summaryrefslogtreecommitdiffstats
path: root/storage/innobase
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase')
-rw-r--r--storage/innobase/.clang-format-old11
-rw-r--r--storage/innobase/CMakeLists.txt401
-rw-r--r--storage/innobase/COPYING.Google30
-rw-r--r--storage/innobase/COPYING.Percona30
-rw-r--r--storage/innobase/btr/btr0btr.cc5192
-rw-r--r--storage/innobase/btr/btr0bulk.cc1238
-rw-r--r--storage/innobase/btr/btr0cur.cc8279
-rw-r--r--storage/innobase/btr/btr0defragment.cc843
-rw-r--r--storage/innobase/btr/btr0pcur.cc681
-rw-r--r--storage/innobase/btr/btr0sea.cc2372
-rw-r--r--storage/innobase/buf/buf0block_hint.cc59
-rw-r--r--storage/innobase/buf/buf0buddy.cc764
-rw-r--r--storage/innobase/buf/buf0buf.cc4728
-rw-r--r--storage/innobase/buf/buf0checksum.cc129
-rw-r--r--storage/innobase/buf/buf0dblwr.cc764
-rw-r--r--storage/innobase/buf/buf0dump.cc824
-rw-r--r--storage/innobase/buf/buf0flu.cc2530
-rw-r--r--storage/innobase/buf/buf0lru.cc1477
-rw-r--r--storage/innobase/buf/buf0rea.cc785
-rw-r--r--storage/innobase/bzip2.cmake36
-rwxr-xr-xstorage/innobase/compile-innodb25
-rw-r--r--storage/innobase/data/data0data.cc854
-rw-r--r--storage/innobase/data/data0type.cc212
-rw-r--r--storage/innobase/dict/dict0boot.cc492
-rw-r--r--storage/innobase/dict/dict0crea.cc2237
-rw-r--r--storage/innobase/dict/dict0defrag_bg.cc327
-rw-r--r--storage/innobase/dict/dict0dict.cc5277
-rw-r--r--storage/innobase/dict/dict0load.cc3687
-rw-r--r--storage/innobase/dict/dict0mem.cc1396
-rw-r--r--storage/innobase/dict/dict0stats.cc4306
-rw-r--r--storage/innobase/dict/dict0stats_bg.cc479
-rw-r--r--storage/innobase/eval/eval0eval.cc632
-rw-r--r--storage/innobase/eval/eval0proc.cc286
-rw-r--r--storage/innobase/fil/fil0crypt.cc2642
-rw-r--r--storage/innobase/fil/fil0fil.cc3757
-rw-r--r--storage/innobase/fil/fil0pagecompress.cc613
-rw-r--r--storage/innobase/fsp/fsp0file.cc1043
-rw-r--r--storage/innobase/fsp/fsp0fsp.cc2890
-rw-r--r--storage/innobase/fsp/fsp0space.cc230
-rw-r--r--storage/innobase/fsp/fsp0sysspace.cc994
-rw-r--r--storage/innobase/fts/Makefile.query18
-rw-r--r--storage/innobase/fts/fts0ast.cc815
-rw-r--r--storage/innobase/fts/fts0blex.cc2177
-rw-r--r--storage/innobase/fts/fts0blex.l74
-rw-r--r--storage/innobase/fts/fts0config.cc432
-rw-r--r--storage/innobase/fts/fts0fts.cc6316
-rw-r--r--storage/innobase/fts/fts0opt.cc3053
-rw-r--r--storage/innobase/fts/fts0pars.cc2007
-rw-r--r--storage/innobase/fts/fts0pars.y293
-rw-r--r--storage/innobase/fts/fts0plugin.cc283
-rw-r--r--storage/innobase/fts/fts0que.cc4596
-rw-r--r--storage/innobase/fts/fts0sql.cc258
-rw-r--r--storage/innobase/fts/fts0tlex.cc2169
-rw-r--r--storage/innobase/fts/fts0tlex.l69
-rwxr-xr-xstorage/innobase/fts/make_parser.sh49
-rw-r--r--storage/innobase/fut/fut0lst.cc392
-rw-r--r--storage/innobase/gis/gis0geo.cc650
-rw-r--r--storage/innobase/gis/gis0rtree.cc1956
-rw-r--r--storage/innobase/gis/gis0sea.cc2052
-rw-r--r--storage/innobase/ha/ha0storage.cc178
-rw-r--r--storage/innobase/handler/ha_innodb.cc21691
-rw-r--r--storage/innobase/handler/ha_innodb.h973
-rw-r--r--storage/innobase/handler/handler0alter.cc11565
-rw-r--r--storage/innobase/handler/i_s.cc7461
-rw-r--r--storage/innobase/handler/i_s.h147
-rw-r--r--storage/innobase/ibuf/ibuf0ibuf.cc4811
-rw-r--r--storage/innobase/include/btr0btr.h760
-rw-r--r--storage/innobase/include/btr0btr.ic149
-rw-r--r--storage/innobase/include/btr0bulk.h371
-rw-r--r--storage/innobase/include/btr0cur.h1010
-rw-r--r--storage/innobase/include/btr0cur.ic211
-rw-r--r--storage/innobase/include/btr0defragment.h75
-rw-r--r--storage/innobase/include/btr0pcur.h546
-rw-r--r--storage/innobase/include/btr0pcur.ic645
-rw-r--r--storage/innobase/include/btr0sea.h392
-rw-r--r--storage/innobase/include/btr0sea.ic160
-rw-r--r--storage/innobase/include/btr0types.h59
-rw-r--r--storage/innobase/include/buf0block_hint.h76
-rw-r--r--storage/innobase/include/buf0buddy.h92
-rw-r--r--storage/innobase/include/buf0buf.h2456
-rw-r--r--storage/innobase/include/buf0buf.ic422
-rw-r--r--storage/innobase/include/buf0checksum.h67
-rw-r--r--storage/innobase/include/buf0dblwr.h170
-rw-r--r--storage/innobase/include/buf0dump.h44
-rw-r--r--storage/innobase/include/buf0flu.h153
-rw-r--r--storage/innobase/include/buf0flu.ic66
-rw-r--r--storage/innobase/include/buf0lru.h204
-rw-r--r--storage/innobase/include/buf0rea.h119
-rw-r--r--storage/innobase/include/buf0types.h225
-rw-r--r--storage/innobase/include/data0data.h710
-rw-r--r--storage/innobase/include/data0data.ic633
-rw-r--r--storage/innobase/include/data0type.h606
-rw-r--r--storage/innobase/include/data0type.ic618
-rw-r--r--storage/innobase/include/data0types.h36
-rw-r--r--storage/innobase/include/db0err.h178
-rw-r--r--storage/innobase/include/dict0boot.h330
-rw-r--r--storage/innobase/include/dict0boot.ic78
-rw-r--r--storage/innobase/include/dict0crea.h324
-rw-r--r--storage/innobase/include/dict0crea.ic136
-rw-r--r--storage/innobase/include/dict0defrag_bg.h106
-rw-r--r--storage/innobase/include/dict0dict.h1804
-rw-r--r--storage/innobase/include/dict0dict.ic1248
-rw-r--r--storage/innobase/include/dict0load.h309
-rw-r--r--storage/innobase/include/dict0mem.h2542
-rw-r--r--storage/innobase/include/dict0mem.ic73
-rw-r--r--storage/innobase/include/dict0pagecompress.h61
-rw-r--r--storage/innobase/include/dict0pagecompress.ic81
-rw-r--r--storage/innobase/include/dict0priv.h50
-rw-r--r--storage/innobase/include/dict0priv.ic91
-rw-r--r--storage/innobase/include/dict0stats.h251
-rw-r--r--storage/innobase/include/dict0stats.ic221
-rw-r--r--storage/innobase/include/dict0stats_bg.h122
-rw-r--r--storage/innobase/include/dict0types.h177
-rw-r--r--storage/innobase/include/dyn0buf.h496
-rw-r--r--storage/innobase/include/dyn0types.h39
-rw-r--r--storage/innobase/include/eval0eval.h109
-rw-r--r--storage/innobase/include/eval0eval.ic254
-rw-r--r--storage/innobase/include/eval0proc.h94
-rw-r--r--storage/innobase/include/eval0proc.ic88
-rw-r--r--storage/innobase/include/fil0crypt.h455
-rw-r--r--storage/innobase/include/fil0crypt.ic81
-rw-r--r--storage/innobase/include/fil0fil.h1799
-rw-r--r--storage/innobase/include/fil0fil.ic144
-rw-r--r--storage/innobase/include/fil0pagecompress.h60
-rw-r--r--storage/innobase/include/fsp0file.h576
-rw-r--r--storage/innobase/include/fsp0fsp.h761
-rw-r--r--storage/innobase/include/fsp0space.h242
-rw-r--r--storage/innobase/include/fsp0sysspace.h289
-rw-r--r--storage/innobase/include/fsp0types.h405
-rw-r--r--storage/innobase/include/fts0ast.h340
-rw-r--r--storage/innobase/include/fts0blex.h702
-rw-r--r--storage/innobase/include/fts0fts.h976
-rw-r--r--storage/innobase/include/fts0opt.h39
-rw-r--r--storage/innobase/include/fts0pars.h72
-rw-r--r--storage/innobase/include/fts0plugin.h50
-rw-r--r--storage/innobase/include/fts0priv.h502
-rw-r--r--storage/innobase/include/fts0priv.ic121
-rw-r--r--storage/innobase/include/fts0tlex.h702
-rw-r--r--storage/innobase/include/fts0tokenize.h189
-rw-r--r--storage/innobase/include/fts0types.h386
-rw-r--r--storage/innobase/include/fts0types.ic231
-rw-r--r--storage/innobase/include/fts0vlc.ic142
-rw-r--r--storage/innobase/include/fut0fut.h74
-rw-r--r--storage/innobase/include/fut0lst.h163
-rw-r--r--storage/innobase/include/gis0geo.h122
-rw-r--r--storage/innobase/include/gis0rtree.h494
-rw-r--r--storage/innobase/include/gis0rtree.ic242
-rw-r--r--storage/innobase/include/gis0type.h152
-rw-r--r--storage/innobase/include/ha0ha.h60
-rw-r--r--storage/innobase/include/ha0ha.ic154
-rw-r--r--storage/innobase/include/ha0storage.h137
-rw-r--r--storage/innobase/include/ha0storage.ic142
-rw-r--r--storage/innobase/include/ha_prototypes.h522
-rw-r--r--storage/innobase/include/handler0alter.h108
-rw-r--r--storage/innobase/include/hash0hash.h236
-rw-r--r--storage/innobase/include/ib0mutex.h773
-rw-r--r--storage/innobase/include/ibuf0ibuf.h411
-rw-r--r--storage/innobase/include/ibuf0ibuf.ic307
-rw-r--r--storage/innobase/include/ibuf0types.h31
-rw-r--r--storage/innobase/include/lock0iter.h66
-rw-r--r--storage/innobase/include/lock0lock.h990
-rw-r--r--storage/innobase/include/lock0lock.ic103
-rw-r--r--storage/innobase/include/lock0prdt.h204
-rw-r--r--storage/innobase/include/lock0priv.h653
-rw-r--r--storage/innobase/include/lock0priv.ic321
-rw-r--r--storage/innobase/include/lock0types.h273
-rw-r--r--storage/innobase/include/log0crypt.h125
-rw-r--r--storage/innobase/include/log0log.h751
-rw-r--r--storage/innobase/include/log0log.ic326
-rw-r--r--storage/innobase/include/log0recv.h426
-rw-r--r--storage/innobase/include/log0types.h44
-rw-r--r--storage/innobase/include/mach0data.h353
-rw-r--r--storage/innobase/include/mach0data.ic836
-rw-r--r--storage/innobase/include/mem0mem.h345
-rw-r--r--storage/innobase/include/mem0mem.ic466
-rw-r--r--storage/innobase/include/mtr0log.h673
-rw-r--r--storage/innobase/include/mtr0mtr.h696
-rw-r--r--storage/innobase/include/mtr0mtr.ic173
-rw-r--r--storage/innobase/include/mtr0types.h347
-rw-r--r--storage/innobase/include/os0event.h131
-rw-r--r--storage/innobase/include/os0file.h1228
-rw-r--r--storage/innobase/include/os0file.ic450
-rw-r--r--storage/innobase/include/os0thread.h98
-rw-r--r--storage/innobase/include/page0cur.h350
-rw-r--r--storage/innobase/include/page0cur.ic291
-rw-r--r--storage/innobase/include/page0page.h1171
-rw-r--r--storage/innobase/include/page0page.ic724
-rw-r--r--storage/innobase/include/page0types.h161
-rw-r--r--storage/innobase/include/page0zip.h392
-rw-r--r--storage/innobase/include/page0zip.ic334
-rw-r--r--storage/innobase/include/pars0grm.h145
-rw-r--r--storage/innobase/include/pars0opt.h68
-rw-r--r--storage/innobase/include/pars0pars.h724
-rw-r--r--storage/innobase/include/pars0sym.h243
-rw-r--r--storage/innobase/include/pars0types.h50
-rw-r--r--storage/innobase/include/que0que.h435
-rw-r--r--storage/innobase/include/que0que.ic293
-rw-r--r--storage/innobase/include/que0types.h97
-rw-r--r--storage/innobase/include/read0types.h293
-rw-r--r--storage/innobase/include/rem0cmp.h263
-rw-r--r--storage/innobase/include/rem0cmp.ic107
-rw-r--r--storage/innobase/include/rem0rec.h1299
-rw-r--r--storage/innobase/include/rem0rec.ic1204
-rw-r--r--storage/innobase/include/rem0types.h78
-rw-r--r--storage/innobase/include/row0ext.h101
-rw-r--r--storage/innobase/include/row0ext.ic87
-rw-r--r--storage/innobase/include/row0ftsort.h265
-rw-r--r--storage/innobase/include/row0import.h67
-rw-r--r--storage/innobase/include/row0ins.h224
-rw-r--r--storage/innobase/include/row0log.h268
-rw-r--r--storage/innobase/include/row0log.ic84
-rw-r--r--storage/innobase/include/row0merge.h464
-rw-r--r--storage/innobase/include/row0mysql.h975
-rw-r--r--storage/innobase/include/row0purge.h268
-rw-r--r--storage/innobase/include/row0quiesce.h67
-rw-r--r--storage/innobase/include/row0row.h432
-rw-r--r--storage/innobase/include/row0row.ic221
-rw-r--r--storage/innobase/include/row0sel.h482
-rw-r--r--storage/innobase/include/row0sel.ic138
-rw-r--r--storage/innobase/include/row0types.h54
-rw-r--r--storage/innobase/include/row0uins.h50
-rw-r--r--storage/innobase/include/row0umod.h46
-rw-r--r--storage/innobase/include/row0undo.h128
-rw-r--r--storage/innobase/include/row0upd.h568
-rw-r--r--storage/innobase/include/row0upd.ic153
-rw-r--r--storage/innobase/include/row0vers.h141
-rw-r--r--storage/innobase/include/rw_lock.h112
-rw-r--r--storage/innobase/include/srv0mon.h892
-rw-r--r--storage/innobase/include/srv0mon.ic113
-rw-r--r--storage/innobase/include/srv0srv.h868
-rw-r--r--storage/innobase/include/srv0start.h129
-rw-r--r--storage/innobase/include/sync0arr.h129
-rw-r--r--storage/innobase/include/sync0arr.ic85
-rw-r--r--storage/innobase/include/sync0debug.h101
-rw-r--r--storage/innobase/include/sync0policy.h296
-rw-r--r--storage/innobase/include/sync0rw.h838
-rw-r--r--storage/innobase/include/sync0rw.ic842
-rw-r--r--storage/innobase/include/sync0sync.h107
-rw-r--r--storage/innobase/include/sync0types.h1060
-rw-r--r--storage/innobase/include/trx0i_s.h278
-rw-r--r--storage/innobase/include/trx0purge.h268
-rw-r--r--storage/innobase/include/trx0rec.h321
-rw-r--r--storage/innobase/include/trx0rec.ic73
-rw-r--r--storage/innobase/include/trx0roll.h187
-rw-r--r--storage/innobase/include/trx0rseg.h277
-rw-r--r--storage/innobase/include/trx0rseg.ic72
-rw-r--r--storage/innobase/include/trx0sys.h1235
-rw-r--r--storage/innobase/include/trx0trx.h1126
-rw-r--r--storage/innobase/include/trx0trx.ic206
-rw-r--r--storage/innobase/include/trx0types.h142
-rw-r--r--storage/innobase/include/trx0undo.h465
-rw-r--r--storage/innobase/include/trx0undo.ic158
-rw-r--r--storage/innobase/include/trx0xa.h61
-rw-r--r--storage/innobase/include/univ.i581
-rw-r--r--storage/innobase/include/ut0byte.h117
-rw-r--r--storage/innobase/include/ut0byte.ic109
-rw-r--r--storage/innobase/include/ut0counter.h125
-rw-r--r--storage/innobase/include/ut0crc32.h37
-rw-r--r--storage/innobase/include/ut0dbg.h179
-rw-r--r--storage/innobase/include/ut0list.h146
-rw-r--r--storage/innobase/include/ut0list.ic80
-rw-r--r--storage/innobase/include/ut0lst.h568
-rw-r--r--storage/innobase/include/ut0mem.h76
-rw-r--r--storage/innobase/include/ut0mem.ic246
-rw-r--r--storage/innobase/include/ut0mutex.h178
-rw-r--r--storage/innobase/include/ut0new.h1105
-rw-r--r--storage/innobase/include/ut0pool.h363
-rw-r--r--storage/innobase/include/ut0rbt.h254
-rw-r--r--storage/innobase/include/ut0rnd.h137
-rw-r--r--storage/innobase/include/ut0rnd.ic150
-rw-r--r--storage/innobase/include/ut0sort.h104
-rw-r--r--storage/innobase/include/ut0stage.h499
-rw-r--r--storage/innobase/include/ut0ut.h453
-rw-r--r--storage/innobase/include/ut0ut.ic143
-rw-r--r--storage/innobase/include/ut0vec.h285
-rw-r--r--storage/innobase/include/ut0vec.ic348
-rw-r--r--storage/innobase/include/ut0wqueue.h94
-rw-r--r--storage/innobase/innodb.cmake191
-rw-r--r--storage/innobase/lock/lock0iter.cc107
-rw-r--r--storage/innobase/lock/lock0lock.cc6818
-rw-r--r--storage/innobase/lock/lock0prdt.cc1028
-rw-r--r--storage/innobase/lock/lock0wait.cc515
-rw-r--r--storage/innobase/log/log0crypt.cc429
-rw-r--r--storage/innobase/log/log0log.cc1340
-rw-r--r--storage/innobase/log/log0recv.cc3783
-rw-r--r--storage/innobase/log/log0sync.cc309
-rw-r--r--storage/innobase/log/log0sync.h81
-rw-r--r--storage/innobase/lz4.cmake38
-rw-r--r--storage/innobase/lzma.cmake35
-rw-r--r--storage/innobase/lzo.cmake34
-rw-r--r--storage/innobase/mem/mem0mem.cc436
-rw-r--r--storage/innobase/mtr/mtr0mtr.cc1121
-rw-r--r--storage/innobase/mysql-test/storage_engine/alter_tablespace.opt2
-rw-r--r--storage/innobase/mysql-test/storage_engine/autoinc_secondary.rdiff30
-rw-r--r--storage/innobase/mysql-test/storage_engine/cache_index.rdiff71
-rw-r--r--storage/innobase/mysql-test/storage_engine/checksum_table_live.rdiff13
-rw-r--r--storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt1
-rw-r--r--storage/innobase/mysql-test/storage_engine/col_opt_null.opt1
-rw-r--r--storage/innobase/mysql-test/storage_engine/define_engine.inc45
-rw-r--r--storage/innobase/mysql-test/storage_engine/disabled.def9
-rw-r--r--storage/innobase/mysql-test/storage_engine/fulltext_search.rdiff49
-rw-r--r--storage/innobase/mysql-test/storage_engine/index_enable_disable.rdiff33
-rw-r--r--storage/innobase/mysql-test/storage_engine/index_type_hash.rdiff60
-rw-r--r--storage/innobase/mysql-test/storage_engine/insert_delayed.rdiff26
-rw-r--r--storage/innobase/mysql-test/storage_engine/lock_concurrent.rdiff25
-rw-r--r--storage/innobase/mysql-test/storage_engine/optimize_table.rdiff37
-rw-r--r--storage/innobase/mysql-test/storage_engine/parts/checksum_table.rdiff13
-rw-r--r--storage/innobase/mysql-test/storage_engine/parts/create_table.rdiff20
-rw-r--r--storage/innobase/mysql-test/storage_engine/parts/disabled.def1
-rw-r--r--storage/innobase/mysql-test/storage_engine/parts/optimize_table.rdiff58
-rw-r--r--storage/innobase/mysql-test/storage_engine/parts/repair_table.rdiff158
-rw-r--r--storage/innobase/mysql-test/storage_engine/parts/suite.opt2
-rw-r--r--storage/innobase/mysql-test/storage_engine/repair_table.rdiff139
-rw-r--r--storage/innobase/mysql-test/storage_engine/suite.opt1
-rw-r--r--storage/innobase/mysql-test/storage_engine/tbl_opt_index_dir.rdiff23
-rw-r--r--storage/innobase/mysql-test/storage_engine/tbl_opt_insert_method.rdiff11
-rw-r--r--storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.rdiff44
-rw-r--r--storage/innobase/mysql-test/storage_engine/tbl_opt_union.rdiff16
-rw-r--r--storage/innobase/mysql-test/storage_engine/trx/cons_snapshot_serializable.rdiff18
-rw-r--r--storage/innobase/mysql-test/storage_engine/trx/level_read_committed.rdiff11
-rw-r--r--storage/innobase/mysql-test/storage_engine/trx/level_read_uncommitted.rdiff11
-rw-r--r--storage/innobase/mysql-test/storage_engine/trx/suite.opt3
-rw-r--r--storage/innobase/mysql-test/storage_engine/type_blob.opt1
-rw-r--r--storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff11
-rw-r--r--storage/innobase/mysql-test/storage_engine/type_float_indexes.rdiff11
-rw-r--r--storage/innobase/mysql-test/storage_engine/type_text.opt1
-rw-r--r--storage/innobase/os/os0event.cc515
-rw-r--r--storage/innobase/os/os0file.cc4349
-rw-r--r--storage/innobase/os/os0thread.cc131
-rw-r--r--storage/innobase/page/page0cur.cc2983
-rw-r--r--storage/innobase/page/page0page.cc2499
-rw-r--r--storage/innobase/page/page0zip.cc4713
-rw-r--r--storage/innobase/pars/lexyy.cc2841
-rwxr-xr-xstorage/innobase/pars/make_bison.sh31
-rwxr-xr-xstorage/innobase/pars/make_flex.sh50
-rw-r--r--storage/innobase/pars/pars0grm.cc2616
-rw-r--r--storage/innobase/pars/pars0grm.y618
-rw-r--r--storage/innobase/pars/pars0lex.l614
-rw-r--r--storage/innobase/pars/pars0opt.cc1267
-rw-r--r--storage/innobase/pars/pars0pars.cc2413
-rw-r--r--storage/innobase/pars/pars0sym.cc416
-rw-r--r--storage/innobase/plugin_exports14
-rw-r--r--storage/innobase/que/que0que.cc1138
-rw-r--r--storage/innobase/read/read0read.cc252
-rw-r--r--storage/innobase/rem/rem0cmp.cc1005
-rw-r--r--storage/innobase/rem/rem0rec.cc2844
-rw-r--r--storage/innobase/row/row0ext.cc132
-rw-r--r--storage/innobase/row/row0ftsort.cc1781
-rw-r--r--storage/innobase/row/row0import.cc4290
-rw-r--r--storage/innobase/row/row0ins.cc3838
-rw-r--r--storage/innobase/row/row0log.cc4053
-rw-r--r--storage/innobase/row/row0merge.cc4799
-rw-r--r--storage/innobase/row/row0mysql.cc4902
-rw-r--r--storage/innobase/row/row0purge.cc1221
-rw-r--r--storage/innobase/row/row0quiesce.cc710
-rw-r--r--storage/innobase/row/row0row.cc1741
-rw-r--r--storage/innobase/row/row0sel.cc6082
-rw-r--r--storage/innobase/row/row0uins.cc608
-rw-r--r--storage/innobase/row/row0umod.cc1418
-rw-r--r--storage/innobase/row/row0undo.cc491
-rw-r--r--storage/innobase/row/row0upd.cc3237
-rw-r--r--storage/innobase/row/row0vers.cc1353
-rw-r--r--storage/innobase/snappy.cmake34
-rw-r--r--storage/innobase/srv/srv0mon.cc2108
-rw-r--r--storage/innobase/srv/srv0srv.cc2135
-rw-r--r--storage/innobase/srv/srv0start.cc2168
-rw-r--r--storage/innobase/sync/sync0arr.cc1296
-rw-r--r--storage/innobase/sync/sync0debug.cc1423
-rw-r--r--storage/innobase/sync/sync0rw.cc1216
-rw-r--r--storage/innobase/sync/sync0sync.cc246
-rw-r--r--storage/innobase/trx/trx0i_s.cc1490
-rw-r--r--storage/innobase/trx/trx0purge.cc1297
-rw-r--r--storage/innobase/trx/trx0rec.cc2559
-rw-r--r--storage/innobase/trx/trx0roll.cc984
-rw-r--r--storage/innobase/trx/trx0rseg.cc768
-rw-r--r--storage/innobase/trx/trx0sys.cc339
-rw-r--r--storage/innobase/trx/trx0trx.cc2300
-rw-r--r--storage/innobase/trx/trx0undo.cc1401
-rw-r--r--storage/innobase/ut/ut0dbg.cc61
-rw-r--r--storage/innobase/ut/ut0list.cc151
-rw-r--r--storage/innobase/ut/ut0mem.cc54
-rw-r--r--storage/innobase/ut/ut0new.cc112
-rw-r--r--storage/innobase/ut/ut0rbt.cc1140
-rw-r--r--storage/innobase/ut/ut0rnd.cc93
-rw-r--r--storage/innobase/ut/ut0ut.cc648
-rw-r--r--storage/innobase/ut/ut0vec.cc73
-rw-r--r--storage/innobase/ut/ut0wqueue.cc133
387 files changed, 326081 insertions, 0 deletions
diff --git a/storage/innobase/.clang-format-old b/storage/innobase/.clang-format-old
new file mode 100644
index 00000000..54f7b47b
--- /dev/null
+++ b/storage/innobase/.clang-format-old
@@ -0,0 +1,11 @@
+UseTab: Always
+TabWidth: 8
+IndentWidth: 8
+ContinuationIndentWidth: 8
+BreakBeforeBinaryOperators: All
+PointerAlignment: Left
+BreakBeforeBraces: Custom
+ColumnLimit: 79
+BraceWrapping:
+ AfterFunction: true
+AccessModifierOffset: -8
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
new file mode 100644
index 00000000..b7e7fb93
--- /dev/null
+++ b/storage/innobase/CMakeLists.txt
@@ -0,0 +1,401 @@
+
+# Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2014, 2020, MariaDB Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+# This is the CMakeLists for InnoDB
+
+
+
+INCLUDE(innodb.cmake)
+INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/tpool)
+
+SET(INNOBASE_SOURCES
+ btr/btr0btr.cc
+ btr/btr0bulk.cc
+ btr/btr0cur.cc
+ btr/btr0pcur.cc
+ btr/btr0sea.cc
+ btr/btr0defragment.cc
+ buf/buf0block_hint.cc
+ buf/buf0buddy.cc
+ buf/buf0buf.cc
+ buf/buf0dblwr.cc
+ buf/buf0checksum.cc
+ buf/buf0dump.cc
+ buf/buf0flu.cc
+ buf/buf0lru.cc
+ buf/buf0rea.cc
+ data/data0data.cc
+ data/data0type.cc
+ dict/dict0boot.cc
+ dict/dict0crea.cc
+ dict/dict0dict.cc
+ dict/dict0load.cc
+ dict/dict0mem.cc
+ dict/dict0stats.cc
+ dict/dict0stats_bg.cc
+ dict/dict0defrag_bg.cc
+ eval/eval0eval.cc
+ eval/eval0proc.cc
+ fil/fil0fil.cc
+ fil/fil0pagecompress.cc
+ fil/fil0crypt.cc
+ fsp/fsp0fsp.cc
+ fsp/fsp0file.cc
+ fsp/fsp0space.cc
+ fsp/fsp0sysspace.cc
+ fut/fut0lst.cc
+ ha/ha0storage.cc
+ fts/fts0fts.cc
+ fts/fts0ast.cc
+ fts/fts0blex.cc
+ fts/fts0config.cc
+ fts/fts0opt.cc
+ fts/fts0pars.cc
+ fts/fts0que.cc
+ fts/fts0sql.cc
+ fts/fts0tlex.cc
+ gis/gis0geo.cc
+ gis/gis0rtree.cc
+ gis/gis0sea.cc
+ fts/fts0plugin.cc
+ handler/ha_innodb.cc
+ handler/handler0alter.cc
+ handler/i_s.cc
+ ibuf/ibuf0ibuf.cc
+ include/btr0btr.h
+ include/btr0btr.ic
+ include/btr0bulk.h
+ include/btr0cur.h
+ include/btr0cur.ic
+ include/btr0defragment.h
+ include/btr0pcur.h
+ include/btr0pcur.ic
+ include/btr0sea.h
+ include/btr0sea.ic
+ include/btr0types.h
+ include/buf0buddy.h
+ include/buf0buf.h
+ include/buf0buf.ic
+ include/buf0checksum.h
+ include/buf0dblwr.h
+ include/buf0dump.h
+ include/buf0flu.h
+ include/buf0flu.ic
+ include/buf0lru.h
+ include/buf0rea.h
+ include/buf0types.h
+ include/data0data.h
+ include/data0data.ic
+ include/data0type.h
+ include/data0type.ic
+ include/data0types.h
+ include/db0err.h
+ include/dict0boot.h
+ include/dict0boot.ic
+ include/dict0crea.h
+ include/dict0crea.ic
+ include/dict0defrag_bg.h
+ include/dict0dict.h
+ include/dict0dict.ic
+ include/dict0load.h
+ include/dict0mem.h
+ include/dict0mem.ic
+ include/dict0pagecompress.h
+ include/dict0pagecompress.ic
+ include/dict0priv.h
+ include/dict0priv.ic
+ include/dict0stats.h
+ include/dict0stats.ic
+ include/dict0stats_bg.h
+ include/dict0types.h
+ include/dyn0buf.h
+ include/dyn0types.h
+ include/eval0eval.h
+ include/eval0eval.ic
+ include/eval0proc.h
+ include/eval0proc.ic
+ include/fil0crypt.h
+ include/fil0crypt.ic
+ include/fil0fil.h
+ include/fil0fil.ic
+ include/fil0pagecompress.h
+ include/fsp0file.h
+ include/fsp0fsp.h
+ include/fsp0space.h
+ include/fsp0sysspace.h
+ include/fsp0types.h
+ include/fts0ast.h
+ include/fts0blex.h
+ include/fts0fts.h
+ include/fts0opt.h
+ include/fts0pars.h
+ include/fts0plugin.h
+ include/fts0priv.h
+ include/fts0priv.ic
+ include/fts0tlex.h
+ include/fts0tokenize.h
+ include/fts0types.h
+ include/fts0types.ic
+ include/fts0vlc.ic
+ include/fut0fut.h
+ include/fut0lst.h
+ include/gis0geo.h
+ include/gis0rtree.h
+ include/gis0rtree.ic
+ include/gis0type.h
+ include/ha_prototypes.h
+ include/ha0ha.h
+ include/ha0ha.ic
+ include/ha0storage.h
+ include/ha0storage.ic
+ include/handler0alter.h
+ include/hash0hash.h
+ include/ib0mutex.h
+ include/ibuf0ibuf.h
+ include/ibuf0ibuf.ic
+ include/ibuf0types.h
+ include/lock0iter.h
+ include/lock0lock.h
+ include/lock0lock.ic
+ include/lock0prdt.h
+ include/lock0priv.h
+ include/lock0priv.ic
+ include/lock0types.h
+ include/log0crypt.h
+ include/log0log.h
+ include/log0log.ic
+ include/log0recv.h
+ include/log0types.h
+ include/mach0data.h
+ include/mach0data.ic
+ include/mem0mem.h
+ include/mem0mem.ic
+ include/mtr0log.h
+ include/mtr0mtr.h
+ include/mtr0mtr.ic
+ include/mtr0types.h
+ include/os0event.h
+ include/os0file.h
+ include/os0file.ic
+ include/os0thread.h
+ include/page0cur.h
+ include/page0cur.ic
+ include/page0page.h
+ include/page0page.ic
+ include/page0types.h
+ include/page0zip.h
+ include/page0zip.ic
+ include/pars0grm.h
+ include/pars0opt.h
+ include/pars0pars.h
+ include/pars0sym.h
+ include/pars0types.h
+ include/que0que.h
+ include/que0que.ic
+ include/que0types.h
+ include/read0types.h
+ include/rem0cmp.h
+ include/rem0cmp.ic
+ include/rem0rec.h
+ include/rem0rec.ic
+ include/rem0types.h
+ include/row0ext.h
+ include/row0ext.ic
+ include/row0ftsort.h
+ include/row0import.h
+ include/row0ins.h
+ include/row0log.h
+ include/row0log.ic
+ include/row0merge.h
+ include/row0mysql.h
+ include/row0purge.h
+ include/row0quiesce.h
+ include/row0row.h
+ include/row0row.ic
+ include/row0sel.h
+ include/row0sel.ic
+ include/row0types.h
+ include/row0uins.h
+ include/row0umod.h
+ include/row0undo.h
+ include/row0upd.h
+ include/row0upd.ic
+ include/row0vers.h
+ include/srv0mon.h
+ include/srv0mon.ic
+ include/srv0srv.h
+ include/srv0start.h
+ include/sync0arr.h
+ include/sync0arr.ic
+ include/sync0debug.h
+ include/sync0policy.h
+ include/sync0rw.h
+ include/sync0rw.ic
+ include/sync0sync.h
+ include/sync0types.h
+ include/trx0i_s.h
+ include/trx0purge.h
+ include/trx0rec.h
+ include/trx0rec.ic
+ include/trx0roll.h
+ include/trx0rseg.h
+ include/trx0rseg.ic
+ include/trx0sys.h
+ include/trx0trx.h
+ include/trx0trx.ic
+ include/trx0types.h
+ include/trx0undo.h
+ include/trx0undo.ic
+ include/trx0xa.h
+ include/univ.i
+ include/ut0byte.h
+ include/ut0byte.ic
+ include/ut0counter.h
+ include/ut0dbg.h
+ include/ut0list.h
+ include/ut0list.ic
+ include/ut0lst.h
+ include/ut0mem.h
+ include/ut0mem.ic
+ include/ut0mutex.h
+ include/ut0new.h
+ include/ut0pool.h
+ include/ut0rbt.h
+ include/ut0rnd.h
+ include/ut0rnd.ic
+ include/ut0sort.h
+ include/ut0stage.h
+ include/ut0ut.h
+ include/ut0ut.ic
+ include/ut0vec.h
+ include/ut0vec.ic
+ include/ut0wqueue.h
+ lock/lock0iter.cc
+ lock/lock0prdt.cc
+ lock/lock0lock.cc
+ lock/lock0wait.cc
+ log/log0log.cc
+ log/log0recv.cc
+ log/log0crypt.cc
+ log/log0sync.cc
+ mem/mem0mem.cc
+ mtr/mtr0mtr.cc
+ os/os0file.cc
+ os/os0event.cc
+ os/os0thread.cc
+ page/page0cur.cc
+ page/page0page.cc
+ page/page0zip.cc
+ pars/lexyy.cc
+ pars/pars0grm.cc
+ pars/pars0opt.cc
+ pars/pars0pars.cc
+ pars/pars0sym.cc
+ que/que0que.cc
+ read/read0read.cc
+ rem/rem0cmp.cc
+ rem/rem0rec.cc
+ row/row0ext.cc
+ row/row0ftsort.cc
+ row/row0import.cc
+ row/row0ins.cc
+ row/row0merge.cc
+ row/row0mysql.cc
+ row/row0log.cc
+ row/row0purge.cc
+ row/row0row.cc
+ row/row0sel.cc
+ row/row0uins.cc
+ row/row0umod.cc
+ row/row0undo.cc
+ row/row0upd.cc
+ row/row0quiesce.cc
+ row/row0vers.cc
+ srv/srv0mon.cc
+ srv/srv0srv.cc
+ srv/srv0start.cc
+ sync/sync0arr.cc
+ sync/sync0rw.cc
+ sync/sync0debug.cc
+ sync/sync0sync.cc
+ trx/trx0i_s.cc
+ trx/trx0purge.cc
+ trx/trx0rec.cc
+ trx/trx0roll.cc
+ trx/trx0rseg.cc
+ trx/trx0sys.cc
+ trx/trx0trx.cc
+ trx/trx0undo.cc
+ ut/ut0dbg.cc
+ ut/ut0list.cc
+ ut/ut0mem.cc
+ ut/ut0new.cc
+ ut/ut0rbt.cc
+ ut/ut0rnd.cc
+ ut/ut0ut.cc
+ ut/ut0vec.cc
+ ut/ut0wqueue.cc)
+
+MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE
+ MODULE_OUTPUT_NAME ha_innodb
+ DEFAULT RECOMPILE_FOR_EMBEDDED
+ LINK_LIBRARIES
+ ${ZLIB_LIBRARY}
+ ${NUMA_LIBRARY}
+ ${LIBSYSTEMD}
+ ${LINKER_SCRIPT}
+ ${LIBPMEM})
+
+IF(NOT TARGET innobase)
+ RETURN()
+ENDIF()
+
+ADD_DEFINITIONS(${SSL_DEFINES})
+
+# A GCC bug causes crash when compiling these files on ARM64 with -O1+
+# Compile them with -O0 as a workaround.
+IF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64"
+ AND CMAKE_C_COMPILER_VERSION VERSION_LESS "5.2.0")
+ ADD_COMPILE_FLAGS(
+ btr/btr0btr.cc
+ btr/btr0cur.cc
+ buf/buf0buf.cc
+ fts/fts0fts.cc
+ gis/gis0sea.cc
+ handler/handler0alter.cc
+ mtr/mtr0mtr.cc
+ row/row0merge.cc
+ row/row0mysql.cc
+ srv/srv0srv.cc
+ COMPILE_FLAGS "-O0"
+ )
+ENDIF()
+IF(MSVC)
+ IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
+ ADD_COMPILE_FLAGS(
+ pars/lexyy.cc
+ COMPILE_FLAGS "/wd4267")
+ ENDIF()
+ # silence "switch statement contains 'default' but no 'case' label
+ # on generated file.
+ TARGET_COMPILE_OPTIONS(innobase PRIVATE "/wd4065")
+ENDIF()
+
+IF(NOT (PLUGIN_INNOBASE STREQUAL DYNAMIC))
+ TARGET_LINK_LIBRARIES(innobase tpool mysys)
+ ADD_SUBDIRECTORY(${CMAKE_SOURCE_DIR}/extra/mariabackup ${CMAKE_BINARY_DIR}/extra/mariabackup)
+ENDIF()
diff --git a/storage/innobase/COPYING.Google b/storage/innobase/COPYING.Google
new file mode 100644
index 00000000..5ade2b0e
--- /dev/null
+++ b/storage/innobase/COPYING.Google
@@ -0,0 +1,30 @@
+Portions of this software contain modifications contributed by Google, Inc.
+These contributions are used with the following license:
+
+Copyright (c) 2008, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ * Neither the name of the Google Inc. nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/innobase/COPYING.Percona b/storage/innobase/COPYING.Percona
new file mode 100644
index 00000000..8c786811
--- /dev/null
+++ b/storage/innobase/COPYING.Percona
@@ -0,0 +1,30 @@
+Portions of this software contain modifications contributed by Percona, Inc.
+These contributions are used with the following license:
+
+Copyright (c) 2008, 2009, Percona Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ * Neither the name of the Percona Inc. nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
new file mode 100644
index 00000000..de87ad02
--- /dev/null
+++ b/storage/innobase/btr/btr0btr.cc
@@ -0,0 +1,5192 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0btr.cc
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#include "page0page.h"
+#include "page0zip.h"
+#include "gis0rtree.h"
+
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "btr0defragment.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#include "gis0geo.h"
+#include "dict0boot.h"
+#include "row0sel.h" /* row_search_max_autoinc() */
+
+Atomic_counter<uint32_t> btr_validate_index_running;
+
+/**************************************************************//**
+Checks if the page in the cursor can be merged with given page.
+If necessary, re-organize the merge_page.
+@return true if possible to merge. */
+static
+bool
+btr_can_merge_with_page(
+/*====================*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to merge */
+ uint32_t page_no, /*!< in: a sibling page */
+ buf_block_t** merge_block, /*!< out: the merge block */
+ mtr_t* mtr); /*!< in: mini-transaction */
+
+/** Report that an index page is corrupted.
+@param[in] buffer block
+@param[in] index tree */
+void btr_corruption_report(const buf_block_t* block, const dict_index_t* index)
+{
+ ib::fatal()
+ << "Flag mismatch in page " << block->page.id()
+ << " index " << index->name
+ << " of table " << index->table->name;
+}
+
+/*
+Latching strategy of the InnoDB B-tree
+--------------------------------------
+
+Node pointer page latches acquisition is protected by index->lock latch.
+
+Before MariaDB 10.2.2, all node pointer pages were protected by index->lock
+either in S (shared) or X (exclusive) mode and block->lock was not acquired on
+node pointer pages.
+
+After MariaDB 10.2.2, block->lock S-latch or X-latch is used to protect
+node pointer pages and obtaiment of node pointer page latches is protected by
+index->lock.
+
+(0) Definition: B-tree level.
+
+(0.1) The leaf pages of the B-tree are at level 0.
+
+(0.2) The parent of a page at level L has level L+1. (The level of the
+root page is equal to the tree height.)
+
+(0.3) The B-tree lock (index->lock) is the parent of the root page and
+has a level = tree height + 1.
+
+Index->lock has 3 possible locking modes:
+
+(1) S-latch:
+
+(1.1) All latches for pages must be obtained in descending order of tree level.
+
+(1.2) Before obtaining the first node pointer page latch at a given B-tree
+level, parent latch must be held (at level +1 ).
+
+(1.3) If a node pointer page is already latched at the same level
+we can only obtain latch to its right sibling page latch at the same level.
+
+(1.4) Release of the node pointer page latches must be done in
+child-to-parent order. (Prevents deadlocks when obtained index->lock
+in SX mode).
+
+(1.4.1) Level L node pointer page latch can be released only when
+no latches at children level i.e. level < L are hold.
+
+(1.4.2) All latches from node pointer pages must be released so
+that no latches are obtained between.
+
+(1.5) [implied by (1.1), (1.2)] Root page latch must be first node pointer
+latch obtained.
+
+(2) SX-latch:
+
+In this case rules (1.2) and (1.3) from S-latch case are relaxed and
+merged into (2.2) and rule (1.4) is removed. Thus, latch acquisition
+can be skipped at some tree levels and latches can be obtained in
+a less restricted order.
+
+(2.1) [identical to (1.1)]: All latches for pages must be obtained in descending
+order of tree level.
+
+(2.2) When a node pointer latch at level L is obtained,
+the left sibling page latch in the same level or some ancestor
+page latch (at level > L) must be hold.
+
+(2.3) [implied by (2.1), (2.2)] The first node pointer page latch obtained can
+be any node pointer page.
+
+(3) X-latch:
+
+Node pointer latches can be obtained in any order.
+
+NOTE: New rules after MariaDB 10.2.2 does not affect the latching rules of leaf pages:
+
+index->lock S-latch is needed in read for the node pointer traversal. When the leaf
+level is reached, index-lock can be released (and with the MariaDB 10.2.2 changes, all
+node pointer latches). Left to right index travelsal in leaf page level can be safely done
+by obtaining right sibling leaf page latch and then releasing the old page latch.
+
+Single leaf page modifications (BTR_MODIFY_LEAF) are protected by index->lock
+S-latch.
+
+B-tree operations involving page splits or merges (BTR_MODIFY_TREE) and page
+allocations are protected by index->lock X-latch.
+
+Node pointers
+-------------
+Leaf pages of a B-tree contain the index records stored in the
+tree. On levels n > 0 we store 'node pointers' to pages on level
+n - 1. For each page there is exactly one node pointer stored:
+thus the our tree is an ordinary B-tree, not a B-link tree.
+
+A node pointer contains a prefix P of an index record. The prefix
+is long enough so that it determines an index record uniquely.
+The file page number of the child page is added as the last
+field. To the child page we can store node pointers or index records
+which are >= P in the alphabetical order, but < P1 if there is
+a next node pointer on the level, and P1 is its prefix.
+
+If a node pointer with a prefix P points to a non-leaf child,
+then the leftmost record in the child must have the same
+prefix P. If it points to a leaf node, the child is not required
+to contain any record with a prefix equal to P. The leaf case
+is decided this way to allow arbitrary deletions in a leaf node
+without touching upper levels of the tree.
+
+We have predefined a special minimum record which we
+define as the smallest record in any alphabetical order.
+A minimum record is denoted by setting a bit in the record
+header. A minimum record acts as the prefix of a node pointer
+which points to a leftmost node on any level of the tree.
+
+File page allocation
+--------------------
+In the root node of a B-tree there are two file segment headers.
+The leaf pages of a tree are allocated from one file segment, to
+make them consecutive on disk if possible. From the other file segment
+we allocate pages for the non-leaf levels of the tree.
+*/
+
+#ifdef UNIV_BTR_DEBUG
+/**************************************************************//**
+Checks a file segment header within a B-tree root page.
+@return TRUE if valid */
+static
+ibool
+btr_root_fseg_validate(
+/*===================*/
+ const fseg_header_t* seg_header, /*!< in: segment header */
+ ulint space) /*!< in: tablespace identifier */
+{
+ ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+ ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
+ ut_a(offset >= FIL_PAGE_DATA);
+ ut_a(offset <= srv_page_size - FIL_PAGE_DATA_END);
+ return(TRUE);
+}
+#endif /* UNIV_BTR_DEBUG */
+
+/**************************************************************//**
+Gets the root node of a tree and x- or s-latches it.
+@return root page, x- or s-latched */
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+ const dict_index_t* index, /*!< in: index tree */
+ rw_lock_type_t mode, /*!< in: either RW_S_LATCH
+ or RW_X_LATCH */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (!index->table || !index->table->space) {
+ return NULL;
+ }
+
+ buf_block_t* block = btr_block_get(*index, index->page, mode, false,
+ mtr);
+
+ if (!block) {
+ index->table->file_unreadable = true;
+
+ ib_push_warning(
+ static_cast<THD*>(NULL), DB_DECRYPTION_FAILED,
+ "Table %s in file %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ index->table->name.m_name,
+ UT_LIST_GET_FIRST(index->table->space->chain)->name);
+
+ return NULL;
+ }
+
+ btr_assert_not_corrupted(block, index);
+
+#ifdef UNIV_BTR_DEBUG
+ if (!dict_index_is_ibuf(index)) {
+ const page_t* root = buf_block_get_frame(block);
+
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + root, index->table->space_id));
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + root, index->table->space_id));
+ }
+#endif /* UNIV_BTR_DEBUG */
+
+ return(block);
+}
+
+/**************************************************************//**
+Gets the root node of a tree and sx-latches it for segment access.
+@return root page, sx-latched */
+page_t*
+btr_root_get(
+/*=========*/
+ const dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ /* Intended to be used for segment list access.
+ SX lock doesn't block reading user data by other threads.
+ And block the segment list access by others.*/
+ buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+ return(root ? buf_block_get_frame(root) : NULL);
+}
+
+/**************************************************************//**
+Gets the height of the B-tree (the level of the root, when the leaf
+level is assumed to be 0). The caller must hold an S or X latch on
+the index.
+@return tree height (level of the root) */
+ulint
+btr_height_get(
+/*===========*/
+ const dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint height=0;
+ buf_block_t* root_block;
+
+ ut_ad(srv_read_only_mode
+ || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
+ | MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+
+ /* S latches the page */
+ root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
+
+ if (root_block) {
+ height = btr_page_get_level(buf_block_get_frame(root_block));
+
+ /* Release the S latch on the root page. */
+ mtr->memo_release(root_block, MTR_MEMO_PAGE_S_FIX);
+
+ ut_d(sync_check_unlock(&root_block->lock));
+ }
+
+ return(height);
+}
+
+/**************************************************************//**
+Checks a file segment header within a B-tree root page and updates
+the segment header space id.
+@return TRUE if valid */
+static
+bool
+btr_root_fseg_adjust_on_import(
+/*===========================*/
+ fseg_header_t* seg_header, /*!< in/out: segment header */
+ page_zip_des_t* page_zip, /*!< in/out: compressed page,
+ or NULL */
+ ulint space) /*!< in: tablespace identifier */
+{
+ ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+ if (offset < FIL_PAGE_DATA
+ || offset > srv_page_size - FIL_PAGE_DATA_END) {
+ return false;
+ }
+
+ seg_header += FSEG_HDR_SPACE;
+
+ mach_write_to_4(seg_header, space);
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ memcpy(page_zip->data + page_offset(seg_header), seg_header,
+ 4);
+ }
+
+ return true;
+}
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+ const dict_index_t* index) /*!< in: index tree */
+{
+ dberr_t err;
+ mtr_t mtr;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ dict_table_t* table = index->table;
+
+ DBUG_EXECUTE_IF("ib_import_trigger_corruption_3",
+ return(DB_CORRUPTION););
+
+ mtr_start(&mtr);
+
+ mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+ buf_block_t* block = buf_page_get_gen(
+ page_id_t(table->space->id, index->page),
+ table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET,
+ __FILE__, __LINE__,
+ &mtr, &err);
+ if (!block) {
+ ut_ad(err != DB_SUCCESS);
+ goto func_exit;
+ }
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+ page = buf_block_get_frame(block);
+ page_zip = buf_block_get_page_zip(block);
+
+ if (!fil_page_index_page_check(page) || page_has_siblings(page)) {
+ err = DB_CORRUPTION;
+
+ } else if (dict_index_is_clust(index)) {
+ bool page_is_compact_format;
+
+ page_is_compact_format = page_is_comp(page) > 0;
+
+ /* Check if the page format and table format agree. */
+ if (page_is_compact_format != dict_table_is_comp(table)) {
+ err = DB_CORRUPTION;
+ } else {
+ /* Check that the table flags and the tablespace
+ flags match. */
+ ulint tf = dict_tf_to_fsp_flags(table->flags);
+ ulint sf = table->space->flags;
+ sf &= ~FSP_FLAGS_MEM_MASK;
+ tf &= ~FSP_FLAGS_MEM_MASK;
+ if (fil_space_t::is_flags_equal(tf, sf)
+ || fil_space_t::is_flags_equal(sf, tf)) {
+ mutex_enter(&fil_system.mutex);
+ table->space->flags = (table->space->flags
+ & ~FSP_FLAGS_MEM_MASK)
+ | (tf & FSP_FLAGS_MEM_MASK);
+ mutex_exit(&fil_system.mutex);
+ err = DB_SUCCESS;
+ } else {
+ err = DB_CORRUPTION;
+ }
+ }
+ } else {
+ err = DB_SUCCESS;
+ }
+
+ /* Check and adjust the file segment headers, if all OK so far. */
+ if (err == DB_SUCCESS
+ && (!btr_root_fseg_adjust_on_import(
+ FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + page, page_zip, table->space_id)
+ || !btr_root_fseg_adjust_on_import(
+ FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + page, page_zip, table->space_id))) {
+
+ err = DB_CORRUPTION;
+ }
+
+func_exit:
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization). @see btr_page_empty(). */
+void
+btr_page_create(
+/*============*/
+ buf_block_t* block, /*!< in/out: page to be created */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID +
+ block->frame);
+
+ if (UNIV_LIKELY_NULL(page_zip))
+ {
+ mach_write_to_8(index_id, index->id);
+ page_create_zip(block, index, level, 0, mtr);
+ }
+ else
+ {
+ page_create(block, mtr, dict_table_is_comp(index->table));
+ if (index->is_spatial())
+ {
+ static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+ FIL_PAGE_RTREE, "compatibility");
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+ byte(FIL_PAGE_RTREE));
+ if (mach_read_from_8(block->frame + FIL_RTREE_SPLIT_SEQ_NUM))
+ mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0);
+ }
+ /* Set the level of the new index page */
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+ my_assume_aligned<2>(PAGE_HEADER +
+ PAGE_LEVEL +
+ block->frame), level);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block, index_id, index->id);
+ }
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an ibuf tree. Takes the page from
+the free list of the tree, which must contain pages!
+@return new allocated block, x-latched */
+static
+buf_block_t*
+btr_page_alloc_for_ibuf(
+/*====================*/
+ dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* new_block;
+
+ buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+
+ fil_addr_t node_addr = flst_get_first(PAGE_HEADER
+ + PAGE_BTR_IBUF_FREE_LIST
+ + root->frame);
+ ut_a(node_addr.page != FIL_NULL);
+
+ new_block = buf_page_get(
+ page_id_t(index->table->space_id, node_addr.page),
+ index->table->space->zip_size(),
+ RW_X_LATCH, mtr);
+
+ buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW);
+
+ flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ new_block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+ mtr);
+ ut_d(flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+
+ return(new_block);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+btr_page_alloc_low(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ uint32_t hint_page_no, /*!< in: hint of a good page */
+ byte file_direction, /*!< in: direction where a possible
+ page split is made */
+ ulint level, /*!< in: level where the page is placed
+ in the tree */
+ mtr_t* mtr, /*!< in/out: mini-transaction
+ for the allocation */
+ mtr_t* init_mtr) /*!< in/out: mtr or another
+ mini-transaction in which the
+ page should be initialized. */
+{
+ page_t* root = btr_root_get(index, mtr);
+
+ fseg_header_t* seg_header = (level
+ ? PAGE_HEADER + PAGE_BTR_SEG_TOP
+ : PAGE_HEADER + PAGE_BTR_SEG_LEAF)
+ + root;
+
+ /* Parameter TRUE below states that the caller has made the
+ reservation for free extents, and thus we know that a page can
+ be allocated: */
+
+ buf_block_t* block = fseg_alloc_free_page_general(
+ seg_header, hint_page_no, file_direction,
+ true, mtr, init_mtr);
+
+ return block;
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+ dict_index_t* index, /*!< in: index */
+ uint32_t hint_page_no, /*!< in: hint of a good page */
+ byte file_direction, /*!< in: direction where a possible
+ page split is made */
+ ulint level, /*!< in: level where the page is placed
+ in the tree */
+ mtr_t* mtr, /*!< in/out: mini-transaction
+ for the allocation */
+ mtr_t* init_mtr) /*!< in/out: mini-transaction
+ for x-latching and initializing
+ the page */
+{
+ buf_block_t* new_block;
+
+ if (dict_index_is_ibuf(index)) {
+
+ return(btr_page_alloc_for_ibuf(index, mtr));
+ }
+
+ new_block = btr_page_alloc_low(
+ index, hint_page_no, file_direction, level, mtr, init_mtr);
+
+ if (new_block) {
+ buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
+ }
+
+ return(new_block);
+}
+
+/**************************************************************//**
+Gets the number of pages in a B-tree.
+@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
+ulint
+btr_get_size(
+/*=========*/
+ const dict_index_t* index, /*!< in: index */
+ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+ mtr_t* mtr) /*!< in/out: mini-transaction where index
+ is s-latched */
+{
+ ulint n=0;
+
+ ut_ad(srv_read_only_mode
+ || mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
+ ut_ad(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
+
+ if (index->page == FIL_NULL
+ || dict_index_is_online_ddl(index)
+ || !index->is_committed()
+ || !index->table->space) {
+ return(ULINT_UNDEFINED);
+ }
+
+ buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+ if (!root) {
+ return ULINT_UNDEFINED;
+ }
+ mtr_x_lock_space(index->table->space, mtr);
+ if (flag == BTR_N_LEAF_PAGES) {
+ fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+ + root->frame, &n, mtr);
+ } else {
+ ulint dummy;
+ n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP
+ + root->frame, &dummy, mtr);
+ n += fseg_n_reserved_pages(*root,
+ PAGE_HEADER + PAGE_BTR_SEG_LEAF
+ + root->frame, &dummy, mtr);
+ }
+
+ return(n);
+}
+
+/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr) /*!< in/out: mini-transaction where index
+ is s-latched */
+{
+ ulint dummy;
+
+ ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
+ ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
+
+ if (index->page == FIL_NULL
+ || dict_index_is_online_ddl(index)
+ || !index->is_committed()
+ || !index->table->space) {
+ return(ULINT_UNDEFINED);
+ }
+
+ buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+ *used = 0;
+ if (!root) {
+ return ULINT_UNDEFINED;
+ }
+
+ mtr_x_lock_space(index->table->space, mtr);
+
+ ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+ + root->frame, used, mtr);
+ if (flag == BTR_TOTAL_SIZE) {
+ n += fseg_n_reserved_pages(*root,
+ PAGE_HEADER + PAGE_BTR_SEG_TOP
+ + root->frame, &dummy, mtr);
+ *used += dummy;
+ }
+
+ return(n);
+}
+
+/**************************************************************//**
+Frees a page used in an ibuf tree. Puts the page to the free list of the
+ibuf tree. */
+static
+void
+btr_page_free_for_ibuf(
+/*===================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: block to be freed, x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+ buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
+
+ flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+
+ ut_d(flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+}
+
+/** Free an index page.
+@param[in,out] index index tree
+@param[in,out] block block to be freed
+@param[in,out] mtr mini-transaction
+@param[in] blob whether this is freeing a BLOB page */
+void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
+ bool blob)
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+#ifdef BTR_CUR_HASH_ADAPT
+ if (block->index && !block->index->freed()) {
+ ut_ad(!blob);
+ ut_ad(page_is_leaf(block->frame));
+ }
+#endif
+ const page_id_t id(block->page.id());
+ ut_ad(index->table->space_id == id.space());
+ /* The root page is freed by btr_free_root(). */
+ ut_ad(id.page_no() != index->page);
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ /* The page gets invalid for optimistic searches: increment the frame
+ modify clock */
+
+ buf_block_modify_clock_inc(block);
+
+ if (dict_index_is_ibuf(index)) {
+ btr_page_free_for_ibuf(index, block, mtr);
+ return;
+ }
+
+ /* TODO: Discard any operations for block from mtr->log.
+ The page will be freed, so previous changes to it by this
+ mini-transaction should not matter. */
+ page_t* root = btr_root_get(index, mtr);
+ fseg_header_t* seg_header = &root[blob || page_is_leaf(block->frame)
+ ? PAGE_HEADER + PAGE_BTR_SEG_LEAF
+ : PAGE_HEADER + PAGE_BTR_SEG_TOP];
+ fil_space_t* space= index->table->space;
+ const uint32_t page= id.page_no();
+
+ fseg_free_page(seg_header, space, page, mtr);
+ buf_page_free(space, page, mtr, __FILE__, __LINE__);
+
+ /* The page was marked free in the allocation bitmap, but it
+ should remain exclusively latched until mtr_t::commit() or until it
+ is explicitly freed from the mini-transaction. */
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+}
+
+/** Set the child page number in a node pointer record.
+@param[in,out] block non-leaf index page
+@param[in,out] rec node pointer record in the page
+@param[in] offsets rec_get_offsets(rec)
+@param[in] page_no child page number
+@param[in,out] mtr mini-transaction
+Sets the child node file address in a node pointer. */
+inline void btr_node_ptr_set_child_page_no(buf_block_t *block,
+ rec_t *rec, const rec_offs *offsets,
+ ulint page_no, mtr_t *mtr)
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!page_rec_is_leaf(rec));
+ ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+ const ulint offs= rec_offs_data_size(offsets);
+ ut_ad(rec_offs_nth_size(offsets, rec_offs_n_fields(offsets) - 1) ==
+ REC_NODE_PTR_SIZE);
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ page_zip_write_node_ptr(block, rec, offs, page_no, mtr);
+ else
+ mtr->write<4>(*block, rec + offs - REC_NODE_PTR_SIZE, page_no);
+}
+
+/************************************************************//**
+Returns the child page of a node pointer and sx-latches it.
+@return child page, sx-latched */
+static
+buf_block_t*
+btr_node_ptr_get_child(
+/*===================*/
+ const rec_t* node_ptr,/*!< in: node pointer */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(rec_offs_validate(node_ptr, index, offsets));
+ ut_ad(index->table->space_id
+ == page_get_space_id(page_align(node_ptr)));
+
+ return btr_block_get(
+ *index, btr_node_ptr_get_child_page_no(node_ptr, offsets),
+ RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1,
+ mtr);
+}
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an sx-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_get_father_node_ptr_func(
+/*==============================*/
+ rec_offs* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ btr_cur_t* cursor, /*!< in: cursor pointing to user record,
+ out: cursor on node pointer record,
+ its page x-latched */
+ ulint latch_mode,/*!< in: BTR_CONT_MODIFY_TREE
+ or BTR_CONT_SEARCH_TREE */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dtuple_t* tuple;
+ rec_t* user_rec;
+ rec_t* node_ptr;
+ ulint level;
+ ulint page_no;
+ dict_index_t* index;
+
+ ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
+ || latch_mode == BTR_CONT_SEARCH_TREE);
+
+ page_no = btr_cur_get_block(cursor)->page.id().page_no();
+ index = btr_cur_get_index(cursor);
+ ut_ad(!dict_index_is_spatial(index));
+
+ ut_ad(srv_read_only_mode
+ || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+
+ ut_ad(dict_index_get_page(index) != page_no);
+
+ level = btr_page_get_level(btr_cur_get_page(cursor));
+
+ user_rec = btr_cur_get_rec(cursor);
+ ut_a(page_rec_is_user_rec(user_rec));
+
+ tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
+ dberr_t err = DB_SUCCESS;
+
+ err = btr_cur_search_to_nth_level(
+ index, level + 1, tuple,
+ PAGE_CUR_LE, latch_mode, cursor, 0,
+ file, line, mtr);
+
+ if (err != DB_SUCCESS) {
+ ib::warn() << " Error code: " << err
+ << " btr_page_get_father_node_ptr_func "
+ << " level: " << level + 1
+ << " called from file: "
+ << file << " line: " << line
+ << " table: " << index->table->name
+ << " index: " << index->name();
+ }
+
+ node_ptr = btr_cur_get_rec(cursor);
+
+ offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+ rec_t* print_rec;
+
+ ib::error()
+ << "Corruption of an index tree: table "
+ << index->table->name
+ << " index " << index->name
+ << ", father ptr page no "
+ << btr_node_ptr_get_child_page_no(node_ptr, offsets)
+ << ", child page no " << page_no;
+
+ print_rec = page_rec_get_next(
+ page_get_infimum_rec(page_align(user_rec)));
+ offsets = rec_get_offsets(print_rec, index, offsets,
+ page_rec_is_leaf(user_rec)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ page_rec_print(print_rec, offsets);
+ offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ page_rec_print(node_ptr, offsets);
+
+ ib::fatal()
+ << "You should dump + drop + reimport the table to"
+ << " fix the corruption. If the crash happens at"
+ << " database startup. " << FORCE_RECOVERY_MSG
+ << " Then dump + drop + reimport.";
+ }
+
+ return(offsets);
+}
+
+#define btr_page_get_father_node_ptr(of,heap,cur,mtr) \
+ btr_page_get_father_node_ptr_func( \
+ of,heap,cur,BTR_CONT_MODIFY_TREE,__FILE__,__LINE__,mtr)
+
+#define btr_page_get_father_node_ptr_for_validate(of,heap,cur,mtr) \
+ btr_page_get_father_node_ptr_func( \
+ of,heap,cur,BTR_CONT_SEARCH_TREE,__FILE__,__LINE__,mtr)
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an x-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_get_father_block(
+/*======================*/
+ rec_offs* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ dict_index_t* index, /*!< in: b-tree index */
+ buf_block_t* block, /*!< in: child page in the index */
+ mtr_t* mtr, /*!< in: mtr */
+ btr_cur_t* cursor) /*!< out: cursor on node pointer record,
+ its page x-latched */
+{
+ rec_t* rec
+ = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
+ block)));
+ btr_cur_position(index, rec, block, cursor);
+ return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
+}
+
+/** Seek to the parent page of a B-tree page.
+@param[in,out] index b-tree
+@param[in] block child page
+@param[in,out] mtr mini-transaction
+@param[out] cursor cursor pointing to the x-latched parent page */
+void btr_page_get_father(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
+ btr_cur_t* cursor)
+{
+ mem_heap_t* heap;
+ rec_t* rec
+ = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
+ block)));
+ btr_cur_position(index, rec, block, cursor);
+
+ heap = mem_heap_create(100);
+ btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
+ mem_heap_free(heap);
+}
+
+#ifdef UNIV_DEBUG
+/** PAGE_INDEX_ID value for freed index B-trees */
+constexpr index_id_t BTR_FREED_INDEX_ID = 0;
+#endif
+
+/** Free a B-tree root page. btr_free_but_not_root() must already
+have been called.
+In a persistent tablespace, the caller must invoke fsp_init_file_page()
+before mtr.commit().
+@param[in,out] block index root page
+@param[in,out] mtr mini-transaction */
+static void btr_free_root(buf_block_t *block, mtr_t *mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mtr->is_named_space(block->page.id().space()));
+
+ btr_search_drop_page_hash_index(block);
+
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame,
+ block->page.id().space()));
+#endif /* UNIV_BTR_DEBUG */
+
+ /* Free the entire segment in small steps. */
+ while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame, mtr));
+}
+
+/** Prepare to free a B-tree.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] index_id PAGE_INDEX_ID contents
+@param[in,out] mtr mini-transaction
+@return root block, to invoke btr_free_but_not_root() and btr_free_root()
+@retval NULL if the page is no longer a matching B-tree page */
+static MY_ATTRIBUTE((warn_unused_result))
+buf_block_t*
+btr_free_root_check(
+ const page_id_t page_id,
+ ulint zip_size,
+ index_id_t index_id,
+ mtr_t* mtr)
+{
+ ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
+ ut_ad(index_id != BTR_FREED_INDEX_ID);
+
+ buf_block_t* block = buf_page_get(
+ page_id, zip_size, RW_X_LATCH, mtr);
+
+ if (block) {
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+ if (fil_page_index_page_check(block->frame)
+ && index_id == btr_page_get_index_id(block->frame)) {
+ /* This should be a root page.
+ It should not be possible to reassign the same
+ index_id for some other index in the tablespace. */
+ ut_ad(!page_has_siblings(block->frame));
+ } else {
+ block = NULL;
+ }
+ }
+
+ return(block);
+}
+
+/** Create the root node for a new index tree.
+@param[in] type type of the index
+@param[in] index_id index id
+@param[in,out] space tablespace where created
+@param[in] index index, or NULL to create a system table
+@param[in,out] mtr mini-transaction
+@return page number of the created root
+@retval FIL_NULL if did not succeed */
+uint32_t
+btr_create(
+ ulint type,
+ fil_space_t* space,
+ index_id_t index_id,
+ dict_index_t* index,
+ mtr_t* mtr)
+{
+ buf_block_t* block;
+
+ ut_ad(mtr->is_named_space(space));
+ ut_ad(index_id != BTR_FREED_INDEX_ID);
+
+ /* Create the two new segments (one, in the case of an ibuf tree) for
+ the index tree; the segment headers are put on the allocated root page
+ (for an ibuf tree, not in the root, but on a separate ibuf header
+ page) */
+
+ if (UNIV_UNLIKELY(type & DICT_IBUF)) {
+ /* Allocate first the ibuf header page */
+ buf_block_t* ibuf_hdr_block = fseg_create(
+ space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
+
+ if (ibuf_hdr_block == NULL) {
+ return(FIL_NULL);
+ }
+
+ buf_block_dbg_add_level(
+ ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW);
+
+ ut_ad(ibuf_hdr_block->page.id().page_no()
+ == IBUF_HEADER_PAGE_NO);
+ /* Allocate then the next page to the segment: it will be the
+ tree root page */
+
+ block = fseg_alloc_free_page(
+ buf_block_get_frame(ibuf_hdr_block)
+ + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+ IBUF_TREE_ROOT_PAGE_NO,
+ FSP_UP, mtr);
+
+ if (block == NULL) {
+ return(FIL_NULL);
+ }
+
+ ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO));
+
+ buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
+
+ flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
+ } else {
+ block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP,
+ mtr);
+
+ if (block == NULL) {
+ return(FIL_NULL);
+ }
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+
+ if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr,
+ false, block)) {
+ /* Not enough space for new segment, free root
+ segment before return. */
+ btr_free_root(block, mtr);
+ return(FIL_NULL);
+ }
+
+ /* The fseg create acquires a second latch on the page,
+ therefore we must declare it: */
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+ }
+
+ ut_ad(!page_has_siblings(block->frame));
+
+ constexpr uint16_t field = PAGE_HEADER + PAGE_INDEX_ID;
+
+ byte* page_index_id = my_assume_aligned<2>(field + block->frame);
+
+ /* Create a new index page on the allocated segment page */
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ mach_write_to_8(page_index_id, index_id);
+ ut_ad(!page_has_siblings(block->page.zip.data));
+ page_create_zip(block, index, 0, 0, mtr);
+ } else {
+ page_create(block, mtr,
+ index && index->table->not_redundant());
+ if (index && index->is_spatial()) {
+ static_assert(((FIL_PAGE_INDEX & 0xff00)
+ | byte(FIL_PAGE_RTREE))
+ == FIL_PAGE_RTREE, "compatibility");
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+ byte(FIL_PAGE_RTREE));
+ if (mach_read_from_8(block->frame
+ + FIL_RTREE_SPLIT_SEQ_NUM)) {
+ mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+ 8, 0);
+ }
+ }
+ /* Set the level of the new index page */
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL
+ + block->frame, 0U);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id,
+ index_id);
+ }
+
+ /* We reset the free bits for the page in a separate
+ mini-transaction to allow creation of several trees in the
+ same mtr, otherwise the latch on a bitmap page would prevent
+ it because of the latching order.
+
+ Note: Insert Buffering is disabled for temporary tables given that
+ most temporary tables are smaller in size and short-lived. */
+ if (!(type & DICT_CLUSTERED)
+ && (!index || !index->table->is_temporary())) {
+ ibuf_reset_free_bits(block);
+ }
+
+ /* In the following assertion we test that two records of maximum
+ allowed size fit on the root page: this fact is needed to ensure
+ correctness of split algorithms */
+
+ ut_ad(page_get_max_insert_size(block->frame, 2)
+ > 2 * BTR_PAGE_MAX_REC_SIZE);
+
+ return(block->page.id().page_no());
+}
+
+/** Free a B-tree except the root page. The root page MUST be freed after
+this by calling btr_free_root.
+@param[in,out] block root page
+@param[in] log_mode mtr logging mode */
+static
+void
+btr_free_but_not_root(
+ buf_block_t* block,
+ mtr_log_t log_mode)
+{
+ mtr_t mtr;
+
+ ut_ad(fil_page_index_page_check(block->frame));
+ ut_ad(!page_has_siblings(block->frame));
+leaf_loop:
+ mtr_start(&mtr);
+ mtr_set_log_mode(&mtr, log_mode);
+ mtr.set_named_space_id(block->page.id().space());
+
+ page_t* root = block->frame;
+
+ if (!root) {
+ mtr_commit(&mtr);
+ return;
+ }
+
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + root, block->page.id().space()));
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + root, block->page.id().space()));
+#endif /* UNIV_BTR_DEBUG */
+
+ /* NOTE: page hash indexes are dropped when a page is freed inside
+ fsp0fsp. */
+
+ bool finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
+ &mtr);
+ mtr_commit(&mtr);
+
+ if (!finished) {
+
+ goto leaf_loop;
+ }
+top_loop:
+ mtr_start(&mtr);
+ mtr_set_log_mode(&mtr, log_mode);
+ mtr.set_named_space_id(block->page.id().space());
+
+ root = block->frame;
+
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + root, block->page.id().space()));
+#endif /* UNIV_BTR_DEBUG */
+
+ finished = fseg_free_step_not_header(
+ root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
+ mtr_commit(&mtr);
+
+ if (!finished) {
+ goto top_loop;
+ }
+}
+
+/** Free a persistent index tree if it exists.
+@param[in] page_id root page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] index_id PAGE_INDEX_ID contents
+@param[in,out] mtr mini-transaction */
+void
+btr_free_if_exists(
+ const page_id_t page_id,
+ ulint zip_size,
+ index_id_t index_id,
+ mtr_t* mtr)
+{
+ buf_block_t* root = btr_free_root_check(
+ page_id, zip_size, index_id, mtr);
+
+ if (root == NULL) {
+ return;
+ }
+
+ btr_free_but_not_root(root, mtr->get_log_mode());
+ mtr->set_named_space_id(page_id.space());
+ btr_free_root(root, mtr);
+}
+
+/** Free an index tree in a temporary tablespace.
+@param[in] page_id root page id */
+void btr_free(const page_id_t page_id)
+{
+ mtr_t mtr;
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr);
+
+ if (block) {
+ btr_free_but_not_root(block, MTR_LOG_NO_REDO);
+ btr_free_root(block, &mtr);
+ }
+ mtr.commit();
+}
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
+@param[in,out] index clustered index
+@return the last used AUTO_INCREMENT value
+@retval 0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc(dict_index_t* index)
+{
+ ut_ad(index->is_primary());
+ ut_ad(index->table->persistent_autoinc);
+ ut_ad(!index->table->is_temporary());
+ mtr_t mtr;
+ mtr.start();
+ ib_uint64_t autoinc;
+ if (buf_block_t* block = buf_page_get(
+ page_id_t(index->table->space_id, index->page),
+ index->table->space->zip_size(),
+ RW_S_LATCH, &mtr)) {
+ autoinc = page_get_autoinc(block->frame);
+ } else {
+ autoinc = 0;
+ }
+ mtr.commit();
+ return autoinc;
+}
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
+or fall back to MAX(auto_increment_column).
+@param[in] table table containing an AUTO_INCREMENT column
+@param[in] col_no index of the AUTO_INCREMENT column
+@return the AUTO_INCREMENT value
+@retval 0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
+{
+ ut_ad(table->persistent_autoinc);
+ ut_ad(!table->is_temporary());
+
+ dict_index_t* index = dict_table_get_first_index(table);
+
+ if (index == NULL) {
+ return 0;
+ }
+
+ mtr_t mtr;
+ mtr.start();
+ buf_block_t* block = buf_page_get(
+ page_id_t(index->table->space_id, index->page),
+ index->table->space->zip_size(),
+ RW_S_LATCH, &mtr);
+
+ ib_uint64_t autoinc = block ? page_get_autoinc(block->frame) : 0;
+ const bool retry = block && autoinc == 0
+ && !page_is_empty(block->frame);
+ mtr.commit();
+
+ if (retry) {
+ /* This should be an old data file where
+ PAGE_ROOT_AUTO_INC was initialized to 0.
+ Fall back to reading MAX(autoinc_col).
+ There should be an index on it. */
+ const dict_col_t* autoinc_col
+ = dict_table_get_nth_col(table, col_no);
+ while (index && index->fields[0].col != autoinc_col) {
+ index = dict_table_get_next_index(index);
+ }
+
+ if (index) {
+ autoinc = row_search_max_autoinc(index);
+ }
+ }
+
+ return autoinc;
+}
+
+/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
+@param[in,out] index clustered index
+@param[in] autoinc the AUTO_INCREMENT value
+@param[in] reset whether to reset the AUTO_INCREMENT
+ to a possibly smaller value than currently
+ exists in the page */
+void
+btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset)
+{
+ ut_ad(index->is_primary());
+ ut_ad(index->table->persistent_autoinc);
+ ut_ad(!index->table->is_temporary());
+
+ mtr_t mtr;
+ mtr.start();
+ fil_space_t* space = index->table->space;
+ mtr.set_named_space(space);
+ page_set_autoinc(buf_page_get(page_id_t(space->id, index->page),
+ space->zip_size(),
+ RW_SX_LATCH, &mtr),
+ autoinc, &mtr, reset);
+ mtr.commit();
+}
+
+/** Reorganize an index page.
+@param cursor index page cursor
+@param index the index that the cursor belongs to
+@param mtr mini-transaction */
+static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index,
+ mtr_t *mtr)
+{
+ const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO);
+
+ buf_block_t *const block= cursor->block;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!is_buf_block_get_page_zip(block));
+ btr_assert_not_corrupted(block, index);
+ ut_ad(fil_page_index_page_check(block->frame));
+ ut_ad(index->is_dummy ||
+ block->page.id().space() == index->table->space->id);
+ ut_ad(index->is_dummy || block->page.id().page_no() != index->page ||
+ !page_has_siblings(block->frame));
+
+ buf_block_t *old= buf_block_alloc();
+ /* Copy the old page to temporary space */
+ memcpy_aligned<UNIV_PAGE_SIZE_MIN>(old->frame, block->frame, srv_page_size);
+
+ btr_search_drop_page_hash_index(block);
+
+ /* Save the cursor position. */
+ const ulint pos= page_rec_get_n_recs_before(cursor->rec);
+
+ page_create(block, mtr, index->table->not_redundant());
+ if (index->is_spatial())
+ block->frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE);
+
+ static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+ FIL_PAGE_RTREE, "compatibility");
+
+ /* Copy the records from the temporary space to the recreated page;
+ do not copy the lock bits yet */
+
+ page_copy_rec_list_end_no_locks(block, old, page_get_infimum_rec(old->frame),
+ index, mtr);
+
+ /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
+ ut_ad(!page_get_max_trx_id(block->frame));
+ memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->frame,
+ PAGE_MAX_TRX_ID + PAGE_HEADER + old->frame, 8);
+#ifdef UNIV_DEBUG
+ if (page_get_max_trx_id(block->frame))
+ /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
+ clustered index root pages. */
+ ut_ad(dict_index_is_sec_or_ibuf(index)
+ ? page_is_leaf(block->frame)
+ : block->page.id().page_no() == index->page);
+ else
+ /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than
+ the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf
+ pages, and in temporary tables. It was always zero-initialized in
+ page_create(). PAGE_MAX_TRX_ID must be nonzero on
+ dict_index_is_sec_or_ibuf() leaf pages. */
+ ut_ad(index->table->is_temporary() || !page_is_leaf(block->frame) ||
+ !dict_index_is_sec_or_ibuf(index));
+#endif
+
+ const uint16_t data_size1= page_get_data_size(old->frame);
+ const uint16_t data_size2= page_get_data_size(block->frame);
+ const ulint max1= page_get_max_insert_size_after_reorganize(old->frame, 1);
+ const ulint max2= page_get_max_insert_size_after_reorganize(block->frame, 1);
+
+ if (UNIV_UNLIKELY(data_size1 != data_size2 || max1 != max2))
+ ib::fatal() << "Page old data size " << data_size1
+ << " new data size " << data_size2
+ << ", page old max ins size " << max1
+ << " new max ins size " << max2;
+
+ /* Restore the cursor position. */
+ if (pos)
+ cursor->rec = page_rec_get_nth(block->frame, pos);
+ else
+ ut_ad(cursor->rec == page_get_infimum_rec(block->frame));
+
+ if (block->page.id().page_no() == index->page &&
+ fil_page_get_type(old->frame) == FIL_PAGE_TYPE_INSTANT)
+ {
+ /* Preserve the PAGE_INSTANT information. */
+ ut_ad(index->is_instant());
+ memcpy_aligned<2>(FIL_PAGE_TYPE + block->frame,
+ FIL_PAGE_TYPE + old->frame, 2);
+ memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->frame,
+ PAGE_HEADER + PAGE_INSTANT + old->frame, 2);
+ if (!index->table->instant);
+ else if (page_is_comp(block->frame))
+ {
+ memcpy(PAGE_NEW_INFIMUM + block->frame,
+ PAGE_NEW_INFIMUM + old->frame, 8);
+ memcpy(PAGE_NEW_SUPREMUM + block->frame,
+ PAGE_NEW_SUPREMUM + old->frame, 8);
+ }
+ else
+ {
+ memcpy(PAGE_OLD_INFIMUM + block->frame,
+ PAGE_OLD_INFIMUM + old->frame, 8);
+ memcpy(PAGE_OLD_SUPREMUM + block->frame,
+ PAGE_OLD_SUPREMUM + old->frame, 8);
+ }
+ }
+
+ ut_ad(!memcmp(old->frame, block->frame, PAGE_HEADER));
+ ut_ad(!memcmp(old->frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+ block->frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+ PAGE_DATA - (PAGE_MAX_TRX_ID + PAGE_HEADER)));
+
+ if (!dict_table_is_locking_disabled(index->table))
+ lock_move_reorganize_page(block, old);
+
+ /* Write log for the changes, if needed. */
+ mtr->set_log_mode(log_mode);
+ if (log_mode == MTR_LOG_ALL)
+ {
+ /* Check and log the changes in the page header. */
+ ulint a, e;
+ for (a= PAGE_HEADER, e= PAGE_MAX_TRX_ID + PAGE_HEADER; a < e; a++)
+ {
+ if (old->frame[a] == block->frame[a])
+ continue;
+ while (--e, old->frame[e] == block->frame[e]);
+ e++;
+ ut_ad(a < e);
+ /* Write log for the changed page header fields. */
+ mtr->memcpy(*block, a, e - a);
+ break;
+ }
+
+ const uint16_t top= page_header_get_offs(block->frame, PAGE_HEAP_TOP);
+
+ if (page_is_comp(block->frame))
+ {
+ /* info_bits=0, n_owned=1, heap_no=0, status */
+ ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + block->frame,
+ PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + old->frame, 3));
+ /* If the 'next' pointer of the infimum record has changed, log it. */
+ a= PAGE_NEW_INFIMUM - 2;
+ e= a + 2;
+ if (block->frame[a] == old->frame[a])
+ a++;
+ if (--e, block->frame[e] != old->frame[e])
+ e++;
+ if (ulint len= e - a)
+ mtr->memcpy(*block, a, len);
+ /* The infimum record itself must not change. */
+ ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->frame,
+ PAGE_NEW_INFIMUM + old->frame, 8));
+ /* Log any change of the n_owned of the supremum record. */
+ a= PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES;
+ if (block->frame[a] != old->frame[a])
+ mtr->memcpy(*block, a, 1);
+ /* The rest of the supremum record must not change. */
+ ut_ad(!memcmp(&block->frame[a + 1], &old->frame[a + 1],
+ PAGE_NEW_SUPREMUM_END - PAGE_NEW_SUPREMUM +
+ REC_N_NEW_EXTRA_BYTES - 1));
+
+ /* Log the differences in the payload. */
+ for (a= PAGE_NEW_SUPREMUM_END, e= top; a < e; a++)
+ {
+ if (old->frame[a] == block->frame[a])
+ continue;
+ while (--e, old->frame[e] == block->frame[e]);
+ e++;
+ ut_ad(a < e);
+ /* TODO: write MEMMOVE records to minimize this further! */
+ mtr->memcpy(*block, a, e - a);
+ break;
+ }
+ }
+ else
+ {
+ /* info_bits=0, n_owned=1, heap_no=0, number of fields, 1-byte format */
+ ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + block->frame,
+ PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + old->frame, 4));
+ /* If the 'next' pointer of the infimum record has changed, log it. */
+ a= PAGE_OLD_INFIMUM - 2;
+ e= a + 2;
+ if (block->frame[a] == old->frame[a])
+ a++;
+ if (--e, block->frame[e] != old->frame[e])
+ e++;
+ if (ulint len= e - a)
+ mtr->memcpy(*block, a, len);
+ /* The infimum record itself must not change. */
+ ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->frame,
+ PAGE_OLD_INFIMUM + old->frame, 8));
+ /* Log any change of the n_owned of the supremum record. */
+ a= PAGE_OLD_SUPREMUM - REC_N_OLD_EXTRA_BYTES;
+ if (block->frame[a] != old->frame[a])
+ mtr->memcpy(*block, a, 1);
+ ut_ad(!memcmp(&block->frame[a + 1], &old->frame[a + 1],
+ PAGE_OLD_SUPREMUM_END - PAGE_OLD_SUPREMUM +
+ REC_N_OLD_EXTRA_BYTES - 1));
+
+ /* Log the differences in the payload. */
+ for (a= PAGE_OLD_SUPREMUM_END, e= top; a < e; a++)
+ {
+ if (old->frame[a] == block->frame[a])
+ continue;
+ while (--e, old->frame[e] == block->frame[e]);
+ e++;
+ ut_ad(a < e);
+ /* TODO: write MEMMOVE records to minimize this further! */
+ mtr->memcpy(*block, a, e - a);
+ break;
+ }
+ }
+
+ e= srv_page_size - PAGE_DIR;
+ a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->frame);
+
+ /* Zero out the payload area. */
+ mtr->memset(*block, top, a - top, 0);
+
+ /* Log changes to the page directory. */
+ for (; a < e; a++)
+ {
+ if (old->frame[a] == block->frame[a])
+ continue;
+ while (--e, old->frame[e] == block->frame[e]);
+ e++;
+ ut_ad(a < e);
+ /* Write log for the changed page directory slots. */
+ mtr->memcpy(*block, a, e - a);
+ break;
+ }
+ }
+
+ buf_block_free(old);
+
+ MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
+ MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
+}
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+bool
+btr_page_reorganize_block(
+ ulint z_level,/*!< in: compression level to be used
+ if dealing with compressed page */
+ buf_block_t* block, /*!< in/out: B-tree page */
+ dict_index_t* index, /*!< in: the index tree of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ if (buf_block_get_page_zip(block)) {
+ return page_zip_reorganize(block, index, z_level, mtr, true);
+ }
+
+ page_cur_t cur;
+ page_cur_set_before_first(block, &cur);
+
+ btr_page_reorganize_low(&cur, index, mtr);
+ return true;
+}
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+bool
+btr_page_reorganize(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ dict_index_t* index, /*!< in: the index tree of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ if (!buf_block_get_page_zip(cursor->block)) {
+ btr_page_reorganize_low(cursor, index, mtr);
+ return true;
+ }
+
+ ulint pos = page_rec_get_n_recs_before(cursor->rec);
+ if (!page_zip_reorganize(cursor->block, index, page_zip_level, mtr,
+ true)) {
+ return false;
+ }
+ if (pos) {
+ cursor->rec = page_rec_get_nth(cursor->block->frame, pos);
+ } else {
+ ut_ad(cursor->rec == page_get_infimum_rec(
+ cursor->block->frame));
+ }
+
+ return true;
+}
+
+/** Empty an index page (possibly the root page). @see btr_page_create().
+@param[in,out] block page to be emptied
+@param[in,out] page_zip compressed page frame, or NULL
+@param[in] index index of the page
+@param[in] level B-tree level of the page (0=leaf)
+@param[in,out] mtr mini-transaction */
+void
+btr_page_empty(
+ buf_block_t* block,
+ page_zip_des_t* page_zip,
+ dict_index_t* index,
+ ulint level,
+ mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(page_zip == buf_block_get_page_zip(block));
+ ut_ad(!index->is_dummy);
+ ut_ad(index->table->space->id == block->page.id().space());
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ btr_search_drop_page_hash_index(block);
+
+ /* Recreate the page: note that global data on page (possible
+ segment headers, next page-field, etc.) is preserved intact */
+
+ /* Preserve PAGE_ROOT_AUTO_INC when creating a clustered index
+ root page. */
+ const ib_uint64_t autoinc
+ = dict_index_is_clust(index)
+ && index->page == block->page.id().page_no()
+ ? page_get_autoinc(block->frame)
+ : 0;
+
+ if (page_zip) {
+ page_create_zip(block, index, level, autoinc, mtr);
+ } else {
+ page_create(block, mtr, index->table->not_redundant());
+ if (index->is_spatial()) {
+ static_assert(((FIL_PAGE_INDEX & 0xff00)
+ | byte(FIL_PAGE_RTREE))
+ == FIL_PAGE_RTREE, "compatibility");
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+ byte(FIL_PAGE_RTREE));
+ if (mach_read_from_8(block->frame
+ + FIL_RTREE_SPLIT_SEQ_NUM)) {
+ mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+ 8, 0);
+ }
+ }
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL
+ + block->frame, level);
+ if (autoinc) {
+ mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
+ + block->frame, autoinc);
+ }
+ }
+}
+
+/** Write instant ALTER TABLE metadata to a root page.
+@param[in,out] root clustered index root page
+@param[in] index clustered index with instant ALTER TABLE
+@param[in,out] mtr mini-transaction */
+void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr)
+{
+ ut_ad(index.n_core_fields > 0);
+ ut_ad(index.n_core_fields < REC_MAX_N_FIELDS);
+ ut_ad(index.is_instant());
+ ut_ad(fil_page_get_type(root->frame) == FIL_PAGE_TYPE_INSTANT
+ || fil_page_get_type(root->frame) == FIL_PAGE_INDEX);
+ ut_ad(!page_has_siblings(root->frame));
+ ut_ad(root->page.id().page_no() == index.page);
+
+ rec_t* infimum = page_get_infimum_rec(root->frame);
+ rec_t* supremum = page_get_supremum_rec(root->frame);
+ byte* page_type = root->frame + FIL_PAGE_TYPE;
+ uint16_t i = page_header_get_field(root->frame, PAGE_INSTANT);
+
+ switch (mach_read_from_2(page_type)) {
+ case FIL_PAGE_TYPE_INSTANT:
+ ut_ad(page_get_instant(root->frame) == index.n_core_fields);
+ if (memcmp(infimum, "infimum", 8)
+ || memcmp(supremum, "supremum", 8)) {
+ ut_ad(index.table->instant);
+ ut_ad(!memcmp(infimum, field_ref_zero, 8));
+ ut_ad(!memcmp(supremum, field_ref_zero, 7));
+ /* The n_core_null_bytes only matters for
+ ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */
+ ut_ad(supremum[7] == index.n_core_null_bytes
+ || !index.table->not_redundant());
+ return;
+ }
+ break;
+ default:
+ ut_ad("wrong page type" == 0);
+ /* fall through */
+ case FIL_PAGE_INDEX:
+ ut_ad(!page_is_comp(root->frame)
+ || !page_get_instant(root->frame));
+ ut_ad(!memcmp(infimum, "infimum", 8));
+ ut_ad(!memcmp(supremum, "supremum", 8));
+ mtr->write<2>(*root, page_type, FIL_PAGE_TYPE_INSTANT);
+ ut_ad(i <= PAGE_NO_DIRECTION);
+ i |= static_cast<uint16_t>(index.n_core_fields << 3);
+ mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT + root->frame,
+ i);
+ break;
+ }
+
+ if (index.table->instant) {
+ mtr->memset(root, infimum - root->frame, 8, 0);
+ mtr->memset(root, supremum - root->frame, 7, 0);
+ mtr->write<1,mtr_t::MAYBE_NOP>(*root, &supremum[7],
+ index.n_core_null_bytes);
+ }
+}
+
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in] index clustered index with instant ALTER TABLE
+@param[in] all whether to reset FIL_PAGE_TYPE as well
+@param[in,out] mtr mini-transaction */
+ATTRIBUTE_COLD
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr)
+{
+ ut_ad(!index.table->is_temporary());
+ ut_ad(index.is_primary());
+ if (buf_block_t *root = btr_root_block_get(&index, RW_SX_LATCH, mtr))
+ {
+ byte *page_type= root->frame + FIL_PAGE_TYPE;
+ if (all)
+ {
+ ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT ||
+ mach_read_from_2(page_type) == FIL_PAGE_INDEX);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX);
+ byte *instant= PAGE_INSTANT + PAGE_HEADER + root->frame;
+ mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant,
+ page_ptr_get_direction(instant + 1));
+ }
+ else
+ ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT);
+ static const byte supremuminfimum[8 + 8] = "supremuminfimum";
+ uint16_t infimum, supremum;
+ if (page_is_comp(root->frame))
+ {
+ infimum= PAGE_NEW_INFIMUM;
+ supremum= PAGE_NEW_SUPREMUM;
+ }
+ else
+ {
+ infimum= PAGE_OLD_INFIMUM;
+ supremum= PAGE_OLD_SUPREMUM;
+ }
+ ut_ad(!memcmp(&root->frame[infimum], supremuminfimum + 8, 8) ==
+ !memcmp(&root->frame[supremum], supremuminfimum, 8));
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->frame[infimum],
+ supremuminfimum + 8, 8);
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->frame[supremum],
+ supremuminfimum, 8);
+ }
+}
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
+ on the root page; when the function returns,
+ the cursor is positioned on the predecessor
+ of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ ulint new_page_no;
+ rec_t* rec;
+ dtuple_t* node_ptr;
+ ulint level;
+ rec_t* node_ptr_rec;
+ page_cur_t* page_cursor;
+ page_zip_des_t* root_page_zip;
+ page_zip_des_t* new_page_zip;
+ buf_block_t* root;
+ buf_block_t* new_block;
+
+ root = btr_cur_get_block(cursor);
+ root_page_zip = buf_block_get_page_zip(root);
+ ut_ad(!page_is_empty(root->frame));
+ index = btr_cur_get_index(cursor);
+ ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!root_page_zip || page_zip_validate(root_page_zip, root->frame,
+ index));
+#endif /* UNIV_ZIP_DEBUG */
+#ifdef UNIV_BTR_DEBUG
+ if (!dict_index_is_ibuf(index)) {
+ ulint space = index->table->space_id;
+
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + root->frame, space));
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + root->frame, space));
+ }
+
+ ut_a(dict_index_get_page(index) == root->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX));
+
+ /* Allocate a new page to the tree. Root splitting is done by first
+ moving the root records to the new page, emptying the root, putting
+ a node pointer to the new page, and then splitting the new page. */
+
+ level = btr_page_get_level(root->frame);
+
+ new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
+
+ if (new_block == NULL && os_has_said_disk_full) {
+ return(NULL);
+ }
+
+ new_page_zip = buf_block_get_page_zip(new_block);
+ ut_a(!new_page_zip == !root_page_zip);
+ ut_a(!new_page_zip
+ || page_zip_get_size(new_page_zip)
+ == page_zip_get_size(root_page_zip));
+
+ btr_page_create(new_block, new_page_zip, index, level, mtr);
+ if (page_has_siblings(new_block->frame)) {
+ compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+ memset_aligned<8>(new_block->frame + FIL_PAGE_PREV, 0xff, 8);
+ mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff);
+ if (UNIV_LIKELY_NULL(new_page_zip)) {
+ memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV,
+ 0xff, 8);
+ }
+ }
+
+ /* Copy the records from root to the new page one by one. */
+
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || new_page_zip
+#endif /* UNIV_ZIP_COPY */
+ || !page_copy_rec_list_end(new_block, root,
+ page_get_infimum_rec(root->frame),
+ index, mtr)) {
+ ut_a(new_page_zip);
+
+ /* Copy the page byte for byte. */
+ page_zip_copy_recs(new_block,
+ root_page_zip, root->frame, index, mtr);
+
+ /* Update the lock table and possible hash index. */
+ lock_move_rec_list_end(new_block, root,
+ page_get_infimum_rec(root->frame));
+
+ /* Move any existing predicate locks */
+ if (dict_index_is_spatial(index)) {
+ lock_prdt_rec_move(new_block, root);
+ } else {
+ btr_search_move_or_delete_hash_entries(
+ new_block, root);
+ }
+ }
+
+ constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID;
+ if (dict_index_is_sec_or_ibuf(index)) {
+ /* In secondary indexes and the change buffer,
+ PAGE_MAX_TRX_ID can be reset on the root page, because
+ the field only matters on leaf pages, and the root no
+ longer is a leaf page. (Older versions of InnoDB did
+ set PAGE_MAX_TRX_ID on all secondary index pages.) */
+ byte* p = my_assume_aligned<8>(
+ PAGE_HEADER + PAGE_MAX_TRX_ID + root->frame);
+ if (mach_read_from_8(p)) {
+ mtr->memset(root, max_trx_id, 8, 0);
+ if (UNIV_LIKELY_NULL(root->page.zip.data)) {
+ memset_aligned<8>(max_trx_id
+ + root->page.zip.data, 0, 8);
+ }
+ }
+ } else {
+ /* PAGE_ROOT_AUTO_INC is only present in the clustered index
+ root page; on other clustered index pages, we want to reserve
+ the field PAGE_MAX_TRX_ID for future use. */
+ byte* p = my_assume_aligned<8>(
+ PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->frame);
+ if (mach_read_from_8(p)) {
+ mtr->memset(new_block, max_trx_id, 8, 0);
+ if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+ memset_aligned<8>(max_trx_id
+ + new_block->page.zip.data,
+ 0, 8);
+ }
+ }
+ }
+
+ /* If this is a pessimistic insert which is actually done to
+ perform a pessimistic update then we have stored the lock
+ information of the record to be inserted on the infimum of the
+ root page: we cannot discard the lock structs on the root page */
+
+ if (!dict_table_is_locking_disabled(index->table)) {
+ lock_update_root_raise(new_block, root);
+ }
+
+ /* Create a memory heap where the node pointer is stored */
+ if (!*heap) {
+ *heap = mem_heap_create(1000);
+ }
+
+ rec = page_rec_get_next(page_get_infimum_rec(new_block->frame));
+ new_page_no = new_block->page.id().page_no();
+
+ /* Build the node pointer (= node key and page address) for the
+ child */
+ if (dict_index_is_spatial(index)) {
+ rtr_mbr_t new_mbr;
+
+ rtr_page_cal_mbr(index, new_block, &new_mbr, *heap);
+ node_ptr = rtr_index_build_node_ptr(
+ index, &new_mbr, rec, new_page_no, *heap);
+ } else {
+ node_ptr = dict_index_build_node_ptr(
+ index, rec, new_page_no, *heap, level);
+ }
+ /* The node pointer must be marked as the predefined minimum record,
+ as there is no lower alphabetical limit to records in the leftmost
+ node of a level: */
+ dtuple_set_info_bits(node_ptr,
+ dtuple_get_info_bits(node_ptr)
+ | REC_INFO_MIN_REC_FLAG);
+
+ /* Rebuild the root page to get free space */
+ btr_page_empty(root, root_page_zip, index, level + 1, mtr);
+ /* btr_page_empty() is supposed to zero-initialize the field. */
+ ut_ad(!page_get_instant(root->frame));
+
+ if (index->is_instant()) {
+ ut_ad(!root_page_zip);
+ btr_set_instant(root, *index, mtr);
+ }
+
+ ut_ad(!page_has_siblings(root->frame));
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ /* Insert node pointer to the root */
+
+ page_cur_set_before_first(root, page_cursor);
+
+ node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
+ index, offsets, heap, 0, mtr);
+
+ /* The root page should only contain the node pointer
+ to new_block at this point. Thus, the data should fit. */
+ ut_a(node_ptr_rec);
+
+ /* We play safe and reset the free bits for the new page */
+
+ if (!dict_index_is_clust(index)
+ && !index->table->is_temporary()) {
+ ibuf_reset_free_bits(new_block);
+ }
+
+ if (tuple != NULL) {
+ /* Reposition the cursor to the child node */
+ page_cur_search(new_block, index, tuple, page_cursor);
+ } else {
+ /* Set cursor to first record on child node */
+ page_cur_set_before_first(new_block, page_cursor);
+ }
+
+ /* Split the child and insert tuple */
+ if (dict_index_is_spatial(index)) {
+ /* Split rtree page and insert tuple */
+ return(rtr_page_split_and_insert(flags, cursor, offsets, heap,
+ tuple, n_ext, mtr));
+ } else {
+ return(btr_page_split_and_insert(flags, cursor, offsets, heap,
+ tuple, n_ext, mtr));
+ }
+}
+
+/** Decide if the page should be split at the convergence point of inserts
+converging to the left.
+@param[in] cursor insert position
+@return the first record to be moved to the right half page
+@retval NULL if no split is recommended */
+rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor)
+{
+ rec_t* split_rec = btr_cur_get_rec(cursor);
+ const page_t* page = page_align(split_rec);
+
+ if (page_header_get_ptr(page, PAGE_LAST_INSERT)
+ != page_rec_get_next(split_rec)) {
+ return NULL;
+ }
+
+ /* The metadata record must be present in the leftmost leaf page
+ of the clustered index, if and only if index->is_instant().
+ However, during innobase_instant_try(), index->is_instant()
+ would already hold when row_ins_clust_index_entry_low()
+ is being invoked to insert the the metadata record.
+ So, we can only assert that when the metadata record exists,
+ index->is_instant() must hold. */
+ ut_ad(!page_is_leaf(page) || page_has_prev(page)
+ || cursor->index->is_instant()
+ || !(rec_get_info_bits(page_rec_get_next_const(
+ page_get_infimum_rec(page)),
+ cursor->index->table->not_redundant())
+ & REC_INFO_MIN_REC_FLAG));
+
+ const rec_t* infimum = page_get_infimum_rec(page);
+
+ /* If the convergence is in the middle of a page, include also
+ the record immediately before the new insert to the upper
+ page. Otherwise, we could repeatedly move from page to page
+ lots of records smaller than the convergence point. */
+
+ if (split_rec == infimum
+ || split_rec == page_rec_get_next_const(infimum)) {
+ split_rec = page_rec_get_next(split_rec);
+ }
+
+ return split_rec;
+}
+
+/** Decide if the page should be split at the convergence point of inserts
+converging to the right.
+@param[in] cursor insert position
+@param[out] split_rec if split recommended, the first record
+ on the right half page, or
+ NULL if the to-be-inserted record
+ should be first
+@return whether split is recommended */
+bool
+btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec)
+{
+ rec_t* insert_point = btr_cur_get_rec(cursor);
+ const page_t* page = page_align(insert_point);
+
+ /* We use eager heuristics: if the new insert would be right after
+ the previous insert on the same page, we assume that there is a
+ pattern of sequential inserts here. */
+
+ if (page_header_get_ptr(page, PAGE_LAST_INSERT) != insert_point) {
+ return false;
+ }
+
+ insert_point = page_rec_get_next(insert_point);
+
+ if (page_rec_is_supremum(insert_point)) {
+ insert_point = NULL;
+ } else {
+ insert_point = page_rec_get_next(insert_point);
+ if (page_rec_is_supremum(insert_point)) {
+ insert_point = NULL;
+ }
+
+ /* If there are >= 2 user records up from the insert
+ point, split all but 1 off. We want to keep one because
+ then sequential inserts can use the adaptive hash
+ index, as they can do the necessary checks of the right
+ search position just by looking at the records on this
+ page. */
+ }
+
+ *split_rec = insert_point;
+ return true;
+}
+
+/*************************************************************//**
+Calculates a split record such that the tuple will certainly fit on
+its half-page when the split is performed. We assume in this function
+only that the cursor page has at least one user record.
+@return split record, or NULL if tuple will be the first record on
+the lower or upper half-page (determined by btr_page_tuple_smaller()) */
+static
+rec_t*
+btr_page_get_split_rec(
+/*===================*/
+ btr_cur_t* cursor, /*!< in: cursor at which insert should be made */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ page_t* page;
+ page_zip_des_t* page_zip;
+ ulint insert_size;
+ ulint free_space;
+ ulint total_data;
+ ulint total_n_recs;
+ ulint total_space;
+ ulint incl_data;
+ rec_t* ins_rec;
+ rec_t* rec;
+ rec_t* next_rec;
+ ulint n;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+
+ page = btr_cur_get_page(cursor);
+
+ insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+ free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+ page_zip = btr_cur_get_page_zip(cursor);
+ if (page_zip) {
+ /* Estimate the free space of an empty compressed page. */
+ ulint free_space_zip = page_zip_empty_size(
+ cursor->index->n_fields,
+ page_zip_get_size(page_zip));
+
+ if (free_space > (ulint) free_space_zip) {
+ free_space = (ulint) free_space_zip;
+ }
+ }
+
+ /* free_space is now the free space of a created new page */
+
+ total_data = page_get_data_size(page) + insert_size;
+ total_n_recs = ulint(page_get_n_recs(page)) + 1;
+ ut_ad(total_n_recs >= 2);
+ total_space = total_data + page_dir_calc_reserved_space(total_n_recs);
+
+ n = 0;
+ incl_data = 0;
+ ins_rec = btr_cur_get_rec(cursor);
+ rec = page_get_infimum_rec(page);
+
+ heap = NULL;
+ offsets = NULL;
+
+ /* We start to include records to the left half, and when the
+ space reserved by them exceeds half of total_space, then if
+ the included records fit on the left page, they will be put there
+ if something was left over also for the right page,
+ otherwise the last included record will be the first on the right
+ half page */
+
+ do {
+ /* Decide the next record to include */
+ if (rec == ins_rec) {
+ rec = NULL; /* NULL denotes that tuple is
+ now included */
+ } else if (rec == NULL) {
+ rec = page_rec_get_next(ins_rec);
+ } else {
+ rec = page_rec_get_next(rec);
+ }
+
+ if (rec == NULL) {
+ /* Include tuple */
+ incl_data += insert_size;
+ } else {
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ page_is_leaf(page)
+ ? cursor->index->n_core_fields
+ : 0,
+ ULINT_UNDEFINED, &heap);
+ incl_data += rec_offs_size(offsets);
+ }
+
+ n++;
+ } while (incl_data + page_dir_calc_reserved_space(n)
+ < total_space / 2);
+
+ if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
+ /* The next record will be the first on
+ the right half page if it is not the
+ supremum record of page */
+
+ if (rec == ins_rec) {
+ rec = NULL;
+
+ goto func_exit;
+ } else if (rec == NULL) {
+ next_rec = page_rec_get_next(ins_rec);
+ } else {
+ next_rec = page_rec_get_next(rec);
+ }
+ ut_ad(next_rec);
+ if (!page_rec_is_supremum(next_rec)) {
+ rec = next_rec;
+ }
+ }
+
+func_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(rec);
+}
+
+/*************************************************************//**
+Returns TRUE if the insert fits on the appropriate half-page with the
+chosen split_rec.
+@return true if fits */
+static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result))
+bool
+btr_page_insert_fits(
+/*=================*/
+ btr_cur_t* cursor, /*!< in: cursor at which insert
+ should be made */
+ const rec_t* split_rec,/*!< in: suggestion for first record
+ on upper half-page, or NULL if
+ tuple to be inserted should be first */
+ rec_offs** offsets,/*!< in: rec_get_offsets(
+ split_rec, cursor->index); out: garbage */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mem_heap_t** heap) /*!< in: temporary memory heap */
+{
+ page_t* page;
+ ulint insert_size;
+ ulint free_space;
+ ulint total_data;
+ ulint total_n_recs;
+ const rec_t* rec;
+ const rec_t* end_rec;
+
+ page = btr_cur_get_page(cursor);
+
+ ut_ad(!split_rec
+ || !page_is_comp(page) == !rec_offs_comp(*offsets));
+ ut_ad(!split_rec
+ || rec_offs_validate(split_rec, cursor->index, *offsets));
+
+ insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+ free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+ /* free_space is now the free space of a created new page */
+
+ total_data = page_get_data_size(page) + insert_size;
+ total_n_recs = ulint(page_get_n_recs(page)) + 1;
+
+ /* We determine which records (from rec to end_rec, not including
+ end_rec) will end up on the other half page from tuple when it is
+ inserted. */
+
+ if (split_rec == NULL) {
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+ } else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) {
+
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ end_rec = split_rec;
+ } else {
+ rec = split_rec;
+ end_rec = page_get_supremum_rec(page);
+ }
+
+ if (total_data + page_dir_calc_reserved_space(total_n_recs)
+ <= free_space) {
+
+ /* Ok, there will be enough available space on the
+ half page where the tuple is inserted */
+
+ return(true);
+ }
+
+ while (rec != end_rec) {
+ /* In this loop we calculate the amount of reserved
+ space after rec is removed from page. */
+
+ *offsets = rec_get_offsets(rec, cursor->index, *offsets,
+ page_is_leaf(page)
+ ? cursor->index->n_core_fields
+ : 0,
+ ULINT_UNDEFINED, heap);
+
+ total_data -= rec_offs_size(*offsets);
+ total_n_recs--;
+
+ if (total_data + page_dir_calc_reserved_space(total_n_recs)
+ <= free_space) {
+
+ /* Ok, there will be enough available space on the
+ half page where the tuple is inserted */
+
+ return(true);
+ }
+
+ rec = page_rec_get_next_const(rec);
+ }
+
+ return(false);
+}
+
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+void
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level, must be > 0 */
+ dtuple_t* tuple, /*!< in: the record to be inserted */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ big_rec_t* dummy_big_rec;
+ btr_cur_t cursor;
+ dberr_t err;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ rtr_info_t rtr_info;
+
+ ut_ad(level > 0);
+
+ if (!dict_index_is_spatial(index)) {
+ dberr_t err = btr_cur_search_to_nth_level(
+ index, level, tuple, PAGE_CUR_LE,
+ BTR_CONT_MODIFY_TREE,
+ &cursor, 0, file, line, mtr);
+
+ if (err != DB_SUCCESS) {
+ ib::warn() << " Error code: " << err
+ << " btr_page_get_father_node_ptr_func "
+ << " level: " << level
+ << " called from file: "
+ << file << " line: " << line
+ << " table: " << index->table->name
+ << " index: " << index->name;
+ }
+ } else {
+ /* For spatial index, initialize structures to track
+ its parents etc. */
+ rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
+
+ rtr_info_update_btr(&cursor, &rtr_info);
+
+ btr_cur_search_to_nth_level(index, level, tuple,
+ PAGE_CUR_RTREE_INSERT,
+ BTR_CONT_MODIFY_TREE,
+ &cursor, 0, file, line, mtr);
+ }
+
+ ut_ad(cursor.flag == BTR_CUR_BINARY);
+
+ err = btr_cur_optimistic_insert(
+ flags
+ | BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG
+ | BTR_NO_UNDO_LOG_FLAG,
+ &cursor, &offsets, &heap,
+ tuple, &rec, &dummy_big_rec, 0, NULL, mtr);
+
+ if (err == DB_FAIL) {
+ err = btr_cur_pessimistic_insert(flags
+ | BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG
+ | BTR_NO_UNDO_LOG_FLAG,
+ &cursor, &offsets, &heap,
+ tuple, &rec,
+ &dummy_big_rec, 0, NULL, mtr);
+ ut_a(err == DB_SUCCESS);
+ }
+
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+
+ if (dict_index_is_spatial(index)) {
+ ut_ad(cursor.rtr_info);
+
+ rtr_clean_rtr_info(&rtr_info, true);
+ }
+}
+
+/**************************************************************//**
+Attaches the halves of an index page on the appropriate level in an
+index tree. */
+static MY_ATTRIBUTE((nonnull))
+void
+btr_attach_half_pages(
+/*==================*/
+ ulint flags, /*!< in: undo logging and
+ locking flags */
+ dict_index_t* index, /*!< in: the index tree */
+ buf_block_t* block, /*!< in/out: page to be split */
+ const rec_t* split_rec, /*!< in: first record on upper
+ half page */
+ buf_block_t* new_block, /*!< in/out: the new half page */
+ ulint direction, /*!< in: FSP_UP or FSP_DOWN */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dtuple_t* node_ptr_upper;
+ mem_heap_t* heap;
+ buf_block_t* prev_block = NULL;
+ buf_block_t* next_block = NULL;
+ buf_block_t* lower_block;
+ buf_block_t* upper_block;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr->memo_contains_flagged(new_block, MTR_MEMO_PAGE_X_FIX));
+
+ /* Create a memory heap where the data tuple is stored */
+ heap = mem_heap_create(1024);
+
+ /* Based on split direction, decide upper and lower pages */
+ if (direction == FSP_DOWN) {
+
+ btr_cur_t cursor;
+ rec_offs* offsets;
+
+ lower_block = new_block;
+ upper_block = block;
+
+ /* Look up the index for the node pointer to page */
+ offsets = btr_page_get_father_block(NULL, heap, index,
+ block, mtr, &cursor);
+
+ /* Replace the address of the old child node (= page) with the
+ address of the new lower half */
+
+ btr_node_ptr_set_child_page_no(
+ btr_cur_get_block(&cursor),
+ btr_cur_get_rec(&cursor),
+ offsets, lower_block->page.id().page_no(), mtr);
+ mem_heap_empty(heap);
+ } else {
+ lower_block = block;
+ upper_block = new_block;
+ }
+
+ /* Get the level of the split pages */
+ const ulint level = btr_page_get_level(buf_block_get_frame(block));
+ ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block)));
+
+ /* Get the previous and next pages of page */
+ const uint32_t prev_page_no = btr_page_get_prev(block->frame);
+ const uint32_t next_page_no = btr_page_get_next(block->frame);
+
+ /* for consistency, both blocks should be locked, before change */
+ if (prev_page_no != FIL_NULL && direction == FSP_DOWN) {
+ prev_block = btr_block_get(*index, prev_page_no, RW_X_LATCH,
+ !level, mtr);
+ }
+ if (next_page_no != FIL_NULL && direction != FSP_DOWN) {
+ next_block = btr_block_get(*index, next_page_no, RW_X_LATCH,
+ !level, mtr);
+ }
+
+ /* Build the node pointer (= node key and page address) for the upper
+ half */
+
+ node_ptr_upper = dict_index_build_node_ptr(
+ index, split_rec, upper_block->page.id().page_no(),
+ heap, level);
+
+ /* Insert it next to the pointer to the lower half. Note that this
+ may generate recursion leading to a split on the higher level. */
+
+ btr_insert_on_non_leaf_level(flags, index, level + 1,
+ node_ptr_upper, mtr);
+
+ /* Free the memory heap */
+ mem_heap_free(heap);
+
+ /* Update page links of the level */
+
+ if (prev_block) {
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(prev_block->frame)
+ == page_is_comp(block->frame));
+ ut_a(btr_page_get_next(prev_block->frame)
+ == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+ btr_page_set_next(prev_block, lower_block->page.id().page_no(),
+ mtr);
+ }
+
+ if (next_block) {
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(next_block->frame)
+ == page_is_comp(block->frame));
+ ut_a(btr_page_get_prev(next_block->frame)
+ == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+ btr_page_set_prev(next_block, upper_block->page.id().page_no(),
+ mtr);
+ }
+
+ if (direction == FSP_DOWN) {
+ ut_ad(lower_block == new_block);
+ ut_ad(btr_page_get_next(upper_block->frame) == next_page_no);
+ btr_page_set_prev(lower_block, prev_page_no, mtr);
+ } else {
+ ut_ad(upper_block == new_block);
+ ut_ad(btr_page_get_prev(lower_block->frame) == prev_page_no);
+ btr_page_set_next(upper_block, next_page_no, mtr);
+ }
+
+ btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr);
+ btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr);
+}
+
+/*************************************************************//**
+Determine if a tuple is smaller than any record on the page.
+@return TRUE if smaller */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+btr_page_tuple_smaller(
+/*===================*/
+ btr_cur_t* cursor, /*!< in: b-tree cursor */
+ const dtuple_t* tuple, /*!< in: tuple to consider */
+ rec_offs** offsets,/*!< in/out: temporary storage */
+ ulint n_uniq, /*!< in: number of unique fields
+ in the index page records */
+ mem_heap_t** heap) /*!< in/out: heap for offsets */
+{
+ buf_block_t* block;
+ const rec_t* first_rec;
+ page_cur_t pcur;
+
+ /* Read the first user record in the page. */
+ block = btr_cur_get_block(cursor);
+ page_cur_set_before_first(block, &pcur);
+ page_cur_move_to_next(&pcur);
+ first_rec = page_cur_get_rec(&pcur);
+
+ *offsets = rec_get_offsets(
+ first_rec, cursor->index, *offsets,
+ page_is_leaf(block->frame) ? cursor->index->n_core_fields : 0,
+ n_uniq, heap);
+
+ return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0);
+}
+
+/** Insert the tuple into the right sibling page, if the cursor is at the end
+of a page.
+@param[in] flags undo logging and locking flags
+@param[in,out] cursor cursor at which to insert; when the function succeeds,
+ the cursor is positioned before the insert point.
+@param[out] offsets offsets on inserted record
+@param[in,out] heap memory heap for allocating offsets
+@param[in] tuple tuple to insert
+@param[in] n_ext number of externally stored columns
+@param[in,out] mtr mini-transaction
+@return inserted record (first record on the right sibling page);
+ the cursor will be positioned on the page infimum
+@retval NULL if the operation was not performed */
+static
+rec_t*
+btr_insert_into_right_sibling(
+ ulint flags,
+ btr_cur_t* cursor,
+ rec_offs** offsets,
+ mem_heap_t* heap,
+ const dtuple_t* tuple,
+ ulint n_ext,
+ mtr_t* mtr)
+{
+ buf_block_t* block = btr_cur_get_block(cursor);
+ page_t* page = buf_block_get_frame(block);
+ const uint32_t next_page_no = btr_page_get_next(page);
+
+ ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+ MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(heap);
+
+ if (next_page_no == FIL_NULL || !page_rec_is_supremum(
+ page_rec_get_next(btr_cur_get_rec(cursor)))) {
+
+ return(NULL);
+ }
+
+ page_cur_t next_page_cursor;
+ buf_block_t* next_block;
+ page_t* next_page;
+ btr_cur_t next_father_cursor;
+ rec_t* rec = NULL;
+ ulint max_size;
+
+ next_block = btr_block_get(*cursor->index, next_page_no, RW_X_LATCH,
+ page_is_leaf(page), mtr);
+ if (UNIV_UNLIKELY(!next_block)) {
+ return NULL;
+ }
+ next_page = buf_block_get_frame(next_block);
+
+ bool is_leaf = page_is_leaf(next_page);
+
+ btr_page_get_father(
+ cursor->index, next_block, mtr, &next_father_cursor);
+
+ page_cur_search(
+ next_block, cursor->index, tuple, PAGE_CUR_LE,
+ &next_page_cursor);
+
+ max_size = page_get_max_insert_size_after_reorganize(next_page, 1);
+
+ /* Extends gap lock for the next page */
+ if (!dict_table_is_locking_disabled(cursor->index->table)) {
+ lock_update_split_left(next_block, block);
+ }
+
+ rec = page_cur_tuple_insert(
+ &next_page_cursor, tuple, cursor->index, offsets, &heap,
+ n_ext, mtr);
+
+ if (rec == NULL) {
+ if (is_leaf
+ && next_block->page.zip.ssize
+ && !dict_index_is_clust(cursor->index)
+ && !cursor->index->table->is_temporary()) {
+ /* Reset the IBUF_BITMAP_FREE bits, because
+ page_cur_tuple_insert() will have attempted page
+ reorganize before failing. */
+ ibuf_reset_free_bits(next_block);
+ }
+ return(NULL);
+ }
+
+ ibool compressed;
+ dberr_t err;
+ ulint level = btr_page_get_level(next_page);
+
+ /* adjust cursor position */
+ *btr_cur_get_page_cur(cursor) = next_page_cursor;
+
+ ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page));
+ ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec);
+
+ /* We have to change the parent node pointer */
+
+ compressed = btr_cur_pessimistic_delete(
+ &err, TRUE, &next_father_cursor,
+ BTR_CREATE_FLAG, false, mtr);
+
+ ut_a(err == DB_SUCCESS);
+
+ if (!compressed) {
+ btr_cur_compress_if_useful(&next_father_cursor, FALSE, mtr);
+ }
+
+ dtuple_t* node_ptr = dict_index_build_node_ptr(
+ cursor->index, rec, next_block->page.id().page_no(),
+ heap, level);
+
+ btr_insert_on_non_leaf_level(
+ flags, cursor->index, level + 1, node_ptr, mtr);
+
+ ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+
+ if (is_leaf
+ && !dict_index_is_clust(cursor->index)
+ && !cursor->index->table->is_temporary()) {
+ /* Update the free bits of the B-tree page in the
+ insert buffer bitmap. */
+
+ if (next_block->page.zip.ssize) {
+ ibuf_update_free_bits_zip(next_block, mtr);
+ } else {
+ ibuf_update_free_bits_if_full(
+ next_block, max_size,
+ rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE);
+ }
+ }
+
+ return(rec);
+}
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+NOTE: jonaso added support for calling function with tuple == NULL
+which cause it to only split a page.
+
+@return inserted record or NULL if run out of space */
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ buf_block_t* new_block;
+ page_t* new_page;
+ page_zip_des_t* new_page_zip;
+ rec_t* split_rec;
+ buf_block_t* left_block;
+ buf_block_t* right_block;
+ page_cur_t* page_cursor;
+ rec_t* first_rec;
+ byte* buf = 0; /* remove warning */
+ rec_t* move_limit;
+ ulint n_iterations = 0;
+ ulint n_uniq;
+
+ if (cursor->index->is_spatial()) {
+ /* Split rtree page and update parent */
+ return(rtr_page_split_and_insert(flags, cursor, offsets, heap,
+ tuple, n_ext, mtr));
+ }
+
+ if (!*heap) {
+ *heap = mem_heap_create(1024);
+ }
+ n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
+func_start:
+ mem_heap_empty(*heap);
+ *offsets = NULL;
+
+ ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(!dict_index_is_online_ddl(cursor->index)
+ || (flags & BTR_CREATE_FLAG)
+ || dict_index_is_clust(cursor->index));
+ ut_ad(rw_lock_own_flagged(dict_index_get_lock(cursor->index),
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ page_zip = buf_block_get_page_zip(block);
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!page_is_empty(page));
+
+ /* try to insert to the next page if possible before split */
+ if (rec_t* rec = btr_insert_into_right_sibling(
+ flags, cursor, offsets, *heap, tuple, n_ext, mtr)) {
+ return(rec);
+ }
+
+ /* 1. Decide the split record; split_rec == NULL means that the
+ tuple to be inserted should be the first record on the upper
+ half-page */
+ bool insert_left = false;
+ uint32_t hint_page_no = block->page.id().page_no() + 1;
+ byte direction = FSP_UP;
+
+ if (tuple && n_iterations > 0) {
+ split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
+
+ if (split_rec == NULL) {
+ insert_left = btr_page_tuple_smaller(
+ cursor, tuple, offsets, n_uniq, heap);
+ }
+ } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
+ } else if ((split_rec = btr_page_get_split_rec_to_left(cursor))) {
+ direction = FSP_DOWN;
+ hint_page_no -= 2;
+ } else {
+ /* If there is only one record in the index page, we
+ can't split the node in the middle by default. We need
+ to determine whether the new record will be inserted
+ to the left or right. */
+
+ if (page_get_n_recs(page) > 1) {
+ split_rec = page_get_middle_rec(page);
+ } else if (btr_page_tuple_smaller(cursor, tuple,
+ offsets, n_uniq, heap)) {
+ split_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+ } else {
+ split_rec = NULL;
+ }
+ }
+
+ DBUG_EXECUTE_IF("disk_is_full",
+ os_has_said_disk_full = true;
+ return(NULL););
+
+ /* 2. Allocate a new page to the index */
+ const uint16_t page_level = btr_page_get_level(page);
+ new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
+ page_level, mtr, mtr);
+
+ if (!new_block) {
+ return(NULL);
+ }
+
+ new_page = buf_block_get_frame(new_block);
+ new_page_zip = buf_block_get_page_zip(new_block);
+
+ if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
+ /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
+ to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
+ memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4);
+ }
+ btr_page_create(new_block, new_page_zip, cursor->index,
+ page_level, mtr);
+ /* Only record the leaf level page splits. */
+ if (!page_level) {
+ cursor->index->stat_defrag_n_page_split ++;
+ cursor->index->stat_defrag_modified_counter ++;
+ btr_defragment_save_defrag_stats_if_needed(cursor->index);
+ }
+
+ /* 3. Calculate the first record on the upper half-page, and the
+ first record (move_limit) on original page which ends up on the
+ upper half */
+
+ if (split_rec) {
+ first_rec = move_limit = split_rec;
+
+ *offsets = rec_get_offsets(split_rec, cursor->index, *offsets,
+ page_is_leaf(page)
+ ? cursor->index->n_core_fields : 0,
+ n_uniq, heap);
+
+ insert_left = !tuple
+ || cmp_dtuple_rec(tuple, split_rec, *offsets) < 0;
+
+ if (!insert_left && new_page_zip && n_iterations > 0) {
+ /* If a compressed page has already been split,
+ avoid further splits by inserting the record
+ to an empty page. */
+ split_rec = NULL;
+ goto insert_empty;
+ }
+ } else if (insert_left) {
+ ut_a(n_iterations > 0);
+ first_rec = page_rec_get_next(page_get_infimum_rec(page));
+ move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+ } else {
+insert_empty:
+ ut_ad(!split_rec);
+ ut_ad(!insert_left);
+ buf = UT_NEW_ARRAY_NOKEY(
+ byte,
+ rec_get_converted_size(cursor->index, tuple, n_ext));
+
+ first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
+ tuple, n_ext);
+ move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+ }
+
+ /* 4. Do first the modifications in the tree structure */
+
+ /* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */
+ btr_attach_half_pages(flags, cursor->index, block,
+ first_rec, new_block, direction, mtr);
+
+ /* If the split is made on the leaf level and the insert will fit
+ on the appropriate half-page, we may release the tree x-latch.
+ We can then move the records after releasing the tree latch,
+ thus reducing the tree latch contention. */
+ bool insert_will_fit;
+ if (tuple == NULL) {
+ insert_will_fit = true;
+ } else if (split_rec) {
+ insert_will_fit = !new_page_zip
+ && btr_page_insert_fits(cursor, split_rec,
+ offsets, tuple, n_ext, heap);
+ } else {
+ if (!insert_left) {
+ UT_DELETE_ARRAY(buf);
+ buf = NULL;
+ }
+
+ insert_will_fit = !new_page_zip
+ && btr_page_insert_fits(cursor, NULL,
+ offsets, tuple, n_ext, heap);
+ }
+
+ if (!srv_read_only_mode
+ && insert_will_fit
+ && page_is_leaf(page)
+ && !dict_index_is_online_ddl(cursor->index)) {
+
+ mtr->memo_release(
+ dict_index_get_lock(cursor->index),
+ MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
+
+ /* NOTE: We cannot release root block latch here, because it
+ has segment header and already modified in most of cases.*/
+ }
+
+ /* 5. Move then the records to the new page */
+ if (direction == FSP_DOWN) {
+ /* fputs("Split left\n", stderr); */
+
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || page_zip
+#endif /* UNIV_ZIP_COPY */
+ || !page_move_rec_list_start(new_block, block, move_limit,
+ cursor->index, mtr)) {
+ /* For some reason, compressing new_page failed,
+ even though it should contain fewer records than
+ the original page. Copy the page byte for byte
+ and then delete the records from both pages
+ as appropriate. Deleting will always succeed. */
+ ut_a(new_page_zip);
+
+ page_zip_copy_recs(new_block,
+ page_zip, page, cursor->index, mtr);
+ page_delete_rec_list_end(move_limit - page + new_page,
+ new_block, cursor->index,
+ ULINT_UNDEFINED,
+ ULINT_UNDEFINED, mtr);
+
+ /* Update the lock table and possible hash index. */
+ lock_move_rec_list_start(
+ new_block, block, move_limit,
+ new_page + PAGE_NEW_INFIMUM);
+
+ btr_search_move_or_delete_hash_entries(
+ new_block, block);
+
+ /* Delete the records from the source page. */
+
+ page_delete_rec_list_start(move_limit, block,
+ cursor->index, mtr);
+ }
+
+ left_block = new_block;
+ right_block = block;
+
+ if (!dict_table_is_locking_disabled(cursor->index->table)) {
+ lock_update_split_left(right_block, left_block);
+ }
+ } else {
+ /* fputs("Split right\n", stderr); */
+
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || page_zip
+#endif /* UNIV_ZIP_COPY */
+ || !page_move_rec_list_end(new_block, block, move_limit,
+ cursor->index, mtr)) {
+ /* For some reason, compressing new_page failed,
+ even though it should contain fewer records than
+ the original page. Copy the page byte for byte
+ and then delete the records from both pages
+ as appropriate. Deleting will always succeed. */
+ ut_a(new_page_zip);
+
+ page_zip_copy_recs(new_block,
+ page_zip, page, cursor->index, mtr);
+ page_delete_rec_list_start(move_limit - page
+ + new_page, new_block,
+ cursor->index, mtr);
+
+ /* Update the lock table and possible hash index. */
+ lock_move_rec_list_end(new_block, block, move_limit);
+
+ btr_search_move_or_delete_hash_entries(
+ new_block, block);
+
+ /* Delete the records from the source page. */
+
+ page_delete_rec_list_end(move_limit, block,
+ cursor->index,
+ ULINT_UNDEFINED,
+ ULINT_UNDEFINED, mtr);
+ }
+
+ left_block = block;
+ right_block = new_block;
+
+ if (!dict_table_is_locking_disabled(cursor->index->table)) {
+ lock_update_split_right(right_block, left_block);
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ if (page_zip) {
+ ut_a(page_zip_validate(page_zip, page, cursor->index));
+ ut_a(page_zip_validate(new_page_zip, new_page, cursor->index));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* At this point, split_rec, move_limit and first_rec may point
+ to garbage on the old page. */
+
+ /* 6. The split and the tree modification is now completed. Decide the
+ page where the tuple should be inserted */
+ rec_t* rec;
+ buf_block_t* const insert_block = insert_left
+ ? left_block : right_block;
+
+ if (UNIV_UNLIKELY(!tuple)) {
+ rec = NULL;
+ goto func_exit;
+ }
+
+ /* 7. Reposition the cursor for insert and try insertion */
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ page_cur_search(insert_block, cursor->index, tuple, page_cursor);
+
+ rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+ offsets, heap, n_ext, mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_t* insert_page
+ = buf_block_get_frame(insert_block);
+
+ page_zip_des_t* insert_page_zip
+ = buf_block_get_page_zip(insert_block);
+
+ ut_a(!insert_page_zip
+ || page_zip_validate(insert_page_zip, insert_page,
+ cursor->index));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (rec != NULL) {
+
+ goto func_exit;
+ }
+
+ /* 8. If insert did not fit, try page reorganization.
+ For compressed pages, page_cur_tuple_insert() will have
+ attempted this already. */
+
+ if (page_cur_get_page_zip(page_cursor)
+ || !btr_page_reorganize(page_cursor, cursor->index, mtr)) {
+
+ goto insert_failed;
+ }
+
+ rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+ offsets, heap, n_ext, mtr);
+
+ if (rec == NULL) {
+ /* The insert did not fit on the page: loop back to the
+ start of the function for a new split */
+insert_failed:
+ /* We play safe and reset the free bits for new_page */
+ if (!dict_index_is_clust(cursor->index)
+ && !cursor->index->table->is_temporary()) {
+ ibuf_reset_free_bits(new_block);
+ ibuf_reset_free_bits(block);
+ }
+
+ n_iterations++;
+ ut_ad(n_iterations < 2
+ || buf_block_get_page_zip(insert_block));
+ ut_ad(!insert_will_fit);
+
+ goto func_start;
+ }
+
+func_exit:
+ /* Insert fit on the page: update the free bits for the
+ left and right pages in the same mtr */
+
+ if (!dict_index_is_clust(cursor->index)
+ && !cursor->index->table->is_temporary()
+ && page_is_leaf(page)) {
+
+ ibuf_update_free_bits_for_two_pages_low(
+ left_block, right_block, mtr);
+ }
+
+ MONITOR_INC(MONITOR_INDEX_SPLIT);
+
+ ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
+ ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
+
+ ut_ad(tuple || !rec);
+ ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
+ return(rec);
+}
+
+/** Remove a page from the level list of pages.
+@param[in] block page to remove
+@param[in] index index tree
+@param[in,out] mtr mini-transaction */
+void btr_level_list_remove(const buf_block_t& block, const dict_index_t& index,
+ mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(block.zip_size() == index.table->space->zip_size());
+ ut_ad(index.table->space->id == block.page.id().space());
+ /* Get the previous and next page numbers of page */
+
+ const page_t* page = block.frame;
+ const uint32_t prev_page_no = btr_page_get_prev(page);
+ const uint32_t next_page_no = btr_page_get_next(page);
+
+ /* Update page links of the level */
+
+ if (prev_page_no != FIL_NULL) {
+ buf_block_t* prev_block = btr_block_get(
+ index, prev_page_no, RW_X_LATCH, page_is_leaf(page),
+ mtr);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
+ static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+ ut_a(!memcmp_aligned<4>(prev_block->frame + FIL_PAGE_NEXT,
+ page + FIL_PAGE_OFFSET, 4));
+#endif /* UNIV_BTR_DEBUG */
+
+ btr_page_set_next(prev_block, next_page_no, mtr);
+ }
+
+ if (next_page_no != FIL_NULL) {
+ buf_block_t* next_block = btr_block_get(
+ index, next_page_no, RW_X_LATCH, page_is_leaf(page),
+ mtr);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
+ static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+ ut_a(!memcmp_aligned<4>(next_block->frame + FIL_PAGE_PREV,
+ page + FIL_PAGE_OFFSET, 4));
+#endif /* UNIV_BTR_DEBUG */
+
+ btr_page_set_prev(next_block, prev_page_no, mtr);
+ }
+}
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+UNIV_INTERN
+buf_block_t*
+btr_lift_page_up(
+/*=============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level;
+ must not be empty: use
+ btr_discard_only_page_on_level if the last
+ record from the page should be removed */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* father_block;
+ ulint page_level;
+ page_zip_des_t* father_page_zip;
+ page_t* page = buf_block_get_frame(block);
+ ulint root_page_no;
+ buf_block_t* blocks[BTR_MAX_LEVELS];
+ ulint n_blocks; /*!< last used index in blocks[] */
+ ulint i;
+ bool lift_father_up;
+ buf_block_t* block_orig = block;
+
+ ut_ad(!page_has_siblings(page));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+ page_level = btr_page_get_level(page);
+ root_page_no = dict_index_get_page(index);
+
+ {
+ btr_cur_t cursor;
+ rec_offs* offsets = NULL;
+ mem_heap_t* heap = mem_heap_create(
+ sizeof(*offsets)
+ * (REC_OFFS_HEADER_SIZE + 1 + 1
+ + unsigned(index->n_fields)));
+ buf_block_t* b;
+
+ if (dict_index_is_spatial(index)) {
+ offsets = rtr_page_get_father_block(
+ NULL, heap, index, block, mtr,
+ NULL, &cursor);
+ } else {
+ offsets = btr_page_get_father_block(offsets, heap,
+ index, block,
+ mtr, &cursor);
+ }
+ father_block = btr_cur_get_block(&cursor);
+ father_page_zip = buf_block_get_page_zip(father_block);
+
+ n_blocks = 0;
+
+ /* Store all ancestor pages so we can reset their
+ levels later on. We have to do all the searches on
+ the tree now because later on, after we've replaced
+ the first level, the tree is in an inconsistent state
+ and can not be searched. */
+ for (b = father_block;
+ b->page.id().page_no() != root_page_no; ) {
+ ut_a(n_blocks < BTR_MAX_LEVELS);
+
+ if (dict_index_is_spatial(index)) {
+ offsets = rtr_page_get_father_block(
+ NULL, heap, index, b, mtr,
+ NULL, &cursor);
+ } else {
+ offsets = btr_page_get_father_block(offsets,
+ heap,
+ index, b,
+ mtr,
+ &cursor);
+ }
+
+ blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
+ }
+
+ lift_father_up = (n_blocks && page_level == 0);
+ if (lift_father_up) {
+ /* The father page also should be the only on its level (not
+ root). We should lift up the father page at first.
+ Because the leaf page should be lifted up only for root page.
+ The freeing page is based on page_level (==0 or !=0)
+ to choose segment. If the page_level is changed ==0 from !=0,
+ later freeing of the page doesn't find the page allocation
+ to be freed.*/
+
+ block = father_block;
+ page = buf_block_get_frame(block);
+ page_level = btr_page_get_level(page);
+
+ ut_ad(!page_has_siblings(page));
+ ut_ad(mtr->memo_contains_flagged(block,
+ MTR_MEMO_PAGE_X_FIX));
+
+ father_block = blocks[0];
+ father_page_zip = buf_block_get_page_zip(father_block);
+ }
+
+ mem_heap_free(heap);
+ }
+
+ btr_search_drop_page_hash_index(block);
+
+ /* Make the father empty */
+ btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
+ /* btr_page_empty() is supposed to zero-initialize the field. */
+ ut_ad(!page_get_instant(father_block->frame));
+
+ if (index->is_instant()
+ && father_block->page.id().page_no() == root_page_no) {
+ ut_ad(!father_page_zip);
+ btr_set_instant(father_block, *index, mtr);
+ }
+
+ page_level++;
+
+ /* Copy the records to the father page one by one. */
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || father_page_zip
+#endif /* UNIV_ZIP_COPY */
+ || !page_copy_rec_list_end(father_block, block,
+ page_get_infimum_rec(page),
+ index, mtr)) {
+ const page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(father_page_zip);
+ ut_a(page_zip);
+
+ /* Copy the page byte for byte. */
+ page_zip_copy_recs(father_block,
+ page_zip, page, index, mtr);
+
+ /* Update the lock table and possible hash index. */
+
+ lock_move_rec_list_end(father_block, block,
+ page_get_infimum_rec(page));
+
+ /* Also update the predicate locks */
+ if (dict_index_is_spatial(index)) {
+ lock_prdt_rec_move(father_block, block);
+ } else {
+ btr_search_move_or_delete_hash_entries(
+ father_block, block);
+ }
+ }
+
+ if (!dict_table_is_locking_disabled(index->table)) {
+ /* Free predicate page locks on the block */
+ if (dict_index_is_spatial(index)) {
+ lock_mutex_enter();
+ lock_prdt_page_free_from_discard(
+ block, &lock_sys.prdt_page_hash);
+ lock_mutex_exit();
+ }
+ lock_update_copy_and_discard(father_block, block);
+ }
+
+ /* Go upward to root page, decrementing levels by one. */
+ for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
+ ut_ad(btr_page_get_level(blocks[i]->frame) == page_level + 1);
+ btr_page_set_level(blocks[i], page_level, mtr);
+ }
+
+ if (dict_index_is_spatial(index)) {
+ rtr_check_discard_page(index, NULL, block);
+ }
+
+ /* Free the file page */
+ btr_page_free(index, block, mtr);
+
+ /* We play it safe and reset the free bits for the father */
+ if (!dict_index_is_clust(index)
+ && !index->table->is_temporary()) {
+ ibuf_reset_free_bits(father_block);
+ }
+ ut_ad(page_validate(father_block->frame, index));
+ ut_ad(btr_check_node_ptr(index, father_block, mtr));
+
+ return(lift_father_up ? block_orig : father_block);
+}
+
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the brother
+reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to the
+brothers, if they exist.
+@return TRUE on success */
+ibool
+btr_compress(
+/*=========*/
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to merge
+ or lift; the page must not be empty:
+ when deleting records, use btr_discard_page()
+ if the page would become empty */
+ ibool adjust, /*!< in: TRUE if should adjust the
+ cursor position even if compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ dict_index_t* index;
+ buf_block_t* merge_block;
+ page_t* merge_page = NULL;
+ page_zip_des_t* merge_page_zip;
+ ibool is_left;
+ buf_block_t* block;
+ page_t* page;
+ btr_cur_t father_cursor;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+ ulint nth_rec = 0; /* remove bogus warning */
+ bool mbr_changed = false;
+#ifdef UNIV_DEBUG
+ bool leftmost_child;
+#endif
+ DBUG_ENTER("btr_compress");
+
+ block = btr_cur_get_block(cursor);
+ page = btr_cur_get_page(cursor);
+ index = btr_cur_get_index(cursor);
+
+ btr_assert_not_corrupted(block, index);
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+ MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS);
+
+ const uint32_t left_page_no = btr_page_get_prev(page);
+ const uint32_t right_page_no = btr_page_get_next(page);
+
+#ifdef UNIV_DEBUG
+ if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
+ ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+ page_rec_get_next(page_get_infimum_rec(page)),
+ page_is_comp(page)));
+ }
+#endif /* UNIV_DEBUG */
+
+ heap = mem_heap_create(100);
+
+ if (dict_index_is_spatial(index)) {
+ offsets = rtr_page_get_father_block(
+ NULL, heap, index, block, mtr, cursor, &father_cursor);
+ ut_ad(cursor->page_cur.block->page.id() == block->page.id());
+ rec_t* my_rec = father_cursor.page_cur.rec;
+
+ ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets);
+
+ if (page_no != block->page.id().page_no()) {
+ ib::info() << "father positioned on page "
+ << page_no << "instead of "
+ << block->page.id().page_no();
+ offsets = btr_page_get_father_block(
+ NULL, heap, index, block, mtr, &father_cursor);
+ }
+ } else {
+ offsets = btr_page_get_father_block(
+ NULL, heap, index, block, mtr, &father_cursor);
+ }
+
+ if (adjust) {
+ nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
+ ut_ad(nth_rec > 0);
+ }
+
+ if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) {
+ /* The page is the only one on the level, lift the records
+ to the father */
+
+ merge_block = btr_lift_page_up(index, block, mtr);
+ goto func_exit;
+ }
+
+ ut_d(leftmost_child =
+ left_page_no != FIL_NULL
+ && (page_rec_get_next(
+ page_get_infimum_rec(
+ btr_cur_get_page(&father_cursor)))
+ == btr_cur_get_rec(&father_cursor)));
+
+ /* Decide the page to which we try to merge and which will inherit
+ the locks */
+
+ is_left = btr_can_merge_with_page(cursor, left_page_no,
+ &merge_block, mtr);
+
+ DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;);
+retry:
+ if (!is_left
+ && !btr_can_merge_with_page(cursor, right_page_no, &merge_block,
+ mtr)) {
+ if (!merge_block) {
+ merge_page = NULL;
+ }
+ goto err_exit;
+ }
+
+ merge_page = buf_block_get_frame(merge_block);
+
+#ifdef UNIV_BTR_DEBUG
+ if (is_left) {
+ ut_a(btr_page_get_next(merge_page)
+ == block->page.id().page_no());
+ } else {
+ ut_a(btr_page_get_prev(merge_page)
+ == block->page.id().page_no());
+ }
+#endif /* UNIV_BTR_DEBUG */
+
+ ut_ad(page_validate(merge_page, index));
+
+ merge_page_zip = buf_block_get_page_zip(merge_block);
+#ifdef UNIV_ZIP_DEBUG
+ if (merge_page_zip) {
+ const page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(page_zip);
+ ut_a(page_zip_validate(merge_page_zip, merge_page, index));
+ ut_a(page_zip_validate(page_zip, page, index));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* Move records to the merge page */
+ if (is_left) {
+ btr_cur_t cursor2;
+ rtr_mbr_t new_mbr;
+ rec_offs* offsets2 = NULL;
+
+ /* For rtree, we need to update father's mbr. */
+ if (index->is_spatial()) {
+ /* We only support merge pages with the same parent
+ page */
+ if (!rtr_check_same_block(
+ index, &cursor2,
+ btr_cur_get_block(&father_cursor),
+ merge_block, heap)) {
+ is_left = false;
+ goto retry;
+ }
+
+ /* Set rtr_info for cursor2, since it is
+ necessary in recursive page merge. */
+ cursor2.rtr_info = cursor->rtr_info;
+ cursor2.tree_height = cursor->tree_height;
+
+ offsets2 = rec_get_offsets(
+ btr_cur_get_rec(&cursor2), index, NULL,
+ page_is_leaf(cursor2.page_cur.block->frame)
+ ? index->n_fields : 0,
+ ULINT_UNDEFINED, &heap);
+
+ /* Check if parent entry needs to be updated */
+ mbr_changed = rtr_merge_mbr_changed(
+ &cursor2, &father_cursor,
+ offsets2, offsets, &new_mbr);
+ }
+
+ rec_t* orig_pred = page_copy_rec_list_start(
+ merge_block, block, page_get_supremum_rec(page),
+ index, mtr);
+
+ if (!orig_pred) {
+ goto err_exit;
+ }
+
+ btr_search_drop_page_hash_index(block);
+
+ /* Remove the page from the level list */
+ btr_level_list_remove(*block, *index, mtr);
+
+ if (dict_index_is_spatial(index)) {
+ rec_t* my_rec = father_cursor.page_cur.rec;
+
+ ulint page_no = btr_node_ptr_get_child_page_no(
+ my_rec, offsets);
+
+ if (page_no != block->page.id().page_no()) {
+ ib::fatal() << "father positioned on "
+ << page_no << " instead of "
+ << block->page.id().page_no();
+ }
+
+ if (mbr_changed) {
+#ifdef UNIV_DEBUG
+ bool success = rtr_update_mbr_field(
+ &cursor2, offsets2, &father_cursor,
+ merge_page, &new_mbr, NULL, mtr);
+
+ ut_ad(success);
+#else
+ rtr_update_mbr_field(
+ &cursor2, offsets2, &father_cursor,
+ merge_page, &new_mbr, NULL, mtr);
+#endif
+ } else {
+ rtr_node_ptr_delete(&father_cursor, mtr);
+ }
+
+ /* No GAP lock needs to be worrying about */
+ lock_mutex_enter();
+ lock_prdt_page_free_from_discard(
+ block, &lock_sys.prdt_page_hash);
+ lock_rec_free_all_from_discard_page(block);
+ lock_mutex_exit();
+ } else {
+ btr_cur_node_ptr_delete(&father_cursor, mtr);
+ if (!dict_table_is_locking_disabled(index->table)) {
+ lock_update_merge_left(
+ merge_block, orig_pred, block);
+ }
+ }
+
+ if (adjust) {
+ nth_rec += page_rec_get_n_recs_before(orig_pred);
+ }
+ } else {
+ rec_t* orig_succ;
+ ibool compressed;
+ dberr_t err;
+ btr_cur_t cursor2;
+ /* father cursor pointing to node ptr
+ of the right sibling */
+#ifdef UNIV_BTR_DEBUG
+ byte fil_page_prev[4];
+#endif /* UNIV_BTR_DEBUG */
+
+ if (dict_index_is_spatial(index)) {
+ cursor2.rtr_info = NULL;
+
+ /* For spatial index, we disallow merge of blocks
+ with different parents, since the merge would need
+ to update entry (for MBR and Primary key) in the
+ parent of block being merged */
+ if (!rtr_check_same_block(
+ index, &cursor2,
+ btr_cur_get_block(&father_cursor),
+ merge_block, heap)) {
+ goto err_exit;
+ }
+
+ /* Set rtr_info for cursor2, since it is
+ necessary in recursive page merge. */
+ cursor2.rtr_info = cursor->rtr_info;
+ cursor2.tree_height = cursor->tree_height;
+ } else {
+ btr_page_get_father(index, merge_block, mtr, &cursor2);
+ }
+
+ if (merge_page_zip && left_page_no == FIL_NULL) {
+
+ /* The function page_zip_compress(), which will be
+ invoked by page_copy_rec_list_end() below,
+ requires that FIL_PAGE_PREV be FIL_NULL.
+ Clear the field, but prepare to restore it. */
+ static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+#ifdef UNIV_BTR_DEBUG
+ memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
+#endif /* UNIV_BTR_DEBUG */
+ compile_time_assert(FIL_NULL == 0xffffffffU);
+ memset_aligned<4>(merge_page + FIL_PAGE_PREV, 0xff, 4);
+ }
+
+ orig_succ = page_copy_rec_list_end(merge_block, block,
+ page_get_infimum_rec(page),
+ cursor->index, mtr);
+
+ if (!orig_succ) {
+ ut_a(merge_page_zip);
+#ifdef UNIV_BTR_DEBUG
+ if (left_page_no == FIL_NULL) {
+ /* FIL_PAGE_PREV was restored from
+ merge_page_zip. */
+ ut_a(!memcmp(fil_page_prev,
+ merge_page + FIL_PAGE_PREV, 4));
+ }
+#endif /* UNIV_BTR_DEBUG */
+ goto err_exit;
+ }
+
+ btr_search_drop_page_hash_index(block);
+
+#ifdef UNIV_BTR_DEBUG
+ if (merge_page_zip && left_page_no == FIL_NULL) {
+
+ /* Restore FIL_PAGE_PREV in order to avoid an assertion
+ failure in btr_level_list_remove(), which will set
+ the field again to FIL_NULL. Even though this makes
+ merge_page and merge_page_zip inconsistent for a
+ split second, it is harmless, because the pages
+ are X-latched. */
+ memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
+ }
+#endif /* UNIV_BTR_DEBUG */
+
+ /* Remove the page from the level list */
+ btr_level_list_remove(*block, *index, mtr);
+
+ ut_ad(btr_node_ptr_get_child_page_no(
+ btr_cur_get_rec(&father_cursor), offsets)
+ == block->page.id().page_no());
+
+ /* Replace the address of the old child node (= page) with the
+ address of the merge page to the right */
+ btr_node_ptr_set_child_page_no(
+ btr_cur_get_block(&father_cursor),
+ btr_cur_get_rec(&father_cursor),
+ offsets, right_page_no, mtr);
+
+#ifdef UNIV_DEBUG
+ if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
+ ut_ad(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+ page_rec_get_next(page_get_infimum_rec(
+ buf_block_get_frame(merge_block))),
+ page_is_comp(page)));
+ }
+#endif /* UNIV_DEBUG */
+
+ /* For rtree, we need to update father's mbr. */
+ if (index->is_spatial()) {
+ rec_offs* offsets2;
+ ulint rec_info;
+
+ offsets2 = rec_get_offsets(
+ btr_cur_get_rec(&cursor2), index, NULL,
+ page_is_leaf(cursor2.page_cur.block->frame)
+ ? index->n_fields : 0,
+ ULINT_UNDEFINED, &heap);
+
+ ut_ad(btr_node_ptr_get_child_page_no(
+ btr_cur_get_rec(&cursor2), offsets2)
+ == right_page_no);
+
+ rec_info = rec_get_info_bits(
+ btr_cur_get_rec(&father_cursor),
+ rec_offs_comp(offsets));
+ if (rec_info & REC_INFO_MIN_REC_FLAG) {
+ /* When the father node ptr is minimal rec,
+ we will keep it and delete the node ptr of
+ merge page. */
+ rtr_merge_and_update_mbr(&father_cursor,
+ &cursor2,
+ offsets, offsets2,
+ merge_page, mtr);
+ } else {
+ /* Otherwise, we will keep the node ptr of
+ merge page and delete the father node ptr.
+ This is for keeping the rec order in upper
+ level. */
+ rtr_merge_and_update_mbr(&cursor2,
+ &father_cursor,
+ offsets2, offsets,
+ merge_page, mtr);
+ }
+ lock_mutex_enter();
+ lock_prdt_page_free_from_discard(
+ block, &lock_sys.prdt_page_hash);
+ lock_rec_free_all_from_discard_page(block);
+ lock_mutex_exit();
+ } else {
+
+ compressed = btr_cur_pessimistic_delete(&err, TRUE,
+ &cursor2,
+ BTR_CREATE_FLAG,
+ false, mtr);
+ ut_a(err == DB_SUCCESS);
+
+ if (!compressed) {
+ btr_cur_compress_if_useful(&cursor2,
+ FALSE,
+ mtr);
+ }
+
+ if (!dict_table_is_locking_disabled(index->table)) {
+ lock_update_merge_right(
+ merge_block, orig_succ, block);
+ }
+ }
+ }
+
+ if (!dict_index_is_clust(index)
+ && !index->table->is_temporary()
+ && page_is_leaf(merge_page)) {
+ /* Update the free bits of the B-tree page in the
+ insert buffer bitmap. This has to be done in a
+ separate mini-transaction that is committed before the
+ main mini-transaction. We cannot update the insert
+ buffer bitmap in this mini-transaction, because
+ btr_compress() can be invoked recursively without
+ committing the mini-transaction in between. Since
+ insert buffer bitmap pages have a lower rank than
+ B-tree pages, we must not access other pages in the
+ same mini-transaction after accessing an insert buffer
+ bitmap page. */
+
+ /* The free bits in the insert buffer bitmap must
+ never exceed the free space on a page. It is safe to
+ decrement or reset the bits in the bitmap in a
+ mini-transaction that is committed before the
+ mini-transaction that affects the free space. */
+
+ /* It is unsafe to increment the bits in a separately
+ committed mini-transaction, because in crash recovery,
+ the free bits could momentarily be set too high. */
+
+ if (merge_block->zip_size()) {
+ /* Because the free bits may be incremented
+ and we cannot update the insert buffer bitmap
+ in the same mini-transaction, the only safe
+ thing we can do here is the pessimistic
+ approach: reset the free bits. */
+ ibuf_reset_free_bits(merge_block);
+ } else {
+ /* On uncompressed pages, the free bits will
+ never increase here. Thus, it is safe to
+ write the bits accurately in a separate
+ mini-transaction. */
+ ibuf_update_free_bits_if_full(merge_block,
+ srv_page_size,
+ ULINT_UNDEFINED);
+ }
+ }
+
+ ut_ad(page_validate(merge_page, index));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page,
+ index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (dict_index_is_spatial(index)) {
+ rtr_check_discard_page(index, NULL, block);
+ }
+
+ /* Free the file page */
+ btr_page_free(index, block, mtr);
+
+ /* btr_check_node_ptr() needs parent block latched.
+ If the merge_block's parent block is not same,
+ we cannot use btr_check_node_ptr() */
+ ut_ad(leftmost_child
+ || btr_check_node_ptr(index, merge_block, mtr));
+func_exit:
+ mem_heap_free(heap);
+
+ if (adjust) {
+ ut_ad(nth_rec > 0);
+ btr_cur_position(
+ index,
+ page_rec_get_nth(merge_block->frame, nth_rec),
+ merge_block, cursor);
+ }
+
+ MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
+
+ DBUG_RETURN(TRUE);
+
+err_exit:
+ /* We play it safe and reset the free bits. */
+ if (merge_block && merge_block->zip_size()
+ && page_is_leaf(merge_block->frame)
+ && !dict_index_is_clust(index)) {
+
+ ibuf_reset_free_bits(merge_block);
+ }
+
+ mem_heap_free(heap);
+ DBUG_RETURN(FALSE);
+}
+
+/*************************************************************//**
+Discards a page that is the only page on its level. This will empty
+the whole B-tree, leaving just an empty root page. This function
+should almost never be reached, because btr_compress(), which is invoked in
+delete operations, calls btr_lift_page_up() to flatten the B-tree. */
+ATTRIBUTE_COLD
+static
+void
+btr_discard_only_page_on_level(
+/*===========================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint page_level = 0;
+
+ ut_ad(!index->is_dummy);
+
+ /* Save the PAGE_MAX_TRX_ID from the leaf page. */
+ const trx_id_t max_trx_id = page_get_max_trx_id(block->frame);
+ const rec_t* r = page_rec_get_next(page_get_infimum_rec(block->frame));
+ ut_ad(rec_is_metadata(r, *index) == index->is_instant());
+
+ while (block->page.id().page_no() != dict_index_get_page(index)) {
+ btr_cur_t cursor;
+ buf_block_t* father;
+ const page_t* page = buf_block_get_frame(block);
+
+ ut_a(page_get_n_recs(page) == 1);
+ ut_a(page_level == btr_page_get_level(page));
+ ut_a(!page_has_siblings(page));
+ ut_ad(fil_page_index_page_check(page));
+ ut_ad(block->page.id().space() == index->table->space->id);
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ btr_search_drop_page_hash_index(block);
+
+ if (dict_index_is_spatial(index)) {
+ /* Check any concurrent search having this page */
+ rtr_check_discard_page(index, NULL, block);
+ rtr_page_get_father(index, block, mtr, NULL, &cursor);
+ } else {
+ btr_page_get_father(index, block, mtr, &cursor);
+ }
+ father = btr_cur_get_block(&cursor);
+
+ if (!dict_table_is_locking_disabled(index->table)) {
+ lock_update_discard(
+ father, PAGE_HEAP_NO_SUPREMUM, block);
+ }
+
+ /* Free the file page */
+ btr_page_free(index, block, mtr);
+
+ block = father;
+ page_level++;
+ }
+
+ /* block is the root page, which must be empty, except
+ for the node pointer to the (now discarded) block(s). */
+ ut_ad(!page_has_siblings(block->frame));
+
+#ifdef UNIV_BTR_DEBUG
+ if (!dict_index_is_ibuf(index)) {
+ const page_t* root = buf_block_get_frame(block);
+ const ulint space = index->table->space_id;
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + root, space));
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + root, space));
+ }
+#endif /* UNIV_BTR_DEBUG */
+
+ mem_heap_t* heap = nullptr;
+ const rec_t* rec = nullptr;
+ rec_offs* offsets = nullptr;
+ if (index->table->instant || index->must_avoid_clear_instant_add()) {
+ if (!rec_is_metadata(r, *index)) {
+ } else if (!index->table->instant
+ || rec_is_alter_metadata(r, *index)) {
+ heap = mem_heap_create(srv_page_size);
+ offsets = rec_get_offsets(r, index, nullptr,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ rec = rec_copy(mem_heap_alloc(heap,
+ rec_offs_size(offsets)),
+ r, offsets);
+ rec_offs_make_valid(rec, index, true, offsets);
+ }
+ }
+
+ btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
+ ut_ad(page_is_leaf(buf_block_get_frame(block)));
+ /* btr_page_empty() is supposed to zero-initialize the field. */
+ ut_ad(!page_get_instant(block->frame));
+
+ if (index->is_primary()) {
+ if (rec) {
+ page_cur_t cur;
+ page_cur_set_before_first(block, &cur);
+ DBUG_ASSERT(index->table->instant);
+ DBUG_ASSERT(rec_is_alter_metadata(rec, *index));
+ btr_set_instant(block, *index, mtr);
+ rec = page_cur_insert_rec_low(&cur, index, rec,
+ offsets, mtr);
+ ut_ad(rec);
+ mem_heap_free(heap);
+ } else if (index->is_instant()) {
+ index->clear_instant_add();
+ }
+ } else if (!index->table->is_temporary()) {
+ /* We play it safe and reset the free bits for the root */
+ ibuf_reset_free_bits(block);
+
+ ut_a(max_trx_id);
+ page_set_max_trx_id(block,
+ buf_block_get_page_zip(block),
+ max_trx_id, mtr);
+ }
+}
+
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+void
+btr_discard_page(
+/*=============*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
+ the root page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ buf_block_t* merge_block;
+ buf_block_t* block;
+ btr_cur_t parent_cursor;
+
+ block = btr_cur_get_block(cursor);
+ index = btr_cur_get_index(cursor);
+
+ ut_ad(dict_index_get_page(index) != block->page.id().page_no());
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+ MONITOR_INC(MONITOR_INDEX_DISCARD);
+
+ if (dict_index_is_spatial(index)) {
+ rtr_page_get_father(index, block, mtr, cursor, &parent_cursor);
+ } else {
+ btr_page_get_father(index, block, mtr, &parent_cursor);
+ }
+
+ /* Decide the page which will inherit the locks */
+
+ const uint32_t left_page_no = btr_page_get_prev(block->frame);
+ const uint32_t right_page_no = btr_page_get_next(block->frame);
+
+ ut_d(bool parent_is_different = false);
+ if (left_page_no != FIL_NULL) {
+ merge_block = btr_block_get(*index, left_page_no, RW_X_LATCH,
+ true, mtr);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_page_get_next(merge_block->frame)
+ == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+ ut_d(parent_is_different =
+ (page_rec_get_next(
+ page_get_infimum_rec(
+ btr_cur_get_page(
+ &parent_cursor)))
+ == btr_cur_get_rec(&parent_cursor)));
+ } else if (right_page_no != FIL_NULL) {
+ merge_block = btr_block_get(*index, right_page_no, RW_X_LATCH,
+ true, mtr);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_page_get_prev(merge_block->frame)
+ == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+ ut_d(parent_is_different = page_rec_is_supremum(
+ page_rec_get_next(btr_cur_get_rec(&parent_cursor))));
+ if (!page_is_leaf(merge_block->frame)) {
+ rec_t* node_ptr = page_rec_get_next(
+ page_get_infimum_rec(merge_block->frame));
+ ut_ad(page_rec_is_user_rec(node_ptr));
+ /* We have to mark the leftmost node pointer as the
+ predefined minimum record. */
+ btr_set_min_rec_mark<true>(node_ptr, *merge_block,
+ mtr);
+ }
+ } else {
+ btr_discard_only_page_on_level(index, block, mtr);
+
+ return;
+ }
+
+ ut_a(page_is_comp(merge_block->frame) == page_is_comp(block->frame));
+ ut_ad(!memcmp_aligned<2>(&merge_block->frame[PAGE_HEADER + PAGE_LEVEL],
+ &block->frame[PAGE_HEADER + PAGE_LEVEL], 2));
+ btr_search_drop_page_hash_index(block);
+
+ if (dict_index_is_spatial(index)) {
+ rtr_node_ptr_delete(&parent_cursor, mtr);
+ } else {
+ btr_cur_node_ptr_delete(&parent_cursor, mtr);
+ }
+
+ /* Remove the page from the level list */
+ btr_level_list_remove(*block, *index, mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_zip_des_t* merge_page_zip
+ = buf_block_get_page_zip(merge_block);
+ ut_a(!merge_page_zip
+ || page_zip_validate(merge_page_zip, merge_block->frame,
+ index));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (!dict_table_is_locking_disabled(index->table)) {
+ if (left_page_no != FIL_NULL) {
+ lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
+ block);
+ } else {
+ lock_update_discard(merge_block,
+ lock_get_min_heap_no(merge_block),
+ block);
+ }
+ }
+
+ if (dict_index_is_spatial(index)) {
+ rtr_check_discard_page(index, cursor, block);
+ }
+
+ /* Free the file page */
+ btr_page_free(index, block, mtr);
+
+ /* btr_check_node_ptr() needs parent block latched.
+ If the merge_block's parent block is not same,
+ we cannot use btr_check_node_ptr() */
+ ut_ad(parent_is_different
+ || btr_check_node_ptr(index, merge_block, mtr));
+
+ if (btr_cur_get_block(&parent_cursor)->page.id().page_no()
+ == index->page
+ && !page_has_siblings(btr_cur_get_page(&parent_cursor))
+ && page_get_n_recs(btr_cur_get_page(&parent_cursor)) == 1) {
+ btr_lift_page_up(index, merge_block, mtr);
+ }
+}
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+void
+btr_print_size(
+/*===========*/
+ dict_index_t* index) /*!< in: index tree */
+{
+ page_t* root;
+ fseg_header_t* seg;
+ mtr_t mtr;
+
+ if (dict_index_is_ibuf(index)) {
+ fputs("Sorry, cannot print info of an ibuf tree:"
+ " use ibuf functions\n", stderr);
+
+ return;
+ }
+
+ mtr_start(&mtr);
+
+ root = btr_root_get(index, &mtr);
+
+ seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+ fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
+ fseg_print(seg, &mtr);
+
+ if (!dict_index_is_ibuf(index)) {
+
+ seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+ fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
+ fseg_print(seg, &mtr);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/************************************************************//**
+Prints recursively index tree pages. */
+static
+void
+btr_print_recursive(
+/*================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: index page */
+ ulint width, /*!< in: print this many entries from start
+ and end */
+ mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
+ rec_offs** offsets,/*!< in/out: buffer for rec_get_offsets() */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const page_t* page = buf_block_get_frame(block);
+ page_cur_t cursor;
+ ulint n_recs;
+ ulint i = 0;
+ mtr_t mtr2;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX));
+
+ ib::info() << "NODE ON LEVEL " << btr_page_get_level(page)
+ << " page " << block->page.id;
+
+ page_print(block, index, width, width);
+
+ n_recs = page_get_n_recs(page);
+
+ page_cur_set_before_first(block, &cursor);
+ page_cur_move_to_next(&cursor);
+
+ while (!page_cur_is_after_last(&cursor)) {
+
+ if (page_is_leaf(page)) {
+
+ /* If this is the leaf level, do nothing */
+
+ } else if ((i <= width) || (i >= n_recs - width)) {
+
+ const rec_t* node_ptr;
+
+ mtr_start(&mtr2);
+
+ node_ptr = page_cur_get_rec(&cursor);
+
+ *offsets = rec_get_offsets(
+ node_ptr, index, *offsets, 0,
+ ULINT_UNDEFINED, heap);
+ btr_print_recursive(index,
+ btr_node_ptr_get_child(node_ptr,
+ index,
+ *offsets,
+ &mtr2),
+ width, heap, offsets, &mtr2);
+ mtr_commit(&mtr2);
+ }
+
+ page_cur_move_to_next(&cursor);
+ i++;
+ }
+}
+
+/**************************************************************//**
+Prints directories and other info of all nodes in the tree. */
+void
+btr_print_index(
+/*============*/
+ dict_index_t* index, /*!< in: index */
+ ulint width) /*!< in: print this many entries from start
+ and end */
+{
+ mtr_t mtr;
+ buf_block_t* root;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ fputs("--------------------------\n"
+ "INDEX TREE PRINT\n", stderr);
+
+ mtr_start(&mtr);
+
+ root = btr_root_block_get(index, RW_SX_LATCH, &mtr);
+
+ btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ mtr_commit(&mtr);
+
+ ut_ad(btr_validate_index(index, 0));
+}
+#endif /* UNIV_BTR_PRINT */
+
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+ibool
+btr_check_node_ptr(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: index page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mem_heap_t* heap;
+ dtuple_t* tuple;
+ rec_offs* offsets;
+ btr_cur_t cursor;
+ page_t* page = buf_block_get_frame(block);
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+ if (dict_index_get_page(index) == block->page.id().page_no()) {
+
+ return(TRUE);
+ }
+
+ heap = mem_heap_create(256);
+
+ if (dict_index_is_spatial(index)) {
+ offsets = rtr_page_get_father_block(NULL, heap, index, block, mtr,
+ NULL, &cursor);
+ } else {
+ offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
+ &cursor);
+ }
+
+ if (page_is_leaf(page)) {
+
+ goto func_exit;
+ }
+
+ tuple = dict_index_build_node_ptr(
+ index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
+ btr_page_get_level(page));
+
+ /* For spatial index, the MBR in the parent rec could be different
+ with that of first rec of child, their relationship should be
+ "WITHIN" relationship */
+ if (dict_index_is_spatial(index)) {
+ ut_a(!cmp_dtuple_rec_with_gis(
+ tuple, btr_cur_get_rec(&cursor),
+ PAGE_CUR_WITHIN));
+ } else {
+ ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
+ }
+func_exit:
+ mem_heap_free(heap);
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+Display identification information for a record. */
+static
+void
+btr_index_rec_validate_report(
+/*==========================*/
+ const page_t* page, /*!< in: index page */
+ const rec_t* rec, /*!< in: index record */
+ const dict_index_t* index) /*!< in: index */
+{
+ ib::info() << "Record in index " << index->name
+ << " of table " << index->table->name
+ << ", page " << page_id_t(page_get_space_id(page),
+ page_get_page_no(page))
+ << ", at offset " << page_offset(rec);
+}
+
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+ibool
+btr_index_rec_validate(
+/*===================*/
+ const rec_t* rec, /*!< in: index record */
+ const dict_index_t* index, /*!< in: index */
+ ibool dump_on_error) /*!< in: TRUE if the function
+ should print hex dump of record
+ and page on error */
+{
+ ulint len;
+ const page_t* page;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ page = page_align(rec);
+
+ ut_ad(index->n_core_fields);
+
+ if (index->is_ibuf()) {
+ /* The insert buffer index tree can contain records from any
+ other index: we cannot check the number of fields or
+ their length */
+
+ return(TRUE);
+ }
+
+#ifdef VIRTUAL_INDEX_DEBUG
+ if (dict_index_has_virtual(index)) {
+ fprintf(stderr, "index name is %s\n", index->name());
+ }
+#endif
+ if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) {
+ btr_index_rec_validate_report(page, rec, index);
+
+ ib::error() << "Compact flag=" << !!page_is_comp(page)
+ << ", should be " << dict_table_is_comp(index->table);
+
+ return(FALSE);
+ }
+
+ const bool is_alter_metadata = page_is_leaf(page)
+ && !page_has_prev(page)
+ && index->is_primary() && index->table->instant
+ && rec == page_rec_get_next_const(page_get_infimum_rec(page));
+
+ if (is_alter_metadata
+ && !rec_is_alter_metadata(rec, page_is_comp(page))) {
+ btr_index_rec_validate_report(page, rec, index);
+
+ ib::error() << "First record is not ALTER TABLE metadata";
+ return FALSE;
+ }
+
+ if (!page_is_comp(page)) {
+ const ulint n_rec_fields = rec_get_n_fields_old(rec);
+ if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD
+ && index->id == DICT_INDEXES_ID) {
+ /* A record for older SYS_INDEXES table
+ (missing merge_threshold column) is acceptable. */
+ } else if (is_alter_metadata) {
+ if (n_rec_fields != ulint(index->n_fields) + 1) {
+ goto n_field_mismatch;
+ }
+ } else if (n_rec_fields < index->n_core_fields
+ || n_rec_fields > index->n_fields) {
+n_field_mismatch:
+ btr_index_rec_validate_report(page, rec, index);
+
+ ib::error() << "Has " << rec_get_n_fields_old(rec)
+ << " fields, should have "
+ << index->n_core_fields << ".."
+ << index->n_fields;
+
+ if (dump_on_error) {
+ fputs("InnoDB: corrupt record ", stderr);
+ rec_print_old(stderr, rec);
+ putc('\n', stderr);
+ }
+ return(FALSE);
+ }
+ }
+
+ offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ const dict_field_t* field = index->fields;
+ ut_ad(rec_offs_n_fields(offsets)
+ == ulint(index->n_fields) + is_alter_metadata);
+
+ for (unsigned i = 0; i < rec_offs_n_fields(offsets); i++) {
+ rec_get_nth_field_offs(offsets, i, &len);
+
+ ulint fixed_size;
+
+ if (is_alter_metadata && i == index->first_user_field()) {
+ fixed_size = FIELD_REF_SIZE;
+ if (len != FIELD_REF_SIZE
+ || !rec_offs_nth_extern(offsets, i)) {
+ goto len_mismatch;
+ }
+
+ continue;
+ } else {
+ fixed_size = dict_col_get_fixed_size(
+ field->col, page_is_comp(page));
+ if (rec_offs_nth_extern(offsets, i)) {
+ const byte* data = rec_get_nth_field(
+ rec, offsets, i, &len);
+ len -= BTR_EXTERN_FIELD_REF_SIZE;
+ ulint extern_len = mach_read_from_4(
+ data + len + BTR_EXTERN_LEN + 4);
+ if (fixed_size == extern_len) {
+ goto next_field;
+ }
+ }
+ }
+
+ /* Note that if fixed_size != 0, it equals the
+ length of a fixed-size column in the clustered index.
+ We should adjust it here.
+ A prefix index of the column is of fixed, but different
+ length. When fixed_size == 0, prefix_len is the maximum
+ length of the prefix index column. */
+
+ if (len_is_stored(len)
+ && (field->prefix_len
+ ? len > field->prefix_len
+ : (fixed_size && len != fixed_size))) {
+len_mismatch:
+ btr_index_rec_validate_report(page, rec, index);
+ ib::error error;
+
+ error << "Field " << i << " len is " << len
+ << ", should be " << fixed_size;
+
+ if (dump_on_error) {
+ error << "; ";
+ rec_print(error.m_oss, rec,
+ rec_get_info_bits(
+ rec, rec_offs_comp(offsets)),
+ offsets);
+ }
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(FALSE);
+ }
+next_field:
+ field++;
+ }
+
+#ifdef VIRTUAL_INDEX_DEBUG
+ if (dict_index_has_virtual(index)) {
+ rec_print_new(stderr, rec, offsets);
+ }
+#endif
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(TRUE);
+}
+
+/************************************************************//**
+Checks the size and number of fields in records based on the definition of
+the index.
+@return TRUE if ok */
+static
+ibool
+btr_index_page_validate(
+/*====================*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index) /*!< in: index */
+{
+ page_cur_t cur;
+ ibool ret = TRUE;
+#ifndef DBUG_OFF
+ ulint nth = 1;
+#endif /* !DBUG_OFF */
+
+ page_cur_set_before_first(block, &cur);
+
+ /* Directory slot 0 should only contain the infimum record. */
+ DBUG_EXECUTE_IF("check_table_rec_next",
+ ut_a(page_rec_get_nth_const(
+ page_cur_get_page(&cur), 0)
+ == cur.rec);
+ ut_a(page_dir_slot_get_n_owned(
+ page_dir_get_nth_slot(
+ page_cur_get_page(&cur), 0))
+ == 1););
+
+ page_cur_move_to_next(&cur);
+
+ for (;;) {
+ if (page_cur_is_after_last(&cur)) {
+
+ break;
+ }
+
+ if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
+
+ return(FALSE);
+ }
+
+ /* Verify that page_rec_get_nth_const() is correctly
+ retrieving each record. */
+ DBUG_EXECUTE_IF("check_table_rec_next",
+ ut_a(cur.rec == page_rec_get_nth_const(
+ page_cur_get_page(&cur),
+ page_rec_get_n_recs_before(
+ cur.rec)));
+ ut_a(nth++ == page_rec_get_n_recs_before(
+ cur.rec)););
+
+ page_cur_move_to_next(&cur);
+ }
+
+ return(ret);
+}
+
+/************************************************************//**
+Report an error on one page of an index tree. */
+static
+void
+btr_validate_report1(
+/*=================*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: B-tree level */
+ const buf_block_t* block) /*!< in: index page */
+{
+ ib::error error;
+ error << "In page " << block->page.id().page_no()
+ << " of index " << index->name
+ << " of table " << index->table->name;
+
+ if (level > 0) {
+ error << ", index tree level " << level;
+ }
+}
+
+/************************************************************//**
+Report an error on two pages of an index tree. */
+static
+void
+btr_validate_report2(
+/*=================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: B-tree level */
+ const buf_block_t* block1, /*!< in: first index page */
+ const buf_block_t* block2) /*!< in: second index page */
+{
+ ib::error error;
+ error << "In pages " << block1->page.id()
+ << " and " << block2->page.id() << " of index " << index->name
+ << " of table " << index->table->name;
+
+ if (level)
+ error << ", index tree level " << level;
+}
+
+/************************************************************//**
+Validates index tree level.
+@return TRUE if ok */
+static
+bool
+btr_validate_level(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ const trx_t* trx, /*!< in: transaction or NULL */
+ ulint level, /*!< in: level number */
+ bool lockout)/*!< in: true if X-latch index is intended */
+{
+ buf_block_t* block;
+ page_t* page;
+ buf_block_t* right_block = 0; /* remove warning */
+ page_t* right_page = 0; /* remove warning */
+ page_t* father_page;
+ btr_cur_t node_cur;
+ btr_cur_t right_node_cur;
+ rec_t* rec;
+ page_cur_t cursor;
+ dtuple_t* node_ptr_tuple;
+ bool ret = true;
+ mtr_t mtr;
+ mem_heap_t* heap = mem_heap_create(256);
+ rec_offs* offsets = NULL;
+ rec_offs* offsets2= NULL;
+#ifdef UNIV_ZIP_DEBUG
+ page_zip_des_t* page_zip;
+#endif /* UNIV_ZIP_DEBUG */
+ ulint savepoint = 0;
+ ulint savepoint2 = 0;
+ uint32_t parent_page_no = FIL_NULL;
+ uint32_t parent_right_page_no = FIL_NULL;
+ bool rightmost_child = false;
+
+ mtr.start();
+
+ if (!srv_read_only_mode) {
+ if (lockout) {
+ mtr_x_lock_index(index, &mtr);
+ } else {
+ mtr_sx_lock_index(index, &mtr);
+ }
+ }
+
+ block = btr_root_block_get(index, RW_SX_LATCH, &mtr);
+ page = buf_block_get_frame(block);
+
+ fil_space_t* space = index->table->space;
+
+ while (level != btr_page_get_level(page)) {
+ const rec_t* node_ptr;
+
+ if (fseg_page_is_free(space, block->page.id().page_no())) {
+
+ btr_validate_report1(index, level, block);
+
+ ib::warn() << "Page is free";
+
+ ret = false;
+ }
+
+ ut_a(index->table->space_id == block->page.id().space());
+ ut_a(block->page.id().space() == page_get_space_id(page));
+#ifdef UNIV_ZIP_DEBUG
+ page_zip = buf_block_get_page_zip(block);
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+ ut_a(!page_is_leaf(page));
+
+ page_cur_set_before_first(block, &cursor);
+ page_cur_move_to_next(&cursor);
+
+ node_ptr = page_cur_get_rec(&cursor);
+ offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ savepoint2 = mtr_set_savepoint(&mtr);
+ block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
+ page = buf_block_get_frame(block);
+
+ /* For R-Tree, since record order might not be the same as
+ linked index page in the lower level, we need to travers
+ backwards to get the first page rec in this level.
+ This is only used for index validation. Spatial index
+ does not use such scan for any of its DML or query
+ operations */
+ if (dict_index_is_spatial(index)) {
+ uint32_t left_page_no = btr_page_get_prev(page);
+
+ while (left_page_no != FIL_NULL) {
+ /* To obey latch order of tree blocks,
+ we should release the right_block once to
+ obtain lock of the uncle block. */
+ mtr_release_block_at_savepoint(
+ &mtr, savepoint2, block);
+
+ savepoint2 = mtr_set_savepoint(&mtr);
+ block = btr_block_get(*index, left_page_no,
+ RW_SX_LATCH, false,
+ &mtr);
+ page = buf_block_get_frame(block);
+ left_page_no = btr_page_get_prev(page);
+ }
+ }
+ }
+
+ /* Now we are on the desired level. Loop through the pages on that
+ level. */
+
+loop:
+ mem_heap_empty(heap);
+ offsets = offsets2 = NULL;
+ if (!srv_read_only_mode) {
+ if (lockout) {
+ mtr_x_lock_index(index, &mtr);
+ } else {
+ mtr_sx_lock_index(index, &mtr);
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ page_zip = buf_block_get_page_zip(block);
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ ut_a(block->page.id().space() == index->table->space_id);
+
+ if (fseg_page_is_free(space, block->page.id().page_no())) {
+
+ btr_validate_report1(index, level, block);
+
+ ib::warn() << "Page is marked as free";
+ ret = false;
+
+ } else if (btr_page_get_index_id(page) != index->id) {
+
+ ib::error() << "Page index id " << btr_page_get_index_id(page)
+ << " != data dictionary index id " << index->id;
+
+ ret = false;
+
+ } else if (!page_validate(page, index)) {
+
+ btr_validate_report1(index, level, block);
+ ret = false;
+
+ } else if (level == 0 && !btr_index_page_validate(block, index)) {
+
+ /* We are on level 0. Check that the records have the right
+ number of fields, and field lengths are right. */
+
+ ret = false;
+ }
+
+ ut_a(btr_page_get_level(page) == level);
+
+ uint32_t right_page_no = btr_page_get_next(page);
+ uint32_t left_page_no = btr_page_get_prev(page);
+
+ ut_a(!page_is_empty(page)
+ || (level == 0
+ && page_get_page_no(page) == dict_index_get_page(index)));
+
+ if (right_page_no != FIL_NULL) {
+ const rec_t* right_rec;
+ savepoint = mtr_set_savepoint(&mtr);
+
+ right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
+ !level, &mtr);
+ right_page = buf_block_get_frame(right_block);
+
+ if (btr_page_get_prev(right_page) != page_get_page_no(page)) {
+ btr_validate_report2(index, level, block, right_block);
+ fputs("InnoDB: broken FIL_PAGE_NEXT"
+ " or FIL_PAGE_PREV links\n", stderr);
+
+ ret = false;
+ }
+
+ if (page_is_comp(right_page) != page_is_comp(page)) {
+ btr_validate_report2(index, level, block, right_block);
+ fputs("InnoDB: 'compact' flag mismatch\n", stderr);
+
+ ret = false;
+
+ goto node_ptr_fails;
+ }
+
+ rec = page_rec_get_prev(page_get_supremum_rec(page));
+ right_rec = page_rec_get_next(page_get_infimum_rec(
+ right_page));
+ offsets = rec_get_offsets(rec, index, offsets,
+ page_is_leaf(page)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ offsets2 = rec_get_offsets(right_rec, index, offsets2,
+ page_is_leaf(right_page)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+
+ /* For spatial index, we cannot guarantee the key ordering
+ across pages, so skip the record compare verification for
+ now. Will enhanced in special R-Tree index validation scheme */
+ if (!dict_index_is_spatial(index)
+ && cmp_rec_rec(rec, right_rec,
+ offsets, offsets2, index) >= 0) {
+
+ btr_validate_report2(index, level, block, right_block);
+
+ fputs("InnoDB: records in wrong order"
+ " on adjacent pages\n", stderr);
+
+ fputs("InnoDB: record ", stderr);
+ rec = page_rec_get_prev(page_get_supremum_rec(page));
+ rec_print(stderr, rec, index);
+ putc('\n', stderr);
+ fputs("InnoDB: record ", stderr);
+ rec = page_rec_get_next(
+ page_get_infimum_rec(right_page));
+ rec_print(stderr, rec, index);
+ putc('\n', stderr);
+
+ ret = false;
+ }
+ }
+
+ if (level > 0 && left_page_no == FIL_NULL) {
+ ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+ page_rec_get_next(page_get_infimum_rec(page)),
+ page_is_comp(page)));
+ }
+
+ /* Similarly skip the father node check for spatial index for now,
+ for a couple of reasons:
+ 1) As mentioned, there is no ordering relationship between records
+ in parent level and linked pages in the child level.
+ 2) Search parent from root is very costly for R-tree.
+ We will add special validation mechanism for R-tree later (WL #7520) */
+ if (!dict_index_is_spatial(index)
+ && block->page.id().page_no() != dict_index_get_page(index)) {
+
+ /* Check father node pointers */
+ rec_t* node_ptr;
+
+ btr_cur_position(
+ index, page_rec_get_next(page_get_infimum_rec(page)),
+ block, &node_cur);
+ offsets = btr_page_get_father_node_ptr_for_validate(
+ offsets, heap, &node_cur, &mtr);
+
+ father_page = btr_cur_get_page(&node_cur);
+ node_ptr = btr_cur_get_rec(&node_cur);
+
+ parent_page_no = page_get_page_no(father_page);
+ parent_right_page_no = btr_page_get_next(father_page);
+ rightmost_child = page_rec_is_supremum(
+ page_rec_get_next(node_ptr));
+
+ btr_cur_position(
+ index,
+ page_rec_get_prev(page_get_supremum_rec(page)),
+ block, &node_cur);
+
+ offsets = btr_page_get_father_node_ptr_for_validate(
+ offsets, heap, &node_cur, &mtr);
+
+ if (node_ptr != btr_cur_get_rec(&node_cur)
+ || btr_node_ptr_get_child_page_no(node_ptr, offsets)
+ != block->page.id().page_no()) {
+
+ btr_validate_report1(index, level, block);
+
+ fputs("InnoDB: node pointer to the page is wrong\n",
+ stderr);
+
+ fputs("InnoDB: node ptr ", stderr);
+ rec_print(stderr, node_ptr, index);
+
+ rec = btr_cur_get_rec(&node_cur);
+ fprintf(stderr, "\n"
+ "InnoDB: node ptr child page n:o %u\n",
+ btr_node_ptr_get_child_page_no(rec, offsets));
+
+ fputs("InnoDB: record on page ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ ret = false;
+
+ goto node_ptr_fails;
+ }
+
+ if (!page_is_leaf(page)) {
+ node_ptr_tuple = dict_index_build_node_ptr(
+ index,
+ page_rec_get_next(page_get_infimum_rec(page)),
+ 0, heap, btr_page_get_level(page));
+
+ if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
+ offsets)) {
+ const rec_t* first_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+
+ btr_validate_report1(index, level, block);
+
+ ib::error() << "Node ptrs differ on levels > 0";
+
+ fputs("InnoDB: node ptr ",stderr);
+ rec_print_new(stderr, node_ptr, offsets);
+ fputs("InnoDB: first rec ", stderr);
+ rec_print(stderr, first_rec, index);
+ putc('\n', stderr);
+ ret = false;
+
+ goto node_ptr_fails;
+ }
+ }
+
+ if (left_page_no == FIL_NULL) {
+ ut_a(node_ptr == page_rec_get_next(
+ page_get_infimum_rec(father_page)));
+ ut_a(!page_has_prev(father_page));
+ }
+
+ if (right_page_no == FIL_NULL) {
+ ut_a(node_ptr == page_rec_get_prev(
+ page_get_supremum_rec(father_page)));
+ ut_a(!page_has_next(father_page));
+ } else {
+ const rec_t* right_node_ptr;
+
+ right_node_ptr = page_rec_get_next(node_ptr);
+
+ if (!lockout && rightmost_child) {
+
+ /* To obey latch order of tree blocks,
+ we should release the right_block once to
+ obtain lock of the uncle block. */
+ mtr_release_block_at_savepoint(
+ &mtr, savepoint, right_block);
+
+ if (parent_right_page_no != FIL_NULL) {
+ btr_block_get(*index,
+ parent_right_page_no,
+ RW_SX_LATCH, false,
+ &mtr);
+ }
+
+ right_block = btr_block_get(*index,
+ right_page_no,
+ RW_SX_LATCH,
+ !level, &mtr);
+ }
+
+ btr_cur_position(
+ index, page_rec_get_next(
+ page_get_infimum_rec(
+ buf_block_get_frame(
+ right_block))),
+ right_block, &right_node_cur);
+
+ offsets = btr_page_get_father_node_ptr_for_validate(
+ offsets, heap, &right_node_cur, &mtr);
+
+ if (right_node_ptr
+ != page_get_supremum_rec(father_page)) {
+
+ if (btr_cur_get_rec(&right_node_cur)
+ != right_node_ptr) {
+ ret = false;
+ fputs("InnoDB: node pointer to"
+ " the right page is wrong\n",
+ stderr);
+
+ btr_validate_report1(index, level,
+ block);
+ }
+ } else {
+ page_t* right_father_page
+ = btr_cur_get_page(&right_node_cur);
+
+ if (btr_cur_get_rec(&right_node_cur)
+ != page_rec_get_next(
+ page_get_infimum_rec(
+ right_father_page))) {
+ ret = false;
+ fputs("InnoDB: node pointer 2 to"
+ " the right page is wrong\n",
+ stderr);
+
+ btr_validate_report1(index, level,
+ block);
+ }
+
+ if (page_get_page_no(right_father_page)
+ != btr_page_get_next(father_page)) {
+
+ ret = false;
+ fputs("InnoDB: node pointer 3 to"
+ " the right page is wrong\n",
+ stderr);
+
+ btr_validate_report1(index, level,
+ block);
+ }
+ }
+ }
+ }
+
+node_ptr_fails:
+ /* Commit the mini-transaction to release the latch on 'page'.
+ Re-acquire the latch on right_page, which will become 'page'
+ on the next loop. The page has already been checked. */
+ mtr.commit();
+
+ if (trx_is_interrupted(trx)) {
+ /* On interrupt, return the current status. */
+ } else if (right_page_no != FIL_NULL) {
+
+ mtr.start();
+
+ if (!lockout) {
+ if (rightmost_child) {
+ if (parent_right_page_no != FIL_NULL) {
+ btr_block_get(*index,
+ parent_right_page_no,
+ RW_SX_LATCH, false,
+ &mtr);
+ }
+ } else if (parent_page_no != FIL_NULL) {
+ btr_block_get(*index, parent_page_no,
+ RW_SX_LATCH, false, &mtr);
+ }
+ }
+
+ block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
+ !level, &mtr);
+ page = buf_block_get_frame(block);
+
+ goto loop;
+ }
+
+ mem_heap_free(heap);
+
+ return(ret);
+}
+
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return DB_SUCCESS if ok, error code if not */
+dberr_t
+btr_validate_index(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ const trx_t* trx) /*!< in: transaction or NULL */
+{
+ dberr_t err = DB_SUCCESS;
+ bool lockout = dict_index_is_spatial(index);
+
+ /* Full Text index are implemented by auxiliary tables,
+ not the B-tree */
+ if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
+ return(err);
+ }
+
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ if (!srv_read_only_mode) {
+ if (lockout) {
+ mtr_x_lock_index(index, &mtr);
+ } else {
+ mtr_sx_lock_index(index, &mtr);
+ }
+ }
+
+ page_t* root = btr_root_get(index, &mtr);
+
+ if (!root) {
+ mtr_commit(&mtr);
+ return DB_CORRUPTION;
+ }
+
+ ulint n = btr_page_get_level(root);
+
+ btr_validate_index_running++;
+ for (ulint i = 0; i <= n; ++i) {
+
+ if (!btr_validate_level(index, trx, n - i, lockout)) {
+ err = DB_CORRUPTION;
+ }
+ }
+
+ mtr_commit(&mtr);
+ /* In theory we need release barrier here, so that
+ btr_validate_index_running decrement is guaranteed to
+ happen after latches are released.
+
+ Original code issued SEQ_CST on update and non-atomic
+ access on load. Which means it had broken synchronisation
+ as well. */
+ btr_validate_index_running--;
+
+ return(err);
+}
+
+/**************************************************************//**
+Checks if the page in the cursor can be merged with given page.
+If necessary, re-organize the merge_page.
+@return true if possible to merge. */
+static
+bool
+btr_can_merge_with_page(
+/*====================*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to merge */
+ uint32_t page_no, /*!< in: a sibling page */
+ buf_block_t** merge_block, /*!< out: the merge block */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ dict_index_t* index;
+ page_t* page;
+ ulint n_recs;
+ ulint data_size;
+ ulint max_ins_size_reorg;
+ ulint max_ins_size;
+ buf_block_t* mblock;
+ page_t* mpage;
+ DBUG_ENTER("btr_can_merge_with_page");
+
+ if (page_no == FIL_NULL) {
+ *merge_block = NULL;
+ DBUG_RETURN(false);
+ }
+
+ index = btr_cur_get_index(cursor);
+ page = btr_cur_get_page(cursor);
+
+ mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page),
+ mtr);
+ mpage = buf_block_get_frame(mblock);
+
+ n_recs = page_get_n_recs(page);
+ data_size = page_get_data_size(page);
+
+ max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
+ mpage, n_recs);
+
+ if (data_size > max_ins_size_reorg) {
+ goto error;
+ }
+
+ /* If compression padding tells us that merging will result in
+ too packed up page i.e.: which is likely to cause compression
+ failure then don't merge the pages. */
+ if (mblock->page.zip.data && page_is_leaf(mpage)
+ && (page_get_data_size(mpage) + data_size
+ >= dict_index_zip_pad_optimal_page_size(index))) {
+
+ goto error;
+ }
+
+ max_ins_size = page_get_max_insert_size(mpage, n_recs);
+
+ if (data_size > max_ins_size) {
+ /* We have to reorganize mpage */
+ if (!btr_page_reorganize_block(page_zip_level, mblock, index,
+ mtr)) {
+ goto error;
+ }
+
+ max_ins_size = page_get_max_insert_size(mpage, n_recs);
+
+ ut_ad(page_validate(mpage, index));
+ ut_ad(max_ins_size == max_ins_size_reorg);
+
+ if (data_size > max_ins_size) {
+
+ /* Add fault tolerance, though this should
+ never happen */
+
+ goto error;
+ }
+ }
+
+ *merge_block = mblock;
+ DBUG_RETURN(true);
+
+error:
+ *merge_block = NULL;
+ DBUG_RETURN(false);
+}
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
new file mode 100644
index 00000000..9004064a
--- /dev/null
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -0,0 +1,1238 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0bulk.cc
+The B-tree bulk load
+
+Created 03/11/2014 Shaohua Wang
+*******************************************************/
+
+#include "btr0bulk.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "ibuf0ibuf.h"
+#include "page0page.h"
+#include "trx0trx.h"
+
+/** Innodb B-tree index fill factor for bulk load. */
+uint innobase_fill_factor;
+
+/** Initialize members, allocate page if needed and start mtr.
+Note: we commit all mtrs on failure.
+@return error code. */
+dberr_t
+PageBulk::init()
+{
+ buf_block_t* new_block;
+ page_t* new_page;
+
+ ut_ad(m_heap == NULL);
+ m_heap = mem_heap_create(1000);
+
+ m_mtr.start();
+ m_index->set_modified(m_mtr);
+
+ if (m_page_no == FIL_NULL) {
+ mtr_t alloc_mtr;
+
+ /* We commit redo log for allocation by a separate mtr,
+ because we don't guarantee pages are committed following
+ the allocation order, and we will always generate redo log
+ for page allocation, even when creating a new tablespace. */
+ alloc_mtr.start();
+ m_index->set_modified(alloc_mtr);
+
+ uint32_t n_reserved;
+ if (!fsp_reserve_free_extents(&n_reserved,
+ m_index->table->space,
+ 1, FSP_NORMAL, &alloc_mtr)) {
+ alloc_mtr.commit();
+ m_mtr.commit();
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ /* Allocate a new page. */
+ new_block = btr_page_alloc(m_index, 0, FSP_UP, m_level,
+ &alloc_mtr, &m_mtr);
+
+ m_index->table->space->release_free_extents(n_reserved);
+
+ alloc_mtr.commit();
+
+ new_page = buf_block_get_frame(new_block);
+ m_page_no = new_block->page.id().page_no();
+
+ byte* index_id = my_assume_aligned<2>
+ (PAGE_HEADER + PAGE_INDEX_ID + new_page);
+ compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ memset_aligned<8>(new_page + FIL_PAGE_PREV, 0xff, 8);
+
+ if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+ mach_write_to_8(index_id, m_index->id);
+ page_create_zip(new_block, m_index, m_level, 0,
+ &m_mtr);
+ } else {
+ ut_ad(!m_index->is_spatial());
+ page_create(new_block, &m_mtr,
+ m_index->table->not_redundant());
+ m_mtr.memset(*new_block, FIL_PAGE_PREV, 8, 0xff);
+ m_mtr.write<2,mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER
+ + PAGE_LEVEL
+ + new_page, m_level);
+ m_mtr.write<8>(*new_block, index_id, m_index->id);
+ }
+ } else {
+ new_block = btr_block_get(*m_index, m_page_no, RW_X_LATCH,
+ false, &m_mtr);
+
+ new_page = buf_block_get_frame(new_block);
+ ut_ad(new_block->page.id().page_no() == m_page_no);
+
+ ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW);
+
+ btr_page_set_level(new_block, m_level, &m_mtr);
+ }
+
+ m_page_zip = buf_block_get_page_zip(new_block);
+
+ if (!m_level && dict_index_is_sec_or_ibuf(m_index)) {
+ page_update_max_trx_id(new_block, m_page_zip, m_trx_id,
+ &m_mtr);
+ }
+
+ m_block = new_block;
+ m_page = new_page;
+ m_cur_rec = page_get_infimum_rec(new_page);
+ ut_ad(m_is_comp == !!page_is_comp(new_page));
+ m_free_space = page_get_free_space_of_empty(m_is_comp);
+
+ if (innobase_fill_factor == 100 && dict_index_is_clust(m_index)) {
+ /* Keep default behavior compatible with 5.6 */
+ m_reserved_space = dict_index_get_space_reserve();
+ } else {
+ m_reserved_space =
+ srv_page_size * (100 - innobase_fill_factor) / 100;
+ }
+
+ m_padding_space =
+ srv_page_size - dict_index_zip_pad_optimal_page_size(m_index);
+ m_heap_top = page_header_get_ptr(new_page, PAGE_HEAP_TOP);
+ m_rec_no = page_header_get_field(new_page, PAGE_N_RECS);
+ /* Temporarily reset PAGE_DIRECTION_B from PAGE_NO_DIRECTION to 0,
+ without writing redo log, to ensure that needs_finish() will hold
+ on an empty page. */
+ ut_ad(m_page[PAGE_HEADER + PAGE_DIRECTION_B] == PAGE_NO_DIRECTION);
+ m_page[PAGE_HEADER + PAGE_DIRECTION_B] = 0;
+ ut_d(m_total_data = 0);
+
+ return(DB_SUCCESS);
+}
+
+/** Insert a record in the page.
+@tparam fmt the page format
+@param[in,out] rec record
+@param[in] offsets record offsets */
+template<PageBulk::format fmt>
+inline void PageBulk::insertPage(rec_t *rec, rec_offs *offsets)
+{
+ ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
+ ut_ad((fmt != REDUNDANT) == m_is_comp);
+ ut_ad(page_align(m_heap_top) == m_page);
+ ut_ad(m_heap);
+
+ const ulint rec_size= rec_offs_size(offsets);
+ const ulint extra_size= rec_offs_extra_size(offsets);
+ ut_ad(page_align(m_heap_top + rec_size) == m_page);
+ ut_d(const bool is_leaf= page_rec_is_leaf(m_cur_rec));
+
+#ifdef UNIV_DEBUG
+ /* Check whether records are in order. */
+ if (page_offset(m_cur_rec) !=
+ (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+ {
+ const rec_t *old_rec = m_cur_rec;
+ rec_offs *old_offsets= rec_get_offsets(old_rec, m_index, nullptr, is_leaf
+ ? m_index->n_core_fields : 0,
+ ULINT_UNDEFINED, &m_heap);
+ ut_ad(cmp_rec_rec(rec, old_rec, offsets, old_offsets, m_index) > 0);
+ }
+
+ m_total_data+= rec_size;
+#endif /* UNIV_DEBUG */
+
+ rec_t* const insert_rec= m_heap_top + extra_size;
+
+ /* Insert the record in the linked list. */
+ if (fmt != REDUNDANT)
+ {
+ const rec_t *next_rec= m_page +
+ page_offset(m_cur_rec + mach_read_from_2(m_cur_rec - REC_NEXT));
+ if (fmt != COMPRESSED)
+ m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT,
+ static_cast<uint16_t>(insert_rec - m_cur_rec));
+ else
+ {
+ mach_write_to_2(m_cur_rec - REC_NEXT,
+ static_cast<uint16_t>(insert_rec - m_cur_rec));
+ memcpy(m_heap_top, rec - extra_size, rec_size);
+ }
+
+ rec_t * const this_rec= fmt != COMPRESSED
+ ? const_cast<rec_t*>(rec) : insert_rec;
+ rec_set_bit_field_1(this_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK,
+ REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(this_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no,
+ REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ mach_write_to_2(this_rec - REC_NEXT,
+ static_cast<uint16_t>(next_rec - insert_rec));
+ }
+ else
+ {
+ memcpy(const_cast<rec_t*>(rec) - REC_NEXT, m_cur_rec - REC_NEXT, 2);
+ m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, page_offset(insert_rec));
+ rec_set_bit_field_1(const_cast<rec_t*>(rec), 0,
+ REC_OLD_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(const_cast<rec_t*>(rec),
+ PAGE_HEAP_NO_USER_LOW + m_rec_no,
+ REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ }
+
+ if (fmt == COMPRESSED)
+ /* We already wrote the record. Log is written in PageBulk::compress(). */;
+ else if (page_offset(m_cur_rec) ==
+ (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+ m_mtr.memcpy(*m_block, m_heap_top, rec - extra_size, rec_size);
+ else
+ {
+ /* Try to copy common prefix from the preceding record. */
+ const byte *r= rec - extra_size;
+ const byte * const insert_rec_end= m_heap_top + rec_size;
+ byte *b= m_heap_top;
+
+ /* Skip any unchanged prefix of the record. */
+ for (; * b == *r; b++, r++);
+
+ ut_ad(b < insert_rec_end);
+
+ const byte *c= m_cur_rec - (rec - r);
+ const byte * const c_end= std::min(m_cur_rec + rec_offs_data_size(offsets),
+ m_heap_top);
+
+ /* Try to copy any bytes of the preceding record. */
+ if (UNIV_LIKELY(c >= m_page && c < c_end))
+ {
+ const byte *cm= c;
+ byte *bm= b;
+ const byte *rm= r;
+ for (; cm < c_end && *rm == *cm; cm++, bm++, rm++);
+ ut_ad(bm <= insert_rec_end);
+ size_t len= static_cast<size_t>(rm - r);
+ ut_ad(!memcmp(r, c, len));
+ if (len > 2)
+ {
+ memcpy(b, c, len);
+ m_mtr.memmove(*m_block, page_offset(b), page_offset(c), len);
+ c= cm;
+ b= bm;
+ r= rm;
+ }
+ }
+
+ if (c < m_cur_rec)
+ {
+ if (!rec_offs_data_size(offsets))
+ {
+no_data:
+ m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+ goto rec_done;
+ }
+ /* Some header bytes differ. Compare the data separately. */
+ const byte *cd= m_cur_rec;
+ byte *bd= insert_rec;
+ const byte *rd= rec;
+ /* Skip any unchanged prefix of the record. */
+ for (;; cd++, bd++, rd++)
+ if (bd == insert_rec_end)
+ goto no_data;
+ else if (*bd != *rd)
+ break;
+
+ /* Try to copy any data bytes of the preceding record. */
+ if (c_end - cd > 2)
+ {
+ const byte *cdm= cd;
+ const byte *rdm= rd;
+ for (; cdm < c_end && *rdm == *cdm; cdm++, rdm++)
+ ut_ad(rdm - rd + bd <= insert_rec_end);
+ size_t len= static_cast<size_t>(rdm - rd);
+ ut_ad(!memcmp(rd, cd, len));
+ if (len > 2)
+ {
+ m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+ memcpy(bd, cd, len);
+ m_mtr.memmove(*m_block, page_offset(bd), page_offset(cd), len);
+ c= cdm;
+ b= rdm - rd + bd;
+ r= rdm;
+ }
+ }
+ }
+
+ if (size_t len= static_cast<size_t>(insert_rec_end - b))
+ m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, len);
+ }
+
+rec_done:
+ ut_ad(fmt == COMPRESSED || !memcmp(m_heap_top, rec - extra_size, rec_size));
+ rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets);
+
+ /* Update the member variables. */
+ ulint slot_size= page_dir_calc_reserved_space(m_rec_no + 1) -
+ page_dir_calc_reserved_space(m_rec_no);
+
+ ut_ad(m_free_space >= rec_size + slot_size);
+ ut_ad(m_heap_top + rec_size < m_page + srv_page_size);
+
+ m_free_space-= rec_size + slot_size;
+ m_heap_top+= rec_size;
+ m_rec_no++;
+ m_cur_rec= insert_rec;
+}
+
+/** Insert a record in the page.
+@param[in] rec record
+@param[in] offsets record offsets */
+inline void PageBulk::insert(const rec_t *rec, rec_offs *offsets)
+{
+ byte rec_hdr[REC_N_OLD_EXTRA_BYTES];
+ static_assert(REC_N_OLD_EXTRA_BYTES > REC_N_NEW_EXTRA_BYTES, "file format");
+
+ if (UNIV_LIKELY_NULL(m_page_zip))
+ insertPage<COMPRESSED>(const_cast<rec_t*>(rec), offsets);
+ else if (m_is_comp)
+ {
+ memcpy(rec_hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES);
+ insertPage<DYNAMIC>(const_cast<rec_t*>(rec), offsets);
+ memcpy(const_cast<rec_t*>(rec) - REC_N_NEW_EXTRA_BYTES, rec_hdr,
+ REC_N_NEW_EXTRA_BYTES);
+ }
+ else
+ {
+ memcpy(rec_hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES);
+ insertPage<REDUNDANT>(const_cast<rec_t*>(rec), offsets);
+ memcpy(const_cast<rec_t*>(rec) - REC_N_OLD_EXTRA_BYTES, rec_hdr,
+ REC_N_OLD_EXTRA_BYTES);
+ }
+}
+
+/** Set the number of owned records in the uncompressed page of
+a ROW_FORMAT=COMPRESSED record without redo-logging. */
+static void rec_set_n_owned_zip(rec_t *rec, ulint n_owned)
+{
+ rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+}
+
+/** Mark end of insertion to the page. Scan all records to set page dirs,
+and set page header members.
+@tparam fmt page format */
+template<PageBulk::format fmt>
+inline void PageBulk::finishPage()
+{
+ ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
+ ut_ad((fmt != REDUNDANT) == m_is_comp);
+
+ ulint count= 0;
+ ulint n_recs= 0;
+ byte *slot= my_assume_aligned<2>(m_page + srv_page_size -
+ (PAGE_DIR + PAGE_DIR_SLOT_SIZE));
+ const page_dir_slot_t *const slot0 = slot;
+ compile_time_assert(PAGE_DIR_SLOT_SIZE == 2);
+ if (fmt != REDUNDANT)
+ {
+ uint16_t offset= mach_read_from_2(PAGE_NEW_INFIMUM - REC_NEXT + m_page);
+ ut_ad(offset >= PAGE_NEW_SUPREMUM - PAGE_NEW_INFIMUM);
+ offset= static_cast<uint16_t>(offset + PAGE_NEW_INFIMUM);
+ /* Set owner & dir. */
+ while (offset != PAGE_NEW_SUPREMUM)
+ {
+ ut_ad(offset >= PAGE_NEW_SUPREMUM);
+ ut_ad(offset < page_offset(slot));
+ count++;
+ n_recs++;
+
+ if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
+ {
+ slot-= PAGE_DIR_SLOT_SIZE;
+ mach_write_to_2(slot, offset);
+
+ if (fmt != COMPRESSED)
+ page_rec_set_n_owned<false>(m_block, m_page + offset, count, true,
+ &m_mtr);
+ else
+ rec_set_n_owned_zip(m_page + offset, count);
+
+ count= 0;
+ }
+
+ uint16_t next= static_cast<uint16_t>
+ ((mach_read_from_2(m_page + offset - REC_NEXT) + offset) &
+ (srv_page_size - 1));
+ ut_ad(next);
+ offset= next;
+ }
+
+ if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <=
+ PAGE_DIR_SLOT_MAX_N_OWNED))
+ {
+ /* Merge the last two slots, like page_cur_insert_rec_low() does. */
+ count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+ rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot));
+ if (fmt != COMPRESSED)
+ page_rec_set_n_owned<false>(m_block, rec, 0, true, &m_mtr);
+ else
+ rec_set_n_owned_zip(rec, 0);
+ }
+ else
+ slot-= PAGE_DIR_SLOT_SIZE;
+
+ mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+ if (fmt != COMPRESSED)
+ page_rec_set_n_owned<false>(m_block, m_page + PAGE_NEW_SUPREMUM,
+ count + 1, true, &m_mtr);
+ else
+ rec_set_n_owned_zip(m_page + PAGE_NEW_SUPREMUM, count + 1);
+ }
+ else
+ {
+ rec_t *insert_rec= m_page +
+ mach_read_from_2(PAGE_OLD_INFIMUM - REC_NEXT + m_page);
+
+ /* Set owner & dir. */
+ while (insert_rec != m_page + PAGE_OLD_SUPREMUM)
+ {
+ count++;
+ n_recs++;
+
+ if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
+ {
+ slot-= PAGE_DIR_SLOT_SIZE;
+ mach_write_to_2(slot, page_offset(insert_rec));
+ page_rec_set_n_owned<false>(m_block, insert_rec, count, false, &m_mtr);
+ count= 0;
+ }
+
+ insert_rec= m_page + mach_read_from_2(insert_rec - REC_NEXT);
+ }
+
+ if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <=
+ PAGE_DIR_SLOT_MAX_N_OWNED))
+ {
+ /* Merge the last two slots, like page_cur_insert_rec_low() does. */
+ count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+ rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot));
+ page_rec_set_n_owned<false>(m_block, rec, 0, false, &m_mtr);
+ }
+ else
+ slot-= PAGE_DIR_SLOT_SIZE;
+
+ mach_write_to_2(slot, PAGE_OLD_SUPREMUM);
+ page_rec_set_n_owned<false>(m_block, m_page + PAGE_OLD_SUPREMUM, count + 1,
+ false, &m_mtr);
+ }
+
+ if (!m_rec_no);
+ else if (fmt != COMPRESSED)
+ {
+ static_assert(PAGE_N_DIR_SLOTS == 0, "compatibility");
+ alignas(8) byte page_header[PAGE_N_HEAP + 2];
+ mach_write_to_2(page_header + PAGE_N_DIR_SLOTS,
+ 1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+ mach_write_to_2(page_header + PAGE_HEAP_TOP, m_heap_top - m_page);
+ mach_write_to_2(page_header + PAGE_N_HEAP,
+ (PAGE_HEAP_NO_USER_LOW + m_rec_no) |
+ uint16_t{fmt != REDUNDANT} << 15);
+ m_mtr.memcpy(*m_block, PAGE_HEADER + m_page, page_header,
+ sizeof page_header);
+ m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
+ m_mtr.memcpy(*m_block, page_offset(slot), slot0 - slot);
+ }
+ else
+ {
+ /* For ROW_FORMAT=COMPRESSED, redo log may be written in
+ PageBulk::compress(). */
+ mach_write_to_2(PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page,
+ 1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+ mach_write_to_2(PAGE_HEADER + PAGE_HEAP_TOP + m_page,
+ static_cast<ulint>(m_heap_top - m_page));
+ mach_write_to_2(PAGE_HEADER + PAGE_N_HEAP + m_page,
+ (PAGE_HEAP_NO_USER_LOW + m_rec_no) | 1U << 15);
+ mach_write_to_2(PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
+ }
+}
+
+inline bool PageBulk::needs_finish() const
+{
+ ut_ad(page_align(m_cur_rec) == m_block->frame);
+ ut_ad(m_page == m_block->frame);
+ if (!m_page[PAGE_HEADER + PAGE_DIRECTION_B])
+ return true;
+ ulint heap_no, n_heap= page_header_get_field(m_page, PAGE_N_HEAP);
+ ut_ad((n_heap & 0x7fff) >= PAGE_HEAP_NO_USER_LOW);
+ if (n_heap & 0x8000)
+ {
+ n_heap&= 0x7fff;
+ heap_no= rec_get_heap_no_new(m_cur_rec);
+ if (heap_no == PAGE_HEAP_NO_INFIMUM &&
+ page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_NEW_SUPREMUM_END)
+ return false;
+ }
+ else
+ {
+ heap_no= rec_get_heap_no_old(m_cur_rec);
+ if (heap_no == PAGE_HEAP_NO_INFIMUM &&
+ page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_OLD_SUPREMUM_END)
+ return false;
+ }
+ return heap_no != n_heap - 1;
+}
+
+/** Mark end of insertion to the page. Scan all records to set page dirs,
+and set page header members.
+@tparam compressed whether the page is in ROW_FORMAT=COMPRESSED */
+inline void PageBulk::finish()
+{
+ ut_ad(!m_index->is_spatial());
+
+ if (!needs_finish());
+ else if (UNIV_LIKELY_NULL(m_page_zip))
+ finishPage<COMPRESSED>();
+ else if (m_is_comp)
+ finishPage<DYNAMIC>();
+ else
+ finishPage<REDUNDANT>();
+
+ /* In MariaDB 10.2, 10.3, 10.4, we would initialize
+ PAGE_DIRECTION_B, PAGE_N_DIRECTION, PAGE_LAST_INSERT
+ in the same way as we would during normal INSERT operations.
+ Starting with MariaDB Server 10.5, bulk insert will not
+ touch those fields. */
+ ut_ad(!m_page[PAGE_HEADER + PAGE_INSTANT]);
+ /* Restore the temporary change of PageBulk::init() that was necessary to
+ ensure that PageBulk::needs_finish() holds on an empty page. */
+ m_page[PAGE_HEADER + PAGE_DIRECTION_B]= PAGE_NO_DIRECTION;
+
+ ut_ad(!page_header_get_field(m_page, PAGE_FREE));
+ ut_ad(!page_header_get_field(m_page, PAGE_GARBAGE));
+ ut_ad(!page_header_get_field(m_page, PAGE_LAST_INSERT));
+ ut_ad(!page_header_get_field(m_page, PAGE_N_DIRECTION));
+ ut_ad(m_total_data + page_dir_calc_reserved_space(m_rec_no) <=
+ page_get_free_space_of_empty(m_is_comp));
+ ut_ad(!needs_finish());
+ ut_ad(page_validate(m_page, m_index));
+}
+
+/** Commit inserts done to the page
+@param[in] success Flag whether all inserts succeed. */
+void PageBulk::commit(bool success)
+{
+ finish();
+ if (success && !dict_index_is_clust(m_index) && page_is_leaf(m_page))
+ ibuf_set_bitmap_for_bulk_load(m_block, innobase_fill_factor == 100);
+ m_mtr.commit();
+}
+
+/** Compress a page of compressed table
+@return true compress successfully or no need to compress
+@return false compress failed. */
+bool
+PageBulk::compress()
+{
+ ut_ad(m_page_zip != NULL);
+
+ return page_zip_compress(m_block, m_index, page_zip_level, &m_mtr);
+}
+
+/** Get node pointer
+@return node pointer */
+dtuple_t*
+PageBulk::getNodePtr()
+{
+ rec_t* first_rec;
+ dtuple_t* node_ptr;
+
+ /* Create node pointer */
+ first_rec = page_rec_get_next(page_get_infimum_rec(m_page));
+ ut_a(page_rec_is_user_rec(first_rec));
+ node_ptr = dict_index_build_node_ptr(m_index, first_rec, m_page_no,
+ m_heap, m_level);
+
+ return(node_ptr);
+}
+
+/** Get split rec in left page.We split a page in half when compresssion fails,
+and the split rec will be copied to right page.
+@return split rec */
+rec_t*
+PageBulk::getSplitRec()
+{
+ rec_t* rec;
+ rec_offs* offsets;
+ ulint total_used_size;
+ ulint total_recs_size;
+ ulint n_recs;
+
+ ut_ad(m_page_zip != NULL);
+ ut_ad(m_rec_no >= 2);
+ ut_ad(!m_index->is_instant());
+
+ ut_ad(page_get_free_space_of_empty(m_is_comp) > m_free_space);
+ total_used_size = page_get_free_space_of_empty(m_is_comp)
+ - m_free_space;
+
+ total_recs_size = 0;
+ n_recs = 0;
+ offsets = NULL;
+ rec = page_get_infimum_rec(m_page);
+ const ulint n_core = page_is_leaf(m_page) ? m_index->n_core_fields : 0;
+
+ do {
+ rec = page_rec_get_next(rec);
+ ut_ad(page_rec_is_user_rec(rec));
+
+ offsets = rec_get_offsets(rec, m_index, offsets, n_core,
+ ULINT_UNDEFINED, &m_heap);
+ total_recs_size += rec_offs_size(offsets);
+ n_recs++;
+ } while (total_recs_size + page_dir_calc_reserved_space(n_recs)
+ < total_used_size / 2);
+
+ /* Keep at least one record on left page */
+ if (page_rec_is_infimum(page_rec_get_prev(rec))) {
+ rec = page_rec_get_next(rec);
+ ut_ad(page_rec_is_user_rec(rec));
+ }
+
+ return(rec);
+}
+
+/** Copy all records after split rec including itself.
+@param[in] rec split rec */
+void
+PageBulk::copyIn(
+ rec_t* split_rec)
+{
+
+ rec_t* rec = split_rec;
+ rec_offs* offsets = NULL;
+
+ ut_ad(m_rec_no == 0);
+ ut_ad(page_rec_is_user_rec(rec));
+
+ const ulint n_core = page_rec_is_leaf(rec)
+ ? m_index->n_core_fields : 0;
+
+ do {
+ offsets = rec_get_offsets(rec, m_index, offsets, n_core,
+ ULINT_UNDEFINED, &m_heap);
+
+ insert(rec, offsets);
+
+ rec = page_rec_get_next(rec);
+ } while (!page_rec_is_supremum(rec));
+
+ ut_ad(m_rec_no > 0);
+}
+
+/** Remove all records after split rec including itself.
+@param[in] rec split rec */
+void
+PageBulk::copyOut(
+ rec_t* split_rec)
+{
+ rec_t* rec;
+ rec_t* last_rec;
+ ulint n;
+
+ /* Suppose before copyOut, we have 5 records on the page:
+ infimum->r1->r2->r3->r4->r5->supremum, and r3 is the split rec.
+
+ after copyOut, we have 2 records on the page:
+ infimum->r1->r2->supremum. slot ajustment is not done. */
+
+ rec = page_rec_get_next(page_get_infimum_rec(m_page));
+ last_rec = page_rec_get_prev(page_get_supremum_rec(m_page));
+ n = 0;
+
+ while (rec != split_rec) {
+ rec = page_rec_get_next(rec);
+ n++;
+ }
+
+ ut_ad(n > 0);
+
+ /* Set last record's next in page */
+ rec_offs* offsets = NULL;
+ rec = page_rec_get_prev(split_rec);
+ const ulint n_core = page_rec_is_leaf(split_rec)
+ ? m_index->n_core_fields : 0;
+
+ offsets = rec_get_offsets(rec, m_index, offsets, n_core,
+ ULINT_UNDEFINED, &m_heap);
+ mach_write_to_2(rec - REC_NEXT, m_is_comp
+ ? static_cast<uint16_t>
+ (PAGE_NEW_SUPREMUM - page_offset(rec))
+ : PAGE_OLD_SUPREMUM);
+
+ /* Set related members */
+ m_cur_rec = rec;
+ m_heap_top = rec_get_end(rec, offsets);
+
+ offsets = rec_get_offsets(last_rec, m_index, offsets, n_core,
+ ULINT_UNDEFINED, &m_heap);
+
+ m_free_space += ulint(rec_get_end(last_rec, offsets) - m_heap_top)
+ + page_dir_calc_reserved_space(m_rec_no)
+ - page_dir_calc_reserved_space(n);
+ ut_ad(lint(m_free_space) > 0);
+ m_rec_no = n;
+
+#ifdef UNIV_DEBUG
+ m_total_data -= ulint(rec_get_end(last_rec, offsets) - m_heap_top);
+#endif /* UNIV_DEBUG */
+}
+
+/** Set next page
+@param[in] next_page_no next page no */
+inline void PageBulk::setNext(ulint next_page_no)
+{
+ if (UNIV_LIKELY_NULL(m_page_zip))
+ /* For ROW_FORMAT=COMPRESSED, redo log may be written
+ in PageBulk::compress(). */
+ mach_write_to_4(m_page + FIL_PAGE_NEXT, next_page_no);
+ else
+ m_mtr.write<4>(*m_block, m_page + FIL_PAGE_NEXT, next_page_no);
+}
+
+/** Set previous page
+@param[in] prev_page_no previous page no */
+inline void PageBulk::setPrev(ulint prev_page_no)
+{
+ if (UNIV_LIKELY_NULL(m_page_zip))
+ /* For ROW_FORMAT=COMPRESSED, redo log may be written
+ in PageBulk::compress(). */
+ mach_write_to_4(m_page + FIL_PAGE_PREV, prev_page_no);
+ else
+ m_mtr.write<4>(*m_block, m_page + FIL_PAGE_PREV, prev_page_no);
+}
+
+/** Check if required space is available in the page for the rec to be inserted.
+We check fill factor & padding here.
+@param[in] length required length
+@return true if space is available */
+bool
+PageBulk::isSpaceAvailable(
+ ulint rec_size)
+{
+ ulint slot_size;
+ ulint required_space;
+
+ slot_size = page_dir_calc_reserved_space(m_rec_no + 1)
+ - page_dir_calc_reserved_space(m_rec_no);
+
+ required_space = rec_size + slot_size;
+
+ if (required_space > m_free_space) {
+ ut_ad(m_rec_no > 0);
+ return false;
+ }
+
+ /* Fillfactor & Padding apply to both leaf and non-leaf pages.
+ Note: we keep at least 2 records in a page to avoid B-tree level
+ growing too high. */
+ if (m_rec_no >= 2
+ && ((m_page_zip == NULL && m_free_space - required_space
+ < m_reserved_space)
+ || (m_page_zip != NULL && m_free_space - required_space
+ < m_padding_space))) {
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Check whether the record needs to be stored externally.
+@return false if the entire record can be stored locally on the page */
+bool
+PageBulk::needExt(
+ const dtuple_t* tuple,
+ ulint rec_size)
+{
+ return page_zip_rec_needs_ext(rec_size, m_is_comp,
+ dtuple_get_n_fields(tuple),
+ m_block->zip_size());
+}
+
+/** Store external record
+Since the record is not logged yet, so we don't log update to the record.
+the blob data is logged first, then the record is logged in bulk mode.
+@param[in] big_rec external recrod
+@param[in] offsets record offsets
+@return error code */
+dberr_t
+PageBulk::storeExt(
+ const big_rec_t* big_rec,
+ rec_offs* offsets)
+{
+ finish();
+
+ /* Note: not all fields are initialized in btr_pcur. */
+ btr_pcur_t btr_pcur;
+ btr_pcur.pos_state = BTR_PCUR_IS_POSITIONED;
+ btr_pcur.latch_mode = BTR_MODIFY_LEAF;
+ btr_pcur.btr_cur.index = m_index;
+ btr_pcur.btr_cur.page_cur.index = m_index;
+ btr_pcur.btr_cur.page_cur.rec = m_cur_rec;
+ btr_pcur.btr_cur.page_cur.offsets = offsets;
+ btr_pcur.btr_cur.page_cur.block = m_block;
+
+ dberr_t err = btr_store_big_rec_extern_fields(
+ &btr_pcur, offsets, big_rec, &m_mtr, BTR_STORE_INSERT_BULK);
+
+ /* Reset m_block and m_cur_rec from page cursor, because
+ block may be changed during blob insert. (FIXME: Can it really?) */
+ ut_ad(m_block == btr_pcur.btr_cur.page_cur.block);
+
+ m_block = btr_pcur.btr_cur.page_cur.block;
+ m_cur_rec = btr_pcur.btr_cur.page_cur.rec;
+ m_page = buf_block_get_frame(m_block);
+
+ return(err);
+}
+
+/** Release block by commiting mtr
+Note: log_free_check requires holding no lock/latch in current thread. */
+void
+PageBulk::release()
+{
+ finish();
+
+ /* We fix the block because we will re-pin it soon. */
+ buf_block_buf_fix_inc(m_block, __FILE__, __LINE__);
+
+ /* No other threads can modify this block. */
+ m_modify_clock = buf_block_get_modify_clock(m_block);
+
+ m_mtr.commit();
+}
+
+/** Start mtr and latch the block */
+dberr_t
+PageBulk::latch()
+{
+ m_mtr.start();
+ m_index->set_modified(m_mtr);
+
+ ut_ad(m_block->page.buf_fix_count());
+
+ /* In case the block is S-latched by page_cleaner. */
+ if (!buf_page_optimistic_get(RW_X_LATCH, m_block, m_modify_clock,
+ __FILE__, __LINE__, &m_mtr)) {
+ m_block = buf_page_get_gen(page_id_t(m_index->table->space_id,
+ m_page_no),
+ 0, RW_X_LATCH,
+ m_block, BUF_GET_IF_IN_POOL,
+ __FILE__, __LINE__, &m_mtr, &m_err);
+
+ if (m_err != DB_SUCCESS) {
+ return (m_err);
+ }
+
+ ut_ad(m_block != NULL);
+ }
+
+ buf_block_buf_fix_dec(m_block);
+
+ ut_ad(m_block->page.buf_fix_count());
+
+ ut_ad(m_cur_rec > m_page && m_cur_rec < m_heap_top);
+
+ return (m_err);
+}
+
+/** Split a page
+@param[in] page_bulk page to split
+@param[in] next_page_bulk next page
+@return error code */
+dberr_t
+BtrBulk::pageSplit(
+ PageBulk* page_bulk,
+ PageBulk* next_page_bulk)
+{
+ ut_ad(page_bulk->getPageZip() != NULL);
+
+ if (page_bulk->getRecNo() <= 1) {
+ return(DB_TOO_BIG_RECORD);
+ }
+
+ /* Initialize a new page */
+ PageBulk new_page_bulk(m_index, m_trx->id, FIL_NULL,
+ page_bulk->getLevel());
+ dberr_t err = new_page_bulk.init();
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Copy the upper half to the new page. */
+ rec_t* split_rec = page_bulk->getSplitRec();
+ new_page_bulk.copyIn(split_rec);
+ page_bulk->copyOut(split_rec);
+
+ /* Commit the pages after split. */
+ err = pageCommit(page_bulk, &new_page_bulk, true);
+ if (err != DB_SUCCESS) {
+ pageAbort(&new_page_bulk);
+ return(err);
+ }
+
+ err = pageCommit(&new_page_bulk, next_page_bulk, true);
+ if (err != DB_SUCCESS) {
+ pageAbort(&new_page_bulk);
+ return(err);
+ }
+
+ return(err);
+}
+
+/** Commit(finish) a page. We set next/prev page no, compress a page of
+compressed table and split the page if compression fails, insert a node
+pointer to father page if needed, and commit mini-transaction.
+@param[in] page_bulk page to commit
+@param[in] next_page_bulk next page
+@param[in] insert_father false when page_bulk is a root page and
+ true when it's a non-root page
+@return error code */
+dberr_t
+BtrBulk::pageCommit(
+ PageBulk* page_bulk,
+ PageBulk* next_page_bulk,
+ bool insert_father)
+{
+ page_bulk->finish();
+
+ /* Set page links */
+ if (next_page_bulk != NULL) {
+ ut_ad(page_bulk->getLevel() == next_page_bulk->getLevel());
+
+ page_bulk->setNext(next_page_bulk->getPageNo());
+ next_page_bulk->setPrev(page_bulk->getPageNo());
+ } else {
+ ut_ad(!page_has_next(page_bulk->getPage()));
+ /* If a page is released and latched again, we need to
+ mark it modified in mini-transaction. */
+ page_bulk->set_modified();
+ }
+
+ ut_ad(!rw_lock_own_flagged(&m_index->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX
+ | RW_LOCK_FLAG_S));
+
+ /* Compress page if it's a compressed table. */
+ if (page_bulk->getPageZip() != NULL && !page_bulk->compress()) {
+ return(pageSplit(page_bulk, next_page_bulk));
+ }
+
+ /* Insert node pointer to father page. */
+ if (insert_father) {
+ dtuple_t* node_ptr = page_bulk->getNodePtr();
+ dberr_t err = insert(node_ptr, page_bulk->getLevel()+1);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ /* Commit mtr. */
+ page_bulk->commit(true);
+
+ return(DB_SUCCESS);
+}
+
+/** Log free check */
+inline void BtrBulk::logFreeCheck()
+{
+ if (log_sys.check_flush_or_checkpoint()) {
+ release();
+
+ log_check_margins();
+
+ latch();
+ }
+}
+
+/** Release all latches */
+void
+BtrBulk::release()
+{
+ ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+ for (ulint level = 0; level <= m_root_level; level++) {
+ PageBulk* page_bulk = m_page_bulks.at(level);
+
+ page_bulk->release();
+ }
+}
+
+/** Re-latch all latches */
+void
+BtrBulk::latch()
+{
+ ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+ for (ulint level = 0; level <= m_root_level; level++) {
+ PageBulk* page_bulk = m_page_bulks.at(level);
+ page_bulk->latch();
+ }
+}
+
+/** Insert a tuple to page in a level
+@param[in] tuple tuple to insert
+@param[in] level B-tree level
+@return error code */
+dberr_t
+BtrBulk::insert(
+ dtuple_t* tuple,
+ ulint level)
+{
+ bool is_left_most = false;
+ dberr_t err = DB_SUCCESS;
+
+ /* Check if we need to create a PageBulk for the level. */
+ if (level + 1 > m_page_bulks.size()) {
+ PageBulk* new_page_bulk
+ = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id, FIL_NULL,
+ level));
+ err = new_page_bulk->init();
+ if (err != DB_SUCCESS) {
+ UT_DELETE(new_page_bulk);
+ return(err);
+ }
+
+ m_page_bulks.push_back(new_page_bulk);
+ ut_ad(level + 1 == m_page_bulks.size());
+ m_root_level = level;
+
+ is_left_most = true;
+ }
+
+ ut_ad(m_page_bulks.size() > level);
+
+ PageBulk* page_bulk = m_page_bulks.at(level);
+
+ if (is_left_most && level > 0 && page_bulk->getRecNo() == 0) {
+ /* The node pointer must be marked as the predefined minimum
+ record, as there is no lower alphabetical limit to records in
+ the leftmost node of a level: */
+ dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple)
+ | REC_INFO_MIN_REC_FLAG);
+ }
+
+ ulint n_ext = 0;
+ ulint rec_size = rec_get_converted_size(m_index, tuple, n_ext);
+ big_rec_t* big_rec = NULL;
+ rec_t* rec = NULL;
+ rec_offs* offsets = NULL;
+
+ if (page_bulk->needExt(tuple, rec_size)) {
+ /* The record is so big that we have to store some fields
+ externally on separate database pages */
+ big_rec = dtuple_convert_big_rec(m_index, 0, tuple, &n_ext);
+
+ if (big_rec == NULL) {
+ return(DB_TOO_BIG_RECORD);
+ }
+
+ rec_size = rec_get_converted_size(m_index, tuple, n_ext);
+ }
+
+ if (page_bulk->getPageZip() != NULL
+ && page_zip_is_too_big(m_index, tuple)) {
+ err = DB_TOO_BIG_RECORD;
+ goto func_exit;
+ }
+
+ if (!page_bulk->isSpaceAvailable(rec_size)) {
+ /* Create a sibling page_bulk. */
+ PageBulk* sibling_page_bulk;
+ sibling_page_bulk = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id,
+ FIL_NULL, level));
+ err = sibling_page_bulk->init();
+ if (err != DB_SUCCESS) {
+ UT_DELETE(sibling_page_bulk);
+ goto func_exit;
+ }
+
+ /* Commit page bulk. */
+ err = pageCommit(page_bulk, sibling_page_bulk, true);
+ if (err != DB_SUCCESS) {
+ pageAbort(sibling_page_bulk);
+ UT_DELETE(sibling_page_bulk);
+ goto func_exit;
+ }
+
+ /* Set new page bulk to page_bulks. */
+ ut_ad(sibling_page_bulk->getLevel() <= m_root_level);
+ m_page_bulks.at(level) = sibling_page_bulk;
+
+ UT_DELETE(page_bulk);
+ page_bulk = sibling_page_bulk;
+
+ /* Important: log_free_check whether we need a checkpoint. */
+ if (page_is_leaf(sibling_page_bulk->getPage())) {
+ if (trx_is_interrupted(m_trx)) {
+ err = DB_INTERRUPTED;
+ goto func_exit;
+ }
+
+ srv_inc_activity_count();
+ logFreeCheck();
+ }
+ }
+
+ /* Convert tuple to rec. */
+ rec = rec_convert_dtuple_to_rec(static_cast<byte*>(mem_heap_alloc(
+ page_bulk->m_heap, rec_size)), m_index, tuple, n_ext);
+ offsets = rec_get_offsets(rec, m_index, offsets, level
+ ? 0 : m_index->n_core_fields,
+ ULINT_UNDEFINED, &page_bulk->m_heap);
+
+ page_bulk->insert(rec, offsets);
+
+ if (big_rec != NULL) {
+ ut_ad(dict_index_is_clust(m_index));
+ ut_ad(page_bulk->getLevel() == 0);
+ ut_ad(page_bulk == m_page_bulks.at(0));
+
+ /* Release all pages above the leaf level */
+ for (ulint level = 1; level <= m_root_level; level++) {
+ m_page_bulks.at(level)->release();
+ }
+
+ err = page_bulk->storeExt(big_rec, offsets);
+
+ /* Latch */
+ for (ulint level = 1; level <= m_root_level; level++) {
+ PageBulk* page_bulk = m_page_bulks.at(level);
+ page_bulk->latch();
+ }
+ }
+
+func_exit:
+ if (big_rec != NULL) {
+ dtuple_convert_back_big_rec(m_index, tuple, big_rec);
+ }
+
+ return(err);
+}
+
+/** Btree bulk load finish. We commit the last page in each level
+and copy the last page in top level to the root page of the index
+if no error occurs.
+@param[in] err whether bulk load was successful until now
+@return error code */
+dberr_t
+BtrBulk::finish(dberr_t err)
+{
+ uint32_t last_page_no = FIL_NULL;
+
+ ut_ad(!m_index->table->is_temporary());
+
+ if (m_page_bulks.size() == 0) {
+ /* The table is empty. The root page of the index tree
+ is already in a consistent state. No need to flush. */
+ return(err);
+ }
+
+ ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+ /* Finish all page bulks */
+ for (ulint level = 0; level <= m_root_level; level++) {
+ PageBulk* page_bulk = m_page_bulks.at(level);
+
+ last_page_no = page_bulk->getPageNo();
+
+ if (err == DB_SUCCESS) {
+ err = pageCommit(page_bulk, NULL,
+ level != m_root_level);
+ }
+
+ if (err != DB_SUCCESS) {
+ pageAbort(page_bulk);
+ }
+
+ UT_DELETE(page_bulk);
+ }
+
+ if (err == DB_SUCCESS) {
+ rec_t* first_rec;
+ mtr_t mtr;
+ buf_block_t* last_block;
+ PageBulk root_page_bulk(m_index, m_trx->id,
+ m_index->page, m_root_level);
+
+ mtr.start();
+ m_index->set_modified(mtr);
+ mtr_x_lock_index(m_index, &mtr);
+
+ ut_ad(last_page_no != FIL_NULL);
+ last_block = btr_block_get(*m_index, last_page_no, RW_X_LATCH,
+ false, &mtr);
+ first_rec = page_rec_get_next(
+ page_get_infimum_rec(last_block->frame));
+ ut_ad(page_rec_is_user_rec(first_rec));
+
+ /* Copy last page to root page. */
+ err = root_page_bulk.init();
+ if (err != DB_SUCCESS) {
+ mtr.commit();
+ return(err);
+ }
+ root_page_bulk.copyIn(first_rec);
+ root_page_bulk.finish();
+
+ /* Remove last page. */
+ btr_page_free(m_index, last_block, &mtr);
+
+ mtr.commit();
+
+ err = pageCommit(&root_page_bulk, NULL, false);
+ ut_ad(err == DB_SUCCESS);
+ }
+
+ ut_ad(!sync_check_iterate(dict_sync_check()));
+
+ ut_ad(err != DB_SUCCESS
+ || btr_validate_index(m_index, NULL) == DB_SUCCESS);
+ return(err);
+}
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
new file mode 100644
index 00000000..5bb2a0e2
--- /dev/null
+++ b/storage/innobase/btr/btr0cur.cc
@@ -0,0 +1,8279 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0cur.cc
+The index tree cursor
+
+All changes that row operations make to a B-tree or the records
+there must go through this module! Undo log records are written here
+of every modify or insert of a clustered index record.
+
+ NOTE!!!
+To make sure we do not run out of disk space during a pessimistic
+insert or update, we have to reserve 2 x the height of the index tree
+many pages in the tablespace before we start the operation, because
+if leaf splitting has been started, it is difficult to undo, except
+by crashing the database and doing a roll-forward.
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0cur.h"
+#include "row0upd.h"
+#include "mtr0log.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "buf0lru.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
+#include "row0log.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "zlib.h"
+#include "srv0start.h"
+#include "mysql_com.h"
+#include "dict0stats.h"
+#ifdef WITH_WSREP
+#include "mysql/service_wsrep.h"
+#endif /* WITH_WSREP */
+
+/** Buffered B-tree operation types, introduced as part of delete buffering. */
+enum btr_op_t {
+ BTR_NO_OP = 0, /*!< Not buffered */
+ BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
+ BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
+ BTR_DELETE_OP, /*!< Purge a delete-marked record */
+ BTR_DELMARK_OP /*!< Mark a record for deletion */
+};
+
+/** Modification types for the B-tree operation.
+ Note that the order must be DELETE, BOTH, INSERT !!
+ */
+enum btr_intention_t {
+ BTR_INTENTION_DELETE,
+ BTR_INTENTION_BOTH,
+ BTR_INTENTION_INSERT
+};
+
+/** For the index->lock scalability improvement, only possibility of clear
+performance regression observed was caused by grown huge history list length.
+That is because the exclusive use of index->lock also worked as reserving
+free blocks and read IO bandwidth with priority. To avoid huge glowing history
+list as same level with previous implementation, prioritizes pessimistic tree
+operations by purge as the previous, when it seems to be growing huge.
+
+ Experimentally, the history list length starts to affect to performance
+throughput clearly from about 100000. */
+#define BTR_CUR_FINE_HISTORY_LENGTH 100000
+
+/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
+Atomic_counter<ulint> btr_cur_n_non_sea;
+/** Old value of btr_cur_n_non_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+ulint btr_cur_n_non_sea_old;
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of successful adaptive hash index lookups in
+btr_cur_search_to_nth_level(). */
+ulint btr_cur_n_sea;
+/** Old value of btr_cur_n_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+ulint btr_cur_n_sea_old;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef UNIV_DEBUG
+/* Flag to limit optimistic insert records */
+uint btr_cur_limit_optimistic_insert_debug;
+#endif /* UNIV_DEBUG */
+
+/** In the optimistic insert, if the insert does not fit, but this much space
+can be released by page reorganize, then it is reorganized */
+#define BTR_CUR_PAGE_REORGANIZE_LIMIT (srv_page_size / 32)
+
+/** The structure of a BLOB part header */
+/* @{ */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this
+ page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
+ FIL_NULL if none */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
+ part header, in bytes */
+
+/** Estimated table level stats from sampled value.
+@param value sampled stats
+@param index index being sampled
+@param sample number of sampled rows
+@param ext_size external stored data size
+@param not_empty table not empty
+@return estimated table wide stats from sampled value */
+#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
+ (((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
+ + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
+
+/* @} */
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+ buf_block_t* block, /*!< in/out: index page */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
+/*******************************************************************//**
+Adds path information to the cursor for the current page, for which
+the binary search has been performed. */
+static
+void
+btr_cur_add_path_info(
+/*==================*/
+ btr_cur_t* cursor, /*!< in: cursor positioned on a page */
+ ulint height, /*!< in: height of the page in tree;
+ 0 means leaf node */
+ ulint root_height); /*!< in: root node height in tree */
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+ dict_index_t* index, /*!< in: index of rec; the index tree MUST be
+ X-latched */
+ rec_t* rec, /*!< in: record */
+ buf_block_t* block, /*!< in: index page of rec */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr); /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the tree */
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched */
+ rec_t* rec, /*!< in: record */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ buf_block_t* block, /*!< in: index page of rec */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr); /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the index
+ tree */
+
+/*==================== B-TREE SEARCH =========================*/
+
+/** Latches the leaf page or pages requested.
+@param[in] block leaf page where the search converged
+@param[in] latch_mode BTR_SEARCH_LEAF, ...
+@param[in] cursor cursor
+@param[in] mtr mini-transaction
+@return blocks and savepoints which actually latched. */
+btr_latch_leaves_t
+btr_cur_latch_leaves(
+ buf_block_t* block,
+ ulint latch_mode,
+ btr_cur_t* cursor,
+ mtr_t* mtr)
+{
+ rw_lock_type_t mode;
+ uint32_t left_page_no;
+ uint32_t right_page_no;
+ buf_block_t* get_block;
+ bool spatial;
+ btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
+
+ compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH));
+ compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH));
+ compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH));
+ ut_ad(block->page.id().space() == cursor->index->table->space->id);
+
+ spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
+ ut_ad(block->page.in_file());
+
+ switch (latch_mode) {
+ case BTR_SEARCH_LEAF:
+ case BTR_MODIFY_LEAF:
+ case BTR_SEARCH_TREE:
+ if (spatial) {
+ cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
+ = mtr_set_savepoint(mtr);
+ }
+
+ mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
+ latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
+ get_block = btr_block_get(*cursor->index,
+ block->page.id().page_no(), mode,
+ true, mtr);
+ latch_leaves.blocks[1] = get_block;
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(get_block->frame)
+ == page_is_comp(block->frame));
+#endif /* UNIV_BTR_DEBUG */
+ if (spatial) {
+ cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
+ = get_block;
+ }
+
+ return(latch_leaves);
+ case BTR_MODIFY_TREE:
+ /* It is exclusive for other operations which calls
+ btr_page_set_prev() */
+ ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+ MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ /* x-latch also siblings from left to right */
+ left_page_no = btr_page_get_prev(block->frame);
+
+ if (left_page_no != FIL_NULL) {
+
+ if (spatial) {
+ cursor->rtr_info->tree_savepoints[
+ RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
+ }
+
+ latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
+ get_block = btr_block_get(
+ *cursor->index, left_page_no, RW_X_LATCH,
+ true, mtr);
+ latch_leaves.blocks[0] = get_block;
+
+ if (spatial) {
+ cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
+ = get_block;
+ }
+ }
+
+ if (spatial) {
+ cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
+ = mtr_set_savepoint(mtr);
+ }
+
+ latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
+ get_block = btr_block_get(
+ *cursor->index, block->page.id().page_no(),
+ RW_X_LATCH, true, mtr);
+ latch_leaves.blocks[1] = get_block;
+
+#ifdef UNIV_BTR_DEBUG
+ /* Sanity check only after both the blocks are latched. */
+ if (latch_leaves.blocks[0] != NULL) {
+ ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
+ == page_is_comp(block->frame));
+ ut_a(btr_page_get_next(latch_leaves.blocks[0]->frame)
+ == block->page.id().page_no());
+ }
+ ut_a(page_is_comp(get_block->frame)
+ == page_is_comp(block->frame));
+#endif /* UNIV_BTR_DEBUG */
+
+ if (spatial) {
+ cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
+ = get_block;
+ }
+
+ right_page_no = btr_page_get_next(block->frame);
+
+ if (right_page_no != FIL_NULL) {
+ if (spatial) {
+ cursor->rtr_info->tree_savepoints[
+ RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
+ mtr);
+ }
+ latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
+ get_block = btr_block_get(*cursor->index,
+ right_page_no, RW_X_LATCH,
+ true, mtr);
+ latch_leaves.blocks[2] = get_block;
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(get_block->frame)
+ == page_is_comp(block->frame));
+ ut_a(btr_page_get_prev(get_block->frame)
+ == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+ if (spatial) {
+ cursor->rtr_info->tree_blocks[
+ RTR_MAX_LEVELS + 2] = get_block;
+ }
+ }
+
+ return(latch_leaves);
+
+ case BTR_SEARCH_PREV:
+ case BTR_MODIFY_PREV:
+ mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
+ /* latch also left sibling */
+ rw_lock_s_lock(&block->lock);
+ left_page_no = btr_page_get_prev(block->frame);
+ rw_lock_s_unlock(&block->lock);
+
+ if (left_page_no != FIL_NULL) {
+ latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
+ get_block = btr_block_get(
+ *cursor->index, left_page_no, mode,
+ true, mtr);
+ latch_leaves.blocks[0] = get_block;
+ cursor->left_block = get_block;
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(get_block->frame)
+ == page_is_comp(block->frame));
+ ut_a(btr_page_get_next(get_block->frame)
+ == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+ }
+
+ latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
+ get_block = btr_block_get(*cursor->index,
+ block->page.id().page_no(), mode,
+ true, mtr);
+ latch_leaves.blocks[1] = get_block;
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(get_block->frame)
+ == page_is_comp(block->frame));
+#endif /* UNIV_BTR_DEBUG */
+ return(latch_leaves);
+ case BTR_CONT_MODIFY_TREE:
+ ut_ad(dict_index_is_spatial(cursor->index));
+ return(latch_leaves);
+ }
+
+ ut_error;
+ return(latch_leaves);
+}
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out] index clustered index definition
+@param[in,out] mtr mini-transaction
+@return error code
+@retval DB_SUCCESS if no error occurred
+@retval DB_CORRUPTION if any corruption was noticed */
+static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
+{
+ ut_ad(index->is_primary());
+ ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
+ ut_ad(index->table->supports_instant());
+ ut_ad(index->table->is_readable());
+
+ const fil_space_t* space = index->table->space;
+ if (!space) {
+unreadable:
+ ib::error() << "Table " << index->table->name
+ << " has an unreadable root page";
+ index->table->corrupted = true;
+ return DB_CORRUPTION;
+ }
+
+ page_t* root = btr_root_get(index, mtr);
+
+ if (!root || btr_cur_instant_root_init(index, root)) {
+ goto unreadable;
+ }
+
+ ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
+
+ if (fil_page_get_type(root) == FIL_PAGE_INDEX) {
+ ut_ad(!index->is_instant());
+ return DB_SUCCESS;
+ }
+
+ btr_cur_t cur;
+ /* Relax the assertion in rec_init_offsets(). */
+ ut_ad(!index->in_instant_init);
+ ut_d(index->in_instant_init = true);
+ dberr_t err = btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF,
+ &cur, 0, mtr);
+ ut_d(index->in_instant_init = false);
+ if (err != DB_SUCCESS) {
+ index->table->corrupted = true;
+ return err;
+ }
+
+ ut_ad(page_cur_is_before_first(&cur.page_cur));
+ ut_ad(page_is_leaf(cur.page_cur.block->frame));
+
+ page_cur_move_to_next(&cur.page_cur);
+
+ const rec_t* rec = cur.page_cur.rec;
+ const ulint comp = dict_table_is_comp(index->table);
+ const ulint info_bits = rec_get_info_bits(rec, comp);
+
+ if (page_rec_is_supremum(rec)
+ || !(info_bits & REC_INFO_MIN_REC_FLAG)) {
+ if (!index->is_instant()) {
+ /* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be
+ assigned even if instant ADD COLUMN was not
+ committed. Changes to these page header fields are not
+ undo-logged, but changes to the hidden metadata record
+ are. If the server is killed and restarted, the page
+ header fields could remain set even though no metadata
+ record is present. */
+ return DB_SUCCESS;
+ }
+
+ ib::error() << "Table " << index->table->name
+ << " is missing instant ALTER metadata";
+ index->table->corrupted = true;
+ return DB_CORRUPTION;
+ }
+
+ if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG
+ || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) {
+incompatible:
+ ib::error() << "Table " << index->table->name
+ << " contains unrecognizable instant ALTER metadata";
+ index->table->corrupted = true;
+ return DB_CORRUPTION;
+ }
+
+ /* Read the metadata. We can get here on server restart
+ or when the table was evicted from the data dictionary cache
+ and is now being accessed again.
+
+ Here, READ COMMITTED and REPEATABLE READ should be equivalent.
+ Committing the ADD COLUMN operation would acquire
+ MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any
+ concurrent operations on the table, including table eviction
+ from the cache. */
+
+ if (info_bits & REC_INFO_DELETED_FLAG) {
+ /* This metadata record includes a BLOB that identifies
+ any dropped or reordered columns. */
+ ulint trx_id_offset = index->trx_id_offset;
+ /* If !index->trx_id_offset, the PRIMARY KEY contains
+ variable-length columns. For the metadata record,
+ variable-length columns should be written with zero
+ length. However, before MDEV-21088 was fixed, for
+ variable-length encoded PRIMARY KEY column of type
+ CHAR, we wrote more than zero bytes. That is why we
+ must determine the actual length of each PRIMARY KEY
+ column. The DB_TRX_ID will start right after any
+ PRIMARY KEY columns. */
+ ut_ad(index->n_uniq);
+
+ /* We cannot invoke rec_get_offsets() before
+ index->table->deserialise_columns(). Therefore,
+ we must duplicate some logic here. */
+ if (trx_id_offset) {
+ } else if (index->table->not_redundant()) {
+ /* The PRIMARY KEY contains variable-length columns.
+ For the metadata record, variable-length columns are
+ always written with zero length. The DB_TRX_ID will
+ start right after any fixed-length columns. */
+
+ /* OK, before MDEV-21088 was fixed, for
+ variable-length encoded PRIMARY KEY column of
+ type CHAR, we wrote more than zero bytes. In
+ order to allow affected tables to be accessed,
+ it would be nice to determine the actual
+ length of each PRIMARY KEY column. However, to
+ be able to do that, we should determine the
+ size of the null-bit bitmap in the metadata
+ record. And we cannot know that before reading
+ the metadata BLOB, whose starting point we are
+ trying to find here. (Although the PRIMARY KEY
+ columns cannot be NULL, we would have to know
+ where the lengths of variable-length PRIMARY KEY
+ columns start.)
+
+ So, unfortunately we cannot help users who
+ were affected by MDEV-21088 on a ROW_FORMAT=COMPACT
+ or ROW_FORMAT=DYNAMIC table. */
+
+ for (uint i = index->n_uniq; i--; ) {
+ trx_id_offset += index->fields[i].fixed_len;
+ }
+ } else if (rec_get_1byte_offs_flag(rec)) {
+ trx_id_offset = rec_1_get_field_end_info(
+ rec, index->n_uniq - 1);
+ ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK));
+ trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK;
+ } else {
+ trx_id_offset = rec_2_get_field_end_info(
+ rec, index->n_uniq - 1);
+ ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK));
+ trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK;
+ }
+
+ const byte* ptr = rec + trx_id_offset
+ + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) {
+ goto incompatible;
+ }
+
+ uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
+ if (!len
+ || mach_read_from_4(ptr + BTR_EXTERN_OFFSET)
+ != FIL_PAGE_DATA
+ || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
+ != space->id) {
+ goto incompatible;
+ }
+
+ buf_block_t* block = buf_page_get(
+ page_id_t(space->id,
+ mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
+ 0, RW_S_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+ if (fil_page_get_type(block->frame) != FIL_PAGE_TYPE_BLOB
+ || mach_read_from_4(&block->frame[FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO])
+ != FIL_NULL
+ || mach_read_from_4(&block->frame[FIL_PAGE_DATA
+ + BTR_BLOB_HDR_PART_LEN])
+ != len) {
+ goto incompatible;
+ }
+
+ /* The unused part of the BLOB page should be zero-filled. */
+ for (const byte* b = block->frame
+ + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len,
+ * const end = block->frame + srv_page_size
+ - BTR_EXTERN_LEN;
+ b < end; ) {
+ if (*b++) {
+ goto incompatible;
+ }
+ }
+
+ if (index->table->deserialise_columns(
+ &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE],
+ len)) {
+ goto incompatible;
+ }
+
+ /* Proceed to initialize the default values of
+ any instantly added columns. */
+ }
+
+ mem_heap_t* heap = NULL;
+ rec_offs* offsets = rec_get_offsets(rec, index, NULL,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ if (rec_offs_any_default(offsets)) {
+inconsistent:
+ mem_heap_free(heap);
+ goto incompatible;
+ }
+
+ /* In fact, because we only ever append fields to the metadata
+ record, it is also OK to perform READ UNCOMMITTED and
+ then ignore any extra fields, provided that
+ trx_sys.is_registered(DB_TRX_ID). */
+ if (rec_offs_n_fields(offsets)
+ > ulint(index->n_fields) + !!index->table->instant
+ && !trx_sys.is_registered(current_trx(),
+ row_get_rec_trx_id(rec, index,
+ offsets))) {
+ goto inconsistent;
+ }
+
+ for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
+ dict_col_t* col = index->fields[i].col;
+ const unsigned o = i + !!index->table->instant;
+ ulint len;
+ const byte* data = rec_get_nth_field(rec, offsets, o, &len);
+ ut_ad(!col->is_added());
+ ut_ad(!col->def_val.data);
+ col->def_val.len = len;
+ switch (len) {
+ case UNIV_SQL_NULL:
+ continue;
+ case 0:
+ col->def_val.data = field_ref_zero;
+ continue;
+ }
+ ut_ad(len != UNIV_SQL_DEFAULT);
+ if (!rec_offs_nth_extern(offsets, o)) {
+ col->def_val.data = mem_heap_dup(
+ index->table->heap, data, len);
+ } else if (len < BTR_EXTERN_FIELD_REF_SIZE
+ || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE)) {
+ col->def_val.len = UNIV_SQL_DEFAULT;
+ goto inconsistent;
+ } else {
+ col->def_val.data = btr_copy_externally_stored_field(
+ &col->def_val.len, data,
+ cur.page_cur.block->zip_size(),
+ len, index->table->heap);
+ }
+ }
+
+ mem_heap_free(heap);
+ return DB_SUCCESS;
+}
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out] table table definition from the data dictionary
+@return error code
+@retval DB_SUCCESS if no error occurred */
+dberr_t
+btr_cur_instant_init(dict_table_t* table)
+{
+ mtr_t mtr;
+ dict_index_t* index = dict_table_get_first_index(table);
+ mtr.start();
+ dberr_t err = index
+ ? btr_cur_instant_init_low(index, &mtr)
+ : DB_CORRUPTION;
+ mtr.commit();
+ return(err);
+}
+
+/** Initialize the n_core_null_bytes on first access to a clustered
+index root page.
+@param[in] index clustered index that is on its first access
+@param[in] page clustered index root page
+@return whether the page is corrupted */
+bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
+{
+ ut_ad(!index->is_dummy);
+ ut_ad(fil_page_index_page_check(page));
+ ut_ad(!page_has_siblings(page));
+ ut_ad(page_get_space_id(page) == index->table->space_id);
+ ut_ad(page_get_page_no(page) == index->page);
+ ut_ad(!page_is_comp(page) == !dict_table_is_comp(index->table));
+ ut_ad(index->is_primary());
+ ut_ad(!index->is_instant());
+ ut_ad(index->table->supports_instant());
+ /* This is normally executed as part of btr_cur_instant_init()
+ when dict_load_table_one() is loading a table definition.
+ Other threads should not access or modify the n_core_null_bytes,
+ n_core_fields before dict_load_table_one() returns.
+
+ This can also be executed during IMPORT TABLESPACE, where the
+ table definition is exclusively locked. */
+
+ switch (fil_page_get_type(page)) {
+ default:
+ ut_ad("wrong page type" == 0);
+ return true;
+ case FIL_PAGE_INDEX:
+ /* The field PAGE_INSTANT is guaranteed 0 on clustered
+ index root pages of ROW_FORMAT=COMPACT or
+ ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */
+ ut_ad(!page_is_comp(page) || !page_get_instant(page));
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+ return false;
+ case FIL_PAGE_TYPE_INSTANT:
+ break;
+ }
+
+ const uint16_t n = page_get_instant(page);
+
+ if (n < index->n_uniq + DATA_ROLL_PTR) {
+ /* The PRIMARY KEY (or hidden DB_ROW_ID) and
+ DB_TRX_ID,DB_ROLL_PTR columns must always be present
+ as 'core' fields. */
+ return true;
+ }
+
+ if (n > REC_MAX_N_FIELDS) {
+ return true;
+ }
+
+ index->n_core_fields = n & dict_index_t::MAX_N_FIELDS;
+
+ const rec_t* infimum = page_get_infimum_rec(page);
+ const rec_t* supremum = page_get_supremum_rec(page);
+
+ if (!memcmp(infimum, "infimum", 8)
+ && !memcmp(supremum, "supremum", 8)) {
+ if (n > index->n_fields) {
+ /* All fields, including those for instantly
+ added columns, must be present in the
+ data dictionary. */
+ return true;
+ }
+
+ ut_ad(!index->is_dummy);
+ ut_d(index->is_dummy = true);
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(index->get_n_nullable(n)));
+ ut_d(index->is_dummy = false);
+ return false;
+ }
+
+ if (memcmp(infimum, field_ref_zero, 8)
+ || memcmp(supremum, field_ref_zero, 7)) {
+ /* The infimum and supremum records must either contain
+ the original strings, or they must be filled with zero
+ bytes, except for the bytes that we have repurposed. */
+ return true;
+ }
+
+ index->n_core_null_bytes = supremum[7];
+ return index->n_core_null_bytes > 128;
+}
+
+/** Optimistically latches the leaf page or pages requested.
+@param[in] block guessed buffer block
+@param[in] modify_clock modify clock value
+@param[in,out] latch_mode BTR_SEARCH_LEAF, ...
+@param[in,out] cursor cursor
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@return true if success */
+bool
+btr_cur_optimistic_latch_leaves(
+ buf_block_t* block,
+ ib_uint64_t modify_clock,
+ ulint* latch_mode,
+ btr_cur_t* cursor,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr)
+{
+ ut_ad(block->page.buf_fix_count());
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+ switch (*latch_mode) {
+ default:
+ ut_error;
+ return(false);
+ case BTR_SEARCH_LEAF:
+ case BTR_MODIFY_LEAF:
+ return(buf_page_optimistic_get(*latch_mode, block,
+ modify_clock, file, line, mtr));
+ case BTR_SEARCH_PREV:
+ case BTR_MODIFY_PREV:
+ rw_lock_s_lock(&block->lock);
+ if (block->modify_clock != modify_clock) {
+ rw_lock_s_unlock(&block->lock);
+ return false;
+ }
+ const uint32_t curr_page_no = block->page.id().page_no();
+ const uint32_t left_page_no = btr_page_get_prev(block->frame);
+ rw_lock_s_unlock(&block->lock);
+
+ const rw_lock_type_t mode = *latch_mode == BTR_SEARCH_PREV
+ ? RW_S_LATCH : RW_X_LATCH;
+
+ if (left_page_no != FIL_NULL) {
+ dberr_t err = DB_SUCCESS;
+ cursor->left_block = buf_page_get_gen(
+ page_id_t(cursor->index->table->space_id,
+ left_page_no),
+ cursor->index->table->space->zip_size(),
+ mode, nullptr, BUF_GET_POSSIBLY_FREED,
+ __FILE__, __LINE__, mtr, &err);
+
+ if (!cursor->left_block) {
+ cursor->index->table->file_unreadable = true;
+ }
+
+ if (cursor->left_block->page.status
+ == buf_page_t::FREED
+ || btr_page_get_next(cursor->left_block->frame)
+ != curr_page_no) {
+ /* release the left block */
+ btr_leaf_page_release(
+ cursor->left_block, mode, mtr);
+ return false;
+ }
+ } else {
+ cursor->left_block = NULL;
+ }
+
+ if (buf_page_optimistic_get(mode, block, modify_clock,
+ file, line, mtr)) {
+ if (btr_page_get_prev(block->frame) == left_page_no) {
+ /* block was already buffer-fixed while
+ entering the function and
+ buf_page_optimistic_get() buffer-fixes
+ it again. */
+ ut_ad(2 <= block->page.buf_fix_count());
+ *latch_mode = mode;
+ return(true);
+ } else {
+ /* release the block and decrement of
+ buf_fix_count which was incremented
+ in buf_page_optimistic_get() */
+ btr_leaf_page_release(block, mode, mtr);
+ }
+ }
+
+ ut_ad(block->page.buf_fix_count());
+ /* release the left block */
+ if (cursor->left_block != NULL) {
+ btr_leaf_page_release(cursor->left_block,
+ mode, mtr);
+ }
+ }
+
+ return false;
+}
+
+/**
+Gets intention in btr_intention_t from latch_mode, and cleares the intention
+at the latch_mode.
+@param latch_mode in/out: pointer to latch_mode
+@return intention for latching tree */
+static
+btr_intention_t
+btr_cur_get_and_clear_intention(
+ ulint *latch_mode)
+{
+ btr_intention_t intention;
+
+ switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
+ case BTR_LATCH_FOR_INSERT:
+ intention = BTR_INTENTION_INSERT;
+ break;
+ case BTR_LATCH_FOR_DELETE:
+ intention = BTR_INTENTION_DELETE;
+ break;
+ default:
+ /* both or unknown */
+ intention = BTR_INTENTION_BOTH;
+ }
+ *latch_mode &= ulint(~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE));
+
+ return(intention);
+}
+
+/**
+Gets the desired latch type for the root leaf (root page is root leaf)
+at the latch mode.
+@param latch_mode in: BTR_SEARCH_LEAF, ...
+@return latch type */
+static
+rw_lock_type_t
+btr_cur_latch_for_root_leaf(
+ ulint latch_mode)
+{
+ switch (latch_mode) {
+ case BTR_SEARCH_LEAF:
+ case BTR_SEARCH_TREE:
+ case BTR_SEARCH_PREV:
+ return(RW_S_LATCH);
+ case BTR_MODIFY_LEAF:
+ case BTR_MODIFY_TREE:
+ case BTR_MODIFY_PREV:
+ return(RW_X_LATCH);
+ case BTR_CONT_MODIFY_TREE:
+ case BTR_CONT_SEARCH_TREE:
+ /* A root page should be latched already,
+ and don't need to be latched here.
+ fall through (RW_NO_LATCH) */
+ case BTR_NO_LATCHES:
+ return(RW_NO_LATCH);
+ }
+
+ ut_error;
+ return(RW_NO_LATCH); /* avoid compiler warnings */
+}
+
+/** Detects whether the modifying record might need a modifying tree structure.
+@param[in] index index
+@param[in] page page
+@param[in] lock_intention lock intention for the tree operation
+@param[in] rec record (current node_ptr)
+@param[in] rec_size size of the record or max size of node_ptr
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] mtr mtr
+@return true if tree modification is needed */
+static
+bool
+btr_cur_will_modify_tree(
+ dict_index_t* index,
+ const page_t* page,
+ btr_intention_t lock_intention,
+ const rec_t* rec,
+ ulint rec_size,
+ ulint zip_size,
+ mtr_t* mtr)
+{
+ ut_ad(!page_is_leaf(page));
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+
+ /* Pessimistic delete of the first record causes delete & insert
+ of node_ptr at upper level. And a subsequent page shrink is
+ possible. It causes delete of node_ptr at the upper level.
+ So we should pay attention also to 2nd record not only
+ first record and last record. Because if the "delete & insert" are
+ done for the different page, the 2nd record become
+ first record and following compress might delete the record and causes
+ the uppper level node_ptr modification. */
+
+ const ulint n_recs = page_get_n_recs(page);
+
+ if (lock_intention <= BTR_INTENTION_BOTH) {
+ compile_time_assert(BTR_INTENTION_DELETE < BTR_INTENTION_BOTH);
+ compile_time_assert(BTR_INTENTION_BOTH < BTR_INTENTION_INSERT);
+
+ if (!page_has_siblings(page)) {
+ return true;
+ }
+
+ ulint margin = rec_size;
+
+ if (lock_intention == BTR_INTENTION_BOTH) {
+ ulint level = btr_page_get_level(page);
+
+ /* This value is the worst expectation for the node_ptr
+ records to be deleted from this page. It is used to
+ expect whether the cursor position can be the left_most
+ record in this page or not. */
+ ulint max_nodes_deleted = 0;
+
+ /* By modifying tree operations from the under of this
+ level, logically (2 ^ (level - 1)) opportunities to
+ deleting records in maximum even unreally rare case. */
+ if (level > 7) {
+ /* TODO: adjust this practical limit. */
+ max_nodes_deleted = 64;
+ } else if (level > 0) {
+ max_nodes_deleted = (ulint)1 << (level - 1);
+ }
+ /* check delete will cause. (BTR_INTENTION_BOTH
+ or BTR_INTENTION_DELETE) */
+ if (n_recs <= max_nodes_deleted * 2
+ || page_rec_is_first(rec, page)) {
+ /* The cursor record can be the left most record
+ in this page. */
+ return true;
+ }
+
+ if (page_has_prev(page)
+ && page_rec_distance_is_at_most(
+ page_get_infimum_rec(page), rec,
+ max_nodes_deleted)) {
+ return true;
+ }
+
+ if (page_has_next(page)
+ && page_rec_distance_is_at_most(
+ rec, page_get_supremum_rec(page),
+ max_nodes_deleted)) {
+ return true;
+ }
+
+ /* Delete at leftmost record in a page causes delete
+ & insert at its parent page. After that, the delete
+ might cause btr_compress() and delete record at its
+ parent page. Thus we should consider max deletes. */
+ margin *= max_nodes_deleted;
+ }
+
+ /* Safe because we already have SX latch of the index tree */
+ if (page_get_data_size(page)
+ < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)) {
+ return(true);
+ }
+ }
+
+ if (lock_intention >= BTR_INTENTION_BOTH) {
+ /* check insert will cause. BTR_INTENTION_BOTH
+ or BTR_INTENTION_INSERT*/
+
+ /* Once we invoke the btr_cur_limit_optimistic_insert_debug,
+ we should check it here in advance, since the max allowable
+ records in a page is limited. */
+ LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true);
+
+ /* needs 2 records' space for the case the single split and
+ insert cannot fit.
+ page_get_max_insert_size_after_reorganize() includes space
+ for page directory already */
+ ulint max_size
+ = page_get_max_insert_size_after_reorganize(page, 2);
+
+ if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
+ || max_size < rec_size * 2) {
+ return(true);
+ }
+
+ /* TODO: optimize this condition for ROW_FORMAT=COMPRESSED.
+ This is based on the worst case, and we could invoke
+ page_zip_available() on the block->page.zip. */
+ /* needs 2 records' space also for worst compress rate. */
+ if (zip_size
+ && page_zip_empty_size(index->n_fields, zip_size)
+ <= rec_size * 2 + page_get_data_size(page)
+ + page_dir_calc_reserved_space(n_recs + 2)) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Detects whether the modifying record might need a opposite modification
+to the intention.
+@param[in] page page
+@param[in] lock_intention lock intention for the tree operation
+@param[in] rec record (current node_ptr)
+@return true if tree modification is needed */
+static
+bool
+btr_cur_need_opposite_intention(
+ const page_t* page,
+ btr_intention_t lock_intention,
+ const rec_t* rec)
+{
+ switch (lock_intention) {
+ case BTR_INTENTION_DELETE:
+ return (page_has_prev(page) && page_rec_is_first(rec, page)) ||
+ (page_has_next(page) && page_rec_is_last(rec, page));
+ case BTR_INTENTION_INSERT:
+ return page_has_next(page) && page_rec_is_last(rec, page);
+ case BTR_INTENTION_BOTH:
+ return(false);
+ }
+
+ ut_error;
+ return(false);
+}
+
+/**
+@param[in] index b-tree
+@return maximum size of a node pointer record in bytes */
+static ulint btr_node_ptr_max_size(const dict_index_t* index)
+{
+ if (dict_index_is_ibuf(index)) {
+ /* cannot estimate accurately */
+ /* This is universal index for change buffer.
+ The max size of the entry is about max key length * 2.
+ (index key + primary key to be inserted to the index)
+ (The max key length is UNIV_PAGE_SIZE / 16 * 3 at
+ ha_innobase::max_supported_key_length(),
+ considering MAX_KEY_LENGTH = 3072 at MySQL imposes
+ the 3500 historical InnoDB value for 16K page size case.)
+ For the universal index, node_ptr contains most of the entry.
+ And 512 is enough to contain ibuf columns and meta-data */
+ return srv_page_size / 8 * 3 + 512;
+ }
+
+ /* Each record has page_no, length of page_no and header. */
+ ulint comp = dict_table_is_comp(index->table);
+ ulint rec_max_size = comp
+ ? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES
+ + UT_BITS_IN_BYTES(index->n_nullable)
+ : REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES
+ + 2 * index->n_fields;
+
+ /* Compute the maximum possible record size. */
+ for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) {
+ const dict_field_t* field
+ = dict_index_get_nth_field(index, i);
+ const dict_col_t* col
+ = dict_field_get_col(field);
+ ulint field_max_size;
+ ulint field_ext_max_size;
+
+ /* Determine the maximum length of the index field. */
+
+ field_max_size = dict_col_get_fixed_size(col, comp);
+ if (field_max_size) {
+ /* dict_index_add_col() should guarantee this */
+ ut_ad(!field->prefix_len
+ || field->fixed_len == field->prefix_len);
+ /* Fixed lengths are not encoded
+ in ROW_FORMAT=COMPACT. */
+ rec_max_size += field_max_size;
+ continue;
+ }
+
+ field_max_size = dict_col_get_max_size(col);
+ if (UNIV_UNLIKELY(!field_max_size)) {
+ switch (col->mtype) {
+ case DATA_VARCHAR:
+ if (!comp
+ && (!strcmp(index->table->name.m_name,
+ "SYS_FOREIGN")
+ || !strcmp(index->table->name.m_name,
+ "SYS_FOREIGN_COLS"))) {
+ break;
+ }
+ /* fall through */
+ case DATA_VARMYSQL:
+ case DATA_CHAR:
+ case DATA_MYSQL:
+ /* CHAR(0) and VARCHAR(0) are possible
+ data type definitions in MariaDB.
+ The InnoDB internal SQL parser maps
+ CHAR to DATA_VARCHAR, so DATA_CHAR (or
+ DATA_MYSQL) is only coming from the
+ MariaDB SQL layer. */
+ if (comp) {
+ /* Add a length byte, because
+ fixed-length empty field are
+ encoded as variable-length.
+ For ROW_FORMAT=REDUNDANT,
+ these bytes were added to
+ rec_max_size before this loop. */
+ rec_max_size++;
+ }
+ continue;
+ }
+
+ /* SYS_FOREIGN.ID is defined as CHAR in the
+ InnoDB internal SQL parser, which translates
+ into the incorrect VARCHAR(0). InnoDB does
+ not enforce maximum lengths of columns, so
+ that is why any data can be inserted in the
+ first place.
+
+ Likewise, SYS_FOREIGN.FOR_NAME,
+ SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are
+ defined as CHAR, and also they are part of a key. */
+
+ ut_ad(!strcmp(index->table->name.m_name,
+ "SYS_FOREIGN")
+ || !strcmp(index->table->name.m_name,
+ "SYS_FOREIGN_COLS"));
+ ut_ad(!comp);
+ ut_ad(col->mtype == DATA_VARCHAR);
+
+ rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX)
+ ? REDUNDANT_REC_MAX_DATA_SIZE
+ : page_get_free_space_of_empty(FALSE) / 2;
+ } else if (field_max_size == NAME_LEN && i == 1
+ && (!strcmp(index->table->name.m_name,
+ TABLE_STATS_NAME)
+ || !strcmp(index->table->name.m_name,
+ INDEX_STATS_NAME))) {
+ /* Interpret "table_name" as VARCHAR(199) even
+ if it was incorrectly defined as VARCHAR(64).
+ While the caller of ha_innobase enforces the
+ maximum length on any data written, the InnoDB
+ internal SQL parser will happily write as much
+ data as is provided. The purpose of this hack
+ is to avoid InnoDB hangs after persistent
+ statistics on partitioned tables are
+ deleted. */
+ field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN;
+ }
+ field_ext_max_size = field_max_size < 256 ? 1 : 2;
+
+ if (field->prefix_len
+ && field->prefix_len < field_max_size) {
+ field_max_size = field->prefix_len;
+ }
+
+ if (comp) {
+ /* Add the extra size for ROW_FORMAT=COMPACT.
+ For ROW_FORMAT=REDUNDANT, these bytes were
+ added to rec_max_size before this loop. */
+ rec_max_size += field_ext_max_size;
+ }
+
+ rec_max_size += field_max_size;
+ }
+
+ return rec_max_size;
+}
+
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
+
+If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record. */
+dberr_t
+btr_cur_search_to_nth_level_func(
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the tree level of search */
+ const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
+ tuple must be set so that it cannot get
+ compared to the node ptr page number field! */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
+ Inserts should always be made using
+ PAGE_CUR_LE to search the position! */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
+ at most one of BTR_INSERT, BTR_DELETE_MARK,
+ BTR_DELETE, or BTR_ESTIMATE;
+ cursor->left_block is used to store a pointer
+ to the left neighbor page, in the cases
+ BTR_SEARCH_PREV and BTR_MODIFY_PREV;
+ NOTE that if ahi_latch, we might not have a
+ cursor page latch, we assume that ahi_latch
+ protects the record! */
+ btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
+ s- or x-latched, but see also above! */
+#ifdef BTR_CUR_HASH_ADAPT
+ rw_lock_t* ahi_latch,
+ /*!< in: currently held btr_search_latch
+ (in RW_S_LATCH mode), or NULL */
+#endif /* BTR_CUR_HASH_ADAPT */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr, /*!< in: mtr */
+ ib_uint64_t autoinc)/*!< in: PAGE_ROOT_AUTO_INC to be written
+ (0 if none) */
+{
+ page_t* page = NULL; /* remove warning */
+ buf_block_t* block;
+ buf_block_t* guess;
+ ulint height;
+ ulint up_match;
+ ulint up_bytes;
+ ulint low_match;
+ ulint low_bytes;
+ ulint rw_latch;
+ page_cur_mode_t page_mode;
+ page_cur_mode_t search_mode = PAGE_CUR_UNSUPP;
+ ulint buf_mode;
+ ulint estimate;
+ ulint node_ptr_max_size = srv_page_size / 2;
+ page_cur_t* page_cursor;
+ btr_op_t btr_op;
+ ulint root_height = 0; /* remove warning */
+ dberr_t err = DB_SUCCESS;
+
+ btr_intention_t lock_intention;
+ bool modify_external;
+ buf_block_t* tree_blocks[BTR_MAX_LEVELS];
+ ulint tree_savepoints[BTR_MAX_LEVELS];
+ ulint n_blocks = 0;
+ ulint n_releases = 0;
+ bool detected_same_key_root = false;
+
+ bool retrying_for_search_prev = false;
+ ulint leftmost_from_level = 0;
+ buf_block_t** prev_tree_blocks = NULL;
+ ulint* prev_tree_savepoints = NULL;
+ ulint prev_n_blocks = 0;
+ ulint prev_n_releases = 0;
+ bool need_path = true;
+ bool rtree_parent_modified = false;
+ bool mbr_adj = false;
+ bool found = false;
+
+ DBUG_ENTER("btr_cur_search_to_nth_level");
+
+#ifdef BTR_CUR_ADAPT
+ btr_search_t* info;
+#endif /* BTR_CUR_ADAPT */
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets2 = offsets2_;
+ rec_offs_init(offsets_);
+ rec_offs_init(offsets2_);
+ /* Currently, PAGE_CUR_LE is the only search mode used for searches
+ ending to upper levels */
+
+ ut_ad(level == 0 || mode == PAGE_CUR_LE
+ || RTREE_SEARCH_MODE(mode));
+ ut_ad(dict_index_check_search_tuple(index, tuple));
+ ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(!(index->type & DICT_FTS));
+ ut_ad(index->page != FIL_NULL);
+
+ MEM_UNDEFINED(&cursor->up_match, sizeof cursor->up_match);
+ MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes);
+ MEM_UNDEFINED(&cursor->low_match, sizeof cursor->low_match);
+ MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes);
+#ifdef UNIV_DEBUG
+ cursor->up_match = ULINT_UNDEFINED;
+ cursor->low_match = ULINT_UNDEFINED;
+#endif /* UNIV_DEBUG */
+
+ ibool s_latch_by_caller;
+
+ s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
+
+ ut_ad(!s_latch_by_caller
+ || srv_read_only_mode
+ || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
+ | MTR_MEMO_SX_LOCK));
+
+ /* These flags are mutually exclusive, they are lumped together
+ with the latch mode for historical reasons. It's possible for
+ none of the flags to be set. */
+ switch (UNIV_EXPECT(latch_mode
+ & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
+ 0)) {
+ case 0:
+ btr_op = BTR_NO_OP;
+ break;
+ case BTR_INSERT:
+ btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
+ ? BTR_INSERT_IGNORE_UNIQUE_OP
+ : BTR_INSERT_OP;
+ break;
+ case BTR_DELETE:
+ btr_op = BTR_DELETE_OP;
+ ut_a(cursor->purge_node);
+ break;
+ case BTR_DELETE_MARK:
+ btr_op = BTR_DELMARK_OP;
+ break;
+ default:
+ /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
+ should be specified at a time */
+ ut_error;
+ }
+
+ /* Operations on the insert buffer tree cannot be buffered. */
+ ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
+ /* Operations on the clustered index cannot be buffered. */
+ ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
+ /* Operations on the temporary table(indexes) cannot be buffered. */
+ ut_ad(btr_op == BTR_NO_OP || !index->table->is_temporary());
+ /* Operation on the spatial index cannot be buffered. */
+ ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
+
+ estimate = latch_mode & BTR_ESTIMATE;
+
+ lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
+
+ modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
+
+ /* Turn the flags unrelated to the latch mode off. */
+ latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+ ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
+
+ ut_ad(!s_latch_by_caller
+ || latch_mode == BTR_SEARCH_LEAF
+ || latch_mode == BTR_SEARCH_TREE
+ || latch_mode == BTR_MODIFY_LEAF);
+
+ ut_ad(autoinc == 0 || dict_index_is_clust(index));
+ ut_ad(autoinc == 0
+ || latch_mode == BTR_MODIFY_TREE
+ || latch_mode == BTR_MODIFY_LEAF);
+ ut_ad(autoinc == 0 || level == 0);
+
+ cursor->flag = BTR_CUR_BINARY;
+ cursor->index = index;
+
+#ifndef BTR_CUR_ADAPT
+ guess = NULL;
+#else
+ info = btr_search_get_info(index);
+ guess = info->root_guess;
+
+#ifdef BTR_CUR_HASH_ADAPT
+
+# ifdef UNIV_SEARCH_PERF_STAT
+ info->n_searches++;
+# endif
+ if (autoinc == 0
+ && latch_mode <= BTR_MODIFY_LEAF
+ && info->last_hash_succ
+# ifdef MYSQL_INDEX_DISABLE_AHI
+ && !index->disable_ahi
+# endif
+ && !estimate
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+ && mode != PAGE_CUR_LE_OR_EXTENDS
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+ && !dict_index_is_spatial(index)
+ /* If !ahi_latch, we do a dirty read of
+ btr_search_enabled below, and btr_search_guess_on_hash()
+ will have to check it again. */
+ && btr_search_enabled
+ && !modify_external
+ && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
+ && btr_search_guess_on_hash(index, info, tuple, mode,
+ latch_mode, cursor,
+ ahi_latch, mtr)) {
+
+ /* Search using the hash index succeeded */
+
+ ut_ad(cursor->up_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_GE);
+ ut_ad(cursor->up_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_LE);
+ ut_ad(cursor->low_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_LE);
+ btr_cur_n_sea++;
+
+ DBUG_RETURN(err);
+ }
+# endif /* BTR_CUR_HASH_ADAPT */
+#endif /* BTR_CUR_ADAPT */
+ btr_cur_n_non_sea++;
+
+ /* If the hash search did not succeed, do binary search down the
+ tree */
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (ahi_latch) {
+ /* Release possible search latch to obey latching order */
+ rw_lock_s_unlock(ahi_latch);
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /* Store the position of the tree latch we push to mtr so that we
+ know how to release it when we have latched leaf node(s) */
+
+ ulint savepoint = mtr_set_savepoint(mtr);
+
+ rw_lock_type_t upper_rw_latch;
+
+ switch (latch_mode) {
+ case BTR_MODIFY_TREE:
+ /* Most of delete-intended operations are purging.
+ Free blocks and read IO bandwidth should be prior
+ for them, when the history list is glowing huge. */
+ if (lock_intention == BTR_INTENTION_DELETE
+ && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
+ && buf_pool.n_pend_reads) {
+x_latch_index:
+ mtr_x_lock_index(index, mtr);
+ } else if (index->is_spatial()
+ && lock_intention <= BTR_INTENTION_BOTH) {
+ /* X lock the if there is possibility of
+ pessimistic delete on spatial index. As we could
+ lock upward for the tree */
+ goto x_latch_index;
+ } else {
+ mtr_sx_lock_index(index, mtr);
+ }
+ upper_rw_latch = RW_X_LATCH;
+ break;
+ case BTR_CONT_MODIFY_TREE:
+ case BTR_CONT_SEARCH_TREE:
+ /* Do nothing */
+ ut_ad(srv_read_only_mode
+ || mtr->memo_contains_flagged(&index->lock,
+ MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ if (dict_index_is_spatial(index)
+ && latch_mode == BTR_CONT_MODIFY_TREE) {
+ /* If we are about to locating parent page for split
+ and/or merge operation for R-Tree index, X latch
+ the parent */
+ upper_rw_latch = RW_X_LATCH;
+ } else {
+ upper_rw_latch = RW_NO_LATCH;
+ }
+ break;
+ default:
+ if (!srv_read_only_mode) {
+ if (s_latch_by_caller) {
+ ut_ad(rw_lock_own(dict_index_get_lock(index),
+ RW_LOCK_S));
+ } else if (!modify_external) {
+ /* BTR_SEARCH_TREE is intended to be used with
+ BTR_ALREADY_S_LATCHED */
+ ut_ad(latch_mode != BTR_SEARCH_TREE);
+
+ mtr_s_lock_index(index, mtr);
+ } else {
+ /* BTR_MODIFY_EXTERNAL needs to be excluded */
+ mtr_sx_lock_index(index, mtr);
+ }
+ upper_rw_latch = RW_S_LATCH;
+ } else {
+ upper_rw_latch = RW_NO_LATCH;
+ }
+ }
+ const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
+ latch_mode);
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ const ulint zip_size = index->table->space->zip_size();
+
+ /* Start with the root page. */
+ page_id_t page_id(index->table->space_id, index->page);
+
+ if (root_leaf_rw_latch == RW_X_LATCH) {
+ node_ptr_max_size = btr_node_ptr_max_size(index);
+ }
+
+ up_match = 0;
+ up_bytes = 0;
+ low_match = 0;
+ low_bytes = 0;
+
+ height = ULINT_UNDEFINED;
+
+ /* We use these modified search modes on non-leaf levels of the
+ B-tree. These let us end up in the right B-tree leaf. In that leaf
+ we use the original search mode. */
+
+ switch (mode) {
+ case PAGE_CUR_GE:
+ page_mode = PAGE_CUR_L;
+ break;
+ case PAGE_CUR_G:
+ page_mode = PAGE_CUR_LE;
+ break;
+ default:
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+ || RTREE_SEARCH_MODE(mode)
+ || mode == PAGE_CUR_LE_OR_EXTENDS);
+#else /* PAGE_CUR_LE_OR_EXTENDS */
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+ || RTREE_SEARCH_MODE(mode));
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ page_mode = mode;
+ break;
+ }
+
+ /* Loop and search until we arrive at the desired level */
+ btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
+
+search_loop:
+ buf_mode = BUF_GET;
+ rw_latch = RW_NO_LATCH;
+ rtree_parent_modified = false;
+
+ if (height != 0) {
+ /* We are about to fetch the root or a non-leaf page. */
+ if ((latch_mode != BTR_MODIFY_TREE || height == level)
+ && !retrying_for_search_prev) {
+ /* If doesn't have SX or X latch of index,
+ each pages should be latched before reading. */
+ if (height == ULINT_UNDEFINED
+ && upper_rw_latch == RW_S_LATCH
+ && (modify_external || autoinc)) {
+ /* needs sx-latch of root page
+ for fseg operation or for writing
+ PAGE_ROOT_AUTO_INC */
+ rw_latch = RW_SX_LATCH;
+ } else {
+ rw_latch = upper_rw_latch;
+ }
+ }
+ } else if (latch_mode <= BTR_MODIFY_LEAF) {
+ rw_latch = latch_mode;
+
+ if (btr_op != BTR_NO_OP
+ && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
+
+ /* Try to buffer the operation if the leaf
+ page is not in the buffer pool. */
+
+ buf_mode = btr_op == BTR_DELETE_OP
+ ? BUF_GET_IF_IN_POOL_OR_WATCH
+ : BUF_GET_IF_IN_POOL;
+ }
+ }
+
+retry_page_get:
+ ut_ad(n_blocks < BTR_MAX_LEVELS);
+ tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
+ block = buf_page_get_gen(page_id, zip_size, rw_latch, guess,
+ buf_mode, file, line, mtr, &err,
+ height == 0 && !index->is_clust());
+ tree_blocks[n_blocks] = block;
+
+ /* Note that block==NULL signifies either an error or change
+ buffering. */
+
+ if (err != DB_SUCCESS) {
+ ut_ad(block == NULL);
+ if (err == DB_DECRYPTION_FAILED) {
+ ib_push_warning((void *)NULL,
+ DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ index->table->name.m_name);
+ index->table->file_unreadable = true;
+ }
+
+ goto func_exit;
+ }
+
+ if (block == NULL) {
+ /* This must be a search to perform an insert/delete
+ mark/ delete; try using the insert/delete buffer */
+
+ ut_ad(height == 0);
+ ut_ad(cursor->thr);
+
+ switch (btr_op) {
+ case BTR_INSERT_OP:
+ case BTR_INSERT_IGNORE_UNIQUE_OP:
+ ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+ ut_ad(!dict_index_is_spatial(index));
+
+ if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
+ page_id, zip_size, cursor->thr)) {
+
+ cursor->flag = BTR_CUR_INSERT_TO_IBUF;
+
+ goto func_exit;
+ }
+ break;
+
+ case BTR_DELMARK_OP:
+ ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+ ut_ad(!dict_index_is_spatial(index));
+
+ if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
+ index, page_id, zip_size,
+ cursor->thr)) {
+
+ cursor->flag = BTR_CUR_DEL_MARK_IBUF;
+
+ goto func_exit;
+ }
+
+ break;
+
+ case BTR_DELETE_OP:
+ ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
+ ut_ad(!dict_index_is_spatial(index));
+
+ if (!row_purge_poss_sec(cursor->purge_node,
+ index, tuple)) {
+
+ /* The record cannot be purged yet. */
+ cursor->flag = BTR_CUR_DELETE_REF;
+ } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
+ index, page_id, zip_size,
+ cursor->thr)) {
+
+ /* The purge was buffered. */
+ cursor->flag = BTR_CUR_DELETE_IBUF;
+ } else {
+ /* The purge could not be buffered. */
+ buf_pool.watch_unset(page_id);
+ break;
+ }
+
+ buf_pool.watch_unset(page_id);
+ goto func_exit;
+
+ default:
+ ut_error;
+ }
+
+ /* Insert to the insert/delete buffer did not succeed, we
+ must read the page from disk. */
+
+ buf_mode = BUF_GET;
+
+ goto retry_page_get;
+ }
+
+ if (retrying_for_search_prev && height != 0) {
+ /* also latch left sibling */
+ uint32_t left_page_no;
+ buf_block_t* get_block;
+
+ ut_ad(rw_latch == RW_NO_LATCH);
+
+ rw_latch = upper_rw_latch;
+
+ rw_lock_s_lock(&block->lock);
+ left_page_no = btr_page_get_prev(buf_block_get_frame(block));
+ rw_lock_s_unlock(&block->lock);
+
+ if (left_page_no != FIL_NULL) {
+ ut_ad(prev_n_blocks < leftmost_from_level);
+
+ prev_tree_savepoints[prev_n_blocks]
+ = mtr_set_savepoint(mtr);
+ get_block = buf_page_get_gen(
+ page_id_t(page_id.space(), left_page_no),
+ zip_size, rw_latch, NULL, buf_mode,
+ file, line, mtr, &err);
+ prev_tree_blocks[prev_n_blocks] = get_block;
+ prev_n_blocks++;
+
+ if (err != DB_SUCCESS) {
+ if (err == DB_DECRYPTION_FAILED) {
+ ib_push_warning((void *)NULL,
+ DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ index->table->name.m_name);
+ index->table->file_unreadable = true;
+ }
+
+ goto func_exit;
+ }
+
+ /* BTR_MODIFY_TREE doesn't update prev/next_page_no,
+ without their parent page's lock. So, not needed to
+ retry here, because we have the parent page's lock. */
+ }
+
+ /* release RW_NO_LATCH page and lock with RW_S_LATCH */
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_blocks],
+ tree_blocks[n_blocks]);
+
+ tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
+ block = buf_page_get_gen(page_id, zip_size,
+ rw_latch, NULL, buf_mode,
+ file, line, mtr, &err);
+ tree_blocks[n_blocks] = block;
+
+ if (err != DB_SUCCESS) {
+ if (err == DB_DECRYPTION_FAILED) {
+ ib_push_warning((void *)NULL,
+ DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ index->table->name.m_name);
+ index->table->file_unreadable = true;
+ }
+
+ goto func_exit;
+ }
+ }
+
+ page = buf_block_get_frame(block);
+
+ if (height == ULINT_UNDEFINED
+ && page_is_leaf(page)
+ && rw_latch != RW_NO_LATCH
+ && rw_latch != root_leaf_rw_latch) {
+ /* The root page is also a leaf page (root_leaf).
+ We should reacquire the page, because the root page
+ is latched differently from leaf pages. */
+ ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+ ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
+ ut_ad(rw_latch == RW_S_LATCH || modify_external || autoinc);
+ ut_ad(!autoinc || root_leaf_rw_latch == RW_X_LATCH);
+
+ ut_ad(n_blocks == 0);
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_blocks],
+ tree_blocks[n_blocks]);
+
+ upper_rw_latch = root_leaf_rw_latch;
+ goto search_loop;
+ }
+
+ if (rw_latch != RW_NO_LATCH) {
+#ifdef UNIV_ZIP_DEBUG
+ const page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ buf_block_dbg_add_level(
+ block, dict_index_is_ibuf(index)
+ ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
+ }
+
+ ut_ad(fil_page_index_page_check(page));
+ ut_ad(index->id == btr_page_get_index_id(page));
+
+ if (height == ULINT_UNDEFINED) {
+ /* We are in the root node */
+
+ height = btr_page_get_level(page);
+ root_height = height;
+ cursor->tree_height = root_height + 1;
+
+ if (dict_index_is_spatial(index)) {
+ ut_ad(cursor->rtr_info);
+
+ /* If SSN in memory is not initialized, fetch
+ it from root page */
+ if (!rtr_get_current_ssn_id(index)) {
+ /* FIXME: do this in dict_load_table_one() */
+ index->set_ssn(page_get_ssn_id(page) + 1);
+ }
+
+ /* Save the MBR */
+ cursor->rtr_info->thr = cursor->thr;
+ rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
+ }
+
+#ifdef BTR_CUR_ADAPT
+ info->root_guess = block;
+#endif
+ }
+
+ if (height == 0) {
+ if (rw_latch == RW_NO_LATCH) {
+ latch_leaves = btr_cur_latch_leaves(
+ block, latch_mode, cursor, mtr);
+ }
+
+ switch (latch_mode) {
+ case BTR_MODIFY_TREE:
+ case BTR_CONT_MODIFY_TREE:
+ case BTR_CONT_SEARCH_TREE:
+ break;
+ default:
+ if (!s_latch_by_caller
+ && !srv_read_only_mode
+ && !modify_external) {
+ /* Release the tree s-latch */
+ /* NOTE: BTR_MODIFY_EXTERNAL
+ needs to keep tree sx-latch */
+ mtr_release_s_latch_at_savepoint(
+ mtr, savepoint,
+ dict_index_get_lock(index));
+ }
+
+ /* release upper blocks */
+ if (retrying_for_search_prev) {
+ ut_ad(!autoinc);
+ for (;
+ prev_n_releases < prev_n_blocks;
+ prev_n_releases++) {
+ mtr_release_block_at_savepoint(
+ mtr,
+ prev_tree_savepoints[
+ prev_n_releases],
+ prev_tree_blocks[
+ prev_n_releases]);
+ }
+ }
+
+ for (; n_releases < n_blocks; n_releases++) {
+ if (n_releases == 0
+ && (modify_external || autoinc)) {
+ /* keep the root page latch */
+ ut_ad(mtr->memo_contains_flagged(
+ tree_blocks[n_releases],
+ MTR_MEMO_PAGE_SX_FIX
+ | MTR_MEMO_PAGE_X_FIX));
+ continue;
+ }
+
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+ }
+
+ page_mode = mode;
+ }
+
+ if (dict_index_is_spatial(index)) {
+ /* Remember the page search mode */
+ search_mode = page_mode;
+
+ /* Some adjustment on search mode, when the
+ page search mode is PAGE_CUR_RTREE_LOCATE
+ or PAGE_CUR_RTREE_INSERT, as we are searching
+ with MBRs. When it is not the target level, we
+ should search all sub-trees that "CONTAIN" the
+ search range/MBR. When it is at the target
+ level, the search becomes PAGE_CUR_LE */
+ if (page_mode == PAGE_CUR_RTREE_LOCATE
+ && level == height) {
+ if (level == 0) {
+ page_mode = PAGE_CUR_LE;
+ } else {
+ page_mode = PAGE_CUR_RTREE_GET_FATHER;
+ }
+ }
+
+ if (page_mode == PAGE_CUR_RTREE_INSERT) {
+ page_mode = (level == height)
+ ? PAGE_CUR_LE
+ : PAGE_CUR_RTREE_INSERT;
+
+ ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
+ }
+
+ /* "need_path" indicates if we need to tracking the parent
+ pages, if it is not spatial comparison, then no need to
+ track it */
+ if (page_mode < PAGE_CUR_CONTAIN) {
+ need_path = false;
+ }
+
+ up_match = 0;
+ low_match = 0;
+
+ if (latch_mode == BTR_MODIFY_TREE
+ || latch_mode == BTR_CONT_MODIFY_TREE
+ || latch_mode == BTR_CONT_SEARCH_TREE) {
+ /* Tree are locked, no need for Page Lock to protect
+ the "path" */
+ cursor->rtr_info->need_page_lock = false;
+ }
+ }
+
+ if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
+ ut_ad(need_path);
+ found = rtr_cur_search_with_match(
+ block, index, tuple, page_mode, page_cursor,
+ cursor->rtr_info);
+
+ /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
+ if (search_mode == PAGE_CUR_RTREE_INSERT
+ && cursor->rtr_info->mbr_adj) {
+ if (latch_mode & BTR_MODIFY_LEAF) {
+ /* Parent MBR needs updated, should retry
+ with BTR_MODIFY_TREE */
+ goto func_exit;
+ } else if (latch_mode & BTR_MODIFY_TREE) {
+ rtree_parent_modified = true;
+ cursor->rtr_info->mbr_adj = false;
+ mbr_adj = true;
+ } else {
+ ut_ad(0);
+ }
+ }
+
+ if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
+ cursor->low_match =
+ DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
+ }
+#ifdef BTR_CUR_HASH_ADAPT
+ } else if (height == 0 && btr_search_enabled
+ && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
+ && !dict_index_is_spatial(index)) {
+ /* The adaptive hash index is only used when searching
+ for leaf pages (height==0), but not in r-trees.
+ We only need the byte prefix comparison for the purpose
+ of updating the adaptive hash index. */
+ page_cur_search_with_match_bytes(
+ block, index, tuple, page_mode, &up_match, &up_bytes,
+ &low_match, &low_bytes, page_cursor);
+#endif /* BTR_CUR_HASH_ADAPT */
+ } else {
+ /* Search for complete index fields. */
+ up_bytes = low_bytes = 0;
+ page_cur_search_with_match(
+ block, index, tuple, page_mode, &up_match,
+ &low_match, page_cursor,
+ need_path ? cursor->rtr_info : NULL);
+ }
+
+ if (estimate) {
+ btr_cur_add_path_info(cursor, height, root_height);
+ }
+
+ /* If this is the desired level, leave the loop */
+
+ ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor)));
+
+ /* Add Predicate lock if it is serializable isolation
+ and only if it is in the search case */
+ if (dict_index_is_spatial(index)
+ && cursor->rtr_info->need_prdt_lock
+ && mode != PAGE_CUR_RTREE_INSERT
+ && mode != PAGE_CUR_RTREE_LOCATE
+ && mode >= PAGE_CUR_CONTAIN) {
+ trx_t* trx = thr_get_trx(cursor->thr);
+ lock_prdt_t prdt;
+
+ lock_mutex_enter();
+ lock_init_prdt_from_mbr(
+ &prdt, &cursor->rtr_info->mbr, mode,
+ trx->lock.lock_heap);
+ lock_mutex_exit();
+
+ if (rw_latch == RW_NO_LATCH && height != 0) {
+ rw_lock_s_lock(&(block->lock));
+ }
+
+ lock_prdt_lock(block, &prdt, index, LOCK_S,
+ LOCK_PREDICATE, cursor->thr);
+
+ if (rw_latch == RW_NO_LATCH && height != 0) {
+ rw_lock_s_unlock(&(block->lock));
+ }
+ }
+
+ if (level != height) {
+
+ const rec_t* node_ptr;
+ ut_ad(height > 0);
+
+ height--;
+ guess = NULL;
+
+ node_ptr = page_cur_get_rec(page_cursor);
+
+ offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ /* If the rec is the first or last in the page for
+ pessimistic delete intention, it might cause node_ptr insert
+ for the upper level. We should change the intention and retry.
+ */
+ if (latch_mode == BTR_MODIFY_TREE
+ && btr_cur_need_opposite_intention(
+ page, lock_intention, node_ptr)) {
+
+need_opposite_intention:
+ ut_ad(upper_rw_latch == RW_X_LATCH);
+
+ if (n_releases > 0) {
+ /* release root block */
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[0],
+ tree_blocks[0]);
+ }
+
+ /* release all blocks */
+ for (; n_releases <= n_blocks; n_releases++) {
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+
+ lock_intention = BTR_INTENTION_BOTH;
+
+ page_id.set_page_no(index->page);
+ up_match = 0;
+ low_match = 0;
+ height = ULINT_UNDEFINED;
+
+ n_blocks = 0;
+ n_releases = 0;
+
+ goto search_loop;
+ }
+
+ if (dict_index_is_spatial(index)) {
+ if (page_rec_is_supremum(node_ptr)) {
+ cursor->low_match = 0;
+ cursor->up_match = 0;
+ goto func_exit;
+ }
+
+ /* If we are doing insertion or record locating,
+ remember the tree nodes we visited */
+ if (page_mode == PAGE_CUR_RTREE_INSERT
+ || (search_mode == PAGE_CUR_RTREE_LOCATE
+ && (latch_mode != BTR_MODIFY_LEAF))) {
+ bool add_latch = false;
+
+ if (latch_mode == BTR_MODIFY_TREE
+ && rw_latch == RW_NO_LATCH) {
+ ut_ad(mtr->memo_contains_flagged(
+ &index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ rw_lock_s_lock(&block->lock);
+ add_latch = true;
+ }
+
+ /* Store the parent cursor location */
+#ifdef UNIV_DEBUG
+ ulint num_stored = rtr_store_parent_path(
+ block, cursor, latch_mode,
+ height + 1, mtr);
+#else
+ rtr_store_parent_path(
+ block, cursor, latch_mode,
+ height + 1, mtr);
+#endif
+
+ if (page_mode == PAGE_CUR_RTREE_INSERT) {
+ btr_pcur_t* r_cursor =
+ rtr_get_parent_cursor(
+ cursor, height + 1,
+ true);
+ /* If it is insertion, there should
+ be only one parent for each level
+ traverse */
+#ifdef UNIV_DEBUG
+ ut_ad(num_stored == 1);
+#endif
+
+ node_ptr = btr_pcur_get_rec(r_cursor);
+
+ }
+
+ if (add_latch) {
+ rw_lock_s_unlock(&block->lock);
+ }
+
+ ut_ad(!page_rec_is_supremum(node_ptr));
+ }
+
+ ut_ad(page_mode == search_mode
+ || (page_mode == PAGE_CUR_WITHIN
+ && search_mode == PAGE_CUR_RTREE_LOCATE));
+
+ page_mode = search_mode;
+ }
+
+ /* If the first or the last record of the page
+ or the same key value to the first record or last record,
+ the another page might be chosen when BTR_CONT_MODIFY_TREE.
+ So, the parent page should not released to avoiding deadlock
+ with blocking the another search with the same key value. */
+ if (!detected_same_key_root
+ && lock_intention == BTR_INTENTION_BOTH
+ && !dict_index_is_unique(index)
+ && latch_mode == BTR_MODIFY_TREE
+ && (up_match >= rec_offs_n_fields(offsets) - 1
+ || low_match >= rec_offs_n_fields(offsets) - 1)) {
+ const rec_t* first_rec = page_rec_get_next_const(
+ page_get_infimum_rec(page));
+ ulint matched_fields;
+
+ ut_ad(upper_rw_latch == RW_X_LATCH);
+
+ if (node_ptr == first_rec
+ || page_rec_is_last(node_ptr, page)) {
+ detected_same_key_root = true;
+ } else {
+ matched_fields = 0;
+
+ offsets2 = rec_get_offsets(
+ first_rec, index, offsets2,
+ 0, ULINT_UNDEFINED, &heap);
+ cmp_rec_rec(node_ptr, first_rec,
+ offsets, offsets2, index, false,
+ &matched_fields);
+
+ if (matched_fields
+ >= rec_offs_n_fields(offsets) - 1) {
+ detected_same_key_root = true;
+ } else {
+ const rec_t* last_rec;
+
+ last_rec = page_rec_get_prev_const(
+ page_get_supremum_rec(page));
+
+ matched_fields = 0;
+
+ offsets2 = rec_get_offsets(
+ last_rec, index, offsets2,
+ 0, ULINT_UNDEFINED, &heap);
+ cmp_rec_rec(
+ node_ptr, last_rec,
+ offsets, offsets2, index,
+ false, &matched_fields);
+ if (matched_fields
+ >= rec_offs_n_fields(offsets) - 1) {
+ detected_same_key_root = true;
+ }
+ }
+ }
+ }
+
+ /* If the page might cause modify_tree,
+ we should not release the parent page's lock. */
+ if (!detected_same_key_root
+ && latch_mode == BTR_MODIFY_TREE
+ && !btr_cur_will_modify_tree(
+ index, page, lock_intention, node_ptr,
+ node_ptr_max_size, zip_size, mtr)
+ && !rtree_parent_modified) {
+ ut_ad(upper_rw_latch == RW_X_LATCH);
+ ut_ad(n_releases <= n_blocks);
+
+ /* we can release upper blocks */
+ for (; n_releases < n_blocks; n_releases++) {
+ if (n_releases == 0) {
+ /* we should not release root page
+ to pin to same block. */
+ continue;
+ }
+
+ /* release unused blocks to unpin */
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+ }
+
+ if (height == level
+ && latch_mode == BTR_MODIFY_TREE) {
+ ut_ad(upper_rw_latch == RW_X_LATCH);
+ /* we should sx-latch root page, if released already.
+ It contains seg_header. */
+ if (n_releases > 0) {
+ mtr_block_sx_latch_at_savepoint(
+ mtr, tree_savepoints[0],
+ tree_blocks[0]);
+ }
+
+ /* x-latch the branch blocks not released yet. */
+ for (ulint i = n_releases; i <= n_blocks; i++) {
+ mtr_block_x_latch_at_savepoint(
+ mtr, tree_savepoints[i],
+ tree_blocks[i]);
+ }
+ }
+
+ /* We should consider prev_page of parent page, if the node_ptr
+ is the leftmost of the page. because BTR_SEARCH_PREV and
+ BTR_MODIFY_PREV latches prev_page of the leaf page. */
+ if ((latch_mode == BTR_SEARCH_PREV
+ || latch_mode == BTR_MODIFY_PREV)
+ && !retrying_for_search_prev) {
+ /* block should be latched for consistent
+ btr_page_get_prev() */
+ ut_ad(mtr->memo_contains_flagged(
+ block, MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX));
+
+ if (page_has_prev(page)
+ && page_rec_is_first(node_ptr, page)) {
+
+ if (leftmost_from_level == 0) {
+ leftmost_from_level = height + 1;
+ }
+ } else {
+ leftmost_from_level = 0;
+ }
+
+ if (height == 0 && leftmost_from_level > 0) {
+ /* should retry to get also prev_page
+ from level==leftmost_from_level. */
+ retrying_for_search_prev = true;
+
+ prev_tree_blocks = static_cast<buf_block_t**>(
+ ut_malloc_nokey(sizeof(buf_block_t*)
+ * leftmost_from_level));
+
+ prev_tree_savepoints = static_cast<ulint*>(
+ ut_malloc_nokey(sizeof(ulint)
+ * leftmost_from_level));
+
+ /* back to the level (leftmost_from_level+1) */
+ ulint idx = n_blocks
+ - (leftmost_from_level - 1);
+
+ page_id.set_page_no(
+ tree_blocks[idx]->page.id().page_no());
+
+ for (ulint i = n_blocks
+ - (leftmost_from_level - 1);
+ i <= n_blocks; i++) {
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[i],
+ tree_blocks[i]);
+ }
+
+ n_blocks -= (leftmost_from_level - 1);
+ height = leftmost_from_level;
+ ut_ad(n_releases == 0);
+
+ /* replay up_match, low_match */
+ up_match = 0;
+ low_match = 0;
+ rtr_info_t* rtr_info = need_path
+ ? cursor->rtr_info : NULL;
+
+ for (ulint i = 0; i < n_blocks; i++) {
+ page_cur_search_with_match(
+ tree_blocks[i], index, tuple,
+ page_mode, &up_match,
+ &low_match, page_cursor,
+ rtr_info);
+ }
+
+ goto search_loop;
+ }
+ }
+
+ /* Go to the child node */
+ page_id.set_page_no(
+ btr_node_ptr_get_child_page_no(node_ptr, offsets));
+
+ n_blocks++;
+
+ if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
+ /* We're doing a search on an ibuf tree and we're one
+ level above the leaf page. */
+
+ ut_ad(level == 0);
+
+ buf_mode = BUF_GET;
+ rw_latch = RW_NO_LATCH;
+ goto retry_page_get;
+ }
+
+ if (dict_index_is_spatial(index)
+ && page_mode >= PAGE_CUR_CONTAIN
+ && page_mode != PAGE_CUR_RTREE_INSERT) {
+ ut_ad(need_path);
+ rtr_node_path_t* path =
+ cursor->rtr_info->path;
+
+ if (!path->empty() && found) {
+ ut_ad(path->back().page_no
+ == page_id.page_no());
+ path->pop_back();
+#ifdef UNIV_DEBUG
+ if (page_mode == PAGE_CUR_RTREE_LOCATE
+ && (latch_mode != BTR_MODIFY_LEAF)) {
+ btr_pcur_t* cur
+ = cursor->rtr_info->parent_path->back(
+ ).cursor;
+ rec_t* my_node_ptr
+ = btr_pcur_get_rec(cur);
+
+ offsets = rec_get_offsets(
+ my_node_ptr, index, offsets,
+ 0, ULINT_UNDEFINED, &heap);
+
+ ulint my_page_no
+ = btr_node_ptr_get_child_page_no(
+ my_node_ptr, offsets);
+
+ ut_ad(page_id.page_no() == my_page_no);
+ }
+#endif
+ }
+ }
+
+ goto search_loop;
+ } else if (!dict_index_is_spatial(index)
+ && latch_mode == BTR_MODIFY_TREE
+ && lock_intention == BTR_INTENTION_INSERT
+ && page_has_next(page)
+ && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
+
+ /* btr_insert_into_right_sibling() might cause
+ deleting node_ptr at upper level */
+
+ guess = NULL;
+
+ if (height == 0) {
+ /* release the leaf pages if latched */
+ for (uint i = 0; i < 3; i++) {
+ if (latch_leaves.blocks[i] != NULL) {
+ mtr_release_block_at_savepoint(
+ mtr, latch_leaves.savepoints[i],
+ latch_leaves.blocks[i]);
+ latch_leaves.blocks[i] = NULL;
+ }
+ }
+ }
+
+ goto need_opposite_intention;
+ }
+
+ if (level != 0) {
+ ut_ad(!autoinc);
+
+ if (upper_rw_latch == RW_NO_LATCH) {
+ ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
+ || latch_mode == BTR_CONT_SEARCH_TREE);
+ buf_block_t* child_block = btr_block_get(
+ *index, page_id.page_no(),
+ latch_mode == BTR_CONT_MODIFY_TREE
+ ? RW_X_LATCH : RW_SX_LATCH, false, mtr);
+ btr_assert_not_corrupted(child_block, index);
+ } else {
+ ut_ad(mtr->memo_contains_flagged(block,
+ upper_rw_latch));
+ btr_assert_not_corrupted(block, index);
+
+ if (s_latch_by_caller) {
+ ut_ad(latch_mode == BTR_SEARCH_TREE);
+ /* to exclude modifying tree operations
+ should sx-latch the index. */
+ ut_ad(mtr->memo_contains(index->lock,
+ MTR_MEMO_SX_LOCK));
+ /* because has sx-latch of index,
+ can release upper blocks. */
+ for (; n_releases < n_blocks; n_releases++) {
+ mtr_release_block_at_savepoint(
+ mtr,
+ tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+ }
+ }
+
+ if (page_mode <= PAGE_CUR_LE) {
+ cursor->low_match = low_match;
+ cursor->up_match = up_match;
+ }
+ } else {
+ cursor->low_match = low_match;
+ cursor->low_bytes = low_bytes;
+ cursor->up_match = up_match;
+ cursor->up_bytes = up_bytes;
+
+ if (autoinc) {
+ page_set_autoinc(tree_blocks[0], autoinc, mtr, false);
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* We do a dirty read of btr_search_enabled here. We
+ will properly check btr_search_enabled again in
+ btr_search_build_page_hash_index() before building a
+ page hash index, while holding search latch. */
+ if (!btr_search_enabled) {
+# ifdef MYSQL_INDEX_DISABLE_AHI
+ } else if (index->disable_ahi) {
+# endif
+ } else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) {
+ ut_ad(index->is_instant());
+ /* This may be a search tuple for
+ btr_pcur_restore_position(). */
+ ut_ad(tuple->is_metadata()
+ || (tuple->is_metadata(tuple->info_bits
+ ^ REC_STATUS_INSTANT)));
+ } else if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) {
+ /* Only user records belong in the adaptive
+ hash index. */
+ } else {
+ btr_search_info_update(index, cursor);
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_ad(cursor->up_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_GE);
+ ut_ad(cursor->up_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_LE);
+ ut_ad(cursor->low_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_LE);
+ }
+
+ /* For spatial index, remember what blocks are still latched */
+ if (dict_index_is_spatial(index)
+ && (latch_mode == BTR_MODIFY_TREE
+ || latch_mode == BTR_MODIFY_LEAF)) {
+ for (ulint i = 0; i < n_releases; i++) {
+ cursor->rtr_info->tree_blocks[i] = NULL;
+ cursor->rtr_info->tree_savepoints[i] = 0;
+ }
+
+ for (ulint i = n_releases; i <= n_blocks; i++) {
+ cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
+ cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
+ }
+ }
+
+func_exit:
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ if (retrying_for_search_prev) {
+ ut_free(prev_tree_blocks);
+ ut_free(prev_tree_savepoints);
+ }
+
+ if (mbr_adj) {
+ /* remember that we will need to adjust parent MBR */
+ cursor->rtr_info->mbr_adj = true;
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (ahi_latch) {
+ rw_lock_s_lock(ahi_latch);
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ DBUG_RETURN(err);
+}
+
+/*****************************************************************//**
+Opens a cursor at either end of an index. */
+dberr_t
+btr_cur_open_at_index_side_func(
+/*============================*/
+ bool from_left, /*!< in: true if open to the low end,
+ false if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_cur_t* cursor, /*!< in/out: cursor */
+ ulint level, /*!< in: level to search for
+ (0=leaf). */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_cur_t* page_cursor;
+ ulint node_ptr_max_size = srv_page_size / 2;
+ ulint height;
+ ulint root_height = 0; /* remove warning */
+ rec_t* node_ptr;
+ ulint estimate;
+ btr_intention_t lock_intention;
+ buf_block_t* tree_blocks[BTR_MAX_LEVELS];
+ ulint tree_savepoints[BTR_MAX_LEVELS];
+ ulint n_blocks = 0;
+ ulint n_releases = 0;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ dberr_t err = DB_SUCCESS;
+
+ rec_offs_init(offsets_);
+
+ estimate = latch_mode & BTR_ESTIMATE;
+ latch_mode &= ulint(~BTR_ESTIMATE);
+
+ ut_ad(level != ULINT_UNDEFINED);
+
+ bool s_latch_by_caller;
+
+ s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
+ latch_mode &= ulint(~BTR_ALREADY_S_LATCHED);
+
+ lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
+
+ ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
+
+ /* This function doesn't need to lock left page of the leaf page */
+ if (latch_mode == BTR_SEARCH_PREV) {
+ latch_mode = BTR_SEARCH_LEAF;
+ } else if (latch_mode == BTR_MODIFY_PREV) {
+ latch_mode = BTR_MODIFY_LEAF;
+ }
+
+ /* Store the position of the tree latch we push to mtr so that we
+ know how to release it when we have latched the leaf node */
+
+ ulint savepoint = mtr_set_savepoint(mtr);
+
+ rw_lock_type_t upper_rw_latch;
+
+ switch (latch_mode) {
+ case BTR_CONT_MODIFY_TREE:
+ case BTR_CONT_SEARCH_TREE:
+ upper_rw_latch = RW_NO_LATCH;
+ break;
+ case BTR_MODIFY_TREE:
+ /* Most of delete-intended operations are purging.
+ Free blocks and read IO bandwidth should be prior
+ for them, when the history list is glowing huge. */
+ if (lock_intention == BTR_INTENTION_DELETE
+ && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
+ && buf_pool.n_pend_reads) {
+ mtr_x_lock_index(index, mtr);
+ } else {
+ mtr_sx_lock_index(index, mtr);
+ }
+ upper_rw_latch = RW_X_LATCH;
+ break;
+ default:
+ ut_ad(!s_latch_by_caller
+ || mtr->memo_contains_flagged(&index->lock,
+ MTR_MEMO_SX_LOCK
+ | MTR_MEMO_S_LOCK));
+ if (!srv_read_only_mode) {
+ if (!s_latch_by_caller) {
+ /* BTR_SEARCH_TREE is intended to be used with
+ BTR_ALREADY_S_LATCHED */
+ ut_ad(latch_mode != BTR_SEARCH_TREE);
+
+ mtr_s_lock_index(index, mtr);
+ }
+ upper_rw_latch = RW_S_LATCH;
+ } else {
+ upper_rw_latch = RW_NO_LATCH;
+ }
+ }
+
+ const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
+ latch_mode);
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+ cursor->index = index;
+
+ page_id_t page_id(index->table->space_id, index->page);
+ const ulint zip_size = index->table->space->zip_size();
+
+ if (root_leaf_rw_latch == RW_X_LATCH) {
+ node_ptr_max_size = btr_node_ptr_max_size(index);
+ }
+
+ height = ULINT_UNDEFINED;
+
+ for (;;) {
+ ut_ad(n_blocks < BTR_MAX_LEVELS);
+ tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
+
+ const ulint rw_latch = height
+ && (latch_mode != BTR_MODIFY_TREE || height == level)
+ ? upper_rw_latch : RW_NO_LATCH;
+ buf_block_t* block = buf_page_get_gen(page_id, zip_size,
+ rw_latch, NULL, BUF_GET,
+ file, line, mtr, &err,
+ height == 0
+ && !index->is_clust());
+ ut_ad((block != NULL) == (err == DB_SUCCESS));
+ tree_blocks[n_blocks] = block;
+
+ if (err != DB_SUCCESS) {
+ if (err == DB_DECRYPTION_FAILED) {
+ ib_push_warning((void *)NULL,
+ DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ index->table->name.m_name);
+ index->table->file_unreadable = true;
+ }
+
+ goto exit_loop;
+ }
+
+ const page_t* page = buf_block_get_frame(block);
+
+ if (height == ULINT_UNDEFINED
+ && page_is_leaf(page)
+ && rw_latch != RW_NO_LATCH
+ && rw_latch != root_leaf_rw_latch) {
+ /* We should retry to get the page, because the root page
+ is latched with different level as a leaf page. */
+ ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+ ut_ad(rw_latch == RW_S_LATCH);
+
+ ut_ad(n_blocks == 0);
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_blocks],
+ tree_blocks[n_blocks]);
+
+ upper_rw_latch = root_leaf_rw_latch;
+ continue;
+ }
+
+ ut_ad(fil_page_index_page_check(page));
+ ut_ad(index->id == btr_page_get_index_id(page));
+
+ if (height == ULINT_UNDEFINED) {
+ /* We are in the root node */
+
+ height = btr_page_get_level(page);
+ root_height = height;
+ ut_a(height >= level);
+ } else {
+ /* TODO: flag the index corrupted if this fails */
+ ut_ad(height == btr_page_get_level(page));
+ }
+
+ if (height == 0) {
+ if (rw_latch == RW_NO_LATCH) {
+ btr_cur_latch_leaves(block, latch_mode,
+ cursor, mtr);
+ }
+
+ /* In versions <= 3.23.52 we had forgotten to
+ release the tree latch here. If in an index
+ scan we had to scan far to find a record
+ visible to the current transaction, that could
+ starve others waiting for the tree latch. */
+
+ switch (latch_mode) {
+ case BTR_MODIFY_TREE:
+ case BTR_CONT_MODIFY_TREE:
+ case BTR_CONT_SEARCH_TREE:
+ break;
+ default:
+ if (UNIV_UNLIKELY(srv_read_only_mode)) {
+ break;
+ }
+ if (!s_latch_by_caller) {
+ /* Release the tree s-latch */
+ mtr_release_s_latch_at_savepoint(
+ mtr, savepoint, &index->lock);
+ }
+
+ /* release upper blocks */
+ for (; n_releases < n_blocks; n_releases++) {
+ mtr_release_block_at_savepoint(
+ mtr,
+ tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+ }
+ } else if (height == level /* height != 0 */
+ && UNIV_LIKELY(!srv_read_only_mode)) {
+ /* We already have the block latched. */
+ ut_ad(latch_mode == BTR_SEARCH_TREE);
+ ut_ad(s_latch_by_caller);
+ ut_ad(upper_rw_latch == RW_S_LATCH);
+ ut_ad(mtr->memo_contains_flagged(block,
+ MTR_MEMO_PAGE_S_FIX));
+
+ if (s_latch_by_caller) {
+ /* to exclude modifying tree operations
+ should sx-latch the index. */
+ ut_ad(mtr->memo_contains(index->lock,
+ MTR_MEMO_SX_LOCK));
+ /* because has sx-latch of index,
+ can release upper blocks. */
+ for (; n_releases < n_blocks; n_releases++) {
+ mtr_release_block_at_savepoint(
+ mtr,
+ tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+ }
+ }
+
+ if (from_left) {
+ page_cur_set_before_first(block, page_cursor);
+ } else {
+ page_cur_set_after_last(block, page_cursor);
+ }
+
+ if (height == level) {
+ if (estimate) {
+ btr_cur_add_path_info(cursor, height,
+ root_height);
+ }
+
+ break;
+ }
+
+ ut_ad(height > 0);
+
+ if (from_left) {
+ page_cur_move_to_next(page_cursor);
+ } else {
+ page_cur_move_to_prev(page_cursor);
+ }
+
+ if (estimate) {
+ btr_cur_add_path_info(cursor, height, root_height);
+ }
+
+ height--;
+
+ node_ptr = page_cur_get_rec(page_cursor);
+ offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+ 0, ULINT_UNDEFINED, &heap);
+
+ /* If the rec is the first or last in the page for
+ pessimistic delete intention, it might cause node_ptr insert
+ for the upper level. We should change the intention and retry.
+ */
+ if (latch_mode == BTR_MODIFY_TREE
+ && btr_cur_need_opposite_intention(
+ page, lock_intention, node_ptr)) {
+
+ ut_ad(upper_rw_latch == RW_X_LATCH);
+ /* release all blocks */
+ for (; n_releases <= n_blocks; n_releases++) {
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+
+ lock_intention = BTR_INTENTION_BOTH;
+
+ page_id.set_page_no(dict_index_get_page(index));
+
+ height = ULINT_UNDEFINED;
+
+ n_blocks = 0;
+ n_releases = 0;
+
+ continue;
+ }
+
+ if (latch_mode == BTR_MODIFY_TREE
+ && !btr_cur_will_modify_tree(
+ cursor->index, page, lock_intention, node_ptr,
+ node_ptr_max_size, zip_size, mtr)) {
+ ut_ad(upper_rw_latch == RW_X_LATCH);
+ ut_ad(n_releases <= n_blocks);
+
+ /* we can release upper blocks */
+ for (; n_releases < n_blocks; n_releases++) {
+ if (n_releases == 0) {
+ /* we should not release root page
+ to pin to same block. */
+ continue;
+ }
+
+ /* release unused blocks to unpin */
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+ }
+
+ if (height == level
+ && latch_mode == BTR_MODIFY_TREE) {
+ ut_ad(upper_rw_latch == RW_X_LATCH);
+ /* we should sx-latch root page, if released already.
+ It contains seg_header. */
+ if (n_releases > 0) {
+ mtr_block_sx_latch_at_savepoint(
+ mtr, tree_savepoints[0],
+ tree_blocks[0]);
+ }
+
+ /* x-latch the branch blocks not released yet. */
+ for (ulint i = n_releases; i <= n_blocks; i++) {
+ mtr_block_x_latch_at_savepoint(
+ mtr, tree_savepoints[i],
+ tree_blocks[i]);
+ }
+ }
+
+ /* Go to the child node */
+ page_id.set_page_no(
+ btr_node_ptr_get_child_page_no(node_ptr, offsets));
+
+ n_blocks++;
+ }
+
+ exit_loop:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ return err;
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree.
+@return true if the index is available and we have put the cursor, false
+if the index is unavailable */
+bool
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /*!< in/out: B-tree cursor */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t* page_cursor;
+ ulint node_ptr_max_size = srv_page_size / 2;
+ ulint height;
+ rec_t* node_ptr;
+ btr_intention_t lock_intention;
+ buf_block_t* tree_blocks[BTR_MAX_LEVELS];
+ ulint tree_savepoints[BTR_MAX_LEVELS];
+ ulint n_blocks = 0;
+ ulint n_releases = 0;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(!index->is_spatial());
+
+ lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
+
+ ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
+
+ ulint savepoint = mtr_set_savepoint(mtr);
+
+ rw_lock_type_t upper_rw_latch;
+
+ switch (latch_mode) {
+ case BTR_MODIFY_TREE:
+ /* Most of delete-intended operations are purging.
+ Free blocks and read IO bandwidth should be prior
+ for them, when the history list is glowing huge. */
+ if (lock_intention == BTR_INTENTION_DELETE
+ && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
+ && buf_pool.n_pend_reads) {
+ mtr_x_lock_index(index, mtr);
+ } else {
+ mtr_sx_lock_index(index, mtr);
+ }
+ upper_rw_latch = RW_X_LATCH;
+ break;
+ case BTR_SEARCH_PREV:
+ case BTR_MODIFY_PREV:
+ /* This function doesn't support left uncle
+ page lock for left leaf page lock, when
+ needed. */
+ case BTR_SEARCH_TREE:
+ case BTR_CONT_MODIFY_TREE:
+ case BTR_CONT_SEARCH_TREE:
+ ut_ad(0);
+ /* fall through */
+ default:
+ if (!srv_read_only_mode) {
+ mtr_s_lock_index(index, mtr);
+ upper_rw_latch = RW_S_LATCH;
+ } else {
+ upper_rw_latch = RW_NO_LATCH;
+ }
+ }
+
+ DBUG_EXECUTE_IF("test_index_is_unavailable",
+ return(false););
+
+ if (index->page == FIL_NULL) {
+ /* Since we don't hold index lock until just now, the index
+ could be modified by others, for example, if this is a
+ statistics updater for referenced table, it could be marked
+ as unavailable by 'DROP TABLE' in the mean time, since
+ we don't hold lock for statistics updater */
+ return(false);
+ }
+
+ const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
+ latch_mode);
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+ cursor->index = index;
+
+ page_id_t page_id(index->table->space_id, index->page);
+ const ulint zip_size = index->table->space->zip_size();
+ dberr_t err = DB_SUCCESS;
+
+ if (root_leaf_rw_latch == RW_X_LATCH) {
+ node_ptr_max_size = btr_node_ptr_max_size(index);
+ }
+
+ height = ULINT_UNDEFINED;
+
+ for (;;) {
+ page_t* page;
+
+ ut_ad(n_blocks < BTR_MAX_LEVELS);
+ tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
+
+ const rw_lock_type_t rw_latch = height
+ && latch_mode != BTR_MODIFY_TREE
+ ? upper_rw_latch : RW_NO_LATCH;
+ buf_block_t* block = buf_page_get_gen(page_id, zip_size,
+ rw_latch, NULL, BUF_GET,
+ file, line, mtr, &err,
+ height == 0
+ && !index->is_clust());
+ tree_blocks[n_blocks] = block;
+
+ ut_ad((block != NULL) == (err == DB_SUCCESS));
+
+ if (err != DB_SUCCESS) {
+ if (err == DB_DECRYPTION_FAILED) {
+ ib_push_warning((void *)NULL,
+ DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ index->table->name.m_name);
+ index->table->file_unreadable = true;
+ }
+
+ break;
+ }
+
+ page = buf_block_get_frame(block);
+
+ if (height == ULINT_UNDEFINED
+ && page_is_leaf(page)
+ && rw_latch != RW_NO_LATCH
+ && rw_latch != root_leaf_rw_latch) {
+ /* We should retry to get the page, because the root page
+ is latched with different level as a leaf page. */
+ ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+ ut_ad(rw_latch == RW_S_LATCH);
+
+ ut_ad(n_blocks == 0);
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_blocks],
+ tree_blocks[n_blocks]);
+
+ upper_rw_latch = root_leaf_rw_latch;
+ continue;
+ }
+
+ ut_ad(fil_page_index_page_check(page));
+ ut_ad(index->id == btr_page_get_index_id(page));
+
+ if (height == ULINT_UNDEFINED) {
+ /* We are in the root node */
+
+ height = btr_page_get_level(page);
+ }
+
+ if (height == 0) {
+ if (rw_latch == RW_NO_LATCH
+ || srv_read_only_mode) {
+ btr_cur_latch_leaves(block, latch_mode, cursor,
+ mtr);
+ }
+
+ /* btr_cur_open_at_index_side_func() and
+ btr_cur_search_to_nth_level() release
+ tree s-latch here.*/
+ switch (latch_mode) {
+ case BTR_MODIFY_TREE:
+ case BTR_CONT_MODIFY_TREE:
+ case BTR_CONT_SEARCH_TREE:
+ break;
+ default:
+ /* Release the tree s-latch */
+ if (!srv_read_only_mode) {
+ mtr_release_s_latch_at_savepoint(
+ mtr, savepoint,
+ dict_index_get_lock(index));
+ }
+
+ /* release upper blocks */
+ for (; n_releases < n_blocks; n_releases++) {
+ mtr_release_block_at_savepoint(
+ mtr,
+ tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+ }
+ }
+
+ page_cur_open_on_rnd_user_rec(block, page_cursor);
+
+ if (height == 0) {
+
+ break;
+ }
+
+ ut_ad(height > 0);
+
+ height--;
+
+ node_ptr = page_cur_get_rec(page_cursor);
+ offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+ 0, ULINT_UNDEFINED, &heap);
+
+ /* If the rec is the first or last in the page for
+ pessimistic delete intention, it might cause node_ptr insert
+ for the upper level. We should change the intention and retry.
+ */
+ if (latch_mode == BTR_MODIFY_TREE
+ && btr_cur_need_opposite_intention(
+ page, lock_intention, node_ptr)) {
+
+ ut_ad(upper_rw_latch == RW_X_LATCH);
+ /* release all blocks */
+ for (; n_releases <= n_blocks; n_releases++) {
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+
+ lock_intention = BTR_INTENTION_BOTH;
+
+ page_id.set_page_no(dict_index_get_page(index));
+
+ height = ULINT_UNDEFINED;
+
+ n_blocks = 0;
+ n_releases = 0;
+
+ continue;
+ }
+
+ if (latch_mode == BTR_MODIFY_TREE
+ && !btr_cur_will_modify_tree(
+ cursor->index, page, lock_intention, node_ptr,
+ node_ptr_max_size, zip_size, mtr)) {
+ ut_ad(upper_rw_latch == RW_X_LATCH);
+ ut_ad(n_releases <= n_blocks);
+
+ /* we can release upper blocks */
+ for (; n_releases < n_blocks; n_releases++) {
+ if (n_releases == 0) {
+ /* we should not release root page
+ to pin to same block. */
+ continue;
+ }
+
+ /* release unused blocks to unpin */
+ mtr_release_block_at_savepoint(
+ mtr, tree_savepoints[n_releases],
+ tree_blocks[n_releases]);
+ }
+ }
+
+ if (height == 0
+ && latch_mode == BTR_MODIFY_TREE) {
+ ut_ad(upper_rw_latch == RW_X_LATCH);
+ /* we should sx-latch root page, if released already.
+ It contains seg_header. */
+ if (n_releases > 0) {
+ mtr_block_sx_latch_at_savepoint(
+ mtr, tree_savepoints[0],
+ tree_blocks[0]);
+ }
+
+ /* x-latch the branch blocks not released yet. */
+ for (ulint i = n_releases; i <= n_blocks; i++) {
+ mtr_block_x_latch_at_savepoint(
+ mtr, tree_savepoints[i],
+ tree_blocks[i]);
+ }
+ }
+
+ /* Go to the child node */
+ page_id.set_page_no(
+ btr_node_ptr_get_child_page_no(node_ptr, offsets));
+
+ n_blocks++;
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return err == DB_SUCCESS;
+}
+
+/*==================== B-TREE INSERT =========================*/
+
+/*************************************************************//**
+Inserts a record if there is enough space, or if enough space can
+be freed by reorganizing. Differs from btr_cur_optimistic_insert because
+no heuristics is applied to whether it pays to use CPU time for
+reorganizing the page or not.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to inserted record if succeed, else NULL */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+rec_t*
+btr_cur_insert_if_possible(
+/*=======================*/
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
+ cursor stays valid */
+ const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not
+ have been stored to tuple */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_cur_t* page_cursor;
+ rec_t* rec;
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ /* Now, try the insert */
+ rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+ offsets, heap, n_ext, mtr);
+
+ /* If the record did not fit, reorganize.
+ For compressed pages, page_cur_tuple_insert()
+ attempted this already. */
+ if (!rec && !page_cur_get_page_zip(page_cursor)
+ && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
+ rec = page_cur_tuple_insert(
+ page_cursor, tuple, cursor->index,
+ offsets, heap, n_ext, mtr);
+ }
+
+ ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
+ return(rec);
+}
+
+/*************************************************************//**
+For an insert, checks the locks and does the undo logging if desired.
+@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
+dberr_t
+btr_cur_ins_lock_and_undo(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if
+ not zero, the parameters index and thr
+ should be specified */
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ que_thr_t* thr, /*!< in: query thread or NULL */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ bool* inherit)/*!< out: true if the inserted new record maybe
+ should inherit LOCK_GAP type locks from the
+ successor record */
+{
+ dict_index_t* index;
+ dberr_t err = DB_SUCCESS;
+ rec_t* rec;
+ roll_ptr_t roll_ptr;
+
+ /* Check if we have to wait for a lock: enqueue an explicit lock
+ request if yes */
+
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index;
+
+ ut_ad(!dict_index_is_online_ddl(index)
+ || dict_index_is_clust(index)
+ || (flags & BTR_CREATE_FLAG));
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ /* Check if there is predicate or GAP lock preventing the insertion */
+ if (!(flags & BTR_NO_LOCKING_FLAG)) {
+ const unsigned type = index->type;
+ if (UNIV_UNLIKELY(type & DICT_SPATIAL)) {
+ lock_prdt_t prdt;
+ rtr_mbr_t mbr;
+
+ rtr_get_mbr_from_tuple(entry, &mbr);
+
+ /* Use on stack MBR variable to test if a lock is
+ needed. If so, the predicate (MBR) will be allocated
+ from lock heap in lock_prdt_insert_check_and_lock() */
+ lock_init_prdt_from_mbr(
+ &prdt, &mbr, 0, NULL);
+
+ err = lock_prdt_insert_check_and_lock(
+ flags, rec, btr_cur_get_block(cursor),
+ index, thr, mtr, &prdt);
+ *inherit = false;
+ } else {
+#ifdef WITH_WSREP
+ trx_t* trx= thr_get_trx(thr);
+ /* If transaction scanning an unique secondary
+ key is wsrep high priority thread (brute
+ force) this scanning may involve GAP-locking
+ in the index. As this locking happens also
+ when applying replication events in high
+ priority applier threads, there is a
+ probability for lock conflicts between two
+ wsrep high priority threads. To avoid this
+ GAP-locking we mark that this transaction
+ is using unique key scan here. */
+ if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE
+ && trx->is_wsrep()
+ && wsrep_thd_is_BF(trx->mysql_thd, false)) {
+ trx->wsrep_UK_scan= true;
+ }
+#endif /* WITH_WSREP */
+ err = lock_rec_insert_check_and_lock(
+ flags, rec, btr_cur_get_block(cursor),
+ index, thr, mtr, inherit);
+#ifdef WITH_WSREP
+ trx->wsrep_UK_scan= false;
+#endif /* WITH_WSREP */
+ }
+ }
+
+ if (err != DB_SUCCESS
+ || !(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))
+ || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
+
+ return(err);
+ }
+
+ if (flags & BTR_NO_UNDO_LOG_FLAG) {
+ roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS;
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+upd_sys:
+ dfield_t* r = dtuple_get_nth_field(
+ entry, index->db_roll_ptr());
+ ut_ad(r->len == DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(static_cast<byte*>(r->data),
+ roll_ptr);
+ }
+ } else {
+ err = trx_undo_report_row_operation(thr, index, entry,
+ NULL, 0, NULL, NULL,
+ &roll_ptr);
+ if (err == DB_SUCCESS) {
+ goto upd_sys;
+ }
+ }
+
+ return(err);
+}
+
+/**
+Prefetch siblings of the leaf for the pessimistic operation.
+@param block leaf page
+@param index index of the page */
+static void btr_cur_prefetch_siblings(const buf_block_t *block,
+ const dict_index_t *index)
+{
+ ut_ad(page_is_leaf(block->frame));
+
+ if (index->is_ibuf())
+ return;
+
+ const page_t *page= block->frame;
+ uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
+ uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
+
+ if (prev == FIL_NULL);
+ else if (index->table->space->acquire())
+ buf_read_page_background(index->table->space,
+ page_id_t(block->page.id().space(), prev),
+ block->zip_size(), false);
+ if (next == FIL_NULL);
+ else if (index->table->space->acquire())
+ buf_read_page_background(index->table->space,
+ page_id_t(block->page.id().space(), next),
+ block->zip_size(), false);
+}
+
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+dberr_t
+btr_cur_optimistic_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameters index and thr should be
+ specified */
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
+ cursor stays valid */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in/out: query thread; can be NULL if
+ !(~flags
+ & (BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG)) */
+ mtr_t* mtr) /*!< in/out: mini-transaction;
+ if this function returns DB_SUCCESS on
+ a leaf page of a secondary index in a
+ compressed tablespace, the caller must
+ mtr_commit(mtr) before latching
+ any further pages */
+{
+ big_rec_t* big_rec_vec = NULL;
+ dict_index_t* index;
+ page_cur_t* page_cursor;
+ buf_block_t* block;
+ page_t* page;
+ rec_t* dummy;
+ bool leaf;
+ bool reorg __attribute__((unused));
+ bool inherit = true;
+ ulint rec_size;
+ dberr_t err;
+
+ ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
+ *big_rec = NULL;
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ index = cursor->index;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!dict_index_is_online_ddl(index)
+ || dict_index_is_clust(index)
+ || (flags & BTR_CREATE_FLAG));
+ ut_ad(dtuple_check_typed(entry));
+
+#ifdef HAVE_valgrind
+ if (block->page.zip.data) {
+ MEM_CHECK_DEFINED(page, srv_page_size);
+ MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size());
+ }
+#endif /* HAVE_valgrind */
+
+ leaf = page_is_leaf(page);
+
+ if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
+ ut_ad(leaf);
+ goto convert_big_rec;
+ }
+
+ /* Calculate the record size when entry is converted to a record */
+ rec_size = rec_get_converted_size(index, entry, n_ext);
+
+ if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
+ dtuple_get_n_fields(entry),
+ block->zip_size())) {
+convert_big_rec:
+ /* The record is so big that we have to store some fields
+ externally on separate database pages */
+ big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
+
+ if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+ return(DB_TOO_BIG_RECORD);
+ }
+
+ rec_size = rec_get_converted_size(index, entry, n_ext);
+ }
+
+ if (block->page.zip.data && page_zip_is_too_big(index, entry)) {
+ if (big_rec_vec != NULL) {
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
+
+ return(DB_TOO_BIG_RECORD);
+ }
+
+ LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
+ goto fail);
+
+ if (block->page.zip.data && leaf
+ && (page_get_data_size(page) + rec_size
+ >= dict_index_zip_pad_optimal_page_size(index))) {
+ /* If compression padding tells us that insertion will
+ result in too packed up page i.e.: which is likely to
+ cause compression failure then don't do an optimistic
+ insertion. */
+fail:
+ err = DB_FAIL;
+
+ /* prefetch siblings of the leaf for the pessimistic
+ operation, if the page is leaf. */
+ if (page_is_leaf(page)) {
+ btr_cur_prefetch_siblings(block, index);
+ }
+fail_err:
+
+ if (big_rec_vec) {
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
+
+ return(err);
+ }
+
+ ulint max_size = page_get_max_insert_size_after_reorganize(page, 1);
+ if (max_size < rec_size) {
+ goto fail;
+ }
+
+ const ulint n_recs = page_get_n_recs(page);
+ if (UNIV_UNLIKELY(n_recs >= 8189)) {
+ ut_ad(srv_page_size == 65536);
+ goto fail;
+ }
+
+ if (page_has_garbage(page)) {
+ if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
+ && n_recs > 1
+ && page_get_max_insert_size(page, 1) < rec_size) {
+
+ goto fail;
+ }
+ }
+
+ /* If there have been many consecutive inserts to the
+ clustered index leaf page of an uncompressed table, check if
+ we have to split the page to reserve enough free space for
+ future updates of records. */
+
+ if (leaf && !block->page.zip.data && dict_index_is_clust(index)
+ && page_get_n_recs(page) >= 2
+ && dict_index_get_space_reserve() + rec_size > max_size
+ && (btr_page_get_split_rec_to_right(cursor, &dummy)
+ || btr_page_get_split_rec_to_left(cursor))) {
+ goto fail;
+ }
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ DBUG_LOG("ib_cur",
+ "insert " << index->name << " (" << index->id << ") by "
+ << ib::hex(thr ? thr->graph->trx->id : 0)
+ << ' ' << rec_printer(entry).str());
+ DBUG_EXECUTE_IF("do_page_reorganize",
+ btr_page_reorganize(page_cursor, index, mtr););
+
+ /* Now, try the insert */
+ {
+ const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
+
+ /* Check locks and write to the undo log,
+ if specified */
+ err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+ thr, mtr, &inherit);
+ if (err != DB_SUCCESS) {
+ goto fail_err;
+ }
+
+#ifdef UNIV_DEBUG
+ if (!(flags & BTR_CREATE_FLAG)
+ && index->is_primary() && page_is_leaf(page)) {
+ const dfield_t* trx_id = dtuple_get_nth_field(
+ entry, dict_col_get_clust_pos(
+ dict_table_get_sys_col(index->table,
+ DATA_TRX_ID),
+ index));
+
+ ut_ad(trx_id->len == DATA_TRX_ID_LEN);
+ ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN);
+ ut_ad(*static_cast<const byte*>
+ (trx_id[1].data) & 0x80);
+ if (flags & BTR_NO_UNDO_LOG_FLAG) {
+ ut_ad(!memcmp(trx_id->data, reset_trx_id,
+ DATA_TRX_ID_LEN));
+ } else {
+ ut_ad(thr->graph->trx->id);
+ ut_ad(thr->graph->trx->id
+ == trx_read_trx_id(
+ static_cast<const byte*>(
+ trx_id->data))
+ || index->table->is_temporary());
+ }
+ }
+#endif
+
+ *rec = page_cur_tuple_insert(
+ page_cursor, entry, index, offsets, heap,
+ n_ext, mtr);
+
+ reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
+ }
+
+ if (*rec) {
+ } else if (block->page.zip.data) {
+ ut_ad(!index->table->is_temporary());
+ /* Reset the IBUF_BITMAP_FREE bits, because
+ page_cur_tuple_insert() will have attempted page
+ reorganize before failing. */
+ if (leaf
+ && !dict_index_is_clust(index)) {
+ ibuf_reset_free_bits(block);
+ }
+
+ goto fail;
+ } else {
+ ut_ad(!reorg);
+
+ /* If the record did not fit, reorganize */
+ if (!btr_page_reorganize(page_cursor, index, mtr)) {
+ ut_ad(0);
+ goto fail;
+ }
+
+ ut_ad(page_get_max_insert_size(page, 1) == max_size);
+
+ reorg = TRUE;
+
+ *rec = page_cur_tuple_insert(page_cursor, entry, index,
+ offsets, heap, n_ext, mtr);
+
+ if (UNIV_UNLIKELY(!*rec)) {
+ ib::fatal() << "Cannot insert tuple " << *entry
+ << "into index " << index->name
+ << " of table " << index->table->name
+ << ". Max size: " << max_size;
+ }
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (!leaf) {
+# ifdef MYSQL_INDEX_DISABLE_AHI
+ } else if (index->disable_ahi) {
+# endif
+ } else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
+ ut_ad(entry->is_metadata());
+ ut_ad(index->is_instant());
+ ut_ad(flags == BTR_NO_LOCKING_FLAG);
+ } else {
+ rw_lock_t* ahi_latch = btr_search_sys.get_latch(*index);
+ if (!reorg && cursor->flag == BTR_CUR_HASH) {
+ btr_search_update_hash_node_on_insert(
+ cursor, ahi_latch);
+ } else {
+ btr_search_update_hash_on_insert(cursor, ahi_latch);
+ }
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
+
+ lock_update_insert(block, *rec);
+ }
+
+ if (leaf
+ && !dict_index_is_clust(index)
+ && !index->table->is_temporary()) {
+ /* Update the free bits of the B-tree page in the
+ insert buffer bitmap. */
+
+ /* The free bits in the insert buffer bitmap must
+ never exceed the free space on a page. It is safe to
+ decrement or reset the bits in the bitmap in a
+ mini-transaction that is committed before the
+ mini-transaction that affects the free space. */
+
+ /* It is unsafe to increment the bits in a separately
+ committed mini-transaction, because in crash recovery,
+ the free bits could momentarily be set too high. */
+
+ if (block->page.zip.data) {
+ /* Update the bits in the same mini-transaction. */
+ ibuf_update_free_bits_zip(block, mtr);
+ } else {
+ /* Decrement the bits in a separate
+ mini-transaction. */
+ ibuf_update_free_bits_if_full(
+ block, max_size,
+ rec_size + PAGE_DIR_SLOT_SIZE);
+ }
+ }
+
+ *big_rec = big_rec_vec;
+
+ return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+dberr_t
+btr_cur_pessimistic_insert(
+/*=======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameter thr should be
+ specified; if no undo logging is specified,
+ then the caller must have reserved enough
+ free extents in the file space so that the
+ insertion will certainly succeed */
+ btr_cur_t* cursor, /*!< in: cursor after which to insert;
+ cursor stays valid */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap
+ that can be emptied */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in/out: query thread; can be NULL if
+ !(~flags
+ & (BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG)) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ dict_index_t* index = cursor->index;
+ big_rec_t* big_rec_vec = NULL;
+ dberr_t err;
+ bool inherit = false;
+ bool success;
+ uint32_t n_reserved = 0;
+
+ ut_ad(dtuple_check_typed(entry));
+ ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
+
+ *big_rec = NULL;
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!dict_index_is_online_ddl(index)
+ || dict_index_is_clust(index)
+ || (flags & BTR_CREATE_FLAG));
+
+ cursor->flag = BTR_CUR_BINARY;
+
+ /* Check locks and write to undo log, if specified */
+
+ err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+ thr, mtr, &inherit);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
+ /* First reserve enough free space for the file segments
+ of the index tree, so that the insert will not fail because
+ of lack of space */
+
+ uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
+
+ success = fsp_reserve_free_extents(&n_reserved,
+ index->table->space,
+ n_extents, FSP_NORMAL, mtr);
+ if (!success) {
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+ }
+
+ if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
+ index->table->not_redundant(),
+ dtuple_get_n_fields(entry),
+ btr_cur_get_block(cursor)->zip_size())
+ || UNIV_UNLIKELY(entry->is_alter_metadata()
+ && !dfield_is_ext(
+ dtuple_get_nth_field(
+ entry,
+ index->first_user_field())))) {
+ /* The record is so big that we have to store some fields
+ externally on separate database pages */
+
+ if (UNIV_LIKELY_NULL(big_rec_vec)) {
+ /* This should never happen, but we handle
+ the situation in a robust manner. */
+ ut_ad(0);
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
+
+ big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
+
+ if (big_rec_vec == NULL) {
+
+ index->table->space->release_free_extents(n_reserved);
+ return(DB_TOO_BIG_RECORD);
+ }
+ }
+
+ if (dict_index_get_page(index)
+ == btr_cur_get_block(cursor)->page.id().page_no()) {
+
+ /* The page is the root page */
+ *rec = btr_root_raise_and_insert(
+ flags, cursor, offsets, heap, entry, n_ext, mtr);
+ } else {
+ *rec = btr_page_split_and_insert(
+ flags, cursor, offsets, heap, entry, n_ext, mtr);
+ }
+
+ if (*rec == NULL && os_has_said_disk_full) {
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
+ || dict_index_is_spatial(index));
+
+ if (!(flags & BTR_NO_LOCKING_FLAG)) {
+ ut_ad(!index->table->is_temporary());
+ if (dict_index_is_spatial(index)) {
+ /* Do nothing */
+ } else {
+ /* The cursor might be moved to the other page
+ and the max trx id field should be updated after
+ the cursor was fixed. */
+ if (!dict_index_is_clust(index)) {
+ page_update_max_trx_id(
+ btr_cur_get_block(cursor),
+ btr_cur_get_page_zip(cursor),
+ thr_get_trx(thr)->id, mtr);
+ }
+
+ if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
+ || !page_has_prev(btr_cur_get_page(cursor))) {
+ /* split and inserted need to call
+ lock_update_insert() always. */
+ inherit = true;
+ }
+ }
+ }
+
+ if (!page_is_leaf(btr_cur_get_page(cursor))) {
+ ut_ad(!big_rec_vec);
+ } else {
+#ifdef BTR_CUR_HASH_ADAPT
+# ifdef MYSQL_INDEX_DISABLE_AHI
+ if (index->disable_ahi); else
+# endif
+ if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
+ ut_ad(entry->is_metadata());
+ ut_ad(index->is_instant());
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ ut_ad(!(flags & BTR_CREATE_FLAG));
+ } else {
+ btr_search_update_hash_on_insert(
+ cursor, btr_search_sys.get_latch(*index));
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+ if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
+
+ lock_update_insert(btr_cur_get_block(cursor), *rec);
+ }
+ }
+
+ index->table->space->release_free_extents(n_reserved);
+ *big_rec = big_rec_vec;
+
+ return(DB_SUCCESS);
+}
+
+/*==================== B-TREE UPDATE =========================*/
+
+/*************************************************************//**
+For an update, checks the locks and does the undo logging.
+@return DB_SUCCESS, DB_WAIT_LOCK, or error number */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+btr_cur_upd_lock_and_undo(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on record to update */
+ const rec_offs* offsets,/*!< in: rec_get_offsets() on cursor */
+ const upd_t* update, /*!< in: update vector */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread
+ (can be NULL if BTR_NO_LOCKING_FLAG) */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ roll_ptr_t* roll_ptr)/*!< out: roll pointer */
+{
+ dict_index_t* index;
+ const rec_t* rec;
+ dberr_t err;
+
+ ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
+
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ if (!dict_index_is_clust(index)) {
+ ut_ad(dict_index_is_online_ddl(index)
+ == !!(flags & BTR_CREATE_FLAG));
+
+ /* We do undo logging only when we update a clustered index
+ record */
+ return(lock_sec_rec_modify_check_and_lock(
+ flags, btr_cur_get_block(cursor), rec,
+ index, thr, mtr));
+ }
+
+ /* Check if we have to wait for a lock: enqueue an explicit lock
+ request if yes */
+
+ if (!(flags & BTR_NO_LOCKING_FLAG)) {
+ err = lock_clust_rec_modify_check_and_lock(
+ flags, btr_cur_get_block(cursor), rec, index,
+ offsets, thr);
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ /* Append the info about the update in the undo log */
+
+ return((flags & BTR_NO_UNDO_LOG_FLAG)
+ ? DB_SUCCESS
+ : trx_undo_report_row_operation(
+ thr, index, NULL, update,
+ cmpl_info, rec, offsets, roll_ptr));
+}
+
+/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry.
+@param[in,out] entry clustered index entry
+@param[in] index clustered index
+@param[in] trx_id DB_TRX_ID
+@param[in] roll_ptr DB_ROLL_PTR */
+static void btr_cur_write_sys(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ trx_id_t trx_id,
+ roll_ptr_t roll_ptr)
+{
+ dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
+ ut_ad(t->len == DATA_TRX_ID_LEN);
+ trx_write_trx_id(static_cast<byte*>(t->data), trx_id);
+ dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr());
+ ut_ad(r->len == DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
+}
+
+/** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record.
+@param[in,out] block clustered index leaf page
+@param[in,out] rec clustered index record
+@param[in] index clustered index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] trx transaction
+@param[in] roll_ptr DB_ROLL_PTR value
+@param[in,out] mtr mini-transaction */
+static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
+ dict_index_t *index, const rec_offs *offsets,
+ const trx_t *trx, roll_ptr_t roll_ptr,
+ mtr_t *mtr)
+{
+ ut_ad(index->is_primary());
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ {
+ page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(),
+ trx->id, roll_ptr, mtr);
+ return;
+ }
+
+ ulint offset= index->trx_id_offset;
+
+ if (!offset)
+ offset= row_get_trx_id_offset(index, offsets);
+
+ compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+
+ /* During IMPORT the trx id in the record can be in the future, if
+ the .ibd file is being imported from another instance. During IMPORT
+ roll_ptr will be 0. */
+ ut_ad(roll_ptr == 0 ||
+ lock_check_trx_id_sanity(trx_read_trx_id(rec + offset),
+ rec, index, offsets));
+
+ byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+ trx_write_trx_id(sys, trx->id);
+ trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr);
+
+ ulint d= 0;
+ const byte *src= nullptr;
+ byte *dest= rec + offset;
+ ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ if (UNIV_LIKELY(index->trx_id_offset))
+ {
+ const rec_t *prev= page_rec_get_prev_const(rec);
+ if (UNIV_UNLIKELY(prev == rec))
+ ut_ad(0);
+ else if (page_rec_is_infimum(prev));
+ else
+ for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++)
+ if (src[d] != sys[d])
+ break;
+ if (d > 6 && memcmp(dest, sys, d))
+ {
+ /* We save space by replacing a single record
+
+ WRITE,page_offset(dest),byte[13]
+
+ with two records:
+
+ MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes),
+ WRITE|0x80,0,byte[13-d]
+
+ The single WRITE record would be x+13 bytes long, with x>2.
+ The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the
+ second WRITE would be 1+1+13-d = 15-d bytes.
+
+ The total size is: x+13 versus x+4+15-d = x+19-d bytes.
+ To save space, we must have d>6, that is, the complete DB_TRX_ID and
+ the first byte(s) of DB_ROLL_PTR must match the previous record. */
+ memcpy(dest, src, d);
+ mtr->memmove(*block, page_offset(dest), page_offset(src), d);
+ dest+= d;
+ len-= d;
+ /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when
+ DB_TRX_ID refers to an active transaction. */
+ ut_ad(len);
+ }
+ else
+ d= 0;
+ }
+
+ if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len);
+}
+
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ page_cur_t* cursor, /*!< in/out: B-tree page cursor */
+ dict_index_t* index, /*!< in: the index corresponding to cursor */
+#ifdef UNIV_DEBUG
+ rec_offs* offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
+ ulint length, /*!< in: size needed */
+ bool create, /*!< in: true=delete-and-insert,
+ false=update-in-place */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+
+ /* Have a local copy of the variables as these can change
+ dynamically. */
+ const page_t* page = page_cur_get_page(cursor);
+
+ ut_ad(page_zip == page_cur_get_page_zip(cursor));
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
+
+ if (page_zip_available(page_zip, dict_index_is_clust(index),
+ length, create)) {
+ return(true);
+ }
+
+ if (!page_zip->m_nonempty && !page_has_garbage(page)) {
+ /* The page has been freshly compressed, so
+ reorganizing it will not help. */
+ return(false);
+ }
+
+ if (create && page_is_leaf(page)
+ && (length + page_get_data_size(page)
+ >= dict_index_zip_pad_optimal_page_size(index))) {
+ return(false);
+ }
+
+ if (!btr_page_reorganize(cursor, index, mtr)) {
+ goto out_of_space;
+ }
+
+ rec_offs_make_valid(page_cur_get_rec(cursor), index,
+ page_is_leaf(page), offsets);
+
+ /* After recompressing a page, we must make sure that the free
+ bits in the insert buffer bitmap will not exceed the free
+ space on the page. Because this function will not attempt
+ recompression unless page_zip_available() fails above, it is
+ safe to reset the free bits if page_zip_available() fails
+ again, below. The free bits can safely be reset in a separate
+ mini-transaction. If page_zip_available() succeeds below, we
+ can be sure that the btr_page_reorganize() above did not reduce
+ the free space available on the page. */
+
+ if (page_zip_available(page_zip, dict_index_is_clust(index),
+ length, create)) {
+ return(true);
+ }
+
+out_of_space:
+ ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
+
+ /* Out of space: reset the free bits. */
+ if (!dict_index_is_clust(index)
+ && !index->table->is_temporary()
+ && page_is_leaf(page)) {
+ ibuf_reset_free_bits(page_cur_get_block(cursor));
+ }
+
+ return(false);
+}
+
+/** Apply an update vector to a record. No field size changes are allowed.
+
+This is usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page().
+@param[in,out] rec index record
+@param[in] index the index of the record
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] update update vector
+@param[in,out] block index page
+@param[in,out] mtr mini-transaction */
+void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
+ const rec_offs *offsets, const upd_t *update,
+ buf_block_t *block, mtr_t *mtr)
+{
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!index->table->skip_alter_undo);
+ ut_ad(!block->page.zip.data || index->table->not_redundant());
+
+#ifdef UNIV_DEBUG
+ if (rec_offs_comp(offsets)) {
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_ORDINARY:
+ break;
+ case REC_STATUS_INSTANT:
+ ut_ad(index->is_instant());
+ break;
+ case REC_STATUS_NODE_PTR:
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ ut_ad("wrong record status in update" == 0);
+ }
+ }
+#endif /* UNIV_DEBUG */
+
+ static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility");
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ ut_ad(rec_offs_comp(offsets));
+ byte* info_bits = &rec[-REC_NEW_INFO_BITS];
+ const bool flip_del_mark = (*info_bits ^ update->info_bits)
+ & REC_INFO_DELETED_FLAG;
+ *info_bits &= byte(~REC_INFO_BITS_MASK);
+ *info_bits |= update->info_bits;
+
+ if (flip_del_mark) {
+ page_zip_rec_set_deleted(block, rec, update->info_bits
+ & REC_INFO_DELETED_FLAG, mtr);
+ }
+ } else {
+ byte* info_bits = &rec[rec_offs_comp(offsets)
+ ? -REC_NEW_INFO_BITS
+ : -REC_OLD_INFO_BITS];
+
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits,
+ (*info_bits
+ & ~REC_INFO_BITS_MASK)
+ | update->info_bits);
+ }
+
+ for (ulint i = 0; i < update->n_fields; i++) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+ if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) {
+ continue;
+ }
+ const ulint n = uf->field_no;
+
+ ut_ad(!dfield_is_ext(&uf->new_val)
+ == !rec_offs_nth_extern(offsets, n));
+ ut_ad(!rec_offs_nth_default(offsets, n));
+
+ if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+ if (rec_offs_nth_sql_null(offsets, n)) {
+ ut_ad(index->table->is_instant());
+ ut_ad(n >= index->n_core_fields);
+ continue;
+ }
+
+ ut_ad(!index->table->not_redundant());
+ switch (ulint size = rec_get_nth_field_size(rec, n)) {
+ case 0:
+ break;
+ case 1:
+ mtr->write<1,mtr_t::MAYBE_NOP>(
+ *block,
+ rec_get_field_start_offs(rec, n) + rec,
+ 0U);
+ break;
+ default:
+ mtr->memset(
+ block,
+ page_offset(rec_get_field_start_offs(
+ rec, n) + rec),
+ size, 0);
+ }
+ ulint l = rec_get_1byte_offs_flag(rec)
+ ? (n + 1) : (n + 1) * 2;
+ byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
+ compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+ == REC_2BYTE_SQL_NULL_MASK);
+ mtr->write<1>(*block, b,
+ byte(*b | REC_1BYTE_SQL_NULL_MASK));
+ continue;
+ }
+
+ ulint len;
+ byte* data = rec_get_nth_field(rec, offsets, n, &len);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ ut_ad(len == uf->new_val.len);
+ memcpy(data, uf->new_val.data, len);
+ continue;
+ }
+
+ if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+ ut_ad(len == UNIV_SQL_NULL);
+ ut_ad(!rec_offs_comp(offsets));
+ len = uf->new_val.len;
+ ut_ad(len == rec_get_nth_field_size(rec, n));
+ ulint l = rec_get_1byte_offs_flag(rec)
+ ? (n + 1) : (n + 1) * 2;
+ byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
+ compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+ == REC_2BYTE_SQL_NULL_MASK);
+ mtr->write<1>(*block, b,
+ byte(*b & ~REC_1BYTE_SQL_NULL_MASK));
+ }
+
+ if (len) {
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*block, data,
+ uf->new_val.data, len);
+ }
+ }
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ page_zip_write_rec(block, rec, index, offsets, 0, mtr);
+ }
+}
+
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+We assume here that the ordering fields of the record do not change.
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_update_in_place(
+/*====================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ rec_offs* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+ const upd_t* update, /*!< in: update vector */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; if this
+ is a secondary index, the caller must
+ mtr_commit(mtr) before latching any
+ further pages */
+{
+ dict_index_t* index;
+ dberr_t err;
+ rec_t* rec;
+ roll_ptr_t roll_ptr = 0;
+ ulint was_delete_marked;
+
+ ut_ad(page_is_leaf(cursor->page_cur.block->frame));
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index;
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+ ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+ || index->table->is_temporary());
+ /* The insert buffer tree should never be updated in place. */
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+ || dict_index_is_clust(index));
+ ut_ad(thr_get_trx(thr)->id == trx_id
+ || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
+ == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+ ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
+ ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
+ ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG));
+
+ DBUG_LOG("ib_cur",
+ "update-in-place " << index->name << " (" << index->id
+ << ") by " << ib::hex(trx_id) << ": "
+ << rec_printer(rec, offsets).str());
+
+ buf_block_t* block = btr_cur_get_block(cursor);
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+
+ /* Check that enough space is available on the compressed page. */
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ ut_ad(!index->table->is_temporary());
+
+ if (!btr_cur_update_alloc_zip(
+ page_zip, btr_cur_get_page_cur(cursor),
+ index, offsets, rec_offs_size(offsets),
+ false, mtr)) {
+ return(DB_ZIP_OVERFLOW);
+ }
+
+ rec = btr_cur_get_rec(cursor);
+ }
+
+ /* Do lock checking and undo logging */
+ err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
+ update, cmpl_info,
+ thr, mtr, &roll_ptr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ /* We may need to update the IBUF_BITMAP_FREE
+ bits after a reorganize that was done in
+ btr_cur_update_alloc_zip(). */
+ goto func_exit;
+ }
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ btr_cur_upd_rec_sys(block, rec, index, offsets,
+ thr_get_trx(thr), roll_ptr, mtr);
+ }
+
+ was_delete_marked = rec_get_deleted_flag(
+ rec, page_is_comp(buf_block_get_frame(block)));
+ /* In delete-marked records, DB_TRX_ID must always refer to an
+ existing undo log record. */
+ ut_ad(!was_delete_marked
+ || !dict_index_is_clust(index)
+ || row_get_rec_trx_id(rec, index, offsets));
+
+#ifdef BTR_CUR_HASH_ADAPT
+ {
+ rw_lock_t* ahi_latch = block->index
+ ? btr_search_sys.get_latch(*index) : NULL;
+ if (ahi_latch) {
+ /* TO DO: Can we skip this if none of the fields
+ index->search_info->curr_n_fields
+ are being updated? */
+
+ /* The function row_upd_changes_ord_field_binary
+ does not work on a secondary index. */
+
+ if (!dict_index_is_clust(index)
+ || row_upd_changes_ord_field_binary(
+ index, update, thr, NULL, NULL)) {
+ ut_ad(!(update->info_bits
+ & REC_INFO_MIN_REC_FLAG));
+ /* Remove possible hash index pointer
+ to this record */
+ btr_search_update_hash_on_delete(cursor);
+ }
+
+ rw_lock_x_lock(ahi_latch);
+ }
+
+ assert_block_ahi_valid(block);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ btr_cur_upd_rec_in_place(rec, index, offsets, update, block,
+ mtr);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (ahi_latch) {
+ rw_lock_x_unlock(ahi_latch);
+ }
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (was_delete_marked
+ && !rec_get_deleted_flag(
+ rec, page_is_comp(buf_block_get_frame(block)))) {
+ /* The new updated record owns its possible externally
+ stored fields */
+
+ btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr);
+ }
+
+ ut_ad(err == DB_SUCCESS);
+
+func_exit:
+ if (page_zip
+ && !(flags & BTR_KEEP_IBUF_BITMAP)
+ && !dict_index_is_clust(index)
+ && page_is_leaf(buf_block_get_frame(block))) {
+ /* Update the free bits in the insert buffer. */
+ ut_ad(!index->table->is_temporary());
+ ibuf_update_free_bits_zip(block, mtr);
+ }
+
+ return(err);
+}
+
+/** Trim a metadata record during the rollback of instant ALTER TABLE.
+@param[in] entry metadata tuple
+@param[in] index primary key
+@param[in] update update vector for the rollback */
+ATTRIBUTE_COLD
+static void btr_cur_trim_alter_metadata(dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update)
+{
+ ut_ad(index->is_instant());
+ ut_ad(update->is_alter_metadata());
+ ut_ad(entry->is_alter_metadata());
+
+ ut_ad(update->fields[0].field_no == index->first_user_field());
+ ut_ad(update->fields[0].new_val.ext);
+ ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE);
+ ut_ad(entry->n_fields - 1 == index->n_fields);
+
+ const byte* ptr = static_cast<const byte*>(
+ update->fields[0].new_val.data);
+ ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN));
+ ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4);
+ ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA);
+ ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
+ == index->table->space->id);
+
+ ulint n_fields = update->fields[1].field_no;
+ ut_ad(n_fields <= index->n_fields);
+ if (n_fields != index->n_uniq) {
+ ut_ad(n_fields
+ >= index->n_core_fields);
+ entry->n_fields = n_fields;
+ return;
+ }
+
+ /* This is based on dict_table_t::deserialise_columns()
+ and btr_cur_instant_init_low(). */
+ mtr_t mtr;
+ mtr.start();
+ buf_block_t* block = buf_page_get(
+ page_id_t(index->table->space->id,
+ mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
+ 0, RW_S_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+ ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_TYPE_BLOB);
+ ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO])
+ == FIL_NULL);
+ ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
+ + BTR_BLOB_HDR_PART_LEN])
+ == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4));
+ n_fields = mach_read_from_4(
+ &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE])
+ + index->first_user_field();
+ /* Rollback should not increase the number of fields. */
+ ut_ad(n_fields <= index->n_fields);
+ ut_ad(n_fields + 1 <= entry->n_fields);
+ /* dict_index_t::clear_instant_alter() cannot be invoked while
+ rollback of an instant ALTER TABLE transaction is in progress
+ for an is_alter_metadata() record. */
+ ut_ad(n_fields >= index->n_core_fields);
+
+ mtr.commit();
+ entry->n_fields = n_fields + 1;
+}
+
+/** Trim an update tuple due to instant ADD COLUMN, if needed.
+For normal records, the trailing instantly added fields that match
+the initial default values are omitted.
+
+For the special metadata record on a table on which instant
+ADD COLUMN has already been executed, both ADD COLUMN and the
+rollback of ADD COLUMN need to be handled specially.
+
+@param[in,out] entry index entry
+@param[in] index index
+@param[in] update update vector
+@param[in] thr execution thread */
+static inline
+void
+btr_cur_trim(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update,
+ const que_thr_t* thr)
+{
+ if (!index->is_instant()) {
+ } else if (UNIV_UNLIKELY(update->is_metadata())) {
+ /* We are either updating a metadata record
+ (instant ALTER TABLE on a table where instant ALTER was
+ already executed) or rolling back such an operation. */
+ ut_ad(!upd_get_nth_field(update, 0)->orig_len);
+ ut_ad(entry->is_metadata());
+
+ if (thr->graph->trx->in_rollback) {
+ /* This rollback can occur either as part of
+ ha_innobase::commit_inplace_alter_table() rolling
+ back after a failed innobase_add_instant_try(),
+ or as part of crash recovery. Either way, the
+ table will be in the data dictionary cache, with
+ the instantly added columns going to be removed
+ later in the rollback. */
+ ut_ad(index->table->cached);
+ /* The DB_TRX_ID,DB_ROLL_PTR are always last,
+ and there should be some change to roll back.
+ The first field in the update vector is the
+ first instantly added column logged by
+ innobase_add_instant_try(). */
+ ut_ad(update->n_fields > 2);
+ if (update->is_alter_metadata()) {
+ btr_cur_trim_alter_metadata(
+ entry, index, update);
+ return;
+ }
+ ut_ad(!entry->is_alter_metadata());
+
+ ulint n_fields = upd_get_nth_field(update, 0)
+ ->field_no;
+ ut_ad(n_fields + 1 >= entry->n_fields);
+ entry->n_fields = n_fields;
+ }
+ } else {
+ entry->trim(*index);
+ }
+}
+
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended. We assume here that the ordering
+fields of the record do not change.
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_optimistic_update(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
+ mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
+ const upd_t* update, /*!< in: update vector; this must also
+ contain trx id and roll ptr fields */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; if this
+ is a secondary index, the caller must
+ mtr_commit(mtr) before latching any
+ further pages */
+{
+ dict_index_t* index;
+ page_cur_t* page_cursor;
+ dberr_t err;
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ rec_t* rec;
+ ulint max_size;
+ ulint new_rec_size;
+ ulint old_rec_size;
+ ulint max_ins_size = 0;
+ dtuple_t* new_entry;
+ roll_ptr_t roll_ptr;
+ ulint i;
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index;
+ ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+ || index->table->is_temporary());
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ /* This is intended only for leaf page updates */
+ ut_ad(page_is_leaf(page));
+ /* The insert buffer tree should never be updated in place. */
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+ || dict_index_is_clust(index));
+ ut_ad(thr_get_trx(thr)->id == trx_id
+ || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
+ == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+ ut_ad(fil_page_index_page_check(page));
+ ut_ad(btr_page_get_index_id(page) == index->id);
+
+ *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
+ ULINT_UNDEFINED, heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(rec, *offsets)
+ || thr_get_trx(thr) == trx_roll_crash_recv_trx);
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ if (UNIV_LIKELY(!update->is_metadata())
+ && !row_upd_changes_field_size_or_external(index, *offsets,
+ update)) {
+
+ /* The simplest and the most common case: the update does not
+ change the size of any field and none of the updated fields is
+ externally stored in rec or update, and there is enough space
+ on the compressed page to log the update. */
+
+ return(btr_cur_update_in_place(
+ flags, cursor, *offsets, update,
+ cmpl_info, thr, trx_id, mtr));
+ }
+
+ if (rec_offs_any_extern(*offsets)) {
+any_extern:
+ ut_ad(!index->is_ibuf());
+ /* Externally stored fields are treated in pessimistic
+ update */
+
+ /* prefetch siblings of the leaf for the pessimistic
+ operation. */
+ btr_cur_prefetch_siblings(block, index);
+
+ return(DB_OVERFLOW);
+ }
+
+ if (rec_is_metadata(rec, *index) && index->table->instant) {
+ goto any_extern;
+ }
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
+
+ goto any_extern;
+ }
+ }
+
+ DBUG_LOG("ib_cur",
+ "update " << index->name << " (" << index->id << ") by "
+ << ib::hex(trx_id) << ": "
+ << rec_printer(rec, *offsets).str());
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ if (!*heap) {
+ *heap = mem_heap_create(
+ rec_offs_size(*offsets)
+ + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
+ }
+
+ new_entry = row_rec_to_index_entry(rec, index, *offsets, *heap);
+ ut_ad(!dtuple_get_n_ext(new_entry));
+
+ /* The page containing the clustered index record
+ corresponding to new_entry is latched in mtr.
+ Thus the following call is safe. */
+ row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+ *heap);
+ btr_cur_trim(new_entry, index, update, thr);
+ old_rec_size = rec_offs_size(*offsets);
+ new_rec_size = rec_get_converted_size(index, new_entry, 0);
+
+ page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_zip) {
+ ut_ad(!index->table->is_temporary());
+
+ if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
+ dict_index_get_n_fields(index),
+ block->zip_size())) {
+ goto any_extern;
+ }
+
+ if (!btr_cur_update_alloc_zip(
+ page_zip, page_cursor, index, *offsets,
+ new_rec_size, true, mtr)) {
+ return(DB_ZIP_OVERFLOW);
+ }
+
+ rec = page_cur_get_rec(page_cursor);
+ }
+
+ /* We limit max record size to 16k even for 64k page size. */
+ if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
+ (!dict_table_is_comp(index->table)
+ && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
+ err = DB_OVERFLOW;
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(new_rec_size
+ >= (page_get_free_space_of_empty(page_is_comp(page))
+ / 2))) {
+ /* We may need to update the IBUF_BITMAP_FREE
+ bits after a reorganize that was done in
+ btr_cur_update_alloc_zip(). */
+ err = DB_OVERFLOW;
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(page_get_data_size(page)
+ - old_rec_size + new_rec_size
+ < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
+ /* We may need to update the IBUF_BITMAP_FREE
+ bits after a reorganize that was done in
+ btr_cur_update_alloc_zip(). */
+
+ /* The page would become too empty */
+ err = DB_UNDERFLOW;
+ goto func_exit;
+ }
+
+ /* We do not attempt to reorganize if the page is compressed.
+ This is because the page may fail to compress after reorganization. */
+ max_size = page_zip
+ ? page_get_max_insert_size(page, 1)
+ : (old_rec_size
+ + page_get_max_insert_size_after_reorganize(page, 1));
+
+ if (!page_zip) {
+ max_ins_size = page_get_max_insert_size_after_reorganize(
+ page, 1);
+ }
+
+ if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
+ && (max_size >= new_rec_size))
+ || (page_get_n_recs(page) <= 1))) {
+
+ /* We may need to update the IBUF_BITMAP_FREE
+ bits after a reorganize that was done in
+ btr_cur_update_alloc_zip(). */
+
+ /* There was not enough space, or it did not pay to
+ reorganize: for simplicity, we decide what to do assuming a
+ reorganization is needed, though it might not be necessary */
+
+ err = DB_OVERFLOW;
+ goto func_exit;
+ }
+
+ /* Do lock checking and undo logging */
+ err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+ update, cmpl_info,
+ thr, mtr, &roll_ptr);
+ if (err != DB_SUCCESS) {
+ /* We may need to update the IBUF_BITMAP_FREE
+ bits after a reorganize that was done in
+ btr_cur_update_alloc_zip(). */
+ goto func_exit;
+ }
+
+ /* Ok, we may do the replacement. Store on the page infimum the
+ explicit locks on rec, before deleting rec (see the comment in
+ btr_cur_pessimistic_update). */
+ if (!dict_table_is_locking_disabled(index->table)) {
+ lock_rec_store_on_page_infimum(block, rec);
+ }
+
+ if (UNIV_UNLIKELY(update->is_metadata())) {
+ ut_ad(new_entry->is_metadata());
+ ut_ad(index->is_instant());
+ /* This can be innobase_add_instant_try() performing a
+ subsequent instant ADD COLUMN, or its rollback by
+ row_undo_mod_clust_low(). */
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ } else {
+ btr_search_update_hash_on_delete(cursor);
+ }
+
+ page_cur_delete_rec(page_cursor, index, *offsets, mtr);
+
+ page_cur_move_to_prev(page_cursor);
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
+ }
+
+ /* There are no externally stored columns in new_entry */
+ rec = btr_cur_insert_if_possible(
+ cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
+ ut_a(rec); /* <- We calculated above the insert would fit */
+
+ if (UNIV_UNLIKELY(update->is_metadata())) {
+ /* We must empty the PAGE_FREE list, because if this
+ was a rollback, the shortened metadata record
+ would have too many fields, and we would be unable to
+ know the size of the freed record. */
+ btr_page_reorganize(page_cursor, index, mtr);
+ } else if (!dict_table_is_locking_disabled(index->table)) {
+ /* Restore the old explicit lock state on the record */
+ lock_rec_restore_from_page_infimum(block, rec, block);
+ }
+
+ page_cur_move_to_next(page_cursor);
+ ut_ad(err == DB_SUCCESS);
+
+func_exit:
+ if (!(flags & BTR_KEEP_IBUF_BITMAP)
+ && !dict_index_is_clust(index)) {
+ /* Update the free bits in the insert buffer. */
+ if (page_zip) {
+ ut_ad(!index->table->is_temporary());
+ ibuf_update_free_bits_zip(block, mtr);
+ } else if (!index->table->is_temporary()) {
+ ibuf_update_free_bits_low(block, max_ins_size, mtr);
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ /* prefetch siblings of the leaf for the pessimistic
+ operation. */
+ btr_cur_prefetch_siblings(block, index);
+ }
+
+ return(err);
+}
+
+/*************************************************************//**
+If, in a split, a new supremum record was created as the predecessor of the
+updated record, the supremum record must inherit exactly the locks on the
+updated record. In the split it may have inherited locks from the successor
+of the updated record, which is not correct. This function restores the
+right locks for the new supremum. */
+static
+void
+btr_cur_pess_upd_restore_supremum(
+/*==============================*/
+ buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: updated record */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page;
+ buf_block_t* prev_block;
+
+ page = buf_block_get_frame(block);
+
+ if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
+ /* Updated record is not the first user record on its page */
+
+ return;
+ }
+
+ const uint32_t prev_page_no = btr_page_get_prev(page);
+
+ const page_id_t page_id(block->page.id().space(), prev_page_no);
+
+ ut_ad(prev_page_no != FIL_NULL);
+ prev_block = buf_page_get_with_no_latch(page_id, block->zip_size(),
+ mtr);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_page_get_next(prev_block->frame)
+ == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+
+ /* We must already have an x-latch on prev_block! */
+ ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX));
+
+ lock_rec_reset_and_inherit_gap_locks(prev_block, block,
+ PAGE_HEAP_NO_SUPREMUM,
+ page_rec_get_heap_no(rec));
+}
+
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist. We assume
+here that the ordering fields of the record do not change.
+@return DB_SUCCESS or error code */
+dberr_t
+btr_cur_pessimistic_update(
+/*=======================*/
+ ulint flags, /*!< in: undo logging, locking, and rollback
+ flags */
+ btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
+ cursor may become invalid if *big_rec == NULL
+ || !(flags & BTR_KEEP_POS_FLAG) */
+ rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
+ mem_heap_t** offsets_heap,
+ /*!< in/out: pointer to memory heap
+ that can be emptied */
+ mem_heap_t* entry_heap,
+ /*!< in/out: memory heap for allocating
+ big_rec and the index tuple */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ upd_t* update, /*!< in/out: update vector; this is allowed to
+ also contain trx id and roll ptr fields.
+ Non-updated columns that are moved offpage will
+ be appended to this. */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; must be
+ committed before latching any further pages */
+{
+ big_rec_t* big_rec_vec = NULL;
+ big_rec_t* dummy_big_rec;
+ dict_index_t* index;
+ buf_block_t* block;
+ page_zip_des_t* page_zip;
+ rec_t* rec;
+ page_cur_t* page_cursor;
+ dberr_t err;
+ dberr_t optim_err;
+ roll_ptr_t roll_ptr;
+ bool was_first;
+ uint32_t n_reserved = 0;
+
+ *offsets = NULL;
+ *big_rec = NULL;
+
+ block = btr_cur_get_block(cursor);
+ page_zip = buf_block_get_page_zip(block);
+ index = cursor->index;
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
+ MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+ ut_ad(!page_zip || !index->table->is_temporary());
+ /* The insert buffer tree should never be updated in place. */
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+ || index->table->is_temporary());
+ ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+ || dict_index_is_clust(index));
+ ut_ad(thr_get_trx(thr)->id == trx_id
+ || (flags & ulint(~BTR_KEEP_POS_FLAG))
+ == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+
+ err = optim_err = btr_cur_optimistic_update(
+ flags | BTR_KEEP_IBUF_BITMAP,
+ cursor, offsets, offsets_heap, update,
+ cmpl_info, thr, trx_id, mtr);
+
+ switch (err) {
+ case DB_ZIP_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_OVERFLOW:
+ break;
+ default:
+ err_exit:
+ /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
+ For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
+ already reset by btr_cur_update_alloc_zip() if the
+ page was recompressed. */
+ if (page_zip
+ && optim_err != DB_ZIP_OVERFLOW
+ && !dict_index_is_clust(index)
+ && page_is_leaf(block->frame)) {
+ ut_ad(!index->table->is_temporary());
+ ibuf_update_free_bits_zip(block, mtr);
+ }
+
+ if (big_rec_vec != NULL) {
+ dtuple_big_rec_free(big_rec_vec);
+ }
+
+ return(err);
+ }
+
+ rec = btr_cur_get_rec(cursor);
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ dtuple_t* new_entry;
+
+ const bool is_metadata = rec_is_metadata(rec, *index);
+
+ if (UNIV_UNLIKELY(is_metadata)) {
+ ut_ad(update->is_metadata());
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ ut_ad(index->is_instant());
+ new_entry = row_metadata_to_tuple(
+ rec, index, *offsets, entry_heap,
+ update->info_bits, !thr_get_trx(thr)->in_rollback);
+ ut_ad(new_entry->n_fields
+ == ulint(index->n_fields)
+ + update->is_alter_metadata());
+ } else {
+ new_entry = row_rec_to_index_entry(rec, index, *offsets,
+ entry_heap);
+ }
+
+ /* The page containing the clustered index record
+ corresponding to new_entry is latched in mtr. If the
+ clustered index record is delete-marked, then its externally
+ stored fields cannot have been purged yet, because then the
+ purge would also have removed the clustered index record
+ itself. Thus the following call is safe. */
+ row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+ entry_heap);
+ btr_cur_trim(new_entry, index, update, thr);
+
+ /* We have to set appropriate extern storage bits in the new
+ record to be inserted: we have to remember which fields were such */
+
+ ut_ad(!page_is_comp(block->frame) || !rec_get_node_ptr_flag(rec));
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ if ((flags & BTR_NO_UNDO_LOG_FLAG)
+ && rec_offs_any_extern(*offsets)) {
+ /* We are in a transaction rollback undoing a row
+ update: we must free possible externally stored fields
+ which got new values in the update, if they are not
+ inherited values. They can be inherited if we have
+ updated the primary key to another value, and then
+ update it back again. */
+
+ ut_ad(big_rec_vec == NULL);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(thr_get_trx(thr)->in_rollback);
+
+ DEBUG_SYNC_C("blob_rollback_middle");
+
+ btr_rec_free_updated_extern_fields(
+ index, rec, block, *offsets, update, true, mtr);
+ }
+
+ ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0;
+
+ if (page_zip_rec_needs_ext(
+ rec_get_converted_size(index, new_entry, n_ext),
+ page_is_comp(block->frame),
+ dict_index_get_n_fields(index),
+ block->zip_size())
+ || (UNIV_UNLIKELY(update->is_alter_metadata())
+ && !dfield_is_ext(dtuple_get_nth_field(
+ new_entry,
+ index->first_user_field())))) {
+ big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
+ if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+ /* We cannot goto return_after_reservations,
+ because we may need to update the
+ IBUF_BITMAP_FREE bits, which was suppressed by
+ BTR_KEEP_IBUF_BITMAP. */
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip
+ || page_zip_validate(page_zip, block->frame,
+ index));
+#endif /* UNIV_ZIP_DEBUG */
+ index->table->space->release_free_extents(n_reserved);
+ err = DB_TOO_BIG_RECORD;
+ goto err_exit;
+ }
+
+ ut_ad(page_is_leaf(block->frame));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(flags & BTR_KEEP_POS_FLAG);
+ }
+
+ /* Do lock checking and undo logging */
+ err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+ update, cmpl_info,
+ thr, mtr, &roll_ptr);
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+
+ if (optim_err == DB_OVERFLOW) {
+
+ /* First reserve enough free space for the file segments
+ of the index tree, so that the update will not fail because
+ of lack of space */
+
+ uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
+
+ if (!fsp_reserve_free_extents(
+ &n_reserved, index->table->space, n_extents,
+ flags & BTR_NO_UNDO_LOG_FLAG
+ ? FSP_CLEANING : FSP_NORMAL,
+ mtr)) {
+ err = DB_OUT_OF_FILE_SPACE;
+ goto err_exit;
+ }
+ }
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
+ }
+
+ const ulint max_ins_size = page_zip
+ ? 0 : page_get_max_insert_size_after_reorganize(block->frame,
+ 1);
+
+ if (UNIV_UNLIKELY(is_metadata)) {
+ ut_ad(new_entry->is_metadata());
+ ut_ad(index->is_instant());
+ /* This can be innobase_add_instant_try() performing a
+ subsequent instant ALTER TABLE, or its rollback by
+ row_undo_mod_clust_low(). */
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ } else {
+ btr_search_update_hash_on_delete(cursor);
+
+ /* Store state of explicit locks on rec on the page
+ infimum record, before deleting rec. The page infimum
+ acts as a dummy carrier of the locks, taking care also
+ of lock releases, before we can move the locks back on
+ the actual record. There is a special case: if we are
+ inserting on the root page and the insert causes a
+ call of btr_root_raise_and_insert. Therefore we cannot
+ in the lock system delete the lock structs set on the
+ root page even if the root page carries just node
+ pointers. */
+ if (!dict_table_is_locking_disabled(index->table)) {
+ lock_rec_store_on_page_infimum(block, rec);
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ page_cur_delete_rec(page_cursor, index, *offsets, mtr);
+
+ page_cur_move_to_prev(page_cursor);
+
+ rec = btr_cur_insert_if_possible(cursor, new_entry,
+ offsets, offsets_heap, n_ext, mtr);
+
+ if (rec) {
+ page_cursor->rec = rec;
+
+ if (UNIV_UNLIKELY(is_metadata)) {
+ /* We must empty the PAGE_FREE list, because if this
+ was a rollback, the shortened metadata record
+ would have too many fields, and we would be unable to
+ know the size of the freed record. */
+ btr_page_reorganize(page_cursor, index, mtr);
+ rec = page_cursor->rec;
+ rec_offs_make_valid(rec, index, true, *offsets);
+ if (page_cursor->block->page.id().page_no()
+ == index->page) {
+ btr_set_instant(page_cursor->block, *index,
+ mtr);
+ }
+ } else if (!dict_table_is_locking_disabled(index->table)) {
+ lock_rec_restore_from_page_infimum(
+ btr_cur_get_block(cursor), rec, block);
+ }
+
+ if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))
+ || rec_is_alter_metadata(rec, *index)) {
+ /* The new inserted record owns its possible externally
+ stored fields */
+ btr_cur_unmark_extern_fields(btr_cur_get_block(cursor),
+ rec, index, *offsets, mtr);
+ } else {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+ }
+
+ bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
+ ut_ad(!adjust || page_is_leaf(block->frame));
+
+ if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
+ if (adjust) {
+ rec_offs_make_valid(page_cursor->rec, index,
+ true, *offsets);
+ }
+ } else if (!dict_index_is_clust(index)
+ && page_is_leaf(block->frame)) {
+ /* Update the free bits in the insert buffer.
+ This is the same block which was skipped by
+ BTR_KEEP_IBUF_BITMAP. */
+ if (page_zip) {
+ ut_ad(!index->table->is_temporary());
+ ibuf_update_free_bits_zip(block, mtr);
+ } else if (!index->table->is_temporary()) {
+ ibuf_update_free_bits_low(block, max_ins_size,
+ mtr);
+ }
+ }
+
+ if (!srv_read_only_mode
+ && !big_rec_vec
+ && page_is_leaf(block->frame)
+ && !dict_index_is_online_ddl(index)) {
+
+ mtr_memo_release(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
+
+ /* NOTE: We cannot release root block latch here, because it
+ has segment header and already modified in most of cases.*/
+ }
+
+ err = DB_SUCCESS;
+ goto return_after_reservations;
+ } else {
+ /* If the page is compressed and it initially
+ compresses very well, and there is a subsequent insert
+ of a badly-compressing record, it is possible for
+ btr_cur_optimistic_update() to return DB_UNDERFLOW and
+ btr_cur_insert_if_possible() to return FALSE. */
+ ut_a(page_zip || optim_err != DB_UNDERFLOW);
+
+ /* Out of space: reset the free bits.
+ This is the same block which was skipped by
+ BTR_KEEP_IBUF_BITMAP. */
+ if (!dict_index_is_clust(index)
+ && !index->table->is_temporary()
+ && page_is_leaf(block->frame)) {
+ ibuf_reset_free_bits(block);
+ }
+ }
+
+ if (big_rec_vec != NULL) {
+ ut_ad(page_is_leaf(block->frame));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(flags & BTR_KEEP_POS_FLAG);
+
+ /* btr_page_split_and_insert() in
+ btr_cur_pessimistic_insert() invokes
+ mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
+ We must keep the index->lock when we created a
+ big_rec, so that row_upd_clust_rec() can store the
+ big_rec in the same mini-transaction. */
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ mtr_sx_lock_index(index, mtr);
+ }
+
+ /* Was the record to be updated positioned as the first user
+ record on its page? */
+ was_first = page_cur_is_before_first(page_cursor);
+
+ /* Lock checks and undo logging were already performed by
+ btr_cur_upd_lock_and_undo(). We do not try
+ btr_cur_optimistic_insert() because
+ btr_cur_insert_if_possible() already failed above. */
+
+ err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ cursor, offsets, offsets_heap,
+ new_entry, &rec,
+ &dummy_big_rec, n_ext, NULL, mtr);
+ ut_a(rec);
+ ut_a(err == DB_SUCCESS);
+ ut_a(dummy_big_rec == NULL);
+ ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+ page_cursor->rec = rec;
+
+ /* Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (dict_index_is_sec_or_ibuf(index)
+ && !index->table->is_temporary()) {
+ /* Update PAGE_MAX_TRX_ID in the index page header.
+ It was not updated by btr_cur_pessimistic_insert()
+ because of BTR_NO_LOCKING_FLAG. */
+ page_update_max_trx_id(btr_cur_get_block(cursor),
+ btr_cur_get_page_zip(cursor),
+ trx_id, mtr);
+ }
+
+ if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+ /* The new inserted record owns its possible externally
+ stored fields */
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, block->frame,
+ index));
+#endif /* UNIV_ZIP_DEBUG */
+ btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec,
+ index, *offsets, mtr);
+ } else {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+ }
+
+ if (UNIV_UNLIKELY(is_metadata)) {
+ /* We must empty the PAGE_FREE list, because if this
+ was a rollback, the shortened metadata record
+ would have too many fields, and we would be unable to
+ know the size of the freed record. */
+ btr_page_reorganize(page_cursor, index, mtr);
+ rec = page_cursor->rec;
+ } else if (!dict_table_is_locking_disabled(index->table)) {
+ lock_rec_restore_from_page_infimum(
+ btr_cur_get_block(cursor), rec, block);
+ }
+
+ /* If necessary, restore also the correct lock state for a new,
+ preceding supremum record created in a page split. While the old
+ record was nonexistent, the supremum might have inherited its locks
+ from a wrong record. */
+
+ if (!was_first && !dict_table_is_locking_disabled(index->table)) {
+ btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
+ rec, mtr);
+ }
+
+return_after_reservations:
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(btr_cur_get_page_zip(cursor),
+ btr_cur_get_page(cursor), index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ index->table->space->release_free_extents(n_reserved);
+ *big_rec = big_rec_vec;
+ return(err);
+}
+
+/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
+
+/** Modify the delete-mark flag of a record.
+@tparam flag the value of the delete-mark flag
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in,out] mtr mini-transaction */
+template<bool flag>
+void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
+{
+ if (page_rec_is_comp(rec))
+ {
+ byte *b= &rec[-REC_NEW_INFO_BITS];
+ const byte v= flag
+ ? (*b | REC_INFO_DELETED_FLAG)
+ : (*b & byte(~REC_INFO_DELETED_FLAG));
+ if (*b == v);
+ else if (UNIV_LIKELY_NULL(block->page.zip.data))
+ {
+ *b= v;
+ page_zip_rec_set_deleted(block, rec, flag, mtr);
+ }
+ else
+ mtr->write<1>(*block, b, v);
+ }
+ else
+ {
+ ut_ad(!block->page.zip.data);
+ byte *b= &rec[-REC_OLD_INFO_BITS];
+ const byte v = flag
+ ? (*b | REC_INFO_DELETED_FLAG)
+ : (*b & byte(~REC_INFO_DELETED_FLAG));
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v);
+ }
+}
+
+template void btr_rec_set_deleted<false>(buf_block_t *, rec_t *, mtr_t *);
+template void btr_rec_set_deleted<true>(buf_block_t *, rec_t *, mtr_t *);
+
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+dberr_t
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+ buf_block_t* block, /*!< in/out: buffer block of the record */
+ rec_t* rec, /*!< in/out: record */
+ dict_index_t* index, /*!< in: clustered index of the record */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */
+ que_thr_t* thr, /*!< in: query thread */
+ const dtuple_t* entry, /*!< in: dtuple for the deleting record, also
+ contains the virtual cols if there are any */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ roll_ptr_t roll_ptr;
+ dberr_t err;
+ trx_t* trx;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+ ut_ad(buf_block_get_frame(block) == page_align(rec));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+ /* We may already have delete-marked this record
+ when executing an ON DELETE CASCADE operation. */
+ ut_ad(row_get_rec_trx_id(rec, index, offsets)
+ == thr_get_trx(thr)->id);
+ return(DB_SUCCESS);
+ }
+
+ err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
+ rec, index, offsets, thr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ err = trx_undo_report_row_operation(thr, index,
+ entry, NULL, 0, rec, offsets,
+ &roll_ptr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ /* The search latch is not needed here, because
+ the adaptive hash index does not depend on the delete-mark
+ and the delete-mark is being updated in place. */
+
+ btr_rec_set_deleted<true>(block, rec, mtr);
+
+ trx = thr_get_trx(thr);
+
+ DBUG_LOG("ib_cur",
+ "delete-mark clust " << index->table->name
+ << " (" << index->id << ") by "
+ << ib::hex(trx_get_id_for_print(trx)) << ": "
+ << rec_printer(rec, offsets).str());
+
+ if (dict_index_is_online_ddl(index)) {
+ row_log_table_delete(rec, index, offsets, NULL);
+ }
+
+ btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr, mtr);
+ return(err);
+}
+
+/*==================== B-TREE RECORD REMOVE =========================*/
+
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return TRUE if compression occurred */
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
+ cursor does not stay valid if !adjust and
+ compression occurs */
+ ibool adjust, /*!< in: TRUE if should adjust the
+ cursor position even if compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+ MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ if (cursor->index->is_spatial()) {
+ const trx_t* trx = cursor->rtr_info->thr
+ ? thr_get_trx(cursor->rtr_info->thr)
+ : NULL;
+ const buf_block_t* block = btr_cur_get_block(cursor);
+
+ /* Check whether page lock prevents the compression */
+ if (!lock_test_prdt_page_lock(trx, block->page.id())) {
+ return(false);
+ }
+ }
+
+ return(btr_cur_compress_recommendation(cursor, mtr)
+ && btr_compress(cursor, adjust, mtr));
+}
+
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned on a leaf page.
+It is assumed that the mtr has an x-latch on the page where the cursor is
+positioned, but no latch on the whole tree.
+@return TRUE if success, i.e., the page did not become too empty */
+ibool
+btr_cur_optimistic_delete_func(
+/*===========================*/
+ btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to
+ delete; cursor stays valid: if deletion
+ succeeds, on function exit it points to the
+ successor of the deleted record */
+#ifdef UNIV_DEBUG
+ ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
+#endif /* UNIV_DEBUG */
+ mtr_t* mtr) /*!< in: mtr; if this function returns
+ TRUE on a leaf page of a secondary
+ index, the mtr must be committed
+ before latching any further pages */
+{
+ buf_block_t* block;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr->is_named_space(cursor->index->table->space));
+ ut_ad(!cursor->index->is_dummy);
+
+ /* This is intended only for leaf page deletions */
+
+ block = btr_cur_get_block(cursor);
+
+ ut_ad(block->page.id().space() == cursor->index->table->space->id);
+ ut_ad(page_is_leaf(buf_block_get_frame(block)));
+ ut_ad(!dict_index_is_online_ddl(cursor->index)
+ || dict_index_is_clust(cursor->index)
+ || (flags & BTR_CREATE_FLAG));
+
+ rec = btr_cur_get_rec(cursor);
+
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ cursor->index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ const ibool no_compress_needed = !rec_offs_any_extern(offsets)
+ && btr_cur_can_delete_without_compress(
+ cursor, rec_offs_size(offsets), mtr);
+
+ if (!no_compress_needed) {
+ /* prefetch siblings of the leaf for the pessimistic
+ operation. */
+ btr_cur_prefetch_siblings(block, cursor->index);
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index->page
+ && page_get_n_recs(block->frame) == 1
+ + (cursor->index->is_instant()
+ && !rec_is_metadata(rec, *cursor->index))
+ && !cursor->index->must_avoid_clear_instant_add())) {
+ /* The whole index (and table) becomes logically empty.
+ Empty the whole page. That is, if we are deleting the
+ only user record, also delete the metadata record
+ if one exists for instant ADD COLUMN (not generic ALTER TABLE).
+ If we are deleting the metadata record and the
+ table becomes empty, clean up the whole page. */
+ dict_index_t* index = cursor->index;
+ const rec_t* first_rec = page_rec_get_next_const(
+ page_get_infimum_rec(block->frame));
+ ut_ad(!index->is_instant()
+ || rec_is_metadata(first_rec, *index));
+ const bool is_metadata = rec_is_metadata(rec, *index);
+ /* We can remove the metadata when rolling back an
+ instant ALTER TABLE operation, or when deleting the
+ last user record on the page such that only metadata for
+ instant ADD COLUMN (not generic ALTER TABLE) remains. */
+ const bool empty_table = is_metadata
+ || !index->is_instant()
+ || (first_rec != rec
+ && rec_is_add_metadata(first_rec, *index));
+ if (UNIV_LIKELY(empty_table)) {
+ if (UNIV_LIKELY(!is_metadata)) {
+ lock_update_delete(block, rec);
+ }
+ btr_page_empty(block, buf_block_get_page_zip(block),
+ index, 0, mtr);
+ if (index->is_instant()) {
+ /* MDEV-17383: free metadata BLOBs! */
+ index->clear_instant_alter();
+ }
+ page_cur_set_after_last(block,
+ btr_cur_get_page_cur(cursor));
+ goto func_exit;
+ }
+ }
+
+ {
+ page_t* page = buf_block_get_frame(block);
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+
+ if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec))
+ & REC_INFO_MIN_REC_FLAG)) {
+ /* This should be rolling back instant ADD COLUMN.
+ If this is a recovered transaction, then
+ index->is_instant() will hold until the
+ insert into SYS_COLUMNS is rolled back. */
+ ut_ad(cursor->index->table->supports_instant());
+ ut_ad(cursor->index->is_primary());
+ ut_ad(!page_zip);
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ cursor->index, offsets, mtr);
+ /* We must empty the PAGE_FREE list, because
+ after rollback, this deleted metadata record
+ would have too many fields, and we would be
+ unable to know the size of the freed record. */
+ btr_page_reorganize(btr_cur_get_page_cur(cursor),
+ cursor->index, mtr);
+ goto func_exit;
+ } else {
+ lock_update_delete(block, rec);
+
+ btr_search_update_hash_on_delete(cursor);
+ }
+
+ if (page_zip) {
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, cursor->index));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ cursor->index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, cursor->index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* On compressed pages, the IBUF_BITMAP_FREE
+ space is not affected by deleting (purging)
+ records, because it is defined as the minimum
+ of space available *without* reorganize, and
+ space available in the modification log. */
+ } else {
+ const ulint max_ins
+ = page_get_max_insert_size_after_reorganize(
+ page, 1);
+
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ cursor->index, offsets, mtr);
+
+ /* The change buffer does not handle inserts
+ into non-leaf pages, into clustered indexes,
+ or into the change buffer. */
+ if (!dict_index_is_clust(cursor->index)
+ && !cursor->index->table->is_temporary()
+ && !dict_index_is_ibuf(cursor->index)) {
+ ibuf_update_free_bits_low(block, max_ins, mtr);
+ }
+ }
+ }
+
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(no_compress_needed);
+}
+
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred and FALSE if not or something
+wrong. */
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+ dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+ the latter may occur because we may have
+ to update node pointers on upper levels,
+ and in the case of variable length keys
+ these may actually grow in size */
+ ibool has_reserved_extents, /*!< in: TRUE if the
+ caller has already reserved enough free
+ extents so that he knows that the operation
+ will succeed */
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ if compression does not occur, the cursor
+ stays valid: it points to successor of
+ deleted record on function exit */
+ ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ dict_index_t* index;
+ rec_t* rec;
+ uint32_t n_reserved = 0;
+ bool success;
+ ibool ret = FALSE;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+#ifdef UNIV_DEBUG
+ bool parent_latched = false;
+#endif /* UNIV_DEBUG */
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ index = btr_cur_get_index(cursor);
+
+ ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+ ut_ad(!dict_index_is_online_ddl(index)
+ || dict_index_is_clust(index)
+ || (flags & BTR_CREATE_FLAG));
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr->is_named_space(index->table->space));
+ ut_ad(!index->is_dummy);
+ ut_ad(block->page.id().space() == index->table->space->id);
+
+ if (!has_reserved_extents) {
+ /* First reserve enough free space for the file segments
+ of the index tree, so that the node pointer updates will
+ not fail because of lack of space */
+
+ uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1);
+
+ success = fsp_reserve_free_extents(&n_reserved,
+ index->table->space,
+ n_extents,
+ FSP_CLEANING, mtr);
+ if (!success) {
+ *err = DB_OUT_OF_FILE_SPACE;
+
+ return(FALSE);
+ }
+ }
+
+ heap = mem_heap_create(1024);
+ rec = btr_cur_get_rec(cursor);
+ page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+
+ if (rec_offs_any_extern(offsets)) {
+ btr_rec_free_externally_stored_fields(index,
+ rec, offsets, block,
+ rollback, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+ }
+
+ rec_t* next_rec = NULL;
+ bool min_mark_next_rec = false;
+
+ if (page_is_leaf(page)) {
+ const bool is_metadata = rec_is_metadata(
+ rec, page_rec_is_comp(rec));
+ if (UNIV_UNLIKELY(is_metadata)) {
+ /* This should be rolling back instant ALTER TABLE.
+ If this is a recovered transaction, then
+ index->is_instant() will hold until the
+ insert into SYS_COLUMNS is rolled back. */
+ ut_ad(rollback);
+ ut_ad(index->table->supports_instant());
+ ut_ad(index->is_primary());
+ } else if (flags == 0) {
+ lock_update_delete(block, rec);
+ }
+
+ if (block->page.id().page_no() != index->page) {
+ if (page_get_n_recs(page) < 2) {
+ goto discard_page;
+ }
+ } else if (page_get_n_recs(page) == 1
+ + (index->is_instant() && !is_metadata)
+ && !index->must_avoid_clear_instant_add()) {
+ /* The whole index (and table) becomes logically empty.
+ Empty the whole page. That is, if we are deleting the
+ only user record, also delete the metadata record
+ if one exists for instant ADD COLUMN
+ (not generic ALTER TABLE).
+ If we are deleting the metadata record
+ (in the rollback of instant ALTER TABLE) and the
+ table becomes empty, clean up the whole page. */
+
+ const rec_t* first_rec = page_rec_get_next_const(
+ page_get_infimum_rec(page));
+ ut_ad(!index->is_instant()
+ || rec_is_metadata(first_rec, *index));
+ if (is_metadata || !index->is_instant()
+ || (first_rec != rec
+ && rec_is_add_metadata(first_rec, *index))) {
+ btr_page_empty(block, page_zip, index, 0, mtr);
+ if (index->is_instant()) {
+ /* MDEV-17383: free metadata BLOBs! */
+ index->clear_instant_alter();
+ }
+ page_cur_set_after_last(
+ block,
+ btr_cur_get_page_cur(cursor));
+ ret = TRUE;
+ goto return_after_reservations;
+ }
+ }
+
+ if (UNIV_LIKELY(!is_metadata)) {
+ btr_search_update_hash_on_delete(cursor);
+ } else {
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ index, offsets, mtr);
+ /* We must empty the PAGE_FREE list, because
+ after rollback, this deleted metadata record
+ would carry too many fields, and we would be
+ unable to know the size of the freed record. */
+ btr_page_reorganize(btr_cur_get_page_cur(cursor),
+ index, mtr);
+ ut_ad(!ret);
+ goto return_after_reservations;
+ }
+ } else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) {
+ if (page_rec_is_last(rec, page)) {
+discard_page:
+ ut_ad(page_get_n_recs(page) == 1);
+ /* If there is only one record, drop
+ the whole page. */
+
+ btr_discard_page(cursor, mtr);
+
+ ret = TRUE;
+ goto return_after_reservations;
+ }
+
+ next_rec = page_rec_get_next(rec);
+
+ if (!page_has_prev(page)) {
+ /* If we delete the leftmost node pointer on a
+ non-leaf level, we must mark the new leftmost node
+ pointer as the predefined minimum record */
+
+ min_mark_next_rec = true;
+ } else if (index->is_spatial()) {
+ /* For rtree, if delete the leftmost node pointer,
+ we need to update parent page. */
+ rtr_mbr_t father_mbr;
+ rec_t* father_rec;
+ btr_cur_t father_cursor;
+ rec_offs* offsets;
+ bool upd_ret;
+ ulint len;
+
+ rtr_page_get_father_block(NULL, heap, index,
+ block, mtr, NULL,
+ &father_cursor);
+ offsets = rec_get_offsets(
+ btr_cur_get_rec(&father_cursor), index, NULL,
+ 0, ULINT_UNDEFINED, &heap);
+
+ father_rec = btr_cur_get_rec(&father_cursor);
+ rtr_read_mbr(rec_get_nth_field(
+ father_rec, offsets, 0, &len), &father_mbr);
+
+ upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
+ NULL, page, &father_mbr,
+ next_rec, mtr);
+
+ if (!upd_ret) {
+ *err = DB_ERROR;
+
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+
+ ut_d(parent_latched = true);
+ } else {
+ /* Otherwise, if we delete the leftmost node pointer
+ on a page, we have to change the parent node pointer
+ so that it is equal to the new leftmost node pointer
+ on the page */
+ btr_cur_t cursor;
+ btr_page_get_father(index, block, mtr, &cursor);
+ btr_cur_node_ptr_delete(&cursor, mtr);
+ const ulint level = btr_page_get_level(page);
+ // FIXME: reuse the node_ptr from above
+ dtuple_t* node_ptr = dict_index_build_node_ptr(
+ index, next_rec, block->page.id().page_no(),
+ heap, level);
+
+ btr_insert_on_non_leaf_level(
+ flags, index, level + 1, node_ptr, mtr);
+
+ ut_d(parent_latched = true);
+ }
+ }
+
+ /* SPATIAL INDEX never use SX locks; we can allow page merges
+ while holding X lock on the spatial index tree.
+ Do not allow merges of non-leaf B-tree pages unless it is
+ safe to do so. */
+ {
+ const bool allow_merge = page_is_leaf(page)
+ || dict_index_is_spatial(index)
+ || btr_cur_will_modify_tree(
+ index, page, BTR_INTENTION_DELETE, rec,
+ btr_node_ptr_max_size(index),
+ block->zip_size(), mtr);
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor), index,
+ offsets, mtr);
+
+ if (min_mark_next_rec) {
+ btr_set_min_rec_mark(next_rec, *block, mtr);
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ ut_ad(!parent_latched
+ || btr_check_node_ptr(index, block, mtr));
+
+ if (!ret && btr_cur_compress_recommendation(cursor, mtr)) {
+ if (UNIV_LIKELY(allow_merge)) {
+ ret = btr_cur_compress_if_useful(
+ cursor, FALSE, mtr);
+ } else {
+ ib::warn() << "Not merging page "
+ << block->page.id()
+ << " in index " << index->name
+ << " of " << index->table->name;
+ ut_ad("MDEV-14637" == 0);
+ }
+ }
+ }
+
+return_after_reservations:
+ *err = DB_SUCCESS;
+
+ mem_heap_free(heap);
+
+ if (!srv_read_only_mode
+ && page_is_leaf(page)
+ && !dict_index_is_online_ddl(index)) {
+
+ mtr_memo_release(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
+
+ /* NOTE: We cannot release root block latch here, because it
+ has segment header and already modified in most of cases.*/
+ }
+
+ index->table->space->release_free_extents(n_reserved);
+ return(ret);
+}
+
+/** Delete the node pointer in a parent page.
+@param[in,out] parent cursor pointing to parent record
+@param[in,out] mtr mini-transaction */
+void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent),
+ MTR_MEMO_PAGE_X_FIX));
+ dberr_t err;
+ ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent,
+ BTR_CREATE_FLAG, false,
+ mtr);
+ ut_a(err == DB_SUCCESS);
+ if (!compressed) {
+ btr_cur_compress_if_useful(parent, FALSE, mtr);
+ }
+}
+
+/*******************************************************************//**
+Adds path information to the cursor for the current page, for which
+the binary search has been performed. */
+static
+void
+btr_cur_add_path_info(
+/*==================*/
+ btr_cur_t* cursor, /*!< in: cursor positioned on a page */
+ ulint height, /*!< in: height of the page in tree;
+ 0 means leaf node */
+ ulint root_height) /*!< in: root node height in tree */
+{
+ btr_path_t* slot;
+
+ ut_a(cursor->path_arr);
+
+ if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
+ /* Do nothing; return empty path */
+
+ slot = cursor->path_arr;
+ slot->nth_rec = ULINT_UNDEFINED;
+
+ return;
+ }
+
+ if (height == 0) {
+ /* Mark end of slots for path */
+ slot = cursor->path_arr + root_height + 1;
+ slot->nth_rec = ULINT_UNDEFINED;
+ }
+
+ slot = cursor->path_arr + (root_height - height);
+
+ const buf_block_t* block = btr_cur_get_block(cursor);
+
+ slot->nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
+ slot->n_recs = page_get_n_recs(block->frame);
+ slot->page_no = block->page.id().page_no();
+ slot->page_level = btr_page_get_level(block->frame);
+}
+
+/*******************************************************************//**
+Estimate the number of rows between slot1 and slot2 for any level on a
+B-tree. This function starts from slot1->page and reads a few pages to
+the right, counting their records. If we reach slot2->page quickly then
+we know exactly how many records there are between slot1 and slot2 and
+we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
+then we calculate the average number of records in the pages scanned
+so far and assume that all pages that we did not scan up to slot2->page
+contain the same number of records, then we multiply that average to
+the number of pages between slot1->page and slot2->page (which is
+n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
+@return number of rows, not including the borders (exact or estimated) */
+static
+ha_rows
+btr_estimate_n_rows_in_range_on_level(
+/*==================================*/
+ dict_index_t* index, /*!< in: index */
+ btr_path_t* slot1, /*!< in: left border */
+ btr_path_t* slot2, /*!< in: right border */
+ ha_rows n_rows_on_prev_level, /*!< in: number of rows
+ on the previous level for the
+ same descend paths; used to
+ determine the number of pages
+ on this level */
+ bool* is_n_rows_exact) /*!< out: TRUE if the returned
+ value is exact i.e. not an
+ estimation */
+{
+ ha_rows n_rows = 0;
+ uint n_pages_read = 0;
+ ulint level;
+
+ /* Assume by default that we will scan all pages between
+ slot1->page_no and slot2->page_no. */
+ *is_n_rows_exact = true;
+
+ /* Add records from slot1->page_no which are to the right of
+ the record which serves as a left border of the range, if any
+ (we don't include the record itself in this count). */
+ if (slot1->nth_rec <= slot1->n_recs) {
+ n_rows += slot1->n_recs - slot1->nth_rec;
+ }
+
+ /* Add records from slot2->page_no which are to the left of
+ the record which servers as a right border of the range, if any
+ (we don't include the record itself in this count). */
+ if (slot2->nth_rec > 1) {
+ n_rows += slot2->nth_rec - 1;
+ }
+
+ /* Count the records in the pages between slot1->page_no and
+ slot2->page_no (non inclusive), if any. */
+
+ /* Do not read more than this number of pages in order not to hurt
+ performance with this code which is just an estimation. If we read
+ this many pages before reaching slot2->page_no then we estimate the
+ average from the pages scanned so far. */
+# define N_PAGES_READ_LIMIT 10
+
+ const fil_space_t* space = index->table->space;
+ page_id_t page_id(space->id, slot1->page_no);
+ const ulint zip_size = space->zip_size();
+
+ level = slot1->page_level;
+
+ do {
+ mtr_t mtr;
+ page_t* page;
+ buf_block_t* block;
+ dberr_t err=DB_SUCCESS;
+
+ mtr_start(&mtr);
+
+ /* Fetch the page. Because we are not holding the
+ index->lock, the tree may have changed and we may be
+ attempting to read a page that is no longer part of
+ the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
+ silence a debug assertion about this. */
+ block = buf_page_get_gen(page_id, zip_size, RW_S_LATCH,
+ NULL, BUF_GET_POSSIBLY_FREED,
+ __FILE__, __LINE__, &mtr, &err);
+
+ ut_ad((block != NULL) == (err == DB_SUCCESS));
+
+ if (!block) {
+ if (err == DB_DECRYPTION_FAILED) {
+ ib_push_warning((void *)NULL,
+ DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ index->table->name.m_name);
+ index->table->file_unreadable = true;
+ }
+
+ mtr_commit(&mtr);
+ goto inexact;
+ }
+
+ page = buf_block_get_frame(block);
+
+ /* It is possible that the tree has been reorganized in the
+ meantime and this is a different page. If this happens the
+ calculated estimate will be bogus, which is not fatal as
+ this is only an estimate. We are sure that a page with
+ page_no exists because InnoDB never frees pages, only
+ reuses them. */
+ if (!fil_page_index_page_check(page)
+ || btr_page_get_index_id(page) != index->id
+ || btr_page_get_level(page) != level) {
+
+ /* The page got reused for something else */
+ mtr_commit(&mtr);
+ goto inexact;
+ }
+
+ /* It is possible but highly unlikely that the page was
+ originally written by an old version of InnoDB that did
+ not initialize FIL_PAGE_TYPE on other than B-tree pages.
+ For example, this could be an almost-empty BLOB page
+ that happens to contain the magic values in the fields
+ that we checked above. */
+
+ n_pages_read++;
+
+ if (page_id.page_no() != slot1->page_no) {
+ /* Do not count the records on slot1->page_no,
+ we already counted them before this loop. */
+ n_rows += page_get_n_recs(page);
+ }
+
+ page_id.set_page_no(btr_page_get_next(page));
+
+ mtr_commit(&mtr);
+
+ if (n_pages_read == N_PAGES_READ_LIMIT
+ || page_id.page_no() == FIL_NULL) {
+ /* Either we read too many pages or
+ we reached the end of the level without passing
+ through slot2->page_no, the tree must have changed
+ in the meantime */
+ goto inexact;
+ }
+
+ } while (page_id.page_no() != slot2->page_no);
+
+ return(n_rows);
+
+inexact:
+
+ *is_n_rows_exact = false;
+
+ /* We did interrupt before reaching slot2->page */
+
+ if (n_pages_read > 0) {
+ /* The number of pages on this level is
+ n_rows_on_prev_level, multiply it by the
+ average number of recs per page so far */
+ n_rows = n_rows_on_prev_level * n_rows / n_pages_read;
+ } else {
+ /* The tree changed before we could even
+ start with slot1->page_no */
+ n_rows = 10;
+ }
+
+ return(n_rows);
+}
+
+/** If the tree gets changed too much between the two dives for the left
+and right boundary then btr_estimate_n_rows_in_range_low() will retry
+that many times before giving up and returning the value stored in
+rows_in_range_arbitrary_ret_val. */
+static const unsigned rows_in_range_max_retries = 4;
+
+/** We pretend that a range has that many records if the tree keeps changing
+for rows_in_range_max_retries retries while we try to estimate the records
+in a given range. */
+static const ha_rows rows_in_range_arbitrary_ret_val = 10;
+
+/** Estimates the number of rows in a given index range.
+@param[in] index index
+@param[in] tuple1 range start
+@param[in] tuple2 range end
+@param[in] nth_attempt if the tree gets modified too much while
+we are trying to analyze it, then we will retry (this function will call
+itself, incrementing this parameter)
+@return estimated number of rows; if after rows_in_range_max_retries
+retries the tree keeps changing, then we will just return
+rows_in_range_arbitrary_ret_val as a result (if
+nth_attempt >= rows_in_range_max_retries and the tree is modified between
+the two dives). */
+static
+ha_rows
+btr_estimate_n_rows_in_range_low(
+ dict_index_t* index,
+ btr_pos_t* tuple1,
+ btr_pos_t* tuple2,
+ unsigned nth_attempt)
+{
+ btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
+ btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
+ btr_cur_t cursor;
+ btr_path_t* slot1;
+ btr_path_t* slot2;
+ bool diverged;
+ bool diverged_lot;
+ ulint divergence_level;
+ ha_rows n_rows;
+ bool is_n_rows_exact;
+ ulint i;
+ mtr_t mtr;
+ ha_rows table_n_rows;
+ page_cur_mode_t mode2= tuple2->mode;
+
+ table_n_rows = dict_table_get_n_rows(index->table);
+
+ /* Below we dive to the two records specified by tuple1 and tuple2 and
+ we remember the entire dive paths from the tree root. The place where
+ the tuple1 path ends on the leaf level we call "left border" of our
+ interval and the place where the tuple2 path ends on the leaf level -
+ "right border". We take care to either include or exclude the interval
+ boundaries depending on whether <, <=, > or >= was specified. For
+ example if "5 < x AND x <= 10" then we should not include the left
+ boundary, but should include the right one. */
+
+ mtr_start(&mtr);
+
+ cursor.path_arr = path1;
+
+ bool should_count_the_left_border;
+
+ if (dtuple_get_n_fields(tuple1->tuple) > 0) {
+
+ btr_cur_search_to_nth_level(index, 0, tuple1->tuple,
+ tuple1->mode,
+ BTR_SEARCH_LEAF | BTR_ESTIMATE,
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
+
+ ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
+
+ /* We should count the border if there are any records to
+ match the criteria, i.e. if the maximum record on the tree is
+ 5 and x > 3 is specified then the cursor will be positioned at
+ 5 and we should count the border, but if x > 7 is specified,
+ then the cursor will be positioned at 'sup' on the rightmost
+ leaf page in the tree and we should not count the border. */
+ should_count_the_left_border
+ = !page_rec_is_supremum(btr_cur_get_rec(&cursor));
+ } else {
+ dberr_t err = DB_SUCCESS;
+
+ err = btr_cur_open_at_index_side(true, index,
+ BTR_SEARCH_LEAF | BTR_ESTIMATE,
+ &cursor, 0, &mtr);
+
+ if (err != DB_SUCCESS) {
+ ib::warn() << " Error code: " << err
+ << " btr_estimate_n_rows_in_range_low "
+ << " called from file: "
+ << __FILE__ << " line: " << __LINE__
+ << " table: " << index->table->name
+ << " index: " << index->name;
+ }
+
+ ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
+
+ /* The range specified is wihout a left border, just
+ 'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
+ positioned the cursor on the infimum record on the leftmost
+ page, which must not be counted. */
+ should_count_the_left_border = false;
+ }
+
+ tuple1->page_id= cursor.page_cur.block->page.id();
+
+ mtr_commit(&mtr);
+
+ if (!index->is_readable()) {
+ return 0;
+ }
+
+ mtr_start(&mtr);
+
+ cursor.path_arr = path2;
+
+ bool should_count_the_right_border;
+
+ if (dtuple_get_n_fields(tuple2->tuple) > 0) {
+
+ btr_cur_search_to_nth_level(index, 0, tuple2->tuple,
+ mode2,
+ BTR_SEARCH_LEAF | BTR_ESTIMATE,
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
+
+ const rec_t* rec = btr_cur_get_rec(&cursor);
+
+ ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
+
+ should_count_the_right_border
+ = (mode2 == PAGE_CUR_LE /* if the range is '<=' */
+ /* and the record was found */
+ && cursor.low_match >= dtuple_get_n_fields(tuple2->tuple))
+ || (mode2 == PAGE_CUR_L /* or if the range is '<' */
+ /* and there are any records to match the criteria,
+ i.e. if the minimum record on the tree is 5 and
+ x < 7 is specified then the cursor will be
+ positioned at 5 and we should count the border, but
+ if x < 2 is specified, then the cursor will be
+ positioned at 'inf' and we should not count the
+ border */
+ && !page_rec_is_infimum(rec));
+ /* Notice that for "WHERE col <= 'foo'" MySQL passes to
+ ha_innobase::records_in_range():
+ min_key=NULL (left-unbounded) which is expected
+ max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
+ unexpected - one would expect
+ flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
+ cursor will be positioned on the first record to the right of
+ the requested one (can also be positioned on the 'sup') and
+ we should not count the right border. */
+ } else {
+ dberr_t err = DB_SUCCESS;
+
+ err = btr_cur_open_at_index_side(false, index,
+ BTR_SEARCH_LEAF | BTR_ESTIMATE,
+ &cursor, 0, &mtr);
+
+ if (err != DB_SUCCESS) {
+ ib::warn() << " Error code: " << err
+ << " btr_estimate_n_rows_in_range_low "
+ << " called from file: "
+ << __FILE__ << " line: " << __LINE__
+ << " table: " << index->table->name
+ << " index: " << index->name;
+ }
+
+ ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
+
+ /* The range specified is wihout a right border, just
+ 'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
+ positioned the cursor on the supremum record on the rightmost
+ page, which must not be counted. */
+ should_count_the_right_border = false;
+ }
+
+ tuple2->page_id= cursor.page_cur.block->page.id();
+
+ mtr_commit(&mtr);
+
+ /* We have the path information for the range in path1 and path2 */
+
+ n_rows = 0;
+ is_n_rows_exact = true;
+
+ /* This becomes true when the two paths do not pass through the
+ same pages anymore. */
+ diverged = false;
+
+ /* This becomes true when the paths are not the same or adjacent
+ any more. This means that they pass through the same or
+ neighboring-on-the-same-level pages only. */
+ diverged_lot = false;
+
+ /* This is the level where paths diverged a lot. */
+ divergence_level = 1000000;
+
+ for (i = 0; ; i++) {
+ ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
+
+ slot1 = path1 + i;
+ slot2 = path2 + i;
+
+ if (slot1->nth_rec == ULINT_UNDEFINED
+ || slot2->nth_rec == ULINT_UNDEFINED) {
+
+ /* Here none of the borders were counted. For example,
+ if on the leaf level we descended to:
+ (inf, a, b, c, d, e, f, sup)
+ ^ ^
+ path1 path2
+ then n_rows will be 2 (c and d). */
+
+ if (is_n_rows_exact) {
+ /* Only fiddle to adjust this off-by-one
+ if the number is exact, otherwise we do
+ much grosser adjustments below. */
+
+ btr_path_t* last1 = &path1[i - 1];
+ btr_path_t* last2 = &path2[i - 1];
+
+ /* If both paths end up on the same record on
+ the leaf level. */
+ if (last1->page_no == last2->page_no
+ && last1->nth_rec == last2->nth_rec) {
+
+ /* n_rows can be > 0 here if the paths
+ were first different and then converged
+ to the same record on the leaf level.
+ For example:
+ SELECT ... LIKE 'wait/synch/rwlock%'
+ mode1=PAGE_CUR_GE,
+ tuple1="wait/synch/rwlock"
+ path1[0]={nth_rec=58, n_recs=58,
+ page_no=3, page_level=1}
+ path1[1]={nth_rec=56, n_recs=55,
+ page_no=119, page_level=0}
+
+ mode2=PAGE_CUR_G
+ tuple2="wait/synch/rwlock"
+ path2[0]={nth_rec=57, n_recs=57,
+ page_no=3, page_level=1}
+ path2[1]={nth_rec=56, n_recs=55,
+ page_no=119, page_level=0} */
+
+ /* If the range is such that we should
+ count both borders, then avoid
+ counting that record twice - once as a
+ left border and once as a right
+ border. */
+ if (should_count_the_left_border
+ && should_count_the_right_border) {
+
+ n_rows = 1;
+ } else {
+ /* Some of the borders should
+ not be counted, e.g. [3,3). */
+ n_rows = 0;
+ }
+ } else {
+ if (should_count_the_left_border) {
+ n_rows++;
+ }
+
+ if (should_count_the_right_border) {
+ n_rows++;
+ }
+ }
+ }
+
+ if (i > divergence_level + 1 && !is_n_rows_exact) {
+ /* In trees whose height is > 1 our algorithm
+ tends to underestimate: multiply the estimate
+ by 2: */
+
+ n_rows = n_rows * 2;
+ }
+
+ DBUG_EXECUTE_IF("bug14007649", return(n_rows););
+
+ /* Do not estimate the number of rows in the range
+ to over 1 / 2 of the estimated rows in the whole
+ table */
+
+ if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
+
+ n_rows = table_n_rows / 2;
+
+ /* If there are just 0 or 1 rows in the table,
+ then we estimate all rows are in the range */
+
+ if (n_rows == 0) {
+ n_rows = table_n_rows;
+ }
+ }
+
+ return(n_rows);
+ }
+
+ if (!diverged && slot1->nth_rec != slot2->nth_rec) {
+
+ /* If both slots do not point to the same page,
+ this means that the tree must have changed between
+ the dive for slot1 and the dive for slot2 at the
+ beginning of this function. */
+ if (slot1->page_no != slot2->page_no
+ || slot1->page_level != slot2->page_level) {
+
+ /* If the tree keeps changing even after a
+ few attempts, then just return some arbitrary
+ number. */
+ if (nth_attempt >= rows_in_range_max_retries) {
+ return(rows_in_range_arbitrary_ret_val);
+ }
+
+ return btr_estimate_n_rows_in_range_low(
+ index, tuple1, tuple2,
+ nth_attempt + 1);
+ }
+
+ diverged = true;
+
+ if (slot1->nth_rec < slot2->nth_rec) {
+ /* We do not count the borders (nor the left
+ nor the right one), thus "- 1". */
+ n_rows = slot2->nth_rec - slot1->nth_rec - 1;
+
+ if (n_rows > 0) {
+ /* There is at least one row between
+ the two borders pointed to by slot1
+ and slot2, so on the level below the
+ slots will point to non-adjacent
+ pages. */
+ diverged_lot = true;
+ divergence_level = i;
+ }
+ } else {
+ /* It is possible that
+ slot1->nth_rec >= slot2->nth_rec
+ if, for example, we have a single page
+ tree which contains (inf, 5, 6, supr)
+ and we select where x > 20 and x < 30;
+ in this case slot1->nth_rec will point
+ to the supr record and slot2->nth_rec
+ will point to 6. */
+ n_rows = 0;
+ should_count_the_left_border = false;
+ should_count_the_right_border = false;
+ }
+
+ } else if (diverged && !diverged_lot) {
+
+ if (slot1->nth_rec < slot1->n_recs
+ || slot2->nth_rec > 1) {
+
+ diverged_lot = true;
+ divergence_level = i;
+
+ n_rows = 0;
+
+ if (slot1->nth_rec < slot1->n_recs) {
+ n_rows += slot1->n_recs
+ - slot1->nth_rec;
+ }
+
+ if (slot2->nth_rec > 1) {
+ n_rows += slot2->nth_rec - 1;
+ }
+ }
+ } else if (diverged_lot) {
+
+ n_rows = btr_estimate_n_rows_in_range_on_level(
+ index, slot1, slot2, n_rows,
+ &is_n_rows_exact);
+ }
+ }
+}
+
+/** Estimates the number of rows in a given index range.
+@param[in] index index
+@param[in] tuple1 range start, may also be empty tuple
+@param[in] mode1 search mode for range start
+@param[in] tuple2 range end, may also be empty tuple
+@param[in] mode2 search mode for range end
+@return estimated number of rows */
+ha_rows
+btr_estimate_n_rows_in_range(
+ dict_index_t* index,
+ btr_pos_t *tuple1,
+ btr_pos_t *tuple2)
+{
+ return btr_estimate_n_rows_in_range_low(
+ index, tuple1, tuple2, 1);
+}
+
+/*******************************************************************//**
+Record the number of non_null key values in a given index for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are eventually stored in the array:
+index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
+static
+void
+btr_record_not_null_field_in_rec(
+/*=============================*/
+ ulint n_unique, /*!< in: dict_index_get_n_unique(index),
+ number of columns uniquely determine
+ an index entry */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
+ its size could be for all fields or
+ that of "n_unique" */
+ ib_uint64_t* n_not_null) /*!< in/out: array to record number of
+ not null rows for n-column prefix */
+{
+ ulint i;
+
+ ut_ad(rec_offs_n_fields(offsets) >= n_unique);
+
+ if (n_not_null == NULL) {
+ return;
+ }
+
+ for (i = 0; i < n_unique; i++) {
+ if (rec_offs_nth_sql_null(offsets, i)) {
+ break;
+ }
+
+ n_not_null[i]++;
+ }
+}
+
+/** Estimates the number of different key values in a given index, for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+result.n_sample_sizes[].
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array result.n_non_null_key_vals.
+@param[in] index index
+@return vector with statistics information
+empty vector if the index is unavailable. */
+std::vector<index_field_stats_t>
+btr_estimate_number_of_different_key_vals(dict_index_t* index)
+{
+ btr_cur_t cursor;
+ page_t* page;
+ rec_t* rec;
+ ulint n_cols;
+ ib_uint64_t* n_diff;
+ ib_uint64_t* n_not_null;
+ ibool stats_null_not_equal;
+ uintmax_t n_sample_pages=1; /* number of pages to sample */
+ ulint not_empty_flag = 0;
+ ulint total_external_size = 0;
+ ulint i;
+ ulint j;
+ uintmax_t add_on;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ rec_offs* offsets_rec = NULL;
+ rec_offs* offsets_next_rec = NULL;
+
+ std::vector<index_field_stats_t> result;
+
+ /* For spatial index, there is no such stats can be
+ fetched. */
+ ut_ad(!dict_index_is_spatial(index));
+
+ n_cols = dict_index_get_n_unique(index);
+
+ heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
+ * n_cols
+ + dict_index_get_n_fields(index)
+ * (sizeof *offsets_rec
+ + sizeof *offsets_next_rec));
+
+ n_diff = (ib_uint64_t*) mem_heap_zalloc(
+ heap, n_cols * sizeof(n_diff[0]));
+
+ n_not_null = NULL;
+
+ /* Check srv_innodb_stats_method setting, and decide whether we
+ need to record non-null value and also decide if NULL is
+ considered equal (by setting stats_null_not_equal value) */
+ switch (srv_innodb_stats_method) {
+ case SRV_STATS_NULLS_IGNORED:
+ n_not_null = (ib_uint64_t*) mem_heap_zalloc(
+ heap, n_cols * sizeof *n_not_null);
+ /* fall through */
+
+ case SRV_STATS_NULLS_UNEQUAL:
+ /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
+ case, we will treat NULLs as unequal value */
+ stats_null_not_equal = TRUE;
+ break;
+
+ case SRV_STATS_NULLS_EQUAL:
+ stats_null_not_equal = FALSE;
+ break;
+
+ default:
+ ut_error;
+ }
+
+ if (srv_stats_sample_traditional) {
+ /* It makes no sense to test more pages than are contained
+ in the index, thus we lower the number if it is too high */
+ if (srv_stats_transient_sample_pages > index->stat_index_size) {
+ if (index->stat_index_size > 0) {
+ n_sample_pages = index->stat_index_size;
+ }
+ } else {
+ n_sample_pages = srv_stats_transient_sample_pages;
+ }
+ } else {
+ /* New logaritmic number of pages that are estimated.
+ Number of pages estimated should be between 1 and
+ index->stat_index_size.
+
+ If we have only 0 or 1 index pages then we can only take 1
+ sample. We have already initialized n_sample_pages to 1.
+
+ So taking index size as I and sample as S and log(I)*S as L
+
+ requirement 1) we want the out limit of the expression to not exceed I;
+ requirement 2) we want the ideal pages to be at least S;
+ so the current expression is min(I, max( min(S,I), L)
+
+ looking for simplifications:
+
+ case 1: assume S < I
+ min(I, max( min(S,I), L) -> min(I , max( S, L))
+
+ but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L.
+
+ so we have: min(I , L)
+
+ case 2: assume I < S
+ min(I, max( min(S,I), L) -> min(I, max( I, L))
+
+ case 2a: L > I
+ min(I, max( I, L)) -> min(I, L) -> I
+
+ case 2b: when L < I
+ min(I, max( I, L)) -> min(I, I ) -> I
+
+ so taking all case2 paths is I, our expression is:
+ n_pages = S < I? min(I,L) : I
+ */
+ if (index->stat_index_size > 1) {
+ n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
+ ? ut_min(index->stat_index_size,
+ static_cast<ulint>(
+ log2(double(index->stat_index_size))
+ * double(srv_stats_transient_sample_pages)))
+ : index->stat_index_size;
+ }
+ }
+
+ /* Sanity check */
+ ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
+
+ /* We sample some pages in the index to get an estimate */
+
+ for (i = 0; i < n_sample_pages; i++) {
+ mtr_start(&mtr);
+
+ bool available;
+
+ available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
+ &cursor, &mtr);
+
+ if (!available) {
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return result;
+ }
+
+ /* Count the number of different key values for each prefix of
+ the key on this index page. If the prefix does not determine
+ the index record uniquely in the B-tree, then we subtract one
+ because otherwise our algorithm would give a wrong estimate
+ for an index where there is just one key value. */
+
+ if (!index->is_readable()) {
+ mtr_commit(&mtr);
+ goto exit_loop;
+ }
+
+ page = btr_cur_get_page(&cursor);
+
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ const ulint n_core = page_is_leaf(page)
+ ? index->n_core_fields : 0;
+
+ if (!page_rec_is_supremum(rec)) {
+ not_empty_flag = 1;
+ offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+ n_core,
+ ULINT_UNDEFINED, &heap);
+
+ if (n_not_null != NULL) {
+ btr_record_not_null_field_in_rec(
+ n_cols, offsets_rec, n_not_null);
+ }
+ }
+
+ while (!page_rec_is_supremum(rec)) {
+ ulint matched_fields;
+ rec_t* next_rec = page_rec_get_next(rec);
+ if (page_rec_is_supremum(next_rec)) {
+ total_external_size +=
+ btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
+ break;
+ }
+
+ offsets_next_rec = rec_get_offsets(next_rec, index,
+ offsets_next_rec,
+ n_core,
+ ULINT_UNDEFINED,
+ &heap);
+
+ cmp_rec_rec(rec, next_rec,
+ offsets_rec, offsets_next_rec,
+ index, stats_null_not_equal,
+ &matched_fields);
+
+ for (j = matched_fields; j < n_cols; j++) {
+ /* We add one if this index record has
+ a different prefix from the previous */
+
+ n_diff[j]++;
+ }
+
+ if (n_not_null != NULL) {
+ btr_record_not_null_field_in_rec(
+ n_cols, offsets_next_rec, n_not_null);
+ }
+
+ total_external_size
+ += btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
+
+ rec = next_rec;
+ /* Initialize offsets_rec for the next round
+ and assign the old offsets_rec buffer to
+ offsets_next_rec. */
+ {
+ rec_offs* offsets_tmp = offsets_rec;
+ offsets_rec = offsets_next_rec;
+ offsets_next_rec = offsets_tmp;
+ }
+ }
+
+ if (n_cols == dict_index_get_n_unique_in_tree(index)
+ && page_has_siblings(page)) {
+
+ /* If there is more than one leaf page in the tree,
+ we add one because we know that the first record
+ on the page certainly had a different prefix than the
+ last record on the previous index page in the
+ alphabetical order. Before this fix, if there was
+ just one big record on each clustered index page, the
+ algorithm grossly underestimated the number of rows
+ in the table. */
+
+ n_diff[n_cols - 1]++;
+ }
+
+ mtr_commit(&mtr);
+ }
+
+exit_loop:
+ /* If we saw k borders between different key values on
+ n_sample_pages leaf pages, we can estimate how many
+ there will be in index->stat_n_leaf_pages */
+
+ /* We must take into account that our sample actually represents
+ also the pages used for external storage of fields (those pages are
+ included in index->stat_n_leaf_pages) */
+
+ result.reserve(n_cols);
+
+ for (j = 0; j < n_cols; j++) {
+ index_field_stats_t stat;
+
+ stat.n_diff_key_vals
+ = BTR_TABLE_STATS_FROM_SAMPLE(
+ n_diff[j], index, n_sample_pages,
+ total_external_size, not_empty_flag);
+
+ /* If the tree is small, smaller than
+ 10 * n_sample_pages + total_external_size, then
+ the above estimate is ok. For bigger trees it is common that we
+ do not see any borders between key values in the few pages
+ we pick. But still there may be n_sample_pages
+ different key values, or even more. Let us try to approximate
+ that: */
+
+ add_on = index->stat_n_leaf_pages
+ / (10 * (n_sample_pages
+ + total_external_size));
+
+ if (add_on > n_sample_pages) {
+ add_on = n_sample_pages;
+ }
+
+ stat.n_diff_key_vals += add_on;
+
+ stat.n_sample_sizes = n_sample_pages;
+
+ if (n_not_null != NULL) {
+ stat.n_non_null_key_vals =
+ BTR_TABLE_STATS_FROM_SAMPLE(
+ n_not_null[j], index, n_sample_pages,
+ total_external_size, not_empty_flag);
+ }
+
+ result.push_back(stat);
+ }
+
+ mem_heap_free(heap);
+
+ return result;
+}
+
+/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
+
+/***********************************************************//**
+Gets the offset of the pointer to the externally stored part of a field.
+@return offset of the pointer to the externally stored part */
+static
+ulint
+btr_rec_get_field_ref_offs(
+/*=======================*/
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: index of the external field */
+{
+ ulint field_ref_offs;
+ ulint local_len;
+
+ ut_a(rec_offs_nth_extern(offsets, n));
+ field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
+ ut_a(len_is_stored(local_len));
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
+}
+
+/** Gets a pointer to the externally stored part of a field.
+@param rec record
+@param offsets rec_get_offsets(rec)
+@param n index of the externally stored field
+@return pointer to the externally stored part */
+#define btr_rec_get_field_ref(rec, offsets, n) \
+ ((rec) + btr_rec_get_field_ref_offs(offsets, n))
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in] rec record
+@param[in] offsets array returned by rec_get_offsets()
+@return externally stored part, in units of a database page */
+ulint
+btr_rec_get_externally_stored_len(
+ const rec_t* rec,
+ const rec_offs* offsets)
+{
+ ulint n_fields;
+ ulint total_extern_len = 0;
+ ulint i;
+
+ ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(0);
+ }
+
+ n_fields = rec_offs_n_fields(offsets);
+
+ for (i = 0; i < n_fields; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+
+ ulint extern_len = mach_read_from_4(
+ btr_rec_get_field_ref(rec, offsets, i)
+ + BTR_EXTERN_LEN + 4);
+
+ total_extern_len += ut_calc_align(
+ extern_len, ulint(srv_page_size));
+ }
+ }
+
+ return total_extern_len >> srv_page_size_shift;
+}
+
+/*******************************************************************//**
+Sets the ownership bit of an externally stored field in a record. */
+static
+void
+btr_cur_set_ownership_of_extern_field(
+/*==================================*/
+ buf_block_t* block, /*!< in/out: index page */
+ rec_t* rec, /*!< in/out: clustered index record */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint i, /*!< in: field number */
+ bool val, /*!< in: value to set */
+ mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
+{
+ byte* data;
+ ulint local_len;
+ ulint byte_val;
+
+ data = rec_get_nth_field(rec, offsets, i, &local_len);
+ ut_ad(rec_offs_nth_extern(offsets, i));
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
+
+ if (val) {
+ byte_val &= ~BTR_EXTERN_OWNER_FLAG;
+ } else {
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+ byte_val |= BTR_EXTERN_OWNER_FLAG;
+ }
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+ page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr);
+ } else {
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len
+ + BTR_EXTERN_LEN, byte_val);
+ }
+}
+
+/*******************************************************************//**
+Marks non-updated off-page fields as disowned by this record. The ownership
+must be transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+void
+btr_cur_disown_inherited_fields(
+/*============================*/
+ buf_block_t* block, /*!< in/out: index page */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ const upd_t* update, /*!< in: update vector */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+ ut_ad(rec_offs_any_extern(offsets));
+
+ for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)
+ && !upd_get_field_by_field_no(update, i, false)) {
+ btr_cur_set_ownership_of_extern_field(
+ block, rec, index, offsets, i, false, mtr);
+ }
+ }
+}
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+ buf_block_t* block, /*!< in/out: index page */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
+{
+ ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+ if (!rec_offs_any_extern(offsets)) {
+ return;
+ }
+
+ const ulint n = rec_offs_n_fields(offsets);
+
+ for (ulint i = 0; i < n; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ btr_cur_set_ownership_of_extern_field(
+ block, rec, index, offsets, i, true, mtr);
+ }
+ }
+}
+
+/*******************************************************************//**
+Returns the length of a BLOB part stored on the header page.
+@return part length */
+static
+uint32_t
+btr_blob_get_part_len(
+/*==================*/
+ const byte* blob_header) /*!< in: blob header */
+{
+ return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
+}
+
+/*******************************************************************//**
+Returns the page number where the next BLOB part is stored.
+@return page number or FIL_NULL if no more pages */
+static
+uint32_t
+btr_blob_get_next_page_no(
+/*======================*/
+ const byte* blob_header) /*!< in: blob header */
+{
+ return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
+}
+
+/** Deallocate a buffer block that was reserved for a BLOB part.
+@param block buffer block
+@param all flag whether to remove a ROW_FORMAT=COMPRESSED page
+@param mtr mini-transaction to commit */
+static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr)
+{
+ const page_id_t page_id(block->page.id());
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ mtr->commit();
+
+ const ulint fold= page_id.fold();
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
+ if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data)
+ /* Attempt to deallocate the redundant copy of the uncompressed page
+ if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */
+ buf_LRU_free_page(bpage, false);
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Helper class used while writing blob pages, during insert or update. */
+struct btr_blob_log_check_t {
+ /** Persistent cursor on a clusterex index record with blobs. */
+ btr_pcur_t* m_pcur;
+ /** Mini transaction holding the latches for m_pcur */
+ mtr_t* m_mtr;
+ /** rec_get_offsets(rec, index); offset of clust_rec */
+ const rec_offs* m_offsets;
+ /** The block containing clustered record */
+ buf_block_t** m_block;
+ /** The clustered record pointer */
+ rec_t** m_rec;
+ /** The blob operation code */
+ enum blob_op m_op;
+
+ /** Constructor
+ @param[in] pcur persistent cursor on a clustered
+ index record with blobs.
+ @param[in] mtr mini-transaction holding latches for
+ pcur.
+ @param[in] offsets offsets of the clust_rec
+ @param[in,out] block record block containing pcur record
+ @param[in,out] rec the clustered record pointer
+ @param[in] op the blob operation code */
+ btr_blob_log_check_t(
+ btr_pcur_t* pcur,
+ mtr_t* mtr,
+ const rec_offs* offsets,
+ buf_block_t** block,
+ rec_t** rec,
+ enum blob_op op)
+ : m_pcur(pcur),
+ m_mtr(mtr),
+ m_offsets(offsets),
+ m_block(block),
+ m_rec(rec),
+ m_op(op)
+ {
+ ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
+ ut_ad((*m_block)->frame == page_align(*m_rec));
+ ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
+ }
+
+ /** Check if there is enough space in log file. Commit and re-start the
+ mini transaction. */
+ void check()
+ {
+ dict_index_t* index = m_pcur->index();
+ ulint offs = 0;
+ uint32_t page_no = FIL_NULL;
+
+ if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) {
+ offs = page_offset(*m_rec);
+ page_no = (*m_block)->page.id().page_no();
+ buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
+ ut_ad(page_no != FIL_NULL);
+ } else {
+ btr_pcur_store_position(m_pcur, m_mtr);
+ }
+ m_mtr->commit();
+
+ DEBUG_SYNC_C("blob_write_middle");
+
+ log_free_check();
+
+ DEBUG_SYNC_C("blob_write_middle_after_check");
+
+ const mtr_log_t log_mode = m_mtr->get_log_mode();
+ m_mtr->start();
+ m_mtr->set_log_mode(log_mode);
+ index->set_modified(*m_mtr);
+
+ if (UNIV_UNLIKELY(page_no != FIL_NULL)) {
+ m_pcur->btr_cur.page_cur.block = btr_block_get(
+ *index, page_no, RW_X_LATCH, false, m_mtr);
+ m_pcur->btr_cur.page_cur.rec
+ = m_pcur->btr_cur.page_cur.block->frame
+ + offs;
+
+ buf_block_buf_fix_dec(m_pcur->btr_cur.page_cur.block);
+ } else {
+ ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
+ bool ret = btr_pcur_restore_position(
+ BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
+ m_pcur, m_mtr);
+
+ ut_a(ret);
+ }
+
+ *m_block = btr_pcur_get_block(m_pcur);
+ *m_rec = btr_pcur_get_rec(m_pcur);
+
+ rec_offs_make_valid(*m_rec, index, true,
+ const_cast<rec_offs*>(m_offsets));
+
+ ut_ad(m_mtr->memo_contains_page_flagged(
+ *m_rec,
+ MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
+
+ ut_ad((m_op == BTR_STORE_INSERT_BULK)
+ == !m_mtr->memo_contains_flagged(&index->lock,
+ MTR_MEMO_SX_LOCK
+ | MTR_MEMO_X_LOCK));
+ }
+};
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec. The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+
+TODO: If the allocation extends the tablespace, it will not be redo logged, in
+any mini-transaction. Tablespace extension should be redo-logged, so that
+recovery will not fail when the big_rec was written to the extended portion of
+the file, in case the file was somehow truncated in the crash.
+
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+btr_store_big_rec_extern_fields(
+/*============================*/
+ btr_pcur_t* pcur, /*!< in/out: a persistent cursor. if
+ btr_mtr is restarted, then this can
+ be repositioned. */
+ rec_offs* offsets, /*!< in/out: rec_get_offsets() on
+ pcur. the "external storage" flags
+ in offsets will correctly correspond
+ to rec when this function returns */
+ const big_rec_t*big_rec_vec, /*!< in: vector containing fields
+ to be stored externally */
+ mtr_t* btr_mtr, /*!< in/out: mtr containing the
+ latches to the clustered index. can be
+ committed and restarted. */
+ enum blob_op op) /*! in: operation code */
+{
+ byte* field_ref;
+ ulint extern_len;
+ ulint store_len;
+ ulint space_id;
+ ulint i;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ page_zip_des_t* page_zip;
+ z_stream c_stream;
+ dberr_t error = DB_SUCCESS;
+ dict_index_t* index = pcur->index();
+ buf_block_t* rec_block = btr_pcur_get_block(pcur);
+ rec_t* rec = btr_pcur_get_rec(pcur);
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_any_extern(offsets));
+ ut_ad(op == BTR_STORE_INSERT_BULK
+ || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
+ ut_a(dict_index_is_clust(index));
+
+ btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
+ &rec, op);
+ page_zip = buf_block_get_page_zip(rec_block);
+ space_id = rec_block->page.id().space();
+ ut_a(fil_page_index_page_check(page_align(rec))
+ || op == BTR_STORE_INSERT_BULK);
+
+ if (page_zip) {
+ int err;
+
+ /* Zlib deflate needs 128 kilobytes for the default
+ window size, plus 512 << memLevel, plus a few
+ kilobytes for small objects. We use reduced memLevel
+ to limit the memory consumption, and preallocate the
+ heap, hoping to avoid memory fragmentation. */
+ heap = mem_heap_create(250000);
+ page_zip_set_alloc(&c_stream, heap);
+
+ err = deflateInit2(&c_stream, int(page_zip_level),
+ Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
+ ut_a(err == Z_OK);
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* All pointers to externally stored columns in the record
+ must either be zero or they must be pointers to inherited
+ columns, owned by this record or an earlier record version. */
+ for (i = 0; i < big_rec_vec->n_fields; i++) {
+ field_ref = btr_rec_get_field_ref(
+ rec, offsets, big_rec_vec->fields[i].field_no);
+
+ ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+ /* Either this must be an update in place,
+ or the BLOB must be inherited, or the BLOB pointer
+ must be zero (will be written in this function). */
+ ut_a(op == BTR_STORE_UPDATE
+ || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
+ || !memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ }
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ /* Space available in compressed page to carry blob data */
+ const ulint payload_size_zip = rec_block->physical_size()
+ - FIL_PAGE_DATA;
+
+ /* Space available in uncompressed page to carry blob data */
+ const ulint payload_size = payload_size_zip
+ - (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END);
+
+ /* We have to create a file segment to the tablespace
+ for each field and put the pointer to the field in rec */
+
+ for (i = 0; i < big_rec_vec->n_fields; i++) {
+ const ulint field_no = big_rec_vec->fields[i].field_no;
+
+ field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* A zero BLOB pointer should have been initially inserted. */
+ ut_a(!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+ extern_len = big_rec_vec->fields[i].len;
+ MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len);
+ ut_a(extern_len > 0);
+
+ uint32_t prev_page_no = FIL_NULL;
+
+ if (page_zip) {
+ int err = deflateReset(&c_stream);
+ ut_a(err == Z_OK);
+
+ c_stream.next_in = (Bytef*)
+ big_rec_vec->fields[i].data;
+ c_stream.avail_in = static_cast<uInt>(extern_len);
+ }
+
+ for (ulint blob_npages = 0;; ++blob_npages) {
+ buf_block_t* block;
+ const ulint commit_freq = 4;
+ uint32_t r_extents;
+
+ ut_ad(page_align(field_ref) == page_align(rec));
+
+ if (!(blob_npages % commit_freq)) {
+
+ redo_log.check();
+
+ field_ref = btr_rec_get_field_ref(
+ rec, offsets, field_no);
+
+ page_zip = buf_block_get_page_zip(rec_block);
+ }
+
+ mtr.start();
+ index->set_modified(mtr);
+ mtr.set_log_mode(btr_mtr->get_log_mode());
+
+ buf_page_get(rec_block->page.id(),
+ rec_block->zip_size(), RW_X_LATCH, &mtr);
+
+ uint32_t hint_prev = prev_page_no;
+ if (hint_prev == FIL_NULL) {
+ hint_prev = rec_block->page.id().page_no();
+ }
+
+ if (!fsp_reserve_free_extents(&r_extents,
+ index->table->space, 1,
+ FSP_BLOB, &mtr, 1)) {
+ mtr.commit();
+ error = DB_OUT_OF_FILE_SPACE;
+ goto func_exit;
+ }
+
+ block = btr_page_alloc(index, hint_prev + 1,
+ FSP_NO_DIR, 0, &mtr, &mtr);
+
+ index->table->space->release_free_extents(r_extents);
+
+ ut_a(block != NULL);
+
+ const uint32_t page_no = block->page.id().page_no();
+
+ if (prev_page_no != FIL_NULL) {
+ buf_block_t* prev_block;
+
+ prev_block = buf_page_get(
+ page_id_t(space_id, prev_page_no),
+ rec_block->zip_size(),
+ RW_X_LATCH, &mtr);
+
+ buf_block_dbg_add_level(prev_block,
+ SYNC_EXTERN_STORAGE);
+
+ if (page_zip) {
+ mtr.write<4>(*prev_block,
+ prev_block->frame
+ + FIL_PAGE_NEXT,
+ page_no);
+ memcpy_aligned<4>(
+ buf_block_get_page_zip(
+ prev_block)
+ ->data + FIL_PAGE_NEXT,
+ prev_block->frame
+ + FIL_PAGE_NEXT, 4);
+ } else {
+ mtr.write<4>(*prev_block,
+ BTR_BLOB_HDR_NEXT_PAGE_NO
+ + FIL_PAGE_DATA
+ + prev_block->frame,
+ page_no);
+ }
+ } else if (dict_index_is_online_ddl(index)) {
+ row_log_table_blob_alloc(index, page_no);
+ }
+
+ ut_ad(!page_has_siblings(block->frame));
+ ut_ad(!fil_page_get_type(block->frame));
+
+ if (page_zip) {
+ int err;
+ page_zip_des_t* blob_page_zip;
+
+ mtr.write<1>(*block,
+ FIL_PAGE_TYPE + 1 + block->frame,
+ prev_page_no == FIL_NULL
+ ? FIL_PAGE_TYPE_ZBLOB
+ : FIL_PAGE_TYPE_ZBLOB2);
+ block->page.zip.data[FIL_PAGE_TYPE + 1]
+ = block->frame[FIL_PAGE_TYPE + 1];
+
+ c_stream.next_out = block->frame
+ + FIL_PAGE_DATA;
+ c_stream.avail_out = static_cast<uInt>(
+ payload_size_zip);
+
+ err = deflate(&c_stream, Z_FINISH);
+ ut_a(err == Z_OK || err == Z_STREAM_END);
+ ut_a(err == Z_STREAM_END
+ || c_stream.avail_out == 0);
+
+ mtr.memcpy(*block,
+ FIL_PAGE_DATA,
+ page_zip_get_size(page_zip)
+ - FIL_PAGE_DATA
+ - c_stream.avail_out);
+ /* Copy the page to compressed storage,
+ because it will be flushed to disk
+ from there. */
+ blob_page_zip = buf_block_get_page_zip(block);
+ ut_ad(blob_page_zip);
+ ut_ad(page_zip_get_size(blob_page_zip)
+ == page_zip_get_size(page_zip));
+ memcpy(blob_page_zip->data, block->frame,
+ page_zip_get_size(page_zip));
+
+ if (err == Z_OK && prev_page_no != FIL_NULL) {
+
+ goto next_zip_page;
+ }
+
+ if (err == Z_STREAM_END) {
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_LEN, 0);
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_LEN + 4,
+ c_stream.total_in);
+ } else {
+ memset(field_ref + BTR_EXTERN_LEN,
+ 0, 8);
+ }
+
+ if (prev_page_no == FIL_NULL) {
+ ut_ad(blob_npages == 0);
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_SPACE_ID,
+ space_id);
+
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_PAGE_NO,
+ page_no);
+
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_OFFSET,
+ FIL_PAGE_NEXT);
+ }
+
+ /* We compress a page when finish bulk insert.*/
+ if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) {
+ page_zip_write_blob_ptr(
+ rec_block, rec, index, offsets,
+ field_no, &mtr);
+ }
+
+next_zip_page:
+ prev_page_no = page_no;
+
+ /* Commit mtr and release the
+ uncompressed page frame to save memory. */
+ btr_blob_free(block, FALSE, &mtr);
+
+ if (err == Z_STREAM_END) {
+ break;
+ }
+ } else {
+ mtr.write<1>(*block, FIL_PAGE_TYPE + 1
+ + block->frame,
+ FIL_PAGE_TYPE_BLOB);
+
+ if (extern_len > payload_size) {
+ store_len = payload_size;
+ } else {
+ store_len = extern_len;
+ }
+
+ mtr.memcpy<mtr_t::MAYBE_NOP>(
+ *block,
+ FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE
+ + block->frame,
+ static_cast<const byte*>
+ (big_rec_vec->fields[i].data)
+ + big_rec_vec->fields[i].len
+ - extern_len, store_len);
+ mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN
+ + FIL_PAGE_DATA + block->frame,
+ store_len);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO
+ + FIL_PAGE_DATA, 4, 0xff);
+
+ extern_len -= store_len;
+
+ ut_ad(!mach_read_from_4(BTR_EXTERN_LEN
+ + field_ref));
+ mtr.write<4>(*rec_block,
+ BTR_EXTERN_LEN + 4 + field_ref,
+ big_rec_vec->fields[i].len
+ - extern_len);
+
+ if (prev_page_no == FIL_NULL) {
+ ut_ad(blob_npages == 0);
+ mtr.write<4,mtr_t::MAYBE_NOP>(
+ *rec_block,
+ field_ref + BTR_EXTERN_SPACE_ID,
+ space_id);
+
+ mtr.write<4>(*rec_block, field_ref
+ + BTR_EXTERN_PAGE_NO,
+ page_no);
+
+ mtr.write<4>(*rec_block, field_ref
+ + BTR_EXTERN_OFFSET,
+ FIL_PAGE_DATA);
+ }
+
+ prev_page_no = page_no;
+
+ mtr.commit();
+
+ if (extern_len == 0) {
+ break;
+ }
+ }
+ }
+
+ DBUG_EXECUTE_IF("btr_store_big_rec_extern",
+ error = DB_OUT_OF_FILE_SPACE;
+ goto func_exit;);
+
+ rec_offs_make_nth_extern(offsets, field_no);
+ }
+
+func_exit:
+ if (page_zip) {
+ deflateEnd(&c_stream);
+ }
+
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* All pointers to externally stored columns in the record
+ must be valid. */
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (!rec_offs_nth_extern(offsets, i)) {
+ continue;
+ }
+
+ field_ref = btr_rec_get_field_ref(rec, offsets, i);
+
+ /* The pointer must not be zero if the operation
+ succeeded. */
+ ut_a(0 != memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE)
+ || error != DB_SUCCESS);
+ /* The column must not be disowned by this record. */
+ ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+ }
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+ return(error);
+}
+
+/** Check the FIL_PAGE_TYPE on an uncompressed BLOB page.
+@param[in] block uncompressed BLOB page
+@param[in] read true=read, false=purge */
+static void btr_check_blob_fil_page_type(const buf_block_t& block, bool read)
+{
+ uint16_t type= fil_page_get_type(block.frame);
+
+ if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB))
+ return;
+ /* FIXME: take the tablespace as a parameter */
+ if (fil_space_t *space= fil_space_t::get(block.page.id().space()))
+ {
+ /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB
+ pages. Do not print anything about the type mismatch when reading
+ a BLOB page that may be from old versions. */
+ if (space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags))
+ {
+ ib::fatal() << "FIL_PAGE_TYPE=" << type
+ << (read ? " on BLOB read file " : " on BLOB purge file ")
+ << space->chain.start->name
+ << " page " << block.page.id().page_no();
+ }
+ space->release();
+ }
+}
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned by the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+void
+btr_free_externally_stored_field(
+/*=============================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched; if the tree
+ height is 1, then also the root page
+ must be X-latched! (this is relevant
+ in the case this function is called
+ from purge where 'data' is located on
+ an undo log page, not an index
+ page) */
+ byte* field_ref, /*!< in/out: field reference */
+ const rec_t* rec, /*!< in: record containing field_ref, for
+ page_zip_write_blob_ptr(), or NULL */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
+ or NULL */
+ buf_block_t* block, /*!< in/out: page of field_ref */
+ ulint i, /*!< in: field number of field_ref;
+ ignored if rec == NULL */
+ bool rollback, /*!< in: performing rollback? */
+ mtr_t* local_mtr) /*!< in: mtr
+ containing the latch to data an an
+ X-latch to the index tree */
+{
+ page_t* page;
+ const uint32_t space_id = mach_read_from_4(
+ field_ref + BTR_EXTERN_SPACE_ID);
+ const uint32_t start_page = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ uint32_t page_no;
+ uint32_t next_page_no;
+ mtr_t mtr;
+
+ ut_ad(index->is_primary());
+ ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(local_mtr->memo_contains_page_flagged(field_ref,
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+ ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
+ ut_ad(local_mtr->is_named_space(
+ page_get_space_id(page_align(field_ref))));
+
+ if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE))) {
+ /* In the rollback, we may encounter a clustered index
+ record with some unwritten off-page columns. There is
+ nothing to free then. */
+ ut_a(rollback);
+ return;
+ }
+
+ ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
+ & ~((BTR_EXTERN_OWNER_FLAG
+ | BTR_EXTERN_INHERITED_FLAG) << 24)));
+ ut_ad(space_id == index->table->space->id);
+ ut_ad(space_id == index->table->space_id);
+
+ const ulint ext_zip_size = index->table->space->zip_size();
+ const ulint rec_zip_size = rec ? ext_zip_size : 0;
+
+ /* !rec holds in a call from purge when field_ref is in an undo page */
+ ut_ad(rec || !block->page.zip.data);
+
+ for (;;) {
+#ifdef UNIV_DEBUG
+ buf_block_t* rec_block;
+#endif /* UNIV_DEBUG */
+ buf_block_t* ext_block;
+
+ mtr_start(&mtr);
+ mtr.set_spaces(*local_mtr);
+ mtr.set_log_mode(local_mtr->get_log_mode());
+
+ ut_ad(!index->table->is_temporary()
+ || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
+
+ const page_t* p = page_align(field_ref);
+
+ const page_id_t page_id(page_get_space_id(p),
+ page_get_page_no(p));
+
+#ifdef UNIV_DEBUG
+ rec_block =
+#endif /* UNIV_DEBUG */
+ buf_page_get(page_id, rec_zip_size, RW_X_LATCH, &mtr);
+
+ buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
+ page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
+
+ if (/* There is no external storage data */
+ page_no == FIL_NULL
+ /* This field does not own the externally stored field */
+ || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+ & BTR_EXTERN_OWNER_FLAG)
+ /* Rollback and inherited field */
+ || (rollback
+ && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+ & BTR_EXTERN_INHERITED_FLAG))) {
+
+ /* Do not free */
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ if (page_no == start_page && dict_index_is_online_ddl(index)) {
+ row_log_table_blob_free(index, start_page);
+ }
+
+ ext_block = buf_page_get(
+ page_id_t(space_id, page_no), ext_zip_size,
+ RW_X_LATCH, &mtr);
+
+ buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
+ page = buf_block_get_frame(ext_block);
+
+ if (ext_zip_size) {
+ /* Note that page_zip will be NULL
+ in row_purge_upd_exist_or_extern(). */
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ break;
+ default:
+ ut_error;
+ }
+ next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
+
+ btr_page_free(index, ext_block, &mtr, true);
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
+ next_page_no);
+ memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4);
+ page_zip_write_blob_ptr(block, rec, index,
+ offsets, i, &mtr);
+ } else {
+ mtr.write<4>(*block,
+ BTR_EXTERN_PAGE_NO + field_ref,
+ next_page_no);
+ mtr.write<4,mtr_t::MAYBE_NOP>(*block,
+ BTR_EXTERN_LEN
+ + 4 + field_ref,
+ 0U);
+ }
+ } else {
+ ut_ad(!block->page.zip.data);
+ btr_check_blob_fil_page_type(*ext_block, false);
+
+ next_page_no = mach_read_from_4(
+ page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO);
+ btr_page_free(index, ext_block, &mtr, true);
+
+ mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref,
+ next_page_no);
+ /* Zero out the BLOB length. If the server
+ crashes during the execution of this function,
+ trx_rollback_all_recovered() could
+ dereference the half-deleted BLOB, fetching a
+ wrong prefix for the BLOB. */
+ mtr.write<4,mtr_t::MAYBE_NOP>(*block,
+ BTR_EXTERN_LEN + 4
+ + field_ref, 0U);
+ }
+
+ /* Commit mtr and release the BLOB block to save memory. */
+ btr_blob_free(ext_block, TRUE, &mtr);
+ }
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched */
+ rec_t* rec, /*!< in/out: record */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ buf_block_t* block, /*!< in: index page of rec */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr) /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the index
+ tree */
+{
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(index->is_primary());
+ ut_ad(page_rec_is_leaf(rec));
+ /* Free possible externally stored fields in the record */
+
+ ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
+ n_fields = rec_offs_n_fields(offsets);
+
+ for (i = 0; i < n_fields; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ btr_free_externally_stored_field(
+ index, btr_rec_get_field_ref(rec, offsets, i),
+ rec, offsets, block, i, rollback, mtr);
+ }
+ }
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+ dict_index_t* index, /*!< in: index of rec; the index tree MUST be
+ X-latched */
+ rec_t* rec, /*!< in/out: record */
+ buf_block_t* block, /*!< in: index page of rec */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr) /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the tree */
+{
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+
+ /* Free possible externally stored fields in the record */
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ const upd_field_t* ufield = upd_get_nth_field(update, i);
+
+ if (rec_offs_nth_extern(offsets, ufield->field_no)) {
+ ulint len;
+ byte* data = rec_get_nth_field(
+ rec, offsets, ufield->field_no, &len);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ btr_free_externally_stored_field(
+ index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ rec, offsets, block,
+ ufield->field_no, rollback, mtr);
+ }
+ }
+}
+
+/*******************************************************************//**
+Copies the prefix of an uncompressed BLOB. The clustered index record
+that points to this BLOB must be protected by a lock or a page latch.
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_blob_prefix(
+/*=================*/
+ byte* buf, /*!< out: the externally stored part of
+ the field, or a prefix of it */
+ uint32_t len, /*!< in: length of buf, in bytes */
+ page_id_t id, /*!< in: page identifier of the first BLOB page */
+ uint32_t offset) /*!< in: offset on the first BLOB page */
+{
+ ulint copied_len = 0;
+
+ for (;;) {
+ mtr_t mtr;
+ buf_block_t* block;
+ const page_t* page;
+ const byte* blob_header;
+ ulint part_len;
+ ulint copy_len;
+
+ mtr_start(&mtr);
+
+ block = buf_page_get(id, 0, RW_S_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+ page = buf_block_get_frame(block);
+
+ btr_check_blob_fil_page_type(*block, true);
+
+ blob_header = page + offset;
+ part_len = btr_blob_get_part_len(blob_header);
+ copy_len = ut_min(part_len, len - copied_len);
+
+ memcpy(buf + copied_len,
+ blob_header + BTR_BLOB_HDR_SIZE, copy_len);
+ copied_len += copy_len;
+
+ id.set_page_no(btr_blob_get_next_page_no(blob_header));
+
+ mtr_commit(&mtr);
+
+ if (id.page_no() == FIL_NULL || copy_len != part_len) {
+ MEM_CHECK_DEFINED(buf, copied_len);
+ return(copied_len);
+ }
+
+ /* On other BLOB pages except the first the BLOB header
+ always is at the page data start: */
+
+ offset = FIL_PAGE_DATA;
+
+ ut_ad(copied_len <= len);
+ }
+}
+
+/** Copies the prefix of a compressed BLOB.
+The clustered index record that points to this BLOB must be protected
+by a lock or a page latch.
+@param[out] buf the externally stored part of the field,
+or a prefix of it
+@param[in] len length of buf, in bytes
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size
+@param[in] id page identifier of the BLOB pages
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_zblob_prefix(
+ byte* buf,
+ uint32_t len,
+ ulint zip_size,
+ page_id_t id,
+ uint32_t offset)
+{
+ ulint page_type = FIL_PAGE_TYPE_ZBLOB;
+ mem_heap_t* heap;
+ int err;
+ z_stream d_stream;
+
+ d_stream.next_out = buf;
+ d_stream.avail_out = static_cast<uInt>(len);
+ d_stream.next_in = Z_NULL;
+ d_stream.avail_in = 0;
+
+ /* Zlib inflate needs 32 kilobytes for the default
+ window size, plus a few kilobytes for small objects. */
+ heap = mem_heap_create(40000);
+ page_zip_set_alloc(&d_stream, heap);
+
+ ut_ad(zip_size);
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(id.space());
+
+ err = inflateInit(&d_stream);
+ ut_a(err == Z_OK);
+
+ for (;;) {
+ buf_page_t* bpage;
+ uint32_t next_page_no;
+
+ /* There is no latch on bpage directly. Instead,
+ bpage is protected by the B-tree page latch that
+ is being held on the clustered index record, or,
+ in row_merge_copy_blobs(), by an exclusive table lock. */
+ bpage = buf_page_get_zip(id, zip_size);
+
+ if (UNIV_UNLIKELY(!bpage)) {
+ ib::error() << "Cannot load compressed BLOB " << id;
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY
+ (fil_page_get_type(bpage->zip.data) != page_type)) {
+
+ ib::error() << "Unexpected type "
+ << fil_page_get_type(bpage->zip.data)
+ << " of compressed BLOB page " << id;
+
+ ut_ad(0);
+ goto end_of_blob;
+ }
+
+ next_page_no = mach_read_from_4(bpage->zip.data + offset);
+
+ if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
+ /* When the BLOB begins at page header,
+ the compressed data payload does not
+ immediately follow the next page pointer. */
+ offset = FIL_PAGE_DATA;
+ } else {
+ offset += 4;
+ }
+
+ d_stream.next_in = bpage->zip.data + offset;
+ d_stream.avail_in = uInt(zip_size - offset);
+
+ err = inflate(&d_stream, Z_NO_FLUSH);
+ switch (err) {
+ case Z_OK:
+ if (!d_stream.avail_out) {
+ goto end_of_blob;
+ }
+ break;
+ case Z_STREAM_END:
+ if (next_page_no == FIL_NULL) {
+ goto end_of_blob;
+ }
+ /* fall through */
+ default:
+inflate_error:
+ ib::error() << "inflate() of compressed BLOB page "
+ << id
+ << " returned " << err
+ << " (" << d_stream.msg << ")";
+
+ case Z_BUF_ERROR:
+ goto end_of_blob;
+ }
+
+ if (next_page_no == FIL_NULL) {
+ if (!d_stream.avail_in) {
+ ib::error()
+ << "Unexpected end of compressed "
+ << "BLOB page " << id;
+ } else {
+ err = inflate(&d_stream, Z_FINISH);
+ switch (err) {
+ case Z_STREAM_END:
+ case Z_BUF_ERROR:
+ break;
+ default:
+ goto inflate_error;
+ }
+ }
+
+end_of_blob:
+ buf_page_release_zip(bpage);
+ goto func_exit;
+ }
+
+ buf_page_release_zip(bpage);
+
+ /* On other BLOB pages except the first
+ the BLOB header always is at the page header: */
+
+ id.set_page_no(next_page_no);
+ offset = FIL_PAGE_NEXT;
+ page_type = FIL_PAGE_TYPE_ZBLOB2;
+ }
+
+func_exit:
+ inflateEnd(&d_stream);
+ mem_heap_free(heap);
+ MEM_CHECK_DEFINED(buf, d_stream.total_out);
+ return(d_stream.total_out);
+}
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record that points to this BLOB must be protected
+by a lock or a page latch.
+@param[out] buf the externally stored part of the
+field, or a prefix of it
+@param[in] len length of buf, in bytes
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] id page identifier of the first BLOB page
+@param[in] offset offset on the first BLOB page
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_externally_stored_field_prefix_low(
+ byte* buf,
+ uint32_t len,
+ ulint zip_size,
+ page_id_t id,
+ uint32_t offset)
+{
+ if (len == 0)
+ return 0;
+
+ return zip_size
+ ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset)
+ : btr_copy_blob_prefix(buf, len, id, offset);
+}
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record must be protected by a lock or a page latch.
+@param[out] buf the field, or a prefix of it
+@param[in] len length of buf, in bytes
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] data 'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in] local_len length of data, in bytes
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+ulint
+btr_copy_externally_stored_field_prefix(
+ byte* buf,
+ ulint len,
+ ulint zip_size,
+ const byte* data,
+ ulint local_len)
+{
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY(local_len >= len)) {
+ memcpy(buf, data, len);
+ return(len);
+ }
+
+ memcpy(buf, data, local_len);
+ data += local_len;
+
+ ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+ if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
+ /* The externally stored part of the column has been
+ (partially) deleted. Signal the half-deleted BLOB
+ to the caller. */
+
+ return(0);
+ }
+
+ uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
+ uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
+ uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
+ len -= local_len;
+
+ return(local_len
+ + btr_copy_externally_stored_field_prefix_low(buf + local_len,
+ uint32_t(len),
+ zip_size,
+ page_id_t(
+ space_id,
+ page_no),
+ offset));
+}
+
+/** Copies an externally stored field of a record to mem heap.
+The clustered index record must be protected by a lock or a page latch.
+@param[out] len length of the whole field
+@param[in] data 'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] local_len length of data
+@param[in,out] heap mem heap
+@return the whole field copied to heap */
+byte*
+btr_copy_externally_stored_field(
+ ulint* len,
+ const byte* data,
+ ulint zip_size,
+ ulint local_len,
+ mem_heap_t* heap)
+{
+ byte* buf;
+
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ uint32_t space_id = mach_read_from_4(data + local_len
+ + BTR_EXTERN_SPACE_ID);
+ uint32_t page_no = mach_read_from_4(data + local_len
+ + BTR_EXTERN_PAGE_NO);
+ uint32_t offset = mach_read_from_4(data + local_len
+ + BTR_EXTERN_OFFSET);
+
+ /* Currently a BLOB cannot be bigger than 4 GB; we
+ leave the 4 upper bytes in the length field unused */
+
+ uint32_t extern_len = mach_read_from_4(data + local_len
+ + BTR_EXTERN_LEN + 4);
+
+ buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
+
+ memcpy(buf, data, local_len);
+ *len = local_len
+ + btr_copy_externally_stored_field_prefix_low(buf + local_len,
+ extern_len,
+ zip_size,
+ page_id_t(
+ space_id,
+ page_no),
+ offset);
+
+ return(buf);
+}
+
+/** Copies an externally stored field of a record to mem heap.
+@param[in] rec record in a clustered index; must be
+protected by a lock or a page latch
+@param[in] offset array returned by rec_get_offsets()
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] no field number
+@param[out] len length of the field
+@param[in,out] heap mem heap
+@return the field copied to heap, or NULL if the field is incomplete */
+byte*
+btr_rec_copy_externally_stored_field(
+ const rec_t* rec,
+ const rec_offs* offsets,
+ ulint zip_size,
+ ulint no,
+ ulint* len,
+ mem_heap_t* heap)
+{
+ ulint local_len;
+ const byte* data;
+
+ ut_a(rec_offs_nth_extern(offsets, no));
+
+ /* An externally stored field can contain some initial
+ data from the field, and in the last 20 bytes it has the
+ space id, page number, and offset where the rest of the
+ field data is stored, and the data length in addition to
+ the data stored locally. We may need to store some data
+ locally to get the local record length above the 128 byte
+ limit so that field offsets are stored in two bytes, and
+ the extern bit is available in those two bytes. */
+
+ data = rec_get_nth_field(rec, offsets, no, &local_len);
+
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ if (UNIV_UNLIKELY
+ (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
+ /* The externally stored field was not written yet.
+ This record should only be seen by
+ trx_rollback_recovered() or any
+ TRX_ISO_READ_UNCOMMITTED transactions. */
+ return(NULL);
+ }
+
+ return(btr_copy_externally_stored_field(len, data,
+ zip_size, local_len, heap));
+}
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
new file mode 100644
index 00000000..ebe9854b
--- /dev/null
+++ b/storage/innobase/btr/btr0defragment.cc
@@ -0,0 +1,843 @@
+/*****************************************************************************
+
+Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file btr/btr0defragment.cc
+Index defragmentation.
+
+Created 05/29/2014 Rongrong Zhong
+Modified 16/07/2014 Sunguck Lee
+Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com
+*******************************************************/
+
+#include "btr0defragment.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "dict0defrag_bg.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "srv0start.h"
+
+#include <list>
+
+/* When there's no work, either because defragment is disabled, or because no
+query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
+#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
+/* Reduce the target page size by this amount when compression failure happens
+during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
+3% of the page size. When there are compression failures in defragmentation,
+our goal is to get a decent defrag ratio with as few compression failure as
+possible. From experimentation it seems that reduce the target size by 512 every
+time will make sure the page is compressible within a couple of iterations. */
+#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
+
+/** Item in the work queue for btr_degrament_thread. */
+struct btr_defragment_item_t
+{
+ btr_pcur_t* pcur; /* persistent cursor where
+ btr_defragment_n_pages should start */
+ os_event_t event; /* if not null, signal after work
+ is done */
+ bool removed; /* Mark an item as removed */
+ ulonglong last_processed; /* timestamp of last time this index
+ is processed by defragment thread */
+
+ btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
+ ~btr_defragment_item_t();
+};
+
+/* Work queue for defragmentation. */
+typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
+static btr_defragment_wq_t btr_defragment_wq;
+
+/* Mutex protecting the defragmentation work queue.*/
+ib_mutex_t btr_defragment_mutex;
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* Number of compression failures caused by defragmentation since server
+start. */
+Atomic_counter<ulint> btr_defragment_compression_failures;
+/* Number of btr_defragment_n_pages calls that altered page but didn't
+manage to release any page. */
+Atomic_counter<ulint> btr_defragment_failures;
+/* Total number of btr_defragment_n_pages calls that altered page.
+The difference between btr_defragment_count and btr_defragment_failures shows
+the amount of effort wasted. */
+Atomic_counter<ulint> btr_defragment_count;
+
+bool btr_defragment_active;
+
+struct defragment_chunk_state_t
+{
+ btr_defragment_item_t* m_item;
+};
+
+static defragment_chunk_state_t defragment_chunk_state;
+static void btr_defragment_chunk(void*);
+
+static tpool::timer* btr_defragment_timer;
+static tpool::task_group task_group(1);
+static tpool::task btr_defragment_task(btr_defragment_chunk, 0, &task_group);
+static void btr_defragment_start();
+
+/******************************************************************//**
+Constructor for btr_defragment_item_t. */
+btr_defragment_item_t::btr_defragment_item_t(
+ btr_pcur_t* pcur,
+ os_event_t event)
+{
+ this->pcur = pcur;
+ this->event = event;
+ this->removed = false;
+ this->last_processed = 0;
+}
+
+/******************************************************************//**
+Destructor for btr_defragment_item_t. */
+btr_defragment_item_t::~btr_defragment_item_t() {
+ if (this->pcur) {
+ btr_pcur_free_for_mysql(this->pcur);
+ }
+ if (this->event) {
+ os_event_set(this->event);
+ }
+}
+
+static void submit_defragment_task(void*arg=0)
+{
+ srv_thread_pool->submit_task(&btr_defragment_task);
+}
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init()
+{
+ srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
+ mutex_create(LATCH_ID_BTR_DEFRAGMENT_MUTEX, &btr_defragment_mutex);
+ defragment_chunk_state.m_item = 0;
+ btr_defragment_timer = srv_thread_pool->create_timer(submit_defragment_task);
+ btr_defragment_active = true;
+}
+
+/******************************************************************//**
+Shutdown defragmentation. Release all resources. */
+void
+btr_defragment_shutdown()
+{
+ if (!btr_defragment_timer)
+ return;
+ delete btr_defragment_timer;
+ btr_defragment_timer = 0;
+ task_group.cancel_pending(&btr_defragment_task);
+ mutex_enter(&btr_defragment_mutex);
+ std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ while(iter != btr_defragment_wq.end()) {
+ btr_defragment_item_t* item = *iter;
+ iter = btr_defragment_wq.erase(iter);
+ delete item;
+ }
+ mutex_exit(&btr_defragment_mutex);
+ mutex_free(&btr_defragment_mutex);
+ btr_defragment_active = false;
+}
+
+
+/******************************************************************//**
+Functions used by the query threads: btr_defragment_xxx_index
+Query threads find/add/remove index. */
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. We use index->id
+to identify indices. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index) /*!< Index to find. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (index->id == idx->id) {
+ mutex_exit(&btr_defragment_mutex);
+ return true;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+ return false;
+}
+
+/******************************************************************//**
+Query thread uses this function to add an index to btr_defragment_wq.
+Return a pointer to os_event for the query thread to wait on if this is a
+synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+ dict_index_t* index, /*!< index to be added */
+ dberr_t* err) /*!< out: error code */
+{
+ mtr_t mtr;
+ *err = DB_SUCCESS;
+
+ mtr_start(&mtr);
+ buf_block_t* block = btr_root_block_get(index, RW_NO_LATCH, &mtr);
+ page_t* page = NULL;
+
+ if (block) {
+ page = buf_block_get_frame(block);
+ }
+
+ if (page == NULL && !index->is_readable()) {
+ mtr_commit(&mtr);
+ *err = DB_DECRYPTION_FAILED;
+ return NULL;
+ }
+
+ ut_ad(fil_page_index_page_check(page));
+ ut_ad(!page_has_siblings(page));
+
+ if (page_is_leaf(page)) {
+ // Index root is a leaf page, no need to defragment.
+ mtr_commit(&mtr);
+ return NULL;
+ }
+ btr_pcur_t* pcur = btr_pcur_create_for_mysql();
+ os_event_t event = os_event_create(0);
+ btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
+ true, 0, &mtr);
+ btr_pcur_move_to_next(pcur, &mtr);
+ btr_pcur_store_position(pcur, &mtr);
+ mtr_commit(&mtr);
+ dict_stats_empty_defrag_summary(index);
+ btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event);
+ mutex_enter(&btr_defragment_mutex);
+ btr_defragment_wq.push_back(item);
+ if(btr_defragment_wq.size() == 1){
+ /* Kick off defragmentation work */
+ btr_defragment_start();
+ }
+ mutex_exit(&btr_defragment_mutex);
+ return event;
+}
+
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table) /*!< Index to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (table->id == idx->table->id) {
+ item->removed = true;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Query thread uses this function to mark an index as removed in
+btr_efragment_wq. */
+void
+btr_defragment_remove_index(
+ dict_index_t* index) /*!< Index to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (index->id == idx->id) {
+ item->removed = true;
+ item->event = NULL;
+ break;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Functions used by defragmentation thread: btr_defragment_xxx_item.
+Defragmentation thread operates on the work *item*. It gets/removes
+item from the work queue. */
+/******************************************************************//**
+Defragment thread uses this to remove an item from btr_defragment_wq.
+When an item is removed from the work queue, all resources associated with it
+are free as well. */
+void
+btr_defragment_remove_item(
+ btr_defragment_item_t* item) /*!< Item to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ if (item == *iter) {
+ btr_defragment_wq.erase(iter);
+ delete item;
+ break;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Defragment thread uses this to get an item from btr_defragment_wq to work on.
+The item is not removed from the work queue so query threads can still access
+this item. We keep it this way so query threads can find and kill a
+defragmentation even if that index is being worked on. Be aware that while you
+work on this item you have no lock protection on it whatsoever. This is OK as
+long as the query threads and defragment thread won't modify the same fields
+without lock protection.
+*/
+btr_defragment_item_t*
+btr_defragment_get_item()
+{
+ if (btr_defragment_wq.empty()) {
+ return NULL;
+ //return nullptr;
+ }
+ mutex_enter(&btr_defragment_mutex);
+ std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ if (iter == btr_defragment_wq.end()) {
+ iter = btr_defragment_wq.begin();
+ }
+ btr_defragment_item_t* item = *iter;
+ iter++;
+ mutex_exit(&btr_defragment_mutex);
+ return item;
+}
+
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.
+Currently we save the stats to persistent storage every 100 updates. */
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+ dict_index_t* index) /*!< in: index */
+{
+ if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
+ && index->table->space_id != 0 // do not track system tables
+ && index->stat_defrag_modified_counter
+ >= srv_defragment_stats_accuracy) {
+ dict_stats_defrag_pool_add(index);
+ index->stat_defrag_modified_counter = 0;
+ }
+}
+
+/*********************************************************************//**
+Main defragment functionalities used by defragment thread.*/
+/*************************************************************//**
+Calculate number of records from beginning of block that can
+fit into size_limit
+@return number of records */
+UNIV_INTERN
+ulint
+btr_defragment_calc_n_recs_for_size(
+ buf_block_t* block, /*!< in: B-tree page */
+ dict_index_t* index, /*!< in: index of the page */
+ ulint size_limit, /*!< in: size limit to fit records in */
+ ulint* n_recs_size) /*!< out: actual size of the records that fit
+ in size_limit. */
+{
+ page_t* page = buf_block_get_frame(block);
+ ulint n_recs = 0;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ mem_heap_t* heap = NULL;
+ ulint size = 0;
+ page_cur_t cur;
+
+ const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+ page_cur_set_before_first(block, &cur);
+ page_cur_move_to_next(&cur);
+ while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
+ rec_t* cur_rec = page_cur_get_rec(&cur);
+ offsets = rec_get_offsets(cur_rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ ulint rec_size = rec_offs_size(offsets);
+ size += rec_size;
+ if (size > size_limit) {
+ size = size - rec_size;
+ break;
+ }
+ n_recs ++;
+ page_cur_move_to_next(&cur);
+ }
+ *n_recs_size = size;
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return n_recs;
+}
+
+/*************************************************************//**
+Merge as many records from the from_block to the to_block. Delete
+the from_block if all records are successfully merged to to_block.
+@return the to_block to target for next merge operation. */
+static
+buf_block_t*
+btr_defragment_merge_pages(
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* from_block, /*!< in: origin of merge */
+ buf_block_t* to_block, /*!< in: destination of merge */
+ ulint zip_size, /*!< in: ROW_FORMAT=COMPRESSED size */
+ ulint reserved_space, /*!< in: space reserved for future
+ insert to avoid immediate page split */
+ ulint* max_data_size, /*!< in/out: max data size to
+ fit in a single compressed page. */
+ mem_heap_t* heap, /*!< in/out: pointer to memory heap */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_t* from_page = buf_block_get_frame(from_block);
+ page_t* to_page = buf_block_get_frame(to_block);
+ ulint level = btr_page_get_level(from_page);
+ ulint n_recs = page_get_n_recs(from_page);
+ ulint new_data_size = page_get_data_size(to_page);
+ ulint max_ins_size =
+ page_get_max_insert_size(to_page, n_recs);
+ ulint max_ins_size_reorg =
+ page_get_max_insert_size_after_reorganize(
+ to_page, n_recs);
+ ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
+ ? max_ins_size_reorg - reserved_space : 0;
+ ulint move_size = 0;
+ ulint n_recs_to_move = 0;
+ rec_t* rec = NULL;
+ ulint target_n_recs = 0;
+ rec_t* orig_pred;
+
+ // Estimate how many records can be moved from the from_page to
+ // the to_page.
+ if (zip_size) {
+ ulint page_diff = srv_page_size - *max_data_size;
+ max_ins_size_to_use = (max_ins_size_to_use > page_diff)
+ ? max_ins_size_to_use - page_diff : 0;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+
+ // If max_ins_size >= move_size, we can move the records without
+ // reorganizing the page, otherwise we need to reorganize the page
+ // first to release more space.
+ if (move_size > max_ins_size) {
+ if (!btr_page_reorganize_block(page_zip_level,
+ to_block, index,
+ mtr)) {
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ ibuf_reset_free_bits(to_block);
+ }
+ // If reorganization fails, that means page is
+ // not compressable. There's no point to try
+ // merging into this page. Continue to the
+ // next page.
+ return from_block;
+ }
+ ut_ad(page_validate(to_page, index));
+ max_ins_size = page_get_max_insert_size(to_page, n_recs);
+ ut_a(max_ins_size >= move_size);
+ }
+
+ // Move records to pack to_page more full.
+ orig_pred = NULL;
+ target_n_recs = n_recs_to_move;
+ while (n_recs_to_move > 0) {
+ rec = page_rec_get_nth(from_page,
+ n_recs_to_move + 1);
+ orig_pred = page_copy_rec_list_start(
+ to_block, from_block, rec, index, mtr);
+ if (orig_pred)
+ break;
+ // If we reach here, that means compression failed after packing
+ // n_recs_to_move number of records to to_page. We try to reduce
+ // the targeted data size on the to_page by
+ // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
+ btr_defragment_compression_failures++;
+ max_ins_size_to_use =
+ move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ : 0;
+ if (max_ins_size_to_use == 0) {
+ n_recs_to_move = 0;
+ move_size = 0;
+ break;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+ }
+ // If less than target_n_recs are moved, it means there are
+ // compression failures during page_copy_rec_list_start. Adjust
+ // the max_data_size estimation to reduce compression failures
+ // in the following runs.
+ if (target_n_recs > n_recs_to_move
+ && *max_data_size > new_data_size + move_size) {
+ *max_data_size = new_data_size + move_size;
+ }
+ // Set ibuf free bits if necessary.
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ if (zip_size) {
+ ibuf_reset_free_bits(to_block);
+ } else {
+ ibuf_update_free_bits_if_full(
+ to_block,
+ srv_page_size,
+ ULINT_UNDEFINED);
+ }
+ }
+ btr_cur_t parent;
+ if (n_recs_to_move == n_recs) {
+ /* The whole page is merged with the previous page,
+ free it. */
+ lock_update_merge_left(to_block, orig_pred,
+ from_block);
+ btr_search_drop_page_hash_index(from_block);
+ btr_level_list_remove(*from_block, *index, mtr);
+ btr_page_get_father(index, from_block, mtr, &parent);
+ btr_cur_node_ptr_delete(&parent, mtr);
+ /* btr_blob_dbg_remove(from_page, index,
+ "btr_defragment_n_pages"); */
+ btr_page_free(index, from_block, mtr);
+ } else {
+ // There are still records left on the page, so
+ // increment n_defragmented. Node pointer will be changed
+ // so remove the old node pointer.
+ if (n_recs_to_move > 0) {
+ // Part of the page is merged to left, remove
+ // the merged records, update record locks and
+ // node pointer.
+ dtuple_t* node_ptr;
+ page_delete_rec_list_start(rec, from_block,
+ index, mtr);
+ lock_update_split_and_merge(to_block,
+ orig_pred,
+ from_block);
+ // FIXME: reuse the node_ptr!
+ btr_page_get_father(index, from_block, mtr, &parent);
+ btr_cur_node_ptr_delete(&parent, mtr);
+ rec = page_rec_get_next(
+ page_get_infimum_rec(from_page));
+ node_ptr = dict_index_build_node_ptr(
+ index, rec, page_get_page_no(from_page),
+ heap, level);
+ btr_insert_on_non_leaf_level(0, index, level+1,
+ node_ptr, mtr);
+ }
+ to_block = from_block;
+ }
+ return to_block;
+}
+
+/*************************************************************//**
+Tries to merge N consecutive pages, starting from the page pointed by the
+cursor. Skip space 0. Only consider leaf pages.
+This function first loads all N pages into memory, then for each of
+the pages other than the first page, it tries to move as many records
+as possible to the left sibling to keep the left sibling full. During
+the process, if any page becomes empty, that page will be removed from
+the level list. Record locks, hash, and node pointers are updated after
+page reorganization.
+@return pointer to the last block processed, or NULL if reaching end of index */
+UNIV_INTERN
+buf_block_t*
+btr_defragment_n_pages(
+ buf_block_t* block, /*!< in: starting block for defragmentation */
+ dict_index_t* index, /*!< in: index tree */
+ uint n_pages,/*!< in: number of pages to defragment */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ /* We will need to load the n+1 block because if the last page is freed
+ and we need to modify the prev_page_no of that block. */
+ buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
+ page_t* first_page;
+ buf_block_t* current_block;
+ ulint total_data_size = 0;
+ ulint total_n_recs = 0;
+ ulint data_size_per_rec;
+ ulint optimal_page_size;
+ ulint reserved_space;
+ ulint max_data_size = 0;
+ uint n_defragmented = 0;
+ uint n_new_slots;
+ mem_heap_t* heap;
+ ibool end_of_index = FALSE;
+
+ /* It doesn't make sense to call this function with n_pages = 1. */
+ ut_ad(n_pages > 1);
+
+ if (!page_is_leaf(block->frame)) {
+ return NULL;
+ }
+
+ if (!index->table->space || !index->table->space_id) {
+ /* Ignore space 0. */
+ return NULL;
+ }
+
+ if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
+ n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
+ }
+
+ first_page = buf_block_get_frame(block);
+ const ulint zip_size = index->table->space->zip_size();
+
+ /* 1. Load the pages and calculate the total data size. */
+ blocks[0] = block;
+ for (uint i = 1; i <= n_pages; i++) {
+ page_t* page = buf_block_get_frame(blocks[i-1]);
+ uint32_t page_no = btr_page_get_next(page);
+ total_data_size += page_get_data_size(page);
+ total_n_recs += page_get_n_recs(page);
+ if (page_no == FIL_NULL) {
+ n_pages = i;
+ end_of_index = TRUE;
+ break;
+ }
+
+ blocks[i] = btr_block_get(*index, page_no, RW_X_LATCH, true,
+ mtr);
+ }
+
+ if (n_pages == 1) {
+ if (!page_has_prev(first_page)) {
+ /* last page in the index */
+ if (dict_index_get_page(index)
+ == page_get_page_no(first_page))
+ return NULL;
+ /* given page is the last page.
+ Lift the records to father. */
+ btr_lift_page_up(index, block, mtr);
+ }
+ return NULL;
+ }
+
+ /* 2. Calculate how many pages data can fit in. If not compressable,
+ return early. */
+ ut_a(total_n_recs != 0);
+ data_size_per_rec = total_data_size / total_n_recs;
+ // For uncompressed pages, the optimal data size if the free space of a
+ // empty page.
+ optimal_page_size = page_get_free_space_of_empty(
+ page_is_comp(first_page));
+ // For compressed pages, we take compression failures into account.
+ if (zip_size) {
+ ulint size = 0;
+ uint i = 0;
+ // We estimate the optimal data size of the index use samples of
+ // data size. These samples are taken when pages failed to
+ // compress due to insertion on the page. We use the average
+ // of all samples we have as the estimation. Different pages of
+ // the same index vary in compressibility. Average gives a good
+ // enough estimation.
+ for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
+ if (index->stat_defrag_data_size_sample[i] == 0) {
+ break;
+ }
+ size += index->stat_defrag_data_size_sample[i];
+ }
+ if (i != 0) {
+ size /= i;
+ optimal_page_size = ut_min(optimal_page_size, size);
+ }
+ max_data_size = optimal_page_size;
+ }
+
+ reserved_space = ut_min(static_cast<ulint>(
+ static_cast<double>(optimal_page_size)
+ * (1 - srv_defragment_fill_factor)),
+ (data_size_per_rec
+ * srv_defragment_fill_factor_n_recs));
+ optimal_page_size -= reserved_space;
+ n_new_slots = uint((total_data_size + optimal_page_size - 1)
+ / optimal_page_size);
+ if (n_new_slots >= n_pages) {
+ /* Can't defragment. */
+ if (end_of_index)
+ return NULL;
+ return blocks[n_pages-1];
+ }
+
+ /* 3. Defragment pages. */
+ heap = mem_heap_create(256);
+ // First defragmented page will be the first page.
+ current_block = blocks[0];
+ // Start from the second page.
+ for (uint i = 1; i < n_pages; i ++) {
+ buf_block_t* new_block = btr_defragment_merge_pages(
+ index, blocks[i], current_block, zip_size,
+ reserved_space, &max_data_size, heap, mtr);
+ if (new_block != current_block) {
+ n_defragmented ++;
+ current_block = new_block;
+ }
+ }
+ mem_heap_free(heap);
+ n_defragmented ++;
+ btr_defragment_count++;
+ if (n_pages == n_defragmented) {
+ btr_defragment_failures++;
+ } else {
+ index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
+ }
+ if (end_of_index)
+ return NULL;
+ return current_block;
+}
+
+
+
+void btr_defragment_start() {
+ if (!srv_defragment)
+ return;
+ ut_ad(!btr_defragment_wq.empty());
+ submit_defragment_task();
+}
+
+
+/**
+Callback used by defragment timer
+
+Throttling "sleep", is implemented via rescheduling the
+threadpool timer, which, when fired, will resume the work again,
+where it is left.
+
+The state (current item) is stored in function parameter.
+*/
+static void btr_defragment_chunk(void*)
+{
+ defragment_chunk_state_t* state = &defragment_chunk_state;
+
+ btr_pcur_t* pcur;
+ btr_cur_t* cursor;
+ dict_index_t* index;
+ mtr_t mtr;
+ buf_block_t* first_block;
+ buf_block_t* last_block;
+
+ while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+ if (!state->m_item) {
+ state->m_item = btr_defragment_get_item();
+ }
+ /* If an index is marked as removed, we remove it from the work
+ queue. No other thread could be using this item at this point so
+ it's safe to remove now. */
+ while (state->m_item && state->m_item->removed) {
+ btr_defragment_remove_item(state->m_item);
+ state->m_item = btr_defragment_get_item();
+ }
+ if (!state->m_item) {
+ /* Queue empty */
+ return;
+ }
+
+ pcur = state->m_item->pcur;
+ ulonglong now = my_interval_timer();
+ ulonglong elapsed = now - state->m_item->last_processed;
+
+ if (elapsed < srv_defragment_interval) {
+ /* If we see an index again before the interval
+ determined by the configured frequency is reached,
+ we just sleep until the interval pass. Since
+ defragmentation of all indices queue up on a single
+ thread, it's likely other indices that follow this one
+ don't need to sleep again. */
+ int sleep_ms = (int)((srv_defragment_interval - elapsed) / 1000 / 1000);
+ if (sleep_ms) {
+ btr_defragment_timer->set_time(sleep_ms, 0);
+ return;
+ }
+ }
+ log_free_check();
+ mtr_start(&mtr);
+ cursor = btr_pcur_get_btr_cur(pcur);
+ index = btr_cur_get_index(cursor);
+ index->set_modified(mtr);
+ /* To follow the latching order defined in WL#6326, acquire index->lock X-latch.
+ This entitles us to acquire page latches in any order for the index. */
+ mtr_x_lock_index(index, &mtr);
+ /* This will acquire index->lock SX-latch, which per WL#6363 is allowed
+ when we are already holding the X-latch. */
+ btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
+ first_block = btr_cur_get_block(cursor);
+
+ last_block = btr_defragment_n_pages(first_block, index,
+ srv_defragment_n_pages,
+ &mtr);
+ if (last_block) {
+ /* If we haven't reached the end of the index,
+ place the cursor on the last record of last page,
+ store the cursor position, and put back in queue. */
+ page_t* last_page = buf_block_get_frame(last_block);
+ rec_t* rec = page_rec_get_prev(
+ page_get_supremum_rec(last_page));
+ ut_a(page_rec_is_user_rec(rec));
+ page_cur_position(rec, last_block,
+ btr_cur_get_page_cur(cursor));
+ btr_pcur_store_position(pcur, &mtr);
+ mtr_commit(&mtr);
+ /* Update the last_processed time of this index. */
+ state->m_item->last_processed = now;
+ } else {
+ dberr_t err = DB_SUCCESS;
+ mtr_commit(&mtr);
+ /* Reaching the end of the index. */
+ dict_stats_empty_defrag_stats(index);
+ err = dict_stats_save_defrag_stats(index);
+ if (err != DB_SUCCESS) {
+ ib::error() << "Saving defragmentation stats for table "
+ << index->table->name
+ << " index " << index->name()
+ << " failed with error " << err;
+ } else {
+ err = dict_stats_save_defrag_summary(index);
+
+ if (err != DB_SUCCESS) {
+ ib::error() << "Saving defragmentation summary for table "
+ << index->table->name
+ << " index " << index->name()
+ << " failed with error " << err;
+ }
+ }
+
+ btr_defragment_remove_item(state->m_item);
+ state->m_item = NULL;
+ }
+ }
+}
diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc
new file mode 100644
index 00000000..574998a9
--- /dev/null
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -0,0 +1,681 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0pcur.cc
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#include "btr0pcur.h"
+#include "ut0byte.h"
+#include "rem0cmp.h"
+#include "trx0trx.h"
+
+/**************************************************************//**
+Allocates memory for a persistent cursor object and initializes the cursor.
+@return own: persistent cursor */
+btr_pcur_t*
+btr_pcur_create_for_mysql(void)
+/*============================*/
+{
+ btr_pcur_t* pcur;
+ DBUG_ENTER("btr_pcur_create_for_mysql");
+
+ pcur = (btr_pcur_t*) ut_malloc_nokey(sizeof(btr_pcur_t));
+
+ pcur->btr_cur.index = NULL;
+ btr_pcur_init(pcur);
+
+ DBUG_PRINT("btr_pcur_create_for_mysql", ("pcur: %p", pcur));
+ DBUG_RETURN(pcur);
+}
+
+/**************************************************************//**
+Resets a persistent cursor object, freeing ::old_rec_buf if it is
+allocated and resetting the other members to their initial values. */
+void
+btr_pcur_reset(
+/*===========*/
+ btr_pcur_t* cursor) /*!< in, out: persistent cursor */
+{
+ btr_pcur_free(cursor);
+ cursor->old_rec_buf = NULL;
+ cursor->btr_cur.index = NULL;
+ cursor->btr_cur.page_cur.rec = NULL;
+ cursor->old_rec = NULL;
+ cursor->old_n_core_fields = 0;
+ cursor->old_n_fields = 0;
+ cursor->old_stored = false;
+
+ cursor->latch_mode = BTR_NO_LATCHES;
+ cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+}
+
+/**************************************************************//**
+Frees the memory for a persistent cursor object. */
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+ btr_pcur_t* cursor) /*!< in, own: persistent cursor */
+{
+ DBUG_ENTER("btr_pcur_free_for_mysql");
+ DBUG_PRINT("btr_pcur_free_for_mysql", ("pcur: %p", cursor));
+
+ btr_pcur_free(cursor);
+ ut_free(cursor);
+ DBUG_VOID_RETURN;
+}
+
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+void
+btr_pcur_store_position(
+/*====================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t* page_cursor;
+ buf_block_t* block;
+ rec_t* rec;
+ dict_index_t* index;
+ ulint offs;
+
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ block = btr_pcur_get_block(cursor);
+ index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+ page_cursor = btr_pcur_get_page_cur(cursor);
+
+ rec = page_cur_get_rec(page_cursor);
+ offs = rec - block->frame;
+ ut_ad(block->page.id().page_no() == page_get_page_no(block->frame));
+ ut_ad(block->page.buf_fix_count());
+ /* For spatial index, when we do positioning on parent
+ buffer if necessary, it might not hold latches, but the
+ tree must be locked to prevent change on the page */
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX)
+ || (index->is_spatial()
+ && mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK)));
+
+ cursor->old_stored = true;
+
+ if (page_is_empty(block->frame)) {
+ /* It must be an empty index tree; NOTE that in this case
+ we do not store the modify_clock, but always do a search
+ if we restore the cursor position */
+
+ ut_a(!page_has_siblings(block->frame));
+ ut_ad(page_is_leaf(block->frame));
+ ut_ad(block->page.id().page_no() == index->page);
+
+ if (page_rec_is_supremum_low(offs)) {
+ cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+ } else {
+before_first:
+ cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE;
+ }
+
+ return;
+ }
+
+ if (page_rec_is_supremum_low(offs)) {
+ rec = page_rec_get_prev(rec);
+
+ ut_ad(!page_rec_is_infimum(rec));
+ if (UNIV_UNLIKELY(rec_is_metadata(rec, *index))) {
+#if 0 /* MDEV-22867 had to relax this */
+ /* If the table is emptied during an ALGORITHM=NOCOPY
+ DROP COLUMN ... that is not ALGORITHM=INSTANT,
+ then we must preserve any instant ADD metadata. */
+ ut_ad(index->table->instant
+ || block->page.id().page_no() != index->page);
+#endif
+ ut_ad(index->is_instant()
+ || block->page.id().page_no() != index->page);
+ ut_ad(page_get_n_recs(block->frame) == 1);
+ ut_ad(page_is_leaf(block->frame));
+ ut_ad(!page_has_prev(block->frame));
+ cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+ return;
+ }
+
+ cursor->rel_pos = BTR_PCUR_AFTER;
+ } else if (page_rec_is_infimum_low(offs)) {
+ rec = page_rec_get_next(rec);
+
+ if (rec_is_metadata(rec, *index)) {
+ ut_ad(!page_has_prev(block->frame));
+ rec = page_rec_get_next(rec);
+ if (page_rec_is_supremum(rec)) {
+ goto before_first;
+ }
+ }
+
+ cursor->rel_pos = BTR_PCUR_BEFORE;
+ } else {
+ cursor->rel_pos = BTR_PCUR_ON;
+ }
+
+ if (index->is_ibuf()) {
+ ut_ad(!index->table->not_redundant());
+ cursor->old_n_fields = uint16_t(rec_get_n_fields_old(rec));
+ } else {
+ cursor->old_n_fields = static_cast<uint16>(
+ dict_index_get_n_unique_in_tree(index));
+ if (index->is_spatial() && !page_rec_is_leaf(rec)) {
+ ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index)
+ == DICT_INDEX_SPATIAL_NODEPTR_SIZE);
+ /* For R-tree, we have to compare
+ the child page numbers as well. */
+ cursor->old_n_fields
+ = DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
+ }
+ }
+
+ cursor->old_n_core_fields = index->n_core_fields;
+ cursor->old_rec = rec_copy_prefix_to_buf(rec, index,
+ cursor->old_n_fields,
+ &cursor->old_rec_buf,
+ &cursor->buf_size);
+ cursor->block_when_stored.store(block);
+
+ /* Function try to check if block is S/X latch. */
+ cursor->modify_clock = buf_block_get_modify_clock(block);
+}
+
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+ btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the
+ position info */
+ btr_pcur_t* pcur_donate) /*!< in: pcur from which the info is
+ copied */
+{
+ ut_free(pcur_receive->old_rec_buf);
+ memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t));
+
+ if (pcur_donate->old_rec_buf) {
+
+ pcur_receive->old_rec_buf = (byte*)
+ ut_malloc_nokey(pcur_donate->buf_size);
+
+ memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
+ pcur_donate->buf_size);
+ pcur_receive->old_rec = pcur_receive->old_rec_buf
+ + (pcur_donate->old_rec - pcur_donate->old_rec_buf);
+ }
+
+ pcur_receive->old_n_core_fields = pcur_donate->old_n_core_fields;
+ pcur_receive->old_n_fields = pcur_donate->old_n_fields;
+}
+
+/** Structure acts as functor to do the latching of leaf pages.
+It returns true if latching of leaf pages succeeded and false
+otherwise. */
+struct optimistic_latch_leaves
+{
+ btr_pcur_t *const cursor;
+ ulint *latch_mode;
+ mtr_t *const mtr;
+
+ optimistic_latch_leaves(btr_pcur_t *cursor, ulint *latch_mode, mtr_t *mtr)
+ :cursor(cursor), latch_mode(latch_mode), mtr(mtr) {}
+
+ bool operator() (buf_block_t *hint) const
+ {
+ return hint && btr_cur_optimistic_latch_leaves(
+ hint, cursor->modify_clock, latch_mode,
+ btr_pcur_get_btr_cur(cursor), __FILE__, __LINE__, mtr);
+ }
+};
+
+/**************************************************************//**
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum.
+(4) cursor was positioned before the first or after the last in an empty tree:
+restores to before first or after the last in the tree.
+@return TRUE if the cursor position was stored when it was on a user
+record and it can be restored on a user record whose ordering fields
+are identical to the ones of the original user record */
+ibool
+btr_pcur_restore_position_func(
+/*===========================*/
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: detached persistent cursor */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ dtuple_t* tuple;
+ page_cur_mode_t mode;
+ page_cur_mode_t old_mode;
+ mem_heap_t* heap;
+
+ ut_ad(mtr->is_active());
+ //ut_ad(cursor->old_stored);
+ ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+ || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+ if (UNIV_UNLIKELY
+ (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
+ || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) {
+ dberr_t err = DB_SUCCESS;
+
+ /* In these cases we do not try an optimistic restoration,
+ but always do a search */
+
+ err = btr_cur_open_at_index_side(
+ cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
+ index, latch_mode,
+ btr_pcur_get_btr_cur(cursor), 0, mtr);
+
+ if (err != DB_SUCCESS) {
+ ib::warn() << " Error code: " << err
+ << " btr_pcur_restore_position_func "
+ << " called from file: "
+ << file << " line: " << line
+ << " table: " << index->table->name
+ << " index: " << index->name;
+ }
+
+ cursor->latch_mode =
+ BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+ cursor->block_when_stored.clear();
+
+ return(FALSE);
+ }
+
+ ut_a(cursor->old_rec);
+ ut_a(cursor->old_n_core_fields);
+ ut_a(cursor->old_n_core_fields <= index->n_core_fields);
+ ut_a(cursor->old_n_fields);
+
+ switch (latch_mode) {
+ case BTR_SEARCH_LEAF:
+ case BTR_MODIFY_LEAF:
+ case BTR_SEARCH_PREV:
+ case BTR_MODIFY_PREV:
+ /* Try optimistic restoration. */
+
+ if (cursor->block_when_stored.run_with_hint(
+ optimistic_latch_leaves(cursor, &latch_mode,
+ mtr))) {
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+ cursor->latch_mode = latch_mode;
+
+ buf_block_dbg_add_level(
+ btr_pcur_get_block(cursor),
+ dict_index_is_ibuf(index)
+ ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
+
+ if (cursor->rel_pos == BTR_PCUR_ON) {
+#ifdef UNIV_DEBUG
+ const rec_t* rec;
+ rec_offs offsets1_[REC_OFFS_NORMAL_SIZE];
+ rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets1 = offsets1_;
+ rec_offs* offsets2 = offsets2_;
+ rec = btr_pcur_get_rec(cursor);
+
+ rec_offs_init(offsets1_);
+ rec_offs_init(offsets2_);
+
+ heap = mem_heap_create(256);
+ ut_ad(cursor->old_n_core_fields
+ == index->n_core_fields);
+
+ offsets1 = rec_get_offsets(
+ cursor->old_rec, index, offsets1,
+ cursor->old_n_core_fields,
+ cursor->old_n_fields, &heap);
+ offsets2 = rec_get_offsets(
+ rec, index, offsets2,
+ index->n_core_fields,
+ cursor->old_n_fields, &heap);
+
+ ut_ad(!cmp_rec_rec(cursor->old_rec,
+ rec, offsets1, offsets2,
+ index));
+ mem_heap_free(heap);
+#endif /* UNIV_DEBUG */
+ return(TRUE);
+ }
+ /* This is the same record as stored,
+ may need to be adjusted for BTR_PCUR_BEFORE/AFTER,
+ depending on search mode and direction. */
+ if (btr_pcur_is_on_user_rec(cursor)) {
+ cursor->pos_state
+ = BTR_PCUR_IS_POSITIONED_OPTIMISTIC;
+ }
+ return(FALSE);
+ }
+ }
+
+ /* If optimistic restoration did not succeed, open the cursor anew */
+
+ heap = mem_heap_create(256);
+
+ tuple = dtuple_create(heap, cursor->old_n_fields);
+
+ dict_index_copy_types(tuple, index, cursor->old_n_fields);
+
+ rec_copy_prefix_to_dtuple(tuple, cursor->old_rec, index,
+ cursor->old_n_core_fields,
+ cursor->old_n_fields, heap);
+ ut_ad(dtuple_check_typed(tuple));
+
+ /* Save the old search mode of the cursor */
+ old_mode = cursor->search_mode;
+
+ switch (cursor->rel_pos) {
+ case BTR_PCUR_ON:
+ mode = PAGE_CUR_LE;
+ break;
+ case BTR_PCUR_AFTER:
+ mode = PAGE_CUR_G;
+ break;
+ case BTR_PCUR_BEFORE:
+ mode = PAGE_CUR_L;
+ break;
+ default:
+ ut_error;
+ mode = PAGE_CUR_UNSUPP;
+ }
+
+ btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode,
+ cursor,
+#ifdef BTR_CUR_HASH_ADAPT
+ NULL,
+#endif /* BTR_CUR_HASH_ADAPT */
+ file, line, mtr);
+
+ /* Restore the old search mode */
+ cursor->search_mode = old_mode;
+
+ ut_ad(cursor->rel_pos == BTR_PCUR_ON
+ || cursor->rel_pos == BTR_PCUR_BEFORE
+ || cursor->rel_pos == BTR_PCUR_AFTER);
+ rec_offs offsets[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets);
+ if (cursor->rel_pos == BTR_PCUR_ON
+ && btr_pcur_is_on_user_rec(cursor)
+ && !cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor),
+ rec_get_offsets(btr_pcur_get_rec(cursor),
+ index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap))) {
+
+ /* We have to store the NEW value for the modify clock,
+ since the cursor can now be on a different page!
+ But we can retain the value of old_rec */
+
+ cursor->block_when_stored.store(btr_pcur_get_block(cursor));
+ cursor->modify_clock = buf_block_get_modify_clock(
+ cursor->block_when_stored.block());
+ cursor->old_stored = true;
+
+ mem_heap_free(heap);
+
+ return(TRUE);
+ }
+
+ mem_heap_free(heap);
+
+ /* We have to store new position information, modify_clock etc.,
+ to the cursor because it can now be on a different page, the record
+ under it may have been removed, etc. */
+
+ btr_pcur_store_position(cursor, mtr);
+
+ return(FALSE);
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page. Releases the
+latch on the current page, and bufferunfixes it. Note that there must not be
+modifications on the current page, as then the x-latch can be released only in
+mtr_commit. */
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the
+ last record of the current page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ ut_ad(btr_pcur_is_after_last_on_page(cursor));
+
+ cursor->old_stored = false;
+
+ const page_t* page = btr_pcur_get_page(cursor);
+
+ if (UNIV_UNLIKELY(!page)) {
+ return;
+ }
+
+ const uint32_t next_page_no = btr_page_get_next(page);
+
+ ut_ad(next_page_no != FIL_NULL);
+
+ ulint mode = cursor->latch_mode;
+ switch (mode) {
+ case BTR_SEARCH_TREE:
+ mode = BTR_SEARCH_LEAF;
+ break;
+ case BTR_MODIFY_TREE:
+ mode = BTR_MODIFY_LEAF;
+ }
+
+ buf_block_t* next_block = btr_block_get(
+ *btr_pcur_get_btr_cur(cursor)->index, next_page_no, mode,
+ page_is_leaf(page), mtr);
+
+ if (UNIV_UNLIKELY(!next_block)) {
+ return;
+ }
+
+ const page_t* next_page = buf_block_get_frame(next_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(next_page) == page_is_comp(page));
+ ut_a(btr_page_get_prev(next_page)
+ == btr_pcur_get_block(cursor)->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+
+ btr_leaf_page_release(btr_pcur_get_block(cursor), mode, mtr);
+
+ page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor));
+
+ ut_d(page_check_dir(next_page));
+}
+
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record of the page.
+Commits mtr. Note that to prevent a possible deadlock, the operation
+first stores the position of the cursor, commits mtr, acquires the necessary
+latches and restores the cursor position again before returning. The
+alphabetical position of the cursor is guaranteed to be sensible on
+return, but it may happen that the cursor is not positioned on the last
+record of any page, because the structure of the tree may have changed
+during the time when the cursor had no latches. */
+static
+void
+btr_pcur_move_backward_from_page(
+/*=============================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the first
+ record of the current page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint prev_page_no;
+ page_t* page;
+ buf_block_t* prev_block;
+ ulint latch_mode;
+ ulint latch_mode2;
+
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ ut_ad(btr_pcur_is_before_first_on_page(cursor));
+ ut_ad(!btr_pcur_is_before_first_in_tree(cursor));
+
+ latch_mode = cursor->latch_mode;
+
+ if (latch_mode == BTR_SEARCH_LEAF) {
+
+ latch_mode2 = BTR_SEARCH_PREV;
+
+ } else if (latch_mode == BTR_MODIFY_LEAF) {
+
+ latch_mode2 = BTR_MODIFY_PREV;
+ } else {
+ latch_mode2 = 0; /* To eliminate compiler warning */
+ ut_error;
+ }
+
+ btr_pcur_store_position(cursor, mtr);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ btr_pcur_restore_position(latch_mode2, cursor, mtr);
+
+ page = btr_pcur_get_page(cursor);
+
+ prev_page_no = btr_page_get_prev(page);
+
+ if (prev_page_no == FIL_NULL) {
+ } else if (btr_pcur_is_before_first_on_page(cursor)) {
+
+ prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+
+ btr_leaf_page_release(btr_pcur_get_block(cursor),
+ latch_mode, mtr);
+
+ page_cur_set_after_last(prev_block,
+ btr_pcur_get_page_cur(cursor));
+ } else {
+
+ /* The repositioned cursor did not end on an infimum
+ record on a page. Cursor repositioning acquired a latch
+ also on the previous page, but we do not need the latch:
+ release it. */
+
+ prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+
+ btr_leaf_page_release(prev_block, latch_mode, mtr);
+ }
+
+ cursor->latch_mode = latch_mode;
+ cursor->old_stored = false;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return TRUE if the cursor was not before first in tree */
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ cursor->old_stored = false;
+
+ if (btr_pcur_is_before_first_on_page(cursor)) {
+
+ if (btr_pcur_is_before_first_in_tree(cursor)) {
+
+ return(FALSE);
+ }
+
+ btr_pcur_move_backward_from_page(cursor, mtr);
+
+ return(TRUE);
+ }
+
+ btr_pcur_move_to_prev_on_page(cursor);
+
+ return(TRUE);
+}
+
+/**************************************************************//**
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+void
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ... */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent
+ cursor */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ btr_pcur_open_low(index, 0, tuple, mode, latch_mode, cursor,
+ file, line, 0, mtr);
+
+ if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
+
+ if (btr_pcur_is_after_last_on_page(cursor)) {
+
+ btr_pcur_move_to_next_user_rec(cursor, mtr);
+ }
+ } else {
+ ut_ad((mode == PAGE_CUR_LE) || (mode == PAGE_CUR_L));
+
+ /* Not implemented yet */
+
+ ut_error;
+ }
+}
diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
new file mode 100644
index 00000000..f22e3a59
--- /dev/null
+++ b/storage/innobase/btr/btr0sea.cc
@@ -0,0 +1,2372 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file btr/btr0sea.cc
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "btr0sea.h"
+#ifdef BTR_CUR_HASH_ADAPT
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "srv0mon.h"
+
+/** Is search system enabled.
+Search system is protected by array of latches. */
+char btr_search_enabled;
+
+/** Number of adaptive hash index partition. */
+ulong btr_ahi_parts;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+ulint btr_search_n_succ = 0;
+/** Number of failed adaptive hash index lookups */
+ulint btr_search_n_hash_fail = 0;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** The adaptive hash index */
+btr_search_sys_t btr_search_sys;
+
+/** If the number of records on the page divided by this parameter
+would have been successfully accessed using a hash index, the index
+is then built on the page, assuming the global limit has been reached */
+#define BTR_SEARCH_PAGE_BUILD_LIMIT 16U
+
+/** The global limit for consecutive potentially successful hash searches,
+before hash index building is started */
+#define BTR_SEARCH_BUILD_LIMIT 100U
+
+/** Compute a hash value of a record in a page.
+@param[in] rec index record
+@param[in] offsets return value of rec_get_offsets()
+@param[in] n_fields number of complete fields to fold
+@param[in] n_bytes number of bytes to fold in the last field
+@param[in] index_id index tree ID
+@return the hash value */
+static inline
+ulint
+rec_fold(
+ const rec_t* rec,
+ const rec_offs* offsets,
+ ulint n_fields,
+ ulint n_bytes,
+ index_id_t tree_id)
+{
+ ulint i;
+ const byte* data;
+ ulint len;
+ ulint fold;
+ ulint n_fields_rec;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_validate(rec, offsets));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(!page_rec_is_metadata(rec));
+ ut_ad(n_fields > 0 || n_bytes > 0);
+
+ n_fields_rec = rec_offs_n_fields(offsets);
+ ut_ad(n_fields <= n_fields_rec);
+ ut_ad(n_fields < n_fields_rec || n_bytes == 0);
+
+ if (n_fields > n_fields_rec) {
+ n_fields = n_fields_rec;
+ }
+
+ if (n_fields == n_fields_rec) {
+ n_bytes = 0;
+ }
+
+ fold = ut_fold_ull(tree_id);
+
+ for (i = 0; i < n_fields; i++) {
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ if (n_bytes > 0) {
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len > n_bytes) {
+ len = n_bytes;
+ }
+
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ return(fold);
+}
+
+/** Determine the number of accessed key fields.
+@param[in] n_fields number of complete fields
+@param[in] n_bytes number of bytes in an incomplete last field
+@return number of complete or incomplete fields */
+inline MY_ATTRIBUTE((warn_unused_result))
+ulint
+btr_search_get_n_fields(
+ ulint n_fields,
+ ulint n_bytes)
+{
+ return(n_fields + (n_bytes > 0 ? 1 : 0));
+}
+
+/** Determine the number of accessed key fields.
+@param[in] cursor b-tree cursor
+@return number of complete or incomplete fields */
+inline MY_ATTRIBUTE((warn_unused_result))
+ulint
+btr_search_get_n_fields(
+ const btr_cur_t* cursor)
+{
+ return(btr_search_get_n_fields(cursor->n_fields, cursor->n_bytes));
+}
+
+/** This function should be called before reserving any btr search mutex, if
+the intended operation might add nodes to the search system hash table.
+Because of the latching order, once we have reserved the btr search system
+latch, we cannot allocate a free frame from the buffer pool. Checks that
+there is a free buffer frame allocated for hash table heap in the btr search
+system. If not, allocates a free frames for the heap. This check makes it
+probable that, when have reserved the btr search system latch and we need to
+allocate a new node to the hash table, it will succeed. However, the check
+will not guarantee success.
+@param[in] index index handler */
+static void btr_search_check_free_space_in_heap(const dict_index_t *index)
+{
+ /* Note that we peek the value of heap->free_block without reserving
+ the latch: this is ok, because we will not guarantee that there will
+ be enough free space in the hash table. */
+
+ buf_block_t *block= buf_block_alloc();
+ auto part= btr_search_sys.get_part(*index);
+
+ rw_lock_x_lock(&part->latch);
+
+ if (!btr_search_enabled || part->heap->free_block)
+ buf_block_free(block);
+ else
+ part->heap->free_block= block;
+
+ rw_lock_x_unlock(&part->latch);
+}
+
+/** Set index->ref_count = 0 on all indexes of a table.
+@param[in,out] table table handler */
+static void btr_search_disable_ref_count(dict_table_t *table)
+{
+ for (dict_index_t *index= dict_table_get_first_index(table); index;
+ index= dict_table_get_next_index(index))
+ index->search_info->ref_count= 0;
+}
+
+/** Lazily free detached metadata when removing the last reference. */
+ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index)
+{
+ ut_ad(index->freed());
+ dict_table_t *table= index->table;
+ /* Perform the skipped steps of dict_index_remove_from_cache_low(). */
+ UT_LIST_REMOVE(table->freed_indexes, index);
+ rw_lock_free(&index->lock);
+ dict_mem_index_free(index);
+
+ if (!UT_LIST_GET_LEN(table->freed_indexes) &&
+ !UT_LIST_GET_LEN(table->indexes))
+ {
+ ut_ad(table->id == 0);
+ dict_mem_table_free(table);
+ }
+}
+
+/** Disable the adaptive hash search system and empty the index. */
+void btr_search_disable()
+{
+ dict_table_t* table;
+
+ mutex_enter(&dict_sys.mutex);
+
+ btr_search_x_lock_all();
+
+ if (!btr_search_enabled) {
+ mutex_exit(&dict_sys.mutex);
+ btr_search_x_unlock_all();
+ return;
+ }
+
+ btr_search_enabled = false;
+
+ /* Clear the index->search_info->ref_count of every index in
+ the data dictionary cache. */
+ for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); table;
+ table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+ btr_search_disable_ref_count(table);
+ }
+
+ for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); table;
+ table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+ btr_search_disable_ref_count(table);
+ }
+
+ mutex_exit(&dict_sys.mutex);
+
+ /* Set all block->index = NULL. */
+ buf_pool.clear_hash_index();
+
+ /* Clear the adaptive hash index. */
+ btr_search_sys.clear();
+
+ btr_search_x_unlock_all();
+}
+
+/** Enable the adaptive hash search system.
+@param resize whether buf_pool_t::resize() is the caller */
+void btr_search_enable(bool resize)
+{
+ if (!resize) {
+ mysql_mutex_lock(&buf_pool.mutex);
+ bool changed = srv_buf_pool_old_size != srv_buf_pool_size;
+ mysql_mutex_unlock(&buf_pool.mutex);
+ if (changed) {
+ return;
+ }
+ }
+
+ btr_search_x_lock_all();
+ ulint hash_size = buf_pool_get_curr_size() / sizeof(void *) / 64;
+
+ if (btr_search_sys.parts[0].heap) {
+ ut_ad(btr_search_enabled);
+ btr_search_x_unlock_all();
+ return;
+ }
+
+ btr_search_sys.alloc(hash_size);
+
+ btr_search_enabled = true;
+ btr_search_x_unlock_all();
+}
+
+/** Updates the search info of an index about hash successes. NOTE that info
+is NOT protected by any semaphore, to save CPU time! Do not assume its fields
+are consistent.
+@param[in,out] info search info
+@param[in] cursor cursor which was just positioned */
+static
+void
+btr_search_info_update_hash(
+ btr_search_t* info,
+ btr_cur_t* cursor)
+{
+ dict_index_t* index = cursor->index;
+ int cmp;
+
+ ut_ad(!btr_search_own_any(RW_LOCK_S));
+ ut_ad(!btr_search_own_any(RW_LOCK_X));
+
+ if (dict_index_is_ibuf(index)) {
+ /* So many deletes are performed on an insert buffer tree
+ that we do not consider a hash index useful on it: */
+
+ return;
+ }
+
+ uint16_t n_unique = dict_index_get_n_unique_in_tree(index);
+
+ if (info->n_hash_potential == 0) {
+
+ goto set_new_recomm;
+ }
+
+ /* Test if the search would have succeeded using the recommended
+ hash prefix */
+
+ if (info->n_fields >= n_unique && cursor->up_match >= n_unique) {
+increment_potential:
+ info->n_hash_potential++;
+
+ return;
+ }
+
+ cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+ cursor->low_match, cursor->low_bytes);
+
+ if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+ goto set_new_recomm;
+ }
+
+ cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+ cursor->up_match, cursor->up_bytes);
+
+ if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+ goto increment_potential;
+ }
+
+set_new_recomm:
+ /* We have to set a new recommendation; skip the hash analysis
+ for a while to avoid unnecessary CPU time usage when there is no
+ chance for success */
+
+ info->hash_analysis = 0;
+
+ cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes,
+ cursor->low_match, cursor->low_bytes);
+ info->left_side = cmp >= 0;
+ info->n_hash_potential = cmp != 0;
+
+ if (cmp == 0) {
+ /* For extra safety, we set some sensible values here */
+ info->n_fields = 1;
+ info->n_bytes = 0;
+ } else if (cmp > 0) {
+ info->n_hash_potential = 1;
+
+ if (cursor->up_match >= n_unique) {
+
+ info->n_fields = n_unique;
+ info->n_bytes = 0;
+
+ } else if (cursor->low_match < cursor->up_match) {
+
+ info->n_fields = static_cast<uint16_t>(
+ cursor->low_match + 1);
+ info->n_bytes = 0;
+ } else {
+ info->n_fields = static_cast<uint16_t>(
+ cursor->low_match);
+ info->n_bytes = static_cast<uint16_t>(
+ cursor->low_bytes + 1);
+ }
+ } else {
+ if (cursor->low_match >= n_unique) {
+
+ info->n_fields = n_unique;
+ info->n_bytes = 0;
+ } else if (cursor->low_match > cursor->up_match) {
+
+ info->n_fields = static_cast<uint16_t>(
+ cursor->up_match + 1);
+ info->n_bytes = 0;
+ } else {
+ info->n_fields = static_cast<uint16_t>(
+ cursor->up_match);
+ info->n_bytes = static_cast<uint16_t>(
+ cursor->up_bytes + 1);
+ }
+ }
+}
+
+/** Update the block search info on hash successes. NOTE that info and
+block->n_hash_helps, n_fields, n_bytes, left_side are NOT protected by any
+semaphore, to save CPU time! Do not assume the fields are consistent.
+@return TRUE if building a (new) hash index on the block is recommended
+@param[in,out] info search info
+@param[in,out] block buffer block */
+static
+bool
+btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block)
+{
+ ut_ad(!btr_search_own_any());
+ ut_ad(rw_lock_own_flagged(&block->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+ info->last_hash_succ = FALSE;
+ ut_d(auto state= block->page.state());
+ ut_ad(state == BUF_BLOCK_NOT_USED
+ || state == BUF_BLOCK_FILE_PAGE
+ || state == BUF_BLOCK_MEMORY
+ || state == BUF_BLOCK_REMOVE_HASH);
+ ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N);
+
+ if ((block->n_hash_helps > 0)
+ && (info->n_hash_potential > 0)
+ && (block->n_fields == info->n_fields)
+ && (block->n_bytes == info->n_bytes)
+ && (block->left_side == info->left_side)) {
+
+ if ((block->index)
+ && (block->curr_n_fields == info->n_fields)
+ && (block->curr_n_bytes == info->n_bytes)
+ && (block->curr_left_side == info->left_side)) {
+
+ /* The search would presumably have succeeded using
+ the hash index */
+
+ info->last_hash_succ = TRUE;
+ }
+
+ block->n_hash_helps++;
+ } else {
+ block->n_hash_helps = 1;
+ block->n_fields = info->n_fields;
+ block->n_bytes = info->n_bytes;
+ block->left_side = info->left_side;
+ }
+
+ if ((block->n_hash_helps > page_get_n_recs(block->frame)
+ / BTR_SEARCH_PAGE_BUILD_LIMIT)
+ && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) {
+
+ if ((!block->index)
+ || (block->n_hash_helps
+ > 2U * page_get_n_recs(block->frame))
+ || (block->n_fields != block->curr_n_fields)
+ || (block->n_bytes != block->curr_n_bytes)
+ || (block->left_side != block->curr_left_side)) {
+
+ /* Build a new hash index on the page */
+
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Maximum number of records in a page */
+constexpr ulint MAX_N_POINTERS = UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+__attribute__((nonnull))
+/**
+Insert an entry into the hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@param table hash table
+@param heap memory heap
+@param fold folded value of the record
+@param block buffer block containing the record
+@param data the record
+@retval true on success
+@retval false if no more memory could be allocated */
+static bool ha_insert_for_fold(hash_table_t *table, mem_heap_t* heap,
+ ulint fold,
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t *block, /*!< buffer block of data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ const rec_t *data)
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(block->frame == page_align(data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ ut_ad(btr_search_enabled);
+
+ hash_cell_t *cell= &table->array[table->calc_hash(fold)];
+
+ for (ha_node_t *prev= static_cast<ha_node_t*>(cell->node); prev;
+ prev= prev->next)
+ {
+ if (prev->fold == fold)
+ {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t *prev_block= prev->block;
+ ut_a(prev_block->frame == page_align(prev->data));
+ ut_a(prev_block->n_pointers-- < MAX_N_POINTERS);
+ ut_a(block->n_pointers++ < MAX_N_POINTERS);
+
+ prev->block= block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ prev->data= data;
+ return true;
+ }
+ }
+
+ /* We have to allocate a new chain node */
+ ha_node_t *node= static_cast<ha_node_t*>(mem_heap_alloc(heap, sizeof *node));
+
+ if (!node)
+ return false;
+
+ ha_node_set_data(node, block, data);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(block->n_pointers++ < MAX_N_POINTERS);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ node->fold= fold;
+ node->next= nullptr;
+
+ ha_node_t *prev= static_cast<ha_node_t*>(cell->node);
+ if (!prev)
+ cell->node= node;
+ else
+ {
+ while (prev->next)
+ prev= prev->next;
+ prev->next= node;
+ }
+ return true;
+}
+
+__attribute__((nonnull))
+/** Delete a record.
+@param table hash table
+@param heap memory heap
+@param del_node record to be deleted */
+static void ha_delete_hash_node(hash_table_t *table, mem_heap_t *heap,
+ ha_node_t *del_node)
+{
+ ut_ad(btr_search_enabled);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(del_node->block->frame == page_align(del_node->data));
+ ut_a(del_node->block->n_pointers-- < MAX_N_POINTERS);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ const ulint fold= del_node->fold;
+
+ HASH_DELETE(ha_node_t, next, table, fold, del_node);
+
+ ha_node_t *top= static_cast<ha_node_t*>(mem_heap_get_top(heap, sizeof *top));
+
+ if (del_node != top)
+ {
+ /* Compact the heap of nodes by moving the top in the place of del_node. */
+ *del_node= *top;
+ hash_cell_t *cell= &table->array[table->calc_hash(top->fold)];
+
+ /* Look for the pointer to the top node, to update it */
+ if (cell->node == top)
+ /* The top node is the first in the chain */
+ cell->node= del_node;
+ else
+ {
+ /* We have to look for the predecessor */
+ ha_node_t *node= static_cast<ha_node_t*>(cell->node);
+
+ while (top != HASH_GET_NEXT(next, node))
+ node= static_cast<ha_node_t*>(HASH_GET_NEXT(next, node));
+
+ /* Now we have the predecessor node */
+ node->next= del_node;
+ }
+ }
+
+ /* Free the occupied space */
+ mem_heap_free_top(heap, sizeof *top);
+}
+
+__attribute__((nonnull))
+/** Delete all pointers to a page.
+@param table hash table
+@param heap memory heap
+@param page record to be deleted */
+static void ha_remove_all_nodes_to_page(hash_table_t *table, mem_heap_t *heap,
+ ulint fold, const page_t *page)
+{
+ for (ha_node_t *node= ha_chain_get_first(table, fold); node; )
+ {
+ if (page_align(ha_node_get_data(node)) == page)
+ {
+ ha_delete_hash_node(table, heap, node);
+ /* The deletion may compact the heap of nodes and move other nodes! */
+ node= ha_chain_get_first(table, fold);
+ }
+ else
+ node= ha_chain_get_next(node);
+ }
+#ifdef UNIV_DEBUG
+ /* Check that all nodes really got deleted */
+ for (ha_node_t *node= ha_chain_get_first(table, fold); node;
+ node= ha_chain_get_next(node))
+ ut_ad(page_align(ha_node_get_data(node)) != page);
+#endif /* UNIV_DEBUG */
+}
+
+/** Delete a record if found.
+@param table hash table
+@param heap memory heap for the hash bucket chain
+@param fold folded value of the searched data
+@param data pointer to the record
+@return whether the record was found */
+static bool ha_search_and_delete_if_found(hash_table_t *table,
+ mem_heap_t *heap,
+ ulint fold, const rec_t *data)
+{
+ if (ha_node_t *node= ha_search_with_data(table, fold, data))
+ {
+ ha_delete_hash_node(table, heap, node);
+ return true;
+ }
+
+ return false;
+}
+
+__attribute__((nonnull))
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table hash table
+@param fold folded value of the searched data
+@param data pointer to the data
+@param new_data new pointer to the data
+@return whether the element was found */
+static bool ha_search_and_update_if_found(hash_table_t *table, ulint fold,
+ const rec_t *data,
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ /** block containing new_data */
+ buf_block_t *new_block,
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ const rec_t *new_data)
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(new_block->frame == page_align(new_data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ if (!btr_search_enabled)
+ return false;
+
+ if (ha_node_t *node= ha_search_with_data(table, fold, data))
+ {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(node->block->n_pointers-- < MAX_N_POINTERS);
+ ut_a(new_block->n_pointers++ < MAX_N_POINTERS);
+ node->block= new_block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ node->data= new_data;
+
+ return true;
+ }
+
+ return false;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+#else
+# define ha_insert_for_fold(t,h,f,b,d) ha_insert_for_fold(t,h,f,d)
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+ ha_search_and_update_if_found(table,fold,data,new_data)
+#endif
+
+/** Updates a hash node reference when it has been unsuccessfully used in a
+search which could have succeeded with the used hash parameters. This can
+happen because when building a hash index for a page, we do not check
+what happens at page boundaries, and therefore there can be misleading
+hash nodes. Also, collisions in the fold value can lead to misleading
+references. This function lazily fixes these imperfections in the hash
+index.
+@param[in] info search info
+@param[in] block buffer block where cursor positioned
+@param[in] cursor cursor */
+static
+void
+btr_search_update_hash_ref(
+ const btr_search_t* info,
+ buf_block_t* block,
+ const btr_cur_t* cursor)
+{
+ ut_ad(cursor->flag == BTR_CUR_HASH_FAIL);
+
+ ut_ad(rw_lock_own_flagged(&block->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+ ut_ad(page_align(btr_cur_get_rec(cursor)) == block->frame);
+ ut_ad(page_is_leaf(block->frame));
+ assert_block_ahi_valid(block);
+
+ dict_index_t* index = block->index;
+
+ if (!index || !info->n_hash_potential) {
+ return;
+ }
+
+ if (index != cursor->index) {
+ ut_ad(index->id == cursor->index->id);
+ btr_search_drop_page_hash_index(block);
+ return;
+ }
+
+ ut_ad(block->page.id().space() == index->table->space_id);
+ ut_ad(index == cursor->index);
+ ut_ad(!dict_index_is_ibuf(index));
+ auto part = btr_search_sys.get_part(*index);
+ rw_lock_x_lock(&part->latch);
+ ut_ad(!block->index || block->index == index);
+
+ if (block->index
+ && (block->curr_n_fields == info->n_fields)
+ && (block->curr_n_bytes == info->n_bytes)
+ && (block->curr_left_side == info->left_side)
+ && btr_search_enabled) {
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ const rec_t* rec = btr_cur_get_rec(cursor);
+
+ if (!page_rec_is_user_rec(rec)) {
+ goto func_exit;
+ }
+
+ ulint fold = rec_fold(
+ rec,
+ rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap),
+ block->curr_n_fields,
+ block->curr_n_bytes, index->id);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ ha_insert_for_fold(&part->table, part->heap, fold, block, rec);
+
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+ }
+
+func_exit:
+ rw_lock_x_unlock(&part->latch);
+}
+
+/** Checks if a guessed position for a tree cursor is right. Note that if
+mode is PAGE_CUR_LE, which is used in inserts, and the function returns
+TRUE, then cursor->up_match and cursor->low_match both have sensible values.
+@param[in,out] cursor guess cursor position
+@param[in] can_only_compare_to_cursor_rec
+ if we do not have a latch on the page of cursor,
+ but a latch corresponding search system, then
+ ONLY the columns of the record UNDER the cursor
+ are protected, not the next or previous record
+ in the chain: we cannot look at the next or
+ previous record to check our guess!
+@param[in] tuple data tuple
+@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, PAGE_CUR_GE
+@return whether a match was found */
+static
+bool
+btr_search_check_guess(
+ btr_cur_t* cursor,
+ bool can_only_compare_to_cursor_rec,
+ const dtuple_t* tuple,
+ ulint mode)
+{
+ rec_t* rec;
+ ulint n_unique;
+ ulint match;
+ int cmp;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ ibool success = FALSE;
+ rec_offs_init(offsets_);
+
+ n_unique = dict_index_get_n_unique_in_tree(cursor->index);
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(page_rec_is_leaf(rec));
+
+ match = 0;
+
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ cursor->index->n_core_fields,
+ n_unique, &heap);
+ cmp = cmp_dtuple_rec_with_match(tuple, rec, offsets, &match);
+
+ if (mode == PAGE_CUR_GE) {
+ if (cmp > 0) {
+ goto exit_func;
+ }
+
+ cursor->up_match = match;
+
+ if (match >= n_unique) {
+ success = TRUE;
+ goto exit_func;
+ }
+ } else if (mode == PAGE_CUR_LE) {
+ if (cmp < 0) {
+ goto exit_func;
+ }
+
+ cursor->low_match = match;
+
+ } else if (mode == PAGE_CUR_G) {
+ if (cmp >= 0) {
+ goto exit_func;
+ }
+ } else if (mode == PAGE_CUR_L) {
+ if (cmp <= 0) {
+ goto exit_func;
+ }
+ }
+
+ if (can_only_compare_to_cursor_rec) {
+ /* Since we could not determine if our guess is right just by
+ looking at the record under the cursor, return FALSE */
+ goto exit_func;
+ }
+
+ match = 0;
+
+ if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) {
+ ut_ad(!page_rec_is_infimum(rec));
+
+ const rec_t* prev_rec = page_rec_get_prev(rec);
+
+ if (page_rec_is_infimum(prev_rec)) {
+ success = !page_has_prev(page_align(prev_rec));
+ goto exit_func;
+ }
+
+ offsets = rec_get_offsets(prev_rec, cursor->index, offsets,
+ cursor->index->n_core_fields,
+ n_unique, &heap);
+ cmp = cmp_dtuple_rec_with_match(
+ tuple, prev_rec, offsets, &match);
+ if (mode == PAGE_CUR_GE) {
+ success = cmp > 0;
+ } else {
+ success = cmp >= 0;
+ }
+ } else {
+ ut_ad(!page_rec_is_supremum(rec));
+
+ const rec_t* next_rec = page_rec_get_next(rec);
+
+ if (page_rec_is_supremum(next_rec)) {
+ if (!page_has_next(page_align(next_rec))) {
+ cursor->up_match = 0;
+ success = TRUE;
+ }
+
+ goto exit_func;
+ }
+
+ offsets = rec_get_offsets(next_rec, cursor->index, offsets,
+ cursor->index->n_core_fields,
+ n_unique, &heap);
+ cmp = cmp_dtuple_rec_with_match(
+ tuple, next_rec, offsets, &match);
+ if (mode == PAGE_CUR_LE) {
+ success = cmp < 0;
+ cursor->up_match = match;
+ } else {
+ success = cmp <= 0;
+ }
+ }
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(success);
+}
+
+static
+void
+btr_search_failure(btr_search_t* info, btr_cur_t* cursor)
+{
+ cursor->flag = BTR_CUR_HASH_FAIL;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ ++info->n_hash_fail;
+
+ if (info->n_hash_succ > 0) {
+ --info->n_hash_succ;
+ }
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+ info->last_hash_succ = FALSE;
+}
+
+/** Clear the adaptive hash index on all pages in the buffer pool. */
+inline void buf_pool_t::clear_hash_index()
+{
+ ut_ad(btr_search_own_all(RW_LOCK_X));
+ ut_ad(!resizing);
+ ut_ad(!btr_search_enabled);
+
+ std::set<dict_index_t*> garbage;
+
+ for (chunk_t *chunk= chunks + n_chunks; chunk-- != chunks; )
+ {
+ for (buf_block_t *block= chunk->blocks, * const end= block + chunk->size;
+ block != end; block++)
+ {
+ dict_index_t *index= block->index;
+ assert_block_ahi_valid(block);
+
+ /* We can clear block->index and block->n_pointers when
+ btr_search_own_all(RW_LOCK_X); see the comments in buf0buf.h */
+
+ if (!index)
+ {
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(!block->n_pointers);
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ continue;
+ }
+
+ ut_d(buf_page_state state= block->page.state());
+ /* Another thread may have set the state to
+ BUF_BLOCK_REMOVE_HASH in buf_LRU_block_remove_hashed().
+
+ The state change in buf_pool_t::realloc() is not observable
+ here, because in that case we would have !block->index.
+
+ In the end, the entire adaptive hash index will be removed. */
+ ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ block->n_pointers= 0;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ if (index->freed())
+ garbage.insert(index);
+ block->index= nullptr;
+ }
+ }
+
+ for (dict_index_t *index : garbage)
+ btr_search_lazy_free(index);
+}
+
+/** Get a buffer block from an adaptive hash index pointer.
+This function does not return if the block is not identified.
+@param ptr pointer to within a page frame
+@return pointer to block, never NULL */
+inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const
+{
+ chunk_t::map *chunk_map = chunk_t::map_ref;
+ ut_ad(chunk_t::map_ref == chunk_t::map_reg);
+ ut_ad(!resizing);
+
+ chunk_t::map::const_iterator it= chunk_map->upper_bound(ptr);
+ ut_a(it != chunk_map->begin());
+
+ chunk_t *chunk= it == chunk_map->end()
+ ? chunk_map->rbegin()->second
+ : (--it)->second;
+
+ const size_t offs= size_t(ptr - chunk->blocks->frame) >> srv_page_size_shift;
+ ut_a(offs < chunk->size);
+
+ buf_block_t *block= &chunk->blocks[offs];
+ /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that
+ block[n].frame == block->frame + n * srv_page_size. Check it. */
+ ut_ad(block->frame == page_align(ptr));
+ /* Read the state of the block without holding hash_lock.
+ A state transition from BUF_BLOCK_FILE_PAGE to
+ BUF_BLOCK_REMOVE_HASH is possible during this execution. */
+ ut_d(const buf_page_state state = block->page.state());
+ ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
+ return block;
+}
+
+/** Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@param[in,out] index index
+@param[in,out] info index search info
+@param[in] tuple logical record
+@param[in] mode PAGE_CUR_L, ....
+@param[in] latch_mode BTR_SEARCH_LEAF, ...;
+ NOTE that only if has_search_latch is 0, we will
+ have a latch set on the cursor page, otherwise
+ we assume the caller uses his search latch
+ to protect the record!
+@param[out] cursor tree cursor
+@param[in] ahi_latch the adaptive hash index latch being held,
+ or NULL
+@param[in] mtr mini transaction
+@return whether the search succeeded */
+bool
+btr_search_guess_on_hash(
+ dict_index_t* index,
+ btr_search_t* info,
+ const dtuple_t* tuple,
+ ulint mode,
+ ulint latch_mode,
+ btr_cur_t* cursor,
+ rw_lock_t* ahi_latch,
+ mtr_t* mtr)
+{
+ ulint fold;
+ index_id_t index_id;
+
+ ut_ad(mtr->is_active());
+ ut_ad(!ahi_latch || rw_lock_own_flagged(
+ ahi_latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+ if (!btr_search_enabled) {
+ return false;
+ }
+
+ ut_ad(!index->is_ibuf());
+ ut_ad(!ahi_latch
+ || ahi_latch == &btr_search_sys.get_part(*index)->latch);
+ ut_ad((latch_mode == BTR_SEARCH_LEAF)
+ || (latch_mode == BTR_MODIFY_LEAF));
+ compile_time_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH});
+ compile_time_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH});
+
+ /* Not supported for spatial index */
+ ut_ad(!dict_index_is_spatial(index));
+
+ /* Note that, for efficiency, the struct info may not be protected by
+ any latch here! */
+
+ if (info->n_hash_potential == 0) {
+ return false;
+ }
+
+ cursor->n_fields = info->n_fields;
+ cursor->n_bytes = info->n_bytes;
+
+ if (dtuple_get_n_fields(tuple) < btr_search_get_n_fields(cursor)) {
+ return false;
+ }
+
+ index_id = index->id;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ info->n_hash_succ++;
+#endif
+ fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id);
+
+ cursor->fold = fold;
+ cursor->flag = BTR_CUR_HASH;
+
+ auto part = btr_search_sys.get_part(*index);
+ const rec_t* rec;
+
+ if (!ahi_latch) {
+ rw_lock_s_lock(&part->latch);
+
+ if (!btr_search_enabled) {
+ goto fail;
+ }
+ } else {
+ ut_ad(btr_search_enabled);
+ ut_ad(rw_lock_own(ahi_latch, RW_LOCK_S));
+ }
+
+ rec = static_cast<const rec_t*>(
+ ha_search_and_get_data(&part->table, fold));
+
+ if (!rec) {
+ if (!ahi_latch) {
+fail:
+ rw_lock_s_unlock(&part->latch);
+ }
+
+ btr_search_failure(info, cursor);
+ return false;
+ }
+
+ buf_block_t* block = buf_pool.block_from_ahi(rec);
+
+ if (!ahi_latch) {
+ page_hash_latch* hash_lock = buf_pool.hash_lock_get(
+ block->page.id());
+ hash_lock->read_lock();
+
+ if (block->page.state() == BUF_BLOCK_REMOVE_HASH) {
+ /* Another thread is just freeing the block
+ from the LRU list of the buffer pool: do not
+ try to access this page. */
+ hash_lock->read_unlock();
+ goto fail;
+ }
+
+ const bool fail = index != block->index
+ && index_id == block->index->id;
+ ut_a(!fail || block->index->freed());
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+ DBUG_ASSERT(fail || block->page.status != buf_page_t::FREED);
+
+ buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+ hash_lock->read_unlock();
+ block->page.set_accessed();
+
+ buf_page_make_young_if_needed(&block->page);
+ mtr_memo_type_t fix_type;
+ if (latch_mode == BTR_SEARCH_LEAF) {
+ if (!rw_lock_s_lock_nowait(&block->lock,
+ __FILE__, __LINE__)) {
+got_no_latch:
+ buf_block_buf_fix_dec(block);
+ goto fail;
+ }
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ } else {
+ if (!rw_lock_x_lock_func_nowait_inline(
+ &block->lock, __FILE__, __LINE__)) {
+ goto got_no_latch;
+ }
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ }
+ mtr->memo_push(block, fix_type);
+
+ buf_pool.stat.n_page_gets++;
+
+ rw_lock_s_unlock(&part->latch);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
+ if (UNIV_UNLIKELY(fail)) {
+ goto fail_and_release_page;
+ }
+ } else if (UNIV_UNLIKELY(index != block->index
+ && index_id == block->index->id)) {
+ ut_a(block->index->freed());
+ goto fail_and_release_page;
+ }
+
+ if (block->page.state() != BUF_BLOCK_FILE_PAGE) {
+
+ ut_ad(block->page.state() == BUF_BLOCK_REMOVE_HASH);
+
+fail_and_release_page:
+ if (!ahi_latch) {
+ btr_leaf_page_release(block, latch_mode, mtr);
+ }
+
+ btr_search_failure(info, cursor);
+ return false;
+ }
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ btr_cur_position(index, (rec_t*) rec, block, cursor);
+
+ /* Check the validity of the guess within the page */
+
+ /* If we only have the latch on search system, not on the
+ page, it only protects the columns of the record the cursor
+ is positioned on. We cannot look at the next of the previous
+ record to determine if our guess for the cursor position is
+ right. */
+ if (index_id != btr_page_get_index_id(block->frame)
+ || !btr_search_check_guess(cursor, !!ahi_latch, tuple, mode)) {
+ goto fail_and_release_page;
+ }
+
+ if (info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5) {
+
+ info->n_hash_potential++;
+ }
+
+#ifdef notdefined
+ /* These lines of code can be used in a debug version to check
+ the correctness of the searched cursor position: */
+
+ info->last_hash_succ = FALSE;
+
+ /* Currently, does not work if the following fails: */
+ ut_ad(!ahi_latch);
+
+ btr_leaf_page_release(block, latch_mode, mtr);
+
+ btr_cur_search_to_nth_level(
+ index, 0, tuple, mode, latch_mode, &cursor2, 0, mtr);
+
+ if (mode == PAGE_CUR_GE
+ && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) {
+
+ /* If mode is PAGE_CUR_GE, then the binary search
+ in the index tree may actually take us to the supremum
+ of the previous page */
+
+ info->last_hash_succ = FALSE;
+
+ btr_pcur_open_on_user_rec(
+ index, tuple, mode, latch_mode, &pcur, mtr);
+
+ ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor));
+ } else {
+ ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor));
+ }
+
+ /* NOTE that it is theoretically possible that the above assertions
+ fail if the page of the cursor gets removed from the buffer pool
+ meanwhile! Thus it might not be a bug. */
+#endif
+ info->last_hash_succ = TRUE;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ btr_search_n_succ++;
+#endif
+ /* Increment the page get statistics though we did not really
+ fix the page: for user info only */
+ ++buf_pool.stat.n_page_gets;
+
+ if (!ahi_latch) {
+ buf_page_make_young_if_needed(&block->page);
+ }
+
+ return true;
+}
+
+/** Drop any adaptive hash index entries that point to an index page.
+@param[in,out] block block containing index page, s- or x-latched, or an
+ index page for which we know that
+ block->buf_fix_count == 0 or it is an index page which
+ has already been removed from the buf_pool.page_hash
+ i.e.: it is in state BUF_BLOCK_REMOVE_HASH */
+void btr_search_drop_page_hash_index(buf_block_t* block)
+{
+ ulint n_fields;
+ ulint n_bytes;
+ const page_t* page;
+ const rec_t* rec;
+ ulint fold;
+ ulint prev_fold;
+ ulint n_cached;
+ ulint n_recs;
+ ulint* folds;
+ ulint i;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+
+retry:
+ /* This debug check uses a dirty read that could theoretically cause
+ false positives while buf_pool.clear_hash_index() is executing. */
+ assert_block_ahi_valid(block);
+ ut_ad(!btr_search_own_any(RW_LOCK_S));
+ ut_ad(!btr_search_own_any(RW_LOCK_X));
+
+ if (!block->index) {
+ return;
+ }
+
+ ut_ad(!block->page.buf_fix_count()
+ || block->page.state() == BUF_BLOCK_REMOVE_HASH
+ || rw_lock_own_flagged(&block->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S
+ | RW_LOCK_FLAG_SX));
+ ut_ad(page_is_leaf(block->frame));
+
+ /* We must not dereference block->index here, because it could be freed
+ if (index->table->n_ref_count == 0 && !mutex_own(&dict_sys.mutex)).
+ Determine the ahi_slot based on the block contents. */
+
+ const index_id_t index_id
+ = btr_page_get_index_id(block->frame);
+
+ auto part = btr_search_sys.get_part(index_id,
+ block->page.id().space());
+
+ dict_index_t* index = block->index;
+ bool is_freed = index && index->freed();
+
+ if (is_freed) {
+ rw_lock_x_lock(&part->latch);
+ } else {
+ rw_lock_s_lock(&part->latch);
+ }
+
+ assert_block_ahi_valid(block);
+
+
+ if (!index || !btr_search_enabled) {
+ if (is_freed) {
+ rw_lock_x_unlock(&part->latch);
+ } else {
+ rw_lock_s_unlock(&part->latch);
+ }
+ return;
+ }
+
+#ifdef MYSQL_INDEX_DISABLE_AHI
+ ut_ad(!index->disable_ahi);
+#endif
+ ut_ad(btr_search_enabled);
+
+ ut_ad(block->page.id().space() == index->table->space_id);
+ ut_a(index_id == index->id);
+ ut_ad(!dict_index_is_ibuf(index));
+
+ n_fields = block->curr_n_fields;
+ n_bytes = block->curr_n_bytes;
+
+ /* NOTE: The AHI fields of block must not be accessed after
+ releasing search latch, as the index page might only be s-latched! */
+
+ if (!is_freed) {
+ rw_lock_s_unlock(&part->latch);
+ }
+
+ ut_a(n_fields > 0 || n_bytes > 0);
+
+ page = block->frame;
+ n_recs = page_get_n_recs(page);
+
+ /* Calculate and cache fold values into an array for fast deletion
+ from the hash index */
+
+ folds = (ulint*) ut_malloc_nokey(n_recs * sizeof(ulint));
+
+ n_cached = 0;
+
+ rec = page_get_infimum_rec(page);
+ rec = page_rec_get_next_low(rec, page_is_comp(page));
+ if (rec_is_metadata(rec, *index)) {
+ rec = page_rec_get_next_low(rec, page_is_comp(page));
+ }
+
+ prev_fold = 0;
+
+ heap = NULL;
+ offsets = NULL;
+
+ while (!page_rec_is_supremum(rec)) {
+ offsets = rec_get_offsets(
+ rec, index, offsets, index->n_core_fields,
+ btr_search_get_n_fields(n_fields, n_bytes),
+ &heap);
+ fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+
+ if (fold == prev_fold && prev_fold != 0) {
+
+ goto next_rec;
+ }
+
+ /* Remove all hash nodes pointing to this page from the
+ hash chain */
+
+ folds[n_cached] = fold;
+ n_cached++;
+next_rec:
+ rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
+ prev_fold = fold;
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ if (!is_freed) {
+ rw_lock_x_lock(&part->latch);
+
+ if (UNIV_UNLIKELY(!block->index)) {
+ /* Someone else has meanwhile dropped the
+ hash index */
+ goto cleanup;
+ }
+
+ ut_a(block->index == index);
+ }
+
+ if (block->curr_n_fields != n_fields
+ || block->curr_n_bytes != n_bytes) {
+
+ /* Someone else has meanwhile built a new hash index on the
+ page, with different parameters */
+
+ rw_lock_x_unlock(&part->latch);
+
+ ut_free(folds);
+ goto retry;
+ }
+
+ for (i = 0; i < n_cached; i++) {
+ ha_remove_all_nodes_to_page(&part->table, part->heap,
+ folds[i], page);
+ }
+
+ switch (index->search_info->ref_count--) {
+ case 0:
+ ut_error;
+ case 1:
+ if (index->freed()) {
+ btr_search_lazy_free(index);
+ }
+ }
+
+ block->index = NULL;
+
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED);
+ MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached);
+
+cleanup:
+ assert_block_ahi_valid(block);
+ rw_lock_x_unlock(&part->latch);
+
+ ut_free(folds);
+}
+
+/** Drop possible adaptive hash index entries when a page is evicted
+from the buffer pool or freed in a file, or the index is being dropped.
+@param[in] page_id page id */
+void btr_search_drop_page_hash_when_freed(const page_id_t page_id)
+{
+ buf_block_t* block;
+ mtr_t mtr;
+ dberr_t err = DB_SUCCESS;
+
+ mtr_start(&mtr);
+
+ /* If the caller has a latch on the page, then the caller must
+ have a x-latch on the page and it must have already dropped
+ the hash index for the page. Because of the x-latch that we
+ are possibly holding, we cannot s-latch the page, but must
+ (recursively) x-latch it, even though we are only reading. */
+
+ block = buf_page_get_gen(page_id, 0, RW_X_LATCH, NULL,
+ BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__,
+ &mtr, &err);
+
+ if (block) {
+
+ /* If AHI is still valid, page can't be in free state.
+ AHI is dropped when page is freed. */
+ DBUG_ASSERT(block->page.status != buf_page_t::FREED);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
+
+ dict_index_t* index = block->index;
+ if (index != NULL) {
+ /* In all our callers, the table handle should
+ be open, or we should be in the process of
+ dropping the table (preventing eviction). */
+ ut_ad(index->table->get_ref_count() > 0
+ || mutex_own(&dict_sys.mutex));
+ btr_search_drop_page_hash_index(block);
+ }
+ }
+
+ mtr_commit(&mtr);
+}
+
+/** Build a hash index on a page with the given parameters. If the page already
+has a hash index with different parameters, the old hash index is removed.
+If index is non-NULL, this function checks if n_fields and n_bytes are
+sensible, and does not build a hash index if not.
+@param[in,out] index index for which to build.
+@param[in,out] block index page, s-/x- latched.
+@param[in,out] ahi_latch the adaptive search latch
+@param[in] n_fields hash this many full fields
+@param[in] n_bytes hash this many bytes of the next field
+@param[in] left_side hash for searches from left side */
+static
+void
+btr_search_build_page_hash_index(
+ dict_index_t* index,
+ buf_block_t* block,
+ rw_lock_t* ahi_latch,
+ uint16_t n_fields,
+ uint16_t n_bytes,
+ bool left_side)
+{
+ const rec_t* rec;
+ const rec_t* next_rec;
+ ulint fold;
+ ulint next_fold;
+ ulint n_cached;
+ ulint n_recs;
+ ulint* folds;
+ const rec_t** recs;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+#ifdef MYSQL_INDEX_DISABLE_AHI
+ if (index->disable_ahi) return;
+#endif
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ rec_offs_init(offsets_);
+ ut_ad(ahi_latch == &btr_search_sys.get_part(*index)->latch);
+ ut_ad(index);
+ ut_ad(block->page.id().space() == index->table->space_id);
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(page_is_leaf(block->frame));
+
+ ut_ad(rw_lock_own_flagged(&block->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+ ut_ad(block->page.id().page_no() >= 3);
+
+ rw_lock_s_lock(ahi_latch);
+
+ const bool enabled = btr_search_enabled;
+ const bool rebuild = enabled && block->index
+ && (block->curr_n_fields != n_fields
+ || block->curr_n_bytes != n_bytes
+ || block->curr_left_side != left_side);
+
+ rw_lock_s_unlock(ahi_latch);
+
+ if (!enabled) {
+ return;
+ }
+
+ if (rebuild) {
+ btr_search_drop_page_hash_index(block);
+ }
+
+ /* Check that the values for hash index build are sensible */
+
+ if (n_fields == 0 && n_bytes == 0) {
+
+ return;
+ }
+
+ if (dict_index_get_n_unique_in_tree(index)
+ < btr_search_get_n_fields(n_fields, n_bytes)) {
+ return;
+ }
+
+ page_t* page = buf_block_get_frame(block);
+ n_recs = page_get_n_recs(page);
+
+ if (n_recs == 0) {
+
+ return;
+ }
+
+ rec = page_rec_get_next_const(page_get_infimum_rec(page));
+
+ if (rec_is_metadata(rec, *index)) {
+ rec = page_rec_get_next_const(rec);
+ if (!--n_recs) return;
+ }
+
+ /* Calculate and cache fold values and corresponding records into
+ an array for fast insertion to the hash index */
+
+ folds = static_cast<ulint*>(ut_malloc_nokey(n_recs * sizeof *folds));
+ recs = static_cast<const rec_t**>(
+ ut_malloc_nokey(n_recs * sizeof *recs));
+
+ n_cached = 0;
+
+ ut_a(index->id == btr_page_get_index_id(page));
+
+ offsets = rec_get_offsets(
+ rec, index, offsets, index->n_core_fields,
+ btr_search_get_n_fields(n_fields, n_bytes),
+ &heap);
+ ut_ad(page_rec_is_supremum(rec)
+ || n_fields == rec_offs_n_fields(offsets) - (n_bytes > 0));
+
+ fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
+
+ if (left_side) {
+
+ folds[n_cached] = fold;
+ recs[n_cached] = rec;
+ n_cached++;
+ }
+
+ for (;;) {
+ next_rec = page_rec_get_next_const(rec);
+
+ if (page_rec_is_supremum(next_rec)) {
+
+ if (!left_side) {
+
+ folds[n_cached] = fold;
+ recs[n_cached] = rec;
+ n_cached++;
+ }
+
+ break;
+ }
+
+ offsets = rec_get_offsets(
+ next_rec, index, offsets, index->n_core_fields,
+ btr_search_get_n_fields(n_fields, n_bytes), &heap);
+ next_fold = rec_fold(next_rec, offsets, n_fields,
+ n_bytes, index->id);
+
+ if (fold != next_fold) {
+ /* Insert an entry into the hash index */
+
+ if (left_side) {
+
+ folds[n_cached] = next_fold;
+ recs[n_cached] = next_rec;
+ n_cached++;
+ } else {
+ folds[n_cached] = fold;
+ recs[n_cached] = rec;
+ n_cached++;
+ }
+ }
+
+ rec = next_rec;
+ fold = next_fold;
+ }
+
+ btr_search_check_free_space_in_heap(index);
+
+ rw_lock_x_lock(ahi_latch);
+
+ if (!btr_search_enabled) {
+ goto exit_func;
+ }
+
+ /* This counter is decremented every time we drop page
+ hash index entries and is incremented here. Since we can
+ rebuild hash index for a page that is already hashed, we
+ have to take care not to increment the counter in that
+ case. */
+ if (!block->index) {
+ assert_block_ahi_empty(block);
+ index->search_info->ref_count++;
+ } else if (block->curr_n_fields != n_fields
+ || block->curr_n_bytes != n_bytes
+ || block->curr_left_side != left_side) {
+ goto exit_func;
+ }
+
+ block->n_hash_helps = 0;
+
+ block->curr_n_fields = n_fields & dict_index_t::MAX_N_FIELDS;
+ block->curr_n_bytes = n_bytes & ((1U << 15) - 1);
+ block->curr_left_side = left_side;
+ block->index = index;
+
+ {
+ auto part = btr_search_sys.get_part(*index);
+ for (ulint i = 0; i < n_cached; i++) {
+ ha_insert_for_fold(&part->table, part->heap,
+ folds[i], block, recs[i]);
+ }
+ }
+
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED);
+ MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached);
+exit_func:
+ assert_block_ahi_valid(block);
+ rw_lock_x_unlock(ahi_latch);
+
+ ut_free(folds);
+ ut_free(recs);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/** Updates the search info.
+@param[in,out] info search info
+@param[in,out] cursor cursor which was just positioned */
+void
+btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor)
+{
+ rw_lock_t* ahi_latch = &btr_search_sys.get_part(*cursor->index)
+ ->latch;
+ ut_ad(!rw_lock_own_flagged(ahi_latch,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+ buf_block_t* block = btr_cur_get_block(cursor);
+
+ /* NOTE that the following two function calls do NOT protect
+ info or block->n_fields etc. with any semaphore, to save CPU time!
+ We cannot assume the fields are consistent when we return from
+ those functions! */
+
+ btr_search_info_update_hash(info, cursor);
+
+ bool build_index = btr_search_update_block_hash_info(info, block);
+
+ if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) {
+
+ btr_search_check_free_space_in_heap(cursor->index);
+ }
+
+ if (cursor->flag == BTR_CUR_HASH_FAIL) {
+ /* Update the hash node reference, if appropriate */
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ btr_search_n_hash_fail++;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+ btr_search_update_hash_ref(info, block, cursor);
+ }
+
+ if (build_index) {
+ /* Note that since we did not protect block->n_fields etc.
+ with any semaphore, the values can be inconsistent. We have
+ to check inside the function call that they make sense. */
+ btr_search_build_page_hash_index(cursor->index, block,
+ ahi_latch,
+ block->n_fields,
+ block->n_bytes,
+ block->left_side);
+ }
+}
+
+/** Move or delete hash entries for moved records, usually in a page split.
+If new_block is already hashed, then any hash index for block is dropped.
+If new_block is not hashed, and block is hashed, then a new hash index is
+built to new_block with the same parameters as block.
+@param[in,out] new_block destination page
+@param[in,out] block source page (subject to deletion later) */
+void
+btr_search_move_or_delete_hash_entries(
+ buf_block_t* new_block,
+ buf_block_t* block)
+{
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+ ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_X));
+
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ dict_index_t* index = block->index;
+ if (!index) {
+ index = new_block->index;
+ } else {
+ ut_ad(!new_block->index || index == new_block->index);
+ }
+ assert_block_ahi_valid(block);
+ assert_block_ahi_valid(new_block);
+
+ rw_lock_t* ahi_latch = index
+ ? &btr_search_sys.get_part(*index)->latch
+ : nullptr;
+
+ if (new_block->index) {
+drop_exit:
+ btr_search_drop_page_hash_index(block);
+ return;
+ }
+
+ if (!index) {
+ return;
+ }
+
+ if (index->freed()) {
+ goto drop_exit;
+ }
+
+ rw_lock_s_lock(ahi_latch);
+
+ if (block->index) {
+ uint16_t n_fields = block->curr_n_fields;
+ uint16_t n_bytes = block->curr_n_bytes;
+ bool left_side = block->curr_left_side;
+
+ new_block->n_fields = block->curr_n_fields;
+ new_block->n_bytes = block->curr_n_bytes;
+ new_block->left_side = left_side;
+
+ rw_lock_s_unlock(ahi_latch);
+
+ ut_a(n_fields > 0 || n_bytes > 0);
+
+ btr_search_build_page_hash_index(
+ index, new_block, ahi_latch,
+ n_fields, n_bytes, left_side);
+ ut_ad(n_fields == block->curr_n_fields);
+ ut_ad(n_bytes == block->curr_n_bytes);
+ ut_ad(left_side == block->curr_left_side);
+ return;
+ }
+
+ rw_lock_s_unlock(ahi_latch);
+}
+
+/** Updates the page hash index when a single record is deleted from a page.
+@param[in] cursor cursor which was positioned on the record to delete
+ using btr_cur_search_, the record is not yet deleted.*/
+void btr_search_update_hash_on_delete(btr_cur_t* cursor)
+{
+ buf_block_t* block;
+ const rec_t* rec;
+ ulint fold;
+ dict_index_t* index;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t* heap = NULL;
+ rec_offs_init(offsets_);
+
+ ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
+#ifdef MYSQL_INDEX_DISABLE_AHI
+ if (cursor->index->disable_ahi) return;
+#endif
+
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ block = btr_cur_get_block(cursor);
+
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+
+ assert_block_ahi_valid(block);
+ index = block->index;
+
+ if (!index) {
+
+ return;
+ }
+
+ if (index != cursor->index) {
+ btr_search_drop_page_hash_index(block);
+ return;
+ }
+
+ ut_ad(block->page.id().space() == index->table->space_id);
+ ut_a(index == cursor->index);
+ ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0);
+ ut_ad(!dict_index_is_ibuf(index));
+
+ rec = btr_cur_get_rec(cursor);
+
+ fold = rec_fold(rec, rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap),
+ block->curr_n_fields, block->curr_n_bytes, index->id);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ auto part = btr_search_sys.get_part(*index);
+
+ rw_lock_x_lock(&part->latch);
+ assert_block_ahi_valid(block);
+
+ if (block->index && btr_search_enabled) {
+ ut_a(block->index == index);
+
+ if (ha_search_and_delete_if_found(&part->table, part->heap,
+ fold, rec)) {
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED);
+ } else {
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND);
+ }
+
+ assert_block_ahi_valid(block);
+ }
+
+ rw_lock_x_unlock(&part->latch);
+}
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in] cursor cursor which was positioned to the place to insert
+ using btr_cur_search_, and the new record has been
+ inserted next to the cursor.
+@param[in] ahi_latch the adaptive hash index latch */
+void
+btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
+{
+ buf_block_t* block;
+ dict_index_t* index;
+ rec_t* rec;
+
+ ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index)->latch);
+ ut_ad(!btr_search_own_any(RW_LOCK_S));
+ ut_ad(!btr_search_own_any(RW_LOCK_X));
+#ifdef MYSQL_INDEX_DISABLE_AHI
+ if (cursor->index->disable_ahi) return;
+#endif
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ rec = btr_cur_get_rec(cursor);
+
+ block = btr_cur_get_block(cursor);
+
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+
+ index = block->index;
+
+ if (!index) {
+
+ return;
+ }
+
+ if (index != cursor->index) {
+ ut_ad(index->id == cursor->index->id);
+ btr_search_drop_page_hash_index(block);
+ return;
+ }
+
+ ut_a(cursor->index == index);
+ ut_ad(!dict_index_is_ibuf(index));
+ rw_lock_x_lock(ahi_latch);
+
+ if (!block->index || !btr_search_enabled) {
+
+ goto func_exit;
+ }
+
+ ut_a(block->index == index);
+
+ if ((cursor->flag == BTR_CUR_HASH)
+ && (cursor->n_fields == block->curr_n_fields)
+ && (cursor->n_bytes == block->curr_n_bytes)
+ && !block->curr_left_side) {
+
+ if (ha_search_and_update_if_found(
+ &btr_search_sys.get_part(*cursor->index)->table,
+ cursor->fold, rec, block,
+ page_rec_get_next(rec))) {
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
+ }
+
+func_exit:
+ assert_block_ahi_valid(block);
+ rw_lock_x_unlock(ahi_latch);
+ } else {
+ rw_lock_x_unlock(ahi_latch);
+
+ btr_search_update_hash_on_insert(cursor, ahi_latch);
+ }
+}
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in,out] cursor cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor
+@param[in] ahi_latch the adaptive hash index latch */
+void
+btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
+{
+ buf_block_t* block;
+ dict_index_t* index;
+ const rec_t* rec;
+ const rec_t* ins_rec;
+ const rec_t* next_rec;
+ ulint fold;
+ ulint ins_fold;
+ ulint next_fold = 0; /* remove warning (??? bug ???) */
+ ulint n_fields;
+ ulint n_bytes;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index)->latch);
+ ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
+ ut_ad(!btr_search_own_any(RW_LOCK_S));
+ ut_ad(!btr_search_own_any(RW_LOCK_X));
+#ifdef MYSQL_INDEX_DISABLE_AHI
+ if (cursor->index->disable_ahi) return;
+#endif
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ block = btr_cur_get_block(cursor);
+
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+ assert_block_ahi_valid(block);
+
+ index = block->index;
+
+ if (!index) {
+
+ return;
+ }
+
+ ut_ad(block->page.id().space() == index->table->space_id);
+ btr_search_check_free_space_in_heap(index);
+
+ rec = btr_cur_get_rec(cursor);
+
+#ifdef MYSQL_INDEX_DISABLE_AHI
+ ut_a(!index->disable_ahi);
+#endif
+ if (index != cursor->index) {
+ ut_ad(index->id == cursor->index->id);
+ btr_search_drop_page_hash_index(block);
+ return;
+ }
+
+ ut_a(index == cursor->index);
+ ut_ad(!dict_index_is_ibuf(index));
+
+ n_fields = block->curr_n_fields;
+ n_bytes = block->curr_n_bytes;
+ const bool left_side = block->curr_left_side;
+
+ ins_rec = page_rec_get_next_const(rec);
+ next_rec = page_rec_get_next_const(ins_rec);
+
+ offsets = rec_get_offsets(ins_rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index->id);
+
+ if (!page_rec_is_supremum(next_rec)) {
+ offsets = rec_get_offsets(
+ next_rec, index, offsets, index->n_core_fields,
+ btr_search_get_n_fields(n_fields, n_bytes), &heap);
+ next_fold = rec_fold(next_rec, offsets, n_fields,
+ n_bytes, index->id);
+ }
+
+ /* We must not look up "part" before acquiring ahi_latch. */
+ btr_search_sys_t::partition* part= nullptr;
+ bool locked = false;
+
+ if (!page_rec_is_infimum(rec) && !rec_is_metadata(rec, *index)) {
+ offsets = rec_get_offsets(
+ rec, index, offsets, index->n_core_fields,
+ btr_search_get_n_fields(n_fields, n_bytes), &heap);
+ fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
+ } else {
+ if (left_side) {
+ locked = true;
+ rw_lock_x_lock(ahi_latch);
+
+ if (!btr_search_enabled || !block->index) {
+ goto function_exit;
+ }
+
+ part = btr_search_sys.get_part(*index);
+ ha_insert_for_fold(&part->table, part->heap,
+ ins_fold, block, ins_rec);
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+ }
+
+ goto check_next_rec;
+ }
+
+ if (fold != ins_fold) {
+
+ if (!locked) {
+ locked = true;
+ rw_lock_x_lock(ahi_latch);
+
+ if (!btr_search_enabled || !block->index) {
+ goto function_exit;
+ }
+
+ part = btr_search_sys.get_part(*index);
+ }
+
+ if (!left_side) {
+ ha_insert_for_fold(&part->table, part->heap,
+ fold, block, rec);
+ } else {
+ ha_insert_for_fold(&part->table, part->heap,
+ ins_fold, block, ins_rec);
+ }
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+ }
+
+check_next_rec:
+ if (page_rec_is_supremum(next_rec)) {
+
+ if (!left_side) {
+ if (!locked) {
+ locked = true;
+ rw_lock_x_lock(ahi_latch);
+
+ if (!btr_search_enabled || !block->index) {
+ goto function_exit;
+ }
+
+ part = btr_search_sys.get_part(*index);
+ }
+
+ ha_insert_for_fold(&part->table, part->heap,
+ ins_fold, block, ins_rec);
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+ }
+
+ goto function_exit;
+ }
+
+ if (ins_fold != next_fold) {
+ if (!locked) {
+ locked = true;
+ rw_lock_x_lock(ahi_latch);
+
+ if (!btr_search_enabled || !block->index) {
+ goto function_exit;
+ }
+
+ part = btr_search_sys.get_part(*index);
+ }
+
+ if (!left_side) {
+ ha_insert_for_fold(&part->table, part->heap,
+ ins_fold, block, ins_rec);
+ } else {
+ ha_insert_for_fold(&part->table, part->heap,
+ next_fold, block, next_rec);
+ }
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+ }
+
+function_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ if (locked) {
+ rw_lock_x_unlock(ahi_latch);
+ }
+ ut_ad(!rw_lock_own(ahi_latch, RW_LOCK_X));
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+__attribute__((nonnull))
+/** @return whether a range of the cells is valid */
+static bool ha_validate(const hash_table_t *table,
+ ulint start_index, ulint end_index)
+{
+ ut_a(start_index <= end_index);
+ ut_a(end_index < table->n_cells);
+
+ bool ok= true;
+
+ for (ulint i= start_index; i <= end_index; i++)
+ {
+ for (auto node= static_cast<const ha_node_t*>(table->array[i].node); node;
+ node= node->next)
+ {
+ if (table->calc_hash(node->fold) != i) {
+ ib::error() << "Hash table node fold value " << node->fold
+ << " does not match the cell number " << i;
+ ok= false;
+ }
+ }
+ }
+
+ return ok;
+}
+
+/** Validates the search system for given hash table.
+@param[in] hash_table_id hash table to validate
+@return TRUE if ok */
+static
+ibool
+btr_search_hash_table_validate(ulint hash_table_id)
+{
+ ha_node_t* node;
+ ibool ok = TRUE;
+ ulint i;
+ ulint cell_count;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ btr_search_x_lock_all();
+ if (!btr_search_enabled) {
+ btr_search_x_unlock_all();
+ return(TRUE);
+ }
+
+ /* How many cells to check before temporarily releasing
+ search latches. */
+ ulint chunk_size = 10000;
+
+ rec_offs_init(offsets_);
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ auto &part = btr_search_sys.parts[hash_table_id];
+
+ cell_count = part.table.n_cells;
+
+ for (i = 0; i < cell_count; i++) {
+ /* We release search latches every once in a while to
+ give other queries a chance to run. */
+ if ((i != 0) && ((i % chunk_size) == 0)) {
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ btr_search_x_unlock_all();
+
+ os_thread_yield();
+
+ btr_search_x_lock_all();
+
+ if (!btr_search_enabled) {
+ ok = true;
+ goto func_exit;
+ }
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ ulint curr_cell_count = part.table.n_cells;
+
+ if (cell_count != curr_cell_count) {
+
+ cell_count = curr_cell_count;
+
+ if (i >= cell_count) {
+ break;
+ }
+ }
+ }
+
+ node = static_cast<ha_node_t*>(part.table.array[i].node);
+
+ for (; node != NULL; node = node->next) {
+ const buf_block_t* block
+ = buf_pool.block_from_ahi((byte*) node->data);
+ index_id_t page_index_id;
+
+ if (UNIV_LIKELY(block->page.state()
+ == BUF_BLOCK_FILE_PAGE)) {
+
+ /* The space and offset are only valid
+ for file blocks. It is possible that
+ the block is being freed
+ (BUF_BLOCK_REMOVE_HASH, see the
+ assertion and the comment below) */
+ const page_id_t id(block->page.id());
+ if (const buf_page_t* hash_page
+ = buf_pool.page_hash_get_low(
+ id, id.fold())) {
+ ut_ad(hash_page == &block->page);
+ goto state_ok;
+ }
+ }
+
+ /* When a block is being freed,
+ buf_LRU_search_and_free_block() first removes
+ the block from buf_pool.page_hash by calling
+ buf_LRU_block_remove_hashed_page(). Then it
+ invokes btr_search_drop_page_hash_index(). */
+ ut_a(block->page.state() == BUF_BLOCK_REMOVE_HASH);
+state_ok:
+ ut_ad(!dict_index_is_ibuf(block->index));
+ ut_ad(block->page.id().space()
+ == block->index->table->space_id);
+
+ page_index_id = btr_page_get_index_id(block->frame);
+
+ offsets = rec_get_offsets(
+ node->data, block->index, offsets,
+ block->index->n_core_fields,
+ btr_search_get_n_fields(block->curr_n_fields,
+ block->curr_n_bytes),
+ &heap);
+
+ const ulint fold = rec_fold(
+ node->data, offsets,
+ block->curr_n_fields,
+ block->curr_n_bytes,
+ page_index_id);
+
+ if (node->fold != fold) {
+ const page_t* page = block->frame;
+
+ ok = FALSE;
+
+ ib::error() << "Error in an adaptive hash"
+ << " index pointer to page "
+ << block->page.id()
+ << ", ptr mem address "
+ << reinterpret_cast<const void*>(
+ node->data)
+ << ", index id " << page_index_id
+ << ", node fold " << node->fold
+ << ", rec fold " << fold;
+
+ fputs("InnoDB: Record ", stderr);
+ rec_print_new(stderr, node->data, offsets);
+ fprintf(stderr, "\nInnoDB: on that page."
+ " Page mem address %p, is hashed %p,"
+ " n fields %lu\n"
+ "InnoDB: side %lu\n",
+ (void*) page, (void*) block->index,
+ (ulong) block->curr_n_fields,
+ (ulong) block->curr_left_side);
+ ut_ad(0);
+ }
+ }
+ }
+
+ for (i = 0; i < cell_count; i += chunk_size) {
+ /* We release search latches every once in a while to
+ give other queries a chance to run. */
+ if (i != 0) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ btr_search_x_unlock_all();
+
+ os_thread_yield();
+
+ btr_search_x_lock_all();
+
+ if (!btr_search_enabled) {
+ ok = true;
+ goto func_exit;
+ }
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ ulint curr_cell_count = part.table.n_cells;
+
+ if (cell_count != curr_cell_count) {
+
+ cell_count = curr_cell_count;
+
+ if (i >= cell_count) {
+ break;
+ }
+ }
+ }
+
+ ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1);
+
+ if (!ha_validate(&part.table, i, end_index)) {
+ ok = FALSE;
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+func_exit:
+ btr_search_x_unlock_all();
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(ok);
+}
+
+/** Validate the search system.
+@return true if ok. */
+bool
+btr_search_validate()
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ if (!btr_search_hash_table_validate(i)) {
+ return(false);
+ }
+ }
+
+ return(true);
+}
+
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc
new file mode 100644
index 00000000..6d99d0b6
--- /dev/null
+++ b/storage/innobase/buf/buf0block_hint.cc
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License, version 2.0, as published by the
+Free Software Foundation.
+
+This program is also distributed with certain software (including but not
+limited to OpenSSL) that is licensed under separate terms, as designated in a
+particular file or component or in included license documentation. The authors
+of MySQL hereby grant you an additional permission to link the program and
+your derivative works with the separately licensed software that they have
+included with MySQL.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
+for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+#include "buf0block_hint.h"
+namespace buf {
+
+void Block_hint::buffer_fix_block_if_still_valid()
+{
+ /* To check if m_block belongs to the current buf_pool, we must
+ prevent freeing memory while we check, and until we buffer-fix the
+ block. For this purpose it is enough to latch any of the many
+ latches taken by buf_pool_t::resize().
+
+ Similar to buf_page_optimistic_get(), we must validate
+ m_block->page.id() after acquiring the hash_lock, because the object
+ may have been freed and not actually attached to buf_pool.page_hash
+ at the moment. (The block could have been reused to store a
+ different page, and that slice of buf_pool.page_hash could be protected
+ by another hash_lock that we are not holding.)
+
+ Finally, assuming that we have correct hash bucket latched, we must
+ validate m_block->state() to ensure that the block is not being freed. */
+ if (m_block)
+ {
+ const ulint fold= m_page_id.fold();
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+ if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() &&
+ m_block->page.state() == BUF_BLOCK_FILE_PAGE)
+ buf_block_buf_fix_inc(m_block, __FILE__, __LINE__);
+ else
+ clear();
+ hash_lock->read_unlock();
+ }
+}
+} // namespace buf
diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc
new file mode 100644
index 00000000..f822adc3
--- /dev/null
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -0,0 +1,764 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buddy.cc
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "page0zip.h"
+#include "srv0start.h"
+
+/** When freeing a buf we attempt to coalesce by looking at its buddy
+and deciding whether it is free or not. To ascertain if the buddy is
+free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET
+within the buddy. The question is how we can be sure that it is
+safe to look at BUF_BUDDY_STAMP_OFFSET.
+The answer lies in following invariants:
+* All blocks allocated by buddy allocator are used for compressed
+page frame.
+* A compressed table always have space_id < SRV_SPACE_ID_UPPER_BOUND
+* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in
+a frame.
+ -- The above is true because we look at these fields when the
+ corresponding buddy block is free which implies that:
+ * The block we are looking at must have an address aligned at
+ the same size that its free buddy has. For example, if we have
+ a free block of 8K then its buddy's address must be aligned at
+ 8K as well.
+ * It is possible that the block we are looking at may have been
+ further divided into smaller sized blocks but its starting
+ address must still remain the start of a page frame i.e.: it
+ cannot be middle of a block. For example, if we have a free
+ block of size 8K then its buddy may be divided into blocks
+ of, say, 1K, 1K, 2K, 4K but the buddy's address will still be
+ the starting address of first 1K compressed page.
+ * What is important to note is that for any given block, the
+ buddy's address cannot be in the middle of a larger block i.e.:
+ in above example, our 8K block cannot have a buddy whose address
+ is aligned on 8K but it is part of a larger 16K block.
+*/
+
+/** Offset within buf_buddy_free_t where free or non_free stamps
+are written.*/
+#define BUF_BUDDY_STAMP_OFFSET FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+/** Value that we stamp on all buffers that are currently on the zip_free
+list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */
+#define BUF_BUDDY_STAMP_FREE SRV_SPACE_ID_UPPER_BOUND
+
+/** Stamp value for non-free buffers. Will be overwritten by a non-zero
+value by the consumer of the block */
+#define BUF_BUDDY_STAMP_NONFREE 0XFFFFFFFFUL
+
+/** Return type of buf_buddy_is_free() */
+enum buf_buddy_state_t {
+ BUF_BUDDY_STATE_FREE, /*!< If the buddy to completely free */
+ BUF_BUDDY_STATE_USED, /*!< Buddy currently in used */
+ BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy
+ are in use */
+};
+
+/**********************************************************************//**
+Invalidate memory area that we won't access while page is free */
+UNIV_INLINE
+void
+buf_buddy_mem_invalid(
+/*==================*/
+ buf_buddy_free_t* buf, /*!< in: block to check */
+ ulint i) /*!< in: index of zip_free[] */
+{
+ ut_ad(i <= BUF_BUDDY_SIZES);
+
+ MEM_CHECK_ADDRESSABLE(buf, BUF_BUDDY_LOW << i);
+ MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i);
+}
+
+/**********************************************************************//**
+Check if a buddy is stamped free.
+@return whether the buddy is free */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+bool
+buf_buddy_stamp_is_free(
+/*====================*/
+ const buf_buddy_free_t* buf) /*!< in: block to check */
+{
+ compile_time_assert(BUF_BUDDY_STAMP_FREE < BUF_BUDDY_STAMP_NONFREE);
+ return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET)
+ == BUF_BUDDY_STAMP_FREE);
+}
+
+/**********************************************************************//**
+Stamps a buddy free. */
+UNIV_INLINE
+void
+buf_buddy_stamp_free(
+/*=================*/
+ buf_buddy_free_t* buf, /*!< in/out: block to stamp */
+ ulint i) /*!< in: block size */
+{
+ ut_d(memset(&buf->stamp.bytes, int(i), BUF_BUDDY_LOW << i));
+ buf_buddy_mem_invalid(buf, i);
+ mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET,
+ BUF_BUDDY_STAMP_FREE);
+ buf->stamp.size = i;
+}
+
+/**********************************************************************//**
+Stamps a buddy nonfree.
+@param[in,out] buf block to stamp
+@param[in] i block size */
+static inline void buf_buddy_stamp_nonfree(buf_buddy_free_t* buf, ulint i)
+{
+ buf_buddy_mem_invalid(buf, i);
+ compile_time_assert(BUF_BUDDY_STAMP_NONFREE == 0xffffffffU);
+ memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4);
+}
+
+/**********************************************************************//**
+Get the offset of the buddy of a compressed page frame.
+@return the buddy relative of page */
+UNIV_INLINE
+void*
+buf_buddy_get(
+/*==========*/
+ byte* page, /*!< in: compressed page */
+ ulint size) /*!< in: page size in bytes */
+{
+ ut_ad(ut_is_2pow(size));
+ ut_ad(size >= BUF_BUDDY_LOW);
+ ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN);
+ ut_ad(size < BUF_BUDDY_HIGH);
+ ut_ad(BUF_BUDDY_HIGH == srv_page_size);
+ ut_ad(!ut_align_offset(page, size));
+
+ if (((ulint) page) & size) {
+ return(page - size);
+ } else {
+ return(page + size);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Validate a given zip_free list. */
+struct CheckZipFree {
+ CheckZipFree(ulint i) : m_i(i) {}
+
+ void operator()(const buf_buddy_free_t* elem) const
+ {
+ ut_ad(buf_buddy_stamp_is_free(elem));
+ ut_ad(elem->stamp.size <= m_i);
+ }
+
+ const ulint m_i;
+};
+
+/** Validate a buddy list.
+@param[in] i buddy size to validate */
+static void buf_buddy_list_validate(ulint i)
+{
+ ut_list_validate(buf_pool.zip_free[i], CheckZipFree(i));
+}
+
+/**********************************************************************//**
+Debug function to validate that a buffer is indeed free i.e.: in the
+zip_free[].
+@param[in] buf block to check
+@param[in] i index of buf_pool.zip_free[]
+@return true if free */
+static bool buf_buddy_check_free(const buf_buddy_free_t* buf, ulint i)
+{
+ const ulint size = BUF_BUDDY_LOW << i;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(!ut_align_offset(buf, size));
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ buf_buddy_free_t* itr;
+
+ for (itr = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+ itr && itr != buf;
+ itr = UT_LIST_GET_NEXT(list, itr)) {
+ }
+
+ return(itr == buf);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Checks if a buf is free i.e.: in the zip_free[].
+@retval BUF_BUDDY_STATE_FREE if fully free
+@retval BUF_BUDDY_STATE_USED if currently in use
+@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */
+static MY_ATTRIBUTE((warn_unused_result))
+buf_buddy_state_t
+buf_buddy_is_free(
+/*==============*/
+ buf_buddy_free_t* buf, /*!< in: block to check */
+ ulint i) /*!< in: index of
+ buf_pool.zip_free[] */
+{
+#ifdef UNIV_DEBUG
+ const ulint size = BUF_BUDDY_LOW << i;
+ ut_ad(!ut_align_offset(buf, size));
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+#endif /* UNIV_DEBUG */
+
+ /* We assume that all memory from buf_buddy_alloc()
+ is used for compressed page frames. */
+
+ /* We look inside the allocated objects returned by
+ buf_buddy_alloc() and assume that each block is a compressed
+ page that contains one of the following in space_id.
+ * BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or
+ * BUF_BUDDY_STAMP_NONFREE if the block has been allocated but
+ not initialized yet or
+ * A valid space_id of a compressed tablespace
+
+ The call below attempts to read from free memory. The memory
+ is "owned" by the buddy allocator (and it has been allocated
+ from the buffer pool), so there is nothing wrong about this. */
+ if (!buf_buddy_stamp_is_free(buf)) {
+ return(BUF_BUDDY_STATE_USED);
+ }
+
+ /* A block may be free but a fragment of it may still be in use.
+ To guard against that we write the free block size in terms of
+ zip_free index at start of stamped block. Note that we can
+ safely rely on this value only if the buf is free. */
+ ut_ad(buf->stamp.size <= i);
+ return(buf->stamp.size == i
+ ? BUF_BUDDY_STATE_FREE
+ : BUF_BUDDY_STATE_PARTIALLY_USED);
+}
+
+/** Add a block to the head of the appropriate buddy free list.
+@param[in,out] buf block to be freed
+@param[in] i index of buf_pool.zip_free[] */
+UNIV_INLINE
+void
+buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_pool.zip_free[i].start != buf);
+
+ buf_buddy_stamp_free(buf, i);
+ UT_LIST_ADD_FIRST(buf_pool.zip_free[i], buf);
+ ut_d(buf_buddy_list_validate(i));
+}
+
+/** Remove a block from the appropriate buddy free list.
+@param[in,out] buf block to be freed
+@param[in] i index of buf_pool.zip_free[] */
+UNIV_INLINE
+void
+buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_buddy_check_free(buf, i));
+
+ UT_LIST_REMOVE(buf_pool.zip_free[i], buf);
+ buf_buddy_stamp_nonfree(buf, i);
+}
+
+/** Try to allocate a block from buf_pool.zip_free[].
+@param[in] i index of buf_pool.zip_free[]
+@return allocated block, or NULL if buf_pool.zip_free[] was empty */
+static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i)
+{
+ buf_buddy_free_t* buf;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_a(i < BUF_BUDDY_SIZES);
+ ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ ut_d(buf_buddy_list_validate(i));
+
+ buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+
+ if (buf_pool.curr_size < buf_pool.old_size
+ && UT_LIST_GET_LEN(buf_pool.withdraw)
+ < buf_pool.withdraw_target) {
+
+ while (buf != NULL
+ && buf_pool.will_be_withdrawn(
+ reinterpret_cast<byte*>(buf))) {
+ /* This should be withdrawn, not to be allocated */
+ buf = UT_LIST_GET_NEXT(list, buf);
+ }
+ }
+
+ if (buf) {
+ buf_buddy_remove_from_free(buf, i);
+ } else if (i + 1 < BUF_BUDDY_SIZES) {
+ /* Attempt to split. */
+ buf = buf_buddy_alloc_zip(i + 1);
+
+ if (buf) {
+ buf_buddy_free_t* buddy =
+ reinterpret_cast<buf_buddy_free_t*>(
+ reinterpret_cast<byte*>(buf)
+ + (BUF_BUDDY_LOW << i));
+ ut_ad(!buf_pool.contains_zip(buddy));
+ buf_buddy_add_to_free(buddy, i);
+ }
+ }
+
+ if (buf) {
+ /* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */
+ MEM_UNDEFINED(buf, BUF_BUDDY_STAMP_OFFSET);
+ MEM_UNDEFINED(BUF_BUDDY_STAMP_OFFSET + 4 + buf->stamp.bytes,
+ (BUF_BUDDY_LOW << i)
+ - (BUF_BUDDY_STAMP_OFFSET + 4));
+ ut_ad(mach_read_from_4(buf->stamp.bytes
+ + BUF_BUDDY_STAMP_OFFSET)
+ == BUF_BUDDY_STAMP_NONFREE);
+ }
+
+ return(buf);
+}
+
+/** Deallocate a buffer frame of srv_page_size.
+@param[in] buf buffer frame to deallocate */
+static
+void
+buf_buddy_block_free(void* buf)
+{
+ const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf);
+ buf_page_t* bpage;
+ buf_block_t* block;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_a(!ut_align_offset(buf, srv_page_size));
+
+ HASH_SEARCH(hash, &buf_pool.zip_hash, fold, buf_page_t*, bpage,
+ ut_ad(bpage->state() == BUF_BLOCK_MEMORY
+ && bpage->in_zip_hash),
+ ((buf_block_t*) bpage)->frame == buf);
+ ut_a(bpage);
+ ut_a(bpage->state() == BUF_BLOCK_MEMORY);
+ ut_ad(bpage->in_zip_hash);
+ ut_d(bpage->in_zip_hash = false);
+ HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage);
+
+ ut_d(memset(buf, 0, srv_page_size));
+ MEM_UNDEFINED(buf, srv_page_size);
+
+ block = (buf_block_t*) bpage;
+ buf_LRU_block_free_non_file_page(block);
+
+ ut_ad(buf_pool.buddy_n_frames > 0);
+ ut_d(buf_pool.buddy_n_frames--);
+}
+
+/**********************************************************************//**
+Allocate a buffer block to the buddy allocator. */
+static
+void
+buf_buddy_block_register(
+/*=====================*/
+ buf_block_t* block) /*!< in: buffer frame to allocate */
+{
+ const ulint fold = BUF_POOL_ZIP_FOLD(block);
+ ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+
+ ut_a(block->frame);
+ ut_a(!ut_align_offset(block->frame, srv_page_size));
+
+ ut_ad(!block->page.in_zip_hash);
+ ut_d(block->page.in_zip_hash = true);
+ HASH_INSERT(buf_page_t, hash, &buf_pool.zip_hash, fold, &block->page);
+
+ ut_d(buf_pool.buddy_n_frames++);
+}
+
+/** Allocate a block from a bigger object.
+@param[in] buf a block that is free to use
+@param[in] i index of buf_pool.zip_free[]
+@param[in] j size of buf as an index of buf_pool.zip_free[]
+@return allocated block */
+static
+void*
+buf_buddy_alloc_from(void* buf, ulint i, ulint j)
+{
+ ulint offs = BUF_BUDDY_LOW << j;
+ ut_ad(j <= BUF_BUDDY_SIZES);
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+ ut_ad(j >= i);
+ ut_ad(!ut_align_offset(buf, offs));
+
+ /* Add the unused parts of the block to the free lists. */
+ while (j > i) {
+ buf_buddy_free_t* zip_buf;
+
+ offs >>= 1;
+ j--;
+
+ zip_buf = reinterpret_cast<buf_buddy_free_t*>(
+ reinterpret_cast<byte*>(buf) + offs);
+ buf_buddy_add_to_free(zip_buf, j);
+ }
+
+ buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+ return(buf);
+}
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
+@param lru assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+byte *buf_buddy_alloc_low(ulint i, bool *lru)
+{
+ buf_block_t* block;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ if (i < BUF_BUDDY_SIZES) {
+ /* Try to allocate from the buddy system. */
+ block = (buf_block_t*) buf_buddy_alloc_zip(i);
+
+ if (block) {
+ goto func_exit;
+ }
+ }
+
+ /* Try allocating from the buf_pool.free list. */
+ block = buf_LRU_get_free_only();
+
+ if (block) {
+ goto alloc_big;
+ }
+
+ /* Try replacing an uncompressed page in the buffer pool. */
+ block = buf_LRU_get_free_block(true);
+ if (lru) {
+ *lru = true;
+ }
+
+alloc_big:
+ buf_buddy_block_register(block);
+
+ block = (buf_block_t*) buf_buddy_alloc_from(
+ block->frame, i, BUF_BUDDY_SIZES);
+
+func_exit:
+ buf_pool.buddy_stat[i].used++;
+ return reinterpret_cast<byte*>(block);
+}
+
+/** Try to relocate a block. The caller must hold zip_free_mutex, and this
+function will release and lock it again.
+@param[in] src block to relocate
+@param[in] dst free block to relocated to
+@param[in] i index of buf_pool.zip_free[]
+@param[in] force true if we must relocated always
+@return true if relocated */
+static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
+{
+ buf_page_t* bpage;
+ const ulint size = BUF_BUDDY_LOW << i;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(!ut_align_offset(src, size));
+ ut_ad(!ut_align_offset(dst, size));
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+ MEM_CHECK_ADDRESSABLE(dst, size);
+
+ uint32_t space = mach_read_from_4(static_cast<const byte*>(src)
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ uint32_t offset = mach_read_from_4(static_cast<const byte*>(src)
+ + FIL_PAGE_OFFSET);
+
+ /* Suppress Valgrind or MSAN warnings. */
+ MEM_MAKE_DEFINED(&space, sizeof space);
+ MEM_MAKE_DEFINED(&offset, sizeof offset);
+
+ ut_ad(space != BUF_BUDDY_STAMP_FREE);
+
+ const page_id_t page_id(space, offset);
+ const ulint fold= page_id.fold();
+
+ bpage = buf_pool.page_hash_get_low(page_id, fold);
+
+ if (!bpage || bpage->zip.data != src) {
+ /* The block has probably been freshly
+ allocated by buf_LRU_get_free_block() but not
+ added to buf_pool.page_hash yet. Obviously,
+ it cannot be relocated. */
+
+ if (!force || space != 0 || offset != 0) {
+ return(false);
+ }
+
+ /* It might be just uninitialized page.
+ We should search from LRU list also. */
+
+ bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+ while (bpage != NULL) {
+ if (bpage->zip.data == src) {
+ ut_ad(bpage->id() == page_id);
+ break;
+ }
+ bpage = UT_LIST_GET_NEXT(LRU, bpage);
+ }
+
+ if (bpage == NULL) {
+ return(false);
+ }
+ }
+
+ if (page_zip_get_size(&bpage->zip) != size) {
+ /* The block is of different size. We would
+ have to relocate all blocks covered by src.
+ For the sake of simplicity, give up. */
+ ut_ad(page_zip_get_size(&bpage->zip) < size);
+ return(false);
+ }
+
+ /* The block must have been allocated, but it may
+ contain uninitialized data. */
+ MEM_CHECK_ADDRESSABLE(src, size);
+
+ if (!bpage->can_relocate()) {
+ return false;
+ }
+
+ page_hash_latch *hash_lock = buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+
+ if (bpage->can_relocate()) {
+ /* Relocate the compressed page. */
+ const ulonglong ns = my_interval_timer();
+
+ ut_a(bpage->zip.data == src);
+
+ memcpy(dst, src, size);
+ bpage->zip.data = reinterpret_cast<page_zip_t*>(dst);
+
+ hash_lock->write_unlock();
+
+ buf_buddy_mem_invalid(
+ reinterpret_cast<buf_buddy_free_t*>(src), i);
+
+ buf_buddy_stat_t* buddy_stat = &buf_pool.buddy_stat[i];
+ buddy_stat->relocated++;
+ buddy_stat->relocated_usec+= (my_interval_timer() - ns) / 1000;
+ return(true);
+ }
+
+ hash_lock->write_unlock();
+
+ return(false);
+}
+
+/** Deallocate a block.
+@param[in] buf block to be freed, must not be pointed to
+ by the buffer pool
+@param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+void buf_buddy_free_low(void* buf, ulint i)
+{
+ buf_buddy_free_t* buddy;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(i <= BUF_BUDDY_SIZES);
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+ ut_ad(buf_pool.buddy_stat[i].used > 0);
+
+ buf_pool.buddy_stat[i].used--;
+recombine:
+ MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i);
+
+ if (i == BUF_BUDDY_SIZES) {
+ buf_buddy_block_free(buf);
+ return;
+ }
+
+ ut_ad(i < BUF_BUDDY_SIZES);
+ ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
+ ut_ad(!buf_pool.contains_zip(buf));
+
+ /* Do not recombine blocks if there are few free blocks.
+ We may waste up to 15360*max_len bytes to free blocks
+ (1024 + 2048 + 4096 + 8192 = 15360) */
+ if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16
+ && buf_pool.curr_size >= buf_pool.old_size) {
+ goto func_exit;
+ }
+
+ /* Try to combine adjacent blocks. */
+ buddy = reinterpret_cast<buf_buddy_free_t*>(
+ buf_buddy_get(reinterpret_cast<byte*>(buf),
+ BUF_BUDDY_LOW << i));
+
+ switch (buf_buddy_is_free(buddy, i)) {
+ case BUF_BUDDY_STATE_FREE:
+ /* The buddy is free: recombine */
+ buf_buddy_remove_from_free(buddy, i);
+buddy_is_free:
+ ut_ad(!buf_pool.contains_zip(buddy));
+ i++;
+ buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
+
+ goto recombine;
+
+ case BUF_BUDDY_STATE_USED:
+ ut_d(buf_buddy_list_validate(i));
+
+ /* The buddy is not free. Is there a free block of
+ this size? */
+ if (buf_buddy_free_t* zip_buf =
+ UT_LIST_GET_FIRST(buf_pool.zip_free[i])) {
+
+ /* Remove the block from the free list, because
+ a successful buf_buddy_relocate() will overwrite
+ zip_free->list. */
+ buf_buddy_remove_from_free(zip_buf, i);
+
+ /* Try to relocate the buddy of buf to the free
+ block. */
+ if (buf_buddy_relocate(buddy, zip_buf, i, false)) {
+ goto buddy_is_free;
+ }
+
+ buf_buddy_add_to_free(zip_buf, i);
+ }
+
+ break;
+ case BUF_BUDDY_STATE_PARTIALLY_USED:
+ /* Some sub-blocks in the buddy are still in use.
+ Relocation will fail. No need to try. */
+ break;
+ }
+
+func_exit:
+ /* Free the block to the buddy list. */
+ buf_buddy_add_to_free(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+}
+
+/** Try to reallocate a block.
+@param[in] buf buf_pool block to be reallocated
+@param[in] size block size, up to srv_page_size
+@return whether the reallocation succeeded */
+bool
+buf_buddy_realloc(void* buf, ulint size)
+{
+ buf_block_t* block = NULL;
+ ulint i = buf_buddy_get_slot(size);
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(i <= BUF_BUDDY_SIZES);
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ if (i < BUF_BUDDY_SIZES) {
+ /* Try to allocate from the buddy system. */
+ block = reinterpret_cast<buf_block_t*>(buf_buddy_alloc_zip(i));
+ }
+
+ if (block == NULL) {
+ /* Try allocating from the buf_pool.free list. */
+ block = buf_LRU_get_free_only();
+
+ if (block == NULL) {
+ return(false); /* free_list was not enough */
+ }
+
+ buf_buddy_block_register(block);
+
+ block = reinterpret_cast<buf_block_t*>(
+ buf_buddy_alloc_from(
+ block->frame, i, BUF_BUDDY_SIZES));
+ }
+
+ buf_pool.buddy_stat[i].used++;
+
+ /* Try to relocate the buddy of buf to the free block. */
+ if (buf_buddy_relocate(buf, block, i, true)) {
+ /* succeeded */
+ buf_buddy_free_low(buf, i);
+ } else {
+ /* failed */
+ buf_buddy_free_low(block, i);
+ }
+
+ return(true); /* free_list was enough */
+}
+
+/** Combine all pairs of free buddies. */
+void buf_buddy_condense_free()
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_pool.curr_size < buf_pool.old_size);
+
+ for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) {
+ buf_buddy_free_t* buf =
+ UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+
+ /* seek to withdraw target */
+ while (buf != NULL
+ && !buf_pool.will_be_withdrawn(
+ reinterpret_cast<byte*>(buf))) {
+ buf = UT_LIST_GET_NEXT(list, buf);
+ }
+
+ while (buf != NULL) {
+ buf_buddy_free_t* next =
+ UT_LIST_GET_NEXT(list, buf);
+
+ buf_buddy_free_t* buddy =
+ reinterpret_cast<buf_buddy_free_t*>(
+ buf_buddy_get(
+ reinterpret_cast<byte*>(buf),
+ BUF_BUDDY_LOW << i));
+
+ /* seek to the next withdraw target */
+ while (true) {
+ while (next != NULL
+ && !buf_pool.will_be_withdrawn(
+ reinterpret_cast<byte*>(next))) {
+ next = UT_LIST_GET_NEXT(list, next);
+ }
+
+ if (buddy != next) {
+ break;
+ }
+
+ next = UT_LIST_GET_NEXT(list, next);
+ }
+
+ if (buf_buddy_is_free(buddy, i)
+ == BUF_BUDDY_STATE_FREE) {
+ /* Both buf and buddy are free.
+ Try to combine them. */
+ buf_buddy_remove_from_free(buf, i);
+ buf_pool.buddy_stat[i].used++;
+
+ buf_buddy_free_low(buf, i);
+ }
+
+ buf = next;
+ }
+ }
+}
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
new file mode 100644
index 00000000..b658bdfc
--- /dev/null
+++ b/storage/innobase/buf/buf0buf.cc
@@ -0,0 +1,4728 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buf.cc
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "assume_aligned.h"
+#include "mtr0types.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "ut0crc32.h"
+#include <string.h>
+
+#ifndef UNIV_INNOCHECKSUM
+#include "my_cpu.h"
+#include "mem0mem.h"
+#include "btr0btr.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "buf0buddy.h"
+#include "buf0dblwr.h"
+#include "lock0lock.h"
+#include "sync0rw.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "dict0stats_bg.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "srv0mon.h"
+#include "log0crypt.h"
+#include "fil0pagecompress.h"
+#endif /* !UNIV_INNOCHECKSUM */
+#include "page0zip.h"
+#include "sync0sync.h"
+#include "buf0dump.h"
+#include <map>
+#include <sstream>
+
+using st_::span;
+
+#ifdef HAVE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+struct set_numa_interleave_t
+{
+ set_numa_interleave_t()
+ {
+ if (srv_numa_interleave) {
+
+ struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
+ ib::info() << "Setting NUMA memory policy to"
+ " MPOL_INTERLEAVE";
+ if (set_mempolicy(MPOL_INTERLEAVE,
+ numa_mems_allowed->maskp,
+ numa_mems_allowed->size) != 0) {
+
+ ib::warn() << "Failed to set NUMA memory"
+ " policy to MPOL_INTERLEAVE: "
+ << strerror(errno);
+ }
+ numa_bitmask_free(numa_mems_allowed);
+ }
+ }
+
+ ~set_numa_interleave_t()
+ {
+ if (srv_numa_interleave) {
+
+ ib::info() << "Setting NUMA memory policy to"
+ " MPOL_DEFAULT";
+ if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
+ ib::warn() << "Failed to set NUMA memory"
+ " policy to MPOL_DEFAULT: "
+ << strerror(errno);
+ }
+ }
+ }
+};
+
+#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa
+#else
+#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
+#endif /* HAVE_LIBNUMA */
+
+/*
+ IMPLEMENTATION OF THE BUFFER POOL
+ =================================
+
+ Buffer frames and blocks
+ ------------------------
+Following the terminology of Gray and Reuter, we call the memory
+blocks where file pages are loaded buffer frames. For each buffer
+frame there is a control block, or shortly, a block, in the buffer
+control array. The control info which does not need to be stored
+in the file along with the file page, resides in the control block.
+
+ Buffer pool struct
+ ------------------
+The buffer buf_pool contains a single mutex which protects all the
+control data structures of the buf_pool. The content of a buffer frame is
+protected by a separate read-write lock in its control block, though.
+These locks can be locked and unlocked without owning the buf_pool.mutex.
+The OS events in the buf_pool struct can be waited for without owning the
+buf_pool.mutex.
+
+The buf_pool.mutex is a hot-spot in main memory, causing a lot of
+memory bus traffic on multiprocessor systems when processors
+alternately access the mutex. On our Pentium, the mutex is accessed
+maybe every 10 microseconds. We gave up the solution to have mutexes
+for each control block, for instance, because it seemed to be
+complicated.
+
+A solution to reduce mutex contention of the buf_pool.mutex is to
+create a separate mutex for the page hash table. On Pentium,
+accessing the hash table takes 2 microseconds, about half
+of the total buf_pool.mutex hold time.
+
+ Control blocks
+ --------------
+
+The control block contains, for instance, the bufferfix count
+which is incremented when a thread wants a file page to be fixed
+in a buffer frame. The bufferfix operation does not lock the
+contents of the frame, however. For this purpose, the control
+block contains a read-write lock.
+
+The buffer frames have to be aligned so that the start memory
+address of a frame is divisible by the universal page size, which
+is a power of two.
+
+The control blocks containing file pages are put to a hash table
+according to the file address of the page.
+We could speed up the access to an individual page by using
+"pointer swizzling": we could replace the page references on
+non-leaf index pages by direct pointers to the page, if it exists
+in the buf_pool. We could make a separate hash table where we could
+chain all the page references in non-leaf pages residing in the buf_pool,
+using the page reference as the hash key,
+and at the time of reading of a page update the pointers accordingly.
+Drawbacks of this solution are added complexity and,
+possibly, extra space required on non-leaf pages for memory pointers.
+A simpler solution is just to speed up the hash table mechanism
+in the database, using tables whose size is a power of 2.
+
+ Lists of blocks
+ ---------------
+
+There are several lists of control blocks.
+
+The free list (buf_pool.free) contains blocks which are currently not
+used.
+
+The common LRU list contains all the blocks holding a file page
+except those for which the bufferfix count is non-zero.
+The pages are in the LRU list roughly in the order of the last
+access to the page, so that the oldest pages are at the end of the
+list. We also keep a pointer to near the end of the LRU list,
+which we can use when we want to artificially age a page in the
+buf_pool. This is used if we know that some page is not needed
+again for some time: we insert the block right after the pointer,
+causing it to be replaced sooner than would normally be the case.
+Currently this aging mechanism is used for read-ahead mechanism
+of pages, and it can also be used when there is a scan of a full
+table which cannot fit in the memory. Putting the pages near the
+end of the LRU list, we make sure that most of the buf_pool stays
+in the main memory, undisturbed.
+
+The unzip_LRU list contains a subset of the common LRU list. The
+blocks on the unzip_LRU list hold a compressed file page and the
+corresponding uncompressed page frame. A block is in unzip_LRU if and
+only if the predicate block->page.belongs_to_unzip_LRU()
+holds. The blocks in unzip_LRU will be in same order as they are in
+the common LRU list. That is, each manipulation of the common LRU
+list will result in the same manipulation of the unzip_LRU list.
+
+The chain of modified blocks (buf_pool.flush_list) contains the blocks
+holding persistent file pages that have been modified in the memory
+but not written to disk yet. The block with the oldest modification
+which has not yet been written to disk is at the end of the chain.
+The access to this list is protected by buf_pool.flush_list_mutex.
+
+The control blocks for uncompressed pages are accessible via
+buf_block_t objects that are reachable via buf_pool.chunks[].
+The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages
+that are not in buf_pool.flush_list and for which no uncompressed
+page has been allocated in buf_pool are only accessible via
+buf_pool.LRU.
+
+The chains of free memory blocks (buf_pool.zip_free[]) are used by
+the buddy allocator (buf0buddy.cc) to keep track of currently unused
+memory blocks of size sizeof(buf_page_t)..srv_page_size / 2. These
+blocks are inside the srv_page_size-sized memory blocks of type
+BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
+pool. The buddy allocator is solely used for allocating control
+blocks for compressed pages (buf_page_t) and compressed page frames.
+
+ Loading a file page
+ -------------------
+
+First, a victim block for replacement has to be found in the
+buf_pool. It is taken from the free list or searched for from the
+end of the LRU-list. An exclusive lock is reserved for the frame,
+the io_fix field is set in the block fixing the block in buf_pool,
+and the io-operation for loading the page is queued. The io-handler thread
+releases the X-lock on the frame and resets the io_fix field
+when the io operation completes.
+
+A thread may request the above operation using the function
+buf_page_get(). It may then continue to request a lock on the frame.
+The lock is granted when the io-handler releases the x-lock.
+
+ Read-ahead
+ ----------
+
+The read-ahead mechanism is intended to be intelligent and
+isolated from the semantically higher levels of the database
+index management. From the higher level we only need the
+information if a file page has a natural successor or
+predecessor page. On the leaf level of a B-tree index,
+these are the next and previous pages in the natural
+order of the pages.
+
+Let us first explain the read-ahead mechanism when the leafs
+of a B-tree are scanned in an ascending or descending order.
+When a read page is the first time referenced in the buf_pool,
+the buffer manager checks if it is at the border of a so-called
+linear read-ahead area. The tablespace is divided into these
+areas of size 64 blocks, for example. So if the page is at the
+border of such an area, the read-ahead mechanism checks if
+all the other blocks in the area have been accessed in an
+ascending or descending order. If this is the case, the system
+looks at the natural successor or predecessor of the page,
+checks if that is at the border of another area, and in this case
+issues read-requests for all the pages in that area. Maybe
+we could relax the condition that all the pages in the area
+have to be accessed: if data is deleted from a table, there may
+appear holes of unused pages in the area.
+
+A different read-ahead mechanism is used when there appears
+to be a random access pattern to a file.
+If a new page is referenced in the buf_pool, and several pages
+of its random access area (for instance, 32 consecutive pages
+in a tablespace) have recently been referenced, we may predict
+that the whole area may be needed in the near future, and issue
+the read requests for the whole area.
+*/
+
+#ifndef UNIV_INNOCHECKSUM
+void page_hash_latch::read_lock_wait()
+{
+ /* First, try busy spinning for a while. */
+ for (auto spin= srv_n_spin_wait_rounds; spin--; )
+ {
+ ut_delay(srv_spin_wait_delay);
+ if (read_trylock())
+ return;
+ }
+ /* Fall back to yielding to other threads. */
+ do
+ os_thread_yield();
+ while (!read_trylock());
+}
+
+void page_hash_latch::write_lock_wait()
+{
+ write_lock_wait_start();
+
+ /* First, try busy spinning for a while. */
+ for (auto spin= srv_n_spin_wait_rounds; spin--; )
+ {
+ if (write_lock_poll())
+ return;
+ ut_delay(srv_spin_wait_delay);
+ }
+
+ /* Fall back to yielding to other threads. */
+ do
+ os_thread_yield();
+ while (!write_lock_poll());
+}
+
+/** Value in microseconds */
+constexpr int WAIT_FOR_READ= 100;
+constexpr int WAIT_FOR_WRITE= 100;
+/** Number of attempts made to read in a page in the buffer pool */
+constexpr ulint BUF_PAGE_READ_MAX_RETRIES= 100;
+/** The maximum portion of the buffer pool that can be used for the
+read-ahead buffer. (Divide buf_pool size by this amount) */
+constexpr uint32_t BUF_READ_AHEAD_PORTION= 32;
+
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
+and dummy default values of instantly dropped columns.
+Initially, BLOB field references are set to NUL bytes, in
+dtuple_convert_big_rec(). */
+const byte *field_ref_zero;
+
+/** The InnoDB buffer pool */
+buf_pool_t buf_pool;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref;
+
+#ifdef UNIV_DEBUG
+/** Disable resizing buffer pool to make assertion code not expensive. */
+my_bool buf_disable_resize_buffer_pool_debug = TRUE;
+
+/** This is used to insert validation operations in execution
+in the debug version */
+static ulint buf_dbg_counter;
+#endif /* UNIV_DEBUG */
+
+/** Macro to determine whether the read of write counter is used depending
+on the io_type */
+#define MONITOR_RW_COUNTER(io_type, counter) \
+ ((io_type == BUF_IO_READ) \
+ ? (counter##_READ) \
+ : (counter##_WRITTEN))
+
+
+/** Decrypt a page for temporary tablespace.
+@param[in,out] tmp_frame Temporary buffer
+@param[in] src_frame Page to decrypt
+@return true if temporary tablespace decrypted, false if not */
+static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame)
+{
+ if (buf_is_zeroes(span<const byte>(src_frame, srv_page_size))) {
+ return true;
+ }
+
+ /* read space & lsn */
+ uint header_len = FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+
+ /* Copy FIL page header, it is not encrypted */
+ memcpy(tmp_frame, src_frame, header_len);
+
+ /* Calculate the offset where decryption starts */
+ const byte* src = src_frame + header_len;
+ byte* dst = tmp_frame + header_len;
+ uint srclen = uint(srv_page_size)
+ - (header_len + FIL_PAGE_FCRC32_CHECKSUM);
+ ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+
+ if (!log_tmp_block_decrypt(src, srclen, dst,
+ (offset * srv_page_size))) {
+ return false;
+ }
+
+ static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+ memcpy_aligned<4>(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+ src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+ FIL_PAGE_FCRC32_CHECKSUM);
+
+ memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(src_frame, tmp_frame,
+ srv_page_size);
+ srv_stats.pages_decrypted.inc();
+ srv_stats.n_temp_blocks_decrypted.inc();
+
+ return true; /* page was decrypted */
+}
+
+/** Decrypt a page.
+@param[in,out] bpage Page control block
+@param[in] node data file
+@return whether the operation was successful */
+static bool buf_page_decrypt_after_read(buf_page_t *bpage,
+ const fil_node_t &node)
+{
+ ut_ad(node.space->referenced());
+ ut_ad(node.space->id == bpage->id().space());
+ const auto flags = node.space->flags;
+
+ byte* dst_frame = bpage->zip.data ? bpage->zip.data :
+ ((buf_block_t*) bpage)->frame;
+ bool page_compressed = node.space->is_compressed()
+ && buf_page_is_compressed(dst_frame, flags);
+ const page_id_t id(bpage->id());
+
+ if (id.page_no() == 0) {
+ /* File header pages are not encrypted/compressed */
+ return (true);
+ }
+
+ if (node.space->purpose == FIL_TYPE_TEMPORARY
+ && innodb_encrypt_temporary_tables) {
+ buf_tmp_buffer_t* slot = buf_pool.io_buf_reserve();
+ ut_a(slot);
+ slot->allocate();
+
+ if (!buf_tmp_page_decrypt(slot->crypt_buf, dst_frame)) {
+ slot->release();
+ ib::error() << "Encrypted page " << id
+ << " in file " << node.name;
+ return false;
+ }
+
+ slot->release();
+ return true;
+ }
+
+ /* Page is encrypted if encryption information is found from
+ tablespace and page contains used key_version. This is true
+ also for pages first compressed and then encrypted. */
+
+ buf_tmp_buffer_t* slot;
+ uint key_version = buf_page_get_key_version(dst_frame, flags);
+
+ if (page_compressed && !key_version) {
+ /* the page we read is unencrypted */
+ /* Find free slot from temporary memory array */
+decompress:
+ if (fil_space_t::full_crc32(flags)
+ && buf_page_is_corrupted(true, dst_frame, flags)) {
+ return false;
+ }
+
+ slot = buf_pool.io_buf_reserve();
+ ut_a(slot);
+ slot->allocate();
+
+decompress_with_slot:
+ ut_d(fil_page_type_validate(node.space, dst_frame));
+
+ ulint write_size = fil_page_decompress(
+ slot->crypt_buf, dst_frame, flags);
+ slot->release();
+ ut_ad(!write_size
+ || fil_page_type_validate(node.space, dst_frame));
+ ut_ad(node.space->referenced());
+ return write_size != 0;
+ }
+
+ if (key_version && node.space->crypt_data) {
+ /* Verify encryption checksum before we even try to
+ decrypt. */
+ if (!buf_page_verify_crypt_checksum(dst_frame, flags)) {
+decrypt_failed:
+ ib::error() << "Encrypted page " << id
+ << " in file " << node.name
+ << " looks corrupted; key_version="
+ << key_version;
+ return false;
+ }
+
+ slot = buf_pool.io_buf_reserve();
+ ut_a(slot);
+ slot->allocate();
+ ut_d(fil_page_type_validate(node.space, dst_frame));
+
+ /* decrypt using crypt_buf to dst_frame */
+ if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) {
+ slot->release();
+ goto decrypt_failed;
+ }
+
+ ut_d(fil_page_type_validate(node.space, dst_frame));
+
+ if ((fil_space_t::full_crc32(flags) && page_compressed)
+ || fil_page_get_type(dst_frame)
+ == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+ goto decompress_with_slot;
+ }
+
+ slot->release();
+ } else if (fil_page_get_type(dst_frame)
+ == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+ goto decompress;
+ }
+
+ ut_ad(node.space->referenced());
+ return true;
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Checks if the page is in crc32 checksum format.
+@param[in] read_buf database page
+@param[in] checksum_field1 new checksum field
+@param[in] checksum_field2 old checksum field
+@return true if the page is in crc32 checksum format. */
+bool
+buf_page_is_checksum_valid_crc32(
+ const byte* read_buf,
+ ulint checksum_field1,
+ ulint checksum_field2)
+{
+ const uint32_t crc32 = buf_calc_page_crc32(read_buf);
+
+#ifdef UNIV_INNOCHECKSUM
+ if (log_file
+ && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " crc32 calculated = " UINT32PF ";"
+ " recorded checksum field1 = " ULINTPF " recorded"
+ " checksum field2 =" ULINTPF "\n", cur_page_num,
+ crc32, checksum_field1, checksum_field2);
+ }
+#endif /* UNIV_INNOCHECKSUM */
+
+ if (checksum_field1 != checksum_field2) {
+ return false;
+ }
+
+ return checksum_field1 == crc32;
+}
+
+/** Checks if the page is in innodb checksum format.
+@param[in] read_buf database page
+@param[in] checksum_field1 new checksum field
+@param[in] checksum_field2 old checksum field
+@return true if the page is in innodb checksum format. */
+bool
+buf_page_is_checksum_valid_innodb(
+ const byte* read_buf,
+ ulint checksum_field1,
+ ulint checksum_field2)
+{
+ /* There are 2 valid formulas for
+ checksum_field2 (old checksum field) which algo=innodb could have
+ written to the page:
+
+ 1. Very old versions of InnoDB only stored 8 byte lsn to the
+ start and the end of the page.
+
+ 2. Newer InnoDB versions store the old formula checksum
+ (buf_calc_page_old_checksum()). */
+
+ ulint old_checksum = buf_calc_page_old_checksum(read_buf);
+ ulint new_checksum = buf_calc_page_new_checksum(read_buf);
+
+#ifdef UNIV_INNOCHECKSUM
+ if (log_file
+ && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " old style: calculated ="
+ " " ULINTPF "; recorded = " ULINTPF "\n",
+ cur_page_num, old_checksum,
+ checksum_field2);
+ fprintf(log_file, "page::" UINT32PF ";"
+ " new style: calculated ="
+ " " ULINTPF "; crc32 = " UINT32PF "; recorded = " ULINTPF "\n",
+ cur_page_num, new_checksum,
+ buf_calc_page_crc32(read_buf), checksum_field1);
+ }
+
+ if (log_file
+ && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " old style: calculated ="
+ " " ULINTPF "; recorded checksum = " ULINTPF "\n",
+ cur_page_num, old_checksum,
+ checksum_field2);
+ fprintf(log_file, "page::" UINT32PF ";"
+ " new style: calculated ="
+ " " ULINTPF "; recorded checksum = " ULINTPF "\n",
+ cur_page_num, new_checksum,
+ checksum_field1);
+ }
+#endif /* UNIV_INNOCHECKSUM */
+
+
+ if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ && checksum_field2 != old_checksum) {
+ DBUG_LOG("checksum",
+ "Page checksum crc32 not valid"
+ << " field1 " << checksum_field1
+ << " field2 " << checksum_field2
+ << " crc32 " << buf_calc_page_old_checksum(read_buf)
+ << " lsn " << mach_read_from_4(
+ read_buf + FIL_PAGE_LSN));
+ return(false);
+ }
+
+ /* old field is fine, check the new field */
+
+ /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
+ (always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
+
+ if (checksum_field1 != 0 && checksum_field1 != new_checksum) {
+ DBUG_LOG("checksum",
+ "Page checksum crc32 not valid"
+ << " field1 " << checksum_field1
+ << " field2 " << checksum_field2
+ << " crc32 " << buf_calc_page_new_checksum(read_buf)
+ << " lsn " << mach_read_from_4(
+ read_buf + FIL_PAGE_LSN));
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Checks if the page is in none checksum format.
+@param[in] read_buf database page
+@param[in] checksum_field1 new checksum field
+@param[in] checksum_field2 old checksum field
+@return true if the page is in none checksum format. */
+bool
+buf_page_is_checksum_valid_none(
+ const byte* read_buf,
+ ulint checksum_field1,
+ ulint checksum_field2)
+{
+#ifndef DBUG_OFF
+ if (checksum_field1 != checksum_field2
+ && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
+ DBUG_LOG("checksum",
+ "Page checksum crc32 not valid"
+ << " field1 " << checksum_field1
+ << " field2 " << checksum_field2
+ << " crc32 " << BUF_NO_CHECKSUM_MAGIC
+ << " lsn " << mach_read_from_4(read_buf
+ + FIL_PAGE_LSN));
+ }
+#endif /* DBUG_OFF */
+
+#ifdef UNIV_INNOCHECKSUM
+ if (log_file
+ && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) {
+ fprintf(log_file,
+ "page::" UINT32PF "; none checksum: calculated"
+ " = %lu; recorded checksum_field1 = " ULINTPF
+ " recorded checksum_field2 = " ULINTPF "\n",
+ cur_page_num, BUF_NO_CHECKSUM_MAGIC,
+ checksum_field1, checksum_field2);
+ }
+#endif /* UNIV_INNOCHECKSUM */
+
+ return(checksum_field1 == checksum_field2
+ && checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
+}
+
+/** Checks whether the lsn present in the page is lesser than the
+peek current lsn.
+@param[in] check_lsn lsn to check
+@param[in] read_buf page. */
+static void buf_page_check_lsn(bool check_lsn, const byte* read_buf)
+{
+#ifndef UNIV_INNOCHECKSUM
+ if (check_lsn && recv_lsn_checks_on) {
+ const lsn_t current_lsn = log_sys.get_lsn();
+ const lsn_t page_lsn
+ = mach_read_from_8(read_buf + FIL_PAGE_LSN);
+
+ /* Since we are going to reset the page LSN during the import
+ phase it makes no sense to spam the log with error messages. */
+ if (current_lsn < page_lsn) {
+
+ const uint32_t space_id = mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_ID);
+ const uint32_t page_no = mach_read_from_4(
+ read_buf + FIL_PAGE_OFFSET);
+
+ ib::error() << "Page " << page_id_t(space_id, page_no)
+ << " log sequence number " << page_lsn
+ << " is in the future! Current system"
+ << " log sequence number "
+ << current_lsn << ".";
+
+ ib::error() << "Your database may be corrupt or"
+ " you may have copied the InnoDB"
+ " tablespace but not the InnoDB"
+ " log files. "
+ << FORCE_RECOVERY_MSG;
+
+ }
+ }
+#endif /* !UNIV_INNOCHECKSUM */
+}
+
+
+/** Check if a buffer is all zeroes.
+@param[in] buf data to check
+@return whether the buffer is all zeroes */
+bool buf_is_zeroes(span<const byte> buf)
+{
+ ut_ad(buf.size() <= UNIV_PAGE_SIZE_MAX);
+ return memcmp(buf.data(), field_ref_zero, buf.size()) == 0;
+}
+
+/** Check if a page is corrupt.
+@param[in] check_lsn whether the LSN should be checked
+@param[in] read_buf database page
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] space tablespace
+@return whether the page is corrupted */
+bool
+buf_page_is_corrupted(
+ bool check_lsn,
+ const byte* read_buf,
+ ulint fsp_flags)
+{
+#ifndef UNIV_INNOCHECKSUM
+ DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", return(true); );
+#endif
+ if (fil_space_t::full_crc32(fsp_flags)) {
+ bool compressed = false, corrupted = false;
+ const uint size = buf_page_full_crc32_size(
+ read_buf, &compressed, &corrupted);
+ if (corrupted) {
+ return true;
+ }
+ const byte* end = read_buf + (size - FIL_PAGE_FCRC32_CHECKSUM);
+ uint crc32 = mach_read_from_4(end);
+
+ if (!crc32 && size == srv_page_size
+ && buf_is_zeroes(span<const byte>(read_buf, size))) {
+ return false;
+ }
+
+ DBUG_EXECUTE_IF(
+ "page_intermittent_checksum_mismatch", {
+ static int page_counter;
+ if (page_counter++ == 2) {
+ crc32++;
+ }
+ });
+
+ if (crc32 != ut_crc32(read_buf,
+ size - FIL_PAGE_FCRC32_CHECKSUM)) {
+ return true;
+ }
+ static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "alignment");
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+ if (!compressed
+ && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION
+ + read_buf)
+ && memcmp_aligned<4>(read_buf + (FIL_PAGE_LSN + 4),
+ end - (FIL_PAGE_FCRC32_END_LSN
+ - FIL_PAGE_FCRC32_CHECKSUM),
+ 4)) {
+ return true;
+ }
+
+ buf_page_check_lsn(check_lsn, read_buf);
+ return false;
+ }
+
+ size_t checksum_field1 = 0;
+ size_t checksum_field2 = 0;
+ uint32_t crc32 = 0;
+ bool crc32_inited = false;
+ bool crc32_chksum = false;
+ const ulint zip_size = fil_space_t::zip_size(fsp_flags);
+ const uint16_t page_type = fil_page_get_type(read_buf);
+
+ /* We can trust page type if page compression is set on tablespace
+ flags because page compression flag means file must have been
+ created with 10.1 (later than 5.5 code base). In 10.1 page
+ compressed tables do not contain post compression checksum and
+ FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can
+ be null if we are in fil_check_first_page() and first page
+ is not compressed or encrypted. Page checksum is verified
+ after decompression (i.e. normally pages are already
+ decompressed at this stage). */
+ if ((page_type == FIL_PAGE_PAGE_COMPRESSED ||
+ page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
+#ifndef UNIV_INNOCHECKSUM
+ && FSP_FLAGS_HAS_PAGE_COMPRESSION(fsp_flags)
+#endif
+ ) {
+ return(false);
+ }
+
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 4 == 0, "alignment");
+
+ if (!zip_size
+ && memcmp_aligned<4>(read_buf + FIL_PAGE_LSN + 4,
+ read_buf + srv_page_size
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+ /* Stored log sequence numbers at the start and the end
+ of page do not match */
+
+ return(true);
+ }
+
+ buf_page_check_lsn(check_lsn, read_buf);
+
+ /* Check whether the checksum fields have correct values */
+
+ const srv_checksum_algorithm_t curr_algo =
+ static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
+
+ if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
+ return(false);
+ }
+
+ if (zip_size) {
+ return !page_zip_verify_checksum(read_buf, zip_size);
+ }
+
+ checksum_field1 = mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ checksum_field2 = mach_read_from_4(
+ read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+ static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
+
+ /* A page filled with NUL bytes is considered not corrupted.
+ Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7),
+ the FIL_PAGE_FILE_FLUSH_LSN field may have been written nonzero
+ for the first page of each file of the system tablespace.
+ We want to ignore it for the system tablespace, but because
+ we do not know the expected tablespace here, we ignore the
+ field for all data files, except for
+ innodb_checksum_algorithm=full_crc32 which we handled above. */
+ if (!checksum_field1 && !checksum_field2) {
+ /* Checksum fields can have valid value as zero.
+ If the page is not empty then do the checksum
+ calculation for the page. */
+ bool all_zeroes = true;
+ for (size_t i = 0; i < srv_page_size; i++) {
+#ifndef UNIV_INNOCHECKSUM
+ if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) {
+ i += 8;
+ }
+#endif
+ if (read_buf[i]) {
+ all_zeroes = false;
+ break;
+ }
+ }
+
+ if (all_zeroes) {
+ return false;
+ }
+ }
+
+ switch (curr_algo) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ return !buf_page_is_checksum_valid_crc32(
+ read_buf, checksum_field1, checksum_field2);
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ return !buf_page_is_checksum_valid_innodb(
+ read_buf, checksum_field1, checksum_field2);
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ return !buf_page_is_checksum_valid_none(
+ read_buf, checksum_field1, checksum_field2);
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ if (buf_page_is_checksum_valid_none(read_buf,
+ checksum_field1, checksum_field2)) {
+#ifdef UNIV_INNOCHECKSUM
+ if (log_file) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " old style: calculated = %u;"
+ " recorded = " ULINTPF ";\n",
+ cur_page_num,
+ buf_calc_page_old_checksum(read_buf),
+ checksum_field2);
+ fprintf(log_file, "page::" UINT32PF ";"
+ " new style: calculated = " UINT32PF ";"
+ " crc32 = " UINT32PF "; recorded = " ULINTPF ";\n",
+ cur_page_num,
+ buf_calc_page_new_checksum(read_buf),
+ buf_calc_page_crc32(read_buf),
+ checksum_field1);
+ }
+#endif /* UNIV_INNOCHECKSUM */
+ return false;
+ }
+
+ crc32_chksum = curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32
+ || curr_algo == SRV_CHECKSUM_ALGORITHM_FULL_CRC32;
+
+ /* Very old versions of InnoDB only stored 8 byte lsn to the
+ start and the end of the page. */
+
+ /* Since innodb_checksum_algorithm is not strict_* allow
+ any of the algos to match for the old field */
+
+ if (checksum_field2
+ != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
+
+ if (crc32_chksum) {
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = true;
+
+ DBUG_EXECUTE_IF(
+ "page_intermittent_checksum_mismatch", {
+ static int page_counter;
+ if (page_counter++ == 2) {
+ crc32++;
+ }
+ });
+
+ if (checksum_field2 != crc32
+ && checksum_field2
+ != buf_calc_page_old_checksum(read_buf)) {
+ return true;
+ }
+ } else {
+ ut_ad(curr_algo
+ == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+ if (checksum_field2
+ != buf_calc_page_old_checksum(read_buf)) {
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = true;
+
+ if (checksum_field2 != crc32) {
+ return true;
+ }
+ }
+ }
+ }
+
+ if (checksum_field1 == 0
+ || checksum_field1 == BUF_NO_CHECKSUM_MAGIC) {
+ } else if (crc32_chksum) {
+
+ if (!crc32_inited) {
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = true;
+ }
+
+ if (checksum_field1 != crc32
+ && checksum_field1
+ != buf_calc_page_new_checksum(read_buf)) {
+ return true;
+ }
+ } else {
+ ut_ad(curr_algo == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+ if (checksum_field1
+ != buf_calc_page_new_checksum(read_buf)) {
+
+ if (!crc32_inited) {
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = true;
+ }
+
+ if (checksum_field1 != crc32) {
+ return true;
+ }
+ }
+ }
+
+ if (crc32_inited
+ && ((checksum_field1 == crc32
+ && checksum_field2 != crc32)
+ || (checksum_field1 != crc32
+ && checksum_field2 == crc32))) {
+ return true;
+ }
+
+ break;
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ /* should have returned false earlier */
+ break;
+ }
+
+ return false;
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
+/** Enable buffers to be dumped to core files
+
+A convience function, not called anyhwere directly however
+it is left available for gdb or any debugger to call
+in the event that you want all of the memory to be dumped
+to a core file.
+
+Returns number of errors found in madvise calls. */
+int
+buf_madvise_do_dump()
+{
+ int ret= 0;
+
+ /* mirrors allocation in log_t::create() */
+ if (log_sys.buf) {
+ ret += madvise(log_sys.buf,
+ srv_log_buffer_size,
+ MADV_DODUMP);
+ ret += madvise(log_sys.flush_buf,
+ srv_log_buffer_size,
+ MADV_DODUMP);
+ }
+ /* mirrors recv_sys_t::create() */
+ if (recv_sys.buf)
+ {
+ ret+= madvise(recv_sys.buf, recv_sys.len, MADV_DODUMP);
+ }
+
+ mysql_mutex_lock(&buf_pool.mutex);
+ auto chunk = buf_pool.chunks;
+
+ for (ulint n = buf_pool.n_chunks; n--; chunk++) {
+ ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return ret;
+}
+#endif
+
+/** Dump a page to stderr.
+@param[in] read_buf database page
+@param[in] zip_size compressed page size, or 0 */
+void buf_page_print(const byte* read_buf, ulint zip_size)
+{
+ dict_index_t* index;
+
+#ifndef UNIV_DEBUG
+ const ulint size = zip_size ? zip_size : srv_page_size;
+ ib::info() << "Page dump in ascii and hex ("
+ << size << " bytes):";
+
+ ut_print_buf(stderr, read_buf, size);
+ fputs("\nInnoDB: End of page dump\n", stderr);
+#endif
+
+ if (zip_size) {
+ /* Print compressed page. */
+ ib::info() << "Compressed page type ("
+ << fil_page_get_type(read_buf)
+ << "); stored checksum in field1 "
+ << mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
+ << "; calculated checksums for field1: "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_CRC32)
+ << " "
+ << page_zip_calc_checksum(
+ read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_CRC32)
+ << ", "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_INNODB)
+ << " "
+ << page_zip_calc_checksum(
+ read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_INNODB)
+ << ", "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_NONE)
+ << " "
+ << page_zip_calc_checksum(
+ read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_NONE)
+ << "; page LSN "
+ << mach_read_from_8(read_buf + FIL_PAGE_LSN)
+ << "; page number (if stored to page"
+ << " already) "
+ << mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
+ << "; space id (if stored to page already) "
+ << mach_read_from_4(
+ read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ } else {
+ const uint32_t crc32 = buf_calc_page_crc32(read_buf);
+ ulint page_type = fil_page_get_type(read_buf);
+
+ ib::info() << "Uncompressed page, stored checksum in field1 "
+ << mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
+ << ", calculated checksums for field1: "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_CRC32) << " "
+ << crc32
+ << ", "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_INNODB) << " "
+ << buf_calc_page_new_checksum(read_buf)
+ << ", "
+ << " page type " << page_type << " == "
+ << fil_get_page_type_name(page_type) << "."
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_NONE) << " "
+ << BUF_NO_CHECKSUM_MAGIC
+ << ", stored checksum in field2 "
+ << mach_read_from_4(read_buf + srv_page_size
+ - FIL_PAGE_END_LSN_OLD_CHKSUM)
+ << ", calculated checksums for field2: "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_CRC32) << " "
+ << crc32
+ << ", "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_INNODB) << " "
+ << buf_calc_page_old_checksum(read_buf)
+ << ", "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_NONE) << " "
+ << BUF_NO_CHECKSUM_MAGIC
+ << ", page LSN "
+ << mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ << " "
+ << mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
+ << ", low 4 bytes of LSN at page end "
+ << mach_read_from_4(read_buf + srv_page_size
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)
+ << ", page number (if stored to page already) "
+ << mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
+ << ", space id (if created with >= MySQL-4.1.1"
+ " and stored already) "
+ << mach_read_from_4(
+ read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ }
+
+ switch (fil_page_get_type(read_buf)) {
+ index_id_t index_id;
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_RTREE:
+ index_id = btr_page_get_index_id(read_buf);
+ ib::info() << "Page may be an index page where"
+ " index id is " << index_id;
+
+ index = dict_index_find_on_id_low(index_id);
+ if (index) {
+ ib::info()
+ << "Index " << index_id
+ << " is " << index->name
+ << " in table " << index->table->name;
+ }
+ break;
+ case FIL_PAGE_UNDO_LOG:
+ fputs("InnoDB: Page may be an undo log page\n", stderr);
+ break;
+ case FIL_PAGE_INODE:
+ fputs("InnoDB: Page may be an 'inode' page\n", stderr);
+ break;
+ case FIL_PAGE_IBUF_FREE_LIST:
+ fputs("InnoDB: Page may be an insert buffer free list page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_ALLOCATED:
+ fputs("InnoDB: Page may be a freshly allocated page\n",
+ stderr);
+ break;
+ case FIL_PAGE_IBUF_BITMAP:
+ fputs("InnoDB: Page may be an insert buffer bitmap page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_SYS:
+ fputs("InnoDB: Page may be a system page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_TRX_SYS:
+ fputs("InnoDB: Page may be a transaction system page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_FSP_HDR:
+ fputs("InnoDB: Page may be a file space header page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_XDES:
+ fputs("InnoDB: Page may be an extent descriptor page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_BLOB:
+ fputs("InnoDB: Page may be a BLOB page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ fputs("InnoDB: Page may be a compressed BLOB page\n",
+ stderr);
+ break;
+ }
+}
+
+/** Initialize a buffer page descriptor.
+@param[in,out] block buffer page descriptor
+@param[in] frame buffer page frame */
+static
+void
+buf_block_init(buf_block_t* block, byte* frame)
+{
+ /* This function should only be executed at database startup or by
+ buf_pool.resize(). Either way, adaptive hash index must not exist. */
+ assert_block_ahi_empty_on_init(block);
+
+ block->frame = frame;
+
+ block->modify_clock = 0;
+ block->page.init(BUF_BLOCK_NOT_USED, page_id_t(~0ULL));
+#ifdef BTR_CUR_HASH_ADAPT
+ block->index = NULL;
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_d(block->in_unzip_LRU_list = false);
+ ut_d(block->in_withdraw_list = false);
+
+ page_zip_des_init(&block->page.zip);
+
+ ut_d(block->debug_latch = (rw_lock_t *) ut_malloc_nokey(sizeof(rw_lock_t)));
+
+ rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
+
+ ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, block->debug_latch,
+ SYNC_LEVEL_VARYING));
+
+ block->lock.is_block_lock = 1;
+
+ ut_ad(rw_lock_validate(&(block->lock)));
+}
+
+/** Allocate a chunk of buffer frames.
+@param bytes requested size
+@return whether the allocation succeeded */
+inline bool buf_pool_t::chunk_t::create(size_t bytes)
+{
+ DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;);
+ /* Round down to a multiple of page size, although it already should be. */
+ bytes= ut_2pow_round<size_t>(bytes, srv_page_size);
+
+ mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx);
+
+ if (UNIV_UNLIKELY(!mem))
+ return false;
+
+ MEM_UNDEFINED(mem, mem_size());
+
+#ifdef HAVE_LIBNUMA
+ if (srv_numa_interleave)
+ {
+ struct bitmask *numa_mems_allowed= numa_get_mems_allowed();
+ if (mbind(mem, mem_size(), MPOL_INTERLEAVE,
+ numa_mems_allowed->maskp, numa_mems_allowed->size,
+ MPOL_MF_MOVE))
+ {
+ ib::warn() << "Failed to set NUMA memory policy of"
+ " buffer pool page frames to MPOL_INTERLEAVE"
+ " (error: " << strerror(errno) << ").";
+ }
+ numa_bitmask_free(numa_mems_allowed);
+ }
+#endif /* HAVE_LIBNUMA */
+
+
+ /* Allocate the block descriptors from
+ the start of the memory block. */
+ blocks= reinterpret_cast<buf_block_t*>(mem);
+
+ /* Align a pointer to the first frame. Note that when
+ opt_large_page_size is smaller than srv_page_size,
+ (with max srv_page_size at 64k don't think any hardware
+ makes this true),
+ we may allocate one fewer block than requested. When
+ it is bigger, we may allocate more blocks than requested. */
+ static_assert(sizeof(byte*) == sizeof(ulint), "pointer size");
+
+ byte *frame= reinterpret_cast<byte*>((reinterpret_cast<ulint>(mem) +
+ srv_page_size - 1) &
+ ~ulint{srv_page_size - 1});
+ size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem);
+
+ /* Subtract the space needed for block descriptors. */
+ {
+ ulint s= size;
+
+ while (frame < reinterpret_cast<const byte*>(blocks + s))
+ {
+ frame+= srv_page_size;
+ s--;
+ }
+
+ size= s;
+ }
+
+ /* Init block structs and assign frames for them. Then we assign the
+ frames to the first blocks (we already mapped the memory above). */
+
+ buf_block_t *block= blocks;
+
+ for (auto i= size; i--; ) {
+ buf_block_init(block, frame);
+ MEM_UNDEFINED(block->frame, srv_page_size);
+ /* Add the block to the free list */
+ UT_LIST_ADD_LAST(buf_pool.free, &block->page);
+
+ ut_d(block->page.in_free_list = TRUE);
+ block++;
+ frame+= srv_page_size;
+ }
+
+ reg();
+
+ return true;
+}
+
+#ifdef UNIV_DEBUG
+/** Check that all file pages in the buffer chunk are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const
+{
+ buf_block_t *block= blocks;
+ for (auto i= size; i--; block++)
+ {
+ switch (block->page.state()) {
+ case BUF_BLOCK_ZIP_PAGE:
+ /* The uncompressed buffer pool should never
+ contain ROW_FORMAT=COMPRESSED block descriptors. */
+ ut_error;
+ break;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ /* Skip blocks that are not being used for file pages. */
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ const lsn_t lsn= block->page.oldest_modification();
+
+ if (srv_read_only_mode)
+ {
+ /* The page cleaner is disabled in read-only mode. No pages
+ can be dirtied, so all of them must be clean. */
+ ut_ad(lsn == 0 || lsn == recv_sys.recovered_lsn ||
+ srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+ ut_ad(!block->page.buf_fix_count());
+ ut_ad(block->page.io_fix() == BUF_IO_NONE);
+ break;
+ }
+
+ if (fsp_is_system_temporary(block->page.id().space()))
+ {
+ ut_ad(lsn == 0 || lsn == 2);
+ break;
+ }
+
+ if (lsn > 1 || !block->page.can_relocate())
+ return block;
+
+ break;
+ }
+ }
+
+ return nullptr;
+}
+#endif /* UNIV_DEBUG */
+
+/** Free the synchronization objects of a buffer pool block descriptor
+@param[in,out] block buffer pool block descriptor */
+static void buf_block_free_mutexes(buf_block_t* block)
+{
+ rw_lock_free(&block->lock);
+ ut_d(rw_lock_free(block->debug_latch));
+ ut_d(ut_free(block->debug_latch));
+}
+
+/** Create the hash table.
+@param n the lower bound of n_cells */
+void buf_pool_t::page_hash_table::create(ulint n)
+{
+ n_cells= ut_find_prime(n);
+ const size_t size= pad(n_cells) * sizeof *array;
+ void* v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+ memset(v, 0, size);
+ array= static_cast<hash_cell_t*>(v);
+}
+
+/** Create the buffer pool.
+@return whether the creation failed */
+bool buf_pool_t::create()
+{
+ ut_ad(this == &buf_pool);
+ ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0);
+ ut_ad(!is_initialised());
+ ut_ad(srv_buf_pool_size > 0);
+ ut_ad(!resizing);
+ ut_ad(!chunks_old);
+ ut_ad(!field_ref_zero);
+
+ NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+
+ if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096))
+ field_ref_zero= static_cast<const byte*>
+ (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX));
+ else
+ return true;
+
+ chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map());
+
+ new(&allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool);
+
+ n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit;
+ const size_t chunk_size= srv_buf_pool_chunk_unit;
+
+ chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks));
+ UT_LIST_INIT(free, &buf_page_t::list);
+ curr_size= 0;
+ auto chunk= chunks;
+
+ do
+ {
+ if (!chunk->create(chunk_size))
+ {
+ while (--chunk >= chunks)
+ {
+ buf_block_t* block= chunk->blocks;
+
+ for (auto i= chunk->size; i--; block++)
+ buf_block_free_mutexes(block);
+
+ allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+ }
+ ut_free(chunks);
+ chunks= nullptr;
+ UT_DELETE(chunk_t::map_reg);
+ chunk_t::map_reg= nullptr;
+ aligned_free(const_cast<byte*>(field_ref_zero));
+ field_ref_zero= nullptr;
+ ut_ad(!is_initialised());
+ return true;
+ }
+
+ curr_size+= chunk->size;
+ }
+ while (++chunk < chunks + n_chunks);
+
+ ut_ad(is_initialised());
+ mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST);
+
+ UT_LIST_INIT(LRU, &buf_page_t::LRU);
+ UT_LIST_INIT(withdraw, &buf_page_t::list);
+ withdraw_target= 0;
+ UT_LIST_INIT(flush_list, &buf_page_t::list);
+ UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU);
+
+ for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i)
+ UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list);
+ ulint s= curr_size;
+ old_size= s;
+ s/= BUF_READ_AHEAD_PORTION;
+ read_ahead_area= s >= READ_AHEAD_PAGES
+ ? READ_AHEAD_PAGES
+ : my_round_up_to_next_power(static_cast<uint32_t>(s));
+ curr_pool_size= srv_buf_pool_size;
+
+ n_chunks_new= n_chunks;
+
+ page_hash.create(2 * curr_size);
+ zip_hash.create(2 * curr_size);
+ last_printout_time= time(NULL);
+
+ mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex,
+ MY_MUTEX_INIT_FAST);
+
+ pthread_cond_init(&done_flush_LRU, nullptr);
+ pthread_cond_init(&done_flush_list, nullptr);
+ pthread_cond_init(&do_flush_list, nullptr);
+ pthread_cond_init(&done_free, nullptr);
+
+ try_LRU_scan= true;
+
+ ut_d(flush_hp.m_mutex= &flush_list_mutex;);
+ ut_d(lru_hp.m_mutex= &mutex);
+ ut_d(lru_scan_itr.m_mutex= &mutex);
+
+ io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) *
+ OS_AIO_N_PENDING_IOS_PER_THREAD);
+
+ /* FIXME: remove some of these variables */
+ srv_buf_pool_curr_size= curr_pool_size;
+ srv_buf_pool_old_size= srv_buf_pool_size;
+ srv_buf_pool_base_size= srv_buf_pool_size;
+
+ last_activity_count= srv_get_activity_count();
+
+ chunk_t::map_ref= chunk_t::map_reg;
+ buf_LRU_old_ratio_update(100 * 3 / 8, false);
+ btr_search_sys_create();
+ ut_ad(is_initialised());
+ return false;
+}
+
+/** Clean up after successful create() */
+void buf_pool_t::close()
+{
+ ut_ad(this == &buf_pool);
+ if (!is_initialised())
+ return;
+
+ mysql_mutex_destroy(&mutex);
+ mysql_mutex_destroy(&flush_list_mutex);
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage;
+ bpage= prev_bpage)
+ {
+ prev_bpage= UT_LIST_GET_PREV(LRU, bpage);
+ ut_ad(bpage->in_file());
+ ut_ad(bpage->in_LRU_list);
+ /* The buffer pool must be clean during normal shutdown.
+ Only on aborted startup (with recovery) or with innodb_fast_shutdown=2
+ we may discard changes. */
+ ut_d(const lsn_t oldest= bpage->oldest_modification();)
+ ut_ad(fsp_is_system_temporary(bpage->id().space())
+ ? (oldest == 0 || oldest == 2)
+ : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2);
+
+ if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+ buf_page_free_descriptor(bpage);
+ }
+
+ for (auto chunk= chunks + n_chunks; --chunk >= chunks; )
+ {
+ buf_block_t *block= chunk->blocks;
+
+ for (auto i= chunk->size; i--; block++)
+ buf_block_free_mutexes(block);
+
+ allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+ }
+
+ pthread_cond_destroy(&done_flush_LRU);
+ pthread_cond_destroy(&done_flush_list);
+ pthread_cond_destroy(&do_flush_list);
+ pthread_cond_destroy(&done_free);
+
+ ut_free(chunks);
+ chunks= nullptr;
+ page_hash.free();
+ zip_hash.free();
+
+ io_buf.close();
+ UT_DELETE(chunk_t::map_reg);
+ chunk_t::map_reg= chunk_t::map_ref= nullptr;
+ aligned_free(const_cast<byte*>(field_ref_zero));
+ field_ref_zero= nullptr;
+}
+
+/** Try to reallocate a control block.
+@param block control block to reallocate
+@return whether the reallocation succeeded */
+inline bool buf_pool_t::realloc(buf_block_t *block)
+{
+ buf_block_t* new_block;
+
+ mysql_mutex_assert_owner(&mutex);
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+ new_block = buf_LRU_get_free_only();
+
+ if (new_block == NULL) {
+ return(false); /* free list was not enough */
+ }
+
+ const page_id_t id(block->page.id());
+ page_hash_latch* hash_lock = hash_lock_get(id);
+ hash_lock->write_lock();
+
+ if (block->page.can_relocate()) {
+ memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
+ new_block->frame, block->frame, srv_page_size);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ new (&new_block->page) buf_page_t(block->page);
+
+ /* relocate LRU list */
+ if (buf_page_t* prev_b = buf_pool.LRU_remove(&block->page)) {
+ UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page);
+ } else {
+ UT_LIST_ADD_FIRST(LRU, &new_block->page);
+ }
+
+ if (LRU_old == &block->page) {
+ LRU_old = &new_block->page;
+ }
+
+ ut_ad(new_block->page.in_LRU_list);
+
+ /* relocate unzip_LRU list */
+ if (block->page.zip.data != NULL) {
+ ut_ad(block->in_unzip_LRU_list);
+ ut_d(new_block->in_unzip_LRU_list = true);
+
+ buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+ UT_LIST_REMOVE(unzip_LRU, block);
+
+ ut_d(block->in_unzip_LRU_list = false);
+ block->page.zip.data = NULL;
+ page_zip_set_size(&block->page.zip, 0);
+
+ if (prev_block != NULL) {
+ UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block);
+ } else {
+ UT_LIST_ADD_FIRST(unzip_LRU, new_block);
+ }
+ } else {
+ ut_ad(!block->in_unzip_LRU_list);
+ ut_d(new_block->in_unzip_LRU_list = false);
+ }
+
+ /* relocate page_hash */
+ ut_ad(block->page.in_page_hash);
+ ut_ad(new_block->page.in_page_hash);
+ const ulint fold = id.fold();
+ ut_ad(&block->page == page_hash_get_low(id, fold));
+ ut_d(block->page.in_page_hash = false);
+ HASH_REPLACE(buf_page_t, hash, &page_hash, fold,
+ &block->page, &new_block->page);
+
+ buf_block_modify_clock_inc(block);
+ static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+ memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xff, 4);
+ static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+ "not perfect alignment");
+ memset_aligned<2>(block->frame
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+ MEM_UNDEFINED(block->frame, srv_page_size);
+ block->page.set_state(BUF_BLOCK_REMOVE_HASH);
+ if (!fsp_is_system_temporary(id.space())) {
+ buf_flush_relocate_on_flush_list(&block->page,
+ &new_block->page);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ block->page.set_corrupt_id();
+
+ /* set other flags of buf_block_t */
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* This code should only be executed by resize(),
+ while the adaptive hash index is disabled. */
+ assert_block_ahi_empty(block);
+ assert_block_ahi_empty_on_init(new_block);
+ ut_ad(!block->index);
+ new_block->index = NULL;
+ new_block->n_hash_helps = 0;
+ new_block->n_fields = 1;
+ new_block->left_side = TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_d(block->page.set_state(BUF_BLOCK_MEMORY));
+ /* free block */
+ new_block = block;
+ }
+
+ hash_lock->write_unlock();
+ buf_LRU_block_free_non_file_page(new_block);
+ return(true); /* free_list was enough */
+}
+
+/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3).
+@param[in] fmt format
+@param[in] ... extra parameters according to fmt */
+static
+void
+buf_resize_status(
+ const char* fmt,
+ ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ vsnprintf(
+ export_vars.innodb_buffer_pool_resize_status,
+ sizeof(export_vars.innodb_buffer_pool_resize_status),
+ fmt, ap);
+
+ va_end(ap);
+
+ ib::info() << export_vars.innodb_buffer_pool_resize_status;
+}
+
+/** Withdraw blocks from the buffer pool until meeting withdraw_target.
+@return whether retry is needed */
+inline bool buf_pool_t::withdraw_blocks()
+{
+ buf_block_t* block;
+ ulint loop_count = 0;
+
+ ib::info() << "start to withdraw the last "
+ << withdraw_target << " blocks";
+
+ /* Minimize zip_free[i] lists */
+ mysql_mutex_lock(&mutex);
+ buf_buddy_condense_free();
+ mysql_mutex_unlock(&mutex);
+
+ while (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+
+ /* try to withdraw from free_list */
+ ulint count1 = 0;
+
+ mysql_mutex_lock(&mutex);
+ block = reinterpret_cast<buf_block_t*>(
+ UT_LIST_GET_FIRST(free));
+ while (block != NULL
+ && UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+ ut_ad(block->page.in_free_list);
+ ut_ad(!block->page.oldest_modification());
+ ut_ad(!block->page.in_LRU_list);
+ ut_a(!block->page.in_file());
+
+ buf_block_t* next_block;
+ next_block = reinterpret_cast<buf_block_t*>(
+ UT_LIST_GET_NEXT(
+ list, &block->page));
+
+ if (will_be_withdrawn(block->page)) {
+ /* This should be withdrawn */
+ UT_LIST_REMOVE(free, &block->page);
+ UT_LIST_ADD_LAST(withdraw, &block->page);
+ ut_d(block->in_withdraw_list = true);
+ count1++;
+ }
+
+ block = next_block;
+ }
+ mysql_mutex_unlock(&mutex);
+
+ /* reserve free_list length */
+ if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+ ulint n_flushed = buf_flush_LRU(
+ std::max<ulint>(withdraw_target
+ - UT_LIST_GET_LEN(withdraw),
+ srv_LRU_scan_depth));
+ buf_flush_wait_batch_end_acquiring_mutex(true);
+
+ if (n_flushed) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_FLUSH_COUNT,
+ MONITOR_LRU_BATCH_FLUSH_PAGES,
+ n_flushed);
+ }
+ }
+
+ /* relocate blocks/buddies in withdrawn area */
+ ulint count2 = 0;
+
+ mysql_mutex_lock(&mutex);
+ buf_page_t* bpage;
+ bpage = UT_LIST_GET_FIRST(LRU);
+ while (bpage != NULL) {
+ buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
+ if (bpage->zip.data != NULL
+ && will_be_withdrawn(bpage->zip.data)
+ && bpage->can_relocate()) {
+ buf_pool_mutex_exit_forbid();
+ if (!buf_buddy_realloc(
+ bpage->zip.data,
+ page_zip_get_size(&bpage->zip))) {
+ /* failed to allocate block */
+ buf_pool_mutex_exit_allow();
+ break;
+ }
+ buf_pool_mutex_exit_allow();
+ count2++;
+ }
+
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE
+ && will_be_withdrawn(*bpage)) {
+ if (bpage->can_relocate()) {
+ buf_pool_mutex_exit_forbid();
+ if (!realloc(
+ reinterpret_cast<buf_block_t*>(
+ bpage))) {
+ /* failed to allocate block */
+ buf_pool_mutex_exit_allow();
+ break;
+ }
+ buf_pool_mutex_exit_allow();
+ count2++;
+ }
+ /* NOTE: if the page is in use,
+ not relocated yet */
+ }
+
+ bpage = next_bpage;
+ }
+ mysql_mutex_unlock(&mutex);
+
+ buf_resize_status(
+ "withdrawing blocks. (" ULINTPF "/" ULINTPF ")",
+ UT_LIST_GET_LEN(withdraw),
+ withdraw_target);
+
+ ib::info() << "withdrew "
+ << count1 << " blocks from free list."
+ << " Tried to relocate " << count2 << " pages ("
+ << UT_LIST_GET_LEN(withdraw) << "/"
+ << withdraw_target << ")";
+
+ if (++loop_count >= 10) {
+ /* give up for now.
+ retried after user threads paused. */
+
+ ib::info() << "will retry to withdraw later";
+
+ /* need retry later */
+ return(true);
+ }
+ }
+
+ /* confirm withdrawn enough */
+ for (const chunk_t* chunk = chunks + n_chunks_new,
+ * const echunk = chunks + n_chunks; chunk != echunk; chunk++) {
+ block = chunk->blocks;
+ for (ulint j = chunk->size; j--; block++) {
+ ut_a(block->page.state() == BUF_BLOCK_NOT_USED);
+ ut_ad(block->in_withdraw_list);
+ }
+ }
+
+ ib::info() << "withdrawn target: " << UT_LIST_GET_LEN(withdraw)
+ << " blocks";
+
+ return(false);
+}
+
+
+
+inline void buf_pool_t::page_hash_table::write_lock_all()
+{
+ for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+ {
+ reinterpret_cast<page_hash_latch&>(array[n]).write_lock();
+ if (!n)
+ break;
+ }
+}
+
+
+inline void buf_pool_t::page_hash_table::write_unlock_all()
+{
+ for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+ {
+ reinterpret_cast<page_hash_latch&>(array[n]).write_unlock();
+ if (!n)
+ break;
+ }
+}
+
+
+namespace
+{
+
+struct find_interesting_trx
+{
+ void operator()(const trx_t &trx)
+ {
+ if (trx.state == TRX_STATE_NOT_STARTED)
+ return;
+ if (trx.mysql_thd == nullptr)
+ return;
+ if (withdraw_started <= trx.start_time)
+ return;
+
+ if (!found)
+ {
+ ib::warn() << "The following trx might hold "
+ "the blocks in buffer pool to "
+ "be withdrawn. Buffer pool "
+ "resizing can complete only "
+ "after all the transactions "
+ "below release the blocks.";
+ found= true;
+ }
+
+ lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time);
+ }
+
+ bool &found;
+ time_t withdraw_started;
+ time_t current_time;
+};
+
+} // namespace
+
+/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+inline void buf_pool_t::resize()
+{
+ ut_ad(this == &buf_pool);
+
+ bool warning = false;
+
+ NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+
+ ut_ad(!resize_in_progress());
+ ut_ad(srv_buf_pool_chunk_unit > 0);
+
+ ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift;
+
+ buf_resize_status("Resizing buffer pool from " ULINTPF " to "
+ ULINTPF " (unit=" ULINTPF ").",
+ srv_buf_pool_old_size, srv_buf_pool_size,
+ srv_buf_pool_chunk_unit);
+
+ mysql_mutex_lock(&mutex);
+ ut_ad(curr_size == old_size);
+ ut_ad(n_chunks_new == n_chunks);
+ ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+
+ n_chunks_new = (new_instance_size << srv_page_size_shift)
+ / srv_buf_pool_chunk_unit;
+ curr_size = n_chunks_new * chunks->size;
+ mysql_mutex_unlock(&mutex);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* disable AHI if needed */
+ const bool btr_search_disabled = btr_search_enabled;
+
+ buf_resize_status("Disabling adaptive hash index.");
+
+ btr_search_s_lock_all();
+ if (btr_search_disabled) {
+ btr_search_s_unlock_all();
+ } else {
+ btr_search_s_unlock_all();
+ }
+
+ btr_search_disable();
+
+ if (btr_search_disabled) {
+ ib::info() << "disabled adaptive hash index.";
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (curr_size < old_size) {
+ /* set withdraw target */
+ size_t w = 0;
+
+ for (const chunk_t* chunk = chunks + n_chunks_new,
+ * const echunk = chunks + n_chunks;
+ chunk != echunk; chunk++)
+ w += chunk->size;
+
+ ut_ad(withdraw_target == 0);
+ withdraw_target = w;
+ }
+
+ buf_resize_status("Withdrawing blocks to be shrunken.");
+
+ time_t withdraw_started = time(NULL);
+ double message_interval = 60;
+ ulint retry_interval = 1;
+
+withdraw_retry:
+ /* wait for the number of blocks fit to the new size (if needed)*/
+ bool should_retry_withdraw = curr_size < old_size
+ && withdraw_blocks();
+
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+ /* abort to resize for shutdown. */
+ return;
+ }
+
+ /* abort buffer pool load */
+ buf_load_abort();
+
+ const time_t current_time = time(NULL);
+
+ if (should_retry_withdraw
+ && difftime(current_time, withdraw_started) >= message_interval) {
+
+ if (message_interval > 900) {
+ message_interval = 1800;
+ } else {
+ message_interval *= 2;
+ }
+
+ lock_mutex_enter();
+ bool found = false;
+ trx_sys.trx_list.for_each(find_interesting_trx{
+ found, withdraw_started, current_time});
+ lock_mutex_exit();
+
+ withdraw_started = current_time;
+ }
+
+ if (should_retry_withdraw) {
+ ib::info() << "Will retry to withdraw " << retry_interval
+ << " seconds later.";
+ os_thread_sleep(retry_interval * 1000000);
+
+ if (retry_interval > 5) {
+ retry_interval = 10;
+ } else {
+ retry_interval *= 2;
+ }
+
+ goto withdraw_retry;
+ }
+
+ buf_resize_status("Latching whole of buffer pool.");
+
+#ifndef DBUG_OFF
+ {
+ bool should_wait = true;
+
+ while (should_wait) {
+ should_wait = false;
+ DBUG_EXECUTE_IF(
+ "ib_buf_pool_resize_wait_before_resize",
+ should_wait = true; os_thread_sleep(10000););
+ }
+ }
+#endif /* !DBUG_OFF */
+
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+ return;
+ }
+
+ /* Indicate critical path */
+ resizing.store(true, std::memory_order_relaxed);
+
+ mysql_mutex_lock(&mutex);
+ page_hash.write_lock_all();
+
+ chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());
+
+ /* add/delete chunks */
+
+ buf_resize_status("buffer pool resizing with chunks "
+ ULINTPF " to " ULINTPF ".",
+ n_chunks, n_chunks_new);
+
+ if (n_chunks_new < n_chunks) {
+ /* delete chunks */
+ chunk_t* chunk = chunks + n_chunks_new;
+ const chunk_t* const echunk = chunks + n_chunks;
+
+ ulint sum_freed = 0;
+
+ while (chunk < echunk) {
+ /* buf_LRU_block_free_non_file_page() invokes
+ MEM_NOACCESS() on any buf_pool.free blocks.
+ We must cancel the effect of that. In
+ MemorySanitizer, MEM_NOACCESS() is no-op, so
+ we must not do anything special for it here. */
+#ifdef HAVE_valgrind
+# if !__has_feature(memory_sanitizer)
+ MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size());
+# endif
+#else
+ MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size);
+#endif
+
+ buf_block_t* block = chunk->blocks;
+
+ for (ulint j = chunk->size; j--; block++) {
+ buf_block_free_mutexes(block);
+ }
+
+ allocator.deallocate_large_dodump(
+ chunk->mem, &chunk->mem_pfx);
+ sum_freed += chunk->size;
+ ++chunk;
+ }
+
+ /* discard withdraw list */
+ UT_LIST_INIT(withdraw, &buf_page_t::list);
+ withdraw_target = 0;
+
+ ib::info() << n_chunks - n_chunks_new
+ << " chunks (" << sum_freed
+ << " blocks) were freed.";
+
+ n_chunks = n_chunks_new;
+ }
+
+ {
+ /* reallocate chunks */
+ const size_t new_chunks_size
+ = n_chunks_new * sizeof(chunk_t);
+
+ chunk_t* new_chunks = static_cast<chunk_t*>(
+ ut_zalloc_nokey_nofatal(new_chunks_size));
+
+ DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
+ ut_free(new_chunks); new_chunks= nullptr; );
+
+ if (!new_chunks) {
+ ib::error() << "failed to allocate"
+ " the chunk array.";
+ n_chunks_new = n_chunks;
+ warning = true;
+ chunks_old = NULL;
+ goto calc_buf_pool_size;
+ }
+
+ ulint n_chunks_copy = ut_min(n_chunks_new,
+ n_chunks);
+
+ memcpy(new_chunks, chunks,
+ n_chunks_copy * sizeof *new_chunks);
+
+ for (ulint j = 0; j < n_chunks_copy; j++) {
+ new_chunks[j].reg();
+ }
+
+ chunks_old = chunks;
+ chunks = new_chunks;
+ }
+
+ if (n_chunks_new > n_chunks) {
+ /* add chunks */
+ ulint sum_added = 0;
+ ulint n = n_chunks;
+ const size_t unit = srv_buf_pool_chunk_unit;
+
+ for (chunk_t* chunk = chunks + n_chunks,
+ * const echunk = chunks + n_chunks_new;
+ chunk != echunk; chunk++) {
+ if (!chunk->create(unit)) {
+ ib::error() << "failed to allocate"
+ " memory for buffer pool chunk";
+
+ warning = true;
+ n_chunks_new = n_chunks;
+ break;
+ }
+
+ sum_added += chunk->size;
+ ++n;
+ }
+
+ ib::info() << n_chunks_new - n_chunks
+ << " chunks (" << sum_added
+ << " blocks) were added.";
+
+ n_chunks = n;
+ }
+calc_buf_pool_size:
+ /* recalc curr_size */
+ ulint new_size = 0;
+
+ {
+ chunk_t* chunk = chunks;
+ const chunk_t* const echunk = chunk + n_chunks;
+ do {
+ new_size += chunk->size;
+ } while (++chunk != echunk);
+ }
+
+ curr_size = new_size;
+ n_chunks_new = n_chunks;
+
+ if (chunks_old) {
+ ut_free(chunks_old);
+ chunks_old = NULL;
+ }
+
+ chunk_t::map* chunk_map_old = chunk_t::map_ref;
+ chunk_t::map_ref = chunk_t::map_reg;
+
+ /* set size */
+ ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+ ulint s= curr_size;
+ old_size= s;
+ s/= BUF_READ_AHEAD_PORTION;
+ read_ahead_area= s >= READ_AHEAD_PAGES
+ ? READ_AHEAD_PAGES
+ : my_round_up_to_next_power(static_cast<uint32_t>(s));
+ curr_pool_size= n_chunks * srv_buf_pool_chunk_unit;
+ srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/
+ innodb_set_buf_pool_size(buf_pool_size_align(srv_buf_pool_curr_size));
+
+ const bool new_size_too_diff
+ = srv_buf_pool_base_size > srv_buf_pool_size * 2
+ || srv_buf_pool_base_size * 2 < srv_buf_pool_size;
+
+ mysql_mutex_unlock(&mutex);
+ page_hash.write_unlock_all();
+
+ UT_DELETE(chunk_map_old);
+
+ resizing.store(false, std::memory_order_relaxed);
+
+ /* Normalize other components, if the new size is too different */
+ if (!warning && new_size_too_diff) {
+ srv_buf_pool_base_size = srv_buf_pool_size;
+
+ buf_resize_status("Resizing also other hash tables.");
+
+ srv_lock_table_size = 5
+ * (srv_buf_pool_size >> srv_page_size_shift);
+ lock_sys.resize(srv_lock_table_size);
+ dict_sys.resize();
+
+ ib::info() << "Resized hash tables at lock_sys,"
+#ifdef BTR_CUR_HASH_ADAPT
+ " adaptive hash index,"
+#endif /* BTR_CUR_HASH_ADAPT */
+ " dictionary.";
+ }
+
+ /* normalize ibuf.max_size */
+ ibuf_max_size_update(srv_change_buffer_max_size);
+
+ if (srv_buf_pool_old_size != srv_buf_pool_size) {
+
+ ib::info() << "Completed to resize buffer pool from "
+ << srv_buf_pool_old_size
+ << " to " << srv_buf_pool_size << ".";
+ srv_buf_pool_old_size = srv_buf_pool_size;
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* enable AHI if needed */
+ if (btr_search_disabled) {
+ btr_search_enable(true);
+ ib::info() << "Re-enabled adaptive hash index.";
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ char now[32];
+
+ ut_sprintf_timestamp(now);
+ if (!warning) {
+ buf_resize_status("Completed resizing buffer pool at %s.",
+ now);
+ } else {
+ buf_resize_status("Resizing buffer pool failed,"
+ " finished resizing at %s.", now);
+ }
+
+ ut_d(validate());
+
+ return;
+}
+
+/** Thread pool task invoked by innodb_buffer_pool_size changes. */
+static void buf_resize_callback(void *)
+{
+ DBUG_ENTER("buf_resize_callback");
+ ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
+ mysql_mutex_lock(&buf_pool.mutex);
+ const auto size= srv_buf_pool_size;
+ const bool work= srv_buf_pool_old_size != size;
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (work)
+ buf_pool.resize();
+ else
+ {
+ std::ostringstream sout;
+ sout << "Size did not change: old size = new size = " << size;
+ buf_resize_status(sout.str().c_str());
+ }
+ DBUG_VOID_RETURN;
+}
+
+/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */
+static tpool::task_group single_threaded_group(1);
+static tpool::waitable_task buf_resize_task(buf_resize_callback,
+ nullptr, &single_threaded_group);
+
+void buf_resize_start()
+{
+ srv_thread_pool->submit_task(&buf_resize_task);
+}
+
+void buf_resize_shutdown()
+{
+ buf_resize_task.wait();
+}
+
+
+/** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and
+buf_pool.page_hash.
+The caller must relocate bpage->list.
+@param bpage BUF_BLOCK_ZIP_PAGE block
+@param dpage destination control block */
+static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
+{
+ const ulint fold= bpage->id().fold();
+ ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_pool.hash_lock_get(bpage->id())->is_write_locked());
+ ut_a(bpage->io_fix() == BUF_IO_NONE);
+ ut_a(!bpage->buf_fix_count());
+ ut_ad(bpage == buf_pool.page_hash_get_low(bpage->id(), fold));
+ ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+ ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+
+ new (dpage) buf_page_t(*bpage);
+
+ /* Important that we adjust the hazard pointer before
+ removing bpage from LRU list. */
+ if (buf_page_t *b= buf_pool.LRU_remove(bpage))
+ UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage);
+ else
+ UT_LIST_ADD_FIRST(buf_pool.LRU, dpage);
+
+ if (UNIV_UNLIKELY(buf_pool.LRU_old == bpage))
+ {
+ buf_pool.LRU_old= dpage;
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool.LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool.LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) ||
+ !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) ||
+ UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+ }
+ else
+ {
+ /* Check that the "old" flag is consistent in
+ the block and its neighbours. */
+ dpage->set_old(dpage->is_old());
+#endif /* UNIV_LRU_DEBUG */
+ }
+
+ ut_d(CheckInLRUList::validate());
+
+ /* relocate buf_pool.page_hash */
+ ut_ad(bpage->in_page_hash);
+ ut_ad(dpage->in_page_hash);
+ ut_d(bpage->in_page_hash= false);
+ HASH_REPLACE(buf_page_t, hash, &buf_pool.page_hash, fold, bpage, dpage);
+}
+
+/** Register a watch for a page identifier. The caller must hold an
+exclusive page hash latch. The *hash_lock may be released,
+relocated, and reacquired.
+@param id page identifier
+@param hash_lock exclusively held page_hash latch
+@return a buffer pool block corresponding to id
+@retval nullptr if the block was not present, and a watch was installed */
+inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
+ page_hash_latch **hash_lock)
+{
+ const ulint fold= id.fold();
+ ut_ad(*hash_lock == page_hash.lock_get(fold));
+ ut_ad((*hash_lock)->is_write_locked());
+
+retry:
+ if (buf_page_t *bpage= page_hash_get_low(id, fold))
+ {
+ if (!watch_is_sentinel(*bpage))
+ /* The page was loaded meanwhile. */
+ return bpage;
+ /* Add to an existing watch. */
+ bpage->fix();
+ return nullptr;
+ }
+
+ (*hash_lock)->write_unlock();
+ /* Allocate a watch[] and then try to insert it into the page_hash. */
+ mysql_mutex_lock(&mutex);
+
+ /* The maximum number of purge tasks should never exceed
+ the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a
+ watch when setting another watch. */
+ for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; )
+ {
+ ut_ad(w->access_time == 0);
+ ut_ad(!w->oldest_modification());
+ ut_ad(!w->zip.data);
+ ut_ad(!w->in_zip_hash);
+ if (w->state() == BUF_BLOCK_ZIP_PAGE)
+ /* This watch may be in use for some other page. */
+ continue;
+ ut_ad(w->state() == BUF_BLOCK_NOT_USED);
+ ut_ad(!w->buf_fix_count());
+ /* w is pointing to watch[], which is protected by mutex.
+ Normally, buf_page_t::id for objects that are reachable by
+ page_hash_get_low(id, fold) are protected by hash_lock. */
+ w->set_state(BUF_BLOCK_ZIP_PAGE);
+ w->id_= id;
+
+ *hash_lock= page_hash.lock_get(fold);
+ (*hash_lock)->write_lock();
+ mysql_mutex_unlock(&mutex);
+
+ buf_page_t *bpage= page_hash_get_low(id, fold);
+ if (UNIV_LIKELY_NULL(bpage))
+ {
+ (*hash_lock)->write_unlock();
+ mysql_mutex_lock(&mutex);
+ w->set_state(BUF_BLOCK_NOT_USED);
+ *hash_lock= page_hash.lock_get(fold);
+ (*hash_lock)->write_lock();
+ mysql_mutex_unlock(&mutex);
+ goto retry;
+ }
+
+ ut_ad(!w->buf_fix_count_);
+ w->buf_fix_count_= 1;
+ ut_ad(!w->in_page_hash);
+ ut_d(w->in_page_hash= true); /* Not holding buf_pool.mutex here! */
+ HASH_INSERT(buf_page_t, hash, &page_hash, fold, w);
+ return nullptr;
+ }
+
+ ut_error;
+ mysql_mutex_unlock(&mutex);
+ return nullptr;
+}
+
+/** Mark the page status as FREED for the given tablespace id and
+page number. If the page is not in buffer pool then ignore it.
+@param[in,out] space tablespace
+@param[in] page page number
+@param[in,out] mtr mini-transaction
+@param[in] file file name
+@param[in] line line where called */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
+ const char *file, unsigned line)
+{
+ ut_ad(mtr);
+ ut_ad(mtr->is_active());
+
+ if (srv_immediate_scrub_data_uncompressed
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ || space->is_compressed()
+#endif
+ )
+ mtr->add_freed_offset(space, page);
+
+ buf_pool.stat.n_page_gets++;
+ const page_id_t page_id(space->id, page);
+ const ulint fold= page_id.fold();
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+ if (buf_block_t *block= reinterpret_cast<buf_block_t*>
+ (buf_pool.page_hash_get_low(page_id, fold)))
+ {
+ if (block->page.state() != BUF_BLOCK_FILE_PAGE)
+ /* FIXME: convert, but avoid buf_zip_decompress() */;
+ else
+ {
+ buf_block_buf_fix_inc(block, file, line);
+ ut_ad(block->page.buf_fix_count());
+ hash_lock->read_unlock();
+
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+ rw_lock_x_lock_inline(&block->lock, 0, file, line);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ block->page.status= buf_page_t::FREED;
+ return;
+ }
+ }
+
+ hash_lock->read_unlock();
+}
+
+/** Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch. Mutual exclusion has to
+be implemented at a higher level. In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size
+@return pointer to the block */
+buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size)
+{
+ ut_ad(zip_size);
+ ut_ad(ut_is_2pow(zip_size));
+ buf_pool.stat.n_page_gets++;
+
+ bool discard_attempted= false;
+ const ulint fold= page_id.fold();
+ buf_page_t *bpage;
+ page_hash_latch *hash_lock;
+
+ for (;;)
+ {
+lookup:
+ bpage= buf_pool.page_hash_get_locked<false>(page_id, fold, &hash_lock);
+ if (bpage)
+ break;
+
+ dberr_t err= buf_read_page(page_id, zip_size);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ {
+ ib::error() << "Reading compressed page " << page_id
+ << " failed with error: " << err;
+ goto err_exit;
+ }
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ }
+
+ ut_ad(hash_lock->is_read_locked());
+
+ if (!bpage->zip.data)
+ {
+ /* There is no compressed page. */
+err_exit:
+ hash_lock->read_unlock();
+ return nullptr;
+ }
+
+ ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+
+ switch (bpage->state()) {
+ case BUF_BLOCK_ZIP_PAGE:
+ bpage->fix();
+ goto got_block;
+ case BUF_BLOCK_FILE_PAGE:
+ /* Discard the uncompressed page frame if possible. */
+ if (!discard_attempted)
+ {
+ discard_attempted= true;
+ hash_lock->read_unlock();
+ mysql_mutex_lock(&buf_pool.mutex);
+ if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
+ buf_LRU_free_page(bpage, false);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ goto lookup;
+ }
+
+ buf_block_buf_fix_inc(reinterpret_cast<buf_block_t*>(bpage),
+ __FILE__, __LINE__);
+ goto got_block;
+ default:
+ break;
+ }
+
+ ut_error;
+ goto err_exit;
+
+got_block:
+ bool must_read= bpage->io_fix() == BUF_IO_READ;
+ hash_lock->read_unlock();
+
+ DBUG_ASSERT(bpage->status != buf_page_t::FREED);
+
+ bpage->set_accessed();
+ buf_page_make_young_if_needed(bpage);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ ut_ad(bpage->buf_fix_count());
+ ut_ad(bpage->in_file());
+
+ if (must_read)
+ /* Let us wait until the read operation completes */
+ while (bpage->io_fix() == BUF_IO_READ)
+ os_thread_sleep(WAIT_FOR_READ);
+
+ return bpage;
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_block_init_low(
+/*===============*/
+ buf_block_t* block) /*!< in: block to init */
+{
+#ifdef BTR_CUR_HASH_ADAPT
+ /* No adaptive hash index entries may point to a previously
+ unused (and now freshly allocated) block. */
+ assert_block_ahi_empty_on_init(block);
+ block->index = NULL;
+
+ block->n_hash_helps = 0;
+ block->n_fields = 1;
+ block->n_bytes = 0;
+ block->left_side = TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+}
+
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+ibool
+buf_zip_decompress(
+/*===============*/
+ buf_block_t* block, /*!< in/out: block */
+ ibool check) /*!< in: TRUE=verify the page checksum */
+{
+ const byte* frame = block->page.zip.data;
+ ulint size = page_zip_get_size(&block->page.zip);
+ /* The tablespace will not be found if this function is called
+ during IMPORT. */
+ fil_space_t* space= fil_space_t::get(block->page.id().space());
+ const unsigned key_version = mach_read_from_4(
+ frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
+ const bool encrypted = crypt_data
+ && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
+ && (!crypt_data->is_default_encryption()
+ || srv_encrypt_tables);
+
+ ut_ad(block->zip_size());
+ ut_a(block->page.id().space() != 0);
+
+ if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
+
+ ib::error() << "Compressed page checksum mismatch for "
+ << (space ? space->chain.start->name : "")
+ << block->page.id() << ": stored: "
+ << mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
+ << ", crc32: "
+ << page_zip_calc_checksum(
+ frame, size, SRV_CHECKSUM_ALGORITHM_CRC32)
+ << " innodb: "
+ << page_zip_calc_checksum(
+ frame, size, SRV_CHECKSUM_ALGORITHM_INNODB)
+ << ", none: "
+ << page_zip_calc_checksum(
+ frame, size, SRV_CHECKSUM_ALGORITHM_NONE)
+ << " (algorithm: " << srv_checksum_algorithm << ")";
+ goto err_exit;
+ }
+
+ switch (fil_page_get_type(frame)) {
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ if (page_zip_decompress(&block->page.zip,
+ block->frame, TRUE)) {
+ if (space) {
+ space->release();
+ }
+ return(TRUE);
+ }
+
+ ib::error() << "Unable to decompress "
+ << (space ? space->chain.start->name : "")
+ << block->page.id();
+ goto err_exit;
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ /* Copy to uncompressed storage. */
+ memcpy(block->frame, frame, block->zip_size());
+ if (space) {
+ space->release();
+ }
+
+ return(TRUE);
+ }
+
+ ib::error() << "Unknown compressed page type "
+ << fil_page_get_type(frame)
+ << " in " << (space ? space->chain.start->name : "")
+ << block->page.id();
+
+err_exit:
+ if (encrypted) {
+ ib::info() << "Row compressed page could be encrypted"
+ " with key_version " << key_version;
+ }
+
+ if (space) {
+ if (encrypted) {
+ dict_set_encrypted_by_space(space);
+ } else {
+ dict_set_corrupted_by_space(space);
+ }
+
+ space->release();
+ }
+
+ return(FALSE);
+}
+
+/** Wait for the block to be read in.
+@param[in] block The block to check */
+static
+void
+buf_wait_for_read(
+ buf_block_t* block)
+{
+ /* Note:
+
+ We are using the block->lock to check for IO state.
+ We set the IO_READ state under the protection of the hash_lock.
+ This is safe because another thread can only
+ access the block (and check for IO state) after the block has been
+ added to the page hashtable. */
+
+ while (block->page.io_fix() == BUF_IO_READ) {
+ rw_lock_s_lock(&block->lock);
+ rw_lock_s_unlock(&block->lock);
+ }
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** If a stale adaptive hash index exists on the block, drop it.
+Multiple executions of btr_search_drop_page_hash_index() on the
+same block must be prevented by exclusive page latch. */
+ATTRIBUTE_COLD
+static void buf_defer_drop_ahi(buf_block_t *block, mtr_memo_type_t fix_type)
+{
+ switch (fix_type) {
+ case MTR_MEMO_BUF_FIX:
+ /* We do not drop the adaptive hash index, because safely doing
+ so would require acquiring block->lock, and that is not safe
+ to acquire in some RW_NO_LATCH access paths. Those code paths
+ should have no business accessing the adaptive hash index anyway. */
+ break;
+ case MTR_MEMO_PAGE_S_FIX:
+ /* Temporarily release our S-latch. */
+ rw_lock_s_unlock(&block->lock);
+ rw_lock_x_lock(&block->lock);
+ if (dict_index_t *index= block->index)
+ if (index->freed())
+ btr_search_drop_page_hash_index(block);
+ rw_lock_x_unlock(&block->lock);
+ rw_lock_s_lock(&block->lock);
+ break;
+ case MTR_MEMO_PAGE_SX_FIX:
+ rw_lock_sx_unlock(&block->lock);
+ rw_lock_x_lock(&block->lock);
+ if (dict_index_t *index= block->index)
+ if (index->freed())
+ btr_search_drop_page_hash_index(block);
+ rw_lock_x_unlock(&block->lock);
+ rw_lock_sx_lock(&block->lock);
+ break;
+ default:
+ ut_ad(fix_type == MTR_MEMO_PAGE_X_FIX);
+ btr_search_drop_page_hash_index(block);
+ }
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/** Lock the page with the given latch type.
+@param[in,out] block block to be locked
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] mtr mini-transaction
+@param[in] file file name
+@param[in] line line where called
+@return pointer to locked block */
+static buf_block_t* buf_page_mtr_lock(buf_block_t *block,
+ ulint rw_latch,
+ mtr_t* mtr,
+ const char *file,
+ unsigned line)
+{
+ mtr_memo_type_t fix_type;
+ switch (rw_latch)
+ {
+ case RW_NO_LATCH:
+ fix_type= MTR_MEMO_BUF_FIX;
+ goto done;
+ case RW_S_LATCH:
+ rw_lock_s_lock_inline(&block->lock, 0, file, line);
+ fix_type= MTR_MEMO_PAGE_S_FIX;
+ break;
+ case RW_SX_LATCH:
+ rw_lock_sx_lock_inline(&block->lock, 0, file, line);
+ fix_type= MTR_MEMO_PAGE_SX_FIX;
+ break;
+ default:
+ ut_ad(rw_latch == RW_X_LATCH);
+ rw_lock_x_lock_inline(&block->lock, 0, file, line);
+ fix_type= MTR_MEMO_PAGE_X_FIX;
+ break;
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ {
+ dict_index_t *index= block->index;
+ if (index && index->freed())
+ buf_defer_drop_ahi(block, fix_type);
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+done:
+ mtr_memo_push(mtr, block, fix_type);
+ return block;
+}
+
+/** Low level function used to get access to a database page.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] guess guessed block or NULL
+@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@param[out] err DB_SUCCESS or error code
+@param[in] allow_ibuf_merge Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_low(
+ const page_id_t page_id,
+ ulint zip_size,
+ ulint rw_latch,
+ buf_block_t* guess,
+ ulint mode,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr,
+ dberr_t* err,
+ bool allow_ibuf_merge)
+{
+ buf_block_t* block;
+ unsigned access_time;
+ ulint retries = 0;
+ const ulint fold = page_id.fold();
+
+ ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL));
+ ut_ad(!mtr || mtr->is_active());
+ ut_ad((rw_latch == RW_S_LATCH)
+ || (rw_latch == RW_X_LATCH)
+ || (rw_latch == RW_SX_LATCH)
+ || (rw_latch == RW_NO_LATCH));
+ ut_ad(!allow_ibuf_merge
+ || mode == BUF_GET
+ || mode == BUF_GET_POSSIBLY_FREED
+ || mode == BUF_GET_IF_IN_POOL
+ || mode == BUF_GET_IF_IN_POOL_OR_WATCH);
+
+ if (err) {
+ *err = DB_SUCCESS;
+ }
+
+#ifdef UNIV_DEBUG
+ switch (mode) {
+ case BUF_EVICT_IF_IN_POOL:
+ /* After DISCARD TABLESPACE, the tablespace would not exist,
+ but in IMPORT TABLESPACE, PageConverter::operator() must
+ replace any old pages, which were not evicted during DISCARD.
+ Skip the assertion on space_page_size. */
+ break;
+ case BUF_PEEK_IF_IN_POOL:
+ case BUF_GET_IF_IN_POOL:
+ /* The caller may pass a dummy page size,
+ because it does not really matter. */
+ break;
+ default:
+ ut_error;
+ case BUF_GET_POSSIBLY_FREED:
+ break;
+ case BUF_GET_NO_LATCH:
+ ut_ad(rw_latch == RW_NO_LATCH);
+ /* fall through */
+ case BUF_GET:
+ case BUF_GET_IF_IN_POOL_OR_WATCH:
+ fil_space_t* s = fil_space_get(page_id.space());
+ ut_ad(s);
+ ut_ad(s->zip_size() == zip_size);
+ }
+#endif /* UNIV_DEBUG */
+
+ ut_ad(!mtr || !ibuf_inside(mtr)
+ || ibuf_page_low(page_id, zip_size, FALSE, file, line, NULL));
+
+ buf_pool.stat.n_page_gets++;
+loop:
+ buf_block_t* fix_block;
+ block = guess;
+
+ page_hash_latch* hash_lock = buf_pool.page_hash.lock<false>(fold);
+
+ if (block) {
+
+ /* If the guess is a compressed page descriptor that
+ has been allocated by buf_page_alloc_descriptor(),
+ it may have been freed by buf_relocate(). */
+
+ if (!buf_pool.is_uncompressed(block)
+ || page_id != block->page.id()
+ || block->page.state() != BUF_BLOCK_FILE_PAGE) {
+ /* Our guess was bogus or things have changed
+ since. */
+ guess = nullptr;
+ goto lookup;
+ } else {
+ ut_ad(!block->page.in_zip_hash);
+ }
+ } else {
+lookup:
+ block = reinterpret_cast<buf_block_t*>(
+ buf_pool.page_hash_get_low(page_id, fold));
+ }
+
+ if (!block || buf_pool.watch_is_sentinel(block->page)) {
+ hash_lock->read_unlock();
+ block = nullptr;
+ }
+
+ if (UNIV_UNLIKELY(!block)) {
+ /* Page not in buf_pool: needs to be read from file */
+ if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+ hash_lock = buf_pool.page_hash.lock<true>(fold);
+
+ if (buf_page_t *bpage= buf_pool.watch_set(
+ page_id, &hash_lock)) {
+ /* We can release hash_lock after we
+ increment the fix count to make
+ sure that no state change takes place. */
+ bpage->fix();
+ hash_lock->write_unlock();
+ block = reinterpret_cast<buf_block_t*>(bpage);
+ fix_block = block;
+ goto got_block;
+ }
+
+ hash_lock->write_unlock();
+ }
+
+ switch (mode) {
+ case BUF_GET_IF_IN_POOL:
+ case BUF_GET_IF_IN_POOL_OR_WATCH:
+ case BUF_PEEK_IF_IN_POOL:
+ case BUF_EVICT_IF_IN_POOL:
+ return(NULL);
+ }
+
+ /* The call path is buf_read_page() ->
+ buf_read_page_low() (fil_space_t::io()) ->
+ buf_page_read_complete() ->
+ buf_decrypt_after_read(). Here fil_space_t* is used
+ and we decrypt -> buf_page_check_corrupt() where page
+ checksums are compared. Decryption, decompression as
+ well as error handling takes place at a lower level.
+ Here we only need to know whether the page really is
+ corrupted, or if an encrypted page with a valid
+ checksum cannot be decypted. */
+
+ dberr_t local_err = buf_read_page(page_id, zip_size);
+
+ if (local_err == DB_SUCCESS) {
+ buf_read_ahead_random(page_id, zip_size,
+ ibuf_inside(mtr));
+
+ retries = 0;
+ } else if (mode == BUF_GET_POSSIBLY_FREED) {
+ if (err) {
+ *err = local_err;
+ }
+ return NULL;
+ } else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
+ ++retries;
+
+ DBUG_EXECUTE_IF(
+ "innodb_page_corruption_retries",
+ retries = BUF_PAGE_READ_MAX_RETRIES;
+ );
+ } else {
+ if (err) {
+ *err = local_err;
+ }
+
+ /* Pages whose encryption key is unavailable or used
+ key, encryption algorithm or encryption method is
+ incorrect are marked as encrypted in
+ buf_page_check_corrupt(). Unencrypted page could be
+ corrupted in a way where the key_id field is
+ nonzero. There is no checksum on field
+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION. */
+ if (local_err == DB_DECRYPTION_FAILED) {
+ return (NULL);
+ }
+
+ if (local_err == DB_PAGE_CORRUPTED
+ && srv_force_recovery) {
+ return NULL;
+ }
+
+ /* Try to set table as corrupted instead of
+ asserting. */
+ if (page_id.space() == TRX_SYS_SPACE) {
+ } else if (page_id.space() == SRV_TMP_SPACE_ID) {
+ } else if (fil_space_t* space= fil_space_t::get(
+ page_id.space())) {
+ bool set = dict_set_corrupted_by_space(space);
+ space->release();
+ if (set) {
+ return NULL;
+ }
+ }
+
+ ib::fatal() << "Unable to read page " << page_id
+ << " into the buffer pool after "
+ << BUF_PAGE_READ_MAX_RETRIES
+ << ". The most probable cause"
+ " of this error may be that the"
+ " table has been corrupted."
+ " See https://mariadb.com/kb/en/library/innodb-recovery-modes/";
+ }
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ goto loop;
+ } else {
+ fix_block = block;
+ }
+
+ fix_block->fix();
+ hash_lock->read_unlock();
+
+got_block:
+ switch (mode) {
+ default:
+ ut_ad(block->zip_size() == zip_size);
+ break;
+ case BUF_GET_IF_IN_POOL:
+ case BUF_PEEK_IF_IN_POOL:
+ case BUF_EVICT_IF_IN_POOL:
+ if (fix_block->page.io_fix() == BUF_IO_READ) {
+ /* The page is being read to buffer pool,
+ but we cannot wait around for the read to
+ complete. */
+ fix_block->unfix();
+ return(NULL);
+ }
+ }
+
+ switch (UNIV_EXPECT(fix_block->page.state(), BUF_BLOCK_FILE_PAGE)) {
+ case BUF_BLOCK_FILE_PAGE:
+ if (fsp_is_system_temporary(page_id.space())
+ && block->page.io_fix() != BUF_IO_NONE) {
+ /* This suggests that the page is being flushed.
+ Avoid returning reference to this page.
+ Instead wait for the flush action to complete. */
+ fix_block->unfix();
+ os_thread_sleep(WAIT_FOR_WRITE);
+ goto loop;
+ }
+
+ if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
+evict_from_pool:
+ ut_ad(!fix_block->page.oldest_modification());
+ mysql_mutex_lock(&buf_pool.mutex);
+ fix_block->unfix();
+
+ if (!buf_LRU_free_page(&fix_block->page, true)) {
+ ut_ad(0);
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return(NULL);
+ }
+
+ break;
+ default:
+ ut_error;
+ break;
+
+ case BUF_BLOCK_ZIP_PAGE:
+ if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
+ goto evict_from_pool;
+ }
+
+ if (mode == BUF_PEEK_IF_IN_POOL) {
+ /* This mode is only used for dropping an
+ adaptive hash index. There cannot be an
+ adaptive hash index for a compressed-only
+ page, so do not bother decompressing the page. */
+ fix_block->unfix();
+
+ return(NULL);
+ }
+
+ buf_page_t* bpage = &block->page;
+
+ /* Note: We have already buffer fixed this block. */
+ if (bpage->buf_fix_count() > 1
+ || bpage->io_fix() != BUF_IO_NONE) {
+
+ /* This condition often occurs when the buffer
+ is not buffer-fixed, but I/O-fixed by
+ buf_page_init_for_read(). */
+ fix_block->unfix();
+
+ /* The block is buffer-fixed or I/O-fixed.
+ Try again later. */
+ os_thread_sleep(WAIT_FOR_READ);
+
+ goto loop;
+ }
+
+ /* Buffer-fix the block so that it cannot be evicted
+ or relocated while we are attempting to allocate an
+ uncompressed page. */
+
+ block = buf_LRU_get_free_block(false);
+ buf_block_init_low(block);
+
+ mysql_mutex_lock(&buf_pool.mutex);
+ hash_lock = buf_pool.page_hash.lock_get(fold);
+
+ hash_lock->write_lock();
+
+ /* Buffer-fixing prevents the page_hash from changing. */
+ ut_ad(bpage == buf_pool.page_hash_get_low(page_id, fold));
+
+ fix_block->unfix(); /* hash_lock protects us after this */
+
+ if (bpage->buf_fix_count() || bpage->io_fix() != BUF_IO_NONE) {
+ /* The block was buffer-fixed or I/O-fixed while
+ buf_pool.mutex was not held by this thread.
+ Free the block that was allocated and retry.
+ This should be extremely unlikely, for example,
+ if buf_page_get_zip() was invoked. */
+
+ hash_lock->write_unlock();
+ buf_LRU_block_free_non_file_page(block);
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ /* Try again */
+ goto loop;
+ }
+
+ fix_block = block;
+
+ /* Move the compressed page from bpage to block,
+ and uncompress it. */
+
+ /* Note: this is the uncompressed block and it is not
+ accessible by other threads yet because it is not in
+ any list or hash table */
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_relocate(bpage, &block->page);
+
+ /* Set after buf_relocate(). */
+ block->page.set_buf_fix_count(1);
+
+ buf_flush_relocate_on_flush_list(bpage, &block->page);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ /* Buffer-fix, I/O-fix, and X-latch the block
+ for the duration of the decompression.
+ Also add the block to the unzip_LRU list. */
+ block->page.set_state(BUF_BLOCK_FILE_PAGE);
+
+ /* Insert at the front of unzip_LRU list */
+ buf_unzip_LRU_add_block(block, FALSE);
+
+ block->page.set_io_fix(BUF_IO_READ);
+ rw_lock_x_lock_inline(&block->lock, 0, file, line);
+
+ MEM_UNDEFINED(bpage, sizeof *bpage);
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ hash_lock->write_unlock();
+ buf_pool.n_pend_unzip++;
+
+ access_time = block->page.is_accessed();
+
+ if (!access_time && !recv_no_ibuf_operations
+ && ibuf_page_exists(block->page.id(), zip_size)) {
+ block->page.ibuf_exist = true;
+ }
+
+ buf_page_free_descriptor(bpage);
+
+ /* Decompress the page while not holding
+ buf_pool.mutex. */
+
+ if (!buf_zip_decompress(block, false)) {
+ rw_lock_x_unlock(&fix_block->lock);
+ fix_block->page.io_unfix();
+ fix_block->unfix();
+ --buf_pool.n_pend_unzip;
+
+ if (err) {
+ *err = DB_PAGE_CORRUPTED;
+ }
+ return NULL;
+ }
+
+ rw_lock_x_unlock(&block->lock);
+ fix_block->page.io_unfix();
+ --buf_pool.n_pend_unzip;
+ break;
+ }
+
+ ut_ad(block == fix_block);
+ ut_ad(fix_block->page.buf_fix_count());
+
+ ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+re_evict:
+ if (mode != BUF_GET_IF_IN_POOL
+ && mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
+ } else if (!ibuf_debug) {
+ } else if (fil_space_t* space = fil_space_t::get(page_id.space())) {
+ /* Try to evict the block from the buffer pool, to use the
+ insert buffer (change buffer) as much as possible. */
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ fix_block->unfix();
+
+ /* Blocks cannot be relocated or enter or exit the
+ buf_pool while we are holding the buf_pool.mutex. */
+ const bool evicted = buf_LRU_free_page(&fix_block->page, true);
+ space->release();
+
+ if (evicted) {
+ hash_lock = buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+ mysql_mutex_unlock(&buf_pool.mutex);
+ /* We may set the watch, as it would have
+ been set if the page were not in the
+ buffer pool in the first place. */
+ block= reinterpret_cast<buf_block_t*>(
+ mode == BUF_GET_IF_IN_POOL_OR_WATCH
+ ? buf_pool.watch_set(page_id, &hash_lock)
+ : buf_pool.page_hash_get_low(page_id, fold));
+ hash_lock->write_unlock();
+
+ if (block != NULL) {
+ /* Either the page has been read in or
+ a watch was set on that in the window
+ where we released the buf_pool.mutex
+ and before we acquire the hash_lock
+ above. Try again. */
+ guess = block;
+
+ goto loop;
+ }
+
+ return(NULL);
+ }
+
+ fix_block->fix();
+ mysql_mutex_unlock(&buf_pool.mutex);
+ buf_flush_list();
+ buf_flush_wait_batch_end_acquiring_mutex(false);
+ while (buf_flush_list_space(space));
+ os_aio_wait_until_no_pending_writes();
+
+ if (fix_block->page.buf_fix_count() == 1
+ && !fix_block->page.oldest_modification()) {
+ goto re_evict;
+ }
+
+ /* Failed to evict the page; change it directly */
+ }
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+ ut_ad(fix_block->page.buf_fix_count());
+
+#ifdef UNIV_DEBUG
+ /* We have already buffer fixed the page, and we are committed to
+ returning this page to the caller. Register for debugging.
+ Avoid debug latching if page/block belongs to system temporary
+ tablespace (Not much needed for table with single threaded access.). */
+ if (!fsp_is_system_temporary(page_id.space())) {
+ ibool ret;
+ ret = rw_lock_s_lock_nowait(
+ fix_block->debug_latch, file, line);
+ ut_a(ret);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* While tablespace is reinited the indexes are already freed but the
+ blocks related to it still resides in buffer pool. Trying to remove
+ such blocks from buffer pool would invoke removal of AHI entries
+ associated with these blocks. Logic to remove AHI entry will try to
+ load the block but block is already in free state. Handle the said case
+ with mode = BUF_PEEK_IF_IN_POOL that is invoked from
+ "btr_search_drop_page_hash_when_freed". */
+ ut_ad(mode == BUF_GET_POSSIBLY_FREED
+ || mode == BUF_PEEK_IF_IN_POOL
+ || fix_block->page.status != buf_page_t::FREED);
+
+ const bool not_first_access = fix_block->page.set_accessed();
+
+ if (mode != BUF_PEEK_IF_IN_POOL) {
+ buf_page_make_young_if_needed(&fix_block->page);
+ }
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+ /* We have to wait here because the IO_READ state was set
+ under the protection of the hash_lock and not block->lock. */
+ buf_wait_for_read(fix_block);
+
+ if (fix_block->page.id() != page_id) {
+ fix_block->unfix();
+
+#ifdef UNIV_DEBUG
+ if (!fsp_is_system_temporary(page_id.space())) {
+ rw_lock_s_unlock(fix_block->debug_latch);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (err) {
+ *err = DB_PAGE_CORRUPTED;
+ }
+
+ return NULL;
+ }
+
+ if (fix_block->page.status != buf_page_t::FREED
+ && allow_ibuf_merge
+ && fil_page_get_type(fix_block->frame) == FIL_PAGE_INDEX
+ && page_is_leaf(fix_block->frame)) {
+ rw_lock_x_lock_inline(&fix_block->lock, 0, file, line);
+
+ if (fix_block->page.ibuf_exist) {
+ fix_block->page.ibuf_exist = false;
+ ibuf_merge_or_delete_for_page(fix_block, page_id,
+ zip_size);
+ }
+
+ if (rw_latch == RW_X_LATCH) {
+ mtr->memo_push(fix_block, MTR_MEMO_PAGE_X_FIX);
+ } else {
+ rw_lock_x_unlock(&fix_block->lock);
+ goto get_latch;
+ }
+ } else {
+get_latch:
+ fix_block = buf_page_mtr_lock(fix_block, rw_latch, mtr,
+ file, line);
+ }
+
+ if (!not_first_access && mode != BUF_PEEK_IF_IN_POOL) {
+ /* In the case of a first access, try to apply linear
+ read-ahead */
+
+ buf_read_ahead_linear(page_id, zip_size, ibuf_inside(mtr));
+ }
+
+ return(fix_block);
+}
+
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] guess guessed block or NULL
+@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@param[out] err DB_SUCCESS or error code
+@param[in] allow_ibuf_merge Allow change buffer merge while
+reading the pages from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_gen(
+ const page_id_t page_id,
+ ulint zip_size,
+ ulint rw_latch,
+ buf_block_t* guess,
+ ulint mode,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr,
+ dberr_t* err,
+ bool allow_ibuf_merge)
+{
+ if (buf_block_t *block= recv_sys.recover(page_id))
+ {
+ block->fix();
+ ut_ad(rw_lock_s_lock_nowait(block->debug_latch, file, line));
+ if (err)
+ *err= DB_SUCCESS;
+ const bool must_merge= allow_ibuf_merge &&
+ ibuf_page_exists(page_id, block->zip_size());
+ if (block->page.status == buf_page_t::FREED)
+ ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL);
+ else if (must_merge && fil_page_get_type(block->frame) == FIL_PAGE_INDEX &&
+ page_is_leaf(block->frame))
+ {
+ rw_lock_x_lock_inline(&block->lock, 0, file, line);
+ block->page.ibuf_exist= false;
+ ibuf_merge_or_delete_for_page(block, page_id, block->zip_size());
+
+ if (rw_latch == RW_X_LATCH)
+ {
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+ return block;
+ }
+ rw_lock_x_unlock(&block->lock);
+ }
+ block= buf_page_mtr_lock(block, rw_latch, mtr, file, line);
+ return block;
+ }
+
+ return buf_page_get_low(page_id, zip_size, rw_latch,
+ guess, mode, file, line, mtr, err, allow_ibuf_merge);
+}
+
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return TRUE if success */
+ibool
+buf_page_optimistic_get(
+/*====================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: guessed buffer block */
+ ib_uint64_t modify_clock,/*!< in: modify clock value */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ ibool success;
+
+ ut_ad(block);
+ ut_ad(mtr);
+ ut_ad(mtr->is_active());
+ ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+
+ if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE
+ || block->page.io_fix() != BUF_IO_NONE)) {
+ return FALSE;
+ }
+
+ const page_id_t id(block->page.id());
+
+ page_hash_latch *hash_lock = buf_pool.hash_lock_get(id);
+ hash_lock->read_lock();
+
+ if (UNIV_UNLIKELY(id != block->page.id()
+ || block->page.state() != BUF_BLOCK_FILE_PAGE
+ || block->page.io_fix() != BUF_IO_NONE)) {
+ hash_lock->read_unlock();
+ return(FALSE);
+ }
+
+ buf_block_buf_fix_inc(block, file, line);
+ hash_lock->read_unlock();
+
+ block->page.set_accessed();
+
+ buf_page_make_young_if_needed(&block->page);
+
+ ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), NULL));
+
+ mtr_memo_type_t fix_type;
+
+ if (rw_latch == RW_S_LATCH) {
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ success = rw_lock_s_lock_nowait(&block->lock, file, line);
+ } else {
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ success = rw_lock_x_lock_func_nowait_inline(
+ &block->lock, file, line);
+ }
+
+ ut_ad(id == block->page.id());
+
+ if (!success) {
+ buf_block_buf_fix_dec(block);
+ return(FALSE);
+ }
+
+ if (modify_clock != block->modify_clock) {
+
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ if (rw_latch == RW_S_LATCH) {
+ rw_lock_s_unlock(&block->lock);
+ } else {
+ rw_lock_x_unlock(&block->lock);
+ }
+
+ buf_block_buf_fix_dec(block);
+ return(FALSE);
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ ut_ad(block->page.buf_fix_count());
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+ buf_pool.stat.n_page_gets++;
+
+ return(TRUE);
+}
+
+/** Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the lock_sys_t::mutex.
+@param[in] page_id page id
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@return pointer to a page or NULL */
+buf_block_t*
+buf_page_try_get_func(
+ const page_id_t page_id,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr)
+{
+ ut_ad(mtr);
+ ut_ad(mtr->is_active());
+
+ page_hash_latch *hash_lock;
+ buf_page_t *bpage= buf_pool.page_hash_get_locked<false>(page_id,
+ page_id.fold(),
+ &hash_lock);
+ if (!bpage)
+ return nullptr;
+ if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+ {
+ hash_lock->read_unlock();
+ return nullptr;
+ }
+
+ buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+ buf_block_buf_fix_inc(block, file, line);
+ hash_lock->read_unlock();
+
+ mtr_memo_type_t fix_type= MTR_MEMO_PAGE_S_FIX;
+ if (!rw_lock_s_lock_nowait(&block->lock, file, line))
+ {
+ /* Let us try to get an X-latch. If the current thread
+ is holding an X-latch on the page, we cannot get an S-latch. */
+ fix_type= MTR_MEMO_PAGE_X_FIX;
+ if (!rw_lock_x_lock_func_nowait_inline(&block->lock, file, line))
+ {
+ buf_block_buf_fix_dec(block);
+ return nullptr;
+ }
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ ut_ad(bpage->buf_fix_count());
+ ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(bpage->id() == page_id);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ buf_pool.stat.n_page_gets++;
+ return block;
+}
+
+/** Initialize the block.
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param fix initial buf_fix_count() */
+void buf_block_t::initialise(const page_id_t page_id, ulint zip_size,
+ uint32_t fix)
+{
+ ut_ad(page.state() != BUF_BLOCK_FILE_PAGE);
+ buf_block_init_low(this);
+ page.init(page_id, fix);
+ page_zip_set_size(&page.zip, zip_size);
+}
+
+/** Initialize a page in the buffer pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@param[in,out] space space object
+@param[in] offset offset of the tablespace
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction
+@param[in,out] free_block pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create(fil_space_t *space, uint32_t offset,
+ ulint zip_size, mtr_t *mtr, buf_block_t *free_block)
+{
+ page_id_t page_id(space->id, offset);
+ ut_ad(mtr->is_active());
+ ut_ad(page_id.space() != 0 || !zip_size);
+
+ space->free_page(offset, false);
+ free_block->initialise(page_id, zip_size, 1);
+
+ const ulint fold= page_id.fold();
+ mysql_mutex_lock(&buf_pool.mutex);
+
+loop:
+ buf_block_t *block= reinterpret_cast<buf_block_t*>
+ (buf_pool.page_hash_get_low(page_id, fold));
+
+ if (block && block->page.in_file() &&
+ !buf_pool.watch_is_sentinel(block->page))
+ {
+#ifdef BTR_CUR_HASH_ADAPT
+ const dict_index_t *drop_hash_entry= nullptr;
+#endif
+ switch (UNIV_EXPECT(block->page.state(), BUF_BLOCK_FILE_PAGE)) {
+ default:
+ ut_ad(0);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ if (!mtr->have_x_latch(*block))
+ {
+ buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+ while (!rw_lock_x_lock_nowait(&block->lock))
+ {
+ /* Wait for buf_page_write_complete() to release block->lock.
+ We must not hold buf_pool.mutex while waiting. */
+ timespec abstime;
+ set_timespec_nsec(abstime, 1000000);
+ my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+ &abstime);
+ }
+ mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+ }
+ else
+ {
+ ut_ad(!block->page.ibuf_exist);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!block->index);
+#endif
+ }
+#ifdef BTR_CUR_HASH_ADAPT
+ drop_hash_entry= block->index;
+#endif
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+ if (block->page.io_fix() != BUF_IO_NONE)
+ {
+ hash_lock->write_unlock();
+ /* Wait for buf_page_write_complete() to release the I/O fix. */
+ timespec abstime;
+ set_timespec_nsec(abstime, 1000000);
+ my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+ &abstime);
+ goto loop;
+ }
+
+ rw_lock_x_lock(&free_block->lock);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_relocate(&block->page, &free_block->page);
+ buf_flush_relocate_on_flush_list(&block->page, &free_block->page);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ free_block->page.set_state(BUF_BLOCK_FILE_PAGE);
+ buf_unzip_LRU_add_block(free_block, FALSE);
+ hash_lock->write_unlock();
+ buf_page_free_descriptor(&block->page);
+ block= free_block;
+ buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+ mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+ break;
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (drop_hash_entry)
+ btr_search_drop_page_hash_index(block);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (block->page.ibuf_exist)
+ {
+ if (!recv_recovery_is_on())
+ ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+ block->page.ibuf_exist= false;
+ }
+
+ return block;
+ }
+
+ /* If we get here, the page was not in buf_pool: init it there */
+
+ DBUG_PRINT("ib_buf", ("create page %u:%u",
+ page_id.space(), page_id.page_no()));
+
+ block= free_block;
+
+ /* Duplicate buf_block_buf_fix_inc_func() */
+ ut_ad(block->page.buf_fix_count() == 1);
+ ut_ad(fsp_is_system_temporary(page_id.space()) ||
+ rw_lock_s_lock_nowait(block->debug_latch, __FILE__, __LINE__));
+
+ /* The block must be put to the LRU list */
+ buf_LRU_add_block(&block->page, false);
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+ block->page.set_state(BUF_BLOCK_FILE_PAGE);
+ ut_d(block->page.in_page_hash= true);
+ HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, &block->page);
+
+ rw_lock_x_lock(&block->lock);
+ if (UNIV_UNLIKELY(zip_size))
+ {
+ /* Prevent race conditions during buf_buddy_alloc(), which may
+ release and reacquire buf_pool.mutex, by IO-fixing and X-latching
+ the block. */
+ block->page.set_io_fix(BUF_IO_READ);
+ hash_lock->write_unlock();
+
+ /* buf_pool.mutex may be released and reacquired by
+ buf_buddy_alloc(). We must defer this operation until
+ after the block descriptor has been added to
+ buf_pool.LRU and buf_pool.page_hash. */
+ block->page.zip.data= buf_buddy_alloc(zip_size);
+
+ /* To maintain the invariant block->in_unzip_LRU_list ==
+ block->page.belongs_to_unzip_LRU() we have to add this
+ block to unzip_LRU after block->page.zip.data is set. */
+ ut_ad(block->page.belongs_to_unzip_LRU());
+ buf_unzip_LRU_add_block(block, FALSE);
+
+ block->page.set_io_fix(BUF_IO_NONE);
+ }
+ else
+ hash_lock->write_unlock();
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+ block->page.set_accessed();
+ buf_pool.stat.n_pages_created++;
+
+ /* Delete possible entries for the page from the insert buffer:
+ such can exist if the page belonged to an index which was dropped */
+ if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} &&
+ !recv_recovery_is_on())
+ ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+
+ static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent");
+ memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8);
+ mach_write_to_2(block->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+
+ /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
+ following pages:
+ (1) The first page of the InnoDB system tablespace (page 0:0)
+ (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
+ (3) key_version on encrypted pages (not page 0:0) */
+
+ memset(block->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+ memset_aligned<8>(block->frame + FIL_PAGE_LSN, 0, 8);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ return block;
+}
+
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage buffer page whose read or write was completed
+@param io_type BUF_IO_READ or BUF_IO_WRITE */
+ATTRIBUTE_COLD __attribute__((nonnull))
+void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type)
+{
+ const byte* frame;
+ monitor_id_t counter;
+
+ ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+ frame = bpage->zip.data
+ ? bpage->zip.data
+ : ((buf_block_t*) bpage)->frame;
+
+ switch (fil_page_get_type(frame)) {
+ ulint level;
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ level = btr_page_get_level(frame);
+
+ /* Check if it is an index page for insert buffer */
+ if (fil_page_get_type(frame) == FIL_PAGE_INDEX
+ && btr_page_get_index_id(frame)
+ == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
+ if (level == 0) {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
+ } else {
+ counter = MONITOR_RW_COUNTER(
+ io_type,
+ MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
+ }
+ } else {
+ if (level == 0) {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_LEAF_PAGE);
+ } else {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_NON_LEAF_PAGE);
+ }
+ }
+ break;
+
+ case FIL_PAGE_UNDO_LOG:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
+ break;
+
+ case FIL_PAGE_INODE:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
+ break;
+
+ case FIL_PAGE_IBUF_FREE_LIST:
+ counter = MONITOR_RW_COUNTER(io_type,
+ MONITOR_IBUF_FREELIST_PAGE);
+ break;
+
+ case FIL_PAGE_IBUF_BITMAP:
+ counter = MONITOR_RW_COUNTER(io_type,
+ MONITOR_IBUF_BITMAP_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_SYS:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_TRX_SYS:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_FSP_HDR:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_XDES:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_BLOB:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_ZBLOB:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_ZBLOB2:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
+ break;
+
+ default:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
+ }
+
+ MONITOR_INC_NOCHECK(counter);
+}
+
+/** Mark a table corrupted.
+@param[in] bpage corrupted page
+@param[in] space tablespace of the corrupted page */
+ATTRIBUTE_COLD
+static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space)
+{
+ /* If block is not encrypted find the table with specified
+ space id, and mark it corrupted. Encrypted tables
+ are marked unusable later e.g. in ::open(). */
+ if (!space.crypt_data
+ || space.crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) {
+ dict_set_corrupted_by_space(&space);
+ } else {
+ dict_set_encrypted_by_space(&space);
+ }
+}
+
+/** Release and evict a corrupted page.
+@param bpage page that was being read */
+ATTRIBUTE_COLD void buf_pool_t::corrupted_evict(buf_page_t *bpage)
+{
+ const page_id_t id(bpage->id());
+ page_hash_latch *hash_lock= hash_lock_get(id);
+
+ mysql_mutex_lock(&mutex);
+ hash_lock->write_lock();
+
+ ut_ad(bpage->io_fix() == BUF_IO_READ);
+ ut_ad(!bpage->oldest_modification());
+ bpage->set_corrupt_id();
+
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+ rw_lock_x_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
+ BUF_IO_READ);
+
+ bpage->io_unfix();
+
+ /* remove from LRU and page_hash */
+ buf_LRU_free_one_page(bpage, id, hash_lock);
+ mysql_mutex_unlock(&mutex);
+
+ ut_d(auto n=) n_pend_reads--;
+ ut_ad(n > 0);
+}
+
+/** Mark a table corrupted.
+@param[in] bpage Corrupted page
+@param[in] node data file
+Also remove the bpage from LRU list. */
+ATTRIBUTE_COLD
+static void buf_corrupt_page_release(buf_page_t *bpage, const fil_node_t &node)
+{
+ ut_ad(bpage->id().space() == node.space->id);
+ buf_pool.corrupted_evict(bpage);
+
+ if (!srv_force_recovery)
+ buf_mark_space_corrupt(bpage, *node.space);
+}
+
+/** Check if the encrypted page is corrupted for the full crc32 format.
+@param[in] space_id page belongs to space id
+@param[in] d page
+@param[in] is_compressed compressed page
+@return true if page is corrupted or false if it isn't */
+static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d,
+ bool is_compressed)
+{
+ if (space_id != mach_read_from_4(d + FIL_PAGE_SPACE_ID))
+ return true;
+
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+
+ return !is_compressed &&
+ memcmp_aligned<4>(FIL_PAGE_LSN + 4 + d,
+ d + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4);
+}
+
+/** Check if page is maybe compressed, encrypted or both when we encounter
+corrupted page. Note that we can't be 100% sure if page is corrupted
+or decrypt/decompress just failed.
+@param[in,out] bpage page
+@param[in] node data file
+@return whether the operation succeeded
+@retval DB_SUCCESS if page has been read and is not corrupted
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval DB_TABLESPACE_DELETED if accessed tablespace is not found */
+static dberr_t buf_page_check_corrupt(buf_page_t *bpage,
+ const fil_node_t &node)
+{
+ ut_ad(node.space->referenced());
+
+ byte* dst_frame = (bpage->zip.data) ? bpage->zip.data :
+ ((buf_block_t*) bpage)->frame;
+ dberr_t err = DB_SUCCESS;
+ uint key_version = buf_page_get_key_version(dst_frame,
+ node.space->flags);
+
+ /* In buf_decrypt_after_read we have either decrypted the page if
+ page post encryption checksum matches and used key_id is found
+ from the encryption plugin. If checksum did not match page was
+ not decrypted and it could be either encrypted and corrupted
+ or corrupted or good page. If we decrypted, there page could
+ still be corrupted if used key does not match. */
+ const bool seems_encrypted = !node.space->full_crc32() && key_version
+ && node.space->crypt_data
+ && node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
+ ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY ||
+ node.space->full_crc32());
+
+ /* If traditional checksums match, we assume that page is
+ not anymore encrypted. */
+ if (node.space->full_crc32()
+ && !buf_is_zeroes(span<const byte>(dst_frame,
+ node.space->physical_size()))
+ && (key_version || node.space->is_compressed()
+ || node.space->purpose == FIL_TYPE_TEMPORARY)) {
+ if (buf_page_full_crc32_is_corrupted(
+ bpage->id().space(), dst_frame,
+ node.space->is_compressed())) {
+ err = DB_PAGE_CORRUPTED;
+ }
+ } else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) {
+ err = DB_PAGE_CORRUPTED;
+ }
+
+ if (seems_encrypted && err == DB_PAGE_CORRUPTED
+ && bpage->id().page_no() != 0) {
+ err = DB_DECRYPTION_FAILED;
+
+ ib::error()
+ << "The page " << bpage->id()
+ << " in file '" << node.name
+ << "' cannot be decrypted.";
+
+ ib::info()
+ << "However key management plugin or used key_version "
+ << key_version
+ << " is not found or"
+ " used encryption algorithm or method does not match.";
+
+ if (bpage->id().space() != TRX_SYS_SPACE) {
+ ib::info()
+ << "Marking tablespace as missing."
+ " You may drop this table or"
+ " install correct key management plugin"
+ " and key file.";
+ }
+ }
+
+ return (err);
+}
+
+/** Complete a read request of a file page to buf_pool.
+@param bpage recently read page
+@param node data file
+@return whether the operation succeeded
+@retval DB_SUCCESS always when writing, or if a read page was OK
+@retval DB_PAGE_CORRUPTED if the checksum fails on a page read
+@retval DB_DECRYPTION_FAILED if the page cannot be decrypted */
+dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node)
+{
+ const page_id_t id(bpage->id());
+ ut_ad(bpage->in_file());
+ ut_ad(!buf_dblwr.is_inside(id));
+ ut_ad(id.space() == node.space->id);
+ ut_ad(bpage->zip_size() == node.space->zip_size());
+
+ /* We do not need protect io_fix here by mutex to read it because
+ this and buf_page_write_complete() are the only functions where we can
+ change the value from BUF_IO_READ or BUF_IO_WRITE to some other
+ value, and our code ensures that this is the only thread that handles
+ the i/o for this block. */
+
+ ut_ad(bpage->io_fix() == BUF_IO_READ);
+ ut_ad(!!bpage->zip.ssize == !!bpage->zip.data);
+ ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE || bpage->zip.data);
+
+ const byte *frame= bpage->zip.data
+ ? bpage->zip.data
+ : reinterpret_cast<buf_block_t*>(bpage)->frame;
+ ut_ad(frame);
+
+ dberr_t err;
+ if (!buf_page_decrypt_after_read(bpage, node))
+ {
+ err= DB_DECRYPTION_FAILED;
+ goto database_corrupted;
+ }
+
+ if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE)
+ {
+ buf_pool.n_pend_unzip++;
+ auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(bpage), FALSE);
+ buf_pool.n_pend_unzip--;
+
+ if (!ok)
+ {
+ ib::info() << "Page " << id << " zip_decompress failure.";
+ err= DB_PAGE_CORRUPTED;
+ goto database_corrupted;
+ }
+ }
+
+ {
+ const page_id_t read_id(mach_read_from_4(frame + FIL_PAGE_SPACE_ID),
+ mach_read_from_4(frame + FIL_PAGE_OFFSET));
+
+ if (read_id == id);
+ else if (read_id == page_id_t(0, 0))
+ /* This is likely an uninitialized page. */;
+ else if (!node.space->full_crc32() &&
+ page_id_t(0, read_id.page_no()) == id)
+ /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace
+ before MySQL 4.1.1, which introduced innodb_file_per_table. */;
+ else if (node.space->full_crc32() &&
+ *reinterpret_cast<const uint32_t*>
+ (&frame[FIL_PAGE_FCRC32_KEY_VERSION]) &&
+ node.space->crypt_data &&
+ node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)
+ {
+ ib::error() << "Cannot decrypt " << id;
+ err= DB_DECRYPTION_FAILED;
+ goto release_page;
+ }
+ else
+ ib::error() << "Space id and page no stored in the page, read in are "
+ << read_id << ", should be " << id;
+ }
+
+ err= buf_page_check_corrupt(bpage, node);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ {
+database_corrupted:
+ /* Not a real corruption if it was triggered by error injection */
+ DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
+ if (!is_predefined_tablespace(id.space()))
+ {
+ buf_corrupt_page_release(bpage, node);
+ ib::info() << "Simulated IMPORT corruption";
+ return err;
+ }
+ err= DB_SUCCESS;
+ goto page_not_corrupt;);
+
+ if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE)
+ memset(reinterpret_cast<buf_block_t*>(bpage)->frame, 0, srv_page_size);
+
+ if (err == DB_PAGE_CORRUPTED)
+ {
+ ib::error() << "Database page corruption on disk"
+ " or a failed read of file '"
+ << node.name << "' page " << id
+ << ". You may have to recover from a backup.";
+
+ buf_page_print(frame, bpage->zip_size());
+
+ ib::info() << " You can use CHECK TABLE to scan"
+ " your table for corruption. "
+ << FORCE_RECOVERY_MSG;
+ }
+
+ if (!srv_force_recovery)
+ {
+ /* If the corruption is in the system tablespace, we will
+ intentionally crash the server. */
+ if (id.space() == TRX_SYS_SPACE)
+ ib::fatal() << "Aborting because of a corrupt database page.";
+ buf_corrupt_page_release(bpage, node);
+ return err;
+ }
+ }
+
+ DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
+ page_not_corrupt: bpage= bpage; );
+
+ if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED)
+ {
+release_page:
+ buf_corrupt_page_release(bpage, node);
+ if (recv_recovery_is_on())
+ recv_sys.free_corrupted_page(id);
+ return err;
+ }
+
+ if (recv_recovery_is_on())
+ recv_recover_page(node.space, bpage);
+
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE && !recv_no_ibuf_operations &&
+ (!id.space() || !is_predefined_tablespace(id.space())) &&
+ fil_page_get_type(frame) == FIL_PAGE_INDEX &&
+ page_is_leaf(frame))
+ bpage->ibuf_exist= true;
+
+ if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+ buf_page_monitor(bpage, BUF_IO_READ);
+ DBUG_PRINT("ib_buf", ("read page %u:%u",
+ id.space(), id.page_no()));
+
+ /* Because this thread which does the unlocking might not be the same that
+ did the locking, we use a pass value != 0 in unlock, which simply
+ removes the newest lock debug record, without checking the thread id. */
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+ rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_READ);
+ bpage->io_unfix();
+
+ ut_d(auto n=) buf_pool.n_pend_reads--;
+ ut_ad(n > 0);
+ buf_pool.stat.n_pages_read++;
+
+ return DB_SUCCESS;
+}
+
+#ifdef UNIV_DEBUG
+/** Check that all blocks are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+void buf_pool_t::assert_all_freed()
+{
+ mysql_mutex_lock(&mutex);
+ const chunk_t *chunk= chunks;
+ for (auto i= n_chunks; i--; chunk++)
+ if (const buf_block_t* block= chunk->not_freed())
+ ib::fatal() << "Page " << block->page.id() << " still fixed or dirty";
+ mysql_mutex_unlock(&mutex);
+}
+#endif /* UNIV_DEBUG */
+
+/** Refresh the statistics used to print per-second averages. */
+void buf_refresh_io_stats()
+{
+ buf_pool.last_printout_time = time(NULL);
+ buf_pool.old_stat = buf_pool.stat;
+}
+
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate()
+{
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ buf_flush_wait_batch_end(true);
+ buf_flush_wait_batch_end(false);
+
+ /* It is possible that a write batch that has been posted
+ earlier is still not complete. For buffer pool invalidation to
+ proceed we must ensure there is NO write activity happening. */
+
+ ut_d(mysql_mutex_unlock(&buf_pool.mutex));
+ ut_d(buf_pool.assert_all_freed());
+ ut_d(mysql_mutex_lock(&buf_pool.mutex));
+
+ while (buf_LRU_scan_and_free_block());
+
+ ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
+ ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
+
+ buf_pool.freed_page_clock = 0;
+ buf_pool.LRU_old = NULL;
+ buf_pool.LRU_old_len = 0;
+
+ memset(&buf_pool.stat, 0x00, sizeof(buf_pool.stat));
+ buf_refresh_io_stats();
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate the buffer pool. */
+void buf_pool_t::validate()
+{
+ ulint n_lru = 0;
+ ulint n_flushing = 0;
+ ulint n_free = 0;
+ ulint n_zip = 0;
+
+ mysql_mutex_lock(&mutex);
+
+ chunk_t* chunk = chunks;
+
+ /* Check the uncompressed blocks. */
+
+ for (auto i = n_chunks; i--; chunk++) {
+
+ ulint j;
+ buf_block_t* block = chunk->blocks;
+
+ for (j = chunk->size; j--; block++) {
+ switch (block->page.state()) {
+ case BUF_BLOCK_ZIP_PAGE:
+ /* This kind of block descriptors should
+ be allocated by malloc() only. */
+ ut_error;
+ break;
+
+ case BUF_BLOCK_NOT_USED:
+ n_free++;
+ break;
+
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ /* do nothing */
+ break;
+
+ case BUF_BLOCK_FILE_PAGE:
+ const page_id_t id = block->page.id();
+ ut_ad(page_hash_get_low(id, id.fold())
+ == &block->page);
+ n_lru++;
+ break;
+
+ }
+ }
+ }
+
+ /* Check dirty blocks. */
+
+ mysql_mutex_lock(&flush_list_mutex);
+ for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b;
+ b = UT_LIST_GET_NEXT(list, b)) {
+ ut_ad(b->oldest_modification());
+ ut_ad(!fsp_is_system_temporary(b->id().space()));
+ n_flushing++;
+
+ switch (b->state()) {
+ case BUF_BLOCK_ZIP_PAGE:
+ n_lru++;
+ n_zip++;
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ /* uncompressed page */
+ break;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+ const page_id_t id = b->id();
+ ut_ad(page_hash_get_low(id, id.fold()) == b);
+ }
+
+ ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing);
+
+ mysql_mutex_unlock(&flush_list_mutex);
+
+ if (curr_size == old_size
+ && n_lru + n_free > curr_size + n_zip) {
+
+ ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
+ << ", pool " << curr_size
+ << " zip " << n_zip << ". Aborting...";
+ }
+
+ ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru);
+
+ if (curr_size == old_size
+ && UT_LIST_GET_LEN(free) != n_free) {
+
+ ib::fatal() << "Free list len "
+ << UT_LIST_GET_LEN(free)
+ << ", free blocks " << n_free << ". Aborting...";
+ }
+
+ mysql_mutex_unlock(&mutex);
+
+ ut_d(buf_LRU_validate());
+ ut_d(buf_flush_validate());
+}
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Write information of the buf_pool to the error log. */
+void buf_pool_t::print()
+{
+ index_id_t* index_ids;
+ ulint* counts;
+ ulint size;
+ ulint i;
+ ulint j;
+ index_id_t id;
+ ulint n_found;
+ chunk_t* chunk;
+ dict_index_t* index;
+
+ size = curr_size;
+
+ index_ids = static_cast<index_id_t*>(
+ ut_malloc_nokey(size * sizeof *index_ids));
+
+ counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
+
+ mysql_mutex_lock(&mutex);
+ mysql_mutex_lock(&flush_list_mutex);
+
+ ib::info()
+ << "[buffer pool: size=" << curr_size
+ << ", database pages=" << UT_LIST_GET_LEN(LRU)
+ << ", free pages=" << UT_LIST_GET_LEN(free)
+ << ", modified database pages="
+ << UT_LIST_GET_LEN(flush_list)
+ << ", n pending decompressions=" << n_pend_unzip
+ << ", n pending reads=" << n_pend_reads
+ << ", n pending flush LRU=" << n_flush_LRU_
+ << " list=" << n_flush_list_
+ << ", pages made young=" << stat.n_pages_made_young
+ << ", not young=" << stat.n_pages_not_made_young
+ << ", pages read=" << stat.n_pages_read
+ << ", created=" << stat.n_pages_created
+ << ", written=" << stat.n_pages_written << "]";
+
+ mysql_mutex_unlock(&flush_list_mutex);
+
+ /* Count the number of blocks belonging to each index in the buffer */
+
+ n_found = 0;
+
+ chunk = chunks;
+
+ for (i = n_chunks; i--; chunk++) {
+ buf_block_t* block = chunk->blocks;
+ ulint n_blocks = chunk->size;
+
+ for (; n_blocks--; block++) {
+ const buf_frame_t* frame = block->frame;
+
+ if (fil_page_index_page_check(frame)) {
+
+ id = btr_page_get_index_id(frame);
+
+ /* Look for the id in the index_ids array */
+ j = 0;
+
+ while (j < n_found) {
+
+ if (index_ids[j] == id) {
+ counts[j]++;
+
+ break;
+ }
+ j++;
+ }
+
+ if (j == n_found) {
+ n_found++;
+ index_ids[j] = id;
+ counts[j] = 1;
+ }
+ }
+ }
+ }
+
+ mysql_mutex_unlock(&mutex);
+
+ for (i = 0; i < n_found; i++) {
+ index = dict_index_get_if_in_cache(index_ids[i]);
+
+ if (!index) {
+ ib::info() << "Block count for index "
+ << index_ids[i] << " in buffer is about "
+ << counts[i];
+ } else {
+ ib::info() << "Block count for index " << index_ids[i]
+ << " in buffer is about " << counts[i]
+ << ", index " << index->name
+ << " of table " << index->table->name;
+ }
+ }
+
+ ut_free(index_ids);
+ ut_free(counts);
+
+ validate();
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number()
+{
+ ulint fixed_pages_number= 0;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b;
+ b= UT_LIST_GET_NEXT(LRU, b))
+ if (b->in_file() && (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE))
+ fixed_pages_number++;
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ return fixed_pages_number;
+}
+#endif /* UNIV_DEBUG */
+
+/** Collect buffer pool metadata.
+@param[out] pool_info buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
+{
+ time_t current_time;
+ double time_elapsed;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ pool_info->pool_size = buf_pool.curr_size;
+
+ pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+ pool_info->old_lru_len = buf_pool.LRU_old_len;
+
+ pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free);
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list);
+
+ pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ pool_info->n_pend_reads = buf_pool.n_pend_reads;
+
+ pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU_;
+
+ pool_info->n_pending_flush_list = buf_pool.n_flush_list_;
+
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time,
+ buf_pool.last_printout_time);
+
+ pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young;
+
+ pool_info->n_pages_not_made_young =
+ buf_pool.stat.n_pages_not_made_young;
+
+ pool_info->n_pages_read = buf_pool.stat.n_pages_read;
+
+ pool_info->n_pages_created = buf_pool.stat.n_pages_created;
+
+ pool_info->n_pages_written = buf_pool.stat.n_pages_written;
+
+ pool_info->n_page_gets = buf_pool.stat.n_page_gets;
+
+ pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd;
+ pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read;
+
+ pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted;
+
+ pool_info->page_made_young_rate =
+ static_cast<double>(buf_pool.stat.n_pages_made_young
+ - buf_pool.old_stat.n_pages_made_young)
+ / time_elapsed;
+
+ pool_info->page_not_made_young_rate =
+ static_cast<double>(buf_pool.stat.n_pages_not_made_young
+ - buf_pool.old_stat.n_pages_not_made_young)
+ / time_elapsed;
+
+ pool_info->pages_read_rate =
+ static_cast<double>(buf_pool.stat.n_pages_read
+ - buf_pool.old_stat.n_pages_read)
+ / time_elapsed;
+
+ pool_info->pages_created_rate =
+ static_cast<double>(buf_pool.stat.n_pages_created
+ - buf_pool.old_stat.n_pages_created)
+ / time_elapsed;
+
+ pool_info->pages_written_rate =
+ static_cast<double>(buf_pool.stat.n_pages_written
+ - buf_pool.old_stat.n_pages_written)
+ / time_elapsed;
+
+ pool_info->n_page_get_delta = buf_pool.stat.n_page_gets
+ - buf_pool.old_stat.n_page_gets;
+
+ if (pool_info->n_page_get_delta) {
+ pool_info->page_read_delta = buf_pool.stat.n_pages_read
+ - buf_pool.old_stat.n_pages_read;
+
+ pool_info->young_making_delta =
+ buf_pool.stat.n_pages_made_young
+ - buf_pool.old_stat.n_pages_made_young;
+
+ pool_info->not_young_making_delta =
+ buf_pool.stat.n_pages_not_made_young
+ - buf_pool.old_stat.n_pages_not_made_young;
+ }
+ pool_info->pages_readahead_rnd_rate =
+ static_cast<double>(buf_pool.stat.n_ra_pages_read_rnd
+ - buf_pool.old_stat.n_ra_pages_read_rnd)
+ / time_elapsed;
+
+
+ pool_info->pages_readahead_rate =
+ static_cast<double>(buf_pool.stat.n_ra_pages_read
+ - buf_pool.old_stat.n_ra_pages_read)
+ / time_elapsed;
+
+ pool_info->pages_evicted_rate =
+ static_cast<double>(buf_pool.stat.n_ra_pages_evicted
+ - buf_pool.old_stat.n_ra_pages_evicted)
+ / time_elapsed;
+
+ pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+
+ pool_info->io_sum = buf_LRU_stat_sum.io;
+
+ pool_info->io_cur = buf_LRU_stat_cur.io;
+
+ pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
+
+ pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
+
+ buf_refresh_io_stats();
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+static
+void
+buf_print_io_instance(
+/*==================*/
+ buf_pool_info_t*pool_info, /*!< in: buffer pool info */
+ FILE* file) /*!< in/out: buffer where to print */
+{
+ ut_ad(pool_info);
+
+ fprintf(file,
+ "Buffer pool size " ULINTPF "\n"
+ "Free buffers " ULINTPF "\n"
+ "Database pages " ULINTPF "\n"
+ "Old database pages " ULINTPF "\n"
+ "Modified db pages " ULINTPF "\n"
+ "Percent of dirty pages(LRU & free pages): %.3f\n"
+ "Max dirty pages percent: %.3f\n"
+ "Pending reads " ULINTPF "\n"
+ "Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n",
+ pool_info->pool_size,
+ pool_info->free_list_len,
+ pool_info->lru_len,
+ pool_info->old_lru_len,
+ pool_info->flush_list_len,
+ static_cast<double>(pool_info->flush_list_len)
+ / (static_cast<double>(pool_info->lru_len
+ + pool_info->free_list_len) + 1.0)
+ * 100.0,
+ srv_max_buf_pool_modified_pct,
+ pool_info->n_pend_reads,
+ pool_info->n_pending_flush_lru,
+ pool_info->n_pending_flush_list);
+
+ fprintf(file,
+ "Pages made young " ULINTPF ", not young " ULINTPF "\n"
+ "%.2f youngs/s, %.2f non-youngs/s\n"
+ "Pages read " ULINTPF ", created " ULINTPF
+ ", written " ULINTPF "\n"
+ "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+ pool_info->n_pages_made_young,
+ pool_info->n_pages_not_made_young,
+ pool_info->page_made_young_rate,
+ pool_info->page_not_made_young_rate,
+ pool_info->n_pages_read,
+ pool_info->n_pages_created,
+ pool_info->n_pages_written,
+ pool_info->pages_read_rate,
+ pool_info->pages_created_rate,
+ pool_info->pages_written_rate);
+
+ if (pool_info->n_page_get_delta) {
+ double hit_rate = static_cast<double>(
+ pool_info->page_read_delta)
+ / static_cast<double>(pool_info->n_page_get_delta);
+
+ if (hit_rate > 1) {
+ hit_rate = 1;
+ }
+
+ fprintf(file,
+ "Buffer pool hit rate " ULINTPF " / 1000,"
+ " young-making rate " ULINTPF " / 1000 not "
+ ULINTPF " / 1000\n",
+ ulint(1000 * (1 - hit_rate)),
+ ulint(1000
+ * double(pool_info->young_making_delta)
+ / double(pool_info->n_page_get_delta)),
+ ulint(1000 * double(pool_info->not_young_making_delta)
+ / double(pool_info->n_page_get_delta)));
+ } else {
+ fputs("No buffer pool page gets since the last printout\n",
+ file);
+ }
+
+ /* Statistics about read ahead algorithm */
+ fprintf(file, "Pages read ahead %.2f/s,"
+ " evicted without access %.2f/s,"
+ " Random read ahead %.2f/s\n",
+
+ pool_info->pages_readahead_rate,
+ pool_info->pages_evicted_rate,
+ pool_info->pages_readahead_rnd_rate);
+
+ /* Print some values to help us with visualizing what is
+ happening with LRU eviction. */
+ fprintf(file,
+ "LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n"
+ "I/O sum[" ULINTPF "]:cur[" ULINTPF "], "
+ "unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n",
+ pool_info->lru_len, pool_info->unzip_lru_len,
+ pool_info->io_sum, pool_info->io_cur,
+ pool_info->unzip_sum, pool_info->unzip_cur);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+void
+buf_print_io(
+/*=========*/
+ FILE* file) /*!< in/out: buffer where to print */
+{
+ buf_pool_info_t pool_info;
+
+ buf_stats_get_pool_info(&pool_info);
+ buf_print_io_instance(&pool_info, file);
+}
+
+/** Verify that post encryption checksum match with the calculated checksum.
+This function should be called only if tablespace contains crypt data metadata.
+@param[in] page page frame
+@param[in] fsp_flags tablespace flags
+@return true if true if page is encrypted and OK, false otherwise */
+bool buf_page_verify_crypt_checksum(const byte* page, ulint fsp_flags)
+{
+ if (!fil_space_t::full_crc32(fsp_flags)) {
+ return fil_space_verify_crypt_checksum(
+ page, fil_space_t::zip_size(fsp_flags));
+ }
+
+ return !buf_page_is_corrupted(true, page, fsp_flags);
+}
+
+/** Print the given page_id_t object.
+@param[in,out] out the output stream
+@param[in] page_id the page_id_t object to be printed
+@return the output stream */
+std::ostream& operator<<(std::ostream &out, const page_id_t page_id)
+{
+ out << "[page id: space=" << page_id.space()
+ << ", page number=" << page_id.page_no() << "]";
+ return out;
+}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc
new file mode 100644
index 00000000..e98dc184
--- /dev/null
+++ b/storage/innobase/buf/buf0checksum.cc
@@ -0,0 +1,129 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.cc
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#include "buf0checksum.h"
+#include "fil0fil.h"
+#include "ut0crc32.h"
+#include "ut0rnd.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "srv0srv.h"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** the value of innodb_checksum_algorithm */
+ulong srv_checksum_algorithm;
+
+/** Calculate the CRC32 checksum of a page. The value is stored to the page
+when it is written to a file and also checked for a match when reading from
+the file. Note that we must be careful to calculate the same value on all
+architectures.
+@param[in] page buffer page (srv_page_size bytes)
+@return CRC-32C */
+uint32_t buf_calc_page_crc32(const byte* page)
+{
+ /* Note: innodb_checksum_algorithm=crc32 could and should have
+ included the entire page in the checksum, and CRC-32 values
+ should be combined with the CRC-32 function, not with
+ exclusive OR. We stick to the current algorithm in order to
+ remain compatible with old data files. */
+ return ut_crc32(page + FIL_PAGE_OFFSET,
+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ - FIL_PAGE_OFFSET)
+ ^ ut_crc32(page + FIL_PAGE_DATA,
+ srv_page_size
+ - (FIL_PAGE_DATA + FIL_PAGE_END_LSN_OLD_CHKSUM));
+}
+
+/** Calculate a checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@param[in] page file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_new_checksum(const byte* page)
+{
+ ulint checksum;
+
+ /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+ to the first pages of data files, we have to skip them in the page
+ checksum calculation.
+ We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+ checksum is stored, and also the last 8 bytes of page because
+ there we store the old formula checksum. */
+
+ checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ - FIL_PAGE_OFFSET)
+ + ut_fold_binary(page + FIL_PAGE_DATA,
+ srv_page_size - FIL_PAGE_DATA
+ - FIL_PAGE_END_LSN_OLD_CHKSUM);
+ return(static_cast<uint32_t>(checksum));
+}
+
+/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that
+the checksum only looked at the first few bytes of the page.
+This calculates that old checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@param[in] page file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_old_checksum(const byte* page)
+{
+ return(static_cast<uint32_t>
+ (ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)));
+}
+
+/** Return a printable string describing the checksum algorithm.
+@param[in] algo algorithm
+@return algorithm name */
+const char*
+buf_checksum_algorithm_name(srv_checksum_algorithm_t algo)
+{
+ switch (algo) {
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ return("crc32");
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ return("strict_crc32");
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ return("innodb");
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ return("strict_innodb");
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ return("none");
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ return("strict_none");
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ return("full_crc32");
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ return("strict_full_crc32");
+ }
+
+ ut_error;
+ return(NULL);
+}
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
new file mode 100644
index 00000000..52e947b7
--- /dev/null
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -0,0 +1,764 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dblwr.cc
+Doublwrite buffer module
+
+Created 2011/12/19
+*******************************************************/
+
+#include "buf0dblwr.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "fil0crypt.h"
+#include "fil0pagecompress.h"
+
+using st_::span;
+
+/** The doublewrite buffer */
+buf_dblwr_t buf_dblwr;
+
+/** @return the TRX_SYS page */
+inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr)
+{
+ buf_block_t *block= buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+ 0, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+ return block;
+}
+
+/** Initialize the doublewrite buffer data structure.
+@param header doublewrite page header in the TRX_SYS page */
+inline void buf_dblwr_t::init(const byte *header)
+{
+ ut_ad(!active_slot->first_free);
+ ut_ad(!active_slot->reserved);
+ ut_ad(!batch_running);
+
+ mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
+ pthread_cond_init(&cond, nullptr);
+ block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1));
+ block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2));
+
+ const uint32_t buf_size= 2 * block_size();
+ for (int i= 0; i < 2; i++)
+ {
+ slots[i].write_buf= static_cast<byte*>
+ (aligned_malloc(buf_size << srv_page_size_shift, srv_page_size));
+ slots[i].buf_block_arr= static_cast<element*>
+ (ut_zalloc_nokey(buf_size * sizeof(element)));
+ }
+ active_slot= &slots[0];
+}
+
+/** Create or restore the doublewrite buffer in the TRX_SYS page.
+@return whether the operation succeeded */
+bool buf_dblwr_t::create()
+{
+ if (is_initialised())
+ return true;
+
+ mtr_t mtr;
+ const ulint size= block_size();
+
+start_again:
+ mtr.start();
+
+ buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+
+ if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+ trx_sys_block->frame) == TRX_SYS_DOUBLEWRITE_MAGIC_N)
+ {
+ /* The doublewrite buffer has already been created: just read in
+ some numbers */
+ init(TRX_SYS_DOUBLEWRITE + trx_sys_block->frame);
+ mtr.commit();
+ return true;
+ }
+
+ if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size)
+ {
+too_small:
+ ib::error() << "Cannot create doublewrite buffer: "
+ "the first file in innodb_data_file_path must be at least "
+ << (3 * (size >> (20U - srv_page_size_shift))) << "M.";
+ mtr.commit();
+ return false;
+ }
+ else
+ {
+ buf_block_t *b= fseg_create(fil_system.sys_space,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
+ &mtr, false, trx_sys_block);
+ if (!b)
+ goto too_small;
+ ib::info() << "Doublewrite buffer not found: creating new";
+
+ /* FIXME: After this point, the doublewrite buffer creation
+ is not atomic. The doublewrite buffer should not exist in
+ the InnoDB system tablespace file in the first place.
+ It could be located in separate optional file(s) in a
+ user-specified location. */
+
+ /* fseg_create acquires a second latch on the page,
+ therefore we must declare it: */
+ buf_block_dbg_add_level(b, SYNC_NO_ORDER_CHECK);
+ }
+
+ byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+ trx_sys_block->frame;
+ for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
+ i < 2 * size + extent_size / 2; i++)
+ {
+ buf_block_t *new_block= fseg_alloc_free_page(fseg_header, prev_page_no + 1,
+ FSP_UP, &mtr);
+ if (!new_block)
+ {
+ ib::error() << "Cannot create doublewrite buffer: "
+ " you must increase your tablespace size."
+ " Cannot continue operation.";
+ /* This may essentially corrupt the doublewrite
+ buffer. However, usually the doublewrite buffer
+ is created at database initialization, and it
+ should not matter (just remove all newly created
+ InnoDB files and restart). */
+ mtr.commit();
+ return false;
+ }
+
+ /* We read the allocated pages to the buffer pool; when they are
+ written to disk in a flush, the space id and page number fields
+ are also written to the pages. When we at database startup read
+ pages from the doublewrite buffer, we know that if the space id
+ and page number in them are the same as the page position in the
+ tablespace, then the page has not been written to in
+ doublewrite. */
+
+ ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+ const page_id_t id= new_block->page.id();
+ /* We only do this in the debug build, to ensure that the check in
+ buf_flush_init_for_writing() will see a valid page type. The
+ flushes of new_block are actually unnecessary here. */
+ ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->frame,
+ FIL_PAGE_TYPE_SYS));
+
+ if (i == size / 2)
+ {
+ ut_a(id.page_no() == size);
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 +
+ trx_sys_block->frame, id.page_no());
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+ TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->frame,
+ id.page_no());
+ }
+ else if (i == size / 2 + size)
+ {
+ ut_a(id.page_no() == 2 * size);
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 +
+ trx_sys_block->frame, id.page_no());
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+ TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->frame,
+ id.page_no());
+ }
+ else if (i > size / 2)
+ ut_a(id.page_no() == prev_page_no + 1);
+
+ if (((i + 1) & 15) == 0) {
+ /* rw_locks can only be recursively x-locked 2048 times. (on 32
+ bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a
+ negative number, and thus lock_word becomes like a shared lock).
+ For 4k page size this loop will lock the fseg header too many
+ times. Since this code is not done while any other threads are
+ active, restart the MTR occasionally. */
+ mtr.commit();
+ mtr.start();
+ trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+ fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+ trx_sys_block->frame;
+ }
+
+ prev_page_no= id.page_no();
+ }
+
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+ trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_MAGIC_N);
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+ TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->frame,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N);
+
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+ trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
+ mtr.commit();
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint();
+
+ /* Remove doublewrite pages from LRU */
+ buf_pool_invalidate();
+
+ ib::info() << "Doublewrite buffer created";
+ goto start_again;
+}
+
+/** Initialize the doublewrite buffer memory structure on recovery.
+If we are upgrading from a version before MySQL 4.1, then this
+function performs the necessary update operations to support
+innodb_file_per_table. If we are in a crash recovery, this function
+loads the pages from double write buffer into memory.
+@param file File handle
+@param path Path name of file
+@return DB_SUCCESS or error code */
+dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path)
+{
+ ut_ad(this == &buf_dblwr);
+ const uint32_t size= block_size();
+
+ /* We do the file i/o past the buffer pool */
+ byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size,
+ srv_page_size));
+ /* Read the TRX_SYS header to check if we are using the doublewrite buffer */
+ dberr_t err= os_file_read(IORequestRead, file, read_buf,
+ TRX_SYS_PAGE_NO << srv_page_size_shift,
+ srv_page_size);
+
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to read the system tablespace header page";
+func_exit:
+ aligned_free(read_buf);
+ return err;
+ }
+
+ /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */
+ if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE +
+ read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N)
+ {
+ /* There is no doublewrite buffer initialized in the TRX_SYS page.
+ This should normally not be possible; the doublewrite buffer should
+ be initialized when creating the database. */
+ err= DB_SUCCESS;
+ goto func_exit;
+ }
+
+ init(TRX_SYS_DOUBLEWRITE + read_buf);
+
+ const bool upgrade_to_innodb_file_per_table=
+ mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+ TRX_SYS_DOUBLEWRITE + read_buf) !=
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N;
+
+ auto write_buf= active_slot->write_buf;
+ /* Read the pages from the doublewrite buffer to memory */
+ err= os_file_read(IORequestRead, file, write_buf,
+ block1.page_no() << srv_page_size_shift,
+ size << srv_page_size_shift);
+
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to read the first double write buffer extent";
+ goto func_exit;
+ }
+
+ err= os_file_read(IORequestRead, file,
+ write_buf + (size << srv_page_size_shift),
+ block2.page_no() << srv_page_size_shift,
+ size << srv_page_size_shift);
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to read the second double write buffer extent";
+ goto func_exit;
+ }
+
+ byte *page= write_buf;
+
+ if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table))
+ {
+ ib::info() << "Resetting space id's in the doublewrite buffer";
+
+ for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+ {
+ memset(page + FIL_PAGE_SPACE_ID, 0, 4);
+ /* For innodb_checksum_algorithm=innodb, we do not need to
+ calculate new checksums for the pages because the field
+ .._SPACE_ID does not affect them. Write the page back to where
+ we read it from. */
+ const ulint source_page_no= i < size
+ ? block1.page_no() + i
+ : block2.page_no() + i - size;
+ err= os_file_write(IORequestWrite, path, file, page,
+ source_page_no << srv_page_size_shift, srv_page_size);
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to upgrade the double write buffer";
+ goto func_exit;
+ }
+ }
+ os_file_flush(file);
+ }
+ else
+ for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+ if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN)))
+ /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */
+ recv_sys.dblwr.add(page);
+
+ err= DB_SUCCESS;
+ goto func_exit;
+}
+
+/** Process and remove the double write buffer pages for all tablespaces. */
+void buf_dblwr_t::recover()
+{
+ ut_ad(recv_sys.parse_start_lsn);
+ if (!is_initialised())
+ return;
+
+ uint32_t page_no_dblwr= 0;
+ byte *read_buf= static_cast<byte*>(aligned_malloc(3 * srv_page_size,
+ srv_page_size));
+ byte *const buf= read_buf + srv_page_size;
+
+ for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin();
+ i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr)
+ {
+ byte *page= *i;
+ const uint32_t page_no= page_get_page_no(page);
+ if (!page_no) /* recovered via Datafile::restore_from_doublewrite() */
+ continue;
+
+ const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
+ if (recv_sys.parse_start_lsn > lsn)
+ /* Pages written before the checkpoint are not useful for recovery. */
+ continue;
+ const ulint space_id= page_get_space_id(page);
+ const page_id_t page_id(space_id, page_no);
+
+ if (recv_sys.scanned_lsn < lsn)
+ {
+ ib::info() << "Ignoring a doublewrite copy of page " << page_id
+ << " with future log sequence number " << lsn;
+ continue;
+ }
+
+ fil_space_t *space= fil_space_t::get(space_id);
+
+ if (!space)
+ /* The tablespace that this page once belonged to does not exist */
+ continue;
+
+ if (UNIV_UNLIKELY(page_no >= space->get_size()))
+ {
+ /* Do not report the warning for undo tablespaces, because they
+ can be truncated in place. */
+ if (!srv_is_undo_tablespace(space_id))
+ ib::warn() << "A copy of page " << page_no
+ << " in the doublewrite buffer slot " << page_no_dblwr
+ << " is beyond the end of tablespace " << space->name
+ << " (" << space->size << " pages)";
+next_page:
+ space->release();
+ continue;
+ }
+
+ const ulint physical_size= space->physical_size();
+ ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
+
+ /* We want to ensure that for partial reads the unread portion of
+ the page is NUL. */
+ memset(read_buf, 0x0, physical_size);
+
+ /* Read in the actual page from the file */
+ fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER),
+ os_offset_t{page_no} * physical_size,
+ physical_size, read_buf);
+
+ if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
+ ib::warn() << "Double write buffer recovery: " << page_id
+ << " (tablespace '" << space->name
+ << "') read failed with error: " << fio.err;
+
+ if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
+ {
+ /* We will check if the copy in the doublewrite buffer is
+ valid. If not, we will ignore this page (there should be redo
+ log records to initialize it). */
+ }
+ else if (recv_sys.dblwr.validate_page(page_id, read_buf, space, buf))
+ goto next_page;
+ else
+ /* We intentionally skip this message for all-zero pages. */
+ ib::info() << "Trying to recover page " << page_id
+ << " from the doublewrite buffer.";
+
+ page= recv_sys.dblwr.find_page(page_id, space, buf);
+
+ if (!page)
+ goto next_page;
+
+ /* Write the good page from the doublewrite buffer to the intended
+ position. */
+ space->reacquire();
+ fio= space->io(IORequestWrite,
+ os_offset_t{page_id.page_no()} * physical_size,
+ physical_size, page);
+
+ if (fio.err == DB_SUCCESS)
+ ib::info() << "Recovered page " << page_id << " to '" << fio.node->name
+ << "' from the doublewrite buffer.";
+ goto next_page;
+ }
+
+ recv_sys.dblwr.pages.clear();
+ fil_flush_file_spaces();
+ aligned_free(read_buf);
+}
+
+/** Free the doublewrite buffer. */
+void buf_dblwr_t::close()
+{
+ if (!is_initialised())
+ return;
+
+ /* Free the double write data structures. */
+ ut_ad(!active_slot->reserved);
+ ut_ad(!active_slot->first_free);
+ ut_ad(!batch_running);
+
+ pthread_cond_destroy(&cond);
+ for (int i= 0; i < 2; i++)
+ {
+ aligned_free(slots[i].write_buf);
+ ut_free(slots[i].buf_block_arr);
+ }
+ mysql_mutex_destroy(&mutex);
+
+ memset((void*) this, 0, sizeof *this);
+ active_slot= &slots[0];
+}
+
+/** Update the doublewrite buffer on write completion. */
+void buf_dblwr_t::write_completed()
+{
+ ut_ad(this == &buf_dblwr);
+ ut_ad(srv_use_doublewrite_buf);
+ ut_ad(is_initialised());
+ ut_ad(!srv_read_only_mode);
+
+ mysql_mutex_lock(&mutex);
+
+ ut_ad(batch_running);
+ slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+ ut_ad(flush_slot->reserved);
+ ut_ad(flush_slot->reserved <= flush_slot->first_free);
+
+ if (!--flush_slot->reserved)
+ {
+ mysql_mutex_unlock(&mutex);
+ /* This will finish the batch. Sync data files to the disk. */
+ fil_flush_file_spaces();
+ mysql_mutex_lock(&mutex);
+
+ /* We can now reuse the doublewrite memory buffer: */
+ flush_slot->first_free= 0;
+ batch_running= false;
+ pthread_cond_broadcast(&cond);
+ }
+
+ mysql_mutex_unlock(&mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** Check the LSN values on the page.
+@param[in] page page to check
+@param[in] s tablespace */
+static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)
+{
+ /* Ignore page_compressed or encrypted pages */
+ if (s.is_compressed() || buf_page_get_key_version(page, s.flags))
+ return;
+ const byte* lsn_start= FIL_PAGE_LSN + 4 + page;
+ const byte* lsn_end= page + srv_page_size -
+ (s.full_crc32()
+ ? FIL_PAGE_FCRC32_END_LSN
+ : FIL_PAGE_END_LSN_OLD_CHKSUM - 4);
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4));
+}
+
+static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
+{
+ if (fil_space_t *space= fil_space_t::get(b.id().space()))
+ {
+ buf_dblwr_check_page_lsn(page, *space);
+ space->release();
+ }
+}
+
+/** Check the LSN values on the page with which this block is associated. */
+static void buf_dblwr_check_block(const buf_page_t *bpage)
+{
+ ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+ const page_t *page= reinterpret_cast<const buf_block_t*>(bpage)->frame;
+
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_RTREE:
+ if (page_is_comp(page))
+ {
+ if (page_simple_validate_new(page))
+ return;
+ }
+ else if (page_simple_validate_old(page))
+ return;
+ /* While it is possible that this is not an index page but just
+ happens to have wrongly set FIL_PAGE_TYPE, such pages should never
+ be modified to without also adjusting the page type during page
+ allocation or buf_flush_init_for_writing() or
+ fil_block_reset_type(). */
+ buf_page_print(page);
+
+ ib::fatal() << "Apparent corruption of an index page " << bpage->id()
+ << " to be written to data file. We intentionally crash"
+ " the server to prevent corrupt data from ending up in"
+ " data files.";
+ }
+}
+#endif /* UNIV_DEBUG */
+
+bool buf_dblwr_t::flush_buffered_writes(const ulint size)
+{
+ mysql_mutex_assert_owner(&mutex);
+ ut_ad(size == block_size());
+
+ for (;;)
+ {
+ if (!active_slot->first_free)
+ return false;
+ if (!batch_running)
+ break;
+ my_cond_wait(&cond, &mutex.m_mutex);
+ }
+
+ ut_ad(active_slot->reserved == active_slot->first_free);
+ ut_ad(!flushing_buffered_writes);
+
+ /* Disallow anyone else to start another batch of flushing. */
+ slot *flush_slot= active_slot;
+ /* Switch the active slot */
+ active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+ ut_a(active_slot->first_free == 0);
+ batch_running= true;
+ const ulint old_first_free= flush_slot->first_free;
+ auto write_buf= flush_slot->write_buf;
+ const bool multi_batch= block1 + static_cast<uint32_t>(size) != block2 &&
+ old_first_free > size;
+ flushing_buffered_writes= 1 + multi_batch;
+ pages_submitted+= old_first_free;
+ /* Now safe to release the mutex. */
+ mysql_mutex_unlock(&mutex);
+#ifdef UNIV_DEBUG
+ for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++)
+ {
+ buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage;
+
+ if (bpage->zip.data)
+ /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */
+ continue;
+
+ /* Check that the actual page in the buffer pool is not corrupt
+ and the LSN values are sane. */
+ buf_dblwr_check_block(bpage);
+ ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2));
+ }
+#endif /* UNIV_DEBUG */
+ const IORequest request(nullptr, fil_system.sys_space->chain.start,
+ IORequest::DBLWR_BATCH);
+ ut_a(fil_system.sys_space->acquire());
+ if (multi_batch)
+ {
+ fil_system.sys_space->reacquire();
+ os_aio(request, write_buf,
+ os_offset_t{block1.page_no()} << srv_page_size_shift,
+ size << srv_page_size_shift);
+ os_aio(request, write_buf + (size << srv_page_size_shift),
+ os_offset_t{block2.page_no()} << srv_page_size_shift,
+ (old_first_free - size) << srv_page_size_shift);
+ }
+ else
+ os_aio(request, write_buf,
+ os_offset_t{block1.page_no()} << srv_page_size_shift,
+ old_first_free << srv_page_size_shift);
+ return true;
+}
+
+void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
+{
+ ut_ad(this == &buf_dblwr);
+ ut_ad(srv_use_doublewrite_buf);
+ ut_ad(is_initialised());
+ ut_ad(!srv_read_only_mode);
+ ut_ad(!request.bpage);
+ ut_ad(request.node == fil_system.sys_space->chain.start);
+ ut_ad(request.type == IORequest::DBLWR_BATCH);
+ mysql_mutex_lock(&mutex);
+ ut_ad(batch_running);
+ ut_ad(flushing_buffered_writes);
+ ut_ad(flushing_buffered_writes <= 2);
+ writes_completed++;
+ if (UNIV_UNLIKELY(--flushing_buffered_writes))
+ {
+ mysql_mutex_unlock(&mutex);
+ return;
+ }
+
+ slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+ ut_ad(flush_slot->reserved == flush_slot->first_free);
+ /* increment the doublewrite flushed pages counter */
+ pages_written+= flush_slot->first_free;
+ mysql_mutex_unlock(&mutex);
+
+ /* Now flush the doublewrite buffer data to disk */
+ fil_system.sys_space->flush<false>();
+
+ /* The writes have been flushed to disk now and in recovery we will
+ find them in the doublewrite buffer blocks. Next, write the data pages. */
+ for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++)
+ {
+ auto e= flush_slot->buf_block_arr[i];
+ buf_page_t* bpage= e.request.bpage;
+ ut_ad(bpage->in_file());
+
+ /* We request frame here to get correct buffer in case of
+ encryption and/or page compression */
+ void *frame= buf_page_get_frame(bpage);
+
+ auto e_size= e.size;
+
+ if (UNIV_LIKELY_NULL(bpage->zip.data))
+ {
+ e_size= bpage->zip_size();
+ ut_ad(e_size);
+ }
+ else
+ {
+ ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(!bpage->zip_size());
+ ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
+ }
+
+ const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+ (FIL_PAGE_LSN +
+ static_cast<const byte*>(frame)));
+ ut_ad(lsn);
+ ut_ad(lsn >= bpage->oldest_modification());
+ log_write_up_to(lsn, true);
+ e.request.node->space->io(e.request, bpage->physical_offset(), e_size,
+ frame, bpage);
+ }
+}
+
+/** Flush possible buffered writes to persistent storage.
+It is very important to call this function after a batch of writes has been
+posted, and also when we may have to wait for a page latch!
+Otherwise a deadlock of threads can occur. */
+void buf_dblwr_t::flush_buffered_writes()
+{
+ if (!is_initialised() || !srv_use_doublewrite_buf)
+ {
+ fil_flush_file_spaces();
+ return;
+ }
+
+ ut_ad(!srv_read_only_mode);
+ const ulint size= block_size();
+
+ mysql_mutex_lock(&mutex);
+ if (!flush_buffered_writes(size))
+ mysql_mutex_unlock(&mutex);
+}
+
+/** Schedule a page write. If the doublewrite memory buffer is full,
+flush_buffered_writes() will be invoked to make space.
+@param request asynchronous write request
+@param size payload size in bytes */
+void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
+{
+ ut_ad(request.is_async());
+ ut_ad(request.is_write());
+ ut_ad(request.bpage);
+ ut_ad(request.bpage->in_file());
+ ut_ad(request.node);
+ ut_ad(request.node->space->id == request.bpage->id().space());
+ ut_ad(request.node->space->referenced());
+ ut_ad(!srv_read_only_mode);
+
+ const ulint buf_size= 2 * block_size();
+
+ mysql_mutex_lock(&mutex);
+
+ for (;;)
+ {
+ ut_ad(active_slot->first_free <= buf_size);
+ if (active_slot->first_free != buf_size)
+ break;
+
+ if (flush_buffered_writes(buf_size / 2))
+ mysql_mutex_lock(&mutex);
+ }
+
+ byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free;
+
+ /* We request frame here to get correct buffer in case of
+ encryption and/or page compression */
+ void *frame= buf_page_get_frame(request.bpage);
+
+ /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages,
+ and at least srv_page_size (4096-byte) for everything else. */
+ memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, frame, size);
+ /* fil_page_compress() for page_compressed guarantees 256-byte alignment */
+ memset_aligned<256>(p + size, 0, srv_page_size - size);
+ /* FIXME: Inform the compiler that "size" and "srv_page_size - size"
+ are integer multiples of 256, so the above can translate into simple
+ SIMD instructions. Currently, we make no such assumptions about the
+ non-pointer parameters that are passed to the _aligned templates. */
+ ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size);
+ ut_ad(active_slot->reserved == active_slot->first_free);
+ ut_ad(active_slot->reserved < buf_size);
+ new (active_slot->buf_block_arr + active_slot->first_free++)
+ element{request, size};
+ active_slot->reserved= active_slot->first_free;
+
+ if (active_slot->first_free != buf_size ||
+ !flush_buffered_writes(buf_size / 2))
+ mysql_mutex_unlock(&mutex);
+}
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
new file mode 100644
index 00000000..c6ddcb4f
--- /dev/null
+++ b/storage/innobase/buf/buf0dump.cc
@@ -0,0 +1,824 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.cc
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#include "my_global.h"
+#include "mysqld.h"
+#include "my_sys.h"
+
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+
+#include "buf0buf.h"
+#include "buf0dump.h"
+#include "dict0dict.h"
+#include "os0file.h"
+#include "os0thread.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "sync0rw.h"
+#include "ut0byte.h"
+
+#include <algorithm>
+
+#include "mysql/service_wsrep.h" /* wsrep_recovery */
+#include <my_service_manager.h>
+
+static void buf_do_load_dump();
+
+enum status_severity {
+ STATUS_INFO,
+ STATUS_ERR
+};
+
+#define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE)
+
+/* Flags that tell the buffer pool dump/load thread which action should it
+take after being waked up. */
+static volatile bool buf_dump_should_start;
+static volatile bool buf_load_should_start;
+
+static bool buf_load_abort_flag;
+
+/** Start the buffer pool dump/load task and instructs it to start a dump. */
+void buf_dump_start()
+{
+ buf_dump_should_start= true;
+ buf_do_load_dump();
+}
+
+/** Start the buffer pool dump/load task and instructs it to start a load. */
+void buf_load_start()
+{
+ buf_load_should_start= true;
+ buf_do_load_dump();
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */
+static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
+void
+buf_dump_status(
+/*============*/
+ enum status_severity severity,/*!< in: status severity */
+ const char* fmt, /*!< in: format */
+ ...) /*!< in: extra parameters according
+ to fmt */
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ vsnprintf(
+ export_vars.innodb_buffer_pool_dump_status,
+ sizeof(export_vars.innodb_buffer_pool_dump_status),
+ fmt, ap);
+
+ switch (severity) {
+ case STATUS_INFO:
+ ib::info() << export_vars.innodb_buffer_pool_dump_status;
+ break;
+
+ case STATUS_ERR:
+ ib::error() << export_vars.innodb_buffer_pool_dump_status;
+ break;
+ }
+
+ va_end(ap);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */
+static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
+void
+buf_load_status(
+/*============*/
+ enum status_severity severity,/*!< in: status severity */
+ const char* fmt, /*!< in: format */
+ ...) /*!< in: extra parameters according to fmt */
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ vsnprintf(
+ export_vars.innodb_buffer_pool_load_status,
+ sizeof(export_vars.innodb_buffer_pool_load_status),
+ fmt, ap);
+
+ switch (severity) {
+ case STATUS_INFO:
+ ib::info() << export_vars.innodb_buffer_pool_load_status;
+ break;
+
+ case STATUS_ERR:
+ ib::error() << export_vars.innodb_buffer_pool_load_status;
+ break;
+ }
+
+ va_end(ap);
+}
+
+/** Returns the directory path where the buffer pool dump file will be created.
+@return directory path */
+static
+const char*
+get_buf_dump_dir()
+{
+ const char* dump_dir;
+
+ /* The dump file should be created in the default data directory if
+ innodb_data_home_dir is set as an empty string. */
+ if (!*srv_data_home) {
+ dump_dir = fil_path_to_mysql_datadir;
+ } else {
+ dump_dir = srv_data_home;
+ }
+
+ return(dump_dir);
+}
+
+/** Generate the path to the buffer pool dump/load file.
+@param[out] path generated path
+@param[in] path_size size of 'path', used as in snprintf(3). */
+static void buf_dump_generate_path(char *path, size_t path_size)
+{
+ char buf[FN_REFLEN];
+
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ snprintf(buf, sizeof(buf), "%s%c%s", get_buf_dump_dir(),
+ OS_PATH_SEPARATOR, srv_buf_dump_filename);
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+
+ os_file_type_t type;
+ bool exists = false;
+ bool ret;
+
+ ret = os_file_status(buf, &exists, &type);
+
+ /* For realpath() to succeed the file must exist. */
+
+ if (ret && exists) {
+ /* my_realpath() assumes the destination buffer is big enough
+ to hold FN_REFLEN bytes. */
+ ut_a(path_size >= FN_REFLEN);
+
+ my_realpath(path, buf, 0);
+ } else {
+ /* If it does not exist, then resolve only srv_data_home
+ and append srv_buf_dump_filename to it. */
+ char srv_data_home_full[FN_REFLEN];
+
+ my_realpath(srv_data_home_full, get_buf_dump_dir(), 0);
+
+ if (srv_data_home_full[strlen(srv_data_home_full) - 1]
+ == OS_PATH_SEPARATOR) {
+
+ snprintf(path, path_size, "%s%s",
+ srv_data_home_full,
+ srv_buf_dump_filename);
+ } else {
+ snprintf(path, path_size, "%s%c%s",
+ srv_data_home_full,
+ OS_PATH_SEPARATOR,
+ srv_buf_dump_filename);
+ }
+ }
+}
+
+/*****************************************************************//**
+Perform a buffer pool dump into the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_dump(
+/*=====*/
+ ibool obey_shutdown) /*!< in: quit if we are in a shutting down
+ state */
+{
+#define SHOULD_QUIT() (SHUTTING_DOWN() && obey_shutdown)
+
+ char full_filename[OS_FILE_MAX_PATH];
+ char tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"];
+ char now[32];
+ FILE* f;
+ int ret;
+
+ buf_dump_generate_path(full_filename, sizeof(full_filename));
+
+ snprintf(tmp_filename, sizeof(tmp_filename),
+ "%s.incomplete", full_filename);
+
+ buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s",
+ full_filename);
+
+#if defined(__GLIBC__) || defined(__WIN__) || O_CLOEXEC == 0
+ f = fopen(tmp_filename, "w" STR_O_CLOEXEC);
+#else
+ {
+ int fd;
+ fd = open(tmp_filename, O_CREAT | O_TRUNC | O_CLOEXEC | O_WRONLY, 0640);
+ if (fd >= 0) {
+ f = fdopen(fd, "w");
+ }
+ else {
+ f = NULL;
+ }
+ }
+#endif
+ if (f == NULL) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot open '%s' for writing: %s",
+ tmp_filename, strerror(errno));
+ return;
+ }
+ const buf_page_t* bpage;
+ page_id_t* dump;
+ ulint n_pages;
+ ulint j;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ n_pages = UT_LIST_GET_LEN(buf_pool.LRU);
+
+ /* skip empty buffer pools */
+ if (n_pages == 0) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ goto done;
+ }
+
+ if (srv_buf_pool_dump_pct != 100) {
+ ulint t_pages;
+
+ /* limit the number of total pages dumped to X% of the
+ total number of pages */
+ t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100;
+ if (n_pages > t_pages) {
+ buf_dump_status(STATUS_INFO,
+ "Restricted to " ULINTPF
+ " pages due to "
+ "innodb_buf_pool_dump_pct=%lu",
+ t_pages, srv_buf_pool_dump_pct);
+ n_pages = t_pages;
+ }
+
+ if (n_pages == 0) {
+ n_pages = 1;
+ }
+ }
+
+ dump = static_cast<page_id_t*>(ut_malloc_nokey(
+ n_pages * sizeof(*dump)));
+
+ if (dump == NULL) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ fclose(f);
+ buf_dump_status(STATUS_ERR,
+ "Cannot allocate " ULINTPF " bytes: %s",
+ (ulint) (n_pages * sizeof(*dump)),
+ strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+
+ for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0;
+ bpage != NULL && j < n_pages;
+ bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+
+ ut_a(bpage->in_file());
+ const page_id_t id(bpage->id());
+
+ if (id.space() == SRV_TMP_SPACE_ID) {
+ /* Ignore the innodb_temporary tablespace. */
+ continue;
+ }
+
+ if (bpage->status == buf_page_t::FREED) {
+ continue;
+ }
+
+ dump[j++] = id;
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ ut_a(j <= n_pages);
+ n_pages = j;
+
+ for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
+ ret = fprintf(f, "%u,%u\n",
+ dump[j].space(), dump[j].page_no());
+ if (ret < 0) {
+ ut_free(dump);
+ fclose(f);
+ buf_dump_status(STATUS_ERR,
+ "Cannot write to '%s': %s",
+ tmp_filename, strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+ if (SHUTTING_DOWN() && !(j & 1023)) {
+ service_manager_extend_timeout(
+ INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Dumping buffer pool page "
+ ULINTPF "/" ULINTPF, j + 1, n_pages);
+ }
+ }
+
+ ut_free(dump);
+
+done:
+ ret = fclose(f);
+ if (ret != 0) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot close '%s': %s",
+ tmp_filename, strerror(errno));
+ return;
+ }
+ /* else */
+
+ ret = unlink(full_filename);
+ if (ret != 0 && errno != ENOENT) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot delete '%s': %s",
+ full_filename, strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+ /* else */
+
+ ret = rename(tmp_filename, full_filename);
+ if (ret != 0) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot rename '%s' to '%s': %s",
+ tmp_filename, full_filename,
+ strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+ /* else */
+
+ /* success */
+
+ ut_sprintf_timestamp(now);
+
+ buf_dump_status(STATUS_INFO,
+ "Buffer pool(s) dump completed at %s", now);
+
+ /* Though dumping doesn't related to an incomplete load,
+ we reset this to 0 here to indicate that a shutdown can also perform
+ a dump */
+ export_vars.innodb_buffer_pool_load_incomplete = 0;
+}
+
+/*****************************************************************//**
+Artificially delay the buffer pool loading if necessary. The idea of
+this function is to prevent hogging the server with IO and slowing down
+too much normal client queries. */
+UNIV_INLINE
+void
+buf_load_throttle_if_needed(
+/*========================*/
+ ulint* last_check_time, /*!< in/out: milliseconds since epoch
+ of the last time we did check if
+ throttling is needed, we do the check
+ every srv_io_capacity IO ops. */
+ ulint* last_activity_count,
+ ulint n_io) /*!< in: number of IO ops done since
+ buffer pool load has started */
+{
+ if (n_io % srv_io_capacity < srv_io_capacity - 1) {
+ return;
+ }
+
+ if (*last_check_time == 0 || *last_activity_count == 0) {
+ *last_check_time = ut_time_ms();
+ *last_activity_count = srv_get_activity_count();
+ return;
+ }
+
+ /* srv_io_capacity IO operations have been performed by buffer pool
+ load since the last time we were here. */
+
+ /* If no other activity, then keep going without any delay. */
+ if (srv_get_activity_count() == *last_activity_count) {
+ return;
+ }
+
+ /* There has been other activity, throttle. */
+
+ ulint now = ut_time_ms();
+ ulint elapsed_time = now - *last_check_time;
+
+ /* Notice that elapsed_time is not the time for the last
+ srv_io_capacity IO operations performed by BP load. It is the
+ time elapsed since the last time we detected that there has been
+ other activity. This has a small and acceptable deficiency, e.g.:
+ 1. BP load runs and there is no other activity.
+ 2. Other activity occurs, we run N IO operations after that and
+ enter here (where 0 <= N < srv_io_capacity).
+ 3. last_check_time is very old and we do not sleep at this time, but
+ only update last_check_time and last_activity_count.
+ 4. We run srv_io_capacity more IO operations and call this function
+ again.
+ 5. There has been more other activity and thus we enter here.
+ 6. Now last_check_time is recent and we sleep if necessary to prevent
+ more than srv_io_capacity IO operations per second.
+ The deficiency is that we could have slept at 3., but for this we
+ would have to update last_check_time before the
+ "cur_activity_count == *last_activity_count" check and calling
+ ut_time_ms() that often may turn out to be too expensive. */
+
+ if (elapsed_time < 1000 /* 1 sec (1000 milli secs) */) {
+ os_thread_sleep((1000 - elapsed_time) * 1000 /* micro secs */);
+ }
+
+ *last_check_time = ut_time_ms();
+ *last_activity_count = srv_get_activity_count();
+}
+
+/*****************************************************************//**
+Perform a buffer pool load from the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_load_status will be set accordingly, see buf_load_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_load()
+/*======*/
+{
+ char full_filename[OS_FILE_MAX_PATH];
+ char now[32];
+ FILE* f;
+ page_id_t* dump;
+ ulint dump_n;
+ ulint i;
+ uint32_t space_id;
+ uint32_t page_no;
+ int fscanf_ret;
+
+ /* Ignore any leftovers from before */
+ buf_load_abort_flag = false;
+
+ buf_dump_generate_path(full_filename, sizeof(full_filename));
+
+ buf_load_status(STATUS_INFO,
+ "Loading buffer pool(s) from %s", full_filename);
+
+ f = fopen(full_filename, "r" STR_O_CLOEXEC);
+ if (f == NULL) {
+ buf_load_status(STATUS_INFO,
+ "Cannot open '%s' for reading: %s",
+ full_filename, strerror(errno));
+ return;
+ }
+ /* else */
+
+ /* First scan the file to estimate how many entries are in it.
+ This file is tiny (approx 500KB per 1GB buffer pool), reading it
+ two times is fine. */
+ dump_n = 0;
+ while (fscanf(f, "%u,%u", &space_id, &page_no) == 2
+ && !SHUTTING_DOWN()) {
+ dump_n++;
+ }
+
+ if (!SHUTTING_DOWN() && !feof(f)) {
+ /* fscanf() returned != 2 */
+ const char* what;
+ if (ferror(f)) {
+ what = "reading";
+ } else {
+ what = "parsing";
+ }
+ fclose(f);
+ buf_load_status(STATUS_ERR, "Error %s '%s',"
+ " unable to load buffer pool (stage 1)",
+ what, full_filename);
+ return;
+ }
+
+ /* If dump is larger than the buffer pool(s), then we ignore the
+ extra trailing. This could happen if a dump is made, then buffer
+ pool is shrunk and then load is attempted. */
+ dump_n = std::min(dump_n, buf_pool.get_n_pages());
+
+ if (dump_n != 0) {
+ dump = static_cast<page_id_t*>(ut_malloc_nokey(
+ dump_n * sizeof(*dump)));
+ } else {
+ fclose(f);
+ ut_sprintf_timestamp(now);
+ buf_load_status(STATUS_INFO,
+ "Buffer pool(s) load completed at %s"
+ " (%s was empty)", now, full_filename);
+ return;
+ }
+
+ if (dump == NULL) {
+ fclose(f);
+ buf_load_status(STATUS_ERR,
+ "Cannot allocate " ULINTPF " bytes: %s",
+ dump_n * sizeof(*dump),
+ strerror(errno));
+ return;
+ }
+
+ rewind(f);
+
+ export_vars.innodb_buffer_pool_load_incomplete = 1;
+
+ for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+ fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no);
+
+ if (fscanf_ret != 2) {
+ if (feof(f)) {
+ break;
+ }
+ /* else */
+
+ ut_free(dump);
+ fclose(f);
+ buf_load_status(STATUS_ERR,
+ "Error parsing '%s', unable"
+ " to load buffer pool (stage 2)",
+ full_filename);
+ return;
+ }
+
+ if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) {
+ ut_free(dump);
+ fclose(f);
+ buf_load_status(STATUS_ERR,
+ "Error parsing '%s': bogus"
+ " space,page %u,%u at line " ULINTPF
+ ", unable to load buffer pool",
+ full_filename,
+ space_id, page_no,
+ i);
+ return;
+ }
+
+ dump[i] = page_id_t(space_id, page_no);
+ }
+
+ /* Set dump_n to the actual number of initialized elements,
+ i could be smaller than dump_n here if the file got truncated after
+ we read it the first time. */
+ dump_n = i;
+
+ fclose(f);
+
+ if (dump_n == 0) {
+ ut_free(dump);
+ ut_sprintf_timestamp(now);
+ buf_load_status(STATUS_INFO,
+ "Buffer pool(s) load completed at %s"
+ " (%s was empty or had errors)", now, full_filename);
+ return;
+ }
+
+ if (!SHUTTING_DOWN()) {
+ std::sort(dump, dump + dump_n);
+ }
+
+ ulint last_check_time = 0;
+ ulint last_activity_cnt = 0;
+
+ /* Avoid calling the expensive fil_space_t::get() for each
+ page within the same tablespace. dump[] is sorted by (space, page),
+ so all pages from a given tablespace are consecutive. */
+ ulint cur_space_id = dump[0].space();
+ fil_space_t* space = fil_space_t::get(cur_space_id);
+ ulint zip_size = space ? space->zip_size() : 0;
+
+ PSI_stage_progress* pfs_stage_progress __attribute__((unused))
+ = mysql_set_stage(srv_stage_buffer_pool_load.m_key);
+ mysql_stage_set_work_estimated(pfs_stage_progress, dump_n);
+ mysql_stage_set_work_completed(pfs_stage_progress, 0);
+
+ for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+
+ /* space_id for this iteration of the loop */
+ const ulint this_space_id = dump[i].space();
+
+ if (this_space_id == SRV_TMP_SPACE_ID) {
+ /* Ignore the innodb_temporary tablespace. */
+ continue;
+ }
+
+ if (this_space_id != cur_space_id) {
+ if (space) {
+ space->release();
+ }
+
+ cur_space_id = this_space_id;
+ space = fil_space_t::get(cur_space_id);
+
+ if (!space) {
+ continue;
+ }
+
+ zip_size = space->zip_size();
+ }
+
+ /* JAN: TODO: As we use background page read below,
+ if tablespace is encrypted we cant use it. */
+ if (!space || dump[i].page_no() >= space->get_size() ||
+ (space->crypt_data &&
+ space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
+ space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
+ continue;
+ }
+
+ if (space->is_stopping()) {
+ space->release();
+ space = nullptr;
+ continue;
+ }
+
+ space->reacquire();
+ buf_read_page_background(space, dump[i], zip_size, true);
+
+ if (buf_load_abort_flag) {
+ if (space) {
+ space->release();
+ }
+ buf_load_abort_flag = false;
+ ut_free(dump);
+ buf_load_status(
+ STATUS_INFO,
+ "Buffer pool(s) load aborted on request");
+ /* Premature end, set estimated = completed = i and
+ end the current stage event. */
+
+ mysql_stage_set_work_estimated(pfs_stage_progress, i);
+ mysql_stage_set_work_completed(pfs_stage_progress, i);
+
+ mysql_end_stage();
+ return;
+ }
+
+ buf_load_throttle_if_needed(
+ &last_check_time, &last_activity_cnt, i);
+
+#ifdef UNIV_DEBUG
+ if ((i+1) >= srv_buf_pool_load_pages_abort) {
+ buf_load_abort_flag = true;
+ }
+#endif
+ }
+
+ if (space) {
+ space->release();
+ }
+
+ ut_free(dump);
+
+ ut_sprintf_timestamp(now);
+
+ if (i == dump_n) {
+ buf_load_status(STATUS_INFO,
+ "Buffer pool(s) load completed at %s", now);
+ export_vars.innodb_buffer_pool_load_incomplete = 0;
+ } else if (!buf_load_abort_flag) {
+ buf_load_status(STATUS_INFO,
+ "Buffer pool(s) load aborted due to user instigated abort at %s",
+ now);
+ /* intentionally don't reset innodb_buffer_pool_load_incomplete
+ as we don't want a shutdown to save the buffer pool */
+ } else {
+ buf_load_status(STATUS_INFO,
+ "Buffer pool(s) load aborted due to shutdown at %s",
+ now);
+ /* intentionally don't reset innodb_buffer_pool_load_incomplete
+ as we want to abort without saving the buffer pool */
+ }
+
+ /* Make sure that estimated = completed when we end. */
+ mysql_stage_set_work_completed(pfs_stage_progress, dump_n);
+ /* End the stage progress event. */
+ mysql_end_stage();
+}
+
+/** Abort a currently running buffer pool load. */
+void buf_load_abort()
+{
+ buf_load_abort_flag= true;
+}
+
+/*****************************************************************//**
+This is the main task for buffer pool dump/load. when scheduled
+either performs a dump or load, depending on server state, state of the variables etc- */
+static void buf_dump_load_func(void *)
+{
+ ut_ad(!srv_read_only_mode);
+ static bool first_time = true;
+ if (first_time && srv_buffer_pool_load_at_startup) {
+
+#ifdef WITH_WSREP
+ if (!get_wsrep_recovery()) {
+#endif /* WITH_WSREP */
+ buf_load();
+#ifdef WITH_WSREP
+ }
+#endif /* WITH_WSREP */
+ }
+ first_time = false;
+
+ while (!SHUTTING_DOWN()) {
+ if (buf_dump_should_start) {
+ buf_dump_should_start = false;
+ buf_dump(true);
+ }
+ if (buf_load_should_start) {
+ buf_load_should_start = false;
+ buf_load();
+ }
+
+ if (!buf_dump_should_start && !buf_load_should_start) {
+ return;
+ }
+ }
+
+ /* In shutdown */
+ if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
+ if (export_vars.innodb_buffer_pool_load_incomplete) {
+ buf_dump_status(STATUS_INFO,
+ "Dumping of buffer pool not started"
+ " as load was incomplete");
+#ifdef WITH_WSREP
+ } else if (get_wsrep_recovery()) {
+#endif /* WITH_WSREP */
+ } else {
+ buf_dump(false/* do complete dump at shutdown */);
+ }
+ }
+}
+
+
+/* Execute task with max.concurrency */
+static tpool::task_group tpool_group(1);
+static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group);
+static bool load_dump_enabled;
+
+/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
+void buf_load_at_startup()
+{
+ load_dump_enabled= true;
+ if (srv_buffer_pool_load_at_startup)
+ buf_do_load_dump();
+}
+
+static void buf_do_load_dump()
+{
+ if (load_dump_enabled && !buf_dump_load_task.is_running())
+ srv_thread_pool->submit_task(&buf_dump_load_task);
+}
+
+/** Wait for currently running load/dumps to finish*/
+void buf_load_dump_end()
+{
+ ut_ad(SHUTTING_DOWN());
+ buf_dump_load_task.wait();
+}
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
new file mode 100644
index 00000000..10a84d99
--- /dev/null
+++ b/storage/innobase/buf/buf0flu.cc
@@ -0,0 +1,2530 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+Copyright (c) 2013, 2014, Fusion-io
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0flu.cc
+The database buffer buf_pool flush algorithm
+
+Created 11/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <my_service_manager.h>
+#include <mysql/service_thd_wait.h>
+#include <sql_class.h>
+
+#include "buf0flu.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "buf0dblwr.h"
+#include "srv0start.h"
+#include "page0zip.h"
+#include "fil0fil.h"
+#include "log0crypt.h"
+#include "srv0mon.h"
+#include "fil0pagecompress.h"
+#ifdef HAVE_LZO
+# include "lzo/lzo1x.h"
+#elif defined HAVE_SNAPPY
+# include "snappy-c.h"
+#endif
+
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_flush_page_count. */
+ulint buf_lru_flush_page_count;
+
+/** Number of pages flushed. Protected by buf_pool.mutex. */
+ulint buf_flush_page_count;
+
+/** Flag indicating if the page_cleaner is in active state. */
+bool buf_page_cleaner_is_active;
+
+/** Factor for scan length to determine n_pages for intended oldest LSN
+progress */
+static constexpr ulint buf_flush_lsn_scan_factor = 3;
+
+/** Average redo generation rate */
+static lsn_t lsn_avg_rate = 0;
+
+/** Target oldest_modification for the page cleaner background flushing;
+writes are protected by buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_async_lsn;
+/** Target oldest_modification for the page cleaner furious flushing;
+writes are protected by buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_sync_lsn;
+
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t page_cleaner_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/** Page cleaner structure */
+static struct
+{
+ /** total elapsed time in adaptive flushing, in seconds */
+ ulint flush_time;
+ /** number of adaptive flushing passes */
+ ulint flush_pass;
+} page_cleaner;
+
+#ifdef UNIV_DEBUG
+my_bool innodb_page_cleaner_disabled_debug;
+#endif /* UNIV_DEBUG */
+
+/** If LRU list of a buf_pool is less than this size then LRU eviction
+should not happen. This is because when we do LRU flushing we also put
+the blocks on free list. If LRU list is very small then we can end up
+in thrashing. */
+#define BUF_LRU_MIN_LEN 256
+
+/* @} */
+
+#ifdef UNIV_DEBUG
+/** Validate the flush list. */
+static void buf_flush_validate_low();
+
+/** Validates the flush list some of the time. */
+static void buf_flush_validate_skip()
+{
+/** Try buf_flush_validate_low() every this many times */
+# define BUF_FLUSH_VALIDATE_SKIP 23
+
+ /** The buf_flush_validate_low() call skip counter.
+ Use a signed type because of the race condition below. */
+ static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+
+ /* There is a race condition below, but it does not matter,
+ because this call is only for heuristic purposes. We want to
+ reduce the call frequency of the costly buf_flush_validate_low()
+ check in debug builds. */
+ if (--buf_flush_validate_count > 0) {
+ return;
+ }
+
+ buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+ buf_flush_validate_low();
+}
+#endif /* UNIV_DEBUG */
+
+/** Wake up the page cleaner if needed */
+inline void buf_pool_t::page_cleaner_wakeup()
+{
+ if (!page_cleaner_idle())
+ return;
+ double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 /
+ double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
+ double pct_lwm= srv_max_dirty_pages_pct_lwm;
+
+ /* if pct_lwm != 0.0, adaptive flushing is enabled.
+ signal buf page cleaner thread
+ - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow
+ - if pct_lwm > dirty_pct then it will invoke idle flushing flow.
+
+ idle_flushing:
+ dirty_pct < innodb_max_dirty_pages_pct_lwm so it could be an
+ idle flushing use-case.
+
+ Why is last_activity_count not updated always?
+ - let's first understand when is server activity count updated.
+ - it is updated on commit of a transaction trx_t::commit() and not
+ on adding a page to the flush list.
+ - page_cleaner_wakeup is called when a page is added to the flush list.
+
+ - now let's say the first user thread, updates the count from X -> Y but
+ is yet to commit the transaction (so activity count is still Y).
+ followup user threads will see the updated count as (Y) that is matching
+ the universal server activity count (Y), giving a false impression that
+ the server is idle.
+
+ How to avoid this?
+ - by allowing last_activity_count to updated when page-cleaner is made
+ active and has work to do. This ensures that the last_activity signal
+ is consumed by the page-cleaner before the next one is generated. */
+ if ((pct_lwm != 0.0 && pct_lwm <= dirty_pct) ||
+ (pct_lwm != 0.0 && last_activity_count == srv_get_activity_count()) ||
+ srv_max_buf_pool_modified_pct <= dirty_pct)
+ {
+ page_cleaner_is_idle= false;
+ pthread_cond_signal(&do_flush_list);
+ }
+}
+
+inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage)
+{
+ ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ flush_hp.adjust(bpage);
+ UT_LIST_REMOVE(flush_list, bpage);
+}
+
+/** Insert a modified block into the flush list.
+@param block modified block
+@param lsn start LSN of the mini-transaction that modified the block */
+void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn)
+{
+ mysql_mutex_assert_not_owner(&mutex);
+ mysql_mutex_assert_owner(&log_sys.flush_order_mutex);
+ ut_ad(lsn > 2);
+ ut_ad(!fsp_is_system_temporary(block->page.id().space()));
+
+ mysql_mutex_lock(&flush_list_mutex);
+ if (ut_d(const lsn_t old=) block->page.oldest_modification())
+ {
+ ut_ad(old == 1);
+ delete_from_flush_list_low(&block->page);
+ }
+ else
+ stat.flush_list_bytes+= block->physical_size();
+ ut_ad(stat.flush_list_bytes <= curr_pool_size);
+
+ block->page.set_oldest_modification(lsn);
+ MEM_CHECK_DEFINED(block->page.zip.data
+ ? block->page.zip.data : block->frame,
+ block->physical_size());
+ UT_LIST_ADD_FIRST(flush_list, &block->page);
+ ut_d(buf_flush_validate_skip());
+ page_cleaner_wakeup();
+ mysql_mutex_unlock(&flush_list_mutex);
+}
+
+/** Remove a block from flush_list.
+@param bpage buffer pool page
+@param clear whether to invoke buf_page_t::clear_oldest_modification() */
+void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear)
+{
+ delete_from_flush_list_low(bpage);
+ stat.flush_list_bytes-= bpage->physical_size();
+ if (clear)
+ bpage->clear_oldest_modification();
+#ifdef UNIV_DEBUG
+ buf_flush_validate_skip();
+#endif /* UNIV_DEBUG */
+}
+
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id tablespace identifier */
+void buf_flush_remove_pages(ulint id)
+{
+ const page_id_t first(id, 0), end(id + 1, 0);
+ ut_ad(id);
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ for (;;)
+ {
+ bool deferred= false;
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+ {
+ ut_d(const auto s= bpage->state());
+ ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
+ s == BUF_BLOCK_REMOVE_HASH);
+ buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+ const page_id_t bpage_id(bpage->id());
+
+ if (bpage_id < first || bpage_id >= end);
+ else if (bpage->io_fix() != BUF_IO_NONE)
+ deferred= true;
+ else
+ buf_pool.delete_from_flush_list(bpage);
+
+ bpage= prev;
+ }
+
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (!deferred)
+ break;
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ os_thread_yield();
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_flush_wait_batch_end(false);
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage have already been
+copied to dpage.
+IMPORTANT: When this function is called bpage and dpage are not
+exact copies of each other. For example, they both will have different
+::state. Also the ::list pointers in dpage may be stale. We need to
+use the current list node (bpage) to do the list manipulation because
+the list pointers could have changed between the time that we copied
+the contents of bpage to the dpage and the flush list manipulation
+below. */
+ATTRIBUTE_COLD
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage) /*!< in/out: destination block */
+{
+ buf_page_t* prev;
+
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+ ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+
+ const lsn_t lsn = bpage->oldest_modification();
+
+ if (!lsn) {
+ return;
+ }
+
+ ut_ad(lsn == 1 || lsn > 2);
+ ut_ad(dpage->oldest_modification() == lsn);
+
+ /* Important that we adjust the hazard pointer before removing
+ the bpage from the flush list. */
+ buf_pool.flush_hp.adjust(bpage);
+
+ prev = UT_LIST_GET_PREV(list, bpage);
+ UT_LIST_REMOVE(buf_pool.flush_list, bpage);
+
+ bpage->clear_oldest_modification();
+
+ if (lsn == 1) {
+ buf_pool.stat.flush_list_bytes -= dpage->physical_size();
+ dpage->list.prev = nullptr;
+ dpage->list.next = nullptr;
+ dpage->clear_oldest_modification();
+ } else if (prev) {
+ ut_ad(prev->oldest_modification());
+ UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage);
+ } else {
+ UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage);
+ }
+
+ ut_d(buf_flush_validate_low());
+}
+
+/** Complete write of a file page from buf_pool.
+@param request write request */
+void buf_page_write_complete(const IORequest &request)
+{
+ ut_ad(request.is_write());
+ ut_ad(!srv_read_only_mode/* ||
+ request.node->space->purpose == FIL_TYPE_TEMPORARY*/);
+ buf_page_t *bpage= request.bpage;
+ ut_ad(bpage);
+ ut_ad(bpage->in_file());
+ /* bpage->io_fix() can only be changed by buf_page_write_complete()
+ and buf_page_read_complete() from BUF_IO_READ or BUF_IO_WRITE */
+ ut_ad(bpage->io_fix() == BUF_IO_WRITE);
+ ut_ad(!buf_dblwr.is_inside(bpage->id()));
+ ut_ad(request.node->space->id == bpage->id().space());
+
+ if (bpage->status == buf_page_t::INIT_ON_FLUSH)
+ bpage->status= buf_page_t::NORMAL;
+ else
+ {
+ ut_ad(bpage->status == buf_page_t::NORMAL);
+ if (request.node->space->use_doublewrite())
+ {
+ ut_ad(request.node->space != fil_system.temp_space);
+ buf_dblwr.write_completed();
+ }
+ }
+
+ if (bpage->slot)
+ {
+ bpage->slot->release();
+ bpage->slot= nullptr;
+ }
+
+ if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+ buf_page_monitor(bpage, BUF_IO_WRITE);
+ DBUG_PRINT("ib_buf", ("write page %u:%u",
+ bpage->id().space(), bpage->id().page_no()));
+ const bool temp= fsp_is_system_temporary(bpage->id().space());
+
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_pool.stat.n_pages_written++;
+ /* While we do not need any mutex for clearing oldest_modification
+ here, we hope that it will be in the same cache line with io_fix,
+ whose changes must be protected by buf_pool.mutex. */
+ bpage->clear_oldest_modification(temp);
+ ut_ad(bpage->io_fix() == BUF_IO_WRITE);
+ bpage->set_io_fix(BUF_IO_NONE);
+
+ /* Because this thread which does the unlocking might not be the same that
+ did the locking, we use a pass value != 0 in unlock, which simply
+ removes the newest lock debug record, without checking the thread id. */
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+ rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE);
+
+ if (request.is_LRU())
+ {
+ buf_LRU_free_page(bpage, true);
+
+ ut_ad(buf_pool.n_flush_LRU_);
+ if (!--buf_pool.n_flush_LRU_)
+ {
+ pthread_cond_broadcast(&buf_pool.done_flush_LRU);
+ pthread_cond_signal(&buf_pool.done_free);
+ }
+ }
+ else
+ {
+ ut_ad(!temp);
+ ut_ad(buf_pool.n_flush_list_);
+ if (!--buf_pool.n_flush_list_)
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
+@param[in,out] page page to update
+@param[in] size compressed page size */
+void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size)
+{
+ ut_ad(size > 0);
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+ page_zip_calc_checksum(page, size,
+ static_cast<srv_checksum_algorithm_t>
+ (srv_checksum_algorithm)));
+}
+
+/** Assign the full crc32 checksum for non-compressed page.
+@param[in,out] page page to be updated */
+void buf_flush_assign_full_crc32_checksum(byte* page)
+{
+ ut_d(bool compressed = false);
+ ut_d(bool corrupted = false);
+ ut_d(const uint size = buf_page_full_crc32_size(page, &compressed,
+ &corrupted));
+ ut_ad(!compressed);
+ ut_ad(!corrupted);
+ ut_ad(size == uint(srv_page_size));
+ const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
+ mach_write_to_4(page + payload, ut_crc32(page, payload));
+}
+
+/** Initialize a page for writing to the tablespace.
+@param[in] block buffer block; NULL if bypassing
+ the buffer pool
+@param[in,out] page page frame
+@param[in,out] page_zip_ compressed page, or NULL if
+ uncompressed
+@param[in] use_full_checksum whether tablespace uses full checksum */
+void
+buf_flush_init_for_writing(
+ const buf_block_t* block,
+ byte* page,
+ void* page_zip_,
+ bool use_full_checksum)
+{
+ if (block != NULL && block->frame != page) {
+ /* If page is encrypted in full crc32 format then
+ checksum stored already as a part of fil_encrypt_buf() */
+ ut_ad(use_full_checksum);
+ return;
+ }
+
+ ut_ad(block == NULL || block->frame == page);
+ ut_ad(block == NULL || page_zip_ == NULL
+ || &block->page.zip == page_zip_);
+ ut_ad(page);
+
+ if (page_zip_) {
+ page_zip_des_t* page_zip;
+ ulint size;
+
+ page_zip = static_cast<page_zip_des_t*>(page_zip_);
+ size = page_zip_get_size(page_zip);
+
+ ut_ad(size);
+ ut_ad(ut_is_2pow(size));
+ ut_ad(size <= UNIV_ZIP_SIZE_MAX);
+
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ /* These are essentially uncompressed pages. */
+ memcpy(page_zip->data, page, size);
+ /* fall through */
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ buf_flush_update_zip_checksum(page_zip->data, size);
+ return;
+ }
+
+ ib::error() << "The compressed page to be written"
+ " seems corrupt:";
+ ut_print_buf(stderr, page, size);
+ fputs("\nInnoDB: Possibly older version of the page:", stderr);
+ ut_print_buf(stderr, page_zip->data, size);
+ putc('\n', stderr);
+ ut_error;
+ }
+
+ if (use_full_checksum) {
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "aligned");
+ static_assert(FIL_PAGE_LSN % 4 == 0, "aligned");
+ memcpy_aligned<4>(page + srv_page_size
+ - FIL_PAGE_FCRC32_END_LSN,
+ FIL_PAGE_LSN + 4 + page, 4);
+ return buf_flush_assign_full_crc32_checksum(page);
+ }
+
+ static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 8 == 0, "aligned");
+ static_assert(FIL_PAGE_LSN % 8 == 0, "aligned");
+ memcpy_aligned<8>(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ FIL_PAGE_LSN + page, 8);
+
+ if (block && srv_page_size == 16384) {
+ /* The page type could be garbage in old files
+ created before MySQL 5.5. Such files always
+ had a page size of 16 kilobytes. */
+ ulint page_type = fil_page_get_type(page);
+ ulint reset_type = page_type;
+
+ switch (block->page.id().page_no() % 16384) {
+ case 0:
+ reset_type = block->page.id().page_no() == 0
+ ? FIL_PAGE_TYPE_FSP_HDR
+ : FIL_PAGE_TYPE_XDES;
+ break;
+ case 1:
+ reset_type = FIL_PAGE_IBUF_BITMAP;
+ break;
+ case FSP_TRX_SYS_PAGE_NO:
+ if (block->page.id()
+ == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) {
+ reset_type = FIL_PAGE_TYPE_TRX_SYS;
+ break;
+ }
+ /* fall through */
+ default:
+ switch (page_type) {
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_RTREE:
+ case FIL_PAGE_UNDO_LOG:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_FREE_LIST:
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_TYPE_SYS:
+ case FIL_PAGE_TYPE_TRX_SYS:
+ case FIL_PAGE_TYPE_BLOB:
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ break;
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ case FIL_PAGE_IBUF_BITMAP:
+ /* These pages should have
+ predetermined page numbers
+ (see above). */
+ default:
+ reset_type = FIL_PAGE_TYPE_UNKNOWN;
+ break;
+ }
+ }
+
+ if (UNIV_UNLIKELY(page_type != reset_type)) {
+ ib::info()
+ << "Resetting invalid page "
+ << block->page.id() << " type "
+ << page_type << " to "
+ << reset_type << " when flushing.";
+ fil_page_set_type(page, reset_type);
+ }
+ }
+
+ uint32_t checksum = BUF_NO_CHECKSUM_MAGIC;
+
+ switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ checksum = buf_calc_page_new_checksum(page);
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+ checksum);
+ /* With the InnoDB checksum, we overwrite the first 4 bytes of
+ the end lsn field to store the old formula checksum. Since it
+ depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
+ be calculated after storing the new formula checksum. */
+ checksum = buf_calc_page_old_checksum(page);
+ break;
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ /* In other cases we write the same checksum to both fields. */
+ checksum = buf_calc_page_crc32(page);
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+ checksum);
+ break;
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+ checksum);
+ break;
+ /* no default so the compiler will emit a warning if
+ new enum is added and not handled here */
+ }
+
+ mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ checksum);
+}
+
+/** Reserve a buffer for compression.
+@param[in,out] slot reserved slot */
+static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
+{
+ if (slot->comp_buf)
+ return;
+ /* Both Snappy and LZO compression methods require that the output
+ buffer be bigger than input buffer. Adjust the allocated size. */
+ ulint size= srv_page_size;
+#ifdef HAVE_LZO
+ size+= LZO1X_1_15_MEM_COMPRESS;
+#elif defined HAVE_SNAPPY
+ size= snappy_max_compressed_length(size);
+#endif
+ slot->comp_buf= static_cast<byte*>(aligned_malloc(size, srv_page_size));
+}
+
+/** Encrypt a buffer of temporary tablespace
+@param[in] offset Page offset
+@param[in] s Page to encrypt
+@param[in,out] d Output buffer
+@return encrypted buffer or NULL */
+static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d)
+{
+ /* Calculate the start offset in a page */
+ uint srclen= static_cast<uint>(srv_page_size) -
+ (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION +
+ FIL_PAGE_FCRC32_CHECKSUM);
+ const byte* src= s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+ byte* dst= d + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+
+ memcpy(d, s, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+ if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true))
+ return NULL;
+
+ const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
+ mach_write_to_4(d + payload, ut_crc32(d, payload));
+
+ srv_stats.pages_encrypted.inc();
+ srv_stats.n_temp_blocks_encrypted.inc();
+ return d;
+}
+
+/** Encryption and page_compression hook that is called just before
+a page is written to disk.
+@param[in,out] space tablespace
+@param[in,out] bpage buffer page
+@param[in] s physical page frame that is being encrypted
+@param[in,out] size payload size in bytes
+@return page frame to be written to file
+(may be src_frame or an encrypted/compressed copy of it) */
+static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s,
+ size_t *size)
+{
+ ut_ad(bpage->status != buf_page_t::FREED);
+ ut_ad(space->id == bpage->id().space());
+
+ ut_d(fil_page_type_validate(space, s));
+ const uint32_t page_no= bpage->id().page_no();
+
+ switch (page_no) {
+ case TRX_SYS_PAGE_NO:
+ if (bpage->id().space() != TRX_SYS_SPACE)
+ break;
+ /* The TRX_SYS page is neither encrypted nor compressed, because
+ it contains the address of the doublewrite buffer. */
+ /* fall through */
+ case 0:
+ /* Page 0 of a tablespace is not encrypted/compressed */
+ return s;
+ }
+
+ fil_space_crypt_t *crypt_data= space->crypt_data;
+ bool encrypted, page_compressed;
+ if (space->purpose == FIL_TYPE_TEMPORARY)
+ {
+ ut_ad(!crypt_data);
+ encrypted= innodb_encrypt_temporary_tables;
+ page_compressed= false;
+ }
+ else
+ {
+ encrypted= crypt_data && !crypt_data->not_encrypted() &&
+ crypt_data->type != CRYPT_SCHEME_UNENCRYPTED &&
+ (!crypt_data->is_default_encryption() || srv_encrypt_tables);
+ page_compressed= space->is_compressed();
+ }
+
+ const bool full_crc32= space->full_crc32();
+
+ if (!encrypted && !page_compressed)
+ {
+ /* No need to encrypt or compress. Clear key-version & crypt-checksum. */
+ static_assert(FIL_PAGE_FCRC32_KEY_VERSION % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION % 4 == 2,
+ "not perfect alignment");
+ if (full_crc32)
+ memset_aligned<4>(s + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4);
+ else
+ memset_aligned<2>(s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+ return s;
+ }
+
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
+ if (full_crc32)
+ memcpy_aligned<4>(s + srv_page_size - FIL_PAGE_FCRC32_END_LSN,
+ FIL_PAGE_LSN + 4 + s, 4);
+
+ ut_ad(!bpage->zip_size() || !page_compressed);
+ /* Find free slot from temporary memory array */
+ buf_tmp_buffer_t *slot= buf_pool.io_buf_reserve();
+ ut_a(slot);
+ slot->allocate();
+ slot->out_buf= NULL;
+ bpage->slot= slot;
+
+ byte *d= slot->crypt_buf;
+
+ if (!page_compressed)
+ {
+not_compressed:
+ byte *tmp= space->purpose == FIL_TYPE_TEMPORARY
+ ? buf_tmp_page_encrypt(page_no, s, d)
+ : fil_space_encrypt(space, page_no, s, d);
+
+ slot->out_buf= d= tmp;
+
+ ut_d(fil_page_type_validate(space, tmp));
+ }
+ else
+ {
+ ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
+ /* First we compress the page content */
+ buf_tmp_reserve_compression_buf(slot);
+ byte *tmp= slot->comp_buf;
+ ulint len= fil_page_compress(s, tmp, space->flags,
+ fil_space_get_block_size(space, page_no),
+ encrypted);
+
+ if (!len)
+ goto not_compressed;
+
+ *size= len;
+
+ if (full_crc32)
+ {
+ ut_d(bool compressed = false);
+ len= buf_page_full_crc32_size(tmp,
+#ifdef UNIV_DEBUG
+ &compressed,
+#else
+ NULL,
+#endif
+ NULL);
+ ut_ad(compressed);
+ }
+
+ /* Workaround for MDEV-15527. */
+ memset(tmp + len, 0 , srv_page_size - len);
+ ut_d(fil_page_type_validate(space, tmp));
+
+ if (encrypted)
+ tmp = fil_space_encrypt(space, page_no, tmp, d);
+
+ if (full_crc32)
+ {
+ static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+ mach_write_to_4(tmp + len - 4, ut_crc32(tmp, len - 4));
+ ut_ad(!buf_page_is_corrupted(true, tmp, space->flags));
+ }
+
+ slot->out_buf= d= tmp;
+ }
+
+ ut_d(fil_page_type_validate(space, d));
+ return d;
+}
+
+/** Free a page whose underlying file page has been freed. */
+inline void buf_pool_t::release_freed_page(buf_page_t *bpage)
+{
+ ut_ad(bpage->in_file());
+ const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE;
+ mysql_mutex_lock(&mutex);
+ bpage->set_io_fix(BUF_IO_NONE);
+ bpage->status= buf_page_t::NORMAL;
+ mysql_mutex_lock(&flush_list_mutex);
+ ut_d(const lsn_t oldest_modification= bpage->oldest_modification();)
+ if (fsp_is_system_temporary(bpage->id().space()))
+ {
+ ut_ad(uncompressed);
+ ut_ad(oldest_modification == 2);
+ }
+ else
+ {
+ ut_ad(oldest_modification > 2);
+ delete_from_flush_list(bpage, false);
+ }
+ bpage->clear_oldest_modification();
+ mysql_mutex_unlock(&flush_list_mutex);
+
+ if (uncompressed)
+ rw_lock_sx_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
+ BUF_IO_WRITE);
+
+ buf_LRU_free_page(bpage, true);
+ mysql_mutex_unlock(&mutex);
+}
+
+/** Write a flushable page from buf_pool to a file.
+buf_pool.mutex must be held.
+@param bpage buffer control block
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list
+@param space tablespace
+@return whether the page was flushed and buf_pool.mutex was released */
+static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
+{
+ ut_ad(bpage->in_file());
+ ut_ad(bpage->ready_for_flush());
+ ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
+ (space == fil_system.temp_space));
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
+ space->atomic_write_supported);
+ ut_ad(space->referenced());
+ ut_ad(lru || space != fil_system.temp_space);
+
+ rw_lock_t *rw_lock;
+
+ if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+ rw_lock= nullptr;
+ else
+ {
+ rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
+ if (!rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE))
+ return false;
+ }
+
+ bpage->set_io_fix(BUF_IO_WRITE);
+ /* Because bpage->status can only be changed while buf_block_t
+ exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages
+ without first allocating the uncompressed page frame. Such
+ allocation cannot be completed due to our io_fix. So, bpage->status
+ is protected even if !rw_lock. */
+ const auto status= bpage->status;
+
+ if (status != buf_page_t::FREED)
+ {
+ if (lru)
+ buf_pool.n_flush_LRU_++;
+ else
+ buf_pool.n_flush_list_++;
+ buf_flush_page_count++;
+ }
+
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+
+ /* We are holding rw_lock = buf_block_t::lock in SX mode except if
+ this is a ROW_FORMAT=COMPRESSED page whose uncompressed page frame
+ has been evicted from the buffer pool.
+
+ Apart from possible rw_lock protection, bpage is also protected by
+ io_fix and oldest_modification()!=0. Thus, it cannot be relocated in
+ the buffer pool or removed from flush_list or LRU_list. */
+
+ DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
+ lru ? "LRU" : "flush_list",
+ bpage->id().space(), bpage->id().page_no()));
+ ut_ad(bpage->io_fix() == BUF_IO_WRITE);
+ ut_d(const lsn_t oldest_modification= bpage->oldest_modification());
+ ut_ad(space == fil_system.temp_space
+ ? oldest_modification == 2
+ : oldest_modification > 2);
+ ut_ad(bpage->state() ==
+ (rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE));
+ ut_ad(ULINT_UNDEFINED >
+ (lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_));
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+ page_t *frame= bpage->zip.data;
+
+ if (status == buf_page_t::FREED)
+ buf_pool.release_freed_page(&block->page);
+ else
+ {
+ space->reacquire();
+ ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH);
+ size_t size;
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ size_t orig_size;
+#endif
+ IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC;
+
+ if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */
+ {
+ ut_ad(!space->full_crc32());
+ ut_ad(!space->is_compressed()); /* not page_compressed */
+ size= bpage->zip_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ orig_size= size;
+#endif
+ buf_flush_update_zip_checksum(frame, size);
+ frame= buf_page_encrypt(space, bpage, frame, &size);
+ ut_ad(size == bpage->zip_size());
+ }
+ else
+ {
+ byte *page= block->frame;
+ size= block->physical_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ orig_size= size;
+#endif
+
+ if (space->full_crc32())
+ {
+ /* innodb_checksum_algorithm=full_crc32 is not implemented for
+ ROW_FORMAT=COMPRESSED pages. */
+ ut_ad(!frame);
+ page= buf_page_encrypt(space, bpage, page, &size);
+ buf_flush_init_for_writing(block, page, nullptr, true);
+ }
+ else
+ {
+ buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr,
+ false);
+ page= buf_page_encrypt(space, bpage, frame ? frame : page, &size);
+ }
+
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ if (size != orig_size && space->punch_hole)
+ type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
+#endif
+ frame=page;
+ }
+
+ ut_ad(status == bpage->status);
+ ut_ad(oldest_modification == bpage->oldest_modification());
+
+ if (status != buf_page_t::NORMAL || !space->use_doublewrite())
+ {
+ if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
+ {
+ const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+ (FIL_PAGE_LSN + (frame ? frame
+ : block->frame)));
+ ut_ad(lsn >= oldest_modification);
+ if (lsn > log_sys.get_flushed_lsn())
+ log_write_up_to(lsn, true);
+ }
+ space->io(IORequest(type, bpage),
+ bpage->physical_offset(), size, frame, bpage);
+ }
+ else
+ buf_dblwr.add_to_batch(IORequest(bpage, space->chain.start, type), size);
+ }
+
+ /* Increment the I/O operation count used for selecting LRU policy. */
+ buf_LRU_stat_inc_io();
+ return true;
+}
+
+/** Check whether a page can be flushed from the buf_pool.
+@param id page identifier
+@param fold id.fold()
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list
+@return whether the page can be flushed */
+static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(fold == id.fold());
+
+ buf_page_t *bpage= buf_pool.page_hash_get_low(id, fold);
+
+ if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+ return false;
+
+ /* We avoid flushing 'non-old' blocks in an LRU flush, because the
+ flushed blocks are soon freed */
+ if (lru && !bpage->is_old())
+ return false;
+
+ return bpage->oldest_modification() > 1 && bpage->ready_for_flush();
+}
+
+/** Check which neighbors of a page can be flushed from the buf_pool.
+@param space tablespace
+@param id page identifier of a dirty page
+@param contiguous whether to consider contiguous areas of pages
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list
+@return last page number that can be flushed */
+static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
+ page_id_t &id, bool contiguous,
+ bool lru)
+{
+ ut_ad(id.page_no() < space.size);
+ /* When flushed, dirty blocks are searched in neighborhoods of this
+ size, and flushed along with the original page. */
+ const ulint s= buf_pool.curr_size / 16;
+ const uint32_t read_ahead= buf_pool.read_ahead_area;
+ const uint32_t buf_flush_area= read_ahead > s
+ ? static_cast<uint32_t>(s) : read_ahead;
+ page_id_t low= id - (id.page_no() % buf_flush_area);
+ page_id_t high= low + buf_flush_area;
+ high.set_page_no(std::min(high.page_no(), space.last_page_number()));
+
+ if (!contiguous)
+ {
+ high= std::max(id + 1, high);
+ id= low;
+ return high;
+ }
+
+ /* Determine the contiguous dirty area around id. */
+ const ulint id_fold= id.fold();
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (id > low)
+ {
+ ulint fold= id_fold;
+ for (page_id_t i= id - 1;; --i)
+ {
+ fold--;
+ if (!buf_flush_check_neighbor(i, fold, lru))
+ {
+ low= i + 1;
+ break;
+ }
+ if (i == low)
+ break;
+ }
+ }
+
+ page_id_t i= id;
+ id= low;
+ ulint fold= id_fold;
+ while (++i < high)
+ {
+ ++fold;
+ if (!buf_flush_check_neighbor(i, fold, lru))
+ break;
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return i;
+}
+
+MY_ATTRIBUTE((nonnull))
+/** Write punch-hole or zeroes of the freed ranges when
+innodb_immediate_scrub_data_uncompressed from the freed ranges.
+@param space tablespace which may contain ranges of freed pages */
+static void buf_flush_freed_pages(fil_space_t *space)
+{
+ const bool punch_hole= space->punch_hole;
+ if (!srv_immediate_scrub_data_uncompressed && !punch_hole)
+ return;
+ lsn_t flush_to_disk_lsn= log_sys.get_flushed_lsn();
+
+ std::unique_lock<std::mutex> freed_lock(space->freed_range_mutex);
+ if (space->freed_ranges.empty()
+ || flush_to_disk_lsn < space->get_last_freed_lsn())
+ {
+ freed_lock.unlock();
+ return;
+ }
+
+ range_set freed_ranges= std::move(space->freed_ranges);
+ freed_lock.unlock();
+
+ for (const auto &range : freed_ranges)
+ {
+ const ulint physical_size= space->physical_size();
+
+ if (punch_hole)
+ {
+ space->reacquire();
+ space->io(IORequest(IORequest::PUNCH_RANGE),
+ os_offset_t{range.first} * physical_size,
+ (range.last - range.first + 1) * physical_size,
+ nullptr);
+ }
+ else if (srv_immediate_scrub_data_uncompressed)
+ {
+ for (os_offset_t i= range.first; i <= range.last; i++)
+ {
+ space->reacquire();
+ space->io(IORequest(IORequest::WRITE_ASYNC),
+ i * physical_size, physical_size,
+ const_cast<byte*>(field_ref_zero));
+ }
+ }
+ buf_pool.stat.n_pages_written+= (range.last - range.first + 1);
+ }
+}
+
+/** Flushes to disk all flushable pages within the flush area
+and also write zeroes or punch the hole for the freed ranges of pages.
+@param space tablespace
+@param page_id page identifier
+@param contiguous whether to consider contiguous areas of pages
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list
+@param n_flushed number of pages flushed so far in this batch
+@param n_to_flush maximum number of pages we are allowed to flush
+@return number of pages flushed */
+static ulint buf_flush_try_neighbors(fil_space_t *space,
+ const page_id_t page_id,
+ bool contiguous, bool lru,
+ ulint n_flushed, ulint n_to_flush)
+{
+ ut_ad(space->id == page_id.space());
+
+ ulint count= 0;
+ page_id_t id= page_id;
+ page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru);
+
+ ut_ad(page_id >= id);
+ ut_ad(page_id < high);
+
+ for (ulint id_fold= id.fold(); id < high && !space->is_stopping();
+ ++id, ++id_fold)
+ {
+ if (count + n_flushed >= n_to_flush)
+ {
+ if (id > page_id)
+ break;
+ /* If the page whose neighbors we are flushing has not been
+ flushed yet, we must flush the page that we selected originally. */
+ id= page_id;
+ id_fold= id.fold();
+ }
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (buf_page_t *bpage= buf_pool.page_hash_get_low(id, id_fold))
+ {
+ ut_ad(bpage->in_file());
+ /* We avoid flushing 'non-old' blocks in an LRU flush,
+ because the flushed blocks are soon freed */
+ if (!lru || id == page_id || bpage->is_old())
+ {
+ if (!buf_pool.watch_is_sentinel(*bpage) &&
+ bpage->oldest_modification() > 1 &&
+ bpage->ready_for_flush() && buf_flush_page(bpage, lru, space))
+ {
+ ++count;
+ continue;
+ }
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+
+ if (auto n= count - 1)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+ MONITOR_FLUSH_NEIGHBOR_COUNT,
+ MONITOR_FLUSH_NEIGHBOR_PAGES, n);
+ }
+
+ return count;
+}
+
+/*******************************************************************//**
+This utility moves the uncompressed frames of pages to the free list.
+Note that this function does not actually flush any data to disk. It
+just detaches the uncompressed frames from the compressed pages at the
+tail of the unzip_LRU and puts those freed frames in the free list.
+Note that it is a best effort attempt and it is not guaranteed that
+after a call to this function there will be 'max' blocks in the free
+list.
+@param[in] max desired number of blocks in the free_list
+@return number of blocks moved to the free list. */
+static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
+{
+ ulint scanned = 0;
+ ulint count = 0;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+
+ while (block
+ && count < max
+ && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth
+ && UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+ > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
+
+ ++scanned;
+ if (buf_LRU_free_page(&block->page, false)) {
+ /* Block was freed. buf_pool.mutex potentially
+ released and reacquired */
+ ++count;
+ block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+ } else {
+ block = UT_LIST_GET_PREV(unzip_LRU, block);
+ }
+ }
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ if (scanned) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_SCANNED,
+ MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+ scanned);
+ }
+
+ return(count);
+}
+
+/** Start writing out pages for a tablespace.
+@param id tablespace identifier
+@return tablespace
+@retval nullptr if the pages for this tablespace should be discarded */
+static fil_space_t *buf_flush_space(const uint32_t id)
+{
+ fil_space_t *space= fil_space_t::get(id);
+ if (space)
+ buf_flush_freed_pages(space);
+ return space;
+}
+
+struct flush_counters_t
+{
+ /** number of dirty pages flushed */
+ ulint flushed;
+ /** number of clean pages evicted */
+ ulint evicted;
+};
+
+/** Try to discard a dirty page.
+@param bpage dirty page whose tablespace is not accessible */
+static void buf_flush_discard_page(buf_page_t *bpage)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+ ut_ad(bpage->in_file());
+ ut_ad(bpage->oldest_modification());
+
+ rw_lock_t *rw_lock;
+
+ if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+ rw_lock= nullptr;
+ else
+ {
+ rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
+ if (!rw_lock_sx_lock_nowait(rw_lock, 0))
+ return;
+ }
+
+ bpage->status= buf_page_t::NORMAL;
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_pool.delete_from_flush_list(bpage);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (rw_lock)
+ rw_lock_sx_unlock(rw_lock);
+
+ buf_LRU_free_page(bpage, true);
+}
+
+/** Flush dirty blocks from the end of the LRU list.
+@param max maximum number of blocks to make available in buf_pool.free
+@param n counts of flushed and evicted pages */
+static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
+{
+ ulint scanned= 0;
+ ulint free_limit= srv_LRU_scan_depth;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ if (buf_pool.withdraw_target && buf_pool.curr_size < buf_pool.old_size)
+ free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw);
+
+ const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+ ? 0 : srv_flush_neighbors;
+ fil_space_t *space= nullptr;
+ uint32_t last_space_id= FIL_NULL;
+ static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+ static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
+ bpage && n->flushed + n->evicted < max &&
+ UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN &&
+ UT_LIST_GET_LEN(buf_pool.free) < free_limit; ++scanned)
+ {
+ buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
+ const lsn_t oldest_modification= bpage->oldest_modification();
+ buf_pool.lru_hp.set(prev);
+
+ if (oldest_modification <= 1 && bpage->can_relocate())
+ {
+ /* block is ready for eviction i.e., it is clean and is not
+ IO-fixed or buffer fixed. */
+ if (buf_LRU_free_page(bpage, true))
+ ++n->evicted;
+ }
+ else if (oldest_modification > 1 && bpage->ready_for_flush())
+ {
+ /* Block is ready for flush. Dispatch an IO request. The IO
+ helper thread will put it on free list in IO completion routine. */
+ const page_id_t page_id(bpage->id());
+ const uint32_t space_id= page_id.space();
+ if (!space || space->id != space_id)
+ {
+ if (last_space_id != space_id)
+ {
+ if (space)
+ space->release();
+ space= buf_flush_space(space_id);
+ last_space_id= space_id;
+ }
+ else
+ ut_ad(!space);
+ }
+ else if (space->is_stopping())
+ {
+ space->release();
+ space= nullptr;
+ }
+
+ if (!space)
+ buf_flush_discard_page(bpage);
+ else if (neighbors && space->is_rotational())
+ {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
+ true, n->flushed, max);
+reacquire_mutex:
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+ else if (buf_flush_page(bpage, true, space))
+ {
+ ++n->flushed;
+ goto reacquire_mutex;
+ }
+ }
+ else
+ /* Can't evict or dispatch this block. Go to previous. */
+ ut_ad(buf_pool.lru_hp.is_hp(prev));
+ bpage= buf_pool.lru_hp.get();
+ }
+
+ buf_pool.lru_hp.set(nullptr);
+
+ if (space)
+ space->release();
+
+ /* We keep track of all flushes happening as part of LRU flush. When
+ estimating the desired rate at which flush_list should be flushed,
+ we factor in this value. */
+ buf_lru_flush_page_count+= n->flushed;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ if (scanned)
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED,
+ MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+ scanned);
+}
+
+/** Flush and move pages from LRU or unzip_LRU list to the free list.
+Whether LRU or unzip_LRU is used depends on the state of the system.
+@param max maximum number of blocks to make available in buf_pool.free
+@return number of flushed pages */
+static ulint buf_do_LRU_batch(ulint max)
+{
+ const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU()
+ ? buf_free_from_unzip_LRU_list_batch(max)
+ : 0;
+ flush_counters_t n;
+ n.flushed= 0;
+ n.evicted= n_unzip_LRU_evicted;
+ buf_flush_LRU_list_batch(max, &n);
+
+ if (const ulint evicted= n.evicted - n_unzip_LRU_evicted)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_COUNT,
+ MONITOR_LRU_BATCH_EVICT_PAGES,
+ evicted);
+ }
+
+ return n.flushed;
+}
+
+/** This utility flushes dirty blocks from the end of the flush_list.
+The calling thread is not allowed to own any latches on pages!
+@param max_n maximum mumber of blocks to flush
+@param lsn once an oldest_modification>=lsn is found, terminate the batch
+@return number of blocks for which the write request was queued */
+static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
+{
+ ulint count= 0;
+ ulint scanned= 0;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+ ? 0 : srv_flush_neighbors;
+ fil_space_t *space= nullptr;
+ uint32_t last_space_id= FIL_NULL;
+ static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+ static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
+ /* Start from the end of the list looking for a suitable block to be
+ flushed. */
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
+ bpage && len && count < max_n; ++scanned, len--)
+ {
+ const lsn_t oldest_modification= bpage->oldest_modification();
+ if (oldest_modification >= lsn)
+ break;
+ ut_ad(bpage->in_file());
+
+ buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+ if (oldest_modification == 1)
+ {
+ buf_pool.delete_from_flush_list(bpage);
+ skip:
+ bpage= prev;
+ continue;
+ }
+
+ ut_ad(oldest_modification > 2);
+ ut_ad(bpage->in_file());
+
+ if (!bpage->ready_for_flush())
+ goto skip;
+
+ /* In order not to degenerate this scan to O(n*n) we attempt to
+ preserve the pointer position. Any thread that would remove 'prev'
+ from buf_pool.flush_list must adjust the hazard pointer.
+
+ Note: A concurrent execution of buf_flush_list_space() may
+ terminate this scan prematurely. The buf_pool.n_flush_list()
+ should prevent multiple threads from executing
+ buf_do_flush_list_batch() concurrently,
+ but buf_flush_list_space() is ignoring that. */
+ buf_pool.flush_hp.set(prev);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ const page_id_t page_id(bpage->id());
+ const uint32_t space_id= page_id.space();
+ if (!space || space->id != space_id)
+ {
+ if (last_space_id != space_id)
+ {
+ if (space)
+ space->release();
+ space= buf_flush_space(space_id);
+ last_space_id= space_id;
+ }
+ else
+ ut_ad(!space);
+ }
+ else if (space->is_stopping())
+ {
+ space->release();
+ space= nullptr;
+ }
+
+ if (!space)
+ buf_flush_discard_page(bpage);
+ else if (neighbors && space->is_rotational())
+ {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ count+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
+ false, count, max_n);
+ reacquire_mutex:
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+ else if (buf_flush_page(bpage, false, space))
+ {
+ ++count;
+ goto reacquire_mutex;
+ }
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ bpage= buf_pool.flush_hp.get();
+ }
+
+ buf_pool.flush_hp.set(nullptr);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (space)
+ space->release();
+
+ if (scanned)
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
+ MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+ MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+ scanned);
+ if (count)
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_COUNT,
+ MONITOR_FLUSH_BATCH_PAGES,
+ count);
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ return count;
+}
+
+/** Wait until a flush batch ends.
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list */
+void buf_flush_wait_batch_end(bool lru)
+{
+ const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_;
+
+ if (n_flush)
+ {
+ auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list;
+ tpool::tpool_wait_begin();
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ do
+ my_cond_wait(cond, &buf_pool.mutex.m_mutex);
+ while (n_flush);
+ tpool::tpool_wait_end();
+ thd_wait_end(nullptr);
+ pthread_cond_broadcast(cond);
+ }
+}
+
+/** Write out dirty blocks from buf_pool.flush_list.
+@param max_n wished maximum mumber of blocks flushed
+@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
+@return the number of processed pages
+@retval 0 if a buf_pool.flush_list batch is already running */
+ulint buf_flush_list(ulint max_n, lsn_t lsn)
+{
+ ut_ad(lsn);
+
+ if (buf_pool.n_flush_list())
+ return 0;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+ const bool running= buf_pool.n_flush_list_ != 0;
+ /* FIXME: we are performing a dirty read of buf_pool.flush_list.count
+ while not holding buf_pool.flush_list_mutex */
+ if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
+ {
+ if (!running)
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return 0;
+ }
+
+ buf_pool.n_flush_list_++;
+ const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
+ const ulint n_flushing= --buf_pool.n_flush_list_;
+
+ buf_pool.try_LRU_scan= true;
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (!n_flushing)
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+
+ buf_dblwr.flush_buffered_writes();
+
+ DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
+ return n_flushed;
+}
+
+/** Try to flush all the dirty pages that belong to a given tablespace.
+@param space tablespace
+@param n_flushed number of pages written
+@return whether the flush for some pages might not have been initiated */
+bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
+{
+ const auto space_id= space->id;
+ ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND);
+
+ bool may_have_skipped= false;
+ ulint max_n_flush= srv_io_capacity;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ bool acquired= space->acquire();
+ buf_flush_freed_pages(space);
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+ {
+ ut_d(const auto s= bpage->state());
+ ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
+ s == BUF_BLOCK_REMOVE_HASH);
+ ut_ad(bpage->oldest_modification());
+ ut_ad(bpage->in_file());
+
+ buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+ if (bpage->id().space() != space_id);
+ else if (bpage->oldest_modification() == 1)
+ buf_pool.delete_from_flush_list(bpage);
+ else if (!bpage->ready_for_flush())
+ may_have_skipped= true;
+ else
+ {
+ /* In order not to degenerate this scan to O(n*n) we attempt to
+ preserve the pointer position. Any thread that would remove 'prev'
+ from buf_pool.flush_list must adjust the hazard pointer.
+
+ Note: Multiple executions of buf_flush_list_space() may be
+ interleaved, and also buf_do_flush_list_batch() may be running
+ concurrently. This may terminate our iteration prematurely,
+ leading us to return may_have_skipped=true. */
+ buf_pool.flush_hp.set(prev);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (!acquired)
+ {
+ was_freed:
+ buf_flush_discard_page(bpage);
+ }
+ else
+ {
+ if (space->is_stopping())
+ {
+ space->release();
+ acquired= false;
+ goto was_freed;
+ }
+ if (!buf_flush_page(bpage, false, space))
+ {
+ may_have_skipped= true;
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ goto next_after_skip;
+ }
+ if (n_flushed)
+ ++*n_flushed;
+ if (!--max_n_flush)
+ {
+ mysql_mutex_lock(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ may_have_skipped= true;
+ break;
+ }
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (!buf_pool.flush_hp.is_hp(prev))
+ may_have_skipped= true;
+ next_after_skip:
+ bpage= buf_pool.flush_hp.get();
+ continue;
+ }
+
+ bpage= prev;
+ }
+
+ /* Note: this loop may have been executed concurrently with
+ buf_do_flush_list_batch() as well as other threads executing
+ buf_flush_list_space(). We should always return true from
+ buf_flush_list_space() if that should be the case; in
+ buf_do_flush_list_batch() we will simply perform less work. */
+
+ buf_pool.flush_hp.set(nullptr);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ buf_pool.try_LRU_scan= true;
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (acquired)
+ space->release();
+
+ if (space->purpose == FIL_TYPE_IMPORT)
+ os_aio_wait_until_no_pending_writes();
+ else
+ buf_dblwr.flush_buffered_writes();
+
+ return may_have_skipped;
+}
+
+/** Write out dirty blocks from buf_pool.LRU.
+@param max_n wished maximum mumber of blocks flushed
+@return the number of processed pages
+@retval 0 if a buf_pool.LRU batch is already running */
+ulint buf_flush_LRU(ulint max_n)
+{
+ if (buf_pool.n_flush_LRU())
+ return 0;
+
+ log_buffer_flush_to_disk(true);
+
+ mysql_mutex_lock(&buf_pool.mutex);
+ if (buf_pool.n_flush_LRU_)
+ {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return 0;
+ }
+ buf_pool.n_flush_LRU_++;
+
+ ulint n_flushed= buf_do_LRU_batch(max_n);
+
+ const ulint n_flushing= --buf_pool.n_flush_LRU_;
+
+ buf_pool.try_LRU_scan= true;
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (!n_flushing)
+ {
+ pthread_cond_broadcast(&buf_pool.done_flush_LRU);
+ pthread_cond_signal(&buf_pool.done_free);
+ }
+
+ buf_dblwr.flush_buffered_writes();
+
+ DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed));
+ return n_flushed;
+}
+
+/** Initiate a log checkpoint, discarding the start of the log.
+@param oldest_lsn the checkpoint LSN
+@param end_lsn log_sys.get_lsn()
+@return true if success, false if a checkpoint write was already running */
+static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
+{
+ ut_ad(!srv_read_only_mode);
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(oldest_lsn <= end_lsn);
+ ut_ad(end_lsn == log_sys.get_lsn());
+ ut_ad(!recv_no_log_write);
+
+ ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
+
+ if (oldest_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
+ /* Some log has been written since the previous checkpoint. */;
+ else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+ /* MariaDB startup expects the redo log file to be logically empty
+ (not even containing a FILE_CHECKPOINT record) after a clean shutdown.
+ Perform an extra checkpoint at shutdown. */;
+ else
+ {
+ /* Do nothing, because nothing was logged (other than a
+ FILE_CHECKPOINT record) since the previous checkpoint. */
+ mysql_mutex_unlock(&log_sys.mutex);
+ return true;
+ }
+
+ /* Repeat the FILE_MODIFY records after the checkpoint, in case some
+ log records between the checkpoint and log_sys.lsn need them.
+ Finally, write a FILE_CHECKPOINT record. Redo log apply expects to
+ see a FILE_CHECKPOINT after the checkpoint, except on clean
+ shutdown, where the log will be empty after the checkpoint.
+
+ It is important that we write out the redo log before any further
+ dirty pages are flushed to the tablespace files. At this point,
+ because we hold log_sys.mutex, mtr_t::commit() in other threads will
+ be blocked, and no pages can be added to the flush lists. */
+ lsn_t flush_lsn= oldest_lsn;
+
+ if (fil_names_clear(flush_lsn, oldest_lsn != end_lsn ||
+ srv_shutdown_state <= SRV_SHUTDOWN_INITIATED))
+ {
+ flush_lsn= log_sys.get_lsn();
+ ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT);
+ mysql_mutex_unlock(&log_sys.mutex);
+ log_write_up_to(flush_lsn, true, true);
+ mysql_mutex_lock(&log_sys.mutex);
+ if (log_sys.last_checkpoint_lsn >= oldest_lsn)
+ {
+ mysql_mutex_unlock(&log_sys.mutex);
+ return true;
+ }
+ }
+ else
+ ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
+
+ ut_ad(log_sys.get_flushed_lsn() >= flush_lsn);
+
+ if (log_sys.n_pending_checkpoint_writes)
+ {
+ /* A checkpoint write is running */
+ mysql_mutex_unlock(&log_sys.mutex);
+ return false;
+ }
+
+ log_sys.next_checkpoint_lsn= oldest_lsn;
+ log_write_checkpoint_info(end_lsn);
+ mysql_mutex_assert_not_owner(&log_sys.mutex);
+
+ return true;
+}
+
+/** Make a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log file. Use log_make_checkpoint() to flush also the pool.
+@retval true if the checkpoint was or had been made
+@retval false if a checkpoint write was already running */
+static bool log_checkpoint()
+{
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
+
+ switch (srv_file_flush_method) {
+ case SRV_NOSYNC:
+ case SRV_O_DIRECT_NO_FSYNC:
+ break;
+ default:
+ fil_flush_file_spaces();
+ }
+
+ mysql_mutex_lock(&log_sys.mutex);
+ const lsn_t end_lsn= log_sys.get_lsn();
+ mysql_mutex_lock(&log_sys.flush_order_mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ mysql_mutex_unlock(&log_sys.flush_order_mutex);
+ return log_checkpoint_low(oldest_lsn, end_lsn);
+}
+
+/** Make a checkpoint. */
+ATTRIBUTE_COLD void log_make_checkpoint()
+{
+ buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire));
+ while (!log_checkpoint());
+}
+
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
+{
+ ut_ad(sync_lsn);
+ ut_ad(sync_lsn < LSN_MAX);
+ mysql_mutex_assert_not_owner(&log_sys.mutex);
+ ut_ad(!srv_read_only_mode);
+
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
+ {
+#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
+ if (UNIV_UNLIKELY(!buf_page_cleaner_is_active)
+ ut_d(|| innodb_page_cleaner_disabled_debug))
+ {
+ do
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
+ buf_flush_wait_batch_end_acquiring_mutex(false);
+ if (n_pages)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+ MONITOR_FLUSH_SYNC_COUNT,
+ MONITOR_FLUSH_SYNC_PAGES, n_pages);
+ }
+ MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ }
+ while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
+
+ goto try_checkpoint;
+ }
+#endif
+ if (buf_flush_sync_lsn < sync_lsn)
+ {
+ buf_flush_sync_lsn= sync_lsn;
+ pthread_cond_signal(&buf_pool.do_flush_list);
+ }
+
+ do
+ {
+ tpool::tpool_wait_begin();
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ my_cond_wait(&buf_pool.done_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex);
+ thd_wait_end(nullptr);
+ tpool::tpool_wait_end();
+
+ MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+ }
+ while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
+ }
+
+try_checkpoint:
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn))
+ {
+ /* If the buffer pool was clean, no log write was guaranteed
+ to happen until now. There could be an outstanding FILE_CHECKPOINT
+ record from a previous fil_names_clear() call, which we must
+ write out before we can advance the checkpoint. */
+ if (sync_lsn > log_sys.get_flushed_lsn())
+ log_write_up_to(sync_lsn, true);
+ log_checkpoint();
+ }
+}
+
+/** Initiate more eager page flushing if the log checkpoint age is too old.
+@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
+@param furious true=furious flushing, false=limit to innodb_io_capacity */
+ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
+{
+ mysql_mutex_assert_not_owner(&log_sys.mutex);
+ ut_ad(!srv_read_only_mode);
+
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
+
+ Atomic_relaxed<lsn_t> &limit= furious
+ ? buf_flush_sync_lsn : buf_flush_async_lsn;
+
+ if (limit < lsn)
+ {
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (limit < lsn)
+ limit= lsn;
+ pthread_cond_signal(&buf_pool.do_flush_list);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ }
+}
+
+/** Wait for pending flushes to complete. */
+void buf_flush_wait_batch_end_acquiring_mutex(bool lru)
+{
+ if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list())
+ {
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_flush_wait_batch_end(lru);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+}
+
+/** Conduct checkpoint-related flushing for innodb_flush_sync=ON,
+and try to initiate checkpoints until the target is met.
+@param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */
+ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
+{
+ ut_ad(!srv_read_only_mode);
+
+ for (;;)
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn))
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+ MONITOR_FLUSH_SYNC_COUNT,
+ MONITOR_FLUSH_SYNC_PAGES, n_flushed);
+ }
+
+ /* Attempt to perform a log checkpoint upon completing each batch. */
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
+
+ switch (srv_file_flush_method) {
+ case SRV_NOSYNC:
+ case SRV_O_DIRECT_NO_FSYNC:
+ break;
+ default:
+ fil_flush_file_spaces();
+ }
+
+ mysql_mutex_lock(&log_sys.mutex);
+ const lsn_t newest_lsn= log_sys.get_lsn();
+ mysql_mutex_lock(&log_sys.flush_order_mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ lsn_t measure= buf_pool.get_oldest_modification(0);
+ mysql_mutex_unlock(&log_sys.flush_order_mutex);
+ const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
+
+ if (checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ log_checkpoint_low(checkpoint_lsn, newest_lsn);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ measure= buf_pool.get_oldest_modification(LSN_MAX);
+ }
+ else
+ {
+ mysql_mutex_unlock(&log_sys.mutex);
+ if (!measure)
+ measure= LSN_MAX;
+ }
+
+ mysql_mutex_assert_not_owner(&log_sys.mutex);
+
+ /* After attempting log checkpoint, check if we have reached our target. */
+ const lsn_t target= buf_flush_sync_lsn;
+
+ if (measure >= target)
+ buf_flush_sync_lsn= 0;
+ else if (measure >= buf_flush_async_lsn)
+ buf_flush_async_lsn= 0;
+
+ /* wake up buf_flush_wait_flushed() */
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+
+ lsn= std::max(lsn, target);
+
+ if (measure >= lsn)
+ return;
+ }
+}
+
+/** Check if the adpative flushing threshold is recommended based on
+redo log capacity filled threshold.
+@param oldest_lsn buf_pool.get_oldest_modification()
+@return true if adaptive flushing is recommended. */
+static bool af_needed_for_redo(lsn_t oldest_lsn)
+{
+ lsn_t age= (log_sys.get_lsn() - oldest_lsn);
+ lsn_t af_lwm= static_cast<lsn_t>(srv_adaptive_flushing_lwm *
+ static_cast<double>(log_sys.log_capacity) / 100);
+
+ /* if age > af_lwm adaptive flushing is recommended */
+ return (age > af_lwm);
+}
+
+/*********************************************************************//**
+Calculates if flushing is required based on redo generation rate.
+@return percent of io_capacity to flush to manage redo space */
+static
+ulint
+af_get_pct_for_lsn(
+/*===============*/
+ lsn_t age) /*!< in: current age of LSN. */
+{
+ lsn_t af_lwm = static_cast<lsn_t>(
+ srv_adaptive_flushing_lwm
+ * static_cast<double>(log_sys.log_capacity) / 100);
+
+ if (age < af_lwm) {
+ /* No adaptive flushing. */
+ return(0);
+ }
+
+ lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async;
+
+ ut_ad(srv_max_io_capacity >= srv_io_capacity);
+ return static_cast<ulint>(
+ (static_cast<double>(srv_max_io_capacity / srv_io_capacity
+ * lsn_age_factor)
+ * sqrt(static_cast<double>(lsn_age_factor))
+ / 7.5));
+}
+
+/** This function is called approximately once every second by the
+page_cleaner thread if innodb_adaptive_flushing=ON.
+Based on various factors it decides if there is a need to do flushing.
+@return number of pages recommended to be flushed
+@param last_pages_in number of pages flushed in previous batch
+@param oldest_lsn buf_pool.get_oldest_modification(0)
+@param dirty_blocks UT_LIST_GET_LEN(buf_pool.flush_list)
+@param dirty_pct 100*flush_list.count / (LRU.count + free.count) */
+static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
+ lsn_t oldest_lsn,
+ ulint dirty_blocks,
+ double dirty_pct)
+{
+ static lsn_t prev_lsn = 0;
+ static ulint sum_pages = 0;
+ static ulint avg_page_rate = 0;
+ static ulint n_iterations = 0;
+ static time_t prev_time;
+ lsn_t lsn_rate;
+ ulint n_pages = 0;
+
+ const lsn_t cur_lsn = log_sys.get_lsn();
+ ut_ad(oldest_lsn <= cur_lsn);
+ ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn);
+ time_t curr_time = time(nullptr);
+ const double max_pct = srv_max_buf_pool_modified_pct;
+
+ if (!prev_lsn || !pct_for_lsn) {
+ prev_time = curr_time;
+ prev_lsn = cur_lsn;
+ if (max_pct > 0.0) {
+ dirty_pct /= max_pct;
+ }
+
+ n_pages = ulint(dirty_pct * double(srv_io_capacity));
+ if (n_pages < dirty_blocks) {
+ n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks);
+ }
+
+ return n_pages;
+ }
+
+ sum_pages += last_pages_in;
+
+ double time_elapsed = difftime(curr_time, prev_time);
+
+ /* We update our variables every srv_flushing_avg_loops
+ iterations to smooth out transition in workload. */
+ if (++n_iterations >= srv_flushing_avg_loops
+ || time_elapsed >= static_cast<double>(srv_flushing_avg_loops)) {
+
+ if (time_elapsed < 1) {
+ time_elapsed = 1;
+ }
+
+ avg_page_rate = static_cast<ulint>(
+ ((static_cast<double>(sum_pages)
+ / time_elapsed)
+ + static_cast<double>(avg_page_rate)) / 2);
+
+ /* How much LSN we have generated since last call. */
+ lsn_rate = static_cast<lsn_t>(
+ static_cast<double>(cur_lsn - prev_lsn)
+ / time_elapsed);
+
+ lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
+
+ ulint flush_tm = page_cleaner.flush_time;
+ ulint flush_pass = page_cleaner.flush_pass;
+
+ page_cleaner.flush_time = 0;
+ page_cleaner.flush_pass = 0;
+
+ if (flush_pass) {
+ flush_tm /= flush_pass;
+ }
+
+ MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm);
+ MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass);
+
+ prev_lsn = cur_lsn;
+ prev_time = curr_time;
+
+ n_iterations = 0;
+
+ sum_pages = 0;
+ }
+
+ const ulint pct_for_dirty = srv_max_dirty_pages_pct_lwm == 0
+ ? (dirty_pct >= max_pct ? 100 : 0)
+ : static_cast<ulint>
+ (max_pct > 0.0 ? dirty_pct / max_pct : dirty_pct);
+ ulint pct_total = std::max(pct_for_dirty, pct_for_lsn);
+
+ /* Estimate pages to be flushed for the lsn progress */
+ lsn_t target_lsn = oldest_lsn
+ + lsn_avg_rate * buf_flush_lsn_scan_factor;
+ ulint pages_for_lsn = 0;
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list);
+ b != NULL;
+ b = UT_LIST_GET_PREV(list, b)) {
+ if (b->oldest_modification() > target_lsn) {
+ break;
+ }
+ if (++pages_for_lsn >= srv_max_io_capacity) {
+ break;
+ }
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ pages_for_lsn /= buf_flush_lsn_scan_factor;
+ if (pages_for_lsn < 1) {
+ pages_for_lsn = 1;
+ }
+
+ n_pages = (ulint(double(srv_io_capacity) * double(pct_total) / 100.0)
+ + avg_page_rate + pages_for_lsn) / 3;
+
+ if (n_pages > srv_max_io_capacity) {
+ n_pages = srv_max_io_capacity;
+ }
+
+ MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
+
+ MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn);
+
+ MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
+ MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
+ MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
+ MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
+
+ return(n_pages);
+}
+
+/******************************************************************//**
+page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one coordinator.
+@return a dummy parameter */
+static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
+{
+ my_thread_init();
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(page_cleaner_thread_key);
+#endif /* UNIV_PFS_THREAD */
+ ut_ad(!srv_read_only_mode);
+ ut_ad(buf_page_cleaner_is_active);
+
+ ulint last_pages= 0;
+ timespec abstime;
+ set_timespec(abstime, 1);
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ lsn_t lsn_limit;
+ ulint last_activity_count= srv_get_activity_count();
+
+ for (;;)
+ {
+ lsn_limit= buf_flush_sync_lsn;
+
+ if (UNIV_UNLIKELY(lsn_limit != 0))
+ {
+furious_flush:
+ if (UNIV_LIKELY(srv_flush_sync))
+ {
+ buf_flush_sync_for_checkpoint(lsn_limit);
+ last_pages= 0;
+ set_timespec(abstime, 1);
+ continue;
+ }
+ }
+ else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+ break;
+
+ /* If buf pager cleaner is idle and there is no work
+ (either dirty pages are all flushed or adaptive flushing
+ is not enabled) then opt for non-timed wait */
+ if (buf_pool.page_cleaner_idle() &&
+ (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
+ srv_max_dirty_pages_pct_lwm == 0.0))
+ my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex);
+ else
+ my_cond_timedwait(&buf_pool.do_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex, &abstime);
+
+ set_timespec(abstime, 1);
+
+ lsn_t soft_lsn_limit= buf_flush_async_lsn;
+ lsn_limit= buf_flush_sync_lsn;
+
+ if (UNIV_UNLIKELY(lsn_limit != 0))
+ {
+ if (UNIV_LIKELY(srv_flush_sync))
+ goto furious_flush;
+ }
+ else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+ break;
+
+ const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0);
+
+ if (!oldest_lsn)
+ {
+ if (UNIV_UNLIKELY(lsn_limit != 0))
+ {
+ buf_flush_sync_lsn= 0;
+ /* wake up buf_flush_wait_flushed() */
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ }
+unemployed:
+ buf_flush_async_lsn= 0;
+ buf_pool.page_cleaner_set_idle(true);
+ continue;
+ }
+
+ const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list);
+ ut_ad(dirty_blocks);
+ /* We perform dirty reads of the LRU+free list lengths here.
+ Division by zero is not possible, because buf_pool.flush_list is
+ guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */
+ const double dirty_pct= double(dirty_blocks) * 100.0 /
+ double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
+
+ bool idle_flush= false;
+
+ if (lsn_limit || soft_lsn_limit);
+ else if (af_needed_for_redo(oldest_lsn));
+ else if (srv_max_dirty_pages_pct_lwm != 0.0)
+ {
+ const ulint activity_count= srv_get_activity_count();
+ if (activity_count != last_activity_count)
+ last_activity_count= activity_count;
+ else if (buf_pool.page_cleaner_idle() && buf_pool.n_pend_reads == 0)
+ {
+ /* reaching here means 3 things:
+ - last_activity_count == activity_count: suggesting server is idle
+ (no trx_t::commit activity)
+ - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm)
+ - there are no pending reads but there are dirty pages to flush */
+ idle_flush= true;
+ buf_pool.update_last_activity_count(activity_count);
+ }
+
+ if (!idle_flush && dirty_pct < srv_max_dirty_pages_pct_lwm)
+ goto unemployed;
+ }
+ else if (dirty_pct < srv_max_buf_pool_modified_pct)
+ goto unemployed;
+
+ if (UNIV_UNLIKELY(lsn_limit != 0) && oldest_lsn >= lsn_limit)
+ lsn_limit= buf_flush_sync_lsn= 0;
+ if (UNIV_UNLIKELY(soft_lsn_limit != 0) && oldest_lsn >= soft_lsn_limit)
+ soft_lsn_limit= buf_flush_async_lsn= 0;
+
+ buf_pool.page_cleaner_set_idle(false);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (!lsn_limit)
+ lsn_limit= soft_lsn_limit;
+
+ ulint n_flushed;
+
+ if (UNIV_UNLIKELY(lsn_limit != 0))
+ {
+ n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit);
+ /* wake up buf_flush_wait_flushed() */
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ goto try_checkpoint;
+ }
+ else if (idle_flush || !srv_adaptive_flushing)
+ {
+ n_flushed= buf_flush_list(srv_io_capacity);
+try_checkpoint:
+ if (n_flushed)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+ MONITOR_FLUSH_BACKGROUND_COUNT,
+ MONITOR_FLUSH_BACKGROUND_PAGES,
+ n_flushed);
+do_checkpoint:
+ /* The periodic log_checkpoint() call here makes it harder to
+ reproduce bugs in crash recovery or mariabackup --prepare, or
+ in code that writes the redo log records. Omitting the call
+ here should not affect correctness, because log_free_check()
+ should still be invoking checkpoints when needed. */
+ DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;);
+
+ if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL)
+ log_checkpoint();
+ }
+ }
+ else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages,
+ oldest_lsn,
+ dirty_blocks,
+ dirty_pct))
+ {
+ page_cleaner.flush_pass++;
+ const ulint tm= ut_time_ms();
+ last_pages= n_flushed= buf_flush_list(n);
+ page_cleaner.flush_time+= ut_time_ms() - tm;
+
+ if (n_flushed)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+ MONITOR_FLUSH_ADAPTIVE_COUNT,
+ MONITOR_FLUSH_ADAPTIVE_PAGES,
+ n_flushed);
+ goto do_checkpoint;
+ }
+ }
+ else if (buf_flush_async_lsn <= oldest_lsn)
+ {
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ goto unemployed;
+ }
+
+#ifdef UNIV_DEBUG
+ while (innodb_page_cleaner_disabled_debug && !buf_flush_sync_lsn &&
+ srv_shutdown_state == SRV_SHUTDOWN_NONE)
+ os_thread_sleep(100000);
+#endif /* UNIV_DEBUG */
+
+#ifndef DBUG_OFF
+next:
+#endif /* !DBUG_OFF */
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ /* when idle flushing kicks in page_cleaner is marked active.
+ reset it back to idle since the it was made active as part of
+ idle flushing stage. */
+ if (idle_flush)
+ buf_pool.page_cleaner_set_idle(true);
+ }
+
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (srv_fast_shutdown != 2)
+ {
+ buf_flush_wait_batch_end_acquiring_mutex(true);
+ buf_flush_wait_batch_end_acquiring_mutex(false);
+ }
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ lsn_limit= buf_flush_sync_lsn;
+ if (UNIV_UNLIKELY(lsn_limit != 0))
+ goto furious_flush;
+ buf_page_cleaner_is_active= false;
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ my_thread_end();
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+ os_thread_exit();
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/** Initialize page_cleaner. */
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
+{
+ ut_ad(!buf_page_cleaner_is_active);
+ ut_ad(srv_operation == SRV_OPERATION_NORMAL ||
+ srv_operation == SRV_OPERATION_RESTORE ||
+ srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+ buf_flush_async_lsn= 0;
+ buf_flush_sync_lsn= 0;
+ buf_page_cleaner_is_active= true;
+ os_thread_create(buf_flush_page_cleaner);
+}
+
+/** @return the number of dirty pages in the buffer pool */
+static ulint buf_flush_list_length()
+{
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ return len;
+}
+
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool()
+{
+ ut_ad(!buf_page_cleaner_is_active);
+ ut_ad(!buf_flush_sync_lsn);
+
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Waiting to flush the buffer pool");
+
+ while (buf_pool.n_flush_list() || buf_flush_list_length())
+ {
+ buf_flush_list(srv_max_io_capacity);
+ timespec abstime;
+
+ if (buf_pool.n_flush_list())
+ {
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Waiting to flush " ULINTPF " pages",
+ buf_flush_list_length());
+ set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2);
+ mysql_mutex_lock(&buf_pool.mutex);
+ while (buf_pool.n_flush_list_)
+ my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+ &abstime);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+ }
+
+ ut_ad(!buf_pool.any_io_pending());
+}
+
+/** Synchronously flush dirty blocks.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync()
+{
+ ut_ad(!sync_check_iterate(dict_sync_check()));
+
+ for (;;)
+ {
+ const ulint n_flushed= buf_flush_list(srv_max_io_capacity);
+ buf_flush_wait_batch_end_acquiring_mutex(false);
+ if (!n_flushed && !buf_flush_list_length())
+ return;
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Functor to validate the flush list. */
+struct Check {
+ void operator()(const buf_page_t* elem) const
+ {
+ ut_ad(elem->oldest_modification());
+ ut_ad(!fsp_is_system_temporary(elem->id().space()));
+ }
+};
+
+/** Validate the flush list. */
+static void buf_flush_validate_low()
+{
+ buf_page_t* bpage;
+
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+
+ ut_list_validate(buf_pool.flush_list, Check());
+
+ bpage = UT_LIST_GET_FIRST(buf_pool.flush_list);
+
+ while (bpage != NULL) {
+ const lsn_t om = bpage->oldest_modification();
+ /* A page in buf_pool.flush_list can be in
+ BUF_BLOCK_REMOVE_HASH state. This happens when a page
+ is in the middle of being relocated. In that case the
+ original descriptor can have this state and still be
+ in the flush list waiting to acquire the
+ buf_pool.flush_list_mutex to complete the relocation. */
+ ut_d(const auto s= bpage->state());
+ ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE
+ || s == BUF_BLOCK_REMOVE_HASH);
+ ut_ad(om == 1 || om > 2);
+
+ bpage = UT_LIST_GET_NEXT(list, bpage);
+ ut_ad(om == 1 || !bpage || recv_recovery_is_on()
+ || om >= bpage->oldest_modification());
+ }
+}
+
+/** Validate the flush list. */
+void buf_flush_validate()
+{
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_flush_validate_low();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
new file mode 100644
index 00000000..b282eb17
--- /dev/null
+++ b/storage/innobase/buf/buf0lru.cc
@@ -0,0 +1,1477 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0lru.cc
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+#include "sync0rw.h"
+#include "fil0fil.h"
+#include "btr0btr.h"
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0rea.h"
+#include "btr0sea.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "log0recv.h"
+#include "srv0srv.h"
+#include "srv0mon.h"
+
+/** Flush this many pages in buf_LRU_get_free_block() */
+size_t innodb_lru_flush_size;
+
+/** The number of blocks from the LRU_old pointer onward, including
+the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+of the whole LRU list length, except that the tolerance defined below
+is allowed. Note that the tolerance must be small enough such that for
+even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
+allowed to point to either end of the LRU list. */
+
+static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20;
+
+/** The minimum amount of non-old blocks when the LRU_old list exists
+(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
+@see buf_LRU_old_adjust_len */
+#define BUF_LRU_NON_OLD_MIN_LEN 5
+
+/** If we switch on the InnoDB monitor because there are too few available
+frames in the buffer pool, we set this to TRUE */
+static bool buf_lru_switched_on_innodb_mon = false;
+
+/** True if diagnostic message about difficult to find free blocks
+in the buffer bool has already printed. */
+static bool buf_lru_free_blocks_error_printed;
+
+/******************************************************************//**
+These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O
+and page_zip_decompress() operations. Based on the statistics,
+buf_LRU_evict_from_unzip_LRU() decides if we want to evict from
+unzip_LRU or the regular LRU. From unzip_LRU, we will only evict the
+uncompressed frame (meaning we can evict dirty blocks as well). From
+the regular LRU, we will evict the entire block (i.e.: both the
+uncompressed and compressed data), which must be clean. */
+
+/* @{ */
+
+/** Number of intervals for which we keep the history of these stats.
+Updated at SRV_MONITOR_INTERVAL (the buf_LRU_stat_update() call rate). */
+static constexpr ulint BUF_LRU_STAT_N_INTERVAL= 4;
+
+/** Co-efficient with which we multiply I/O operations to equate them
+with page_zip_decompress() operations. */
+static constexpr ulint BUF_LRU_IO_TO_UNZIP_FACTOR= 50;
+
+/** Sampled values buf_LRU_stat_cur.
+Not protected by any mutex. Updated by buf_LRU_stat_update(). */
+static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL];
+
+/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */
+static ulint buf_LRU_stat_arr_ind;
+
+/** Current operation counters. Not protected by any mutex. Cleared
+by buf_LRU_stat_update(). */
+buf_LRU_stat_t buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update(). Not Protected by any mutex. */
+buf_LRU_stat_t buf_LRU_stat_sum;
+
+/* @} */
+
+/** @name Heuristics for detecting index scan @{ */
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago. Not protected by any mutex or latch. */
+uint buf_LRU_old_threshold_ms;
+/* @} */
+
+/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
+
+If bpage->state() == BUF_BLOCK_ZIP_PAGE && bpage->oldest_modification() <= 1,
+the object will be freed.
+
+@param bpage buffer block
+@param id page identifier
+@param hash_lock buf_pool.page_hash latch (will be released here)
+@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
+ page_hash_latch *hash_lock, bool zip);
+
+/** Free a block to buf_pool */
+static void buf_LRU_block_free_hashed_page(buf_block_t *block)
+{
+ block->page.free_file_page();
+ buf_LRU_block_free_non_file_page(block);
+}
+
+/** Increase LRU size in bytes by the page size.
+@param[in] bpage control block */
+static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage)
+{
+ /* FIXME: use atomics, not mutex */
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ buf_pool.stat.LRU_bytes += bpage->physical_size();
+
+ ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size);
+}
+
+/** @return whether the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list */
+bool buf_LRU_evict_from_unzip_LRU()
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ /* If the unzip_LRU list is empty, we can only use the LRU. */
+ if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0) {
+ return false;
+ }
+
+ /* If unzip_LRU is at most 10% of the size of the LRU list,
+ then use the LRU. This slack allows us to keep hot
+ decompressed pages in the buffer pool. */
+ if (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+ <= UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
+ return false;
+ }
+
+ /* If eviction hasn't started yet, we assume by default
+ that a workload is disk bound. */
+ if (buf_pool.freed_page_clock == 0) {
+ return true;
+ }
+
+ /* Calculate the average over past intervals, and add the values
+ of the current interval. */
+ ulint io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL
+ + buf_LRU_stat_cur.io;
+
+ ulint unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL
+ + buf_LRU_stat_cur.unzip;
+
+ /* Decide based on our formula. If the load is I/O bound
+ (unzip_avg is smaller than the weighted io_avg), evict an
+ uncompressed frame from unzip_LRU. Otherwise we assume that
+ the load is CPU bound and evict from the regular LRU. */
+ return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
+}
+
+/** Try to free an uncompressed page of a compressed block from the unzip
+LRU list. The compressed page is preserved, and it need not be clean.
+@param limit maximum number of blocks to scan
+@return true if freed */
+static bool buf_LRU_free_from_unzip_LRU_list(ulint limit)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ if (!buf_LRU_evict_from_unzip_LRU()) {
+ return(false);
+ }
+
+ ulint scanned = 0;
+ bool freed = false;
+
+ for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+ block && scanned < limit; ++scanned) {
+ buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->in_unzip_LRU_list);
+ ut_ad(block->page.in_LRU_list);
+
+ freed = buf_LRU_free_page(&block->page, false);
+ if (freed) {
+ break;
+ }
+
+ block = prev_block;
+ }
+
+ if (scanned) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+ scanned);
+ }
+
+ return(freed);
+}
+
+/** Try to free a clean page from the common LRU list.
+@param limit maximum number of blocks to scan
+@return whether a page was freed */
+static bool buf_LRU_free_from_common_LRU_list(ulint limit)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ ulint scanned = 0;
+ bool freed = false;
+
+ for (buf_page_t* bpage = buf_pool.lru_scan_itr.start();
+ bpage && scanned < limit;
+ ++scanned, bpage = buf_pool.lru_scan_itr.get()) {
+ buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
+ buf_pool.lru_scan_itr.set(prev);
+
+ const auto accessed = bpage->is_accessed();
+
+ if (buf_LRU_free_page(bpage, true)) {
+ if (!accessed) {
+ /* Keep track of pages that are evicted without
+ ever being accessed. This gives us a measure of
+ the effectiveness of readahead */
+ ++buf_pool.stat.n_ra_pages_evicted;
+ }
+
+ freed = true;
+ break;
+ }
+ }
+
+ if (scanned) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_SEARCH_SCANNED,
+ MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+ scanned);
+ }
+
+ return(freed);
+}
+
+/** Try to free a replaceable block.
+@param limit maximum number of blocks to scan
+@return true if found and freed */
+bool buf_LRU_scan_and_free_block(ulint limit)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ return buf_LRU_free_from_unzip_LRU_list(limit) ||
+ buf_LRU_free_from_common_LRU_list(limit);
+}
+
+/** @return a buffer block from the buf_pool.free list
+@retval NULL if the free list is empty */
+buf_block_t* buf_LRU_get_free_only()
+{
+ buf_block_t* block;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ block = reinterpret_cast<buf_block_t*>(
+ UT_LIST_GET_FIRST(buf_pool.free));
+
+ while (block != NULL) {
+ ut_ad(block->page.in_free_list);
+ ut_d(block->page.in_free_list = FALSE);
+ ut_ad(!block->page.oldest_modification());
+ ut_ad(!block->page.in_LRU_list);
+ ut_a(!block->page.in_file());
+ UT_LIST_REMOVE(buf_pool.free, &block->page);
+
+ if (buf_pool.curr_size >= buf_pool.old_size
+ || UT_LIST_GET_LEN(buf_pool.withdraw)
+ >= buf_pool.withdraw_target
+ || !buf_pool.will_be_withdrawn(block->page)) {
+ /* No adaptive hash index entries may point to
+ a free block. */
+ assert_block_ahi_empty(block);
+
+ block->page.set_state(BUF_BLOCK_MEMORY);
+ MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size);
+ break;
+ }
+
+ /* This should be withdrawn */
+ UT_LIST_ADD_LAST(
+ buf_pool.withdraw,
+ &block->page);
+ ut_d(block->in_withdraw_list = true);
+
+ block = reinterpret_cast<buf_block_t*>(
+ UT_LIST_GET_FIRST(buf_pool.free));
+ }
+
+ return(block);
+}
+
+/******************************************************************//**
+Checks how much of buf_pool is occupied by non-data objects like
+AHI, lock heaps etc. Depending on the size of non-data objects this
+function will either assert or issue a warning and switch on the
+status monitor. */
+static void buf_LRU_check_size_of_non_data_objects()
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ if (recv_recovery_is_on() || buf_pool.curr_size != buf_pool.old_size)
+ return;
+
+ const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
+
+ if (s < buf_pool.curr_size / 20)
+ ib::fatal() << "Over 95 percent of the buffer pool is"
+ " occupied by lock heaps"
+#ifdef BTR_CUR_HASH_ADAPT
+ " or the adaptive hash index"
+#endif /* BTR_CUR_HASH_ADAPT */
+ "! Check that your transactions do not set too many"
+ " row locks, or review if innodb_buffer_pool_size="
+ << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+ << "M could be bigger.";
+
+ if (s < buf_pool.curr_size / 3)
+ {
+ if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer)
+ {
+ /* Over 67 % of the buffer pool is occupied by lock heaps or
+ the adaptive hash index. This may be a memory leak! */
+ ib::warn() << "Over 67 percent of the buffer pool is"
+ " occupied by lock heaps"
+#ifdef BTR_CUR_HASH_ADAPT
+ " or the adaptive hash index"
+#endif /* BTR_CUR_HASH_ADAPT */
+ "! Check that your transactions do not set too many row locks."
+ " innodb_buffer_pool_size="
+ << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+ << "M. Starting the InnoDB Monitor to print diagnostics.";
+ buf_lru_switched_on_innodb_mon= true;
+ srv_print_innodb_monitor= TRUE;
+ srv_monitor_timer_schedule_now();
+ }
+ }
+ else if (buf_lru_switched_on_innodb_mon)
+ {
+ /* Switch off the InnoDB Monitor; this is a simple way to stop the
+ monitor if the situation becomes less urgent, but may also
+ surprise users who did SET GLOBAL innodb_status_output=ON earlier! */
+ buf_lru_switched_on_innodb_mon= false;
+ srv_print_innodb_monitor= FALSE;
+ }
+}
+
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
+
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+ * get a block from the buf_pool.free list, success:done
+ * if buf_pool.try_LRU_scan is set
+ * scan LRU up to 100 pages to free a clean block
+ * success:retry the free list
+ * flush up to innodb_lru_flush_size LRU blocks to data files
+ (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+ * on buf_page_write_complete() the blocks will put on buf_pool.free list
+ * success: retry the free list
+* subsequent iterations: same as iteration 0 except:
+ * scan whole LRU list
+ * scan LRU list even if buf_pool.try_LRU_scan is not set
+
+@param have_mutex whether buf_pool.mutex is already being held
+@return the free control block, in state BUF_BLOCK_MEMORY */
+buf_block_t *buf_LRU_get_free_block(bool have_mutex)
+{
+ ulint n_iterations = 0;
+ ulint flush_failures = 0;
+ MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
+ if (have_mutex) {
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ goto got_mutex;
+ }
+ mysql_mutex_lock(&buf_pool.mutex);
+got_mutex:
+ buf_LRU_check_size_of_non_data_objects();
+ buf_block_t* block;
+
+ DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
+ if (!buf_lru_free_blocks_error_printed) {
+ n_iterations = 21;
+ goto not_found;});
+
+retry:
+ /* If there is a block in the free list, take it */
+ if ((block = buf_LRU_get_free_only()) != nullptr) {
+got_block:
+ if (!have_mutex) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+ memset(&block->page.zip, 0, sizeof block->page.zip);
+ return block;
+ }
+
+ MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS );
+ if (n_iterations || buf_pool.try_LRU_scan) {
+ /* If no block was in the free list, search from the
+ end of the LRU list and try to free a block there.
+ If we are doing for the first time we'll scan only
+ tail of the LRU list otherwise we scan the whole LRU
+ list. */
+ if (buf_LRU_scan_and_free_block(n_iterations
+ ? ULINT_UNDEFINED : 100)) {
+ goto retry;
+ }
+
+ /* Tell other threads that there is no point
+ in scanning the LRU list. */
+ buf_pool.try_LRU_scan = false;
+ }
+
+ for (;;) {
+ if ((block = buf_LRU_get_free_only()) != nullptr) {
+ goto got_block;
+ }
+ if (!buf_pool.n_flush_LRU_) {
+ break;
+ }
+ my_cond_wait(&buf_pool.done_free, &buf_pool.mutex.m_mutex);
+ }
+
+#ifndef DBUG_OFF
+not_found:
+#endif
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (n_iterations > 20 && !buf_lru_free_blocks_error_printed
+ && srv_buf_pool_old_size == srv_buf_pool_size) {
+
+ ib::warn() << "Difficult to find free blocks in the buffer pool"
+ " (" << n_iterations << " search iterations)! "
+ << flush_failures << " failed attempts to"
+ " flush a page!"
+ " Consider increasing innodb_buffer_pool_size."
+ " Pending flushes (fsync) log: "
+ << log_sys.get_pending_flushes()
+ << "; buffer pool: "
+ << fil_n_pending_tablespace_flushes
+ << ". " << os_n_file_reads << " OS file reads, "
+ << os_n_file_writes << " OS file writes, "
+ << os_n_fsyncs
+ << " OS fsyncs.";
+
+ buf_lru_free_blocks_error_printed = true;
+ }
+
+ if (n_iterations > 1) {
+ MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
+ }
+
+ /* No free block was found: try to flush the LRU list.
+ The freed blocks will be up for grabs for all threads.
+
+ TODO: A more elegant way would have been to return one freed
+ up block to the caller here but the code that deals with
+ removing the block from buf_pool.page_hash and buf_pool.LRU is fairly
+ involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We
+ can do that in a separate patch sometime in future. */
+
+ if (!buf_flush_LRU(innodb_lru_flush_size)) {
+ MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
+ ++flush_failures;
+ }
+
+ n_iterations++;
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_pool.stat.LRU_waits++;
+ goto got_mutex;
+}
+
+/** Move the LRU_old pointer so that the length of the old blocks list
+is inside the allowed limits. */
+static void buf_LRU_old_adjust_len()
+{
+ ulint old_len;
+ ulint new_len;
+
+ ut_a(buf_pool.LRU_old);
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_pool.LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
+ ut_ad(buf_pool.LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
+ compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN
+ > BUF_LRU_OLD_RATIO_DIV
+ * (BUF_LRU_OLD_TOLERANCE + 5));
+ compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN);
+
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool.LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool.LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)
+ || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)
+ || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+
+ old_len = buf_pool.LRU_old_len;
+ new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU)
+ * buf_pool.LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
+ UT_LIST_GET_LEN(buf_pool.LRU)
+ - (BUF_LRU_OLD_TOLERANCE
+ + BUF_LRU_NON_OLD_MIN_LEN));
+
+ for (;;) {
+ buf_page_t* LRU_old = buf_pool.LRU_old;
+
+ ut_a(LRU_old);
+ ut_ad(LRU_old->in_LRU_list);
+#ifdef UNIV_LRU_DEBUG
+ ut_a(LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+
+ /* Update the LRU_old pointer if necessary */
+
+ if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) {
+
+ buf_pool.LRU_old = LRU_old = UT_LIST_GET_PREV(
+ LRU, LRU_old);
+#ifdef UNIV_LRU_DEBUG
+ ut_a(!LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+ old_len = ++buf_pool.LRU_old_len;
+ LRU_old->set_old(true);
+
+ } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) {
+
+ buf_pool.LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
+ old_len = --buf_pool.LRU_old_len;
+ LRU_old->set_old(false);
+ } else {
+ return;
+ }
+ }
+}
+
+/** Initialize the old blocks pointer in the LRU list. This function should be
+called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
+static void buf_LRU_old_init()
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_a(UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN);
+
+ /* We first initialize all blocks in the LRU list as old and then use
+ the adjust function to move the LRU_old pointer to the right
+ position */
+
+ for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool.LRU);
+ bpage != NULL;
+ bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+
+ ut_ad(bpage->in_LRU_list);
+
+ /* This loop temporarily violates the
+ assertions of buf_page_t::set_old(). */
+ bpage->old = true;
+ }
+
+ buf_pool.LRU_old = UT_LIST_GET_FIRST(buf_pool.LRU);
+ buf_pool.LRU_old_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+ buf_LRU_old_adjust_len();
+}
+
+/** Remove a block from the unzip_LRU list if it belonged to the list.
+@param[in] bpage control block */
+static void buf_unzip_LRU_remove_block_if_needed(buf_page_t* bpage)
+{
+ ut_ad(bpage->in_file());
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ if (bpage->belongs_to_unzip_LRU()) {
+ buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
+
+ ut_ad(block->in_unzip_LRU_list);
+ ut_d(block->in_unzip_LRU_list = false);
+
+ UT_LIST_REMOVE(buf_pool.unzip_LRU, block);
+ }
+}
+
+/** Removes a block from the LRU list.
+@param[in] bpage control block */
+static inline void buf_LRU_remove_block(buf_page_t* bpage)
+{
+ /* Important that we adjust the hazard pointers before removing
+ bpage from the LRU list. */
+ buf_page_t* prev_bpage = buf_pool.LRU_remove(bpage);
+
+ /* If the LRU_old pointer is defined and points to just this block,
+ move it backward one step */
+
+ if (bpage == buf_pool.LRU_old) {
+
+ /* Below: the previous block is guaranteed to exist,
+ because the LRU_old pointer is only allowed to differ
+ by BUF_LRU_OLD_TOLERANCE from strict
+ buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
+ list length. */
+ ut_a(prev_bpage);
+#ifdef UNIV_LRU_DEBUG
+ ut_a(!prev_bpage->old);
+#endif /* UNIV_LRU_DEBUG */
+ buf_pool.LRU_old = prev_bpage;
+ prev_bpage->set_old(true);
+
+ buf_pool.LRU_old_len++;
+ }
+
+ buf_pool.stat.LRU_bytes -= bpage->physical_size();
+
+ buf_unzip_LRU_remove_block_if_needed(bpage);
+
+ /* If the LRU list is so short that LRU_old is not defined,
+ clear the "old" flags and return */
+ if (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN) {
+
+ for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+ bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+
+ /* This loop temporarily violates the
+ assertions of buf_page_t::set_old(). */
+ bpage->old = false;
+ }
+
+ buf_pool.LRU_old = NULL;
+ buf_pool.LRU_old_len = 0;
+
+ return;
+ }
+
+ ut_ad(buf_pool.LRU_old);
+
+ /* Update the LRU_old_len field if necessary */
+ if (bpage->old) {
+ buf_pool.LRU_old_len--;
+ }
+
+ /* Adjust the length of the old block list if necessary */
+ buf_LRU_old_adjust_len();
+}
+
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+ buf_block_t* block, /*!< in: control block */
+ ibool old) /*!< in: TRUE if should be put to the end
+ of the list, else put to the start */
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_a(block->page.belongs_to_unzip_LRU());
+ ut_ad(!block->in_unzip_LRU_list);
+ ut_d(block->in_unzip_LRU_list = true);
+
+ if (old) {
+ UT_LIST_ADD_LAST(buf_pool.unzip_LRU, block);
+ } else {
+ UT_LIST_ADD_FIRST(buf_pool.unzip_LRU, block);
+ }
+}
+
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the page_size is
+already set when invoking the function, so that we can get correct
+page_size from the buffer page when adding a block into LRU */
+void
+buf_LRU_add_block(
+ buf_page_t* bpage, /*!< in: control block */
+ bool old) /*!< in: true if should be put to the old blocks
+ in the LRU list, else put to the start; if the
+ LRU list is very short, the block is added to
+ the start, regardless of this parameter */
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(!bpage->in_LRU_list);
+
+ if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) {
+
+ UT_LIST_ADD_FIRST(buf_pool.LRU, bpage);
+
+ bpage->freed_page_clock = buf_pool.freed_page_clock
+ & ((1U << 31) - 1);
+ } else {
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool.LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool.LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)
+ || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)
+ || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+ UT_LIST_INSERT_AFTER(buf_pool.LRU, buf_pool.LRU_old,
+ bpage);
+
+ buf_pool.LRU_old_len++;
+ }
+
+ ut_d(bpage->in_LRU_list = TRUE);
+
+ incr_LRU_size_in_bytes(bpage);
+
+ if (UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_OLD_MIN_LEN) {
+
+ ut_ad(buf_pool.LRU_old);
+
+ /* Adjust the length of the old block list if necessary */
+
+ bpage->set_old(old);
+ buf_LRU_old_adjust_len();
+
+ } else if (UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN) {
+
+ /* The LRU list is now long enough for LRU_old to become
+ defined: init it */
+
+ buf_LRU_old_init();
+ } else {
+ bpage->set_old(buf_pool.LRU_old != NULL);
+ }
+
+ /* If this is a zipped block with decompressed frame as well
+ then put it on the unzip_LRU list */
+ if (bpage->belongs_to_unzip_LRU()) {
+ buf_unzip_LRU_add_block((buf_block_t*) bpage, old);
+ }
+}
+
+/** Move a block to the start of the LRU list. */
+void buf_page_make_young(buf_page_t *bpage)
+{
+ ut_ad(bpage->in_file());
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (UNIV_UNLIKELY(bpage->old))
+ buf_pool.stat.n_pages_made_young++;
+
+ buf_LRU_remove_block(bpage);
+ buf_LRU_add_block(bpage, false);
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Try to free a block. If bpage is a descriptor of a compressed-only
+ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
+The caller must hold buf_pool.mutex.
+@param bpage block to be freed
+@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page
+@retval true if freed and buf_pool.mutex may have been temporarily released
+@retval false if the page was not freed */
+bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
+{
+ const page_id_t id(bpage->id());
+ buf_page_t* b = nullptr;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(bpage->in_file());
+ ut_ad(bpage->in_LRU_list);
+
+ /* First, perform a quick check before we acquire hash_lock. */
+ if (!bpage->can_relocate()) {
+ return false;
+ }
+
+ /* We must hold an exclusive hash_lock to prevent
+ bpage->can_relocate() from changing due to a concurrent
+ execution of buf_page_get_low(). */
+ const ulint fold = id.fold();
+ page_hash_latch* hash_lock = buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+ lsn_t oldest_modification = bpage->oldest_modification_acquire();
+
+ if (UNIV_UNLIKELY(!bpage->can_relocate())) {
+ /* Do not free buffer fixed and I/O-fixed blocks. */
+ goto func_exit;
+ }
+
+ if (oldest_modification == 1) {
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ oldest_modification = bpage->oldest_modification();
+ if (oldest_modification) {
+ ut_ad(oldest_modification == 1);
+ buf_pool.delete_from_flush_list(bpage);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ ut_ad(!bpage->oldest_modification());
+ oldest_modification = 0;
+ }
+
+ if (zip || !bpage->zip.data) {
+ /* This would completely free the block. */
+ /* Do not completely free dirty blocks. */
+
+ if (oldest_modification) {
+ goto func_exit;
+ }
+ } else if (oldest_modification
+ && bpage->state() != BUF_BLOCK_FILE_PAGE) {
+func_exit:
+ hash_lock->write_unlock();
+ return(false);
+
+ } else if (bpage->state() == BUF_BLOCK_FILE_PAGE) {
+ b = buf_page_alloc_descriptor();
+ ut_a(b);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ new (b) buf_page_t(*bpage);
+ b->set_state(BUF_BLOCK_ZIP_PAGE);
+ }
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(bpage->in_file());
+ ut_ad(bpage->in_LRU_list);
+
+ DBUG_PRINT("ib_buf", ("free page %u:%u",
+ id.space(), id.page_no()));
+
+ ut_ad(bpage->can_relocate());
+
+ if (!buf_LRU_block_remove_hashed(bpage, id, hash_lock, zip)) {
+ ut_ad(!b);
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+ return(true);
+ }
+
+ /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr
+ then it was a compressed page with an uncompressed frame and
+ we are interested in freeing only the uncompressed frame.
+ Therefore we have to reinsert the compressed page descriptor
+ into the LRU and page_hash (and possibly flush_list).
+ if !b then it was a regular page that has been freed */
+
+ if (UNIV_LIKELY_NULL(b)) {
+ buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
+
+ ut_ad(!buf_pool.page_hash_get_low(id, fold));
+ ut_ad(b->zip_size());
+
+ /* The field in_LRU_list of
+ the to-be-freed block descriptor should have
+ been cleared in
+ buf_LRU_block_remove_hashed(), which
+ invokes buf_LRU_remove_block(). */
+ ut_ad(!bpage->in_LRU_list);
+
+ /* bpage->state was BUF_BLOCK_FILE_PAGE because
+ b != nullptr. The type cast below is thus valid. */
+ ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
+
+ /* The fields of bpage were copied to b before
+ buf_LRU_block_remove_hashed() was invoked. */
+ ut_ad(!b->in_zip_hash);
+ ut_ad(b->in_LRU_list);
+ ut_ad(b->in_page_hash);
+
+ HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, b);
+
+ /* Insert b where bpage was in the LRU list. */
+ if (prev_b) {
+ ulint lru_len;
+
+ ut_ad(prev_b->in_LRU_list);
+ ut_ad(prev_b->in_file());
+
+ UT_LIST_INSERT_AFTER(buf_pool.LRU, prev_b, b);
+
+ incr_LRU_size_in_bytes(b);
+
+ if (b->is_old()) {
+ buf_pool.LRU_old_len++;
+ if (buf_pool.LRU_old
+ == UT_LIST_GET_NEXT(LRU, b)) {
+
+ buf_pool.LRU_old = b;
+ }
+ }
+
+ lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+ if (lru_len > BUF_LRU_OLD_MIN_LEN) {
+ ut_ad(buf_pool.LRU_old);
+ /* Adjust the length of the
+ old block list if necessary */
+ buf_LRU_old_adjust_len();
+ } else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
+ /* The LRU list is now long
+ enough for LRU_old to become
+ defined: init it */
+ buf_LRU_old_init();
+ }
+#ifdef UNIV_LRU_DEBUG
+ /* Check that the "old" flag is consistent
+ in the block and its neighbours. */
+ b->set_old(b->is_old());
+#endif /* UNIV_LRU_DEBUG */
+ } else {
+ ut_d(b->in_LRU_list = FALSE);
+ buf_LRU_add_block(b, b->old);
+ }
+
+ buf_flush_relocate_on_flush_list(bpage, b);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ bpage->zip.data = nullptr;
+
+ page_zip_set_size(&bpage->zip, 0);
+
+ /* Prevent buf_page_get_gen() from
+ decompressing the block while we release
+ hash_lock. */
+ b->set_io_fix(BUF_IO_PIN);
+ hash_lock->write_unlock();
+ } else if (!zip) {
+ hash_lock->write_unlock();
+ }
+
+ buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (block->index) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ /* Remove the adaptive hash index on the page.
+ The page was declared uninitialized by
+ buf_LRU_block_remove_hashed(). We need to flag
+ the contents of the page valid (which it still is) in
+ order to avoid bogus Valgrind or MSAN warnings.*/
+
+ MEM_MAKE_DEFINED(block->frame, srv_page_size);
+ btr_search_drop_page_hash_index(block);
+ MEM_UNDEFINED(block->frame, srv_page_size);
+
+ if (UNIV_LIKELY_NULL(b)) {
+ ut_ad(b->zip_size());
+ b->io_unfix();
+ }
+
+ mysql_mutex_lock(&buf_pool.mutex);
+ } else
+#endif
+ if (UNIV_LIKELY_NULL(b)) {
+ ut_ad(b->zip_size());
+ b->io_unfix();
+ }
+
+ buf_LRU_block_free_hashed_page(block);
+
+ return(true);
+}
+
+/******************************************************************//**
+Puts a block back to the free list. */
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+ buf_block_t* block) /*!< in: block, must not contain a file page */
+{
+ void* data;
+
+ ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+ assert_block_ahi_empty(block);
+ ut_ad(!block->page.in_free_list);
+ ut_ad(!block->page.oldest_modification());
+ ut_ad(!block->page.in_LRU_list);
+
+ block->page.set_state(BUF_BLOCK_NOT_USED);
+
+ MEM_UNDEFINED(block->frame, srv_page_size);
+ /* Wipe page_no and space_id */
+ static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+ memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xfe, 4);
+ static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+ "not perfect alignment");
+ memset_aligned<2>(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ 0xfe, 4);
+ data = block->page.zip.data;
+
+ if (data != NULL) {
+ block->page.zip.data = NULL;
+ buf_pool_mutex_exit_forbid();
+
+ ut_ad(block->zip_size());
+
+ buf_buddy_free(data, block->zip_size());
+
+ buf_pool_mutex_exit_allow();
+ page_zip_set_size(&block->page.zip, 0);
+ }
+
+ if (buf_pool.curr_size < buf_pool.old_size
+ && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target
+ && buf_pool.will_be_withdrawn(block->page)) {
+ /* This should be withdrawn */
+ UT_LIST_ADD_LAST(
+ buf_pool.withdraw,
+ &block->page);
+ ut_d(block->in_withdraw_list = true);
+ } else {
+ UT_LIST_ADD_FIRST(buf_pool.free, &block->page);
+ ut_d(block->page.in_free_list = true);
+ pthread_cond_signal(&buf_pool.done_free);
+ }
+
+ MEM_NOACCESS(block->frame, srv_page_size);
+}
+
+/** Release a memory block to the buffer pool. */
+ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block)
+{
+ ut_ad(this == &buf_pool);
+ mysql_mutex_lock(&mutex);
+ buf_LRU_block_free_non_file_page(block);
+ mysql_mutex_unlock(&mutex);
+}
+
+
+/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
+
+If bpage->state() == BUF_BLOCK_ZIP_PAGE && !bpage->oldest_modification(),
+the object will be freed.
+
+@param bpage buffer block
+@param id page identifier
+@param hash_lock buf_pool.page_hash latch (will be released here)
+@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
+ page_hash_latch *hash_lock, bool zip)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(hash_lock->is_write_locked());
+
+ ut_a(bpage->io_fix() == BUF_IO_NONE);
+ ut_a(!bpage->buf_fix_count());
+
+ buf_LRU_remove_block(bpage);
+
+ buf_pool.freed_page_clock += 1;
+
+ switch (bpage->state()) {
+ case BUF_BLOCK_FILE_PAGE:
+ MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t));
+ MEM_CHECK_ADDRESSABLE(((buf_block_t*) bpage)->frame,
+ srv_page_size);
+ buf_block_modify_clock_inc((buf_block_t*) bpage);
+ if (bpage->zip.data) {
+ const page_t* page = ((buf_block_t*) bpage)->frame;
+
+ ut_a(!zip || !bpage->oldest_modification());
+ ut_ad(bpage->zip_size());
+
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ /* These are essentially uncompressed pages. */
+ if (!zip) {
+ /* InnoDB writes the data to the
+ uncompressed page frame. Copy it
+ to the compressed page, which will
+ be preserved. */
+ memcpy(bpage->zip.data, page,
+ bpage->zip_size());
+ }
+ break;
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ break;
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+#if defined UNIV_ZIP_DEBUG && defined BTR_CUR_HASH_ADAPT
+ /* During recovery, we only update the
+ compressed page, not the uncompressed one. */
+ ut_a(recv_recovery_is_on()
+ || page_zip_validate(
+ &bpage->zip, page,
+ ((buf_block_t*) bpage)->index));
+#endif /* UNIV_ZIP_DEBUG && BTR_CUR_HASH_ADAPT */
+ break;
+ default:
+ ib::error() << "The compressed page to be"
+ " evicted seems corrupt:";
+ ut_print_buf(stderr, page, srv_page_size);
+
+ ib::error() << "Possibly older version of"
+ " the page:";
+
+ ut_print_buf(stderr, bpage->zip.data,
+ bpage->zip_size());
+ putc('\n', stderr);
+ ut_error;
+ }
+
+ break;
+ }
+ /* fall through */
+ case BUF_BLOCK_ZIP_PAGE:
+ ut_a(!bpage->oldest_modification());
+ MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size());
+ break;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+
+ ut_ad(!bpage->in_zip_hash);
+ HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, id.fold(), bpage);
+
+ switch (bpage->state()) {
+ case BUF_BLOCK_ZIP_PAGE:
+ ut_ad(!bpage->in_free_list);
+ ut_ad(!bpage->in_LRU_list);
+ ut_a(bpage->zip.data);
+ ut_a(bpage->zip.ssize);
+ ut_ad(!bpage->oldest_modification());
+
+ hash_lock->write_unlock();
+ buf_pool_mutex_exit_forbid();
+
+ buf_buddy_free(bpage->zip.data, bpage->zip_size());
+
+ buf_pool_mutex_exit_allow();
+ buf_page_free_descriptor(bpage);
+ return(false);
+
+ case BUF_BLOCK_FILE_PAGE:
+ static_assert(FIL_NULL == 0xffffffffU, "fill pattern");
+ static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+ memset_aligned<4>(reinterpret_cast<buf_block_t*>(bpage)->frame
+ + FIL_PAGE_OFFSET, 0xff, 4);
+ static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+ "not perfect alignment");
+ memset_aligned<2>(reinterpret_cast<buf_block_t*>(bpage)->frame
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+ MEM_UNDEFINED(((buf_block_t*) bpage)->frame, srv_page_size);
+ bpage->set_state(BUF_BLOCK_REMOVE_HASH);
+
+ if (!zip) {
+ return true;
+ }
+
+ /* Question: If we release hash_lock here
+ then what protects us against:
+ 1) Some other thread buffer fixing this page
+ 2) Some other thread trying to read this page and
+ not finding it in buffer pool attempting to read it
+ from the disk.
+ Answer:
+ 1) Cannot happen because the page is no longer in the
+ page_hash. Only possibility is when while invalidating
+ a tablespace we buffer fix the prev_page in LRU to
+ avoid relocation during the scan. But that is not
+ possible because we are holding buf_pool mutex.
+
+ 2) Not possible because in buf_page_init_for_read()
+ we do a look up of page_hash while holding buf_pool
+ mutex and since we are holding buf_pool mutex here
+ and by the time we'll release it in the caller we'd
+ have inserted the compressed only descriptor in the
+ page_hash. */
+ hash_lock->write_unlock();
+
+ if (bpage->zip.data) {
+ /* Free the compressed page. */
+ void* data = bpage->zip.data;
+ bpage->zip.data = NULL;
+
+ ut_ad(!bpage->in_free_list);
+ ut_ad(!bpage->oldest_modification());
+ ut_ad(!bpage->in_LRU_list);
+ buf_pool_mutex_exit_forbid();
+
+ buf_buddy_free(data, bpage->zip_size());
+
+ buf_pool_mutex_exit_allow();
+
+ page_zip_set_size(&bpage->zip, 0);
+ }
+
+ return(true);
+
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ break;
+ }
+
+ ut_error;
+ return(false);
+}
+
+/** Remove one page from LRU list and put it to free list.
+@param bpage file page to be freed
+@param id page identifier
+@param hash_lock buf_pool.page_hash latch (will be released here) */
+void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
+ page_hash_latch *hash_lock)
+{
+ while (bpage->buf_fix_count())
+ /* Wait for other threads to release the fix count
+ before releasing the bpage from LRU list. */
+ (void) LF_BACKOFF();
+
+ if (buf_LRU_block_remove_hashed(bpage, id, hash_lock, true))
+ buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage));
+}
+
+/** Update buf_pool.LRU_old_ratio.
+@param[in] old_pct Reserve this percentage of
+ the buffer pool for "old" blocks
+@param[in] adjust true=adjust the LRU list;
+ false=just assign buf_pool.LRU_old_ratio
+ during the initialization of InnoDB
+@return updated old_pct */
+uint buf_LRU_old_ratio_update(uint old_pct, bool adjust)
+{
+ uint ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100;
+ if (ratio < BUF_LRU_OLD_RATIO_MIN) {
+ ratio = BUF_LRU_OLD_RATIO_MIN;
+ } else if (ratio > BUF_LRU_OLD_RATIO_MAX) {
+ ratio = BUF_LRU_OLD_RATIO_MAX;
+ }
+
+ if (adjust) {
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (ratio != buf_pool.LRU_old_ratio) {
+ buf_pool.LRU_old_ratio = ratio;
+
+ if (UT_LIST_GET_LEN(buf_pool.LRU)
+ >= BUF_LRU_OLD_MIN_LEN) {
+ buf_LRU_old_adjust_len();
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ } else {
+ buf_pool.LRU_old_ratio = ratio;
+ }
+ /* the reverse of
+ ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
+ return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
+}
+
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+void
+buf_LRU_stat_update()
+{
+ buf_LRU_stat_t* item;
+ buf_LRU_stat_t cur_stat;
+
+ if (!buf_pool.freed_page_clock) {
+ goto func_exit;
+ }
+
+ /* Update the index. */
+ item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind];
+ buf_LRU_stat_arr_ind++;
+ buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL;
+
+ /* Add the current value and subtract the obsolete entry.
+ Since buf_LRU_stat_cur is not protected by any mutex,
+ it can be changing between adding to buf_LRU_stat_sum
+ and copying to item. Assign it to local variables to make
+ sure the same value assign to the buf_LRU_stat_sum
+ and item */
+ cur_stat = buf_LRU_stat_cur;
+
+ buf_LRU_stat_sum.io += cur_stat.io - item->io;
+ buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip;
+
+ /* Put current entry in the array. */
+ memcpy(item, &cur_stat, sizeof *item);
+
+func_exit:
+ /* Clear the current entry. */
+ memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate the LRU list. */
+void buf_LRU_validate()
+{
+ ulint old_len;
+ ulint new_len;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (UT_LIST_GET_LEN(buf_pool.LRU) >= BUF_LRU_OLD_MIN_LEN) {
+
+ ut_a(buf_pool.LRU_old);
+ old_len = buf_pool.LRU_old_len;
+
+ new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU)
+ * buf_pool.LRU_old_ratio
+ / BUF_LRU_OLD_RATIO_DIV,
+ UT_LIST_GET_LEN(buf_pool.LRU)
+ - (BUF_LRU_OLD_TOLERANCE
+ + BUF_LRU_NON_OLD_MIN_LEN));
+
+ ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE);
+ ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
+ }
+
+ CheckInLRUList::validate();
+
+ old_len = 0;
+
+ for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+ bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+
+ switch (bpage->state()) {
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ ut_ad(reinterpret_cast<buf_block_t*>(bpage)
+ ->in_unzip_LRU_list
+ == bpage->belongs_to_unzip_LRU());
+ case BUF_BLOCK_ZIP_PAGE:
+ break;
+ }
+
+ if (bpage->is_old()) {
+ const buf_page_t* prev
+ = UT_LIST_GET_PREV(LRU, bpage);
+ const buf_page_t* next
+ = UT_LIST_GET_NEXT(LRU, bpage);
+
+ if (!old_len++) {
+ ut_a(buf_pool.LRU_old == bpage);
+ } else {
+ ut_a(!prev || prev->is_old());
+ }
+
+ ut_a(!next || next->is_old());
+ }
+ }
+
+ ut_a(buf_pool.LRU_old_len == old_len);
+
+ CheckInFreeList::validate();
+
+ for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.free);
+ bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(list, bpage)) {
+
+ ut_a(bpage->state() == BUF_BLOCK_NOT_USED);
+ }
+
+ CheckUnzipLRUAndLRUList::validate();
+
+ for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool.unzip_LRU);
+ block != NULL;
+ block = UT_LIST_GET_NEXT(unzip_LRU, block)) {
+
+ ut_ad(block->in_unzip_LRU_list);
+ ut_ad(block->page.in_LRU_list);
+ ut_a(block->page.belongs_to_unzip_LRU());
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Dump the LRU list to stderr. */
+void buf_LRU_print()
+{
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+ bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+ const page_id_t id(bpage->id());
+
+ fprintf(stderr, "BLOCK space %u page %u ",
+ id.space(), id.page_no());
+
+ if (bpage->is_old()) {
+ fputs("old ", stderr);
+ }
+
+ if (const uint32_t buf_fix_count = bpage->buf_fix_count()) {
+ fprintf(stderr, "buffix count %u ", buf_fix_count);
+ }
+
+ if (const auto io_fix = bpage->io_fix()) {
+ fprintf(stderr, "io_fix %d ", io_fix);
+ }
+
+ if (bpage->oldest_modification()) {
+ fputs("modif. ", stderr);
+ }
+
+ switch (const auto state = bpage->state()) {
+ const byte* frame;
+ case BUF_BLOCK_FILE_PAGE:
+ frame = buf_block_get_frame((buf_block_t*) bpage);
+ fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n",
+ fil_page_get_type(frame),
+ btr_page_get_index_id(frame));
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ frame = bpage->zip.data;
+ fprintf(stderr, "\ntype %u size " ULINTPF
+ " index id " IB_ID_FMT "\n",
+ fil_page_get_type(frame),
+ bpage->zip_size(),
+ btr_page_get_index_id(frame));
+ break;
+
+ default:
+ fprintf(stderr, "\n!state %d!\n", state);
+ break;
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
new file mode 100644
index 00000000..253a2542
--- /dev/null
+++ b/storage/innobase/buf/buf0rea.cc
@@ -0,0 +1,785 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0rea.cc
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <mysql/service_thd_wait.h>
+
+#include "buf0rea.h"
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0buddy.h"
+#include "buf0dblwr.h"
+#include "ibuf0ibuf.h"
+#include "log0recv.h"
+#include "trx0sys.h"
+#include "os0file.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+
+/** If there are buf_pool.curr_size per the number below pending reads, then
+read-ahead is not done: this is to prevent flooding the buffer pool with
+i/o-fixed buffer blocks */
+#define BUF_READ_AHEAD_PEND_LIMIT 2
+
+/** Remove the sentinel block for the watch before replacing it with a
+real block. watch_unset() or watch_occurred() will notice
+that the block has been replaced with the real block.
+@param watch sentinel */
+inline void buf_pool_t::watch_remove(buf_page_t *watch)
+{
+ ut_ad(hash_lock_get(watch->id())->is_write_locked());
+ ut_a(watch_is_sentinel(*watch));
+ if (watch->buf_fix_count())
+ {
+ ut_ad(watch->in_page_hash);
+ ut_d(watch->in_page_hash= false);
+ HASH_DELETE(buf_page_t, hash, &page_hash, watch->id().fold(), watch);
+ watch->set_buf_fix_count(0);
+ }
+ ut_ad(!watch->in_page_hash);
+ watch->set_state(BUF_BLOCK_NOT_USED);
+ watch->id_= page_id_t(~0ULL);
+}
+
+/** Initialize a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ...
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] unzip whether the uncompressed page is
+ requested (for ROW_FORMAT=COMPRESSED)
+@return pointer to the block
+@retval NULL in case of an error */
+static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
+ ulint zip_size, bool unzip)
+{
+ mtr_t mtr;
+
+ if (mode == BUF_READ_IBUF_PAGES_ONLY)
+ {
+ /* It is a read-ahead within an ibuf routine */
+ ut_ad(!ibuf_bitmap_page(page_id, zip_size));
+ ibuf_mtr_start(&mtr);
+
+ if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr))
+ {
+ ibuf_mtr_commit(&mtr);
+ return nullptr;
+ }
+ }
+ else
+ ut_ad(mode == BUF_READ_ANY_PAGE);
+
+ buf_page_t *bpage= nullptr;
+ buf_block_t *block= nullptr;
+ if (!zip_size || unzip || recv_recovery_is_on())
+ {
+ block= buf_LRU_get_free_block(false);
+ block->initialise(page_id, zip_size);
+ /* We set a pass-type x-lock on the frame because then
+ the same thread which called for the read operation
+ (and is running now at this point of code) can wait
+ for the read to complete by waiting for the x-lock on
+ the frame; if the x-lock were recursive, the same
+ thread would illegally get the x-lock before the page
+ read is completed. The x-lock will be released
+ in buf_page_read_complete() by the io-handler thread. */
+ rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
+ }
+
+ const ulint fold= page_id.fold();
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ /* We must acquire hash_lock this early to prevent
+ a race condition with buf_pool_t::watch_remove() */
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+
+ buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold);
+ if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
+ {
+ /* The page is already in the buffer pool. */
+ hash_lock->write_unlock();
+ if (block)
+ {
+ rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ);
+ buf_LRU_block_free_non_file_page(block);
+ }
+ goto func_exit;
+ }
+
+ if (UNIV_LIKELY(block != nullptr))
+ {
+ bpage= &block->page;
+
+ /* Insert into the hash table of file pages */
+ if (hash_page)
+ {
+ /* Preserve the reference count. */
+ auto buf_fix_count= hash_page->buf_fix_count();
+ ut_a(buf_fix_count > 0);
+ block->page.add_buf_fix_count(buf_fix_count);
+ buf_pool.watch_remove(hash_page);
+ }
+
+ block->page.set_io_fix(BUF_IO_READ);
+ block->page.set_state(BUF_BLOCK_FILE_PAGE);
+ ut_ad(!block->page.in_page_hash);
+ ut_d(block->page.in_page_hash= true);
+ HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
+ hash_lock->write_unlock();
+
+ /* The block must be put to the LRU list, to the old blocks */
+ buf_LRU_add_block(bpage, true/* to old blocks */);
+
+ if (UNIV_UNLIKELY(zip_size))
+ {
+ /* buf_pool.mutex may be released and reacquired by
+ buf_buddy_alloc(). We must defer this operation until after the
+ block descriptor has been added to buf_pool.LRU and
+ buf_pool.page_hash. */
+ block->page.zip.data= static_cast<page_zip_t*>
+ (buf_buddy_alloc(zip_size));
+
+ /* To maintain the invariant
+ block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU()
+ we have to add this block to unzip_LRU
+ after block->page.zip.data is set. */
+ ut_ad(block->page.belongs_to_unzip_LRU());
+ buf_unzip_LRU_add_block(block, TRUE);
+ }
+ }
+ else
+ {
+ hash_lock->write_unlock();
+
+ /* The compressed page must be allocated before the
+ control block (bpage), in order to avoid the
+ invocation of buf_buddy_relocate_block() on
+ uninitialized data. */
+ bool lru= false;
+ void *data= buf_buddy_alloc(zip_size, &lru);
+
+ hash_lock->write_lock();
+
+ /* If buf_buddy_alloc() allocated storage from the LRU list,
+ it released and reacquired buf_pool.mutex. Thus, we must
+ check the page_hash again, as it may have been modified. */
+ if (UNIV_UNLIKELY(lru))
+ {
+ hash_page= buf_pool.page_hash_get_low(page_id, fold);
+
+ if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
+ {
+ /* The block was added by some other thread. */
+ hash_lock->write_unlock();
+ buf_buddy_free(data, zip_size);
+ goto func_exit;
+ }
+ }
+
+ bpage= buf_page_alloc_descriptor();
+
+ page_zip_des_init(&bpage->zip);
+ page_zip_set_size(&bpage->zip, zip_size);
+ bpage->zip.data = (page_zip_t*) data;
+
+ bpage->init(BUF_BLOCK_ZIP_PAGE, page_id);
+
+ if (hash_page)
+ {
+ /* Preserve the reference count. It can be 0 if
+ buf_pool_t::watch_unset() is executing concurrently,
+ waiting for buf_pool.mutex, which we are holding. */
+ bpage->add_buf_fix_count(hash_page->buf_fix_count());
+ buf_pool.watch_remove(hash_page);
+ }
+
+ ut_ad(!bpage->in_page_hash);
+ ut_d(bpage->in_page_hash= true);
+ HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
+ bpage->set_io_fix(BUF_IO_READ);
+ hash_lock->write_unlock();
+
+ /* The block must be put to the LRU list, to the old blocks.
+ The zip size is already set into the page zip */
+ buf_LRU_add_block(bpage, true/* to old blocks */);
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ buf_pool.n_pend_reads++;
+ goto func_exit_no_mutex;
+func_exit:
+ mysql_mutex_unlock(&buf_pool.mutex);
+func_exit_no_mutex:
+ if (mode == BUF_READ_IBUF_PAGES_ONLY)
+ ibuf_mtr_commit(&mtr);
+
+ ut_ad(!bpage || bpage->in_file());
+
+ return bpage;
+}
+
+/** Low-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there, in which case does nothing.
+Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
+flag is cleared and the x-lock released by an i/o-handler thread.
+
+@param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED
+ if we are trying
+ to read from a non-existent tablespace
+@param[in,out] space tablespace
+@param[in] sync true if synchronous aio is desired
+@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ...,
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] unzip true=request uncompressed page
+@return whether a read request was queued */
+static
+bool
+buf_read_page_low(
+ dberr_t* err,
+ fil_space_t* space,
+ bool sync,
+ ulint mode,
+ const page_id_t page_id,
+ ulint zip_size,
+ bool unzip)
+{
+ buf_page_t* bpage;
+
+ *err = DB_SUCCESS;
+
+ if (buf_dblwr.is_inside(page_id)) {
+ ib::error() << "Trying to read doublewrite buffer page "
+ << page_id;
+ ut_ad(0);
+nothing_read:
+ space->release();
+ return false;
+ }
+
+ if (sync) {
+ } else if (trx_sys_hdr_page(page_id)
+ || ibuf_bitmap_page(page_id, zip_size)
+ || (!recv_no_ibuf_operations
+ && ibuf_page(page_id, zip_size, nullptr))) {
+
+ /* Trx sys header is so low in the latching order that we play
+ safe and do not leave the i/o-completion to an asynchronous
+ i/o-thread. Change buffer pages must always be read with
+ syncronous i/o, to make sure they do not get involved in
+ thread deadlocks. */
+ sync = true;
+ }
+
+ /* The following call will also check if the tablespace does not exist
+ or is being dropped; if we succeed in initing the page in the buffer
+ pool for read, then DISCARD cannot proceed until the read has
+ completed */
+ bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);
+
+ if (bpage == NULL) {
+ goto nothing_read;
+ }
+
+ ut_ad(bpage->in_file());
+
+ if (sync) {
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ }
+
+ DBUG_LOG("ib_buf",
+ "read page " << page_id << " zip_size=" << zip_size
+ << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
+
+ void* dst;
+
+ if (zip_size) {
+ dst = bpage->zip.data;
+ } else {
+ ut_a(bpage->state() == BUF_BLOCK_FILE_PAGE);
+
+ dst = ((buf_block_t*) bpage)->frame;
+ }
+
+ const ulint len = zip_size ? zip_size : srv_page_size;
+
+ auto fio = space->io(IORequest(sync
+ ? IORequest::READ_SYNC
+ : IORequest::READ_ASYNC),
+ page_id.page_no() * len, len, dst, bpage);
+ *err= fio.err;
+
+ if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
+ if (!sync || fio.err == DB_TABLESPACE_DELETED) {
+ buf_pool.corrupted_evict(bpage);
+ return false;
+ }
+
+ ut_error;
+ }
+
+ if (sync) {
+ thd_wait_end(NULL);
+
+ /* The i/o was already completed in space->io() */
+ *err = buf_page_read_complete(bpage, *fio.node);
+ space->release();
+
+ if (*err != DB_SUCCESS) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/** Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@param[in] page_id page id of a page which the current thread
+wants to access
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] ibuf whether we are inside ibuf routine
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value! */
+ulint
+buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
+{
+ if (!srv_random_read_ahead)
+ return 0;
+
+ if (srv_startup_is_before_trx_rollback_phase)
+ /* No read-ahead to avoid thread deadlocks */
+ return 0;
+
+ if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
+ /* If it is an ibuf bitmap page or trx sys hdr, we do no
+ read-ahead, as that could break the ibuf page access order */
+ return 0;
+
+ if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+ return 0;
+
+ fil_space_t* space= fil_space_t::get(page_id.space());
+ if (!space)
+ return 0;
+
+ const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
+ ulint count= 5 + buf_read_ahead_area / 8;
+ const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
+ page_id_t high= low + buf_read_ahead_area;
+ high.set_page_no(std::min(high.page_no(), space->last_page_number()));
+
+ /* Count how many blocks in the area have been recently accessed,
+ that is, reside near the start of the LRU list. */
+
+ for (page_id_t i= low; i < high; ++i)
+ {
+ const ulint fold= i.fold();
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+ const buf_page_t *bpage= buf_pool.page_hash_get_low(i, fold);
+ bool found= bpage && bpage->is_accessed() && buf_page_peek_if_young(bpage);
+ hash_lock->read_unlock();
+ if (found && !--count)
+ goto read_ahead;
+ }
+
+no_read_ahead:
+ space->release();
+ return 0;
+
+read_ahead:
+ if (space->is_stopping())
+ goto no_read_ahead;
+
+ /* Read all the suitable blocks within the area */
+ const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+
+ for (page_id_t i= low; i < high; ++i)
+ {
+ if (ibuf_bitmap_page(i, zip_size))
+ continue;
+ if (space->is_stopping())
+ break;
+ dberr_t err;
+ space->reacquire();
+ if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false))
+ count++;
+ }
+
+ if (count)
+ DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
+ count, space->chain.start->name,
+ low.page_no()));
+ space->release();
+
+ /* Read ahead is considered one I/O operation for the purpose of
+ LRU policy decision. */
+ buf_LRU_stat_inc_io();
+
+ buf_pool.stat.n_ra_pages_read_rnd+= count;
+ srv_stats.buf_pool_reads.add(count);
+ return count;
+}
+
+/** High-level function which reads a page from a file to buf_pool
+if it is not already there. Sets the io_fix and an exclusive lock
+on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@retval DB_SUCCESS if the page was read and is not corrupted,
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
+dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
+{
+ fil_space_t *space= fil_space_t::get(page_id.space());
+ if (!space)
+ {
+ ib::info() << "trying to read page " << page_id
+ << " in nonexisting or being-dropped tablespace";
+ return DB_TABLESPACE_DELETED;
+ }
+
+ dberr_t err;
+ if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE,
+ page_id, zip_size, false))
+ srv_stats.buf_pool_reads.add(1);
+
+ buf_LRU_stat_inc_io();
+ return err;
+}
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in,out] space tablespace
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] sync true if synchronous aio is desired */
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+ ulint zip_size, bool sync)
+{
+ dberr_t err;
+
+ if (buf_read_page_low(&err, space, sync, BUF_READ_ANY_PAGE,
+ page_id, zip_size, false)) {
+ srv_stats.buf_pool_reads.add(1);
+ }
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_ERROR:
+ break;
+ case DB_TABLESPACE_DELETED:
+ ib::info() << "trying to read page " << page_id
+ << " in the background"
+ " in a non-existing or being-dropped tablespace";
+ break;
+ case DB_PAGE_CORRUPTED:
+ case DB_DECRYPTION_FAILED:
+ ib::error()
+ << "Background Page read failed to "
+ "read or decrypt " << page_id;
+ break;
+ default:
+ ib::fatal() << "Error " << err << " in background read of "
+ << page_id;
+ }
+
+ /* We do not increment number of I/O operations used for LRU policy
+ here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
+ about evicting uncompressed version of compressed pages from the
+ buffer pool. Since this function is called from buffer pool load
+ these IOs are deliberate and are not part of normal workload we can
+ ignore these in our heuristics. */
+}
+
+/** Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@param[in] page_id page id; see NOTE 3 above
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] ibuf whether if we are inside ibuf routine
+@return number of page read requests issued */
+ulint
+buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
+{
+ /* check if readahead is disabled */
+ if (!srv_read_ahead_threshold)
+ return 0;
+
+ if (srv_startup_is_before_trx_rollback_phase)
+ /* No read-ahead to avoid thread deadlocks */
+ return 0;
+
+ if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+ return 0;
+
+ const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
+ const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
+ const page_id_t high_1= low + (buf_read_ahead_area - 1);
+
+ /* We will check that almost all pages in the area have been accessed
+ in the desired order. */
+ const bool descending= page_id == low;
+
+ if (!descending && page_id != high_1)
+ /* This is not a border page of the area */
+ return 0;
+
+ if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
+ /* If it is an ibuf bitmap page or trx sys hdr, we do no
+ read-ahead, as that could break the ibuf page access order */
+ return 0;
+
+ fil_space_t *space= fil_space_t::get(page_id.space());
+ if (!space)
+ return 0;
+
+ if (high_1.page_no() > space->last_page_number())
+ {
+ /* The area is not whole. */
+fail:
+ space->release();
+ return 0;
+ }
+
+ /* How many out of order accessed pages can we ignore
+ when working out the access pattern for linear readahead */
+ ulint count= std::min<ulint>(buf_pool_t::READ_AHEAD_PAGES -
+ srv_read_ahead_threshold,
+ uint32_t{buf_pool.read_ahead_area});
+ page_id_t new_low= low, new_high_1= high_1;
+ unsigned prev_accessed= 0;
+ for (page_id_t i= low; i != high_1; ++i)
+ {
+ const ulint fold= i.fold();
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+ const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold);
+ if (i == page_id)
+ {
+ /* Read the natural predecessor and successor page addresses from
+ the page; NOTE that because the calling thread may have an x-latch
+ on the page, we do not acquire an s-latch on the page, this is to
+ prevent deadlocks. The hash_lock is only protecting the
+ buf_pool.page_hash for page i, not the bpage contents itself. */
+ if (!bpage)
+ {
+hard_fail:
+ hash_lock->read_unlock();
+ goto fail;
+ }
+ const byte *f;
+ switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) {
+ case BUF_BLOCK_FILE_PAGE:
+ f= reinterpret_cast<const buf_block_t*>(bpage)->frame;
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ f= bpage->zip.data;
+ break;
+ default:
+ goto hard_fail;
+ }
+
+ uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV));
+ uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT));
+ if (prev == FIL_NULL || next == FIL_NULL)
+ goto hard_fail;
+ page_id_t id= page_id;
+ if (descending && next - 1 == page_id.page_no())
+ id.set_page_no(prev);
+ else if (!descending && prev + 1 == page_id.page_no())
+ id.set_page_no(next);
+ else
+ goto hard_fail; /* Successor or predecessor not in the right order */
+
+ new_low= id - (id.page_no() % buf_read_ahead_area);
+ new_high_1= new_low + (buf_read_ahead_area - 1);
+
+ if (id != new_low && id != new_high_1)
+ /* This is not a border page of the area: return */
+ goto hard_fail;
+ if (new_high_1.page_no() > space->last_page_number())
+ /* The area is not whole */
+ goto hard_fail;
+ }
+ else if (!bpage)
+ {
+failed:
+ hash_lock->read_unlock();
+ if (--count)
+ continue;
+ goto fail;
+ }
+
+ const unsigned accessed= bpage->is_accessed();
+ if (!accessed)
+ goto failed;
+ /* Note that buf_page_t::is_accessed() returns the time of the
+ first access. If some blocks of the extent existed in the buffer
+ pool at the time of a linear access pattern, the first access
+ times may be nonmonotonic, even though the latest access times
+ were linear. The threshold (srv_read_ahead_factor) should help a
+ little against this. */
+ bool fail= prev_accessed &&
+ (descending ? prev_accessed > accessed : prev_accessed < accessed);
+ prev_accessed= accessed;
+ if (fail)
+ goto failed;
+ hash_lock->read_unlock();
+ }
+
+ /* If we got this far, read-ahead can be sensible: do it */
+ count= 0;
+ for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+ new_low != new_high_1; ++new_low)
+ {
+ if (ibuf_bitmap_page(new_low, zip_size))
+ continue;
+ if (space->is_stopping())
+ break;
+ dberr_t err;
+ space->reacquire();
+ count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size,
+ false);
+ }
+
+ if (count)
+ DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
+ count, space->chain.start->name,
+ new_low.page_no()));
+ space->release();
+
+ /* Read ahead is considered one I/O operation for the purpose of
+ LRU policy decision. */
+ buf_LRU_stat_inc_io();
+
+ buf_pool.stat.n_ra_pages_read+= count;
+ return count;
+}
+
+/** Issues read requests for pages which recovery wants to read in.
+@param[in] space_id tablespace id
+@param[in] page_nos array of page numbers to read, with the
+highest page number the last in the array
+@param[in] n number of page numbers in the array */
+void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n)
+{
+ fil_space_t* space = fil_space_t::get(space_id);
+
+ if (!space) {
+ /* The tablespace is missing or unreadable: do nothing */
+ return;
+ }
+
+ const ulint zip_size = space->zip_size();
+
+ for (ulint i = 0; i < n; i++) {
+
+ /* Ignore if the page already present in freed ranges. */
+ if (space->freed_ranges.contains(page_nos[i])) {
+ continue;
+ }
+
+ const page_id_t cur_page_id(space_id, page_nos[i]);
+
+ ulint limit = 0;
+ for (ulint j = 0; j < buf_pool.n_chunks; j++) {
+ limit += buf_pool.chunks[j].size / 2;
+ }
+
+ for (ulint count = 0; buf_pool.n_pend_reads >= limit; ) {
+ os_thread_sleep(10000);
+
+ if (!(++count % 1000)) {
+
+ ib::error()
+ << "Waited for " << count / 100
+ << " seconds for "
+ << buf_pool.n_pend_reads
+ << " pending reads";
+ }
+ }
+
+ dberr_t err;
+ space->reacquire();
+ buf_read_page_low(&err, space, false,
+ BUF_READ_ANY_PAGE, cur_page_id, zip_size,
+ true);
+
+ if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) {
+ ib::error() << "Recovery failed to read or decrypt "
+ << cur_page_id;
+ }
+ }
+
+
+ DBUG_PRINT("ib_buf", ("recovery read (%u pages) for %s", n,
+ space->chain.start->name));
+ space->release();
+}
diff --git a/storage/innobase/bzip2.cmake b/storage/innobase/bzip2.cmake
new file mode 100644
index 00000000..91dd2bf0
--- /dev/null
+++ b/storage/innobase/bzip2.cmake
@@ -0,0 +1,36 @@
+# Copyright (C) 2014, SkySQL Ab. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+SET(WITH_INNODB_BZIP2 AUTO CACHE STRING
+ "Build with bzip2. Possible values are 'ON', 'OFF', 'AUTO' and default is 'AUTO'")
+
+MACRO (MYSQL_CHECK_BZIP2)
+ IF (WITH_INNODB_BZIP2 STREQUAL "ON" OR WITH_INNODB_BZIP2 STREQUAL "AUTO")
+ CHECK_INCLUDE_FILES(bzlib.h HAVE_BZLIB2_H)
+ CHECK_LIBRARY_EXISTS(bz2 BZ2_bzBuffToBuffCompress "" HAVE_BZLIB2_COMPRESS)
+ CHECK_LIBRARY_EXISTS(bz2 BZ2_bzBuffToBuffDecompress "" HAVE_BZLIB2_DECOMPRESS)
+
+ IF (HAVE_BZLIB2_COMPRESS AND HAVE_BZLIB2_DECOMPRESS AND HAVE_BZLIB2_H)
+ SET(HAVE_INNODB_BZLIB2 TRUE)
+ ADD_DEFINITIONS(-DHAVE_BZIP2=1)
+ LINK_LIBRARIES(bz2)
+ ELSE()
+ IF (WITH_INNODB_BZIP2 STREQUAL "ON")
+ MESSAGE(FATAL_ERROR "Required bzip2 library is not found")
+ ENDIF()
+ ENDIF()
+ ENDIF()
+ ADD_FEATURE_INFO(INNODB_BZIP2 HAVE_INNODB_BZLIB2
+ "BZIP2 compression in the InnoDB storage engine")
+ENDMACRO()
diff --git a/storage/innobase/compile-innodb b/storage/innobase/compile-innodb
new file mode 100755
index 00000000..47073d3c
--- /dev/null
+++ b/storage/innobase/compile-innodb
@@ -0,0 +1,25 @@
+#!/bin/sh
+#
+# Copyright (c) 2006, 2013, Oracle and/or its affiliates. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+#
+
+# we assume this script is in storage/innobase/
+
+MYSQL_ROOT="$(dirname ${0})/../.."
+
+cd ${MYSQL_ROOT}
+
+cmake -DWITH_INNOBASE_STORAGE_ENGINE:BOOL=ON
+make -j$(nproc)
diff --git a/storage/innobase/data/data0data.cc b/storage/innobase/data/data0data.cc
new file mode 100644
index 00000000..14a0b3e1
--- /dev/null
+++ b/storage/innobase/data/data0data.cc
@@ -0,0 +1,854 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file data/data0data.cc
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "data0data.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "row0upd.h"
+
+#ifdef UNIV_DEBUG
+/** Dummy variable to catch access to uninitialized fields. In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+ut_d(byte data_error);
+#endif /* UNIV_DEBUG */
+
+/** Trim the tail of an index tuple before insert or update.
+After instant ADD COLUMN, if the last fields of a clustered index tuple
+match the default values that were explicitly specified or implied during
+ADD COLUMN, there will be no need to store them.
+NOTE: A page latch in the index must be held, so that the index
+may not lose 'instantness' before the trimmed tuple has been
+inserted or updated.
+@param[in] index index possibly with instantly added columns */
+void dtuple_t::trim(const dict_index_t& index)
+{
+ ut_ad(n_fields >= index.n_core_fields);
+ ut_ad(n_fields <= index.n_fields);
+ ut_ad(index.is_instant());
+
+ ulint i = n_fields;
+ for (; i > index.n_core_fields; i--) {
+ const dfield_t* dfield = dtuple_get_nth_field(this, i - 1);
+ const dict_col_t* col = dict_index_get_nth_col(&index, i - 1);
+
+ if (col->is_dropped()) {
+ continue;
+ }
+
+ ut_ad(col->is_added());
+ ulint len = dfield_get_len(dfield);
+ if (len != col->def_val.len) {
+ break;
+ }
+
+ if (len != 0 && len != UNIV_SQL_NULL
+ && dfield->data != col->def_val.data
+ && memcmp(dfield->data, col->def_val.data, len)) {
+ break;
+ }
+ }
+
+ n_fields = i;
+}
+
+/** Compare two data tuples.
+@param[in] tuple1 first data tuple
+@param[in] tuple2 second data tuple
+@return positive, 0, negative if tuple1 is greater, equal, less, than tuple2,
+respectively */
+int
+dtuple_coll_cmp(
+ const dtuple_t* tuple1,
+ const dtuple_t* tuple2)
+{
+ ulint n_fields;
+ ulint i;
+ int cmp;
+
+ ut_ad(tuple1 != NULL);
+ ut_ad(tuple2 != NULL);
+ ut_ad(tuple1->magic_n == DATA_TUPLE_MAGIC_N);
+ ut_ad(tuple2->magic_n == DATA_TUPLE_MAGIC_N);
+ ut_ad(dtuple_check_typed(tuple1));
+ ut_ad(dtuple_check_typed(tuple2));
+
+ n_fields = dtuple_get_n_fields(tuple1);
+
+ cmp = (int) n_fields - (int) dtuple_get_n_fields(tuple2);
+
+ for (i = 0; cmp == 0 && i < n_fields; i++) {
+ const dfield_t* field1 = dtuple_get_nth_field(tuple1, i);
+ const dfield_t* field2 = dtuple_get_nth_field(tuple2, i);
+ cmp = cmp_dfield_dfield(field1, field2);
+ }
+
+ return(cmp);
+}
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+void
+dtuple_set_n_fields(
+/*================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields) /*!< in: number of fields */
+{
+ tuple->n_fields = n_fields;
+ tuple->n_fields_cmp = n_fields;
+}
+
+/**********************************************************//**
+Checks that a data field is typed.
+@return TRUE if ok */
+static
+ibool
+dfield_check_typed_no_assert(
+/*=========================*/
+ const dfield_t* field) /*!< in: data field */
+{
+ if (dfield_get_type(field)->mtype > DATA_MTYPE_CURRENT_MAX
+ || dfield_get_type(field)->mtype < DATA_MTYPE_CURRENT_MIN) {
+
+ ib::error() << "Data field type "
+ << dfield_get_type(field)->mtype
+ << ", len " << dfield_get_len(field);
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************//**
+Checks that a data tuple is typed.
+@return TRUE if ok */
+static
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ const dfield_t* field;
+ ulint i;
+
+ if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) {
+ ib::error() << "Index entry has "
+ << dtuple_get_n_fields(tuple) << " fields";
+dump:
+ fputs("InnoDB: Tuple contents: ", stderr);
+ dtuple_print(stderr, tuple);
+ putc('\n', stderr);
+
+ return(FALSE);
+ }
+
+ for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+ field = dtuple_get_nth_field(tuple, i);
+
+ if (!dfield_check_typed_no_assert(field)) {
+ goto dump;
+ }
+ }
+
+ return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dfield_check_typed(
+/*===============*/
+ const dfield_t* field) /*!< in: data field */
+{
+ if (dfield_get_type(field)->mtype > DATA_MTYPE_CURRENT_MAX
+ || dfield_get_type(field)->mtype < DATA_MTYPE_CURRENT_MIN) {
+
+ ib::fatal() << "Data field type "
+ << dfield_get_type(field)->mtype
+ << ", len " << dfield_get_len(field);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dtuple_check_typed(
+/*===============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ const dfield_t* field;
+ ulint i;
+
+ for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+ field = dtuple_get_nth_field(tuple, i);
+
+ ut_a(dfield_check_typed(field));
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return TRUE if ok */
+ibool
+dtuple_validate(
+/*============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+#ifdef HAVE_valgrind
+ const ulint n_fields = dtuple_get_n_fields(tuple);
+
+ for (ulint i = 0; i < n_fields; i++) {
+ const dfield_t* field = dtuple_get_nth_field(tuple, i);
+
+ if (!dfield_is_null(field)) {
+ MEM_CHECK_DEFINED(dfield_get_data(field),
+ dfield_get_len(field));
+ }
+ }
+#endif /* HAVE_valgrind */
+ ut_ad(dtuple_check_typed(tuple));
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+void
+dfield_print(
+/*=========*/
+ const dfield_t* dfield) /*!< in: dfield */
+{
+ const byte* data;
+ ulint len;
+ ulint i;
+
+ len = dfield_get_len(dfield);
+ data = static_cast<const byte*>(dfield_get_data(dfield));
+
+ if (dfield_is_null(dfield)) {
+ fputs("NULL", stderr);
+
+ return;
+ }
+
+ switch (dtype_get_mtype(dfield_get_type(dfield))) {
+ case DATA_CHAR:
+ case DATA_VARCHAR:
+ for (i = 0; i < len; i++) {
+ int c = *data++;
+ putc(isprint(c) ? c : ' ', stderr);
+ }
+
+ if (dfield_is_ext(dfield)) {
+ fputs("(external)", stderr);
+ }
+ break;
+ case DATA_INT:
+ ut_a(len == 4); /* only works for 32-bit integers */
+ fprintf(stderr, "%d", (int) mach_read_from_4(data));
+ break;
+ default:
+ ut_error;
+ }
+}
+
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+void
+dfield_print_also_hex(
+/*==================*/
+ const dfield_t* dfield) /*!< in: dfield */
+{
+ const byte* data;
+ ulint len;
+ ulint prtype;
+ ulint i;
+ ibool print_also_hex;
+
+ len = dfield_get_len(dfield);
+ data = static_cast<const byte*>(dfield_get_data(dfield));
+
+ if (dfield_is_null(dfield)) {
+ fputs("NULL", stderr);
+
+ return;
+ }
+
+ prtype = dtype_get_prtype(dfield_get_type(dfield));
+
+ switch (dtype_get_mtype(dfield_get_type(dfield))) {
+ ib_id_t id;
+ case DATA_INT:
+ switch (len) {
+ ulint val;
+ case 1:
+ val = mach_read_from_1(data);
+
+ if (!(prtype & DATA_UNSIGNED)) {
+ val &= ~0x80U;
+ fprintf(stderr, "%ld", (long) val);
+ } else {
+ fprintf(stderr, "%lu", (ulong) val);
+ }
+ break;
+
+ case 2:
+ val = mach_read_from_2(data);
+
+ if (!(prtype & DATA_UNSIGNED)) {
+ val &= ~0x8000U;
+ fprintf(stderr, "%ld", (long) val);
+ } else {
+ fprintf(stderr, "%lu", (ulong) val);
+ }
+ break;
+
+ case 3:
+ val = mach_read_from_3(data);
+
+ if (!(prtype & DATA_UNSIGNED)) {
+ val &= ~0x800000U;
+ fprintf(stderr, "%ld", (long) val);
+ } else {
+ fprintf(stderr, "%lu", (ulong) val);
+ }
+ break;
+
+ case 4:
+ val = mach_read_from_4(data);
+
+ if (!(prtype & DATA_UNSIGNED)) {
+ val &= ~0x80000000;
+ fprintf(stderr, "%ld", (long) val);
+ } else {
+ fprintf(stderr, "%lu", (ulong) val);
+ }
+ break;
+
+ case 6:
+ id = mach_read_from_6(data);
+ fprintf(stderr, IB_ID_FMT, id);
+ break;
+
+ case 7:
+ id = mach_read_from_7(data);
+ fprintf(stderr, IB_ID_FMT, id);
+ break;
+ case 8:
+ id = mach_read_from_8(data);
+ fprintf(stderr, IB_ID_FMT, id);
+ break;
+ default:
+ goto print_hex;
+ }
+ break;
+
+ case DATA_SYS:
+ switch (prtype & DATA_SYS_PRTYPE_MASK) {
+ case DATA_TRX_ID:
+ id = mach_read_from_6(data);
+
+ fprintf(stderr, "trx_id " TRX_ID_FMT, id);
+ break;
+
+ case DATA_ROLL_PTR:
+ id = mach_read_from_7(data);
+
+ fprintf(stderr, "roll_ptr " TRX_ID_FMT, id);
+ break;
+
+ case DATA_ROW_ID:
+ id = mach_read_from_6(data);
+
+ fprintf(stderr, "row_id " TRX_ID_FMT, id);
+ break;
+
+ default:
+ goto print_hex;
+ }
+ break;
+
+ case DATA_CHAR:
+ case DATA_VARCHAR:
+ print_also_hex = FALSE;
+
+ for (i = 0; i < len; i++) {
+ int c = *data++;
+
+ if (!isprint(c)) {
+ print_also_hex = TRUE;
+
+ fprintf(stderr, "\\x%02x", (unsigned char) c);
+ } else {
+ putc(c, stderr);
+ }
+ }
+
+ if (dfield_is_ext(dfield)) {
+ fputs("(external)", stderr);
+ }
+
+ if (!print_also_hex) {
+ break;
+ }
+
+ data = static_cast<const byte*>(dfield_get_data(dfield));
+ /* fall through */
+
+ case DATA_BINARY:
+ default:
+print_hex:
+ fputs(" Hex: ",stderr);
+
+ for (i = 0; i < len; i++) {
+ fprintf(stderr, "%02x", *data++);
+ }
+
+ if (dfield_is_ext(dfield)) {
+ fputs("(external)", stderr);
+ }
+ }
+}
+
+/*************************************************************//**
+Print a dfield value using ut_print_buf. */
+static
+void
+dfield_print_raw(
+/*=============*/
+ FILE* f, /*!< in: output stream */
+ const dfield_t* dfield) /*!< in: dfield */
+{
+ ulint len = dfield_get_len(dfield);
+ if (!dfield_is_null(dfield)) {
+ ulint print_len = ut_min(len, static_cast<ulint>(1000));
+ ut_print_buf(f, dfield_get_data(dfield), print_len);
+ if (len != print_len) {
+ fprintf(f, "(total %lu bytes%s)",
+ (ulong) len,
+ dfield_is_ext(dfield) ? ", external" : "");
+ }
+ } else {
+ fputs(" SQL NULL", f);
+ }
+}
+
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+void
+dtuple_print(
+/*=========*/
+ FILE* f, /*!< in: output stream */
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ulint n_fields;
+ ulint i;
+
+ n_fields = dtuple_get_n_fields(tuple);
+
+ fprintf(f, "DATA TUPLE: %lu fields;\n", (ulong) n_fields);
+
+ for (i = 0; i < n_fields; i++) {
+ fprintf(f, " %lu:", (ulong) i);
+
+ dfield_print_raw(f, dtuple_get_nth_field(tuple, i));
+
+ putc(';', f);
+ putc('\n', f);
+ }
+
+ ut_ad(dtuple_validate(tuple));
+}
+
+/** Print the contents of a tuple.
+@param[out] o output stream
+@param[in] field array of data fields
+@param[in] n number of data fields */
+void
+dfield_print(
+ std::ostream& o,
+ const dfield_t* field,
+ ulint n)
+{
+ for (ulint i = 0; i < n; i++, field++) {
+ const void* data = dfield_get_data(field);
+ const ulint len = dfield_get_len(field);
+
+ if (i) {
+ o << ',';
+ }
+
+ if (dfield_is_null(field)) {
+ o << "NULL";
+ } else if (dfield_is_ext(field)) {
+ ulint local_len = len - BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ o << '['
+ << local_len
+ << '+' << BTR_EXTERN_FIELD_REF_SIZE << ']';
+ ut_print_buf(o, data, local_len);
+ ut_print_buf_hex(o, static_cast<const byte*>(data)
+ + local_len,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ } else {
+ o << '[' << len << ']';
+ ut_print_buf(o, data, len);
+ }
+ }
+}
+
+/** Print the contents of a tuple.
+@param[out] o output stream
+@param[in] tuple data tuple */
+void
+dtuple_print(
+ std::ostream& o,
+ const dtuple_t* tuple)
+{
+ const ulint n = dtuple_get_n_fields(tuple);
+
+ o << "TUPLE (info_bits=" << dtuple_get_info_bits(tuple)
+ << ", " << n << " fields): {";
+
+ dfield_print(o, tuple->fields, n);
+
+ o << "}";
+}
+
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ upd_t* upd, /*!< in/out: update vector */
+ dtuple_t* entry, /*!< in/out: index entry */
+ ulint* n_ext) /*!< in/out: number of
+ externally stored columns */
+{
+ mem_heap_t* heap;
+ big_rec_t* vector;
+ dfield_t* dfield;
+ ulint size;
+ ulint n_fields;
+ ulint local_prefix_len;
+
+ if (!dict_index_is_clust(index)) {
+ return(NULL);
+ }
+
+ if (!index->table->space) {
+ return NULL;
+ }
+
+ ulint local_len = index->table->get_overflow_field_local_len();
+ const auto zip_size = index->table->space->zip_size();
+
+ ut_ad(index->n_uniq > 0);
+
+ ut_a(dtuple_check_typed_no_assert(entry));
+
+ size = rec_get_converted_size(index, entry, *n_ext);
+
+ if (UNIV_UNLIKELY(size > 1000000000)) {
+ ib::warn() << "Tuple size is very big: " << size;
+ fputs("InnoDB: Tuple contents: ", stderr);
+ dtuple_print(stderr, entry);
+ putc('\n', stderr);
+ }
+
+ heap = mem_heap_create(size + dtuple_get_n_fields(entry)
+ * sizeof(big_rec_field_t) + 1000);
+
+ vector = big_rec_t::alloc(heap, dtuple_get_n_fields(entry));
+
+ /* Decide which fields to shorten: the algorithm is to look for
+ a variable-length field that yields the biggest savings when
+ stored externally */
+
+ n_fields = 0;
+ uint16_t longest_i;
+ ulint longest;
+
+ const bool mblob = entry->is_alter_metadata();
+ ut_ad(entry->n_fields - mblob >= index->first_user_field());
+ ut_ad(entry->n_fields - mblob <= index->n_fields);
+
+ if (mblob) {
+ longest_i = index->first_user_field();
+ dfield = dtuple_get_nth_field(entry, longest_i);
+ local_len = BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(!dfield_is_ext(dfield));
+ goto ext_write;
+ }
+
+ if (!dict_table_has_atomic_blobs(index->table)) {
+ /* up to MySQL 5.1: store a 768-byte prefix locally */
+ local_len = BTR_EXTERN_FIELD_REF_SIZE
+ + DICT_ANTELOPE_MAX_INDEX_COL_LEN;
+ } else {
+ /* new-format table: do not store any BLOB prefix locally */
+ local_len = BTR_EXTERN_FIELD_REF_SIZE;
+ }
+
+ while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry,
+ *n_ext),
+ index->table->not_redundant(),
+ dict_index_get_n_fields(index),
+ zip_size)) {
+ longest_i = 0;
+ longest = 0;
+ for (uint16_t i = index->first_user_field();
+ i < entry->n_fields - mblob; i++) {
+ ulint savings;
+ dfield = dtuple_get_nth_field(entry, i + mblob);
+
+ const dict_field_t* ifield = dict_index_get_nth_field(
+ index, i);
+
+ /* Skip fixed-length, NULL, externally stored,
+ or short columns */
+
+ if (ifield->fixed_len
+ || dfield_is_null(dfield)
+ || dfield_is_ext(dfield)
+ || dfield_get_len(dfield) <= local_len
+ || dfield_get_len(dfield)
+ <= BTR_EXTERN_LOCAL_STORED_MAX_SIZE) {
+ goto skip_field;
+ }
+
+ savings = dfield_get_len(dfield) - local_len;
+
+ /* Check that there would be savings */
+ if (longest >= savings) {
+ goto skip_field;
+ }
+
+ /* In DYNAMIC and COMPRESSED format, store
+ locally any non-BLOB columns whose maximum
+ length does not exceed 256 bytes. This is
+ because there is no room for the "external
+ storage" flag when the maximum length is 255
+ bytes or less. This restriction trivially
+ holds in REDUNDANT and COMPACT format, because
+ there we always store locally columns whose
+ length is up to local_len == 788 bytes.
+ @see rec_init_offsets_comp_ordinary */
+ if (!DATA_BIG_COL(ifield->col)) {
+ goto skip_field;
+ }
+
+ longest_i = uint16_t(i + mblob);
+ longest = savings;
+
+skip_field:
+ continue;
+ }
+
+ if (!longest_i) {
+ /* Cannot shorten more */
+
+ mem_heap_free(heap);
+
+ return(NULL);
+ }
+
+ /* Move data from field longest_i to big rec vector.
+
+ We store the first bytes locally to the record. Then
+ we can calculate all ordering fields in all indexes
+ from locally stored data. */
+ dfield = dtuple_get_nth_field(entry, longest_i);
+ext_write:
+ local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ vector->append(
+ big_rec_field_t(
+ longest_i,
+ dfield_get_len(dfield) - local_prefix_len,
+ static_cast<char*>(dfield_get_data(dfield))
+ + local_prefix_len));
+
+ /* Allocate the locally stored part of the column. */
+ byte* data = static_cast<byte*>(
+ mem_heap_alloc(heap, local_len));
+
+ /* Copy the local prefix. */
+ memcpy(data, dfield_get_data(dfield), local_prefix_len);
+ /* Clear the extern field reference (BLOB pointer). */
+ memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE);
+
+ dfield_set_data(dfield, data, local_len);
+ dfield_set_ext(dfield);
+
+ n_fields++;
+ (*n_ext)++;
+ ut_ad(n_fields < dtuple_get_n_fields(entry));
+
+ if (upd && !upd->is_modified(longest_i)) {
+
+ DEBUG_SYNC_C("ib_mv_nonupdated_column_offpage");
+
+ upd_field_t upd_field;
+ upd_field.field_no = longest_i;
+ upd_field.orig_len = 0;
+ upd_field.exp = NULL;
+ upd_field.old_v_val = NULL;
+ dfield_copy(&upd_field.new_val,
+ dfield->clone(upd->heap));
+ upd->append(upd_field);
+ ut_ad(upd->is_modified(longest_i));
+
+ ut_ad(upd_field.new_val.len
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_ad(upd_field.new_val.len == local_len);
+ ut_ad(upd_field.new_val.len == dfield_get_len(dfield));
+ }
+ }
+
+ ut_ad(n_fields == vector->n_fields);
+
+ return(vector);
+}
+
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+ dict_index_t* index MY_ATTRIBUTE((unused)), /*!< in: index */
+ dtuple_t* entry, /*!< in/out: entry whose data was put to vector */
+ big_rec_t* vector) /*!< in, own: big rec vector; it is
+ freed in this function */
+{
+ big_rec_field_t* b = vector->fields;
+ const big_rec_field_t* const end = b + vector->n_fields;
+
+ for (; b < end; b++) {
+ dfield_t* dfield;
+ ulint local_len;
+
+ dfield = dtuple_get_nth_field(entry, b->field_no);
+ local_len = dfield_get_len(dfield);
+
+ ut_ad(dfield_is_ext(dfield));
+ ut_ad(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Only in REDUNDANT and COMPACT format, we store
+ up to DICT_ANTELOPE_MAX_INDEX_COL_LEN (768) bytes
+ locally */
+ ut_ad(local_len <= DICT_ANTELOPE_MAX_INDEX_COL_LEN);
+
+ dfield_set_data(dfield,
+ (char*) b->data - local_len,
+ b->len + local_len);
+ }
+
+ mem_heap_free(vector->heap);
+}
+
+/** Allocate a big_rec_t object in the given memory heap, and for storing
+n_fld number of fields.
+@param[in] heap memory heap in which this object is allocated
+@param[in] n_fld maximum number of fields that can be stored in
+ this object
+
+@return the allocated object */
+big_rec_t*
+big_rec_t::alloc(
+ mem_heap_t* heap,
+ ulint n_fld)
+{
+ big_rec_t* rec = static_cast<big_rec_t*>(
+ mem_heap_alloc(heap, sizeof(big_rec_t)));
+
+ new(rec) big_rec_t(n_fld);
+
+ rec->heap = heap;
+ rec->fields = static_cast<big_rec_field_t*>(
+ mem_heap_alloc(heap,
+ n_fld * sizeof(big_rec_field_t)));
+
+ rec->n_fields = 0;
+ return(rec);
+}
+
+/** Create a deep copy of this object.
+@param[in,out] heap memory heap in which the clone will be created
+@return the cloned object */
+dfield_t*
+dfield_t::clone(mem_heap_t* heap) const
+{
+ const ulint size = len == UNIV_SQL_NULL ? 0 : len;
+ dfield_t* obj = static_cast<dfield_t*>(
+ mem_heap_alloc(heap, sizeof(dfield_t) + size));
+
+ ut_ad(len != UNIV_SQL_DEFAULT);
+ obj->ext = ext;
+ obj->len = len;
+ obj->type = type;
+ obj->spatial_status = spatial_status;
+
+ if (len != UNIV_SQL_NULL) {
+ obj->data = obj + 1;
+ memcpy(obj->data, data, len);
+ } else {
+ obj->data = 0;
+ }
+
+ return(obj);
+}
diff --git a/storage/innobase/data/data0type.cc b/storage/innobase/data/data0type.cc
new file mode 100644
index 00000000..7de4cc02
--- /dev/null
+++ b/storage/innobase/data/data0type.cc
@@ -0,0 +1,212 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file data/data0type.cc
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0mem.h"
+#include "my_sys.h"
+
+/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */
+const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] = {
+ 0, 0, 0, 0, 0, 0,
+ 0x80, 0, 0, 0, 0, 0, 0
+};
+
+/* At the database startup we store the default-charset collation number of
+this MySQL installation to this global variable. If we have < 4.1.2 format
+column definitions, or records in the insert buffer, we use this
+charset-collation code for them. */
+
+ulint data_mysql_default_charset_coll;
+
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return length of the prefix, in bytes */
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+ ulint prtype, /*!< in: precise type */
+ ulint mbminlen, /*!< in: minimum length of
+ a multi-byte character, in bytes */
+ ulint mbmaxlen, /*!< in: maximum length of
+ a multi-byte character, in bytes */
+ ulint prefix_len, /*!< in: length of the requested
+ prefix, in characters, multiplied by
+ dtype_get_mbmaxlen(dtype) */
+ ulint data_len, /*!< in: length of str (in bytes) */
+ const char* str) /*!< in: the string whose prefix
+ length is being determined */
+{
+ ut_a(len_is_stored(data_len));
+ ut_ad(!mbmaxlen || !(prefix_len % mbmaxlen));
+
+ if (mbminlen != mbmaxlen) {
+ ut_a(!(prefix_len % mbmaxlen));
+ return(innobase_get_at_most_n_mbchars(
+ dtype_get_charset_coll(prtype),
+ prefix_len, data_len, str));
+ }
+
+ if (prefix_len < data_len) {
+
+ return(prefix_len);
+
+ }
+
+ return(data_len);
+}
+
+/*********************************************************************//**
+Validates a data type structure.
+@return TRUE if ok */
+ibool
+dtype_validate(
+/*===========*/
+ const dtype_t* type) /*!< in: type struct to validate */
+{
+ ut_a(type);
+ ut_a(type->mtype >= DATA_VARCHAR);
+ ut_a(type->mtype <= DATA_MTYPE_MAX);
+
+ if (type->mtype == DATA_SYS) {
+ ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS);
+ }
+
+ ut_a(dtype_get_mbminlen(type) <= dtype_get_mbmaxlen(type));
+
+ return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+/** Print a data type structure.
+@param[in] type data type */
+void
+dtype_print(const dtype_t* type)
+{
+ ulint mtype;
+ ulint prtype;
+ ulint len;
+
+ ut_a(type);
+
+ mtype = type->mtype;
+ prtype = type->prtype;
+
+ switch (mtype) {
+ case DATA_VARCHAR:
+ fputs("DATA_VARCHAR", stderr);
+ break;
+
+ case DATA_CHAR:
+ fputs("DATA_CHAR", stderr);
+ break;
+
+ case DATA_BINARY:
+ fputs("DATA_BINARY", stderr);
+ break;
+
+ case DATA_FIXBINARY:
+ fputs("DATA_FIXBINARY", stderr);
+ break;
+
+ case DATA_BLOB:
+ fputs("DATA_BLOB", stderr);
+ break;
+
+ case DATA_GEOMETRY:
+ fputs("DATA_GEOMETRY", stderr);
+ break;
+
+ case DATA_INT:
+ fputs("DATA_INT", stderr);
+ break;
+
+ case DATA_MYSQL:
+ fputs("DATA_MYSQL", stderr);
+ break;
+
+ case DATA_SYS:
+ fputs("DATA_SYS", stderr);
+ break;
+
+ case DATA_FLOAT:
+ fputs("DATA_FLOAT", stderr);
+ break;
+
+ case DATA_DOUBLE:
+ fputs("DATA_DOUBLE", stderr);
+ break;
+
+ case DATA_DECIMAL:
+ fputs("DATA_DECIMAL", stderr);
+ break;
+
+ case DATA_VARMYSQL:
+ fputs("DATA_VARMYSQL", stderr);
+ break;
+
+ default:
+ fprintf(stderr, "type %lu", (ulong) mtype);
+ break;
+ }
+
+ len = type->len;
+
+ if ((type->mtype == DATA_SYS)
+ || (type->mtype == DATA_VARCHAR)
+ || (type->mtype == DATA_CHAR)) {
+ putc(' ', stderr);
+ if (prtype == DATA_ROW_ID) {
+ fputs("DATA_ROW_ID", stderr);
+ len = DATA_ROW_ID_LEN;
+ } else if (prtype == DATA_ROLL_PTR) {
+ fputs("DATA_ROLL_PTR", stderr);
+ len = DATA_ROLL_PTR_LEN;
+ } else if (prtype == DATA_TRX_ID) {
+ fputs("DATA_TRX_ID", stderr);
+ len = DATA_TRX_ID_LEN;
+ } else if (prtype == DATA_ENGLISH) {
+ fputs("DATA_ENGLISH", stderr);
+ } else {
+ fprintf(stderr, "prtype %lu", (ulong) prtype);
+ }
+ } else {
+ if (prtype & DATA_UNSIGNED) {
+ fputs(" DATA_UNSIGNED", stderr);
+ }
+
+ if (prtype & DATA_BINARY_TYPE) {
+ fputs(" DATA_BINARY_TYPE", stderr);
+ }
+
+ if (prtype & DATA_NOT_NULL) {
+ fputs(" DATA_NOT_NULL", stderr);
+ }
+ }
+
+ fprintf(stderr, " len %lu", (ulong) len);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc
new file mode 100644
index 00000000..bd2cf4ff
--- /dev/null
+++ b/storage/innobase/dict/dict0boot.cc
@@ -0,0 +1,492 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0boot.cc
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "btr0btr.h"
+#include "dict0load.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/** @return the DICT_HDR block, x-latched */
+buf_block_t *dict_hdr_get(mtr_t* mtr)
+{
+ buf_block_t *block= buf_page_get(page_id_t(DICT_HDR_SPACE, DICT_HDR_PAGE_NO),
+ 0, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_DICT_HEADER);
+ return block;
+}
+
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+void
+dict_hdr_get_new_id(
+/*================*/
+ table_id_t* table_id, /*!< out: table id
+ (not assigned if NULL) */
+ index_id_t* index_id, /*!< out: index id
+ (not assigned if NULL) */
+ ulint* space_id) /*!< out: space id
+ (not assigned if NULL) */
+{
+ ib_id_t id;
+ mtr_t mtr;
+
+ mtr.start();
+ buf_block_t* dict_hdr = dict_hdr_get(&mtr);
+
+ if (table_id) {
+ id = mach_read_from_8(DICT_HDR + DICT_HDR_TABLE_ID
+ + dict_hdr->frame);
+ id++;
+ mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_TABLE_ID
+ + dict_hdr->frame, id);
+ *table_id = id;
+ }
+
+ if (index_id) {
+ id = mach_read_from_8(DICT_HDR + DICT_HDR_INDEX_ID
+ + dict_hdr->frame);
+ id++;
+ mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_INDEX_ID
+ + dict_hdr->frame, id);
+ *index_id = id;
+ }
+
+ if (space_id) {
+ *space_id = mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID
+ + dict_hdr->frame);
+ if (fil_assign_new_space_id(space_id)) {
+ mtr.write<4>(*dict_hdr,
+ DICT_HDR + DICT_HDR_MAX_SPACE_ID
+ + dict_hdr->frame, *space_id);
+ }
+ }
+
+ mtr.commit();
+}
+
+/**********************************************************************//**
+Writes the current value of the row id counter to the dictionary header file
+page. */
+void
+dict_hdr_flush_row_id(void)
+/*=======================*/
+{
+ row_id_t id;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ id = dict_sys.row_id;
+
+ mtr.start();
+
+ buf_block_t* d = dict_hdr_get(&mtr);
+
+ mtr.write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->frame, id);
+
+ mtr.commit();
+}
+
+/*****************************************************************//**
+Creates the file page for the dictionary header. This function is
+called only at the database creation.
+@return TRUE if succeed */
+static
+ibool
+dict_hdr_create(
+/*============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ ulint root_page_no;
+
+ ut_ad(mtr);
+ compile_time_assert(DICT_HDR_SPACE == 0);
+
+ /* Create the dictionary header file block in a new, allocated file
+ segment in the system tablespace */
+ block = fseg_create(fil_system.sys_space,
+ DICT_HDR + DICT_HDR_FSEG_HEADER, mtr);
+
+ ut_a(block->page.id() == page_id_t(DICT_HDR_SPACE, DICT_HDR_PAGE_NO));
+
+ buf_block_t* d = dict_hdr_get(mtr);
+
+ /* Start counting row, table, index, and tree ids from
+ DICT_HDR_FIRST_ID */
+ mtr->write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->frame,
+ DICT_HDR_FIRST_ID);
+ mtr->write<8>(*d, DICT_HDR + DICT_HDR_TABLE_ID + d->frame,
+ DICT_HDR_FIRST_ID);
+ mtr->write<8>(*d, DICT_HDR + DICT_HDR_INDEX_ID + d->frame,
+ DICT_HDR_FIRST_ID);
+
+ ut_ad(!mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID + d->frame));
+
+ /* Obsolete, but we must initialize it anyway. */
+ mtr->write<4>(*d, DICT_HDR + DICT_HDR_MIX_ID_LOW + d->frame,
+ DICT_HDR_FIRST_ID);
+
+ /* Create the B-tree roots for the clustered indexes of the basic
+ system tables */
+
+ /*--------------------------*/
+ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+ fil_system.sys_space, DICT_TABLES_ID,
+ nullptr, mtr);
+ if (root_page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ mtr->write<4>(*d, DICT_HDR + DICT_HDR_TABLES + d->frame, root_page_no);
+ /*--------------------------*/
+ root_page_no = btr_create(DICT_UNIQUE,
+ fil_system.sys_space, DICT_TABLE_IDS_ID,
+ nullptr, mtr);
+ if (root_page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ mtr->write<4>(*d, DICT_HDR + DICT_HDR_TABLE_IDS + d->frame,
+ root_page_no);
+ /*--------------------------*/
+ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+ fil_system.sys_space, DICT_COLUMNS_ID,
+ nullptr, mtr);
+ if (root_page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ mtr->write<4>(*d, DICT_HDR + DICT_HDR_COLUMNS + d->frame,
+ root_page_no);
+ /*--------------------------*/
+ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+ fil_system.sys_space, DICT_INDEXES_ID,
+ nullptr, mtr);
+ if (root_page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ mtr->write<4>(*d, DICT_HDR + DICT_HDR_INDEXES + d->frame,
+ root_page_no);
+ /*--------------------------*/
+ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+ fil_system.sys_space, DICT_FIELDS_ID,
+ nullptr, mtr);
+ if (root_page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ mtr->write<4>(*d, DICT_HDR + DICT_HDR_FIELDS + d->frame, root_page_no);
+ /*--------------------------*/
+
+ return(TRUE);
+}
+
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created.
+@return DB_SUCCESS or error code. */
+dberr_t
+dict_boot(void)
+/*===========*/
+{
+ dict_table_t* table;
+ dict_index_t* index;
+ mem_heap_t* heap;
+ mtr_t mtr;
+
+ /* Be sure these constants do not ever change. To avoid bloat,
+ only check the *NUM_FIELDS* in each table */
+
+ ut_ad(DICT_NUM_COLS__SYS_TABLES == 8);
+ ut_ad(DICT_NUM_FIELDS__SYS_TABLES == 10);
+ ut_ad(DICT_NUM_FIELDS__SYS_TABLE_IDS == 2);
+ ut_ad(DICT_NUM_COLS__SYS_COLUMNS == 7);
+ ut_ad(DICT_NUM_FIELDS__SYS_COLUMNS == 9);
+ ut_ad(DICT_NUM_COLS__SYS_INDEXES == 8);
+ ut_ad(DICT_NUM_FIELDS__SYS_INDEXES == 10);
+ ut_ad(DICT_NUM_COLS__SYS_FIELDS == 3);
+ ut_ad(DICT_NUM_FIELDS__SYS_FIELDS == 5);
+ ut_ad(DICT_NUM_COLS__SYS_FOREIGN == 4);
+ ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN == 6);
+ ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2);
+ ut_ad(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4);
+ ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6);
+
+ mtr_start(&mtr);
+
+ /* Create the hash tables etc. */
+ dict_sys.create();
+
+ heap = mem_heap_create(450);
+
+ mutex_enter(&dict_sys.mutex);
+
+ /* Get the dictionary header */
+ const byte* dict_hdr = &dict_hdr_get(&mtr)->frame[DICT_HDR];
+
+ /* Because we only write new row ids to disk-based data structure
+ (dictionary header) when it is divisible by
+ DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover
+ the latest value of the row id counter. Therefore we advance
+ the counter at the database startup to avoid overlapping values.
+ Note that when a user after database startup first time asks for
+ a new row id, then because the counter is now divisible by
+ ..._MARGIN, it will immediately be updated to the disk-based
+ header. */
+
+ dict_sys.row_id = DICT_HDR_ROW_ID_WRITE_MARGIN
+ + ut_uint64_align_up(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID),
+ DICT_HDR_ROW_ID_WRITE_MARGIN);
+ if (ulint max_space_id = mach_read_from_4(dict_hdr
+ + DICT_HDR_MAX_SPACE_ID)) {
+ max_space_id--;
+ fil_assign_new_space_id(&max_space_id);
+ }
+
+ /* Insert into the dictionary cache the descriptions of the basic
+ system tables */
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_TABLES", fil_system.sys_space,
+ 8, 0, 0, 0);
+
+ dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0,
+ MAX_FULL_NAME_LEN);
+ dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8);
+ /* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */
+ dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4);
+ /* The low order bit of TYPE is always set to 1. If ROW_FORMAT
+ is not REDUNDANT or COMPACT, this field matches table->flags. */
+ dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0);
+ /* MIX_LEN may contain additional table flags when
+ ROW_FORMAT!=REDUNDANT. */
+ dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
+
+ table->id = DICT_TABLES_ID;
+
+ dict_table_add_system_columns(table, heap);
+ table->add_to_cache();
+ dict_sys.sys_tables = table;
+ mem_heap_empty(heap);
+
+ index = dict_mem_index_create(table, "CLUST_IND",
+ DICT_UNIQUE | DICT_CLUSTERED, 1);
+
+ dict_mem_index_add_field(index, "NAME", 0);
+
+ index->id = DICT_TABLES_ID;
+ dberr_t error = dict_index_add_to_cache(
+ index, mach_read_from_4(dict_hdr + DICT_HDR_TABLES));
+ ut_a(error == DB_SUCCESS);
+ ut_ad(!table->is_instant());
+ table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
+
+ /*-------------------------*/
+ index = dict_mem_index_create(table, "ID_IND", DICT_UNIQUE, 1);
+ dict_mem_index_add_field(index, "ID", 0);
+
+ index->id = DICT_TABLE_IDS_ID;
+ error = dict_index_add_to_cache(
+ index, mach_read_from_4(dict_hdr + DICT_HDR_TABLE_IDS));
+ ut_a(error == DB_SUCCESS);
+
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_COLUMNS", fil_system.sys_space,
+ 7, 0, 0, 0);
+
+ dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8);
+ dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4);
+
+ table->id = DICT_COLUMNS_ID;
+
+ dict_table_add_system_columns(table, heap);
+ table->add_to_cache();
+ dict_sys.sys_columns = table;
+ mem_heap_empty(heap);
+
+ index = dict_mem_index_create(table, "CLUST_IND",
+ DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+ dict_mem_index_add_field(index, "TABLE_ID", 0);
+ dict_mem_index_add_field(index, "POS", 0);
+
+ index->id = DICT_COLUMNS_ID;
+ error = dict_index_add_to_cache(
+ index, mach_read_from_4(dict_hdr + DICT_HDR_COLUMNS));
+ ut_a(error == DB_SUCCESS);
+ ut_ad(!table->is_instant());
+ table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
+
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_INDEXES", fil_system.sys_space,
+ DICT_NUM_COLS__SYS_INDEXES, 0, 0, 0);
+
+ dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8);
+ dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8);
+ dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
+ /* SYS_INDEXES.SPACE is redundant and not being read;
+ SYS_TABLES.SPACE is being used instead. */
+ dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "MERGE_THRESHOLD", DATA_INT, 0, 4);
+
+ table->id = DICT_INDEXES_ID;
+
+ dict_table_add_system_columns(table, heap);
+ /* The column SYS_INDEXES.MERGE_THRESHOLD was "instantly"
+ added in MySQL 5.7 and MariaDB 10.2.2. Assign it DEFAULT NULL.
+ Because of file format compatibility, we must treat SYS_INDEXES
+ as a special case, relaxing some debug assertions
+ for DICT_INDEXES_ID. */
+ dict_table_get_nth_col(table, DICT_COL__SYS_INDEXES__MERGE_THRESHOLD)
+ ->def_val.len = UNIV_SQL_NULL;
+ table->add_to_cache();
+ dict_sys.sys_indexes = table;
+ mem_heap_empty(heap);
+
+ index = dict_mem_index_create(table, "CLUST_IND",
+ DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+ dict_mem_index_add_field(index, "TABLE_ID", 0);
+ dict_mem_index_add_field(index, "ID", 0);
+
+ index->id = DICT_INDEXES_ID;
+ error = dict_index_add_to_cache(
+ index, mach_read_from_4(dict_hdr + DICT_HDR_INDEXES));
+ ut_a(error == DB_SUCCESS);
+ ut_ad(!table->is_instant());
+ table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
+
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_FIELDS", fil_system.sys_space,
+ 3, 0, 0, 0);
+
+ dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 8);
+ dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0);
+
+ table->id = DICT_FIELDS_ID;
+
+ dict_table_add_system_columns(table, heap);
+ table->add_to_cache();
+ dict_sys.sys_fields = table;
+ mem_heap_free(heap);
+
+ index = dict_mem_index_create(table, "CLUST_IND",
+ DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+ dict_mem_index_add_field(index, "INDEX_ID", 0);
+ dict_mem_index_add_field(index, "POS", 0);
+
+ index->id = DICT_FIELDS_ID;
+ error = dict_index_add_to_cache(
+ index, mach_read_from_4(dict_hdr + DICT_HDR_FIELDS));
+ ut_a(error == DB_SUCCESS);
+ ut_ad(!table->is_instant());
+ table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
+
+ mtr_commit(&mtr);
+
+ /*-------------------------*/
+
+ /* Initialize the insert buffer table and index for each tablespace */
+
+ dberr_t err = ibuf_init_at_db_start();
+
+ if (err == DB_SUCCESS) {
+ /* Load definitions of other indexes on system tables */
+
+ dict_load_sys_table(dict_sys.sys_tables);
+ dict_load_sys_table(dict_sys.sys_columns);
+ dict_load_sys_table(dict_sys.sys_indexes);
+ dict_load_sys_table(dict_sys.sys_fields);
+ }
+
+ mutex_exit(&dict_sys.mutex);
+
+ return(err);
+}
+
+/*****************************************************************//**
+Inserts the basic system table data into themselves in the database
+creation. */
+static
+void
+dict_insert_initial_data(void)
+/*==========================*/
+{
+ /* Does nothing yet */
+}
+
+/*****************************************************************//**
+Creates and initializes the data dictionary at the server bootstrap.
+@return DB_SUCCESS or error code. */
+dberr_t
+dict_create(void)
+/*=============*/
+{
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ dict_hdr_create(&mtr);
+
+ mtr_commit(&mtr);
+
+ dberr_t err = dict_boot();
+
+ if (err == DB_SUCCESS) {
+ dict_insert_initial_data();
+ }
+
+ return(err);
+}
diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc
new file mode 100644
index 00000000..55e3191c
--- /dev/null
+++ b/storage/innobase/dict/dict0crea.cc
@@ -0,0 +1,2237 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0crea.cc
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0crea.h"
+#include "btr0pcur.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif /* BTR_CUR_HASH_ADAPT */
+#include "page0page.h"
+#include "mach0data.h"
+#include "dict0boot.h"
+#include "dict0dict.h"
+#include "que0que.h"
+#include "row0ins.h"
+#include "row0mysql.h"
+#include "pars0pars.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "ut0vec.h"
+#include "dict0priv.h"
+#include "fts0priv.h"
+#include "srv0start.h"
+
+/*****************************************************************//**
+Based on a table object, this function builds the entry to be inserted
+in the SYS_TABLES system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_tables_tuple(
+/*=========================*/
+ const dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
+{
+ dtuple_t* entry;
+ dfield_t* dfield;
+ byte* ptr;
+ ulint type;
+
+ ut_ad(table);
+ ut_ad(!table->space || table->space->id == table->space_id);
+ ut_ad(heap);
+ ut_ad(table->n_cols >= DATA_N_SYS_COLS);
+
+ entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS);
+
+ dict_table_copy_types(entry, dict_sys.sys_tables);
+
+ /* 0: NAME -----------------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_TABLES__NAME);
+
+ dfield_set_data(dfield,
+ table->name.m_name, strlen(table->name.m_name));
+
+ /* 1: DB_TRX_ID added later */
+ /* 2: DB_ROLL_PTR added later */
+ /* 3: ID -------------------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_TABLES__ID);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(ptr, table->id);
+
+ dfield_set_data(dfield, ptr, 8);
+
+ /* 4: N_COLS ---------------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_TABLES__N_COLS);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+ /* If there is any virtual column, encode it in N_COLS */
+ mach_write_to_4(ptr, dict_table_encode_n_col(
+ ulint(table->n_cols - DATA_N_SYS_COLS),
+ ulint(table->n_v_def))
+ | (ulint(table->flags & DICT_TF_COMPACT) << 31));
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 5: TYPE (table flags) -----------------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_TABLES__TYPE);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+ /* Validate the table flags and convert them to what is saved in
+ SYS_TABLES.TYPE. Table flag values 0 and 1 are both written to
+ SYS_TABLES.TYPE as 1. */
+ type = dict_tf_to_sys_tables_type(table->flags);
+ mach_write_to_4(ptr, type);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 6: MIX_ID (obsolete) ---------------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_TABLES__MIX_ID);
+
+ ptr = static_cast<byte*>(mem_heap_zalloc(heap, 8));
+
+ dfield_set_data(dfield, ptr, 8);
+
+ /* 7: MIX_LEN (additional flags) --------------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_TABLES__MIX_LEN);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ /* Be sure all non-used bits are zero. */
+ ut_a(!(table->flags2 & DICT_TF2_UNUSED_BIT_MASK));
+ mach_write_to_4(ptr, table->flags2);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 8: CLUSTER_NAME ---------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_TABLES__CLUSTER_ID);
+ dfield_set_null(dfield); /* not supported */
+
+ /* 9: SPACE ----------------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_TABLES__SPACE);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, table->space_id);
+
+ dfield_set_data(dfield, ptr, 4);
+ /*----------------------------------*/
+
+ return(entry);
+}
+
+/*****************************************************************//**
+Based on a table object, this function builds the entry to be inserted
+in the SYS_COLUMNS system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_columns_tuple(
+/*==========================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint i, /*!< in: column number */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
+{
+ dtuple_t* entry;
+ const dict_col_t* column;
+ dfield_t* dfield;
+ byte* ptr;
+ const char* col_name;
+ ulint num_base = 0;
+ ulint v_col_no = ULINT_UNDEFINED;
+
+ ut_ad(table);
+ ut_ad(heap);
+
+ /* Any column beyond table->n_def would be virtual columns */
+ if (i >= table->n_def) {
+ dict_v_col_t* v_col = dict_table_get_nth_v_col(
+ table, i - table->n_def);
+ column = &v_col->m_col;
+ num_base = v_col->num_base;
+ v_col_no = column->ind;
+ } else {
+ column = dict_table_get_nth_col(table, i);
+ ut_ad(!column->is_virtual());
+ }
+
+ entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS);
+
+ dict_table_copy_types(entry, dict_sys.sys_columns);
+
+ /* 0: TABLE_ID -----------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__TABLE_ID);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(ptr, table->id);
+
+ dfield_set_data(dfield, ptr, 8);
+
+ /* 1: POS ----------------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__POS);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+ if (v_col_no != ULINT_UNDEFINED) {
+ /* encode virtual column's position in MySQL table and InnoDB
+ table in "POS" */
+ mach_write_to_4(ptr, dict_create_v_col_pos(
+ i - table->n_def, v_col_no));
+ } else {
+ mach_write_to_4(ptr, i);
+ }
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 2: DB_TRX_ID added later */
+ /* 3: DB_ROLL_PTR added later */
+ /* 4: NAME ---------------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__NAME);
+
+ if (i >= table->n_def) {
+ col_name = dict_table_get_v_col_name(table, i - table->n_def);
+ } else {
+ col_name = dict_table_get_col_name(table, i);
+ }
+
+ dfield_set_data(dfield, col_name, strlen(col_name));
+
+ /* 5: MTYPE --------------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__MTYPE);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, column->mtype);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 6: PRTYPE -------------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PRTYPE);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, column->prtype);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 7: LEN ----------------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__LEN);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, column->len);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 8: PREC ---------------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PREC);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, num_base);
+
+ dfield_set_data(dfield, ptr, 4);
+ /*---------------------------------*/
+
+ return(entry);
+}
+
+/** Based on a table object, this function builds the entry to be inserted
+in the SYS_VIRTUAL system table. Each row maps a virtual column to one of
+its base column.
+@param[in] table table
+@param[in] v_col_n virtual column number
+@param[in] b_col_n base column sequence num
+@param[in] heap memory heap
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_virtual_tuple(
+ const dict_table_t* table,
+ ulint v_col_n,
+ ulint b_col_n,
+ mem_heap_t* heap)
+{
+ dtuple_t* entry;
+ const dict_col_t* base_column;
+ dfield_t* dfield;
+ byte* ptr;
+
+ ut_ad(table);
+ ut_ad(heap);
+
+ ut_ad(v_col_n < table->n_v_def);
+ dict_v_col_t* v_col = dict_table_get_nth_v_col(table, v_col_n);
+ base_column = v_col->base_col[b_col_n];
+
+ entry = dtuple_create(heap, DICT_NUM_COLS__SYS_VIRTUAL
+ + DATA_N_SYS_COLS);
+
+ dict_table_copy_types(entry, dict_sys.sys_virtual);
+
+ /* 0: TABLE_ID -----------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__TABLE_ID);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(ptr, table->id);
+
+ dfield_set_data(dfield, ptr, 8);
+
+ /* 1: POS ---------------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__POS);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ ulint v_col_no = dict_create_v_col_pos(v_col_n, v_col->m_col.ind);
+ mach_write_to_4(ptr, v_col_no);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 2: BASE_POS ----------------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__BASE_POS);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, base_column->ind);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 3: DB_TRX_ID added later */
+ /* 4: DB_ROLL_PTR added later */
+
+ /*---------------------------------*/
+ return(entry);
+}
+
+/***************************************************************//**
+Builds a table definition to insert.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+dict_build_table_def_step(
+/*======================*/
+ que_thr_t* thr, /*!< in: query thread */
+ tab_node_t* node) /*!< in: table create node */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+ dict_table_t* table = node->table;
+ trx_t* trx = thr_get_trx(thr);
+ ut_ad(!table->is_temporary());
+ ut_ad(!table->space);
+ ut_ad(table->space_id == ULINT_UNDEFINED);
+ dict_hdr_get_new_id(&table->id, NULL, NULL);
+ trx->table_id = table->id;
+
+ /* Always set this bit for all new created tables */
+ DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME);
+ DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+ DICT_TF2_FLAG_UNSET(table,
+ DICT_TF2_FTS_AUX_HEX_NAME););
+
+ if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_FILE_PER_TABLE)) {
+ /* This table will need a new tablespace. */
+
+ ut_ad(DICT_TF_GET_ZIP_SSIZE(table->flags) == 0
+ || dict_table_has_atomic_blobs(table));
+ mtr_t mtr;
+ trx_undo_t* undo = trx->rsegs.m_redo.undo;
+ if (undo && !undo->table_id
+ && trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE) {
+ /* This must be a TRUNCATE operation where
+ the empty table is created after the old table
+ was renamed. Be sure to mark the transaction
+ associated with the new empty table, so that
+ we can remove it on recovery. */
+ mtr.start();
+ undo->table_id = trx->table_id;
+ undo->dict_operation = TRUE;
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(trx->rsegs.m_redo.rseg->space->id,
+ undo->hdr_page_no),
+ &mtr);
+ mtr.write<1,mtr_t::MAYBE_NOP>(
+ *block,
+ block->frame + undo->hdr_offset
+ + TRX_UNDO_DICT_TRANS, 1U);
+ mtr.write<8,mtr_t::MAYBE_NOP>(
+ *block,
+ block->frame + undo->hdr_offset
+ + TRX_UNDO_TABLE_ID, trx->table_id);
+ mtr.commit();
+ log_write_up_to(mtr.commit_lsn(), true);
+ }
+ /* Get a new tablespace ID */
+ ulint space_id;
+ dict_hdr_get_new_id(NULL, NULL, &space_id);
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_out_of_space_ids",
+ space_id = ULINT_UNDEFINED;
+ );
+
+ if (space_id == ULINT_UNDEFINED) {
+ return DB_ERROR;
+ }
+
+ /* Determine the tablespace flags. */
+ bool has_data_dir = DICT_TF_HAS_DATA_DIR(table->flags);
+ ulint fsp_flags = dict_tf_to_fsp_flags(table->flags);
+ ut_ad(!has_data_dir || table->data_dir_path);
+ char* filepath = has_data_dir
+ ? fil_make_filepath(table->data_dir_path,
+ table->name.m_name, IBD, true)
+ : fil_make_filepath(NULL,
+ table->name.m_name, IBD, false);
+
+ /* We create a new single-table tablespace for the table.
+ We initially let it be 4 pages:
+ - page 0 is the fsp header and an extent descriptor page,
+ - page 1 is an ibuf bitmap page,
+ - page 2 is the first inode page,
+ - page 3 will contain the root of the clustered index of
+ the table we create here. */
+
+ dberr_t err;
+ table->space = fil_ibd_create(
+ space_id, table->name.m_name, filepath, fsp_flags,
+ FIL_IBD_FILE_INITIAL_SIZE,
+ node->mode, node->key_id, &err);
+
+ ut_free(filepath);
+
+ if (!table->space) {
+ ut_ad(err != DB_SUCCESS);
+ return err;
+ }
+
+ table->space_id = space_id;
+ mtr.start();
+ mtr.set_named_space(table->space);
+ fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr);
+ mtr.commit();
+ } else {
+ ut_ad(dict_tf_get_rec_format(table->flags)
+ != REC_FORMAT_COMPRESSED);
+ table->space = fil_system.sys_space;
+ table->space_id = TRX_SYS_SPACE;
+ }
+
+ ins_node_set_new_row(node->tab_def,
+ dict_create_sys_tables_tuple(table, node->heap));
+ return DB_SUCCESS;
+}
+
+/** Builds a SYS_VIRTUAL row definition to insert.
+@param[in] node table create node */
+static
+void
+dict_build_v_col_def_step(
+ tab_node_t* node)
+{
+ dtuple_t* row;
+
+ row = dict_create_sys_virtual_tuple(node->table, node->col_no,
+ node->base_col_no,
+ node->heap);
+ ins_node_set_new_row(node->v_col_def, row);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_INDEXES system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_indexes_tuple(
+/*==========================*/
+ const dict_index_t* index, /*!< in: index */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
+{
+ dtuple_t* entry;
+ dfield_t* dfield;
+ byte* ptr;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_ad(index);
+ ut_ad(index->table->space || index->table->file_unreadable);
+ ut_ad(!index->table->space
+ || index->table->space->id == index->table->space_id);
+ ut_ad(heap);
+
+ entry = dtuple_create(
+ heap, DICT_NUM_COLS__SYS_INDEXES + DATA_N_SYS_COLS);
+
+ dict_table_copy_types(entry, dict_sys.sys_indexes);
+
+ /* 0: TABLE_ID -----------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_INDEXES__TABLE_ID);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(ptr, index->table->id);
+
+ dfield_set_data(dfield, ptr, 8);
+
+ /* 1: ID ----------------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_INDEXES__ID);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(ptr, index->id);
+
+ dfield_set_data(dfield, ptr, 8);
+
+ /* 2: DB_TRX_ID added later */
+ /* 3: DB_ROLL_PTR added later */
+ /* 4: NAME --------------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_INDEXES__NAME);
+
+ if (!index->is_committed()) {
+ ulint len = strlen(index->name) + 1;
+ char* name = static_cast<char*>(
+ mem_heap_alloc(heap, len));
+ *name = *TEMP_INDEX_PREFIX_STR;
+ memcpy(name + 1, index->name, len - 1);
+ dfield_set_data(dfield, name, len);
+ } else {
+ dfield_set_data(dfield, index->name, strlen(index->name));
+ }
+
+ /* 5: N_FIELDS ----------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_INDEXES__N_FIELDS);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, index->n_fields);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 6: TYPE --------------------------*/
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_INDEXES__TYPE);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, index->type);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 7: SPACE --------------------------*/
+
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_INDEXES__SPACE);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, index->table->space_id);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 8: PAGE_NO --------------------------*/
+
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_INDEXES__PAGE_NO);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, FIL_NULL);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 9: MERGE_THRESHOLD ----------------*/
+
+ dfield = dtuple_get_nth_field(
+ entry, DICT_COL__SYS_INDEXES__MERGE_THRESHOLD);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(ptr, DICT_INDEX_MERGE_THRESHOLD_DEFAULT);
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /*--------------------------------*/
+
+ return(entry);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_FIELDS system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_fields_tuple(
+/*=========================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint fld_no, /*!< in: field number */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
+{
+ dtuple_t* entry;
+ dict_field_t* field;
+ dfield_t* dfield;
+ byte* ptr;
+ ibool index_contains_column_prefix_field = FALSE;
+ ulint j;
+
+ ut_ad(index);
+ ut_ad(heap);
+
+ for (j = 0; j < index->n_fields; j++) {
+ if (dict_index_get_nth_field(index, j)->prefix_len > 0) {
+ index_contains_column_prefix_field = TRUE;
+ break;
+ }
+ }
+
+ field = dict_index_get_nth_field(index, fld_no);
+
+ entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS);
+
+ dict_table_copy_types(entry, dict_sys.sys_fields);
+
+ /* 0: INDEX_ID -----------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__INDEX_ID);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(ptr, index->id);
+
+ dfield_set_data(dfield, ptr, 8);
+
+ /* 1: POS; FIELD NUMBER & PREFIX LENGTH -----------------------*/
+
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__POS);
+
+ ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+ if (index_contains_column_prefix_field) {
+ /* If there are column prefix fields in the index, then
+ we store the number of the field to the 2 HIGH bytes
+ and the prefix length to the 2 low bytes, */
+
+ mach_write_to_4(ptr, (fld_no << 16) + field->prefix_len);
+ } else {
+ /* Else we store the number of the field to the 2 LOW bytes.
+ This is to keep the storage format compatible with
+ InnoDB versions < 4.0.14. */
+
+ mach_write_to_4(ptr, fld_no);
+ }
+
+ dfield_set_data(dfield, ptr, 4);
+
+ /* 2: DB_TRX_ID added later */
+ /* 3: DB_ROLL_PTR added later */
+ /* 4: COL_NAME -------------------------*/
+ dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__COL_NAME);
+
+ dfield_set_data(dfield, field->name, strlen(field->name));
+ /*---------------------------------*/
+
+ return(entry);
+}
+
+/*****************************************************************//**
+Creates the tuple with which the index entry is searched for writing the index
+tree root page number, if such a tree is created.
+@return the tuple for search */
+static
+dtuple_t*
+dict_create_search_tuple(
+/*=====================*/
+ const dtuple_t* tuple, /*!< in: the tuple inserted in the SYS_INDEXES
+ table */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory for
+ the built tuple is allocated */
+{
+ dtuple_t* search_tuple;
+ const dfield_t* field1;
+ dfield_t* field2;
+
+ ut_ad(tuple && heap);
+
+ search_tuple = dtuple_create(heap, 2);
+
+ field1 = dtuple_get_nth_field(tuple, 0);
+ field2 = dtuple_get_nth_field(search_tuple, 0);
+
+ dfield_copy(field2, field1);
+
+ field1 = dtuple_get_nth_field(tuple, 1);
+ field2 = dtuple_get_nth_field(search_tuple, 1);
+
+ dfield_copy(field2, field1);
+
+ ut_ad(dtuple_validate(search_tuple));
+
+ return(search_tuple);
+}
+
+/***************************************************************//**
+Builds an index definition row to insert.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+dict_build_index_def_step(
+/*======================*/
+ que_thr_t* thr, /*!< in: query thread */
+ ind_node_t* node) /*!< in: index create node */
+{
+ dict_table_t* table;
+ dict_index_t* index;
+ dtuple_t* row;
+ trx_t* trx;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ trx = thr_get_trx(thr);
+
+ index = node->index;
+
+ table = index->table = node->table = dict_table_open_on_name(
+ node->table_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+
+ if (table == NULL) {
+ return(DB_TABLE_NOT_FOUND);
+ }
+
+ if (!trx->table_id) {
+ /* Record only the first table id. */
+ trx->table_id = table->id;
+ }
+
+ ut_ad((UT_LIST_GET_LEN(table->indexes) > 0)
+ || dict_index_is_clust(index));
+
+ dict_hdr_get_new_id(NULL, &index->id, NULL);
+
+ /* Inherit the space id from the table; we store all indexes of a
+ table in the same tablespace */
+
+ node->page_no = FIL_NULL;
+ row = dict_create_sys_indexes_tuple(index, node->heap);
+ node->ind_row = row;
+
+ ins_node_set_new_row(node->ind_def, row);
+
+ /* Note that the index was created by this transaction. */
+ index->trx_id = trx->id;
+ ut_ad(table->def_trx_id <= trx->id);
+ table->def_trx_id = trx->id;
+ dict_table_close(table, true, false);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Builds an index definition without updating SYSTEM TABLES.
+@return DB_SUCCESS or error code */
+void
+dict_build_index_def(
+/*=================*/
+ const dict_table_t* table, /*!< in: table */
+ dict_index_t* index, /*!< in/out: index */
+ trx_t* trx) /*!< in/out: InnoDB transaction handle */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ if (trx->table_id == 0) {
+ /* Record only the first table id. */
+ trx->table_id = table->id;
+ }
+
+ ut_ad((UT_LIST_GET_LEN(table->indexes) > 0)
+ || dict_index_is_clust(index));
+
+ dict_hdr_get_new_id(NULL, &index->id, NULL);
+
+ /* Note that the index was created by this transaction. */
+ index->trx_id = trx->id;
+}
+
+/***************************************************************//**
+Builds a field definition row to insert. */
+static
+void
+dict_build_field_def_step(
+/*======================*/
+ ind_node_t* node) /*!< in: index create node */
+{
+ dict_index_t* index;
+ dtuple_t* row;
+
+ index = node->index;
+
+ row = dict_create_sys_fields_tuple(index, node->field_no, node->heap);
+
+ ins_node_set_new_row(node->field_def, row);
+}
+
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+dict_create_index_tree_step(
+/*========================*/
+ ind_node_t* node) /*!< in: index create node */
+{
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ dict_index_t* index;
+ dtuple_t* search_tuple;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ index = node->index;
+
+ if (index->type == DICT_FTS) {
+ /* FTS index does not need an index tree */
+ return(DB_SUCCESS);
+ }
+
+ /* Run a mini-transaction in which the index tree is allocated for
+ the index and its root address is written to the index entry in
+ sys_indexes */
+
+ mtr.start();
+
+ search_tuple = dict_create_search_tuple(node->ind_row, node->heap);
+
+ btr_pcur_open(UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes),
+ search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF,
+ &pcur, &mtr);
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+
+ dberr_t err = DB_SUCCESS;
+
+ if (!index->is_readable()) {
+ node->page_no = FIL_NULL;
+ } else {
+ index->set_modified(mtr);
+
+ node->page_no = btr_create(
+ index->type, index->table->space,
+ index->id, index, &mtr);
+
+ if (node->page_no == FIL_NULL) {
+ err = DB_OUT_OF_FILE_SPACE;
+ }
+
+ DBUG_EXECUTE_IF("ib_import_create_index_failure_1",
+ node->page_no = FIL_NULL;
+ err = DB_OUT_OF_FILE_SPACE; );
+ }
+
+ ulint len;
+ byte* data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur),
+ DICT_FLD__SYS_INDEXES__PAGE_NO,
+ &len);
+ ut_ad(len == 4);
+ mtr.write<4,mtr_t::MAYBE_NOP>(*btr_pcur_get_block(&pcur), data,
+ node->page_no);
+
+ mtr.commit();
+
+ return(err);
+}
+
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+Don't update SYSTEM TABLES.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+dict_create_index_tree_in_mem(
+/*==========================*/
+ dict_index_t* index, /*!< in/out: index */
+ const trx_t* trx) /*!< in: InnoDB transaction handle */
+{
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_ad(!(index->type & DICT_FTS));
+
+ mtr_start(&mtr);
+ mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+ /* Currently this function is being used by temp-tables only.
+ Import/Discard of temp-table is blocked and so this assert. */
+ ut_ad(index->is_readable());
+ ut_ad(!(index->table->flags2 & DICT_TF2_DISCARDED));
+
+ index->page = btr_create(index->type, index->table->space,
+ index->id, index, &mtr);
+ mtr_commit(&mtr);
+
+ index->trx_id = trx->id;
+
+ return index->page == FIL_NULL ? DB_OUT_OF_FILE_SPACE : DB_SUCCESS;
+}
+
+/** Drop the index tree associated with a row in SYS_INDEXES table.
+@param[in,out] pcur persistent cursor on rec
+@param[in,out] trx dictionary transaction
+@param[in,out] mtr mini-transaction */
+void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
+{
+ rec_t* rec = btr_pcur_get_rec(pcur);
+ byte* ptr;
+ ulint len;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_a(!dict_table_is_comp(dict_sys.sys_indexes));
+
+ ptr = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+
+ ut_ad(len == 4);
+
+ btr_pcur_store_position(pcur, mtr);
+
+ const uint32_t root_page_no = mach_read_from_4(ptr);
+
+ if (root_page_no == FIL_NULL) {
+ /* The tree has already been freed */
+ return;
+ }
+
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ mtr->memset(btr_pcur_get_block(pcur), page_offset(ptr), 4, 0xff);
+
+ ptr = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
+
+ ut_ad(len == 4);
+
+ const uint32_t space_id = mach_read_from_4(ptr);
+ ut_ad(space_id < SRV_TMP_SPACE_ID);
+ if (space_id != TRX_SYS_SPACE
+ && trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE) {
+ /* We are about to delete the entire .ibd file;
+ do not bother to free pages inside it. */
+ return;
+ }
+
+ ptr = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__ID, &len);
+
+ ut_ad(len == 8);
+
+ if (fil_space_t* s = fil_space_t::get(space_id)) {
+ /* Ensure that the tablespace file exists
+ in order to avoid a crash in buf_page_get_gen(). */
+ if (root_page_no < s->get_size()) {
+ btr_free_if_exists(page_id_t(space_id, root_page_no),
+ s->zip_size(),
+ mach_read_from_8(ptr), mtr);
+ }
+ s->release();
+ }
+}
+
+/*********************************************************************//**
+Creates a table create graph.
+@return own: table create node */
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+ dict_table_t* table, /*!< in: table to create, built as a memory data
+ structure */
+ mem_heap_t* heap, /*!< in: heap where created */
+ fil_encryption_t mode, /*!< in: encryption mode */
+ uint32_t key_id) /*!< in: encryption key_id */
+{
+ tab_node_t* node;
+
+ node = static_cast<tab_node_t*>(
+ mem_heap_alloc(heap, sizeof(tab_node_t)));
+
+ node->common.type = QUE_NODE_CREATE_TABLE;
+
+ node->table = table;
+
+ node->state = TABLE_BUILD_TABLE_DEF;
+ node->heap = mem_heap_create(256);
+ node->mode = mode;
+ node->key_id = key_id;
+
+ node->tab_def = ins_node_create(INS_DIRECT, dict_sys.sys_tables,
+ heap);
+ node->tab_def->common.parent = node;
+
+ node->col_def = ins_node_create(INS_DIRECT, dict_sys.sys_columns,
+ heap);
+ node->col_def->common.parent = node;
+
+ node->v_col_def = ins_node_create(INS_DIRECT, dict_sys.sys_virtual,
+ heap);
+ node->v_col_def->common.parent = node;
+
+ return(node);
+}
+
+/** Creates an index create graph.
+@param[in] index index to create, built as a memory data structure
+@param[in] table table name
+@param[in,out] heap heap where created
+@param[in] add_v new virtual columns added in the same clause with
+ add index
+@return own: index create node */
+ind_node_t*
+ind_create_graph_create(
+ dict_index_t* index,
+ const char* table,
+ mem_heap_t* heap,
+ const dict_add_v_col_t* add_v)
+{
+ ind_node_t* node;
+
+ node = static_cast<ind_node_t*>(
+ mem_heap_alloc(heap, sizeof(ind_node_t)));
+
+ node->common.type = QUE_NODE_CREATE_INDEX;
+
+ node->index = index;
+
+ node->table_name = table;
+
+ node->add_v = add_v;
+
+ node->state = INDEX_BUILD_INDEX_DEF;
+ node->page_no = FIL_NULL;
+ node->heap = mem_heap_create(256);
+
+ node->ind_def = ins_node_create(INS_DIRECT,
+ dict_sys.sys_indexes, heap);
+ node->ind_def->common.parent = node;
+
+ node->field_def = ins_node_create(INS_DIRECT,
+ dict_sys.sys_fields, heap);
+ node->field_def->common.parent = node;
+
+ return(node);
+}
+
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ tab_node_t* node;
+ dberr_t err = DB_ERROR;
+ trx_t* trx;
+
+ ut_ad(thr);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ trx = thr_get_trx(thr);
+
+ node = static_cast<tab_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = TABLE_BUILD_TABLE_DEF;
+ }
+
+ if (node->state == TABLE_BUILD_TABLE_DEF) {
+
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = dict_build_table_def_step(thr, node);
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->state = TABLE_BUILD_COL_DEF;
+ node->col_no = 0;
+
+ thr->run_node = node->tab_def;
+
+ return(thr);
+ }
+
+ if (node->state == TABLE_BUILD_COL_DEF) {
+
+ if (node->col_no + DATA_N_SYS_COLS
+ < (static_cast<ulint>(node->table->n_def)
+ + static_cast<ulint>(node->table->n_v_def))) {
+
+ ulint i = node->col_no++;
+ if (i + DATA_N_SYS_COLS >= node->table->n_def) {
+ i += DATA_N_SYS_COLS;
+ }
+
+ ins_node_set_new_row(
+ node->col_def,
+ dict_create_sys_columns_tuple(node->table, i,
+ node->heap));
+
+ thr->run_node = node->col_def;
+
+ return(thr);
+ } else {
+ /* Move on to SYS_VIRTUAL table */
+ node->col_no = 0;
+ node->base_col_no = 0;
+ node->state = TABLE_BUILD_V_COL_DEF;
+ }
+ }
+
+ if (node->state == TABLE_BUILD_V_COL_DEF) {
+
+ if (node->col_no < static_cast<ulint>(node->table->n_v_def)) {
+ dict_v_col_t* v_col = dict_table_get_nth_v_col(
+ node->table, node->col_no);
+
+ /* If no base column */
+ while (v_col->num_base == 0) {
+ node->col_no++;
+ if (node->col_no == static_cast<ulint>(
+ (node->table)->n_v_def)) {
+ node->state = TABLE_ADD_TO_CACHE;
+ break;
+ }
+
+ v_col = dict_table_get_nth_v_col(
+ node->table, node->col_no);
+ node->base_col_no = 0;
+ }
+
+ if (node->state != TABLE_ADD_TO_CACHE) {
+ ut_ad(node->col_no == v_col->v_pos);
+ dict_build_v_col_def_step(node);
+
+ if (node->base_col_no
+ < unsigned{v_col->num_base} - 1) {
+ /* move on to next base column */
+ node->base_col_no++;
+ } else {
+ /* move on to next virtual column */
+ node->col_no++;
+ node->base_col_no = 0;
+ }
+
+ thr->run_node = node->v_col_def;
+
+ return(thr);
+ }
+ } else {
+ node->state = TABLE_ADD_TO_CACHE;
+ }
+ }
+
+ if (node->state == TABLE_ADD_TO_CACHE) {
+ DBUG_EXECUTE_IF("ib_ddl_crash_during_create", DBUG_SUICIDE(););
+
+ node->table->can_be_evicted = true;
+ node->table->add_to_cache();
+
+ err = DB_SUCCESS;
+ }
+
+function_exit:
+ trx->error_state = err;
+
+ if (err == DB_SUCCESS) {
+ /* Ok: do nothing */
+
+ } else if (err == DB_LOCK_WAIT) {
+
+ return(NULL);
+ } else {
+ /* SQL error detected */
+
+ return(NULL);
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ind_node_t* node;
+ dberr_t err = DB_ERROR;
+ trx_t* trx;
+
+ ut_ad(thr);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ trx = thr_get_trx(thr);
+
+ node = static_cast<ind_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = INDEX_BUILD_INDEX_DEF;
+ }
+
+ if (node->state == INDEX_BUILD_INDEX_DEF) {
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+ err = dict_build_index_def_step(thr, node);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->state = INDEX_BUILD_FIELD_DEF;
+ node->field_no = 0;
+
+ thr->run_node = node->ind_def;
+
+ return(thr);
+ }
+
+ if (node->state == INDEX_BUILD_FIELD_DEF) {
+
+ if (node->field_no < (node->index)->n_fields) {
+
+ dict_build_field_def_step(node);
+
+ node->field_no++;
+
+ thr->run_node = node->field_def;
+
+ return(thr);
+ } else {
+ node->state = INDEX_ADD_TO_CACHE;
+ }
+ }
+
+ if (node->state == INDEX_ADD_TO_CACHE) {
+ ut_ad(node->index->table == node->table);
+ err = dict_index_add_to_cache(node->index, FIL_NULL,
+ node->add_v);
+
+ ut_ad((node->index == NULL) == (err != DB_SUCCESS));
+
+ if (!node->index) {
+ goto function_exit;
+ }
+
+ ut_ad(!node->index->is_instant());
+ ut_ad(node->index->n_core_null_bytes
+ == ((dict_index_is_clust(node->index)
+ && node->table->supports_instant())
+ ? dict_index_t::NO_CORE_NULL_BYTES
+ : UT_BITS_IN_BYTES(
+ unsigned(node->index->n_nullable))));
+ node->index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(node->index->n_nullable)));
+ node->state = INDEX_CREATE_INDEX_TREE;
+ }
+
+ if (node->state == INDEX_CREATE_INDEX_TREE) {
+
+ err = dict_create_index_tree_step(node);
+
+ DBUG_EXECUTE_IF("ib_dict_create_index_tree_fail",
+ err = DB_OUT_OF_MEMORY;);
+
+ if (err != DB_SUCCESS) {
+ /* If this is a FTS index, we will need to remove
+ it from fts->cache->indexes list as well */
+ if ((node->index->type & DICT_FTS)
+ && node->table->fts) {
+ fts_index_cache_t* index_cache;
+
+ rw_lock_x_lock(
+ &node->table->fts->cache->init_lock);
+
+ index_cache = (fts_index_cache_t*)
+ fts_find_index_cache(
+ node->table->fts->cache,
+ node->index);
+
+ if (index_cache->words) {
+ rbt_free(index_cache->words);
+ index_cache->words = 0;
+ }
+
+ ib_vector_remove(
+ node->table->fts->cache->indexes,
+ *reinterpret_cast<void**>(index_cache));
+
+ rw_lock_x_unlock(
+ &node->table->fts->cache->init_lock);
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!node->index->search_info->ref_count);
+#endif /* BTR_CUR_HASH_ADAPT */
+ dict_index_remove_from_cache(node->table, node->index);
+ node->index = NULL;
+
+ goto function_exit;
+ }
+
+ node->index->page = node->page_no;
+ /* These should have been set in
+ dict_build_index_def_step() and
+ dict_index_add_to_cache(). */
+ ut_ad(node->index->trx_id == trx->id);
+ ut_ad(node->index->table->def_trx_id == trx->id);
+ }
+
+function_exit:
+ trx->error_state = err;
+
+ if (err == DB_SUCCESS) {
+ /* Ok: do nothing */
+
+ } else if (err == DB_LOCK_WAIT) {
+
+ return(NULL);
+ } else {
+ /* SQL error detected */
+
+ return(NULL);
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+/****************************************************************//**
+Check whether a system table exists. Additionally, if it exists,
+move it to the non-LRU end of the table LRU list. This is oly used
+for system tables that can be upgraded or added to an older database,
+which include SYS_FOREIGN, SYS_FOREIGN_COLS, SYS_TABLESPACES and
+SYS_DATAFILES.
+@return DB_SUCCESS if the sys table exists, DB_CORRUPTION if it exists
+but is not current, DB_TABLE_NOT_FOUND if it does not exist*/
+static
+dberr_t
+dict_check_if_system_table_exists(
+/*==============================*/
+ const char* tablename, /*!< in: name of table */
+ ulint num_fields, /*!< in: number of fields */
+ ulint num_indexes) /*!< in: number of indexes */
+{
+ dict_table_t* sys_table;
+ dberr_t error = DB_SUCCESS;
+
+ ut_ad(!srv_any_background_activity());
+
+ mutex_enter(&dict_sys.mutex);
+
+ sys_table = dict_table_get_low(tablename);
+
+ if (sys_table == NULL) {
+ error = DB_TABLE_NOT_FOUND;
+
+ } else if (UT_LIST_GET_LEN(sys_table->indexes) != num_indexes
+ || sys_table->n_cols != num_fields) {
+ error = DB_CORRUPTION;
+
+ } else {
+ /* This table has already been created, and it is OK.
+ Ensure that it can't be evicted from the table LRU cache. */
+
+ dict_table_prevent_eviction(sys_table);
+ }
+
+ mutex_exit(&dict_sys.mutex);
+
+ return(error);
+}
+
+/****************************************************************//**
+Creates the foreign key constraints system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_create_or_check_foreign_constraint_tables(void)
+/*================================================*/
+{
+ trx_t* trx;
+ my_bool srv_file_per_table_backup;
+ dberr_t err;
+ dberr_t sys_foreign_err;
+ dberr_t sys_foreign_cols_err;
+
+ ut_ad(!srv_any_background_activity());
+
+ /* Note: The master thread has not been started at this point. */
+
+
+ sys_foreign_err = dict_check_if_system_table_exists(
+ "SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3);
+ sys_foreign_cols_err = dict_check_if_system_table_exists(
+ "SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1);
+
+ if (sys_foreign_err == DB_SUCCESS
+ && sys_foreign_cols_err == DB_SUCCESS) {
+ return(DB_SUCCESS);
+ }
+
+ if (srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) {
+ return(DB_READ_ONLY);
+ }
+
+ trx = trx_create();
+
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+ trx->op_info = "creating foreign key sys tables";
+
+ row_mysql_lock_data_dictionary(trx);
+
+ DBUG_EXECUTE_IF(
+ "create_and_drop_garbage",
+ err = que_eval_sql(
+ NULL,
+ "PROCEDURE CREATE_GARBAGE_TABLE_PROC () IS\n"
+ "BEGIN\n"
+ "CREATE TABLE\n"
+ "\"test/#sql-ib-garbage\"(ID CHAR);\n"
+ "CREATE UNIQUE CLUSTERED INDEX PRIMARY"
+ " ON \"test/#sql-ib-garbage\"(ID);\n"
+ "END;\n", FALSE, trx);
+ ut_ad(err == DB_SUCCESS);
+ row_drop_table_for_mysql("test/#sql-ib-garbage", trx,
+ SQLCOM_DROP_DB, true););
+
+ /* Check which incomplete table definition to drop. */
+
+ if (sys_foreign_err == DB_CORRUPTION) {
+ row_drop_table_after_create_fail("SYS_FOREIGN", trx);
+ }
+
+ if (sys_foreign_cols_err == DB_CORRUPTION) {
+ row_drop_table_after_create_fail("SYS_FOREIGN_COLS", trx);
+ }
+
+ ib::info() << "Creating foreign key constraint system tables.";
+
+ /* NOTE: in dict_load_foreigns we use the fact that
+ there are 2 secondary indexes on SYS_FOREIGN, and they
+ are defined just like below */
+
+ /* NOTE: when designing InnoDB's foreign key support in 2001, we made
+ an error and made the table names and the foreign key id of type
+ 'CHAR' (internally, really a VARCHAR). We should have made the type
+ VARBINARY, like in other InnoDB system tables, to get a clean
+ design. */
+
+ srv_file_per_table_backup = srv_file_per_table;
+
+ /* We always want SYSTEM tables to be created inside the system
+ tablespace. */
+
+ srv_file_per_table = 0;
+
+ err = que_eval_sql(
+ NULL,
+ "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n"
+ "BEGIN\n"
+ "CREATE TABLE\n"
+ "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR,"
+ " REF_NAME CHAR, N_COLS INT);\n"
+ "CREATE UNIQUE CLUSTERED INDEX ID_IND"
+ " ON SYS_FOREIGN (ID);\n"
+ "CREATE INDEX FOR_IND"
+ " ON SYS_FOREIGN (FOR_NAME);\n"
+ "CREATE INDEX REF_IND"
+ " ON SYS_FOREIGN (REF_NAME);\n"
+ "CREATE TABLE\n"
+ "SYS_FOREIGN_COLS(ID CHAR, POS INT,"
+ " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n"
+ "CREATE UNIQUE CLUSTERED INDEX ID_IND"
+ " ON SYS_FOREIGN_COLS (ID, POS);\n"
+ "END;\n",
+ FALSE, trx);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ib::error() << "Creation of SYS_FOREIGN and SYS_FOREIGN_COLS"
+ " failed: " << err << ". Tablespace is"
+ " full. Dropping incompletely created tables.";
+
+ ut_ad(err == DB_OUT_OF_FILE_SPACE
+ || err == DB_TOO_MANY_CONCURRENT_TRXS);
+
+ row_drop_table_after_create_fail("SYS_FOREIGN", trx);
+ row_drop_table_after_create_fail("SYS_FOREIGN_COLS", trx);
+
+ if (err == DB_OUT_OF_FILE_SPACE) {
+ err = DB_MUST_GET_MORE_FILE_SPACE;
+ }
+ }
+
+ trx_commit_for_mysql(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->free();
+
+ srv_file_per_table = srv_file_per_table_backup;
+
+ /* Note: The master thread has not been started at this point. */
+ /* Confirm and move to the non-LRU part of the table LRU list. */
+ sys_foreign_err = dict_check_if_system_table_exists(
+ "SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3);
+ ut_a(sys_foreign_err == DB_SUCCESS);
+
+ sys_foreign_cols_err = dict_check_if_system_table_exists(
+ "SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1);
+ ut_a(sys_foreign_cols_err == DB_SUCCESS);
+
+ return(err);
+}
+
+/** Creates the virtual column system table (SYS_VIRTUAL) inside InnoDB
+at server bootstrap or server start if the table is not found or is
+not of the right form.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_create_or_check_sys_virtual()
+{
+ trx_t* trx;
+ my_bool srv_file_per_table_backup;
+ dberr_t err;
+
+ ut_ad(!srv_any_background_activity());
+
+ /* Note: The master thread has not been started at this point. */
+ err = dict_check_if_system_table_exists(
+ "SYS_VIRTUAL", DICT_NUM_FIELDS__SYS_VIRTUAL + 1, 1);
+
+ if (err == DB_SUCCESS) {
+ mutex_enter(&dict_sys.mutex);
+ dict_sys.sys_virtual = dict_table_get_low("SYS_VIRTUAL");
+ mutex_exit(&dict_sys.mutex);
+ return(DB_SUCCESS);
+ }
+
+ if (srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) {
+ return(DB_READ_ONLY);
+ }
+
+ trx = trx_create();
+
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+ trx->op_info = "creating sys_virtual tables";
+
+ row_mysql_lock_data_dictionary(trx);
+
+ /* Check which incomplete table definition to drop. */
+
+ if (err == DB_CORRUPTION) {
+ row_drop_table_after_create_fail("SYS_VIRTUAL", trx);
+ }
+
+ ib::info() << "Creating sys_virtual system tables.";
+
+ srv_file_per_table_backup = srv_file_per_table;
+
+ /* We always want SYSTEM tables to be created inside the system
+ tablespace. */
+
+ srv_file_per_table = 0;
+
+ err = que_eval_sql(
+ NULL,
+ "PROCEDURE CREATE_SYS_VIRTUAL_TABLES_PROC () IS\n"
+ "BEGIN\n"
+ "CREATE TABLE\n"
+ "SYS_VIRTUAL(TABLE_ID BIGINT, POS INT,"
+ " BASE_POS INT);\n"
+ "CREATE UNIQUE CLUSTERED INDEX BASE_IDX"
+ " ON SYS_VIRTUAL(TABLE_ID, POS, BASE_POS);\n"
+ "END;\n",
+ FALSE, trx);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ib::error() << "Creation of SYS_VIRTUAL"
+ " failed: " << err << ". Tablespace is"
+ " full or too many transactions."
+ " Dropping incompletely created tables.";
+
+ ut_ad(err == DB_OUT_OF_FILE_SPACE
+ || err == DB_TOO_MANY_CONCURRENT_TRXS);
+
+ row_drop_table_after_create_fail("SYS_VIRTUAL", trx);
+
+ if (err == DB_OUT_OF_FILE_SPACE) {
+ err = DB_MUST_GET_MORE_FILE_SPACE;
+ }
+ }
+
+ trx_commit_for_mysql(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->free();
+
+ srv_file_per_table = srv_file_per_table_backup;
+
+ /* Note: The master thread has not been started at this point. */
+ /* Confirm and move to the non-LRU part of the table LRU list. */
+ dberr_t sys_virtual_err = dict_check_if_system_table_exists(
+ "SYS_VIRTUAL", DICT_NUM_FIELDS__SYS_VIRTUAL + 1, 1);
+ ut_a(sys_virtual_err == DB_SUCCESS);
+ mutex_enter(&dict_sys.mutex);
+ dict_sys.sys_virtual = dict_table_get_low("SYS_VIRTUAL");
+ mutex_exit(&dict_sys.mutex);
+
+ return(err);
+}
+
+/****************************************************************//**
+Evaluate the given foreign key SQL statement.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+dict_foreign_eval_sql(
+/*==================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* sql, /*!< in: SQL string to evaluate */
+ const char* name, /*!< in: table name (for diagnostics) */
+ const char* id, /*!< in: foreign key id */
+ trx_t* trx) /*!< in/out: transaction */
+{
+ dberr_t error;
+ FILE* ef = dict_foreign_err_file;
+
+ error = que_eval_sql(info, sql, FALSE, trx);
+
+ if (error == DB_DUPLICATE_KEY) {
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Error in foreign key constraint creation for table ",
+ ef);
+ ut_print_name(ef, trx, name);
+ fputs(".\nA foreign key constraint of name ", ef);
+ ut_print_name(ef, trx, id);
+ fputs("\nalready exists."
+ " (Note that internally InnoDB adds 'databasename'\n"
+ "in front of the user-defined constraint name.)\n"
+ "Note that InnoDB's FOREIGN KEY system tables store\n"
+ "constraint names as case-insensitive, with the\n"
+ "MySQL standard latin1_swedish_ci collation. If you\n"
+ "create tables or databases whose names differ only in\n"
+ "the character case, then collisions in constraint\n"
+ "names can occur. Workaround: name your constraints\n"
+ "explicitly with unique names.\n",
+ ef);
+
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(error);
+ }
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "Foreign key constraint creation failed: "
+ << error;
+
+ mutex_enter(&dict_foreign_err_mutex);
+ ut_print_timestamp(ef);
+ fputs(" Internal error in foreign key constraint creation"
+ " for table ", ef);
+ ut_print_name(ef, trx, name);
+ fputs(".\n"
+ "See the MySQL .err log in the datadir"
+ " for more information.\n", ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(error);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Add a single foreign key field definition to the data dictionary tables in
+the database.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+dict_create_add_foreign_field_to_dictionary(
+/*========================================*/
+ ulint field_nr, /*!< in: field number */
+ const char* table_name, /*!< in: table name */
+ const dict_foreign_t* foreign, /*!< in: foreign */
+ trx_t* trx) /*!< in/out: transaction */
+{
+ DBUG_ENTER("dict_create_add_foreign_field_to_dictionary");
+
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_str_literal(info, "id", foreign->id);
+
+ pars_info_add_int4_literal(info, "pos", field_nr);
+
+ pars_info_add_str_literal(info, "for_col_name",
+ foreign->foreign_col_names[field_nr]);
+
+ pars_info_add_str_literal(info, "ref_col_name",
+ foreign->referenced_col_names[field_nr]);
+
+ DBUG_RETURN(dict_foreign_eval_sql(
+ info,
+ "PROCEDURE P () IS\n"
+ "BEGIN\n"
+ "INSERT INTO SYS_FOREIGN_COLS VALUES"
+ "(:id, :pos, :for_col_name, :ref_col_name);\n"
+ "END;\n",
+ table_name, foreign->id, trx));
+}
+
+/********************************************************************//**
+Construct foreign key constraint defintion from data dictionary information.
+*/
+UNIV_INTERN
+char*
+dict_foreign_def_get(
+/*=================*/
+ dict_foreign_t* foreign,/*!< in: foreign */
+ trx_t* trx) /*!< in: trx */
+{
+ char* fk_def = (char *)mem_heap_alloc(foreign->heap, 4*1024);
+ const char* tbname;
+ char tablebuf[MAX_TABLE_NAME_LEN + 1] = "";
+ unsigned i;
+ char* bufend;
+
+ tbname = dict_remove_db_name(foreign->id);
+ bufend = innobase_convert_name(tablebuf, MAX_TABLE_NAME_LEN,
+ tbname, strlen(tbname), trx->mysql_thd);
+ tablebuf[bufend - tablebuf] = '\0';
+
+ sprintf(fk_def,
+ (char *)"CONSTRAINT %s FOREIGN KEY (", (char *)tablebuf);
+
+ for(i = 0; i < foreign->n_fields; i++) {
+ char buf[MAX_TABLE_NAME_LEN + 1] = "";
+ innobase_convert_name(buf, MAX_TABLE_NAME_LEN,
+ foreign->foreign_col_names[i],
+ strlen(foreign->foreign_col_names[i]),
+ trx->mysql_thd);
+ strcat(fk_def, buf);
+ if (i < static_cast<unsigned>(foreign->n_fields-1)) {
+ strcat(fk_def, (char *)",");
+ }
+ }
+
+ strcat(fk_def,(char *)") REFERENCES ");
+
+ bufend = innobase_convert_name(tablebuf, MAX_TABLE_NAME_LEN,
+ foreign->referenced_table_name,
+ strlen(foreign->referenced_table_name),
+ trx->mysql_thd);
+ tablebuf[bufend - tablebuf] = '\0';
+
+ strcat(fk_def, tablebuf);
+ strcat(fk_def, " (");
+
+ for(i = 0; i < foreign->n_fields; i++) {
+ char buf[MAX_TABLE_NAME_LEN + 1] = "";
+ bufend = innobase_convert_name(buf, MAX_TABLE_NAME_LEN,
+ foreign->referenced_col_names[i],
+ strlen(foreign->referenced_col_names[i]),
+ trx->mysql_thd);
+ buf[bufend - buf] = '\0';
+ strcat(fk_def, buf);
+ if (i < (uint)foreign->n_fields-1) {
+ strcat(fk_def, (char *)",");
+ }
+ }
+ strcat(fk_def, (char *)")");
+
+ return fk_def;
+}
+
+/********************************************************************//**
+Convert foreign key column names from data dictionary to SQL-layer.
+*/
+static
+void
+dict_foreign_def_get_fields(
+/*========================*/
+ dict_foreign_t* foreign,/*!< in: foreign */
+ trx_t* trx, /*!< in: trx */
+ char** field, /*!< out: foreign column */
+ char** field2, /*!< out: referenced column */
+ ulint col_no) /*!< in: column number */
+{
+ char* bufend;
+ char* fieldbuf = (char *)mem_heap_alloc(foreign->heap, MAX_TABLE_NAME_LEN+1);
+ char* fieldbuf2 = (char *)mem_heap_alloc(foreign->heap, MAX_TABLE_NAME_LEN+1);
+
+ bufend = innobase_convert_name(fieldbuf, MAX_TABLE_NAME_LEN,
+ foreign->foreign_col_names[col_no],
+ strlen(foreign->foreign_col_names[col_no]),
+ trx->mysql_thd);
+
+ fieldbuf[bufend - fieldbuf] = '\0';
+
+ bufend = innobase_convert_name(fieldbuf2, MAX_TABLE_NAME_LEN,
+ foreign->referenced_col_names[col_no],
+ strlen(foreign->referenced_col_names[col_no]),
+ trx->mysql_thd);
+
+ fieldbuf2[bufend - fieldbuf2] = '\0';
+ *field = fieldbuf;
+ *field2 = fieldbuf2;
+}
+
+/********************************************************************//**
+Add a foreign key definition to the data dictionary tables.
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+ const char* name, /*!< in: table name */
+ const dict_foreign_t* foreign,/*!< in: foreign key */
+ trx_t* trx) /*!< in/out: dictionary transaction */
+{
+ dberr_t error;
+
+ DBUG_ENTER("dict_create_add_foreign_to_dictionary");
+
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_str_literal(info, "id", foreign->id);
+
+ pars_info_add_str_literal(info, "for_name", name);
+
+ pars_info_add_str_literal(info, "ref_name",
+ foreign->referenced_table_name);
+
+ pars_info_add_int4_literal(info, "n_cols",
+ ulint(foreign->n_fields)
+ | (ulint(foreign->type) << 24));
+
+ DBUG_PRINT("dict_create_add_foreign_to_dictionary",
+ ("'%s', '%s', '%s', %d", foreign->id, name,
+ foreign->referenced_table_name,
+ foreign->n_fields + (foreign->type << 24)));
+
+ error = dict_foreign_eval_sql(info,
+ "PROCEDURE P () IS\n"
+ "BEGIN\n"
+ "INSERT INTO SYS_FOREIGN VALUES"
+ "(:id, :for_name, :ref_name, :n_cols);\n"
+ "END;\n"
+ , name, foreign->id, trx);
+
+ if (error != DB_SUCCESS) {
+
+ if (error == DB_DUPLICATE_KEY) {
+ char buf[MAX_TABLE_NAME_LEN + 1] = "";
+ char tablename[MAX_TABLE_NAME_LEN + 1] = "";
+ char* fk_def;
+
+ innobase_convert_name(tablename, MAX_TABLE_NAME_LEN,
+ name, strlen(name), trx->mysql_thd);
+
+ innobase_convert_name(buf, MAX_TABLE_NAME_LEN,
+ foreign->id, strlen(foreign->id), trx->mysql_thd);
+
+ fk_def = dict_foreign_def_get((dict_foreign_t*)foreign, trx);
+
+ ib_push_warning(trx, error,
+ "Create or Alter table %s with foreign key constraint"
+ " failed. Foreign key constraint %s"
+ " already exists on data dictionary."
+ " Foreign key constraint names need to be unique in database."
+ " Error in foreign key definition: %s.",
+ tablename, buf, fk_def);
+ }
+
+ DBUG_RETURN(error);
+ }
+
+ for (ulint i = 0; i < foreign->n_fields; i++) {
+ error = dict_create_add_foreign_field_to_dictionary(
+ i, name, foreign, trx);
+
+ if (error != DB_SUCCESS) {
+ char buf[MAX_TABLE_NAME_LEN + 1] = "";
+ char tablename[MAX_TABLE_NAME_LEN + 1] = "";
+ char* field=NULL;
+ char* field2=NULL;
+ char* fk_def;
+
+ innobase_convert_name(tablename, MAX_TABLE_NAME_LEN,
+ name, strlen(name), trx->mysql_thd);
+ innobase_convert_name(buf, MAX_TABLE_NAME_LEN,
+ foreign->id, strlen(foreign->id), trx->mysql_thd);
+ fk_def = dict_foreign_def_get((dict_foreign_t*)foreign, trx);
+ dict_foreign_def_get_fields((dict_foreign_t*)foreign, trx, &field, &field2, i);
+
+ ib_push_warning(trx, error,
+ "Create or Alter table %s with foreign key constraint"
+ " failed. Error adding foreign key constraint name %s"
+ " fields %s or %s to the dictionary."
+ " Error in foreign key definition: %s.",
+ tablename, buf, i+1, fk_def);
+
+ DBUG_RETURN(error);
+ }
+ }
+
+ DBUG_RETURN(error);
+}
+
+/** Check if a foreign constraint is on the given column name.
+@param[in] col_name column name to be searched for fk constraint
+@param[in] table table to which foreign key constraint belongs
+@return true if fk constraint is present on the table, false otherwise. */
+static
+bool
+dict_foreign_base_for_stored(
+ const char* col_name,
+ const dict_table_t* table)
+{
+ /* Loop through each stored column and check if its base column has
+ the same name as the column name being checked */
+ dict_s_col_list::const_iterator it;
+ for (it = table->s_cols->begin();
+ it != table->s_cols->end(); ++it) {
+ dict_s_col_t s_col = *it;
+
+ for (ulint j = 0; j < s_col.num_base; j++) {
+ if (strcmp(col_name, dict_table_get_col_name(
+ table,
+ s_col.base_col[j]->ind)) == 0) {
+ return(true);
+ }
+ }
+ }
+
+ return(false);
+}
+
+/** Check if a foreign constraint is on columns served as base columns
+of any stored column. This is to prevent creating SET NULL or CASCADE
+constraint on such columns
+@param[in] local_fk_set set of foreign key objects, to be added to
+the dictionary tables
+@param[in] table table to which the foreign key objects in
+local_fk_set belong to
+@return true if yes, otherwise, false */
+bool
+dict_foreigns_has_s_base_col(
+ const dict_foreign_set& local_fk_set,
+ const dict_table_t* table)
+{
+ dict_foreign_t* foreign;
+
+ if (table->s_cols == NULL) {
+ return (false);
+ }
+
+ for (dict_foreign_set::const_iterator it = local_fk_set.begin();
+ it != local_fk_set.end(); ++it) {
+
+ foreign = *it;
+ ulint type = foreign->type;
+
+ type &= ~(DICT_FOREIGN_ON_DELETE_NO_ACTION
+ | DICT_FOREIGN_ON_UPDATE_NO_ACTION);
+
+ if (type == 0) {
+ continue;
+ }
+
+ for (ulint i = 0; i < foreign->n_fields; i++) {
+ /* Check if the constraint is on a column that
+ is a base column of any stored column */
+ if (dict_foreign_base_for_stored(
+ foreign->foreign_col_names[i], table)) {
+ return(true);
+ }
+ }
+ }
+
+ return(false);
+}
+
+/** Adds the given set of foreign key objects to the dictionary tables
+in the database. This function does not modify the dictionary cache. The
+caller must ensure that all foreign key objects contain a valid constraint
+name in foreign->id.
+@param[in] local_fk_set set of foreign key objects, to be added to
+the dictionary tables
+@param[in] table table to which the foreign key objects in
+local_fk_set belong to
+@param[in,out] trx transaction
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+ const dict_foreign_set& local_fk_set,
+ const dict_table_t* table,
+ trx_t* trx)
+{
+ dict_foreign_t* foreign;
+ dberr_t error;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ if (NULL == dict_table_get_low("SYS_FOREIGN")) {
+
+ ib::error() << "Table SYS_FOREIGN not found"
+ " in internal data dictionary";
+
+ return(DB_ERROR);
+ }
+
+ error = DB_SUCCESS;
+
+ for (dict_foreign_set::const_iterator it = local_fk_set.begin();
+ it != local_fk_set.end();
+ ++it) {
+
+ foreign = *it;
+ ut_ad(foreign->id != NULL);
+
+ error = dict_create_add_foreign_to_dictionary(
+ table->name.m_name, foreign, trx);
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+ }
+
+ return error;
+}
+
+/****************************************************************//**
+Creates the tablespaces and datafiles system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_create_or_check_sys_tablespace(void)
+/*=====================================*/
+{
+ trx_t* trx;
+ my_bool srv_file_per_table_backup;
+ dberr_t err;
+ dberr_t sys_tablespaces_err;
+ dberr_t sys_datafiles_err;
+
+ ut_ad(!srv_any_background_activity());
+
+ /* Note: The master thread has not been started at this point. */
+
+ sys_tablespaces_err = dict_check_if_system_table_exists(
+ "SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1);
+ sys_datafiles_err = dict_check_if_system_table_exists(
+ "SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1);
+
+ if (sys_tablespaces_err == DB_SUCCESS
+ && sys_datafiles_err == DB_SUCCESS) {
+ srv_sys_tablespaces_open = true;
+ return(DB_SUCCESS);
+ }
+
+ if (srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) {
+ return(DB_READ_ONLY);
+ }
+
+ trx = trx_create();
+
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+ trx->op_info = "creating tablepace and datafile sys tables";
+
+ row_mysql_lock_data_dictionary(trx);
+
+ /* Check which incomplete table definition to drop. */
+
+ if (sys_tablespaces_err == DB_CORRUPTION) {
+ row_drop_table_after_create_fail("SYS_TABLESPACES", trx);
+ }
+
+ if (sys_datafiles_err == DB_CORRUPTION) {
+ row_drop_table_after_create_fail("SYS_DATAFILES", trx);
+ }
+
+ ib::info() << "Creating tablespace and datafile system tables.";
+
+ /* We always want SYSTEM tables to be created inside the system
+ tablespace. */
+ srv_file_per_table_backup = srv_file_per_table;
+ srv_file_per_table = 0;
+
+ err = que_eval_sql(
+ NULL,
+ "PROCEDURE CREATE_SYS_TABLESPACE_PROC () IS\n"
+ "BEGIN\n"
+ "CREATE TABLE SYS_TABLESPACES(\n"
+ " SPACE INT, NAME CHAR, FLAGS INT);\n"
+ "CREATE UNIQUE CLUSTERED INDEX SYS_TABLESPACES_SPACE"
+ " ON SYS_TABLESPACES (SPACE);\n"
+ "CREATE TABLE SYS_DATAFILES(\n"
+ " SPACE INT, PATH CHAR);\n"
+ "CREATE UNIQUE CLUSTERED INDEX SYS_DATAFILES_SPACE"
+ " ON SYS_DATAFILES (SPACE);\n"
+ "END;\n",
+ FALSE, trx);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ib::error() << "Creation of SYS_TABLESPACES and SYS_DATAFILES"
+ " has failed with error " << err
+ << ". Dropping incompletely created tables.";
+
+ ut_a(err == DB_OUT_OF_FILE_SPACE
+ || err == DB_DUPLICATE_KEY
+ || err == DB_TOO_MANY_CONCURRENT_TRXS);
+
+ row_drop_table_after_create_fail("SYS_TABLESPACES", trx);
+ row_drop_table_after_create_fail("SYS_DATAFILES", trx);
+
+ if (err == DB_OUT_OF_FILE_SPACE) {
+ err = DB_MUST_GET_MORE_FILE_SPACE;
+ }
+ }
+
+ trx_commit_for_mysql(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->free();
+
+ srv_file_per_table = srv_file_per_table_backup;
+
+ if (err == DB_SUCCESS) {
+ srv_sys_tablespaces_open = true;
+ }
+
+ /* Note: The master thread has not been started at this point. */
+ /* Confirm and move to the non-LRU part of the table LRU list. */
+
+ sys_tablespaces_err = dict_check_if_system_table_exists(
+ "SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1);
+ ut_a(sys_tablespaces_err == DB_SUCCESS || err != DB_SUCCESS);
+
+ sys_datafiles_err = dict_check_if_system_table_exists(
+ "SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1);
+ ut_a(sys_datafiles_err == DB_SUCCESS || err != DB_SUCCESS);
+
+ return(err);
+}
+
+/** Put a tablespace definition into the data dictionary,
+replacing what was there previously.
+@param[in] space Tablespace id
+@param[in] name Tablespace name
+@param[in] flags Tablespace flags
+@param[in] path Tablespace path
+@param[in] trx Transaction
+@return error code or DB_SUCCESS */
+dberr_t
+dict_replace_tablespace_in_dictionary(
+ ulint space_id,
+ const char* name,
+ ulint flags,
+ const char* path,
+ trx_t* trx)
+{
+ if (!srv_sys_tablespaces_open) {
+ /* Startup procedure is not yet ready for updates. */
+ return(DB_SUCCESS);
+ }
+
+ dberr_t error;
+
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_int4_literal(info, "space", space_id);
+
+ pars_info_add_str_literal(info, "name", name);
+
+ pars_info_add_int4_literal(info, "flags", flags);
+
+ pars_info_add_str_literal(info, "path", path);
+
+ error = que_eval_sql(info,
+ "PROCEDURE P () IS\n"
+ "p CHAR;\n"
+
+ "DECLARE CURSOR c IS\n"
+ " SELECT PATH FROM SYS_DATAFILES\n"
+ " WHERE SPACE=:space FOR UPDATE;\n"
+
+ "BEGIN\n"
+ "OPEN c;\n"
+ "FETCH c INTO p;\n"
+
+ "IF (SQL % NOTFOUND) THEN"
+ " DELETE FROM SYS_TABLESPACES "
+ "WHERE SPACE=:space;\n"
+ " INSERT INTO SYS_TABLESPACES VALUES"
+ "(:space, :name, :flags);\n"
+ " INSERT INTO SYS_DATAFILES VALUES"
+ "(:space, :path);\n"
+ "ELSIF p <> :path THEN\n"
+ " UPDATE SYS_DATAFILES SET PATH=:path"
+ " WHERE CURRENT OF c;\n"
+ "END IF;\n"
+ "END;\n",
+ FALSE, trx);
+
+ if (error != DB_SUCCESS) {
+ return(error);
+ }
+
+ trx->op_info = "";
+
+ return(error);
+}
diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc
new file mode 100644
index 00000000..0d9cb185
--- /dev/null
+++ b/storage/innobase/dict/dict0defrag_bg.cc
@@ -0,0 +1,327 @@
+/*****************************************************************************
+
+Copyright (c) 2016, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0defrag_bg.cc
+Defragmentation routines.
+
+Created 25/08/2016 Jan Lindström
+*******************************************************/
+
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "dict0defrag_bg.h"
+#include "btr0btr.h"
+#include "srv0start.h"
+
+static ib_mutex_t defrag_pool_mutex;
+
+#ifdef MYSQL_PFS
+static mysql_pfs_key_t defrag_pool_mutex_key;
+#endif
+
+/** Iterator type for iterating over the elements of objects of type
+defrag_pool_t. */
+typedef defrag_pool_t::iterator defrag_pool_iterator_t;
+
+/** Pool where we store information on which tables are to be processed
+by background defragmentation. */
+defrag_pool_t defrag_pool;
+
+
+/*****************************************************************//**
+Initialize the defrag pool, called once during thread initialization. */
+void
+dict_defrag_pool_init(void)
+/*=======================*/
+{
+ ut_ad(!srv_read_only_mode);
+
+ /* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */
+ mutex_create(LATCH_ID_DEFRAGMENT_MUTEX, &defrag_pool_mutex);
+}
+
+/*****************************************************************//**
+Free the resources occupied by the defrag pool, called once during
+thread de-initialization. */
+void
+dict_defrag_pool_deinit(void)
+/*=========================*/
+{
+ ut_ad(!srv_read_only_mode);
+
+ mutex_free(&defrag_pool_mutex);
+}
+
+/*****************************************************************//**
+Get an index from the auto defrag pool. The returned index id is removed
+from the pool.
+@return true if the pool was non-empty and "id" was set, false otherwise */
+static
+bool
+dict_stats_defrag_pool_get(
+/*=======================*/
+ table_id_t* table_id, /*!< out: table id, or unmodified if
+ list is empty */
+ index_id_t* index_id) /*!< out: index id, or unmodified if
+ list is empty */
+{
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&defrag_pool_mutex);
+
+ if (defrag_pool.empty()) {
+ mutex_exit(&defrag_pool_mutex);
+ return(false);
+ }
+
+ defrag_pool_item_t& item = defrag_pool.back();
+ *table_id = item.table_id;
+ *index_id = item.index_id;
+
+ defrag_pool.pop_back();
+
+ mutex_exit(&defrag_pool_mutex);
+
+ return(true);
+}
+
+/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+ const dict_index_t* index) /*!< in: table to add */
+{
+ defrag_pool_item_t item;
+
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&defrag_pool_mutex);
+
+ /* quit if already in the list */
+ for (defrag_pool_iterator_t iter = defrag_pool.begin();
+ iter != defrag_pool.end();
+ ++iter) {
+ if ((*iter).table_id == index->table->id
+ && (*iter).index_id == index->id) {
+ mutex_exit(&defrag_pool_mutex);
+ return;
+ }
+ }
+
+ item.table_id = index->table->id;
+ item.index_id = index->id;
+ defrag_pool.push_back(item);
+ if (defrag_pool.size() == 1) {
+ /* Kick off dict stats optimizer work */
+ dict_stats_schedule_now();
+ }
+ mutex_exit(&defrag_pool_mutex);
+}
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+ const dict_table_t* table, /*!<in: if given, remove
+ all entries for the table */
+ const dict_index_t* index) /*!< in: if given, remove this index */
+{
+ ut_a((table && !index) || (!table && index));
+ ut_ad(!srv_read_only_mode);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ mutex_enter(&defrag_pool_mutex);
+
+ defrag_pool_iterator_t iter = defrag_pool.begin();
+ while (iter != defrag_pool.end()) {
+ if ((table && (*iter).table_id == table->id)
+ || (index
+ && (*iter).table_id == index->table->id
+ && (*iter).index_id == index->id)) {
+ /* erase() invalidates the iterator */
+ iter = defrag_pool.erase(iter);
+ if (index)
+ break;
+ } else {
+ iter++;
+ }
+ }
+
+ mutex_exit(&defrag_pool_mutex);
+}
+
+/*****************************************************************//**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+static
+void
+dict_stats_process_entry_from_defrag_pool()
+{
+ table_id_t table_id;
+ index_id_t index_id;
+
+ ut_ad(!srv_read_only_mode);
+
+ /* pop the first index from the auto defrag pool */
+ if (!dict_stats_defrag_pool_get(&table_id, &index_id)) {
+ /* no index in defrag pool */
+ return;
+ }
+
+ dict_table_t* table;
+
+ mutex_enter(&dict_sys.mutex);
+
+ /* If the table is no longer cached, we've already lost the in
+ memory stats so there's nothing really to write to disk. */
+ table = dict_table_open_on_id(table_id, TRUE,
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+
+ dict_index_t* index = table && !table->corrupted
+ ? dict_table_find_index_on_id(table, index_id)
+ : NULL;
+
+ if (!index || index->is_corrupted()) {
+ if (table) {
+ dict_table_close(table, TRUE, FALSE);
+ }
+ mutex_exit(&dict_sys.mutex);
+ return;
+ }
+
+ mutex_exit(&dict_sys.mutex);
+ dict_stats_save_defrag_stats(index);
+ dict_table_close(table, FALSE, FALSE);
+}
+
+/*****************************************************************//**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+void
+dict_defrag_process_entries_from_defrag_pool()
+/*==========================================*/
+{
+ while (defrag_pool.size()) {
+ dict_stats_process_entry_from_defrag_pool();
+ }
+}
+
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_defrag_summary(
+/*============================*/
+ dict_index_t* index) /*!< in: index */
+{
+ dberr_t ret=DB_SUCCESS;
+
+ if (dict_index_is_ibuf(index)) {
+ return DB_SUCCESS;
+ }
+
+ dict_sys_lock();
+
+ ret = dict_stats_save_index_stat(index, time(NULL), "n_pages_freed",
+ index->stat_defrag_n_pages_freed,
+ NULL,
+ "Number of pages freed during"
+ " last defragmentation run.",
+ NULL);
+
+ dict_sys_unlock();
+
+ return (ret);
+}
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_defrag_stats(
+/*============================*/
+ dict_index_t* index) /*!< in: index */
+{
+ dberr_t ret;
+
+ if (dict_index_is_ibuf(index)) {
+ return DB_SUCCESS;
+ }
+
+ if (!index->is_readable()) {
+ return dict_stats_report_error(index->table, true);
+ }
+
+ const time_t now = time(NULL);
+ mtr_t mtr;
+ ulint n_leaf_pages;
+ ulint n_leaf_reserved;
+ mtr.start();
+ mtr_s_lock_index(index, &mtr);
+ n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
+ &n_leaf_pages, &mtr);
+ mtr.commit();
+
+ if (n_leaf_reserved == ULINT_UNDEFINED) {
+ // The index name is different during fast index creation,
+ // so the stats won't be associated with the right index
+ // for later use. We just return without saving.
+ return DB_SUCCESS;
+ }
+
+ dict_sys_lock();
+ ret = dict_stats_save_index_stat(index, now, "n_page_split",
+ index->stat_defrag_n_page_split,
+ NULL,
+ "Number of new page splits on leaves"
+ " since last defragmentation.",
+ NULL);
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+
+ ret = dict_stats_save_index_stat(
+ index, now, "n_leaf_pages_defrag",
+ n_leaf_pages,
+ NULL,
+ "Number of leaf pages when this stat is saved to disk",
+ NULL);
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+
+ ret = dict_stats_save_index_stat(
+ index, now, "n_leaf_pages_reserved",
+ n_leaf_reserved,
+ NULL,
+ "Number of pages reserved for this index leaves when this stat "
+ "is saved to disk",
+ NULL);
+
+end:
+ dict_sys_unlock();
+ return ret;
+}
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
new file mode 100644
index 00000000..7d80fc7e
--- /dev/null
+++ b/storage/innobase/dict/dict0dict.cc
@@ -0,0 +1,5277 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file dict/dict0dict.cc
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include <my_config.h>
+#include <string>
+
+#include "ha_prototypes.h"
+#include <mysqld.h>
+#include <strfunc.h>
+
+#include "dict0dict.h"
+#include "fts0fts.h"
+#include "fil0fil.h"
+#include <algorithm>
+#include "sql_class.h"
+#include "sql_table.h"
+#include <mysql/service_thd_mdl.h>
+
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "buf0buf.h"
+#include "data0type.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "dict0mem.h"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "lock0lock.h"
+#include "mach0data.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "pars0pars.h"
+#include "pars0sym.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "row0log.h"
+#include "row0merge.h"
+#include "row0mysql.h"
+#include "row0upd.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "sync0sync.h"
+#include "trx0undo.h"
+
+#include <vector>
+#include <algorithm>
+
+/** the dictionary system */
+dict_sys_t dict_sys;
+
+/** Percentage of compression failures that are allowed in a single
+round */
+ulong zip_failure_threshold_pct = 5;
+
+/** Maximum percentage of a page that can be allowed as a pad to avoid
+compression failures */
+ulong zip_pad_max = 50;
+
+#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when
+ creating a table or index object */
+#define DICT_POOL_PER_TABLE_HASH 512 /*!< buffer pool max size per table
+ hash table fixed size in bytes */
+#define DICT_POOL_PER_VARYING 4 /*!< buffer pool max size per data
+ dictionary varying size in bytes */
+
+/** Identifies generated InnoDB foreign key names */
+static char dict_ibfk[] = "_ibfk_";
+
+bool innodb_table_stats_not_found = false;
+bool innodb_index_stats_not_found = false;
+static bool innodb_table_stats_not_found_reported = false;
+static bool innodb_index_stats_not_found_reported = false;
+
+/*******************************************************************//**
+Tries to find column names for the index and sets the col field of the
+index.
+@param[in] index index
+@param[in] add_v new virtual columns added along with an add index call
+@return whether the column names were found */
+static
+bool
+dict_index_find_cols(
+ dict_index_t* index,
+ const dict_add_v_col_t* add_v);
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the clustered index */
+static
+dict_index_t*
+dict_index_build_internal_clust(
+/*============================*/
+ dict_index_t* index); /*!< in: user representation of
+ a clustered index */
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a non-clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the non-clustered index */
+static
+dict_index_t*
+dict_index_build_internal_non_clust(
+/*================================*/
+ dict_index_t* index); /*!< in: user representation of
+ a non-clustered index */
+/**********************************************************************//**
+Builds the internal dictionary cache representation for an FTS index.
+@return own: the internal representation of the FTS index */
+static
+dict_index_t*
+dict_index_build_internal_fts(
+/*==========================*/
+ dict_index_t* index); /*!< in: user representation of an FTS index */
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+static
+void
+dict_index_remove_from_cache_low(
+/*=============================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index, /*!< in, own: index */
+ ibool lru_evict); /*!< in: TRUE if page being evicted
+ to make room in the table LRU list */
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate the dictionary table LRU list.
+@return TRUE if validate OK */
+static
+ibool
+dict_lru_validate(void);
+/*===================*/
+#endif /* UNIV_DEBUG */
+
+/* Stream for storing detailed information about the latest foreign key
+and unique key errors. Only created if !srv_read_only_mode */
+FILE* dict_foreign_err_file = NULL;
+/* mutex protecting the foreign and unique error buffers */
+ib_mutex_t dict_foreign_err_mutex;
+
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return TRUE if same db name */
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+ const char* name1, /*!< in: table name in the form
+ dbname '/' tablename */
+ const char* name2) /*!< in: table name in the form
+ dbname '/' tablename */
+{
+ for (; *name1 == *name2; name1++, name2++) {
+ if (*name1 == '/') {
+ return(TRUE);
+ }
+ ut_a(*name1); /* the names must contain '/' */
+ }
+ return(FALSE);
+}
+
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return table name */
+const char*
+dict_remove_db_name(
+/*================*/
+ const char* name) /*!< in: table name in the form
+ dbname '/' tablename */
+{
+ const char* s = strchr(name, '/');
+ ut_a(s);
+
+ return(s + 1);
+}
+
+/** Open a persistent table.
+@param[in] table_id persistent table identifier
+@param[in] ignore_err errors to ignore
+@param[in] cached_only whether to skip loading
+@return persistent table
+@retval NULL if not found */
+static dict_table_t* dict_table_open_on_id_low(
+ table_id_t table_id,
+ dict_err_ignore_t ignore_err,
+ bool cached_only)
+{
+ dict_table_t* table = dict_sys.get_table(table_id);
+
+ if (!table && !cached_only) {
+ table = dict_load_table_on_id(table_id, ignore_err);
+ }
+
+ return table;
+}
+
+/**********************************************************************//**
+Try to drop any indexes after an aborted index creation.
+This can also be after a server kill during DROP INDEX. */
+static
+void
+dict_table_try_drop_aborted(
+/*========================*/
+ dict_table_t* table, /*!< in: table, or NULL if it
+ needs to be looked up again */
+ table_id_t table_id, /*!< in: table identifier */
+ uint32_t ref_count) /*!< in: expected table->n_ref_count */
+{
+ trx_t* trx;
+
+ trx = trx_create();
+ trx->op_info = "try to drop any indexes after an aborted index creation";
+ row_mysql_lock_data_dictionary(trx);
+ trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+ if (table == NULL) {
+ table = dict_table_open_on_id_low(
+ table_id, DICT_ERR_IGNORE_FK_NOKEY, FALSE);
+ } else {
+ ut_ad(table->id == table_id);
+ }
+
+ if (table && table->get_ref_count() == ref_count && table->drop_aborted
+ && !UT_LIST_GET_FIRST(table->locks)) {
+ /* Silence a debug assertion in row_merge_drop_indexes(). */
+ ut_d(table->acquire());
+ row_merge_drop_indexes(trx, table, true);
+ ut_d(table->release());
+ ut_ad(table->get_ref_count() == ref_count);
+ trx_commit_for_mysql(trx);
+ }
+
+ row_mysql_unlock_data_dictionary(trx);
+ trx->free();
+}
+
+/**********************************************************************//**
+When opening a table,
+try to drop any indexes after an aborted index creation.
+Release the dict_sys.mutex. */
+static
+void
+dict_table_try_drop_aborted_and_mutex_exit(
+/*=======================================*/
+ dict_table_t* table, /*!< in: table (may be NULL) */
+ ibool try_drop) /*!< in: FALSE if should try to
+ drop indexes whose online creation
+ was aborted */
+{
+ if (try_drop
+ && table != NULL
+ && table->drop_aborted
+ && table->get_ref_count() == 1
+ && dict_table_get_first_index(table)) {
+
+ /* Attempt to drop the indexes whose online creation
+ was aborted. */
+ table_id_t table_id = table->id;
+
+ mutex_exit(&dict_sys.mutex);
+
+ dict_table_try_drop_aborted(table, table_id, 1);
+ } else {
+ mutex_exit(&dict_sys.mutex);
+ }
+}
+
+/** Decrements the count of open handles of a table.
+@param[in,out] table table
+@param[in] dict_locked data dictionary locked
+@param[in] try_drop try to drop any orphan indexes after
+ an aborted online index creation
+@param[in] thd thread to release MDL
+@param[in] mdl metadata lock or NULL if the thread
+ is a foreground one. */
+void
+dict_table_close(
+ dict_table_t* table,
+ bool dict_locked,
+ bool try_drop,
+ THD* thd,
+ MDL_ticket* mdl)
+{
+ if (!dict_locked) {
+ mutex_enter(&dict_sys.mutex);
+ }
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_a(table->get_ref_count() > 0);
+
+ const bool last_handle = table->release();
+
+ /* Force persistent stats re-read upon next open of the table
+ so that FLUSH TABLE can be used to forcibly fetch stats from disk
+ if they have been manually modified. We reset table->stat_initialized
+ only if table reference count is 0 because we do not want too frequent
+ stats re-reads (e.g. in other cases than FLUSH TABLE). */
+ if (last_handle && strchr(table->name.m_name, '/') != NULL
+ && dict_stats_is_persistent_enabled(table)) {
+
+ dict_stats_deinit(table);
+ }
+
+ MONITOR_DEC(MONITOR_TABLE_REFERENCE);
+
+ ut_ad(dict_lru_validate());
+ ut_ad(dict_sys.find(table));
+
+ if (!dict_locked) {
+ table_id_t table_id = table->id;
+ const bool drop_aborted = last_handle && try_drop
+ && table->drop_aborted
+ && dict_table_get_first_index(table);
+
+ mutex_exit(&dict_sys.mutex);
+
+ /* dict_table_try_drop_aborted() can generate undo logs.
+ So it should be avoided after shutdown of background
+ threads */
+ if (drop_aborted && !srv_undo_sources) {
+ dict_table_try_drop_aborted(NULL, table_id, 0);
+ }
+ }
+
+ if (!thd || !mdl) {
+ } else if (MDL_context *mdl_context= static_cast<MDL_context*>(
+ thd_mdl_context(thd))) {
+ mdl_context->release_lock(mdl);
+ }
+}
+
+/********************************************************************//**
+Closes the only open handle to a table and drops a table while assuring
+that dict_sys.mutex is held the whole time. This assures that the table
+is not evicted after the close when the count of open handles goes to zero.
+Because dict_sys.mutex is held, we do not need to call
+dict_table_prevent_eviction(). */
+void
+dict_table_close_and_drop(
+/*======================*/
+ trx_t* trx, /*!< in: data dictionary transaction */
+ dict_table_t* table) /*!< in/out: table */
+{
+ dberr_t err = DB_SUCCESS;
+
+ ut_d(dict_sys.assert_locked());
+ ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+
+ dict_table_close(table, true, false);
+
+#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG
+ /* Nobody should have initialized the stats of the newly created
+ table when this is called. So we know that it has not been added
+ for background stats gathering. */
+ ut_a(!table->stat_initialized);
+#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */
+
+ err = row_merge_drop_table(trx, table);
+
+ if (err != DB_SUCCESS) {
+ ib::error() << "At " << __FILE__ << ":" << __LINE__
+ << " row_merge_drop_table returned error: " << err
+ << " table: " << table->name;
+ }
+}
+
+/** Check if the table has a given (non_virtual) column.
+@param[in] table table object
+@param[in] col_name column name
+@param[in] col_nr column number guessed, 0 as default
+@return column number if the table has the specified column,
+otherwise table->n_def */
+ulint
+dict_table_has_column(
+ const dict_table_t* table,
+ const char* col_name,
+ ulint col_nr)
+{
+ ulint col_max = table->n_def;
+
+ ut_ad(table);
+ ut_ad(col_name);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ if (col_nr < col_max
+ && innobase_strcasecmp(
+ col_name, dict_table_get_col_name(table, col_nr)) == 0) {
+ return(col_nr);
+ }
+
+ /** The order of column may changed, check it with other columns */
+ for (ulint i = 0; i < col_max; i++) {
+ if (i != col_nr
+ && innobase_strcasecmp(
+ col_name, dict_table_get_col_name(table, i)) == 0) {
+
+ return(i);
+ }
+ }
+
+ return(col_max);
+}
+
+/** Retrieve the column name.
+@param[in] table the table of this column */
+const char* dict_col_t::name(const dict_table_t& table) const
+{
+ ut_ad(table.magic_n == DICT_TABLE_MAGIC_N);
+
+ size_t col_nr;
+ const char *s;
+
+ if (is_virtual()) {
+ col_nr = size_t(reinterpret_cast<const dict_v_col_t*>(this)
+ - table.v_cols);
+ ut_ad(col_nr < table.n_v_def);
+ s = table.v_col_names;
+ } else {
+ col_nr = size_t(this - table.cols);
+ ut_ad(col_nr < table.n_def);
+ s = table.col_names;
+ }
+
+ if (s) {
+ for (size_t i = 0; i < col_nr; i++) {
+ s += strlen(s) + 1;
+ }
+ }
+
+ return(s);
+}
+
+/** Returns a virtual column's name.
+@param[in] table target table
+@param[in] col_nr virtual column number (nth virtual column)
+@return column name or NULL if column number out of range. */
+const char*
+dict_table_get_v_col_name(
+ const dict_table_t* table,
+ ulint col_nr)
+{
+ const char* s;
+
+ ut_ad(table);
+ ut_ad(col_nr < table->n_v_def);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ if (col_nr >= table->n_v_def) {
+ return(NULL);
+ }
+
+ s = table->v_col_names;
+
+ if (s != NULL) {
+ for (ulint i = 0; i < col_nr; i++) {
+ s += strlen(s) + 1;
+ }
+ }
+
+ return(s);
+}
+
+/** Search virtual column's position in InnoDB according to its position
+in original table's position
+@param[in] table target table
+@param[in] col_nr column number (nth column in the MySQL table)
+@return virtual column's position in InnoDB, ULINT_UNDEFINED if not find */
+static
+ulint
+dict_table_get_v_col_pos_for_mysql(
+ const dict_table_t* table,
+ ulint col_nr)
+{
+ ulint i;
+
+ ut_ad(table);
+ ut_ad(col_nr < static_cast<ulint>(table->n_t_def));
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ for (i = 0; i < table->n_v_def; i++) {
+ if (col_nr == dict_get_v_col_mysql_pos(
+ table->v_cols[i].m_col.ind)) {
+ break;
+ }
+ }
+
+ if (i == table->n_v_def) {
+ return(ULINT_UNDEFINED);
+ }
+
+ return(i);
+}
+
+/** Returns a virtual column's name according to its original
+MySQL table position.
+@param[in] table target table
+@param[in] col_nr column number (nth column in the table)
+@return column name. */
+static
+const char*
+dict_table_get_v_col_name_mysql(
+ const dict_table_t* table,
+ ulint col_nr)
+{
+ ulint i = dict_table_get_v_col_pos_for_mysql(table, col_nr);
+
+ if (i == ULINT_UNDEFINED) {
+ return(NULL);
+ }
+
+ return(dict_table_get_v_col_name(table, i));
+}
+
+/** Get nth virtual column according to its original MySQL table position
+@param[in] table target table
+@param[in] col_nr column number in MySQL Table definition
+@return dict_v_col_t ptr */
+dict_v_col_t*
+dict_table_get_nth_v_col_mysql(
+ const dict_table_t* table,
+ ulint col_nr)
+{
+ ulint i = dict_table_get_v_col_pos_for_mysql(table, col_nr);
+
+ if (i == ULINT_UNDEFINED) {
+ return(NULL);
+ }
+
+ return(dict_table_get_nth_v_col(table, i));
+}
+
+
+/** Get all the FTS indexes on a table.
+@param[in] table table
+@param[out] indexes all FTS indexes on this table
+@return number of FTS indexes */
+ulint
+dict_table_get_all_fts_indexes(
+ const dict_table_t* table,
+ ib_vector_t* indexes)
+{
+ dict_index_t* index;
+
+ ut_a(ib_vector_size(indexes) == 0);
+
+ for (index = dict_table_get_first_index(table);
+ index;
+ index = dict_table_get_next_index(index)) {
+
+ if (index->type == DICT_FTS) {
+ ib_vector_push(indexes, &index);
+ }
+ }
+
+ return(ib_vector_size(indexes));
+}
+
+/** Looks for column n in an index.
+@param[in] index index
+@param[in] n column number
+@param[in] inc_prefix true=consider column prefixes too
+@param[in] is_virtual true==virtual column
+@param[out] prefix_col_pos col num if prefix
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_col_or_prefix_pos(
+ const dict_index_t* index,
+ ulint n,
+ bool inc_prefix,
+ bool is_virtual,
+ ulint* prefix_col_pos)
+{
+ const dict_field_t* field;
+ const dict_col_t* col;
+ ulint pos;
+ ulint n_fields;
+
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ if (prefix_col_pos) {
+ *prefix_col_pos = ULINT_UNDEFINED;
+ }
+
+ if (is_virtual) {
+ col = &(dict_table_get_nth_v_col(index->table, n)->m_col);
+ } else {
+ col = dict_table_get_nth_col(index->table, n);
+ }
+
+ if (dict_index_is_clust(index)) {
+
+ return(dict_col_get_clust_pos(col, index));
+ }
+
+ n_fields = dict_index_get_n_fields(index);
+
+ for (pos = 0; pos < n_fields; pos++) {
+ field = dict_index_get_nth_field(index, pos);
+
+ if (col == field->col) {
+ if (prefix_col_pos) {
+ *prefix_col_pos = pos;
+ }
+ if (inc_prefix || field->prefix_len == 0) {
+ return(pos);
+ }
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/** Check if the index contains a column or a prefix of that column.
+@param[in] n column number
+@param[in] is_virtual whether it is a virtual col
+@return whether the index contains the column or its prefix */
+bool dict_index_t::contains_col_or_prefix(ulint n, bool is_virtual) const
+{
+ ut_ad(magic_n == DICT_INDEX_MAGIC_N);
+
+ if (is_primary()) {
+ return(!is_virtual);
+ }
+
+ const dict_col_t* col = is_virtual
+ ? &dict_table_get_nth_v_col(table, n)->m_col
+ : dict_table_get_nth_col(table, n);
+
+ for (ulint pos = 0; pos < n_fields; pos++) {
+ if (col == fields[pos].col) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+ const dict_index_t* index, /*!< in: index from which to search */
+ const dict_index_t* index2, /*!< in: index */
+ ulint n) /*!< in: field number in index2 */
+{
+ const dict_field_t* field;
+ const dict_field_t* field2;
+ ulint n_fields;
+ ulint pos;
+
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ field2 = dict_index_get_nth_field(index2, n);
+
+ n_fields = dict_index_get_n_fields(index);
+
+ /* Are we looking for a MBR (Minimum Bound Box) field of
+ a spatial index */
+ bool is_mbr_fld = (n == 0 && dict_index_is_spatial(index2));
+
+ for (pos = 0; pos < n_fields; pos++) {
+ field = dict_index_get_nth_field(index, pos);
+
+ /* The first field of a spatial index is a transformed
+ MBR (Minimum Bound Box) field made out of original column,
+ so its field->col still points to original cluster index
+ col, but the actual content is different. So we cannot
+ consider them equal if neither of them is MBR field */
+ if (pos == 0 && dict_index_is_spatial(index) && !is_mbr_fld) {
+ continue;
+ }
+
+ if (field->col == field2->col
+ && (field->prefix_len == 0
+ || (field->prefix_len >= field2->prefix_len
+ && field2->prefix_len != 0))) {
+
+ return(pos);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/** Parse the table file name into table name and database name.
+@tparam dict_locked whether dict_sys.mutex is being held
+@param[in,out] db_name database name buffer
+@param[in,out] tbl_name table name buffer
+@param[out] db_name_len database name length
+@param[out] tbl_name_len table name length
+@return whether the table name is visible to SQL */
+template<bool dict_locked>
+bool dict_table_t::parse_name(char (&db_name)[NAME_LEN + 1],
+ char (&tbl_name)[NAME_LEN + 1],
+ size_t *db_name_len, size_t *tbl_name_len) const
+{
+ char db_buf[MAX_DATABASE_NAME_LEN + 1];
+ char tbl_buf[MAX_TABLE_NAME_LEN + 1];
+
+ if (!dict_locked)
+ mutex_enter(&dict_sys.mutex); /* protect against renaming */
+ else
+ ut_ad(mutex_own(&dict_sys.mutex));
+ const size_t db_len= name.dblen();
+ ut_ad(db_len <= MAX_DATABASE_NAME_LEN);
+
+ memcpy(db_buf, name.m_name, db_len);
+ db_buf[db_len]= 0;
+
+ size_t tbl_len= strlen(name.m_name + db_len + 1);
+
+ const bool is_temp= tbl_len > TEMP_FILE_PREFIX_LENGTH &&
+ !strncmp(name.m_name, TEMP_FILE_PREFIX, TEMP_FILE_PREFIX_LENGTH);
+
+ if (is_temp);
+ else if (const char *is_part= static_cast<const char*>
+ (memchr(name.m_name + db_len + 1, '#', tbl_len)))
+ tbl_len= static_cast<size_t>(is_part - &name.m_name[db_len + 1]);
+
+ memcpy(tbl_buf, name.m_name + db_len + 1, tbl_len);
+ tbl_buf[tbl_len]= 0;
+
+ if (!dict_locked)
+ mutex_exit(&dict_sys.mutex);
+
+ *db_name_len= filename_to_tablename(db_buf, db_name,
+ MAX_DATABASE_NAME_LEN + 1, true);
+
+ if (is_temp)
+ return false;
+
+ *tbl_name_len= filename_to_tablename(tbl_buf, tbl_name,
+ MAX_TABLE_NAME_LEN + 1, true);
+ return true;
+}
+
+template bool
+dict_table_t::parse_name<>(char(&)[NAME_LEN + 1], char(&)[NAME_LEN + 1],
+ size_t*, size_t*) const;
+
+/** Acquire MDL shared for the table name.
+@tparam trylock whether to use non-blocking operation
+@param[in,out] table table object
+@param[in,out] thd background thread
+@param[out] mdl mdl ticket
+@param[in] table_op operation to perform when opening
+@return table object after locking MDL shared
+@retval nullptr if the table is not readable, or if trylock && MDL blocked */
+template<bool trylock>
+dict_table_t*
+dict_acquire_mdl_shared(dict_table_t *table,
+ THD *thd,
+ MDL_ticket **mdl,
+ dict_table_op_t table_op)
+{
+ if (!table || !mdl)
+ return table;
+
+ MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
+ size_t db_len;
+
+ if (trylock)
+ {
+ mutex_enter(&dict_sys.mutex);
+ db_len= dict_get_db_name_len(table->name.m_name);
+ mutex_exit(&dict_sys.mutex);
+ }
+ else
+ {
+ ut_ad(mutex_own(&dict_sys.mutex));
+ db_len= dict_get_db_name_len(table->name.m_name);
+ }
+
+ if (db_len == 0)
+ return table; /* InnoDB system tables are not covered by MDL */
+
+ if (!mdl_context)
+ return nullptr;
+
+ table_id_t table_id= table->id;
+ char db_buf[NAME_LEN + 1], db_buf1[NAME_LEN + 1];
+ char tbl_buf[NAME_LEN + 1], tbl_buf1[NAME_LEN + 1];
+ size_t tbl_len;
+ bool unaccessible= false;
+
+ if (!table->parse_name<!trylock>(db_buf, tbl_buf, &db_len, &tbl_len))
+ /* The name of an intermediate table starts with #sql */
+ return table;
+
+retry:
+ if (!unaccessible && (!table->is_readable() || table->corrupted))
+ {
+is_unaccessible:
+ if (*mdl)
+ {
+ mdl_context->release_lock(*mdl);
+ *mdl= nullptr;
+ }
+ unaccessible= true;
+ }
+
+ if (!trylock)
+ table->release();
+
+ if (unaccessible)
+ return nullptr;
+
+ if (!trylock)
+ mutex_exit(&dict_sys.mutex);
+ {
+ MDL_request request;
+ MDL_REQUEST_INIT(&request,MDL_key::TABLE, db_buf, tbl_buf, MDL_SHARED, MDL_EXPLICIT);
+ if (trylock
+ ? mdl_context->try_acquire_lock(&request)
+ : mdl_context->acquire_lock(&request,
+ /* FIXME: use compatible type, and maybe
+ remove this parameter altogether! */
+ static_cast<double>(global_system_variables
+ .lock_wait_timeout)))
+ {
+ *mdl= nullptr;
+ if (trylock)
+ return nullptr;
+ }
+ else
+ *mdl= request.ticket;
+ }
+
+ if (!trylock)
+ mutex_enter(&dict_sys.mutex);
+ else if (!*mdl)
+ return nullptr;
+
+ table= dict_table_open_on_id(table_id, !trylock, table_op);
+
+ if (!table)
+ {
+ /* The table was dropped. */
+ if (*mdl)
+ {
+ mdl_context->release_lock(*mdl);
+ *mdl= nullptr;
+ }
+ return nullptr;
+ }
+
+ if (!table->is_accessible())
+ goto is_unaccessible;
+
+ size_t db1_len, tbl1_len;
+
+ if (!table->parse_name<!trylock>(db_buf1, tbl_buf1, &db1_len, &tbl1_len))
+ {
+ /* The table was renamed to #sql prefix.
+ Release MDL (if any) for the old name and return. */
+ if (*mdl)
+ {
+ mdl_context->release_lock(*mdl);
+ *mdl= nullptr;
+ }
+ return table;
+ }
+
+ if (*mdl)
+ {
+ if (db_len == db1_len && tbl_len == tbl1_len &&
+ !memcmp(db_buf, db_buf1, db_len) &&
+ !memcmp(tbl_buf, tbl_buf1, tbl_len))
+ return table;
+
+ /* The table was renamed. Release MDL for the old name and
+ try to acquire MDL for the new name. */
+ mdl_context->release_lock(*mdl);
+ *mdl= nullptr;
+ }
+
+ db_len= db1_len;
+ tbl_len= tbl1_len;
+
+ memcpy(tbl_buf, tbl_buf1, tbl_len + 1);
+ memcpy(db_buf, db_buf1, db_len + 1);
+ goto retry;
+}
+
+template dict_table_t*
+dict_acquire_mdl_shared<true>(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
+
+/** Look up a table by numeric identifier.
+@param[in] table_id table identifier
+@param[in] dict_locked data dictionary locked
+@param[in] table_op operation to perform when opening
+@param[in,out] thd background thread, or NULL to not acquire MDL
+@param[out] mdl mdl ticket, or NULL
+@return table, NULL if does not exist */
+dict_table_t*
+dict_table_open_on_id(table_id_t table_id, bool dict_locked,
+ dict_table_op_t table_op, THD *thd,
+ MDL_ticket **mdl)
+{
+ ut_ad(!dict_locked || !thd);
+
+ if (!dict_locked) {
+ mutex_enter(&dict_sys.mutex);
+ }
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ dict_table_t* table = dict_table_open_on_id_low(
+ table_id,
+ table_op == DICT_TABLE_OP_LOAD_TABLESPACE
+ ? DICT_ERR_IGNORE_RECOVER_LOCK
+ : DICT_ERR_IGNORE_FK_NOKEY,
+ table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+
+ if (table != NULL) {
+ dict_sys.acquire(table);
+ MONITOR_INC(MONITOR_TABLE_REFERENCE);
+ }
+
+ if (!dict_locked) {
+ if (thd) {
+ table = dict_acquire_mdl_shared<false>(
+ table, thd, mdl, table_op);
+ }
+
+ dict_table_try_drop_aborted_and_mutex_exit(
+ table, table_op == DICT_TABLE_OP_DROP_ORPHAN);
+ }
+
+ return table;
+}
+
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return position in internal representation of the clustered index */
+unsigned
+dict_table_get_nth_col_pos(
+/*=======================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n, /*!< in: column number */
+ ulint* prefix_col_pos)
+{
+ ulint pos= dict_index_get_nth_col_pos(dict_table_get_first_index(table),
+ n, prefix_col_pos);
+ DBUG_ASSERT(pos <= dict_index_t::MAX_N_FIELDS);
+ return static_cast<unsigned>(pos);
+}
+
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return TRUE if the column, or its prefix, is in the clustered key */
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n) /*!< in: column number */
+{
+ const dict_index_t* index;
+ const dict_field_t* field;
+ const dict_col_t* col;
+ ulint pos;
+ ulint n_fields;
+
+ col = dict_table_get_nth_col(table, n);
+
+ index = dict_table_get_first_index(table);
+
+ n_fields = dict_index_get_n_unique(index);
+
+ for (pos = 0; pos < n_fields; pos++) {
+ field = dict_index_get_nth_field(index, pos);
+
+ if (col == field->col) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/** Initialise the data dictionary cache. */
+void dict_sys_t::create()
+{
+ ut_ad(this == &dict_sys);
+ ut_ad(!is_initialised());
+ m_initialised= true;
+ UT_LIST_INIT(table_LRU, &dict_table_t::table_LRU);
+ UT_LIST_INIT(table_non_LRU, &dict_table_t::table_LRU);
+
+ mutex_create(LATCH_ID_DICT_SYS, &mutex);
+
+ const ulint hash_size = buf_pool_get_curr_size()
+ / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE);
+
+ table_hash.create(hash_size);
+ table_id_hash.create(hash_size);
+ temp_id_hash.create(hash_size);
+
+ rw_lock_create(dict_operation_lock_key, &latch, SYNC_DICT_OPERATION);
+
+ if (!srv_read_only_mode)
+ {
+ dict_foreign_err_file= os_file_create_tmpfile();
+ ut_a(dict_foreign_err_file);
+ }
+
+ mutex_create(LATCH_ID_DICT_FOREIGN_ERR, &dict_foreign_err_mutex);
+}
+
+/** Acquire a reference to a cached table. */
+inline void dict_sys_t::acquire(dict_table_t* table)
+{
+ ut_ad(dict_sys.find(table));
+ if (table->can_be_evicted)
+ {
+ UT_LIST_REMOVE(dict_sys.table_LRU, table);
+ UT_LIST_ADD_FIRST(dict_sys.table_LRU, table);
+ }
+
+ table->acquire();
+}
+
+/**********************************************************************//**
+Returns a table object and increment its open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' module. Inside this directory dict_table_get_low
+is usually the appropriate function.
+@return table, NULL if does not exist */
+dict_table_t*
+dict_table_open_on_name(
+/*====================*/
+ const char* table_name, /*!< in: table name */
+ ibool dict_locked, /*!< in: TRUE=data dictionary locked */
+ ibool try_drop, /*!< in: TRUE=try to drop any orphan
+ indexes after an aborted online
+ index creation */
+ dict_err_ignore_t
+ ignore_err) /*!< in: error to be ignored when
+ loading a table definition */
+{
+ dict_table_t* table;
+ DBUG_ENTER("dict_table_open_on_name");
+ DBUG_PRINT("dict_table_open_on_name", ("table: '%s'", table_name));
+
+ if (!dict_locked) {
+ mutex_enter(&dict_sys.mutex);
+ }
+
+ ut_ad(table_name);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ table = dict_table_check_if_in_cache_low(table_name);
+
+ if (table == NULL) {
+ table = dict_load_table(table_name, ignore_err);
+ }
+
+ ut_ad(!table || table->cached);
+
+ if (table != NULL) {
+
+ /* If table is encrypted or corrupted */
+ if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY)
+ && !table->is_readable()) {
+ /* Make life easy for drop table. */
+ dict_sys.prevent_eviction(table);
+
+ if (table->corrupted) {
+
+ ib::error() << "Table " << table->name
+ << " is corrupted. Please "
+ "drop the table and recreate.";
+ if (!dict_locked) {
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ DBUG_RETURN(NULL);
+ }
+
+ dict_sys.acquire(table);
+
+ if (!dict_locked) {
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ DBUG_RETURN(table);
+ }
+
+ dict_sys.acquire(table);
+ MONITOR_INC(MONITOR_TABLE_REFERENCE);
+ }
+
+ ut_ad(dict_lru_validate());
+
+ if (!dict_locked) {
+ dict_table_try_drop_aborted_and_mutex_exit(table, try_drop);
+ }
+
+ DBUG_RETURN(table);
+}
+
+/**********************************************************************//**
+Adds system columns to a table object. */
+void
+dict_table_add_system_columns(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ mem_heap_t* heap) /*!< in: temporary heap */
+{
+ ut_ad(table->n_def == table->n_cols - DATA_N_SYS_COLS);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(!table->cached);
+
+ /* NOTE: the system columns MUST be added in the following order
+ (so that they can be indexed by the numerical value of DATA_ROW_ID,
+ etc.) and as the last columns of the table memory object.
+ The clustered index will not always physically contain all system
+ columns. */
+
+ dict_mem_table_add_col(table, heap, "DB_ROW_ID", DATA_SYS,
+ DATA_ROW_ID | DATA_NOT_NULL,
+ DATA_ROW_ID_LEN);
+
+ compile_time_assert(DATA_ROW_ID == 0);
+ dict_mem_table_add_col(table, heap, "DB_TRX_ID", DATA_SYS,
+ DATA_TRX_ID | DATA_NOT_NULL,
+ DATA_TRX_ID_LEN);
+ compile_time_assert(DATA_TRX_ID == 1);
+ dict_mem_table_add_col(table, heap, "DB_ROLL_PTR", DATA_SYS,
+ DATA_ROLL_PTR | DATA_NOT_NULL,
+ DATA_ROLL_PTR_LEN);
+ compile_time_assert(DATA_ROLL_PTR == 2);
+
+ /* This check reminds that if a new system column is added to
+ the program, it should be dealt with here */
+ compile_time_assert(DATA_N_SYS_COLS == 3);
+}
+
+/** Add the table definition to the data dictionary cache */
+void dict_table_t::add_to_cache()
+{
+ cached = TRUE;
+
+ dict_sys.add(this);
+}
+
+/** Add a table definition to the data dictionary cache */
+inline void dict_sys_t::add(dict_table_t* table)
+{
+ ut_ad(!find(table));
+
+ ulint fold = ut_fold_string(table->name.m_name);
+
+ new (&table->autoinc_mutex) std::mutex();
+
+ /* Look for a table with the same name: error if such exists */
+ {
+ dict_table_t* table2;
+ HASH_SEARCH(name_hash, &table_hash, fold,
+ dict_table_t*, table2, ut_ad(table2->cached),
+ !strcmp(table2->name.m_name, table->name.m_name));
+ ut_a(table2 == NULL);
+
+#ifdef UNIV_DEBUG
+ /* Look for the same table pointer with a different name */
+ HASH_SEARCH_ALL(name_hash, &table_hash,
+ dict_table_t*, table2, ut_ad(table2->cached),
+ table2 == table);
+ ut_ad(table2 == NULL);
+#endif /* UNIV_DEBUG */
+ }
+ HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
+
+ /* Look for a table with the same id: error if such exists */
+ hash_table_t* id_hash = table->is_temporary()
+ ? &temp_id_hash : &table_id_hash;
+ const ulint id_fold = ut_fold_ull(table->id);
+ {
+ dict_table_t* table2;
+ HASH_SEARCH(id_hash, id_hash, id_fold,
+ dict_table_t*, table2, ut_ad(table2->cached),
+ table2->id == table->id);
+ ut_a(table2 == NULL);
+
+#ifdef UNIV_DEBUG
+ /* Look for the same table pointer with a different id */
+ HASH_SEARCH_ALL(id_hash, id_hash,
+ dict_table_t*, table2, ut_ad(table2->cached),
+ table2 == table);
+ ut_ad(table2 == NULL);
+#endif /* UNIV_DEBUG */
+
+ HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table);
+ }
+
+ UT_LIST_ADD_FIRST(table->can_be_evicted ? table_LRU : table_non_LRU,
+ table);
+ ut_ad(dict_lru_validate());
+}
+
+/**********************************************************************//**
+Test whether a table can be evicted from the LRU cache.
+@return TRUE if table can be evicted. */
+static
+ibool
+dict_table_can_be_evicted(
+/*======================*/
+ dict_table_t* table) /*!< in: table to test */
+{
+ ut_d(dict_sys.assert_locked());
+ ut_a(table->can_be_evicted);
+ ut_a(table->foreign_set.empty());
+ ut_a(table->referenced_set.empty());
+
+ if (table->get_ref_count() == 0) {
+ /* The transaction commit and rollback are called from
+ outside the handler interface. This means that there is
+ a window where the table->n_ref_count can be zero but
+ the table instance is in "use". */
+
+ if (lock_table_has_locks(table)) {
+ return(FALSE);
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* We cannot really evict the table if adaptive hash
+ index entries are pointing to any of its indexes. */
+ for (dict_index_t* index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ if (index->n_ahi_pages()) {
+ return(FALSE);
+ }
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** @return a clone of this */
+dict_index_t *dict_index_t::clone() const
+{
+ ut_ad(n_fields);
+ ut_ad(!(type & (DICT_IBUF | DICT_SPATIAL | DICT_FTS)));
+ ut_ad(online_status == ONLINE_INDEX_COMPLETE);
+ ut_ad(is_committed());
+ ut_ad(!is_dummy);
+ ut_ad(!parser);
+ ut_ad(!online_log);
+ ut_ad(!rtr_track);
+
+ const size_t size= sizeof *this + n_fields * sizeof(*fields) +
+#ifdef BTR_CUR_ADAPT
+ sizeof *search_info +
+#endif
+ 1 + strlen(name) +
+ n_uniq * (sizeof *stat_n_diff_key_vals +
+ sizeof *stat_n_sample_sizes +
+ sizeof *stat_n_non_null_key_vals);
+
+ mem_heap_t* heap= mem_heap_create(size);
+ dict_index_t *index= static_cast<dict_index_t*>(mem_heap_dup(heap, this,
+ sizeof *this));
+ *index= *this;
+ rw_lock_create(index_tree_rw_lock_key, &index->lock, SYNC_INDEX_TREE);
+ index->heap= heap;
+ index->name= mem_heap_strdup(heap, name);
+ index->fields= static_cast<dict_field_t*>
+ (mem_heap_dup(heap, fields, n_fields * sizeof *fields));
+#ifdef BTR_CUR_ADAPT
+ index->search_info= btr_search_info_create(index->heap);
+#endif /* BTR_CUR_ADAPT */
+ index->stat_n_diff_key_vals= static_cast<ib_uint64_t*>
+ (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_diff_key_vals));
+ index->stat_n_sample_sizes= static_cast<ib_uint64_t*>
+ (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_sample_sizes));
+ index->stat_n_non_null_key_vals= static_cast<ib_uint64_t*>
+ (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_non_null_key_vals));
+ new (&index->zip_pad.mutex) std::mutex();
+ return index;
+}
+
+/** Clone this index for lazy dropping of the adaptive hash.
+@return this or a clone */
+dict_index_t *dict_index_t::clone_if_needed()
+{
+ if (!search_info->ref_count)
+ return this;
+ dict_index_t *prev= UT_LIST_GET_PREV(indexes, this);
+
+ UT_LIST_REMOVE(table->indexes, this);
+ UT_LIST_ADD_LAST(table->freed_indexes, this);
+ dict_index_t *index= clone();
+ set_freed();
+ if (prev)
+ UT_LIST_INSERT_AFTER(table->indexes, prev, index);
+ else
+ UT_LIST_ADD_FIRST(table->indexes, index);
+ return index;
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/**********************************************************************//**
+Make room in the table cache by evicting an unused table. The unused table
+should not be part of FK relationship and currently not used in any user
+transaction. There is no guarantee that it will remove a table.
+@return number of tables evicted. If the number of tables in the dict_LRU
+is less than max_tables it will not do anything. */
+ulint
+dict_make_room_in_cache(
+/*====================*/
+ ulint max_tables, /*!< in: max tables allowed in cache */
+ ulint pct_check) /*!< in: max percent to check */
+{
+ ulint i;
+ ulint len;
+ dict_table_t* table;
+ ulint check_up_to;
+ ulint n_evicted = 0;
+
+ ut_a(pct_check > 0);
+ ut_a(pct_check <= 100);
+ ut_d(dict_sys.assert_locked());
+ ut_ad(dict_lru_validate());
+
+ i = len = UT_LIST_GET_LEN(dict_sys.table_LRU);
+
+ if (len < max_tables) {
+ return(0);
+ }
+
+ check_up_to = len - ((len * pct_check) / 100);
+
+ /* Check for overflow */
+ ut_a(i == 0 || check_up_to <= i);
+
+ /* Find a suitable candidate to evict from the cache. Don't scan the
+ entire LRU list. Only scan pct_check list entries. */
+
+ for (table = UT_LIST_GET_LAST(dict_sys.table_LRU);
+ table != NULL
+ && i > check_up_to
+ && (len - n_evicted) > max_tables;
+ --i) {
+
+ dict_table_t* prev_table;
+
+ prev_table = UT_LIST_GET_PREV(table_LRU, table);
+
+ if (dict_table_can_be_evicted(table)) {
+ ut_ad(!table->fts);
+ dict_sys.remove(table, true);
+
+ ++n_evicted;
+ }
+
+ table = prev_table;
+ }
+
+ return(n_evicted);
+}
+
+/** Looks for an index with the given id given a table instance.
+@param[in] table table instance
+@param[in] id index id
+@return index or NULL */
+dict_index_t*
+dict_table_find_index_on_id(
+ const dict_table_t* table,
+ index_id_t id)
+{
+ dict_index_t* index;
+
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+
+ if (id == index->id) {
+ /* Found */
+
+ return(index);
+ }
+ }
+
+ return(NULL);
+}
+
+/**********************************************************************//**
+Looks for an index with the given id. NOTE that we do not reserve
+the dictionary mutex: this function is for emergency purposes like
+printing info of a corrupt database page!
+@return index or NULL if not found in cache */
+dict_index_t*
+dict_index_find_on_id_low(
+/*======================*/
+ index_id_t id) /*!< in: index id */
+{
+ if (!dict_sys.is_initialised()) return NULL;
+
+ dict_table_t* table;
+
+ for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU);
+ table != NULL;
+ table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+ dict_index_t* index = dict_table_find_index_on_id(table, id);
+
+ if (index != NULL) {
+ return(index);
+ }
+ }
+
+ for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU);
+ table != NULL;
+ table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+ dict_index_t* index = dict_table_find_index_on_id(table, id);
+
+ if (index != NULL) {
+ return(index);
+ }
+ }
+
+ return(NULL);
+}
+
+/** Function object to remove a foreign key constraint from the
+referenced_set of the referenced table. The foreign key object is
+also removed from the dictionary cache. The foreign key constraint
+is not removed from the foreign_set of the table containing the
+constraint. */
+struct dict_foreign_remove_partial
+{
+ void operator()(dict_foreign_t* foreign) {
+ dict_table_t* table = foreign->referenced_table;
+ if (table != NULL) {
+ table->referenced_set.erase(foreign);
+ }
+ dict_foreign_free(foreign);
+ }
+};
+
+/**********************************************************************//**
+Renames a table object.
+@return TRUE if success */
+dberr_t
+dict_table_rename_in_cache(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ const char* new_name, /*!< in: new name */
+ bool rename_also_foreigns,
+ /*!< in: in ALTER TABLE we want
+ to preserve the original table name
+ in constraints which reference it */
+ bool replace_new_file)
+ /*!< in: whether to replace the
+ file with the new name
+ (as part of rolling back TRUNCATE) */
+{
+ dberr_t err;
+ dict_foreign_t* foreign;
+ ulint fold;
+ char old_name[MAX_FULL_NAME_LEN + 1];
+ os_file_type_t ftype;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ /* store the old/current name to an automatic variable */
+ ut_a(strlen(table->name.m_name) < sizeof old_name);
+ strcpy(old_name, table->name.m_name);
+
+ fold = ut_fold_string(new_name);
+
+ /* Look for a table with the same name: error if such exists */
+ dict_table_t* table2;
+ HASH_SEARCH(name_hash, &dict_sys.table_hash, fold,
+ dict_table_t*, table2, ut_ad(table2->cached),
+ (strcmp(table2->name.m_name, new_name) == 0));
+ DBUG_EXECUTE_IF("dict_table_rename_in_cache_failure",
+ if (table2 == NULL) {
+ table2 = (dict_table_t*) -1;
+ } );
+ if (table2) {
+ ib::error() << "Cannot rename table '" << old_name
+ << "' to '" << new_name << "' since the"
+ " dictionary cache already contains '" << new_name << "'.";
+ return(DB_ERROR);
+ }
+
+ /* If the table is stored in a single-table tablespace, rename the
+ .ibd file and rebuild the .isl file if needed. */
+
+ if (!table->space) {
+ bool exists;
+ char* filepath;
+
+ ut_ad(dict_table_is_file_per_table(table));
+ ut_ad(!table->is_temporary());
+
+ /* Make sure the data_dir_path is set. */
+ dict_get_and_save_data_dir_path(table, true);
+
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+ ut_a(table->data_dir_path);
+
+ filepath = fil_make_filepath(
+ table->data_dir_path, table->name.m_name,
+ IBD, true);
+ } else {
+ filepath = fil_make_filepath(
+ NULL, table->name.m_name, IBD, false);
+ }
+
+ if (filepath == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ fil_delete_tablespace(table->space_id, !table->space);
+
+ /* Delete any temp file hanging around. */
+ if (os_file_status(filepath, &exists, &ftype)
+ && exists
+ && !os_file_delete_if_exists(innodb_temp_file_key,
+ filepath, NULL)) {
+
+ ib::info() << "Delete of " << filepath << " failed.";
+ }
+ ut_free(filepath);
+
+ } else if (dict_table_is_file_per_table(table)) {
+ char* new_path;
+ const char* old_path = UT_LIST_GET_FIRST(table->space->chain)
+ ->name;
+
+ ut_ad(!table->is_temporary());
+
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+ new_path = os_file_make_new_pathname(
+ old_path, new_name);
+ err = RemoteDatafile::create_link_file(
+ new_name, new_path);
+
+ if (err != DB_SUCCESS) {
+ ut_free(new_path);
+ return(DB_TABLESPACE_EXISTS);
+ }
+ } else {
+ new_path = fil_make_filepath(
+ NULL, new_name, IBD, false);
+ }
+
+ /* New filepath must not exist. */
+ err = table->space->rename(new_name, new_path, true,
+ replace_new_file);
+ ut_free(new_path);
+
+ /* If the tablespace is remote, a new .isl file was created
+ If success, delete the old one. If not, delete the new one. */
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+ RemoteDatafile::delete_link_file(
+ err == DB_SUCCESS ? old_name : new_name);
+ }
+
+ if (err != DB_SUCCESS) {
+ return err;
+ }
+ }
+
+ /* Remove table from the hash tables of tables */
+ HASH_DELETE(dict_table_t, name_hash, &dict_sys.table_hash,
+ ut_fold_string(old_name), table);
+
+ if (strlen(new_name) > strlen(table->name.m_name)) {
+ /* We allocate MAX_FULL_NAME_LEN + 1 bytes here to avoid
+ memory fragmentation, we assume a repeated calls of
+ ut_realloc() with the same size do not cause fragmentation */
+ ut_a(strlen(new_name) <= MAX_FULL_NAME_LEN);
+
+ table->name.m_name = static_cast<char*>(
+ ut_realloc(table->name.m_name, MAX_FULL_NAME_LEN + 1));
+ }
+ strcpy(table->name.m_name, new_name);
+
+ /* Add table to hash table of tables */
+ HASH_INSERT(dict_table_t, name_hash, &dict_sys.table_hash, fold,
+ table);
+
+ if (!rename_also_foreigns) {
+ /* In ALTER TABLE we think of the rename table operation
+ in the direction table -> temporary table (#sql...)
+ as dropping the table with the old name and creating
+ a new with the new name. Thus we kind of drop the
+ constraints from the dictionary cache here. The foreign key
+ constraints will be inherited to the new table from the
+ system tables through a call of dict_load_foreigns. */
+
+ /* Remove the foreign constraints from the cache */
+ std::for_each(table->foreign_set.begin(),
+ table->foreign_set.end(),
+ dict_foreign_remove_partial());
+ table->foreign_set.clear();
+
+ /* Reset table field in referencing constraints */
+ for (dict_foreign_set::iterator it
+ = table->referenced_set.begin();
+ it != table->referenced_set.end();
+ ++it) {
+
+ foreign = *it;
+ foreign->referenced_table = NULL;
+ foreign->referenced_index = NULL;
+
+ }
+
+ /* Make the set of referencing constraints empty */
+ table->referenced_set.clear();
+
+ return(DB_SUCCESS);
+ }
+
+ /* Update the table name fields in foreign constraints, and update also
+ the constraint id of new format >= 4.0.18 constraints. Note that at
+ this point we have already changed table->name to the new name. */
+
+ dict_foreign_set fk_set;
+
+ for (;;) {
+
+ dict_foreign_set::iterator it
+ = table->foreign_set.begin();
+
+ if (it == table->foreign_set.end()) {
+ break;
+ }
+
+ foreign = *it;
+
+ if (foreign->referenced_table) {
+ foreign->referenced_table->referenced_set.erase(foreign);
+ }
+
+ if (strlen(foreign->foreign_table_name)
+ < strlen(table->name.m_name)) {
+ /* Allocate a longer name buffer;
+ TODO: store buf len to save memory */
+
+ foreign->foreign_table_name = mem_heap_strdup(
+ foreign->heap, table->name.m_name);
+ dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+ } else {
+ strcpy(foreign->foreign_table_name,
+ table->name.m_name);
+ dict_mem_foreign_table_name_lookup_set(foreign, FALSE);
+ }
+ if (strchr(foreign->id, '/')) {
+ /* This is a >= 4.0.18 format id */
+
+ ulint db_len;
+ char* old_id;
+ char old_name_cs_filename[MAX_FULL_NAME_LEN+1];
+ uint errors = 0;
+
+ /* All table names are internally stored in charset
+ my_charset_filename (except the temp tables and the
+ partition identifier suffix in partition tables). The
+ foreign key constraint names are internally stored
+ in UTF-8 charset. The variable fkid here is used
+ to store foreign key constraint name in charset
+ my_charset_filename for comparison further below. */
+ char fkid[MAX_TABLE_NAME_LEN+20];
+ ibool on_tmp = FALSE;
+
+ /* The old table name in my_charset_filename is stored
+ in old_name_cs_filename */
+
+ strcpy(old_name_cs_filename, old_name);
+ old_name_cs_filename[MAX_FULL_NAME_LEN] = '\0';
+ if (strstr(old_name, TEMP_TABLE_PATH_PREFIX) == NULL) {
+
+ innobase_convert_to_system_charset(
+ strchr(old_name_cs_filename, '/') + 1,
+ strchr(old_name, '/') + 1,
+ MAX_TABLE_NAME_LEN, &errors);
+
+ if (errors) {
+ /* There has been an error to convert
+ old table into UTF-8. This probably
+ means that the old table name is
+ actually in UTF-8. */
+ innobase_convert_to_filename_charset(
+ strchr(old_name_cs_filename,
+ '/') + 1,
+ strchr(old_name, '/') + 1,
+ MAX_TABLE_NAME_LEN);
+ } else {
+ /* Old name already in
+ my_charset_filename */
+ strcpy(old_name_cs_filename, old_name);
+ old_name_cs_filename[MAX_FULL_NAME_LEN]
+ = '\0';
+ }
+ }
+
+ strncpy(fkid, foreign->id, MAX_TABLE_NAME_LEN);
+
+ if (strstr(fkid, TEMP_TABLE_PATH_PREFIX) == NULL) {
+ innobase_convert_to_filename_charset(
+ strchr(fkid, '/') + 1,
+ strchr(foreign->id, '/') + 1,
+ MAX_TABLE_NAME_LEN+20);
+ } else {
+ on_tmp = TRUE;
+ }
+
+ old_id = mem_strdup(foreign->id);
+
+ if (strlen(fkid) > strlen(old_name_cs_filename)
+ + ((sizeof dict_ibfk) - 1)
+ && !memcmp(fkid, old_name_cs_filename,
+ strlen(old_name_cs_filename))
+ && !memcmp(fkid + strlen(old_name_cs_filename),
+ dict_ibfk, (sizeof dict_ibfk) - 1)) {
+
+ /* This is a generated >= 4.0.18 format id */
+
+ char table_name[MAX_TABLE_NAME_LEN + 1];
+ uint errors = 0;
+
+ if (strlen(table->name.m_name)
+ > strlen(old_name)) {
+ foreign->id = static_cast<char*>(
+ mem_heap_alloc(
+ foreign->heap,
+ strlen(table->name.m_name)
+ + strlen(old_id) + 1));
+ }
+
+ /* Convert the table name to UTF-8 */
+ strncpy(table_name, table->name.m_name,
+ MAX_TABLE_NAME_LEN);
+ table_name[MAX_TABLE_NAME_LEN] = '\0';
+ innobase_convert_to_system_charset(
+ strchr(table_name, '/') + 1,
+ strchr(table->name.m_name, '/') + 1,
+ MAX_TABLE_NAME_LEN, &errors);
+
+ if (errors) {
+ /* Table name could not be converted
+ from charset my_charset_filename to
+ UTF-8. This means that the table name
+ is already in UTF-8 (#mysql50#). */
+ strncpy(table_name, table->name.m_name,
+ MAX_TABLE_NAME_LEN);
+ table_name[MAX_TABLE_NAME_LEN] = '\0';
+ }
+
+ /* Replace the prefix 'databasename/tablename'
+ with the new names */
+ strcpy(foreign->id, table_name);
+ if (on_tmp) {
+ strcat(foreign->id,
+ old_id + strlen(old_name));
+ } else {
+ sprintf(strchr(foreign->id, '/') + 1,
+ "%s%s",
+ strchr(table_name, '/') +1,
+ strstr(old_id, "_ibfk_") );
+ }
+
+ } else {
+ /* This is a >= 4.0.18 format id where the user
+ gave the id name */
+ db_len = dict_get_db_name_len(
+ table->name.m_name) + 1;
+
+ if (db_len - 1
+ > dict_get_db_name_len(foreign->id)) {
+
+ foreign->id = static_cast<char*>(
+ mem_heap_alloc(
+ foreign->heap,
+ db_len + strlen(old_id) + 1));
+ }
+
+ /* Replace the database prefix in id with the
+ one from table->name */
+
+ memcpy(foreign->id,
+ table->name.m_name, db_len);
+
+ strcpy(foreign->id + db_len,
+ dict_remove_db_name(old_id));
+ }
+
+ ut_free(old_id);
+ }
+
+ table->foreign_set.erase(it);
+ fk_set.insert(foreign);
+
+ if (foreign->referenced_table) {
+ foreign->referenced_table->referenced_set.insert(foreign);
+ }
+ }
+
+ ut_a(table->foreign_set.empty());
+ table->foreign_set.swap(fk_set);
+
+ for (dict_foreign_set::iterator it = table->referenced_set.begin();
+ it != table->referenced_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ if (strlen(foreign->referenced_table_name)
+ < strlen(table->name.m_name)) {
+ /* Allocate a longer name buffer;
+ TODO: store buf len to save memory */
+
+ foreign->referenced_table_name = mem_heap_strdup(
+ foreign->heap, table->name.m_name);
+
+ dict_mem_referenced_table_name_lookup_set(
+ foreign, TRUE);
+ } else {
+ /* Use the same buffer */
+ strcpy(foreign->referenced_table_name,
+ table->name.m_name);
+
+ dict_mem_referenced_table_name_lookup_set(
+ foreign, FALSE);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table object already in cache */
+ table_id_t new_id) /*!< in: new id to set */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(!table->is_temporary());
+
+ /* Remove the table from the hash table of id's */
+
+ HASH_DELETE(dict_table_t, id_hash, &dict_sys.table_id_hash,
+ ut_fold_ull(table->id), table);
+ table->id = new_id;
+
+ /* Add the table back to the hash table */
+ HASH_INSERT(dict_table_t, id_hash, &dict_sys.table_id_hash,
+ ut_fold_ull(table->id), table);
+}
+
+/** Evict a table definition from the InnoDB data dictionary cache.
+@param[in,out] table cached table definition to be evicted
+@param[in] lru whether this is part of least-recently-used eviction
+@param[in] keep whether to keep (not free) the object */
+void dict_sys_t::remove(dict_table_t* table, bool lru, bool keep)
+{
+ dict_foreign_t* foreign;
+ dict_index_t* index;
+
+ ut_ad(dict_lru_validate());
+ ut_a(table->get_ref_count() == 0);
+ ut_a(table->n_rec_locks == 0);
+ ut_ad(find(table));
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ /* Remove the foreign constraints from the cache */
+ std::for_each(table->foreign_set.begin(), table->foreign_set.end(),
+ dict_foreign_remove_partial());
+ table->foreign_set.clear();
+
+ /* Reset table field in referencing constraints */
+ for (dict_foreign_set::iterator it = table->referenced_set.begin();
+ it != table->referenced_set.end();
+ ++it) {
+
+ foreign = *it;
+ foreign->referenced_table = NULL;
+ foreign->referenced_index = NULL;
+ }
+
+ /* Remove the indexes from the cache */
+
+ for (index = UT_LIST_GET_LAST(table->indexes);
+ index != NULL;
+ index = UT_LIST_GET_LAST(table->indexes)) {
+
+ dict_index_remove_from_cache_low(table, index, lru);
+ }
+
+ /* Remove table from the hash tables of tables */
+
+ HASH_DELETE(dict_table_t, name_hash, &table_hash,
+ ut_fold_string(table->name.m_name), table);
+
+ hash_table_t* id_hash = table->is_temporary()
+ ? &temp_id_hash : &table_id_hash;
+ const ulint id_fold = ut_fold_ull(table->id);
+ HASH_DELETE(dict_table_t, id_hash, id_hash, id_fold, table);
+
+ /* Remove table from LRU or non-LRU list. */
+ if (table->can_be_evicted) {
+ UT_LIST_REMOVE(table_LRU, table);
+ } else {
+ UT_LIST_REMOVE(table_non_LRU, table);
+ }
+
+ if (lru && table->drop_aborted) {
+ /* When evicting the table definition,
+ drop the orphan indexes from the data dictionary
+ and free the index pages. */
+ trx_t* trx = trx_create();
+
+ ut_d(dict_sys.assert_locked());
+ /* Mimic row_mysql_lock_data_dictionary(). */
+ trx->dict_operation_lock_mode = RW_X_LATCH;
+
+ trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+ row_merge_drop_indexes_dict(trx, table->id);
+ trx_commit_for_mysql(trx);
+ trx->dict_operation_lock_mode = 0;
+ trx->free();
+ }
+
+ /* Free virtual column template if any */
+ if (table->vc_templ != NULL) {
+ dict_free_vc_templ(table->vc_templ);
+ UT_DELETE(table->vc_templ);
+ }
+
+ table->autoinc_mutex.~mutex();
+
+ if (keep) {
+ return;
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (UNIV_UNLIKELY(UT_LIST_GET_LEN(table->freed_indexes) != 0)) {
+ if (table->fts) {
+ fts_optimize_remove_table(table);
+ fts_free(table);
+ table->fts = NULL;
+ }
+
+ table->vc_templ = NULL;
+ table->id = 0;
+ return;
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ dict_mem_table_free(table);
+}
+
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return TRUE if name is reserved */
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+ const char* name) /*!< in: column name */
+{
+ static const char* reserved_names[] = {
+ "DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR"
+ };
+
+ compile_time_assert(UT_ARR_SIZE(reserved_names) == DATA_N_SYS_COLS);
+
+ for (ulint i = 0; i < UT_ARR_SIZE(reserved_names); i++) {
+ if (innobase_strcasecmp(name, reserved_names[i]) == 0) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/** Adds an index to the dictionary cache, with possible indexing newly
+added column.
+@param[in,out] index index; NOTE! The index memory
+ object is freed in this function!
+@param[in] page_no root page number of the index
+@param[in] add_v virtual columns being added along with ADD INDEX
+@return DB_SUCCESS, or DB_CORRUPTION */
+dberr_t
+dict_index_add_to_cache(
+ dict_index_t*& index,
+ ulint page_no,
+ const dict_add_v_col_t* add_v)
+{
+ dict_index_t* new_index;
+ ulint n_ord;
+ ulint i;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_ad(index->n_def == index->n_fields);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(!dict_index_is_online_ddl(index));
+ ut_ad(!dict_index_is_ibuf(index));
+
+ ut_d(mem_heap_validate(index->heap));
+ ut_a(!dict_index_is_clust(index)
+ || UT_LIST_GET_LEN(index->table->indexes) == 0);
+ ut_ad(dict_index_is_clust(index) || !index->table->no_rollback());
+
+ if (!dict_index_find_cols(index, add_v)) {
+
+ dict_mem_index_free(index);
+ index = NULL;
+ return DB_CORRUPTION;
+ }
+
+ /* Build the cache internal representation of the index,
+ containing also the added system fields */
+
+ if (dict_index_is_clust(index)) {
+ new_index = dict_index_build_internal_clust(index);
+ } else {
+ new_index = (index->type & DICT_FTS)
+ ? dict_index_build_internal_fts(index)
+ : dict_index_build_internal_non_clust(index);
+ new_index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(new_index->n_nullable)));
+ }
+
+ /* Set the n_fields value in new_index to the actual defined
+ number of fields in the cache internal representation */
+
+ new_index->n_fields = new_index->n_def;
+ new_index->trx_id = index->trx_id;
+ new_index->set_committed(index->is_committed());
+ new_index->nulls_equal = index->nulls_equal;
+#ifdef MYSQL_INDEX_DISABLE_AHI
+ new_index->disable_ahi = index->disable_ahi;
+#endif
+
+ n_ord = new_index->n_uniq;
+ /* Flag the ordering columns and also set column max_prefix */
+
+ for (i = 0; i < n_ord; i++) {
+ const dict_field_t* field
+ = dict_index_get_nth_field(new_index, i);
+
+ /* Check the column being added in the index for
+ the first time and flag the ordering column. */
+ if (field->col->ord_part == 0 ) {
+ field->col->max_prefix = field->prefix_len;
+ field->col->ord_part = 1;
+ } else if (field->prefix_len == 0) {
+ /* Set the max_prefix for a column to 0 if
+ its prefix length is 0 (for this index)
+ even if it was a part of any other index
+ with some prefix length. */
+ field->col->max_prefix = 0;
+ } else if (field->col->max_prefix != 0
+ && field->prefix_len
+ > field->col->max_prefix) {
+ /* Set the max_prefix value based on the
+ prefix_len. */
+ ut_ad(field->col->is_binary()
+ || field->prefix_len % field->col->mbmaxlen == 0);
+ field->col->max_prefix = field->prefix_len;
+ }
+ ut_ad(field->col->ord_part == 1);
+ }
+
+ new_index->stat_n_diff_key_vals =
+ static_cast<ib_uint64_t*>(mem_heap_zalloc(
+ new_index->heap,
+ dict_index_get_n_unique(new_index)
+ * sizeof(*new_index->stat_n_diff_key_vals)));
+
+ new_index->stat_n_sample_sizes =
+ static_cast<ib_uint64_t*>(mem_heap_zalloc(
+ new_index->heap,
+ dict_index_get_n_unique(new_index)
+ * sizeof(*new_index->stat_n_sample_sizes)));
+
+ new_index->stat_n_non_null_key_vals =
+ static_cast<ib_uint64_t*>(mem_heap_zalloc(
+ new_index->heap,
+ dict_index_get_n_unique(new_index)
+ * sizeof(*new_index->stat_n_non_null_key_vals)));
+
+ new_index->stat_index_size = 1;
+ new_index->stat_n_leaf_pages = 1;
+
+ new_index->stat_defrag_n_pages_freed = 0;
+ new_index->stat_defrag_n_page_split = 0;
+
+ new_index->stat_defrag_sample_next_slot = 0;
+ memset(&new_index->stat_defrag_data_size_sample,
+ 0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE);
+
+ /* Add the new index as the last index for the table */
+
+ UT_LIST_ADD_LAST(new_index->table->indexes, new_index);
+#ifdef BTR_CUR_ADAPT
+ new_index->search_info = btr_search_info_create(new_index->heap);
+#endif /* BTR_CUR_ADAPT */
+
+ new_index->page = unsigned(page_no);
+ rw_lock_create(index_tree_rw_lock_key, &new_index->lock,
+ SYNC_INDEX_TREE);
+
+ new_index->n_core_fields = new_index->n_fields;
+
+ dict_mem_index_free(index);
+ index = new_index;
+ return DB_SUCCESS;
+}
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+static
+void
+dict_index_remove_from_cache_low(
+/*=============================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index, /*!< in, own: index */
+ ibool lru_evict) /*!< in: TRUE if index being evicted
+ to make room in the table LRU list */
+{
+ ut_ad(table && index);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_ad(table->id);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!index->freed());
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /* No need to acquire the dict_index_t::lock here because
+ there can't be any active operations on this index (or table). */
+
+ if (index->online_log) {
+ ut_ad(index->online_status == ONLINE_INDEX_CREATION);
+ row_log_free(index->online_log);
+ index->online_log = NULL;
+ }
+
+ /* Remove the index from the list of indexes of the table */
+ UT_LIST_REMOVE(table->indexes, index);
+
+ /* The index is being dropped, remove any compression stats for it. */
+ if (!lru_evict && DICT_TF_GET_ZIP_SSIZE(index->table->flags)) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index.erase(index->id);
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
+
+ /* Remove the index from affected virtual column index list */
+ index->detach_columns();
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* We always create search info whether or not adaptive
+ hash index is enabled or not. */
+ /* We are not allowed to free the in-memory index struct
+ dict_index_t until all entries in the adaptive hash index
+ that point to any of the page belonging to his b-tree index
+ are dropped. This is so because dropping of these entries
+ require access to dict_index_t struct. To avoid such scenario
+ We keep a count of number of such pages in the search_info and
+ only free the dict_index_t struct when this count drops to
+ zero. See also: dict_table_can_be_evicted() */
+
+ if (index->n_ahi_pages()) {
+ index->set_freed();
+ UT_LIST_ADD_LAST(table->freed_indexes, index);
+ return;
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ rw_lock_free(&index->lock);
+
+ dict_mem_index_free(index);
+}
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+void
+dict_index_remove_from_cache(
+/*=========================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index) /*!< in, own: index */
+{
+ dict_index_remove_from_cache_low(table, index, FALSE);
+}
+
+/** Tries to find column names for the index and sets the col field of the
+index.
+@param[in] table table
+@param[in,out] index index
+@param[in] add_v new virtual columns added along with an add index call
+@return whether the column names were found */
+static
+bool
+dict_index_find_cols(
+ dict_index_t* index,
+ const dict_add_v_col_t* add_v)
+{
+ std::vector<ulint, ut_allocator<ulint> > col_added;
+ std::vector<ulint, ut_allocator<ulint> > v_col_added;
+
+ const dict_table_t* table = index->table;
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ for (ulint i = 0; i < index->n_fields; i++) {
+ ulint j;
+ dict_field_t* field = dict_index_get_nth_field(index, i);
+
+ for (j = 0; j < table->n_cols; j++) {
+ if (!innobase_strcasecmp(dict_table_get_col_name(table, j),
+ field->name)) {
+
+ /* Check if same column is being assigned again
+ which suggest that column has duplicate name. */
+ bool exists =
+ std::find(col_added.begin(),
+ col_added.end(), j)
+ != col_added.end();
+
+ if (exists) {
+ /* Duplicate column found. */
+ goto dup_err;
+ }
+
+ field->col = dict_table_get_nth_col(table, j);
+
+ col_added.push_back(j);
+
+ goto found;
+ }
+ }
+
+ /* Let's check if it is a virtual column */
+ for (j = 0; j < table->n_v_cols; j++) {
+ if (!strcmp(dict_table_get_v_col_name(table, j),
+ field->name)) {
+
+ /* Check if same column is being assigned again
+ which suggest that column has duplicate name. */
+ bool exists =
+ std::find(v_col_added.begin(),
+ v_col_added.end(), j)
+ != v_col_added.end();
+
+ if (exists) {
+ /* Duplicate column found. */
+ break;
+ }
+
+ field->col = reinterpret_cast<dict_col_t*>(
+ dict_table_get_nth_v_col(table, j));
+
+ v_col_added.push_back(j);
+
+ goto found;
+ }
+ }
+
+ if (add_v) {
+ for (j = 0; j < add_v->n_v_col; j++) {
+ if (!strcmp(add_v->v_col_name[j],
+ field->name)) {
+ field->col = const_cast<dict_col_t*>(
+ &add_v->v_col[j].m_col);
+ goto found;
+ }
+ }
+ }
+
+dup_err:
+#ifdef UNIV_DEBUG
+ /* It is an error not to find a matching column. */
+ ib::error() << "No matching column for " << field->name
+ << " in index " << index->name
+ << " of table " << table->name;
+#endif /* UNIV_DEBUG */
+ return(FALSE);
+
+found:
+ ;
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Adds a column to index. */
+void
+dict_index_add_col(
+/*===============*/
+ dict_index_t* index, /*!< in/out: index */
+ const dict_table_t* table, /*!< in: table */
+ dict_col_t* col, /*!< in: column */
+ ulint prefix_len) /*!< in: column prefix length */
+{
+ dict_field_t* field;
+ const char* col_name;
+
+ if (col->is_virtual()) {
+ dict_v_col_t* v_col = reinterpret_cast<dict_v_col_t*>(col);
+ /* Register the index with the virtual column index list */
+ v_col->v_indexes.push_front(dict_v_idx_t(index, index->n_def));
+ col_name = dict_table_get_v_col_name_mysql(
+ table, dict_col_get_no(col));
+ } else {
+ col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+ }
+
+ dict_mem_index_add_field(index, col_name, prefix_len);
+
+ field = dict_index_get_nth_field(index, unsigned(index->n_def) - 1);
+
+ field->col = col;
+ field->fixed_len = static_cast<uint16_t>(
+ dict_col_get_fixed_size(
+ col, dict_table_is_comp(table)))
+ & ((1U << 10) - 1);
+
+ if (prefix_len && field->fixed_len > prefix_len) {
+ field->fixed_len = static_cast<uint16_t>(prefix_len)
+ & ((1U << 10) - 1);
+ }
+
+ /* Long fixed-length fields that need external storage are treated as
+ variable-length fields, so that the extern flag can be embedded in
+ the length word. */
+
+ if (field->fixed_len > DICT_MAX_FIXED_COL_LEN) {
+ field->fixed_len = 0;
+ }
+
+ /* The comparison limit above must be constant. If it were
+ changed, the disk format of some fixed-length columns would
+ change, which would be a disaster. */
+ compile_time_assert(DICT_MAX_FIXED_COL_LEN == 768);
+
+ if (!(col->prtype & DATA_NOT_NULL)) {
+ index->n_nullable++;
+ }
+}
+
+/*******************************************************************//**
+Copies fields contained in index2 to index1. */
+static
+void
+dict_index_copy(
+/*============*/
+ dict_index_t* index1, /*!< in: index to copy to */
+ const dict_index_t* index2, /*!< in: index to copy from */
+ ulint start, /*!< in: first position to copy */
+ ulint end) /*!< in: last position to copy */
+{
+ dict_field_t* field;
+ ulint i;
+
+ /* Copy fields contained in index2 */
+
+ for (i = start; i < end; i++) {
+
+ field = dict_index_get_nth_field(index2, i);
+
+ dict_index_add_col(index1, index2->table, field->col,
+ field->prefix_len);
+ }
+}
+
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+void
+dict_index_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_index_t* index, /*!< in: index */
+ ulint n_fields) /*!< in: number of
+ field types to copy */
+{
+ ulint i;
+
+ if (dict_index_is_ibuf(index)) {
+ dtuple_set_types_binary(tuple, n_fields);
+
+ return;
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ const dict_field_t* ifield;
+ dtype_t* dfield_type;
+
+ ifield = dict_index_get_nth_field(index, i);
+ dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+ dict_col_copy_type(dict_field_get_col(ifield), dfield_type);
+ if (dict_index_is_spatial(index)
+ && DATA_GEOMETRY_MTYPE(dfield_type->mtype)) {
+ dfield_type->prtype |= DATA_GIS_MBR;
+ }
+ }
+}
+
+/** Copies types of virtual columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value. This function should
+be called right after dtuple_create().
+@param[in,out] tuple data tuple
+@param[in] table table
+*/
+void
+dict_table_copy_v_types(
+ dtuple_t* tuple,
+ const dict_table_t* table)
+{
+ /* tuple could have more virtual columns than existing table,
+ if we are calling this for creating index along with adding
+ virtual columns */
+ ulint n_fields = ut_min(dtuple_get_n_v_fields(tuple),
+ static_cast<ulint>(table->n_v_def));
+
+ for (ulint i = 0; i < n_fields; i++) {
+
+ dfield_t* dfield = dtuple_get_nth_v_field(tuple, i);
+ dtype_t* dtype = dfield_get_type(dfield);
+
+ dfield_set_null(dfield);
+ dict_col_copy_type(
+ &(dict_table_get_nth_v_col(table, i)->m_col),
+ dtype);
+ }
+}
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value. This function should
+be called right after dtuple_create(). */
+void
+dict_table_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_table_t* table) /*!< in: table */
+{
+ ulint i;
+
+ for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+ dfield_t* dfield = dtuple_get_nth_field(tuple, i);
+ dtype_t* dtype = dfield_get_type(dfield);
+
+ dfield_set_null(dfield);
+ dict_col_copy_type(dict_table_get_nth_col(table, i), dtype);
+ }
+
+ dict_table_copy_v_types(tuple, table);
+}
+
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the clustered index */
+static
+dict_index_t*
+dict_index_build_internal_clust(
+/*============================*/
+ dict_index_t* index) /*!< in: user representation of
+ a clustered index */
+{
+ dict_table_t* table = index->table;
+ dict_index_t* new_index;
+ dict_field_t* field;
+ ulint trx_id_pos;
+ ulint i;
+ ibool* indexed;
+
+ ut_ad(index->is_primary());
+ ut_ad(!index->has_virtual());
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ /* Create a new index object with certainly enough fields */
+ new_index = dict_mem_index_create(index->table, index->name,
+ index->type,
+ unsigned(index->n_fields
+ + table->n_cols));
+
+ /* Copy other relevant data from the old index struct to the new
+ struct: it inherits the values */
+
+ new_index->n_user_defined_cols = index->n_fields;
+
+ new_index->id = index->id;
+
+ /* Copy the fields of index */
+ dict_index_copy(new_index, index, 0, index->n_fields);
+
+ if (dict_index_is_unique(index)) {
+ /* Only the fields defined so far are needed to identify
+ the index entry uniquely */
+
+ new_index->n_uniq = new_index->n_def;
+ } else {
+ /* Also the row id is needed to identify the entry */
+ new_index->n_uniq = unsigned(new_index->n_def + 1)
+ & dict_index_t::MAX_N_FIELDS;
+ }
+
+ new_index->trx_id_offset = 0;
+
+ /* Add system columns, trx id first */
+
+ trx_id_pos = new_index->n_def;
+
+ compile_time_assert(DATA_ROW_ID == 0);
+ compile_time_assert(DATA_TRX_ID == 1);
+ compile_time_assert(DATA_ROLL_PTR == 2);
+
+ if (!dict_index_is_unique(index)) {
+ dict_index_add_col(new_index, table,
+ dict_table_get_sys_col(
+ table, DATA_ROW_ID),
+ 0);
+ trx_id_pos++;
+ }
+
+ dict_index_add_col(
+ new_index, table,
+ dict_table_get_sys_col(table, DATA_TRX_ID), 0);
+
+ for (i = 0; i < trx_id_pos; i++) {
+
+ ulint fixed_size = dict_col_get_fixed_size(
+ dict_index_get_nth_col(new_index, i),
+ dict_table_is_comp(table));
+
+ if (fixed_size == 0) {
+ new_index->trx_id_offset = 0;
+
+ break;
+ }
+
+ dict_field_t* field = dict_index_get_nth_field(
+ new_index, i);
+ if (field->prefix_len > 0) {
+ new_index->trx_id_offset = 0;
+
+ break;
+ }
+
+ /* Add fixed_size to new_index->trx_id_offset.
+ Because the latter is a bit-field, an overflow
+ can theoretically occur. Check for it. */
+ fixed_size += new_index->trx_id_offset;
+
+ new_index->trx_id_offset = static_cast<unsigned>(fixed_size)
+ & ((1U << 12) - 1);
+
+ if (new_index->trx_id_offset != fixed_size) {
+ /* Overflow. Pretend that this is a
+ variable-length PRIMARY KEY. */
+ ut_ad(0);
+ new_index->trx_id_offset = 0;
+ break;
+ }
+ }
+
+ dict_index_add_col(
+ new_index, table,
+ dict_table_get_sys_col(table, DATA_ROLL_PTR), 0);
+
+ /* Remember the table columns already contained in new_index */
+ indexed = static_cast<ibool*>(
+ ut_zalloc_nokey(table->n_cols * sizeof *indexed));
+
+ /* Mark the table columns already contained in new_index */
+ for (i = 0; i < new_index->n_def; i++) {
+
+ field = dict_index_get_nth_field(new_index, i);
+
+ /* If there is only a prefix of the column in the index
+ field, do not mark the column as contained in the index */
+
+ if (field->prefix_len == 0) {
+
+ indexed[field->col->ind] = TRUE;
+ }
+ }
+
+ /* Add to new_index non-system columns of table not yet included
+ there */
+ for (i = 0; i + DATA_N_SYS_COLS < ulint(table->n_cols); i++) {
+ dict_col_t* col = dict_table_get_nth_col(table, i);
+ ut_ad(col->mtype != DATA_SYS);
+
+ if (!indexed[col->ind]) {
+ dict_index_add_col(new_index, table, col, 0);
+ }
+ }
+
+ ut_free(indexed);
+
+ ut_ad(UT_LIST_GET_LEN(table->indexes) == 0);
+
+ new_index->n_core_null_bytes = table->supports_instant()
+ ? dict_index_t::NO_CORE_NULL_BYTES
+ : static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(new_index->n_nullable)));
+ new_index->cached = TRUE;
+
+ return(new_index);
+}
+
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a non-clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the non-clustered index */
+static
+dict_index_t*
+dict_index_build_internal_non_clust(
+/*================================*/
+ dict_index_t* index) /*!< in: user representation of
+ a non-clustered index */
+{
+ dict_field_t* field;
+ dict_index_t* new_index;
+ dict_index_t* clust_index;
+ dict_table_t* table = index->table;
+ ulint i;
+ ibool* indexed;
+
+ ut_ad(table && index);
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ /* The clustered index should be the first in the list of indexes */
+ clust_index = UT_LIST_GET_FIRST(table->indexes);
+
+ ut_ad(clust_index);
+ ut_ad(dict_index_is_clust(clust_index));
+ ut_ad(!dict_index_is_ibuf(clust_index));
+
+ /* Create a new index */
+ new_index = dict_mem_index_create(
+ index->table, index->name, index->type,
+ ulint(index->n_fields + 1 + clust_index->n_uniq));
+
+ /* Copy other relevant data from the old index
+ struct to the new struct: it inherits the values */
+
+ new_index->n_user_defined_cols = index->n_fields;
+
+ new_index->id = index->id;
+
+ /* Copy fields from index to new_index */
+ dict_index_copy(new_index, index, 0, index->n_fields);
+
+ /* Remember the table columns already contained in new_index */
+ indexed = static_cast<ibool*>(
+ ut_zalloc_nokey(table->n_cols * sizeof *indexed));
+
+ /* Mark the table columns already contained in new_index */
+ for (i = 0; i < new_index->n_def; i++) {
+
+ field = dict_index_get_nth_field(new_index, i);
+
+ if (field->col->is_virtual()) {
+ continue;
+ }
+
+ /* If there is only a prefix of the column in the index
+ field, do not mark the column as contained in the index */
+
+ if (field->prefix_len == 0) {
+
+ indexed[field->col->ind] = TRUE;
+ }
+ }
+
+ /* Add to new_index the columns necessary to determine the clustered
+ index entry uniquely */
+
+ for (i = 0; i < clust_index->n_uniq; i++) {
+
+ field = dict_index_get_nth_field(clust_index, i);
+
+ if (!indexed[field->col->ind]) {
+ dict_index_add_col(new_index, table, field->col,
+ field->prefix_len);
+ } else if (dict_index_is_spatial(index)) {
+ /*For spatial index, we still need to add the
+ field to index. */
+ dict_index_add_col(new_index, table, field->col,
+ field->prefix_len);
+ }
+ }
+
+ ut_free(indexed);
+
+ if (dict_index_is_unique(index)) {
+ new_index->n_uniq = index->n_fields;
+ } else {
+ new_index->n_uniq = new_index->n_def;
+ }
+
+ /* Set the n_fields value in new_index to the actual defined
+ number of fields */
+
+ new_index->n_fields = new_index->n_def;
+
+ new_index->cached = TRUE;
+
+ return(new_index);
+}
+
+/***********************************************************************
+Builds the internal dictionary cache representation for an FTS index.
+@return own: the internal representation of the FTS index */
+static
+dict_index_t*
+dict_index_build_internal_fts(
+/*==========================*/
+ dict_index_t* index) /*!< in: user representation of an FTS index */
+{
+ dict_index_t* new_index;
+
+ ut_ad(index->type == DICT_FTS);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ /* Create a new index */
+ new_index = dict_mem_index_create(index->table, index->name,
+ index->type, index->n_fields);
+
+ /* Copy other relevant data from the old index struct to the new
+ struct: it inherits the values */
+
+ new_index->n_user_defined_cols = index->n_fields;
+
+ new_index->id = index->id;
+
+ /* Copy fields from index to new_index */
+ dict_index_copy(new_index, index, 0, index->n_fields);
+
+ new_index->n_uniq = 0;
+ new_index->cached = TRUE;
+
+ dict_table_t* table = index->table;
+
+ if (table->fts->cache == NULL) {
+ table->fts->cache = fts_cache_create(table);
+ }
+
+ rw_lock_x_lock(&table->fts->cache->init_lock);
+ /* Notify the FTS cache about this index. */
+ fts_cache_index_cache_create(table, new_index);
+ rw_lock_x_unlock(&table->fts->cache->init_lock);
+
+ return(new_index);
+}
+/*====================== FOREIGN KEY PROCESSING ========================*/
+
+/*********************************************************************//**
+Checks if a table is referenced by foreign keys.
+@return TRUE if table is referenced by a foreign key */
+ibool
+dict_table_is_referenced_by_foreign_key(
+/*====================================*/
+ const dict_table_t* table) /*!< in: InnoDB table */
+{
+ return(!table->referenced_set.empty());
+}
+
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+ dict_foreign_t* foreign) /*!< in, own: foreign constraint */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_a(foreign);
+
+ if (foreign->referenced_table != NULL) {
+ foreign->referenced_table->referenced_set.erase(foreign);
+ }
+
+ if (foreign->foreign_table != NULL) {
+ foreign->foreign_table->foreign_set.erase(foreign);
+ }
+
+ dict_foreign_free(foreign);
+}
+
+/**********************************************************************//**
+Looks for the foreign constraint from the foreign and referenced lists
+of a table.
+@return foreign constraint */
+static
+dict_foreign_t*
+dict_foreign_find(
+/*==============*/
+ dict_table_t* table, /*!< in: table object */
+ dict_foreign_t* foreign) /*!< in: foreign constraint */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ ut_ad(dict_foreign_set_validate(table->foreign_set));
+ ut_ad(dict_foreign_set_validate(table->referenced_set));
+
+ dict_foreign_set::iterator it = table->foreign_set.find(foreign);
+
+ if (it != table->foreign_set.end()) {
+ return(*it);
+ }
+
+ it = table->referenced_set.find(foreign);
+
+ if (it != table->referenced_set.end()) {
+ return(*it);
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+dict_index_t*
+dict_foreign_find_index(
+/*====================*/
+ const dict_table_t* table, /*!< in: table */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use table->col_names */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols, /*!< in: number of columns */
+ const dict_index_t* types_idx,
+ /*!< in: NULL or an index
+ whose types the column types
+ must match */
+ bool check_charsets,
+ /*!< in: whether to check
+ charsets. only has an effect
+ if types_idx != NULL */
+ ulint check_null,
+ /*!< in: nonzero if none of
+ the columns must be declared
+ NOT NULL */
+ fkerr_t* error, /*!< out: error code */
+ ulint* err_col_no,
+ /*!< out: column number where
+ error happened */
+ dict_index_t** err_index)
+ /*!< out: index where error
+ happened */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ if (error) {
+ *error = FK_INDEX_NOT_FOUND;
+ }
+
+ for (dict_index_t* index = dict_table_get_first_index(table);
+ index;
+ index = dict_table_get_next_index(index)) {
+ if (types_idx != index
+ && !index->to_be_dropped
+ && !dict_index_is_online_ddl(index)
+ && dict_foreign_qualify_index(
+ table, col_names, columns, n_cols,
+ index, types_idx,
+ check_charsets, check_null,
+ error, err_col_no, err_index)) {
+ if (error) {
+ *error = FK_SUCCESS;
+ }
+
+ return(index);
+ }
+ }
+
+ return(NULL);
+}
+/**********************************************************************//**
+Report an error in a foreign key definition. */
+static
+void
+dict_foreign_error_report_low(
+/*==========================*/
+ FILE* file, /*!< in: output stream */
+ const char* name) /*!< in: table name */
+{
+ rewind(file);
+ ut_print_timestamp(file);
+ fprintf(file, " Error in foreign key constraint of table %s:\n",
+ name);
+}
+
+/**********************************************************************//**
+Report an error in a foreign key definition. */
+static
+void
+dict_foreign_error_report(
+/*======================*/
+ FILE* file, /*!< in: output stream */
+ dict_foreign_t* fk, /*!< in: foreign key constraint */
+ const char* msg) /*!< in: the error message */
+{
+ std::string fk_str;
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(file, fk->foreign_table_name);
+ fputs(msg, file);
+ fputs(" Constraint:\n", file);
+ fk_str = dict_print_info_on_foreign_key_in_create_format(NULL, fk, TRUE);
+ fputs(fk_str.c_str(), file);
+ putc('\n', file);
+ if (fk->foreign_index) {
+ fprintf(file, "The index in the foreign key in table is"
+ " %s\n%s\n", fk->foreign_index->name(),
+ FOREIGN_KEY_CONSTRAINTS_MSG);
+ }
+ mutex_exit(&dict_foreign_err_mutex);
+}
+
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of the foreign table and the referenced table must already
+be in the dictionary cache!
+@return DB_SUCCESS or error code */
+dberr_t
+dict_foreign_add_to_cache(
+/*======================*/
+ dict_foreign_t* foreign,
+ /*!< in, own: foreign key constraint */
+ const char** col_names,
+ /*!< in: column names, or NULL to use
+ foreign->foreign_table->col_names */
+ bool check_charsets,
+ /*!< in: whether to check charset
+ compatibility */
+ dict_err_ignore_t ignore_err)
+ /*!< in: error to be ignored */
+{
+ dict_table_t* for_table;
+ dict_table_t* ref_table;
+ dict_foreign_t* for_in_cache = NULL;
+ dict_index_t* index;
+ ibool added_to_referenced_list= FALSE;
+ FILE* ef = dict_foreign_err_file;
+
+ DBUG_ENTER("dict_foreign_add_to_cache");
+ DBUG_PRINT("dict_foreign_add_to_cache", ("id: %s", foreign->id));
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ for_table = dict_table_check_if_in_cache_low(
+ foreign->foreign_table_name_lookup);
+
+ ref_table = dict_table_check_if_in_cache_low(
+ foreign->referenced_table_name_lookup);
+ ut_a(for_table || ref_table);
+
+ if (for_table) {
+ for_in_cache = dict_foreign_find(for_table, foreign);
+ }
+
+ if (!for_in_cache && ref_table) {
+ for_in_cache = dict_foreign_find(ref_table, foreign);
+ }
+
+ if (for_in_cache) {
+ dict_foreign_free(foreign);
+ } else {
+ for_in_cache = foreign;
+
+ }
+
+ if (ref_table && !for_in_cache->referenced_table) {
+ index = dict_foreign_find_index(
+ ref_table, NULL,
+ for_in_cache->referenced_col_names,
+ for_in_cache->n_fields, for_in_cache->foreign_index,
+ check_charsets, false);
+
+ if (index == NULL
+ && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) {
+ dict_foreign_error_report(
+ ef, for_in_cache,
+ "there is no index in referenced table"
+ " which would contain\n"
+ "the columns as the first columns,"
+ " or the data types in the\n"
+ "referenced table do not match"
+ " the ones in table.");
+
+ if (for_in_cache == foreign) {
+ dict_foreign_free(foreign);
+ }
+
+ DBUG_RETURN(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ for_in_cache->referenced_table = ref_table;
+ for_in_cache->referenced_index = index;
+
+ std::pair<dict_foreign_set::iterator, bool> ret
+ = ref_table->referenced_set.insert(for_in_cache);
+
+ ut_a(ret.second); /* second is true if the insertion
+ took place */
+ added_to_referenced_list = TRUE;
+ }
+
+ if (for_table && !for_in_cache->foreign_table) {
+ index = dict_foreign_find_index(
+ for_table, col_names,
+ for_in_cache->foreign_col_names,
+ for_in_cache->n_fields,
+ for_in_cache->referenced_index, check_charsets,
+ for_in_cache->type
+ & (DICT_FOREIGN_ON_DELETE_SET_NULL
+ | DICT_FOREIGN_ON_UPDATE_SET_NULL));
+
+ if (index == NULL
+ && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) {
+ dict_foreign_error_report(
+ ef, for_in_cache,
+ "there is no index in the table"
+ " which would contain\n"
+ "the columns as the first columns,"
+ " or the data types in the\n"
+ "table do not match"
+ " the ones in the referenced table\n"
+ "or one of the ON ... SET NULL columns"
+ " is declared NOT NULL.");
+
+ if (for_in_cache == foreign) {
+ if (added_to_referenced_list) {
+ const dict_foreign_set::size_type
+ n = ref_table->referenced_set
+ .erase(for_in_cache);
+
+ ut_a(n == 1); /* the number of
+ elements removed must
+ be one */
+ }
+
+ dict_foreign_free(foreign);
+ }
+
+ DBUG_RETURN(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ for_in_cache->foreign_table = for_table;
+ for_in_cache->foreign_index = index;
+
+ std::pair<dict_foreign_set::iterator, bool> ret
+ = for_table->foreign_set.insert(for_in_cache);
+
+ ut_a(ret.second); /* second is true if the insertion
+ took place */
+ }
+
+ /* We need to move the table to the non-LRU end of the table LRU
+ list. Otherwise it will be evicted from the cache. */
+
+ if (ref_table != NULL) {
+ dict_sys.prevent_eviction(ref_table);
+ }
+
+ if (for_table != NULL) {
+ dict_sys.prevent_eviction(for_table);
+ }
+
+ ut_ad(dict_lru_validate());
+ DBUG_RETURN(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Scans from pointer onwards. Stops if is at the start of a copy of
+'string' where characters are compared without case sensitivity, and
+only outside `` or "" quotes. Stops also at NUL.
+@return scanned up to this */
+static
+const char*
+dict_scan_to(
+/*=========*/
+ const char* ptr, /*!< in: scan from */
+ const char* string) /*!< in: look for this */
+{
+ char quote = '\0';
+ bool escape = false;
+
+ for (; *ptr; ptr++) {
+ if (*ptr == quote) {
+ /* Closing quote character: do not look for
+ starting quote or the keyword. */
+
+ /* If the quote character is escaped by a
+ backslash, ignore it. */
+ if (escape) {
+ escape = false;
+ } else {
+ quote = '\0';
+ }
+ } else if (quote) {
+ /* Within quotes: do nothing. */
+ if (escape) {
+ escape = false;
+ } else if (*ptr == '\\') {
+ escape = true;
+ }
+ } else if (*ptr == '`' || *ptr == '"' || *ptr == '\'') {
+ /* Starting quote: remember the quote character. */
+ quote = *ptr;
+ } else {
+ /* Outside quotes: look for the keyword. */
+ ulint i;
+ for (i = 0; string[i]; i++) {
+ if (toupper((int)(unsigned char)(ptr[i]))
+ != toupper((int)(unsigned char)
+ (string[i]))) {
+ goto nomatch;
+ }
+ }
+ break;
+nomatch:
+ ;
+ }
+ }
+
+ return(ptr);
+}
+
+/*********************************************************************//**
+Accepts a specified string. Comparisons are case-insensitive.
+@return if string was accepted, the pointer is moved after that, else
+ptr is returned */
+static
+const char*
+dict_accept(
+/*========*/
+ CHARSET_INFO* cs, /*!< in: the character set of ptr */
+ const char* ptr, /*!< in: scan from this */
+ const char* string, /*!< in: accept only this string as the next
+ non-whitespace string */
+ ibool* success)/*!< out: TRUE if accepted */
+{
+ const char* old_ptr = ptr;
+ const char* old_ptr2;
+
+ *success = FALSE;
+
+ while (my_isspace(cs, *ptr)) {
+ ptr++;
+ }
+
+ old_ptr2 = ptr;
+
+ ptr = dict_scan_to(ptr, string);
+
+ if (*ptr == '\0' || old_ptr2 != ptr) {
+ return(old_ptr);
+ }
+
+ *success = TRUE;
+
+ return ptr + strlen(string);
+}
+
+/*********************************************************************//**
+Scans an id. For the lexical definition of an 'id', see the code below.
+Strips backquotes or double quotes from around the id.
+@return scanned to */
+static
+const char*
+dict_scan_id(
+/*=========*/
+ CHARSET_INFO* cs, /*!< in: the character set of ptr */
+ const char* ptr, /*!< in: scanned to */
+ mem_heap_t* heap, /*!< in: heap where to allocate the id
+ (NULL=id will not be allocated, but it
+ will point to string near ptr) */
+ const char** id, /*!< out,own: the id; NULL if no id was
+ scannable */
+ ibool table_id,/*!< in: TRUE=convert the allocated id
+ as a table name; FALSE=convert to UTF-8 */
+ ibool accept_also_dot)
+ /*!< in: TRUE if also a dot can appear in a
+ non-quoted id; in a quoted id it can appear
+ always */
+{
+ char quote = '\0';
+ ulint len = 0;
+ const char* s;
+ char* str;
+ char* dst;
+
+ *id = NULL;
+
+ while (my_isspace(cs, *ptr)) {
+ ptr++;
+ }
+
+ if (*ptr == '\0') {
+
+ return(ptr);
+ }
+
+ if (*ptr == '`' || *ptr == '"') {
+ quote = *ptr++;
+ }
+
+ s = ptr;
+
+ if (quote) {
+ for (;;) {
+ if (!*ptr) {
+ /* Syntax error */
+ return(ptr);
+ }
+ if (*ptr == quote) {
+ ptr++;
+ if (*ptr != quote) {
+ break;
+ }
+ }
+ ptr++;
+ len++;
+ }
+ } else {
+ while (!my_isspace(cs, *ptr) && *ptr != '(' && *ptr != ')'
+ && (accept_also_dot || *ptr != '.')
+ && *ptr != ',' && *ptr != '\0') {
+
+ ptr++;
+ }
+
+ len = ulint(ptr - s);
+ }
+
+ if (heap == NULL) {
+ /* no heap given: id will point to source string */
+ *id = s;
+ return(ptr);
+ }
+
+ if (quote) {
+ char* d;
+
+ str = d = static_cast<char*>(
+ mem_heap_alloc(heap, len + 1));
+
+ while (len--) {
+ if ((*d++ = *s++) == quote) {
+ s++;
+ }
+ }
+ *d++ = 0;
+ len = ulint(d - str);
+ ut_ad(*s == quote);
+ ut_ad(s + 1 == ptr);
+ } else {
+ str = mem_heap_strdupl(heap, s, len);
+ }
+
+ if (!table_id) {
+convert_id:
+ /* Convert the identifier from connection character set
+ to UTF-8. */
+ len = 3 * len + 1;
+ *id = dst = static_cast<char*>(mem_heap_alloc(heap, len));
+
+ innobase_convert_from_id(cs, dst, str, len);
+ } else if (!strncmp(str, srv_mysql50_table_name_prefix,
+ sizeof(srv_mysql50_table_name_prefix) - 1)) {
+ /* This is a pre-5.1 table name
+ containing chars other than [A-Za-z0-9].
+ Discard the prefix and use raw UTF-8 encoding. */
+ str += sizeof(srv_mysql50_table_name_prefix) - 1;
+ len -= sizeof(srv_mysql50_table_name_prefix) - 1;
+ goto convert_id;
+ } else {
+ /* Encode using filename-safe characters. */
+ len = 5 * len + 1;
+ *id = dst = static_cast<char*>(mem_heap_alloc(heap, len));
+
+ innobase_convert_from_table_id(cs, dst, str, len);
+ }
+
+ return(ptr);
+}
+
+/*********************************************************************//**
+Open a table from its database and table name, this is currently used by
+foreign constraint parser to get the referenced table.
+@return complete table name with database and table name, allocated from
+heap memory passed in */
+char*
+dict_get_referenced_table(
+ const char* name, /*!< in: foreign key table name */
+ const char* database_name, /*!< in: table db name */
+ ulint database_name_len, /*!< in: db name length */
+ const char* table_name, /*!< in: table name */
+ ulint table_name_len, /*!< in: table name length */
+ dict_table_t** table, /*!< out: table object or NULL */
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ CHARSET_INFO* from_cs) /*!< in: table name charset */
+{
+ char* ref;
+ char db_name[MAX_DATABASE_NAME_LEN];
+ char tbl_name[MAX_TABLE_NAME_LEN];
+ CHARSET_INFO* to_cs = &my_charset_filename;
+ uint errors;
+ ut_ad(database_name || name);
+ ut_ad(table_name);
+
+ if (!strncmp(table_name, srv_mysql50_table_name_prefix,
+ sizeof(srv_mysql50_table_name_prefix) - 1)) {
+ /* This is a pre-5.1 table name
+ containing chars other than [A-Za-z0-9].
+ Discard the prefix and use raw UTF-8 encoding. */
+ table_name += sizeof(srv_mysql50_table_name_prefix) - 1;
+ table_name_len -= sizeof(srv_mysql50_table_name_prefix) - 1;
+
+ to_cs = system_charset_info;
+ }
+
+ table_name_len = strconvert(from_cs, table_name, table_name_len, to_cs,
+ tbl_name, MAX_TABLE_NAME_LEN, &errors);
+ table_name = tbl_name;
+
+ if (database_name) {
+ to_cs = &my_charset_filename;
+ if (!strncmp(database_name, srv_mysql50_table_name_prefix,
+ sizeof(srv_mysql50_table_name_prefix) - 1)) {
+ database_name
+ += sizeof(srv_mysql50_table_name_prefix) - 1;
+ database_name_len
+ -= sizeof(srv_mysql50_table_name_prefix) - 1;
+ to_cs = system_charset_info;
+ }
+
+ database_name_len = strconvert(
+ from_cs, database_name, database_name_len, to_cs,
+ db_name, MAX_DATABASE_NAME_LEN, &errors);
+ database_name = db_name;
+ } else {
+ /* Use the database name of the foreign key table */
+
+ database_name = name;
+ database_name_len = dict_get_db_name_len(name);
+ }
+
+ /* Copy database_name, '/', table_name, '\0' */
+ ref = static_cast<char*>(mem_heap_alloc(
+ heap, database_name_len + table_name_len + 2));
+ memcpy(ref, database_name, database_name_len);
+ ref[database_name_len] = '/';
+ memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
+
+ /* Values; 0 = Store and compare as given; case sensitive
+ 1 = Store and compare in lower; case insensitive
+ 2 = Store as given, compare in lower; case semi-sensitive */
+ if (innobase_get_lower_case_table_names() == 2) {
+ innobase_casedn_str(ref);
+ *table = dict_table_get_low(ref);
+ memcpy(ref, database_name, database_name_len);
+ ref[database_name_len] = '/';
+ memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
+
+ } else {
+#ifndef _WIN32
+ if (innobase_get_lower_case_table_names() == 1) {
+ innobase_casedn_str(ref);
+ }
+#else
+ innobase_casedn_str(ref);
+#endif /* !_WIN32 */
+ *table = dict_table_get_low(ref);
+ }
+
+ return(ref);
+}
+
+/*********************************************************************//**
+Removes MySQL comments from an SQL string. A comment is either
+(a) '#' to the end of the line,
+(b) '--[space]' to the end of the line, or
+(c) '[slash][asterisk]' till the next '[asterisk][slash]' (like the familiar
+C comment syntax).
+@return own: SQL string stripped from comments; the caller must free
+this with ut_free()! */
+static
+char*
+dict_strip_comments(
+/*================*/
+ const char* sql_string, /*!< in: SQL string */
+ size_t sql_length) /*!< in: length of sql_string */
+{
+ char* str;
+ const char* sptr;
+ const char* eptr = sql_string + sql_length;
+ char* ptr;
+ /* unclosed quote character (0 if none) */
+ char quote = 0;
+ bool escape = false;
+
+ DBUG_ENTER("dict_strip_comments");
+
+ DBUG_PRINT("dict_strip_comments", ("%s", sql_string));
+
+ str = static_cast<char*>(ut_malloc_nokey(sql_length + 1));
+
+ sptr = sql_string;
+ ptr = str;
+
+ for (;;) {
+scan_more:
+ if (sptr >= eptr || *sptr == '\0') {
+end_of_string:
+ *ptr = '\0';
+
+ ut_a(ptr <= str + sql_length);
+
+ DBUG_PRINT("dict_strip_comments", ("%s", str));
+ DBUG_RETURN(str);
+ }
+
+ if (*sptr == quote) {
+ /* Closing quote character: do not look for
+ starting quote or comments. */
+
+ /* If the quote character is escaped by a
+ backslash, ignore it. */
+ if (escape) {
+ escape = false;
+ } else {
+ quote = 0;
+ }
+ } else if (quote) {
+ /* Within quotes: do not look for
+ starting quotes or comments. */
+ if (escape) {
+ escape = false;
+ } else if (*sptr == '\\') {
+ escape = true;
+ }
+ } else if (*sptr == '"' || *sptr == '`' || *sptr == '\'') {
+ /* Starting quote: remember the quote character. */
+ quote = *sptr;
+ } else if (*sptr == '#'
+ || (sptr[0] == '-' && sptr[1] == '-'
+ && sptr[2] == ' ')) {
+ for (;;) {
+ if (++sptr >= eptr) {
+ goto end_of_string;
+ }
+
+ /* In Unix a newline is 0x0A while in Windows
+ it is 0x0D followed by 0x0A */
+
+ switch (*sptr) {
+ case (char) 0X0A:
+ case (char) 0x0D:
+ case '\0':
+ goto scan_more;
+ }
+ }
+ } else if (!quote && *sptr == '/' && *(sptr + 1) == '*') {
+ sptr += 2;
+ for (;;) {
+ if (sptr >= eptr) {
+ goto end_of_string;
+ }
+
+ switch (*sptr) {
+ case '\0':
+ goto scan_more;
+ case '*':
+ if (sptr[1] == '/') {
+ sptr += 2;
+ goto scan_more;
+ }
+ }
+
+ sptr++;
+ }
+ }
+
+ *ptr = *sptr;
+
+ ptr++;
+ sptr++;
+ }
+}
+
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return highest number, 0 if table has no new format foreign key constraints */
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+ dict_table_t* table) /*!< in: table in the dictionary memory cache */
+{
+ dict_foreign_t* foreign;
+ char* endp;
+ ulint biggest_id = 0;
+ ulint id;
+ ulint len;
+
+ DBUG_ENTER("dict_table_get_highest_foreign_id");
+
+ ut_a(table);
+
+ len = strlen(table->name.m_name);
+
+ for (dict_foreign_set::iterator it = table->foreign_set.begin();
+ it != table->foreign_set.end();
+ ++it) {
+ char fkid[MAX_TABLE_NAME_LEN+20];
+ foreign = *it;
+
+ strcpy(fkid, foreign->id);
+ /* Convert foreign key identifier on dictionary memory
+ cache to filename charset. */
+ innobase_convert_to_filename_charset(
+ strchr(fkid, '/') + 1,
+ strchr(foreign->id, '/') + 1,
+ MAX_TABLE_NAME_LEN);
+
+ if (strlen(fkid) > ((sizeof dict_ibfk) - 1) + len
+ && 0 == memcmp(fkid, table->name.m_name, len)
+ && 0 == memcmp(fkid + len,
+ dict_ibfk, (sizeof dict_ibfk) - 1)
+ && fkid[len + ((sizeof dict_ibfk) - 1)] != '0') {
+ /* It is of the >= 4.0.18 format */
+
+ id = strtoul(fkid + len
+ + ((sizeof dict_ibfk) - 1),
+ &endp, 10);
+ if (*endp == '\0') {
+ ut_a(id != biggest_id);
+
+ if (id > biggest_id) {
+ biggest_id = id;
+ }
+ }
+ }
+ }
+
+ DBUG_PRINT("dict_table_get_highest_foreign_id",
+ ("id: " ULINTPF, biggest_id));
+
+ DBUG_RETURN(biggest_id);
+}
+
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+dberr_t
+dict_foreign_parse_drop_constraints(
+/*================================*/
+ mem_heap_t* heap, /*!< in: heap from which we can
+ allocate memory */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: table */
+ ulint* n, /*!< out: number of constraints
+ to drop */
+ const char*** constraints_to_drop) /*!< out: id's of the
+ constraints to drop */
+{
+ ibool success;
+ char* str;
+ size_t len;
+ const char* ptr;
+ const char* ptr1;
+ const char* id;
+ CHARSET_INFO* cs;
+
+ ut_a(trx->mysql_thd);
+
+ cs = thd_charset(trx->mysql_thd);
+
+ *n = 0;
+
+ *constraints_to_drop = static_cast<const char**>(
+ mem_heap_alloc(heap, 1000 * sizeof(char*)));
+
+ ptr = innobase_get_stmt_unsafe(trx->mysql_thd, &len);
+
+ str = dict_strip_comments(ptr, len);
+
+ ptr = str;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+loop:
+ ptr = dict_scan_to(ptr, "DROP");
+
+ if (*ptr == '\0') {
+ ut_free(str);
+
+ return(DB_SUCCESS);
+ }
+
+ ptr = dict_accept(cs, ptr, "DROP", &success);
+
+ if (!my_isspace(cs, *ptr)) {
+
+ goto loop;
+ }
+
+ ptr = dict_accept(cs, ptr, "FOREIGN", &success);
+
+ if (!success || !my_isspace(cs, *ptr)) {
+
+ goto loop;
+ }
+
+ ptr = dict_accept(cs, ptr, "KEY", &success);
+
+ if (!success) {
+
+ goto syntax_error;
+ }
+
+ ptr1 = dict_accept(cs, ptr, "IF", &success);
+
+ if (success && my_isspace(cs, *ptr1)) {
+ ptr1 = dict_accept(cs, ptr1, "EXISTS", &success);
+ if (success) {
+ ptr = ptr1;
+ }
+ }
+
+ ptr = dict_scan_id(cs, ptr, heap, &id, FALSE, TRUE);
+
+ if (id == NULL) {
+
+ goto syntax_error;
+ }
+
+ ut_a(*n < 1000);
+ (*constraints_to_drop)[*n] = id;
+ (*n)++;
+
+ if (std::find_if(table->foreign_set.begin(),
+ table->foreign_set.end(),
+ dict_foreign_matches_id(id))
+ == table->foreign_set.end()) {
+
+ if (!srv_read_only_mode) {
+ FILE* ef = dict_foreign_err_file;
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Error in dropping of a foreign key"
+ " constraint of table ", ef);
+ ut_print_name(ef, NULL, table->name.m_name);
+ fprintf(ef, ",\nin SQL command\n%s"
+ "\nCannot find a constraint with the"
+ " given id %s.\n", str, id);
+ mutex_exit(&dict_foreign_err_mutex);
+ }
+
+ ut_free(str);
+
+ return(DB_CANNOT_DROP_CONSTRAINT);
+ }
+
+ goto loop;
+
+syntax_error:
+ if (!srv_read_only_mode) {
+ FILE* ef = dict_foreign_err_file;
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Syntax error in dropping of a"
+ " foreign key constraint of table ", ef);
+ ut_print_name(ef, NULL, table->name.m_name);
+ fprintf(ef, ",\n"
+ "close to:\n%s\n in SQL command\n%s\n", ptr, str);
+ mutex_exit(&dict_foreign_err_mutex);
+ }
+
+ ut_free(str);
+
+ return(DB_CANNOT_DROP_CONSTRAINT);
+}
+
+/*==================== END OF FOREIGN KEY PROCESSING ====================*/
+
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+Assumes that dict_sys.mutex is already being held.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+ index_id_t index_id) /*!< in: index id */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ return(dict_index_find_on_id_low(index_id));
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+ index_id_t index_id) /*!< in: index id */
+{
+ dict_index_t* index;
+
+ if (!dict_sys.is_initialised()) {
+ return(NULL);
+ }
+
+ mutex_enter(&dict_sys.mutex);
+
+ index = dict_index_get_if_in_cache_low(index_id);
+
+ mutex_exit(&dict_sys.mutex);
+
+ return(index);
+}
+
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return TRUE if ok */
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+ const dict_index_t* index, /*!< in: index tree */
+ const dtuple_t* tuple) /*!< in: tuple used in a search */
+{
+ ut_ad(dtuple_get_n_fields_cmp(tuple)
+ <= dict_index_get_n_unique_in_tree(index));
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return own: node pointer */
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record for which to build node
+ pointer */
+ ulint page_no,/*!< in: page number to put in node
+ pointer */
+ mem_heap_t* heap, /*!< in: memory heap where pointer
+ created */
+ ulint level) /*!< in: level of rec in tree:
+ 0 means leaf level */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ byte* buf;
+ ulint n_unique;
+
+ if (dict_index_is_ibuf(index)) {
+ /* In a universal index tree, we take the whole record as
+ the node pointer if the record is on the leaf level,
+ on non-leaf levels we remove the last field, which
+ contains the page number of the child page */
+
+ ut_a(!dict_table_is_comp(index->table));
+ n_unique = rec_get_n_fields_old(rec);
+
+ if (level > 0) {
+ ut_a(n_unique > 1);
+ n_unique--;
+ }
+ } else {
+ n_unique = dict_index_get_n_unique_in_tree_nonleaf(index);
+ }
+
+ tuple = dtuple_create(heap, n_unique + 1);
+
+ /* When searching in the tree for the node pointer, we must not do
+ comparison on the last field, the page number field, as on upper
+ levels in the tree there may be identical node pointers with a
+ different page number; therefore, we set the n_fields_cmp to one
+ less: */
+
+ dtuple_set_n_fields_cmp(tuple, n_unique);
+
+ dict_index_copy_types(tuple, index, n_unique);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+ mach_write_to_4(buf, page_no);
+
+ field = dtuple_get_nth_field(tuple, n_unique);
+ dfield_set_data(field, buf, 4);
+
+ dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4);
+
+ rec_copy_prefix_to_dtuple(tuple, rec, index,
+ level ? 0 : index->n_core_fields,
+ n_unique, heap);
+ dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple)
+ | REC_STATUS_NODE_PTR);
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ return(tuple);
+}
+
+/** Convert a physical record into a search tuple.
+@param[in] rec index record (not necessarily in an index page)
+@param[in] index index
+@param[in] leaf whether rec is in a leaf page
+@param[in] n_fields number of data fields
+@param[in,out] heap memory heap for allocation
+@return own: data tuple */
+dtuple_t*
+dict_index_build_data_tuple(
+ const rec_t* rec,
+ const dict_index_t* index,
+ bool leaf,
+ ulint n_fields,
+ mem_heap_t* heap)
+{
+ ut_ad(!index->is_clust());
+
+ dtuple_t* tuple = dtuple_create(heap, n_fields);
+
+ dict_index_copy_types(tuple, index, n_fields);
+
+ rec_copy_prefix_to_dtuple(tuple, rec, index,
+ leaf ? n_fields : 0, n_fields, heap);
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ return(tuple);
+}
+
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ulint sum = 0;
+ ulint i;
+ ulint comp = dict_table_is_comp(index->table);
+
+ if (comp) {
+ ulint nullable = 0;
+ sum = REC_N_NEW_EXTRA_BYTES;
+ for (i = 0; i < dict_index_get_n_fields(index); i++) {
+ const dict_col_t* col
+ = dict_index_get_nth_col(index, i);
+ ulint size = dict_col_get_fixed_size(col, comp);
+ sum += size;
+ if (!size) {
+ size = col->len;
+ sum += size < 128 ? 1 : 2;
+ }
+ if (!(col->prtype & DATA_NOT_NULL)) {
+ nullable++;
+ }
+ }
+
+ /* round the NULL flags up to full bytes */
+ sum += UT_BITS_IN_BYTES(nullable);
+
+ return(sum);
+ }
+
+ for (i = 0; i < dict_index_get_n_fields(index); i++) {
+ sum += dict_col_get_fixed_size(
+ dict_index_get_nth_col(index, i), comp);
+ }
+
+ if (sum > 127) {
+ sum += 2 * dict_index_get_n_fields(index);
+ } else {
+ sum += dict_index_get_n_fields(index);
+ }
+
+ sum += REC_N_OLD_EXTRA_BYTES;
+
+ return(sum);
+}
+
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+std::string
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ ibool add_newline) /*!< in: whether to add a newline */
+{
+ const char* stripped_id;
+ ulint i;
+ std::string str;
+
+ if (strchr(foreign->id, '/')) {
+ /* Strip the preceding database name from the constraint id */
+ stripped_id = foreign->id + 1
+ + dict_get_db_name_len(foreign->id);
+ } else {
+ stripped_id = foreign->id;
+ }
+
+ str.append(",");
+
+ if (add_newline) {
+ /* SHOW CREATE TABLE wants constraints each printed nicely
+ on its own line, while error messages want no newlines
+ inserted. */
+ str.append("\n ");
+ }
+
+ str.append(" CONSTRAINT ");
+
+ str.append(innobase_quote_identifier(trx, stripped_id));
+ str.append(" FOREIGN KEY (");
+
+ for (i = 0;;) {
+ str.append(innobase_quote_identifier(trx, foreign->foreign_col_names[i]));
+
+ if (++i < foreign->n_fields) {
+ str.append(", ");
+ } else {
+ break;
+ }
+ }
+
+ str.append(") REFERENCES ");
+
+ if (dict_tables_have_same_db(foreign->foreign_table_name_lookup,
+ foreign->referenced_table_name_lookup)) {
+ /* Do not print the database name of the referenced table */
+ str.append(ut_get_name(trx,
+ dict_remove_db_name(
+ foreign->referenced_table_name)));
+ } else {
+ str.append(ut_get_name(trx,
+ foreign->referenced_table_name));
+ }
+
+ str.append(" (");
+
+ for (i = 0;;) {
+ str.append(innobase_quote_identifier(trx,
+ foreign->referenced_col_names[i]));
+
+ if (++i < foreign->n_fields) {
+ str.append(", ");
+ } else {
+ break;
+ }
+ }
+
+ str.append(")");
+
+ if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) {
+ str.append(" ON DELETE CASCADE");
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) {
+ str.append(" ON DELETE SET NULL");
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+ str.append(" ON DELETE NO ACTION");
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+ str.append(" ON UPDATE CASCADE");
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+ str.append(" ON UPDATE SET NULL");
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+ str.append(" ON UPDATE NO ACTION");
+ }
+
+ return str;
+}
+
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+std::string
+dict_print_info_on_foreign_keys(
+/*============================*/
+ ibool create_table_format, /*!< in: if TRUE then print in
+ a format suitable to be inserted into
+ a CREATE TABLE, otherwise in the format
+ of SHOW TABLE STATUS */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table) /*!< in: table */
+{
+ dict_foreign_t* foreign;
+ std::string str;
+
+ mutex_enter(&dict_sys.mutex);
+
+ for (dict_foreign_set::iterator it = table->foreign_set.begin();
+ it != table->foreign_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ if (create_table_format) {
+ str.append(
+ dict_print_info_on_foreign_key_in_create_format(
+ trx, foreign, TRUE));
+ } else {
+ ulint i;
+ str.append("; (");
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ if (i) {
+ str.append(" ");
+ }
+
+ str.append(innobase_quote_identifier(trx,
+ foreign->foreign_col_names[i]));
+ }
+
+ str.append(") REFER ");
+ str.append(ut_get_name(trx,
+ foreign->referenced_table_name));
+ str.append(")");
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ if (i) {
+ str.append(" ");
+ }
+ str.append(innobase_quote_identifier(
+ trx,
+ foreign->referenced_col_names[i]));
+ }
+
+ str.append(")");
+
+ if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) {
+ str.append(" ON DELETE CASCADE");
+ }
+
+ if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) {
+ str.append(" ON DELETE SET NULL");
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+ str.append(" ON DELETE NO ACTION");
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+ str.append(" ON UPDATE CASCADE");
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+ str.append(" ON UPDATE SET NULL");
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+ str.append(" ON UPDATE NO ACTION");
+ }
+ }
+ }
+
+ mutex_exit(&dict_sys.mutex);
+ return str;
+}
+
+/** Given a space_id of a file-per-table tablespace, search the
+dict_sys.table_LRU list and return the dict_table_t* pointer for it.
+@param space tablespace
+@return table if found, NULL if not */
+static
+dict_table_t*
+dict_find_single_table_by_space(const fil_space_t* space)
+{
+ dict_table_t* table;
+ ulint num_item;
+ ulint count = 0;
+
+ ut_ad(space->id > 0);
+
+ if (!dict_sys.is_initialised()) {
+ /* This could happen when it's in redo processing. */
+ return(NULL);
+ }
+
+ table = UT_LIST_GET_FIRST(dict_sys.table_LRU);
+ num_item = UT_LIST_GET_LEN(dict_sys.table_LRU);
+
+ /* This function intentionally does not acquire mutex as it is used
+ by error handling code in deep call stack as last means to avoid
+ killing the server, so it worth to risk some consequences for
+ the action. */
+ while (table && count < num_item) {
+ if (table->space == space) {
+ if (dict_table_is_file_per_table(table)) {
+ return(table);
+ }
+ return(NULL);
+ }
+
+ table = UT_LIST_GET_NEXT(table_LRU, table);
+ count++;
+ }
+
+ return(NULL);
+}
+
+/**********************************************************************//**
+Flags a table with specified space_id corrupted in the data dictionary
+cache
+@return true if successful */
+bool dict_set_corrupted_by_space(const fil_space_t* space)
+{
+ dict_table_t* table;
+
+ table = dict_find_single_table_by_space(space);
+
+ if (!table) {
+ return false;
+ }
+
+ /* mark the table->corrupted bit only, since the caller
+ could be too deep in the stack for SYS_INDEXES update */
+ table->corrupted = true;
+ table->file_unreadable = true;
+ return true;
+}
+
+/** Flag a table encrypted in the data dictionary cache. */
+void dict_set_encrypted_by_space(const fil_space_t* space)
+{
+ if (dict_table_t* table = dict_find_single_table_by_space(space)) {
+ table->file_unreadable = true;
+ }
+}
+
+/**********************************************************************//**
+Flags an index corrupted both in the data dictionary cache
+and in the SYS_INDEXES */
+void
+dict_set_corrupted(
+/*===============*/
+ dict_index_t* index, /*!< in/out: index */
+ trx_t* trx, /*!< in/out: transaction */
+ const char* ctx) /*!< in: context */
+{
+ mem_heap_t* heap;
+ mtr_t mtr;
+ dict_index_t* sys_index;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ byte* buf;
+ const char* status;
+ btr_cur_t cursor;
+ bool locked = RW_X_LATCH == trx->dict_operation_lock_mode;
+
+ if (!locked) {
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_ad(!dict_table_is_comp(dict_sys.sys_tables));
+ ut_ad(!dict_table_is_comp(dict_sys.sys_indexes));
+ ut_ad(!sync_check_iterate(dict_sync_check()));
+
+ /* Mark the table as corrupted only if the clustered index
+ is corrupted */
+ if (dict_index_is_clust(index)) {
+ index->table->corrupted = TRUE;
+ }
+
+ if (index->type & DICT_CORRUPT) {
+ /* The index was already flagged corrupted. */
+ ut_ad(!dict_index_is_clust(index) || index->table->corrupted);
+ goto func_exit;
+ }
+
+ /* If this is read only mode, do not update SYS_INDEXES, just
+ mark it as corrupted in memory */
+ if (high_level_read_only) {
+ index->type |= DICT_CORRUPT;
+ goto func_exit;
+ }
+
+ heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
+ + sizeof(que_fork_t) + sizeof(upd_node_t)
+ + sizeof(upd_t) + 12));
+ mtr_start(&mtr);
+ index->type |= DICT_CORRUPT;
+
+ sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes);
+
+ /* Find the index row in SYS_INDEXES */
+ tuple = dtuple_create(heap, 2);
+
+ dfield = dtuple_get_nth_field(tuple, 0);
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(buf, index->table->id);
+ dfield_set_data(dfield, buf, 8);
+
+ dfield = dtuple_get_nth_field(tuple, 1);
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(buf, index->id);
+ dfield_set_data(dfield, buf, 8);
+
+ dict_index_copy_types(tuple, sys_index, 2);
+
+ btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_LE,
+ BTR_MODIFY_LEAF,
+ &cursor, 0, __FILE__, __LINE__, &mtr);
+
+ if (cursor.low_match == dtuple_get_n_fields(tuple)) {
+ /* UPDATE SYS_INDEXES SET TYPE=index->type
+ WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */
+ ulint len;
+ byte* field = rec_get_nth_field_old(
+ btr_cur_get_rec(&cursor),
+ DICT_FLD__SYS_INDEXES__TYPE, &len);
+ if (len != 4) {
+ goto fail;
+ }
+ mtr.write<4>(*btr_cur_get_block(&cursor), field, index->type);
+ status = "Flagged";
+ } else {
+fail:
+ status = "Unable to flag";
+ }
+
+ mtr_commit(&mtr);
+ mem_heap_empty(heap);
+ ib::error() << status << " corruption of " << index->name
+ << " in table " << index->table->name << " in " << ctx;
+ mem_heap_free(heap);
+
+func_exit:
+ if (!locked) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+}
+
+/** Flags an index corrupted in the data dictionary cache only. This
+is used mostly to mark a corrupted index when index's own dictionary
+is corrupted, and we force to load such index for repair purpose
+@param[in,out] index index which is corrupted */
+void
+dict_set_corrupted_index_cache_only(
+ dict_index_t* index)
+{
+ ut_ad(index != NULL);
+ ut_ad(index->table != NULL);
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_ad(!dict_table_is_comp(dict_sys.sys_tables));
+ ut_ad(!dict_table_is_comp(dict_sys.sys_indexes));
+
+ /* Mark the table as corrupted only if the clustered index
+ is corrupted */
+ if (dict_index_is_clust(index)) {
+ index->table->corrupted = TRUE;
+ index->table->file_unreadable = true;
+ }
+
+ index->type |= DICT_CORRUPT;
+}
+
+/** Sets merge_threshold in the SYS_INDEXES
+@param[in,out] index index
+@param[in] merge_threshold value to set */
+void
+dict_index_set_merge_threshold(
+ dict_index_t* index,
+ ulint merge_threshold)
+{
+ mem_heap_t* heap;
+ mtr_t mtr;
+ dict_index_t* sys_index;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ byte* buf;
+ btr_cur_t cursor;
+
+ ut_ad(index != NULL);
+ ut_ad(!dict_table_is_comp(dict_sys.sys_tables));
+ ut_ad(!dict_table_is_comp(dict_sys.sys_indexes));
+
+ dict_sys_lock();
+
+ heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
+ + sizeof(que_fork_t) + sizeof(upd_node_t)
+ + sizeof(upd_t) + 12));
+
+ mtr_start(&mtr);
+
+ sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes);
+
+ /* Find the index row in SYS_INDEXES */
+ tuple = dtuple_create(heap, 2);
+
+ dfield = dtuple_get_nth_field(tuple, 0);
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(buf, index->table->id);
+ dfield_set_data(dfield, buf, 8);
+
+ dfield = dtuple_get_nth_field(tuple, 1);
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(buf, index->id);
+ dfield_set_data(dfield, buf, 8);
+
+ dict_index_copy_types(tuple, sys_index, 2);
+
+ btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_GE,
+ BTR_MODIFY_LEAF,
+ &cursor, 0, __FILE__, __LINE__, &mtr);
+
+ if (cursor.up_match == dtuple_get_n_fields(tuple)
+ && rec_get_n_fields_old(btr_cur_get_rec(&cursor))
+ == DICT_NUM_FIELDS__SYS_INDEXES) {
+ ulint len;
+ byte* field = rec_get_nth_field_old(
+ btr_cur_get_rec(&cursor),
+ DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD, &len);
+
+ ut_ad(len == 4);
+ mtr.write<4,mtr_t::MAYBE_NOP>(*btr_cur_get_block(&cursor),
+ field, merge_threshold);
+ }
+
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ dict_sys_unlock();
+}
+
+#ifdef UNIV_DEBUG
+/** Sets merge_threshold for all indexes in the list of tables
+@param[in] list pointer to the list of tables */
+inline
+void
+dict_set_merge_threshold_list_debug(
+ UT_LIST_BASE_NODE_T(dict_table_t)* list,
+ uint merge_threshold_all)
+{
+ for (dict_table_t* table = UT_LIST_GET_FIRST(*list);
+ table != NULL;
+ table = UT_LIST_GET_NEXT(table_LRU, table)) {
+ for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index != NULL;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+ rw_lock_x_lock(dict_index_get_lock(index));
+ index->merge_threshold = merge_threshold_all
+ & ((1U << 6) - 1);
+ rw_lock_x_unlock(dict_index_get_lock(index));
+ }
+ }
+}
+
+/** Sets merge_threshold for all indexes in dictionary cache for debug.
+@param[in] merge_threshold_all value to set for all indexes */
+void
+dict_set_merge_threshold_all_debug(
+ uint merge_threshold_all)
+{
+ mutex_enter(&dict_sys.mutex);
+
+ dict_set_merge_threshold_list_debug(
+ &dict_sys.table_LRU, merge_threshold_all);
+ dict_set_merge_threshold_list_debug(
+ &dict_sys.table_non_LRU, merge_threshold_all);
+
+ mutex_exit(&dict_sys.mutex);
+}
+
+#endif /* UNIV_DEBUG */
+
+/** Get an index by name.
+@param[in] table the table where to look for the index
+@param[in] name the index name to look for
+@return index, NULL if does not exist */
+dict_index_t*
+dict_table_get_index_on_name(dict_table_t* table, const char* name)
+{
+ dict_index_t* index;
+
+ index = dict_table_get_first_index(table);
+
+ while (index != NULL) {
+ if (index->is_committed() && !strcmp(index->name, name)) {
+ return(index);
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ return(NULL);
+}
+
+/**********************************************************************//**
+Replace the index passed in with another equivalent index in the
+foreign key lists of the table.
+@return whether all replacements were found */
+bool
+dict_foreign_replace_index(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use table->col_names */
+ const dict_index_t* index) /*!< in: index to be replaced */
+{
+ bool found = true;
+ dict_foreign_t* foreign;
+
+ ut_ad(index->to_be_dropped);
+ ut_ad(index->table == table);
+
+ for (dict_foreign_set::iterator it = table->foreign_set.begin();
+ it != table->foreign_set.end();
+ ++it) {
+
+ foreign = *it;
+ if (foreign->foreign_index == index) {
+ ut_ad(foreign->foreign_table == index->table);
+
+ dict_index_t* new_index = dict_foreign_find_index(
+ foreign->foreign_table, col_names,
+ foreign->foreign_col_names,
+ foreign->n_fields, index,
+ /*check_charsets=*/TRUE, /*check_null=*/FALSE,
+ NULL, NULL, NULL);
+ if (new_index) {
+ ut_ad(new_index->table == index->table);
+ ut_ad(!new_index->to_be_dropped);
+ } else {
+ found = false;
+ }
+
+ foreign->foreign_index = new_index;
+ }
+ }
+
+ for (dict_foreign_set::iterator it = table->referenced_set.begin();
+ it != table->referenced_set.end();
+ ++it) {
+
+ foreign = *it;
+ if (foreign->referenced_index == index) {
+ ut_ad(foreign->referenced_table == index->table);
+
+ dict_index_t* new_index = dict_foreign_find_index(
+ foreign->referenced_table, NULL,
+ foreign->referenced_col_names,
+ foreign->n_fields, index,
+ /*check_charsets=*/TRUE, /*check_null=*/FALSE,
+ NULL, NULL, NULL);
+ /* There must exist an alternative index,
+ since this must have been checked earlier. */
+ if (new_index) {
+ ut_ad(new_index->table == index->table);
+ ut_ad(!new_index->to_be_dropped);
+ } else {
+ found = false;
+ }
+
+ foreign->referenced_index = new_index;
+ }
+ }
+
+ return(found);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+ const dict_table_t* table, /*!< in: Check for dup indexes
+ in this table */
+ enum check_name check) /*!< in: whether and when to allow
+ temporary index names */
+{
+ /* Check for duplicates, ignoring indexes that are marked
+ as to be dropped */
+
+ const dict_index_t* index1;
+ const dict_index_t* index2;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ /* The primary index _must_ exist */
+ ut_a(UT_LIST_GET_LEN(table->indexes) > 0);
+
+ index1 = UT_LIST_GET_FIRST(table->indexes);
+
+ do {
+ if (!index1->is_committed()) {
+ ut_a(!dict_index_is_clust(index1));
+
+ switch (check) {
+ case CHECK_ALL_COMPLETE:
+ ut_error;
+ case CHECK_ABORTED_OK:
+ switch (dict_index_get_online_status(index1)) {
+ case ONLINE_INDEX_COMPLETE:
+ case ONLINE_INDEX_CREATION:
+ ut_error;
+ break;
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ break;
+ }
+ /* fall through */
+ case CHECK_PARTIAL_OK:
+ break;
+ }
+ }
+
+ for (index2 = UT_LIST_GET_NEXT(indexes, index1);
+ index2 != NULL;
+ index2 = UT_LIST_GET_NEXT(indexes, index2)) {
+ ut_ad(index1->is_committed()
+ != index2->is_committed()
+ || strcmp(index1->name, index2->name) != 0);
+ }
+
+ index1 = UT_LIST_GET_NEXT(indexes, index1);
+ } while (index1);
+}
+#endif /* UNIV_DEBUG */
+
+/** Auxiliary macro used inside dict_table_schema_check(). */
+#define CREATE_TYPES_NAMES() \
+ dtype_sql_name((unsigned) req_schema->columns[i].mtype, \
+ (unsigned) req_schema->columns[i].prtype_mask, \
+ (unsigned) req_schema->columns[i].len, \
+ req_type, sizeof(req_type)); \
+ dtype_sql_name(table->cols[j].mtype, \
+ table->cols[j].prtype, \
+ table->cols[j].len, \
+ actual_type, sizeof(actual_type))
+
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The table must have the same number of columns with the same names and
+types. The order of the columns does not matter.
+The caller must own the dictionary mutex.
+dict_table_schema_check() @{
+@return DB_SUCCESS if the table exists and contains the necessary columns */
+dberr_t
+dict_table_schema_check(
+/*====================*/
+ dict_table_schema_t* req_schema, /*!< in/out: required table
+ schema */
+ char* errstr, /*!< out: human readable error
+ message if != DB_SUCCESS is
+ returned */
+ size_t errstr_sz) /*!< in: errstr size */
+{
+ char buf[MAX_FULL_NAME_LEN];
+ char req_type[64];
+ char actual_type[64];
+ dict_table_t* table;
+ ulint i;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ table = dict_table_get_low(req_schema->table_name);
+
+ if (table == NULL) {
+ bool should_print=true;
+ /* no such table */
+
+ if (innobase_strcasecmp(req_schema->table_name, "mysql/innodb_table_stats") == 0) {
+ if (innodb_table_stats_not_found_reported == false) {
+ innodb_table_stats_not_found = true;
+ innodb_table_stats_not_found_reported = true;
+ } else {
+ should_print = false;
+ }
+ } else if (innobase_strcasecmp(req_schema->table_name, "mysql/innodb_index_stats") == 0 ) {
+ if (innodb_index_stats_not_found_reported == false) {
+ innodb_index_stats_not_found = true;
+ innodb_index_stats_not_found_reported = true;
+ } else {
+ should_print = false;
+ }
+ }
+
+ if (should_print) {
+ snprintf(errstr, errstr_sz,
+ "Table %s not found.",
+ ut_format_name(req_schema->table_name,
+ buf, sizeof(buf)));
+ return(DB_TABLE_NOT_FOUND);
+ } else {
+ return(DB_STATS_DO_NOT_EXIST);
+ }
+ }
+
+ if (!table->is_readable() && !table->space) {
+ /* missing tablespace */
+
+ snprintf(errstr, errstr_sz,
+ "Tablespace for table %s is missing.",
+ ut_format_name(req_schema->table_name,
+ buf, sizeof(buf)));
+
+ return(DB_TABLE_NOT_FOUND);
+ }
+
+ if (ulint(table->n_def - DATA_N_SYS_COLS) != req_schema->n_cols) {
+ /* the table has a different number of columns than required */
+ snprintf(errstr, errstr_sz,
+ "%s has %d columns but should have " ULINTPF ".",
+ ut_format_name(req_schema->table_name, buf,
+ sizeof buf),
+ table->n_def - DATA_N_SYS_COLS,
+ req_schema->n_cols);
+
+ return(DB_ERROR);
+ }
+
+ /* For each column from req_schema->columns[] search
+ whether it is present in table->cols[].
+ The following algorithm is O(n_cols^2), but is optimized to
+ be O(n_cols) if the columns are in the same order in both arrays. */
+
+ for (i = 0; i < req_schema->n_cols; i++) {
+ ulint j = dict_table_has_column(
+ table, req_schema->columns[i].name, i);
+
+ if (j == table->n_def) {
+
+ snprintf(errstr, errstr_sz,
+ "required column %s"
+ " not found in table %s.",
+ req_schema->columns[i].name,
+ ut_format_name(
+ req_schema->table_name,
+ buf, sizeof(buf)));
+
+ return(DB_ERROR);
+ }
+
+ /* we found a column with the same name on j'th position,
+ compare column types and flags */
+
+ /* check length for exact match */
+ if (req_schema->columns[i].len == table->cols[j].len) {
+ } else if (!strcmp(req_schema->table_name, TABLE_STATS_NAME)
+ || !strcmp(req_schema->table_name,
+ INDEX_STATS_NAME)) {
+ ut_ad(table->cols[j].len < req_schema->columns[i].len);
+ ib::warn() << "Table " << req_schema->table_name
+ << " has length mismatch in the"
+ << " column name "
+ << req_schema->columns[i].name
+ << ". Please run mysql_upgrade";
+ } else {
+ CREATE_TYPES_NAMES();
+
+ snprintf(errstr, errstr_sz,
+ "Column %s in table %s is %s"
+ " but should be %s (length mismatch).",
+ req_schema->columns[i].name,
+ ut_format_name(req_schema->table_name,
+ buf, sizeof(buf)),
+ actual_type, req_type);
+
+ return(DB_ERROR);
+ }
+
+ /*
+ check mtype for exact match.
+ This check is relaxed to allow use to use TIMESTAMP
+ (ie INT) for last_update instead of DATA_BINARY.
+ We have to test for both values as the innodb_table_stats
+ table may come from MySQL and have the old type.
+ */
+ if (req_schema->columns[i].mtype != table->cols[j].mtype &&
+ !(req_schema->columns[i].mtype == DATA_INT &&
+ table->cols[j].mtype == DATA_FIXBINARY))
+ {
+ CREATE_TYPES_NAMES();
+
+ snprintf(errstr, errstr_sz,
+ "Column %s in table %s is %s"
+ " but should be %s (type mismatch).",
+ req_schema->columns[i].name,
+ ut_format_name(req_schema->table_name,
+ buf, sizeof(buf)),
+ actual_type, req_type);
+
+ return(DB_ERROR);
+ }
+
+ /* check whether required prtype mask is set */
+ if (req_schema->columns[i].prtype_mask != 0
+ && (table->cols[j].prtype
+ & req_schema->columns[i].prtype_mask)
+ != req_schema->columns[i].prtype_mask) {
+
+ CREATE_TYPES_NAMES();
+
+ snprintf(errstr, errstr_sz,
+ "Column %s in table %s is %s"
+ " but should be %s (flags mismatch).",
+ req_schema->columns[i].name,
+ ut_format_name(req_schema->table_name,
+ buf, sizeof(buf)),
+ actual_type, req_type);
+
+ return(DB_ERROR);
+ }
+ }
+
+ if (req_schema->n_foreign != table->foreign_set.size()) {
+ snprintf(
+ errstr, errstr_sz,
+ "Table %s has " ULINTPF " foreign key(s) pointing"
+ " to other tables, but it must have " ULINTPF ".",
+ ut_format_name(req_schema->table_name,
+ buf, sizeof(buf)),
+ static_cast<ulint>(table->foreign_set.size()),
+ req_schema->n_foreign);
+ return(DB_ERROR);
+ }
+
+ if (req_schema->n_referenced != table->referenced_set.size()) {
+ snprintf(
+ errstr, errstr_sz,
+ "There are " ULINTPF " foreign key(s) pointing to %s, "
+ "but there must be " ULINTPF ".",
+ static_cast<ulint>(table->referenced_set.size()),
+ ut_format_name(req_schema->table_name,
+ buf, sizeof(buf)),
+ req_schema->n_referenced);
+ return(DB_ERROR);
+ }
+
+ return(DB_SUCCESS);
+}
+/* @} */
+
+/*********************************************************************//**
+Converts a database and table name from filesystem encoding
+(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
+strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be
+at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */
+void
+dict_fs2utf8(
+/*=========*/
+ const char* db_and_table, /*!< in: database and table names,
+ e.g. d@i1b/a@q1b@1Kc */
+ char* db_utf8, /*!< out: database name, e.g. dцb */
+ size_t db_utf8_size, /*!< in: dbname_utf8 size */
+ char* table_utf8, /*!< out: table name, e.g. aюbØc */
+ size_t table_utf8_size)/*!< in: table_utf8 size */
+{
+ char db[MAX_DATABASE_NAME_LEN + 1];
+ ulint db_len;
+ uint errors;
+
+ db_len = dict_get_db_name_len(db_and_table);
+
+ ut_a(db_len <= sizeof(db));
+
+ memcpy(db, db_and_table, db_len);
+ db[db_len] = '\0';
+
+ strconvert(
+ &my_charset_filename, db, uint(db_len), system_charset_info,
+ db_utf8, uint(db_utf8_size), &errors);
+
+ /* convert each # to @0023 in table name and store the result in buf */
+ const char* table = dict_remove_db_name(db_and_table);
+ const char* table_p;
+ char buf[MAX_TABLE_NAME_LEN * 5 + 1];
+ char* buf_p;
+ for (table_p = table, buf_p = buf; table_p[0] != '\0'; table_p++) {
+ if (table_p[0] != '#') {
+ buf_p[0] = table_p[0];
+ buf_p++;
+ } else {
+ buf_p[0] = '@';
+ buf_p[1] = '0';
+ buf_p[2] = '0';
+ buf_p[3] = '2';
+ buf_p[4] = '3';
+ buf_p += 5;
+ }
+ ut_a((size_t) (buf_p - buf) < sizeof(buf));
+ }
+ buf_p[0] = '\0';
+
+ errors = 0;
+ strconvert(
+ &my_charset_filename, buf, (uint) (buf_p - buf),
+ system_charset_info,
+ table_utf8, uint(table_utf8_size),
+ &errors);
+
+ if (errors != 0) {
+ snprintf(table_utf8, table_utf8_size, "%s%s",
+ srv_mysql50_table_name_prefix, table);
+ }
+}
+
+/** Resize the hash tables based on the current buffer pool size. */
+void dict_sys_t::resize()
+{
+ ut_ad(this == &dict_sys);
+ ut_ad(is_initialised());
+ mutex_enter(&mutex);
+
+ /* all table entries are in table_LRU and table_non_LRU lists */
+ table_hash.free();
+ table_id_hash.free();
+ temp_id_hash.free();
+
+ const ulint hash_size = buf_pool_get_curr_size()
+ / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE);
+ table_hash.create(hash_size);
+ table_id_hash.create(hash_size);
+ temp_id_hash.create(hash_size);
+
+ for (dict_table_t *table= UT_LIST_GET_FIRST(table_LRU); table;
+ table= UT_LIST_GET_NEXT(table_LRU, table))
+ {
+ ut_ad(!table->is_temporary());
+ ulint fold= ut_fold_string(table->name.m_name);
+ ulint id_fold= ut_fold_ull(table->id);
+
+ HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
+ HASH_INSERT(dict_table_t, id_hash, &table_id_hash, id_fold, table);
+ }
+
+ for (dict_table_t *table = UT_LIST_GET_FIRST(table_non_LRU); table;
+ table= UT_LIST_GET_NEXT(table_LRU, table))
+ {
+ ulint fold= ut_fold_string(table->name.m_name);
+ ulint id_fold= ut_fold_ull(table->id);
+
+ HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
+
+ hash_table_t *id_hash= table->is_temporary()
+ ? &temp_id_hash : &table_id_hash;
+
+ HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table);
+ }
+
+ mutex_exit(&mutex);
+}
+
+/** Close the data dictionary cache on shutdown. */
+void dict_sys_t::close()
+{
+ ut_ad(this == &dict_sys);
+ if (!is_initialised()) return;
+
+ mutex_enter(&mutex);
+
+ /* Free the hash elements. We don't remove them from the table
+ because we are going to destroy the table anyway. */
+ for (ulint i= table_hash.n_cells; i--; )
+ while (dict_table_t *table= static_cast<dict_table_t*>
+ (HASH_GET_FIRST(&table_hash, i)))
+ dict_sys.remove(table);
+
+ table_hash.free();
+
+ /* table_id_hash contains the same elements as in table_hash,
+ therefore we don't delete the individual elements. */
+ table_id_hash.free();
+
+ /* No temporary tables should exist at this point. */
+ temp_id_hash.free();
+
+ mutex_exit(&mutex);
+ mutex_free(&mutex);
+ rw_lock_free(&latch);
+
+ mutex_free(&dict_foreign_err_mutex);
+
+ if (dict_foreign_err_file)
+ {
+ my_fclose(dict_foreign_err_file, MYF(MY_WME));
+ dict_foreign_err_file = NULL;
+ }
+
+ m_initialised= false;
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate the dictionary table LRU list.
+@return TRUE if valid */
+static
+ibool
+dict_lru_validate(void)
+/*===================*/
+{
+ dict_table_t* table;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU);
+ table != NULL;
+ table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+ ut_a(table->can_be_evicted);
+ }
+
+ for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU);
+ table != NULL;
+ table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+ ut_a(!table->can_be_evicted);
+ }
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Check an index to see whether its first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return true if the index qualifies, otherwise false */
+bool
+dict_foreign_qualify_index(
+/*=======================*/
+ const dict_table_t* table, /*!< in: table */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use table->col_names */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols, /*!< in: number of columns */
+ const dict_index_t* index, /*!< in: index to check */
+ const dict_index_t* types_idx,
+ /*!< in: NULL or an index
+ whose types the column types
+ must match */
+ bool check_charsets,
+ /*!< in: whether to check
+ charsets. only has an effect
+ if types_idx != NULL */
+ ulint check_null,
+ /*!< in: nonzero if none of
+ the columns must be declared
+ NOT NULL */
+ fkerr_t* error, /*!< out: error code */
+ ulint* err_col_no,
+ /*!< out: column number where
+ error happened */
+ dict_index_t** err_index)
+ /*!< out: index where error
+ happened */
+{
+ if (dict_index_get_n_fields(index) < n_cols) {
+ return(false);
+ }
+
+ if (index->type & (DICT_SPATIAL | DICT_FTS | DICT_CORRUPT)) {
+ return false;
+ }
+
+ if (index->online_status >= ONLINE_INDEX_ABORTED) {
+ return false;
+ }
+
+ for (ulint i = 0; i < n_cols; i++) {
+ dict_field_t* field;
+ const char* col_name;
+ ulint col_no;
+
+ field = dict_index_get_nth_field(index, i);
+ col_no = dict_col_get_no(field->col);
+
+ if (field->prefix_len != 0) {
+ /* We do not accept column prefix
+ indexes here */
+ if (error && err_col_no && err_index) {
+ *error = FK_IS_PREFIX_INDEX;
+ *err_col_no = i;
+ *err_index = (dict_index_t*)index;
+ }
+ return(false);
+ }
+
+ if (check_null
+ && (field->col->prtype & DATA_NOT_NULL)) {
+ if (error && err_col_no && err_index) {
+ *error = FK_COL_NOT_NULL;
+ *err_col_no = i;
+ *err_index = (dict_index_t*)index;
+ }
+ return(false);
+ }
+
+ if (field->col->is_virtual()) {
+ col_name = "";
+ for (ulint j = 0; j < table->n_v_def; j++) {
+ col_name = dict_table_get_v_col_name(table, j);
+ if (innobase_strcasecmp(field->name,col_name) == 0) {
+ break;
+ }
+ }
+ } else {
+ col_name = col_names
+ ? col_names[col_no]
+ : dict_table_get_col_name(table, col_no);
+ }
+
+ if (0 != innobase_strcasecmp(columns[i], col_name)) {
+ return(false);
+ }
+
+ if (types_idx && !cmp_cols_are_equal(
+ dict_index_get_nth_col(index, i),
+ dict_index_get_nth_col(types_idx, i),
+ check_charsets)) {
+ if (error && err_col_no && err_index) {
+ *error = FK_COLS_NOT_EQUAL;
+ *err_col_no = i;
+ *err_index = (dict_index_t*)index;
+ }
+
+ return(false);
+ }
+ }
+
+ return(true);
+}
+
+/*********************************************************************//**
+Update the state of compression failure padding heuristics. This is
+called whenever a compression operation succeeds or fails.
+The caller must be holding info->mutex */
+static
+void
+dict_index_zip_pad_update(
+/*======================*/
+ zip_pad_info_t* info, /*<! in/out: info to be updated */
+ ulint zip_threshold) /*<! in: zip threshold value */
+{
+ ulint total;
+ ulint fail_pct;
+
+ ut_ad(info);
+ ut_ad(info->pad % ZIP_PAD_INCR == 0);
+
+ total = info->success + info->failure;
+
+ ut_ad(total > 0);
+
+ if (zip_threshold == 0) {
+ /* User has just disabled the padding. */
+ return;
+ }
+
+ if (total < ZIP_PAD_ROUND_LEN) {
+ /* We are in middle of a round. Do nothing. */
+ return;
+ }
+
+ /* We are at a 'round' boundary. Reset the values but first
+ calculate fail rate for our heuristic. */
+ fail_pct = (info->failure * 100) / total;
+ info->failure = 0;
+ info->success = 0;
+
+ if (fail_pct > zip_threshold) {
+ /* Compression failures are more then user defined
+ threshold. Increase the pad size to reduce chances of
+ compression failures.
+
+ Only do increment if it won't increase padding
+ beyond max pad size. */
+ if (info->pad + ZIP_PAD_INCR
+ < (srv_page_size * zip_pad_max) / 100) {
+ info->pad.fetch_add(ZIP_PAD_INCR);
+
+ MONITOR_INC(MONITOR_PAD_INCREMENTS);
+ }
+
+ info->n_rounds = 0;
+
+ } else {
+ /* Failure rate was OK. Another successful round
+ completed. */
+ ++info->n_rounds;
+
+ /* If enough successful rounds are completed with
+ compression failure rate in control, decrease the
+ padding. */
+ if (info->n_rounds >= ZIP_PAD_SUCCESSFUL_ROUND_LIMIT
+ && info->pad > 0) {
+ info->pad.fetch_sub(ZIP_PAD_INCR);
+
+ info->n_rounds = 0;
+
+ MONITOR_INC(MONITOR_PAD_DECREMENTS);
+ }
+ }
+}
+
+/*********************************************************************//**
+This function should be called whenever a page is successfully
+compressed. Updates the compression padding information. */
+void
+dict_index_zip_success(
+/*===================*/
+ dict_index_t* index) /*!< in/out: index to be updated. */
+{
+ ulint zip_threshold = zip_failure_threshold_pct;
+ if (!zip_threshold) {
+ /* Disabled by user. */
+ return;
+ }
+
+ index->zip_pad.mutex.lock();
+ ++index->zip_pad.success;
+ dict_index_zip_pad_update(&index->zip_pad, zip_threshold);
+ index->zip_pad.mutex.unlock();
+}
+
+/*********************************************************************//**
+This function should be called whenever a page compression attempt
+fails. Updates the compression padding information. */
+void
+dict_index_zip_failure(
+/*===================*/
+ dict_index_t* index) /*!< in/out: index to be updated. */
+{
+ ulint zip_threshold = zip_failure_threshold_pct;
+ if (!zip_threshold) {
+ /* Disabled by user. */
+ return;
+ }
+
+ index->zip_pad.mutex.lock();
+ ++index->zip_pad.failure;
+ dict_index_zip_pad_update(&index->zip_pad, zip_threshold);
+ index->zip_pad.mutex.unlock();
+}
+
+/*********************************************************************//**
+Return the optimal page size, for which page will likely compress.
+@return page size beyond which page might not compress */
+ulint
+dict_index_zip_pad_optimal_page_size(
+/*=================================*/
+ dict_index_t* index) /*!< in: index for which page size
+ is requested */
+{
+ ulint pad;
+ ulint min_sz;
+ ulint sz;
+
+ if (!zip_failure_threshold_pct) {
+ /* Disabled by user. */
+ return(srv_page_size);
+ }
+
+ pad = index->zip_pad.pad;
+
+ ut_ad(pad < srv_page_size);
+ sz = srv_page_size - pad;
+
+ /* Min size allowed by user. */
+ ut_ad(zip_pad_max < 100);
+ min_sz = (srv_page_size * (100 - zip_pad_max)) / 100;
+
+ return(ut_max(sz, min_sz));
+}
+
+/*************************************************************//**
+Convert table flag to row format string.
+@return row format name. */
+const char*
+dict_tf_to_row_format_string(
+/*=========================*/
+ ulint table_flag) /*!< in: row format setting */
+{
+ switch (dict_tf_get_rec_format(table_flag)) {
+ case REC_FORMAT_REDUNDANT:
+ return("ROW_TYPE_REDUNDANT");
+ case REC_FORMAT_COMPACT:
+ return("ROW_TYPE_COMPACT");
+ case REC_FORMAT_COMPRESSED:
+ return("ROW_TYPE_COMPRESSED");
+ case REC_FORMAT_DYNAMIC:
+ return("ROW_TYPE_DYNAMIC");
+ }
+
+ ut_error;
+ return(0);
+}
+
+bool dict_table_t::is_stats_table() const
+{
+ return !strcmp(name.m_name, TABLE_STATS_NAME) ||
+ !strcmp(name.m_name, INDEX_STATS_NAME);
+}
diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc
new file mode 100644
index 00000000..34b04eb3
--- /dev/null
+++ b/storage/innobase/dict/dict0load.cc
@@ -0,0 +1,3687 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0load.cc
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0load.h"
+
+#include "mysql_version.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "fsp0file.h"
+#include "fts0priv.h"
+#include "mach0data.h"
+#include "page0page.h"
+#include "rem0cmp.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "fts0opt.h"
+
+/** Following are the InnoDB system tables. The positions in
+this array are referenced by enum dict_system_table_id. */
+static const char* SYSTEM_TABLE_NAME[] = {
+ "SYS_TABLES",
+ "SYS_INDEXES",
+ "SYS_COLUMNS",
+ "SYS_FIELDS",
+ "SYS_FOREIGN",
+ "SYS_FOREIGN_COLS",
+ "SYS_TABLESPACES",
+ "SYS_DATAFILES",
+ "SYS_VIRTUAL"
+};
+
+/** Loads a table definition and also all its index definitions.
+
+Loads those foreign key constraints whose referenced table is already in
+dictionary cache. If a foreign key constraint is not loaded, then the
+referenced table is pushed into the output stack (fk_tables), if it is not
+NULL. These tables must be subsequently loaded so that all the foreign
+key constraints are loaded into memory.
+
+@param[in] name Table name in the db/tablename format
+@param[in] ignore_err Error to be ignored when loading table
+ and its index definition
+@param[out] fk_tables Related table names that must also be
+ loaded to ensure that all foreign key
+ constraints are loaded.
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the
+file_unreadable flag in the table object we return */
+static
+dict_table_t*
+dict_load_table_one(
+ const table_name_t& name,
+ dict_err_ignore_t ignore_err,
+ dict_names_t& fk_tables);
+
+/** Load a table definition from a SYS_TABLES record to dict_table_t.
+Do not load any columns or indexes.
+@param[in] name Table name
+@param[in] rec SYS_TABLES record
+@param[out,own] table table, or NULL
+@return error message
+@retval NULL on success */
+static const char* dict_load_table_low(const table_name_t& name,
+ const rec_t* rec, dict_table_t** table)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Load an index definition from a SYS_INDEXES record to dict_index_t.
+If allocate=TRUE, we will create a dict_index_t structure and fill it
+accordingly. If allocated=FALSE, the dict_index_t will be supplied by
+the caller and filled with information read from the record.
+@return error message
+@retval NULL on success */
+static
+const char*
+dict_load_index_low(
+ byte* table_id, /*!< in/out: table id (8 bytes),
+ an "in" value if allocate=TRUE
+ and "out" when allocate=FALSE */
+ mem_heap_t* heap, /*!< in/out: temporary memory heap */
+ const rec_t* rec, /*!< in: SYS_INDEXES record */
+ ibool allocate, /*!< in: TRUE=allocate *index,
+ FALSE=fill in a pre-allocated
+ *index */
+ dict_index_t** index); /*!< out,own: index, or NULL */
+
+/** Load a table column definition from a SYS_COLUMNS record to dict_table_t.
+@return error message
+@retval NULL on success */
+static
+const char*
+dict_load_column_low(
+ dict_table_t* table, /*!< in/out: table, could be NULL
+ if we just populate a dict_column_t
+ struct with information from
+ a SYS_COLUMNS record */
+ mem_heap_t* heap, /*!< in/out: memory heap
+ for temporary storage */
+ dict_col_t* column, /*!< out: dict_column_t to fill,
+ or NULL if table != NULL */
+ table_id_t* table_id, /*!< out: table id */
+ const char** col_name, /*!< out: column name */
+ const rec_t* rec, /*!< in: SYS_COLUMNS record */
+ ulint* nth_v_col); /*!< out: if not NULL, this
+ records the "n" of "nth" virtual
+ column */
+
+/** Load a virtual column "mapping" (to base columns) information
+from a SYS_VIRTUAL record
+@param[in,out] table table
+@param[in,out] column mapped base column's dict_column_t
+@param[in,out] table_id table id
+@param[in,out] pos virtual column position
+@param[in,out] base_pos base column position
+@param[in] rec SYS_VIRTUAL record
+@return error message
+@retval NULL on success */
+static
+const char*
+dict_load_virtual_low(
+ dict_table_t* table,
+ dict_col_t** column,
+ table_id_t* table_id,
+ ulint* pos,
+ ulint* base_pos,
+ const rec_t* rec);
+
+/** Load an index field definition from a SYS_FIELDS record to dict_index_t.
+@return error message
+@retval NULL on success */
+static
+const char*
+dict_load_field_low(
+ byte* index_id, /*!< in/out: index id (8 bytes)
+ an "in" value if index != NULL
+ and "out" if index == NULL */
+ dict_index_t* index, /*!< in/out: index, could be NULL
+ if we just populate a dict_field_t
+ struct with information from
+ a SYS_FIELDS record */
+ dict_field_t* sys_field, /*!< out: dict_field_t to be
+ filled */
+ ulint* pos, /*!< out: Field position */
+ byte* last_index_id, /*!< in: last index id */
+ mem_heap_t* heap, /*!< in/out: memory heap
+ for temporary storage */
+ const rec_t* rec); /*!< in: SYS_FIELDS record */
+
+/* If this flag is TRUE, then we will load the cluster index's (and tables')
+metadata even if it is marked as "corrupted". */
+my_bool srv_load_corrupted;
+
+#ifdef UNIV_DEBUG
+/****************************************************************//**
+Compare the name of an index column.
+@return TRUE if the i'th column of index is 'name'. */
+static
+ibool
+name_of_col_is(
+/*===========*/
+ const dict_table_t* table, /*!< in: table */
+ const dict_index_t* index, /*!< in: index */
+ ulint i, /*!< in: index field offset */
+ const char* name) /*!< in: name to compare to */
+{
+ ulint tmp = dict_col_get_no(dict_field_get_col(
+ dict_index_get_nth_field(
+ index, i)));
+
+ return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Finds the first table name in the given database.
+@return own: table name, NULL if does not exist; the caller must free
+the memory in the string! */
+char*
+dict_get_first_table_name_in_db(
+/*============================*/
+ const char* name) /*!< in: database name which ends in '/' */
+{
+ dict_table_t* sys_tables;
+ btr_pcur_t pcur;
+ dict_index_t* sys_index;
+ dtuple_t* tuple;
+ mem_heap_t* heap;
+ dfield_t* dfield;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ heap = mem_heap_create(1000);
+
+ mtr_start(&mtr);
+
+ sys_tables = dict_table_get_low("SYS_TABLES");
+ sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+ ut_ad(!dict_table_is_comp(sys_tables));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(dfield, name, strlen(name));
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+loop:
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ /* Not found */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(NULL);
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__NAME, &len);
+
+ if (len < strlen(name)
+ || memcmp(name, field, strlen(name))) {
+ /* Not found */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(NULL);
+ }
+
+ if (!rec_get_deleted_flag(rec, 0)) {
+
+ /* We found one */
+
+ char* table_name = mem_strdupl((char*) field, len);
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(table_name);
+ }
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ goto loop;
+}
+
+/********************************************************************//**
+This function gets the next system table record as it scans the table.
+@return the next record if found, NULL if end of scan */
+static
+const rec_t*
+dict_getnext_system_low(
+/*====================*/
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor to the
+ record*/
+ mtr_t* mtr) /*!< in: the mini-transaction */
+{
+ rec_t* rec = NULL;
+
+ while (!rec || rec_get_deleted_flag(rec, 0)) {
+ btr_pcur_move_to_next_user_rec(pcur, mtr);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (!btr_pcur_is_on_user_rec(pcur)) {
+ /* end of index */
+ btr_pcur_close(pcur);
+
+ return(NULL);
+ }
+ }
+
+ /* Get a record, let's save the position */
+ btr_pcur_store_position(pcur, mtr);
+
+ return(rec);
+}
+
+/********************************************************************//**
+This function opens a system table, and returns the first record.
+@return first record of the system table */
+const rec_t*
+dict_startscan_system(
+/*==================*/
+ btr_pcur_t* pcur, /*!< out: persistent cursor to
+ the record */
+ mtr_t* mtr, /*!< in: the mini-transaction */
+ dict_system_id_t system_id) /*!< in: which system table to open */
+{
+ dict_table_t* system_table;
+ dict_index_t* clust_index;
+ const rec_t* rec;
+
+ ut_a(system_id < SYS_NUM_SYSTEM_TABLES);
+
+ system_table = dict_table_get_low(SYSTEM_TABLE_NAME[system_id]);
+
+ clust_index = UT_LIST_GET_FIRST(system_table->indexes);
+
+ btr_pcur_open_at_index_side(true, clust_index, BTR_SEARCH_LEAF, pcur,
+ true, 0, mtr);
+
+ rec = dict_getnext_system_low(pcur, mtr);
+
+ return(rec);
+}
+
+/********************************************************************//**
+This function gets the next system table record as it scans the table.
+@return the next record if found, NULL if end of scan */
+const rec_t*
+dict_getnext_system(
+/*================*/
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor
+ to the record */
+ mtr_t* mtr) /*!< in: the mini-transaction */
+{
+ const rec_t* rec;
+
+ /* Restore the position */
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+ /* Get the next record */
+ rec = dict_getnext_system_low(pcur, mtr);
+
+ return(rec);
+}
+
+/********************************************************************//**
+This function processes one SYS_TABLES record and populate the dict_table_t
+struct for the table.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_tables_rec_and_mtr_commit(
+/*=======================================*/
+ mem_heap_t* heap, /*!< in/out: temporary memory heap */
+ const rec_t* rec, /*!< in: SYS_TABLES record */
+ dict_table_t** table, /*!< out: dict_table_t to fill */
+ bool cached, /*!< in: whether to load from cache */
+ mtr_t* mtr) /*!< in/out: mini-transaction,
+ will be committed */
+{
+ ulint len;
+ const char* field;
+
+ field = (const char*) rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__NAME, &len);
+
+ ut_a(!rec_get_deleted_flag(rec, 0));
+
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_S_FIX));
+
+ /* Get the table name */
+ table_name_t table_name(mem_heap_strdupl(heap, field, len));
+
+ if (cached) {
+ /* Commit before load the table again */
+ mtr_commit(mtr);
+
+ *table = dict_table_get_low(table_name.m_name);
+ return *table ? NULL : "Table not found in cache";
+ } else {
+ const char* err = dict_load_table_low(table_name, rec, table);
+ mtr_commit(mtr);
+ return err;
+ }
+}
+
+/********************************************************************//**
+This function parses a SYS_INDEXES record and populate a dict_index_t
+structure with the information from the record. For detail information
+about SYS_INDEXES fields, please refer to dict_boot() function.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_indexes_rec(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_INDEXES rec */
+ dict_index_t* index, /*!< out: index to be filled */
+ table_id_t* table_id) /*!< out: index table id */
+{
+ const char* err_msg;
+ byte* buf;
+
+ ut_d(index->is_dummy = true);
+ ut_d(index->in_instant_init = false);
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+
+ /* Parse the record, and get "dict_index_t" struct filled */
+ err_msg = dict_load_index_low(buf, heap, rec, FALSE, &index);
+
+ *table_id = mach_read_from_8(buf);
+
+ return(err_msg);
+}
+
+/********************************************************************//**
+This function parses a SYS_COLUMNS record and populate a dict_column_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_columns_rec(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_COLUMNS rec */
+ dict_col_t* column, /*!< out: dict_col_t to be filled */
+ table_id_t* table_id, /*!< out: table id */
+ const char** col_name, /*!< out: column name */
+ ulint* nth_v_col) /*!< out: if virtual col, this is
+ record's sequence number */
+{
+ const char* err_msg;
+
+ /* Parse the record, and get "dict_col_t" struct filled */
+ err_msg = dict_load_column_low(NULL, heap, column,
+ table_id, col_name, rec, nth_v_col);
+
+ return(err_msg);
+}
+
+/** This function parses a SYS_VIRTUAL record and extracts virtual column
+information
+@param[in] rec current SYS_COLUMNS rec
+@param[in,out] table_id table id
+@param[in,out] pos virtual column position
+@param[in,out] base_pos base column position
+@return error message, or NULL on success */
+const char*
+dict_process_sys_virtual_rec(
+ const rec_t* rec,
+ table_id_t* table_id,
+ ulint* pos,
+ ulint* base_pos)
+{
+ const char* err_msg;
+
+ /* Parse the record, and get "dict_col_t" struct filled */
+ err_msg = dict_load_virtual_low(NULL, NULL, table_id,
+ pos, base_pos, rec);
+
+ return(err_msg);
+}
+
+/********************************************************************//**
+This function parses a SYS_FIELDS record and populates a dict_field_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_fields_rec(
+/*========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_FIELDS rec */
+ dict_field_t* sys_field, /*!< out: dict_field_t to be
+ filled */
+ ulint* pos, /*!< out: Field position */
+ index_id_t* index_id, /*!< out: current index id */
+ index_id_t last_id) /*!< in: previous index id */
+{
+ byte* buf;
+ byte* last_index_id;
+ const char* err_msg;
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+
+ last_index_id = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(last_index_id, last_id);
+
+ err_msg = dict_load_field_low(buf, NULL, sys_field,
+ pos, last_index_id, heap, rec);
+
+ *index_id = mach_read_from_8(buf);
+
+ return(err_msg);
+
+}
+
+/********************************************************************//**
+This function parses a SYS_FOREIGN record and populate a dict_foreign_t
+structure with the information from the record. For detail information
+about SYS_FOREIGN fields, please refer to dict_load_foreign() function.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_rec(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_FOREIGN rec */
+ dict_foreign_t* foreign) /*!< out: dict_foreign_t struct
+ to be filled */
+{
+ ulint len;
+ const byte* field;
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ return("delete-marked record in SYS_FOREIGN");
+ }
+
+ if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN) {
+ return("wrong number of columns in SYS_FOREIGN record");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN__ID, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+err_len:
+ return("incorrect column length in SYS_FOREIGN");
+ }
+
+ /* This receives a dict_foreign_t* that points to a stack variable.
+ So dict_foreign_free(foreign) is not used as elsewhere.
+ Since the heap used here is freed elsewhere, foreign->heap
+ is not assigned. */
+ foreign->id = mem_heap_strdupl(heap, (const char*) field, len);
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len);
+ if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR, &len);
+ if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ /* The _lookup versions of the referenced and foreign table names
+ are not assigned since they are not used in this dict_foreign_t */
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ foreign->foreign_table_name = mem_heap_strdupl(
+ heap, (const char*) field, len);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ foreign->referenced_table_name = mem_heap_strdupl(
+ heap, (const char*) field, len);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+ uint32_t n_fields_and_type = mach_read_from_4(field);
+
+ foreign->type = n_fields_and_type >> 24 & ((1U << 6) - 1);
+ foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS;
+
+ return(NULL);
+}
+
+/********************************************************************//**
+This function parses a SYS_FOREIGN_COLS record and extract necessary
+information from the record and return to caller.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_col_rec(
+/*=============================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_FOREIGN_COLS rec */
+ const char** name, /*!< out: foreign key constraint name */
+ const char** for_col_name, /*!< out: referencing column name */
+ const char** ref_col_name, /*!< out: referenced column name
+ in referenced table */
+ ulint* pos) /*!< out: column position */
+{
+ ulint len;
+ const byte* field;
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ return("delete-marked record in SYS_FOREIGN_COLS");
+ }
+
+ if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN_COLS) {
+ return("wrong number of columns in SYS_FOREIGN_COLS record");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+err_len:
+ return("incorrect column length in SYS_FOREIGN_COLS");
+ }
+ *name = mem_heap_strdupl(heap, (char*) field, len);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+ *pos = mach_read_from_4(field);
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len);
+ if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR, &len);
+ if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ *for_col_name = mem_heap_strdupl(heap, (char*) field, len);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ *ref_col_name = mem_heap_strdupl(heap, (char*) field, len);
+
+ return(NULL);
+}
+
+/********************************************************************//**
+This function parses a SYS_TABLESPACES record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_tablespaces(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_TABLESPACES rec */
+ uint32_t* space, /*!< out: tablespace identifier */
+ const char** name, /*!< out: tablespace name */
+ ulint* flags) /*!< out: tablespace flags */
+{
+ ulint len;
+ const byte* field;
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ return("delete-marked record in SYS_TABLESPACES");
+ }
+
+ if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLESPACES) {
+ return("wrong number of columns in SYS_TABLESPACES record");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLESPACES__SPACE, &len);
+ if (len != DICT_FLD_LEN_SPACE) {
+err_len:
+ return("incorrect column length in SYS_TABLESPACES");
+ }
+ *space = mach_read_from_4(field);
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_TABLESPACES__DB_TRX_ID, &len);
+ if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR, &len);
+ if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLESPACES__NAME, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ *name = mem_heap_strdupl(heap, (char*) field, len);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLESPACES__FLAGS, &len);
+ if (len != DICT_FLD_LEN_FLAGS) {
+ goto err_len;
+ }
+ *flags = mach_read_from_4(field);
+
+ return(NULL);
+}
+
+/********************************************************************//**
+This function parses a SYS_DATAFILES record, extracts necessary
+information from the record and returns it to the caller.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_datafiles(
+/*=======================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_DATAFILES rec */
+ uint32_t* space, /*!< out: space id */
+ const char** path) /*!< out: datafile paths */
+{
+ ulint len;
+ const byte* field;
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ return("delete-marked record in SYS_DATAFILES");
+ }
+
+ if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_DATAFILES) {
+ return("wrong number of columns in SYS_DATAFILES record");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_DATAFILES__SPACE, &len);
+ if (len != DICT_FLD_LEN_SPACE) {
+err_len:
+ return("incorrect column length in SYS_DATAFILES");
+ }
+ *space = mach_read_from_4(field);
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_DATAFILES__DB_TRX_ID, &len);
+ if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR, &len);
+ if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_DATAFILES__PATH, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ *path = mem_heap_strdupl(heap, (char*) field, len);
+
+ return(NULL);
+}
+
+/** Get the first filepath from SYS_DATAFILES for a given space_id.
+@param[in] space_id Tablespace ID
+@return First filepath (caller must invoke ut_free() on it)
+@retval NULL if no SYS_DATAFILES entry was found. */
+static char*
+dict_get_first_path(
+ ulint space_id)
+{
+ mtr_t mtr;
+ dict_table_t* sys_datafiles;
+ dict_index_t* sys_index;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ byte* buf;
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ char* filepath = NULL;
+ mem_heap_t* heap = mem_heap_create(1024);
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ mtr_start(&mtr);
+
+ sys_datafiles = dict_table_get_low("SYS_DATAFILES");
+ sys_index = UT_LIST_GET_FIRST(sys_datafiles->indexes);
+
+ ut_ad(!dict_table_is_comp(sys_datafiles));
+ ut_ad(name_of_col_is(sys_datafiles, sys_index,
+ DICT_FLD__SYS_DATAFILES__SPACE, "SPACE"));
+ ut_ad(name_of_col_is(sys_datafiles, sys_index,
+ DICT_FLD__SYS_DATAFILES__PATH, "PATH"));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, DICT_FLD__SYS_DATAFILES__SPACE);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ mach_write_to_4(buf, space_id);
+
+ dfield_set_data(dfield, buf, 4);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ /* Get the filepath from this SYS_DATAFILES record. */
+ if (btr_pcur_is_on_user_rec(&pcur)) {
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_DATAFILES__SPACE, &len);
+ ut_a(len == 4);
+
+ if (space_id == mach_read_from_4(field)) {
+ /* A record for this space ID was found. */
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_DATAFILES__PATH, &len);
+
+ ut_ad(len > 0);
+ ut_ad(len < OS_FILE_MAX_PATH);
+
+ if (len > 0 && len < UNIV_SQL_NULL) {
+ filepath = mem_strdupl(
+ reinterpret_cast<const char*>(field),
+ len);
+ ut_ad(filepath != NULL);
+
+ /* The dictionary may have been written on
+ another OS. */
+ os_normalize_path(filepath);
+ }
+ }
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(filepath);
+}
+
+/** Update the record for space_id in SYS_TABLESPACES to this filepath.
+@param[in] space_id Tablespace ID
+@param[in] filepath Tablespace filepath
+@return DB_SUCCESS if OK, dberr_t if the insert failed */
+dberr_t
+dict_update_filepath(
+ ulint space_id,
+ const char* filepath)
+{
+ if (!srv_sys_tablespaces_open) {
+ /* Startup procedure is not yet ready for updates. */
+ return(DB_SUCCESS);
+ }
+
+ dberr_t err = DB_SUCCESS;
+ trx_t* trx;
+
+ ut_d(dict_sys.assert_locked());
+
+ trx = trx_create();
+ trx->op_info = "update filepath";
+ trx->dict_operation_lock_mode = RW_X_LATCH;
+ trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_int4_literal(info, "space", space_id);
+ pars_info_add_str_literal(info, "path", filepath);
+
+ err = que_eval_sql(info,
+ "PROCEDURE UPDATE_FILEPATH () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_DATAFILES"
+ " SET PATH = :path\n"
+ " WHERE SPACE = :space;\n"
+ "END;\n", FALSE, trx);
+
+ trx_commit_for_mysql(trx);
+ trx->dict_operation_lock_mode = 0;
+ trx->free();
+
+ if (UNIV_LIKELY(err == DB_SUCCESS)) {
+ /* We just updated SYS_DATAFILES due to the contents in
+ a link file. Make a note that we did this. */
+ ib::info() << "The InnoDB data dictionary table SYS_DATAFILES"
+ " for tablespace ID " << space_id
+ << " was updated to use file " << filepath << ".";
+ } else {
+ ib::warn() << "Error occurred while updating InnoDB data"
+ " dictionary table SYS_DATAFILES for tablespace ID "
+ << space_id << " to file " << filepath << ": "
+ << err << ".";
+ }
+
+ return(err);
+}
+
+/** Replace records in SYS_TABLESPACES and SYS_DATAFILES associated with
+the given space_id using an independent transaction.
+@param[in] space_id Tablespace ID
+@param[in] name Tablespace name
+@param[in] filepath First filepath
+@param[in] fsp_flags Tablespace flags
+@return DB_SUCCESS if OK, dberr_t if the insert failed */
+dberr_t
+dict_replace_tablespace_and_filepath(
+ ulint space_id,
+ const char* name,
+ const char* filepath,
+ ulint fsp_flags)
+{
+ if (!srv_sys_tablespaces_open) {
+ /* Startup procedure is not yet ready for updates.
+ Return success since this will likely get updated
+ later. */
+ return(DB_SUCCESS);
+ }
+
+ dberr_t err = DB_SUCCESS;
+ trx_t* trx;
+
+ DBUG_EXECUTE_IF("innodb_fail_to_update_tablespace_dict",
+ return(DB_INTERRUPTED););
+
+ ut_d(dict_sys.assert_locked());
+ ut_ad(filepath);
+
+ trx = trx_create();
+ trx->op_info = "insert tablespace and filepath";
+ trx->dict_operation_lock_mode = RW_X_LATCH;
+ trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+ /* A record for this space ID was not found in
+ SYS_DATAFILES. Assume the record is also missing in
+ SYS_TABLESPACES. Insert records into them both. */
+ err = dict_replace_tablespace_in_dictionary(
+ space_id, name, fsp_flags, filepath, trx);
+
+ trx_commit_for_mysql(trx);
+ trx->dict_operation_lock_mode = 0;
+ trx->free();
+
+ return(err);
+}
+
+/** Check the validity of a SYS_TABLES record
+Make sure the fields are the right length and that they
+do not contain invalid contents.
+@param[in] rec SYS_TABLES record
+@return error message, or NULL on success */
+static
+const char*
+dict_sys_tables_rec_check(
+ const rec_t* rec)
+{
+ const byte* field;
+ ulint len;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ return("delete-marked record in SYS_TABLES");
+ }
+
+ if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES) {
+ return("wrong number of columns in SYS_TABLES record");
+ }
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_TABLES__NAME, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+err_len:
+ return("incorrect column length in SYS_TABLES");
+ }
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len);
+ if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_TABLES__DB_ROLL_PTR, &len);
+ if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__ID, &len);
+ if (len != 8) {
+ goto err_len;
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+ if (field == NULL || len != 4) {
+ goto err_len;
+ }
+
+ rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_TABLES__MIX_ID, &len);
+ if (len != 8) {
+ goto err_len;
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
+ if (field == NULL || len != 4) {
+ goto err_len;
+ }
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_TABLES__CLUSTER_ID, &len);
+ if (len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__SPACE, &len);
+ if (field == NULL || len != 4) {
+ goto err_len;
+ }
+
+ return(NULL);
+}
+
+/** Read and return the contents of a SYS_TABLESPACES record.
+@param[in] rec A record of SYS_TABLESPACES
+@param[out] id Pointer to the space_id for this table
+@param[in,out] name Buffer for Tablespace Name of length NAME_LEN
+@param[out] flags Pointer to tablespace flags
+@return true if the record was read correctly, false if not. */
+bool
+dict_sys_tablespaces_rec_read(
+ const rec_t* rec,
+ ulint* id,
+ char* name,
+ ulint* flags)
+{
+ const byte* field;
+ ulint len;
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLESPACES__SPACE, &len);
+ if (len != DICT_FLD_LEN_SPACE) {
+ ib::error() << "Wrong field length in SYS_TABLESPACES.SPACE: "
+ << len;
+ return(false);
+ }
+ *id = mach_read_from_4(field);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLESPACES__NAME, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+ ib::error() << "Wrong field length in SYS_TABLESPACES.NAME: "
+ << len;
+ return(false);
+ }
+ strncpy(name, reinterpret_cast<const char*>(field), NAME_LEN);
+
+ /* read the 4 byte flags from the TYPE field */
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLESPACES__FLAGS, &len);
+ if (len != 4) {
+ ib::error() << "Wrong field length in SYS_TABLESPACES.FLAGS: "
+ << len;
+ return(false);
+ }
+ *flags = mach_read_from_4(field);
+
+ return(true);
+}
+
+/** Check if SYS_TABLES.TYPE is valid
+@param[in] type SYS_TABLES.TYPE
+@param[in] not_redundant whether ROW_FORMAT=REDUNDANT is not used
+@return whether the SYS_TABLES.TYPE value is valid */
+static
+bool
+dict_sys_tables_type_valid(ulint type, bool not_redundant)
+{
+ /* The DATA_DIRECTORY flag can be assigned fully independently
+ of all other persistent table flags. */
+ type &= ~DICT_TF_MASK_DATA_DIR;
+
+ if (type == 1) {
+ return(true); /* ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT */
+ }
+
+ if (!(type & 1)) {
+ /* For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT,
+ SYS_TABLES.TYPE=1. Else, it is the same as
+ dict_table_t::flags, and the least significant bit
+ would be set. So, the bit never can be 0. */
+ return(false);
+ }
+
+ if (!not_redundant) {
+ /* SYS_TABLES.TYPE must be 1 or 1|DICT_TF_MASK_NO_ROLLBACK
+ for ROW_FORMAT=REDUNDANT. */
+ return !(type & ~(1U | DICT_TF_MASK_NO_ROLLBACK));
+ }
+
+ if (type >= 1U << DICT_TF_POS_UNUSED) {
+ /* Some unknown bits are set. */
+ return(false);
+ }
+
+ return(dict_tf_is_valid_not_redundant(type));
+}
+
+/** Convert SYS_TABLES.TYPE to dict_table_t::flags.
+@param[in] type SYS_TABLES.TYPE
+@param[in] not_redundant whether ROW_FORMAT=REDUNDANT is not used
+@return table flags */
+static
+ulint
+dict_sys_tables_type_to_tf(ulint type, bool not_redundant)
+{
+ ut_ad(dict_sys_tables_type_valid(type, not_redundant));
+ ulint flags = not_redundant ? 1 : 0;
+
+ /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+ PAGE_COMPRESSION_LEVEL are the same. */
+ flags |= type & (DICT_TF_MASK_ZIP_SSIZE
+ | DICT_TF_MASK_ATOMIC_BLOBS
+ | DICT_TF_MASK_DATA_DIR
+ | DICT_TF_MASK_PAGE_COMPRESSION
+ | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+ | DICT_TF_MASK_NO_ROLLBACK);
+
+ ut_ad(dict_tf_is_valid(flags));
+ return(flags);
+}
+
+/** Read and return 5 integer fields from a SYS_TABLES record.
+@param[in] rec A record of SYS_TABLES
+@param[in] name Table Name, the same as SYS_TABLES.NAME
+@param[out] table_id Pointer to the table_id for this table
+@param[out] space_id Pointer to the space_id for this table
+@param[out] n_cols Pointer to number of columns for this table.
+@param[out] flags Pointer to table flags
+@param[out] flags2 Pointer to table flags2
+@return true if the record was read correctly, false if not. */
+MY_ATTRIBUTE((warn_unused_result))
+static
+bool
+dict_sys_tables_rec_read(
+ const rec_t* rec,
+ const table_name_t& table_name,
+ table_id_t* table_id,
+ ulint* space_id,
+ ulint* n_cols,
+ ulint* flags,
+ ulint* flags2)
+{
+ const byte* field;
+ ulint len;
+ ulint type;
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__ID, &len);
+ ut_ad(len == 8);
+ *table_id = static_cast<table_id_t>(mach_read_from_8(field));
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__SPACE, &len);
+ ut_ad(len == 4);
+ *space_id = mach_read_from_4(field);
+
+ /* Read the 4 byte flags from the TYPE field */
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+ ut_a(len == 4);
+ type = mach_read_from_4(field);
+
+ /* Handle MDEV-12873 InnoDB SYS_TABLES.TYPE incompatibility
+ for PAGE_COMPRESSED=YES in MariaDB 10.2.2 to 10.2.6.
+
+ MariaDB 10.2.2 introduced the SHARED_SPACE flag from MySQL 5.7,
+ shifting the flags PAGE_COMPRESSION, PAGE_COMPRESSION_LEVEL,
+ ATOMIC_WRITES (repurposed to NO_ROLLBACK in 10.3.1) by one bit.
+ The SHARED_SPACE flag would always
+ be written as 0 by MariaDB, because MariaDB does not support
+ CREATE TABLESPACE or CREATE TABLE...TABLESPACE for InnoDB.
+
+ So, instead of the bits AALLLLCxxxxxxx we would have
+ AALLLLC0xxxxxxx if the table was created with MariaDB 10.2.2
+ to 10.2.6. (AA=ATOMIC_WRITES, LLLL=PAGE_COMPRESSION_LEVEL,
+ C=PAGE_COMPRESSED, xxxxxxx=7 bits that were not moved.)
+
+ The case LLLLC=00000 is not a problem. The problem is the case
+ AALLLL10DB00001 where D is the (mostly ignored) DATA_DIRECTORY
+ flag and B is the ATOMIC_BLOBS flag (1 for ROW_FORMAT=DYNAMIC
+ and 0 for ROW_FORMAT=COMPACT in this case). Other low-order
+ bits must be so, because PAGE_COMPRESSED=YES is only allowed
+ for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT, not for
+ ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPRESSED.
+
+ Starting with MariaDB 10.2.4, the flags would be
+ 00LLLL10DB00001, because ATOMIC_WRITES is always written as 0.
+
+ We will concentrate on the PAGE_COMPRESSION_LEVEL and
+ PAGE_COMPRESSED=YES. PAGE_COMPRESSED=NO implies
+ PAGE_COMPRESSION_LEVEL=0, and in that case all the affected
+ bits will be 0. For PAGE_COMPRESSED=YES, the values 1..9 are
+ allowed for PAGE_COMPRESSION_LEVEL. That is, we must interpret
+ the bits AALLLL10DB00001 as AALLLL1DB00001.
+
+ If someone created a table in MariaDB 10.2.2 or 10.2.3 with
+ the attribute ATOMIC_WRITES=OFF (value 2) and without
+ PAGE_COMPRESSED=YES or PAGE_COMPRESSION_LEVEL, that should be
+ rejected. The value ATOMIC_WRITES=ON (1) would look like
+ ATOMIC_WRITES=OFF, but it would be ignored starting with
+ MariaDB 10.2.4. */
+ compile_time_assert(DICT_TF_POS_PAGE_COMPRESSION == 7);
+ compile_time_assert(DICT_TF_POS_UNUSED == 14);
+
+ if ((type & 0x19f) != 0x101) {
+ /* The table cannot have been created with MariaDB
+ 10.2.2 to 10.2.6, because they would write the
+ low-order bits of SYS_TABLES.TYPE as 0b10xx00001 for
+ PAGE_COMPRESSED=YES. No adjustment is applicable. */
+ } else if (type >= 3 << 13) {
+ /* 10.2.2 and 10.2.3 write ATOMIC_WRITES less than 3,
+ and no other flags above that can be set for the
+ SYS_TABLES.TYPE to be in the 10.2.2..10.2.6 format.
+ This would in any case be invalid format for 10.2 and
+ earlier releases. */
+ ut_ad(!dict_sys_tables_type_valid(type, true));
+ } else {
+ /* SYS_TABLES.TYPE is of the form AALLLL10DB00001. We
+ must still validate that the LLLL bits are between 0
+ and 9 before we can discard the extraneous 0 bit. */
+ ut_ad(!DICT_TF_GET_PAGE_COMPRESSION(type));
+
+ if ((((type >> 9) & 0xf) - 1) < 9) {
+ ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) & 1);
+
+ type = (type & 0x7fU) | (type >> 1 & ~0x7fU);
+
+ ut_ad(DICT_TF_GET_PAGE_COMPRESSION(type));
+ ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) >= 1);
+ ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) <= 9);
+ } else {
+ ut_ad(!dict_sys_tables_type_valid(type, true));
+ }
+ }
+
+ /* The low order bit of SYS_TABLES.TYPE is always set to 1. But in
+ dict_table_t::flags the low order bit is used to determine if the
+ ROW_FORMAT=REDUNDANT (0) or anything else (1).
+ Read the 4 byte N_COLS field and look at the high order bit. It
+ should be set for COMPACT and later. It should not be set for
+ REDUNDANT. */
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+ ut_a(len == 4);
+ *n_cols = mach_read_from_4(field);
+
+ const bool not_redundant = 0 != (*n_cols & DICT_N_COLS_COMPACT);
+
+ if (!dict_sys_tables_type_valid(type, not_redundant)) {
+ ib::error() << "Table " << table_name << " in InnoDB"
+ " data dictionary contains invalid flags."
+ " SYS_TABLES.TYPE=" << type <<
+ " SYS_TABLES.N_COLS=" << *n_cols;
+ return(false);
+ }
+
+ *flags = dict_sys_tables_type_to_tf(type, not_redundant);
+
+ /* For tables created before MySQL 4.1, there may be
+ garbage in SYS_TABLES.MIX_LEN where flags2 are found. Such tables
+ would always be in ROW_FORMAT=REDUNDANT which do not have the
+ high bit set in n_cols, and flags would be zero.
+ MySQL 4.1 was the first version to support innodb_file_per_table,
+ that is, *space_id != 0. */
+ if (not_redundant || *space_id != 0 || *n_cols & DICT_N_COLS_COMPACT) {
+
+ /* Get flags2 from SYS_TABLES.MIX_LEN */
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
+ *flags2 = mach_read_from_4(field);
+
+ if (!dict_tf2_is_valid(*flags, *flags2)) {
+ ib::error() << "Table " << table_name << " in InnoDB"
+ " data dictionary contains invalid flags."
+ " SYS_TABLES.TYPE=" << type
+ << " SYS_TABLES.MIX_LEN=" << *flags2;
+ return(false);
+ }
+
+ /* DICT_TF2_FTS will be set when indexes are being loaded */
+ *flags2 &= ~DICT_TF2_FTS;
+
+ /* Now that we have used this bit, unset it. */
+ *n_cols &= ~DICT_N_COLS_COMPACT;
+ } else {
+ *flags2 = 0;
+ }
+
+ return(true);
+}
+
+/** Load and check each non-predefined tablespace mentioned in SYS_TABLES.
+Search SYS_TABLES and check each tablespace mentioned that has not
+already been added to the fil_system. If it is valid, add it to the
+file_system list.
+@return the highest space ID found. */
+static ulint dict_check_sys_tables()
+{
+ ulint max_space_id = 0;
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mtr_t mtr;
+
+ DBUG_ENTER("dict_check_sys_tables");
+
+ ut_d(dict_sys.assert_locked());
+
+ mtr_start(&mtr);
+
+ /* Before traversing SYS_TABLES, let's make sure we have
+ SYS_TABLESPACES and SYS_DATAFILES loaded. */
+ dict_table_t* sys_tablespaces;
+ dict_table_t* sys_datafiles;
+ sys_tablespaces = dict_table_get_low("SYS_TABLESPACES");
+ ut_a(sys_tablespaces != NULL);
+ sys_datafiles = dict_table_get_low("SYS_DATAFILES");
+ ut_a(sys_datafiles != NULL);
+
+ for (rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
+ rec != NULL;
+ mtr.commit(), mtr.start(),
+ rec = dict_getnext_system(&pcur, &mtr)) {
+ const byte* field;
+ ulint len;
+ table_id_t table_id;
+ ulint space_id;
+ ulint n_cols;
+ ulint flags;
+ ulint flags2;
+
+ /* If a table record is not useable, ignore it and continue
+ on to the next record. Error messages were logged. */
+ if (dict_sys_tables_rec_check(rec) != NULL) {
+ continue;
+ }
+
+ /* Copy the table name from rec */
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__NAME, &len);
+
+ table_name_t table_name(mem_strdupl((char*) field, len));
+ DBUG_PRINT("dict_check_sys_tables",
+ ("name: %p, '%s'", table_name.m_name,
+ table_name.m_name));
+
+ if (!dict_sys_tables_rec_read(rec, table_name,
+ &table_id, &space_id,
+ &n_cols, &flags, &flags2)
+ || space_id == TRX_SYS_SPACE) {
+next:
+ ut_free(table_name.m_name);
+ continue;
+ }
+
+ if (strstr(table_name.m_name, "/" TEMP_FILE_PREFIX "-")) {
+ /* This table will be dropped by
+ row_mysql_drop_garbage_tables().
+ We do not care if the file exists. */
+ goto next;
+ }
+
+ if (flags2 & DICT_TF2_DISCARDED) {
+ ib::info() << "Ignoring tablespace for " << table_name
+ << " because the DISCARD flag is set .";
+ goto next;
+ }
+
+ /* For tables or partitions using .ibd files, the flag
+ DICT_TF2_USE_FILE_PER_TABLE was not set in MIX_LEN
+ before MySQL 5.6.5. The flag should not have been
+ introduced in persistent storage. MariaDB will keep
+ setting the flag when writing SYS_TABLES entries for
+ newly created or rebuilt tables or partitions, but
+ will otherwise ignore the flag. */
+
+ /* Now that we have the proper name for this tablespace,
+ look to see if it is already in the tablespace cache. */
+ if (const fil_space_t* space
+ = fil_space_for_table_exists_in_mem(
+ space_id, table_name.m_name, flags)) {
+ /* Recovery can open a datafile that does not
+ match SYS_DATAFILES. If they don't match, update
+ SYS_DATAFILES. */
+ char *dict_path = dict_get_first_path(space_id);
+ const char *fil_path = space->chain.start->name;
+ if (dict_path
+ && strcmp(dict_path, fil_path)) {
+ dict_update_filepath(space_id, fil_path);
+ }
+ ut_free(dict_path);
+ ut_free(table_name.m_name);
+ continue;
+ }
+
+ /* Set the expected filepath from the data dictionary.
+ If the file is found elsewhere (from an ISL or the default
+ location) or this path is the same file but looks different,
+ fil_ibd_open() will update the dictionary with what is
+ opened. */
+ char* filepath = dict_get_first_path(space_id);
+
+ /* Check that the .ibd file exists. */
+ if (!fil_ibd_open(
+ false,
+ !srv_read_only_mode && srv_log_file_size != 0,
+ FIL_TYPE_TABLESPACE,
+ space_id, dict_tf_to_fsp_flags(flags),
+ table_name, filepath)) {
+ ib::warn() << "Ignoring tablespace for "
+ << table_name
+ << " because it could not be opened.";
+ }
+
+ max_space_id = ut_max(max_space_id, space_id);
+
+ ut_free(table_name.m_name);
+ ut_free(filepath);
+ }
+
+ mtr_commit(&mtr);
+
+ DBUG_RETURN(max_space_id);
+}
+
+/** Check each tablespace found in the data dictionary.
+Then look at each table defined in SYS_TABLES that has a space_id > 0
+to find all the file-per-table tablespaces.
+
+In a crash recovery we already have some tablespace objects created from
+processing the REDO log. Any other tablespace in SYS_TABLESPACES not
+previously used in recovery will be opened here. We will compare the
+space_id information in the data dictionary to what we find in the
+tablespace file. In addition, more validation will be done if recovery
+was needed and force_recovery is not set.
+
+We also scan the biggest space id, and store it to fil_system. */
+void dict_check_tablespaces_and_store_max_id()
+{
+ mtr_t mtr;
+
+ DBUG_ENTER("dict_check_tablespaces_and_store_max_id");
+
+ dict_sys_lock();
+
+ /* Initialize the max space_id from sys header */
+ mtr.start();
+ ulint max_space_id = mach_read_from_4(DICT_HDR_MAX_SPACE_ID
+ + DICT_HDR
+ + dict_hdr_get(&mtr)->frame);
+ mtr.commit();
+
+ fil_set_max_space_id_if_bigger(max_space_id);
+
+ /* Open all tablespaces referenced in SYS_TABLES.
+ This will update SYS_TABLESPACES and SYS_DATAFILES if it
+ finds any file-per-table tablespaces not already there. */
+ max_space_id = dict_check_sys_tables();
+ fil_set_max_space_id_if_bigger(max_space_id);
+
+ dict_sys_unlock();
+
+ DBUG_VOID_RETURN;
+}
+
+/** Error message for a delete-marked record in dict_load_column_low() */
+static const char* dict_load_column_del = "delete-marked record in SYS_COLUMN";
+
+/** Load a table column definition from a SYS_COLUMNS record to dict_table_t.
+@return error message
+@retval NULL on success */
+static
+const char*
+dict_load_column_low(
+ dict_table_t* table, /*!< in/out: table, could be NULL
+ if we just populate a dict_column_t
+ struct with information from
+ a SYS_COLUMNS record */
+ mem_heap_t* heap, /*!< in/out: memory heap
+ for temporary storage */
+ dict_col_t* column, /*!< out: dict_column_t to fill,
+ or NULL if table != NULL */
+ table_id_t* table_id, /*!< out: table id */
+ const char** col_name, /*!< out: column name */
+ const rec_t* rec, /*!< in: SYS_COLUMNS record */
+ ulint* nth_v_col) /*!< out: if not NULL, this
+ records the "n" of "nth" virtual
+ column */
+{
+ char* name;
+ const byte* field;
+ ulint len;
+ ulint mtype;
+ ulint prtype;
+ ulint col_len;
+ ulint pos;
+ ulint num_base;
+
+ ut_ad(!table == !!column);
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ return(dict_load_column_del);
+ }
+
+ if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_COLUMNS) {
+ return("wrong number of columns in SYS_COLUMNS record");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len);
+ if (len != 8) {
+err_len:
+ return("incorrect column length in SYS_COLUMNS");
+ }
+
+ if (table_id) {
+ *table_id = mach_read_from_8(field);
+ } else if (table->id != mach_read_from_8(field)) {
+ return("SYS_COLUMNS.TABLE_ID mismatch");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_COLUMNS__POS, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+
+ pos = mach_read_from_4(field);
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_COLUMNS__DB_TRX_ID, &len);
+ if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR, &len);
+ if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_COLUMNS__NAME, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ name = mem_heap_strdupl(heap, (const char*) field, len);
+
+ if (col_name) {
+ *col_name = name;
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_COLUMNS__MTYPE, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+
+ mtype = mach_read_from_4(field);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_COLUMNS__PRTYPE, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+ prtype = mach_read_from_4(field);
+
+ if (dtype_get_charset_coll(prtype) == 0
+ && dtype_is_string_type(mtype)) {
+ /* The table was created with < 4.1.2. */
+
+ if (dtype_is_binary_string_type(mtype, prtype)) {
+ /* Use the binary collation for
+ string columns of binary type. */
+
+ prtype = dtype_form_prtype(
+ prtype,
+ DATA_MYSQL_BINARY_CHARSET_COLL);
+ } else {
+ /* Use the default charset for
+ other than binary columns. */
+
+ prtype = dtype_form_prtype(
+ prtype,
+ data_mysql_default_charset_coll);
+ }
+ }
+
+ if (table && table->n_def != pos && !(prtype & DATA_VIRTUAL)) {
+ return("SYS_COLUMNS.POS mismatch");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_COLUMNS__LEN, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+ col_len = mach_read_from_4(field);
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_COLUMNS__PREC, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+ num_base = mach_read_from_4(field);
+
+ if (table) {
+ if (prtype & DATA_VIRTUAL) {
+#ifdef UNIV_DEBUG
+ dict_v_col_t* vcol =
+#endif
+ dict_mem_table_add_v_col(
+ table, heap, name, mtype,
+ prtype, col_len,
+ dict_get_v_col_mysql_pos(pos), num_base);
+ ut_ad(vcol->v_pos == dict_get_v_col_pos(pos));
+ } else {
+ ut_ad(num_base == 0);
+ dict_mem_table_add_col(table, heap, name, mtype,
+ prtype, col_len);
+ }
+ } else {
+ dict_mem_fill_column_struct(column, pos, mtype,
+ prtype, col_len);
+ }
+
+ /* Report the virtual column number */
+ if ((prtype & DATA_VIRTUAL) && nth_v_col != NULL) {
+ *nth_v_col = dict_get_v_col_pos(pos);
+ }
+
+ return(NULL);
+}
+
+/** Error message for a delete-marked record in dict_load_virtual_low() */
+static const char* dict_load_virtual_del = "delete-marked record in SYS_VIRTUAL";
+
+/** Load a virtual column "mapping" (to base columns) information
+from a SYS_VIRTUAL record
+@param[in,out] table table
+@param[in,out] column mapped base column's dict_column_t
+@param[in,out] table_id table id
+@param[in,out] pos virtual column position
+@param[in,out] base_pos base column position
+@param[in] rec SYS_VIRTUAL record
+@return error message
+@retval NULL on success */
+static
+const char*
+dict_load_virtual_low(
+ dict_table_t* table,
+ dict_col_t** column,
+ table_id_t* table_id,
+ ulint* pos,
+ ulint* base_pos,
+ const rec_t* rec)
+{
+ const byte* field;
+ ulint len;
+ ulint base;
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ return(dict_load_virtual_del);
+ }
+
+ if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_VIRTUAL) {
+ return("wrong number of columns in SYS_VIRTUAL record");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_VIRTUAL__TABLE_ID, &len);
+ if (len != 8) {
+err_len:
+ return("incorrect column length in SYS_VIRTUAL");
+ }
+
+ if (table_id != NULL) {
+ *table_id = mach_read_from_8(field);
+ } else if (table->id != mach_read_from_8(field)) {
+ return("SYS_VIRTUAL.TABLE_ID mismatch");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_VIRTUAL__POS, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+
+ if (pos != NULL) {
+ *pos = mach_read_from_4(field);
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_VIRTUAL__BASE_POS, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+
+ base = mach_read_from_4(field);
+
+ if (base_pos != NULL) {
+ *base_pos = base;
+ }
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_VIRTUAL__DB_TRX_ID, &len);
+ if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_VIRTUAL__DB_ROLL_PTR, &len);
+ if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ if (column != NULL) {
+ *column = dict_table_get_nth_col(table, base);
+ }
+
+ return(NULL);
+}
+
+/********************************************************************//**
+Loads definitions for table columns. */
+static
+void
+dict_load_columns(
+/*==============*/
+ dict_table_t* table, /*!< in/out: table */
+ mem_heap_t* heap) /*!< in/out: memory heap
+ for temporary storage */
+{
+ dict_table_t* sys_columns;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ const rec_t* rec;
+ byte* buf;
+ ulint i;
+ mtr_t mtr;
+ ulint n_skipped = 0;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ mtr_start(&mtr);
+
+ sys_columns = dict_table_get_low("SYS_COLUMNS");
+ sys_index = UT_LIST_GET_FIRST(sys_columns->indexes);
+ ut_ad(!dict_table_is_comp(sys_columns));
+
+ ut_ad(name_of_col_is(sys_columns, sys_index,
+ DICT_FLD__SYS_COLUMNS__NAME, "NAME"));
+ ut_ad(name_of_col_is(sys_columns, sys_index,
+ DICT_FLD__SYS_COLUMNS__PREC, "PREC"));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(buf, table->id);
+
+ dfield_set_data(dfield, buf, 8);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ ut_ad(table->n_t_cols == static_cast<ulint>(
+ table->n_cols) + static_cast<ulint>(table->n_v_cols));
+
+ for (i = 0;
+ i + DATA_N_SYS_COLS < table->n_t_cols + n_skipped;
+ i++) {
+ const char* err_msg;
+ const char* name = NULL;
+ ulint nth_v_col = ULINT_UNDEFINED;
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ ut_a(btr_pcur_is_on_user_rec(&pcur));
+
+ err_msg = dict_load_column_low(table, heap, NULL, NULL,
+ &name, rec, &nth_v_col);
+
+ if (err_msg == dict_load_column_del) {
+ n_skipped++;
+ goto next_rec;
+ } else if (err_msg) {
+ ib::fatal() << err_msg;
+ }
+
+ /* Note: Currently we have one DOC_ID column that is
+ shared by all FTS indexes on a table. And only non-virtual
+ column can be used for FULLTEXT index */
+ if (innobase_strcasecmp(name,
+ FTS_DOC_ID_COL_NAME) == 0
+ && nth_v_col == ULINT_UNDEFINED) {
+ dict_col_t* col;
+ /* As part of normal loading of tables the
+ table->flag is not set for tables with FTS
+ till after the FTS indexes are loaded. So we
+ create the fts_t instance here if there isn't
+ one already created.
+
+ This case does not arise for table create as
+ the flag is set before the table is created. */
+ if (table->fts == NULL) {
+ table->fts = fts_create(table);
+ }
+
+ ut_a(table->fts->doc_col == ULINT_UNDEFINED);
+
+ col = dict_table_get_nth_col(table, i - n_skipped);
+
+ ut_ad(col->len == sizeof(doc_id_t));
+
+ if (col->prtype & DATA_FTS_DOC_ID) {
+ DICT_TF2_FLAG_SET(
+ table, DICT_TF2_FTS_HAS_DOC_ID);
+ DICT_TF2_FLAG_UNSET(
+ table, DICT_TF2_FTS_ADD_DOC_ID);
+ }
+
+ table->fts->doc_col = i - n_skipped;
+ }
+next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+}
+
+/** Loads SYS_VIRTUAL info for one virtual column
+@param[in,out] table table
+@param[in] nth_v_col virtual column sequence num
+@param[in,out] v_col virtual column
+@param[in,out] heap memory heap
+*/
+static
+void
+dict_load_virtual_one_col(
+ dict_table_t* table,
+ ulint nth_v_col,
+ dict_v_col_t* v_col,
+ mem_heap_t* heap)
+{
+ dict_table_t* sys_virtual;
+ dict_index_t* sys_virtual_index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ const rec_t* rec;
+ byte* buf;
+ ulint i = 0;
+ mtr_t mtr;
+ ulint skipped = 0;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ if (v_col->num_base == 0) {
+ return;
+ }
+
+ mtr_start(&mtr);
+
+ sys_virtual = dict_table_get_low("SYS_VIRTUAL");
+ sys_virtual_index = UT_LIST_GET_FIRST(sys_virtual->indexes);
+ ut_ad(!dict_table_is_comp(sys_virtual));
+
+ ut_ad(name_of_col_is(sys_virtual, sys_virtual_index,
+ DICT_FLD__SYS_VIRTUAL__POS, "POS"));
+
+ tuple = dtuple_create(heap, 2);
+
+ /* table ID field */
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(buf, table->id);
+
+ dfield_set_data(dfield, buf, 8);
+
+ /* virtual column pos field */
+ dfield = dtuple_get_nth_field(tuple, 1);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+ ulint vcol_pos = dict_create_v_col_pos(nth_v_col, v_col->m_col.ind);
+ mach_write_to_4(buf, vcol_pos);
+
+ dfield_set_data(dfield, buf, 4);
+
+ dict_index_copy_types(tuple, sys_virtual_index, 2);
+
+ btr_pcur_open_on_user_rec(sys_virtual_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ for (i = 0; i < unsigned{v_col->num_base} + skipped; i++) {
+ const char* err_msg;
+ ulint pos;
+
+ ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ ut_a(btr_pcur_is_on_user_rec(&pcur));
+
+ err_msg = dict_load_virtual_low(table,
+ &v_col->base_col[i - skipped],
+ NULL,
+ &pos, NULL, rec);
+
+ if (err_msg) {
+ if (err_msg != dict_load_virtual_del) {
+ ib::fatal() << err_msg;
+ } else {
+ skipped++;
+ }
+ } else {
+ ut_ad(pos == vcol_pos);
+ }
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+}
+
+/** Loads info from SYS_VIRTUAL for virtual columns.
+@param[in,out] table table
+@param[in] heap memory heap
+*/
+static
+void
+dict_load_virtual(
+ dict_table_t* table,
+ mem_heap_t* heap)
+{
+ for (ulint i = 0; i < table->n_v_cols; i++) {
+ dict_v_col_t* v_col = dict_table_get_nth_v_col(table, i);
+
+ dict_load_virtual_one_col(table, i, v_col, heap);
+ }
+}
+
+/** Error message for a delete-marked record in dict_load_field_low() */
+static const char* dict_load_field_del = "delete-marked record in SYS_FIELDS";
+
+/** Load an index field definition from a SYS_FIELDS record to dict_index_t.
+@return error message
+@retval NULL on success */
+static
+const char*
+dict_load_field_low(
+ byte* index_id, /*!< in/out: index id (8 bytes)
+ an "in" value if index != NULL
+ and "out" if index == NULL */
+ dict_index_t* index, /*!< in/out: index, could be NULL
+ if we just populate a dict_field_t
+ struct with information from
+ a SYS_FIELDS record */
+ dict_field_t* sys_field, /*!< out: dict_field_t to be
+ filled */
+ ulint* pos, /*!< out: Field position */
+ byte* last_index_id, /*!< in: last index id */
+ mem_heap_t* heap, /*!< in/out: memory heap
+ for temporary storage */
+ const rec_t* rec) /*!< in: SYS_FIELDS record */
+{
+ const byte* field;
+ ulint len;
+ unsigned pos_and_prefix_len;
+ unsigned prefix_len;
+ bool first_field;
+ ulint position;
+
+ /* Either index or sys_field is supplied, not both */
+ ut_a((!index) || (!sys_field));
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ return(dict_load_field_del);
+ }
+
+ if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FIELDS) {
+ return("wrong number of columns in SYS_FIELDS record");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FIELDS__INDEX_ID, &len);
+ if (len != 8) {
+err_len:
+ return("incorrect column length in SYS_FIELDS");
+ }
+
+ if (!index) {
+ ut_a(last_index_id);
+ memcpy(index_id, (const char*) field, 8);
+ first_field = memcmp(index_id, last_index_id, 8);
+ } else {
+ first_field = (index->n_def == 0);
+ if (memcmp(field, index_id, 8)) {
+ return("SYS_FIELDS.INDEX_ID mismatch");
+ }
+ }
+
+ /* The next field stores the field position in the index and a
+ possible column prefix length if the index field does not
+ contain the whole column. The storage format is like this: if
+ there is at least one prefix field in the index, then the HIGH
+ 2 bytes contain the field number (index->n_def) and the low 2
+ bytes the prefix length for the field. Otherwise the field
+ number (index->n_def) is contained in the 2 LOW bytes. */
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FIELDS__POS, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+
+ pos_and_prefix_len = mach_read_from_4(field);
+
+ if (index && UNIV_UNLIKELY
+ ((pos_and_prefix_len & 0xFFFFUL) != index->n_def
+ && (pos_and_prefix_len >> 16 & 0xFFFF) != index->n_def)) {
+ return("SYS_FIELDS.POS mismatch");
+ }
+
+ if (first_field || pos_and_prefix_len > 0xFFFFUL) {
+ prefix_len = pos_and_prefix_len & 0xFFFFUL;
+ position = (pos_and_prefix_len & 0xFFFF0000UL) >> 16;
+ } else {
+ prefix_len = 0;
+ position = pos_and_prefix_len & 0xFFFFUL;
+ }
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_FIELDS__DB_TRX_ID, &len);
+ if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_FIELDS__DB_ROLL_PTR, &len);
+ if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FIELDS__COL_NAME, &len);
+ if (len == 0 || len == UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ if (index) {
+ dict_mem_index_add_field(
+ index, mem_heap_strdupl(heap, (const char*) field, len),
+ prefix_len);
+ } else {
+ ut_a(sys_field);
+ ut_a(pos);
+
+ sys_field->name = mem_heap_strdupl(
+ heap, (const char*) field, len);
+ sys_field->prefix_len = prefix_len & ((1U << 12) - 1);
+ *pos = position;
+ }
+
+ return(NULL);
+}
+
+/********************************************************************//**
+Loads definitions for index fields.
+@return DB_SUCCESS if ok, DB_CORRUPTION if corruption */
+static
+ulint
+dict_load_fields(
+/*=============*/
+ dict_index_t* index, /*!< in/out: index whose fields to load */
+ mem_heap_t* heap) /*!< in: memory heap for temporary storage */
+{
+ dict_table_t* sys_fields;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ const rec_t* rec;
+ byte* buf;
+ ulint i;
+ mtr_t mtr;
+ dberr_t error;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ mtr_start(&mtr);
+
+ sys_fields = dict_table_get_low("SYS_FIELDS");
+ sys_index = UT_LIST_GET_FIRST(sys_fields->indexes);
+ ut_ad(!dict_table_is_comp(sys_fields));
+ ut_ad(name_of_col_is(sys_fields, sys_index,
+ DICT_FLD__SYS_FIELDS__COL_NAME, "COL_NAME"));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(buf, index->id);
+
+ dfield_set_data(dfield, buf, 8);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ for (i = 0; i < index->n_fields; i++) {
+ const char* err_msg;
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ ut_a(btr_pcur_is_on_user_rec(&pcur));
+
+ err_msg = dict_load_field_low(buf, index, NULL, NULL, NULL,
+ heap, rec);
+
+ if (err_msg == dict_load_field_del) {
+ /* There could be delete marked records in
+ SYS_FIELDS because SYS_FIELDS.INDEX_ID can be
+ updated by ALTER TABLE ADD INDEX. */
+
+ goto next_rec;
+ } else if (err_msg) {
+ ib::error() << err_msg;
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ error = DB_SUCCESS;
+func_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ return(error);
+}
+
+/** Error message for a delete-marked record in dict_load_index_low() */
+static const char* dict_load_index_del = "delete-marked record in SYS_INDEXES";
+/** Error message for table->id mismatch in dict_load_index_low() */
+static const char* dict_load_index_id_err = "SYS_INDEXES.TABLE_ID mismatch";
+/** Error message for SYS_TABLES flags mismatch in dict_load_table_low() */
+static const char* dict_load_table_flags = "incorrect flags in SYS_TABLES";
+
+/** Load an index definition from a SYS_INDEXES record to dict_index_t.
+If allocate=TRUE, we will create a dict_index_t structure and fill it
+accordingly. If allocated=FALSE, the dict_index_t will be supplied by
+the caller and filled with information read from the record.
+@return error message
+@retval NULL on success */
+static
+const char*
+dict_load_index_low(
+ byte* table_id, /*!< in/out: table id (8 bytes),
+ an "in" value if allocate=TRUE
+ and "out" when allocate=FALSE */
+ mem_heap_t* heap, /*!< in/out: temporary memory heap */
+ const rec_t* rec, /*!< in: SYS_INDEXES record */
+ ibool allocate, /*!< in: TRUE=allocate *index,
+ FALSE=fill in a pre-allocated
+ *index */
+ dict_index_t** index) /*!< out,own: index, or NULL */
+{
+ const byte* field;
+ ulint len;
+ ulint name_len;
+ char* name_buf;
+ index_id_t id;
+ ulint n_fields;
+ ulint type;
+ unsigned merge_threshold;
+
+ if (allocate) {
+ /* If allocate=TRUE, no dict_index_t will
+ be supplied. Initialize "*index" to NULL */
+ *index = NULL;
+ }
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ return(dict_load_index_del);
+ }
+
+ if (rec_get_n_fields_old(rec) == DICT_NUM_FIELDS__SYS_INDEXES) {
+ /* MERGE_THRESHOLD exists */
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD, &len);
+ switch (len) {
+ case 4:
+ merge_threshold = mach_read_from_4(field);
+ break;
+ case UNIV_SQL_NULL:
+ merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+ break;
+ default:
+ return("incorrect MERGE_THRESHOLD length"
+ " in SYS_INDEXES");
+ }
+ } else if (rec_get_n_fields_old(rec)
+ == DICT_NUM_FIELDS__SYS_INDEXES - 1) {
+ /* MERGE_THRESHOLD doesn't exist */
+
+ merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+ } else {
+ return("wrong number of columns in SYS_INDEXES record");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len);
+ if (len != 8) {
+err_len:
+ return("incorrect column length in SYS_INDEXES");
+ }
+
+ if (!allocate) {
+ /* We are reading a SYS_INDEXES record. Copy the table_id */
+ memcpy(table_id, (const char*) field, 8);
+ } else if (memcmp(field, table_id, 8)) {
+ /* Caller supplied table_id, verify it is the same
+ id as on the index record */
+ return(dict_load_index_id_err);
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__ID, &len);
+ if (len != 8) {
+ goto err_len;
+ }
+
+ id = mach_read_from_8(field);
+
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_INDEXES__DB_TRX_ID, &len);
+ if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+ rec_get_nth_field_offs_old(
+ rec, DICT_FLD__SYS_INDEXES__DB_ROLL_PTR, &len);
+ if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__NAME, &name_len);
+ if (name_len == UNIV_SQL_NULL) {
+ goto err_len;
+ }
+
+ name_buf = mem_heap_strdupl(heap, (const char*) field,
+ name_len);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__N_FIELDS, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+ n_fields = mach_read_from_4(field);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__TYPE, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+ type = mach_read_from_4(field);
+ if (type & (~0U << DICT_IT_BITS)) {
+ return("unknown SYS_INDEXES.TYPE bits");
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+ if (len != 4) {
+ goto err_len;
+ }
+
+ if (allocate) {
+ *index = dict_mem_index_create(NULL, name_buf, type, n_fields);
+ } else {
+ ut_a(*index);
+
+ dict_mem_fill_index_struct(*index, NULL, name_buf,
+ type, n_fields);
+ }
+
+ (*index)->id = id;
+ (*index)->page = mach_read_from_4(field);
+ ut_ad((*index)->page);
+ (*index)->merge_threshold = merge_threshold & ((1U << 6) - 1);
+
+ return(NULL);
+}
+
+/********************************************************************//**
+Loads definitions for table indexes. Adds them to the data dictionary
+cache.
+@return DB_SUCCESS if ok, DB_CORRUPTION if corruption of dictionary
+table or DB_UNSUPPORTED if table has unknown index type */
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+dict_load_indexes(
+/*==============*/
+ dict_table_t* table, /*!< in/out: table */
+ mem_heap_t* heap, /*!< in: memory heap for temporary storage */
+ dict_err_ignore_t ignore_err)
+ /*!< in: error to be ignored when
+ loading the index definition */
+{
+ dict_table_t* sys_indexes;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ const rec_t* rec;
+ byte* buf;
+ mtr_t mtr;
+ dberr_t error = DB_SUCCESS;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ mtr_start(&mtr);
+
+ sys_indexes = dict_table_get_low("SYS_INDEXES");
+ sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes);
+ ut_ad(!dict_table_is_comp(sys_indexes));
+ ut_ad(name_of_col_is(sys_indexes, sys_index,
+ DICT_FLD__SYS_INDEXES__NAME, "NAME"));
+ ut_ad(name_of_col_is(sys_indexes, sys_index,
+ DICT_FLD__SYS_INDEXES__PAGE_NO, "PAGE_NO"));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+ mach_write_to_8(buf, table->id);
+
+ dfield_set_data(dfield, buf, 8);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ for (;;) {
+ dict_index_t* index = NULL;
+ const char* err_msg;
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+
+ /* We should allow the table to open even
+ without index when DICT_ERR_IGNORE_CORRUPT is set.
+ DICT_ERR_IGNORE_CORRUPT is currently only set
+ for drop table */
+ if (dict_table_get_first_index(table) == NULL
+ && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) {
+ ib::warn() << "Cannot load table "
+ << table->name
+ << " because it has no indexes in"
+ " InnoDB internal data dictionary.";
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ if ((ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)
+ && (rec_get_n_fields_old(rec)
+ == DICT_NUM_FIELDS__SYS_INDEXES
+ /* a record for older SYS_INDEXES table
+ (missing merge_threshold column) is acceptable. */
+ || rec_get_n_fields_old(rec)
+ == DICT_NUM_FIELDS__SYS_INDEXES - 1)) {
+ const byte* field;
+ ulint len;
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__NAME, &len);
+
+ if (len != UNIV_SQL_NULL
+ && static_cast<char>(*field)
+ == static_cast<char>(*TEMP_INDEX_PREFIX_STR)) {
+ /* Skip indexes whose name starts with
+ TEMP_INDEX_PREFIX_STR, because they will
+ be dropped by row_merge_drop_temp_indexes()
+ during crash recovery. */
+ goto next_rec;
+ }
+ }
+
+ err_msg = dict_load_index_low(buf, heap, rec, TRUE, &index);
+ ut_ad((index == NULL && err_msg != NULL)
+ || (index != NULL && err_msg == NULL));
+
+ if (err_msg == dict_load_index_id_err) {
+ /* TABLE_ID mismatch means that we have
+ run out of index definitions for the table. */
+
+ if (dict_table_get_first_index(table) == NULL
+ && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) {
+
+ ib::warn() << "Failed to load the"
+ " clustered index for table "
+ << table->name
+ << " because of the following error: "
+ << err_msg << "."
+ " Refusing to load the rest of the"
+ " indexes (if any) and the whole table"
+ " altogether.";
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ break;
+ } else if (err_msg == dict_load_index_del) {
+ /* Skip delete-marked records. */
+ goto next_rec;
+ } else if (err_msg) {
+ ib::error() << err_msg;
+ if (ignore_err & DICT_ERR_IGNORE_CORRUPT) {
+ goto next_rec;
+ }
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ ut_ad(index);
+ ut_ad(!dict_index_is_online_ddl(index));
+
+ /* Check whether the index is corrupted */
+ if (index->is_corrupted()) {
+ ib::error() << "Index " << index->name
+ << " of table " << table->name
+ << " is corrupted";
+
+ if (!srv_load_corrupted
+ && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)
+ && dict_index_is_clust(index)) {
+ dict_mem_index_free(index);
+
+ error = DB_INDEX_CORRUPT;
+ goto func_exit;
+ } else {
+ /* We will load the index if
+ 1) srv_load_corrupted is TRUE
+ 2) ignore_err is set with
+ DICT_ERR_IGNORE_CORRUPT
+ 3) if the index corrupted is a secondary
+ index */
+ ib::info() << "Load corrupted index "
+ << index->name
+ << " of table " << table->name;
+ }
+ }
+
+ if (index->type & DICT_FTS
+ && !dict_table_has_fts_index(table)) {
+ /* This should have been created by now. */
+ ut_a(table->fts != NULL);
+ DICT_TF2_FLAG_SET(table, DICT_TF2_FTS);
+ }
+
+ /* We check for unsupported types first, so that the
+ subsequent checks are relevant for the supported types. */
+ if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE
+ | DICT_CORRUPT | DICT_FTS
+ | DICT_SPATIAL | DICT_VIRTUAL)) {
+
+ ib::error() << "Unknown type " << index->type
+ << " of index " << index->name
+ << " of table " << table->name;
+
+ error = DB_UNSUPPORTED;
+ dict_mem_index_free(index);
+ goto func_exit;
+ } else if (index->page == FIL_NULL
+ && table->is_readable()
+ && (!(index->type & DICT_FTS))) {
+
+ ib::error() << "Trying to load index " << index->name
+ << " for table " << table->name
+ << ", but the index tree has been freed!";
+
+ if (ignore_err & DICT_ERR_IGNORE_INDEX_ROOT) {
+ /* If caller can tolerate this error,
+ we will continue to load the index and
+ let caller deal with this error. However
+ mark the index and table corrupted. We
+ only need to mark such in the index
+ dictionary cache for such metadata corruption,
+ since we would always be able to set it
+ when loading the dictionary cache */
+ index->table = table;
+ dict_set_corrupted_index_cache_only(index);
+
+ ib::info() << "Index is corrupt but forcing"
+ " load into data dictionary";
+ } else {
+corrupted:
+ dict_mem_index_free(index);
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+ } else if (!dict_index_is_clust(index)
+ && NULL == dict_table_get_first_index(table)) {
+
+ ib::error() << "Trying to load index " << index->name
+ << " for table " << table->name
+ << ", but the first index is not clustered!";
+
+ goto corrupted;
+ } else if (dict_is_sys_table(table->id)
+ && (dict_index_is_clust(index)
+ || ((table == dict_sys.sys_tables)
+ && !strcmp("ID_IND", index->name)))) {
+
+ /* The index was created in memory already at booting
+ of the database server */
+ dict_mem_index_free(index);
+ } else {
+ dict_load_fields(index, heap);
+ index->table = table;
+
+ /* The data dictionary tables should never contain
+ invalid index definitions. If we ignored this error
+ and simply did not load this index definition, the
+ .frm file would disagree with the index definitions
+ inside InnoDB. */
+ if ((error = dict_index_add_to_cache(index,
+ index->page))
+ != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ ut_ad(table->fts_doc_id_index == NULL);
+
+ if (table->fts != NULL) {
+ table->fts_doc_id_index = dict_table_get_index_on_name(
+ table, FTS_DOC_ID_INDEX_NAME);
+ }
+
+ /* If the table contains FTS indexes, populate table->fts->indexes */
+ if (dict_table_has_fts_index(table)) {
+ ut_ad(table->fts_doc_id_index != NULL);
+ /* table->fts->indexes should have been created. */
+ ut_a(table->fts->indexes != NULL);
+ dict_table_get_all_fts_indexes(table, table->fts->indexes);
+ }
+
+func_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(error);
+}
+
+/** Load a table definition from a SYS_TABLES record to dict_table_t.
+Do not load any columns or indexes.
+@param[in] name Table name
+@param[in] rec SYS_TABLES record
+@param[out,own] table table, or NULL
+@return error message
+@retval NULL on success */
+static const char* dict_load_table_low(const table_name_t& name,
+ const rec_t* rec, dict_table_t** table)
+{
+ table_id_t table_id;
+ ulint space_id;
+ ulint n_cols;
+ ulint t_num;
+ ulint flags;
+ ulint flags2;
+ ulint n_v_col;
+
+ if (const char* error_text = dict_sys_tables_rec_check(rec)) {
+ *table = NULL;
+ return(error_text);
+ }
+
+ if (!dict_sys_tables_rec_read(rec, name, &table_id, &space_id,
+ &t_num, &flags, &flags2)) {
+ *table = NULL;
+ return(dict_load_table_flags);
+ }
+
+ dict_table_decode_n_col(t_num, &n_cols, &n_v_col);
+
+ *table = dict_mem_table_create(
+ name.m_name, NULL, n_cols + n_v_col, n_v_col, flags, flags2);
+ (*table)->space_id = space_id;
+ (*table)->id = table_id;
+ (*table)->file_unreadable = !!(flags2 & DICT_TF2_DISCARDED);
+
+ return(NULL);
+}
+
+/********************************************************************//**
+Using the table->heap, copy the null-terminated filepath into
+table->data_dir_path and replace the 'databasename/tablename.ibd'
+portion with 'tablename'.
+This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path.
+Make this data directory path only if it has not yet been saved. */
+static
+void
+dict_save_data_dir_path(
+/*====================*/
+ dict_table_t* table, /*!< in/out: table */
+ const char* filepath) /*!< in: filepath of tablespace */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_a(DICT_TF_HAS_DATA_DIR(table->flags));
+
+ ut_a(!table->data_dir_path);
+ ut_a(filepath);
+
+ /* Be sure this filepath is not the default filepath. */
+ char* default_filepath = fil_make_filepath(
+ NULL, table->name.m_name, IBD, false);
+ if (default_filepath) {
+ if (0 != strcmp(filepath, default_filepath)) {
+ ulint pathlen = strlen(filepath);
+ ut_a(pathlen < OS_FILE_MAX_PATH);
+ ut_a(0 == strcmp(filepath + pathlen - 4, DOT_IBD));
+
+ table->data_dir_path = mem_heap_strdup(
+ table->heap, filepath);
+ os_file_make_data_dir_path(table->data_dir_path);
+ }
+
+ ut_free(default_filepath);
+ }
+}
+
+/** Make sure the data_dir_path is saved in dict_table_t if DATA DIRECTORY
+was used. Try to read it from the fil_system first, then from SYS_DATAFILES.
+@param[in] table Table object
+@param[in] dict_mutex_own true if dict_sys.mutex is owned already */
+void
+dict_get_and_save_data_dir_path(
+ dict_table_t* table,
+ bool dict_mutex_own)
+{
+ ut_ad(!table->is_temporary());
+ ut_ad(!table->space || table->space->id == table->space_id);
+
+ if (!table->data_dir_path && table->space_id && table->space) {
+ if (!dict_mutex_own) {
+ dict_mutex_enter_for_mysql();
+ }
+
+ table->flags |= 1 << DICT_TF_POS_DATA_DIR
+ & ((1U << DICT_TF_BITS) - 1);
+ dict_save_data_dir_path(table,
+ table->space->chain.start->name);
+
+ if (table->data_dir_path == NULL) {
+ /* Since we did not set the table data_dir_path,
+ unset the flag. This does not change SYS_DATAFILES
+ or SYS_TABLES or FSP_SPACE_FLAGS on the header page
+ of the tablespace, but it makes dict_table_t
+ consistent. */
+ table->flags &= ~DICT_TF_MASK_DATA_DIR
+ & ((1U << DICT_TF_BITS) - 1);
+ }
+
+ if (!dict_mutex_own) {
+ dict_mutex_exit_for_mysql();
+ }
+ }
+}
+
+/** Loads a table definition and also all its index definitions, and also
+the cluster definition if the table is a member in a cluster. Also loads
+all foreign key constraints where the foreign key is in the table or where
+a foreign key references columns in this table.
+@param[in] name Table name in the dbname/tablename format
+@param[in] ignore_err Error to be ignored when loading
+ table and its index definition
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the file_unreadable
+flag in the table object we return. */
+dict_table_t* dict_load_table(const char* name, dict_err_ignore_t ignore_err)
+{
+ dict_names_t fk_list;
+ dict_table_t* result;
+ dict_names_t::iterator i;
+
+ DBUG_ENTER("dict_load_table");
+ DBUG_PRINT("dict_load_table", ("loading table: '%s'", name));
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ result = dict_table_check_if_in_cache_low(name);
+
+ if (!result) {
+ result = dict_load_table_one(const_cast<char*>(name),
+ ignore_err, fk_list);
+ while (!fk_list.empty()) {
+ if (!dict_table_check_if_in_cache_low(fk_list.front()))
+ dict_load_table_one(
+ const_cast<char*>(fk_list.front()),
+ ignore_err, fk_list);
+ fk_list.pop_front();
+ }
+ }
+
+ DBUG_RETURN(result);
+}
+
+/** Opens a tablespace for dict_load_table_one()
+@param[in,out] table A table that refers to the tablespace to open
+@param[in] ignore_err Whether to ignore an error. */
+UNIV_INLINE
+void
+dict_load_tablespace(
+ dict_table_t* table,
+ dict_err_ignore_t ignore_err)
+{
+ ut_ad(!table->is_temporary());
+ ut_ad(!table->space);
+ ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND);
+ ut_ad(fil_system.sys_space);
+
+ if (table->space_id == TRX_SYS_SPACE) {
+ table->space = fil_system.sys_space;
+ return;
+ }
+
+ if (table->flags2 & DICT_TF2_DISCARDED) {
+ ib::warn() << "Tablespace for table " << table->name
+ << " is set as discarded.";
+ table->file_unreadable = true;
+ return;
+ }
+
+ /* The tablespace may already be open. */
+ table->space = fil_space_for_table_exists_in_mem(
+ table->space_id, table->name.m_name, table->flags);
+ if (table->space) {
+ return;
+ }
+
+ if (ignore_err == DICT_ERR_IGNORE_DROP) {
+ table->file_unreadable = true;
+ return;
+ }
+
+ if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) {
+ ib::error() << "Failed to find tablespace for table "
+ << table->name << " in the cache. Attempting"
+ " to load the tablespace with space id "
+ << table->space_id;
+ }
+
+ /* Use the remote filepath if needed. This parameter is optional
+ in the call to fil_ibd_open(). If not supplied, it will be built
+ from the table->name. */
+ char* filepath = NULL;
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+ /* This will set table->data_dir_path from either
+ fil_system or SYS_DATAFILES */
+ dict_get_and_save_data_dir_path(table, true);
+
+ if (table->data_dir_path) {
+ filepath = fil_make_filepath(
+ table->data_dir_path,
+ table->name.m_name, IBD, true);
+ }
+ }
+
+ /* Try to open the tablespace. We set the 2nd param (fix_dict) to
+ false because we do not have an x-lock on dict_sys.latch */
+ table->space = fil_ibd_open(
+ true, false, FIL_TYPE_TABLESPACE, table->space_id,
+ dict_tf_to_fsp_flags(table->flags),
+ table->name, filepath);
+
+ if (!table->space) {
+ /* We failed to find a sensible tablespace file */
+ table->file_unreadable = true;
+ }
+
+ ut_free(filepath);
+}
+
+/** Loads a table definition and also all its index definitions.
+
+Loads those foreign key constraints whose referenced table is already in
+dictionary cache. If a foreign key constraint is not loaded, then the
+referenced table is pushed into the output stack (fk_tables), if it is not
+NULL. These tables must be subsequently loaded so that all the foreign
+key constraints are loaded into memory.
+
+@param[in] name Table name in the db/tablename format
+@param[in] ignore_err Error to be ignored when loading table
+ and its index definition
+@param[out] fk_tables Related table names that must also be
+ loaded to ensure that all foreign key
+ constraints are loaded.
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the
+file_unreadable flag in the table object we return */
+static
+dict_table_t*
+dict_load_table_one(
+ const table_name_t& name,
+ dict_err_ignore_t ignore_err,
+ dict_names_t& fk_tables)
+{
+ dberr_t err;
+ dict_table_t* sys_tables;
+ btr_pcur_t pcur;
+ dict_index_t* sys_index;
+ dtuple_t* tuple;
+ mem_heap_t* heap;
+ dfield_t* dfield;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ mtr_t mtr;
+
+ DBUG_ENTER("dict_load_table_one");
+ DBUG_PRINT("dict_load_table_one", ("table: %s", name.m_name));
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ heap = mem_heap_create(32000);
+
+ mtr_start(&mtr);
+
+ sys_tables = dict_table_get_low("SYS_TABLES");
+ sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+ ut_ad(!dict_table_is_comp(sys_tables));
+ ut_ad(name_of_col_is(sys_tables, sys_index,
+ DICT_FLD__SYS_TABLES__ID, "ID"));
+ ut_ad(name_of_col_is(sys_tables, sys_index,
+ DICT_FLD__SYS_TABLES__N_COLS, "N_COLS"));
+ ut_ad(name_of_col_is(sys_tables, sys_index,
+ DICT_FLD__SYS_TABLES__TYPE, "TYPE"));
+ ut_ad(name_of_col_is(sys_tables, sys_index,
+ DICT_FLD__SYS_TABLES__MIX_LEN, "MIX_LEN"));
+ ut_ad(name_of_col_is(sys_tables, sys_index,
+ DICT_FLD__SYS_TABLES__SPACE, "SPACE"));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(dfield, name.m_name, strlen(name.m_name));
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)
+ || rec_get_deleted_flag(rec, 0)) {
+ /* Not found */
+err_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(NULL);
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLES__NAME, &len);
+
+ /* Check if the table name in record is the searched one */
+ if (len != strlen(name.m_name)
+ || memcmp(name.m_name, field, len)) {
+
+ goto err_exit;
+ }
+
+ dict_table_t* table;
+ if (const char* err_msg = dict_load_table_low(name, rec, &table)) {
+ if (err_msg != dict_load_table_flags) {
+ ib::error() << err_msg;
+ }
+ goto err_exit;
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ dict_load_tablespace(table, ignore_err);
+
+ dict_load_columns(table, heap);
+
+ dict_load_virtual(table, heap);
+
+ dict_table_add_system_columns(table, heap);
+
+ table->can_be_evicted = true;
+ table->add_to_cache();
+
+ mem_heap_empty(heap);
+
+ ut_ad(dict_tf2_is_valid(table->flags, table->flags2));
+
+ /* If there is no tablespace for the table then we only need to
+ load the index definitions. So that we can IMPORT the tablespace
+ later. When recovering table locks for resurrected incomplete
+ transactions, the tablespace should exist, because DDL operations
+ were not allowed while the table is being locked by a transaction. */
+ dict_err_ignore_t index_load_err =
+ !(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)
+ && !table->is_readable()
+ ? DICT_ERR_IGNORE_ALL
+ : ignore_err;
+
+ err = dict_load_indexes(table, heap, index_load_err);
+
+ if (err == DB_INDEX_CORRUPT) {
+ /* Refuse to load the table if the table has a corrupted
+ cluster index */
+ if (!srv_load_corrupted) {
+
+ ib::error() << "Load table " << table->name
+ << " failed, the table has"
+ " corrupted clustered indexes. Turn on"
+ " 'innodb_force_load_corrupted' to drop it";
+ dict_sys.remove(table);
+ table = NULL;
+ goto func_exit;
+ } else {
+ if (table->indexes.start->is_corrupted()) {
+ table->corrupted = true;
+ }
+ }
+ }
+
+ if (err == DB_SUCCESS && table->is_readable()) {
+ const auto root = dict_table_get_first_index(table)->page;
+
+ if (root >= table->space->get_size()) {
+corrupted:
+ table->corrupted = true;
+ table->file_unreadable = true;
+ err = DB_CORRUPTION;
+ } else {
+ const page_id_t page_id(table->space->id, root);
+ mtr.start();
+ buf_block_t* block = buf_page_get(
+ page_id, table->space->zip_size(),
+ RW_S_LATCH, &mtr);
+ const bool corrupted = !block
+ || page_get_space_id(block->frame)
+ != page_id.space()
+ || page_get_page_no(block->frame)
+ != page_id.page_no()
+ || (mach_read_from_2(FIL_PAGE_TYPE
+ + block->frame)
+ != FIL_PAGE_INDEX
+ && mach_read_from_2(FIL_PAGE_TYPE
+ + block->frame)
+ != FIL_PAGE_TYPE_INSTANT);
+ mtr.commit();
+ if (corrupted) {
+ goto corrupted;
+ }
+
+ if (table->supports_instant()) {
+ err = btr_cur_instant_init(table);
+ }
+ }
+ }
+
+ /* Initialize table foreign_child value. Its value could be
+ changed when dict_load_foreigns() is called below */
+ table->fk_max_recusive_level = 0;
+
+ /* If the force recovery flag is set, we open the table irrespective
+ of the error condition, since the user may want to dump data from the
+ clustered index. However we load the foreign key information only if
+ all indexes were loaded. */
+ if (!table->is_readable()) {
+ /* Don't attempt to load the indexes from disk. */
+ } else if (err == DB_SUCCESS) {
+ err = dict_load_foreigns(table->name.m_name, NULL,
+ true, true,
+ ignore_err, fk_tables);
+
+ if (err != DB_SUCCESS) {
+ ib::warn() << "Load table " << table->name
+ << " failed, the table has missing"
+ " foreign key indexes. Turn off"
+ " 'foreign_key_checks' and try again.";
+
+ dict_sys.remove(table);
+ table = NULL;
+ } else {
+ dict_mem_table_fill_foreign_vcol_set(table);
+ table->fk_max_recusive_level = 0;
+ }
+ } else {
+ dict_index_t* index;
+
+ /* Make sure that at least the clustered index was loaded.
+ Otherwise refuse to load the table */
+ index = dict_table_get_first_index(table);
+
+ if (!srv_force_recovery
+ || !index
+ || !index->is_primary()) {
+ dict_sys.remove(table);
+ table = NULL;
+ } else if (index->is_corrupted()
+ && table->is_readable()) {
+ /* It is possible we force to load a corrupted
+ clustered index if srv_load_corrupted is set.
+ Mark the table as corrupted in this case */
+ table->corrupted = true;
+ }
+ }
+
+func_exit:
+ mem_heap_free(heap);
+
+ ut_ad(!table
+ || (ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY)
+ || !table->is_readable()
+ || !table->corrupted);
+
+ if (table && table->fts) {
+ if (!(dict_table_has_fts_index(table)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID))) {
+ /* the table->fts could be created in dict_load_column
+ when a user defined FTS_DOC_ID is present, but no
+ FTS */
+ fts_free(table);
+ } else if (fts_optimize_wq) {
+ fts_optimize_add_table(table);
+ } else if (table->can_be_evicted) {
+ /* fts_optimize_thread is not started yet.
+ So make the table as non-evictable from cache. */
+ dict_sys.prevent_eviction(table);
+ }
+ }
+
+ ut_ad(err != DB_SUCCESS || dict_foreign_set_validate(*table));
+
+ DBUG_RETURN(table);
+}
+
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return table; NULL if table does not exist */
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+ table_id_t table_id, /*!< in: table id */
+ dict_err_ignore_t ignore_err) /*!< in: errors to ignore
+ when loading the table */
+{
+ byte id_buf[8];
+ btr_pcur_t pcur;
+ mem_heap_t* heap;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ dict_index_t* sys_table_ids;
+ dict_table_t* sys_tables;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ dict_table_t* table;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ table = NULL;
+
+ /* NOTE that the operation of this function is protected by
+ the dictionary mutex, and therefore no deadlocks can occur
+ with other dictionary operations. */
+
+ mtr_start(&mtr);
+ /*---------------------------------------------------*/
+ /* Get the secondary index based on ID for table SYS_TABLES */
+ sys_tables = dict_sys.sys_tables;
+ sys_table_ids = dict_table_get_next_index(
+ dict_table_get_first_index(sys_tables));
+ ut_ad(!dict_table_is_comp(sys_tables));
+ ut_ad(!dict_index_is_clust(sys_table_ids));
+ heap = mem_heap_create(256);
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ /* Write the table id in byte format to id_buf */
+ mach_write_to_8(id_buf, table_id);
+
+ dfield_set_data(dfield, id_buf, 8);
+ dict_index_copy_types(tuple, sys_table_ids, 1);
+
+ btr_pcur_open_on_user_rec(sys_table_ids, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (page_rec_is_user_rec(rec)) {
+ /*---------------------------------------------------*/
+ /* Now we have the record in the secondary index
+ containing the table ID and NAME */
+check_rec:
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_TABLE_IDS__ID, &len);
+ ut_ad(len == 8);
+
+ /* Check if the table id in record is the one searched for */
+ if (table_id == mach_read_from_8(field)) {
+ if (rec_get_deleted_flag(rec, 0)) {
+ /* Until purge has completed, there
+ may be delete-marked duplicate records
+ for the same SYS_TABLES.ID, but different
+ SYS_TABLES.NAME. */
+ while (btr_pcur_move_to_next(&pcur, &mtr)) {
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (page_rec_is_user_rec(rec)) {
+ goto check_rec;
+ }
+ }
+ } else {
+ /* Now we get the table name from the record */
+ field = rec_get_nth_field_old(rec,
+ DICT_FLD__SYS_TABLE_IDS__NAME, &len);
+ /* Load the table definition to memory */
+ char* table_name = mem_heap_strdupl(
+ heap, (char*) field, len);
+ table = dict_load_table(table_name, ignore_err);
+ }
+ }
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(table);
+}
+
+/********************************************************************//**
+This function is called when the database is booted. Loads system table
+index definitions except for the clustered index which is added to the
+dictionary cache at booting before calling this function. */
+void
+dict_load_sys_table(
+/*================*/
+ dict_table_t* table) /*!< in: system table */
+{
+ mem_heap_t* heap;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ heap = mem_heap_create(1000);
+
+ dict_load_indexes(table, heap, DICT_ERR_IGNORE_NONE);
+
+ mem_heap_free(heap);
+}
+
+/********************************************************************//**
+Loads foreign key constraint col names (also for the referenced table).
+Members that must be set (and valid) in foreign:
+foreign->heap
+foreign->n_fields
+foreign->id ('\0'-terminated)
+Members that will be created and set by this function:
+foreign->foreign_col_names[i]
+foreign->referenced_col_names[i]
+(for i=0..foreign->n_fields-1) */
+static
+void
+dict_load_foreign_cols(
+/*===================*/
+ dict_foreign_t* foreign)/*!< in/out: foreign constraint object */
+{
+ dict_table_t* sys_foreign_cols;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ ulint i;
+ mtr_t mtr;
+ size_t id_len;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ id_len = strlen(foreign->id);
+
+ foreign->foreign_col_names = static_cast<const char**>(
+ mem_heap_alloc(foreign->heap,
+ foreign->n_fields * sizeof(void*)));
+
+ foreign->referenced_col_names = static_cast<const char**>(
+ mem_heap_alloc(foreign->heap,
+ foreign->n_fields * sizeof(void*)));
+
+ mtr_start(&mtr);
+
+ sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS");
+
+ sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes);
+ ut_ad(!dict_table_is_comp(sys_foreign_cols));
+
+ tuple = dtuple_create(foreign->heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(dfield, foreign->id, id_len);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ for (i = 0; i < foreign->n_fields; i++) {
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ ut_a(btr_pcur_is_on_user_rec(&pcur));
+ ut_a(!rec_get_deleted_flag(rec, 0));
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
+
+ if (len != id_len || memcmp(foreign->id, field, len)) {
+ const rec_t* pos;
+ ulint pos_len;
+ const rec_t* for_col_name;
+ ulint for_col_name_len;
+ const rec_t* ref_col_name;
+ ulint ref_col_name_len;
+
+ pos = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__POS,
+ &pos_len);
+
+ for_col_name = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME,
+ &for_col_name_len);
+
+ ref_col_name = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME,
+ &ref_col_name_len);
+
+ ib::fatal sout;
+
+ sout << "Unable to load column names for foreign"
+ " key '" << foreign->id
+ << "' because it was not found in"
+ " InnoDB internal table SYS_FOREIGN_COLS. The"
+ " closest entry we found is:"
+ " (ID='";
+ sout.write(field, len);
+ sout << "', POS=" << mach_read_from_4(pos)
+ << ", FOR_COL_NAME='";
+ sout.write(for_col_name, for_col_name_len);
+ sout << "', REF_COL_NAME='";
+ sout.write(ref_col_name, ref_col_name_len);
+ sout << "')";
+ }
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len);
+ ut_a(len == 4);
+ ut_a(i == mach_read_from_4(field));
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len);
+ foreign->foreign_col_names[i] = mem_heap_strdupl(
+ foreign->heap, (char*) field, len);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len);
+ foreign->referenced_col_names[i] = mem_heap_strdupl(
+ foreign->heap, (char*) field, len);
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Loads a foreign key constraint to the dictionary cache. If the referenced
+table is not yet loaded, it is added in the output parameter (fk_tables).
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull(1), warn_unused_result))
+dberr_t
+dict_load_foreign(
+/*==============*/
+ const char* id,
+ /*!< in: foreign constraint id, must be
+ '\0'-terminated */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use foreign->foreign_table->col_names */
+ bool check_recursive,
+ /*!< in: whether to record the foreign table
+ parent count to avoid unlimited recursive
+ load of chained foreign tables */
+ bool check_charsets,
+ /*!< in: whether to check charset
+ compatibility */
+ dict_err_ignore_t ignore_err,
+ /*!< in: error to be ignored */
+ dict_names_t& fk_tables)
+ /*!< out: the foreign key constraint is added
+ to the dictionary cache only if the referenced
+ table is already in cache. Otherwise, the
+ foreign key constraint is not added to cache,
+ and the referenced table is added to this
+ stack. */
+{
+ dict_foreign_t* foreign;
+ dict_table_t* sys_foreign;
+ btr_pcur_t pcur;
+ dict_index_t* sys_index;
+ dtuple_t* tuple;
+ mem_heap_t* heap2;
+ dfield_t* dfield;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ mtr_t mtr;
+ dict_table_t* for_table;
+ dict_table_t* ref_table;
+ size_t id_len;
+
+ DBUG_ENTER("dict_load_foreign");
+ DBUG_PRINT("dict_load_foreign",
+ ("id: '%s', check_recursive: %d", id, check_recursive));
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ id_len = strlen(id);
+
+ heap2 = mem_heap_create(1000);
+
+ mtr_start(&mtr);
+
+ sys_foreign = dict_table_get_low("SYS_FOREIGN");
+
+ sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes);
+ ut_ad(!dict_table_is_comp(sys_foreign));
+
+ tuple = dtuple_create(heap2, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(dfield, id, id_len);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)
+ || rec_get_deleted_flag(rec, 0)) {
+ /* Not found */
+
+ ib::error() << "Cannot load foreign constraint " << id
+ << ": could not find the relevant record in "
+ << "SYS_FOREIGN";
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap2);
+
+ DBUG_RETURN(DB_ERROR);
+ }
+
+ field = rec_get_nth_field_old(rec, DICT_FLD__SYS_FOREIGN__ID, &len);
+
+ /* Check if the id in record is the searched one */
+ if (len != id_len || memcmp(id, field, len)) {
+ {
+ ib::error err;
+ err << "Cannot load foreign constraint " << id
+ << ": found ";
+ err.write(field, len);
+ err << " instead in SYS_FOREIGN";
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap2);
+
+ DBUG_RETURN(DB_ERROR);
+ }
+
+ /* Read the table names and the number of columns associated
+ with the constraint */
+
+ mem_heap_free(heap2);
+
+ foreign = dict_mem_foreign_create();
+
+ uint32_t n_fields_and_type = mach_read_from_4(
+ rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len));
+
+ ut_a(len == 4);
+
+ /* We store the type in the bits 24..29 of n_fields_and_type. */
+
+ foreign->type = (n_fields_and_type >> 24) & ((1U << 6) - 1);
+ foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS;
+
+ foreign->id = mem_heap_strdupl(foreign->heap, id, id_len);
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
+
+ foreign->foreign_table_name = mem_heap_strdupl(
+ foreign->heap, (char*) field, len);
+ dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+ const ulint foreign_table_name_len = len;
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
+ foreign->referenced_table_name = mem_heap_strdupl(
+ foreign->heap, (char*) field, len);
+ dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ dict_load_foreign_cols(foreign);
+
+ ref_table = dict_table_check_if_in_cache_low(
+ foreign->referenced_table_name_lookup);
+ for_table = dict_table_check_if_in_cache_low(
+ foreign->foreign_table_name_lookup);
+
+ if (!for_table) {
+ /* To avoid recursively loading the tables related through
+ the foreign key constraints, the child table name is saved
+ here. The child table will be loaded later, along with its
+ foreign key constraint. */
+
+ ut_a(ref_table != NULL);
+ fk_tables.push_back(
+ mem_heap_strdupl(ref_table->heap,
+ foreign->foreign_table_name_lookup,
+ foreign_table_name_len));
+
+ dict_foreign_remove_from_cache(foreign);
+ DBUG_RETURN(DB_SUCCESS);
+ }
+
+ ut_a(for_table || ref_table);
+
+ /* Note that there may already be a foreign constraint object in
+ the dictionary cache for this constraint: then the following
+ call only sets the pointers in it to point to the appropriate table
+ and index objects and frees the newly created object foreign.
+ Adding to the cache should always succeed since we are not creating
+ a new foreign key constraint but loading one from the data
+ dictionary. */
+
+ DBUG_RETURN(dict_foreign_add_to_cache(foreign, col_names,
+ check_charsets,
+ ignore_err));
+}
+
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary.
+
+The foreign key constraint is loaded only if the referenced table is also
+in the dictionary cache. If the referenced table is not in dictionary
+cache, then it is added to the output parameter (fk_tables).
+
+@return DB_SUCCESS or error code */
+dberr_t
+dict_load_foreigns(
+ const char* table_name, /*!< in: table name */
+ const char** col_names, /*!< in: column names, or NULL
+ to use table->col_names */
+ bool check_recursive,/*!< in: Whether to check
+ recursive load of tables
+ chained by FK */
+ bool check_charsets, /*!< in: whether to check
+ charset compatibility */
+ dict_err_ignore_t ignore_err, /*!< in: error to be ignored */
+ dict_names_t& fk_tables)
+ /*!< out: stack of table
+ names which must be loaded
+ subsequently to load all the
+ foreign key constraints. */
+{
+ ulint tuple_buf[(DTUPLE_EST_ALLOC(1) + sizeof(ulint) - 1)
+ / sizeof(ulint)];
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ dict_index_t* sec_index;
+ dict_table_t* sys_foreign;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ dberr_t err;
+ mtr_t mtr;
+
+ DBUG_ENTER("dict_load_foreigns");
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ sys_foreign = dict_table_get_low("SYS_FOREIGN");
+
+ if (sys_foreign == NULL) {
+ /* No foreign keys defined yet in this database */
+
+ ib::info() << "No foreign key system tables in the database";
+ DBUG_RETURN(DB_ERROR);
+ }
+
+ ut_ad(!dict_table_is_comp(sys_foreign));
+ mtr_start(&mtr);
+
+ /* Get the secondary index based on FOR_NAME from table
+ SYS_FOREIGN */
+
+ sec_index = dict_table_get_next_index(
+ dict_table_get_first_index(sys_foreign));
+ ut_ad(!dict_index_is_clust(sec_index));
+start_load:
+
+ tuple = dtuple_create_from_mem(tuple_buf, sizeof(tuple_buf), 1, 0);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(dfield, table_name, strlen(table_name));
+ dict_index_copy_types(tuple, sec_index, 1);
+
+ btr_pcur_open_on_user_rec(sec_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+loop:
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ /* End of index */
+
+ goto load_next_index;
+ }
+
+ /* Now we have the record in the secondary index containing a table
+ name and a foreign constraint ID */
+
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME, &len);
+
+ /* Check if the table name in the record is the one searched for; the
+ following call does the comparison in the latin1_swedish_ci
+ charset-collation, in a case-insensitive way. */
+
+ if (0 != cmp_data_data(dfield_get_type(dfield)->mtype,
+ dfield_get_type(dfield)->prtype,
+ static_cast<const byte*>(
+ dfield_get_data(dfield)),
+ dfield_get_len(dfield),
+ field, len)) {
+
+ goto load_next_index;
+ }
+
+ /* Since table names in SYS_FOREIGN are stored in a case-insensitive
+ order, we have to check that the table name matches also in a binary
+ string comparison. On Unix, MySQL allows table names that only differ
+ in character case. If lower_case_table_names=2 then what is stored
+ may not be the same case, but the previous comparison showed that they
+ match with no-case. */
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ goto next_rec;
+ }
+
+ if (innobase_get_lower_case_table_names() != 2
+ && memcmp(field, table_name, len)) {
+ goto next_rec;
+ }
+
+ /* Now we get a foreign key constraint id */
+ field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__ID, &len);
+
+ /* Copy the string because the page may be modified or evicted
+ after mtr_commit() below. */
+ char fk_id[MAX_TABLE_NAME_LEN + 1];
+
+ ut_a(len <= MAX_TABLE_NAME_LEN);
+ memcpy(fk_id, field, len);
+ fk_id[len] = '\0';
+
+ btr_pcur_store_position(&pcur, &mtr);
+
+ mtr_commit(&mtr);
+
+ /* Load the foreign constraint definition to the dictionary cache */
+
+ err = dict_load_foreign(fk_id, col_names,
+ check_recursive, check_charsets, ignore_err,
+ fk_tables);
+
+ if (err != DB_SUCCESS) {
+ btr_pcur_close(&pcur);
+
+ DBUG_RETURN(err);
+ }
+
+ mtr_start(&mtr);
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ goto loop;
+
+load_next_index:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ sec_index = dict_table_get_next_index(sec_index);
+
+ if (sec_index != NULL) {
+
+ mtr_start(&mtr);
+
+ /* Switch to scan index on REF_NAME, fk_max_recusive_level
+ already been updated when scanning FOR_NAME index, no need to
+ update again */
+ check_recursive = FALSE;
+
+ goto start_load;
+ }
+
+ DBUG_RETURN(DB_SUCCESS);
+}
diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc
new file mode 100644
index 00000000..97889e22
--- /dev/null
+++ b/storage/innobase/dict/dict0mem.cc
@@ -0,0 +1,1396 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file dict/dict0mem.cc
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "ha_prototypes.h"
+#include <mysql_com.h>
+
+#include "dict0mem.h"
+#include "rem0rec.h"
+#include "data0type.h"
+#include "mach0data.h"
+#include "dict0dict.h"
+#include "fts0priv.h"
+#include "lock0lock.h"
+#include "sync0sync.h"
+#include "row0row.h"
+#include "sql_string.h"
+#include <iostream>
+
+#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when
+ creating a table or index object */
+
+/** System databases */
+static const char* innobase_system_databases[] = {
+ "mysql/",
+ "information_schema/",
+ "performance_schema/",
+ NullS
+};
+
+/** Determine if a table belongs to innobase_system_databases[]
+@param[in] name database_name/table_name
+@return whether the database_name is in innobase_system_databases[] */
+static bool dict_mem_table_is_system(const char *name)
+{
+ /* table has the following format: database/table
+ and some system table are of the form SYS_* */
+ if (!strchr(name, '/')) {
+ return true;
+ }
+ size_t table_len = strlen(name);
+ const char *system_db;
+ int i = 0;
+ while ((system_db = innobase_system_databases[i++])
+ && (system_db != NullS)) {
+ size_t len = strlen(system_db);
+ if (table_len > len && !strncmp(name, system_db, len)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/** The start of the table basename suffix for partitioned tables */
+const char table_name_t::part_suffix[4]
+#ifdef _WIN32
+= "#p#";
+#else
+= "#P#";
+#endif
+
+/** Display an identifier.
+@param[in,out] s output stream
+@param[in] id_name SQL identifier (other than table name)
+@return the output stream */
+std::ostream&
+operator<<(
+ std::ostream& s,
+ const id_name_t& id_name)
+{
+ const char q = '`';
+ const char* c = id_name;
+ s << q;
+ for (; *c != 0; c++) {
+ if (*c == q) {
+ s << *c;
+ }
+ s << *c;
+ }
+ s << q;
+ return(s);
+}
+
+/** Display a table name.
+@param[in,out] s output stream
+@param[in] table_name table name
+@return the output stream */
+std::ostream&
+operator<<(
+ std::ostream& s,
+ const table_name_t& table_name)
+{
+ return(s << ut_get_name(NULL, table_name.m_name));
+}
+
+bool dict_col_t::same_encoding(uint16_t a, uint16_t b)
+{
+ if (const CHARSET_INFO *acs= get_charset(a, MYF(MY_WME)))
+ if (const CHARSET_INFO *bcs= get_charset(b, MYF(MY_WME)))
+ return Charset(bcs).encoding_allows_reinterpret_as(acs);
+ return false;
+}
+
+/** Create a table memory object.
+@param name table name
+@param space tablespace
+@param n_cols total number of columns (both virtual and non-virtual)
+@param n_v_cols number of virtual columns
+@param flags table flags
+@param flags2 table flags2
+@return own: table object */
+dict_table_t *dict_mem_table_create(const char *name, fil_space_t *space,
+ ulint n_cols, ulint n_v_cols, ulint flags,
+ ulint flags2)
+{
+ dict_table_t* table;
+ mem_heap_t* heap;
+
+ ut_ad(name);
+ ut_ad(!space
+ || space->purpose == FIL_TYPE_TABLESPACE
+ || space->purpose == FIL_TYPE_TEMPORARY
+ || space->purpose == FIL_TYPE_IMPORT);
+ ut_a(dict_tf2_is_valid(flags, flags2));
+ ut_a(!(flags2 & DICT_TF2_UNUSED_BIT_MASK));
+
+ heap = mem_heap_create(DICT_HEAP_SIZE);
+
+ table = static_cast<dict_table_t*>(
+ mem_heap_zalloc(heap, sizeof(*table)));
+
+ lock_table_lock_list_init(&table->locks);
+
+ UT_LIST_INIT(table->indexes, &dict_index_t::indexes);
+#ifdef BTR_CUR_HASH_ADAPT
+ UT_LIST_INIT(table->freed_indexes, &dict_index_t::indexes);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ table->heap = heap;
+
+ ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
+
+ table->flags = static_cast<unsigned>(flags)
+ & ((1U << DICT_TF_BITS) - 1);
+ table->flags2 = static_cast<unsigned>(flags2)
+ & ((1U << DICT_TF2_BITS) - 1);
+ table->name.m_name = mem_strdup(name);
+ table->is_system_db = dict_mem_table_is_system(table->name.m_name);
+ table->space = space;
+ table->space_id = space ? space->id : ULINT_UNDEFINED;
+ table->n_t_cols = static_cast<unsigned>(n_cols + DATA_N_SYS_COLS)
+ & dict_index_t::MAX_N_FIELDS;
+ table->n_v_cols = static_cast<unsigned>(n_v_cols)
+ & dict_index_t::MAX_N_FIELDS;
+ table->n_cols = static_cast<unsigned>(
+ table->n_t_cols - table->n_v_cols)
+ & dict_index_t::MAX_N_FIELDS;
+
+ table->cols = static_cast<dict_col_t*>(
+ mem_heap_alloc(heap, table->n_cols * sizeof(dict_col_t)));
+ table->v_cols = static_cast<dict_v_col_t*>(
+ mem_heap_alloc(heap, n_v_cols * sizeof(*table->v_cols)));
+ for (ulint i = n_v_cols; i--; ) {
+ new (&table->v_cols[i]) dict_v_col_t();
+ }
+
+ table->autoinc_lock = static_cast<ib_lock_t*>(
+ mem_heap_alloc(heap, lock_get_size()));
+
+ /* If the table has an FTS index or we are in the process
+ of building one, create the table->fts */
+ if (dict_table_has_fts_index(table)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+ table->fts = fts_create(table);
+ table->fts->cache = fts_cache_create(table);
+ }
+
+ new(&table->foreign_set) dict_foreign_set();
+ new(&table->referenced_set) dict_foreign_set();
+
+ return(table);
+}
+
+/****************************************************************//**
+Free a table memory object. */
+void
+dict_mem_table_free(
+/*================*/
+ dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(UT_LIST_GET_LEN(table->indexes) == 0);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(UT_LIST_GET_LEN(table->freed_indexes) == 0);
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_d(table->cached = FALSE);
+
+ if (dict_table_has_fts_index(table)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+ if (table->fts) {
+ fts_free(table);
+ }
+ }
+
+ dict_mem_table_free_foreign_vcol_set(table);
+
+ table->foreign_set.~dict_foreign_set();
+ table->referenced_set.~dict_foreign_set();
+
+ ut_free(table->name.m_name);
+ table->name.m_name = NULL;
+
+ /* Clean up virtual index info structures that are registered
+ with virtual columns */
+ for (ulint i = 0; i < table->n_v_def; i++) {
+ dict_table_get_nth_v_col(table, i)->~dict_v_col_t();
+ }
+
+ UT_DELETE(table->s_cols);
+
+ mem_heap_free(table->heap);
+}
+
+/****************************************************************//**
+Append 'name' to 'col_names'. @see dict_table_t::col_names
+@return new column names array */
+static
+const char*
+dict_add_col_name(
+/*==============*/
+ const char* col_names, /*!< in: existing column names, or
+ NULL */
+ ulint cols, /*!< in: number of existing columns */
+ const char* name, /*!< in: new column name */
+ mem_heap_t* heap) /*!< in: heap */
+{
+ ulint old_len;
+ ulint new_len;
+ ulint total_len;
+ char* res;
+
+ ut_ad(!cols == !col_names);
+
+ /* Find out length of existing array. */
+ if (col_names) {
+ const char* s = col_names;
+ ulint i;
+
+ for (i = 0; i < cols; i++) {
+ s += strlen(s) + 1;
+ }
+
+ old_len = unsigned(s - col_names);
+ } else {
+ old_len = 0;
+ }
+
+ new_len = strlen(name) + 1;
+ total_len = old_len + new_len;
+
+ res = static_cast<char*>(mem_heap_alloc(heap, total_len));
+
+ if (old_len > 0) {
+ memcpy(res, col_names, old_len);
+ }
+
+ memcpy(res + old_len, name, new_len);
+
+ return(res);
+}
+
+/**********************************************************************//**
+Adds a column definition to a table. */
+void
+dict_mem_table_add_col(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */
+ const char* name, /*!< in: column name, or NULL */
+ ulint mtype, /*!< in: main datatype */
+ ulint prtype, /*!< in: precise type */
+ ulint len) /*!< in: precision */
+{
+ dict_col_t* col;
+ unsigned i;
+
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(!heap == !name);
+
+ ut_ad(!(prtype & DATA_VIRTUAL));
+
+ i = table->n_def++;
+
+ table->n_t_def++;
+
+ if (name) {
+ if (table->n_def == table->n_cols) {
+ heap = table->heap;
+ }
+ if (i && !table->col_names) {
+ /* All preceding column names are empty. */
+ char* s = static_cast<char*>(
+ mem_heap_zalloc(heap, table->n_def));
+
+ table->col_names = s;
+ }
+
+ table->col_names = dict_add_col_name(table->col_names,
+ i, name, heap);
+ }
+
+ col = dict_table_get_nth_col(table, i);
+
+ dict_mem_fill_column_struct(col, i, mtype, prtype, len);
+
+ switch (prtype & DATA_VERSIONED) {
+ case DATA_VERS_START:
+ ut_ad(!table->vers_start);
+ table->vers_start = i & dict_index_t::MAX_N_FIELDS;
+ break;
+ case DATA_VERS_END:
+ ut_ad(!table->vers_end);
+ table->vers_end = i & dict_index_t::MAX_N_FIELDS;
+ }
+}
+
+/** Adds a virtual column definition to a table.
+@param[in,out] table table
+@param[in,out] heap temporary memory heap, or NULL. It is
+ used to store name when we have not finished
+ adding all columns. When all columns are
+ added, the whole name will copy to memory from
+ table->heap
+@param[in] name column name
+@param[in] mtype main datatype
+@param[in] prtype precise type
+@param[in] len length
+@param[in] pos position in a table
+@param[in] num_base number of base columns
+@return the virtual column definition */
+dict_v_col_t*
+dict_mem_table_add_v_col(
+ dict_table_t* table,
+ mem_heap_t* heap,
+ const char* name,
+ ulint mtype,
+ ulint prtype,
+ ulint len,
+ ulint pos,
+ ulint num_base)
+{
+ dict_v_col_t* v_col;
+
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(!heap == !name);
+
+ ut_ad(prtype & DATA_VIRTUAL);
+
+ unsigned i = table->n_v_def++;
+
+ table->n_t_def++;
+
+ if (name != NULL) {
+ if (table->n_v_def == table->n_v_cols) {
+ heap = table->heap;
+ }
+
+ if (i && !table->v_col_names) {
+ /* All preceding column names are empty. */
+ char* s = static_cast<char*>(
+ mem_heap_zalloc(heap, table->n_v_def));
+
+ table->v_col_names = s;
+ }
+
+ table->v_col_names = dict_add_col_name(table->v_col_names,
+ i, name, heap);
+ }
+
+ v_col = &table->v_cols[i];
+
+ dict_mem_fill_column_struct(&v_col->m_col, pos, mtype, prtype, len);
+ v_col->v_pos = i & dict_index_t::MAX_N_FIELDS;
+
+ if (num_base != 0) {
+ v_col->base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
+ table->heap, num_base * sizeof(
+ *v_col->base_col)));
+ } else {
+ v_col->base_col = NULL;
+ }
+
+ v_col->num_base = static_cast<unsigned>(num_base)
+ & dict_index_t::MAX_N_FIELDS;
+
+ /* Initialize the index list for virtual columns */
+ ut_ad(v_col->v_indexes.empty());
+
+ return(v_col);
+}
+
+/** Adds a stored column definition to a table.
+@param[in] table table
+@param[in] num_base number of base columns. */
+void
+dict_mem_table_add_s_col(
+ dict_table_t* table,
+ ulint num_base)
+{
+ unsigned i = unsigned(table->n_def) - 1;
+ dict_col_t* col = dict_table_get_nth_col(table, i);
+ dict_s_col_t s_col;
+
+ ut_ad(col != NULL);
+
+ if (table->s_cols == NULL) {
+ table->s_cols = UT_NEW_NOKEY(dict_s_col_list());
+ }
+
+ s_col.m_col = col;
+ s_col.s_pos = i + table->n_v_def;
+
+ if (num_base != 0) {
+ s_col.base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
+ table->heap, num_base * sizeof(dict_col_t*)));
+ } else {
+ s_col.base_col = NULL;
+ }
+
+ s_col.num_base = num_base;
+ table->s_cols->push_front(s_col);
+}
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+static MY_ATTRIBUTE((nonnull))
+void
+dict_mem_table_col_rename_low(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ unsigned i, /*!< in: column offset corresponding to s */
+ const char* to, /*!< in: new column name */
+ const char* s, /*!< in: pointer to table->col_names */
+ bool is_virtual)
+ /*!< in: if this is a virtual column */
+{
+ char* t_col_names = const_cast<char*>(
+ is_virtual ? table->v_col_names : table->col_names);
+ ulint n_col = is_virtual ? table->n_v_def : table->n_def;
+
+ size_t from_len = strlen(s), to_len = strlen(to);
+
+ ut_ad(i < table->n_def || is_virtual);
+ ut_ad(i < table->n_v_def || !is_virtual);
+
+ ut_ad(from_len <= NAME_LEN);
+ ut_ad(to_len <= NAME_LEN);
+
+ char from[NAME_LEN + 1];
+ strncpy(from, s, sizeof from - 1);
+ from[sizeof from - 1] = '\0';
+
+ if (from_len == to_len) {
+ /* The easy case: simply replace the column name in
+ table->col_names. */
+ strcpy(const_cast<char*>(s), to);
+ } else {
+ /* We need to adjust all affected index->field
+ pointers, as in dict_index_add_col(). First, copy
+ table->col_names. */
+ ulint prefix_len = ulint(s - t_col_names);
+
+ for (; i < n_col; i++) {
+ s += strlen(s) + 1;
+ }
+
+ ulint full_len = ulint(s - t_col_names);
+ char* col_names;
+
+ if (to_len > from_len) {
+ col_names = static_cast<char*>(
+ mem_heap_alloc(
+ table->heap,
+ full_len + to_len - from_len));
+
+ memcpy(col_names, t_col_names, prefix_len);
+ } else {
+ col_names = const_cast<char*>(t_col_names);
+ }
+
+ memcpy(col_names + prefix_len, to, to_len);
+ memmove(col_names + prefix_len + to_len,
+ t_col_names + (prefix_len + from_len),
+ full_len - (prefix_len + from_len));
+
+ /* Replace the field names in every index. */
+ for (dict_index_t* index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ ulint n_fields = dict_index_get_n_fields(index);
+
+ for (ulint i = 0; i < n_fields; i++) {
+ dict_field_t* field
+ = dict_index_get_nth_field(
+ index, i);
+
+ ut_ad(!field->name
+ == field->col->is_dropped());
+ if (!field->name) {
+ /* dropped columns lack a name */
+ ut_ad(index->is_instant());
+ continue;
+ }
+
+ /* if is_virtual and that in field->col does
+ not match, continue */
+ if ((!is_virtual) !=
+ (!field->col->is_virtual())) {
+ continue;
+ }
+
+ ulint name_ofs
+ = ulint(field->name - t_col_names);
+ if (name_ofs <= prefix_len) {
+ field->name = col_names + name_ofs;
+ } else {
+ ut_a(name_ofs < full_len);
+ field->name = col_names
+ + name_ofs + to_len - from_len;
+ }
+ }
+ }
+
+ if (is_virtual) {
+ table->v_col_names = col_names;
+ } else {
+ table->col_names = col_names;
+ }
+ }
+
+ /* Virtual columns are not allowed for foreign key */
+ if (is_virtual) {
+ return;
+ }
+
+ dict_foreign_t* foreign;
+
+ /* Replace the field names in every foreign key constraint. */
+ for (dict_foreign_set::iterator it = table->foreign_set.begin();
+ it != table->foreign_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ if (foreign->foreign_index == NULL) {
+ /* We may go here when we set foreign_key_checks to 0,
+ and then try to rename a column and modify the
+ corresponding foreign key constraint. The index
+ would have been dropped, we have to find an equivalent
+ one */
+ for (unsigned f = 0; f < foreign->n_fields; f++) {
+ if (strcmp(foreign->foreign_col_names[f], from)
+ == 0) {
+
+ char** rc = const_cast<char**>(
+ foreign->foreign_col_names
+ + f);
+
+ if (to_len <= strlen(*rc)) {
+ memcpy(*rc, to, to_len + 1);
+ } else {
+ *rc = static_cast<char*>(
+ mem_heap_dup(
+ foreign->heap,
+ to,
+ to_len + 1));
+ }
+ }
+ }
+
+ /* New index can be null if InnoDB already dropped
+ the foreign index when FOREIGN_KEY_CHECKS is
+ disabled */
+ foreign->foreign_index = dict_foreign_find_index(
+ foreign->foreign_table, NULL,
+ foreign->foreign_col_names,
+ foreign->n_fields, NULL, true, false,
+ NULL, NULL, NULL);
+
+ } else {
+
+ for (unsigned f = 0; f < foreign->n_fields; f++) {
+ /* These can point straight to
+ table->col_names, because the foreign key
+ constraints will be freed at the same time
+ when the table object is freed. */
+ foreign->foreign_col_names[f]
+ = dict_index_get_nth_field(
+ foreign->foreign_index,
+ f)->name;
+ }
+ }
+ }
+
+ for (dict_foreign_set::iterator it = table->referenced_set.begin();
+ it != table->referenced_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ if (!foreign->referenced_index) {
+ /* Referenced index could have been dropped
+ when foreign_key_checks is disabled. In that case,
+ rename the corresponding referenced_col_names and
+ find the equivalent referenced index also */
+ for (unsigned f = 0; f < foreign->n_fields; f++) {
+
+ const char*& rc =
+ foreign->referenced_col_names[f];
+ if (strcmp(rc, from)) {
+ continue;
+ }
+
+ if (to_len <= strlen(rc)) {
+ memcpy(const_cast<char*>(rc), to,
+ to_len + 1);
+ } else {
+ rc = static_cast<char*>(
+ mem_heap_dup(
+ foreign->heap,
+ to, to_len + 1));
+ }
+ }
+
+ /* New index can be null if InnoDB already dropped
+ the referenced index when FOREIGN_KEY_CHECKS is
+ disabled */
+ foreign->referenced_index = dict_foreign_find_index(
+ foreign->referenced_table, NULL,
+ foreign->referenced_col_names,
+ foreign->n_fields, NULL, true, false,
+ NULL, NULL, NULL);
+ return;
+ }
+
+
+ for (unsigned f = 0; f < foreign->n_fields; f++) {
+ /* foreign->referenced_col_names[] need to be
+ copies, because the constraint may become
+ orphan when foreign_key_checks=0 and the
+ parent table is dropped. */
+
+ const char* col_name = dict_index_get_nth_field(
+ foreign->referenced_index, f)->name;
+
+ if (strcmp(foreign->referenced_col_names[f],
+ col_name)) {
+ char** rc = const_cast<char**>(
+ foreign->referenced_col_names + f);
+ size_t col_name_len_1 = strlen(col_name) + 1;
+
+ if (col_name_len_1 <= strlen(*rc) + 1) {
+ memcpy(*rc, col_name, col_name_len_1);
+ } else {
+ *rc = static_cast<char*>(
+ mem_heap_dup(
+ foreign->heap,
+ col_name,
+ col_name_len_1));
+ }
+ }
+ }
+ }
+}
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+void
+dict_mem_table_col_rename(
+/*======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ulint nth_col,/*!< in: column index */
+ const char* from, /*!< in: old column name */
+ const char* to, /*!< in: new column name */
+ bool is_virtual)
+ /*!< in: if this is a virtual column */
+{
+ const char* s = is_virtual ? table->v_col_names : table->col_names;
+
+ ut_ad((!is_virtual && nth_col < table->n_def)
+ || (is_virtual && nth_col < table->n_v_def));
+
+ for (ulint i = 0; i < nth_col; i++) {
+ size_t len = strlen(s);
+ ut_ad(len > 0);
+ s += len + 1;
+ }
+
+ ut_ad(!my_strcasecmp(system_charset_info, from, s));
+
+ dict_mem_table_col_rename_low(table, static_cast<unsigned>(nth_col),
+ to, s, is_virtual);
+}
+
+/**********************************************************************//**
+This function populates a dict_col_t memory structure with
+supplied information. */
+void
+dict_mem_fill_column_struct(
+/*========================*/
+ dict_col_t* column, /*!< out: column struct to be
+ filled */
+ ulint col_pos, /*!< in: column position */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint col_len) /*!< in: column length */
+{
+ unsigned mbminlen, mbmaxlen;
+
+ column->ind = static_cast<unsigned>(col_pos)
+ & dict_index_t::MAX_N_FIELDS;
+ column->ord_part = 0;
+ column->max_prefix = 0;
+ column->mtype = static_cast<uint8_t>(mtype);
+ column->prtype = static_cast<unsigned>(prtype);
+ column->len = static_cast<uint16_t>(col_len);
+ dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen);
+ column->mbminlen = mbminlen & 7;
+ column->mbmaxlen = mbmaxlen & 7;
+ column->def_val.data = NULL;
+ column->def_val.len = UNIV_SQL_DEFAULT;
+ ut_ad(!column->is_dropped());
+}
+
+/**********************************************************************//**
+Creates an index memory object.
+@return own: index object */
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+ dict_table_t* table, /*!< in: table */
+ const char* index_name, /*!< in: index name */
+ ulint type, /*!< in: DICT_UNIQUE,
+ DICT_CLUSTERED, ... ORed */
+ ulint n_fields) /*!< in: number of fields */
+{
+ dict_index_t* index;
+ mem_heap_t* heap;
+
+ ut_ad(!table || table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(index_name);
+
+ heap = mem_heap_create(DICT_HEAP_SIZE);
+
+ index = static_cast<dict_index_t*>(
+ mem_heap_zalloc(heap, sizeof(*index)));
+ index->table = table;
+
+ dict_mem_fill_index_struct(index, heap, index_name, type, n_fields);
+
+ new (&index->zip_pad.mutex) std::mutex();
+
+ if (type & DICT_SPATIAL) {
+ index->rtr_track = new
+ (mem_heap_alloc(heap, sizeof *index->rtr_track))
+ rtr_info_track_t();
+ mutex_create(LATCH_ID_RTR_ACTIVE_MUTEX,
+ &index->rtr_track->rtr_active_mutex);
+ }
+
+ return(index);
+}
+
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return own: foreign constraint struct */
+dict_foreign_t*
+dict_mem_foreign_create(void)
+/*=========================*/
+{
+ dict_foreign_t* foreign;
+ mem_heap_t* heap;
+ DBUG_ENTER("dict_mem_foreign_create");
+
+ heap = mem_heap_create(100);
+
+ foreign = static_cast<dict_foreign_t*>(
+ mem_heap_zalloc(heap, sizeof(dict_foreign_t)));
+
+ foreign->heap = heap;
+
+ foreign->v_cols = NULL;
+
+ DBUG_PRINT("dict_mem_foreign_create", ("heap: %p", heap));
+
+ DBUG_RETURN(foreign);
+}
+
+/**********************************************************************//**
+Sets the foreign_table_name_lookup pointer based on the value of
+lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup
+will point to foreign_table_name. If 2, then another string is
+allocated from foreign->heap and set to lower case. */
+void
+dict_mem_foreign_table_name_lookup_set(
+/*===================================*/
+ dict_foreign_t* foreign, /*!< in/out: foreign struct */
+ ibool do_alloc) /*!< in: is an alloc needed */
+{
+ if (innobase_get_lower_case_table_names() == 2) {
+ if (do_alloc) {
+ ulint len;
+
+ len = strlen(foreign->foreign_table_name) + 1;
+
+ foreign->foreign_table_name_lookup =
+ static_cast<char*>(
+ mem_heap_alloc(foreign->heap, len));
+ }
+ strcpy(foreign->foreign_table_name_lookup,
+ foreign->foreign_table_name);
+ innobase_casedn_str(foreign->foreign_table_name_lookup);
+ } else {
+ foreign->foreign_table_name_lookup
+ = foreign->foreign_table_name;
+ }
+}
+
+/**********************************************************************//**
+Sets the referenced_table_name_lookup pointer based on the value of
+lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup
+will point to referenced_table_name. If 2, then another string is
+allocated from foreign->heap and set to lower case. */
+void
+dict_mem_referenced_table_name_lookup_set(
+/*======================================*/
+ dict_foreign_t* foreign, /*!< in/out: foreign struct */
+ ibool do_alloc) /*!< in: is an alloc needed */
+{
+ if (innobase_get_lower_case_table_names() == 2) {
+ if (do_alloc) {
+ ulint len;
+
+ len = strlen(foreign->referenced_table_name) + 1;
+
+ foreign->referenced_table_name_lookup =
+ static_cast<char*>(
+ mem_heap_alloc(foreign->heap, len));
+ }
+ strcpy(foreign->referenced_table_name_lookup,
+ foreign->referenced_table_name);
+ innobase_casedn_str(foreign->referenced_table_name_lookup);
+ } else {
+ foreign->referenced_table_name_lookup
+ = foreign->referenced_table_name;
+ }
+}
+
+/** Fill the virtual column set with virtual column information
+present in the given virtual index.
+@param[in] index virtual index
+@param[out] v_cols virtual column set. */
+static
+void
+dict_mem_fill_vcol_has_index(
+ const dict_index_t* index,
+ dict_vcol_set** v_cols)
+{
+ for (ulint i = 0; i < index->table->n_v_cols; i++) {
+ dict_v_col_t* v_col = dict_table_get_nth_v_col(
+ index->table, i);
+ if (!v_col->m_col.ord_part) {
+ continue;
+ }
+
+ for (const auto& v_idx : v_col->v_indexes) {
+ if (v_idx.index != index) {
+ continue;
+ }
+
+ if (*v_cols == NULL) {
+ *v_cols = UT_NEW_NOKEY(dict_vcol_set());
+ }
+
+ (*v_cols)->insert(v_col);
+ }
+ }
+}
+
+/** Fill the virtual column set with the virtual column of the index
+if the index contains given column name.
+@param[in] col_name column name
+@param[in] table innodb table object
+@param[out] v_cols set of virtual column information. */
+static
+void
+dict_mem_fill_vcol_from_v_indexes(
+ const char* col_name,
+ const dict_table_t* table,
+ dict_vcol_set** v_cols)
+{
+ /* virtual column can't be Primary Key, so start with
+ secondary index */
+ for (dict_index_t* index = dict_table_get_next_index(
+ dict_table_get_first_index(table));
+ index;
+ index = dict_table_get_next_index(index)) {
+
+ /* Skip if the index have newly added
+ virtual column because field name is NULL.
+ Later virtual column set will be
+ refreshed during loading of table. */
+ if (!dict_index_has_virtual(index)
+ || index->has_new_v_col()) {
+ continue;
+ }
+
+ for (ulint i = 0; i < index->n_fields; i++) {
+ dict_field_t* field =
+ dict_index_get_nth_field(index, i);
+
+ if (strcmp(field->name, col_name) == 0) {
+ dict_mem_fill_vcol_has_index(
+ index, v_cols);
+ }
+ }
+ }
+}
+
+/** Fill the virtual column set with virtual columns which have base columns
+as the given col_name
+@param[in] col_name column name
+@param[in] table table object
+@param[out] v_cols set of virtual columns. */
+static
+void
+dict_mem_fill_vcol_set_for_base_col(
+ const char* col_name,
+ const dict_table_t* table,
+ dict_vcol_set** v_cols)
+{
+ for (ulint i = 0; i < table->n_v_cols; i++) {
+ dict_v_col_t* v_col = dict_table_get_nth_v_col(table, i);
+
+ if (!v_col->m_col.ord_part) {
+ continue;
+ }
+
+ for (ulint j = 0; j < unsigned{v_col->num_base}; j++) {
+ if (strcmp(col_name, dict_table_get_col_name(
+ table,
+ v_col->base_col[j]->ind)) == 0) {
+
+ if (*v_cols == NULL) {
+ *v_cols = UT_NEW_NOKEY(dict_vcol_set());
+ }
+
+ (*v_cols)->insert(v_col);
+ }
+ }
+ }
+}
+
+/** Fills the dependent virtual columns in a set.
+Reason for being dependent are
+1) FK can be present on base column of virtual columns
+2) FK can be present on column which is a part of virtual index
+@param[in,out] foreign foreign key information. */
+void
+dict_mem_foreign_fill_vcol_set(
+ dict_foreign_t* foreign)
+{
+ ulint type = foreign->type;
+
+ if (type == 0) {
+ return;
+ }
+
+ for (ulint i = 0; i < foreign->n_fields; i++) {
+ /** FK can be present on base columns
+ of virtual columns. */
+ dict_mem_fill_vcol_set_for_base_col(
+ foreign->foreign_col_names[i],
+ foreign->foreign_table,
+ &foreign->v_cols);
+
+ /** FK can be present on the columns
+ which can be a part of virtual index. */
+ dict_mem_fill_vcol_from_v_indexes(
+ foreign->foreign_col_names[i],
+ foreign->foreign_table,
+ &foreign->v_cols);
+ }
+}
+
+/** Fill virtual columns set in each fk constraint present in the table.
+@param[in,out] table innodb table object. */
+void
+dict_mem_table_fill_foreign_vcol_set(
+ dict_table_t* table)
+{
+ dict_foreign_set fk_set = table->foreign_set;
+ dict_foreign_t* foreign;
+
+ dict_foreign_set::iterator it;
+ for (it = fk_set.begin(); it != fk_set.end(); ++it) {
+ foreign = *it;
+
+ dict_mem_foreign_fill_vcol_set(foreign);
+ }
+}
+
+/** Free the vcol_set from all foreign key constraint on the table.
+@param[in,out] table innodb table object. */
+void
+dict_mem_table_free_foreign_vcol_set(
+ dict_table_t* table)
+{
+ dict_foreign_set fk_set = table->foreign_set;
+ dict_foreign_t* foreign;
+
+ dict_foreign_set::iterator it;
+ for (it = fk_set.begin(); it != fk_set.end(); ++it) {
+
+ foreign = *it;
+
+ if (foreign->v_cols != NULL) {
+ UT_DELETE(foreign->v_cols);
+ foreign->v_cols = NULL;
+ }
+ }
+}
+
+/**********************************************************************//**
+Adds a field definition to an index. NOTE: does not take a copy
+of the column name if the field is a column. The memory occupied
+by the column name may be released only after publishing the index. */
+void
+dict_mem_index_add_field(
+/*=====================*/
+ dict_index_t* index, /*!< in: index */
+ const char* name, /*!< in: column name */
+ ulint prefix_len) /*!< in: 0 or the column prefix length
+ in a MySQL index like
+ INDEX (textcol(25)) */
+{
+ dict_field_t* field;
+
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ index->n_def++;
+
+ field = dict_index_get_nth_field(index, unsigned(index->n_def) - 1);
+
+ field->name = name;
+ field->prefix_len = prefix_len & ((1U << 12) - 1);
+}
+
+/**********************************************************************//**
+Frees an index memory object. */
+void
+dict_mem_index_free(
+/*================*/
+ dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ index->zip_pad.mutex.~mutex();
+
+ if (dict_index_is_spatial(index)) {
+ for (auto& rtr_info : index->rtr_track->rtr_active) {
+ rtr_info->index = NULL;
+ }
+
+ mutex_destroy(&index->rtr_track->rtr_active_mutex);
+ index->rtr_track->~rtr_info_track_t();
+ }
+
+ index->detach_columns();
+ mem_heap_free(index->heap);
+}
+
+/** Create a temporary tablename like "#sql-ibNNN".
+@param[in] heap A memory heap
+@param[in] dbtab Table name in the form database/table name
+@param[in] id Table id
+@return A unique temporary tablename suitable for InnoDB use */
+char*
+dict_mem_create_temporary_tablename(
+ mem_heap_t* heap,
+ const char* dbtab,
+ table_id_t id)
+{
+ size_t size;
+ char* name;
+ const char* dbend = strchr(dbtab, '/');
+ ut_ad(dbend);
+ size_t dblen = size_t(dbend - dbtab) + 1;
+
+ size = dblen + (sizeof(TEMP_FILE_PREFIX) + 3 + 20);
+ name = static_cast<char*>(mem_heap_alloc(heap, size));
+ memcpy(name, dbtab, dblen);
+ snprintf(name + dblen, size - dblen,
+ TEMP_FILE_PREFIX_INNODB UINT64PF, id);
+
+ return(name);
+}
+
+/** Validate the search order in the foreign key set.
+@param[in] fk_set the foreign key set to be validated
+@return true if search order is fine in the set, false otherwise. */
+bool
+dict_foreign_set_validate(
+ const dict_foreign_set& fk_set)
+{
+ dict_foreign_not_exists not_exists(fk_set);
+
+ dict_foreign_set::const_iterator it = std::find_if(
+ fk_set.begin(), fk_set.end(), not_exists);
+
+ if (it == fk_set.end()) {
+ return(true);
+ }
+
+ dict_foreign_t* foreign = *it;
+ std::cerr << "Foreign key lookup failed: " << *foreign;
+ std::cerr << fk_set;
+ ut_ad(0);
+ return(false);
+}
+
+/** Validate the search order in the foreign key sets of the table
+(foreign_set and referenced_set).
+@param[in] table table whose foreign key sets are to be validated
+@return true if foreign key sets are fine, false otherwise. */
+bool
+dict_foreign_set_validate(
+ const dict_table_t& table)
+{
+ return(dict_foreign_set_validate(table.foreign_set)
+ && dict_foreign_set_validate(table.referenced_set));
+}
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_t& foreign)
+{
+ out << "[dict_foreign_t: id='" << foreign.id << "'";
+
+ if (foreign.foreign_table_name != NULL) {
+ out << ",for: '" << foreign.foreign_table_name << "'";
+ }
+
+ out << "]";
+ return(out);
+}
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_set& fk_set)
+{
+ out << "[dict_foreign_set:";
+ std::for_each(fk_set.begin(), fk_set.end(), dict_foreign_print(out));
+ out << "]" << std::endl;
+ return(out);
+}
+
+/** Check whether fulltext index gets affected by foreign
+key constraint. */
+bool dict_foreign_t::affects_fulltext() const
+{
+ if (foreign_table == referenced_table || !foreign_table->fts)
+ return false;
+
+ for (ulint i= 0; i < n_fields; i++)
+ {
+ const dict_col_t *col= dict_index_get_nth_col(foreign_index, i);
+ if (dict_table_is_fts_column(foreign_table->fts->indexes, col->ind,
+ col->is_virtual()) != ULINT_UNDEFINED)
+ return true;
+ }
+
+ return false;
+}
+
+/** Reconstruct the clustered index fields. */
+inline void dict_index_t::reconstruct_fields()
+{
+ DBUG_ASSERT(is_primary());
+
+ n_fields = (n_fields + table->instant->n_dropped)
+ & dict_index_t::MAX_N_FIELDS;
+ n_def = (n_def + table->instant->n_dropped)
+ & dict_index_t::MAX_N_FIELDS;
+
+ const unsigned n_first = first_user_field();
+
+ dict_field_t* tfields = static_cast<dict_field_t*>(
+ mem_heap_zalloc(heap, n_fields * sizeof *fields));
+
+ memcpy(tfields, fields, n_first * sizeof *fields);
+
+ n_nullable = 0;
+ ulint n_core_null = 0;
+ const bool comp = dict_table_is_comp(table);
+ const auto* field_map_it = table->instant->field_map;
+ for (unsigned i = n_first, j = 0; i < n_fields; ) {
+ dict_field_t& f = tfields[i++];
+ auto c = *field_map_it++;
+ if (c.is_dropped()) {
+ f.col = &table->instant->dropped[j++];
+ DBUG_ASSERT(f.col->is_dropped());
+ f.fixed_len = dict_col_get_fixed_size(f.col, comp)
+ & ((1U << 10) - 1);
+ } else {
+ DBUG_ASSERT(!c.is_not_null());
+ const auto old = std::find_if(
+ fields + n_first, fields + n_fields,
+ [c](const dict_field_t& o)
+ { return o.col->ind == c.ind(); });
+ ut_ad(old >= &fields[n_first]);
+ ut_ad(old < &fields[n_fields]);
+ DBUG_ASSERT(!old->prefix_len);
+ DBUG_ASSERT(old->col == &table->cols[c.ind()]);
+ f = *old;
+ }
+
+ f.col->clear_instant();
+ if (f.col->is_nullable()) {
+ n_nullable++;
+ n_core_null += i <= n_core_fields;
+ }
+ }
+
+ fields = tfields;
+ n_core_null_bytes = static_cast<byte>(UT_BITS_IN_BYTES(n_core_null));
+}
+
+/** Reconstruct dropped or reordered columns.
+@param[in] metadata data from serialise_columns()
+@param[in] len length of the metadata, in bytes
+@return whether parsing the metadata failed */
+bool dict_table_t::deserialise_columns(const byte* metadata, ulint len)
+{
+ DBUG_ASSERT(!instant);
+
+ unsigned num_non_pk_fields = mach_read_from_4(metadata);
+ metadata += 4;
+
+ if (num_non_pk_fields >= REC_MAX_N_FIELDS - 3) {
+ return true;
+ }
+
+ dict_index_t* index = UT_LIST_GET_FIRST(indexes);
+
+ if (num_non_pk_fields < unsigned(index->n_fields)
+ - index->first_user_field()) {
+ return true;
+ }
+
+ field_map_element_t* field_map = static_cast<field_map_element_t*>(
+ mem_heap_alloc(heap,
+ num_non_pk_fields * sizeof *field_map));
+
+ unsigned n_dropped_cols = 0;
+
+ for (unsigned i = 0; i < num_non_pk_fields; i++) {
+ auto c = field_map[i] = mach_read_from_2(metadata);
+ metadata += 2;
+
+ if (field_map[i].is_dropped()) {
+ if (c.ind() > DICT_MAX_FIXED_COL_LEN + 1) {
+ return true;
+ }
+ n_dropped_cols++;
+ } else if (c >= n_cols) {
+ return true;
+ }
+ }
+
+ dict_col_t* dropped_cols = static_cast<dict_col_t*>(mem_heap_zalloc(
+ heap, n_dropped_cols * sizeof(dict_col_t)));
+ instant = new (mem_heap_alloc(heap, sizeof *instant)) dict_instant_t();
+ instant->n_dropped = n_dropped_cols;
+ instant->dropped = dropped_cols;
+ instant->field_map = field_map;
+
+ dict_col_t* col = dropped_cols;
+ for (unsigned i = 0; i < num_non_pk_fields; i++) {
+ if (field_map[i].is_dropped()) {
+ auto fixed_len = field_map[i].ind();
+ DBUG_ASSERT(fixed_len <= DICT_MAX_FIXED_COL_LEN + 1);
+ (col++)->set_dropped(field_map[i].is_not_null(),
+ fixed_len == 1,
+ fixed_len > 1 ? fixed_len - 1
+ : 0);
+ }
+ }
+ DBUG_ASSERT(col == &dropped_cols[n_dropped_cols]);
+
+ UT_LIST_GET_FIRST(indexes)->reconstruct_fields();
+ return false;
+}
+
+/** Check if record in clustered index is historical row.
+@param[in] rec clustered row
+@param[in] offsets offsets
+@return true if row is historical */
+bool
+dict_index_t::vers_history_row(
+ const rec_t* rec,
+ const rec_offs* offsets)
+{
+ ut_ad(is_primary());
+
+ ulint len;
+ dict_col_t& col= table->cols[table->vers_end];
+ ut_ad(col.vers_sys_end());
+ ulint nfield = dict_col_get_clust_pos(&col, this);
+ const byte *data = rec_get_nth_field(rec, offsets, nfield, &len);
+ if (col.vers_native()) {
+ ut_ad(len == sizeof trx_id_max_bytes);
+ return 0 != memcmp(data, trx_id_max_bytes, len);
+ }
+ ut_ad(len == sizeof timestamp_max_bytes);
+ return 0 != memcmp(data, timestamp_max_bytes, len);
+}
+
+/** Check if record in secondary index is historical row.
+@param[in] rec record in a secondary index
+@param[out] history_row true if row is historical
+@return true on error */
+bool
+dict_index_t::vers_history_row(
+ const rec_t* rec,
+ bool &history_row)
+{
+ ut_ad(!is_primary());
+
+ bool error = false;
+ mem_heap_t* heap = NULL;
+ dict_index_t* clust_index = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ mtr_t mtr;
+ mtr.start();
+
+ rec_t* clust_rec =
+ row_get_clust_rec(BTR_SEARCH_LEAF, rec, this, &clust_index, &mtr);
+ if (clust_rec) {
+ offsets = rec_get_offsets(clust_rec, clust_index, offsets,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ history_row = clust_index->vers_history_row(clust_rec, offsets);
+ } else {
+ ib::error() << "foreign constraints: secondary index is out of "
+ "sync";
+ ut_ad("secondary index is out of sync" == 0);
+ error = true;
+ }
+ mtr.commit();
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(error);
+}
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
new file mode 100644
index 00000000..42f75252
--- /dev/null
+++ b/storage/innobase/dict/dict0stats.cc
@@ -0,0 +1,4306 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0stats.cc
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#include "dict0stats.h"
+#include "ut0ut.h"
+#include "ut0rnd.h"
+#include "dyn0buf.h"
+#include "row0sel.h"
+#include "trx0trx.h"
+#include "pars0pars.h"
+#include <mysql_com.h>
+#include "btr0btr.h"
+#include "sync0sync.h"
+
+#include <algorithm>
+#include <map>
+#include <vector>
+
+/* Sampling algorithm description @{
+
+The algorithm is controlled by one number - N_SAMPLE_PAGES(index),
+let it be A, which is the number of leaf pages to analyze for a given index
+for each n-prefix (if the index is on 3 columns, then 3*A leaf pages will be
+analyzed).
+
+Let the total number of leaf pages in the table be T.
+Level 0 - leaf pages, level H - root.
+
+Definition: N-prefix-boring record is a record on a non-leaf page that equals
+the next (to the right, cross page boundaries, skipping the supremum and
+infimum) record on the same level when looking at the fist n-prefix columns.
+The last (user) record on a level is not boring (it does not match the
+non-existent user record to the right). We call the records boring because all
+the records on the page below a boring record are equal to that boring record.
+
+We avoid diving below boring records when searching for a leaf page to
+estimate the number of distinct records because we know that such a leaf
+page will have number of distinct records == 1.
+
+For each n-prefix: start from the root level and full scan subsequent lower
+levels until a level that contains at least A*10 distinct records is found.
+Lets call this level LA.
+As an optimization the search is canceled if it has reached level 1 (never
+descend to the level 0 (leaf)) and also if the next level to be scanned
+would contain more than A pages. The latter is because the user has asked
+to analyze A leaf pages and it does not make sense to scan much more than
+A non-leaf pages with the sole purpose of finding a good sample of A leaf
+pages.
+
+After finding the appropriate level LA with >A*10 distinct records (or less in
+the exceptions described above), divide it into groups of equal records and
+pick A such groups. Then pick the last record from each group. For example,
+let the level be:
+
+index: 0,1,2,3,4,5,6,7,8,9,10
+record: 1,1,1,2,2,7,7,7,7,7,9
+
+There are 4 groups of distinct records and if A=2 random ones are selected,
+e.g. 1,1,1 and 7,7,7,7,7, then records with indexes 2 and 9 will be selected.
+
+After selecting A records as described above, dive below them to find A leaf
+pages and analyze them, finding the total number of distinct records. The
+dive to the leaf level is performed by selecting a non-boring record from
+each page and diving below it.
+
+This way, a total of A leaf pages are analyzed for the given n-prefix.
+
+Let the number of different key values found in each leaf page i be Pi (i=1..A).
+Let N_DIFF_AVG_LEAF be (P1 + P2 + ... + PA) / A.
+Let the number of different key values on level LA be N_DIFF_LA.
+Let the total number of records on level LA be TOTAL_LA.
+Let R be N_DIFF_LA / TOTAL_LA, we assume this ratio is the same on the
+leaf level.
+Let the number of leaf pages be N.
+Then the total number of different key values on the leaf level is:
+N * R * N_DIFF_AVG_LEAF.
+See REF01 for the implementation.
+
+The above describes how to calculate the cardinality of an index.
+This algorithm is executed for each n-prefix of a multi-column index
+where n=1..n_uniq.
+@} */
+
+/* names of the tables from the persistent statistics storage */
+#define TABLE_STATS_NAME_PRINT "mysql.innodb_table_stats"
+#define INDEX_STATS_NAME_PRINT "mysql.innodb_index_stats"
+
+#ifdef UNIV_STATS_DEBUG
+#define DEBUG_PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__)
+#else /* UNIV_STATS_DEBUG */
+#define DEBUG_PRINTF(fmt, ...) /* noop */
+#endif /* UNIV_STATS_DEBUG */
+
+/* Gets the number of leaf pages to sample in persistent stats estimation */
+#define N_SAMPLE_PAGES(index) \
+ static_cast<ib_uint64_t>( \
+ (index)->table->stats_sample_pages != 0 \
+ ? (index)->table->stats_sample_pages \
+ : srv_stats_persistent_sample_pages)
+
+/* number of distinct records on a given level that are required to stop
+descending to lower levels and fetch N_SAMPLE_PAGES(index) records
+from that level */
+#define N_DIFF_REQUIRED(index) (N_SAMPLE_PAGES(index) * 10)
+
+/* A dynamic array where we store the boundaries of each distinct group
+of keys. For example if a btree level is:
+index: 0,1,2,3,4,5,6,7,8,9,10,11,12
+data: b,b,b,b,b,b,g,g,j,j,j, x, y
+then we would store 5,7,10,11,12 in the array. */
+typedef std::vector<ib_uint64_t, ut_allocator<ib_uint64_t> > boundaries_t;
+
+/** Allocator type used for index_map_t. */
+typedef ut_allocator<std::pair<const char* const, dict_index_t*> >
+ index_map_t_allocator;
+
+/** Auxiliary map used for sorting indexes by name in dict_stats_save(). */
+typedef std::map<const char*, dict_index_t*, ut_strcmp_functor,
+ index_map_t_allocator> index_map_t;
+
+/*********************************************************************//**
+Checks whether an index should be ignored in stats manipulations:
+* stats fetch
+* stats recalc
+* stats save
+@return true if exists and all tables are ok */
+UNIV_INLINE
+bool
+dict_stats_should_ignore_index(
+/*===========================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ return((index->type & (DICT_FTS | DICT_SPATIAL))
+ || index->is_corrupted()
+ || index->to_be_dropped
+ || !index->is_committed());
+}
+
+/*********************************************************************//**
+Checks whether the persistent statistics storage exists and that all
+tables have the proper structure.
+@return true if exists and all tables are ok */
+static
+bool
+dict_stats_persistent_storage_check(
+/*================================*/
+ bool caller_has_dict_sys_mutex) /*!< in: true if the caller
+ owns dict_sys.mutex */
+{
+ /* definition for the table TABLE_STATS_NAME */
+ dict_col_meta_t table_stats_columns[] = {
+ {"database_name", DATA_VARMYSQL,
+ DATA_NOT_NULL, 192},
+
+ {"table_name", DATA_VARMYSQL,
+ DATA_NOT_NULL, 597},
+
+ {"last_update", DATA_INT,
+ DATA_NOT_NULL | DATA_UNSIGNED, 4},
+
+ {"n_rows", DATA_INT,
+ DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+ {"clustered_index_size", DATA_INT,
+ DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+ {"sum_of_other_index_sizes", DATA_INT,
+ DATA_NOT_NULL | DATA_UNSIGNED, 8}
+ };
+ dict_table_schema_t table_stats_schema = {
+ TABLE_STATS_NAME,
+ UT_ARR_SIZE(table_stats_columns),
+ table_stats_columns,
+ 0 /* n_foreign */,
+ 0 /* n_referenced */
+ };
+
+ /* definition for the table INDEX_STATS_NAME */
+ dict_col_meta_t index_stats_columns[] = {
+ {"database_name", DATA_VARMYSQL,
+ DATA_NOT_NULL, 192},
+
+ {"table_name", DATA_VARMYSQL,
+ DATA_NOT_NULL, 597},
+
+ {"index_name", DATA_VARMYSQL,
+ DATA_NOT_NULL, 192},
+
+ {"last_update", DATA_INT,
+ DATA_NOT_NULL | DATA_UNSIGNED, 4},
+
+ {"stat_name", DATA_VARMYSQL,
+ DATA_NOT_NULL, 64*3},
+
+ {"stat_value", DATA_INT,
+ DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+ {"sample_size", DATA_INT,
+ DATA_UNSIGNED, 8},
+
+ {"stat_description", DATA_VARMYSQL,
+ DATA_NOT_NULL, 1024*3}
+ };
+ dict_table_schema_t index_stats_schema = {
+ INDEX_STATS_NAME,
+ UT_ARR_SIZE(index_stats_columns),
+ index_stats_columns,
+ 0 /* n_foreign */,
+ 0 /* n_referenced */
+ };
+
+ char errstr[512];
+ dberr_t ret;
+
+ if (!caller_has_dict_sys_mutex) {
+ mutex_enter(&dict_sys.mutex);
+ }
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ /* first check table_stats */
+ ret = dict_table_schema_check(&table_stats_schema, errstr,
+ sizeof(errstr));
+ if (ret == DB_SUCCESS) {
+ /* if it is ok, then check index_stats */
+ ret = dict_table_schema_check(&index_stats_schema, errstr,
+ sizeof(errstr));
+ }
+
+ if (!caller_has_dict_sys_mutex) {
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ if (ret != DB_SUCCESS && ret != DB_STATS_DO_NOT_EXIST) {
+ ib::error() << errstr;
+ return(false);
+ } else if (ret == DB_STATS_DO_NOT_EXIST) {
+ return false;
+ }
+ /* else */
+
+ return(true);
+}
+
+/** Executes a given SQL statement using the InnoDB internal SQL parser.
+This function will free the pinfo object.
+@param[in,out] pinfo pinfo to pass to que_eval_sql() must already
+have any literals bound to it
+@param[in] sql SQL string to execute
+@param[in,out] trx in case of NULL the function will allocate and
+free the trx object. If it is not NULL then it will be rolled back
+only in the case of error, but not freed.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_exec_sql(
+ pars_info_t* pinfo,
+ const char* sql,
+ trx_t* trx)
+{
+ dberr_t err;
+ bool trx_started = false;
+
+ ut_d(dict_sys.assert_locked());
+
+ if (!dict_stats_persistent_storage_check(true)) {
+ pars_info_free(pinfo);
+ return(DB_STATS_DO_NOT_EXIST);
+ }
+
+ if (trx == NULL) {
+ trx = trx_create();
+ trx_started = true;
+
+ if (srv_read_only_mode) {
+ trx_start_internal_read_only(trx);
+ } else {
+ trx_start_internal(trx);
+ }
+ }
+
+ err = que_eval_sql(pinfo, sql, FALSE, trx); /* pinfo is freed here */
+
+ DBUG_EXECUTE_IF("stats_index_error",
+ if (!trx_started) {
+ err = DB_STATS_DO_NOT_EXIST;
+ trx->error_state = DB_STATS_DO_NOT_EXIST;
+ });
+
+ if (!trx_started && err == DB_SUCCESS) {
+ return(DB_SUCCESS);
+ }
+
+ if (err == DB_SUCCESS) {
+ trx_commit_for_mysql(trx);
+ } else {
+ trx->op_info = "rollback of internal trx on stats tables";
+ trx->dict_operation_lock_mode = RW_X_LATCH;
+ trx->rollback();
+ trx->dict_operation_lock_mode = 0;
+ trx->op_info = "";
+ ut_a(trx->error_state == DB_SUCCESS);
+ }
+
+ if (trx_started) {
+ trx->free();
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Duplicate a table object and its indexes.
+This function creates a dummy dict_table_t object and initializes the
+following table and index members:
+dict_table_t::id (copied)
+dict_table_t::heap (newly created)
+dict_table_t::name (copied)
+dict_table_t::corrupted (copied)
+dict_table_t::indexes<> (newly created)
+dict_table_t::magic_n
+for each entry in dict_table_t::indexes, the following are initialized:
+(indexes that have DICT_FTS set in index->type are skipped)
+dict_index_t::id (copied)
+dict_index_t::name (copied)
+dict_index_t::table_name (points to the copied table name)
+dict_index_t::table (points to the above semi-initialized object)
+dict_index_t::type (copied)
+dict_index_t::to_be_dropped (copied)
+dict_index_t::online_status (copied)
+dict_index_t::n_uniq (copied)
+dict_index_t::fields[] (newly created, only first n_uniq, only fields[i].name)
+dict_index_t::indexes<> (newly created)
+dict_index_t::stat_n_diff_key_vals[] (only allocated, left uninitialized)
+dict_index_t::stat_n_sample_sizes[] (only allocated, left uninitialized)
+dict_index_t::stat_n_non_null_key_vals[] (only allocated, left uninitialized)
+dict_index_t::magic_n
+The returned object should be freed with dict_stats_table_clone_free()
+when no longer needed.
+@return incomplete table object */
+static
+dict_table_t*
+dict_stats_table_clone_create(
+/*==========================*/
+ const dict_table_t* table) /*!< in: table whose stats to copy */
+{
+ size_t heap_size;
+ dict_index_t* index;
+
+ /* Estimate the size needed for the table and all of its indexes */
+
+ heap_size = 0;
+ heap_size += sizeof(dict_table_t);
+ heap_size += strlen(table->name.m_name) + 1;
+
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+
+ if (dict_stats_should_ignore_index(index)) {
+ continue;
+ }
+
+ ut_ad(!dict_index_is_ibuf(index));
+
+ ulint n_uniq = dict_index_get_n_unique(index);
+
+ heap_size += sizeof(dict_index_t);
+ heap_size += strlen(index->name) + 1;
+ heap_size += n_uniq * sizeof(index->fields[0]);
+ for (ulint i = 0; i < n_uniq; i++) {
+ heap_size += strlen(index->fields[i].name) + 1;
+ }
+ heap_size += n_uniq * sizeof(index->stat_n_diff_key_vals[0]);
+ heap_size += n_uniq * sizeof(index->stat_n_sample_sizes[0]);
+ heap_size += n_uniq * sizeof(index->stat_n_non_null_key_vals[0]);
+ }
+
+ /* Allocate the memory and copy the members */
+
+ mem_heap_t* heap;
+
+ heap = mem_heap_create(heap_size);
+
+ dict_table_t* t;
+
+ t = (dict_table_t*) mem_heap_alloc(heap, sizeof(*t));
+
+ MEM_CHECK_DEFINED(&table->id, sizeof(table->id));
+ t->id = table->id;
+
+ t->heap = heap;
+
+ t->name.m_name = mem_heap_strdup(heap, table->name.m_name);
+
+ t->corrupted = table->corrupted;
+
+ UT_LIST_INIT(t->indexes, &dict_index_t::indexes);
+#ifdef BTR_CUR_HASH_ADAPT
+ UT_LIST_INIT(t->freed_indexes, &dict_index_t::indexes);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+
+ if (dict_stats_should_ignore_index(index)) {
+ continue;
+ }
+
+ ut_ad(!dict_index_is_ibuf(index));
+
+ dict_index_t* idx;
+
+ idx = (dict_index_t*) mem_heap_alloc(heap, sizeof(*idx));
+
+ MEM_CHECK_DEFINED(&index->id, sizeof(index->id));
+ idx->id = index->id;
+
+ idx->name = mem_heap_strdup(heap, index->name);
+
+ idx->table = t;
+
+ idx->type = index->type;
+
+ idx->to_be_dropped = 0;
+
+ idx->online_status = ONLINE_INDEX_COMPLETE;
+ idx->set_committed(true);
+
+ idx->n_uniq = index->n_uniq;
+
+ idx->fields = (dict_field_t*) mem_heap_alloc(
+ heap, idx->n_uniq * sizeof(idx->fields[0]));
+
+ for (ulint i = 0; i < idx->n_uniq; i++) {
+ idx->fields[i].name = mem_heap_strdup(
+ heap, index->fields[i].name);
+ }
+
+ /* hook idx into t->indexes */
+ UT_LIST_ADD_LAST(t->indexes, idx);
+
+ idx->stat_n_diff_key_vals = (ib_uint64_t*) mem_heap_alloc(
+ heap,
+ idx->n_uniq * sizeof(idx->stat_n_diff_key_vals[0]));
+
+ idx->stat_n_sample_sizes = (ib_uint64_t*) mem_heap_alloc(
+ heap,
+ idx->n_uniq * sizeof(idx->stat_n_sample_sizes[0]));
+
+ idx->stat_n_non_null_key_vals = (ib_uint64_t*) mem_heap_alloc(
+ heap,
+ idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
+ ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
+
+ idx->stat_defrag_n_page_split = 0;
+ idx->stat_defrag_n_pages_freed = 0;
+ }
+
+ ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
+
+ return(t);
+}
+
+/*********************************************************************//**
+Free the resources occupied by an object returned by
+dict_stats_table_clone_create(). */
+static
+void
+dict_stats_table_clone_free(
+/*========================*/
+ dict_table_t* t) /*!< in: dummy table object to free */
+{
+ mem_heap_free(t->heap);
+}
+
+/*********************************************************************//**
+Write all zeros (or 1 where it makes sense) into an index
+statistics members. The resulting stats correspond to an empty index. */
+static
+void
+dict_stats_empty_index(
+/*===================*/
+ dict_index_t* index, /*!< in/out: index */
+ bool empty_defrag_stats)
+ /*!< in: whether to empty defrag stats */
+{
+ ut_ad(!(index->type & DICT_FTS));
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ ulint n_uniq = index->n_uniq;
+
+ for (ulint i = 0; i < n_uniq; i++) {
+ index->stat_n_diff_key_vals[i] = 0;
+ index->stat_n_sample_sizes[i] = 1;
+ index->stat_n_non_null_key_vals[i] = 0;
+ }
+
+ index->stat_index_size = 1;
+ index->stat_n_leaf_pages = 1;
+
+ if (empty_defrag_stats) {
+ dict_stats_empty_defrag_stats(index);
+ dict_stats_empty_defrag_summary(index);
+ }
+}
+
+/*********************************************************************//**
+Write all zeros (or 1 where it makes sense) into a table and its indexes'
+statistics members. The resulting stats correspond to an empty table. */
+static
+void
+dict_stats_empty_table(
+/*===================*/
+ dict_table_t* table, /*!< in/out: table */
+ bool empty_defrag_stats)
+ /*!< in: whether to empty defrag stats */
+{
+ mutex_enter(&dict_sys.mutex);
+
+ /* Zero the stats members */
+ table->stat_n_rows = 0;
+ table->stat_clustered_index_size = 1;
+ /* 1 page for each index, not counting the clustered */
+ table->stat_sum_of_other_index_sizes
+ = UT_LIST_GET_LEN(table->indexes) - 1;
+ table->stat_modified_counter = 0;
+
+ dict_index_t* index;
+
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+
+ if (index->type & DICT_FTS) {
+ continue;
+ }
+
+ ut_ad(!dict_index_is_ibuf(index));
+
+ dict_stats_empty_index(index, empty_defrag_stats);
+ }
+
+ table->stat_initialized = TRUE;
+ mutex_exit(&dict_sys.mutex);
+}
+
+/*********************************************************************//**
+Check whether index's stats are initialized (assert if they are not). */
+static
+void
+dict_stats_assert_initialized_index(
+/*================================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ MEM_CHECK_DEFINED(
+ index->stat_n_diff_key_vals,
+ index->n_uniq * sizeof(index->stat_n_diff_key_vals[0]));
+
+ MEM_CHECK_DEFINED(
+ index->stat_n_sample_sizes,
+ index->n_uniq * sizeof(index->stat_n_sample_sizes[0]));
+
+ MEM_CHECK_DEFINED(
+ index->stat_n_non_null_key_vals,
+ index->n_uniq * sizeof(index->stat_n_non_null_key_vals[0]));
+
+ MEM_CHECK_DEFINED(
+ &index->stat_index_size,
+ sizeof(index->stat_index_size));
+
+ MEM_CHECK_DEFINED(
+ &index->stat_n_leaf_pages,
+ sizeof(index->stat_n_leaf_pages));
+}
+
+/*********************************************************************//**
+Check whether table's stats are initialized (assert if they are not). */
+static
+void
+dict_stats_assert_initialized(
+/*==========================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_a(table->stat_initialized);
+
+ MEM_CHECK_DEFINED(&table->stats_last_recalc,
+ sizeof table->stats_last_recalc);
+
+ MEM_CHECK_DEFINED(&table->stat_persistent,
+ sizeof table->stat_persistent);
+
+ MEM_CHECK_DEFINED(&table->stats_auto_recalc,
+ sizeof table->stats_auto_recalc);
+
+ MEM_CHECK_DEFINED(&table->stats_sample_pages,
+ sizeof table->stats_sample_pages);
+
+ MEM_CHECK_DEFINED(&table->stat_n_rows,
+ sizeof table->stat_n_rows);
+
+ MEM_CHECK_DEFINED(&table->stat_clustered_index_size,
+ sizeof table->stat_clustered_index_size);
+
+ MEM_CHECK_DEFINED(&table->stat_sum_of_other_index_sizes,
+ sizeof table->stat_sum_of_other_index_sizes);
+
+ MEM_CHECK_DEFINED(&table->stat_modified_counter,
+ sizeof table->stat_modified_counter);
+
+ MEM_CHECK_DEFINED(&table->stats_bg_flag,
+ sizeof table->stats_bg_flag);
+
+ for (dict_index_t* index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+
+ if (!dict_stats_should_ignore_index(index)) {
+ dict_stats_assert_initialized_index(index);
+ }
+ }
+}
+
+#define INDEX_EQ(i1, i2) \
+ ((i1) != NULL \
+ && (i2) != NULL \
+ && (i1)->id == (i2)->id \
+ && strcmp((i1)->name, (i2)->name) == 0)
+
+/*********************************************************************//**
+Copy table and index statistics from one table to another, including index
+stats. Extra indexes in src are ignored and extra indexes in dst are
+initialized to correspond to an empty index. */
+static
+void
+dict_stats_copy(
+/*============*/
+ dict_table_t* dst, /*!< in/out: destination table */
+ const dict_table_t* src, /*!< in: source table */
+ bool reset_ignored_indexes) /*!< in: if true, set ignored indexes
+ to have the same statistics as if
+ the table was empty */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ dst->stats_last_recalc = src->stats_last_recalc;
+ dst->stat_n_rows = src->stat_n_rows;
+ dst->stat_clustered_index_size = src->stat_clustered_index_size;
+ dst->stat_sum_of_other_index_sizes = src->stat_sum_of_other_index_sizes;
+ dst->stat_modified_counter = src->stat_modified_counter;
+
+ dict_index_t* dst_idx;
+ dict_index_t* src_idx;
+
+ for (dst_idx = dict_table_get_first_index(dst),
+ src_idx = dict_table_get_first_index(src);
+ dst_idx != NULL;
+ dst_idx = dict_table_get_next_index(dst_idx),
+ (src_idx != NULL
+ && (src_idx = dict_table_get_next_index(src_idx)))) {
+
+ if (dict_stats_should_ignore_index(dst_idx)) {
+ if (reset_ignored_indexes) {
+ /* Reset index statistics for all ignored indexes,
+ unless they are FT indexes (these have no statistics)*/
+ if (dst_idx->type & DICT_FTS) {
+ continue;
+ }
+ dict_stats_empty_index(dst_idx, true);
+ } else {
+ continue;
+ }
+ }
+
+ ut_ad(!dict_index_is_ibuf(dst_idx));
+
+ if (!INDEX_EQ(src_idx, dst_idx)) {
+ for (src_idx = dict_table_get_first_index(src);
+ src_idx != NULL;
+ src_idx = dict_table_get_next_index(src_idx)) {
+
+ if (INDEX_EQ(src_idx, dst_idx)) {
+ break;
+ }
+ }
+ }
+
+ if (!INDEX_EQ(src_idx, dst_idx)) {
+ dict_stats_empty_index(dst_idx, true);
+ continue;
+ }
+
+ ulint n_copy_el;
+
+ if (dst_idx->n_uniq > src_idx->n_uniq) {
+ n_copy_el = src_idx->n_uniq;
+ /* Since src is smaller some elements in dst
+ will remain untouched by the following memmove(),
+ thus we init all of them here. */
+ dict_stats_empty_index(dst_idx, true);
+ } else {
+ n_copy_el = dst_idx->n_uniq;
+ }
+
+ memmove(dst_idx->stat_n_diff_key_vals,
+ src_idx->stat_n_diff_key_vals,
+ n_copy_el * sizeof(dst_idx->stat_n_diff_key_vals[0]));
+
+ memmove(dst_idx->stat_n_sample_sizes,
+ src_idx->stat_n_sample_sizes,
+ n_copy_el * sizeof(dst_idx->stat_n_sample_sizes[0]));
+
+ memmove(dst_idx->stat_n_non_null_key_vals,
+ src_idx->stat_n_non_null_key_vals,
+ n_copy_el * sizeof(dst_idx->stat_n_non_null_key_vals[0]));
+
+ dst_idx->stat_index_size = src_idx->stat_index_size;
+
+ dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
+
+ dst_idx->stat_defrag_modified_counter =
+ src_idx->stat_defrag_modified_counter;
+ dst_idx->stat_defrag_n_pages_freed =
+ src_idx->stat_defrag_n_pages_freed;
+ dst_idx->stat_defrag_n_page_split =
+ src_idx->stat_defrag_n_page_split;
+ }
+
+ dst->stat_initialized = TRUE;
+}
+
+/** Duplicate the stats of a table and its indexes.
+This function creates a dummy dict_table_t object and copies the input
+table's stats into it. The returned table object is not in the dictionary
+cache and cannot be accessed by any other threads. In addition to the
+members copied in dict_stats_table_clone_create() this function initializes
+the following:
+dict_table_t::stat_initialized
+dict_table_t::stat_persistent
+dict_table_t::stat_n_rows
+dict_table_t::stat_clustered_index_size
+dict_table_t::stat_sum_of_other_index_sizes
+dict_table_t::stat_modified_counter
+dict_index_t::stat_n_diff_key_vals[]
+dict_index_t::stat_n_sample_sizes[]
+dict_index_t::stat_n_non_null_key_vals[]
+dict_index_t::stat_index_size
+dict_index_t::stat_n_leaf_pages
+dict_index_t::stat_defrag_modified_counter
+dict_index_t::stat_defrag_n_pages_freed
+dict_index_t::stat_defrag_n_page_split
+The returned object should be freed with dict_stats_snapshot_free()
+when no longer needed.
+@param[in] table table whose stats to copy
+@return incomplete table object */
+static
+dict_table_t*
+dict_stats_snapshot_create(
+ dict_table_t* table)
+{
+ mutex_enter(&dict_sys.mutex);
+
+ dict_stats_assert_initialized(table);
+
+ dict_table_t* t;
+
+ t = dict_stats_table_clone_create(table);
+
+ dict_stats_copy(t, table, false);
+
+ t->stat_persistent = table->stat_persistent;
+ t->stats_auto_recalc = table->stats_auto_recalc;
+ t->stats_sample_pages = table->stats_sample_pages;
+ t->stats_bg_flag = table->stats_bg_flag;
+
+ mutex_exit(&dict_sys.mutex);
+
+ return(t);
+}
+
+/*********************************************************************//**
+Free the resources occupied by an object returned by
+dict_stats_snapshot_create(). */
+static
+void
+dict_stats_snapshot_free(
+/*=====================*/
+ dict_table_t* t) /*!< in: dummy table object to free */
+{
+ dict_stats_table_clone_free(t);
+}
+
+/*********************************************************************//**
+Calculates new estimates for index statistics. This function is
+relatively quick and is used to calculate transient statistics that
+are not saved on disk. This was the only way to calculate statistics
+before the Persistent Statistics feature was introduced.
+This function doesn't update the defragmentation related stats.
+Only persistent statistics supports defragmentation stats. */
+static
+void
+dict_stats_update_transient_for_index(
+/*==================================*/
+ dict_index_t* index) /*!< in/out: index */
+{
+ if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+ && (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO
+ || !dict_index_is_clust(index))) {
+ /* If we have set a high innodb_force_recovery
+ level, do not calculate statistics, as a badly
+ corrupted index can cause a crash in it.
+ Initialize some bogus index cardinality
+ statistics, so that the data can be queried in
+ various means, also via secondary indexes. */
+ mutex_enter(&dict_sys.mutex);
+ dict_stats_empty_index(index, false);
+ mutex_exit(&dict_sys.mutex);
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+ } else if (ibuf_debug && !dict_index_is_clust(index)) {
+ mutex_enter(&dict_sys.mutex);
+ dict_stats_empty_index(index, false);
+ mutex_exit(&dict_sys.mutex);
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+ } else {
+ mtr_t mtr;
+ ulint size;
+
+ mtr.start();
+ mtr_s_lock_index(index, &mtr);
+ size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
+
+ if (size != ULINT_UNDEFINED) {
+ index->stat_index_size = size;
+
+ size = btr_get_size(
+ index, BTR_N_LEAF_PAGES, &mtr);
+ }
+
+ mtr.commit();
+
+ switch (size) {
+ case ULINT_UNDEFINED:
+ mutex_enter(&dict_sys.mutex);
+ dict_stats_empty_index(index, false);
+ mutex_exit(&dict_sys.mutex);
+ return;
+ case 0:
+ /* The root node of the tree is a leaf */
+ size = 1;
+ }
+
+ index->stat_n_leaf_pages = size;
+
+ /* Do not continue if table decryption has failed or
+ table is already marked as corrupted. */
+ if (index->is_readable()) {
+ std::vector<index_field_stats_t> stats
+ = btr_estimate_number_of_different_key_vals(
+ index);
+
+ if (!stats.empty()) {
+ ut_ad(!mutex_own(&dict_sys.mutex));
+ mutex_enter(&dict_sys.mutex);
+ for (size_t i = 0; i < stats.size(); ++i) {
+ index->stat_n_diff_key_vals[i]
+ = stats[i].n_diff_key_vals;
+ index->stat_n_sample_sizes[i]
+ = stats[i].n_sample_sizes;
+ index->stat_n_non_null_key_vals[i]
+ = stats[i].n_non_null_key_vals;
+ }
+ mutex_exit(&dict_sys.mutex);
+ }
+ }
+ }
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively quick and is used to calculate transient statistics that
+are not saved on disk.
+This was the only way to calculate statistics before the
+Persistent Statistics feature was introduced. */
+static
+void
+dict_stats_update_transient(
+/*========================*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ dict_index_t* index;
+ ulint sum_of_index_sizes = 0;
+
+ /* Find out the sizes of the indexes and how many different values
+ for the key they approximately have */
+
+ index = dict_table_get_first_index(table);
+
+ if (!table->space) {
+ /* Nothing to do. */
+ dict_stats_empty_table(table, true);
+ return;
+ } else if (index == NULL) {
+ /* Table definition is corrupt */
+
+ ib::warn() << "Table " << table->name
+ << " has no indexes. Cannot calculate statistics.";
+ dict_stats_empty_table(table, true);
+ return;
+ }
+
+ for (; index != NULL; index = dict_table_get_next_index(index)) {
+
+ ut_ad(!dict_index_is_ibuf(index));
+
+ if (index->type & (DICT_FTS | DICT_SPATIAL)) {
+ continue;
+ }
+
+ if (dict_stats_should_ignore_index(index)
+ || !index->is_readable()) {
+ mutex_enter(&dict_sys.mutex);
+ dict_stats_empty_index(index, false);
+ mutex_exit(&dict_sys.mutex);
+ continue;
+ }
+
+ dict_stats_update_transient_for_index(index);
+
+ sum_of_index_sizes += index->stat_index_size;
+ }
+
+ mutex_enter(&dict_sys.mutex);
+
+ index = dict_table_get_first_index(table);
+
+ table->stat_n_rows = index->stat_n_diff_key_vals[
+ dict_index_get_n_unique(index) - 1];
+
+ table->stat_clustered_index_size = index->stat_index_size;
+
+ table->stat_sum_of_other_index_sizes = sum_of_index_sizes
+ - index->stat_index_size;
+
+ table->stats_last_recalc = time(NULL);
+
+ table->stat_modified_counter = 0;
+
+ table->stat_initialized = TRUE;
+
+ mutex_exit(&dict_sys.mutex);
+}
+
+/* @{ Pseudo code about the relation between the following functions
+
+let N = N_SAMPLE_PAGES(index)
+
+dict_stats_analyze_index()
+ for each n_prefix
+ search for good enough level:
+ dict_stats_analyze_index_level() // only called if level has <= N pages
+ // full scan of the level in one mtr
+ collect statistics about the given level
+ if we are not satisfied with the level, search next lower level
+ we have found a good enough level here
+ dict_stats_analyze_index_for_n_prefix(that level, stats collected above)
+ // full scan of the level in one mtr
+ dive below some records and analyze the leaf page there:
+ dict_stats_analyze_index_below_cur()
+@} */
+
+/*********************************************************************//**
+Find the total number and the number of distinct keys on a given level in
+an index. Each of the 1..n_uniq prefixes are looked up and the results are
+saved in the array n_diff[0] .. n_diff[n_uniq - 1]. The total number of
+records on the level is saved in total_recs.
+Also, the index of the last record in each group of equal records is saved
+in n_diff_boundaries[0..n_uniq - 1], records indexing starts from the leftmost
+record on the level and continues cross pages boundaries, counting from 0. */
+static
+void
+dict_stats_analyze_index_level(
+/*===========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level */
+ ib_uint64_t* n_diff, /*!< out: array for number of
+ distinct keys for all prefixes */
+ ib_uint64_t* total_recs, /*!< out: total number of records */
+ ib_uint64_t* total_pages, /*!< out: total number of pages */
+ boundaries_t* n_diff_boundaries,/*!< out: boundaries of the groups
+ of distinct keys */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint n_uniq;
+ mem_heap_t* heap;
+ btr_pcur_t pcur;
+ const page_t* page;
+ const rec_t* rec;
+ const rec_t* prev_rec;
+ bool prev_rec_is_copied;
+ byte* prev_rec_buf = NULL;
+ ulint prev_rec_buf_size = 0;
+ rec_offs* rec_offsets;
+ rec_offs* prev_rec_offsets;
+ ulint i;
+
+ DEBUG_PRINTF(" %s(table=%s, index=%s, level=" ULINTPF ")\n",
+ __func__, index->table->name, index->name, level);
+
+ ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
+
+ n_uniq = dict_index_get_n_unique(index);
+
+ /* elements in the n_diff array are 0..n_uniq-1 (inclusive) */
+ memset(n_diff, 0x0, n_uniq * sizeof(n_diff[0]));
+
+ /* Allocate space for the offsets header (the allocation size at
+ offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_uniq + 1,
+ so that this will never be less than the size calculated in
+ rec_get_offsets_func(). */
+ i = (REC_OFFS_HEADER_SIZE + 1 + 1) + n_uniq;
+
+ heap = mem_heap_create((2 * sizeof *rec_offsets) * i);
+ rec_offsets = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, i * sizeof *rec_offsets));
+ prev_rec_offsets = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, i * sizeof *prev_rec_offsets));
+ rec_offs_set_n_alloc(rec_offsets, i);
+ rec_offs_set_n_alloc(prev_rec_offsets, i);
+
+ /* reset the dynamic arrays n_diff_boundaries[0..n_uniq-1] */
+ if (n_diff_boundaries != NULL) {
+ for (i = 0; i < n_uniq; i++) {
+ n_diff_boundaries[i].erase(
+ n_diff_boundaries[i].begin(),
+ n_diff_boundaries[i].end());
+ }
+ }
+
+ /* Position pcur on the leftmost record on the leftmost page
+ on the desired level. */
+
+ btr_pcur_open_at_index_side(
+ true, index, BTR_SEARCH_TREE_ALREADY_S_LATCHED,
+ &pcur, true, level, mtr);
+ btr_pcur_move_to_next_on_page(&pcur);
+
+ page = btr_pcur_get_page(&pcur);
+
+ /* The page must not be empty, except when
+ it is the root page (and the whole index is empty). */
+ ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page));
+ ut_ad(btr_pcur_get_rec(&pcur)
+ == page_rec_get_next_const(page_get_infimum_rec(page)));
+
+ /* check that we are indeed on the desired level */
+ ut_a(btr_page_get_level(page) == level);
+
+ /* there should not be any pages on the left */
+ ut_a(!page_has_prev(page));
+
+ if (REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+ btr_pcur_get_rec(&pcur), page_is_comp(page))) {
+ ut_ad(btr_pcur_is_on_user_rec(&pcur));
+ if (level == 0) {
+ /* Skip the metadata pseudo-record */
+ ut_ad(index->is_instant());
+ btr_pcur_move_to_next_user_rec(&pcur, mtr);
+ }
+ } else {
+ /* The first record on the leftmost page must be
+ marked as such on each level except the leaf level. */
+ ut_a(level == 0);
+ }
+
+ prev_rec = NULL;
+ prev_rec_is_copied = false;
+
+ /* no records by default */
+ *total_recs = 0;
+
+ *total_pages = 0;
+
+ /* iterate over all user records on this level
+ and compare each two adjacent ones, even the last on page
+ X and the fist on page X+1 */
+ for (;
+ btr_pcur_is_on_user_rec(&pcur);
+ btr_pcur_move_to_next_user_rec(&pcur, mtr)) {
+
+ bool rec_is_last_on_page;
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ /* If rec and prev_rec are on different pages, then prev_rec
+ must have been copied, because we hold latch only on the page
+ where rec resides. */
+ if (prev_rec != NULL
+ && page_align(rec) != page_align(prev_rec)) {
+
+ ut_a(prev_rec_is_copied);
+ }
+
+ rec_is_last_on_page =
+ page_rec_is_supremum(page_rec_get_next_const(rec));
+
+ /* increment the pages counter at the end of each page */
+ if (rec_is_last_on_page) {
+
+ (*total_pages)++;
+ }
+
+ /* Skip delete-marked records on the leaf level. If we
+ do not skip them, then ANALYZE quickly after DELETE
+ could count them or not (purge may have already wiped
+ them away) which brings non-determinism. We skip only
+ leaf-level delete marks because delete marks on
+ non-leaf level do not make sense. */
+
+ if (level == 0
+ && !srv_stats_include_delete_marked
+ && rec_get_deleted_flag(
+ rec,
+ page_is_comp(btr_pcur_get_page(&pcur)))) {
+
+ if (rec_is_last_on_page
+ && !prev_rec_is_copied
+ && prev_rec != NULL) {
+ /* copy prev_rec */
+
+ prev_rec_offsets = rec_get_offsets(
+ prev_rec, index, prev_rec_offsets,
+ index->n_core_fields,
+ n_uniq, &heap);
+
+ prev_rec = rec_copy_prefix_to_buf(
+ prev_rec, index, n_uniq,
+ &prev_rec_buf, &prev_rec_buf_size);
+
+ prev_rec_is_copied = true;
+ }
+
+ continue;
+ }
+ rec_offsets = rec_get_offsets(rec, index, rec_offsets,
+ level ? 0 : index->n_core_fields,
+ n_uniq, &heap);
+
+ (*total_recs)++;
+
+ if (prev_rec != NULL) {
+ ulint matched_fields;
+
+ prev_rec_offsets = rec_get_offsets(
+ prev_rec, index, prev_rec_offsets,
+ level ? 0 : index->n_core_fields,
+ n_uniq, &heap);
+
+ cmp_rec_rec(prev_rec, rec,
+ prev_rec_offsets, rec_offsets, index,
+ false, &matched_fields);
+
+ for (i = matched_fields; i < n_uniq; i++) {
+
+ if (n_diff_boundaries != NULL) {
+ /* push the index of the previous
+ record, that is - the last one from
+ a group of equal keys */
+
+ ib_uint64_t idx;
+
+ /* the index of the current record
+ is total_recs - 1, the index of the
+ previous record is total_recs - 2;
+ we know that idx is not going to
+ become negative here because if we
+ are in this branch then there is a
+ previous record and thus
+ total_recs >= 2 */
+ idx = *total_recs - 2;
+
+ n_diff_boundaries[i].push_back(idx);
+ }
+
+ /* increment the number of different keys
+ for n_prefix=i+1 (e.g. if i=0 then we increment
+ for n_prefix=1 which is stored in n_diff[0]) */
+ n_diff[i]++;
+ }
+ } else {
+ /* this is the first non-delete marked record */
+ for (i = 0; i < n_uniq; i++) {
+ n_diff[i] = 1;
+ }
+ }
+
+ if (rec_is_last_on_page) {
+ /* end of a page has been reached */
+
+ /* we need to copy the record instead of assigning
+ like prev_rec = rec; because when we traverse the
+ records on this level at some point we will jump from
+ one page to the next and then rec and prev_rec will
+ be on different pages and
+ btr_pcur_move_to_next_user_rec() will release the
+ latch on the page that prev_rec is on */
+ prev_rec = rec_copy_prefix_to_buf(
+ rec, index, n_uniq,
+ &prev_rec_buf, &prev_rec_buf_size);
+ prev_rec_is_copied = true;
+
+ } else {
+ /* still on the same page, the next call to
+ btr_pcur_move_to_next_user_rec() will not jump
+ on the next page, we can simply assign pointers
+ instead of copying the records like above */
+
+ prev_rec = rec;
+ prev_rec_is_copied = false;
+ }
+ }
+
+ /* if *total_pages is left untouched then the above loop was not
+ entered at all and there is one page in the whole tree which is
+ empty or the loop was entered but this is level 0, contains one page
+ and all records are delete-marked */
+ if (*total_pages == 0) {
+
+ ut_ad(level == 0);
+ ut_ad(*total_recs == 0);
+
+ *total_pages = 1;
+ }
+
+ /* if there are records on this level and boundaries
+ should be saved */
+ if (*total_recs > 0 && n_diff_boundaries != NULL) {
+
+ /* remember the index of the last record on the level as the
+ last one from the last group of equal keys; this holds for
+ all possible prefixes */
+ for (i = 0; i < n_uniq; i++) {
+ ib_uint64_t idx;
+
+ idx = *total_recs - 1;
+
+ n_diff_boundaries[i].push_back(idx);
+ }
+ }
+
+ /* now in n_diff_boundaries[i] there are exactly n_diff[i] integers,
+ for i=0..n_uniq-1 */
+
+#ifdef UNIV_STATS_DEBUG
+ for (i = 0; i < n_uniq; i++) {
+
+ DEBUG_PRINTF(" %s(): total recs: " UINT64PF
+ ", total pages: " UINT64PF
+ ", n_diff[" ULINTPF "]: " UINT64PF "\n",
+ __func__, *total_recs,
+ *total_pages,
+ i, n_diff[i]);
+
+#if 0
+ if (n_diff_boundaries != NULL) {
+ ib_uint64_t j;
+
+ DEBUG_PRINTF(" %s(): boundaries[%lu]: ",
+ __func__, i);
+
+ for (j = 0; j < n_diff[i]; j++) {
+ ib_uint64_t idx;
+
+ idx = n_diff_boundaries[i][j];
+
+ DEBUG_PRINTF(UINT64PF "=" UINT64PF ", ",
+ j, idx);
+ }
+ DEBUG_PRINTF("\n");
+ }
+#endif
+ }
+#endif /* UNIV_STATS_DEBUG */
+
+ /* Release the latch on the last page, because that is not done by
+ btr_pcur_close(). This function works also for non-leaf pages. */
+ btr_leaf_page_release(btr_pcur_get_block(&pcur), BTR_SEARCH_LEAF, mtr);
+
+ btr_pcur_close(&pcur);
+ ut_free(prev_rec_buf);
+ mem_heap_free(heap);
+}
+
+/** Scan a page, reading records from left to right and counting the number
+of distinct records (looking only at the first n_prefix
+columns) and the number of external pages pointed by records from this page.
+If scan_method is QUIT_ON_FIRST_NON_BORING then the function
+will return as soon as it finds a record that does not match its neighbor
+to the right, which means that in the case of QUIT_ON_FIRST_NON_BORING the
+returned n_diff can either be 0 (empty page), 1 (the whole page has all keys
+equal) or 2 (the function found a non-boring record and returned).
+@param[out] out_rec record, or NULL
+@param[out] offsets1 rec_get_offsets() working space (must
+be big enough)
+@param[out] offsets2 rec_get_offsets() working space (must
+be big enough)
+@param[in] index index of the page
+@param[in] page the page to scan
+@param[in] n_prefix look at the first n_prefix columns
+@param[in] n_core 0, or index->n_core_fields for leaf
+@param[out] n_diff number of distinct records encountered
+@param[out] n_external_pages if this is non-NULL then it will be set
+to the number of externally stored pages which were encountered
+@return offsets1 or offsets2 (the offsets of *out_rec),
+or NULL if the page is empty and does not contain user records. */
+UNIV_INLINE
+rec_offs*
+dict_stats_scan_page(
+ const rec_t** out_rec,
+ rec_offs* offsets1,
+ rec_offs* offsets2,
+ const dict_index_t* index,
+ const page_t* page,
+ ulint n_prefix,
+ ulint n_core,
+ ib_uint64_t* n_diff,
+ ib_uint64_t* n_external_pages)
+{
+ rec_offs* offsets_rec = offsets1;
+ rec_offs* offsets_next_rec = offsets2;
+ const rec_t* rec;
+ const rec_t* next_rec;
+ /* A dummy heap, to be passed to rec_get_offsets().
+ Because offsets1,offsets2 should be big enough,
+ this memory heap should never be used. */
+ mem_heap_t* heap = NULL;
+ ut_ad(!!n_core == page_is_leaf(page));
+ const rec_t* (*get_next)(const rec_t*)
+ = !n_core || srv_stats_include_delete_marked
+ ? page_rec_get_next_const
+ : page_rec_get_next_non_del_marked;
+
+ const bool should_count_external_pages = n_external_pages != NULL;
+
+ if (should_count_external_pages) {
+ *n_external_pages = 0;
+ }
+
+ rec = get_next(page_get_infimum_rec(page));
+
+ if (page_rec_is_supremum(rec)) {
+ /* the page is empty or contains only delete-marked records */
+ *n_diff = 0;
+ *out_rec = NULL;
+ return(NULL);
+ }
+
+ offsets_rec = rec_get_offsets(rec, index, offsets_rec, n_core,
+ ULINT_UNDEFINED, &heap);
+
+ if (should_count_external_pages) {
+ *n_external_pages += btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
+ }
+
+ next_rec = get_next(rec);
+
+ *n_diff = 1;
+
+ while (!page_rec_is_supremum(next_rec)) {
+
+ ulint matched_fields;
+
+ offsets_next_rec = rec_get_offsets(next_rec, index,
+ offsets_next_rec, n_core,
+ ULINT_UNDEFINED,
+ &heap);
+
+ /* check whether rec != next_rec when looking at
+ the first n_prefix fields */
+ cmp_rec_rec(rec, next_rec, offsets_rec, offsets_next_rec,
+ index, false, &matched_fields);
+
+ if (matched_fields < n_prefix) {
+ /* rec != next_rec, => rec is non-boring */
+
+ (*n_diff)++;
+
+ if (!n_core) {
+ break;
+ }
+ }
+
+ rec = next_rec;
+ /* Assign offsets_rec = offsets_next_rec so that
+ offsets_rec matches with rec which was just assigned
+ rec = next_rec above. Also need to point
+ offsets_next_rec to the place where offsets_rec was
+ pointing before because we have just 2 placeholders
+ where data is actually stored: offsets1 and offsets2
+ and we are using them in circular fashion
+ (offsets[_next]_rec are just pointers to those
+ placeholders). */
+ std::swap(offsets_rec, offsets_next_rec);
+
+ if (should_count_external_pages) {
+ *n_external_pages += btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
+ }
+
+ next_rec = get_next(next_rec);
+ }
+
+ /* offsets1,offsets2 should have been big enough */
+ ut_a(heap == NULL);
+ *out_rec = rec;
+ return(offsets_rec);
+}
+
+/** Dive below the current position of a cursor and calculate the number of
+distinct records on the leaf page, when looking at the fist n_prefix
+columns. Also calculate the number of external pages pointed by records
+on the leaf page.
+@param[in] cur cursor
+@param[in] n_prefix look at the first n_prefix columns
+when comparing records
+@param[out] n_diff number of distinct records
+@param[out] n_external_pages number of external pages
+@return number of distinct records on the leaf page */
+static
+void
+dict_stats_analyze_index_below_cur(
+ const btr_cur_t* cur,
+ ulint n_prefix,
+ ib_uint64_t* n_diff,
+ ib_uint64_t* n_external_pages)
+{
+ dict_index_t* index;
+ buf_block_t* block;
+ const page_t* page;
+ mem_heap_t* heap;
+ const rec_t* rec;
+ rec_offs* offsets1;
+ rec_offs* offsets2;
+ rec_offs* offsets_rec;
+ ulint size;
+ mtr_t mtr;
+
+ index = btr_cur_get_index(cur);
+
+ /* Allocate offsets for the record and the node pointer, for
+ node pointer records. In a secondary index, the node pointer
+ record will consist of all index fields followed by a child
+ page number.
+ Allocate space for the offsets header (the allocation size at
+ offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1,
+ so that this will never be less than the size calculated in
+ rec_get_offsets_func(). */
+ size = (1 + REC_OFFS_HEADER_SIZE) + 1 + dict_index_get_n_fields(index);
+
+ heap = mem_heap_create(size * (sizeof *offsets1 + sizeof *offsets2));
+
+ offsets1 = static_cast<rec_offs*>(mem_heap_alloc(
+ heap, size * sizeof *offsets1));
+
+ offsets2 = static_cast<rec_offs*>(mem_heap_alloc(
+ heap, size * sizeof *offsets2));
+
+ rec_offs_set_n_alloc(offsets1, size);
+ rec_offs_set_n_alloc(offsets2, size);
+
+ rec = btr_cur_get_rec(cur);
+ page = page_align(rec);
+ ut_ad(!page_rec_is_leaf(rec));
+
+ offsets_rec = rec_get_offsets(rec, index, offsets1, 0,
+ ULINT_UNDEFINED, &heap);
+
+ page_id_t page_id(index->table->space_id,
+ btr_node_ptr_get_child_page_no(
+ rec, offsets_rec));
+ const ulint zip_size = index->table->space->zip_size();
+
+ /* assume no external pages by default - in case we quit from this
+ function without analyzing any leaf pages */
+ *n_external_pages = 0;
+
+ mtr_start(&mtr);
+
+ /* descend to the leaf level on the B-tree */
+ for (;;) {
+
+ dberr_t err = DB_SUCCESS;
+
+ block = buf_page_get_gen(page_id, zip_size,
+ RW_S_LATCH, NULL, BUF_GET,
+ __FILE__, __LINE__, &mtr, &err,
+ !index->is_clust()
+ && 1 == btr_page_get_level(page));
+
+ page = buf_block_get_frame(block);
+
+ if (page_is_leaf(page)) {
+ /* leaf level */
+ break;
+ }
+ /* else */
+
+ /* search for the first non-boring record on the page */
+ offsets_rec = dict_stats_scan_page(
+ &rec, offsets1, offsets2, index, page, n_prefix,
+ 0, n_diff, NULL);
+
+ /* pages on level > 0 are not allowed to be empty */
+ ut_a(offsets_rec != NULL);
+ /* if page is not empty (offsets_rec != NULL) then n_diff must
+ be > 0, otherwise there is a bug in dict_stats_scan_page() */
+ ut_a(*n_diff > 0);
+
+ if (*n_diff == 1) {
+ mtr_commit(&mtr);
+
+ /* page has all keys equal and the end of the page
+ was reached by dict_stats_scan_page(), no need to
+ descend to the leaf level */
+ mem_heap_free(heap);
+ /* can't get an estimate for n_external_pages here
+ because we do not dive to the leaf level, assume no
+ external pages (*n_external_pages was assigned to 0
+ above). */
+ return;
+ }
+ /* else */
+
+ /* when we instruct dict_stats_scan_page() to quit on the
+ first non-boring record it finds, then the returned n_diff
+ can either be 0 (empty page), 1 (page has all keys equal) or
+ 2 (non-boring record was found) */
+ ut_a(*n_diff == 2);
+
+ /* we have a non-boring record in rec, descend below it */
+
+ page_id.set_page_no(
+ btr_node_ptr_get_child_page_no(rec, offsets_rec));
+ }
+
+ /* make sure we got a leaf page as a result from the above loop */
+ ut_ad(page_is_leaf(page));
+
+ /* scan the leaf page and find the number of distinct keys,
+ when looking only at the first n_prefix columns; also estimate
+ the number of externally stored pages pointed by records on this
+ page */
+
+ offsets_rec = dict_stats_scan_page(
+ &rec, offsets1, offsets2, index, page, n_prefix,
+ index->n_core_fields, n_diff,
+ n_external_pages);
+
+#if 0
+ DEBUG_PRINTF(" %s(): n_diff below page_no=%lu: " UINT64PF "\n",
+ __func__, page_no, n_diff);
+#endif
+
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+}
+
+/** Input data that is used to calculate dict_index_t::stat_n_diff_key_vals[]
+for each n-columns prefix (n from 1 to n_uniq). */
+struct n_diff_data_t {
+ /** Index of the level on which the descent through the btree
+ stopped. level 0 is the leaf level. This is >= 1 because we
+ avoid scanning the leaf level because it may contain too many
+ pages and doing so is useless when combined with the random dives -
+ if we are to scan the leaf level, this means a full scan and we can
+ simply do that instead of fiddling with picking random records higher
+ in the tree and to dive below them. At the start of the analyzing
+ we may decide to do full scan of the leaf level, but then this
+ structure is not used in that code path. */
+ ulint level;
+
+ /** Number of records on the level where the descend through the btree
+ stopped. When we scan the btree from the root, we stop at some mid
+ level, choose some records from it and dive below them towards a leaf
+ page to analyze. */
+ ib_uint64_t n_recs_on_level;
+
+ /** Number of different key values that were found on the mid level. */
+ ib_uint64_t n_diff_on_level;
+
+ /** Number of leaf pages that are analyzed. This is also the same as
+ the number of records that we pick from the mid level and dive below
+ them. */
+ ib_uint64_t n_leaf_pages_to_analyze;
+
+ /** Cumulative sum of the number of different key values that were
+ found on all analyzed pages. */
+ ib_uint64_t n_diff_all_analyzed_pages;
+
+ /** Cumulative sum of the number of external pages (stored outside of
+ the btree but in the same file segment). */
+ ib_uint64_t n_external_pages_sum;
+};
+
+/** Estimate the number of different key values in an index when looking at
+the first n_prefix columns. For a given level in an index select
+n_diff_data->n_leaf_pages_to_analyze records from that level and dive below
+them to the corresponding leaf pages, then scan those leaf pages and save the
+sampling results in n_diff_data->n_diff_all_analyzed_pages.
+@param[in] index index
+@param[in] n_prefix look at first 'n_prefix' columns when
+comparing records
+@param[in] boundaries a vector that contains
+n_diff_data->n_diff_on_level integers each of which represents the index (on
+level 'level', counting from left/smallest to right/biggest from 0) of the
+last record from each group of distinct keys
+@param[in,out] n_diff_data n_diff_all_analyzed_pages and
+n_external_pages_sum in this structure will be set by this function. The
+members level, n_diff_on_level and n_leaf_pages_to_analyze must be set by the
+caller in advance - they are used by some calculations inside this function
+@param[in,out] mtr mini-transaction */
+static
+void
+dict_stats_analyze_index_for_n_prefix(
+ dict_index_t* index,
+ ulint n_prefix,
+ const boundaries_t* boundaries,
+ n_diff_data_t* n_diff_data,
+ mtr_t* mtr)
+{
+ btr_pcur_t pcur;
+ const page_t* page;
+ ib_uint64_t rec_idx;
+ ib_uint64_t i;
+
+#if 0
+ DEBUG_PRINTF(" %s(table=%s, index=%s, level=%lu, n_prefix=%lu,"
+ " n_diff_on_level=" UINT64PF ")\n",
+ __func__, index->table->name, index->name, level,
+ n_prefix, n_diff_data->n_diff_on_level);
+#endif
+
+ ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
+
+ /* Position pcur on the leftmost record on the leftmost page
+ on the desired level. */
+
+ btr_pcur_open_at_index_side(
+ true, index, BTR_SEARCH_TREE_ALREADY_S_LATCHED,
+ &pcur, true, n_diff_data->level, mtr);
+ btr_pcur_move_to_next_on_page(&pcur);
+
+ page = btr_pcur_get_page(&pcur);
+
+ const rec_t* first_rec = btr_pcur_get_rec(&pcur);
+
+ /* We shouldn't be scanning the leaf level. The caller of this function
+ should have stopped the descend on level 1 or higher. */
+ ut_ad(n_diff_data->level > 0);
+ ut_ad(!page_is_leaf(page));
+
+ /* The page must not be empty, except when
+ it is the root page (and the whole index is empty). */
+ ut_ad(btr_pcur_is_on_user_rec(&pcur));
+ ut_ad(first_rec == page_rec_get_next_const(page_get_infimum_rec(page)));
+
+ /* check that we are indeed on the desired level */
+ ut_a(btr_page_get_level(page) == n_diff_data->level);
+
+ /* there should not be any pages on the left */
+ ut_a(!page_has_prev(page));
+
+ /* check whether the first record on the leftmost page is marked
+ as such; we are on a non-leaf level */
+ ut_a(rec_get_info_bits(first_rec, page_is_comp(page))
+ & REC_INFO_MIN_REC_FLAG);
+
+ const ib_uint64_t last_idx_on_level = boundaries->at(
+ static_cast<unsigned>(n_diff_data->n_diff_on_level - 1));
+
+ rec_idx = 0;
+
+ n_diff_data->n_diff_all_analyzed_pages = 0;
+ n_diff_data->n_external_pages_sum = 0;
+
+ for (i = 0; i < n_diff_data->n_leaf_pages_to_analyze; i++) {
+ /* there are n_diff_on_level elements
+ in 'boundaries' and we divide those elements
+ into n_leaf_pages_to_analyze segments, for example:
+
+ let n_diff_on_level=100, n_leaf_pages_to_analyze=4, then:
+ segment i=0: [0, 24]
+ segment i=1: [25, 49]
+ segment i=2: [50, 74]
+ segment i=3: [75, 99] or
+
+ let n_diff_on_level=1, n_leaf_pages_to_analyze=1, then:
+ segment i=0: [0, 0] or
+
+ let n_diff_on_level=2, n_leaf_pages_to_analyze=2, then:
+ segment i=0: [0, 0]
+ segment i=1: [1, 1] or
+
+ let n_diff_on_level=13, n_leaf_pages_to_analyze=7, then:
+ segment i=0: [0, 0]
+ segment i=1: [1, 2]
+ segment i=2: [3, 4]
+ segment i=3: [5, 6]
+ segment i=4: [7, 8]
+ segment i=5: [9, 10]
+ segment i=6: [11, 12]
+
+ then we select a random record from each segment and dive
+ below it */
+ const ib_uint64_t n_diff = n_diff_data->n_diff_on_level;
+ const ib_uint64_t n_pick
+ = n_diff_data->n_leaf_pages_to_analyze;
+
+ const ib_uint64_t left = n_diff * i / n_pick;
+ const ib_uint64_t right = n_diff * (i + 1) / n_pick - 1;
+
+ ut_a(left <= right);
+ ut_a(right <= last_idx_on_level);
+
+ const ulint rnd = ut_rnd_interval(
+ static_cast<ulint>(right - left));
+
+ const ib_uint64_t dive_below_idx
+ = boundaries->at(static_cast<unsigned>(left + rnd));
+
+#if 0
+ DEBUG_PRINTF(" %s(): dive below record with index="
+ UINT64PF "\n", __func__, dive_below_idx);
+#endif
+
+ /* seek to the record with index dive_below_idx */
+ while (rec_idx < dive_below_idx
+ && btr_pcur_is_on_user_rec(&pcur)) {
+
+ btr_pcur_move_to_next_user_rec(&pcur, mtr);
+ rec_idx++;
+ }
+
+ /* if the level has finished before the record we are
+ searching for, this means that the B-tree has changed in
+ the meantime, quit our sampling and use whatever stats
+ we have collected so far */
+ if (rec_idx < dive_below_idx) {
+
+ ut_ad(!btr_pcur_is_on_user_rec(&pcur));
+ break;
+ }
+
+ /* it could be that the tree has changed in such a way that
+ the record under dive_below_idx is the supremum record, in
+ this case rec_idx == dive_below_idx and pcur is positioned
+ on the supremum, we do not want to dive below it */
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ break;
+ }
+
+ ut_a(rec_idx == dive_below_idx);
+
+ ib_uint64_t n_diff_on_leaf_page;
+ ib_uint64_t n_external_pages;
+
+ dict_stats_analyze_index_below_cur(btr_pcur_get_btr_cur(&pcur),
+ n_prefix,
+ &n_diff_on_leaf_page,
+ &n_external_pages);
+
+ /* We adjust n_diff_on_leaf_page here to avoid counting
+ one value twice - once as the last on some page and once
+ as the first on another page. Consider the following example:
+ Leaf level:
+ page: (2,2,2,2,3,3)
+ ... many pages like (3,3,3,3,3,3) ...
+ page: (3,3,3,3,5,5)
+ ... many pages like (5,5,5,5,5,5) ...
+ page: (5,5,5,5,8,8)
+ page: (8,8,8,8,9,9)
+ our algo would (correctly) get an estimate that there are
+ 2 distinct records per page (average). Having 4 pages below
+ non-boring records, it would (wrongly) estimate the number
+ of distinct records to 8. */
+ if (n_diff_on_leaf_page > 0) {
+ n_diff_on_leaf_page--;
+ }
+
+ n_diff_data->n_diff_all_analyzed_pages += n_diff_on_leaf_page;
+
+ n_diff_data->n_external_pages_sum += n_external_pages;
+ }
+
+ btr_pcur_close(&pcur);
+}
+
+/** statistics for an index */
+struct index_stats_t
+{
+ std::vector<index_field_stats_t> stats;
+ ulint index_size;
+ ulint n_leaf_pages;
+
+ index_stats_t(ulint n_uniq) : index_size(1), n_leaf_pages(1)
+ {
+ stats.reserve(n_uniq);
+ for (ulint i= 0; i < n_uniq; ++i)
+ stats.push_back(index_field_stats_t(0, 1, 0));
+ }
+};
+
+/** Set dict_index_t::stat_n_diff_key_vals[] and stat_n_sample_sizes[].
+@param[in] n_diff_data input data to use to derive the results
+@param[in,out] index_stats index stats to set */
+UNIV_INLINE
+void
+dict_stats_index_set_n_diff(
+ const n_diff_data_t* n_diff_data,
+ index_stats_t& index_stats)
+{
+ for (ulint n_prefix = index_stats.stats.size();
+ n_prefix >= 1;
+ n_prefix--) {
+ /* n_diff_all_analyzed_pages can be 0 here if
+ all the leaf pages sampled contained only
+ delete-marked records. In this case we should assign
+ 0 to index->stat_n_diff_key_vals[n_prefix - 1], which
+ the formula below does. */
+
+ const n_diff_data_t* data = &n_diff_data[n_prefix - 1];
+
+ ut_ad(data->n_leaf_pages_to_analyze > 0);
+ ut_ad(data->n_recs_on_level > 0);
+
+ ib_uint64_t n_ordinary_leaf_pages;
+
+ if (data->level == 1) {
+ /* If we know the number of records on level 1, then
+ this number is the same as the number of pages on
+ level 0 (leaf). */
+ n_ordinary_leaf_pages = data->n_recs_on_level;
+ } else {
+ /* If we analyzed D ordinary leaf pages and found E
+ external pages in total linked from those D ordinary
+ leaf pages, then this means that the ratio
+ ordinary/external is D/E. Then the ratio ordinary/total
+ is D / (D + E). Knowing that the total number of pages
+ is T (including ordinary and external) then we estimate
+ that the total number of ordinary leaf pages is
+ T * D / (D + E). */
+ n_ordinary_leaf_pages
+ = index_stats.n_leaf_pages
+ * data->n_leaf_pages_to_analyze
+ / (data->n_leaf_pages_to_analyze
+ + data->n_external_pages_sum);
+ }
+
+ /* See REF01 for an explanation of the algorithm */
+ index_stats.stats[n_prefix - 1].n_diff_key_vals
+ = n_ordinary_leaf_pages
+
+ * data->n_diff_on_level
+ / data->n_recs_on_level
+
+ * data->n_diff_all_analyzed_pages
+ / data->n_leaf_pages_to_analyze;
+
+ index_stats.stats[n_prefix - 1].n_sample_sizes
+ = data->n_leaf_pages_to_analyze;
+
+ DEBUG_PRINTF(" %s(): n_diff=" UINT64PF
+ " for n_prefix=" ULINTPF
+ " (" ULINTPF
+ " * " UINT64PF " / " UINT64PF
+ " * " UINT64PF " / " UINT64PF ")\n",
+ __func__,
+ index_stats.stats[n_prefix - 1].n_diff_key_vals,
+ n_prefix,
+ index_stats.n_leaf_pages,
+ data->n_diff_on_level,
+ data->n_recs_on_level,
+ data->n_diff_all_analyzed_pages,
+ data->n_leaf_pages_to_analyze);
+ }
+}
+
+/** Calculates new statistics for a given index and saves them to the index
+members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and
+stat_n_leaf_pages. This function can be slow.
+@param[in] index index to analyze
+@return index stats */
+static index_stats_t dict_stats_analyze_index(dict_index_t* index)
+{
+ ulint root_level;
+ ulint level;
+ bool level_is_analyzed;
+ ulint n_uniq;
+ ulint n_prefix;
+ ib_uint64_t total_recs;
+ ib_uint64_t total_pages;
+ mtr_t mtr;
+ ulint size;
+ index_stats_t result(index->n_uniq);
+ DBUG_ENTER("dict_stats_analyze_index");
+
+ DBUG_PRINT("info", ("index: %s, online status: %d", index->name(),
+ dict_index_get_online_status(index)));
+
+ ut_ad(!mutex_own(&dict_sys.mutex)); // because this function is slow
+ ut_ad(index->table->get_ref_count());
+
+ /* Disable update statistic for Rtree */
+ if (dict_index_is_spatial(index)) {
+ DBUG_RETURN(result);
+ }
+
+ DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name());
+
+ mtr.start();
+ mtr_s_lock_index(index, &mtr);
+ size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
+
+ if (size != ULINT_UNDEFINED) {
+ result.index_size = size;
+ size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr);
+ }
+
+ /* Release the X locks on the root page taken by btr_get_size() */
+ mtr.commit();
+
+ switch (size) {
+ case ULINT_UNDEFINED:
+ dict_stats_assert_initialized_index(index);
+ DBUG_RETURN(result);
+ case 0:
+ /* The root node of the tree is a leaf */
+ size = 1;
+ }
+
+ result.n_leaf_pages = size;
+
+ mtr.start();
+ mtr_sx_lock_index(index, &mtr);
+ root_level = btr_height_get(index, &mtr);
+
+ n_uniq = dict_index_get_n_unique(index);
+
+ /* If the tree has just one level (and one page) or if the user
+ has requested to sample too many pages then do full scan.
+
+ For each n-column prefix (for n=1..n_uniq) N_SAMPLE_PAGES(index)
+ will be sampled, so in total N_SAMPLE_PAGES(index) * n_uniq leaf
+ pages will be sampled. If that number is bigger than the total
+ number of leaf pages then do full scan of the leaf level instead
+ since it will be faster and will give better results. */
+
+ if (root_level == 0
+ || N_SAMPLE_PAGES(index) * n_uniq > result.n_leaf_pages) {
+
+ if (root_level == 0) {
+ DEBUG_PRINTF(" %s(): just one page,"
+ " doing full scan\n", __func__);
+ } else {
+ DEBUG_PRINTF(" %s(): too many pages requested for"
+ " sampling, doing full scan\n", __func__);
+ }
+
+ /* do full scan of level 0; save results directly
+ into the index */
+
+ dict_stats_analyze_index_level(index,
+ 0 /* leaf level */,
+ index->stat_n_diff_key_vals,
+ &total_recs,
+ &total_pages,
+ NULL /* boundaries not needed */,
+ &mtr);
+
+ mtr.commit();
+
+ mutex_enter(&dict_sys.mutex);
+ for (ulint i = 0; i < n_uniq; i++) {
+ result.stats[i].n_diff_key_vals = index->stat_n_diff_key_vals[i];
+ result.stats[i].n_sample_sizes = total_pages;
+ result.stats[i].n_non_null_key_vals = index->stat_n_non_null_key_vals[i];
+ }
+ result.n_leaf_pages = index->stat_n_leaf_pages;
+ mutex_exit(&dict_sys.mutex);
+
+ DBUG_RETURN(result);
+ }
+
+ /* For each level that is being scanned in the btree, this contains the
+ number of different key values for all possible n-column prefixes. */
+ ib_uint64_t* n_diff_on_level = UT_NEW_ARRAY(
+ ib_uint64_t, n_uniq, mem_key_dict_stats_n_diff_on_level);
+
+ /* For each level that is being scanned in the btree, this contains the
+ index of the last record from each group of equal records (when
+ comparing only the first n columns, n=1..n_uniq). */
+ boundaries_t* n_diff_boundaries = UT_NEW_ARRAY_NOKEY(boundaries_t,
+ n_uniq);
+
+ /* For each n-column prefix this array contains the input data that is
+ used to calculate dict_index_t::stat_n_diff_key_vals[]. */
+ n_diff_data_t* n_diff_data = UT_NEW_ARRAY_NOKEY(n_diff_data_t, n_uniq);
+
+ /* total_recs is also used to estimate the number of pages on one
+ level below, so at the start we have 1 page (the root) */
+ total_recs = 1;
+
+ /* Here we use the following optimization:
+ If we find that level L is the first one (searching from the
+ root) that contains at least D distinct keys when looking at
+ the first n_prefix columns, then:
+ if we look at the first n_prefix-1 columns then the first
+ level that contains D distinct keys will be either L or a
+ lower one.
+ So if we find that the first level containing D distinct
+ keys (on n_prefix columns) is L, we continue from L when
+ searching for D distinct keys on n_prefix-1 columns. */
+ level = root_level;
+ level_is_analyzed = false;
+
+ for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) {
+
+ DEBUG_PRINTF(" %s(): searching level with >=%llu "
+ "distinct records, n_prefix=" ULINTPF "\n",
+ __func__, N_DIFF_REQUIRED(index), n_prefix);
+
+ /* Commit the mtr to release the tree S lock to allow
+ other threads to do some work too. */
+ mtr.commit();
+ mtr.start();
+ mtr_sx_lock_index(index, &mtr);
+ if (root_level != btr_height_get(index, &mtr)) {
+ /* Just quit if the tree has changed beyond
+ recognition here. The old stats from previous
+ runs will remain in the values that we have
+ not calculated yet. Initially when the index
+ object is created the stats members are given
+ some sensible values so leaving them untouched
+ here even the first time will not cause us to
+ read uninitialized memory later. */
+ break;
+ }
+
+ /* check whether we should pick the current level;
+ we pick level 1 even if it does not have enough
+ distinct records because we do not want to scan the
+ leaf level because it may contain too many records */
+ if (level_is_analyzed
+ && (n_diff_on_level[n_prefix - 1] >= N_DIFF_REQUIRED(index)
+ || level == 1)) {
+
+ goto found_level;
+ }
+
+ /* search for a level that contains enough distinct records */
+
+ if (level_is_analyzed && level > 1) {
+
+ /* if this does not hold we should be on
+ "found_level" instead of here */
+ ut_ad(n_diff_on_level[n_prefix - 1]
+ < N_DIFF_REQUIRED(index));
+
+ level--;
+ level_is_analyzed = false;
+ }
+
+ /* descend into the tree, searching for "good enough" level */
+ for (;;) {
+
+ /* make sure we do not scan the leaf level
+ accidentally, it may contain too many pages */
+ ut_ad(level > 0);
+
+ /* scanning the same level twice is an optimization
+ bug */
+ ut_ad(!level_is_analyzed);
+
+ /* Do not scan if this would read too many pages.
+ Here we use the following fact:
+ the number of pages on level L equals the number
+ of records on level L+1, thus we deduce that the
+ following call would scan total_recs pages, because
+ total_recs is left from the previous iteration when
+ we scanned one level upper or we have not scanned any
+ levels yet in which case total_recs is 1. */
+ if (total_recs > N_SAMPLE_PAGES(index)) {
+
+ /* if the above cond is true then we are
+ not at the root level since on the root
+ level total_recs == 1 (set before we
+ enter the n-prefix loop) and cannot
+ be > N_SAMPLE_PAGES(index) */
+ ut_a(level != root_level);
+
+ /* step one level back and be satisfied with
+ whatever it contains */
+ level++;
+ level_is_analyzed = true;
+
+ break;
+ }
+
+ dict_stats_analyze_index_level(index,
+ level,
+ n_diff_on_level,
+ &total_recs,
+ &total_pages,
+ n_diff_boundaries,
+ &mtr);
+
+ level_is_analyzed = true;
+
+ if (level == 1
+ || n_diff_on_level[n_prefix - 1]
+ >= N_DIFF_REQUIRED(index)) {
+ /* we have reached the last level we could scan
+ or we found a good level with many distinct
+ records */
+ break;
+ }
+
+ level--;
+ level_is_analyzed = false;
+ }
+found_level:
+
+ DEBUG_PRINTF(" %s(): found level " ULINTPF
+ " that has " UINT64PF
+ " distinct records for n_prefix=" ULINTPF "\n",
+ __func__, level, n_diff_on_level[n_prefix - 1],
+ n_prefix);
+ /* here we are either on level 1 or the level that we are on
+ contains >= N_DIFF_REQUIRED distinct keys or we did not scan
+ deeper levels because they would contain too many pages */
+
+ ut_ad(level > 0);
+
+ ut_ad(level_is_analyzed);
+
+ /* if any of these is 0 then there is exactly one page in the
+ B-tree and it is empty and we should have done full scan and
+ should not be here */
+ ut_ad(total_recs > 0);
+ ut_ad(n_diff_on_level[n_prefix - 1] > 0);
+
+ ut_ad(N_SAMPLE_PAGES(index) > 0);
+
+ n_diff_data_t* data = &n_diff_data[n_prefix - 1];
+
+ data->level = level;
+
+ data->n_recs_on_level = total_recs;
+
+ data->n_diff_on_level = n_diff_on_level[n_prefix - 1];
+
+ data->n_leaf_pages_to_analyze = std::min(
+ N_SAMPLE_PAGES(index),
+ n_diff_on_level[n_prefix - 1]);
+
+ /* pick some records from this level and dive below them for
+ the given n_prefix */
+
+ dict_stats_analyze_index_for_n_prefix(
+ index, n_prefix, &n_diff_boundaries[n_prefix - 1],
+ data, &mtr);
+ }
+
+ mtr.commit();
+
+ UT_DELETE_ARRAY(n_diff_boundaries);
+
+ UT_DELETE_ARRAY(n_diff_on_level);
+
+ /* n_prefix == 0 means that the above loop did not end up prematurely
+ due to tree being changed and so n_diff_data[] is set up. */
+ if (n_prefix == 0) {
+ dict_stats_index_set_n_diff(n_diff_data, result);
+ }
+
+ UT_DELETE_ARRAY(n_diff_data);
+
+ DBUG_RETURN(result);
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively slow and is used to calculate persistent statistics that
+will be saved on disk.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_update_persistent(
+/*=========================*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ dict_index_t* index;
+
+ DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name);
+
+ DEBUG_SYNC_C("dict_stats_update_persistent");
+
+ /* analyze the clustered index first */
+
+ index = dict_table_get_first_index(table);
+
+ if (index == NULL
+ || index->is_corrupted()
+ || (index->type | DICT_UNIQUE) != (DICT_CLUSTERED | DICT_UNIQUE)) {
+
+ /* Table definition is corrupt */
+ dict_stats_empty_table(table, true);
+
+ return(DB_CORRUPTION);
+ }
+
+ ut_ad(!dict_index_is_ibuf(index));
+ mutex_enter(&dict_sys.mutex);
+ dict_stats_empty_index(index, false);
+ mutex_exit(&dict_sys.mutex);
+
+ index_stats_t stats = dict_stats_analyze_index(index);
+
+ mutex_enter(&dict_sys.mutex);
+ index->stat_index_size = stats.index_size;
+ index->stat_n_leaf_pages = stats.n_leaf_pages;
+ for (size_t i = 0; i < stats.stats.size(); ++i) {
+ index->stat_n_diff_key_vals[i] = stats.stats[i].n_diff_key_vals;
+ index->stat_n_sample_sizes[i] = stats.stats[i].n_sample_sizes;
+ index->stat_n_non_null_key_vals[i] = stats.stats[i].n_non_null_key_vals;
+ }
+
+ ulint n_unique = dict_index_get_n_unique(index);
+
+ table->stat_n_rows = index->stat_n_diff_key_vals[n_unique - 1];
+
+ table->stat_clustered_index_size = index->stat_index_size;
+
+ /* analyze other indexes from the table, if any */
+
+ table->stat_sum_of_other_index_sizes = 0;
+
+ for (index = dict_table_get_next_index(index);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+
+ ut_ad(!dict_index_is_ibuf(index));
+
+ if (index->type & (DICT_FTS | DICT_SPATIAL)) {
+ continue;
+ }
+
+ dict_stats_empty_index(index, false);
+
+ if (dict_stats_should_ignore_index(index)) {
+ continue;
+ }
+
+ if (!(table->stats_bg_flag & BG_STAT_SHOULD_QUIT)) {
+ mutex_exit(&dict_sys.mutex);
+ stats = dict_stats_analyze_index(index);
+ mutex_enter(&dict_sys.mutex);
+
+ index->stat_index_size = stats.index_size;
+ index->stat_n_leaf_pages = stats.n_leaf_pages;
+ for (size_t i = 0; i < stats.stats.size(); ++i) {
+ index->stat_n_diff_key_vals[i]
+ = stats.stats[i].n_diff_key_vals;
+ index->stat_n_sample_sizes[i]
+ = stats.stats[i].n_sample_sizes;
+ index->stat_n_non_null_key_vals[i]
+ = stats.stats[i].n_non_null_key_vals;
+ }
+ }
+
+ table->stat_sum_of_other_index_sizes
+ += index->stat_index_size;
+ }
+
+ table->stats_last_recalc = time(NULL);
+
+ table->stat_modified_counter = 0;
+
+ table->stat_initialized = TRUE;
+
+ dict_stats_assert_initialized(table);
+
+ mutex_exit(&dict_sys.mutex);
+
+ return(DB_SUCCESS);
+}
+
+#include "mysql_com.h"
+/** Save an individual index's statistic into the persistent statistics
+storage.
+@param[in] index index to be updated
+@param[in] last_update timestamp of the stat
+@param[in] stat_name name of the stat
+@param[in] stat_value value of the stat
+@param[in] sample_size n pages sampled or NULL
+@param[in] stat_description description of the stat
+@param[in,out] trx in case of NULL the function will
+allocate and free the trx object. If it is not NULL then it will be
+rolled back only in the case of error, but not freed.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_index_stat(
+ dict_index_t* index,
+ time_t last_update,
+ const char* stat_name,
+ ib_uint64_t stat_value,
+ ib_uint64_t* sample_size,
+ const char* stat_description,
+ trx_t* trx)
+{
+ dberr_t ret;
+ pars_info_t* pinfo;
+ char db_utf8[MAX_DB_UTF8_LEN];
+ char table_utf8[MAX_TABLE_UTF8_LEN];
+
+ ut_ad(!trx || trx->internal || trx->mysql_thd);
+ ut_d(dict_sys.assert_locked());
+
+ dict_fs2utf8(index->table->name.m_name, db_utf8, sizeof(db_utf8),
+ table_utf8, sizeof(table_utf8));
+
+ pinfo = pars_info_create();
+ pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+ pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+ pars_info_add_str_literal(pinfo, "index_name", index->name);
+ MEM_CHECK_DEFINED(&last_update, 4);
+ pars_info_add_int4_literal(pinfo, "last_update", uint32(last_update));
+ MEM_CHECK_DEFINED(stat_name, strlen(stat_name));
+ pars_info_add_str_literal(pinfo, "stat_name", stat_name);
+ MEM_CHECK_DEFINED(&stat_value, 8);
+ pars_info_add_ull_literal(pinfo, "stat_value", stat_value);
+ if (sample_size != NULL) {
+ MEM_CHECK_DEFINED(sample_size, 8);
+ pars_info_add_ull_literal(pinfo, "sample_size", *sample_size);
+ } else {
+ pars_info_add_literal(pinfo, "sample_size", NULL,
+ UNIV_SQL_NULL, DATA_FIXBINARY, 0);
+ }
+ pars_info_add_str_literal(pinfo, "stat_description",
+ stat_description);
+
+ ret = dict_stats_exec_sql(
+ pinfo,
+ "PROCEDURE INDEX_STATS_SAVE () IS\n"
+ "BEGIN\n"
+
+ "DELETE FROM \"" INDEX_STATS_NAME "\"\n"
+ "WHERE\n"
+ "database_name = :database_name AND\n"
+ "table_name = :table_name AND\n"
+ "index_name = :index_name AND\n"
+ "stat_name = :stat_name;\n"
+
+ "INSERT INTO \"" INDEX_STATS_NAME "\"\n"
+ "VALUES\n"
+ "(\n"
+ ":database_name,\n"
+ ":table_name,\n"
+ ":index_name,\n"
+ ":last_update,\n"
+ ":stat_name,\n"
+ ":stat_value,\n"
+ ":sample_size,\n"
+ ":stat_description\n"
+ ");\n"
+ "END;", trx);
+
+ if (UNIV_UNLIKELY(ret != DB_SUCCESS)) {
+ if (innodb_index_stats_not_found == false &&
+ index->stats_error_printed == false) {
+ ib::error() << "Cannot save index statistics for table "
+ << index->table->name
+ << ", index " << index->name
+ << ", stat name \"" << stat_name << "\": "
+ << ret;
+ index->stats_error_printed = true;
+ }
+ }
+
+ return(ret);
+}
+
+/** Report an error if updating table statistics failed because
+.ibd file is missing, table decryption failed or table is corrupted.
+@param[in,out] table Table
+@param[in] defragment true if statistics is for defragment
+@retval DB_DECRYPTION_FAILED if decryption of the table failed
+@retval DB_TABLESPACE_DELETED if .ibd file is missing
+@retval DB_CORRUPTION if table is marked as corrupted */
+dberr_t
+dict_stats_report_error(dict_table_t* table, bool defragment)
+{
+ dberr_t err;
+
+ const char* df = defragment ? " defragment" : "";
+
+ if (!table->space) {
+ ib::warn() << "Cannot save" << df << " statistics for table "
+ << table->name
+ << " because the .ibd file is missing. "
+ << TROUBLESHOOTING_MSG;
+ err = DB_TABLESPACE_DELETED;
+ } else {
+ ib::warn() << "Cannot save" << df << " statistics for table "
+ << table->name
+ << " because file "
+ << table->space->chain.start->name
+ << (table->corrupted
+ ? " is corrupted."
+ : " cannot be decrypted.");
+ err = table->corrupted ? DB_CORRUPTION : DB_DECRYPTION_FAILED;
+ }
+
+ dict_stats_empty_table(table, defragment);
+ return err;
+}
+
+/** Save the table's statistics into the persistent statistics storage.
+@param[in] table_orig table whose stats to save
+@param[in] only_for_index if this is non-NULL, then stats for indexes
+that are not equal to it will not be saved, if NULL, then all indexes' stats
+are saved
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_save(
+ dict_table_t* table_orig,
+ const index_id_t* only_for_index)
+{
+ pars_info_t* pinfo;
+ dberr_t ret;
+ dict_table_t* table;
+ char db_utf8[MAX_DB_UTF8_LEN];
+ char table_utf8[MAX_TABLE_UTF8_LEN];
+
+ if (high_level_read_only) {
+ return DB_READ_ONLY;
+ }
+
+ if (!table_orig->is_readable()) {
+ return (dict_stats_report_error(table_orig));
+ }
+
+ table = dict_stats_snapshot_create(table_orig);
+
+ dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8),
+ table_utf8, sizeof(table_utf8));
+
+ const time_t now = time(NULL);
+ dict_sys_lock();
+
+ pinfo = pars_info_create();
+
+ pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+ pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+ pars_info_add_int4_literal(pinfo, "last_update", uint32(now));
+ pars_info_add_ull_literal(pinfo, "n_rows", table->stat_n_rows);
+ pars_info_add_ull_literal(pinfo, "clustered_index_size",
+ table->stat_clustered_index_size);
+ pars_info_add_ull_literal(pinfo, "sum_of_other_index_sizes",
+ table->stat_sum_of_other_index_sizes);
+
+ ret = dict_stats_exec_sql(
+ pinfo,
+ "PROCEDURE TABLE_STATS_SAVE () IS\n"
+ "BEGIN\n"
+
+ "DELETE FROM \"" TABLE_STATS_NAME "\"\n"
+ "WHERE\n"
+ "database_name = :database_name AND\n"
+ "table_name = :table_name;\n"
+
+ "INSERT INTO \"" TABLE_STATS_NAME "\"\n"
+ "VALUES\n"
+ "(\n"
+ ":database_name,\n"
+ ":table_name,\n"
+ ":last_update,\n"
+ ":n_rows,\n"
+ ":clustered_index_size,\n"
+ ":sum_of_other_index_sizes\n"
+ ");\n"
+ "END;", NULL);
+
+ if (UNIV_UNLIKELY(ret != DB_SUCCESS)) {
+ ib::error() << "Cannot save table statistics for table "
+ << table->name << ": " << ret;
+func_exit:
+ dict_sys_unlock();
+ dict_stats_snapshot_free(table);
+ return ret;
+ }
+
+ trx_t* trx = trx_create();
+ trx_start_internal(trx);
+
+ dict_index_t* index;
+ index_map_t indexes(
+ (ut_strcmp_functor()),
+ index_map_t_allocator(mem_key_dict_stats_index_map_t));
+
+ /* Below we do all the modifications in innodb_index_stats in a single
+ transaction for performance reasons. Modifying more than one row in a
+ single transaction may deadlock with other transactions if they
+ lock the rows in different order. Other transaction could be for
+ example when we DROP a table and do
+ DELETE FROM innodb_index_stats WHERE database_name = '...'
+ AND table_name = '...'; which will affect more than one row. To
+ prevent deadlocks we always lock the rows in the same order - the
+ order of the PK, which is (database_name, table_name, index_name,
+ stat_name). This is why below we sort the indexes by name and then
+ for each index, do the mods ordered by stat_name. */
+
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+
+ indexes[index->name] = index;
+ }
+
+ index_map_t::const_iterator it;
+
+ for (it = indexes.begin(); it != indexes.end(); ++it) {
+
+ index = it->second;
+
+ if (only_for_index != NULL && index->id != *only_for_index) {
+ continue;
+ }
+
+ if (dict_stats_should_ignore_index(index)) {
+ continue;
+ }
+
+ ut_ad(!dict_index_is_ibuf(index));
+
+ for (unsigned i = 0; i < index->n_uniq; i++) {
+
+ char stat_name[16];
+ char stat_description[1024];
+
+ snprintf(stat_name, sizeof(stat_name),
+ "n_diff_pfx%02u", i + 1);
+
+ /* craft a string that contains the column names */
+ snprintf(stat_description, sizeof(stat_description),
+ "%s", index->fields[0].name());
+ for (unsigned j = 1; j <= i; j++) {
+ size_t len;
+
+ len = strlen(stat_description);
+
+ snprintf(stat_description + len,
+ sizeof(stat_description) - len,
+ ",%s", index->fields[j].name());
+ }
+
+ ret = dict_stats_save_index_stat(
+ index, now, stat_name,
+ index->stat_n_diff_key_vals[i],
+ &index->stat_n_sample_sizes[i],
+ stat_description, trx);
+
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+ }
+
+ ret = dict_stats_save_index_stat(index, now, "n_leaf_pages",
+ index->stat_n_leaf_pages,
+ NULL,
+ "Number of leaf pages "
+ "in the index", trx);
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+
+ ret = dict_stats_save_index_stat(index, now, "size",
+ index->stat_index_size,
+ NULL,
+ "Number of pages "
+ "in the index", trx);
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+ }
+
+ trx_commit_for_mysql(trx);
+
+end:
+ trx->free();
+ goto func_exit;
+}
+
+/*********************************************************************//**
+Called for the row that is selected by
+SELECT ... FROM mysql.innodb_table_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to it.
+@return non-NULL dummy */
+static
+ibool
+dict_stats_fetch_table_stats_step(
+/*==============================*/
+ void* node_void, /*!< in: select node */
+ void* table_void) /*!< out: table */
+{
+ sel_node_t* node = (sel_node_t*) node_void;
+ dict_table_t* table = (dict_table_t*) table_void;
+ que_common_t* cnode;
+ int i;
+
+ /* this should loop exactly 3 times - for
+ n_rows,clustered_index_size,sum_of_other_index_sizes */
+ for (cnode = static_cast<que_common_t*>(node->select_list), i = 0;
+ cnode != NULL;
+ cnode = static_cast<que_common_t*>(que_node_get_next(cnode)),
+ i++) {
+
+ const byte* data;
+ dfield_t* dfield = que_node_get_val(cnode);
+ dtype_t* type = dfield_get_type(dfield);
+ ulint len = dfield_get_len(dfield);
+
+ data = static_cast<const byte*>(dfield_get_data(dfield));
+
+ switch (i) {
+ case 0: /* mysql.innodb_table_stats.n_rows */
+
+ ut_a(dtype_get_mtype(type) == DATA_INT);
+ ut_a(len == 8);
+
+ table->stat_n_rows = mach_read_from_8(data);
+
+ break;
+
+ case 1: /* mysql.innodb_table_stats.clustered_index_size */
+
+ ut_a(dtype_get_mtype(type) == DATA_INT);
+ ut_a(len == 8);
+
+ table->stat_clustered_index_size
+ = (ulint) mach_read_from_8(data);
+
+ break;
+
+ case 2: /* mysql.innodb_table_stats.sum_of_other_index_sizes */
+
+ ut_a(dtype_get_mtype(type) == DATA_INT);
+ ut_a(len == 8);
+
+ table->stat_sum_of_other_index_sizes
+ = (ulint) mach_read_from_8(data);
+
+ break;
+
+ default:
+
+ /* someone changed SELECT
+ n_rows,clustered_index_size,sum_of_other_index_sizes
+ to select more columns from innodb_table_stats without
+ adjusting here */
+ ut_error;
+ }
+ }
+
+ /* if i < 3 this means someone changed the
+ SELECT n_rows,clustered_index_size,sum_of_other_index_sizes
+ to select less columns from innodb_table_stats without adjusting here;
+ if i > 3 we would have ut_error'ed earlier */
+ ut_a(i == 3 /*n_rows,clustered_index_size,sum_of_other_index_sizes*/);
+
+ /* XXX this is not used but returning non-NULL is necessary */
+ return(TRUE);
+}
+
+/** Aux struct used to pass a table and a boolean to
+dict_stats_fetch_index_stats_step(). */
+struct index_fetch_t {
+ dict_table_t* table; /*!< table whose indexes are to be modified */
+ bool stats_were_modified; /*!< will be set to true if at
+ least one index stats were modified */
+};
+
+/*********************************************************************//**
+Called for the rows that are selected by
+SELECT ... FROM mysql.innodb_index_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to its indexes.
+Let a table has N indexes and each index has Ui unique columns for i=1..N,
+then mysql.innodb_index_stats will have SUM(Ui) i=1..N rows for that table.
+So this function will be called SUM(Ui) times where SUM(Ui) is of magnitude
+N*AVG(Ui). In each call it searches for the currently fetched index into
+table->indexes linearly, assuming this list is not sorted. Thus, overall,
+fetching all indexes' stats from mysql.innodb_index_stats is O(N^2) where N
+is the number of indexes.
+This can be improved if we sort table->indexes in a temporary area just once
+and then search in that sorted list. Then the complexity will be O(N*log(N)).
+We assume a table will not have more than 100 indexes, so we go with the
+simpler N^2 algorithm.
+@return non-NULL dummy */
+static
+ibool
+dict_stats_fetch_index_stats_step(
+/*==============================*/
+ void* node_void, /*!< in: select node */
+ void* arg_void) /*!< out: table + a flag that tells if we
+ modified anything */
+{
+ sel_node_t* node = (sel_node_t*) node_void;
+ index_fetch_t* arg = (index_fetch_t*) arg_void;
+ dict_table_t* table = arg->table;
+ dict_index_t* index = NULL;
+ que_common_t* cnode;
+ const char* stat_name = NULL;
+ ulint stat_name_len = ULINT_UNDEFINED;
+ ib_uint64_t stat_value = UINT64_UNDEFINED;
+ ib_uint64_t sample_size = UINT64_UNDEFINED;
+ int i;
+
+ /* this should loop exactly 4 times - for the columns that
+ were selected: index_name,stat_name,stat_value,sample_size */
+ for (cnode = static_cast<que_common_t*>(node->select_list), i = 0;
+ cnode != NULL;
+ cnode = static_cast<que_common_t*>(que_node_get_next(cnode)),
+ i++) {
+
+ const byte* data;
+ dfield_t* dfield = que_node_get_val(cnode);
+ dtype_t* type = dfield_get_type(dfield);
+ ulint len = dfield_get_len(dfield);
+
+ data = static_cast<const byte*>(dfield_get_data(dfield));
+
+ switch (i) {
+ case 0: /* mysql.innodb_index_stats.index_name */
+
+ ut_a(dtype_get_mtype(type) == DATA_VARMYSQL);
+
+ /* search for index in table's indexes whose name
+ matches data; the fetched index name is in data,
+ has no terminating '\0' and has length len */
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+
+ if (index->is_committed()
+ && strlen(index->name) == len
+ && memcmp(index->name, data, len) == 0) {
+ /* the corresponding index was found */
+ break;
+ }
+ }
+
+ /* if index is NULL here this means that
+ mysql.innodb_index_stats contains more rows than the
+ number of indexes in the table; this is ok, we just
+ return ignoring those extra rows; in other words
+ dict_stats_fetch_index_stats_step() has been called
+ for a row from index_stats with unknown index_name
+ column */
+ if (index == NULL) {
+
+ return(TRUE);
+ }
+
+ break;
+
+ case 1: /* mysql.innodb_index_stats.stat_name */
+
+ ut_a(dtype_get_mtype(type) == DATA_VARMYSQL);
+
+ ut_a(index != NULL);
+
+ stat_name = (const char*) data;
+ stat_name_len = len;
+
+ break;
+
+ case 2: /* mysql.innodb_index_stats.stat_value */
+
+ ut_a(dtype_get_mtype(type) == DATA_INT);
+ ut_a(len == 8);
+
+ ut_a(index != NULL);
+ ut_a(stat_name != NULL);
+ ut_a(stat_name_len != ULINT_UNDEFINED);
+
+ stat_value = mach_read_from_8(data);
+
+ break;
+
+ case 3: /* mysql.innodb_index_stats.sample_size */
+
+ ut_a(dtype_get_mtype(type) == DATA_INT);
+ ut_a(len == 8 || len == UNIV_SQL_NULL);
+
+ ut_a(index != NULL);
+ ut_a(stat_name != NULL);
+ ut_a(stat_name_len != ULINT_UNDEFINED);
+ ut_a(stat_value != UINT64_UNDEFINED);
+
+ if (len == UNIV_SQL_NULL) {
+ break;
+ }
+ /* else */
+
+ sample_size = mach_read_from_8(data);
+
+ break;
+
+ default:
+
+ /* someone changed
+ SELECT index_name,stat_name,stat_value,sample_size
+ to select more columns from innodb_index_stats without
+ adjusting here */
+ ut_error;
+ }
+ }
+
+ /* if i < 4 this means someone changed the
+ SELECT index_name,stat_name,stat_value,sample_size
+ to select less columns from innodb_index_stats without adjusting here;
+ if i > 4 we would have ut_error'ed earlier */
+ ut_a(i == 4 /* index_name,stat_name,stat_value,sample_size */);
+
+ ut_a(index != NULL);
+ ut_a(stat_name != NULL);
+ ut_a(stat_name_len != ULINT_UNDEFINED);
+ ut_a(stat_value != UINT64_UNDEFINED);
+ /* sample_size could be UINT64_UNDEFINED here, if it is NULL */
+
+#define PFX "n_diff_pfx"
+#define PFX_LEN 10
+
+ if (stat_name_len == 4 /* strlen("size") */
+ && strncasecmp("size", stat_name, stat_name_len) == 0) {
+ index->stat_index_size = (ulint) stat_value;
+ arg->stats_were_modified = true;
+ } else if (stat_name_len == 12 /* strlen("n_leaf_pages") */
+ && strncasecmp("n_leaf_pages", stat_name, stat_name_len)
+ == 0) {
+ index->stat_n_leaf_pages = (ulint) stat_value;
+ arg->stats_were_modified = true;
+ } else if (stat_name_len == 12 /* strlen("n_page_split") */
+ && strncasecmp("n_page_split", stat_name, stat_name_len)
+ == 0) {
+ index->stat_defrag_n_page_split = (ulint) stat_value;
+ arg->stats_were_modified = true;
+ } else if (stat_name_len == 13 /* strlen("n_pages_freed") */
+ && strncasecmp("n_pages_freed", stat_name, stat_name_len)
+ == 0) {
+ index->stat_defrag_n_pages_freed = (ulint) stat_value;
+ arg->stats_were_modified = true;
+ } else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
+ && strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
+
+ const char* num_ptr;
+ unsigned long n_pfx;
+
+ /* point num_ptr into "1" from "n_diff_pfx12..." */
+ num_ptr = stat_name + PFX_LEN;
+
+ /* stat_name should have exactly 2 chars appended to PFX
+ and they should be digits */
+ if (stat_name_len != PFX_LEN + 2
+ || num_ptr[0] < '0' || num_ptr[0] > '9'
+ || num_ptr[1] < '0' || num_ptr[1] > '9') {
+
+ char db_utf8[MAX_DB_UTF8_LEN];
+ char table_utf8[MAX_TABLE_UTF8_LEN];
+
+ dict_fs2utf8(table->name.m_name,
+ db_utf8, sizeof(db_utf8),
+ table_utf8, sizeof(table_utf8));
+
+ ib::info out;
+ out << "Ignoring strange row from "
+ << INDEX_STATS_NAME_PRINT << " WHERE"
+ " database_name = '" << db_utf8
+ << "' AND table_name = '" << table_utf8
+ << "' AND index_name = '" << index->name()
+ << "' AND stat_name = '";
+ out.write(stat_name, stat_name_len);
+ out << "'; because stat_name is malformed";
+ return(TRUE);
+ }
+ /* else */
+
+ /* extract 12 from "n_diff_pfx12..." into n_pfx
+ note that stat_name does not have a terminating '\0' */
+ n_pfx = ulong(num_ptr[0] - '0') * 10 + ulong(num_ptr[1] - '0');
+
+ ulint n_uniq = index->n_uniq;
+
+ if (n_pfx == 0 || n_pfx > n_uniq) {
+
+ char db_utf8[MAX_DB_UTF8_LEN];
+ char table_utf8[MAX_TABLE_UTF8_LEN];
+
+ dict_fs2utf8(table->name.m_name,
+ db_utf8, sizeof(db_utf8),
+ table_utf8, sizeof(table_utf8));
+
+ ib::info out;
+ out << "Ignoring strange row from "
+ << INDEX_STATS_NAME_PRINT << " WHERE"
+ " database_name = '" << db_utf8
+ << "' AND table_name = '" << table_utf8
+ << "' AND index_name = '" << index->name()
+ << "' AND stat_name = '";
+ out.write(stat_name, stat_name_len);
+ out << "'; because stat_name is out of range, the index"
+ " has " << n_uniq << " unique columns";
+
+ return(TRUE);
+ }
+ /* else */
+
+ index->stat_n_diff_key_vals[n_pfx - 1] = stat_value;
+
+ if (sample_size != UINT64_UNDEFINED) {
+ index->stat_n_sample_sizes[n_pfx - 1] = sample_size;
+ } else {
+ /* hmm, strange... the user must have UPDATEd the
+ table manually and SET sample_size = NULL */
+ index->stat_n_sample_sizes[n_pfx - 1] = 0;
+ }
+
+ index->stat_n_non_null_key_vals[n_pfx - 1] = 0;
+
+ arg->stats_were_modified = true;
+ } else {
+ /* silently ignore rows with unknown stat_name, the
+ user may have developed her own stats */
+ }
+
+ /* XXX this is not used but returning non-NULL is necessary */
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Read table's statistics from the persistent statistics storage.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_fetch_from_ps(
+/*=====================*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ index_fetch_t index_fetch_arg;
+ trx_t* trx;
+ pars_info_t* pinfo;
+ dberr_t ret;
+ char db_utf8[MAX_DB_UTF8_LEN];
+ char table_utf8[MAX_TABLE_UTF8_LEN];
+
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ /* Initialize all stats to dummy values before fetching because if
+ the persistent storage contains incomplete stats (e.g. missing stats
+ for some index) then we would end up with (partially) uninitialized
+ stats. */
+ dict_stats_empty_table(table, true);
+
+ trx = trx_create();
+
+ /* Use 'read-uncommitted' so that the SELECTs we execute
+ do not get blocked in case some user has locked the rows we
+ are SELECTing */
+
+ trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+
+ if (srv_read_only_mode) {
+ trx_start_internal_read_only(trx);
+ } else {
+ trx_start_internal(trx);
+ }
+
+ dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8),
+ table_utf8, sizeof(table_utf8));
+
+ pinfo = pars_info_create();
+
+ pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+
+ pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+
+ pars_info_bind_function(pinfo,
+ "fetch_table_stats_step",
+ dict_stats_fetch_table_stats_step,
+ table);
+
+ index_fetch_arg.table = table;
+ index_fetch_arg.stats_were_modified = false;
+ pars_info_bind_function(pinfo,
+ "fetch_index_stats_step",
+ dict_stats_fetch_index_stats_step,
+ &index_fetch_arg);
+
+ ret = que_eval_sql(pinfo,
+ "PROCEDURE FETCH_STATS () IS\n"
+ "found INT;\n"
+ "DECLARE FUNCTION fetch_table_stats_step;\n"
+ "DECLARE FUNCTION fetch_index_stats_step;\n"
+ "DECLARE CURSOR table_stats_cur IS\n"
+ " SELECT\n"
+ /* if you change the selected fields, be
+ sure to adjust
+ dict_stats_fetch_table_stats_step() */
+ " n_rows,\n"
+ " clustered_index_size,\n"
+ " sum_of_other_index_sizes\n"
+ " FROM \"" TABLE_STATS_NAME "\"\n"
+ " WHERE\n"
+ " database_name = :database_name AND\n"
+ " table_name = :table_name;\n"
+ "DECLARE CURSOR index_stats_cur IS\n"
+ " SELECT\n"
+ /* if you change the selected fields, be
+ sure to adjust
+ dict_stats_fetch_index_stats_step() */
+ " index_name,\n"
+ " stat_name,\n"
+ " stat_value,\n"
+ " sample_size\n"
+ " FROM \"" INDEX_STATS_NAME "\"\n"
+ " WHERE\n"
+ " database_name = :database_name AND\n"
+ " table_name = :table_name;\n"
+
+ "BEGIN\n"
+
+ "OPEN table_stats_cur;\n"
+ "FETCH table_stats_cur INTO\n"
+ " fetch_table_stats_step();\n"
+ "IF (SQL % NOTFOUND) THEN\n"
+ " CLOSE table_stats_cur;\n"
+ " RETURN;\n"
+ "END IF;\n"
+ "CLOSE table_stats_cur;\n"
+
+ "OPEN index_stats_cur;\n"
+ "found := 1;\n"
+ "WHILE found = 1 LOOP\n"
+ " FETCH index_stats_cur INTO\n"
+ " fetch_index_stats_step();\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE index_stats_cur;\n"
+
+ "END;",
+ TRUE, trx);
+ /* pinfo is freed by que_eval_sql() */
+
+ trx_commit_for_mysql(trx);
+
+ trx->free();
+
+ if (!index_fetch_arg.stats_were_modified) {
+ return(DB_STATS_DO_NOT_EXIST);
+ }
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Clear defragmentation stats modified counter for all indices in table. */
+static
+void
+dict_stats_empty_defrag_modified_counter(
+ dict_table_t* table) /*!< in: table */
+{
+ dict_index_t* index;
+ ut_a(table);
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ index->stat_defrag_modified_counter = 0;
+ }
+}
+
+/*********************************************************************//**
+Fetches or calculates new estimates for index statistics. */
+void
+dict_stats_update_for_index(
+/*========================*/
+ dict_index_t* index) /*!< in/out: index */
+{
+ DBUG_ENTER("dict_stats_update_for_index");
+
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ if (dict_stats_is_persistent_enabled(index->table)) {
+
+ if (dict_stats_persistent_storage_check(false)) {
+ index_stats_t stats = dict_stats_analyze_index(index);
+ mutex_enter(&dict_sys.mutex);
+ index->stat_index_size = stats.index_size;
+ index->stat_n_leaf_pages = stats.n_leaf_pages;
+ for (size_t i = 0; i < stats.stats.size(); ++i) {
+ index->stat_n_diff_key_vals[i]
+ = stats.stats[i].n_diff_key_vals;
+ index->stat_n_sample_sizes[i]
+ = stats.stats[i].n_sample_sizes;
+ index->stat_n_non_null_key_vals[i]
+ = stats.stats[i].n_non_null_key_vals;
+ }
+ index->table->stat_sum_of_other_index_sizes
+ += index->stat_index_size;
+ mutex_exit(&dict_sys.mutex);
+
+ dict_stats_save(index->table, &index->id);
+ DBUG_VOID_RETURN;
+ }
+ /* else */
+
+ if (innodb_index_stats_not_found == false &&
+ index->stats_error_printed == false) {
+ /* Fall back to transient stats since the persistent
+ storage is not present or is corrupted */
+
+ ib::info() << "Recalculation of persistent statistics"
+ " requested for table " << index->table->name
+ << " index " << index->name
+ << " but the required"
+ " persistent statistics storage is not present or is"
+ " corrupted. Using transient stats instead.";
+ index->stats_error_printed = false;
+ }
+ }
+
+ dict_stats_update_transient_for_index(index);
+
+ DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_update(
+/*==============*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_stats_upd_option_t stats_upd_option)
+ /*!< in: whether to (re) calc
+ the stats or to fetch them from
+ the persistent statistics
+ storage */
+{
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ if (!table->is_readable()) {
+ return (dict_stats_report_error(table));
+ } else if (srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE) {
+ /* If we have set a high innodb_force_recovery level, do
+ not calculate statistics, as a badly corrupted index can
+ cause a crash in it. */
+ dict_stats_empty_table(table, false);
+ return(DB_SUCCESS);
+ }
+
+ switch (stats_upd_option) {
+ case DICT_STATS_RECALC_PERSISTENT:
+
+ if (srv_read_only_mode) {
+ goto transient;
+ }
+
+ /* Persistent recalculation requested, called from
+ 1) ANALYZE TABLE, or
+ 2) the auto recalculation background thread, or
+ 3) open table if stats do not exist on disk and auto recalc
+ is enabled */
+
+ /* InnoDB internal tables (e.g. SYS_TABLES) cannot have
+ persistent stats enabled */
+ ut_a(strchr(table->name.m_name, '/') != NULL);
+
+ /* check if the persistent statistics storage exists
+ before calling the potentially slow function
+ dict_stats_update_persistent(); that is a
+ prerequisite for dict_stats_save() succeeding */
+ if (dict_stats_persistent_storage_check(false)) {
+
+ dberr_t err;
+
+ err = dict_stats_update_persistent(table);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ err = dict_stats_save(table, NULL);
+
+ return(err);
+ }
+
+ /* Fall back to transient stats since the persistent
+ storage is not present or is corrupted */
+
+ if (innodb_table_stats_not_found == false &&
+ table->stats_error_printed == false) {
+ ib::warn() << "Recalculation of persistent statistics"
+ " requested for table "
+ << table->name
+ << " but the required persistent"
+ " statistics storage is not present or is corrupted."
+ " Using transient stats instead.";
+ table->stats_error_printed = true;
+ }
+
+ goto transient;
+
+ case DICT_STATS_RECALC_TRANSIENT:
+
+ goto transient;
+
+ case DICT_STATS_EMPTY_TABLE:
+
+ dict_stats_empty_table(table, true);
+
+ /* If table is using persistent stats,
+ then save the stats on disk */
+
+ if (dict_stats_is_persistent_enabled(table)) {
+
+ if (dict_stats_persistent_storage_check(false)) {
+
+ return(dict_stats_save(table, NULL));
+ }
+
+ return(DB_STATS_DO_NOT_EXIST);
+ }
+
+ return(DB_SUCCESS);
+
+ case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY:
+
+ /* fetch requested, either fetch from persistent statistics
+ storage or use the old method */
+
+ if (table->stat_initialized) {
+ return(DB_SUCCESS);
+ }
+
+ /* InnoDB internal tables (e.g. SYS_TABLES) cannot have
+ persistent stats enabled */
+ ut_a(strchr(table->name.m_name, '/') != NULL);
+
+ if (!dict_stats_persistent_storage_check(false)) {
+ /* persistent statistics storage does not exist
+ or is corrupted, calculate the transient stats */
+
+ if (innodb_table_stats_not_found == false &&
+ table->stats_error_printed == false) {
+ ib::error() << "Fetch of persistent statistics"
+ " requested for table "
+ << table->name
+ << " but the required system tables "
+ << TABLE_STATS_NAME_PRINT
+ << " and " << INDEX_STATS_NAME_PRINT
+ << " are not present or have unexpected"
+ " structure. Using transient stats instead.";
+ table->stats_error_printed = true;
+ }
+
+ goto transient;
+ }
+
+ dict_table_t* t;
+
+ /* Create a dummy table object with the same name and
+ indexes, suitable for fetching the stats into it. */
+ t = dict_stats_table_clone_create(table);
+
+ dberr_t err = dict_stats_fetch_from_ps(t);
+
+ t->stats_last_recalc = table->stats_last_recalc;
+ t->stat_modified_counter = 0;
+ dict_stats_empty_defrag_modified_counter(t);
+
+ switch (err) {
+ case DB_SUCCESS:
+
+ mutex_enter(&dict_sys.mutex);
+
+ /* Pass reset_ignored_indexes=true as parameter
+ to dict_stats_copy. This will cause statictics
+ for corrupted indexes to be set to empty values */
+ dict_stats_copy(table, t, true);
+
+ dict_stats_assert_initialized(table);
+
+ mutex_exit(&dict_sys.mutex);
+
+ dict_stats_table_clone_free(t);
+
+ return(DB_SUCCESS);
+ case DB_STATS_DO_NOT_EXIST:
+
+ dict_stats_table_clone_free(t);
+
+ if (srv_read_only_mode) {
+ goto transient;
+ }
+
+ if (dict_stats_auto_recalc_is_enabled(table)) {
+ return(dict_stats_update(
+ table,
+ DICT_STATS_RECALC_PERSISTENT));
+ }
+
+ ib::info() << "Trying to use table " << table->name
+ << " which has persistent statistics enabled,"
+ " but auto recalculation turned off and the"
+ " statistics do not exist in "
+ TABLE_STATS_NAME_PRINT
+ " and " INDEX_STATS_NAME_PRINT
+ ". Please either run \"ANALYZE TABLE "
+ << table->name << ";\" manually or enable the"
+ " auto recalculation with \"ALTER TABLE "
+ << table->name << " STATS_AUTO_RECALC=1;\"."
+ " InnoDB will now use transient statistics for "
+ << table->name << ".";
+
+ goto transient;
+ default:
+
+ dict_stats_table_clone_free(t);
+
+ if (innodb_table_stats_not_found == false &&
+ table->stats_error_printed == false) {
+ ib::error() << "Error fetching persistent statistics"
+ " for table "
+ << table->name
+ << " from " TABLE_STATS_NAME_PRINT " and "
+ INDEX_STATS_NAME_PRINT ": " << err
+ << ". Using transient stats method instead.";
+ }
+
+ goto transient;
+ }
+ /* no "default:" in order to produce a compilation warning
+ about unhandled enumeration value */
+ }
+
+transient:
+ dict_stats_update_transient(table);
+
+ return(DB_SUCCESS);
+}
+
+/** Remove the information for a particular index's stats from the persistent
+storage if it exists and if there is data stored for this index.
+This function creates its own trx and commits it.
+
+We must modify system tables in a separate transaction in order to
+adhere to the InnoDB design constraint that dict_sys.latch prevents
+lock waits on system tables. If we modified system and user tables in
+the same transaction, we should exclusively hold dict_sys.latch until
+the transaction is committed, and effectively block other transactions
+that will attempt to open any InnoDB tables. Because we have no
+guarantee that user transactions will be committed fast, we cannot
+afford to keep the system tables locked in a user transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_drop_index(
+/*==================*/
+ const char* db_and_table,/*!< in: db and table, e.g. 'db/table' */
+ const char* iname, /*!< in: index name */
+ char* errstr, /*!< out: error message if != DB_SUCCESS
+ is returned */
+ ulint errstr_sz)/*!< in: size of the errstr buffer */
+{
+ char db_utf8[MAX_DB_UTF8_LEN];
+ char table_utf8[MAX_TABLE_UTF8_LEN];
+ pars_info_t* pinfo;
+ dberr_t ret;
+
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ /* skip indexes whose table names do not contain a database name
+ e.g. if we are dropping an index from SYS_TABLES */
+ if (strchr(db_and_table, '/') == NULL) {
+
+ return(DB_SUCCESS);
+ }
+
+ dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8),
+ table_utf8, sizeof(table_utf8));
+
+ pinfo = pars_info_create();
+
+ pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+
+ pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+
+ pars_info_add_str_literal(pinfo, "index_name", iname);
+
+ dict_sys_lock();
+
+ ret = dict_stats_exec_sql(
+ pinfo,
+ "PROCEDURE DROP_INDEX_STATS () IS\n"
+ "BEGIN\n"
+ "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+ "database_name = :database_name AND\n"
+ "table_name = :table_name AND\n"
+ "index_name = :index_name;\n"
+ "END;\n", NULL);
+
+ dict_sys_unlock();
+
+ if (ret == DB_STATS_DO_NOT_EXIST) {
+ ret = DB_SUCCESS;
+ }
+
+ if (ret != DB_SUCCESS) {
+ snprintf(errstr, errstr_sz,
+ "Unable to delete statistics for index %s"
+ " from %s%s: %s. They can be deleted later using"
+ " DELETE FROM %s WHERE"
+ " database_name = '%s' AND"
+ " table_name = '%s' AND"
+ " index_name = '%s';",
+ iname,
+ INDEX_STATS_NAME_PRINT,
+ (ret == DB_LOCK_WAIT_TIMEOUT
+ ? " because the rows are locked"
+ : ""),
+ ut_strerr(ret),
+ INDEX_STATS_NAME_PRINT,
+ db_utf8,
+ table_utf8,
+ iname);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: %s\n", errstr);
+ }
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Executes
+DELETE FROM mysql.innodb_table_stats
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_delete_from_table_stats(
+/*===============================*/
+ const char* database_name, /*!< in: database name, e.g. 'db' */
+ const char* table_name) /*!< in: table name, e.g. 'table' */
+{
+ pars_info_t* pinfo;
+ dberr_t ret;
+
+ ut_d(dict_sys.assert_locked());
+
+ pinfo = pars_info_create();
+
+ pars_info_add_str_literal(pinfo, "database_name", database_name);
+ pars_info_add_str_literal(pinfo, "table_name", table_name);
+
+ ret = dict_stats_exec_sql(
+ pinfo,
+ "PROCEDURE DELETE_FROM_TABLE_STATS () IS\n"
+ "BEGIN\n"
+ "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n"
+ "database_name = :database_name AND\n"
+ "table_name = :table_name;\n"
+ "END;\n", NULL);
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Executes
+DELETE FROM mysql.innodb_index_stats
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_delete_from_index_stats(
+/*===============================*/
+ const char* database_name, /*!< in: database name, e.g. 'db' */
+ const char* table_name) /*!< in: table name, e.g. 'table' */
+{
+ pars_info_t* pinfo;
+ dberr_t ret;
+
+ ut_d(dict_sys.assert_locked());
+
+ pinfo = pars_info_create();
+
+ pars_info_add_str_literal(pinfo, "database_name", database_name);
+ pars_info_add_str_literal(pinfo, "table_name", table_name);
+
+ ret = dict_stats_exec_sql(
+ pinfo,
+ "PROCEDURE DELETE_FROM_INDEX_STATS () IS\n"
+ "BEGIN\n"
+ "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+ "database_name = :database_name AND\n"
+ "table_name = :table_name;\n"
+ "END;\n", NULL);
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Removes the statistics for a table and all of its indexes from the
+persistent statistics storage if it exists and if there is data stored for
+the table. This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_drop_table(
+/*==================*/
+ const char* db_and_table, /*!< in: db and table, e.g. 'db/table' */
+ char* errstr, /*!< out: error message
+ if != DB_SUCCESS is returned */
+ ulint errstr_sz) /*!< in: size of errstr buffer */
+{
+ char db_utf8[MAX_DB_UTF8_LEN];
+ char table_utf8[MAX_TABLE_UTF8_LEN];
+ dberr_t ret;
+
+ ut_d(dict_sys.assert_locked());
+
+ /* skip tables that do not contain a database name
+ e.g. if we are dropping SYS_TABLES */
+ if (strchr(db_and_table, '/') == NULL) {
+
+ return(DB_SUCCESS);
+ }
+
+ /* skip innodb_table_stats and innodb_index_stats themselves */
+ if (strcmp(db_and_table, TABLE_STATS_NAME) == 0
+ || strcmp(db_and_table, INDEX_STATS_NAME) == 0) {
+
+ return(DB_SUCCESS);
+ }
+
+ dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8),
+ table_utf8, sizeof(table_utf8));
+
+ ret = dict_stats_delete_from_table_stats(db_utf8, table_utf8);
+
+ if (ret == DB_SUCCESS) {
+ ret = dict_stats_delete_from_index_stats(db_utf8, table_utf8);
+ }
+
+ if (ret == DB_STATS_DO_NOT_EXIST) {
+ ret = DB_SUCCESS;
+ }
+
+ if (ret != DB_SUCCESS) {
+
+ snprintf(errstr, errstr_sz,
+ "Unable to delete statistics for table %s.%s: %s."
+ " They can be deleted later using"
+
+ " DELETE FROM %s WHERE"
+ " database_name = '%s' AND"
+ " table_name = '%s';"
+
+ " DELETE FROM %s WHERE"
+ " database_name = '%s' AND"
+ " table_name = '%s';",
+
+ db_utf8, table_utf8,
+ ut_strerr(ret),
+
+ INDEX_STATS_NAME_PRINT,
+ db_utf8, table_utf8,
+
+ TABLE_STATS_NAME_PRINT,
+ db_utf8, table_utf8);
+ }
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Executes
+UPDATE mysql.innodb_table_stats SET
+database_name = '...', table_name = '...'
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_rename_table_in_table_stats(
+/*===================================*/
+ const char* old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */
+ const char* old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */
+ const char* new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */
+ const char* new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */
+{
+ pars_info_t* pinfo;
+ dberr_t ret;
+
+ ut_d(dict_sys.assert_locked());
+
+ pinfo = pars_info_create();
+
+ pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8);
+ pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8);
+ pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8);
+ pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8);
+
+ ret = dict_stats_exec_sql(
+ pinfo,
+ "PROCEDURE RENAME_TABLE_IN_TABLE_STATS () IS\n"
+ "BEGIN\n"
+ "UPDATE \"" TABLE_STATS_NAME "\" SET\n"
+ "database_name = :new_dbname_utf8,\n"
+ "table_name = :new_tablename_utf8\n"
+ "WHERE\n"
+ "database_name = :old_dbname_utf8 AND\n"
+ "table_name = :old_tablename_utf8;\n"
+ "END;\n", NULL);
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Executes
+UPDATE mysql.innodb_index_stats SET
+database_name = '...', table_name = '...'
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_rename_table_in_index_stats(
+/*===================================*/
+ const char* old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */
+ const char* old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */
+ const char* new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */
+ const char* new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */
+{
+ pars_info_t* pinfo;
+ dberr_t ret;
+
+ ut_d(dict_sys.assert_locked());
+
+ pinfo = pars_info_create();
+
+ pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8);
+ pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8);
+ pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8);
+ pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8);
+
+ ret = dict_stats_exec_sql(
+ pinfo,
+ "PROCEDURE RENAME_TABLE_IN_INDEX_STATS () IS\n"
+ "BEGIN\n"
+ "UPDATE \"" INDEX_STATS_NAME "\" SET\n"
+ "database_name = :new_dbname_utf8,\n"
+ "table_name = :new_tablename_utf8\n"
+ "WHERE\n"
+ "database_name = :old_dbname_utf8 AND\n"
+ "table_name = :old_tablename_utf8;\n"
+ "END;\n", NULL);
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Renames a table in InnoDB persistent stats storage.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_rename_table(
+/*====================*/
+ const char* old_name, /*!< in: old name, e.g. 'db/table' */
+ const char* new_name, /*!< in: new name, e.g. 'db/table' */
+ char* errstr, /*!< out: error string if != DB_SUCCESS
+ is returned */
+ size_t errstr_sz) /*!< in: errstr size */
+{
+ char old_db_utf8[MAX_DB_UTF8_LEN];
+ char new_db_utf8[MAX_DB_UTF8_LEN];
+ char old_table_utf8[MAX_TABLE_UTF8_LEN];
+ char new_table_utf8[MAX_TABLE_UTF8_LEN];
+ dberr_t ret;
+
+ /* skip innodb_table_stats and innodb_index_stats themselves */
+ if (strcmp(old_name, TABLE_STATS_NAME) == 0
+ || strcmp(old_name, INDEX_STATS_NAME) == 0
+ || strcmp(new_name, TABLE_STATS_NAME) == 0
+ || strcmp(new_name, INDEX_STATS_NAME) == 0) {
+
+ return(DB_SUCCESS);
+ }
+
+ dict_fs2utf8(old_name, old_db_utf8, sizeof(old_db_utf8),
+ old_table_utf8, sizeof(old_table_utf8));
+
+ dict_fs2utf8(new_name, new_db_utf8, sizeof(new_db_utf8),
+ new_table_utf8, sizeof(new_table_utf8));
+
+ dict_sys_lock();
+
+ ulint n_attempts = 0;
+ do {
+ n_attempts++;
+
+ ret = dict_stats_rename_table_in_table_stats(
+ old_db_utf8, old_table_utf8,
+ new_db_utf8, new_table_utf8);
+
+ if (ret == DB_DUPLICATE_KEY) {
+ dict_stats_delete_from_table_stats(
+ new_db_utf8, new_table_utf8);
+ }
+
+ if (ret == DB_STATS_DO_NOT_EXIST) {
+ ret = DB_SUCCESS;
+ }
+
+ if (ret != DB_SUCCESS) {
+ dict_sys_unlock();
+ os_thread_sleep(200000 /* 0.2 sec */);
+ dict_sys_lock();
+ }
+ } while ((ret == DB_DEADLOCK
+ || ret == DB_DUPLICATE_KEY
+ || ret == DB_LOCK_WAIT_TIMEOUT)
+ && n_attempts < 5);
+
+ if (ret != DB_SUCCESS) {
+ snprintf(errstr, errstr_sz,
+ "Unable to rename statistics from"
+ " %s.%s to %s.%s in %s: %s."
+ " They can be renamed later using"
+
+ " UPDATE %s SET"
+ " database_name = '%s',"
+ " table_name = '%s'"
+ " WHERE"
+ " database_name = '%s' AND"
+ " table_name = '%s';",
+
+ old_db_utf8, old_table_utf8,
+ new_db_utf8, new_table_utf8,
+ TABLE_STATS_NAME_PRINT,
+ ut_strerr(ret),
+
+ TABLE_STATS_NAME_PRINT,
+ new_db_utf8, new_table_utf8,
+ old_db_utf8, old_table_utf8);
+ dict_sys_unlock();
+ return(ret);
+ }
+ /* else */
+
+ n_attempts = 0;
+ do {
+ n_attempts++;
+
+ ret = dict_stats_rename_table_in_index_stats(
+ old_db_utf8, old_table_utf8,
+ new_db_utf8, new_table_utf8);
+
+ if (ret == DB_DUPLICATE_KEY) {
+ dict_stats_delete_from_index_stats(
+ new_db_utf8, new_table_utf8);
+ }
+
+ if (ret == DB_STATS_DO_NOT_EXIST) {
+ ret = DB_SUCCESS;
+ }
+
+ if (ret != DB_SUCCESS) {
+ dict_sys_unlock();
+ os_thread_sleep(200000 /* 0.2 sec */);
+ dict_sys_lock();
+ }
+ } while ((ret == DB_DEADLOCK
+ || ret == DB_DUPLICATE_KEY
+ || ret == DB_LOCK_WAIT_TIMEOUT)
+ && n_attempts < 5);
+
+ dict_sys_unlock();
+
+ if (ret != DB_SUCCESS) {
+ snprintf(errstr, errstr_sz,
+ "Unable to rename statistics from"
+ " %s.%s to %s.%s in %s: %s."
+ " They can be renamed later using"
+
+ " UPDATE %s SET"
+ " database_name = '%s',"
+ " table_name = '%s'"
+ " WHERE"
+ " database_name = '%s' AND"
+ " table_name = '%s';",
+
+ old_db_utf8, old_table_utf8,
+ new_db_utf8, new_table_utf8,
+ INDEX_STATS_NAME_PRINT,
+ ut_strerr(ret),
+
+ INDEX_STATS_NAME_PRINT,
+ new_db_utf8, new_table_utf8,
+ old_db_utf8, old_table_utf8);
+ }
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Renames an index in InnoDB persistent stats storage.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code. DB_STATS_DO_NOT_EXIST will be returned
+if the persistent stats do not exist. */
+dberr_t
+dict_stats_rename_index(
+/*====================*/
+ const dict_table_t* table, /*!< in: table whose index
+ is renamed */
+ const char* old_index_name, /*!< in: old index name */
+ const char* new_index_name) /*!< in: new index name */
+{
+ dict_sys_lock();
+
+ if (!dict_stats_persistent_storage_check(true)) {
+ dict_sys_unlock();
+ return(DB_STATS_DO_NOT_EXIST);
+ }
+
+ char dbname_utf8[MAX_DB_UTF8_LEN];
+ char tablename_utf8[MAX_TABLE_UTF8_LEN];
+
+ dict_fs2utf8(table->name.m_name, dbname_utf8, sizeof(dbname_utf8),
+ tablename_utf8, sizeof(tablename_utf8));
+
+ pars_info_t* pinfo;
+
+ pinfo = pars_info_create();
+
+ pars_info_add_str_literal(pinfo, "dbname_utf8", dbname_utf8);
+ pars_info_add_str_literal(pinfo, "tablename_utf8", tablename_utf8);
+ pars_info_add_str_literal(pinfo, "new_index_name", new_index_name);
+ pars_info_add_str_literal(pinfo, "old_index_name", old_index_name);
+
+ dberr_t ret;
+
+ ret = dict_stats_exec_sql(
+ pinfo,
+ "PROCEDURE RENAME_INDEX_IN_INDEX_STATS () IS\n"
+ "BEGIN\n"
+ "UPDATE \"" INDEX_STATS_NAME "\" SET\n"
+ "index_name = :new_index_name\n"
+ "WHERE\n"
+ "database_name = :dbname_utf8 AND\n"
+ "table_name = :tablename_utf8 AND\n"
+ "index_name = :old_index_name;\n"
+ "END;\n", NULL);
+
+ dict_sys_unlock();
+
+ return(ret);
+}
+
+/* tests @{ */
+#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS
+
+/* The following unit tests test some of the functions in this file
+individually, such testing cannot be performed by the mysql-test framework
+via SQL. */
+
+/* test_dict_table_schema_check() @{ */
+void
+test_dict_table_schema_check()
+{
+ /*
+ CREATE TABLE tcheck (
+ c01 VARCHAR(123),
+ c02 INT,
+ c03 INT NOT NULL,
+ c04 INT UNSIGNED,
+ c05 BIGINT,
+ c06 BIGINT UNSIGNED NOT NULL,
+ c07 TIMESTAMP
+ ) ENGINE=INNODB;
+ */
+ /* definition for the table 'test/tcheck' */
+ dict_col_meta_t columns[] = {
+ {"c01", DATA_VARCHAR, 0, 123},
+ {"c02", DATA_INT, 0, 4},
+ {"c03", DATA_INT, DATA_NOT_NULL, 4},
+ {"c04", DATA_INT, DATA_UNSIGNED, 4},
+ {"c05", DATA_INT, 0, 8},
+ {"c06", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+ {"c07", DATA_INT, 0, 4},
+ {"c_extra", DATA_INT, 0, 4}
+ };
+ dict_table_schema_t schema = {
+ "test/tcheck",
+ 0 /* will be set individually for each test below */,
+ columns
+ };
+ char errstr[512];
+
+ snprintf(errstr, sizeof(errstr), "Table not found");
+
+ /* prevent any data dictionary modifications while we are checking
+ the tables' structure */
+
+ mutex_enter(&dict_sys.mutex);
+
+ /* check that a valid table is reported as valid */
+ schema.n_cols = 7;
+ if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+ == DB_SUCCESS) {
+ printf("OK: test.tcheck ok\n");
+ } else {
+ printf("ERROR: %s\n", errstr);
+ printf("ERROR: test.tcheck not present or corrupted\n");
+ goto test_dict_table_schema_check_end;
+ }
+
+ /* check columns with wrong length */
+ schema.columns[1].len = 8;
+ if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+ != DB_SUCCESS) {
+ printf("OK: test.tcheck.c02 has different length and is"
+ " reported as corrupted\n");
+ } else {
+ printf("OK: test.tcheck.c02 has different length but is"
+ " reported as ok\n");
+ goto test_dict_table_schema_check_end;
+ }
+ schema.columns[1].len = 4;
+
+ /* request that c02 is NOT NULL while actually it does not have
+ this flag set */
+ schema.columns[1].prtype_mask |= DATA_NOT_NULL;
+ if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+ != DB_SUCCESS) {
+ printf("OK: test.tcheck.c02 does not have NOT NULL while"
+ " it should and is reported as corrupted\n");
+ } else {
+ printf("ERROR: test.tcheck.c02 does not have NOT NULL while"
+ " it should and is not reported as corrupted\n");
+ goto test_dict_table_schema_check_end;
+ }
+ schema.columns[1].prtype_mask &= ~DATA_NOT_NULL;
+
+ /* check a table that contains some extra columns */
+ schema.n_cols = 6;
+ if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+ == DB_SUCCESS) {
+ printf("ERROR: test.tcheck has more columns but is not"
+ " reported as corrupted\n");
+ goto test_dict_table_schema_check_end;
+ } else {
+ printf("OK: test.tcheck has more columns and is"
+ " reported as corrupted\n");
+ }
+
+ /* check a table that has some columns missing */
+ schema.n_cols = 8;
+ if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+ != DB_SUCCESS) {
+ printf("OK: test.tcheck has missing columns and is"
+ " reported as corrupted\n");
+ } else {
+ printf("ERROR: test.tcheck has missing columns but is"
+ " reported as ok\n");
+ goto test_dict_table_schema_check_end;
+ }
+
+ /* check non-existent table */
+ schema.table_name = "test/tcheck_nonexistent";
+ if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+ != DB_SUCCESS) {
+ printf("OK: test.tcheck_nonexistent is not present\n");
+ } else {
+ printf("ERROR: test.tcheck_nonexistent is present!?\n");
+ goto test_dict_table_schema_check_end;
+ }
+
+test_dict_table_schema_check_end:
+
+ mutex_exit(&dict_sys.mutex);
+}
+/* @} */
+
+/* save/fetch aux macros @{ */
+#define TEST_DATABASE_NAME "foobardb"
+#define TEST_TABLE_NAME "test_dict_stats"
+
+#define TEST_N_ROWS 111
+#define TEST_CLUSTERED_INDEX_SIZE 222
+#define TEST_SUM_OF_OTHER_INDEX_SIZES 333
+
+#define TEST_IDX1_NAME "tidx1"
+#define TEST_IDX1_COL1_NAME "tidx1_col1"
+#define TEST_IDX1_INDEX_SIZE 123
+#define TEST_IDX1_N_LEAF_PAGES 234
+#define TEST_IDX1_N_DIFF1 50
+#define TEST_IDX1_N_DIFF1_SAMPLE_SIZE 500
+
+#define TEST_IDX2_NAME "tidx2"
+#define TEST_IDX2_COL1_NAME "tidx2_col1"
+#define TEST_IDX2_COL2_NAME "tidx2_col2"
+#define TEST_IDX2_COL3_NAME "tidx2_col3"
+#define TEST_IDX2_COL4_NAME "tidx2_col4"
+#define TEST_IDX2_INDEX_SIZE 321
+#define TEST_IDX2_N_LEAF_PAGES 432
+#define TEST_IDX2_N_DIFF1 60
+#define TEST_IDX2_N_DIFF1_SAMPLE_SIZE 600
+#define TEST_IDX2_N_DIFF2 61
+#define TEST_IDX2_N_DIFF2_SAMPLE_SIZE 610
+#define TEST_IDX2_N_DIFF3 62
+#define TEST_IDX2_N_DIFF3_SAMPLE_SIZE 620
+#define TEST_IDX2_N_DIFF4 63
+#define TEST_IDX2_N_DIFF4_SAMPLE_SIZE 630
+/* @} */
+
+/* test_dict_stats_save() @{ */
+void
+test_dict_stats_save()
+{
+ dict_table_t table;
+ dict_index_t index1;
+ dict_field_t index1_fields[1];
+ ib_uint64_t index1_stat_n_diff_key_vals[1];
+ ib_uint64_t index1_stat_n_sample_sizes[1];
+ dict_index_t index2;
+ dict_field_t index2_fields[4];
+ ib_uint64_t index2_stat_n_diff_key_vals[4];
+ ib_uint64_t index2_stat_n_sample_sizes[4];
+ dberr_t ret;
+
+ /* craft a dummy dict_table_t */
+ table.name.m_name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME);
+ table.stat_n_rows = TEST_N_ROWS;
+ table.stat_clustered_index_size = TEST_CLUSTERED_INDEX_SIZE;
+ table.stat_sum_of_other_index_sizes = TEST_SUM_OF_OTHER_INDEX_SIZES;
+ UT_LIST_INIT(table.indexes, &dict_index_t::indexes);
+#ifdef BTR_CUR_HASH_ADAPT
+ UT_LIST_INIT(table.freed_indexes, &dict_index_t::indexes);
+#endif /* BTR_CUR_HASH_ADAPT */
+ UT_LIST_ADD_LAST(table.indexes, &index1);
+ UT_LIST_ADD_LAST(table.indexes, &index2);
+ ut_d(table.magic_n = DICT_TABLE_MAGIC_N);
+ ut_d(index1.magic_n = DICT_INDEX_MAGIC_N);
+
+ index1.name = TEST_IDX1_NAME;
+ index1.table = &table;
+ index1.cached = 1;
+ index1.n_uniq = 1;
+ index1.fields = index1_fields;
+ index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+ index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+ index1.stat_index_size = TEST_IDX1_INDEX_SIZE;
+ index1.stat_n_leaf_pages = TEST_IDX1_N_LEAF_PAGES;
+ index1_fields[0].name = TEST_IDX1_COL1_NAME;
+ index1_stat_n_diff_key_vals[0] = TEST_IDX1_N_DIFF1;
+ index1_stat_n_sample_sizes[0] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE;
+
+ ut_d(index2.magic_n = DICT_INDEX_MAGIC_N);
+ index2.name = TEST_IDX2_NAME;
+ index2.table = &table;
+ index2.cached = 1;
+ index2.n_uniq = 4;
+ index2.fields = index2_fields;
+ index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+ index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+ index2.stat_index_size = TEST_IDX2_INDEX_SIZE;
+ index2.stat_n_leaf_pages = TEST_IDX2_N_LEAF_PAGES;
+ index2_fields[0].name = TEST_IDX2_COL1_NAME;
+ index2_fields[1].name = TEST_IDX2_COL2_NAME;
+ index2_fields[2].name = TEST_IDX2_COL3_NAME;
+ index2_fields[3].name = TEST_IDX2_COL4_NAME;
+ index2_stat_n_diff_key_vals[0] = TEST_IDX2_N_DIFF1;
+ index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF2;
+ index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF3;
+ index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF4;
+ index2_stat_n_sample_sizes[0] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE;
+ index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE;
+ index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE;
+ index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE;
+
+ ret = dict_stats_save(&table, NULL);
+
+ ut_a(ret == DB_SUCCESS);
+
+ printf("\nOK: stats saved successfully, now go ahead and read"
+ " what's inside %s and %s:\n\n",
+ TABLE_STATS_NAME_PRINT,
+ INDEX_STATS_NAME_PRINT);
+
+ printf("SELECT COUNT(*) = 1 AS table_stats_saved_successfully\n"
+ "FROM %s\n"
+ "WHERE\n"
+ "database_name = '%s' AND\n"
+ "table_name = '%s' AND\n"
+ "n_rows = %d AND\n"
+ "clustered_index_size = %d AND\n"
+ "sum_of_other_index_sizes = %d;\n"
+ "\n",
+ TABLE_STATS_NAME_PRINT,
+ TEST_DATABASE_NAME,
+ TEST_TABLE_NAME,
+ TEST_N_ROWS,
+ TEST_CLUSTERED_INDEX_SIZE,
+ TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+ printf("SELECT COUNT(*) = 3 AS tidx1_stats_saved_successfully\n"
+ "FROM %s\n"
+ "WHERE\n"
+ "database_name = '%s' AND\n"
+ "table_name = '%s' AND\n"
+ "index_name = '%s' AND\n"
+ "(\n"
+ " (stat_name = 'size' AND stat_value = %d AND"
+ " sample_size IS NULL) OR\n"
+ " (stat_name = 'n_leaf_pages' AND stat_value = %d AND"
+ " sample_size IS NULL) OR\n"
+ " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND"
+ " sample_size = '%d' AND stat_description = '%s')\n"
+ ");\n"
+ "\n",
+ INDEX_STATS_NAME_PRINT,
+ TEST_DATABASE_NAME,
+ TEST_TABLE_NAME,
+ TEST_IDX1_NAME,
+ TEST_IDX1_INDEX_SIZE,
+ TEST_IDX1_N_LEAF_PAGES,
+ TEST_IDX1_N_DIFF1,
+ TEST_IDX1_N_DIFF1_SAMPLE_SIZE,
+ TEST_IDX1_COL1_NAME);
+
+ printf("SELECT COUNT(*) = 6 AS tidx2_stats_saved_successfully\n"
+ "FROM %s\n"
+ "WHERE\n"
+ "database_name = '%s' AND\n"
+ "table_name = '%s' AND\n"
+ "index_name = '%s' AND\n"
+ "(\n"
+ " (stat_name = 'size' AND stat_value = %d AND"
+ " sample_size IS NULL) OR\n"
+ " (stat_name = 'n_leaf_pages' AND stat_value = %d AND"
+ " sample_size IS NULL) OR\n"
+ " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND"
+ " sample_size = '%d' AND stat_description = '%s') OR\n"
+ " (stat_name = 'n_diff_pfx02' AND stat_value = %d AND"
+ " sample_size = '%d' AND stat_description = '%s,%s') OR\n"
+ " (stat_name = 'n_diff_pfx03' AND stat_value = %d AND"
+ " sample_size = '%d' AND stat_description = '%s,%s,%s') OR\n"
+ " (stat_name = 'n_diff_pfx04' AND stat_value = %d AND"
+ " sample_size = '%d' AND stat_description = '%s,%s,%s,%s')\n"
+ ");\n"
+ "\n",
+ INDEX_STATS_NAME_PRINT,
+ TEST_DATABASE_NAME,
+ TEST_TABLE_NAME,
+ TEST_IDX2_NAME,
+ TEST_IDX2_INDEX_SIZE,
+ TEST_IDX2_N_LEAF_PAGES,
+ TEST_IDX2_N_DIFF1,
+ TEST_IDX2_N_DIFF1_SAMPLE_SIZE, TEST_IDX2_COL1_NAME,
+ TEST_IDX2_N_DIFF2,
+ TEST_IDX2_N_DIFF2_SAMPLE_SIZE,
+ TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME,
+ TEST_IDX2_N_DIFF3,
+ TEST_IDX2_N_DIFF3_SAMPLE_SIZE,
+ TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+ TEST_IDX2_N_DIFF4,
+ TEST_IDX2_N_DIFF4_SAMPLE_SIZE,
+ TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+ TEST_IDX2_COL4_NAME);
+}
+/* @} */
+
+/* test_dict_stats_fetch_from_ps() @{ */
+void
+test_dict_stats_fetch_from_ps()
+{
+ dict_table_t table;
+ dict_index_t index1;
+ ib_uint64_t index1_stat_n_diff_key_vals[1];
+ ib_uint64_t index1_stat_n_sample_sizes[1];
+ dict_index_t index2;
+ ib_uint64_t index2_stat_n_diff_key_vals[4];
+ ib_uint64_t index2_stat_n_sample_sizes[4];
+ dberr_t ret;
+
+ /* craft a dummy dict_table_t */
+ table.name.m_name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME);
+ UT_LIST_INIT(table.indexes, &dict_index_t::indexes);
+#ifdef BTR_CUR_HASH_ADAPT
+ UT_LIST_INIT(table.freed_indexes, &dict_index_t::indexes);
+#endif /* BTR_CUR_HASH_ADAPT */
+ UT_LIST_ADD_LAST(table.indexes, &index1);
+ UT_LIST_ADD_LAST(table.indexes, &index2);
+ ut_d(table.magic_n = DICT_TABLE_MAGIC_N);
+
+ index1.name = TEST_IDX1_NAME;
+ ut_d(index1.magic_n = DICT_INDEX_MAGIC_N);
+ index1.cached = 1;
+ index1.n_uniq = 1;
+ index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+ index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+
+ index2.name = TEST_IDX2_NAME;
+ ut_d(index2.magic_n = DICT_INDEX_MAGIC_N);
+ index2.cached = 1;
+ index2.n_uniq = 4;
+ index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+ index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+
+ ret = dict_stats_fetch_from_ps(&table);
+
+ ut_a(ret == DB_SUCCESS);
+
+ ut_a(table.stat_n_rows == TEST_N_ROWS);
+ ut_a(table.stat_clustered_index_size == TEST_CLUSTERED_INDEX_SIZE);
+ ut_a(table.stat_sum_of_other_index_sizes
+ == TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+ ut_a(index1.stat_index_size == TEST_IDX1_INDEX_SIZE);
+ ut_a(index1.stat_n_leaf_pages == TEST_IDX1_N_LEAF_PAGES);
+ ut_a(index1_stat_n_diff_key_vals[0] == TEST_IDX1_N_DIFF1);
+ ut_a(index1_stat_n_sample_sizes[0] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE);
+
+ ut_a(index2.stat_index_size == TEST_IDX2_INDEX_SIZE);
+ ut_a(index2.stat_n_leaf_pages == TEST_IDX2_N_LEAF_PAGES);
+ ut_a(index2_stat_n_diff_key_vals[0] == TEST_IDX2_N_DIFF1);
+ ut_a(index2_stat_n_sample_sizes[0] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE);
+ ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF2);
+ ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE);
+ ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF3);
+ ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE);
+ ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF4);
+ ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE);
+
+ printf("OK: fetch successful\n");
+}
+/* @} */
+
+/* test_dict_stats_all() @{ */
+void
+test_dict_stats_all()
+{
+ test_dict_table_schema_check();
+
+ test_dict_stats_save();
+
+ test_dict_stats_fetch_from_ps();
+}
+/* @} */
+
+#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */
+/* @} */
diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc
new file mode 100644
index 00000000..afeb8ef6
--- /dev/null
+++ b/storage/innobase/dict/dict0stats_bg.cc
@@ -0,0 +1,479 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0stats_bg.cc
+Code used for background table and index stats gathering.
+
+Created Apr 25, 2012 Vasil Dimov
+*******************************************************/
+
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "dict0defrag_bg.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+#ifdef WITH_WSREP
+# include "trx0trx.h"
+# include "mysql/service_wsrep.h"
+# include "wsrep.h"
+# include "log.h"
+# include "wsrep_mysqld.h"
+#endif
+
+#include <vector>
+
+/** Minimum time interval between stats recalc for a given table */
+#define MIN_RECALC_INTERVAL 10 /* seconds */
+static void dict_stats_schedule(int ms);
+
+#ifdef UNIV_DEBUG
+/** Used by SET GLOBAL innodb_dict_stats_disabled_debug = 1; */
+my_bool innodb_dict_stats_disabled_debug;
+#endif /* UNIV_DEBUG */
+
+/** This mutex protects the "recalc_pool" variable. */
+static ib_mutex_t recalc_pool_mutex;
+
+/** Allocator type, used by std::vector */
+typedef ut_allocator<table_id_t>
+ recalc_pool_allocator_t;
+
+/** The multitude of tables whose stats are to be automatically
+recalculated - an STL vector */
+typedef std::vector<table_id_t, recalc_pool_allocator_t>
+ recalc_pool_t;
+
+/** Iterator type for iterating over the elements of objects of type
+recalc_pool_t. */
+typedef recalc_pool_t::iterator
+ recalc_pool_iterator_t;
+
+/** Pool where we store information on which tables are to be processed
+by background statistics gathering. */
+static recalc_pool_t recalc_pool;
+/** Whether the global data structures have been initialized */
+static bool stats_initialised;
+
+/*****************************************************************//**
+Free the resources occupied by the recalc pool, called once during
+thread de-initialization. */
+static void dict_stats_recalc_pool_deinit()
+{
+ ut_ad(!srv_read_only_mode);
+
+ recalc_pool.clear();
+ defrag_pool.clear();
+ /*
+ recalc_pool may still have its buffer allocated. It will free it when
+ its destructor is called.
+ The problem is, memory leak detector is run before the recalc_pool's
+ destructor is invoked, and will report recalc_pool's buffer as leaked
+ memory. To avoid that, we force recalc_pool to surrender its buffer
+ to empty_pool object, which will free it when leaving this function:
+ */
+ recalc_pool_t recalc_empty_pool;
+ defrag_pool_t defrag_empty_pool;
+ recalc_pool.swap(recalc_empty_pool);
+ defrag_pool.swap(defrag_empty_pool);
+}
+
+/*****************************************************************//**
+Add a table to the recalc pool, which is processed by the
+background stats gathering thread. Only the table id is added to the
+list, so the table can be closed after being enqueued and it will be
+opened when needed. If the table does not exist later (has been DROPped),
+then it will be removed from the pool and skipped. */
+static
+void
+dict_stats_recalc_pool_add(
+/*=======================*/
+ const dict_table_t* table, /*!< in: table to add */
+ bool schedule_dict_stats_task = true /*!< in: schedule dict stats task */
+)
+{
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&recalc_pool_mutex);
+
+ /* quit if already in the list */
+ for (recalc_pool_iterator_t iter = recalc_pool.begin();
+ iter != recalc_pool.end();
+ ++iter) {
+
+ if (*iter == table->id) {
+ mutex_exit(&recalc_pool_mutex);
+ return;
+ }
+ }
+
+ recalc_pool.push_back(table->id);
+ if (recalc_pool.size() == 1 && schedule_dict_stats_task) {
+ dict_stats_schedule_now();
+ }
+ mutex_exit(&recalc_pool_mutex);
+
+}
+
+#ifdef WITH_WSREP
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out] table persistent or temporary table
+@param[in] thd current session */
+void dict_stats_update_if_needed(dict_table_t *table, const trx_t &trx)
+#else
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out] table persistent or temporary table */
+void dict_stats_update_if_needed_func(dict_table_t *table)
+#endif
+{
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ if (UNIV_UNLIKELY(!table->stat_initialized)) {
+ /* The table may have been evicted from dict_sys
+ and reloaded internally by InnoDB for FOREIGN KEY
+ processing, but not reloaded by the SQL layer.
+
+ We can (re)compute the transient statistics when the
+ table is actually loaded by the SQL layer.
+
+ Note: If InnoDB persistent statistics are enabled,
+ we will skip the updates. We must do this, because
+ dict_table_get_n_rows() below assumes that the
+ statistics have been initialized. The DBA may have
+ to execute ANALYZE TABLE. */
+ return;
+ }
+
+ ulonglong counter = table->stat_modified_counter++;
+ ulonglong n_rows = dict_table_get_n_rows(table);
+
+ if (dict_stats_is_persistent_enabled(table)) {
+ if (counter > n_rows / 10 /* 10% */
+ && dict_stats_auto_recalc_is_enabled(table)) {
+
+#ifdef WITH_WSREP
+ /* Do not add table to background
+ statistic calculation if this thread is not a
+ applier (as all DDL, which is replicated (i.e
+ is binlogged in master node), will be executed
+ with high priority (a.k.a BF) in slave nodes)
+ and is BF. This could again lead BF lock
+ waits in applier node but it is better than
+ no persistent index/table statistics at
+ applier nodes. TODO: allow BF threads
+ wait for these InnoDB internal SQL-parser
+ generated row locks and allow BF thread
+ lock waits to be enqueued at head of waiting
+ queue. */
+ if (trx.is_wsrep()
+ && !wsrep_thd_is_applying(trx.mysql_thd)
+ && wsrep_thd_is_BF(trx.mysql_thd, 0)) {
+ WSREP_DEBUG("Avoiding background statistics"
+ " calculation for table %s.",
+ table->name.m_name);
+ return;
+ }
+#endif /* WITH_WSREP */
+
+ dict_stats_recalc_pool_add(table);
+ table->stat_modified_counter = 0;
+ }
+ return;
+ }
+
+ /* Calculate new statistics if 1 / 16 of table has been modified
+ since the last time a statistics batch was run.
+ We calculate statistics at most every 16th round, since we may have
+ a counter table which is very small and updated very often. */
+ ulonglong threshold = 16 + n_rows / 16; /* 6.25% */
+
+ if (srv_stats_modified_counter) {
+ threshold = std::min(srv_stats_modified_counter, threshold);
+ }
+
+ if (counter > threshold) {
+ /* this will reset table->stat_modified_counter to 0 */
+ dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT);
+ }
+}
+
+/*****************************************************************//**
+Get a table from the auto recalc pool. The returned table id is removed
+from the pool.
+@return true if the pool was non-empty and "id" was set, false otherwise */
+static
+bool
+dict_stats_recalc_pool_get(
+/*=======================*/
+ table_id_t* id) /*!< out: table id, or unmodified if list is
+ empty */
+{
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&recalc_pool_mutex);
+
+ if (recalc_pool.empty()) {
+ mutex_exit(&recalc_pool_mutex);
+ return(false);
+ }
+
+ *id = recalc_pool.at(0);
+
+ recalc_pool.erase(recalc_pool.begin());
+
+ mutex_exit(&recalc_pool_mutex);
+
+ return(true);
+}
+
+/*****************************************************************//**
+Delete a given table from the auto recalc pool.
+dict_stats_recalc_pool_del() */
+void
+dict_stats_recalc_pool_del(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table to remove */
+{
+ ut_ad(!srv_read_only_mode);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ mutex_enter(&recalc_pool_mutex);
+
+ ut_ad(table->id > 0);
+
+ for (recalc_pool_iterator_t iter = recalc_pool.begin();
+ iter != recalc_pool.end();
+ ++iter) {
+
+ if (*iter == table->id) {
+ /* erase() invalidates the iterator */
+ recalc_pool.erase(iter);
+ break;
+ }
+ }
+
+ mutex_exit(&recalc_pool_mutex);
+}
+
+/*****************************************************************//**
+Wait until background stats thread has stopped using the specified table.
+The caller must have locked the data dictionary using
+row_mysql_lock_data_dictionary() and this function may unlock it temporarily
+and restore the lock before it exits.
+The background stats thread is guaranteed not to start using the specified
+table after this function returns and before the caller unlocks the data
+dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag
+under dict_sys.mutex. */
+void
+dict_stats_wait_bg_to_stop_using_table(
+/*===================================*/
+ dict_table_t* table, /*!< in/out: table */
+ trx_t* trx) /*!< in/out: transaction to use for
+ unlocking/locking the data dict */
+{
+ while (!dict_stats_stop_bg(table)) {
+ DICT_BG_YIELD(trx);
+ }
+}
+
+/*****************************************************************//**
+Initialize global variables needed for the operation of dict_stats_thread()
+Must be called before dict_stats_thread() is started. */
+void dict_stats_init()
+{
+ ut_ad(!srv_read_only_mode);
+
+ /* The recalc_pool_mutex is acquired from:
+ 1) the background stats gathering thread before any other latch
+ and released without latching anything else in between (thus
+ any level would do here)
+ 2) from dict_stats_update_if_needed()
+ and released without latching anything else in between. We know
+ that dict_sys.mutex (SYNC_DICT) is not acquired when
+ dict_stats_update_if_needed() is called and it may be acquired
+ inside that function (thus a level <=SYNC_DICT would do).
+ 3) from row_drop_table_for_mysql() after dict_sys.mutex (SYNC_DICT)
+ and dict_sys.latch (SYNC_DICT_OPERATION) have been locked
+ (thus a level <SYNC_DICT && <SYNC_DICT_OPERATION would do)
+ So we choose SYNC_STATS_AUTO_RECALC to be about below SYNC_DICT. */
+
+ mutex_create(LATCH_ID_RECALC_POOL, &recalc_pool_mutex);
+
+ dict_defrag_pool_init();
+ stats_initialised = true;
+}
+
+/*****************************************************************//**
+Free resources allocated by dict_stats_init(), must be called
+after dict_stats task has exited. */
+void dict_stats_deinit()
+{
+ if (!stats_initialised) {
+ return;
+ }
+
+ ut_ad(!srv_read_only_mode);
+ stats_initialised = false;
+
+ dict_stats_recalc_pool_deinit();
+ dict_defrag_pool_deinit();
+
+ mutex_free(&recalc_pool_mutex);
+}
+
+/**
+Get the first table that has been added for auto recalc and eventually
+update its stats.
+@return whether the first entry can be processed immediately */
+static bool dict_stats_process_entry_from_recalc_pool()
+{
+ table_id_t table_id;
+
+ ut_ad(!srv_read_only_mode);
+
+next_table_id:
+ /* pop the first table from the auto recalc pool */
+ if (!dict_stats_recalc_pool_get(&table_id)) {
+ /* no tables for auto recalc */
+ return false;
+ }
+
+ dict_table_t* table;
+
+ mutex_enter(&dict_sys.mutex);
+
+ table = dict_table_open_on_id(table_id, TRUE, DICT_TABLE_OP_NORMAL);
+
+ if (table == NULL) {
+ /* table does not exist, must have been DROPped
+ after its id was enqueued */
+ mutex_exit(&dict_sys.mutex);
+ goto next_table_id;
+ }
+
+ ut_ad(!table->is_temporary());
+
+ if (!table->is_accessible()) {
+ dict_table_close(table, TRUE, FALSE);
+ mutex_exit(&dict_sys.mutex);
+ goto next_table_id;
+ }
+
+ table->stats_bg_flag |= BG_STAT_IN_PROGRESS;
+
+ mutex_exit(&dict_sys.mutex);
+
+ /* time() could be expensive, the current function
+ is called once every time a table has been changed more than 10% and
+ on a system with lots of small tables, this could become hot. If we
+ find out that this is a problem, then the check below could eventually
+ be replaced with something else, though a time interval is the natural
+ approach. */
+ int ret;
+ if (difftime(time(NULL), table->stats_last_recalc)
+ < MIN_RECALC_INTERVAL) {
+
+ /* Stats were (re)calculated not long ago. To avoid
+ too frequent stats updates we put back the table on
+ the auto recalc list and do nothing. */
+
+ dict_stats_recalc_pool_add(table, false);
+ dict_stats_schedule(MIN_RECALC_INTERVAL*1000);
+ ret = false;
+ } else {
+
+ dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT);
+ ret = true;
+ }
+
+ mutex_enter(&dict_sys.mutex);
+
+ table->stats_bg_flag = BG_STAT_NONE;
+
+ dict_table_close(table, TRUE, FALSE);
+
+ mutex_exit(&dict_sys.mutex);
+ return ret;
+}
+
+#ifdef UNIV_DEBUG
+/** Disables dict stats thread. It's used by:
+ SET GLOBAL innodb_dict_stats_disabled_debug = 1 (0).
+@param[in] save immediate result from check function */
+void dict_stats_disabled_debug_update(THD*, st_mysql_sys_var*, void*,
+ const void* save)
+{
+ const bool disable = *static_cast<const my_bool*>(save);
+ if (disable)
+ dict_stats_shutdown();
+ else
+ dict_stats_start();
+}
+#endif /* UNIV_DEBUG */
+
+static tpool::timer* dict_stats_timer;
+static std::mutex dict_stats_mutex;
+
+static void dict_stats_func(void*)
+{
+ while (dict_stats_process_entry_from_recalc_pool()) {}
+ dict_defrag_process_entries_from_defrag_pool();
+}
+
+
+void dict_stats_start()
+{
+ std::lock_guard<std::mutex> lk(dict_stats_mutex);
+ if (!dict_stats_timer)
+ dict_stats_timer= srv_thread_pool->create_timer(dict_stats_func);
+}
+
+
+static void dict_stats_schedule(int ms)
+{
+ std::unique_lock<std::mutex> lk(dict_stats_mutex, std::defer_lock);
+ /*
+ Use try_lock() to avoid deadlock in dict_stats_shutdown(), which
+ uses dict_stats_mutex too. If there is simultaneous timer reschedule,
+ the first one will win, which is fine.
+ */
+ if (!lk.try_lock())
+ {
+ return;
+ }
+ if (dict_stats_timer)
+ dict_stats_timer->set_time(ms,0);
+}
+
+void dict_stats_schedule_now()
+{
+ dict_stats_schedule(0);
+}
+
+/** Shut down the dict_stats_thread. */
+void dict_stats_shutdown()
+{
+ std::lock_guard<std::mutex> lk(dict_stats_mutex);
+ delete dict_stats_timer;
+ dict_stats_timer= 0;
+}
diff --git a/storage/innobase/eval/eval0eval.cc b/storage/innobase/eval/eval0eval.cc
new file mode 100644
index 00000000..193a5814
--- /dev/null
+++ b/storage/innobase/eval/eval0eval.cc
@@ -0,0 +1,632 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file eval/eval0eval.cc
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "eval0eval.h"
+#include "data0data.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+
+/** Dummy adress used when we should allocate a buffer of size 0 in
+eval_node_alloc_val_buf */
+
+static byte eval_dummy;
+
+/*************************************************************************
+Gets the like node from the node */
+UNIV_INLINE
+que_node_t*
+que_node_get_like_node(
+/*===================*/
+ /* out: next node in a list of nodes */
+ que_node_t* node) /* in: node in a list */
+{
+ return(((sym_node_t*) node)->like_node);
+}
+
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has an allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return pointer to allocated buffer */
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+ que_node_t* node, /*!< in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size) /*!< in: buffer size */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL
+ || que_node_get_type(node) == QUE_NODE_FUNC);
+
+ dfield = que_node_get_val(node);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (data != &eval_dummy) {
+ ut_free(data);
+ }
+
+ if (size == 0) {
+ data = &eval_dummy;
+ } else {
+ data = static_cast<byte*>(ut_malloc_nokey(size));
+ }
+
+ que_node_set_val_buf_size(node, size);
+
+ dfield_set_data(dfield, data, size);
+
+ return(data);
+}
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+void
+eval_node_free_val_buf(
+/*===================*/
+ que_node_t* node) /*!< in: query graph node */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL
+ || que_node_get_type(node) == QUE_NODE_FUNC);
+
+ dfield = que_node_get_val(node);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (que_node_get_val_buf_size(node) > 0) {
+ ut_a(data);
+
+ ut_free(data);
+ }
+}
+
+/*********************************************************************
+Evaluates a LIKE comparison node.
+@return the result of the comparison */
+UNIV_INLINE
+ibool
+eval_cmp_like(
+/*==========*/
+ que_node_t* arg1, /* !< in: left operand */
+ que_node_t* arg2) /* !< in: right operand */
+{
+ ib_like_t op;
+ que_node_t* arg3;
+ que_node_t* arg4;
+ const dfield_t* dfield;
+
+ arg3 = que_node_get_like_node(arg2);
+
+ /* Get the comparison type operator */
+ ut_a(arg3);
+
+ dfield = que_node_get_val(arg3);
+ ut_ad(dtype_get_mtype(dfield_get_type(dfield)) == DATA_INT);
+ op = static_cast<ib_like_t>(
+ mach_read_from_4(static_cast<const byte*>(
+ dfield_get_data(dfield))));
+
+ switch (op) {
+ case IB_LIKE_PREFIX:
+ arg4 = que_node_get_next(arg3);
+ return(!cmp_dfield_dfield_like_prefix(que_node_get_val(arg1),
+ que_node_get_val(arg4)));
+ case IB_LIKE_EXACT:
+ return(!cmp_dfield_dfield(que_node_get_val(arg1),
+ que_node_get_val(arg2)));
+ }
+
+ ut_error;
+ return(FALSE);
+}
+
+/*********************************************************************
+Evaluates a comparison node.
+@return the result of the comparison */
+ibool
+eval_cmp(
+/*=====*/
+ func_node_t* cmp_node) /*!< in: comparison node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ int res;
+ ibool val = FALSE; /* remove warning */
+
+ ut_ad(que_node_get_type(cmp_node) == QUE_NODE_FUNC);
+
+ arg1 = cmp_node->args;
+ arg2 = que_node_get_next(arg1);
+
+ switch (cmp_node->func) {
+ case '<':
+ case '=':
+ case '>':
+ case PARS_LE_TOKEN:
+ case PARS_NE_TOKEN:
+ case PARS_GE_TOKEN:
+ res = cmp_dfield_dfield(
+ que_node_get_val(arg1), que_node_get_val(arg2));
+
+ switch (cmp_node->func) {
+ case '<':
+ val = (res < 0);
+ break;
+ case '=':
+ val = (res == 0);
+ break;
+ case '>':
+ val = (res > 0);
+ break;
+ case PARS_LE_TOKEN:
+ val = (res <= 0);
+ break;
+ case PARS_NE_TOKEN:
+ val = (res != 0);
+ break;
+ case PARS_GE_TOKEN:
+ val = (res >= 0);
+ break;
+ }
+ break;
+ default:
+ val = eval_cmp_like(arg1, arg2);
+ break;
+ }
+
+ eval_node_set_ibool_val(cmp_node, val);
+
+ return(val);
+}
+
+/*****************************************************************//**
+Evaluates a logical operation node. */
+UNIV_INLINE
+void
+eval_logical(
+/*=========*/
+ func_node_t* logical_node) /*!< in: logical operation node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ ibool val1;
+ ibool val2 = 0; /* remove warning */
+ ibool val = 0; /* remove warning */
+ int func;
+
+ ut_ad(que_node_get_type(logical_node) == QUE_NODE_FUNC);
+
+ arg1 = logical_node->args;
+ arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is 'NOT' */
+
+ val1 = eval_node_get_ibool_val(arg1);
+
+ if (arg2) {
+ val2 = eval_node_get_ibool_val(arg2);
+ }
+
+ func = logical_node->func;
+
+ if (func == PARS_AND_TOKEN) {
+ val = val1 & val2;
+ } else if (func == PARS_OR_TOKEN) {
+ val = val1 | val2;
+ } else if (func == PARS_NOT_TOKEN) {
+ val = TRUE - val1;
+ } else {
+ ut_error;
+ }
+
+ eval_node_set_ibool_val(logical_node, val);
+}
+
+/*****************************************************************//**
+Evaluates an arithmetic operation node. */
+UNIV_INLINE
+void
+eval_arith(
+/*=======*/
+ func_node_t* arith_node) /*!< in: arithmetic operation node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ lint val1;
+ lint val2 = 0; /* remove warning */
+ lint val;
+ int func;
+
+ ut_ad(que_node_get_type(arith_node) == QUE_NODE_FUNC);
+
+ arg1 = arith_node->args;
+ arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is unary '-' */
+
+ val1 = eval_node_get_int_val(arg1);
+
+ if (arg2) {
+ val2 = eval_node_get_int_val(arg2);
+ }
+
+ func = arith_node->func;
+
+ if (func == '+') {
+ val = val1 + val2;
+ } else if ((func == '-') && arg2) {
+ val = val1 - val2;
+ } else if (func == '-') {
+ val = -val1;
+ } else if (func == '*') {
+ val = val1 * val2;
+ } else {
+ ut_ad(func == '/');
+ val = val1 / val2;
+ }
+
+ eval_node_set_int_val(arith_node, val);
+}
+
+/*****************************************************************//**
+Evaluates an aggregate operation node. */
+UNIV_INLINE
+void
+eval_aggregate(
+/*===========*/
+ func_node_t* node) /*!< in: aggregate operation node */
+{
+ lint val;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+ val = eval_node_get_int_val(node);
+
+ ut_a(node->func == PARS_COUNT_TOKEN);
+ val = val + 1;
+ eval_node_set_int_val(node, val);
+}
+
+/*****************************************************************//**
+Evaluates a notfound-function node. */
+UNIV_INLINE
+void
+eval_notfound(
+/*==========*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ sym_node_t* cursor;
+ sel_node_t* sel_node;
+ ibool ibool_val;
+
+ ut_ad(func_node->func == PARS_NOTFOUND_TOKEN);
+
+ cursor = static_cast<sym_node_t*>(func_node->args);
+
+ ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL);
+
+ if (cursor->token_type == SYM_LIT) {
+ ut_ad(!memcmp(dfield_get_data(que_node_get_val(cursor)),
+ "SQL", 3));
+ sel_node = cursor->sym_table->query_graph->last_sel_node;
+ } else {
+ sel_node = cursor->alias->cursor_def;
+ }
+
+ if (sel_node->state == SEL_NODE_NO_MORE_ROWS) {
+ ibool_val = TRUE;
+ } else {
+ ibool_val = FALSE;
+ }
+
+ eval_node_set_ibool_val(func_node, ibool_val);
+}
+
+/*****************************************************************//**
+Evaluates a substr-function node. */
+UNIV_INLINE
+void
+eval_substr(
+/*========*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ que_node_t* arg3;
+ dfield_t* dfield;
+ byte* str1;
+ ulint len1;
+ ulint len2;
+
+ arg1 = func_node->args;
+ arg2 = que_node_get_next(arg1);
+
+ ut_ad(func_node->func == PARS_SUBSTR_TOKEN);
+
+ arg3 = que_node_get_next(arg2);
+
+ str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
+
+ len1 = (ulint) eval_node_get_int_val(arg2);
+ len2 = (ulint) eval_node_get_int_val(arg3);
+
+ dfield = que_node_get_val(func_node);
+
+ dfield_set_data(dfield, str1 + len1, len2);
+}
+
+/*****************************************************************//**
+Evaluates an instr-function node. */
+static
+void
+eval_instr(
+/*=======*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ dfield_t* dfield1;
+ dfield_t* dfield2;
+ lint int_val;
+ byte* str1;
+ byte* str2;
+ byte match_char;
+ ulint len1;
+ ulint len2;
+ ulint i;
+ ulint j;
+
+ arg1 = func_node->args;
+ arg2 = que_node_get_next(arg1);
+
+ dfield1 = que_node_get_val(arg1);
+ dfield2 = que_node_get_val(arg2);
+
+ str1 = static_cast<byte*>(dfield_get_data(dfield1));
+ str2 = static_cast<byte*>(dfield_get_data(dfield2));
+
+ len1 = dfield_get_len(dfield1);
+ len2 = dfield_get_len(dfield2);
+
+ if (len2 == 0) {
+ ut_error;
+ }
+
+ match_char = str2[0];
+
+ for (i = 0; i < len1; i++) {
+ /* In this outer loop, the number of matched characters is 0 */
+
+ if (str1[i] == match_char) {
+
+ if (i + len2 > len1) {
+
+ break;
+ }
+
+ for (j = 1;; j++) {
+ /* We have already matched j characters */
+
+ if (j == len2) {
+ int_val = lint(i) + 1;
+
+ goto match_found;
+ }
+
+ if (str1[i + j] != str2[j]) {
+
+ break;
+ }
+ }
+ }
+ }
+
+ int_val = 0;
+
+match_found:
+ eval_node_set_int_val(func_node, int_val);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+static
+void
+eval_concat(
+/*========*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg;
+ dfield_t* dfield;
+ byte* data;
+ ulint len;
+ ulint len1;
+
+ arg = func_node->args;
+ len = 0;
+
+ while (arg) {
+ len1 = dfield_get_len(que_node_get_val(arg));
+
+ len += len1;
+
+ arg = que_node_get_next(arg);
+ }
+
+ data = eval_node_ensure_val_buf(func_node, len);
+
+ arg = func_node->args;
+ len = 0;
+
+ while (arg) {
+ dfield = que_node_get_val(arg);
+ len1 = dfield_get_len(dfield);
+
+ memcpy(data + len, dfield_get_data(dfield), len1);
+
+ len += len1;
+
+ arg = que_node_get_next(arg);
+ }
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. If the first argument is an integer,
+this function looks at the second argument which is the integer length in
+bytes, and converts the integer to a VARCHAR.
+If the first argument is of some other type, this function converts it to
+BINARY. */
+UNIV_INLINE
+void
+eval_to_binary(
+/*===========*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ dfield_t* dfield;
+ byte* str1;
+ ulint len;
+ ulint len1;
+
+ arg1 = func_node->args;
+
+ str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
+
+ if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) {
+
+ len = dfield_get_len(que_node_get_val(arg1));
+
+ dfield = que_node_get_val(func_node);
+
+ dfield_set_data(dfield, str1, len);
+
+ return;
+ }
+
+ arg2 = que_node_get_next(arg1);
+
+ len1 = (ulint) eval_node_get_int_val(arg2);
+
+ if (len1 > 4) {
+
+ ut_error;
+ }
+
+ dfield = que_node_get_val(func_node);
+
+ dfield_set_data(dfield, str1 + (4 - len1), len1);
+}
+
+/*****************************************************************//**
+Evaluate LENGTH(). */
+inline void eval_length(func_node_t* func_node)
+{
+ eval_node_set_int_val(func_node,
+ dfield_get_len(que_node_get_val
+ (func_node->args)));
+}
+
+/*****************************************************************//**
+Evaluates a function node. */
+void
+eval_func(
+/*======*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg;
+ ulint fclass;
+
+ ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC);
+
+ fclass = func_node->fclass;
+ const int func = func_node->func;
+
+ arg = func_node->args;
+
+ /* Evaluate first the argument list */
+ while (arg) {
+ eval_exp(arg);
+
+ /* The functions are not defined for SQL null argument
+ values, except for eval_cmp and notfound */
+
+ if (dfield_is_null(que_node_get_val(arg))
+ && (fclass != PARS_FUNC_CMP)
+ && (func != PARS_NOTFOUND_TOKEN)) {
+ ut_error;
+ }
+
+ arg = que_node_get_next(arg);
+ }
+
+ switch (fclass) {
+ case PARS_FUNC_CMP:
+ eval_cmp(func_node);
+ return;
+ case PARS_FUNC_ARITH:
+ eval_arith(func_node);
+ return;
+ case PARS_FUNC_AGGREGATE:
+ eval_aggregate(func_node);
+ return;
+ case PARS_FUNC_PREDEFINED:
+ switch (func) {
+ case PARS_NOTFOUND_TOKEN:
+ eval_notfound(func_node);
+ return;
+ case PARS_SUBSTR_TOKEN:
+ eval_substr(func_node);
+ return;
+ case PARS_INSTR_TOKEN:
+ eval_instr(func_node);
+ return;
+ case PARS_CONCAT_TOKEN:
+ eval_concat(func_node);
+ return;
+ case PARS_TO_BINARY_TOKEN:
+ eval_to_binary(func_node);
+ return;
+ case PARS_LENGTH_TOKEN:
+ eval_length(func_node);
+ return;
+ default:
+ ut_error;
+ }
+ case PARS_FUNC_LOGICAL:
+ eval_logical(func_node);
+ return;
+ }
+
+ ut_error;
+}
diff --git a/storage/innobase/eval/eval0proc.cc b/storage/innobase/eval/eval0proc.cc
new file mode 100644
index 00000000..7e39443f
--- /dev/null
+++ b/storage/innobase/eval/eval0proc.cc
@@ -0,0 +1,286 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file eval/eval0proc.cc
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "eval0proc.h"
+
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+if_step(
+/*====*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ if_node_t* node;
+ elsif_node_t* elsif_node;
+
+ ut_ad(thr);
+
+ node = static_cast<if_node_t*>(thr->run_node);
+ ut_ad(que_node_get_type(node) == QUE_NODE_IF);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+
+ /* Evaluate the condition */
+
+ eval_exp(node->cond);
+
+ if (eval_node_get_ibool_val(node->cond)) {
+
+ /* The condition evaluated to TRUE: start execution
+ from the first statement in the statement list */
+
+ thr->run_node = node->stat_list;
+
+ } else if (node->else_part) {
+ thr->run_node = node->else_part;
+
+ } else if (node->elsif_list) {
+ elsif_node = node->elsif_list;
+
+ for (;;) {
+ eval_exp(elsif_node->cond);
+
+ if (eval_node_get_ibool_val(
+ elsif_node->cond)) {
+
+ /* The condition evaluated to TRUE:
+ start execution from the first
+ statement in the statement list */
+
+ thr->run_node = elsif_node->stat_list;
+
+ break;
+ }
+
+ elsif_node = static_cast<elsif_node_t*>(
+ que_node_get_next(elsif_node));
+
+ if (elsif_node == NULL) {
+ thr->run_node = NULL;
+
+ break;
+ }
+ }
+ } else {
+ thr->run_node = NULL;
+ }
+ } else {
+ /* Move to the next statement */
+ ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+ thr->run_node = NULL;
+ }
+
+ if (thr->run_node == NULL) {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+while_step(
+/*=======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ while_node_t* node;
+
+ ut_ad(thr);
+
+ node = static_cast<while_node_t*>(thr->run_node);
+ ut_ad(que_node_get_type(node) == QUE_NODE_WHILE);
+
+ ut_ad((thr->prev_node == que_node_get_parent(node))
+ || (que_node_get_next(thr->prev_node) == NULL));
+
+ /* Evaluate the condition */
+
+ eval_exp(node->cond);
+
+ if (eval_node_get_ibool_val(node->cond)) {
+
+ /* The condition evaluated to TRUE: start execution
+ from the first statement in the statement list */
+
+ thr->run_node = node->stat_list;
+ } else {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+assign_step(
+/*========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ assign_node_t* node;
+
+ ut_ad(thr);
+
+ node = static_cast<assign_node_t*>(thr->run_node);
+ ut_ad(que_node_get_type(node) == QUE_NODE_ASSIGNMENT);
+
+ /* Evaluate the value to assign */
+
+ eval_exp(node->val);
+
+ eval_node_copy_val(node->var->alias, node->val);
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return query thread to run next or NULL */
+que_thr_t*
+for_step(
+/*=====*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ for_node_t* node;
+ que_node_t* parent;
+ lint loop_var_value;
+
+ ut_ad(thr);
+
+ node = static_cast<for_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_FOR);
+
+ parent = que_node_get_parent(node);
+
+ if (thr->prev_node != parent) {
+
+ /* Move to the next statement */
+ thr->run_node = que_node_get_next(thr->prev_node);
+
+ if (thr->run_node != NULL) {
+
+ return(thr);
+ }
+
+ /* Increment the value of loop_var */
+
+ loop_var_value = 1 + eval_node_get_int_val(node->loop_var);
+ } else {
+ /* Initialize the loop */
+
+ eval_exp(node->loop_start_limit);
+ eval_exp(node->loop_end_limit);
+
+ loop_var_value = eval_node_get_int_val(node->loop_start_limit);
+
+ node->loop_end_value
+ = (int) eval_node_get_int_val(node->loop_end_limit);
+ }
+
+ /* Check if we should do another loop */
+
+ if (loop_var_value > node->loop_end_value) {
+
+ /* Enough loops done */
+
+ thr->run_node = parent;
+ } else {
+ eval_node_set_int_val(node->loop_var, loop_var_value);
+
+ thr->run_node = node->stat_list;
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+exit_step(
+/*======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ exit_node_t* node;
+ que_node_t* loop_node;
+
+ ut_ad(thr);
+
+ node = static_cast<exit_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_EXIT);
+
+ /* Loops exit by setting thr->run_node as the loop node's parent, so
+ find our containing loop node and get its parent. */
+
+ loop_node = que_node_get_containing_loop_node(node);
+
+ /* If someone uses an EXIT statement outside of a loop, this will
+ trigger. */
+ ut_a(loop_node);
+
+ thr->run_node = que_node_get_parent(loop_node);
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+return_step(
+/*========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ return_node_t* node;
+ que_node_t* parent;
+
+ ut_ad(thr);
+
+ node = static_cast<return_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_RETURN);
+
+ parent = node;
+
+ while (que_node_get_type(parent) != QUE_NODE_PROC) {
+
+ parent = que_node_get_parent(parent);
+ }
+
+ ut_a(parent);
+
+ thr->run_node = que_node_get_parent(parent);
+
+ return(thr);
+}
diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc
new file mode 100644
index 00000000..240a2682
--- /dev/null
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -0,0 +1,2642 @@
+/*****************************************************************************
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file fil0crypt.cc
+Innodb file space encrypt/decrypt
+
+Created Jonas Oreland Google
+Modified Jan Lindström jan.lindstrom@mariadb.com
+*******************************************************/
+
+#include "fil0crypt.h"
+#include "mtr0types.h"
+#include "mach0data.h"
+#include "page0zip.h"
+#include "buf0checksum.h"
+#ifdef UNIV_INNOCHECKSUM
+# include "buf0buf.h"
+#else
+#include "buf0dblwr.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0ut.h"
+#include "fsp0fsp.h"
+#include "fil0pagecompress.h"
+#include <my_crypt.h>
+
+static bool fil_crypt_threads_inited = false;
+
+/** Is encryption enabled/disabled */
+UNIV_INTERN ulong srv_encrypt_tables = 0;
+
+/** No of key rotation threads requested */
+UNIV_INTERN uint srv_n_fil_crypt_threads = 0;
+
+/** No of key rotation threads started */
+UNIV_INTERN uint srv_n_fil_crypt_threads_started = 0;
+
+/** At this age or older a space/page will be rotated */
+UNIV_INTERN uint srv_fil_crypt_rotate_key_age;
+
+/** Whether the encryption plugin does key rotation */
+static bool srv_encrypt_rotate;
+
+/** Event to signal FROM the key rotation threads. */
+static os_event_t fil_crypt_event;
+
+/** Event to signal TO the key rotation threads. */
+UNIV_INTERN os_event_t fil_crypt_threads_event;
+
+/** Event for waking up threads throttle. */
+static os_event_t fil_crypt_throttle_sleep_event;
+
+/** Mutex for key rotation threads. */
+UNIV_INTERN ib_mutex_t fil_crypt_threads_mutex;
+
+/** Variable ensuring only 1 thread at time does initial conversion */
+static bool fil_crypt_start_converting = false;
+
+/** Variables for throttling */
+UNIV_INTERN uint srv_n_fil_crypt_iops = 100; // 10ms per iop
+static uint srv_alloc_time = 3; // allocate iops for 3s at a time
+static uint n_fil_crypt_iops_allocated = 0;
+
+#define DEBUG_KEYROTATION_THROTTLING 0
+
+/** Statistics variables */
+static fil_crypt_stat_t crypt_stat;
+static ib_mutex_t crypt_stat_mutex;
+
+/***********************************************************************
+Check if a key needs rotation given a key_state
+@param[in] crypt_data Encryption information
+@param[in] key_version Current key version
+@param[in] latest_key_version Latest key version
+@param[in] rotate_key_age when to rotate
+@return true if key needs rotation, false if not */
+static bool
+fil_crypt_needs_rotation(
+ const fil_space_crypt_t* crypt_data,
+ uint key_version,
+ uint latest_key_version,
+ uint rotate_key_age)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************
+Init space crypt */
+UNIV_INTERN
+void
+fil_space_crypt_init()
+{
+ fil_crypt_throttle_sleep_event = os_event_create(0);
+
+ mutex_create(LATCH_ID_FIL_CRYPT_STAT_MUTEX, &crypt_stat_mutex);
+ memset(&crypt_stat, 0, sizeof(crypt_stat));
+}
+
+/*********************************************************************
+Cleanup space crypt */
+UNIV_INTERN
+void
+fil_space_crypt_cleanup()
+{
+ os_event_destroy(fil_crypt_throttle_sleep_event);
+ mutex_free(&crypt_stat_mutex);
+}
+
+/**
+Get latest key version from encryption plugin.
+@return key version or ENCRYPTION_KEY_VERSION_INVALID */
+uint
+fil_space_crypt_t::key_get_latest_version(void)
+{
+ uint key_version = key_found;
+
+ if (is_key_found()) {
+ key_version = encryption_key_get_latest_version(key_id);
+ /* InnoDB does dirty read of srv_fil_crypt_rotate_key_age.
+ It doesn't matter because srv_encrypt_rotate
+ can be set to true only once */
+ if (!srv_encrypt_rotate
+ && key_version > srv_fil_crypt_rotate_key_age) {
+ srv_encrypt_rotate = true;
+ }
+
+ srv_stats.n_key_requests.inc();
+ key_found = key_version;
+ }
+
+ return key_version;
+}
+
+/******************************************************************
+Get the latest(key-version), waking the encrypt thread, if needed
+@param[in,out] crypt_data Crypt data */
+static inline
+uint
+fil_crypt_get_latest_key_version(
+ fil_space_crypt_t* crypt_data)
+{
+ ut_ad(crypt_data != NULL);
+
+ uint key_version = crypt_data->key_get_latest_version();
+
+ if (crypt_data->is_key_found()) {
+
+ if (fil_crypt_needs_rotation(
+ crypt_data,
+ crypt_data->min_key_version,
+ key_version,
+ srv_fil_crypt_rotate_key_age)) {
+ /* Below event seen as NULL-pointer at startup
+ when new database was created and we create a
+ checkpoint. Only seen when debugging. */
+ if (fil_crypt_threads_inited) {
+ os_event_set(fil_crypt_threads_event);
+ }
+ }
+ }
+
+ return key_version;
+}
+
+/******************************************************************
+Mutex helper for crypt_data->scheme */
+void
+crypt_data_scheme_locker(
+/*=====================*/
+ st_encryption_scheme* scheme,
+ int exit)
+{
+ fil_space_crypt_t* crypt_data =
+ static_cast<fil_space_crypt_t*>(scheme);
+
+ if (exit) {
+ mutex_exit(&crypt_data->mutex);
+ } else {
+ mutex_enter(&crypt_data->mutex);
+ }
+}
+
+/******************************************************************
+Create a fil_space_crypt_t object
+@param[in] type CRYPT_SCHEME_UNENCRYPTE or
+ CRYPT_SCHEME_1
+@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or
+ FIL_ENCRYPTION_ON or
+ FIL_ENCRYPTION_OFF
+@param[in] min_key_version key_version or 0
+@param[in] key_id Used key id
+@return crypt object */
+static
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+ uint type,
+ fil_encryption_t encrypt_mode,
+ uint min_key_version,
+ uint key_id)
+{
+ fil_space_crypt_t* crypt_data = NULL;
+ if (void* buf = ut_zalloc_nokey(sizeof(fil_space_crypt_t))) {
+ crypt_data = new(buf)
+ fil_space_crypt_t(
+ type,
+ min_key_version,
+ key_id,
+ encrypt_mode);
+ }
+
+ return crypt_data;
+}
+
+/******************************************************************
+Create a fil_space_crypt_t object
+@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or
+ FIL_ENCRYPTION_ON or
+ FIL_ENCRYPTION_OFF
+
+@param[in] key_id Encryption key id
+@return crypt object */
+UNIV_INTERN
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+ fil_encryption_t encrypt_mode,
+ uint key_id)
+{
+ return (fil_space_create_crypt_data(0, encrypt_mode, 0, key_id));
+}
+
+/******************************************************************
+Merge fil_space_crypt_t object
+@param[in,out] dst Destination cryp data
+@param[in] src Source crypt data */
+UNIV_INTERN
+void
+fil_space_merge_crypt_data(
+ fil_space_crypt_t* dst,
+ const fil_space_crypt_t* src)
+{
+ mutex_enter(&dst->mutex);
+
+ /* validate that they are mergeable */
+ ut_a(src->type == CRYPT_SCHEME_UNENCRYPTED ||
+ src->type == CRYPT_SCHEME_1);
+
+ ut_a(dst->type == CRYPT_SCHEME_UNENCRYPTED ||
+ dst->type == CRYPT_SCHEME_1);
+
+ dst->encryption = src->encryption;
+ dst->type = src->type;
+ dst->min_key_version = src->min_key_version;
+ dst->keyserver_requests += src->keyserver_requests;
+
+ mutex_exit(&dst->mutex);
+}
+
+/** Initialize encryption parameters from a tablespace header page.
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] page first page of the tablespace
+@return crypt data from page 0
+@retval NULL if not present or not valid */
+fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
+{
+ const ulint offset = FSP_HEADER_OFFSET
+ + fsp_header_get_encryption_offset(zip_size);
+
+ if (memcmp(page + offset, CRYPT_MAGIC, MAGIC_SZ) != 0) {
+ /* Crypt data is not stored. */
+ return NULL;
+ }
+
+ uint8_t type = mach_read_from_1(page + offset + MAGIC_SZ + 0);
+ uint8_t iv_length = mach_read_from_1(page + offset + MAGIC_SZ + 1);
+ fil_space_crypt_t* crypt_data;
+
+ if (!(type == CRYPT_SCHEME_UNENCRYPTED ||
+ type == CRYPT_SCHEME_1)
+ || iv_length != sizeof crypt_data->iv) {
+ ib::error() << "Found non sensible crypt scheme: "
+ << type << "," << iv_length
+ << " for space: "
+ << page_get_space_id(page);
+ return NULL;
+ }
+
+ uint min_key_version = mach_read_from_4
+ (page + offset + MAGIC_SZ + 2 + iv_length);
+
+ uint key_id = mach_read_from_4
+ (page + offset + MAGIC_SZ + 2 + iv_length + 4);
+
+ fil_encryption_t encryption = (fil_encryption_t)mach_read_from_1(
+ page + offset + MAGIC_SZ + 2 + iv_length + 8);
+
+ crypt_data = fil_space_create_crypt_data(encryption, key_id);
+ /* We need to overwrite these as above function will initialize
+ members */
+ crypt_data->type = type;
+ crypt_data->min_key_version = min_key_version;
+ memcpy(crypt_data->iv, page + offset + MAGIC_SZ + 2, iv_length);
+
+ return crypt_data;
+}
+
+/******************************************************************
+Free a crypt data object
+@param[in,out] crypt_data crypt data to be freed */
+UNIV_INTERN
+void
+fil_space_destroy_crypt_data(
+ fil_space_crypt_t **crypt_data)
+{
+ if (crypt_data != NULL && (*crypt_data) != NULL) {
+ fil_space_crypt_t* c;
+ if (UNIV_LIKELY(fil_crypt_threads_inited)) {
+ mutex_enter(&fil_crypt_threads_mutex);
+ c = *crypt_data;
+ *crypt_data = NULL;
+ mutex_exit(&fil_crypt_threads_mutex);
+ } else {
+ ut_ad(srv_read_only_mode || !srv_was_started);
+ c = *crypt_data;
+ *crypt_data = NULL;
+ }
+ if (c) {
+ c->~fil_space_crypt_t();
+ ut_free(c);
+ }
+ }
+}
+
+/** Amend encryption information from redo log.
+@param[in] space tablespace
+@param[in] data encryption metadata */
+void fil_crypt_parse(fil_space_t* space, const byte* data)
+{
+ ut_ad(data[1] == MY_AES_BLOCK_SIZE);
+ if (void* buf = ut_zalloc_nokey(sizeof(fil_space_crypt_t))) {
+ fil_space_crypt_t* crypt_data = new(buf)
+ fil_space_crypt_t(
+ data[0],
+ mach_read_from_4(&data[2 + MY_AES_BLOCK_SIZE]),
+ mach_read_from_4(&data[6 + MY_AES_BLOCK_SIZE]),
+ static_cast<fil_encryption_t>
+ (data[10 + MY_AES_BLOCK_SIZE]));
+ memcpy(crypt_data->iv, data + 2, MY_AES_BLOCK_SIZE);
+ mutex_enter(&fil_system.mutex);
+ if (space->crypt_data) {
+ fil_space_merge_crypt_data(space->crypt_data,
+ crypt_data);
+ fil_space_destroy_crypt_data(&crypt_data);
+ crypt_data = space->crypt_data;
+ } else {
+ space->crypt_data = crypt_data;
+ }
+ mutex_exit(&fil_system.mutex);
+ }
+}
+
+/** Fill crypt data information to the give page.
+It should be called during ibd file creation.
+@param[in] flags tablespace flags
+@param[in,out] page first page of the tablespace */
+void
+fil_space_crypt_t::fill_page0(
+ ulint flags,
+ byte* page)
+{
+ const uint len = sizeof(iv);
+ const ulint offset = FSP_HEADER_OFFSET
+ + fsp_header_get_encryption_offset(
+ fil_space_t::zip_size(flags));
+
+ memcpy(page + offset, CRYPT_MAGIC, MAGIC_SZ);
+ mach_write_to_1(page + offset + MAGIC_SZ, type);
+ mach_write_to_1(page + offset + MAGIC_SZ + 1, len);
+ memcpy(page + offset + MAGIC_SZ + 2, &iv, len);
+
+ mach_write_to_4(page + offset + MAGIC_SZ + 2 + len,
+ min_key_version);
+ mach_write_to_4(page + offset + MAGIC_SZ + 2 + len + 4,
+ key_id);
+ mach_write_to_1(page + offset + MAGIC_SZ + 2 + len + 8,
+ encryption);
+}
+
+/** Write encryption metadata to the first page.
+@param[in,out] block first page of the tablespace
+@param[in,out] mtr mini-transaction */
+void fil_space_crypt_t::write_page0(buf_block_t* block, mtr_t* mtr)
+{
+ const ulint offset = FSP_HEADER_OFFSET
+ + fsp_header_get_encryption_offset(block->zip_size());
+ byte* b = block->frame + offset;
+
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*block, b, CRYPT_MAGIC, MAGIC_SZ);
+
+ b += MAGIC_SZ;
+ byte* const start = b;
+ *b++ = static_cast<byte>(type);
+ compile_time_assert(sizeof iv == MY_AES_BLOCK_SIZE);
+ compile_time_assert(sizeof iv == CRYPT_SCHEME_1_IV_LEN);
+ *b++ = sizeof iv;
+ memcpy(b, iv, sizeof iv);
+ b += sizeof iv;
+ mach_write_to_4(b, min_key_version);
+ b += 4;
+ mach_write_to_4(b, key_id);
+ b += 4;
+ *b++ = byte(encryption);
+ ut_ad(b - start == 11 + MY_AES_BLOCK_SIZE);
+ /* We must log also any unchanged bytes, because recovery will
+ invoke fil_crypt_parse() based on this log record. */
+ mtr->memcpy(*block, offset + MAGIC_SZ, b - start);
+}
+
+/** Encrypt a buffer for non full checksum.
+@param[in,out] crypt_data Crypt data
+@param[in] space space_id
+@param[in] offset Page offset
+@param[in] lsn Log sequence number
+@param[in] src_frame Page to encrypt
+@param[in] zip_size ROW_FORMAT=COMPRESSED
+ page size, or 0
+@param[in,out] dst_frame Output buffer
+@return encrypted buffer or NULL */
+static byte* fil_encrypt_buf_for_non_full_checksum(
+ fil_space_crypt_t* crypt_data,
+ ulint space,
+ ulint offset,
+ lsn_t lsn,
+ const byte* src_frame,
+ ulint zip_size,
+ byte* dst_frame)
+{
+ uint size = uint(zip_size ? zip_size : srv_page_size);
+ uint key_version = fil_crypt_get_latest_key_version(crypt_data);
+ ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID);
+ ut_ad(!ut_align_offset(src_frame, 8));
+ ut_ad(!ut_align_offset(dst_frame, 8));
+
+ const bool page_compressed = fil_page_get_type(src_frame)
+ == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED;
+ uint header_len = FIL_PAGE_DATA;
+
+ if (page_compressed) {
+ header_len += FIL_PAGE_ENCRYPT_COMP_METADATA_LEN;
+ }
+
+ /* FIL page header is not encrypted */
+ memcpy(dst_frame, src_frame, header_len);
+ mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
+ key_version);
+
+ /* Calculate the start offset in a page */
+ uint unencrypted_bytes = header_len + FIL_PAGE_DATA_END;
+ uint srclen = size - unencrypted_bytes;
+ const byte* src = src_frame + header_len;
+ byte* dst = dst_frame + header_len;
+ uint32 dstlen = 0;
+ ib_uint32_t checksum = 0;
+
+ if (page_compressed) {
+ srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
+ }
+
+ int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen,
+ crypt_data, key_version,
+ (uint32)space, (uint32)offset, lsn);
+ ut_a(rc == MY_AES_OK);
+ ut_a(dstlen == srclen);
+
+ /* For compressed tables we do not store the FIL header because
+ the whole page is not stored to the disk. In compressed tables only
+ the FIL header + compressed (and now encrypted) payload alligned
+ to sector boundary is written. */
+ if (!page_compressed) {
+ /* FIL page trailer is also not encrypted */
+ static_assert(FIL_PAGE_DATA_END == 8, "alignment");
+ memcpy_aligned<8>(dst_frame + size - FIL_PAGE_DATA_END,
+ src_frame + size - FIL_PAGE_DATA_END, 8);
+ } else {
+ /* Clean up rest of buffer */
+ memset(dst_frame+header_len+srclen, 0,
+ size - (header_len + srclen));
+ }
+
+ checksum = fil_crypt_calculate_checksum(zip_size, dst_frame);
+
+ /* store the post-encryption checksum after the key-version */
+ mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4,
+ checksum);
+
+ ut_ad(fil_space_verify_crypt_checksum(dst_frame, zip_size));
+
+ srv_stats.pages_encrypted.inc();
+
+ return dst_frame;
+}
+
+/** Encrypt a buffer for full checksum format.
+@param[in,out] crypt_data Crypt data
+@param[in] space space_id
+@param[in] offset Page offset
+@param[in] lsn Log sequence number
+@param[in] src_frame Page to encrypt
+@param[in,out] dst_frame Output buffer
+@return encrypted buffer or NULL */
+static byte* fil_encrypt_buf_for_full_crc32(
+ fil_space_crypt_t* crypt_data,
+ ulint space,
+ ulint offset,
+ lsn_t lsn,
+ const byte* src_frame,
+ byte* dst_frame)
+{
+ uint key_version = fil_crypt_get_latest_key_version(crypt_data);
+ ut_d(bool corrupted = false);
+ const uint size = buf_page_full_crc32_size(src_frame, NULL,
+#ifdef UNIV_DEBUG
+ &corrupted
+#else
+ NULL
+#endif
+ );
+ ut_ad(!corrupted);
+ uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ + FIL_PAGE_FCRC32_CHECKSUM);
+ const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+ byte* dst = dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+ uint dstlen = 0;
+
+ ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID);
+
+ /* Till FIL_PAGE_LSN, page is not encrypted */
+ memcpy(dst_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+ /* Write key version to the page. */
+ mach_write_to_4(dst_frame + FIL_PAGE_FCRC32_KEY_VERSION, key_version);
+
+ int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen,
+ crypt_data, key_version,
+ uint(space), uint(offset), lsn);
+ ut_a(rc == MY_AES_OK);
+ ut_a(dstlen == srclen);
+
+ const ulint payload = size - FIL_PAGE_FCRC32_CHECKSUM;
+ mach_write_to_4(dst_frame + payload, ut_crc32(dst_frame, payload));
+ /* Clean the rest of the buffer. FIXME: Punch holes when writing! */
+ memset(dst_frame + (payload + 4), 0, srv_page_size - (payload + 4));
+
+ srv_stats.pages_encrypted.inc();
+
+ return dst_frame;
+}
+
+/** Encrypt a buffer.
+@param[in,out] crypt_data Crypt data
+@param[in] space space_id
+@param[in] offset Page offset
+@param[in] src_frame Page to encrypt
+@param[in] zip_size ROW_FORMAT=COMPRESSED
+ page size, or 0
+@param[in,out] dst_frame Output buffer
+@param[in] use_full_checksum full crc32 algo is used
+@return encrypted buffer or NULL */
+byte* fil_encrypt_buf(
+ fil_space_crypt_t* crypt_data,
+ ulint space,
+ ulint offset,
+ const byte* src_frame,
+ ulint zip_size,
+ byte* dst_frame,
+ bool use_full_checksum)
+{
+ const lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+
+ if (use_full_checksum) {
+ ut_ad(!zip_size);
+ return fil_encrypt_buf_for_full_crc32(
+ crypt_data, space, offset,
+ lsn, src_frame, dst_frame);
+ }
+
+ return fil_encrypt_buf_for_non_full_checksum(
+ crypt_data, space, offset, lsn,
+ src_frame, zip_size, dst_frame);
+}
+
+/** Check whether these page types are allowed to encrypt.
+@param[in] space tablespace object
+@param[in] src_frame source page
+@return true if it is valid page type */
+static bool fil_space_encrypt_valid_page_type(
+ const fil_space_t* space,
+ const byte* src_frame)
+{
+ switch (fil_page_get_type(src_frame)) {
+ case FIL_PAGE_RTREE:
+ return space->full_crc32();
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ return false;
+ }
+
+ return true;
+}
+
+/******************************************************************
+Encrypt a page
+
+@param[in] space Tablespace
+@param[in] offset Page offset
+@param[in] src_frame Page to encrypt
+@param[in,out] dst_frame Output buffer
+@return encrypted buffer or NULL */
+byte* fil_space_encrypt(
+ const fil_space_t* space,
+ ulint offset,
+ byte* src_frame,
+ byte* dst_frame)
+{
+ if (!fil_space_encrypt_valid_page_type(space, src_frame)) {
+ return src_frame;
+ }
+
+ if (!space->crypt_data || !space->crypt_data->is_encrypted()) {
+ return (src_frame);
+ }
+
+ ut_ad(space->referenced());
+
+ return fil_encrypt_buf(space->crypt_data, space->id, offset,
+ src_frame, space->zip_size(),
+ dst_frame, space->full_crc32());
+}
+
+/** Decrypt a page for full checksum format.
+@param[in] space space id
+@param[in] crypt_data crypt_data
+@param[in] tmp_frame Temporary buffer
+@param[in,out] src_frame Page to decrypt
+@param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED
+@return true if page decrypted, false if not.*/
+static bool fil_space_decrypt_full_crc32(
+ ulint space,
+ fil_space_crypt_t* crypt_data,
+ byte* tmp_frame,
+ byte* src_frame,
+ dberr_t* err)
+{
+ uint key_version = mach_read_from_4(
+ src_frame + FIL_PAGE_FCRC32_KEY_VERSION);
+ lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+ uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+ *err = DB_SUCCESS;
+
+ if (key_version == ENCRYPTION_KEY_NOT_ENCRYPTED) {
+ return false;
+ }
+
+ ut_ad(crypt_data);
+ ut_ad(crypt_data->is_encrypted());
+
+ memcpy(tmp_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+ /* Calculate the offset where decryption starts */
+ const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+ byte* dst = tmp_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+ uint dstlen = 0;
+ bool corrupted = false;
+ uint size = buf_page_full_crc32_size(src_frame, NULL, &corrupted);
+ if (UNIV_UNLIKELY(corrupted)) {
+fail:
+ *err = DB_DECRYPTION_FAILED;
+ return false;
+ }
+
+ uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ + FIL_PAGE_FCRC32_CHECKSUM);
+
+ int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen,
+ crypt_data, key_version,
+ (uint) space, offset, lsn);
+
+ if (rc != MY_AES_OK || dstlen != srclen) {
+ if (rc == -1) {
+ goto fail;
+ }
+
+ ib::fatal() << "Unable to decrypt data-block "
+ << " src: " << src << "srclen: "
+ << srclen << " buf: " << dst << "buflen: "
+ << dstlen << " return-code: " << rc
+ << " Can't continue!";
+ }
+
+ /* Copy only checksum part in the trailer */
+ memcpy(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+ src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+ FIL_PAGE_FCRC32_CHECKSUM);
+
+ srv_stats.pages_decrypted.inc();
+
+ return true; /* page was decrypted */
+}
+
+/** Decrypt a page for non full checksum format.
+@param[in] crypt_data crypt_data
+@param[in] tmp_frame Temporary buffer
+@param[in] physical_size page size
+@param[in,out] src_frame Page to decrypt
+@param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED
+@return true if page decrypted, false if not.*/
+static bool fil_space_decrypt_for_non_full_checksum(
+ fil_space_crypt_t* crypt_data,
+ byte* tmp_frame,
+ ulint physical_size,
+ byte* src_frame,
+ dberr_t* err)
+{
+ uint key_version = mach_read_from_4(
+ src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ bool page_compressed = (fil_page_get_type(src_frame)
+ == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+ uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+ uint space = mach_read_from_4(
+ src_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ ib_uint64_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+
+ *err = DB_SUCCESS;
+
+ if (key_version == ENCRYPTION_KEY_NOT_ENCRYPTED) {
+ return false;
+ }
+
+ ut_a(crypt_data != NULL && crypt_data->is_encrypted());
+
+ /* read space & lsn */
+ uint header_len = FIL_PAGE_DATA;
+
+ if (page_compressed) {
+ header_len += FIL_PAGE_ENCRYPT_COMP_METADATA_LEN;
+ }
+
+ /* Copy FIL page header, it is not encrypted */
+ memcpy(tmp_frame, src_frame, header_len);
+
+ /* Calculate the offset where decryption starts */
+ const byte* src = src_frame + header_len;
+ byte* dst = tmp_frame + header_len;
+ uint32 dstlen = 0;
+ uint srclen = uint(physical_size) - header_len - FIL_PAGE_DATA_END;
+
+ if (page_compressed) {
+ srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
+ }
+
+ int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen,
+ crypt_data, key_version,
+ space, offset, lsn);
+
+ if (! ((rc == MY_AES_OK) && ((ulint) dstlen == srclen))) {
+
+ if (rc == -1) {
+ *err = DB_DECRYPTION_FAILED;
+ return false;
+ }
+
+ ib::fatal() << "Unable to decrypt data-block "
+ << " src: " << static_cast<const void*>(src)
+ << "srclen: "
+ << srclen << " buf: "
+ << static_cast<const void*>(dst) << "buflen: "
+ << dstlen << " return-code: " << rc
+ << " Can't continue!";
+ }
+
+ /* For compressed tables we do not store the FIL header because
+ the whole page is not stored to the disk. In compressed tables only
+ the FIL header + compressed (and now encrypted) payload alligned
+ to sector boundary is written. */
+ if (!page_compressed) {
+ /* Copy FIL trailer */
+ memcpy(tmp_frame + physical_size - FIL_PAGE_DATA_END,
+ src_frame + physical_size - FIL_PAGE_DATA_END,
+ FIL_PAGE_DATA_END);
+ }
+
+ srv_stats.pages_decrypted.inc();
+
+ return true; /* page was decrypted */
+}
+
+/** Decrypt a page.
+@param[in] space_id tablespace id
+@param[in] crypt_data crypt_data
+@param[in] tmp_frame Temporary buffer
+@param[in] physical_size page size
+@param[in] fsp_flags Tablespace flags
+@param[in,out] src_frame Page to decrypt
+@param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED
+@return true if page decrypted, false if not.*/
+UNIV_INTERN
+bool
+fil_space_decrypt(
+ ulint space_id,
+ fil_space_crypt_t* crypt_data,
+ byte* tmp_frame,
+ ulint physical_size,
+ ulint fsp_flags,
+ byte* src_frame,
+ dberr_t* err)
+{
+ if (fil_space_t::full_crc32(fsp_flags)) {
+ return fil_space_decrypt_full_crc32(
+ space_id, crypt_data, tmp_frame, src_frame, err);
+ }
+
+ return fil_space_decrypt_for_non_full_checksum(crypt_data, tmp_frame,
+ physical_size, src_frame,
+ err);
+}
+
+/**
+Decrypt a page.
+@param[in] space Tablespace
+@param[in] tmp_frame Temporary buffer used for decrypting
+@param[in,out] src_frame Page to decrypt
+@return decrypted page, or original not encrypted page if decryption is
+not needed.*/
+UNIV_INTERN
+byte*
+fil_space_decrypt(
+ const fil_space_t* space,
+ byte* tmp_frame,
+ byte* src_frame)
+{
+ dberr_t err = DB_SUCCESS;
+ byte* res = NULL;
+ const ulint physical_size = space->physical_size();
+
+ ut_ad(space->crypt_data != NULL && space->crypt_data->is_encrypted());
+ ut_ad(space->referenced());
+
+ bool encrypted = fil_space_decrypt(space->id, space->crypt_data,
+ tmp_frame, physical_size,
+ space->flags,
+ src_frame, &err);
+
+ if (err == DB_SUCCESS) {
+ if (encrypted) {
+ /* Copy the decrypted page back to page buffer, not
+ really any other options. */
+ memcpy(src_frame, tmp_frame, physical_size);
+ }
+
+ res = src_frame;
+ }
+
+ return res;
+}
+
+/**
+Calculate post encryption checksum
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] dst_frame Block where checksum is calculated
+@return page checksum
+not needed. */
+uint32_t
+fil_crypt_calculate_checksum(ulint zip_size, const byte* dst_frame)
+{
+ /* For encrypted tables we use only crc32 and strict_crc32 */
+ return zip_size
+ ? page_zip_calc_checksum(dst_frame, zip_size,
+ SRV_CHECKSUM_ALGORITHM_CRC32)
+ : buf_calc_page_crc32(dst_frame);
+}
+
+/***********************************************************************/
+
+/** A copy of global key state */
+struct key_state_t {
+ key_state_t() : key_id(0), key_version(0),
+ rotate_key_age(srv_fil_crypt_rotate_key_age) {}
+ bool operator==(const key_state_t& other) const {
+ return key_version == other.key_version &&
+ rotate_key_age == other.rotate_key_age;
+ }
+ uint key_id;
+ uint key_version;
+ uint rotate_key_age;
+};
+
+/***********************************************************************
+Copy global key state
+@param[in,out] new_state key state
+@param[in] crypt_data crypt data */
+static void
+fil_crypt_get_key_state(
+ key_state_t* new_state,
+ fil_space_crypt_t* crypt_data)
+{
+ if (srv_encrypt_tables) {
+ new_state->key_version = crypt_data->key_get_latest_version();
+ new_state->rotate_key_age = srv_fil_crypt_rotate_key_age;
+
+ ut_a(new_state->key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
+ } else {
+ new_state->key_version = 0;
+ new_state->rotate_key_age = 0;
+ }
+}
+
+/***********************************************************************
+Check if a key needs rotation given a key_state
+@param[in] crypt_data Encryption information
+@param[in] key_version Current key version
+@param[in] latest_key_version Latest key version
+@param[in] rotate_key_age when to rotate
+@return true if key needs rotation, false if not */
+static bool
+fil_crypt_needs_rotation(
+ const fil_space_crypt_t* crypt_data,
+ uint key_version,
+ uint latest_key_version,
+ uint rotate_key_age)
+{
+ if (key_version == ENCRYPTION_KEY_VERSION_INVALID) {
+ return false;
+ }
+
+ if (key_version == 0 && latest_key_version != 0) {
+ /* this is rotation unencrypted => encrypted
+ * ignore rotate_key_age */
+ return true;
+ }
+
+ if (latest_key_version == 0 && key_version != 0) {
+ if (crypt_data->encryption == FIL_ENCRYPTION_DEFAULT) {
+ /* this is rotation encrypted => unencrypted */
+ return true;
+ }
+ return false;
+ }
+
+ if (crypt_data->encryption == FIL_ENCRYPTION_DEFAULT
+ && crypt_data->type == CRYPT_SCHEME_1
+ && !srv_encrypt_tables) {
+ /* This is rotation encrypted => unencrypted */
+ return true;
+ }
+
+ if (rotate_key_age == 0) {
+ return false;
+ }
+
+ /* this is rotation encrypted => encrypted,
+ * only reencrypt if key is sufficiently old */
+ if (key_version + rotate_key_age < latest_key_version) {
+ return true;
+ }
+
+ return false;
+}
+
+/** Read page 0 and possible crypt data from there.
+@param[in,out] space Tablespace */
+static inline
+void
+fil_crypt_read_crypt_data(fil_space_t* space)
+{
+ if (space->crypt_data || space->size || !space->get_size()) {
+ /* The encryption metadata has already been read, or
+ the tablespace is not encrypted and the file has been
+ opened already, or the file cannot be accessed,
+ likely due to a concurrent DROP
+ (possibly as part of TRUNCATE or ALTER TABLE).
+ FIXME: The file can become unaccessible any time
+ after this check! We should really remove this
+ function and instead make crypt_data an integral
+ part of fil_space_t. */
+ return;
+ }
+
+ const ulint zip_size = space->zip_size();
+ mtr_t mtr;
+ mtr.start();
+ if (buf_block_t* block = buf_page_get_gen(page_id_t(space->id, 0),
+ zip_size, RW_S_LATCH,
+ nullptr,
+ BUF_GET_POSSIBLY_FREED,
+ __FILE__, __LINE__, &mtr)) {
+ if (block->page.status == buf_page_t::FREED) {
+ goto func_exit;
+ }
+ mutex_enter(&fil_system.mutex);
+ if (!space->crypt_data && !space->is_stopping()) {
+ space->crypt_data = fil_space_read_crypt_data(
+ zip_size, block->frame);
+ }
+ mutex_exit(&fil_system.mutex);
+ }
+func_exit:
+ mtr.commit();
+}
+
+/** Start encrypting a space
+@param[in,out] space Tablespace
+@return true if a recheck of tablespace is needed by encryption thread. */
+static bool fil_crypt_start_encrypting_space(fil_space_t* space)
+{
+ mutex_enter(&fil_crypt_threads_mutex);
+
+ fil_space_crypt_t *crypt_data = space->crypt_data;
+
+ /* If space is not encrypted and encryption is not enabled, then
+ do not continue encrypting the space. */
+ if (!crypt_data && !srv_encrypt_tables) {
+ mutex_exit(&fil_crypt_threads_mutex);
+ return false;
+ }
+
+ const bool recheck = fil_crypt_start_converting;
+
+ if (recheck || crypt_data || space->is_stopping()) {
+ mutex_exit(&fil_crypt_threads_mutex);
+ return recheck;
+ }
+
+ /* NOTE: we need to write and flush page 0 before publishing
+ * the crypt data. This so that after restart there is no
+ * risk of finding encrypted pages without having
+ * crypt data in page 0 */
+
+ /* 1 - create crypt data */
+ crypt_data = fil_space_create_crypt_data(
+ FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
+
+ if (crypt_data == NULL) {
+ mutex_exit(&fil_crypt_threads_mutex);
+ return false;
+ }
+
+ fil_crypt_start_converting = true;
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ mtr_t mtr;
+ mtr.start();
+
+ /* 2 - get page 0 */
+ dberr_t err = DB_SUCCESS;
+ if (buf_block_t* block = buf_page_get_gen(
+ page_id_t(space->id, 0), space->zip_size(),
+ RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED,
+ __FILE__, __LINE__, &mtr, &err)) {
+ if (block->page.status == buf_page_t::FREED) {
+ goto abort;
+ }
+
+ crypt_data->type = CRYPT_SCHEME_1;
+ crypt_data->min_key_version = 0; // all pages are unencrypted
+ crypt_data->rotate_state.start_time = time(0);
+ crypt_data->rotate_state.starting = true;
+ crypt_data->rotate_state.active_threads = 1;
+
+ mutex_enter(&fil_system.mutex);
+ const bool stopping = space->is_stopping();
+ if (!stopping) {
+ space->crypt_data = crypt_data;
+ }
+ mutex_exit(&fil_system.mutex);
+
+ if (stopping) {
+ goto abort;
+ }
+
+ /* 3 - write crypt data to page 0 */
+ mtr.set_named_space(space);
+ crypt_data->write_page0(block, &mtr);
+
+ mtr.commit();
+
+ /* 4 - sync tablespace before publishing crypt data */
+ while (buf_flush_list_space(space));
+
+ /* 5 - publish crypt data */
+ mutex_enter(&fil_crypt_threads_mutex);
+ mutex_enter(&crypt_data->mutex);
+ crypt_data->type = CRYPT_SCHEME_1;
+ ut_a(crypt_data->rotate_state.active_threads == 1);
+ crypt_data->rotate_state.active_threads = 0;
+ crypt_data->rotate_state.starting = false;
+
+ fil_crypt_start_converting = false;
+ mutex_exit(&crypt_data->mutex);
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ return false;
+ }
+
+abort:
+ mtr.commit();
+ mutex_enter(&fil_crypt_threads_mutex);
+ fil_crypt_start_converting = false;
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ crypt_data->~fil_space_crypt_t();
+ ut_free(crypt_data);
+ return false;
+}
+
+/** State of a rotation thread */
+struct rotate_thread_t {
+ explicit rotate_thread_t(uint no) {
+ memset(this, 0, sizeof(* this));
+ thread_no = no;
+ first = true;
+ estimated_max_iops = 20;
+ }
+
+ uint thread_no;
+ bool first; /*!< is position before first space */
+ fil_space_t* space; /*!< current space or NULL */
+ uint32_t offset; /*!< current page number */
+ ulint batch; /*!< #pages to rotate */
+ uint min_key_version_found;/*!< min key version found but not rotated */
+ lsn_t end_lsn; /*!< max lsn when rotating this space */
+
+ uint estimated_max_iops; /*!< estimation of max iops */
+ uint allocated_iops; /*!< allocated iops */
+ ulint cnt_waited; /*!< #times waited during this slot */
+ uintmax_t sum_waited_us; /*!< wait time during this slot */
+
+ fil_crypt_stat_t crypt_stat; // statistics
+
+ /** @return whether this thread should terminate */
+ bool should_shutdown() const {
+ switch (srv_shutdown_state) {
+ case SRV_SHUTDOWN_NONE:
+ return thread_no >= srv_n_fil_crypt_threads;
+ case SRV_SHUTDOWN_EXIT_THREADS:
+ /* srv_init_abort() must have been invoked */
+ case SRV_SHUTDOWN_CLEANUP:
+ case SRV_SHUTDOWN_INITIATED:
+ return true;
+ case SRV_SHUTDOWN_LAST_PHASE:
+ break;
+ }
+ ut_ad(0);
+ return true;
+ }
+};
+
+/** Avoid the removal of the tablespace from
+default_encrypt_list only when
+1) Another active encryption thread working on tablespace
+2) Eligible for tablespace key rotation
+3) Tablespace is in flushing phase
+@return true if tablespace should be removed from
+default encrypt */
+static bool fil_crypt_must_remove(const fil_space_t &space)
+{
+ ut_ad(space.purpose == FIL_TYPE_TABLESPACE);
+ fil_space_crypt_t *crypt_data = space.crypt_data;
+ ut_ad(mutex_own(&fil_system.mutex));
+ const ulong encrypt_tables= srv_encrypt_tables;
+ if (!crypt_data)
+ return !encrypt_tables;
+ if (!crypt_data->is_key_found())
+ return true;
+
+ mutex_enter(&crypt_data->mutex);
+ const bool remove= (space.is_stopping() || crypt_data->not_encrypted()) &&
+ (!crypt_data->rotate_state.flushing &&
+ !encrypt_tables == !!crypt_data->min_key_version &&
+ !crypt_data->rotate_state.active_threads);
+ mutex_exit(&crypt_data->mutex);
+ return remove;
+}
+
+/***********************************************************************
+Check if space needs rotation given a key_state
+@param[in,out] state Key rotation state
+@param[in,out] key_state Key state
+@param[in,out] recheck needs recheck ?
+@return true if space needs key rotation */
+static
+bool
+fil_crypt_space_needs_rotation(
+ rotate_thread_t* state,
+ key_state_t* key_state,
+ bool* recheck)
+{
+ fil_space_t* space = state->space;
+
+ /* Make sure that tablespace is normal tablespace */
+ if (space->purpose != FIL_TYPE_TABLESPACE) {
+ return false;
+ }
+
+ ut_ad(space->referenced());
+
+ fil_space_crypt_t *crypt_data = space->crypt_data;
+
+ if (crypt_data == NULL) {
+ /**
+ * space has no crypt data
+ * start encrypting it...
+ */
+ *recheck = fil_crypt_start_encrypting_space(space);
+ crypt_data = space->crypt_data;
+
+ if (crypt_data == NULL) {
+ return false;
+ }
+
+ crypt_data->key_get_latest_version();
+ }
+
+ /* If used key_id is not found from encryption plugin we can't
+ continue to rotate the tablespace */
+ if (!crypt_data->is_key_found()) {
+ return false;
+ }
+
+ bool need_key_rotation = false;
+ mutex_enter(&crypt_data->mutex);
+
+ do {
+ /* prevent threads from starting to rotate space */
+ if (crypt_data->rotate_state.starting) {
+ /* recheck this space later */
+ *recheck = true;
+ break;
+ }
+
+ /* prevent threads from starting to rotate space */
+ if (space->is_stopping()) {
+ break;
+ }
+
+ if (crypt_data->rotate_state.flushing) {
+ break;
+ }
+
+ /* No need to rotate space if encryption is disabled */
+ if (crypt_data->not_encrypted()) {
+ break;
+ }
+
+ if (crypt_data->key_id != key_state->key_id) {
+ key_state->key_id= crypt_data->key_id;
+ fil_crypt_get_key_state(key_state, crypt_data);
+ }
+
+ need_key_rotation = fil_crypt_needs_rotation(
+ crypt_data,
+ crypt_data->min_key_version,
+ key_state->key_version,
+ key_state->rotate_key_age);
+ } while (0);
+
+ mutex_exit(&crypt_data->mutex);
+ return need_key_rotation;
+}
+
+/***********************************************************************
+Update global statistics with thread statistics
+@param[in,out] state key rotation statistics */
+static void
+fil_crypt_update_total_stat(
+ rotate_thread_t *state)
+{
+ mutex_enter(&crypt_stat_mutex);
+ crypt_stat.pages_read_from_cache +=
+ state->crypt_stat.pages_read_from_cache;
+ crypt_stat.pages_read_from_disk +=
+ state->crypt_stat.pages_read_from_disk;
+ crypt_stat.pages_modified += state->crypt_stat.pages_modified;
+ crypt_stat.pages_flushed += state->crypt_stat.pages_flushed;
+ // remote old estimate
+ crypt_stat.estimated_iops -= state->crypt_stat.estimated_iops;
+ // add new estimate
+ crypt_stat.estimated_iops += state->estimated_max_iops;
+ mutex_exit(&crypt_stat_mutex);
+
+ // make new estimate "current" estimate
+ memset(&state->crypt_stat, 0, sizeof(state->crypt_stat));
+ // record our old (current) estimate
+ state->crypt_stat.estimated_iops = state->estimated_max_iops;
+}
+
+/***********************************************************************
+Allocate iops to thread from global setting,
+used before starting to rotate a space.
+@param[in,out] state Rotation state
+@return true if allocation succeeded, false if failed */
+static
+bool
+fil_crypt_alloc_iops(
+ rotate_thread_t *state)
+{
+ ut_ad(state->allocated_iops == 0);
+
+ /* We have not yet selected the space to rotate, thus
+ state might not contain space and we can't check
+ its status yet. */
+
+ uint max_iops = state->estimated_max_iops;
+ mutex_enter(&fil_crypt_threads_mutex);
+
+ if (n_fil_crypt_iops_allocated >= srv_n_fil_crypt_iops) {
+ /* this can happen when user decreases srv_fil_crypt_iops */
+ mutex_exit(&fil_crypt_threads_mutex);
+ return false;
+ }
+
+ uint alloc = srv_n_fil_crypt_iops - n_fil_crypt_iops_allocated;
+
+ if (alloc > max_iops) {
+ alloc = max_iops;
+ }
+
+ n_fil_crypt_iops_allocated += alloc;
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ state->allocated_iops = alloc;
+
+ return alloc > 0;
+}
+
+/***********************************************************************
+Reallocate iops to thread,
+used when inside a space
+@param[in,out] state Rotation state */
+static
+void
+fil_crypt_realloc_iops(
+ rotate_thread_t *state)
+{
+ ut_a(state->allocated_iops > 0);
+
+ if (10 * state->cnt_waited > state->batch) {
+ /* if we waited more than 10% re-estimate max_iops */
+ ulint avg_wait_time_us =
+ ulint(state->sum_waited_us / state->cnt_waited);
+
+ if (avg_wait_time_us == 0) {
+ avg_wait_time_us = 1; // prevent division by zero
+ }
+
+ DBUG_PRINT("ib_crypt",
+ ("thr_no: %u - update estimated_max_iops from %u to "
+ ULINTPF ".",
+ state->thread_no,
+ state->estimated_max_iops,
+ 1000000 / avg_wait_time_us));
+
+ state->estimated_max_iops = uint(1000000 / avg_wait_time_us);
+ state->cnt_waited = 0;
+ state->sum_waited_us = 0;
+ } else {
+ DBUG_PRINT("ib_crypt",
+ ("thr_no: %u only waited " ULINTPF
+ "%% skip re-estimate.",
+ state->thread_no,
+ (100 * state->cnt_waited)
+ / (state->batch ? state->batch : 1)));
+ }
+
+ if (state->estimated_max_iops <= state->allocated_iops) {
+ /* return extra iops */
+ uint extra = state->allocated_iops - state->estimated_max_iops;
+
+ if (extra > 0) {
+ mutex_enter(&fil_crypt_threads_mutex);
+ if (n_fil_crypt_iops_allocated < extra) {
+ /* unknown bug!
+ * crash in debug
+ * keep n_fil_crypt_iops_allocated unchanged
+ * in release */
+ ut_ad(0);
+ extra = 0;
+ }
+ n_fil_crypt_iops_allocated -= extra;
+ state->allocated_iops -= extra;
+
+ if (state->allocated_iops == 0) {
+ /* no matter how slow io system seems to be
+ * never decrease allocated_iops to 0... */
+ state->allocated_iops ++;
+ n_fil_crypt_iops_allocated ++;
+ }
+
+ os_event_set(fil_crypt_threads_event);
+ mutex_exit(&fil_crypt_threads_mutex);
+ }
+ } else {
+ /* see if there are more to get */
+ mutex_enter(&fil_crypt_threads_mutex);
+ if (n_fil_crypt_iops_allocated < srv_n_fil_crypt_iops) {
+ /* there are extra iops free */
+ uint extra = srv_n_fil_crypt_iops -
+ n_fil_crypt_iops_allocated;
+ if (state->allocated_iops + extra >
+ state->estimated_max_iops) {
+ /* but don't alloc more than our max */
+ extra = state->estimated_max_iops -
+ state->allocated_iops;
+ }
+ n_fil_crypt_iops_allocated += extra;
+ state->allocated_iops += extra;
+
+ DBUG_PRINT("ib_crypt",
+ ("thr_no: %u increased iops from %u to %u.",
+ state->thread_no,
+ state->allocated_iops - extra,
+ state->allocated_iops));
+
+ }
+ mutex_exit(&fil_crypt_threads_mutex);
+ }
+
+ fil_crypt_update_total_stat(state);
+}
+
+/** Release excess allocated iops
+@param state rotation state
+@param wake whether to wake up other threads */
+static void fil_crypt_return_iops(rotate_thread_t *state, bool wake= true)
+{
+ if (state->allocated_iops > 0) {
+ uint iops = state->allocated_iops;
+ mutex_enter(&fil_crypt_threads_mutex);
+ if (n_fil_crypt_iops_allocated < iops) {
+ /* unknown bug!
+ * crash in debug
+ * keep n_fil_crypt_iops_allocated unchanged
+ * in release */
+ ut_ad(0);
+ iops = 0;
+ }
+
+ n_fil_crypt_iops_allocated -= iops;
+ state->allocated_iops = 0;
+ if (wake) {
+ os_event_set(fil_crypt_threads_event);
+ }
+ mutex_exit(&fil_crypt_threads_mutex);
+ }
+
+ fil_crypt_update_total_stat(state);
+}
+
+/** Acquire a tablespace reference.
+@return whether a tablespace reference was successfully acquired */
+inline bool fil_space_t::acquire_if_not_stopped()
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+ const uint32_t n= acquire_low();
+ if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
+ return true;
+ if (UNIV_UNLIKELY(n & STOPPING))
+ return false;
+ return UNIV_LIKELY(!(n & CLOSING)) || prepare(true);
+}
+
+bool fil_crypt_must_default_encrypt()
+{
+ return !srv_fil_crypt_rotate_key_age || !srv_encrypt_rotate;
+}
+
+/** Return the next tablespace from default_encrypt_tables list.
+@param space previous tablespace (nullptr to start from the start)
+@param recheck whether the removal condition needs to be rechecked after
+the encryption parameters were changed
+@param encrypt expected state of innodb_encrypt_tables
+@return the next tablespace to process (n_pending_ops incremented)
+@retval fil_system.temp_space if there is no work to do
+@retval nullptr upon reaching the end of the iteration */
+inline fil_space_t *fil_system_t::default_encrypt_next(fil_space_t *space,
+ bool recheck,
+ bool encrypt)
+{
+ ut_ad(mutex_own(&mutex));
+
+ sized_ilist<fil_space_t, rotation_list_tag_t>::iterator it=
+ space && space->is_in_default_encrypt
+ ? space
+ : default_encrypt_tables.begin();
+ const sized_ilist<fil_space_t, rotation_list_tag_t>::iterator end=
+ default_encrypt_tables.end();
+
+ if (space)
+ {
+ const bool released= !space->release();
+
+ if (space->is_in_default_encrypt)
+ {
+ while (++it != end &&
+ (!UT_LIST_GET_LEN(it->chain) || it->is_stopping()));
+
+ /* If one of the encryption threads already started
+ the encryption of the table then don't remove the
+ unencrypted spaces from default encrypt list.
+
+ If there is a change in innodb_encrypt_tables variables
+ value then don't remove the last processed tablespace
+ from the default encrypt list. */
+ if (released && !recheck && fil_crypt_must_remove(*space))
+ {
+ ut_a(!default_encrypt_tables.empty());
+ default_encrypt_tables.remove(*space);
+ space->is_in_default_encrypt= false;
+ }
+ }
+ }
+ else while (it != end &&
+ (!UT_LIST_GET_LEN(it->chain) || it->is_stopping()))
+ {
+ /* Find the next suitable default encrypt table if
+ beginning of default_encrypt_tables list has been scheduled
+ to be deleted */
+ it++;
+ }
+
+ if (it == end)
+ return temp_space;
+
+ do
+ {
+ space= &*it;
+ if (space->acquire_if_not_stopped())
+ return space;
+ if (++it == end)
+ return nullptr;
+ }
+ while (!UT_LIST_GET_LEN(it->chain) || it->is_stopping());
+
+ return nullptr;
+}
+
+/** Determine the next tablespace for encryption key rotation.
+@param space current tablespace (nullptr to start from the beginning)
+@param recheck whether the removal condition needs to be rechecked after
+encryption parameters were changed
+@param encrypt expected state of innodb_encrypt_tables
+@return the next tablespace
+@retval fil_system.temp_space if there is no work to do
+@retval nullptr upon reaching the end of the iteration */
+inline fil_space_t *fil_space_t::next(fil_space_t *space, bool recheck,
+ bool encrypt)
+{
+ mutex_enter(&fil_system.mutex);
+
+ if (fil_crypt_must_default_encrypt())
+ space= fil_system.default_encrypt_next(space, recheck, encrypt);
+ else
+ {
+ if (!space)
+ space= UT_LIST_GET_FIRST(fil_system.space_list);
+ else
+ {
+ /* Move on to the next fil_space_t */
+ space->release();
+ space= UT_LIST_GET_NEXT(space_list, space);
+ }
+
+ for (; space; space= UT_LIST_GET_NEXT(space_list, space))
+ {
+ if (space->purpose != FIL_TYPE_TABLESPACE)
+ continue;
+ const uint32_t n= space->acquire_low();
+ if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
+ break;
+ if (!(n & STOPPING) && space->prepare(true))
+ break;
+ }
+ }
+
+ mutex_exit(&fil_system.mutex);
+ return space;
+}
+
+/** Search for a space needing rotation
+@param[in,out] key_state Key state
+@param[in,out] state Rotation state
+@param[in,out] recheck recheck of the tablespace is needed or
+ still encryption thread does write page 0 */
+static bool fil_crypt_find_space_to_rotate(
+ key_state_t* key_state,
+ rotate_thread_t* state,
+ bool* recheck)
+{
+ /* we need iops to start rotating */
+ while (!state->should_shutdown() && !fil_crypt_alloc_iops(state)) {
+ if (state->space && state->space->is_stopping()) {
+ state->space->release();
+ state->space = NULL;
+ }
+
+ os_event_reset(fil_crypt_threads_event);
+ os_event_wait_time(fil_crypt_threads_event, 100000);
+ }
+
+ if (state->should_shutdown()) {
+ if (state->space) {
+ state->space->release();
+ state->space = NULL;
+ }
+ return false;
+ }
+
+ if (state->first) {
+ state->first = false;
+ if (state->space) {
+ state->space->release();
+ }
+ state->space = NULL;
+ }
+
+ bool wake;
+ for (;;) {
+ state->space = fil_space_t::next(state->space, *recheck,
+ key_state->key_version != 0);
+ wake = state->should_shutdown();
+
+ if (state->space == fil_system.temp_space) {
+ goto done;
+ } else if (wake) {
+ break;
+ } else {
+ wake = true;
+ }
+
+ if (!state->space) {
+ break;
+ }
+
+ /* If there is no crypt data and we have not yet read
+ page 0 for this tablespace, we need to read it before
+ we can continue. */
+ if (!state->space->crypt_data) {
+ fil_crypt_read_crypt_data(state->space);
+ }
+
+ if (fil_crypt_space_needs_rotation(state, key_state, recheck)) {
+ ut_ad(key_state->key_id);
+ /* init state->min_key_version_found before
+ * starting on a space */
+ state->min_key_version_found = key_state->key_version;
+ return true;
+ }
+ }
+
+ if (state->space) {
+ state->space->release();
+done:
+ state->space = NULL;
+ }
+
+ /* no work to do; release our allocation of I/O capacity */
+ fil_crypt_return_iops(state, wake);
+
+ return false;
+
+}
+
+/***********************************************************************
+Start rotating a space
+@param[in] key_state Key state
+@param[in,out] state Rotation state */
+static
+void
+fil_crypt_start_rotate_space(
+ const key_state_t* key_state,
+ rotate_thread_t* state)
+{
+ fil_space_crypt_t *crypt_data = state->space->crypt_data;
+
+ ut_ad(crypt_data);
+ mutex_enter(&crypt_data->mutex);
+ ut_ad(key_state->key_id == crypt_data->key_id);
+
+ if (crypt_data->rotate_state.active_threads == 0) {
+ /* only first thread needs to init */
+ crypt_data->rotate_state.next_offset = 1; // skip page 0
+ /* no need to rotate beyond current max
+ * if space extends, it will be encrypted with newer version */
+ /* FIXME: max_offset could be removed and instead
+ space->size consulted.*/
+ crypt_data->rotate_state.max_offset = state->space->size;
+ crypt_data->rotate_state.end_lsn = 0;
+ crypt_data->rotate_state.min_key_version_found =
+ key_state->key_version;
+
+ crypt_data->rotate_state.start_time = time(0);
+
+ if (crypt_data->type == CRYPT_SCHEME_UNENCRYPTED &&
+ crypt_data->is_encrypted() &&
+ key_state->key_version != 0) {
+ /* this is rotation unencrypted => encrypted */
+ crypt_data->type = CRYPT_SCHEME_1;
+ }
+ }
+
+ /* count active threads in space */
+ crypt_data->rotate_state.active_threads++;
+
+ /* Initialize thread local state */
+ state->end_lsn = crypt_data->rotate_state.end_lsn;
+ state->min_key_version_found =
+ crypt_data->rotate_state.min_key_version_found;
+
+ mutex_exit(&crypt_data->mutex);
+}
+
+/***********************************************************************
+Search for batch of pages needing rotation
+@param[in] key_state Key state
+@param[in,out] state Rotation state
+@return true if page needing key rotation found, false if not found */
+static
+bool
+fil_crypt_find_page_to_rotate(
+ const key_state_t* key_state,
+ rotate_thread_t* state)
+{
+ ulint batch = srv_alloc_time * state->allocated_iops;
+ fil_space_t* space = state->space;
+
+ ut_ad(!space || space->referenced());
+
+ /* If space is marked to be dropped stop rotation. */
+ if (!space || space->is_stopping()) {
+ return false;
+ }
+
+ fil_space_crypt_t *crypt_data = space->crypt_data;
+
+ mutex_enter(&crypt_data->mutex);
+ ut_ad(key_state->key_id == crypt_data->key_id);
+
+ bool found = crypt_data->rotate_state.max_offset >=
+ crypt_data->rotate_state.next_offset;
+
+ if (found) {
+ state->offset = crypt_data->rotate_state.next_offset;
+ ulint remaining = crypt_data->rotate_state.max_offset -
+ crypt_data->rotate_state.next_offset;
+
+ if (batch <= remaining) {
+ state->batch = batch;
+ } else {
+ state->batch = remaining;
+ }
+ }
+
+ crypt_data->rotate_state.next_offset += uint32_t(batch);
+ mutex_exit(&crypt_data->mutex);
+ return found;
+}
+
+#define fil_crypt_get_page_throttle(state,offset,mtr,sleeptime_ms) \
+ fil_crypt_get_page_throttle_func(state, offset, mtr, \
+ sleeptime_ms, __FILE__, __LINE__)
+
+/***********************************************************************
+Get a page and compute sleep time
+@param[in,out] state Rotation state
+@param[in] offset Page offset
+@param[in,out] mtr Minitransaction
+@param[out] sleeptime_ms Sleep time
+@param[in] file File where called
+@param[in] line Line where called
+@return page or NULL*/
+static
+buf_block_t*
+fil_crypt_get_page_throttle_func(
+ rotate_thread_t* state,
+ uint32_t offset,
+ mtr_t* mtr,
+ ulint* sleeptime_ms,
+ const char* file,
+ unsigned line)
+{
+ fil_space_t* space = state->space;
+ const ulint zip_size = space->zip_size();
+ const page_id_t page_id(space->id, offset);
+ ut_ad(space->referenced());
+
+ /* Before reading from tablespace we need to make sure that
+ the tablespace is not about to be dropped. */
+ if (space->is_stopping()) {
+ return NULL;
+ }
+
+ dberr_t err = DB_SUCCESS;
+ buf_block_t* block = buf_page_get_gen(page_id, zip_size, RW_X_LATCH,
+ NULL,
+ BUF_PEEK_IF_IN_POOL, file, line,
+ mtr, &err);
+ if (block != NULL) {
+ /* page was in buffer pool */
+ state->crypt_stat.pages_read_from_cache++;
+ return block;
+ }
+
+ if (space->is_stopping()) {
+ return NULL;
+ }
+
+ if (fseg_page_is_free(space, state->offset)) {
+ /* page is already freed */
+ return NULL;
+ }
+
+ state->crypt_stat.pages_read_from_disk++;
+
+ const ulonglong start = my_interval_timer();
+ block = buf_page_get_gen(page_id, zip_size,
+ RW_X_LATCH,
+ NULL, BUF_GET_POSSIBLY_FREED,
+ file, line, mtr, &err);
+ const ulonglong end = my_interval_timer();
+
+ state->cnt_waited++;
+
+ if (end > start) {
+ state->sum_waited_us += (end - start) / 1000;
+ }
+
+ /* average page load */
+ ulint add_sleeptime_ms = 0;
+ ulint avg_wait_time_us =ulint(state->sum_waited_us / state->cnt_waited);
+ ulint alloc_wait_us = 1000000 / state->allocated_iops;
+
+ if (avg_wait_time_us < alloc_wait_us) {
+ /* we reading faster than we allocated */
+ add_sleeptime_ms = (alloc_wait_us - avg_wait_time_us) / 1000;
+ } else {
+ /* if page load time is longer than we want, skip sleeping */
+ }
+
+ *sleeptime_ms += add_sleeptime_ms;
+
+ return block;
+}
+
+/***********************************************************************
+Rotate one page
+@param[in,out] key_state Key state
+@param[in,out] state Rotation state */
+static
+void
+fil_crypt_rotate_page(
+ const key_state_t* key_state,
+ rotate_thread_t* state)
+{
+ fil_space_t*space = state->space;
+ ulint space_id = space->id;
+ uint32_t offset = state->offset;
+ ulint sleeptime_ms = 0;
+ fil_space_crypt_t *crypt_data = space->crypt_data;
+
+ ut_ad(space->referenced());
+ ut_ad(offset > 0);
+
+ /* In fil_crypt_thread where key rotation is done we have
+ acquired space and checked that this space is not yet
+ marked to be dropped. Similarly, in fil_crypt_find_page_to_rotate().
+ Check here also to give DROP TABLE or similar a change. */
+ if (space->is_stopping()) {
+ return;
+ }
+
+ if (space_id == TRX_SYS_SPACE && offset == TRX_SYS_PAGE_NO) {
+ /* don't encrypt this as it contains address to dblwr buffer */
+ return;
+ }
+
+ mtr_t mtr;
+ mtr.start();
+ if (buf_block_t* block = fil_crypt_get_page_throttle(state,
+ offset, &mtr,
+ &sleeptime_ms)) {
+ bool modified = false;
+ byte* frame = buf_block_get_frame(block);
+ const lsn_t block_lsn = mach_read_from_8(FIL_PAGE_LSN + frame);
+ uint kv = buf_page_get_key_version(frame, space->flags);
+
+ if (block->page.status == buf_page_t::FREED) {
+ /* Do not modify freed pages to avoid an assertion
+ failure on recovery.*/
+ } else if (block->page.oldest_modification() > 1) {
+ /* Do not unnecessarily touch pages that are
+ already dirty. */
+ } else if (space->is_stopping()) {
+ /* The tablespace is closing (in DROP TABLE or
+ TRUNCATE TABLE or similar): avoid further access */
+ } else if (!kv && !*reinterpret_cast<uint16_t*>
+ (&frame[FIL_PAGE_TYPE])) {
+ /* It looks like this page is not
+ allocated. Because key rotation is accessing
+ pages in a pattern that is unlike the normal
+ B-tree and undo log access pattern, we cannot
+ invoke fseg_page_is_free() here, because that
+ could result in a deadlock. If we invoked
+ fseg_page_is_free() and released the
+ tablespace latch before acquiring block->lock,
+ then the fseg_page_is_free() information
+ could be stale already. */
+
+ /* If the data file was originally created
+ before MariaDB 10.0 or MySQL 5.6, some
+ allocated data pages could carry 0 in
+ FIL_PAGE_TYPE. The FIL_PAGE_TYPE on those
+ pages will be updated in
+ buf_flush_init_for_writing() when the page
+ is modified the next time.
+
+ Also, when the doublewrite buffer pages are
+ allocated on bootstrap in a non-debug build,
+ some dummy pages will be allocated, with 0 in
+ the FIL_PAGE_TYPE. Those pages should be
+ skipped from key rotation forever. */
+ } else if (fil_crypt_needs_rotation(
+ crypt_data,
+ kv,
+ key_state->key_version,
+ key_state->rotate_key_age)) {
+
+ mtr.set_named_space(space);
+ modified = true;
+
+ /* force rotation by dummy updating page */
+ mtr.write<1,mtr_t::FORCED>(*block,
+ &frame[FIL_PAGE_SPACE_ID],
+ frame[FIL_PAGE_SPACE_ID]);
+
+ /* statistics */
+ state->crypt_stat.pages_modified++;
+ } else {
+ if (crypt_data->is_encrypted()) {
+ if (kv < state->min_key_version_found) {
+ state->min_key_version_found = kv;
+ }
+ }
+ }
+
+ mtr.commit();
+ lsn_t end_lsn = mtr.commit_lsn();
+
+
+ if (modified) {
+ /* if we modified page, we take lsn from mtr */
+ ut_a(end_lsn > state->end_lsn);
+ ut_a(end_lsn > block_lsn);
+ state->end_lsn = end_lsn;
+ } else {
+ /* if we did not modify page, check for max lsn */
+ if (block_lsn > state->end_lsn) {
+ state->end_lsn = block_lsn;
+ }
+ }
+ } else {
+ /* If block read failed mtr memo and log should be empty. */
+ ut_ad(!mtr.has_modifications());
+ ut_ad(!mtr.is_dirty());
+ ut_ad(mtr.get_memo()->size() == 0);
+ ut_ad(mtr.get_log()->size() == 0);
+ mtr.commit();
+ }
+
+ if (sleeptime_ms) {
+ os_event_reset(fil_crypt_throttle_sleep_event);
+ os_event_wait_time(fil_crypt_throttle_sleep_event,
+ 1000 * sleeptime_ms);
+ }
+}
+
+/***********************************************************************
+Rotate a batch of pages
+@param[in,out] key_state Key state
+@param[in,out] state Rotation state */
+static
+void
+fil_crypt_rotate_pages(
+ const key_state_t* key_state,
+ rotate_thread_t* state)
+{
+ ulint space_id = state->space->id;
+ uint32_t end = std::min(state->offset + uint32_t(state->batch),
+ state->space->free_limit);
+
+ ut_ad(state->space->referenced());
+
+ for (; state->offset < end; state->offset++) {
+
+ /* we can't rotate pages in dblwr buffer as
+ * it's not possible to read those due to lots of asserts
+ * in buffer pool.
+ *
+ * However since these are only (short-lived) copies of
+ * real pages, they will be updated anyway when the
+ * real page is updated
+ */
+ if (buf_dblwr.is_inside(page_id_t(space_id, state->offset))) {
+ continue;
+ }
+
+ /* If space is marked as stopping, stop rotating
+ pages. */
+ if (state->space->is_stopping()) {
+ break;
+ }
+
+ fil_crypt_rotate_page(key_state, state);
+ }
+}
+
+/***********************************************************************
+Flush rotated pages and then update page 0
+
+@param[in,out] state rotation state */
+static
+void
+fil_crypt_flush_space(
+ rotate_thread_t* state)
+{
+ fil_space_t* space = state->space;
+ fil_space_crypt_t *crypt_data = space->crypt_data;
+
+ ut_ad(space->referenced());
+
+ /* flush tablespace pages so that there are no pages left with old key */
+ lsn_t end_lsn = crypt_data->rotate_state.end_lsn;
+
+ if (end_lsn > 0 && !space->is_stopping()) {
+ ulint sum_pages = 0;
+ const ulonglong start = my_interval_timer();
+ while (buf_flush_list_space(space, &sum_pages));
+ if (sum_pages) {
+ const ulonglong end = my_interval_timer();
+
+ state->cnt_waited += sum_pages;
+ state->sum_waited_us += (end - start) / 1000;
+
+ /* statistics */
+ state->crypt_stat.pages_flushed += sum_pages;
+ }
+ }
+
+ if (crypt_data->min_key_version == 0) {
+ crypt_data->type = CRYPT_SCHEME_UNENCRYPTED;
+ }
+
+ if (space->is_stopping()) {
+ return;
+ }
+
+ /* update page 0 */
+ mtr_t mtr;
+ mtr.start();
+
+ if (buf_block_t* block = buf_page_get_gen(
+ page_id_t(space->id, 0), space->zip_size(),
+ RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED,
+ __FILE__, __LINE__, &mtr)) {
+ if (block->page.status != buf_page_t::FREED) {
+ mtr.set_named_space(space);
+ crypt_data->write_page0(block, &mtr);
+ }
+ }
+
+ mtr.commit();
+}
+
+/***********************************************************************
+Complete rotating a space
+@param[in,out] state Rotation state */
+static void fil_crypt_complete_rotate_space(rotate_thread_t* state)
+{
+ fil_space_crypt_t *crypt_data = state->space->crypt_data;
+
+ ut_ad(crypt_data);
+ ut_ad(state->space->referenced());
+
+ /* Space might already be dropped */
+ if (!state->space->is_stopping()) {
+ mutex_enter(&crypt_data->mutex);
+
+ /**
+ * Update crypt data state with state from thread
+ */
+ if (state->min_key_version_found <
+ crypt_data->rotate_state.min_key_version_found) {
+ crypt_data->rotate_state.min_key_version_found =
+ state->min_key_version_found;
+ }
+
+ if (state->end_lsn > crypt_data->rotate_state.end_lsn) {
+ crypt_data->rotate_state.end_lsn = state->end_lsn;
+ }
+
+ ut_a(crypt_data->rotate_state.active_threads > 0);
+ crypt_data->rotate_state.active_threads--;
+ bool last = crypt_data->rotate_state.active_threads == 0;
+
+ /**
+ * check if space is fully done
+ * this as when threads shutdown, it could be that we "complete"
+ * iterating before we have scanned the full space.
+ */
+ bool done = crypt_data->rotate_state.next_offset >=
+ crypt_data->rotate_state.max_offset;
+
+ /**
+ * we should flush space if we're last thread AND
+ * the iteration is done
+ */
+ bool should_flush = last && done;
+
+ if (should_flush) {
+ /* we're the last active thread */
+ crypt_data->rotate_state.flushing = true;
+ crypt_data->min_key_version =
+ crypt_data->rotate_state.min_key_version_found;
+ mutex_exit(&crypt_data->mutex);
+ fil_crypt_flush_space(state);
+
+ mutex_enter(&crypt_data->mutex);
+ crypt_data->rotate_state.flushing = false;
+ mutex_exit(&crypt_data->mutex);
+ } else {
+ mutex_exit(&crypt_data->mutex);
+ }
+ } else {
+ mutex_enter(&crypt_data->mutex);
+ ut_a(crypt_data->rotate_state.active_threads > 0);
+ crypt_data->rotate_state.active_threads--;
+ mutex_exit(&crypt_data->mutex);
+ }
+}
+
+/*********************************************************************//**
+A thread which monitors global key state and rotates tablespaces accordingly
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(fil_crypt_thread)(void*)
+{
+ mutex_enter(&fil_crypt_threads_mutex);
+ uint thread_no = srv_n_fil_crypt_threads_started;
+ srv_n_fil_crypt_threads_started++;
+ os_event_set(fil_crypt_event); /* signal that we started */
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ /* state of this thread */
+ rotate_thread_t thr(thread_no);
+
+ /* if we find a space that is starting, skip over it and recheck it later */
+ bool recheck = false;
+
+ while (!thr.should_shutdown()) {
+
+ key_state_t new_state;
+
+ while (!thr.should_shutdown()) {
+
+ /* wait for key state changes
+ * i.e either new key version of change or
+ * new rotate_key_age */
+ os_event_reset(fil_crypt_threads_event);
+
+ if (os_event_wait_time(fil_crypt_threads_event, 1000000) == 0) {
+ break;
+ }
+
+ if (recheck) {
+ /* check recheck here, after sleep, so
+ * that we don't busy loop while when one thread is starting
+ * a space*/
+ break;
+ }
+ }
+
+ recheck = false;
+ thr.first = true; // restart from first tablespace
+
+ /* iterate all spaces searching for those needing rotation */
+ while (!thr.should_shutdown() &&
+ fil_crypt_find_space_to_rotate(&new_state, &thr, &recheck)) {
+
+ /* we found a space to rotate */
+ fil_crypt_start_rotate_space(&new_state, &thr);
+
+ /* iterate all pages (cooperativly with other threads) */
+ while (!thr.should_shutdown() &&
+ fil_crypt_find_page_to_rotate(&new_state, &thr)) {
+
+ if (!thr.space->is_stopping()) {
+ /* rotate a (set) of pages */
+ fil_crypt_rotate_pages(&new_state, &thr);
+ }
+
+ /* If space is marked as stopping, release
+ space and stop rotation. */
+ if (thr.space->is_stopping()) {
+ fil_crypt_complete_rotate_space(&thr);
+ thr.space->release();
+ thr.space = NULL;
+ break;
+ }
+
+ /* realloc iops */
+ fil_crypt_realloc_iops(&thr);
+ }
+
+ /* complete rotation */
+ if (thr.space) {
+ fil_crypt_complete_rotate_space(&thr);
+ }
+
+ /* force key state refresh */
+ new_state.key_id = 0;
+
+ /* return iops */
+ fil_crypt_return_iops(&thr);
+ }
+ }
+
+ /* return iops if shutting down */
+ fil_crypt_return_iops(&thr);
+
+ /* release current space if shutting down */
+ if (thr.space) {
+ thr.space->release();
+ thr.space = NULL;
+ }
+
+ mutex_enter(&fil_crypt_threads_mutex);
+ srv_n_fil_crypt_threads_started--;
+ os_event_set(fil_crypt_event); /* signal that we stopped */
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit();
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************
+Adjust thread count for key rotation
+@param[in] enw_cnt Number of threads to be used */
+UNIV_INTERN
+void
+fil_crypt_set_thread_cnt(
+ const uint new_cnt)
+{
+ if (!fil_crypt_threads_inited) {
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE)
+ return;
+ fil_crypt_threads_init();
+ }
+
+ mutex_enter(&fil_crypt_threads_mutex);
+
+ if (new_cnt > srv_n_fil_crypt_threads) {
+ uint add = new_cnt - srv_n_fil_crypt_threads;
+ srv_n_fil_crypt_threads = new_cnt;
+ for (uint i = 0; i < add; i++) {
+ ib::info() << "Creating #"
+ << i+1 << " encryption thread id "
+ << os_thread_create(fil_crypt_thread)
+ << " total threads " << new_cnt << ".";
+ }
+ } else if (new_cnt < srv_n_fil_crypt_threads) {
+ srv_n_fil_crypt_threads = new_cnt;
+ os_event_set(fil_crypt_threads_event);
+ }
+
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ while(srv_n_fil_crypt_threads_started != srv_n_fil_crypt_threads) {
+ os_event_reset(fil_crypt_event);
+ os_event_wait_time(fil_crypt_event, 100000);
+ }
+
+ /* Send a message to encryption threads that there could be
+ something to do. */
+ if (srv_n_fil_crypt_threads) {
+ os_event_set(fil_crypt_threads_event);
+ }
+}
+
+/** Initialize the tablespace default_encrypt_tables
+if innodb_encryption_rotate_key_age=0. */
+static void fil_crypt_default_encrypt_tables_fill()
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+
+ for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
+ space != NULL;
+ space = UT_LIST_GET_NEXT(space_list, space)) {
+ if (space->purpose != FIL_TYPE_TABLESPACE
+ || space->is_in_default_encrypt
+ || UT_LIST_GET_LEN(space->chain) == 0
+ || !space->acquire_if_not_stopped()) {
+ continue;
+ }
+
+ /* Ensure that crypt_data has been initialized. */
+ ut_ad(space->size);
+
+ /* Skip ENCRYPTION!=DEFAULT tablespaces. */
+ if (space->crypt_data
+ && !space->crypt_data->is_default_encryption()) {
+ goto next;
+ }
+
+ if (srv_encrypt_tables) {
+ /* Skip encrypted tablespaces if
+ innodb_encrypt_tables!=OFF */
+ if (space->crypt_data
+ && space->crypt_data->min_key_version) {
+ goto next;
+ }
+ } else {
+ /* Skip unencrypted tablespaces if
+ innodb_encrypt_tables=OFF */
+ if (!space->crypt_data
+ || !space->crypt_data->min_key_version) {
+ goto next;
+ }
+ }
+
+ fil_system.default_encrypt_tables.push_back(*space);
+ space->is_in_default_encrypt = true;
+next:
+ space->release();
+ }
+}
+
+/*********************************************************************
+Adjust max key age
+@param[in] val New max key age */
+UNIV_INTERN
+void
+fil_crypt_set_rotate_key_age(
+ uint val)
+{
+ mutex_enter(&fil_system.mutex);
+ srv_fil_crypt_rotate_key_age = val;
+ if (val == 0) {
+ fil_crypt_default_encrypt_tables_fill();
+ }
+ mutex_exit(&fil_system.mutex);
+ os_event_set(fil_crypt_threads_event);
+}
+
+/*********************************************************************
+Adjust rotation iops
+@param[in] val New max roation iops */
+UNIV_INTERN
+void
+fil_crypt_set_rotation_iops(
+ uint val)
+{
+ srv_n_fil_crypt_iops = val;
+ os_event_set(fil_crypt_threads_event);
+}
+
+/*********************************************************************
+Adjust encrypt tables
+@param[in] val New setting for innodb-encrypt-tables */
+void fil_crypt_set_encrypt_tables(ulong val)
+{
+ mutex_enter(&fil_system.mutex);
+
+ srv_encrypt_tables = val;
+
+ if (fil_crypt_must_default_encrypt()) {
+ fil_crypt_default_encrypt_tables_fill();
+ }
+
+ mutex_exit(&fil_system.mutex);
+
+ os_event_set(fil_crypt_threads_event);
+}
+
+/*********************************************************************
+Init threads for key rotation */
+UNIV_INTERN
+void
+fil_crypt_threads_init()
+{
+ if (!fil_crypt_threads_inited) {
+ fil_crypt_event = os_event_create(0);
+ fil_crypt_threads_event = os_event_create(0);
+ mutex_create(LATCH_ID_FIL_CRYPT_THREADS_MUTEX,
+ &fil_crypt_threads_mutex);
+
+ uint cnt = srv_n_fil_crypt_threads;
+ srv_n_fil_crypt_threads = 0;
+ fil_crypt_threads_inited = true;
+ fil_crypt_set_thread_cnt(cnt);
+ }
+}
+
+/*********************************************************************
+Clean up key rotation threads resources */
+UNIV_INTERN
+void
+fil_crypt_threads_cleanup()
+{
+ if (!fil_crypt_threads_inited) {
+ return;
+ }
+ ut_a(!srv_n_fil_crypt_threads_started);
+ os_event_destroy(fil_crypt_event);
+ os_event_destroy(fil_crypt_threads_event);
+ mutex_free(&fil_crypt_threads_mutex);
+ fil_crypt_threads_inited = false;
+}
+
+/*********************************************************************
+Wait for crypt threads to stop accessing space
+@param[in] space Tablespace */
+UNIV_INTERN
+void
+fil_space_crypt_close_tablespace(
+ const fil_space_t* space)
+{
+ fil_space_crypt_t* crypt_data = space->crypt_data;
+
+ if (!crypt_data || srv_n_fil_crypt_threads == 0
+ || !fil_crypt_threads_inited) {
+ return;
+ }
+
+ mutex_enter(&fil_crypt_threads_mutex);
+
+ time_t start = time(0);
+ time_t last = start;
+
+ mutex_enter(&crypt_data->mutex);
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ ulint cnt = crypt_data->rotate_state.active_threads;
+ bool flushing = crypt_data->rotate_state.flushing;
+
+ while (cnt > 0 || flushing) {
+ mutex_exit(&crypt_data->mutex);
+ /* release dict mutex so that scrub threads can release their
+ * table references */
+ dict_mutex_exit_for_mysql();
+
+ /* wakeup throttle (all) sleepers */
+ os_event_set(fil_crypt_throttle_sleep_event);
+ os_event_set(fil_crypt_threads_event);
+
+ os_thread_sleep(20000);
+ dict_mutex_enter_for_mysql();
+ mutex_enter(&crypt_data->mutex);
+ cnt = crypt_data->rotate_state.active_threads;
+ flushing = crypt_data->rotate_state.flushing;
+
+ time_t now = time(0);
+
+ if (now >= last + 30) {
+ ib::warn() << "Waited "
+ << now - start
+ << " seconds to drop space: "
+ << space->name << " ("
+ << space->id << ") active threads "
+ << cnt << "flushing="
+ << flushing << ".";
+ last = now;
+ }
+ }
+
+ mutex_exit(&crypt_data->mutex);
+}
+
+/*********************************************************************
+Get crypt status for a space (used by information_schema)
+@param[in] space Tablespace
+@param[out] status Crypt status */
+UNIV_INTERN
+void
+fil_space_crypt_get_status(
+ const fil_space_t* space,
+ struct fil_space_crypt_status_t* status)
+{
+ memset(status, 0, sizeof(*status));
+
+ ut_ad(space->referenced());
+
+ /* If there is no crypt data and we have not yet read
+ page 0 for this tablespace, we need to read it before
+ we can continue. */
+ if (!space->crypt_data) {
+ fil_crypt_read_crypt_data(const_cast<fil_space_t*>(space));
+ }
+
+ status->space = ULINT_UNDEFINED;
+
+ if (fil_space_crypt_t* crypt_data = space->crypt_data) {
+ status->space = space->id;
+ mutex_enter(&crypt_data->mutex);
+ status->scheme = crypt_data->type;
+ status->keyserver_requests = crypt_data->keyserver_requests;
+ status->min_key_version = crypt_data->min_key_version;
+ status->key_id = crypt_data->key_id;
+
+ if (crypt_data->rotate_state.active_threads > 0 ||
+ crypt_data->rotate_state.flushing) {
+ status->rotating = true;
+ status->flushing =
+ crypt_data->rotate_state.flushing;
+ status->rotate_next_page_number =
+ crypt_data->rotate_state.next_offset;
+ status->rotate_max_page_number =
+ crypt_data->rotate_state.max_offset;
+ }
+
+ mutex_exit(&crypt_data->mutex);
+
+ if (srv_encrypt_tables || crypt_data->min_key_version) {
+ status->current_key_version =
+ fil_crypt_get_latest_key_version(crypt_data);
+ }
+ }
+}
+
+/*********************************************************************
+Return crypt statistics
+@param[out] stat Crypt statistics */
+UNIV_INTERN
+void
+fil_crypt_total_stat(
+ fil_crypt_stat_t *stat)
+{
+ mutex_enter(&crypt_stat_mutex);
+ *stat = crypt_stat;
+ mutex_exit(&crypt_stat_mutex);
+}
+
+#endif /* UNIV_INNOCHECKSUM */
+
+/**
+Verify that post encryption checksum match calculated checksum.
+This function should be called only if tablespace contains crypt_data
+metadata (this is strong indication that tablespace is encrypted).
+Function also verifies that traditional checksum does not match
+calculated checksum as if it does page could be valid unencrypted,
+encrypted, or corrupted.
+
+@param[in,out] page page frame (checksum is temporarily modified)
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return true if page is encrypted AND OK, false otherwise */
+bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
+{
+ ut_ad(mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION));
+
+ /* Compressed and encrypted pages do not have checksum. Assume not
+ corrupted. Page verification happens after decompression in
+ buf_page_read_complete() using buf_page_is_corrupted(). */
+ if (fil_page_get_type(page) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+ return true;
+ }
+
+ /* Read stored post encryption checksum. */
+ const ib_uint32_t checksum = mach_read_from_4(
+ page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4);
+
+ /* If stored checksum matches one of the calculated checksums
+ page is not corrupted. */
+
+ switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ if (zip_size) {
+ return checksum == page_zip_calc_checksum(
+ page, zip_size, SRV_CHECKSUM_ALGORITHM_CRC32);
+ }
+
+ return checksum == buf_calc_page_crc32(page);
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ /* Starting with MariaDB 10.1.25, 10.2.7, 10.3.1,
+ due to MDEV-12114, fil_crypt_calculate_checksum()
+ is only using CRC32 for the encrypted pages.
+ Due to this, we must treat "strict_none" as "none". */
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ return true;
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ /* Starting with MariaDB 10.1.25, 10.2.7, 10.3.1,
+ due to MDEV-12114, fil_crypt_calculate_checksum()
+ is only using CRC32 for the encrypted pages.
+ Due to this, we must treat "strict_innodb" as "innodb". */
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ if (checksum == BUF_NO_CHECKSUM_MAGIC) {
+ return true;
+ }
+ if (zip_size) {
+ return checksum == page_zip_calc_checksum(
+ page, zip_size,
+ SRV_CHECKSUM_ALGORITHM_CRC32)
+ || checksum == page_zip_calc_checksum(
+ page, zip_size,
+ SRV_CHECKSUM_ALGORITHM_INNODB);
+ }
+
+ return checksum == buf_calc_page_crc32(page)
+ || checksum == buf_calc_page_new_checksum(page);
+ }
+
+ ut_ad("unhandled innodb_checksum_algorithm" == 0);
+ return false;
+}
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
new file mode 100644
index 00000000..a2591dd9
--- /dev/null
+++ b/storage/innobase/fil/fil0fil.cc
@@ -0,0 +1,3757 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2021, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fil/fil0fil.cc
+The tablespace memory cache
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#include "fil0fil.h"
+#include "fil0crypt.h"
+
+#include "btr0btr.h"
+#include "buf0buf.h"
+#include "dict0boot.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "fsp0file.h"
+#include "fsp0fsp.h"
+#include "hash0hash.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "buf0lru.h"
+#include "ibuf0ibuf.h"
+#include "os0event.h"
+#include "sync0sync.h"
+#include "buf0flu.h"
+#ifdef UNIV_LINUX
+# include <sys/types.h>
+# include <sys/sysmacros.h>
+# include <dirent.h>
+#endif
+
+/** Determine if the space id is a user tablespace id or not.
+@param space_id tablespace identifier
+@return true if it is a user tablespace ID */
+inline bool fil_is_user_tablespace_id(ulint space_id)
+{
+ return space_id != TRX_SYS_SPACE && space_id != SRV_TMP_SPACE_ID &&
+ !srv_is_undo_tablespace(space_id);
+}
+
+/** Try to close a file to adhere to the innodb_open_files limit.
+@param print_info whether to diagnose why a file cannot be closed
+@return whether a file was closed */
+bool fil_space_t::try_to_close(bool print_info)
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+ for (fil_space_t *space= UT_LIST_GET_FIRST(fil_system.space_list); space;
+ space= UT_LIST_GET_NEXT(space_list, space))
+ {
+ switch (space->purpose) {
+ case FIL_TYPE_TEMPORARY:
+ continue;
+ case FIL_TYPE_IMPORT:
+ break;
+ case FIL_TYPE_TABLESPACE:
+ if (!fil_is_user_tablespace_id(space->id))
+ continue;
+ }
+
+ /* We are using an approximation of LRU replacement policy. In
+ fil_node_open_file_low(), newly opened files are moved to the end
+ of fil_system.space_list, so that they would be less likely to be
+ closed here. */
+ fil_node_t *node= UT_LIST_GET_FIRST(space->chain);
+ ut_ad(node);
+ ut_ad(!UT_LIST_GET_NEXT(chain, node));
+
+ if (!node->is_open())
+ continue;
+
+ if (const auto n= space->set_closing())
+ {
+ if (print_info)
+ ib::info() << "Cannot close file " << node->name
+ << " because of "
+ << (n & PENDING)
+ << ((n & NEEDS_FSYNC)
+ ? " pending operations and pending fsync"
+ : " pending operations");
+ continue;
+ }
+
+ node->close();
+ return true;
+ }
+
+ return false;
+}
+
+/** Rename a single-table tablespace.
+The tablespace must exist in the memory cache.
+@param[in] id tablespace identifier
+@param[in] old_path old file name
+@param[in] new_name new table name in the
+databasename/tablename format
+@param[in] new_path_in new file name,
+or NULL if it is located in the normal data directory
+@return true if success */
+static bool
+fil_rename_tablespace(
+ ulint id,
+ const char* old_path,
+ const char* new_name,
+ const char* new_path_in);
+
+/*
+ IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
+ =============================================
+
+The tablespace cache is responsible for providing fast read/write access to
+tablespaces and logs of the database. File creation and deletion is done
+in other modules which know more of the logic of the operation, however.
+
+A tablespace consists of a chain of files. The size of the files does not
+have to be divisible by the database block size, because we may just leave
+the last incomplete block unused. When a new file is appended to the
+tablespace, the maximum size of the file is also specified. At the moment,
+we think that it is best to extend the file to its maximum size already at
+the creation of the file, because then we can avoid dynamically extending
+the file when more space is needed for the tablespace.
+
+A block's position in the tablespace is specified with a 32-bit unsigned
+integer. The files in the chain are thought to be catenated, and the block
+corresponding to an address n is the nth block in the catenated file (where
+the first block is named the 0th block, and the incomplete block fragments
+at the end of files are not taken into account). A tablespace can be extended
+by appending a new file at the end of the chain.
+
+Our tablespace concept is similar to the one of Oracle.
+
+To acquire more speed in disk transfers, a technique called disk striping is
+sometimes used. This means that logical block addresses are divided in a
+round-robin fashion across several disks. Windows NT supports disk striping,
+so there we do not need to support it in the database. Disk striping is
+implemented in hardware in RAID disks. We conclude that it is not necessary
+to implement it in the database. Oracle 7 does not support disk striping,
+either.
+
+Another trick used at some database sites is replacing tablespace files by
+raw disks, that is, the whole physical disk drive, or a partition of it, is
+opened as a single file, and it is accessed through byte offsets calculated
+from the start of the disk or the partition. This is recommended in some
+books on database tuning to achieve more speed in i/o. Using raw disk
+certainly prevents the OS from fragmenting disk space, but it is not clear
+if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
+system + EIDE Conner disk only a negligible difference in speed when reading
+from a file, versus reading from a raw disk.
+
+To have fast access to a tablespace or a log file, we put the data structures
+to a hash table. Each tablespace and log file is given an unique 32-bit
+identifier. */
+
+/** Reference to the server data directory. Usually it is the
+current working directory ".", but in the MySQL Embedded Server Library
+it is an absolute path. */
+const char* fil_path_to_mysql_datadir;
+
+/** Common InnoDB file extensions */
+const char* dot_ext[] = { "", ".ibd", ".isl", ".cfg" };
+
+/** Number of pending tablespace flushes */
+Atomic_counter<ulint> fil_n_pending_tablespace_flushes;
+
+/** The tablespace memory cache. This variable is NULL before the module is
+initialized. */
+fil_system_t fil_system;
+
+/** At this age or older a space/page will be rotated */
+UNIV_INTERN extern uint srv_fil_crypt_rotate_key_age;
+
+#ifdef UNIV_DEBUG
+/** Try fil_validate() every this many times */
+# define FIL_VALIDATE_SKIP 17
+
+/******************************************************************//**
+Checks the consistency of the tablespace cache some of the time.
+@return true if ok or the check was skipped */
+static
+bool
+fil_validate_skip(void)
+/*===================*/
+{
+ /** The fil_validate() call skip counter. */
+ static Atomic_counter<uint32_t> fil_validate_count;
+
+ /* We want to reduce the call frequency of the costly fil_validate()
+ check in debug builds. */
+ return (fil_validate_count++ % FIL_VALIDATE_SKIP) || fil_validate();
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Returns the table space by a given id, NULL if not found.
+It is unsafe to dereference the returned pointer. It is fine to check
+for NULL. */
+fil_space_t*
+fil_space_get_by_id(
+/*================*/
+ ulint id) /*!< in: space id */
+{
+ fil_space_t* space;
+
+ ut_ad(fil_system.is_initialised());
+ ut_ad(mutex_own(&fil_system.mutex));
+
+ HASH_SEARCH(hash, &fil_system.spaces, id,
+ fil_space_t*, space,
+ ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
+ space->id == id);
+
+ return(space);
+}
+
+/** Look up a tablespace.
+The caller should hold an InnoDB table lock or a MDL that prevents
+the tablespace from being dropped during the operation,
+or the caller should be in single-threaded crash recovery mode
+(no user connections that could drop tablespaces).
+Normally, fil_space_t::get() should be used instead.
+@param[in] id tablespace ID
+@return tablespace, or NULL if not found */
+fil_space_t*
+fil_space_get(
+ ulint id)
+{
+ mutex_enter(&fil_system.mutex);
+ fil_space_t* space = fil_space_get_by_id(id);
+ mutex_exit(&fil_system.mutex);
+ return(space);
+}
+
+/** Validate the compression algorithm for full crc32 format.
+@param[in] space tablespace object
+@return whether the compression algorithm support */
+static bool fil_comp_algo_validate(const fil_space_t* space)
+{
+ if (!space->full_crc32()) {
+ return true;
+ }
+
+ DBUG_EXECUTE_IF("fil_comp_algo_validate_fail",
+ return false;);
+
+ ulint comp_algo = space->get_compression_algo();
+ switch (comp_algo) {
+ case PAGE_UNCOMPRESSED:
+ case PAGE_ZLIB_ALGORITHM:
+#ifdef HAVE_LZ4
+ case PAGE_LZ4_ALGORITHM:
+#endif /* HAVE_LZ4 */
+#ifdef HAVE_LZO
+ case PAGE_LZO_ALGORITHM:
+#endif /* HAVE_LZO */
+#ifdef HAVE_LZMA
+ case PAGE_LZMA_ALGORITHM:
+#endif /* HAVE_LZMA */
+#ifdef HAVE_BZIP2
+ case PAGE_BZIP2_ALGORITHM:
+#endif /* HAVE_BZIP2 */
+#ifdef HAVE_SNAPPY
+ case PAGE_SNAPPY_ALGORITHM:
+#endif /* HAVE_SNAPPY */
+ return true;
+ }
+
+ return false;
+}
+
+/** Append a file to the chain of files of a space.
+@param[in] name file name of a file that is not open
+@param[in] handle file handle, or OS_FILE_CLOSED
+@param[in] size file size in entire database pages
+@param[in] is_raw whether this is a raw device
+@param[in] atomic_write true if atomic write could be enabled
+@param[in] max_pages maximum number of pages in file,
+or UINT32_MAX for unlimited
+@return file object */
+fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle,
+ uint32_t size, bool is_raw, bool atomic_write,
+ uint32_t max_pages)
+{
+ fil_node_t* node;
+
+ ut_ad(name != NULL);
+ ut_ad(fil_system.is_initialised());
+
+ node = reinterpret_cast<fil_node_t*>(ut_zalloc_nokey(sizeof(*node)));
+
+ node->handle = handle;
+
+ node->name = mem_strdup(name);
+
+ ut_a(!is_raw || srv_start_raw_disk_in_use);
+
+ node->is_raw_disk = is_raw;
+
+ node->size = size;
+
+ node->magic_n = FIL_NODE_MAGIC_N;
+
+ node->init_size = size;
+ node->max_size = max_pages;
+
+ node->space = this;
+
+ node->atomic_write = atomic_write;
+
+ mutex_enter(&fil_system.mutex);
+ this->size += size;
+ UT_LIST_ADD_LAST(chain, node);
+ if (node->is_open()) {
+ n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
+ if (++fil_system.n_open >= srv_max_n_open_files) {
+ reacquire();
+ try_to_close(true);
+ release();
+ }
+ }
+ mutex_exit(&fil_system.mutex);
+
+ return node;
+}
+
+/** Open a tablespace file.
+@param node data file
+@return whether the file was successfully opened */
+static bool fil_node_open_file_low(fil_node_t *node)
+{
+ ut_ad(!node->is_open());
+ ut_ad(node->space->is_closing());
+ ut_ad(mutex_own(&fil_system.mutex));
+ ulint type;
+ static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility");
+ switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) {
+ case 1:
+ case 2:
+ type= OS_DATA_FILE_NO_O_DIRECT;
+ break;
+ default:
+ type= OS_DATA_FILE;
+ }
+
+ for (;;)
+ {
+ bool success;
+ node->handle= os_file_create(innodb_data_file_key, node->name,
+ node->is_raw_disk
+ ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT
+ : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_AIO, type,
+ srv_read_only_mode, &success);
+ if (success)
+ break;
+
+ /* The following call prints an error message */
+ if (os_file_get_last_error(true) == EMFILE + 100 &&
+ fil_space_t::try_to_close(true))
+ continue;
+
+ ib::warn() << "Cannot open '" << node->name << "'.";
+ return false;
+ }
+
+ if (node->size);
+ else if (!node->read_page0() || !fil_comp_algo_validate(node->space))
+ {
+ os_file_close(node->handle);
+ node->handle= OS_FILE_CLOSED;
+ return false;
+ }
+
+ ut_ad(node->is_open());
+
+ if (UNIV_LIKELY(!fil_system.freeze_space_list))
+ {
+ /* Move the file last in fil_system.space_list, so that
+ fil_space_t::try_to_close() should close it as a last resort. */
+ UT_LIST_REMOVE(fil_system.space_list, node->space);
+ UT_LIST_ADD_LAST(fil_system.space_list, node->space);
+ }
+
+ fil_system.n_open++;
+ return true;
+}
+
+/** Open a tablespace file.
+@param node data file
+@return whether the file was successfully opened */
+static bool fil_node_open_file(fil_node_t *node)
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+ ut_ad(!node->is_open());
+ ut_ad(fil_is_user_tablespace_id(node->space->id) ||
+ srv_operation == SRV_OPERATION_BACKUP ||
+ srv_operation == SRV_OPERATION_RESTORE ||
+ srv_operation == SRV_OPERATION_RESTORE_DELTA);
+ ut_ad(node->space->purpose != FIL_TYPE_TEMPORARY);
+ ut_ad(node->space->referenced());
+
+ for (ulint count= 0; fil_system.n_open >= srv_max_n_open_files; count++)
+ {
+ if (fil_space_t::try_to_close(count > 1))
+ count= 0;
+ else if (count >= 2)
+ {
+ ib::warn() << "innodb_open_files=" << srv_max_n_open_files
+ << " is exceeded (" << fil_system.n_open
+ << ") files stay open)";
+ break;
+ }
+ else
+ {
+ mutex_exit(&fil_system.mutex);
+ os_thread_sleep(20000);
+ /* Flush tablespaces so that we can close modified files. */
+ fil_flush_file_spaces();
+ mutex_enter(&fil_system.mutex);
+ }
+ }
+
+ return fil_node_open_file_low(node);
+}
+
+/** Close the file handle. */
+void fil_node_t::close()
+{
+ prepare_to_close_or_detach();
+
+ /* printf("Closing file %s\n", name); */
+ int ret= os_file_close(handle);
+ ut_a(ret);
+ handle= OS_FILE_CLOSED;
+}
+
+pfs_os_file_t fil_node_t::detach()
+{
+ prepare_to_close_or_detach();
+
+ pfs_os_file_t result= handle;
+ handle= OS_FILE_CLOSED;
+ return result;
+}
+
+void fil_node_t::prepare_to_close_or_detach()
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+ ut_ad(space->is_ready_to_close() || srv_operation == SRV_OPERATION_BACKUP ||
+ srv_operation == SRV_OPERATION_RESTORE_DELTA);
+ ut_a(is_open());
+ ut_a(!being_extended);
+ ut_a(space->is_ready_to_close() || space->purpose == FIL_TYPE_TEMPORARY ||
+ srv_fast_shutdown == 2 || !srv_was_started);
+
+ ut_a(fil_system.n_open > 0);
+ fil_system.n_open--;
+}
+
+/** Flush any writes cached by the file system. */
+void fil_space_t::flush_low()
+{
+ ut_ad(!mutex_own(&fil_system.mutex));
+
+ uint32_t n= 1;
+ while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC,
+ std::memory_order_acquire,
+ std::memory_order_relaxed))
+ {
+ ut_ad(n & PENDING);
+ if (n & STOPPING)
+ return;
+ if (n & NEEDS_FSYNC)
+ break;
+ }
+
+ fil_n_pending_tablespace_flushes++;
+ for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ {
+ if (!node->is_open())
+ {
+ ut_ad(!is_in_unflushed_spaces);
+ continue;
+ }
+ IF_WIN(if (node->is_raw_disk) continue,);
+ os_file_flush(node->handle);
+ }
+
+ if (is_in_unflushed_spaces)
+ {
+ mutex_enter(&fil_system.mutex);
+ if (is_in_unflushed_spaces)
+ {
+ is_in_unflushed_spaces= false;
+ fil_system.unflushed_spaces.remove(*this);
+ }
+ mutex_exit(&fil_system.mutex);
+ }
+
+ clear_flush();
+ fil_n_pending_tablespace_flushes--;
+}
+
+/** Try to extend a tablespace.
+@param[in,out] space tablespace to be extended
+@param[in,out] node last file of the tablespace
+@param[in] size desired size in number of pages
+@param[out] success whether the operation succeeded
+@return whether the operation should be retried */
+static ATTRIBUTE_COLD __attribute__((warn_unused_result, nonnull))
+bool
+fil_space_extend_must_retry(
+ fil_space_t* space,
+ fil_node_t* node,
+ uint32_t size,
+ bool* success)
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+ ut_ad(UT_LIST_GET_LAST(space->chain) == node);
+ ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE);
+ ut_ad(node->space == space);
+ ut_ad(space->referenced() || space->is_being_truncated);
+
+ *success = space->size >= size;
+
+ if (*success) {
+ /* Space already big enough */
+ return(false);
+ }
+
+ if (node->being_extended) {
+ /* Another thread is currently extending the file. Wait
+ for it to finish.
+ It'd have been better to use event driven mechanism but
+ the entire module is peppered with polling stuff. */
+ mutex_exit(&fil_system.mutex);
+ os_thread_sleep(100000);
+ return(true);
+ }
+
+ node->being_extended = true;
+
+ /* At this point it is safe to release fil_system.mutex. No
+ other thread can rename, delete, close or extend the file because
+ we have set the node->being_extended flag. */
+ mutex_exit(&fil_system.mutex);
+
+ ut_ad(size >= space->size);
+
+ uint32_t last_page_no = space->size;
+ const uint32_t file_start_page_no = last_page_no - node->size;
+
+ const unsigned page_size = space->physical_size();
+
+ /* Datafile::read_first_page() expects srv_page_size bytes.
+ fil_node_t::read_page0() expects at least 4 * srv_page_size bytes.*/
+ os_offset_t new_size = std::max(
+ os_offset_t(size - file_start_page_no) * page_size,
+ os_offset_t(FIL_IBD_FILE_INITIAL_SIZE << srv_page_size_shift));
+
+ *success = os_file_set_size(node->name, node->handle, new_size,
+ space->is_compressed());
+
+ os_has_said_disk_full = *success;
+ if (*success) {
+ os_file_flush(node->handle);
+ last_page_no = size;
+ } else {
+ /* Let us measure the size of the file
+ to determine how much we were able to
+ extend it */
+ os_offset_t fsize = os_file_get_size(node->handle);
+ ut_a(fsize != os_offset_t(-1));
+
+ last_page_no = uint32_t(fsize / page_size)
+ + file_start_page_no;
+ }
+ mutex_enter(&fil_system.mutex);
+
+ ut_a(node->being_extended);
+ node->being_extended = false;
+ ut_a(last_page_no - file_start_page_no >= node->size);
+
+ uint32_t file_size = last_page_no - file_start_page_no;
+ space->size += file_size - node->size;
+ node->size = file_size;
+ const uint32_t pages_in_MiB = node->size
+ & ~uint32_t((1U << (20U - srv_page_size_shift)) - 1);
+
+ /* Keep the last data file size info up to date, rounded to
+ full megabytes */
+
+ switch (space->id) {
+ case TRX_SYS_SPACE:
+ srv_sys_space.set_last_file_size(pages_in_MiB);
+ do_flush:
+ space->reacquire();
+ mutex_exit(&fil_system.mutex);
+ space->flush_low();
+ space->release();
+ mutex_enter(&fil_system.mutex);
+ break;
+ default:
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE
+ || space->purpose == FIL_TYPE_IMPORT);
+ if (space->purpose == FIL_TYPE_TABLESPACE
+ && !space->is_being_truncated) {
+ goto do_flush;
+ }
+ break;
+ case SRV_TMP_SPACE_ID:
+ ut_ad(space->purpose == FIL_TYPE_TEMPORARY);
+ srv_tmp_space.set_last_file_size(pages_in_MiB);
+ break;
+ }
+
+ return false;
+}
+
+/** @return whether the file is usable for io() */
+ATTRIBUTE_COLD bool fil_space_t::prepare(bool have_mutex)
+{
+ ut_ad(referenced());
+ if (!have_mutex)
+ mutex_enter(&fil_system.mutex);
+ ut_ad(mutex_own(&fil_system.mutex));
+ fil_node_t *node= UT_LIST_GET_LAST(chain);
+ ut_ad(!id || purpose == FIL_TYPE_TEMPORARY ||
+ node == UT_LIST_GET_FIRST(chain));
+
+ const bool is_open= node && (node->is_open() || fil_node_open_file(node));
+
+ if (!is_open)
+ release();
+ else if (auto desired_size= recv_size)
+ {
+ bool success;
+ while (fil_space_extend_must_retry(this, node, desired_size, &success))
+ mutex_enter(&fil_system.mutex);
+
+ ut_ad(mutex_own(&fil_system.mutex));
+ /* Crash recovery requires the file extension to succeed. */
+ ut_a(success);
+ /* InnoDB data files cannot shrink. */
+ ut_a(size >= desired_size);
+ if (desired_size > committed_size)
+ committed_size= desired_size;
+
+ /* There could be multiple concurrent I/O requests for this
+ tablespace (multiple threads trying to extend this tablespace).
+
+ Also, fil_space_set_recv_size_and_flags() may have been invoked
+ again during the file extension while fil_system.mutex was not
+ being held by us.
+
+ Only if recv_size matches what we read originally, reset the
+ field. In this way, a subsequent I/O request will handle any
+ pending fil_space_set_recv_size_and_flags(). */
+
+ if (desired_size == recv_size)
+ {
+ recv_size= 0;
+ goto clear;
+ }
+ }
+ else
+clear:
+ n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
+
+ if (!have_mutex)
+ mutex_exit(&fil_system.mutex);
+ return is_open;
+}
+
+/** Try to extend a tablespace if it is smaller than the specified size.
+@param[in,out] space tablespace
+@param[in] size desired size in pages
+@return whether the tablespace is at least as big as requested */
+bool fil_space_extend(fil_space_t *space, uint32_t size)
+{
+ ut_ad(!srv_read_only_mode || space->purpose == FIL_TYPE_TEMPORARY);
+ bool success= false;
+ const bool acquired= space->acquire();
+ mutex_enter(&fil_system.mutex);
+ if (acquired || space->is_being_truncated)
+ {
+ while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
+ size, &success))
+ mutex_enter(&fil_system.mutex);
+ }
+ mutex_exit(&fil_system.mutex);
+ if (acquired)
+ space->release();
+ return success;
+}
+
+/** Prepare to free a file from fil_system. */
+inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+ ut_a(magic_n == FIL_NODE_MAGIC_N);
+ ut_a(!being_extended);
+
+ if (is_open() &&
+ (space->n_pending.fetch_or(fil_space_t::CLOSING,
+ std::memory_order_acquire) &
+ fil_space_t::PENDING))
+ {
+ mutex_exit(&fil_system.mutex);
+ while (space->referenced())
+ os_thread_sleep(100);
+ mutex_enter(&fil_system.mutex);
+ }
+
+ while (is_open())
+ {
+ if (space->is_in_unflushed_spaces)
+ {
+ ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
+ space->is_in_unflushed_spaces= false;
+ fil_system.unflushed_spaces.remove(*space);
+ }
+
+ ut_a(!being_extended);
+ if (detach_handle)
+ {
+ auto result= handle;
+ handle= OS_FILE_CLOSED;
+ return result;
+ }
+ bool ret= os_file_close(handle);
+ ut_a(ret);
+ handle= OS_FILE_CLOSED;
+ break;
+ }
+
+ return OS_FILE_CLOSED;
+}
+
+/** Detach a tablespace from the cache and close the files. */
+std::vector<pfs_os_file_t> fil_system_t::detach(fil_space_t *space,
+ bool detach_handle)
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+ HASH_DELETE(fil_space_t, hash, &spaces, space->id, space);
+
+ if (space->is_in_unflushed_spaces)
+ {
+ ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
+ space->is_in_unflushed_spaces= false;
+ unflushed_spaces.remove(*space);
+ }
+
+ if (space->is_in_default_encrypt)
+ {
+ space->is_in_default_encrypt= false;
+ default_encrypt_tables.remove(*space);
+ }
+ UT_LIST_REMOVE(space_list, space);
+ if (space == sys_space)
+ sys_space= nullptr;
+ else if (space == temp_space)
+ temp_space= nullptr;
+
+ ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
+
+ for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ if (node->is_open())
+ {
+ ut_ad(n_open > 0);
+ n_open--;
+ }
+
+ std::vector<pfs_os_file_t> handles;
+ handles.reserve(UT_LIST_GET_LEN(space->chain));
+
+ for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ {
+ auto handle= node->close_to_free(detach_handle);
+ if (handle != OS_FILE_CLOSED)
+ handles.push_back(handle);
+ }
+
+ ut_ad(!space->referenced());
+ return handles;
+}
+
+/** Free a tablespace object on which fil_system_t::detach() was invoked.
+There must not be any pending i/o's or flushes on the files.
+@param[in,out] space tablespace */
+static
+void
+fil_space_free_low(
+ fil_space_t* space)
+{
+ /* The tablespace must not be in fil_system.named_spaces. */
+ ut_ad(srv_fast_shutdown == 2 || !srv_was_started
+ || space->max_lsn == 0);
+
+ /* Wait for fil_space_t::release() after
+ fil_system_t::detach(), the tablespace cannot be found, so
+ fil_space_t::get() would return NULL */
+ while (space->referenced()) {
+ os_thread_sleep(100);
+ }
+
+ for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+ node != NULL; ) {
+ ut_d(space->size -= node->size);
+ ut_free(node->name);
+ fil_node_t* old_node = node;
+ node = UT_LIST_GET_NEXT(chain, node);
+ ut_free(old_node);
+ }
+
+ ut_ad(space->size == 0);
+
+ rw_lock_free(&space->latch);
+ fil_space_destroy_crypt_data(&space->crypt_data);
+
+ space->~fil_space_t();
+ ut_free(space->name);
+ ut_free(space);
+}
+
+/** Frees a space object from the tablespace memory cache.
+Closes the files in the chain but does not delete them.
+There must not be any pending i/o's or flushes on the files.
+@param[in] id tablespace identifier
+@param[in] x_latched whether the caller holds X-mode space->latch
+@return true if success */
+bool
+fil_space_free(
+ ulint id,
+ bool x_latched)
+{
+ ut_ad(id != TRX_SYS_SPACE);
+
+ mutex_enter(&fil_system.mutex);
+ fil_space_t* space = fil_space_get_by_id(id);
+
+ if (space != NULL) {
+ fil_system.detach(space);
+ }
+
+ mutex_exit(&fil_system.mutex);
+
+ if (space != NULL) {
+ if (x_latched) {
+ rw_lock_x_unlock(&space->latch);
+ }
+
+ if (!recv_recovery_is_on()) {
+ mysql_mutex_lock(&log_sys.mutex);
+ }
+
+ mysql_mutex_assert_owner(&log_sys.mutex);
+
+ if (space->max_lsn != 0) {
+ ut_d(space->max_lsn = 0);
+ UT_LIST_REMOVE(fil_system.named_spaces, space);
+ }
+
+ if (!recv_recovery_is_on()) {
+ mysql_mutex_unlock(&log_sys.mutex);
+ }
+
+ fil_space_free_low(space);
+ }
+
+ return(space != NULL);
+}
+
+/** Create a tablespace in fil_system.
+@param name tablespace name
+@param id tablespace identifier
+@param flags tablespace flags
+@param purpose tablespace purpose
+@param crypt_data encryption information
+@param mode encryption mode
+@return pointer to created tablespace, to be filled in with add()
+@retval nullptr on failure (such as when the same tablespace exists) */
+fil_space_t *fil_space_t::create(const char *name, ulint id, ulint flags,
+ fil_type_t purpose,
+ fil_space_crypt_t *crypt_data,
+ fil_encryption_t mode)
+{
+ fil_space_t* space;
+
+ ut_ad(fil_system.is_initialised());
+ ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id));
+ ut_ad(srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0);
+
+ DBUG_EXECUTE_IF("fil_space_create_failure", return(NULL););
+
+ /* FIXME: if calloc() is defined as an inline function that calls
+ memset() or bzero(), then GCC 6 -flifetime-dse can optimize it away */
+ space= new (ut_zalloc_nokey(sizeof(*space))) fil_space_t;
+
+ space->id = id;
+ space->name = mem_strdup(name);
+
+ UT_LIST_INIT(space->chain, &fil_node_t::chain);
+
+ space->purpose = purpose;
+ space->flags = flags;
+
+ space->magic_n = FIL_SPACE_MAGIC_N;
+ space->crypt_data = crypt_data;
+ space->n_pending.store(CLOSING, std::memory_order_relaxed);
+
+ DBUG_LOG("tablespace",
+ "Created metadata for " << id << " name " << name);
+ if (crypt_data) {
+ DBUG_LOG("crypt",
+ "Tablespace " << id << " name " << name
+ << " encryption " << crypt_data->encryption
+ << " key id " << crypt_data->key_id
+ << ":" << fil_crypt_get_mode(crypt_data)
+ << " " << fil_crypt_get_type(crypt_data));
+ }
+
+ rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP);
+
+ if (space->purpose == FIL_TYPE_TEMPORARY) {
+ /* SysTablespace::open_or_create() would pass
+ size!=0 to fil_space_t::add(), so first_time_open
+ would not hold in fil_node_open_file(), and we
+ must assign this manually. We do not care about
+ the durability or atomicity of writes to the
+ temporary tablespace files. */
+ space->atomic_write_supported = true;
+ }
+
+ mutex_enter(&fil_system.mutex);
+
+ if (const fil_space_t *old_space = fil_space_get_by_id(id)) {
+ ib::error() << "Trying to add tablespace '" << name
+ << "' with id " << id
+ << " to the tablespace memory cache, but tablespace '"
+ << old_space->name << "' already exists in the cache!";
+ mutex_exit(&fil_system.mutex);
+ rw_lock_free(&space->latch);
+ space->~fil_space_t();
+ ut_free(space->name);
+ ut_free(space);
+ return(NULL);
+ }
+
+ HASH_INSERT(fil_space_t, hash, &fil_system.spaces, id, space);
+
+ UT_LIST_ADD_LAST(fil_system.space_list, space);
+
+ switch (id) {
+ case 0:
+ ut_ad(!fil_system.sys_space);
+ fil_system.sys_space = space;
+ break;
+ case SRV_TMP_SPACE_ID:
+ ut_ad(!fil_system.temp_space);
+ fil_system.temp_space = space;
+ break;
+ default:
+ ut_ad(purpose != FIL_TYPE_TEMPORARY);
+ if (UNIV_LIKELY(id <= fil_system.max_assigned_id)) {
+ break;
+ }
+ if (!fil_system.space_id_reuse_warned) {
+ ib::warn() << "Allocated tablespace ID " << id
+ << " for " << name << ", old maximum was "
+ << fil_system.max_assigned_id;
+ }
+
+ fil_system.max_assigned_id = id;
+ }
+
+ const bool rotate =
+ (purpose == FIL_TYPE_TABLESPACE
+ && (mode == FIL_ENCRYPTION_ON
+ || mode == FIL_ENCRYPTION_OFF || srv_encrypt_tables)
+ && fil_crypt_must_default_encrypt());
+
+ /* Inform key rotation that there could be something
+ to do */
+ if (rotate) {
+ /* Key rotation is not enabled, need to inform background
+ encryption threads. */
+ fil_system.default_encrypt_tables.push_back(*space);
+ space->is_in_default_encrypt = true;
+ }
+
+ mutex_exit(&fil_system.mutex);
+
+ if (rotate && srv_n_fil_crypt_threads_started) {
+ os_event_set(fil_crypt_threads_event);
+ }
+
+ return(space);
+}
+
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return true if assigned, false if not */
+bool
+fil_assign_new_space_id(
+/*====================*/
+ ulint* space_id) /*!< in/out: space id */
+{
+ ulint id;
+ bool success;
+
+ mutex_enter(&fil_system.mutex);
+
+ id = *space_id;
+
+ if (id < fil_system.max_assigned_id) {
+ id = fil_system.max_assigned_id;
+ }
+
+ id++;
+
+ if (id > (SRV_SPACE_ID_UPPER_BOUND / 2) && (id % 1000000UL == 0)) {
+ ib::warn() << "You are running out of new single-table"
+ " tablespace id's. Current counter is " << id
+ << " and it must not exceed" <<SRV_SPACE_ID_UPPER_BOUND
+ << "! To reset the counter to zero you have to dump"
+ " all your tables and recreate the whole InnoDB"
+ " installation.";
+ }
+
+ success = (id < SRV_SPACE_ID_UPPER_BOUND);
+
+ if (success) {
+ *space_id = fil_system.max_assigned_id = id;
+ } else {
+ ib::warn() << "You have run out of single-table tablespace"
+ " id's! Current counter is " << id
+ << ". To reset the counter to zero"
+ " you have to dump all your tables and"
+ " recreate the whole InnoDB installation.";
+ *space_id = ULINT_UNDEFINED;
+ }
+
+ mutex_exit(&fil_system.mutex);
+
+ return(success);
+}
+
+/** Read the first page of a data file.
+@return whether the page was found valid */
+bool fil_space_t::read_page0()
+{
+ ut_ad(fil_system.is_initialised());
+ ut_ad(mutex_own(&fil_system.mutex));
+ if (size)
+ return true;
+
+ fil_node_t *node= UT_LIST_GET_FIRST(chain);
+ if (!node)
+ return false;
+ ut_ad(!UT_LIST_GET_NEXT(chain, node));
+
+ if (UNIV_UNLIKELY(acquire_low() & STOPPING))
+ {
+ ut_ad("this should not happen" == 0);
+ return false;
+ }
+ const bool ok= node->is_open() || fil_node_open_file(node);
+ release();
+ return ok;
+}
+
+/** Look up a tablespace and ensure that its first page has been validated. */
+static fil_space_t *fil_space_get_space(ulint id)
+{
+ if (fil_space_t *space= fil_space_get_by_id(id))
+ if (space->read_page0())
+ return space;
+ return nullptr;
+}
+
+void fil_space_set_recv_size_and_flags(ulint id, uint32_t size, uint32_t flags)
+{
+ ut_ad(id < SRV_SPACE_ID_UPPER_BOUND);
+ mutex_enter(&fil_system.mutex);
+ if (fil_space_t *space= fil_space_get_space(id))
+ {
+ if (size)
+ space->recv_size= size;
+ if (flags != FSP_FLAGS_FCRC32_MASK_MARKER)
+ space->flags= flags;
+ }
+ mutex_exit(&fil_system.mutex);
+}
+
+/** Open each file. Never invoked on .ibd files.
+@param create_new_db whether to skip the call to fil_node_t::read_page0()
+@return whether all files were opened */
+bool fil_space_t::open(bool create_new_db)
+{
+ ut_ad(fil_system.is_initialised());
+ ut_ad(!id || create_new_db);
+
+ bool success= true;
+ bool skip_read= create_new_db;
+
+ mutex_enter(&fil_system.mutex);
+
+ for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ {
+ if (!node->is_open() && !fil_node_open_file_low(node))
+ {
+err_exit:
+ success= false;
+ break;
+ }
+
+ if (create_new_db)
+ continue;
+ if (skip_read)
+ {
+ size+= node->size;
+ continue;
+ }
+
+ if (!node->read_page0())
+ {
+ fil_system.n_open--;
+ os_file_close(node->handle);
+ node->handle= OS_FILE_CLOSED;
+ goto err_exit;
+ }
+
+ skip_read= true;
+ }
+
+ if (!create_new_db)
+ committed_size= size;
+ mutex_exit(&fil_system.mutex);
+ return success;
+}
+
+/** Close each file. Only invoked on fil_system.temp_space. */
+void fil_space_t::close()
+{
+ if (!fil_system.is_initialised()) {
+ return;
+ }
+
+ mutex_enter(&fil_system.mutex);
+ ut_ad(this == fil_system.temp_space
+ || srv_operation == SRV_OPERATION_BACKUP
+ || srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_DELTA);
+
+ for (fil_node_t* node = UT_LIST_GET_FIRST(chain);
+ node != NULL;
+ node = UT_LIST_GET_NEXT(chain, node)) {
+ if (node->is_open()) {
+ node->close();
+ }
+ }
+
+ mutex_exit(&fil_system.mutex);
+}
+
+void fil_system_t::create(ulint hash_size)
+{
+ ut_ad(this == &fil_system);
+ ut_ad(!is_initialised());
+ ut_ad(!(srv_page_size % FSP_EXTENT_SIZE));
+ ut_ad(srv_page_size);
+ ut_ad(!spaces.array);
+
+ m_initialised = true;
+
+ compile_time_assert(!(UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX));
+ compile_time_assert(!(UNIV_PAGE_SIZE_MIN % FSP_EXTENT_SIZE_MIN));
+
+ ut_ad(hash_size > 0);
+
+ mutex_create(LATCH_ID_FIL_SYSTEM, &mutex);
+
+ spaces.create(hash_size);
+
+ fil_space_crypt_init();
+#ifdef UNIV_LINUX
+ ssd.clear();
+ char fn[sizeof(dirent::d_name)
+ + sizeof "/sys/block/" "/queue/rotational"];
+ const size_t sizeof_fnp = (sizeof fn) - sizeof "/sys/block";
+ memcpy(fn, "/sys/block/", sizeof "/sys/block");
+ char* fnp = &fn[sizeof "/sys/block"];
+
+ std::set<std::string> ssd_devices;
+ if (DIR* d = opendir("/sys/block")) {
+ while (struct dirent* e = readdir(d)) {
+ if (e->d_name[0] == '.') {
+ continue;
+ }
+ snprintf(fnp, sizeof_fnp, "%s/queue/rotational",
+ e->d_name);
+ int f = open(fn, O_RDONLY);
+ if (f == -1) {
+ continue;
+ }
+ char b[sizeof "4294967295:4294967295\n"];
+ ssize_t l = read(f, b, sizeof b);
+ ::close(f);
+ if (l != 2 || memcmp("0\n", b, 2)) {
+ continue;
+ }
+ snprintf(fnp, sizeof_fnp, "%s/dev", e->d_name);
+ f = open(fn, O_RDONLY);
+ if (f == -1) {
+ continue;
+ }
+ l = read(f, b, sizeof b);
+ ::close(f);
+ if (l <= 0 || b[l - 1] != '\n') {
+ continue;
+ }
+ b[l - 1] = '\0';
+ char* end = b;
+ unsigned long dev_major = strtoul(b, &end, 10);
+ if (b == end || *end != ':'
+ || dev_major != unsigned(dev_major)) {
+ continue;
+ }
+ char* c = end + 1;
+ unsigned long dev_minor = strtoul(c, &end, 10);
+ if (c == end || *end
+ || dev_minor != unsigned(dev_minor)) {
+ continue;
+ }
+ ssd.push_back(makedev(unsigned(dev_major),
+ unsigned(dev_minor)));
+ }
+ closedir(d);
+ }
+ /* fil_system_t::is_ssd() assumes the following */
+ ut_ad(makedev(0, 8) == 8);
+ ut_ad(makedev(0, 4) == 4);
+ ut_ad(makedev(0, 2) == 2);
+ ut_ad(makedev(0, 1) == 1);
+#endif
+}
+
+void fil_system_t::close()
+{
+ ut_ad(this == &fil_system);
+ ut_a(unflushed_spaces.empty());
+ ut_a(!UT_LIST_GET_LEN(space_list));
+ ut_ad(!sys_space);
+ ut_ad(!temp_space);
+
+ if (is_initialised())
+ {
+ m_initialised= false;
+ spaces.free();
+ mutex_free(&mutex);
+ fil_space_crypt_cleanup();
+ }
+
+ ut_ad(!spaces.array);
+
+#ifdef UNIV_LINUX
+ ssd.clear();
+ ssd.shrink_to_fit();
+#endif /* UNIV_LINUX */
+}
+
+/** Extend all open data files to the recovered size */
+ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size()
+{
+ ut_ad(is_initialised());
+ mutex_enter(&mutex);
+ for (fil_space_t *space= UT_LIST_GET_FIRST(fil_system.space_list); space;
+ space= UT_LIST_GET_NEXT(space_list, space))
+ {
+ const uint32_t size= space->recv_size;
+
+ if (size > space->size)
+ {
+ if (space->is_closing())
+ continue;
+ space->reacquire();
+ bool success;
+ while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
+ size, &success))
+ mutex_enter(&mutex);
+ /* Crash recovery requires the file extension to succeed. */
+ ut_a(success);
+ space->release();
+ }
+ }
+ mutex_exit(&mutex);
+}
+
+/** Close all tablespace files at shutdown */
+void fil_space_t::close_all()
+{
+ if (!fil_system.is_initialised()) {
+ return;
+ }
+
+ fil_space_t* space;
+
+ /* At shutdown, we should not have any files in this list. */
+ ut_ad(srv_fast_shutdown == 2
+ || !srv_was_started
+ || UT_LIST_GET_LEN(fil_system.named_spaces) == 0);
+ fil_flush_file_spaces();
+
+ mutex_enter(&fil_system.mutex);
+
+ for (space = UT_LIST_GET_FIRST(fil_system.space_list); space; ) {
+ fil_node_t* node;
+ fil_space_t* prev_space = space;
+
+ for (node = UT_LIST_GET_FIRST(space->chain);
+ node != NULL;
+ node = UT_LIST_GET_NEXT(chain, node)) {
+
+ if (!node->is_open()) {
+next:
+ continue;
+ }
+
+ for (ulint count = 10000; count--; ) {
+ if (!space->set_closing()) {
+ node->close();
+ goto next;
+ }
+ mutex_exit(&fil_system.mutex);
+ os_thread_sleep(100);
+ mutex_enter(&fil_system.mutex);
+ if (!node->is_open()) {
+ goto next;
+ }
+ }
+
+ ib::error() << "File '" << node->name
+ << "' has " << space->referenced()
+ << " operations";
+ }
+
+ space = UT_LIST_GET_NEXT(space_list, space);
+ fil_system.detach(prev_space);
+ fil_space_free_low(prev_space);
+ }
+
+ mutex_exit(&fil_system.mutex);
+
+ ut_ad(srv_fast_shutdown == 2
+ || !srv_was_started
+ || UT_LIST_GET_LEN(fil_system.named_spaces) == 0);
+}
+
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+ ulint max_id) /*!< in: maximum known id */
+{
+ if (max_id >= SRV_SPACE_ID_UPPER_BOUND) {
+ ib::fatal() << "Max tablespace id is too high, " << max_id;
+ }
+
+ mutex_enter(&fil_system.mutex);
+
+ if (fil_system.max_assigned_id < max_id) {
+
+ fil_system.max_assigned_id = max_id;
+ }
+
+ mutex_exit(&fil_system.mutex);
+}
+
+/** Write the flushed LSN to the page header of the first page in the
+system tablespace.
+@param[in] lsn flushed LSN
+@return DB_SUCCESS or error number */
+dberr_t
+fil_write_flushed_lsn(
+ lsn_t lsn)
+{
+ byte* buf;
+ ut_ad(!srv_read_only_mode);
+
+ if (!fil_system.sys_space->acquire()) {
+ return DB_ERROR;
+ }
+
+ buf = static_cast<byte*>(aligned_malloc(srv_page_size, srv_page_size));
+
+ auto fio = fil_system.sys_space->io(IORequestRead, 0, srv_page_size,
+ buf);
+
+ if (fio.err == DB_SUCCESS) {
+ mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
+ lsn);
+
+ ulint fsp_flags = mach_read_from_4(
+ buf + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS);
+
+ if (fil_space_t::full_crc32(fsp_flags)) {
+ buf_flush_assign_full_crc32_checksum(buf);
+ }
+
+ fio = fil_system.sys_space->io(IORequestWrite,
+ 0, srv_page_size, buf);
+ fil_flush_file_spaces();
+ } else {
+ fil_system.sys_space->release();
+ }
+
+ aligned_free(buf);
+ return fio.err;
+}
+
+/** Acquire a tablespace reference.
+@param id tablespace identifier
+@return tablespace
+@retval nullptr if the tablespace is missing or inaccessible */
+fil_space_t *fil_space_t::get(ulint id)
+{
+ mutex_enter(&fil_system.mutex);
+ fil_space_t *space= fil_space_get_by_id(id);
+ const uint32_t n= space ? space->acquire_low() : 0;
+ mutex_exit(&fil_system.mutex);
+
+ if (n & STOPPING)
+ space= nullptr;
+
+ if ((n & CLOSING) && !space->prepare())
+ space= nullptr;
+
+ return space;
+}
+
+/** Write a log record about a file operation.
+@param type file operation
+@param first_page_no first page number in the file
+@param path file path
+@param new_path new file path for type=FILE_RENAME */
+inline void mtr_t::log_file_op(mfile_type_t type, ulint space_id,
+ const char *path, const char *new_path)
+{
+ ut_ad((new_path != nullptr) == (type == FILE_RENAME));
+ ut_ad(!(byte(type) & 15));
+
+ /* fil_name_parse() requires that there be at least one path
+ separator and that the file path end with ".ibd". */
+ ut_ad(strchr(path, OS_PATH_SEPARATOR) != NULL);
+ ut_ad(!strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD));
+
+ flag_modified();
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ m_last= nullptr;
+
+ const size_t len= strlen(path);
+ const size_t new_len= type == FILE_RENAME ? 1 + strlen(new_path) : 0;
+ ut_ad(len > 0);
+ byte *const log_ptr= m_log.open(1 + 3/*length*/ + 5/*space_id*/ +
+ 1/*page_no=0*/);
+ byte *end= log_ptr + 1;
+ end= mlog_encode_varint(end, space_id);
+ *end++= 0;
+ if (UNIV_LIKELY(end + len + new_len >= &log_ptr[16]))
+ {
+ *log_ptr= type;
+ size_t total_len= len + new_len + end - log_ptr - 15;
+ if (total_len >= MIN_3BYTE)
+ total_len+= 2;
+ else if (total_len >= MIN_2BYTE)
+ total_len++;
+ end= mlog_encode_varint(log_ptr + 1, total_len);
+ end= mlog_encode_varint(end, space_id);
+ *end++= 0;
+ }
+ else
+ {
+ *log_ptr= static_cast<byte>(type | (end + len + new_len - &log_ptr[1]));
+ ut_ad(*log_ptr & 15);
+ }
+
+ m_log.close(end);
+
+ if (type == FILE_RENAME)
+ {
+ ut_ad(strchr(new_path, OS_PATH_SEPARATOR));
+ m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len + 1));
+ m_log.push(reinterpret_cast<const byte*>(new_path), uint32_t(new_len));
+ }
+ else
+ m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len));
+}
+
+/** Write redo log for renaming a file.
+@param[in] space_id tablespace id
+@param[in] old_name tablespace file name
+@param[in] new_name tablespace file name after renaming
+@param[in,out] mtr mini-transaction */
+static
+void
+fil_name_write_rename_low(
+ ulint space_id,
+ const char* old_name,
+ const char* new_name,
+ mtr_t* mtr)
+{
+ ut_ad(!is_predefined_tablespace(space_id));
+ mtr->log_file_op(FILE_RENAME, space_id, old_name, new_name);
+}
+
+/** Write redo log for renaming a file.
+@param[in] space_id tablespace id
+@param[in] old_name tablespace file name
+@param[in] new_name tablespace file name after renaming */
+static void
+fil_name_write_rename(
+ ulint space_id,
+ const char* old_name,
+ const char* new_name)
+{
+ mtr_t mtr;
+ mtr.start();
+ fil_name_write_rename_low(space_id, old_name, new_name, &mtr);
+ mtr.commit();
+ log_write_up_to(mtr.commit_lsn(), true);
+}
+
+/** Write FILE_MODIFY for a file.
+@param[in] space_id tablespace id
+@param[in] name tablespace file name
+@param[in,out] mtr mini-transaction */
+static
+void
+fil_name_write(
+ ulint space_id,
+ const char* name,
+ mtr_t* mtr)
+{
+ ut_ad(!is_predefined_tablespace(space_id));
+ mtr->log_file_op(FILE_MODIFY, space_id, name);
+}
+
+/** Check for pending operations.
+@param[in] space tablespace
+@param[in] count number of attempts so far
+@return 0 if no operations else count + 1. */
+static ulint fil_check_pending_ops(const fil_space_t* space, ulint count)
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+
+ if (!space) {
+ return 0;
+ }
+
+ if (auto n_pending_ops = space->referenced()) {
+
+ /* Give a warning every 10 second, starting after 1 second */
+ if ((count % 500) == 50) {
+ ib::warn() << "Trying to delete"
+ " tablespace '" << space->name
+ << "' but there are " << n_pending_ops
+ << " pending operations on it.";
+ }
+
+ return(count + 1);
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Check for pending IO.
+@return 0 if no pending else count + 1. */
+static
+ulint
+fil_check_pending_io(
+/*=================*/
+ fil_space_t* space, /*!< in/out: Tablespace to check */
+ fil_node_t** node, /*!< out: Node in space list */
+ ulint count) /*!< in: number of attempts so far */
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+
+ /* The following code must change when InnoDB supports
+ multiple datafiles per tablespace. */
+ ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
+
+ *node = UT_LIST_GET_FIRST(space->chain);
+
+ if (const uint32_t p = space->referenced()) {
+ ut_a(!(*node)->being_extended);
+
+ /* Give a warning every 10 second, starting after 1 second */
+ if ((count % 500) == 50) {
+ ib::info() << "Trying to delete"
+ " tablespace '" << space->name
+ << "' but there are " << p
+ << " pending i/o's on it.";
+ }
+
+ return(count + 1);
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Check pending operations on a tablespace.
+@return tablespace */
+static
+fil_space_t*
+fil_check_pending_operations(
+/*=========================*/
+ ulint id, /*!< in: space id */
+ bool truncate, /*!< in: whether to truncate a file */
+ char** path) /*!< out/own: tablespace path */
+{
+ ulint count = 0;
+
+ ut_a(!is_system_tablespace(id));
+ mutex_enter(&fil_system.mutex);
+ fil_space_t* sp = fil_space_get_by_id(id);
+
+ if (sp) {
+ sp->set_stopping(true);
+ if (sp->crypt_data) {
+ sp->reacquire();
+ mutex_exit(&fil_system.mutex);
+ fil_space_crypt_close_tablespace(sp);
+ mutex_enter(&fil_system.mutex);
+ sp->release();
+ }
+ }
+
+ /* Check for pending operations. */
+
+ do {
+ count = fil_check_pending_ops(sp, count);
+
+ mutex_exit(&fil_system.mutex);
+
+ if (count) {
+ os_thread_sleep(20000); // Wait 0.02 seconds
+ } else if (!sp) {
+ return nullptr;
+ }
+
+ mutex_enter(&fil_system.mutex);
+
+ sp = fil_space_get_by_id(id);
+ } while (count);
+
+ /* Check for pending IO. */
+
+ for (;;) {
+ if (truncate) {
+ sp->is_being_truncated = true;
+ }
+
+ fil_node_t* node;
+
+ count = fil_check_pending_io(sp, &node, count);
+
+ if (count == 0 && path) {
+ *path = mem_strdup(node->name);
+ }
+
+ mutex_exit(&fil_system.mutex);
+
+ if (count == 0) {
+ break;
+ }
+
+ os_thread_sleep(20000); // Wait 0.02 seconds
+ mutex_enter(&fil_system.mutex);
+ sp = fil_space_get_by_id(id);
+
+ if (!sp) {
+ mutex_exit(&fil_system.mutex);
+ break;
+ }
+ }
+
+ return sp;
+}
+
+/** Close a single-table tablespace on failed IMPORT TABLESPACE.
+The tablespace must be cached in the memory cache.
+Free all pages used by the tablespace. */
+void fil_close_tablespace(ulint id)
+{
+ ut_ad(!is_system_tablespace(id));
+ char* path = nullptr;
+ fil_space_t* space = fil_check_pending_operations(id, false, &path);
+ if (!space) {
+ return;
+ }
+
+ rw_lock_x_lock(&space->latch);
+
+ /* Invalidate in the buffer pool all pages belonging to the
+ tablespace. Since we have invoked space->set_stopping(), readahead
+ can no longer read more pages of this tablespace to buf_pool.
+ Thus we can clean the tablespace out of buf_pool
+ completely and permanently. */
+ while (buf_flush_list_space(space));
+ ut_ad(space->is_stopping());
+
+ /* If the free is successful, the X lock will be released before
+ the space memory data structure is freed. */
+
+ if (!fil_space_free(id, true)) {
+ rw_lock_x_unlock(&space->latch);
+ }
+
+ /* If it is a delete then also delete any generated files, otherwise
+ when we drop the database the remove directory will fail. */
+
+ if (char* cfg_name = fil_make_filepath(path, NULL, CFG, false)) {
+ os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
+ ut_free(cfg_name);
+ }
+
+ ut_free(path);
+}
+
+/** Delete a tablespace and associated .ibd file.
+@param[in] id tablespace identifier
+@param[in] if_exists whether to ignore missing tablespace
+@param[in,out] detached_handles return detached handles if not nullptr
+@return DB_SUCCESS or error */
+dberr_t fil_delete_tablespace(ulint id, bool if_exists,
+ std::vector<pfs_os_file_t>* detached_handles)
+{
+ char* path = NULL;
+ ut_ad(!is_system_tablespace(id));
+ ut_ad(!detached_handles || detached_handles->empty());
+
+ dberr_t err;
+ fil_space_t *space = fil_check_pending_operations(id, false, &path);
+
+ if (!space) {
+ err = DB_TABLESPACE_NOT_FOUND;
+ if (!if_exists) {
+ ib::error() << "Cannot delete tablespace " << id
+ << " because it is not found"
+ " in the tablespace memory cache.";
+ }
+
+ goto func_exit;
+ }
+
+ /* IMPORTANT: Because we have set space::stop_new_ops there
+ can't be any new reads or flushes. We are here
+ because node::n_pending was zero above. However, it is still
+ possible to have pending read and write requests:
+
+ A read request can happen because the reader thread has
+ gone through the ::stop_new_ops check in buf_page_init_for_read()
+ before the flag was set and has not yet incremented ::n_pending
+ when we checked it above.
+
+ A write request can be issued any time because we don't check
+ fil_space_t::is_stopping() when queueing a block for write.
+
+ We deal with pending write requests in the following function
+ where we'd minimally evict all dirty pages belonging to this
+ space from the flush_list. Note that if a block is IO-fixed
+ we'll wait for IO to complete.
+
+ To deal with potential read requests, we will check the
+ is_stopping() in fil_space_t::io(). */
+
+ err = DB_SUCCESS;
+ buf_flush_remove_pages(id);
+
+ /* If it is a delete then also delete any generated files, otherwise
+ when we drop the database the remove directory will fail. */
+ {
+ /* Before deleting the file, write a log record about
+ it, so that InnoDB crash recovery will expect the file
+ to be gone. */
+ mtr_t mtr;
+
+ mtr.start();
+ mtr.log_file_op(FILE_DELETE, id, path);
+ mtr.commit();
+ /* Even if we got killed shortly after deleting the
+ tablespace file, the record must have already been
+ written to the redo log. */
+ log_write_up_to(mtr.commit_lsn(), true);
+
+ char* cfg_name = fil_make_filepath(path, NULL, CFG, false);
+ if (cfg_name != NULL) {
+ os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
+ ut_free(cfg_name);
+ }
+ }
+
+ /* Delete the link file pointing to the ibd file we are deleting. */
+ if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) {
+ RemoteDatafile::delete_link_file(space->name);
+ }
+
+ mutex_enter(&fil_system.mutex);
+
+ /* Double check the sanity of pending ops after reacquiring
+ the fil_system::mutex. */
+ if (const fil_space_t* s = fil_space_get_by_id(id)) {
+ ut_a(s == space);
+ ut_a(!space->referenced());
+ ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+ auto handles = fil_system.detach(space,
+ detached_handles != nullptr);
+ if (detached_handles) {
+ *detached_handles = std::move(handles);
+ }
+ mutex_exit(&fil_system.mutex);
+
+ mysql_mutex_lock(&log_sys.mutex);
+
+ if (space->max_lsn != 0) {
+ ut_d(space->max_lsn = 0);
+ UT_LIST_REMOVE(fil_system.named_spaces, space);
+ }
+
+ mysql_mutex_unlock(&log_sys.mutex);
+ fil_space_free_low(space);
+
+ if (!os_file_delete(innodb_data_file_key, path)
+ && !os_file_delete_if_exists(
+ innodb_data_file_key, path, NULL)) {
+
+ /* Note: This is because we have removed the
+ tablespace instance from the cache. */
+
+ err = DB_IO_ERROR;
+ }
+ } else {
+ mutex_exit(&fil_system.mutex);
+ err = DB_TABLESPACE_NOT_FOUND;
+ }
+
+func_exit:
+ ut_free(path);
+ ibuf_delete_for_discarded_space(id);
+ return(err);
+}
+
+/** Prepare to truncate an undo tablespace.
+@param[in] space_id undo tablespace id
+@return the tablespace
+@retval NULL if tablespace not found */
+fil_space_t *fil_truncate_prepare(ulint space_id)
+{
+ return fil_check_pending_operations(space_id, true, nullptr);
+}
+
+/*******************************************************************//**
+Allocates and builds a file name from a path, a table or tablespace name
+and a suffix. The string must be freed by caller with ut_free().
+@param[in] path NULL or the directory path or the full path and filename.
+@param[in] name NULL if path is full, or Table/Tablespace name
+@param[in] suffix NULL or the file extention to use.
+@param[in] trim_name true if the last name on the path should be trimmed.
+@return own: file name */
+char*
+fil_make_filepath(
+ const char* path,
+ const char* name,
+ ib_extention ext,
+ bool trim_name)
+{
+ /* The path may contain the basename of the file, if so we do not
+ need the name. If the path is NULL, we can use the default path,
+ but there needs to be a name. */
+ ut_ad(path != NULL || name != NULL);
+
+ /* If we are going to strip a name off the path, there better be a
+ path and a new name to put back on. */
+ ut_ad(!trim_name || (path != NULL && name != NULL));
+
+ if (path == NULL) {
+ path = fil_path_to_mysql_datadir;
+ }
+
+ ulint len = 0; /* current length */
+ ulint path_len = strlen(path);
+ ulint name_len = (name ? strlen(name) : 0);
+ const char* suffix = dot_ext[ext];
+ ulint suffix_len = strlen(suffix);
+ ulint full_len = path_len + 1 + name_len + suffix_len + 1;
+
+ char* full_name = static_cast<char*>(ut_malloc_nokey(full_len));
+ if (full_name == NULL) {
+ return NULL;
+ }
+
+ /* If the name is a relative path, do not prepend "./". */
+ if (path[0] == '.'
+ && (path[1] == '\0' || path[1] == OS_PATH_SEPARATOR)
+ && name != NULL && name[0] == '.') {
+ path = NULL;
+ path_len = 0;
+ }
+
+ if (path != NULL) {
+ memcpy(full_name, path, path_len);
+ len = path_len;
+ full_name[len] = '\0';
+ os_normalize_path(full_name);
+ }
+
+ if (trim_name) {
+ /* Find the offset of the last DIR separator and set it to
+ null in order to strip off the old basename from this path. */
+ char* last_dir_sep = strrchr(full_name, OS_PATH_SEPARATOR);
+ if (last_dir_sep) {
+ last_dir_sep[0] = '\0';
+ len = strlen(full_name);
+ }
+ }
+
+ if (name != NULL) {
+ if (len && full_name[len - 1] != OS_PATH_SEPARATOR) {
+ /* Add a DIR separator */
+ full_name[len] = OS_PATH_SEPARATOR;
+ full_name[++len] = '\0';
+ }
+
+ char* ptr = &full_name[len];
+ memcpy(ptr, name, name_len);
+ len += name_len;
+ full_name[len] = '\0';
+ os_normalize_path(ptr);
+ }
+
+ /* Make sure that the specified suffix is at the end of the filepath
+ string provided. This assumes that the suffix starts with '.'.
+ If the first char of the suffix is found in the filepath at the same
+ length as the suffix from the end, then we will assume that there is
+ a previous suffix that needs to be replaced. */
+ if (suffix != NULL) {
+ /* Need room for the trailing null byte. */
+ ut_ad(len < full_len);
+
+ if ((len > suffix_len)
+ && (full_name[len - suffix_len] == suffix[0])) {
+ /* Another suffix exists, make it the one requested. */
+ memcpy(&full_name[len - suffix_len], suffix, suffix_len);
+
+ } else {
+ /* No previous suffix, add it. */
+ ut_ad(len + suffix_len < full_len);
+ memcpy(&full_name[len], suffix, suffix_len);
+ full_name[len + suffix_len] = '\0';
+ }
+ }
+
+ return(full_name);
+}
+
+/** Test if a tablespace file can be renamed to a new filepath by checking
+if that the old filepath exists and the new filepath does not exist.
+@param[in] old_path old filepath
+@param[in] new_path new filepath
+@param[in] replace_new whether to ignore the existence of new_path
+@return innodb error code */
+static dberr_t
+fil_rename_tablespace_check(
+ const char* old_path,
+ const char* new_path,
+ bool replace_new)
+{
+ bool exists = false;
+ os_file_type_t ftype;
+
+ if (os_file_status(old_path, &exists, &ftype) && !exists) {
+ ib::error() << "Cannot rename '" << old_path
+ << "' to '" << new_path
+ << "' because the source file"
+ << " does not exist.";
+ return(DB_TABLESPACE_NOT_FOUND);
+ }
+
+ exists = false;
+ if (os_file_status(new_path, &exists, &ftype) && !exists) {
+ return DB_SUCCESS;
+ }
+
+ if (!replace_new) {
+ ib::error() << "Cannot rename '" << old_path
+ << "' to '" << new_path
+ << "' because the target file exists."
+ " Remove the target file and try again.";
+ return(DB_TABLESPACE_EXISTS);
+ }
+
+ /* This must be during the ROLLBACK of TRUNCATE TABLE.
+ Because InnoDB only allows at most one data dictionary
+ transaction at a time, and because this incomplete TRUNCATE
+ would have created a new tablespace file, we must remove
+ a possibly existing tablespace that is associated with the
+ new tablespace file. */
+retry:
+ mutex_enter(&fil_system.mutex);
+ for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
+ space; space = UT_LIST_GET_NEXT(space_list, space)) {
+ ulint id = space->id;
+ if (id
+ && space->purpose == FIL_TYPE_TABLESPACE
+ && !strcmp(new_path,
+ UT_LIST_GET_FIRST(space->chain)->name)) {
+ ib::info() << "TRUNCATE rollback: " << id
+ << "," << new_path;
+ mutex_exit(&fil_system.mutex);
+ dberr_t err = fil_delete_tablespace(id);
+ if (err != DB_SUCCESS) {
+ return err;
+ }
+ goto retry;
+ }
+ }
+ mutex_exit(&fil_system.mutex);
+ fil_delete_file(new_path);
+
+ return(DB_SUCCESS);
+}
+
+dberr_t fil_space_t::rename(const char* name, const char* path, bool log,
+ bool replace)
+{
+ ut_ad(UT_LIST_GET_LEN(chain) == 1);
+ ut_ad(!is_system_tablespace(id));
+
+ if (log) {
+ dberr_t err = fil_rename_tablespace_check(
+ chain.start->name, path, replace);
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ fil_name_write_rename(id, chain.start->name, path);
+ }
+
+ return fil_rename_tablespace(id, chain.start->name, name, path)
+ ? DB_SUCCESS : DB_ERROR;
+}
+
+/** Rename a single-table tablespace.
+The tablespace must exist in the memory cache.
+@param[in] id tablespace identifier
+@param[in] old_path old file name
+@param[in] new_name new table name in the
+databasename/tablename format
+@param[in] new_path_in new file name,
+or NULL if it is located in the normal data directory
+@return true if success */
+static bool
+fil_rename_tablespace(
+ ulint id,
+ const char* old_path,
+ const char* new_name,
+ const char* new_path_in)
+{
+ fil_space_t* space;
+ fil_node_t* node;
+ ut_a(id != 0);
+
+ ut_ad(strchr(new_name, '/') != NULL);
+
+ mutex_enter(&fil_system.mutex);
+
+ space = fil_space_get_by_id(id);
+
+ if (space == NULL) {
+ ib::error() << "Cannot find space id " << id
+ << " in the tablespace memory cache, though the file '"
+ << old_path
+ << "' in a rename operation should have that id.";
+ mutex_exit(&fil_system.mutex);
+ return(false);
+ }
+
+ /* The following code must change when InnoDB supports
+ multiple datafiles per tablespace. */
+ ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+ node = UT_LIST_GET_FIRST(space->chain);
+ space->reacquire();
+
+ mutex_exit(&fil_system.mutex);
+
+ char* new_file_name = new_path_in == NULL
+ ? fil_make_filepath(NULL, new_name, IBD, false)
+ : mem_strdup(new_path_in);
+ char* old_file_name = node->name;
+ char* new_space_name = mem_strdup(new_name);
+ char* old_space_name = space->name;
+
+ ut_ad(strchr(old_file_name, OS_PATH_SEPARATOR) != NULL);
+ ut_ad(strchr(new_file_name, OS_PATH_SEPARATOR) != NULL);
+
+ if (!recv_recovery_is_on()) {
+ mysql_mutex_lock(&log_sys.mutex);
+ }
+
+ /* log_sys.mutex is above fil_system.mutex in the latching order */
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ mutex_enter(&fil_system.mutex);
+ space->release();
+ ut_ad(space->name == old_space_name);
+ ut_ad(node->name == old_file_name);
+ bool success;
+ DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
+ goto skip_second_rename; );
+ success = os_file_rename(innodb_data_file_key,
+ old_file_name,
+ new_file_name);
+ DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
+skip_second_rename:
+ success = false; );
+
+ ut_ad(node->name == old_file_name);
+
+ if (success) {
+ node->name = new_file_name;
+ }
+
+ if (!recv_recovery_is_on()) {
+ mysql_mutex_unlock(&log_sys.mutex);
+ }
+
+ ut_ad(space->name == old_space_name);
+ if (success) {
+ space->name = new_space_name;
+ } else {
+ /* Because nothing was renamed, we must free the new
+ names, not the old ones. */
+ old_file_name = new_file_name;
+ old_space_name = new_space_name;
+ }
+
+ mutex_exit(&fil_system.mutex);
+
+ ut_free(old_file_name);
+ ut_free(old_space_name);
+
+ return(success);
+}
+
+/* FIXME: remove this! */
+IF_WIN(, bool os_is_sparse_file_supported(os_file_t fh));
+
+/** Create a tablespace file.
+@param[in] space_id Tablespace ID
+@param[in] name Tablespace name in dbname/tablename format.
+@param[in] path Path and filename of the datafile to create.
+@param[in] flags Tablespace flags
+@param[in] size Initial size of the tablespace file in pages,
+must be >= FIL_IBD_FILE_INITIAL_SIZE
+@param[in] mode MariaDB encryption mode
+@param[in] key_id MariaDB encryption key_id
+@param[out] err DB_SUCCESS or error code
+@return the created tablespace
+@retval NULL on error */
+fil_space_t*
+fil_ibd_create(
+ ulint space_id,
+ const char* name,
+ const char* path,
+ ulint flags,
+ uint32_t size,
+ fil_encryption_t mode,
+ uint32_t key_id,
+ dberr_t* err)
+{
+ pfs_os_file_t file;
+ byte* page;
+ bool success;
+ bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags) != 0;
+
+ ut_ad(!is_system_tablespace(space_id));
+ ut_ad(!srv_read_only_mode);
+ ut_a(space_id < SRV_SPACE_ID_UPPER_BOUND);
+ ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
+ ut_a(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, space_id));
+
+ /* Create the subdirectories in the path, if they are
+ not there already. */
+ *err = os_file_create_subdirs_if_needed(path);
+ if (*err != DB_SUCCESS) {
+ return NULL;
+ }
+
+ ulint type;
+ static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096,
+ "compatibility");
+ switch (FSP_FLAGS_GET_ZIP_SSIZE(flags)) {
+ case 1:
+ case 2:
+ type = OS_DATA_FILE_NO_O_DIRECT;
+ break;
+ default:
+ type = OS_DATA_FILE;
+ }
+
+ file = os_file_create(
+ innodb_data_file_key, path,
+ OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_AIO, type, srv_read_only_mode, &success);
+
+ if (!success) {
+ /* The following call will print an error message */
+ switch (os_file_get_last_error(true)) {
+ case OS_FILE_ALREADY_EXISTS:
+ ib::info() << "The file '" << path << "'"
+ " already exists though the"
+ " corresponding table did not exist"
+ " in the InnoDB data dictionary."
+ " You can resolve the problem by removing"
+ " the file.";
+ *err = DB_TABLESPACE_EXISTS;
+ break;
+ case OS_FILE_DISK_FULL:
+ *err = DB_OUT_OF_FILE_SPACE;
+ break;
+ default:
+ *err = DB_ERROR;
+ }
+ ib::error() << "Cannot create file '" << path << "'";
+ return NULL;
+ }
+
+ const bool is_compressed = fil_space_t::is_compressed(flags);
+ bool punch_hole = is_compressed;
+ fil_space_crypt_t* crypt_data = nullptr;
+#ifdef _WIN32
+ if (is_compressed) {
+ os_file_set_sparse_win32(file);
+ }
+#endif
+
+ if (!os_file_set_size(
+ path, file,
+ os_offset_t(size) << srv_page_size_shift, is_compressed)) {
+ *err = DB_OUT_OF_FILE_SPACE;
+err_exit:
+ os_file_close(file);
+ os_file_delete(innodb_data_file_key, path);
+ free(crypt_data);
+ return NULL;
+ }
+
+ /* FIXME: remove this */
+ IF_WIN(, punch_hole = punch_hole && os_is_sparse_file_supported(file));
+
+ /* We have to write the space id to the file immediately and flush the
+ file to disk. This is because in crash recovery we must be aware what
+ tablespaces exist and what are their space id's, so that we can apply
+ the log records to the right file. It may take quite a while until
+ buffer pool flush algorithms write anything to the file and flush it to
+ disk. If we would not write here anything, the file would be filled
+ with zeros from the call of os_file_set_size(), until a buffer pool
+ flush would write to it. */
+
+ /* Align the memory for file i/o if we might have O_DIRECT set */
+ page = static_cast<byte*>(aligned_malloc(2 * srv_page_size,
+ srv_page_size));
+
+ memset(page, '\0', srv_page_size);
+
+ if (fil_space_t::full_crc32(flags)) {
+ flags |= FSP_FLAGS_FCRC32_PAGE_SSIZE();
+ } else {
+ flags |= FSP_FLAGS_PAGE_SSIZE();
+ }
+
+ fsp_header_init_fields(page, space_id, flags);
+ mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
+
+ /* Create crypt data if the tablespace is either encrypted or user has
+ requested it to remain unencrypted. */
+ crypt_data = (mode != FIL_ENCRYPTION_DEFAULT || srv_encrypt_tables)
+ ? fil_space_create_crypt_data(mode, key_id)
+ : NULL;
+
+ if (crypt_data) {
+ /* Write crypt data information in page0 while creating
+ ibd file. */
+ crypt_data->fill_page0(flags, page);
+ }
+
+ if (ulint zip_size = fil_space_t::zip_size(flags)) {
+ page_zip_des_t page_zip;
+ page_zip_set_size(&page_zip, zip_size);
+ page_zip.data = page + srv_page_size;
+#ifdef UNIV_DEBUG
+ page_zip.m_start = 0;
+#endif /* UNIV_DEBUG */
+ page_zip.m_end = 0;
+ page_zip.m_nonempty = 0;
+ page_zip.n_blobs = 0;
+
+ buf_flush_init_for_writing(NULL, page, &page_zip, false);
+
+ *err = os_file_write(IORequestWrite, path, file,
+ page_zip.data, 0, zip_size);
+ } else {
+ buf_flush_init_for_writing(NULL, page, NULL,
+ fil_space_t::full_crc32(flags));
+
+ *err = os_file_write(IORequestWrite, path, file,
+ page, 0, srv_page_size);
+ }
+
+ aligned_free(page);
+
+ if (*err != DB_SUCCESS) {
+ ib::error()
+ << "Could not write the first page to"
+ << " tablespace '" << path << "'";
+ goto err_exit;
+ }
+
+ if (!os_file_flush(file)) {
+ ib::error() << "File flush of tablespace '"
+ << path << "' failed";
+ *err = DB_ERROR;
+ goto err_exit;
+ }
+
+ if (has_data_dir) {
+ /* Make the ISL file if the IBD file is not
+ in the default location. */
+ *err = RemoteDatafile::create_link_file(name, path);
+ if (*err != DB_SUCCESS) {
+ goto err_exit;
+ }
+ }
+
+ if (fil_space_t* space = fil_space_t::create(name, space_id, flags,
+ FIL_TYPE_TABLESPACE,
+ crypt_data, mode)) {
+ space->punch_hole = punch_hole;
+ fil_node_t* node = space->add(path, file, size, false, true);
+ mtr_t mtr;
+ mtr.start();
+ mtr.log_file_op(FILE_CREATE, space_id, node->name);
+ mtr.commit();
+
+ node->find_metadata(file);
+ *err = DB_SUCCESS;
+ return space;
+ }
+
+ if (has_data_dir) {
+ RemoteDatafile::delete_link_file(name);
+ }
+
+ *err = DB_ERROR;
+ goto err_exit;
+}
+
+/** Try to open a single-table tablespace and optionally check that the
+space id in it is correct. If this does not succeed, print an error message
+to the .err log. This function is used to open a tablespace when we start
+mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE.
+
+NOTE that we assume this operation is used either at the database startup
+or under the protection of the dictionary mutex, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file. This boolean may be initially false, but if
+a remote tablespace is found it will be changed to true.
+
+If the fix_dict boolean is set, then it is safe to use an internal SQL
+statement to update the dictionary tables if they are incorrect.
+
+@param[in] validate true if we should validate the tablespace
+@param[in] fix_dict true if the dictionary is available to be fixed
+@param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
+@param[in] id tablespace ID
+@param[in] flags expected FSP_SPACE_FLAGS
+@param[in] space_name tablespace name of the datafile
+If file-per-table, it is the table name in the databasename/tablename format
+@param[in] path_in expected filepath, usually read from dictionary
+@param[out] err DB_SUCCESS or error code
+@return tablespace
+@retval NULL if the tablespace could not be opened */
+fil_space_t*
+fil_ibd_open(
+ bool validate,
+ bool fix_dict,
+ fil_type_t purpose,
+ ulint id,
+ ulint flags,
+ const table_name_t& tablename,
+ const char* path_in,
+ dberr_t* err)
+{
+ mutex_enter(&fil_system.mutex);
+ if (fil_space_t* space = fil_space_get_by_id(id)) {
+ if (strcmp(space->name, tablename.m_name)) {
+ table_name_t space_name;
+ space_name.m_name = space->name;
+ ib::error()
+ << "Trying to open table " << tablename
+ << " with id " << id
+ << ", conflicting with " << space_name;
+ space = NULL;
+ if (err) *err = DB_TABLESPACE_EXISTS;
+ } else if (err) *err = DB_SUCCESS;
+
+ mutex_exit(&fil_system.mutex);
+
+ if (space && validate && !srv_read_only_mode) {
+ fsp_flags_try_adjust(space,
+ flags & ~FSP_FLAGS_MEM_MASK);
+ }
+
+ return space;
+ }
+ mutex_exit(&fil_system.mutex);
+
+ bool dict_filepath_same_as_default = false;
+ bool link_file_found = false;
+ bool link_file_is_bad = false;
+ Datafile df_default; /* default location */
+ Datafile df_dict; /* dictionary location */
+ RemoteDatafile df_remote; /* remote location */
+ ulint tablespaces_found = 0;
+ ulint valid_tablespaces_found = 0;
+
+ if (fix_dict) {
+ ut_d(dict_sys.assert_locked());
+ ut_ad(!srv_read_only_mode);
+ ut_ad(srv_log_file_size != 0);
+ }
+
+ /* Table flags can be ULINT_UNDEFINED if
+ dict_tf_to_fsp_flags_failure is set. */
+ if (flags == ULINT_UNDEFINED) {
+corrupted:
+ if (err) *err = DB_CORRUPTION;
+ return NULL;
+ }
+
+ ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id));
+ df_default.init(tablename.m_name, flags);
+ df_dict.init(tablename.m_name, flags);
+ df_remote.init(tablename.m_name, flags);
+
+ /* Discover the correct file by looking in three possible locations
+ while avoiding unecessary effort. */
+
+ /* We will always look for an ibd in the default location. */
+ df_default.make_filepath(NULL, tablename.m_name, IBD);
+
+ /* Look for a filepath embedded in an ISL where the default file
+ would be. */
+ if (df_remote.open_read_only(true) == DB_SUCCESS) {
+ ut_ad(df_remote.is_open());
+
+ /* Always validate a file opened from an ISL pointer */
+ validate = true;
+ ++tablespaces_found;
+ link_file_found = true;
+ } else if (df_remote.filepath() != NULL) {
+ /* An ISL file was found but contained a bad filepath in it.
+ Better validate anything we do find. */
+ validate = true;
+ }
+
+ /* Attempt to open the tablespace at the dictionary filepath. */
+ if (path_in) {
+ if (df_default.same_filepath_as(path_in)) {
+ dict_filepath_same_as_default = true;
+ } else {
+ /* Dict path is not the default path. Always validate
+ remote files. If default is opened, it was moved. */
+ validate = true;
+ df_dict.set_filepath(path_in);
+ if (df_dict.open_read_only(true) == DB_SUCCESS) {
+ ut_ad(df_dict.is_open());
+ ++tablespaces_found;
+ }
+ }
+ }
+
+ /* Always look for a file at the default location. But don't log
+ an error if the tablespace is already open in remote or dict. */
+ ut_a(df_default.filepath());
+ const bool strict = (tablespaces_found == 0);
+ if (df_default.open_read_only(strict) == DB_SUCCESS) {
+ ut_ad(df_default.is_open());
+ ++tablespaces_found;
+ }
+
+ /* Check if multiple locations point to the same file. */
+ if (tablespaces_found > 1 && df_default.same_as(df_remote)) {
+ /* A link file was found with the default path in it.
+ Use the default path and delete the link file. */
+ --tablespaces_found;
+ df_remote.delete_link_file();
+ df_remote.close();
+ }
+ if (tablespaces_found > 1 && df_default.same_as(df_dict)) {
+ --tablespaces_found;
+ df_dict.close();
+ }
+ if (tablespaces_found > 1 && df_remote.same_as(df_dict)) {
+ --tablespaces_found;
+ df_dict.close();
+ }
+
+ /* We have now checked all possible tablespace locations and
+ have a count of how many unique files we found. If things are
+ normal, we only found 1. */
+ /* For encrypted tablespace, we need to check the
+ encryption in header of first page. */
+ if (!validate && tablespaces_found == 1) {
+ goto skip_validate;
+ }
+
+ /* Read and validate the first page of these three tablespace
+ locations, if found. */
+ valid_tablespaces_found +=
+ (df_remote.validate_to_dd(id, flags) == DB_SUCCESS);
+
+ valid_tablespaces_found +=
+ (df_default.validate_to_dd(id, flags) == DB_SUCCESS);
+
+ valid_tablespaces_found +=
+ (df_dict.validate_to_dd(id, flags) == DB_SUCCESS);
+
+ /* Make sense of these three possible locations.
+ First, bail out if no tablespace files were found. */
+ if (valid_tablespaces_found == 0) {
+ os_file_get_last_error(true);
+ ib::error() << "Could not find a valid tablespace file for `"
+ << tablename << "`. " << TROUBLESHOOT_DATADICT_MSG;
+ goto corrupted;
+ }
+ if (!validate) {
+ goto skip_validate;
+ }
+
+ /* Do not open any tablespaces if more than one tablespace with
+ the correct space ID and flags were found. */
+ if (tablespaces_found > 1) {
+ ib::error() << "A tablespace for `" << tablename
+ << "` has been found in multiple places;";
+
+ if (df_default.is_open()) {
+ ib::error() << "Default location: "
+ << df_default.filepath()
+ << ", Space ID=" << df_default.space_id()
+ << ", Flags=" << df_default.flags();
+ }
+ if (df_remote.is_open()) {
+ ib::error() << "Remote location: "
+ << df_remote.filepath()
+ << ", Space ID=" << df_remote.space_id()
+ << ", Flags=" << df_remote.flags();
+ }
+ if (df_dict.is_open()) {
+ ib::error() << "Dictionary location: "
+ << df_dict.filepath()
+ << ", Space ID=" << df_dict.space_id()
+ << ", Flags=" << df_dict.flags();
+ }
+
+ /* Force-recovery will allow some tablespaces to be
+ skipped by REDO if there was more than one file found.
+ Unlike during the REDO phase of recovery, we now know
+ if the tablespace is valid according to the dictionary,
+ which was not available then. So if we did not force
+ recovery and there is only one good tablespace, ignore
+ any bad tablespaces. */
+ if (valid_tablespaces_found > 1 || srv_force_recovery > 0) {
+ ib::error() << "Will not open tablespace `"
+ << tablename << "`";
+
+ /* If the file is not open it cannot be valid. */
+ ut_ad(df_default.is_open() || !df_default.is_valid());
+ ut_ad(df_dict.is_open() || !df_dict.is_valid());
+ ut_ad(df_remote.is_open() || !df_remote.is_valid());
+
+ /* Having established that, this is an easy way to
+ look for corrupted data files. */
+ if (df_default.is_open() != df_default.is_valid()
+ || df_dict.is_open() != df_dict.is_valid()
+ || df_remote.is_open() != df_remote.is_valid()) {
+ goto corrupted;
+ }
+error:
+ if (err) *err = DB_ERROR;
+ return NULL;
+ }
+
+ /* There is only one valid tablespace found and we did
+ not use srv_force_recovery during REDO. Use this one
+ tablespace and clean up invalid tablespace pointers */
+ if (df_default.is_open() && !df_default.is_valid()) {
+ df_default.close();
+ tablespaces_found--;
+ }
+
+ if (df_dict.is_open() && !df_dict.is_valid()) {
+ df_dict.close();
+ /* Leave dict.filepath so that SYS_DATAFILES
+ can be corrected below. */
+ tablespaces_found--;
+ }
+
+ if (df_remote.is_open() && !df_remote.is_valid()) {
+ df_remote.close();
+ tablespaces_found--;
+ link_file_is_bad = true;
+ }
+ }
+
+ /* At this point, there should be only one filepath. */
+ ut_a(tablespaces_found == 1);
+ ut_a(valid_tablespaces_found == 1);
+
+ /* Only fix the dictionary at startup when there is only one thread.
+ Calls to dict_load_table() can be done while holding other latches. */
+ if (!fix_dict) {
+ goto skip_validate;
+ }
+
+ /* We may need to update what is stored in SYS_DATAFILES or
+ SYS_TABLESPACES or adjust the link file. Since a failure to
+ update SYS_TABLESPACES or SYS_DATAFILES does not prevent opening
+ and using the tablespace either this time or the next, we do not
+ check the return code or fail to open the tablespace. But if it
+ fails, dict_update_filepath() will issue a warning to the log. */
+ if (df_dict.filepath()) {
+ ut_ad(path_in != NULL);
+ ut_ad(df_dict.same_filepath_as(path_in));
+
+ if (df_remote.is_open()) {
+ if (!df_remote.same_filepath_as(path_in)) {
+ dict_update_filepath(id, df_remote.filepath());
+ }
+
+ } else if (df_default.is_open()) {
+ ut_ad(!dict_filepath_same_as_default);
+ dict_update_filepath(id, df_default.filepath());
+ if (link_file_is_bad) {
+ RemoteDatafile::delete_link_file(
+ tablename.m_name);
+ }
+
+ } else if (!link_file_found || link_file_is_bad) {
+ ut_ad(df_dict.is_open());
+ /* Fix the link file if we got our filepath
+ from the dictionary but a link file did not
+ exist or it did not point to a valid file. */
+ RemoteDatafile::delete_link_file(tablename.m_name);
+ RemoteDatafile::create_link_file(
+ tablename.m_name, df_dict.filepath());
+ }
+
+ } else if (df_remote.is_open()) {
+ if (dict_filepath_same_as_default) {
+ dict_update_filepath(id, df_remote.filepath());
+
+ } else if (path_in == NULL) {
+ /* SYS_DATAFILES record for this space ID
+ was not found. */
+ dict_replace_tablespace_and_filepath(
+ id, tablename.m_name,
+ df_remote.filepath(), flags);
+ }
+
+ } else if (df_default.is_open()) {
+ /* We opened the tablespace in the default location.
+ SYS_DATAFILES.PATH needs to be updated if it is different
+ from this default path or if the SYS_DATAFILES.PATH was not
+ supplied and it should have been. Also update the dictionary
+ if we found an ISL file (since !df_remote.is_open). Since
+ path_in is not suppled for file-per-table, we must assume
+ that it matched the ISL. */
+ if ((path_in != NULL && !dict_filepath_same_as_default)
+ || (path_in == NULL && DICT_TF_HAS_DATA_DIR(flags))
+ || df_remote.filepath() != NULL) {
+ dict_replace_tablespace_and_filepath(
+ id, tablename.m_name, df_default.filepath(),
+ flags);
+ }
+ }
+
+skip_validate:
+ const byte* first_page =
+ df_default.is_open() ? df_default.get_first_page() :
+ df_dict.is_open() ? df_dict.get_first_page() :
+ df_remote.get_first_page();
+
+ fil_space_crypt_t* crypt_data = first_page
+ ? fil_space_read_crypt_data(fil_space_t::zip_size(flags),
+ first_page)
+ : NULL;
+
+ fil_space_t* space = fil_space_t::create(
+ tablename.m_name, id, flags, purpose, crypt_data);
+ if (!space) {
+ goto error;
+ }
+
+ /* We do not measure the size of the file, that is why
+ we pass the 0 below */
+
+ space->add(
+ df_remote.is_open() ? df_remote.filepath() :
+ df_dict.is_open() ? df_dict.filepath() :
+ df_default.filepath(), OS_FILE_CLOSED, 0, false, true);
+
+ if (validate && !srv_read_only_mode) {
+ df_remote.close();
+ df_dict.close();
+ df_default.close();
+ if (space->acquire()) {
+ if (purpose != FIL_TYPE_IMPORT) {
+ fsp_flags_try_adjust(space, flags
+ & ~FSP_FLAGS_MEM_MASK);
+ }
+ space->release();
+ }
+ }
+
+ if (err) *err = DB_SUCCESS;
+ return space;
+}
+
+/** Looks for a pre-existing fil_space_t with the given tablespace ID
+and, if found, returns the name and filepath in newly allocated buffers
+that the caller must free.
+@param[in] space_id The tablespace ID to search for.
+@param[out] name Name of the tablespace found.
+@param[out] filepath The filepath of the first datafile for the
+tablespace.
+@return true if tablespace is found, false if not. */
+bool
+fil_space_read_name_and_filepath(
+ ulint space_id,
+ char** name,
+ char** filepath)
+{
+ bool success = false;
+ *name = NULL;
+ *filepath = NULL;
+
+ mutex_enter(&fil_system.mutex);
+
+ fil_space_t* space = fil_space_get_by_id(space_id);
+
+ if (space != NULL) {
+ *name = mem_strdup(space->name);
+
+ fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+ *filepath = mem_strdup(node->name);
+
+ success = true;
+ }
+
+ mutex_exit(&fil_system.mutex);
+
+ return(success);
+}
+
+/** Convert a file name to a tablespace name.
+@param[in] filename directory/databasename/tablename.ibd
+@return database/tablename string, to be freed with ut_free() */
+char*
+fil_path_to_space_name(
+ const char* filename)
+{
+ /* Strip the file name prefix and suffix, leaving
+ only databasename/tablename. */
+ ulint filename_len = strlen(filename);
+ const char* end = filename + filename_len;
+#ifdef HAVE_MEMRCHR
+ const char* tablename = 1 + static_cast<const char*>(
+ memrchr(filename, OS_PATH_SEPARATOR,
+ filename_len));
+ const char* dbname = 1 + static_cast<const char*>(
+ memrchr(filename, OS_PATH_SEPARATOR,
+ tablename - filename - 1));
+#else /* HAVE_MEMRCHR */
+ const char* tablename = filename;
+ const char* dbname = NULL;
+
+ while (const char* t = static_cast<const char*>(
+ memchr(tablename, OS_PATH_SEPARATOR,
+ ulint(end - tablename)))) {
+ dbname = tablename;
+ tablename = t + 1;
+ }
+#endif /* HAVE_MEMRCHR */
+
+ ut_ad(dbname != NULL);
+ ut_ad(tablename > dbname);
+ ut_ad(tablename < end);
+ ut_ad(end - tablename > 4);
+ ut_ad(memcmp(end - 4, DOT_IBD, 4) == 0);
+
+ char* name = mem_strdupl(dbname, ulint(end - dbname) - 4);
+
+ ut_ad(name[tablename - dbname - 1] == OS_PATH_SEPARATOR);
+#if OS_PATH_SEPARATOR != '/'
+ /* space->name uses '/', not OS_PATH_SEPARATOR. */
+ name[tablename - dbname - 1] = '/';
+#endif
+
+ return(name);
+}
+
+/** Discover the correct IBD file to open given a remote or missing
+filepath from the REDO log. Administrators can move a crashed
+database to another location on the same machine and try to recover it.
+Remote IBD files might be moved as well to the new location.
+ The problem with this is that the REDO log contains the old location
+which may be still accessible. During recovery, if files are found in
+both locations, we can chose on based on these priorities;
+1. Default location
+2. ISL location
+3. REDO location
+@param[in] space_id tablespace ID
+@param[in] df Datafile object with path from redo
+@return true if a valid datafile was found, false if not */
+static
+bool
+fil_ibd_discover(
+ ulint space_id,
+ Datafile& df)
+{
+ Datafile df_def_per; /* default file-per-table datafile */
+ RemoteDatafile df_rem_per; /* remote file-per-table datafile */
+
+ /* Look for the datafile in the default location. */
+ const char* filename = df.filepath();
+ const char* basename = base_name(filename);
+
+ /* If this datafile is file-per-table it will have a schema dir. */
+ ulint sep_found = 0;
+ const char* db = basename;
+ for (; db > filename && sep_found < 2; db--) {
+ if (db[0] == OS_PATH_SEPARATOR) {
+ sep_found++;
+ }
+ }
+ if (sep_found == 2) {
+ db += 2;
+ df_def_per.init(db, 0);
+ df_def_per.make_filepath(NULL, db, IBD);
+ if (df_def_per.open_read_only(false) == DB_SUCCESS
+ && df_def_per.validate_for_recovery() == DB_SUCCESS
+ && df_def_per.space_id() == space_id) {
+ df.set_filepath(df_def_per.filepath());
+ df.open_read_only(false);
+ return(true);
+ }
+
+ /* Look for a remote file-per-table tablespace. */
+
+ switch (srv_operation) {
+ case SRV_OPERATION_BACKUP:
+ case SRV_OPERATION_RESTORE_DELTA:
+ ut_ad(0);
+ break;
+ case SRV_OPERATION_RESTORE_EXPORT:
+ case SRV_OPERATION_RESTORE:
+ break;
+ case SRV_OPERATION_NORMAL:
+ df_rem_per.set_name(db);
+ if (df_rem_per.open_link_file() != DB_SUCCESS) {
+ break;
+ }
+
+ /* An ISL file was found with contents. */
+ if (df_rem_per.open_read_only(false) != DB_SUCCESS
+ || df_rem_per.validate_for_recovery()
+ != DB_SUCCESS) {
+
+ /* Assume that this ISL file is intended to
+ be used. Do not continue looking for another
+ if this file cannot be opened or is not
+ a valid IBD file. */
+ ib::error() << "ISL file '"
+ << df_rem_per.link_filepath()
+ << "' was found but the linked file '"
+ << df_rem_per.filepath()
+ << "' could not be opened or is"
+ " not correct.";
+ return(false);
+ }
+
+ /* Use this file if it has the space_id from the
+ MLOG record. */
+ if (df_rem_per.space_id() == space_id) {
+ df.set_filepath(df_rem_per.filepath());
+ df.open_read_only(false);
+ return(true);
+ }
+
+ /* Since old MLOG records can use the same basename
+ in multiple CREATE/DROP TABLE sequences, this ISL
+ file could be pointing to a later version of this
+ basename.ibd file which has a different space_id.
+ Keep looking. */
+ }
+ }
+
+ /* No ISL files were found in the default location. Use the location
+ given in the redo log. */
+ if (df.open_read_only(false) == DB_SUCCESS
+ && df.validate_for_recovery() == DB_SUCCESS
+ && df.space_id() == space_id) {
+ return(true);
+ }
+
+ /* A datafile was not discovered for the filename given. */
+ return(false);
+}
+/** Open an ibd tablespace and add it to the InnoDB data structures.
+This is similar to fil_ibd_open() except that it is used while processing
+the REDO log, so the data dictionary is not available and very little
+validation is done. The tablespace name is extracred from the
+dbname/tablename.ibd portion of the filename, which assumes that the file
+is a file-per-table tablespace. Any name will do for now. General
+tablespace names will be read from the dictionary after it has been
+recovered. The tablespace flags are read at this time from the first page
+of the file in validate_for_recovery().
+@param[in] space_id tablespace ID
+@param[in] filename path/to/databasename/tablename.ibd
+@param[out] space the tablespace, or NULL on error
+@return status of the operation */
+enum fil_load_status
+fil_ibd_load(
+ ulint space_id,
+ const char* filename,
+ fil_space_t*& space)
+{
+ /* If the a space is already in the file system cache with this
+ space ID, then there is nothing to do. */
+ mutex_enter(&fil_system.mutex);
+ space = fil_space_get_by_id(space_id);
+ mutex_exit(&fil_system.mutex);
+
+ if (space) {
+ /* Compare the filename we are trying to open with the
+ filename from the first node of the tablespace we opened
+ previously. Fail if it is different. */
+ fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+ if (0 != strcmp(innobase_basename(filename),
+ innobase_basename(node->name))) {
+ ib::info()
+ << "Ignoring data file '" << filename
+ << "' with space ID " << space->id
+ << ". Another data file called " << node->name
+ << " exists with the same space ID.";
+ space = NULL;
+ return(FIL_LOAD_ID_CHANGED);
+ }
+ return(FIL_LOAD_OK);
+ }
+
+ if (srv_operation == SRV_OPERATION_RESTORE) {
+ /* Replace absolute DATA DIRECTORY file paths with
+ short names relative to the backup directory. */
+ if (const char* name = strrchr(filename, OS_PATH_SEPARATOR)) {
+ while (--name > filename
+ && *name != OS_PATH_SEPARATOR);
+ if (name > filename) {
+ filename = name + 1;
+ }
+ }
+ }
+
+ Datafile file;
+ file.set_filepath(filename);
+ file.open_read_only(false);
+
+ if (!file.is_open()) {
+ /* The file has been moved or it is a remote datafile. */
+ if (!fil_ibd_discover(space_id, file)
+ || !file.is_open()) {
+ return(FIL_LOAD_NOT_FOUND);
+ }
+ }
+
+ os_offset_t size;
+
+ /* Read and validate the first page of the tablespace.
+ Assign a tablespace name based on the tablespace type. */
+ switch (file.validate_for_recovery()) {
+ os_offset_t minimum_size;
+ case DB_SUCCESS:
+ if (file.space_id() != space_id) {
+ return(FIL_LOAD_ID_CHANGED);
+ }
+ /* Get and test the file size. */
+ size = os_file_get_size(file.handle());
+
+ /* Every .ibd file is created >= 4 pages in size.
+ Smaller files cannot be OK. */
+ minimum_size = os_offset_t(FIL_IBD_FILE_INITIAL_SIZE)
+ << srv_page_size_shift;
+
+ if (size == static_cast<os_offset_t>(-1)) {
+ /* The following call prints an error message */
+ os_file_get_last_error(true);
+
+ ib::error() << "Could not measure the size of"
+ " single-table tablespace file '"
+ << file.filepath() << "'";
+ } else if (size < minimum_size) {
+ ib::error() << "The size of tablespace file '"
+ << file.filepath() << "' is only " << size
+ << ", should be at least " << minimum_size
+ << "!";
+ } else {
+ /* Everything is fine so far. */
+ break;
+ }
+
+ /* fall through */
+
+ case DB_TABLESPACE_EXISTS:
+ return(FIL_LOAD_INVALID);
+
+ default:
+ return(FIL_LOAD_NOT_FOUND);
+ }
+
+ ut_ad(space == NULL);
+
+ /* Adjust the memory-based flags that would normally be set by
+ dict_tf_to_fsp_flags(). In recovery, we have no data dictionary. */
+ ulint flags = file.flags();
+ if (fil_space_t::is_compressed(flags)) {
+ flags |= page_zip_level
+ << FSP_FLAGS_MEM_COMPRESSION_LEVEL;
+ }
+
+ const byte* first_page = file.get_first_page();
+ fil_space_crypt_t* crypt_data = first_page
+ ? fil_space_read_crypt_data(fil_space_t::zip_size(flags),
+ first_page)
+ : NULL;
+ space = fil_space_t::create(
+ file.name(), space_id, flags, FIL_TYPE_TABLESPACE, crypt_data);
+
+ if (space == NULL) {
+ return(FIL_LOAD_INVALID);
+ }
+
+ ut_ad(space->id == file.space_id());
+ ut_ad(space->id == space_id);
+
+ /* We do not use the size information we have about the file, because
+ the rounding formula for extents and pages is somewhat complex; we
+ let fil_node_open() do that task. */
+
+ space->add(file.filepath(), OS_FILE_CLOSED, 0, false, false);
+
+ return(FIL_LOAD_OK);
+}
+
+/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations.
+(Typically when upgrading from MariaDB 10.1.0..10.1.20.)
+@param[in,out] space tablespace
+@param[in] flags desired tablespace flags */
+void fsp_flags_try_adjust(fil_space_t* space, ulint flags)
+{
+ ut_ad(!srv_read_only_mode);
+ ut_ad(fil_space_t::is_valid_flags(flags, space->id));
+ if (space->full_crc32() || fil_space_t::full_crc32(flags)) {
+ return;
+ }
+ if (!space->size && (space->purpose != FIL_TYPE_TABLESPACE
+ || !space->get_size())) {
+ return;
+ }
+ /* This code is executed during server startup while no
+ connections are allowed. We do not need to protect against
+ DROP TABLE by fil_space_acquire(). */
+ mtr_t mtr;
+ mtr.start();
+ if (buf_block_t* b = buf_page_get(
+ page_id_t(space->id, 0), space->zip_size(),
+ RW_X_LATCH, &mtr)) {
+ uint32_t f = fsp_header_get_flags(b->frame);
+ if (fil_space_t::full_crc32(f)) {
+ goto func_exit;
+ }
+ if (fil_space_t::is_flags_equal(f, flags)) {
+ goto func_exit;
+ }
+ /* Suppress the message if only the DATA_DIR flag to differs. */
+ if ((f ^ flags) & ~(1U << FSP_FLAGS_POS_RESERVED)) {
+ ib::warn()
+ << "adjusting FSP_SPACE_FLAGS of file '"
+ << UT_LIST_GET_FIRST(space->chain)->name
+ << "' from " << ib::hex(f)
+ << " to " << ib::hex(flags);
+ }
+ mtr.set_named_space(space);
+ mtr.write<4,mtr_t::FORCED>(*b,
+ FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
+ + b->frame, flags);
+ }
+func_exit:
+ mtr.commit();
+}
+
+/** Determine if a matching tablespace exists in the InnoDB tablespace
+memory cache. Note that if we have not done a crash recovery at the database
+startup, there may be many tablespaces which are not yet in the memory cache.
+@param[in] id Tablespace ID
+@param[in] name Tablespace name used in fil_space_t::create().
+@param[in] table_flags table flags
+@return the tablespace
+@retval NULL if no matching tablespace exists in the memory cache */
+fil_space_t*
+fil_space_for_table_exists_in_mem(
+ ulint id,
+ const char* name,
+ ulint table_flags)
+{
+ const ulint expected_flags = dict_tf_to_fsp_flags(table_flags);
+
+ mutex_enter(&fil_system.mutex);
+ if (fil_space_t* space = fil_space_get_by_id(id)) {
+ ulint tf = expected_flags & ~FSP_FLAGS_MEM_MASK;
+ ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK;
+
+ if (!fil_space_t::is_flags_equal(tf, sf)
+ && !fil_space_t::is_flags_equal(sf, tf)) {
+ goto func_exit;
+ }
+
+ if (strcmp(space->name, name)) {
+ ib::error() << "Table " << name
+ << " in InnoDB data dictionary"
+ " has tablespace id " << id
+ << ", but the tablespace"
+ " with that id has name " << space->name << "."
+ " Have you deleted or moved .ibd files?";
+ ib::info() << TROUBLESHOOT_DATADICT_MSG;
+ goto func_exit;
+ }
+
+ /* Adjust the flags that are in FSP_FLAGS_MEM_MASK.
+ FSP_SPACE_FLAGS will not be written back here. */
+ space->flags = (space->flags & ~FSP_FLAGS_MEM_MASK)
+ | (expected_flags & FSP_FLAGS_MEM_MASK);
+ mutex_exit(&fil_system.mutex);
+ if (!srv_read_only_mode) {
+ fsp_flags_try_adjust(space, expected_flags
+ & ~FSP_FLAGS_MEM_MASK);
+ }
+ return space;
+ }
+
+func_exit:
+ mutex_exit(&fil_system.mutex);
+ return NULL;
+}
+
+/*============================ FILE I/O ================================*/
+
+/** Report information about an invalid page access. */
+ATTRIBUTE_COLD __attribute__((noreturn))
+static void
+fil_report_invalid_page_access(const char *name,
+ os_offset_t offset, ulint len, bool is_read)
+{
+ ib::fatal() << "Trying to " << (is_read ? "read " : "write ") << len
+ << " bytes at " << offset
+ << " outside the bounds of the file: " << name;
+}
+
+
+/** Update the data structures on write completion */
+inline void fil_node_t::complete_write()
+{
+ ut_ad(!mutex_own(&fil_system.mutex));
+
+ if (space->purpose != FIL_TYPE_TEMPORARY &&
+ srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC &&
+ space->set_needs_flush())
+ {
+ mutex_enter(&fil_system.mutex);
+ if (!space->is_in_unflushed_spaces)
+ {
+ space->is_in_unflushed_spaces= true;
+ fil_system.unflushed_spaces.push_front(*space);
+ }
+ mutex_exit(&fil_system.mutex);
+ }
+}
+
+/** Read or write data.
+@param type I/O context
+@param offset offset in bytes
+@param len number of bytes
+@param buf the data to be read or written
+@param bpage buffer block (for type.is_async() completion callback)
+@return status and file descriptor */
+fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len,
+ void *buf, buf_page_t *bpage)
+{
+ ut_ad(referenced());
+ ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
+ ut_ad(fil_validate_skip());
+ ut_ad(type.is_read() || type.is_write());
+ ut_ad(type.type != IORequest::DBLWR_BATCH);
+
+ if (type.is_read()) {
+ srv_stats.data_read.add(len);
+ } else {
+ ut_ad(!srv_read_only_mode || this == fil_system.temp_space);
+ srv_stats.data_written.add(len);
+ }
+
+ fil_node_t* node= UT_LIST_GET_FIRST(chain);
+ ut_ad(node);
+
+ if (type.type == IORequest::READ_ASYNC && is_stopping()
+ && !is_being_truncated) {
+ release();
+ return {DB_TABLESPACE_DELETED, nullptr};
+ }
+
+ ulint p = static_cast<ulint>(offset >> srv_page_size_shift);
+
+ if (UNIV_LIKELY_NULL(UT_LIST_GET_NEXT(chain, node))) {
+ ut_ad(this == fil_system.sys_space
+ || this == fil_system.temp_space);
+ ut_ad(!(offset & ((1 << srv_page_size_shift) - 1)));
+
+ while (node->size <= p) {
+ p -= node->size;
+ node = UT_LIST_GET_NEXT(chain, node);
+ if (!node) {
+ if (type.type == IORequest::READ_ASYNC) {
+ release();
+ return {DB_ERROR, nullptr};
+ }
+ fil_report_invalid_page_access(name, offset,
+ len,
+ type.is_read());
+ }
+ }
+
+ offset = os_offset_t{p} << srv_page_size_shift;
+ }
+
+ if (UNIV_UNLIKELY(node->size <= p)) {
+ if (type.type == IORequest::READ_ASYNC) {
+ release();
+ /* If we can tolerate the non-existent pages, we
+ should return with DB_ERROR and let caller decide
+ what to do. */
+ return {DB_ERROR, nullptr};
+ }
+
+ fil_report_invalid_page_access(
+ node->name, offset, len, type.is_read());
+ }
+
+ dberr_t err;
+
+ if (type.type == IORequest::PUNCH_RANGE) {
+ err = os_file_punch_hole(node->handle, offset, len);
+ /* Punch hole is not supported, make space not to
+ support punch hole */
+ if (UNIV_UNLIKELY(err == DB_IO_NO_PUNCH_HOLE)) {
+ punch_hole = false;
+ err = DB_SUCCESS;
+ }
+ goto release_sync_write;
+ } else {
+ /* Queue the aio request */
+ err = os_aio(IORequest(bpage, node, type.type),
+ buf, offset, len);
+ }
+
+ /* We an try to recover the page from the double write buffer if
+ the decompression fails or the page is corrupt. */
+
+ ut_a(type.type == IORequest::DBLWR_RECOVER || err == DB_SUCCESS);
+ if (!type.is_async()) {
+ if (type.is_write()) {
+release_sync_write:
+ node->complete_write();
+release:
+ release();
+ }
+ ut_ad(fil_validate_skip());
+ }
+ if (err != DB_SUCCESS) {
+ goto release;
+ }
+ return {err, node};
+}
+
+#include <tpool.h>
+
+/** Callback for AIO completion */
+void fil_aio_callback(const IORequest &request)
+{
+ ut_ad(fil_validate_skip());
+ ut_ad(request.node);
+
+ if (!request.bpage)
+ {
+ ut_ad(!srv_read_only_mode);
+ if (request.type == IORequest::DBLWR_BATCH)
+ buf_dblwr.flush_buffered_writes_completed(request);
+ else
+ ut_ad(request.type == IORequest::WRITE_ASYNC);
+write_completed:
+ request.node->complete_write();
+ }
+ else if (request.is_write())
+ {
+ buf_page_write_complete(request);
+ goto write_completed;
+ }
+ else
+ {
+ ut_ad(request.is_read());
+
+ /* IMPORTANT: since i/o handling for reads will read also the insert
+ buffer in fil_system.sys_space, we have to be very careful not to
+ introduce deadlocks. We never close fil_system.sys_space data
+ files and never issue asynchronous reads of change buffer pages. */
+ const page_id_t id(request.bpage->id());
+
+ if (dberr_t err= buf_page_read_complete(request.bpage, *request.node))
+ {
+ if (recv_recovery_is_on() && !srv_force_recovery)
+ recv_sys.found_corrupt_fs= true;
+
+ ib::error() << "Failed to read page " << id.page_no()
+ << " from file '" << request.node->name << "': " << err;
+ }
+ }
+
+ request.node->space->release();
+}
+
+/** Flush to disk the writes in file spaces of the given type
+possibly cached by the OS. */
+void fil_flush_file_spaces()
+{
+ if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
+ {
+ ut_d(mutex_enter(&fil_system.mutex));
+ ut_ad(fil_system.unflushed_spaces.empty());
+ ut_d(mutex_exit(&fil_system.mutex));
+ return;
+ }
+
+rescan:
+ mutex_enter(&fil_system.mutex);
+
+ for (fil_space_t &space : fil_system.unflushed_spaces)
+ {
+ if (space.needs_flush_not_stopping())
+ {
+ space.reacquire();
+ mutex_exit(&fil_system.mutex);
+ space.flush_low();
+ space.release();
+ goto rescan;
+ }
+ }
+
+ mutex_exit(&fil_system.mutex);
+}
+
+/** Functor to validate the file node list of a tablespace. */
+struct Check {
+ /** Total size of file nodes visited so far */
+ ulint size;
+ /** Total number of open files visited so far */
+ ulint n_open;
+
+ /** Constructor */
+ Check() : size(0), n_open(0) {}
+
+ /** Visit a file node
+ @param[in] elem file node to visit */
+ void operator()(const fil_node_t* elem)
+ {
+ n_open += elem->is_open();
+ size += elem->size;
+ }
+
+ /** Validate a tablespace.
+ @param[in] space tablespace to validate
+ @return number of open file nodes */
+ static ulint validate(const fil_space_t* space)
+ {
+ ut_ad(mutex_own(&fil_system.mutex));
+ Check check;
+ ut_list_validate(space->chain, check);
+ ut_a(space->size == check.size);
+
+ switch (space->id) {
+ case TRX_SYS_SPACE:
+ ut_ad(fil_system.sys_space == NULL
+ || fil_system.sys_space == space);
+ break;
+ case SRV_TMP_SPACE_ID:
+ ut_ad(fil_system.temp_space == NULL
+ || fil_system.temp_space == space);
+ break;
+ default:
+ break;
+ }
+
+ return(check.n_open);
+ }
+};
+
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return true if ok */
+bool fil_validate()
+{
+ ulint n_open = 0;
+
+ mutex_enter(&fil_system.mutex);
+
+ for (fil_space_t *space = UT_LIST_GET_FIRST(fil_system.space_list);
+ space != NULL;
+ space = UT_LIST_GET_NEXT(space_list, space)) {
+ n_open += Check::validate(space);
+ }
+
+ ut_a(fil_system.n_open == n_open);
+
+ mutex_exit(&fil_system.mutex);
+
+ return(true);
+}
+
+/*********************************************************************//**
+Sets the file page type. */
+void
+fil_page_set_type(
+/*==============*/
+ byte* page, /*!< in/out: file page */
+ ulint type) /*!< in: type */
+{
+ ut_ad(page);
+
+ mach_write_to_2(page + FIL_PAGE_TYPE, type);
+}
+
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables.
+@param[in] ibd_filepath File path of the IBD tablespace */
+void
+fil_delete_file(
+/*============*/
+ const char* ibd_filepath)
+{
+ /* Force a delete of any stale .ibd files that are lying around. */
+
+ ib::info() << "Deleting " << ibd_filepath;
+ os_file_delete_if_exists(innodb_data_file_key, ibd_filepath, NULL);
+
+ char* cfg_filepath = fil_make_filepath(
+ ibd_filepath, NULL, CFG, false);
+ if (cfg_filepath != NULL) {
+ os_file_delete_if_exists(
+ innodb_data_file_key, cfg_filepath, NULL);
+ ut_free(cfg_filepath);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Check that a tablespace is valid for mtr_commit().
+@param[in] space persistent tablespace that has been changed */
+static
+void
+fil_space_validate_for_mtr_commit(
+ const fil_space_t* space)
+{
+ ut_ad(!mutex_own(&fil_system.mutex));
+ ut_ad(space != NULL);
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+ ut_ad(!is_predefined_tablespace(space->id));
+
+ /* We are serving mtr_commit(). While there is an active
+ mini-transaction, we should have !space->stop_new_ops. This is
+ guaranteed by meta-data locks or transactional locks, or
+ dict_sys.latch (X-lock in DROP, S-lock in purge). */
+ ut_ad(!space->is_stopping()
+ || space->is_being_truncated /* fil_truncate_prepare() */
+ || space->referenced());
+}
+#endif /* UNIV_DEBUG */
+
+/** Write a FILE_MODIFY record for a persistent tablespace.
+@param[in] space tablespace
+@param[in,out] mtr mini-transaction */
+static
+void
+fil_names_write(
+ const fil_space_t* space,
+ mtr_t* mtr)
+{
+ ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
+ fil_name_write(space->id, UT_LIST_GET_FIRST(space->chain)->name, mtr);
+}
+
+/** Note that a non-predefined persistent tablespace has been modified
+by redo log.
+@param[in,out] space tablespace */
+void
+fil_names_dirty(
+ fil_space_t* space)
+{
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(recv_recovery_is_on());
+ ut_ad(log_sys.get_lsn() != 0);
+ ut_ad(space->max_lsn == 0);
+ ut_d(fil_space_validate_for_mtr_commit(space));
+
+ UT_LIST_ADD_LAST(fil_system.named_spaces, space);
+ space->max_lsn = log_sys.get_lsn();
+}
+
+/** Write FILE_MODIFY records when a non-predefined persistent
+tablespace was modified for the first time since the latest
+fil_names_clear().
+@param[in,out] space tablespace */
+void fil_names_dirty_and_write(fil_space_t* space)
+{
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_d(fil_space_validate_for_mtr_commit(space));
+ ut_ad(space->max_lsn == log_sys.get_lsn());
+
+ UT_LIST_ADD_LAST(fil_system.named_spaces, space);
+ mtr_t mtr;
+ mtr.start();
+ fil_names_write(space, &mtr);
+
+ DBUG_EXECUTE_IF("fil_names_write_bogus",
+ {
+ char bogus_name[] = "./test/bogus file.ibd";
+ os_normalize_path(bogus_name);
+ fil_name_write(
+ SRV_SPACE_ID_UPPER_BOUND,
+ bogus_name, &mtr);
+ });
+
+ mtr.commit_files();
+}
+
+/** On a log checkpoint, reset fil_names_dirty_and_write() flags
+and write out FILE_MODIFY and FILE_CHECKPOINT if needed.
+@param[in] lsn checkpoint LSN
+@param[in] do_write whether to always write FILE_CHECKPOINT
+@return whether anything was written to the redo log
+@retval false if no flags were set and nothing written
+@retval true if anything was written to the redo log */
+bool
+fil_names_clear(
+ lsn_t lsn,
+ bool do_write)
+{
+ mtr_t mtr;
+ ulint mtr_checkpoint_size = RECV_SCAN_SIZE - 1;
+
+ DBUG_EXECUTE_IF(
+ "increase_mtr_checkpoint_size",
+ mtr_checkpoint_size = 75 * 1024;
+ );
+
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(lsn);
+
+ mtr.start();
+
+ for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.named_spaces);
+ space != NULL; ) {
+ if (mtr.get_log()->size()
+ + (3 + 5 + 1) + strlen(space->chain.start->name)
+ >= mtr_checkpoint_size) {
+ /* Prevent log parse buffer overflow */
+ mtr.commit_files();
+ mtr.start();
+ }
+
+ fil_space_t* next = UT_LIST_GET_NEXT(named_spaces, space);
+
+ ut_ad(space->max_lsn > 0);
+ if (space->max_lsn < lsn) {
+ /* The tablespace was last dirtied before the
+ checkpoint LSN. Remove it from the list, so
+ that if the tablespace is not going to be
+ modified any more, subsequent checkpoints will
+ avoid calling fil_names_write() on it. */
+ space->max_lsn = 0;
+ UT_LIST_REMOVE(fil_system.named_spaces, space);
+ }
+
+ /* max_lsn is the last LSN where fil_names_dirty_and_write()
+ was called. If we kept track of "min_lsn" (the first LSN
+ where max_lsn turned nonzero), we could avoid the
+ fil_names_write() call if min_lsn > lsn. */
+
+ fil_names_write(space, &mtr);
+ do_write = true;
+
+ space = next;
+ }
+
+ if (do_write) {
+ mtr.commit_files(lsn);
+ } else {
+ ut_ad(!mtr.has_modifications());
+ }
+
+ return(do_write);
+}
+
+/* Unit Tests */
+#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+#define MF fil_make_filepath
+#define DISPLAY ib::info() << path
+void
+test_make_filepath()
+{
+ char* path;
+ const char* long_path =
+ "this/is/a/very/long/path/including/a/very/"
+ "looooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooong"
+ "/folder/name";
+ path = MF("/this/is/a/path/with/a/filename", NULL, IBD, false); DISPLAY;
+ path = MF("/this/is/a/path/with/a/filename", NULL, ISL, false); DISPLAY;
+ path = MF("/this/is/a/path/with/a/filename", NULL, CFG, false); DISPLAY;
+ path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY;
+ path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY;
+ path = MF("/this/is/a/path/with/a/filename.dat", NULL, IBD, false); DISPLAY;
+ path = MF(NULL, "tablespacename", NO_EXT, false); DISPLAY;
+ path = MF(NULL, "tablespacename", IBD, false); DISPLAY;
+ path = MF(NULL, "dbname/tablespacename", NO_EXT, false); DISPLAY;
+ path = MF(NULL, "dbname/tablespacename", IBD, false); DISPLAY;
+ path = MF(NULL, "dbname/tablespacename", ISL, false); DISPLAY;
+ path = MF(NULL, "dbname/tablespacename", CFG, false); DISPLAY;
+ path = MF(NULL, "dbname\\tablespacename", NO_EXT, false); DISPLAY;
+ path = MF(NULL, "dbname\\tablespacename", IBD, false); DISPLAY;
+ path = MF("/this/is/a/path", "dbname/tablespacename", IBD, false); DISPLAY;
+ path = MF("/this/is/a/path", "dbname/tablespacename", IBD, true); DISPLAY;
+ path = MF("./this/is/a/path", "dbname/tablespacename.ibd", IBD, true); DISPLAY;
+ path = MF("this\\is\\a\\path", "dbname/tablespacename", IBD, true); DISPLAY;
+ path = MF("/this/is/a/path", "dbname\\tablespacename", IBD, true); DISPLAY;
+ path = MF(long_path, NULL, IBD, false); DISPLAY;
+ path = MF(long_path, "tablespacename", IBD, false); DISPLAY;
+ path = MF(long_path, "tablespacename", IBD, true); DISPLAY;
+}
+#endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */
+/* @} */
+
+/** Determine the block size of the data file.
+@param[in] space tablespace
+@param[in] offset page number
+@return block size */
+UNIV_INTERN
+ulint
+fil_space_get_block_size(const fil_space_t* space, unsigned offset)
+{
+ ulint block_size = 512;
+
+ for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+ node != NULL;
+ node = UT_LIST_GET_NEXT(chain, node)) {
+ block_size = node->block_size;
+ if (node->size > offset) {
+ ut_ad(node->size <= 0xFFFFFFFFU);
+ break;
+ }
+ offset -= static_cast<unsigned>(node->size);
+ }
+
+ /* Currently supporting block size up to 4K,
+ fall back to default if bigger requested. */
+ if (block_size > 4096) {
+ block_size = 512;
+ }
+
+ return block_size;
+}
diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc
new file mode 100644
index 00000000..909e8092
--- /dev/null
+++ b/storage/innobase/fil/fil0pagecompress.cc
@@ -0,0 +1,613 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fil/fil0pagecompress.cc
+Implementation for page compressed file spaces.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@mariadb.com
+Updated 14/02/2015
+***********************************************************************/
+
+#include "fil0fil.h"
+#include "fil0pagecompress.h"
+
+#include <my_dbug.h>
+
+#include "mem0mem.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "row0mysql.h"
+#include "buf0lru.h"
+#include "ibuf0ibuf.h"
+#include "sync0sync.h"
+#include "zlib.h"
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#endif
+#include "row0mysql.h"
+#ifdef HAVE_LZ4
+#include "lz4.h"
+#endif
+#ifdef HAVE_LZO
+#include "lzo/lzo1x.h"
+#endif
+#ifdef HAVE_LZMA
+#include "lzma.h"
+#endif
+#ifdef HAVE_BZIP2
+#include "bzlib.h"
+#endif
+#ifdef HAVE_SNAPPY
+#include "snappy-c.h"
+#endif
+
+/** Compress a page for the given compression algorithm.
+@param[in] buf page to be compressed
+@param[out] out_buf compressed page
+@param[in] header_len header length of the page
+@param[in] comp_algo compression algorithm
+@param[in] comp_level compression level
+@return actual length of compressed page data
+@retval 0 if the page was not compressed */
+static ulint fil_page_compress_low(
+ const byte* buf,
+ byte* out_buf,
+ ulint header_len,
+ ulint comp_algo,
+ unsigned comp_level)
+{
+ ulint write_size = srv_page_size - header_len;
+
+ switch (comp_algo) {
+ default:
+ ut_ad("unknown compression method" == 0);
+ /* fall through */
+ case PAGE_UNCOMPRESSED:
+ return 0;
+ case PAGE_ZLIB_ALGORITHM:
+ {
+ ulong len = uLong(write_size);
+ if (Z_OK == compress2(
+ out_buf + header_len, &len, buf,
+ uLong(srv_page_size), int(comp_level))) {
+ return len;
+ }
+ }
+ break;
+#ifdef HAVE_LZ4
+ case PAGE_LZ4_ALGORITHM:
+# ifdef HAVE_LZ4_COMPRESS_DEFAULT
+ write_size = LZ4_compress_default(
+ reinterpret_cast<const char*>(buf),
+ reinterpret_cast<char*>(out_buf) + header_len,
+ int(srv_page_size), int(write_size));
+# else
+ write_size = LZ4_compress_limitedOutput(
+ reinterpret_cast<const char*>(buf),
+ reinterpret_cast<char*>(out_buf) + header_len,
+ int(srv_page_size), int(write_size));
+# endif
+
+ return write_size;
+#endif /* HAVE_LZ4 */
+#ifdef HAVE_LZO
+ case PAGE_LZO_ALGORITHM: {
+ lzo_uint len = write_size;
+
+ if (LZO_E_OK == lzo1x_1_15_compress(
+ buf, srv_page_size,
+ out_buf + header_len, &len,
+ out_buf + srv_page_size)
+ && len <= write_size) {
+ return len;
+ }
+ break;
+ }
+#endif /* HAVE_LZO */
+#ifdef HAVE_LZMA
+ case PAGE_LZMA_ALGORITHM: {
+ size_t out_pos = 0;
+
+ if (LZMA_OK == lzma_easy_buffer_encode(
+ comp_level, LZMA_CHECK_NONE, NULL,
+ buf, srv_page_size, out_buf + header_len,
+ &out_pos, write_size)
+ && out_pos <= write_size) {
+ return out_pos;
+ }
+ break;
+ }
+#endif /* HAVE_LZMA */
+
+#ifdef HAVE_BZIP2
+ case PAGE_BZIP2_ALGORITHM: {
+ unsigned len = unsigned(write_size);
+ if (BZ_OK == BZ2_bzBuffToBuffCompress(
+ reinterpret_cast<char*>(out_buf + header_len),
+ &len,
+ const_cast<char*>(
+ reinterpret_cast<const char*>(buf)),
+ unsigned(srv_page_size), 1, 0, 0)
+ && len <= write_size) {
+ return len;
+ }
+ break;
+ }
+#endif /* HAVE_BZIP2 */
+
+#ifdef HAVE_SNAPPY
+ case PAGE_SNAPPY_ALGORITHM: {
+ size_t len = snappy_max_compressed_length(srv_page_size);
+
+ if (SNAPPY_OK == snappy_compress(
+ reinterpret_cast<const char*>(buf),
+ srv_page_size,
+ reinterpret_cast<char*>(out_buf) + header_len,
+ &len)
+ && len <= write_size) {
+ return len;
+ }
+ break;
+ }
+#endif /* HAVE_SNAPPY */
+ }
+
+ return 0;
+}
+
+/** Compress a page_compressed page for full crc32 format.
+@param[in] buf page to be compressed
+@param[out] out_buf compressed page
+@param[in] flags tablespace flags
+@param[in] block_size file system block size
+@return actual length of compressed page
+@retval 0 if the page was not compressed */
+static ulint fil_page_compress_for_full_crc32(
+ const byte* buf,
+ byte* out_buf,
+ ulint flags,
+ ulint block_size,
+ bool encrypted)
+{
+ ulint comp_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags);
+
+ if (comp_level == 0) {
+ comp_level = page_zip_level;
+ }
+
+ const ulint header_len = FIL_PAGE_COMP_ALGO;
+
+ ulint write_size = fil_page_compress_low(
+ buf, out_buf, header_len,
+ fil_space_t::get_compression_algo(flags),
+ static_cast<unsigned>(comp_level));
+
+ if (write_size == 0) {
+fail:
+ srv_stats.pages_page_compression_error.inc();
+ return 0;
+ }
+
+ write_size += header_len;
+ const ulint actual_size = write_size;
+ /* Write the actual length of the data & page type
+ for full crc32 format. */
+ const bool lsb = fil_space_t::full_crc32_page_compressed_len(flags);
+ /* In the MSB, store the rounded-up page size. */
+ write_size = (write_size + lsb + (4 + 255)) & ~255;
+ if (write_size >= srv_page_size) {
+ goto fail;
+ }
+
+ /* Set up the page header */
+ memcpy(out_buf, buf, header_len);
+ out_buf[FIL_PAGE_TYPE] = 1U << (FIL_PAGE_COMPRESS_FCRC32_MARKER - 8);
+ out_buf[FIL_PAGE_TYPE + 1] = byte(write_size >> 8);
+ /* Clean up the buffer for the remaining write_size (except checksum) */
+ memset(out_buf + actual_size, 0, write_size - actual_size - 4);
+ if (lsb) {
+ /* Store the LSB */
+ out_buf[write_size - 5] = byte(actual_size + (1 + 4));
+ }
+
+ if (!block_size) {
+ block_size = 512;
+ }
+
+ ut_ad(write_size);
+ if (write_size & (block_size - 1)) {
+ size_t tmp = write_size;
+ write_size = (write_size + (block_size - 1))
+ & ~(block_size - 1);
+ memset(out_buf + tmp, 0, write_size - tmp);
+ }
+
+ srv_stats.page_compression_saved.add(srv_page_size - write_size);
+ srv_stats.pages_page_compressed.inc();
+
+ return write_size;
+}
+
+/** Compress a page_compressed page for non full crc32 format.
+@param[in] buf page to be compressed
+@param[out] out_buf compressed page
+@param[in] flags tablespace flags
+@param[in] block_size file system block size
+@param[in] encrypted whether the page will be subsequently encrypted
+@return actual length of compressed page
+@retval 0 if the page was not compressed */
+static ulint fil_page_compress_for_non_full_crc32(
+ const byte* buf,
+ byte* out_buf,
+ ulint flags,
+ ulint block_size,
+ bool encrypted)
+{
+ uint comp_level = static_cast<uint>(
+ FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags));
+ ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
+ /* Cache to avoid change during function execution */
+ ulint comp_algo = innodb_compression_algorithm;
+
+ if (encrypted) {
+ header_len += FIL_PAGE_ENCRYPT_COMP_ALGO;
+ }
+
+ /* If no compression level was provided to this table, use system
+ default level */
+ if (comp_level == 0) {
+ comp_level = page_zip_level;
+ }
+
+ ulint write_size = fil_page_compress_low(
+ buf, out_buf,
+ header_len, comp_algo, comp_level);
+
+ if (write_size == 0) {
+ srv_stats.pages_page_compression_error.inc();
+ return 0;
+ }
+
+ /* Set up the page header */
+ memcpy(out_buf, buf, FIL_PAGE_DATA);
+ /* Set up the checksum */
+ mach_write_to_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC);
+
+ /* Set up the compression algorithm */
+ mach_write_to_8(out_buf + FIL_PAGE_COMP_ALGO, comp_algo);
+
+ if (encrypted) {
+ /* Set up the correct page type */
+ mach_write_to_2(out_buf + FIL_PAGE_TYPE,
+ FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+
+ mach_write_to_2(out_buf + FIL_PAGE_DATA
+ + FIL_PAGE_ENCRYPT_COMP_ALGO, comp_algo);
+ } else {
+ /* Set up the correct page type */
+ mach_write_to_2(out_buf + FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED);
+ }
+
+ /* Set up the actual payload lenght */
+ mach_write_to_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE,
+ write_size);
+
+ ut_ad(mach_read_from_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM)
+ == BUF_NO_CHECKSUM_MAGIC);
+
+ ut_ad(mach_read_from_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE)
+ == write_size);
+
+#ifdef UNIV_DEBUG
+ bool is_compressed = (mach_read_from_8(out_buf + FIL_PAGE_COMP_ALGO)
+ == (ulint) comp_algo);
+
+ bool is_encrypted_compressed =
+ (mach_read_from_2(out_buf + FIL_PAGE_DATA
+ + FIL_PAGE_ENCRYPT_COMP_ALGO)
+ == (ulint) comp_algo);
+#endif /* UNIV_DEBUG */
+
+ ut_ad(is_compressed || is_encrypted_compressed);
+
+ write_size+=header_len;
+
+ if (block_size <= 0) {
+ block_size = 512;
+ }
+
+ ut_ad(write_size > 0 && block_size > 0);
+
+ /* Actual write needs to be alligned on block size */
+ if (write_size % block_size) {
+ size_t tmp = write_size;
+ write_size = (size_t)ut_uint64_align_up(
+ (ib_uint64_t)write_size, block_size);
+ /* Clean up the end of buffer */
+ memset(out_buf+tmp, 0, write_size - tmp);
+#ifdef UNIV_DEBUG
+ ut_a(write_size > 0 && ((write_size % block_size) == 0));
+ ut_a(write_size >= tmp);
+#endif
+ }
+
+ srv_stats.page_compression_saved.add(srv_page_size - write_size);
+ srv_stats.pages_page_compressed.inc();
+
+ return write_size;
+}
+
+/** Compress a page_compressed page before writing to a data file.
+@param[in] buf page to be compressed
+@param[out] out_buf compressed page
+@param[in] flags tablespace flags
+@param[in] block_size file system block size
+@param[in] encrypted whether the page will be subsequently encrypted
+@return actual length of compressed page
+@retval 0 if the page was not compressed */
+ulint fil_page_compress(
+ const byte* buf,
+ byte* out_buf,
+ ulint flags,
+ ulint block_size,
+ bool encrypted)
+{
+ /* The full_crc32 page_compressed format assumes this. */
+ ut_ad(!(block_size & 255));
+ ut_ad(ut_is_2pow(block_size));
+
+ /* Let's not compress file space header or
+ extent descriptor */
+ switch (fil_page_get_type(buf)) {
+ case 0:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ case FIL_PAGE_PAGE_COMPRESSED:
+ return 0;
+ }
+
+ if (fil_space_t::full_crc32(flags)) {
+ return fil_page_compress_for_full_crc32(
+ buf, out_buf, flags, block_size, encrypted);
+ }
+
+ return fil_page_compress_for_non_full_crc32(
+ buf, out_buf, flags, block_size, encrypted);
+}
+
+/** Decompress a page that may be subject to page_compressed compression.
+@param[in,out] tmp_buf temporary buffer (of innodb_page_size)
+@param[in,out] buf possibly compressed page buffer
+@param[in] comp_algo compression algorithm
+@param[in] header_len header length of the page
+@param[in] actual size actual size of the page
+@retval true if the page is decompressed or false */
+static bool fil_page_decompress_low(
+ byte* tmp_buf,
+ byte* buf,
+ ulint comp_algo,
+ ulint header_len,
+ ulint actual_size)
+{
+ switch (comp_algo) {
+ default:
+ ib::error() << "Unknown compression algorithm "
+ << comp_algo;
+ return false;
+ case PAGE_ZLIB_ALGORITHM:
+ {
+ uLong len = srv_page_size;
+ return (Z_OK == uncompress(tmp_buf, &len,
+ buf + header_len,
+ uLong(actual_size))
+ && len == srv_page_size);
+ }
+#ifdef HAVE_LZ4
+ case PAGE_LZ4_ALGORITHM:
+ return LZ4_decompress_safe(
+ reinterpret_cast<const char*>(buf) + header_len,
+ reinterpret_cast<char*>(tmp_buf),
+ static_cast<int>(actual_size),
+ static_cast<int>(srv_page_size)) ==
+ static_cast<int>(srv_page_size);
+#endif /* HAVE_LZ4 */
+#ifdef HAVE_LZO
+ case PAGE_LZO_ALGORITHM:
+ {
+ lzo_uint len_lzo = srv_page_size;
+ return (LZO_E_OK == lzo1x_decompress_safe(
+ buf + header_len,
+ actual_size, tmp_buf, &len_lzo, NULL)
+ && len_lzo == srv_page_size);
+ }
+#endif /* HAVE_LZO */
+#ifdef HAVE_LZMA
+ case PAGE_LZMA_ALGORITHM:
+ {
+ size_t src_pos = 0;
+ size_t dst_pos = 0;
+ uint64_t memlimit = UINT64_MAX;
+
+ return LZMA_OK == lzma_stream_buffer_decode(
+ &memlimit, 0, NULL, buf + header_len,
+ &src_pos, actual_size, tmp_buf, &dst_pos,
+ srv_page_size)
+ && dst_pos == srv_page_size;
+ }
+#endif /* HAVE_LZMA */
+#ifdef HAVE_BZIP2
+ case PAGE_BZIP2_ALGORITHM:
+ {
+ uint dst_pos = static_cast<uint>(srv_page_size);
+ return BZ_OK == BZ2_bzBuffToBuffDecompress(
+ reinterpret_cast<char*>(tmp_buf),
+ &dst_pos,
+ reinterpret_cast<char*>(buf) + header_len,
+ static_cast<uint>(actual_size), 1, 0)
+ && dst_pos == srv_page_size;
+ }
+#endif /* HAVE_BZIP2 */
+#ifdef HAVE_SNAPPY
+ case PAGE_SNAPPY_ALGORITHM:
+ {
+ size_t olen = srv_page_size;
+
+ return SNAPPY_OK == snappy_uncompress(
+ reinterpret_cast<const char*>(buf)
+ + header_len,
+ actual_size,
+ reinterpret_cast<char*>(tmp_buf), &olen)
+ && olen == srv_page_size;
+ }
+#endif /* HAVE_SNAPPY */
+ }
+
+ return false;
+}
+
+/** Decompress a page for full crc32 format.
+@param[in,out] tmp_buf temporary buffer (of innodb_page_size)
+@param[in,out] buf possibly compressed page buffer
+@param[in] flags tablespace flags
+@return size of the compressed data
+@retval 0 if decompression failed
+@retval srv_page_size if the page was not compressed */
+ulint fil_page_decompress_for_full_crc32(byte* tmp_buf, byte* buf, ulint flags)
+{
+ ut_ad(fil_space_t::full_crc32(flags));
+ bool compressed = false;
+ size_t size = buf_page_full_crc32_size(buf, &compressed, NULL);
+ if (!compressed) {
+ ut_ad(size == srv_page_size);
+ return size;
+ }
+
+ if (!fil_space_t::is_compressed(flags)) {
+ return 0;
+ }
+
+ if (size >= srv_page_size) {
+ return 0;
+ }
+
+ if (fil_space_t::full_crc32_page_compressed_len(flags)) {
+ compile_time_assert(FIL_PAGE_FCRC32_CHECKSUM == 4);
+ if (size_t lsb = buf[size - 5]) {
+ size += lsb - 0x100;
+ }
+ size -= 5;
+ }
+
+ const size_t header_len = FIL_PAGE_COMP_ALGO;
+
+ if (!fil_page_decompress_low(tmp_buf, buf,
+ fil_space_t::get_compression_algo(flags),
+ header_len, size - header_len)) {
+ return 0;
+ }
+
+ srv_stats.pages_page_decompressed.inc();
+ memcpy(buf, tmp_buf, srv_page_size);
+ return size;
+}
+
+/** Decompress a page for non full crc32 format.
+@param[in,out] tmp_buf temporary buffer (of innodb_page_size)
+@param[in,out] buf possibly compressed page buffer
+@return size of the compressed data
+@retval 0 if decompression failed
+@retval srv_page_size if the page was not compressed */
+ulint fil_page_decompress_for_non_full_crc32(
+ byte* tmp_buf,
+ byte* buf)
+{
+ ulint header_len;
+ uint comp_algo;
+ switch (fil_page_get_type(buf)) {
+ case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+ header_len= FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN;
+ comp_algo = mach_read_from_2(
+ FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_ALGO + buf);
+ break;
+ case FIL_PAGE_PAGE_COMPRESSED:
+ header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
+ if (mach_read_from_6(FIL_PAGE_COMP_ALGO + buf)) {
+ return 0;
+ }
+ comp_algo = mach_read_from_2(FIL_PAGE_COMP_ALGO + 6 + buf);
+ break;
+ default:
+ return srv_page_size;
+ }
+
+ if (mach_read_from_4(buf + FIL_PAGE_SPACE_OR_CHKSUM)
+ != BUF_NO_CHECKSUM_MAGIC) {
+ return 0;
+ }
+
+ ulint actual_size = mach_read_from_2(buf + FIL_PAGE_DATA
+ + FIL_PAGE_COMP_SIZE);
+
+ /* Check if payload size is corrupted */
+ if (actual_size == 0 || actual_size > srv_page_size - header_len) {
+ return 0;
+ }
+
+ if (!fil_page_decompress_low(tmp_buf, buf, comp_algo, header_len,
+ actual_size)) {
+ return 0;
+ }
+
+ srv_stats.pages_page_decompressed.inc();
+ memcpy(buf, tmp_buf, srv_page_size);
+ return actual_size;
+}
+
+/** Decompress a page that may be subject to page_compressed compression.
+@param[in,out] tmp_buf temporary buffer (of innodb_page_size)
+@param[in,out] buf possibly compressed page buffer
+@return size of the compressed data
+@retval 0 if decompression failed
+@retval srv_page_size if the page was not compressed */
+ulint fil_page_decompress(
+ byte* tmp_buf,
+ byte* buf,
+ ulint flags)
+{
+ if (fil_space_t::full_crc32(flags)) {
+ return fil_page_decompress_for_full_crc32(tmp_buf, buf, flags);
+ }
+
+ return fil_page_decompress_for_non_full_crc32(tmp_buf, buf);
+}
diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc
new file mode 100644
index 00000000..57164113
--- /dev/null
+++ b/storage/innobase/fsp/fsp0file.cc
@@ -0,0 +1,1043 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fsp/fsp0file.cc
+Tablespace data file implementation
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#include "fil0fil.h"
+#include "fsp0types.h"
+#include "os0file.h"
+#include "page0page.h"
+#include "srv0start.h"
+
+/** Initialize the name, size and order of this datafile
+@param[in] name tablespace name, will be copied
+@param[in] flags tablespace flags */
+void
+Datafile::init(
+ const char* name,
+ ulint flags)
+{
+ ut_ad(m_name == NULL);
+ ut_ad(name != NULL);
+
+ m_name = mem_strdup(name);
+ m_flags = flags;
+}
+
+/** Release the resources. */
+void
+Datafile::shutdown()
+{
+ close();
+
+ ut_free(m_name);
+ m_name = NULL;
+ free_filepath();
+ free_first_page();
+}
+
+/** Create/open a data file.
+@param[in] read_only_mode if true, then readonly mode checks are enforced.
+@return DB_SUCCESS or error code */
+dberr_t
+Datafile::open_or_create(bool read_only_mode)
+{
+ bool success;
+ ut_a(m_filepath != NULL);
+ ut_ad(m_handle == OS_FILE_CLOSED);
+
+ m_handle = os_file_create(
+ innodb_data_file_key, m_filepath, m_open_flags,
+ OS_FILE_NORMAL, OS_DATA_FILE, read_only_mode, &success);
+
+ if (!success) {
+ m_last_os_error = os_file_get_last_error(true);
+ ib::error() << "Cannot open datafile '" << m_filepath << "'";
+ return(DB_CANNOT_OPEN_FILE);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Open a data file in read-only mode to check if it exists so that it
+can be validated.
+@param[in] strict whether to issue error messages
+@return DB_SUCCESS or error code */
+dberr_t
+Datafile::open_read_only(bool strict)
+{
+ bool success = false;
+ ut_ad(m_handle == OS_FILE_CLOSED);
+
+ /* This function can be called for file objects that do not need
+ to be opened, which is the case when the m_filepath is NULL */
+ if (m_filepath == NULL) {
+ return(DB_ERROR);
+ }
+
+ set_open_flags(OS_FILE_OPEN);
+ m_handle = os_file_create_simple_no_error_handling(
+ innodb_data_file_key, m_filepath, m_open_flags,
+ OS_FILE_READ_ONLY, true, &success);
+
+ if (success) {
+ m_exists = true;
+ init_file_info();
+
+ return(DB_SUCCESS);
+ }
+
+ if (strict) {
+ m_last_os_error = os_file_get_last_error(true);
+ ib::error() << "Cannot open datafile for read-only: '"
+ << m_filepath << "' OS error: " << m_last_os_error;
+ }
+
+ return(DB_CANNOT_OPEN_FILE);
+}
+
+/** Open a data file in read-write mode during start-up so that
+doublewrite pages can be restored and then it can be validated.*
+@param[in] read_only_mode if true, then readonly mode checks are enforced.
+@return DB_SUCCESS or error code */
+dberr_t
+Datafile::open_read_write(bool read_only_mode)
+{
+ bool success = false;
+ ut_ad(m_handle == OS_FILE_CLOSED);
+
+ /* This function can be called for file objects that do not need
+ to be opened, which is the case when the m_filepath is NULL */
+ if (m_filepath == NULL) {
+ return(DB_ERROR);
+ }
+
+ set_open_flags(OS_FILE_OPEN);
+ m_handle = os_file_create_simple_no_error_handling(
+ innodb_data_file_key, m_filepath, m_open_flags,
+ OS_FILE_READ_WRITE, read_only_mode, &success);
+
+ if (!success) {
+ m_last_os_error = os_file_get_last_error(true);
+ ib::error() << "Cannot open datafile for read-write: '"
+ << m_filepath << "'";
+ return(DB_CANNOT_OPEN_FILE);
+ }
+
+ m_exists = true;
+
+ init_file_info();
+
+ return(DB_SUCCESS);
+}
+
+/** Initialize OS specific file info. */
+void
+Datafile::init_file_info()
+{
+#ifdef _WIN32
+ GetFileInformationByHandle((os_file_t)m_handle, &m_file_info);
+#else
+ fstat(m_handle, &m_file_info);
+#endif /* WIN32 */
+}
+
+/** Close a data file.
+@return DB_SUCCESS or error code */
+dberr_t
+Datafile::close()
+{
+ if (m_handle != OS_FILE_CLOSED) {
+ ibool success = os_file_close(m_handle);
+ ut_a(success);
+
+ m_handle = OS_FILE_CLOSED;
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Make a full filepath from a directory path and a filename.
+Prepend the dirpath to filename using the extension given.
+If dirpath is NULL, prepend the default datadir to filepath.
+Store the result in m_filepath.
+@param[in] dirpath directory path
+@param[in] filename filename or filepath
+@param[in] ext filename extension */
+void
+Datafile::make_filepath(
+ const char* dirpath,
+ const char* filename,
+ ib_extention ext)
+{
+ ut_ad(dirpath != NULL || filename != NULL);
+
+ free_filepath();
+
+ m_filepath = fil_make_filepath(dirpath, filename, ext, false);
+
+ ut_ad(m_filepath != NULL);
+
+ set_filename();
+}
+
+/** Set the filepath by duplicating the filepath sent in. This is the
+name of the file with its extension and absolute or relative path.
+@param[in] filepath filepath to set */
+void
+Datafile::set_filepath(const char* filepath)
+{
+ free_filepath();
+ m_filepath = static_cast<char*>(ut_malloc_nokey(strlen(filepath) + 1));
+ ::strcpy(m_filepath, filepath);
+ set_filename();
+}
+
+/** Free the filepath buffer. */
+void
+Datafile::free_filepath()
+{
+ if (m_filepath != NULL) {
+ ut_free(m_filepath);
+ m_filepath = NULL;
+ m_filename = NULL;
+ }
+}
+
+/** Do a quick test if the filepath provided looks the same as this filepath
+byte by byte. If they are two different looking paths to the same file,
+same_as() will be used to show that after the files are opened.
+@param[in] other filepath to compare with
+@retval true if it is the same filename by byte comparison
+@retval false if it looks different */
+bool
+Datafile::same_filepath_as(
+ const char* other) const
+{
+ return(0 == strcmp(m_filepath, other));
+}
+
+/** Test if another opened datafile is the same file as this object.
+@param[in] other Datafile to compare with
+@return true if it is the same file, else false */
+bool
+Datafile::same_as(
+ const Datafile& other) const
+{
+#ifdef _WIN32
+ return(m_file_info.dwVolumeSerialNumber
+ == other.m_file_info.dwVolumeSerialNumber
+ && m_file_info.nFileIndexHigh
+ == other.m_file_info.nFileIndexHigh
+ && m_file_info.nFileIndexLow
+ == other.m_file_info.nFileIndexLow);
+#else
+ return(m_file_info.st_ino == other.m_file_info.st_ino
+ && m_file_info.st_dev == other.m_file_info.st_dev);
+#endif /* WIN32 */
+}
+
+/** Allocate and set the datafile or tablespace name in m_name.
+If a name is provided, use it; else extract a file-per-table
+tablespace name from m_filepath. The value of m_name
+will be freed in the destructor.
+@param[in] name tablespace name if known, NULL if not */
+void
+Datafile::set_name(const char* name)
+{
+ ut_free(m_name);
+
+ if (name != NULL) {
+ m_name = mem_strdup(name);
+ } else {
+ m_name = fil_path_to_space_name(m_filepath);
+ }
+}
+
+/** Reads a few significant fields from the first page of the first
+datafile. The Datafile must already be open.
+@param[in] read_only_mode If true, then readonly mode checks are enforced.
+@return DB_SUCCESS or DB_IO_ERROR if page cannot be read */
+dberr_t
+Datafile::read_first_page(bool read_only_mode)
+{
+ if (m_handle == OS_FILE_CLOSED) {
+
+ dberr_t err = open_or_create(read_only_mode);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ /* Align the memory for a possible read from a raw device */
+
+ m_first_page = static_cast<byte*>(
+ aligned_malloc(UNIV_PAGE_SIZE_MAX, srv_page_size));
+
+ dberr_t err = DB_ERROR;
+ size_t page_size = UNIV_PAGE_SIZE_MAX;
+
+ /* Don't want unnecessary complaints about partial reads. */
+
+ while (page_size >= UNIV_PAGE_SIZE_MIN) {
+
+ ulint n_read = 0;
+
+ err = os_file_read_no_error_handling(
+ IORequestReadPartial, m_handle, m_first_page, 0,
+ page_size, &n_read);
+
+ if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) {
+
+ page_size >>= 1;
+
+ } else if (err == DB_SUCCESS) {
+
+ ut_a(n_read == page_size);
+
+ break;
+
+ } else if (srv_operation == SRV_OPERATION_BACKUP) {
+ break;
+ } else {
+
+ ib::error()
+ << "Cannot read first page of '"
+ << m_filepath << "' "
+ << err;
+ break;
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ if (m_order == 0) {
+ if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + m_first_page,
+ FSP_HEADER_OFFSET + FSP_SPACE_ID
+ + m_first_page, 4)) {
+ ib::error()
+ << "Inconsistent tablespace ID in "
+ << m_filepath;
+ return DB_CORRUPTION;
+ }
+
+ m_space_id = mach_read_from_4(FIL_PAGE_SPACE_ID
+ + m_first_page);
+ m_flags = fsp_header_get_flags(m_first_page);
+ if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) {
+ ulint cflags = fsp_flags_convert_from_101(m_flags);
+ if (cflags == ULINT_UNDEFINED) {
+ ib::error()
+ << "Invalid flags " << ib::hex(m_flags)
+ << " in " << m_filepath;
+ return(DB_CORRUPTION);
+ } else {
+ m_flags = cflags;
+ }
+ }
+ }
+
+ const size_t physical_size = fil_space_t::physical_size(m_flags);
+
+ if (physical_size > page_size) {
+ ib::error() << "File " << m_filepath
+ << " should be longer than "
+ << page_size << " bytes";
+ return(DB_CORRUPTION);
+ }
+
+ return(err);
+}
+
+/** Free the first page from memory when it is no longer needed. */
+void Datafile::free_first_page()
+{
+ aligned_free(m_first_page);
+ m_first_page= nullptr;
+}
+
+/** Validates the datafile and checks that it conforms with the expected
+space ID and flags. The file should exist and be successfully opened
+in order for this function to validate it.
+@param[in] space_id The expected tablespace ID.
+@param[in] flags The expected tablespace flags.
+@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+m_is_valid is also set true on success, else false. */
+dberr_t
+Datafile::validate_to_dd(ulint space_id, ulint flags)
+{
+ dberr_t err;
+
+ if (!is_open()) {
+ return DB_ERROR;
+ }
+
+ /* Validate this single-table-tablespace with the data dictionary,
+ but do not compare the DATA_DIR flag, in case the tablespace was
+ remotely located. */
+ err = validate_first_page(0);
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ flags &= ~FSP_FLAGS_MEM_MASK;
+
+ /* Make sure the datafile we found matched the space ID.
+ If the datafile is a file-per-table tablespace then also match
+ the row format and zip page size. */
+ if (m_space_id == space_id
+ && (fil_space_t::is_flags_equal(flags, m_flags)
+ || fil_space_t::is_flags_equal(m_flags, flags))) {
+ /* Datafile matches the tablespace expected. */
+ return(DB_SUCCESS);
+ }
+
+ /* else do not use this tablespace. */
+ m_is_valid = false;
+
+ ib::error() << "Refusing to load '" << m_filepath << "' (id="
+ << m_space_id << ", flags=" << ib::hex(m_flags)
+ << "); dictionary contains id="
+ << space_id << ", flags=" << ib::hex(flags);
+
+ return(DB_ERROR);
+}
+
+/** Validates this datafile for the purpose of recovery. The file should
+exist and be successfully opened. We initially open it in read-only mode
+because we just want to read the SpaceID. However, if the first page is
+corrupt and needs to be restored from the doublewrite buffer, we will
+reopen it in write mode and ry to restore that page.
+@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+m_is_valid is also set true on success, else false. */
+dberr_t
+Datafile::validate_for_recovery()
+{
+ dberr_t err;
+
+ ut_ad(is_open());
+ ut_ad(!srv_read_only_mode);
+
+ err = validate_first_page(0);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_TABLESPACE_EXISTS:
+ break;
+
+ default:
+ /* Re-open the file in read-write mode Attempt to restore
+ page 0 from doublewrite and read the space ID from a survey
+ of the first few pages. */
+ close();
+ err = open_read_write(srv_read_only_mode);
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ err = find_space_id();
+ if (err != DB_SUCCESS || m_space_id == 0) {
+ ib::error() << "Datafile '" << m_filepath << "' is"
+ " corrupted. Cannot determine the space ID from"
+ " the first 64 pages.";
+ return(err);
+ }
+
+ if (restore_from_doublewrite()) {
+ return(DB_CORRUPTION);
+ }
+
+ /* Free the previously read first page and then re-validate. */
+ free_first_page();
+ err = validate_first_page(0);
+ }
+
+ if (err == DB_SUCCESS) {
+ set_name(NULL);
+ }
+
+ return(err);
+}
+
+/** Check the consistency of the first page of a datafile when the
+tablespace is opened. This occurs before the fil_space_t is created
+so the Space ID found here must not already be open.
+m_is_valid is set true on success, else false.
+@param[out] flush_lsn contents of FIL_PAGE_FILE_FLUSH_LSN
+@retval DB_SUCCESS on if the datafile is valid
+@retval DB_CORRUPTION if the datafile is not readable
+@retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */
+dberr_t
+Datafile::validate_first_page(lsn_t* flush_lsn)
+{
+ char* prev_name;
+ char* prev_filepath;
+ const char* error_txt = NULL;
+
+ m_is_valid = true;
+
+ if (m_first_page == NULL
+ && read_first_page(srv_read_only_mode) != DB_SUCCESS) {
+
+ error_txt = "Cannot read first page";
+ } else {
+ ut_ad(m_first_page);
+
+ if (flush_lsn != NULL) {
+
+ *flush_lsn = mach_read_from_8(
+ m_first_page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ }
+ }
+
+ if (error_txt != NULL) {
+err_exit:
+ ib::info() << error_txt << " in datafile: " << m_filepath
+ << ", Space ID:" << m_space_id << ", Flags: "
+ << m_flags;
+ m_is_valid = false;
+ free_first_page();
+ return(DB_CORRUPTION);
+ }
+
+ /* Check if the whole page is blank. */
+ if (!m_space_id && !m_flags) {
+ const byte* b = m_first_page;
+ ulint nonzero_bytes = srv_page_size;
+
+ while (*b == '\0' && --nonzero_bytes != 0) {
+
+ b++;
+ }
+
+ if (nonzero_bytes == 0) {
+ error_txt = "Header page consists of zero bytes";
+ goto err_exit;
+ }
+ }
+
+ if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) {
+ /* Tablespace flags must be valid. */
+ error_txt = "Tablespace flags are invalid";
+ goto err_exit;
+ }
+
+ ulint logical_size = fil_space_t::logical_size(m_flags);
+
+ if (srv_page_size != logical_size) {
+ /* Logical size must be innodb_page_size. */
+ ib::error()
+ << "Data file '" << m_filepath << "' uses page size "
+ << logical_size << ", but the innodb_page_size"
+ " start-up parameter is "
+ << srv_page_size;
+ free_first_page();
+ return(DB_ERROR);
+ }
+
+ if (page_get_page_no(m_first_page) != 0) {
+ /* First page must be number 0 */
+ error_txt = "Header page contains inconsistent data";
+ goto err_exit;
+ }
+
+ if (m_space_id >= SRV_SPACE_ID_UPPER_BOUND) {
+ error_txt = "A bad Space ID was found";
+ goto err_exit;
+ }
+
+ if (buf_page_is_corrupted(false, m_first_page, m_flags)) {
+ /* Look for checksum and other corruptions. */
+ error_txt = "Checksum mismatch";
+ goto err_exit;
+ }
+
+ if (fil_space_read_name_and_filepath(
+ m_space_id, &prev_name, &prev_filepath)) {
+
+ if (0 == strcmp(m_filepath, prev_filepath)) {
+ ut_free(prev_name);
+ ut_free(prev_filepath);
+ return(DB_SUCCESS);
+ }
+
+ /* Make sure the space_id has not already been opened. */
+ ib::error() << "Attempted to open a previously opened"
+ " tablespace. Previous tablespace " << prev_name
+ << " at filepath: " << prev_filepath
+ << " uses space ID: " << m_space_id
+ << ". Cannot open filepath: " << m_filepath
+ << " which uses the same space ID.";
+
+ ut_free(prev_name);
+ ut_free(prev_filepath);
+
+ m_is_valid = false;
+
+ free_first_page();
+
+ return(is_predefined_tablespace(m_space_id)
+ ? DB_CORRUPTION
+ : DB_TABLESPACE_EXISTS);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Determine the space id of the given file descriptor by reading a few
+pages from the beginning of the .ibd file.
+@return DB_SUCCESS if space id was successfully identified, else DB_ERROR. */
+dberr_t
+Datafile::find_space_id()
+{
+ os_offset_t file_size;
+
+ ut_ad(m_handle != OS_FILE_CLOSED);
+
+ file_size = os_file_get_size(m_handle);
+
+ if (file_size == (os_offset_t) -1) {
+ ib::error() << "Could not get file size of datafile '"
+ << m_filepath << "'";
+ return(DB_CORRUPTION);
+ }
+
+ /* Assuming a page size, read the space_id from each page and store it
+ in a map. Find out which space_id is agreed on by majority of the
+ pages. Choose that space_id. */
+ for (ulint page_size = UNIV_ZIP_SIZE_MIN;
+ page_size <= UNIV_PAGE_SIZE_MAX;
+ page_size <<= 1) {
+ /* map[space_id] = count of pages */
+ typedef std::map<
+ ulint,
+ ulint,
+ std::less<ulint>,
+ ut_allocator<std::pair<const ulint, ulint> > >
+ Pages;
+
+ Pages verify;
+ ulint page_count = 64;
+ ulint valid_pages = 0;
+
+ /* Adjust the number of pages to analyze based on file size */
+ while ((page_count * page_size) > file_size) {
+ --page_count;
+ }
+
+ ib::info()
+ << "Page size:" << page_size
+ << ". Pages to analyze:" << page_count;
+
+ byte* page = static_cast<byte*>(
+ aligned_malloc(page_size, page_size));
+
+ ulint fsp_flags;
+ /* provide dummy value if the first os_file_read() fails */
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER
+ | FSP_FLAGS_FCRC32_PAGE_SSIZE()
+ | innodb_compression_algorithm
+ << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+ break;
+ default:
+ fsp_flags = 0;
+ }
+
+ for (ulint j = 0; j < page_count; ++j) {
+ if (os_file_read(IORequestRead, m_handle, page,
+ j * page_size, page_size)) {
+ ib::info()
+ << "READ FAIL: page_no:" << j;
+ continue;
+ }
+
+ if (j == 0) {
+ fsp_flags = mach_read_from_4(
+ page + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS);
+ }
+
+ bool noncompressed_ok = false;
+
+ /* For noncompressed pages, the page size must be
+ equal to srv_page_size. */
+ if (page_size == srv_page_size
+ && !fil_space_t::zip_size(fsp_flags)) {
+ noncompressed_ok = !buf_page_is_corrupted(
+ false, page, fsp_flags);
+ }
+
+ bool compressed_ok = false;
+
+ if (srv_page_size <= UNIV_PAGE_SIZE_DEF
+ && page_size == fil_space_t::zip_size(fsp_flags)) {
+ compressed_ok = !buf_page_is_corrupted(
+ false, page, fsp_flags);
+ }
+
+ if (noncompressed_ok || compressed_ok) {
+
+ ulint space_id = mach_read_from_4(page
+ + FIL_PAGE_SPACE_ID);
+
+ if (space_id > 0) {
+
+ ib::info()
+ << "VALID: space:"
+ << space_id << " page_no:" << j
+ << " page_size:" << page_size;
+
+ ++valid_pages;
+
+ ++verify[space_id];
+ }
+ }
+ }
+
+ aligned_free(page);
+
+ ib::info()
+ << "Page size: " << page_size
+ << ". Possible space_id count:" << verify.size();
+
+ const ulint pages_corrupted = 3;
+
+ for (ulint missed = 0; missed <= pages_corrupted; ++missed) {
+
+ for (Pages::const_iterator it = verify.begin();
+ it != verify.end();
+ ++it) {
+
+ ib::info() << "space_id:" << it->first
+ << ", Number of pages matched: "
+ << it->second << "/" << valid_pages
+ << " (" << page_size << ")";
+
+ if (it->second == (valid_pages - missed)) {
+ ib::info() << "Chosen space:"
+ << it->first;
+
+ m_space_id = it->first;
+ return(DB_SUCCESS);
+ }
+ }
+
+ }
+ }
+
+ return(DB_CORRUPTION);
+}
+
+
+/** Restore the first page of the tablespace from
+the double write buffer.
+@return whether the operation failed */
+bool
+Datafile::restore_from_doublewrite()
+{
+ if (srv_operation != SRV_OPERATION_NORMAL) {
+ return true;
+ }
+
+ /* Find if double write buffer contains page_no of given space id. */
+ const page_id_t page_id(m_space_id, 0);
+ const byte* page = recv_sys.dblwr.find_page(page_id);
+
+ if (!page) {
+ /* If the first page of the given user tablespace is not there
+ in the doublewrite buffer, then the recovery is going to fail
+ now. Hence this is treated as an error. */
+
+ ib::error()
+ << "Corrupted page " << page_id
+ << " of datafile '" << m_filepath
+ << "' could not be found in the doublewrite buffer.";
+
+ return(true);
+ }
+
+ ulint flags = mach_read_from_4(
+ FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
+
+ if (!fil_space_t::is_valid_flags(flags, m_space_id)) {
+ flags = fsp_flags_convert_from_101(flags);
+ /* recv_dblwr_t::validate_page() inside find_page()
+ checked this already. */
+ ut_ad(flags != ULINT_UNDEFINED);
+ /* The flags on the page should be converted later. */
+ }
+
+ ulint physical_size = fil_space_t::physical_size(flags);
+
+ ut_a(page_get_page_no(page) == page_id.page_no());
+
+ ib::info() << "Restoring page " << page_id
+ << " of datafile '" << m_filepath
+ << "' from the doublewrite buffer. Writing "
+ << physical_size << " bytes into file '"
+ << m_filepath << "'";
+
+ return(os_file_write(
+ IORequestWrite,
+ m_filepath, m_handle, page, 0, physical_size)
+ != DB_SUCCESS);
+}
+
+/** Create a link filename based on the contents of m_name,
+open that file, and read the contents into m_filepath.
+@retval DB_SUCCESS if remote linked tablespace file is opened and read.
+@retval DB_CANNOT_OPEN_FILE if the link file does not exist. */
+dberr_t
+RemoteDatafile::open_link_file()
+{
+ if (m_link_filepath == NULL) {
+ m_link_filepath = fil_make_filepath(NULL, name(), ISL, false);
+ }
+
+ m_filepath = read_link_file(m_link_filepath);
+
+ return(m_filepath == NULL ? DB_CANNOT_OPEN_FILE : DB_SUCCESS);
+}
+
+/** Opens a handle to the file linked to in an InnoDB Symbolic Link file
+in read-only mode so that it can be validated.
+@param[in] strict whether to issue error messages
+@return DB_SUCCESS if remote linked tablespace file is found and opened. */
+dberr_t
+RemoteDatafile::open_read_only(bool strict)
+{
+ if (m_filepath == NULL && open_link_file() == DB_CANNOT_OPEN_FILE) {
+ return(DB_ERROR);
+ }
+
+ dberr_t err = Datafile::open_read_only(strict);
+
+ if (err != DB_SUCCESS && strict) {
+ /* The following call prints an error message */
+ os_file_get_last_error(true);
+ ib::error() << "A link file was found named '"
+ << m_link_filepath << "' but the linked tablespace '"
+ << m_filepath << "' could not be opened read-only.";
+ }
+
+ return(err);
+}
+
+/** Opens a handle to the file linked to in an InnoDB Symbolic Link file
+in read-write mode so that it can be restored from doublewrite and validated.
+@param[in] read_only_mode If true, then readonly mode checks are enforced.
+@return DB_SUCCESS if remote linked tablespace file is found and opened. */
+dberr_t
+RemoteDatafile::open_read_write(bool read_only_mode)
+{
+ if (m_filepath == NULL && open_link_file() == DB_CANNOT_OPEN_FILE) {
+ return(DB_ERROR);
+ }
+
+ dberr_t err = Datafile::open_read_write(read_only_mode);
+
+ if (err != DB_SUCCESS) {
+ /* The following call prints an error message */
+ m_last_os_error = os_file_get_last_error(true);
+ ib::error() << "A link file was found named '"
+ << m_link_filepath << "' but the linked data file '"
+ << m_filepath << "' could not be opened for writing.";
+ }
+
+ return(err);
+}
+
+/** Release the resources. */
+void
+RemoteDatafile::shutdown()
+{
+ Datafile::shutdown();
+
+ if (m_link_filepath != 0) {
+ ut_free(m_link_filepath);
+ m_link_filepath = 0;
+ }
+}
+
+/** Creates a new InnoDB Symbolic Link (ISL) file. It is always created
+under the 'datadir' of MySQL. The datadir is the directory of a
+running mysqld program. We can refer to it by simply using the path ".".
+@param[in] name tablespace name
+@param[in] filepath remote filepath of tablespace datafile
+@return DB_SUCCESS or error code */
+dberr_t
+RemoteDatafile::create_link_file(
+ const char* name,
+ const char* filepath)
+{
+ bool success;
+ dberr_t err = DB_SUCCESS;
+ char* link_filepath = NULL;
+ char* prev_filepath = NULL;
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad(0 == strcmp(&filepath[strlen(filepath) - 4], DOT_IBD));
+
+ link_filepath = fil_make_filepath(NULL, name, ISL, false);
+
+ if (link_filepath == NULL) {
+ return(DB_ERROR);
+ }
+
+ prev_filepath = read_link_file(link_filepath);
+ if (prev_filepath) {
+ /* Truncate (starting with MySQL 5.6, probably no
+ longer since MariaDB Server 10.2.19) used to call this
+ with an existing link file which contains the same filepath. */
+ bool same = !strcmp(prev_filepath, filepath);
+ ut_free(prev_filepath);
+ if (same) {
+ ut_free(link_filepath);
+ return(DB_SUCCESS);
+ }
+ }
+
+ /** Check if the file already exists. */
+ FILE* file = NULL;
+ bool exists;
+ os_file_type_t ftype;
+
+ success = os_file_status(link_filepath, &exists, &ftype);
+ ulint error = 0;
+
+ if (success && !exists) {
+
+ file = fopen(link_filepath, "w");
+ if (file == NULL) {
+ /* This call will print its own error message */
+ error = os_file_get_last_error(true);
+ }
+ } else {
+ error = OS_FILE_ALREADY_EXISTS;
+ }
+
+ if (error != 0) {
+
+ ib::error() << "Cannot create file " << link_filepath << ".";
+
+ if (error == OS_FILE_ALREADY_EXISTS) {
+ ib::error() << "The link file: " << link_filepath
+ << " already exists.";
+ err = DB_TABLESPACE_EXISTS;
+
+ } else if (error == OS_FILE_DISK_FULL) {
+ err = DB_OUT_OF_FILE_SPACE;
+
+ } else {
+ err = DB_ERROR;
+ }
+
+ /* file is not open, no need to close it. */
+ ut_free(link_filepath);
+ return(err);
+ }
+
+ ulint rbytes = fwrite(filepath, 1, strlen(filepath), file);
+
+ if (rbytes != strlen(filepath)) {
+ error = os_file_get_last_error(true);
+ ib::error() <<
+ "Cannot write link file: "
+ << link_filepath << " filepath: " << filepath;
+ err = DB_ERROR;
+ }
+
+ /* Close the file, we only need it at startup */
+ fclose(file);
+
+ ut_free(link_filepath);
+
+ return(err);
+}
+
+/** Delete an InnoDB Symbolic Link (ISL) file. */
+void
+RemoteDatafile::delete_link_file(void)
+{
+ ut_ad(m_link_filepath != NULL);
+
+ if (m_link_filepath != NULL) {
+ os_file_delete_if_exists(innodb_data_file_key,
+ m_link_filepath, NULL);
+ }
+}
+
+/** Delete an InnoDB Symbolic Link (ISL) file by name.
+@param[in] name tablespace name */
+void
+RemoteDatafile::delete_link_file(
+ const char* name)
+{
+ char* link_filepath = fil_make_filepath(NULL, name, ISL, false);
+
+ if (link_filepath != NULL) {
+ os_file_delete_if_exists(
+ innodb_data_file_key, link_filepath, NULL);
+
+ ut_free(link_filepath);
+ }
+}
+
+/** Read an InnoDB Symbolic Link (ISL) file by name.
+It is always created under the datadir of MySQL.
+For file-per-table tablespaces, the isl file is expected to be
+in a 'database' directory and called 'tablename.isl'.
+The caller must free the memory returned if it is not null.
+@param[in] link_filepath filepath of the ISL file
+@return Filepath of the IBD file read from the ISL file */
+char*
+RemoteDatafile::read_link_file(
+ const char* link_filepath)
+{
+ FILE* file = fopen(link_filepath, "r+b" STR_O_CLOEXEC);
+ if (file == NULL) {
+ return(NULL);
+ }
+
+ char* filepath = static_cast<char*>(ut_malloc_nokey(OS_FILE_MAX_PATH));
+
+ os_file_read_string(file, filepath, OS_FILE_MAX_PATH);
+ fclose(file);
+
+ if (filepath[0] != '\0') {
+ /* Trim whitespace from end of filepath */
+ ulint last_ch = strlen(filepath) - 1;
+ while (last_ch > 4 && filepath[last_ch] <= 0x20) {
+ filepath[last_ch--] = 0x00;
+ }
+ os_normalize_path(filepath);
+ }
+
+ return(filepath);
+}
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
new file mode 100644
index 00000000..3d5a7edd
--- /dev/null
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -0,0 +1,2890 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fsp/fsp0fsp.cc
+File space management
+
+Created 11/29/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fsp0fsp.h"
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "page0page.h"
+#include "fut0fut.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "ibuf0ibuf.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "log0log.h"
+#include "dict0mem.h"
+#include "fsp0types.h"
+
+// JAN: MySQL 5.7 Encryption
+// #include <my_aes.h>
+
+typedef uint32_t page_no_t;
+
+/** Return an extent to the free list of a space.
+@param[in,out] space tablespace
+@param[in] offset page number in the extent
+@param[in,out] mtr mini-transaction */
+MY_ATTRIBUTE((nonnull))
+static
+void
+fsp_free_extent(
+ fil_space_t* space,
+ page_no_t offset,
+ mtr_t* mtr);
+
+/** Returns the first extent descriptor for a segment.
+We think of the extent lists of the segment catenated in the order
+FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
+@param[in] inode segment inode
+@param[in] space tablespace
+@param[in,out] mtr mini-transaction
+@return the first extent descriptor, or NULL if none */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+xdes_t*
+fseg_get_first_extent(
+ fseg_inode_t* inode,
+ const fil_space_t* space,
+ mtr_t* mtr);
+
+/** Put new extents to the free list if there are free extents above the free
+limit. If an extent happens to contain an extent descriptor page, the extent
+is put to the FSP_FREE_FRAG list with the page marked as used.
+@param[in] init_space true if this is a single-table tablespace
+and we are only initializing the first extent and the first bitmap pages;
+then we will not allocate more extents
+@param[in,out] space tablespace
+@param[in,out] header tablespace header
+@param[in,out] mtr mini-transaction */
+static ATTRIBUTE_COLD
+void
+fsp_fill_free_list(
+ bool init_space,
+ fil_space_t* space,
+ buf_block_t* header,
+ mtr_t* mtr);
+
+/** Allocates a single free page from a segment.
+This function implements the intelligent allocation strategy which tries to
+minimize file space fragmentation.
+@param[in,out] space tablespace
+@param[in,out] seg_inode segment inode
+@param[in,out] iblock segment inode page
+@param[in] hint hint of which page would be desirable
+@param[in] direction if the new page is needed because of
+an index page split, and records are inserted there in order, into which
+direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
+@param[in,out] mtr mini-transaction
+@param[in,out] init_mtr mtr or another mini-transaction in
+which the page should be initialized.
+@retval NULL if no page could be allocated */
+static
+buf_block_t*
+fseg_alloc_free_page_low(
+ fil_space_t* space,
+ fseg_inode_t* seg_inode,
+ buf_block_t* iblock,
+ uint32_t hint,
+ byte direction,
+#ifdef UNIV_DEBUG
+ bool has_done_reservation,
+ /*!< whether the space has already been reserved */
+#endif /* UNIV_DEBUG */
+ mtr_t* mtr,
+ mtr_t* init_mtr)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Get the tablespace header block, SX-latched
+@param[in] space tablespace
+@param[in,out] mtr mini-transaction
+@return pointer to the space header, page x-locked */
+inline buf_block_t *fsp_get_header(const fil_space_t *space, mtr_t *mtr)
+{
+ buf_block_t *block= buf_page_get(page_id_t(space->id, 0), space->zip_size(),
+ RW_SX_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+ ut_ad(space->id == mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID +
+ block->frame));
+ return block;
+}
+
+/** Set the XDES_FREE_BIT of a page.
+@tparam free desired value of XDES_FREE_BIT
+@param[in] block extent descriptor block
+@param[in,out] descr extent descriptor
+@param[in] offset page offset within the extent
+@param[in,out] mtr mini-transaction */
+template<bool free>
+inline void xdes_set_free(const buf_block_t &block, xdes_t *descr,
+ ulint offset, mtr_t *mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(offset < FSP_EXTENT_SIZE);
+ ut_ad(page_align(descr) == block.frame);
+ compile_time_assert(XDES_BITS_PER_PAGE == 2);
+ compile_time_assert(XDES_FREE_BIT == 0);
+ compile_time_assert(XDES_CLEAN_BIT == 1);
+
+ ulint index= XDES_BITS_PER_PAGE * offset;
+ byte *b= &descr[XDES_BITMAP + (index >> 3)];
+ /* xdes_init() should have set all XDES_CLEAN_BIT. */
+ ut_ad(!(~*b & 0xaa));
+ /* Clear or set XDES_FREE_BIT. */
+ byte val= free
+ ? static_cast<byte>(*b | 1 << (index & 7))
+ : static_cast<byte>(*b & ~(1 << (index & 7)));
+ mtr->write<1>(block, b, val);
+}
+
+/**
+Find a free page.
+@param descr extent descriptor
+@param hint page offset to start searching from (towards larger pages)
+@return free page offset
+@retval FIL_NULL if no page is free */
+inline uint32_t xdes_find_free(const xdes_t *descr, uint32_t hint= 0)
+{
+ const uint32_t extent_size= FSP_EXTENT_SIZE;
+ ut_ad(hint < extent_size);
+ for (uint32_t i= hint; i < extent_size; i++)
+ if (xdes_is_free(descr, i))
+ return i;
+ for (uint32_t i= 0; i < hint; i++)
+ if (xdes_is_free(descr, i))
+ return i;
+ return FIL_NULL;
+}
+
+/**
+Determine the number of used pages in a descriptor.
+@param descr file descriptor
+@return number of pages used */
+inline uint32_t xdes_get_n_used(const xdes_t *descr)
+{
+ uint32_t count= 0;
+
+ for (uint32_t i= FSP_EXTENT_SIZE; i--; )
+ if (!xdes_is_free(descr, i))
+ count++;
+
+ return count;
+}
+
+/**
+Determine whether a file extent is full.
+@param descr file descriptor
+@return whether all pages have been allocated */
+inline bool xdes_is_full(const xdes_t *descr)
+{
+ return FSP_EXTENT_SIZE == xdes_get_n_used(descr);
+}
+
+/** Set the state of an extent descriptor.
+@param[in] block extent descriptor block
+@param[in,out] descr extent descriptor
+@param[in] state the state
+@param[in,out] mtr mini-transaction */
+inline void xdes_set_state(const buf_block_t &block, xdes_t *descr,
+ byte state, mtr_t *mtr)
+{
+ ut_ad(descr && mtr);
+ ut_ad(state >= XDES_FREE);
+ ut_ad(state <= XDES_FSEG);
+ ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(page_align(descr) == block.frame);
+ ut_ad(mach_read_from_4(descr + XDES_STATE) <= XDES_FSEG);
+ mtr->write<1>(block, XDES_STATE + 3 + descr, state);
+}
+
+/**********************************************************************//**
+Gets the state of an xdes.
+@return state */
+UNIV_INLINE
+ulint
+xdes_get_state(
+/*===========*/
+ const xdes_t* descr) /*!< in: descriptor */
+{
+ ulint state;
+
+ ut_ad(descr);
+ state = mach_read_from_4(descr + XDES_STATE);
+ ut_ad(state - 1 < XDES_FSEG);
+ return(state);
+}
+
+/**********************************************************************//**
+Inits an extent descriptor to the free and clean state. */
+inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+ MTR_MEMO_PAGE_X_FIX));
+ mtr->memset(&block, uint16_t(descr - block.frame) + XDES_BITMAP,
+ XDES_SIZE - XDES_BITMAP, 0xff);
+ xdes_set_state(block, descr, XDES_FREE, mtr);
+}
+
+/** Mark a page used in an extent descriptor.
+@param[in,out] seg_inode segment inode
+@param[in,out] iblock segment inode page
+@param[in] page page number
+@param[in,out] descr extent descriptor
+@param[in,out] xdes extent descriptor page
+@param[in,out] mtr mini-transaction */
+static MY_ATTRIBUTE((nonnull))
+void
+fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock,
+ ulint page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr)
+{
+ ut_ad(fil_page_get_type(iblock->frame) == FIL_PAGE_INODE);
+ ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+ ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4));
+
+ const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE);
+ const uint16_t ioffset= uint16_t(seg_inode - iblock->frame);
+
+ if (!xdes_get_n_used(descr))
+ {
+ /* We move the extent from the free list to the NOT_FULL list */
+ flst_remove(iblock, uint16_t(FSEG_FREE + ioffset), xdes, xoffset, mtr);
+ flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
+ xdes, xoffset, mtr);
+ }
+
+ ut_ad(xdes_is_free(descr, page % FSP_EXTENT_SIZE));
+
+ /* We mark the page as used */
+ xdes_set_free<false>(*xdes, descr, page % FSP_EXTENT_SIZE, mtr);
+
+ byte* p_not_full= seg_inode + FSEG_NOT_FULL_N_USED;
+ const uint32_t not_full_n_used= mach_read_from_4(p_not_full) + 1;
+ mtr->write<4>(*iblock, p_not_full, not_full_n_used);
+ if (xdes_is_full(descr))
+ {
+ /* We move the extent from the NOT_FULL list to the FULL list */
+ flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset), xdes, xoffset, mtr);
+ flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset), xdes, xoffset, mtr);
+ mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
+ not_full_n_used - FSP_EXTENT_SIZE);
+ }
+}
+
+/** Get pointer to a the extent descriptor of a page.
+@param[in,out] sp_header tablespace header page, x-latched
+@param[in] space tablespace
+@param[in] offset page offset
+@param[out] desc_block descriptor block
+@param[in,out] mtr mini-transaction
+@param[in] init_space whether the tablespace is being initialized
+@return pointer to the extent descriptor, NULL if the page does not
+exist in the space or if the offset exceeds free limit */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+xdes_t*
+xdes_get_descriptor_with_space_hdr(
+ buf_block_t* header,
+ const fil_space_t* space,
+ page_no_t offset,
+ buf_block_t** desc_block,
+ mtr_t* mtr,
+ bool init_space = false)
+{
+ ut_ad(mtr->memo_contains(*space));
+ ut_ad(mtr->memo_contains_flagged(header, MTR_MEMO_PAGE_SX_FIX
+ | MTR_MEMO_PAGE_X_FIX));
+ /* Read free limit and space size */
+ uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+ + header->frame);
+ uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ + header->frame);
+ ut_ad(limit == space->free_limit
+ || (space->free_limit == 0
+ && (init_space
+ || space->purpose == FIL_TYPE_TEMPORARY
+ || (srv_startup_is_before_trx_rollback_phase
+ && (space->id == TRX_SYS_SPACE
+ || srv_is_undo_tablespace(space->id))))));
+ ut_ad(size == space->size_in_header);
+
+ if ((offset >= size) || (offset >= limit)) {
+ return(NULL);
+ }
+
+ const unsigned zip_size = space->zip_size();
+
+ uint32_t descr_page_no = xdes_calc_descriptor_page(zip_size, offset);
+
+ buf_block_t* block = header;
+
+ if (descr_page_no) {
+ block = buf_page_get(
+ page_id_t(space->id, descr_page_no), zip_size,
+ RW_SX_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+ }
+
+ if (desc_block != NULL) {
+ *desc_block = block;
+ }
+
+ return XDES_ARR_OFFSET + XDES_SIZE
+ * xdes_calc_descriptor_index(zip_size, offset)
+ + block->frame;
+}
+
+/** Get the extent descriptor of a page.
+The page where the extent descriptor resides is x-locked. If the page
+offset is equal to the free limit of the space, we will add new
+extents from above the free limit to the space free list, if not free
+limit == space size. This adding is necessary to make the descriptor
+defined, as they are uninitialized above the free limit.
+@param[in] space tablespace
+@param[in] offset page offset; if equal to the free limit, we
+try to add new extents to the space free list
+@param[out] xdes extent descriptor page
+@param[in,out] mtr mini-transaction
+@return the extent descriptor */
+static xdes_t* xdes_get_descriptor(const fil_space_t *space, page_no_t offset,
+ buf_block_t **xdes, mtr_t *mtr)
+{
+ buf_block_t *block= buf_page_get(page_id_t(space->id, 0), space->zip_size(),
+ RW_SX_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+ return xdes_get_descriptor_with_space_hdr(block, space, offset, xdes, mtr);
+}
+
+/** Get the extent descriptor of a page.
+The page where the extent descriptor resides is x-locked. If the page
+offset is equal to the free limit of the space, we will add new
+extents from above the free limit to the space free list, if not free
+limit == space size. This adding is necessary to make the descriptor
+defined, as they are uninitialized above the free limit.
+@param[in] space tablespace
+@param[in] page descriptor page offset
+@param[in] offset page offset
+@param[in,out] mtr mini-transaction
+@return the extent descriptor
+@retval NULL if the descriptor is not available */
+MY_ATTRIBUTE((warn_unused_result))
+static
+const xdes_t*
+xdes_get_descriptor_const(
+ const fil_space_t* space,
+ page_no_t page,
+ page_no_t offset,
+ mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains(space->latch, MTR_MEMO_SX_LOCK));
+ ut_ad(offset < space->free_limit);
+ ut_ad(offset < space->size_in_header);
+
+ const ulint zip_size = space->zip_size();
+
+ if (buf_block_t* block = buf_page_get_gen(page_id_t(space->id, page),
+ zip_size, RW_S_LATCH,
+ nullptr,
+ BUF_GET_POSSIBLY_FREED,
+ __FILE__, __LINE__, mtr)) {
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ if (block->page.status == buf_page_t::FREED) {
+ return nullptr;
+ }
+
+ ut_ad(page != 0 || space->free_limit == mach_read_from_4(
+ FSP_FREE_LIMIT + FSP_HEADER_OFFSET
+ + block->frame));
+ ut_ad(page != 0 || space->size_in_header == mach_read_from_4(
+ FSP_SIZE + FSP_HEADER_OFFSET
+ + block->frame));
+
+ return(block->frame + XDES_ARR_OFFSET + XDES_SIZE
+ * xdes_calc_descriptor_index(zip_size, offset));
+ }
+
+ return(NULL);
+}
+
+/** Get a pointer to the extent descriptor. The page where the
+extent descriptor resides is x-locked.
+@param[in] space tablespace
+@param[in] lst_node file address of the list node
+ contained in the descriptor
+@param[out] block extent descriptor block
+@param[in,out] mtr mini-transaction
+@return pointer to the extent descriptor */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+UNIV_INLINE
+xdes_t*
+xdes_lst_get_descriptor(
+ const fil_space_t* space,
+ fil_addr_t lst_node,
+ buf_block_t** block,
+ mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains(*space));
+ return fut_get_ptr(space->id, space->zip_size(),
+ lst_node, RW_SX_LATCH, mtr, block)
+ - XDES_FLST_NODE;
+}
+
+/********************************************************************//**
+Returns page offset of the first page in extent described by a descriptor.
+@return offset of the first page in extent */
+static uint32_t xdes_get_offset(const xdes_t *descr)
+{
+ ut_ad(descr);
+ return page_get_page_no(page_align(descr)) +
+ uint32_t(((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE) *
+ FSP_EXTENT_SIZE);
+}
+
+/** Initialize a file page whose prior contents should be ignored.
+@param[in,out] block buffer pool block */
+void fsp_apply_init_file_page(buf_block_t *block)
+{
+ memset_aligned<UNIV_PAGE_SIZE_MIN>(block->frame, 0, srv_page_size);
+ const page_id_t id(block->page.id());
+
+ mach_write_to_4(block->frame + FIL_PAGE_OFFSET, id.page_no());
+ if (log_sys.is_physical())
+ memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8);
+ mach_write_to_4(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id.space());
+ if (page_zip_des_t* page_zip= buf_block_get_page_zip(block))
+ {
+ memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0,
+ page_zip_get_size(page_zip));
+ static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+ memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET,
+ block->frame + FIL_PAGE_OFFSET, 4);
+ if (log_sys.is_physical())
+ memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8);
+ static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+ "not perfect alignment");
+ memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Assert that the mini-transaction is compatible with
+updating an allocation bitmap page.
+@param[in] mtr mini-transaction */
+void fil_space_t::modify_check(const mtr_t& mtr) const
+{
+ switch (mtr.get_log_mode()) {
+ case MTR_LOG_NONE:
+ /* These modes are only allowed within a non-bitmap page
+ when there is a higher-level redo log record written. */
+ ut_ad(purpose == FIL_TYPE_TABLESPACE
+ || purpose == FIL_TYPE_TEMPORARY);
+ break;
+ case MTR_LOG_NO_REDO:
+ ut_ad(purpose == FIL_TYPE_TEMPORARY
+ || purpose == FIL_TYPE_IMPORT);
+ return;
+ case MTR_LOG_ALL:
+ /* We may only write redo log for a persistent
+ tablespace. */
+ ut_ad(purpose == FIL_TYPE_TABLESPACE);
+ ut_ad(mtr.is_named_space(id));
+ return;
+ }
+
+ ut_ad("invalid log mode" == 0);
+}
+#endif
+
+/**********************************************************************//**
+Writes the space id and flags to a tablespace header. The flags contain
+row type, physical/compressed page size, and logical/uncompressed page
+size of the tablespace. */
+void
+fsp_header_init_fields(
+/*===================*/
+ page_t* page, /*!< in/out: first page in the space */
+ ulint space_id, /*!< in: space id */
+ ulint flags) /*!< in: tablespace flags (FSP_SPACE_FLAGS) */
+{
+ flags &= ~FSP_FLAGS_MEM_MASK;
+ ut_a(fil_space_t::is_valid_flags(flags, space_id));
+
+ mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page,
+ space_id);
+ mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page,
+ flags);
+}
+
+/** Initialize a tablespace header.
+@param[in,out] space tablespace
+@param[in] size current size in blocks
+@param[in,out] mtr mini-transaction */
+void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr)
+{
+ const page_id_t page_id(space->id, 0);
+ const ulint zip_size = space->zip_size();
+
+ buf_block_t *free_block = buf_LRU_get_free_block(false);
+
+ mtr_x_lock_space(space, mtr);
+
+ buf_block_t* block = buf_page_create(space, 0, zip_size, mtr,
+ free_block);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ if (UNIV_UNLIKELY(block != free_block)) {
+ buf_pool.free_block(free_block);
+ }
+
+ space->size_in_header = size;
+ space->free_len = 0;
+ space->free_limit = 0;
+
+ /* The prior contents of the file page should be ignored */
+
+ fsp_init_file_page(space, block, mtr);
+
+ mtr->write<2>(*block, block->frame + FIL_PAGE_TYPE,
+ FIL_PAGE_TYPE_FSP_HDR);
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, FSP_HEADER_OFFSET + FSP_SPACE_ID
+ + block->frame, space->id);
+ ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED
+ + block->frame));
+ /* recv_sys_t::parse() expects to find a WRITE record that
+ covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+ in order to avoid optimizing away any unchanged most
+ significant bytes of FSP_SIZE. */
+ mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE
+ + block->frame, size);
+ ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+ + block->frame));
+ if (auto f = space->flags & ~FSP_FLAGS_MEM_MASK) {
+ mtr->write<4,mtr_t::FORCED>(*block,
+ FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
+ + block->frame, f);
+ }
+ ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+ + block->frame));
+
+ flst_init(block, FSP_HEADER_OFFSET + FSP_FREE, mtr);
+ flst_init(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, mtr);
+ flst_init(block, FSP_HEADER_OFFSET + FSP_FULL_FRAG, mtr);
+ flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, mtr);
+ flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, mtr);
+
+ mtr->write<8>(*block, FSP_HEADER_OFFSET + FSP_SEG_ID + block->frame,
+ 1U);
+
+ fsp_fill_free_list(!is_system_tablespace(space->id),
+ space, block, mtr);
+
+ /* Write encryption metadata to page 0 if tablespace is
+ encrypted or encryption is disabled by table option. */
+ if (space->crypt_data &&
+ (space->crypt_data->should_encrypt() ||
+ space->crypt_data->not_encrypted())) {
+ space->crypt_data->write_page0(block, mtr);
+ }
+}
+
+/** Try to extend a single-table tablespace so that a page would fit in the
+data file.
+@param[in,out] space tablespace
+@param[in] page_no page number
+@param[in,out] header tablespace header
+@param[in,out] mtr mini-transaction
+@return true if success */
+static ATTRIBUTE_COLD __attribute__((warn_unused_result))
+bool
+fsp_try_extend_data_file_with_pages(
+ fil_space_t* space,
+ uint32_t page_no,
+ buf_block_t* header,
+ mtr_t* mtr)
+{
+ bool success;
+ ulint size;
+
+ ut_a(!is_system_tablespace(space->id));
+ ut_d(space->modify_check(*mtr));
+
+ size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + header->frame);
+ ut_ad(size == space->size_in_header);
+
+ ut_a(page_no >= size);
+
+ success = fil_space_extend(space, page_no + 1);
+ /* The size may be less than we wanted if we ran out of disk space. */
+ /* recv_sys_t::parse() expects to find a WRITE record that
+ covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+ in order to avoid optimizing away any unchanged most
+ significant bytes of FSP_SIZE. */
+ mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
+ + header->frame, space->size);
+ space->size_in_header = space->size;
+
+ return(success);
+}
+
+/** Calculate the number of physical pages in an extent for this file.
+@param[in] physical_size page_size of the datafile
+@return number of pages in an extent for this file */
+inline uint32_t fsp_get_extent_size_in_pages(ulint physical_size)
+{
+ return uint32_t((FSP_EXTENT_SIZE << srv_page_size_shift) / physical_size);
+}
+
+
+/** Calculate the number of pages to extend a datafile.
+We extend single-table tablespaces first one extent at a time,
+but 4 at a time for bigger tablespaces. It is not enough to extend always
+by one extent, because we need to add at least one extent to FSP_FREE.
+A single extent descriptor page will track many extents. And the extent
+that uses its extent descriptor page is put onto the FSP_FREE_FRAG list.
+Extents that do not use their extent descriptor page are added to FSP_FREE.
+The physical page size is used to determine how many extents are tracked
+on one extent descriptor page. See xdes_calc_descriptor_page().
+@param[in] physical_size page size in data file
+@param[in] size current number of pages in the datafile
+@return number of pages to extend the file. */
+static uint32_t fsp_get_pages_to_extend_ibd(unsigned physical_size,
+ uint32_t size)
+{
+ uint32_t extent_size = fsp_get_extent_size_in_pages(physical_size);
+ /* The threshold is set at 32MiB except when the physical page
+ size is small enough that it must be done sooner. */
+ uint32_t threshold = std::min(32 * extent_size, physical_size);
+
+ if (size >= threshold) {
+ /* Below in fsp_fill_free_list() we assume
+ that we add at most FSP_FREE_ADD extents at
+ a time */
+ extent_size *= FSP_FREE_ADD;
+ }
+
+ return extent_size;
+}
+
+/** Try to extend the last data file of a tablespace if it is auto-extending.
+@param[in,out] space tablespace
+@param[in,out] header tablespace header
+@param[in,out] mtr mini-transaction
+@return number of pages added
+@retval 0 if the tablespace was not extended */
+ATTRIBUTE_COLD __attribute__((nonnull))
+static
+ulint
+fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
+{
+ const char* OUT_OF_SPACE_MSG =
+ "ran out of space. Please add another file or use"
+ " 'autoextend' for the last file in setting";
+
+ ut_d(space->modify_check(*mtr));
+
+ if (space->id == TRX_SYS_SPACE
+ && !srv_sys_space.can_auto_extend_last_file()) {
+
+ /* We print the error message only once to avoid
+ spamming the error log. Note that we don't need
+ to reset the flag to false as dealing with this
+ error requires server restart. */
+ if (!srv_sys_space.get_tablespace_full_status()) {
+ ib::error() << "The InnoDB system tablespace "
+ << OUT_OF_SPACE_MSG
+ << " innodb_data_file_path.";
+ srv_sys_space.set_tablespace_full_status(true);
+ }
+ return(0);
+ } else if (space->id == SRV_TMP_SPACE_ID
+ && !srv_tmp_space.can_auto_extend_last_file()) {
+
+ /* We print the error message only once to avoid
+ spamming the error log. Note that we don't need
+ to reset the flag to false as dealing with this
+ error requires server restart. */
+ if (!srv_tmp_space.get_tablespace_full_status()) {
+ ib::error() << "The InnoDB temporary tablespace "
+ << OUT_OF_SPACE_MSG
+ << " innodb_temp_data_file_path.";
+ srv_tmp_space.set_tablespace_full_status(true);
+ }
+ return(0);
+ }
+
+ uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ + header->frame);
+ ut_ad(size == space->size_in_header);
+ uint32_t size_increase;
+
+ const unsigned ps = space->physical_size();
+
+ switch (space->id) {
+ case TRX_SYS_SPACE:
+ size_increase = srv_sys_space.get_increment();
+ break;
+ case SRV_TMP_SPACE_ID:
+ size_increase = srv_tmp_space.get_increment();
+ break;
+ default:
+ uint32_t extent_pages = fsp_get_extent_size_in_pages(ps);
+ if (size < extent_pages) {
+ /* Let us first extend the file to extent_size */
+ if (!fsp_try_extend_data_file_with_pages(
+ space, extent_pages - 1, header, mtr)) {
+ return(0);
+ }
+
+ size = extent_pages;
+ }
+
+ size_increase = fsp_get_pages_to_extend_ibd(ps, size);
+ }
+
+ if (size_increase == 0) {
+ return(0);
+ }
+
+ if (!fil_space_extend(space, size + size_increase)) {
+ return(0);
+ }
+
+ /* We ignore any fragments of a full megabyte when storing the size
+ to the space header */
+
+ space->size_in_header = ut_2pow_round(space->size, (1024 * 1024) / ps);
+
+ /* recv_sys_t::parse() expects to find a WRITE record that
+ covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+ in order to avoid optimizing away any unchanged most
+ significant bytes of FSP_SIZE. */
+ mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
+ + header->frame, space->size_in_header);
+
+ return(size_increase);
+}
+
+/** Reset the page type.
+Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in] block block with invalid FIL_PAGE_TYPE
+@param[in] type expected page type
+@param[in,out] mtr mini-transaction */
+ATTRIBUTE_COLD
+void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr)
+{
+ ib::info()
+ << "Resetting invalid page " << block.page.id() << " type "
+ << fil_page_get_type(block.frame) << " to " << type << ".";
+ mtr->write<2>(block, block.frame + FIL_PAGE_TYPE, type);
+}
+
+/** Put new extents to the free list if there are free extents above the free
+limit. If an extent happens to contain an extent descriptor page, the extent
+is put to the FSP_FREE_FRAG list with the page marked as used.
+@param[in] init_space true if this is a single-table tablespace
+and we are only initializing the first extent and the first bitmap pages;
+then we will not allocate more extents
+@param[in,out] space tablespace
+@param[in,out] header tablespace header
+@param[in,out] mtr mini-transaction */
+static
+void
+fsp_fill_free_list(
+ bool init_space,
+ fil_space_t* space,
+ buf_block_t* header,
+ mtr_t* mtr)
+{
+ ut_d(space->modify_check(*mtr));
+
+ /* Check if we can fill free list from above the free list limit */
+ uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ + header->frame);
+ uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+ + header->frame);
+
+ ut_ad(size == space->size_in_header);
+ ut_ad(limit == space->free_limit);
+
+ const ulint zip_size = space->zip_size();
+
+ if (size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
+ bool skip_resize = init_space;
+ switch (space->id) {
+ case TRX_SYS_SPACE:
+ skip_resize = !srv_sys_space.can_auto_extend_last_file();
+ break;
+ case SRV_TMP_SPACE_ID:
+ skip_resize = !srv_tmp_space.can_auto_extend_last_file();
+ break;
+ }
+
+ if (!skip_resize) {
+ fsp_try_extend_data_file(space, header, mtr);
+ size = space->size_in_header;
+ }
+ }
+
+ uint32_t count = 0;
+
+ for (uint32_t i = limit, extent_size = FSP_EXTENT_SIZE,
+ physical_size = space->physical_size();
+ (init_space && i < 1)
+ || (i + extent_size <= size && count < FSP_FREE_ADD);
+ i += extent_size) {
+ const bool init_xdes = !ut_2pow_remainder(i, physical_size);
+
+ space->free_limit = i + extent_size;
+ mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+ + header->frame, i + extent_size);
+
+ if (init_xdes) {
+
+ buf_block_t* block;
+
+ /* We are going to initialize a new descriptor page
+ and a new ibuf bitmap page: the prior contents of the
+ pages should be ignored. */
+
+ if (i > 0) {
+ buf_block_t *f= buf_LRU_get_free_block(false);
+ block= buf_page_create(
+ space, static_cast<uint32_t>(i),
+ zip_size, mtr, f);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+ if (UNIV_UNLIKELY(block != f)) {
+ buf_pool.free_block(f);
+ }
+ fsp_init_file_page(space, block, mtr);
+ mtr->write<2>(*block,
+ FIL_PAGE_TYPE + block->frame,
+ FIL_PAGE_TYPE_XDES);
+ }
+
+ if (space->purpose != FIL_TYPE_TEMPORARY) {
+ buf_block_t *f= buf_LRU_get_free_block(false);
+ block = buf_page_create(
+ space,
+ static_cast<uint32_t>(
+ i + FSP_IBUF_BITMAP_OFFSET),
+ zip_size, mtr, f);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+ if (UNIV_UNLIKELY(block != f)) {
+ buf_pool.free_block(f);
+ }
+ fsp_init_file_page(space, block, mtr);
+ mtr->write<2>(*block,
+ block->frame + FIL_PAGE_TYPE,
+ FIL_PAGE_IBUF_BITMAP);
+ }
+ }
+
+ buf_block_t* xdes;
+ xdes_t* descr = xdes_get_descriptor_with_space_hdr(
+ header, space, i, &xdes, mtr, init_space);
+ if (xdes != header && !space->full_crc32()) {
+ fil_block_check_type(*xdes, FIL_PAGE_TYPE_XDES, mtr);
+ }
+ xdes_init(*xdes, descr, mtr);
+ const uint16_t xoffset= static_cast<uint16_t>(
+ descr - xdes->frame + XDES_FLST_NODE);
+
+ if (UNIV_UNLIKELY(init_xdes)) {
+
+ /* The first page in the extent is a descriptor page
+ and the second is an ibuf bitmap page: mark them
+ used */
+
+ xdes_set_free<false>(*xdes, descr, 0, mtr);
+ xdes_set_free<false>(*xdes, descr,
+ FSP_IBUF_BITMAP_OFFSET, mtr);
+ xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+
+ flst_add_last(header,
+ FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+ xdes, xoffset, mtr);
+ byte* n_used = FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+ + header->frame;
+ mtr->write<4>(*header, n_used,
+ 2U + mach_read_from_4(n_used));
+ } else {
+ flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE,
+ xdes, xoffset, mtr);
+ count++;
+ }
+ }
+
+ space->free_len += count;
+}
+
+/** Allocates a new free extent.
+@param[in,out] space tablespace
+@param[in] hint hint of which extent would be desirable: any
+page offset in the extent goes; the hint must not be > FSP_FREE_LIMIT
+@param[out] xdes extent descriptor page
+@param[in,out] mtr mini-transaction
+@return extent descriptor, NULL if cannot be allocated */
+static
+xdes_t*
+fsp_alloc_free_extent(
+ fil_space_t* space,
+ uint32_t hint,
+ buf_block_t** xdes,
+ mtr_t* mtr)
+{
+ fil_addr_t first;
+ xdes_t* descr;
+ buf_block_t* desc_block = NULL;
+
+ buf_block_t* header = fsp_get_header(space, mtr);
+
+ descr = xdes_get_descriptor_with_space_hdr(
+ header, space, hint, &desc_block, mtr);
+
+ if (desc_block != header && !space->full_crc32()) {
+ fil_block_check_type(*desc_block, FIL_PAGE_TYPE_XDES, mtr);
+ }
+
+ if (descr && (xdes_get_state(descr) == XDES_FREE)) {
+ /* Ok, we can take this extent */
+ } else {
+ /* Take the first extent in the free list */
+ first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
+ + header->frame);
+
+ if (first.page == FIL_NULL) {
+ fsp_fill_free_list(false, space, header, mtr);
+
+ first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
+ + header->frame);
+ if (first.page == FIL_NULL) {
+ return nullptr; /* No free extents left */
+ }
+ }
+
+ descr = xdes_lst_get_descriptor(space, first, &desc_block,
+ mtr);
+ }
+
+ flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block,
+ static_cast<uint16_t>(
+ descr - desc_block->frame + XDES_FLST_NODE), mtr);
+ space->free_len--;
+ *xdes = desc_block;
+
+ return(descr);
+}
+
+/** Allocate a single free page.
+@param[in,out] header tablespace header
+@param[in,out] xdes extent descriptor page
+@param[in,out] descr extent descriptor
+@param[in] bit slot to allocate in the extent
+@param[in,out] mtr mini-transaction */
+static void
+fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr,
+ ulint bit, mtr_t *mtr)
+{
+ ut_ad(xdes_get_state(descr) == XDES_FREE_FRAG);
+ ut_a(xdes_is_free(descr, bit));
+ xdes_set_free<false>(*xdes, descr, bit, mtr);
+
+ /* Update the FRAG_N_USED field */
+ byte* n_used_p = FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->frame;
+
+ uint32_t n_used = mach_read_from_4(n_used_p) + 1;
+
+ if (xdes_is_full(descr)) {
+ /* The fragment is full: move it to another list */
+ const uint16_t xoffset= static_cast<uint16_t>(
+ descr - xdes->frame + XDES_FLST_NODE);
+ flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+ xdes, xoffset, mtr);
+ xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr);
+
+ flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+ xdes, xoffset, mtr);
+ n_used -= FSP_EXTENT_SIZE;
+ }
+
+ mtr->write<4>(*header, n_used_p, n_used);
+}
+
+/** Gets a buffer block for an allocated page.
+@param[in,out] space tablespace
+@param[in] offset page number of the allocated page
+@param[in,out] mtr mini-transaction
+@return block, initialized */
+static
+buf_block_t*
+fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr)
+{
+ buf_block_t *free_block= buf_LRU_get_free_block(false);
+ buf_block_t *block= buf_page_create(space, static_cast<uint32_t>(offset),
+ space->zip_size(), mtr, free_block);
+ if (UNIV_UNLIKELY(block != free_block))
+ buf_pool.free_block(free_block);
+ fsp_init_file_page(space, block, mtr);
+ return block;
+}
+
+/** Allocates a single free page from a space.
+The page is marked as used.
+@param[in,out] space tablespace
+@param[in] hint hint of which page would be desirable
+@param[in,out] mtr mini-transaction
+@param[in,out] init_mtr mini-transaction in which the page should be
+initialized (may be the same as mtr)
+@retval NULL if no page could be allocated */
+static MY_ATTRIBUTE((warn_unused_result, nonnull))
+buf_block_t*
+fsp_alloc_free_page(
+ fil_space_t* space,
+ uint32_t hint,
+ mtr_t* mtr,
+ mtr_t* init_mtr)
+{
+ fil_addr_t first;
+ xdes_t* descr;
+ const ulint space_id = space->id;
+
+ ut_d(space->modify_check(*mtr));
+ buf_block_t* block = fsp_get_header(space, mtr);
+ buf_block_t *xdes;
+
+ /* Get the hinted descriptor */
+ descr = xdes_get_descriptor_with_space_hdr(block, space, hint, &xdes,
+ mtr);
+
+ if (descr && (xdes_get_state(descr) == XDES_FREE_FRAG)) {
+ /* Ok, we can take this extent */
+ } else {
+ /* Else take the first extent in free_frag list */
+ first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG
+ + block->frame);
+
+ if (first.page == FIL_NULL) {
+ /* There are no partially full fragments: allocate
+ a free extent and add it to the FREE_FRAG list. NOTE
+ that the allocation may have as a side-effect that an
+ extent containing a descriptor page is added to the
+ FREE_FRAG list. But we will allocate our page from the
+ the free extent anyway. */
+
+ descr = fsp_alloc_free_extent(space, hint, &xdes, mtr);
+
+ if (descr == NULL) {
+ /* No free space left */
+
+ return(NULL);
+ }
+
+ xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+ flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+ xdes, static_cast<uint16_t>(
+ descr - xdes->frame
+ + XDES_FLST_NODE), mtr);
+ } else {
+ descr = xdes_lst_get_descriptor(space, first, &xdes,
+ mtr);
+ }
+
+ /* Reset the hint */
+ hint = 0;
+ }
+
+ /* Now we have in descr an extent with at least one free page. Look
+ for a free page in the extent. */
+
+ uint32_t free = xdes_find_free(descr, hint % FSP_EXTENT_SIZE);
+ if (free == FIL_NULL) {
+
+ ut_print_buf(stderr, ((byte*) descr) - 500, 1000);
+ putc('\n', stderr);
+
+ ut_error;
+ }
+
+ uint32_t page_no = xdes_get_offset(descr) + free;
+
+ uint32_t space_size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ + block->frame);
+ ut_ad(space_size == space->size_in_header
+ || (space_id == TRX_SYS_SPACE
+ && srv_startup_is_before_trx_rollback_phase));
+
+ if (space_size <= page_no) {
+ /* It must be that we are extending a single-table tablespace
+ whose size is still < 64 pages */
+
+ ut_a(!is_system_tablespace(space_id));
+ if (page_no >= FSP_EXTENT_SIZE) {
+ ib::error() << "Trying to extend a single-table"
+ " tablespace " << space->name << " , by single"
+ " page(s) though the space size " << space_size
+ << ". Page no " << page_no << ".";
+ return(NULL);
+ }
+
+ if (!fsp_try_extend_data_file_with_pages(space, page_no,
+ block, mtr)) {
+ /* No disk space left */
+ return(NULL);
+ }
+ }
+
+ fsp_alloc_from_free_frag(block, xdes, descr, free, mtr);
+ return fsp_page_create(space, page_no, init_mtr);
+}
+
+/** Frees a single page of a space.
+The page is marked as free and clean.
+@param[in,out] space tablespace
+@param[in] offset page number
+@param[in,out] mtr mini-transaction */
+static void fsp_free_page(fil_space_t* space, page_no_t offset, mtr_t* mtr)
+{
+ xdes_t* descr;
+ ulint state;
+ ulint frag_n_used;
+
+ ut_ad(mtr);
+ ut_d(space->modify_check(*mtr));
+
+ /* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */
+
+ buf_block_t* header = fsp_get_header(space, mtr);
+ buf_block_t* xdes= 0;
+
+ descr = xdes_get_descriptor_with_space_hdr(header, space, offset,
+ &xdes, mtr);
+
+ state = xdes_get_state(descr);
+
+ if (UNIV_UNLIKELY(state != XDES_FREE_FRAG
+ && state != XDES_FULL_FRAG)) {
+ ib::error() << "File space extent descriptor of page "
+ << page_id_t(space->id, offset)
+ << " has state " << state;
+ /* Crash in debug version, so that we get a core dump
+ of this corruption. */
+ ut_ad(0);
+
+ if (state == XDES_FREE) {
+ /* We put here some fault tolerance: if the page
+ is already free, return without doing anything! */
+
+ return;
+ }
+
+ ut_error;
+ }
+
+ if (xdes_is_free(descr, offset % FSP_EXTENT_SIZE)) {
+ ib::error() << "File space extent descriptor of page "
+ << page_id_t(space->id, offset)
+ << " says it is free.";
+ /* Crash in debug version, so that we get a core dump
+ of this corruption. */
+ ut_ad(0);
+
+ /* We put here some fault tolerance: if the page
+ is already free, return without doing anything! */
+
+ return;
+ }
+
+ mtr->free(*space, static_cast<uint32_t>(offset));
+
+ const ulint bit = offset % FSP_EXTENT_SIZE;
+
+ xdes_set_free<true>(*xdes, descr, bit, mtr);
+
+ frag_n_used = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+ + header->frame);
+
+ const uint16_t xoffset= static_cast<uint16_t>(descr - xdes->frame
+ + XDES_FLST_NODE);
+
+ if (state == XDES_FULL_FRAG) {
+ /* The fragment was full: move it to another list */
+ flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+ xdes, xoffset, mtr);
+ xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+ flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+ xdes, xoffset, mtr);
+ mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+ + header->frame,
+ frag_n_used + FSP_EXTENT_SIZE - 1);
+ } else {
+ ut_a(frag_n_used > 0);
+ mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+ + header->frame, frag_n_used - 1);
+ }
+
+ if (!xdes_get_n_used(descr)) {
+ /* The extent has become free: move it to another list */
+ flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+ xdes, xoffset, mtr);
+ fsp_free_extent(space, offset, mtr);
+ }
+}
+
+/** Return an extent to the free list of a space.
+@param[in,out] space tablespace
+@param[in] offset page number in the extent
+@param[in,out] mtr mini-transaction */
+static void fsp_free_extent(fil_space_t* space, page_no_t offset, mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains(*space));
+
+ buf_block_t *block= fsp_get_header(space, mtr);
+ buf_block_t *xdes= 0;
+
+ xdes_t* descr= xdes_get_descriptor_with_space_hdr(block, space, offset,
+ &xdes, mtr);
+ ut_a(xdes_get_state(descr) != XDES_FREE);
+
+ xdes_init(*xdes, descr, mtr);
+
+ flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE,
+ xdes, static_cast<uint16_t>(descr - xdes->frame +
+ XDES_FLST_NODE), mtr);
+ space->free_len++;
+}
+
+/** @return Number of segment inodes which fit on a single page */
+inline ulint FSP_SEG_INODES_PER_PAGE(ulint physical_size)
+{
+ return (physical_size - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE;
+}
+
+/** Returns the nth inode slot on an inode page.
+@param[in] page segment inode page
+@param[in] i inode index on page
+@return segment inode */
+#define fsp_seg_inode_page_get_nth_inode(page, i) \
+ FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i + page
+
+/** Looks for a used segment inode on a segment inode page.
+@param[in] page segment inode page
+@param[in] physical_size page size
+@return segment inode index, or ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_used(const page_t* page, ulint physical_size)
+{
+ for (ulint i = 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) {
+ if (!mach_read_from_8(
+ FSEG_ID
+ + fsp_seg_inode_page_get_nth_inode(page, i))) {
+ continue;
+ }
+ /* This is used */
+ ut_ad(FSEG_MAGIC_N_VALUE == mach_read_from_4(
+ FSEG_MAGIC_N
+ + fsp_seg_inode_page_get_nth_inode(page, i)));
+ return i;
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/** Looks for an unused segment inode on a segment inode page.
+@param[in] page segment inode page
+@param[in] i search forward starting from this index
+@param[in] physical_size page size
+@return segment inode index, or ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_free(const page_t* page, ulint i, ulint physical_size)
+{
+ for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) {
+ if (!mach_read_from_8(
+ FSEG_ID
+ + fsp_seg_inode_page_get_nth_inode(page, i))) {
+ /* This is unused */
+ return i;
+ }
+
+ ut_ad(FSEG_MAGIC_N_VALUE == mach_read_from_4(
+ FSEG_MAGIC_N
+ + fsp_seg_inode_page_get_nth_inode(page, i)));
+ }
+
+ return ULINT_UNDEFINED;
+}
+
+/** Allocate a file segment inode page.
+@param[in,out] space tablespace
+@param[in,out] header tablespace header
+@param[in,out] mtr mini-transaction
+@return whether the allocation succeeded */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+bool
+fsp_alloc_seg_inode_page(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
+{
+ ut_ad(header->page.id().space() == space->id);
+ buf_block_t *block= fsp_alloc_free_page(space, 0, mtr, mtr);
+
+ if (!block)
+ return false;
+
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+ ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
+
+ mtr->write<2>(*block, block->frame + FIL_PAGE_TYPE, FIL_PAGE_INODE);
+
+#ifdef UNIV_DEBUG
+ const byte *inode= FSEG_ID + FSEG_ARR_OFFSET + block->frame;
+ for (ulint i= FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--;
+ inode += FSEG_INODE_SIZE)
+ ut_ad(!mach_read_from_8(inode));
+#endif
+
+ flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+ block, FSEG_INODE_PAGE_NODE, mtr);
+ return true;
+}
+
+/** Allocate a file segment inode.
+@param[in,out] space tablespace
+@param[in,out] header tablespace header
+@param[out] iblock segment inode page
+@param[in,out] mtr mini-transaction
+@return segment inode
+@retval NULL if not enough space */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static fseg_inode_t*
+fsp_alloc_seg_inode(fil_space_t *space, buf_block_t *header,
+ buf_block_t **iblock, mtr_t *mtr)
+{
+ buf_block_t* block;
+ fseg_inode_t* inode;
+
+ /* Allocate a new segment inode page if needed. */
+ if (!flst_get_len(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE
+ + header->frame)
+ && !fsp_alloc_seg_inode_page(space, header, mtr)) {
+ return(NULL);
+ }
+ const page_id_t page_id(
+ space->id,
+ flst_get_first(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE
+ + header->frame).page);
+
+ block = buf_page_get(page_id, space->zip_size(), RW_SX_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+ if (!space->full_crc32()) {
+ fil_block_check_type(*block, FIL_PAGE_INODE, mtr);
+ }
+
+ const ulint physical_size = space->physical_size();
+
+ ulint n = fsp_seg_inode_page_find_free(block->frame, 0, physical_size);
+
+ ut_a(n < FSP_SEG_INODES_PER_PAGE(physical_size));
+
+ inode = fsp_seg_inode_page_get_nth_inode(block->frame, n);
+
+ if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(block->frame,
+ n + 1,
+ physical_size)) {
+ /* There are no other unused headers left on the page: move it
+ to another list */
+ flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+ block, FSEG_INODE_PAGE_NODE, mtr);
+ flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+ block, FSEG_INODE_PAGE_NODE, mtr);
+ }
+
+ ut_ad(!mach_read_from_8(inode + FSEG_ID)
+ || mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+ *iblock = block;
+ return(inode);
+}
+
+/** Frees a file segment inode.
+@param[in,out] space tablespace
+@param[in,out] inode segment inode
+@param[in,out] iblock segment inode page
+@param[in,out] mtr mini-transaction */
+static void fsp_free_seg_inode(
+ fil_space_t* space,
+ fseg_inode_t* inode,
+ buf_block_t* iblock,
+ mtr_t* mtr)
+{
+ ut_d(space->modify_check(*mtr));
+
+ buf_block_t* header = fsp_get_header(space, mtr);
+
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+ const ulint physical_size = space->physical_size();
+
+ if (ULINT_UNDEFINED
+ == fsp_seg_inode_page_find_free(iblock->frame, 0, physical_size)) {
+ /* Move the page to another list */
+ flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+ iblock, FSEG_INODE_PAGE_NODE, mtr);
+ flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+ iblock, FSEG_INODE_PAGE_NODE, mtr);
+ }
+
+ mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0);
+
+ if (ULINT_UNDEFINED
+ == fsp_seg_inode_page_find_used(iblock->frame, physical_size)) {
+ /* There are no other used headers left on the page: free it */
+ flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+ iblock, FSEG_INODE_PAGE_NODE, mtr);
+ fsp_free_page(space, iblock->page.id().page_no(), mtr);
+ }
+}
+
+/** Returns the file segment inode, page x-latched.
+@param[in] header segment header
+@param[in] space space id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction
+@param[out] block inode block, or NULL to ignore
+@return segment inode, page x-latched; NULL if the inode is free */
+static
+fseg_inode_t*
+fseg_inode_try_get(
+ const fseg_header_t* header,
+ ulint space,
+ ulint zip_size,
+ mtr_t* mtr,
+ buf_block_t** block)
+{
+ fil_addr_t inode_addr;
+ fseg_inode_t* inode;
+
+ inode_addr.page = mach_read_from_4(header + FSEG_HDR_PAGE_NO);
+ inode_addr.boffset = mach_read_from_2(header + FSEG_HDR_OFFSET);
+ ut_ad(space == mach_read_from_4(header + FSEG_HDR_SPACE));
+
+ inode = fut_get_ptr(space, zip_size, inode_addr, RW_SX_LATCH, mtr,
+ block);
+
+ if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID))) {
+
+ inode = NULL;
+ } else {
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ }
+
+ return(inode);
+}
+
+/** Returns the file segment inode, page x-latched.
+@param[in] header segment header
+@param[in] space space id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction
+@param[out] block inode block
+@return segment inode, page x-latched */
+static
+fseg_inode_t*
+fseg_inode_get(
+ const fseg_header_t* header,
+ ulint space,
+ ulint zip_size,
+ mtr_t* mtr,
+ buf_block_t** block = NULL)
+{
+ fseg_inode_t* inode
+ = fseg_inode_try_get(header, space, zip_size, mtr, block);
+ ut_a(inode);
+ return(inode);
+}
+
+/** Get the page number from the nth fragment page slot.
+@param inode file segment findex
+@param n slot index
+@return page number
+@retval FIL_NULL if not in use */
+static uint32_t fseg_get_nth_frag_page_no(const fseg_inode_t *inode, ulint n)
+{
+ ut_ad(inode);
+ ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+ return(mach_read_from_4(inode + FSEG_FRAG_ARR
+ + n * FSEG_FRAG_SLOT_SIZE));
+}
+
+/** Set the page number in the nth fragment page slot.
+@param[in,out] inode segment inode
+@param[in,out] iblock segment inode page
+@param[in] n slot index
+@param[in] page_no page number to set
+@param[in,out] mtr mini-transaction */
+inline void fseg_set_nth_frag_page_no(fseg_inode_t *inode, buf_block_t *iblock,
+ ulint n, ulint page_no, mtr_t *mtr)
+{
+ ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+ ut_ad(mtr->memo_contains_flagged(iblock, MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+ mtr->write<4>(*iblock, inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE,
+ page_no);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is free.
+@return slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_free_frag_page_slot(
+/*==========================*/
+ fseg_inode_t* inode) /*!< in: segment inode */
+{
+ ulint i;
+ ulint page_no;
+
+ for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+ page_no = fseg_get_nth_frag_page_no(inode, i);
+
+ if (page_no == FIL_NULL) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is used and last in the array.
+@return slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_last_used_frag_page_slot(
+/*===============================*/
+ fseg_inode_t* inode) /*!< in: segment inode */
+{
+ ulint i;
+ ulint page_no;
+
+ for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+ page_no = fseg_get_nth_frag_page_no(
+ inode, FSEG_FRAG_ARR_N_SLOTS - i - 1);
+
+ if (page_no != FIL_NULL) {
+
+ return(FSEG_FRAG_ARR_N_SLOTS - i - 1);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/** Calculate reserved fragment page slots.
+@param inode file segment index
+@return number of fragment pages */
+static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode)
+{
+ ulint i;
+ ulint count = 0;
+
+ for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+ if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i)) {
+ count++;
+ }
+ }
+
+ return(count);
+}
+
+/** Create a new segment.
+@param space tablespace
+@param byte_offset byte offset of the created segment header
+@param mtr mini-transaction
+@param has_done_reservation whether fsp_reserve_free_extents() was invoked
+@param block block where segment header is placed,
+ or NULL to allocate an additional page for that
+@return the block where the segment header is placed, x-latched
+@retval NULL if could not create segment because of lack of space */
+buf_block_t*
+fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
+ bool has_done_reservation, buf_block_t *block)
+{
+ fseg_inode_t* inode;
+ ib_id_t seg_id;
+ uint32_t n_reserved;
+
+ DBUG_ENTER("fseg_create");
+
+ ut_ad(mtr);
+ ut_ad(byte_offset >= FIL_PAGE_DATA);
+ ut_ad(byte_offset + FSEG_HEADER_SIZE
+ <= srv_page_size - FIL_PAGE_DATA_END);
+
+ mtr_x_lock_space(space, mtr);
+ ut_d(space->modify_check(*mtr));
+
+ if (block) {
+ ut_ad(block->page.id().space() == space->id);
+
+ if (!space->full_crc32()) {
+ fil_block_check_type(*block, block->page.id()
+ == page_id_t(TRX_SYS_SPACE,
+ TRX_SYS_PAGE_NO)
+ ? FIL_PAGE_TYPE_TRX_SYS
+ : FIL_PAGE_TYPE_SYS,
+ mtr);
+ }
+ }
+
+ if (!has_done_reservation
+ && !fsp_reserve_free_extents(&n_reserved, space, 2,
+ FSP_NORMAL, mtr)) {
+ DBUG_RETURN(NULL);
+ }
+
+ buf_block_t* header = fsp_get_header(space, mtr);
+ buf_block_t* iblock;
+
+ inode = fsp_alloc_seg_inode(space, header, &iblock, mtr);
+
+ if (inode == NULL) {
+ goto funct_exit;
+ }
+
+ /* Read the next segment id from space header and increment the
+ value in space header */
+
+ seg_id = mach_read_from_8(FSP_HEADER_OFFSET + FSP_SEG_ID
+ + header->frame);
+
+ mtr->write<8>(*header, FSP_HEADER_OFFSET + FSP_SEG_ID + header->frame,
+ seg_id + 1);
+ mtr->write<8>(*iblock, inode + FSEG_ID, seg_id);
+ ut_ad(!mach_read_from_4(inode + FSEG_NOT_FULL_N_USED));
+
+ flst_init(*iblock, inode + FSEG_FREE, mtr);
+ flst_init(*iblock, inode + FSEG_NOT_FULL, mtr);
+ flst_init(*iblock, inode + FSEG_FULL, mtr);
+
+ mtr->write<4>(*iblock, inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE);
+ compile_time_assert(FSEG_FRAG_SLOT_SIZE == 4);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ mtr->memset(iblock, uint16_t(inode - iblock->frame) + FSEG_FRAG_ARR,
+ FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff);
+
+ if (!block) {
+ block = fseg_alloc_free_page_low(space,
+ inode, iblock, 0, FSP_UP,
+#ifdef UNIV_DEBUG
+ has_done_reservation,
+#endif /* UNIV_DEBUG */
+ mtr, mtr);
+
+ /* The allocation cannot fail if we have already reserved a
+ space for the page. */
+ ut_ad(!has_done_reservation || block != NULL);
+
+ if (block == NULL) {
+ fsp_free_seg_inode(space, inode, iblock, mtr);
+ goto funct_exit;
+ }
+
+ ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
+ ut_ad(!fil_page_get_type(block->frame));
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+ FIL_PAGE_TYPE_SYS);
+ }
+
+ mtr->write<2>(*block, byte_offset + FSEG_HDR_OFFSET
+ + block->frame, page_offset(inode));
+
+ mtr->write<4>(*block, byte_offset + FSEG_HDR_PAGE_NO
+ + block->frame, iblock->page.id().page_no());
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, byte_offset + FSEG_HDR_SPACE
+ + block->frame, space->id);
+
+funct_exit:
+ if (!has_done_reservation) {
+ space->release_free_extents(n_reserved);
+ }
+
+ DBUG_RETURN(block);
+}
+
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return number of reserved pages */
+static
+ulint
+fseg_n_reserved_pages_low(
+/*======================*/
+ const fseg_inode_t* inode, /*!< in: segment inode */
+ ulint* used) /*!< out: number of pages used (not
+ more than reserved) */
+{
+ *used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL)
+ + fseg_get_n_frag_pages(inode);
+
+ return fseg_get_n_frag_pages(inode)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL);
+}
+
+/** Calculate the number of pages reserved by a segment,
+and how many pages are currently used.
+@param[in] block buffer block containing the file segment header
+@param[in] header file segment header
+@param[out] used number of pages that are used (not more than reserved)
+@param[in,out] mtr mini-transaction
+@return number of reserved pages */
+ulint fseg_n_reserved_pages(const buf_block_t &block,
+ const fseg_header_t *header, ulint *used,
+ mtr_t *mtr)
+{
+ ut_ad(page_align(header) == block.frame);
+ return fseg_n_reserved_pages_low(fseg_inode_get(header,
+ block.page.id().space(),
+ block.zip_size(), mtr),
+ used);
+}
+
+/** Tries to fill the free list of a segment with consecutive free extents.
+This happens if the segment is big enough to allow extents in the free list,
+the free list is empty, and the extents can be allocated consecutively from
+the hint onward.
+@param[in,out] inode segment inode
+@param[in,out] iblock segment inode page
+@param[in] space tablespace
+@param[in] hint hint which extent would be good as the first extent
+@param[in,out] mtr mini-transaction */
+static
+void
+fseg_fill_free_list(
+ fseg_inode_t* inode,
+ buf_block_t* iblock,
+ fil_space_t* space,
+ uint32_t hint,
+ mtr_t* mtr)
+{
+ xdes_t* descr;
+ ulint i;
+ ib_id_t seg_id;
+ ulint reserved;
+ ulint used;
+
+ ut_ad(inode && mtr);
+ ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ ut_d(space->modify_check(*mtr));
+
+ reserved = fseg_n_reserved_pages_low(inode, &used);
+
+ if (reserved < FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) {
+
+ /* The segment is too small to allow extents in free list */
+
+ return;
+ }
+
+ if (flst_get_len(inode + FSEG_FREE) > 0) {
+ /* Free list is not empty */
+
+ return;
+ }
+
+ for (i = 0; i < FSEG_FREE_LIST_MAX_LEN; i++) {
+ buf_block_t* xdes;
+ descr = xdes_get_descriptor(space, hint, &xdes, mtr);
+
+ if (!descr || (XDES_FREE != xdes_get_state(descr))) {
+ /* We cannot allocate the desired extent: stop */
+ return;
+ }
+
+ descr = fsp_alloc_free_extent(space, hint, &xdes, mtr);
+
+ xdes_set_state(*xdes, descr, XDES_FSEG, mtr);
+
+ seg_id = mach_read_from_8(inode + FSEG_ID);
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ mtr->write<8>(*xdes, descr + XDES_ID, seg_id);
+
+ flst_add_last(iblock,
+ static_cast<uint16_t>(inode - iblock->frame
+ + FSEG_FREE), xdes,
+ static_cast<uint16_t>(descr - xdes->frame
+ + XDES_FLST_NODE), mtr);
+ hint += FSP_EXTENT_SIZE;
+ }
+}
+
+/** Allocates a free extent for the segment: looks first in the free list of
+the segment, then tries to allocate from the space free list.
+NOTE that the extent returned still resides in the segment free list, it is
+not yet taken off it!
+@param[in,out] inode segment inode
+@param[in,out] iblock segment inode page
+@param[out] xdes extent descriptor page
+@param[in,out] space tablespace
+@param[in,out] mtr mini-transaction
+@retval NULL if no page could be allocated */
+static
+xdes_t*
+fseg_alloc_free_extent(
+ fseg_inode_t* inode,
+ buf_block_t* iblock,
+ buf_block_t** xdes,
+ fil_space_t* space,
+ mtr_t* mtr)
+{
+ xdes_t* descr;
+ ib_id_t seg_id;
+ fil_addr_t first;
+
+ ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+ ut_d(space->modify_check(*mtr));
+
+ if (flst_get_len(inode + FSEG_FREE) > 0) {
+ /* Segment free list is not empty, allocate from it */
+
+ first = flst_get_first(inode + FSEG_FREE);
+
+ descr = xdes_lst_get_descriptor(space, first, xdes, mtr);
+ } else {
+ /* Segment free list was empty, allocate from space */
+ descr = fsp_alloc_free_extent(space, 0, xdes, mtr);
+
+ if (descr == NULL) {
+
+ return(NULL);
+ }
+
+ seg_id = mach_read_from_8(inode + FSEG_ID);
+
+ xdes_set_state(**xdes, descr, XDES_FSEG, mtr);
+ mtr->write<8,mtr_t::MAYBE_NOP>(**xdes, descr + XDES_ID,
+ seg_id);
+ flst_add_last(iblock,
+ static_cast<uint16_t>(inode - iblock->frame
+ + FSEG_FREE), *xdes,
+ static_cast<uint16_t>(descr - (*xdes)->frame
+ + XDES_FLST_NODE), mtr);
+
+ /* Try to fill the segment free list */
+ fseg_fill_free_list(inode, iblock, space,
+ xdes_get_offset(descr) + FSP_EXTENT_SIZE,
+ mtr);
+ }
+
+ return(descr);
+}
+
+/** Allocates a single free page from a segment.
+This function implements the intelligent allocation strategy which tries to
+minimize file space fragmentation.
+@param[in,out] space tablespace
+@param[in,out] seg_inode segment inode
+@param[in,out] iblock segment inode page
+@param[in] hint hint of which page would be desirable
+@param[in] direction if the new page is needed because of
+an index page split, and records are inserted there in order, into which
+direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
+@param[in,out] mtr mini-transaction
+@param[in,out] init_mtr mtr or another mini-transaction in
+which the page should be initialized.
+@retval NULL if no page could be allocated */
+static
+buf_block_t*
+fseg_alloc_free_page_low(
+ fil_space_t* space,
+ fseg_inode_t* seg_inode,
+ buf_block_t* iblock,
+ uint32_t hint,
+ byte direction,
+#ifdef UNIV_DEBUG
+ bool has_done_reservation,
+ /*!< whether the space has already been reserved */
+#endif /* UNIV_DEBUG */
+ mtr_t* mtr,
+ mtr_t* init_mtr)
+{
+ ib_id_t seg_id;
+ ulint used;
+ ulint reserved;
+ xdes_t* descr; /*!< extent of the hinted page */
+ uint32_t ret_page; /*!< the allocated page offset, FIL_NULL
+ if could not be allocated */
+ xdes_t* ret_descr; /*!< the extent of the allocated page */
+ buf_block_t* xdes;
+ ulint n;
+ const ulint space_id = space->id;
+
+ ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR));
+ ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ seg_id = mach_read_from_8(seg_inode + FSEG_ID);
+
+ ut_ad(seg_id);
+ ut_d(space->modify_check(*mtr));
+ ut_ad(fil_page_get_type(page_align(seg_inode)) == FIL_PAGE_INODE);
+
+ reserved = fseg_n_reserved_pages_low(seg_inode, &used);
+
+ buf_block_t* header = fsp_get_header(space, mtr);
+
+ descr = xdes_get_descriptor_with_space_hdr(header, space, hint,
+ &xdes, mtr);
+ if (descr == NULL) {
+ /* Hint outside space or too high above free limit: reset
+ hint */
+ /* The file space header page is always allocated. */
+ hint = 0;
+ descr = xdes_get_descriptor(space, hint, &xdes, mtr);
+ }
+
+ /* In the big if-else below we look for ret_page and ret_descr */
+ /*-------------------------------------------------------------*/
+ if ((xdes_get_state(descr) == XDES_FSEG)
+ && mach_read_from_8(descr + XDES_ID) == seg_id
+ && xdes_is_free(descr, hint % FSP_EXTENT_SIZE)) {
+take_hinted_page:
+ /* 1. We can take the hinted page
+ =================================*/
+ ret_descr = descr;
+ ret_page = hint;
+ /* Skip the check for extending the tablespace. If the
+ page hint were not within the size of the tablespace,
+ we would have got (descr == NULL) above and reset the hint. */
+ goto got_hinted_page;
+ /*-----------------------------------------------------------*/
+ } else if (xdes_get_state(descr) == XDES_FREE
+ && reserved - used < reserved / FSEG_FILLFACTOR
+ && used >= FSEG_FRAG_LIMIT) {
+
+ /* 2. We allocate the free extent from space and can take
+ =========================================================
+ the hinted page
+ ===============*/
+ ret_descr = fsp_alloc_free_extent(space, hint, &xdes, mtr);
+
+ ut_a(ret_descr == descr);
+
+ xdes_set_state(*xdes, ret_descr, XDES_FSEG, mtr);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*xdes, ret_descr + XDES_ID,
+ seg_id);
+ flst_add_last(iblock,
+ static_cast<uint16_t>(seg_inode - iblock->frame
+ + FSEG_FREE), xdes,
+ static_cast<uint16_t>(ret_descr - xdes->frame
+ + XDES_FLST_NODE), mtr);
+
+ /* Try to fill the segment free list */
+ fseg_fill_free_list(seg_inode, iblock, space,
+ hint + FSP_EXTENT_SIZE, mtr);
+ goto take_hinted_page;
+ /*-----------------------------------------------------------*/
+ } else if ((direction != FSP_NO_DIR)
+ && ((reserved - used) < reserved / FSEG_FILLFACTOR)
+ && (used >= FSEG_FRAG_LIMIT)
+ && !!(ret_descr = fseg_alloc_free_extent(seg_inode, iblock,
+ &xdes, space,
+ mtr))) {
+ /* 3. We take any free extent (which was already assigned above
+ ===============================================================
+ in the if-condition to ret_descr) and take the lowest or
+ ========================================================
+ highest page in it, depending on the direction
+ ==============================================*/
+ ret_page = xdes_get_offset(ret_descr);
+
+ if (direction == FSP_DOWN) {
+ ret_page += FSP_EXTENT_SIZE - 1;
+ }
+ ut_ad(!has_done_reservation || ret_page != FIL_NULL);
+ /*-----------------------------------------------------------*/
+ } else if ((xdes_get_state(descr) == XDES_FSEG)
+ && mach_read_from_8(descr + XDES_ID) == seg_id
+ && (!xdes_is_full(descr))) {
+
+ /* 4. We can take the page from the same extent as the
+ ======================================================
+ hinted page (and the extent already belongs to the
+ ==================================================
+ segment)
+ ========*/
+ ret_descr = descr;
+ ret_page = xdes_find_free(ret_descr, hint % FSP_EXTENT_SIZE);
+ if (ret_page == FIL_NULL) {
+ ut_ad(!has_done_reservation);
+ } else {
+ ret_page += xdes_get_offset(ret_descr);
+ }
+ /*-----------------------------------------------------------*/
+ } else if (reserved - used > 0) {
+ /* 5. We take any unused page from the segment
+ ==============================================*/
+ fil_addr_t first;
+
+ if (flst_get_len(seg_inode + FSEG_NOT_FULL) > 0) {
+ first = flst_get_first(seg_inode + FSEG_NOT_FULL);
+ } else if (flst_get_len(seg_inode + FSEG_FREE) > 0) {
+ first = flst_get_first(seg_inode + FSEG_FREE);
+ } else {
+ ut_ad(!has_done_reservation);
+ return(NULL);
+ }
+
+ ret_descr = xdes_lst_get_descriptor(space, first, &xdes, mtr);
+ ret_page = xdes_find_free(ret_descr);
+ if (ret_page == FIL_NULL) {
+ ut_ad(!has_done_reservation);
+ } else {
+ ret_page += xdes_get_offset(ret_descr);
+ }
+ /*-----------------------------------------------------------*/
+ } else if (used < FSEG_FRAG_LIMIT) {
+ /* 6. We allocate an individual page from the space
+ ===================================================*/
+ buf_block_t* block = fsp_alloc_free_page(
+ space, hint, mtr, init_mtr);
+
+ ut_ad(!has_done_reservation || block);
+
+ if (block) {
+ /* Put the page in the fragment page array of the
+ segment */
+ n = fseg_find_free_frag_page_slot(seg_inode);
+ ut_a(n != ULINT_UNDEFINED);
+
+ fseg_set_nth_frag_page_no(
+ seg_inode, iblock, n,
+ block->page.id().page_no(), mtr);
+ }
+
+ /* fsp_alloc_free_page() invoked fsp_init_file_page()
+ already. */
+ return(block);
+ /*-----------------------------------------------------------*/
+ } else {
+ /* 7. We allocate a new extent and take its first page
+ ======================================================*/
+ ret_descr = fseg_alloc_free_extent(seg_inode, iblock, &xdes,
+ space, mtr);
+
+ if (ret_descr == NULL) {
+ ret_page = FIL_NULL;
+ ut_ad(!has_done_reservation);
+ } else {
+ ret_page = xdes_get_offset(ret_descr);
+ ut_ad(!has_done_reservation || ret_page != FIL_NULL);
+ }
+ }
+
+ if (ret_page == FIL_NULL) {
+ /* Page could not be allocated */
+
+ ut_ad(!has_done_reservation);
+ return(NULL);
+ }
+
+ if (space->size <= ret_page && !is_system_tablespace(space_id)) {
+ /* It must be that we are extending a single-table
+ tablespace whose size is still < 64 pages */
+
+ if (ret_page >= FSP_EXTENT_SIZE) {
+ ib::error() << "Error (2): trying to extend"
+ " a single-table tablespace " << space_id
+ << " by single page(s) though the"
+ << " space size " << space->size
+ << ". Page no " << ret_page << ".";
+ ut_ad(!has_done_reservation);
+ return(NULL);
+ }
+
+ if (!fsp_try_extend_data_file_with_pages(
+ space, ret_page, header, mtr)) {
+ /* No disk space left */
+ ut_ad(!has_done_reservation);
+ return(NULL);
+ }
+ }
+
+got_hinted_page:
+ /* ret_descr == NULL if the block was allocated from free_frag
+ (XDES_FREE_FRAG) */
+ if (ret_descr != NULL) {
+ /* At this point we know the extent and the page offset.
+ The extent is still in the appropriate list (FSEG_NOT_FULL
+ or FSEG_FREE), and the page is not yet marked as used. */
+
+ ut_d(buf_block_t* xxdes);
+ ut_ad(xdes_get_descriptor(space, ret_page, &xxdes, mtr)
+ == ret_descr);
+ ut_ad(xdes == xxdes);
+ ut_ad(xdes_is_free(ret_descr, ret_page % FSP_EXTENT_SIZE));
+
+ fseg_mark_page_used(seg_inode, iblock, ret_page, ret_descr,
+ xdes, mtr);
+ }
+
+ return fsp_page_create(space, ret_page, init_mtr);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated */
+buf_block_t*
+fseg_alloc_free_page_general(
+/*=========================*/
+ fseg_header_t* seg_header,/*!< in/out: segment header */
+ uint32_t hint, /*!< in: hint of which page would be
+ desirable */
+ byte direction,/*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ bool has_done_reservation, /*!< in: true if the caller has
+ already done the reservation for the page
+ with fsp_reserve_free_extents, then there
+ is no need to do the check for this individual
+ page */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction
+ in which the page should be initialized. */
+{
+ fseg_inode_t* inode;
+ ulint space_id;
+ fil_space_t* space;
+ buf_block_t* iblock;
+ buf_block_t* block;
+ uint32_t n_reserved;
+
+ space_id = page_get_space_id(page_align(seg_header));
+ space = mtr_x_lock_space(space_id, mtr);
+ inode = fseg_inode_get(seg_header, space_id, space->zip_size(),
+ mtr, &iblock);
+ if (!space->full_crc32()) {
+ fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+ }
+
+ if (!has_done_reservation
+ && !fsp_reserve_free_extents(&n_reserved, space, 2,
+ FSP_NORMAL, mtr)) {
+ return(NULL);
+ }
+
+ block = fseg_alloc_free_page_low(space,
+ inode, iblock, hint, direction,
+#ifdef UNIV_DEBUG
+ has_done_reservation,
+#endif /* UNIV_DEBUG */
+ mtr, init_mtr);
+
+ /* The allocation cannot fail if we have already reserved a
+ space for the page. */
+ ut_ad(!has_done_reservation || block != NULL);
+
+ if (!has_done_reservation) {
+ space->release_free_extents(n_reserved);
+ }
+
+ return(block);
+}
+
+/** Check that we have at least n_pages frag pages free in the first extent
+of a single-table tablespace, and they are also physically initialized to
+the data file. That is we have already extended the data file so that those
+pages are inside the data file. If not, this function extends the tablespace
+with pages.
+@param[in,out] space tablespace
+@param[in,out] header tablespace header, x-latched
+@param[in] size tablespace size in pages, less than FSP_EXTENT_SIZE
+@param[in,out] mtr mini-transaction
+@param[in] n_pages number of pages to reserve
+@return true if there were at least n_pages free pages, or we were able
+to extend */
+static
+bool
+fsp_reserve_free_pages(
+ fil_space_t* space,
+ buf_block_t* header,
+ ulint size,
+ mtr_t* mtr,
+ uint32_t n_pages)
+{
+ xdes_t* descr;
+
+ ut_a(!is_system_tablespace(space->id));
+ ut_a(size < FSP_EXTENT_SIZE);
+
+ buf_block_t* xdes;
+ descr = xdes_get_descriptor_with_space_hdr(header, space, 0, &xdes,
+ mtr);
+ uint32_t n_used = xdes_get_n_used(descr);
+
+ ut_a(n_used <= size);
+
+ return(size >= n_used + n_pages
+ || fsp_try_extend_data_file_with_pages(
+ space, n_used + n_pages - 1, header, mtr));
+}
+
+/** Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_t::release_free_extents()!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special
+case. In this function we would liberally reserve several extents for
+every page split or merge in a B-tree. But we do not want to waste disk space
+if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply
+different rules in that special case, just ensuring that there are n_pages
+free pages available.
+
+@param[out] n_reserved number of extents actually reserved; if we
+ return true and the tablespace size is <
+ FSP_EXTENT_SIZE pages, then this can be 0,
+ otherwise it is n_ext
+@param[in,out] space tablespace
+@param[in] n_ext number of extents to reserve
+@param[in] alloc_type page reservation type (FSP_BLOB, etc)
+@param[in,out] mtr the mini transaction
+@param[in] n_pages for small tablespaces (tablespace size is
+ less than FSP_EXTENT_SIZE), number of free
+ pages to reserve.
+@return true if we were able to make the reservation */
+bool
+fsp_reserve_free_extents(
+ uint32_t* n_reserved,
+ fil_space_t* space,
+ uint32_t n_ext,
+ fsp_reserve_t alloc_type,
+ mtr_t* mtr,
+ uint32_t n_pages)
+{
+ ulint reserve;
+
+ ut_ad(mtr);
+ *n_reserved = n_ext;
+
+ const uint32_t extent_size = FSP_EXTENT_SIZE;
+
+ mtr_x_lock_space(space, mtr);
+ const unsigned physical_size = space->physical_size();
+
+ buf_block_t* header = fsp_get_header(space, mtr);
+try_again:
+ uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ + header->frame);
+ ut_ad(size == space->size_in_header);
+
+ if (size < extent_size && n_pages < extent_size / 2) {
+ /* Use different rules for small single-table tablespaces */
+ *n_reserved = 0;
+ return(fsp_reserve_free_pages(space, header, size,
+ mtr, n_pages));
+ }
+
+ uint32_t n_free_list_ext = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
+ + header->frame);
+ ut_ad(space->free_len == n_free_list_ext);
+
+ uint32_t free_limit = mach_read_from_4(FSP_HEADER_OFFSET
+ + FSP_FREE_LIMIT
+ + header->frame);
+ ut_ad(space->free_limit == free_limit);
+
+ /* Below we play safe when counting free extents above the free limit:
+ some of them will contain extent descriptor pages, and therefore
+ will not be free extents */
+
+ uint32_t n_free_up;
+
+ if (size >= free_limit) {
+ n_free_up = (size - free_limit) / extent_size;
+ if (n_free_up) {
+ n_free_up--;
+ n_free_up -= n_free_up / (physical_size / extent_size);
+ }
+ } else {
+ ut_ad(alloc_type == FSP_BLOB);
+ n_free_up = 0;
+ }
+
+ uint32_t n_free = n_free_list_ext + n_free_up;
+
+ switch (alloc_type) {
+ case FSP_NORMAL:
+ /* We reserve 1 extent + 0.5 % of the space size to undo logs
+ and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+ code is duplicated in the function below! */
+
+ reserve = 2 + ((size / extent_size) * 2) / 200;
+
+ if (n_free <= reserve + n_ext) {
+
+ goto try_to_extend;
+ }
+ break;
+ case FSP_UNDO:
+ /* We reserve 0.5 % of the space size to cleaning operations */
+
+ reserve = 1 + ((size / extent_size) * 1) / 200;
+
+ if (n_free <= reserve + n_ext) {
+
+ goto try_to_extend;
+ }
+ break;
+ case FSP_CLEANING:
+ case FSP_BLOB:
+ reserve = 0;
+ break;
+ default:
+ ut_error;
+ }
+
+ if (space->reserve_free_extents(n_free, n_ext)) {
+ return(true);
+ }
+try_to_extend:
+ if (fsp_try_extend_data_file(space, header, mtr)) {
+ goto try_again;
+ }
+
+ return(false);
+}
+
+/** Frees a single page of a segment.
+@param[in] seg_inode segment inode
+@param[in,out] space tablespace
+@param[in] offset page number
+@param[in,out] mtr mini-transaction */
+static
+void
+fseg_free_page_low(
+ fseg_inode_t* seg_inode,
+ buf_block_t* iblock,
+ fil_space_t* space,
+ page_no_t offset,
+ mtr_t* mtr)
+{
+ ib_id_t descr_id;
+ ib_id_t seg_id;
+
+ ut_ad(seg_inode != NULL);
+ ut_ad(mtr != NULL);
+ ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ ut_ad(iblock->frame == page_align(seg_inode));
+ ut_d(space->modify_check(*mtr));
+
+ const uint32_t extent_size = FSP_EXTENT_SIZE;
+ ut_ad(ut_is_2pow(extent_size));
+ buf_block_t* xdes;
+ xdes_t* descr = xdes_get_descriptor(space, offset, &xdes, mtr);
+
+ if (xdes_is_free(descr, offset & (extent_size - 1))) {
+ ib::fatal() << "InnoDB is trying to free page "
+ << page_id_t(space->id, offset)
+ << " though it is already marked as free in the"
+ " tablespace! The tablespace free space info is"
+ " corrupt. You may need to dump your tables and"
+ " recreate the whole database!"
+ << FORCE_RECOVERY_MSG;
+ }
+
+ if (xdes_get_state(descr) != XDES_FSEG) {
+ /* The page is in the fragment pages of the segment */
+ for (ulint i = 0;; i++) {
+ if (fseg_get_nth_frag_page_no(seg_inode, i)
+ != offset) {
+ continue;
+ }
+
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ mtr->memset(iblock, uint16_t(seg_inode - iblock->frame)
+ + FSEG_FRAG_ARR
+ + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff);
+ break;
+ }
+
+ fsp_free_page(space, offset, mtr);
+ return;
+ }
+
+ /* If we get here, the page is in some extent of the segment */
+
+ descr_id = mach_read_from_8(descr + XDES_ID);
+ seg_id = mach_read_from_8(seg_inode + FSEG_ID);
+
+ if (UNIV_UNLIKELY(descr_id != seg_id)) {
+ fputs("InnoDB: Dump of the tablespace extent descriptor: ",
+ stderr);
+ ut_print_buf(stderr, descr, 40);
+ fputs("\nInnoDB: Dump of the segment inode: ", stderr);
+ ut_print_buf(stderr, seg_inode, 40);
+ putc('\n', stderr);
+
+ ib::fatal() << "InnoDB is trying to free page "
+ << page_id_t(space->id, offset)
+ << ", which does not belong to segment " << descr_id
+ << " but belongs to segment " << seg_id << "."
+ << FORCE_RECOVERY_MSG;
+ }
+
+ byte* p_not_full = seg_inode + FSEG_NOT_FULL_N_USED;
+ uint32_t not_full_n_used = mach_read_from_4(p_not_full);
+ const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE);
+ const uint16_t ioffset= uint16_t(seg_inode - iblock->frame);
+
+ if (xdes_is_full(descr)) {
+ /* The fragment is full: move it to another list */
+ flst_remove(iblock, static_cast<uint16_t>(FSEG_FULL + ioffset),
+ xdes, xoffset, mtr);
+ flst_add_last(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+ + ioffset),
+ xdes, xoffset, mtr);
+ not_full_n_used += extent_size - 1;
+ } else {
+ ut_a(not_full_n_used > 0);
+ not_full_n_used--;
+ }
+
+ mtr->write<4>(*iblock, p_not_full, not_full_n_used);
+
+ const ulint bit = offset & (extent_size - 1);
+
+ xdes_set_free<true>(*xdes, descr, bit, mtr);
+
+ if (!xdes_get_n_used(descr)) {
+ /* The extent has become free: free it to space */
+ flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+ + ioffset),
+ xdes, xoffset, mtr);
+ fsp_free_extent(space, offset, mtr);
+ }
+
+ mtr->free(*space, static_cast<uint32_t>(offset));
+}
+
+/** Free a page in a file segment.
+@param[in,out] seg_header file segment header
+@param[in,out] space tablespace
+@param[in] offset page number
+@param[in,out] mtr mini-transaction */
+void
+fseg_free_page(
+ fseg_header_t* seg_header,
+ fil_space_t* space,
+ uint32_t offset,
+ mtr_t* mtr)
+{
+ DBUG_ENTER("fseg_free_page");
+ fseg_inode_t* seg_inode;
+ buf_block_t* iblock;
+ mtr_x_lock_space(space, mtr);
+
+ DBUG_LOG("fseg_free_page", "space_id: " << space->id
+ << ", page_no: " << offset);
+
+ seg_inode = fseg_inode_get(seg_header, space->id, space->zip_size(),
+ mtr,
+ &iblock);
+ if (!space->full_crc32()) {
+ fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+ }
+
+ fseg_free_page_low(seg_inode, iblock, space, offset, mtr);
+
+ DBUG_VOID_RETURN;
+}
+
+/** Determine whether a page is free.
+@param[in,out] space tablespace
+@param[in] page page number
+@return whether the page is marked as free */
+bool
+fseg_page_is_free(fil_space_t* space, unsigned page)
+{
+ bool is_free;
+ mtr_t mtr;
+ page_no_t dpage = xdes_calc_descriptor_page(space->zip_size(),
+ page);
+
+ mtr.start();
+ mtr_sx_lock_space(space, &mtr);
+
+ if (page >= space->free_limit || page >= space->size_in_header) {
+ is_free = true;
+ } else if (const xdes_t* descr = xdes_get_descriptor_const(
+ space, dpage, page, &mtr)) {
+ is_free = xdes_is_free(descr, page % FSP_EXTENT_SIZE);
+ } else {
+ is_free = true;
+ }
+ mtr.commit();
+
+ return(is_free);
+}
+
+/** Free an extent of a segment to the space free list.
+@param[in,out] seg_inode segment inode
+@param[in,out] space tablespace
+@param[in] page page number in the extent
+@param[in,out] mtr mini-transaction */
+MY_ATTRIBUTE((nonnull))
+static
+void
+fseg_free_extent(
+ fseg_inode_t* seg_inode,
+ buf_block_t* iblock,
+ fil_space_t* space,
+ uint32_t page,
+ mtr_t* mtr)
+{
+
+ ut_ad(mtr != NULL);
+
+ buf_block_t* xdes;
+ xdes_t* descr = xdes_get_descriptor(space, page, &xdes, mtr);
+
+ ut_a(xdes_get_state(descr) == XDES_FSEG);
+ ut_a(!memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8));
+ ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ ut_d(space->modify_check(*mtr));
+ const uint32_t first_page_in_extent = page - (page % FSP_EXTENT_SIZE);
+
+ const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE);
+ const uint16_t ioffset= uint16_t(seg_inode - iblock->frame);
+
+ if (xdes_is_full(descr)) {
+ flst_remove(iblock, static_cast<uint16_t>(FSEG_FULL + ioffset),
+ xdes, xoffset, mtr);
+ } else if (!xdes_get_n_used(descr)) {
+ flst_remove(iblock, static_cast<uint16_t>(FSEG_FREE + ioffset),
+ xdes, xoffset, mtr);
+ } else {
+ flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+ + ioffset),
+ xdes, xoffset, mtr);
+ uint32_t not_full_n_used = mach_read_from_4(
+ FSEG_NOT_FULL_N_USED + seg_inode);
+ uint32_t descr_n_used = xdes_get_n_used(descr);
+ ut_a(not_full_n_used >= descr_n_used);
+ mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
+ not_full_n_used - descr_n_used);
+ }
+
+ fsp_free_extent(space, page, mtr);
+
+ for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) {
+ if (!xdes_is_free(descr, i)) {
+ buf_page_free(space, first_page_in_extent + i, mtr,
+ __FILE__, __LINE__);
+ }
+ }
+}
+
+/**********************************************************************//**
+Frees part of a segment. This function can be used to free a segment by
+repeatedly calling this function in different mini-transactions. Doing
+the freeing in a single mini-transaction might result in too big a
+mini-transaction.
+@return whether the freeing was completed */
+bool
+fseg_free_step(
+ fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header
+ resides on the first page of the frag list
+ of the segment, this pointer becomes obsolete
+ after the last freeing step */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint n;
+ fseg_inode_t* inode;
+
+ DBUG_ENTER("fseg_free_step");
+
+ const uint32_t space_id = page_get_space_id(page_align(header));
+ const uint32_t header_page = page_get_page_no(page_align(header));
+
+ fil_space_t* space = mtr_x_lock_space(space_id, mtr);
+ buf_block_t* xdes;
+ xdes_t* descr = xdes_get_descriptor(space, header_page, &xdes, mtr);
+
+ /* Check that the header resides on a page which has not been
+ freed yet */
+
+ ut_a(!xdes_is_free(descr, header_page % FSP_EXTENT_SIZE));
+ buf_block_t* iblock;
+ const ulint zip_size = space->zip_size();
+ inode = fseg_inode_try_get(header, space_id, zip_size, mtr, &iblock);
+
+ if (inode == NULL) {
+ ib::info() << "Double free of inode from "
+ << page_id_t(space_id, header_page);
+ DBUG_RETURN(true);
+ }
+
+ if (!space->full_crc32()) {
+ fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+ }
+ descr = fseg_get_first_extent(inode, space, mtr);
+
+ if (descr != NULL) {
+ /* Free the extent held by the segment */
+ fseg_free_extent(inode, iblock, space, xdes_get_offset(descr),
+ mtr);
+ DBUG_RETURN(false);
+ }
+
+ /* Free a frag page */
+ n = fseg_find_last_used_frag_page_slot(inode);
+
+ if (n == ULINT_UNDEFINED) {
+ /* Freeing completed: free the segment inode */
+ fsp_free_seg_inode(space, inode, iblock, mtr);
+
+ DBUG_RETURN(true);
+ }
+
+ page_no_t page_no = fseg_get_nth_frag_page_no(inode, n);
+
+ fseg_free_page_low(inode, iblock, space, page_no, mtr);
+
+ buf_page_free(space, page_no, mtr, __FILE__, __LINE__);
+
+ n = fseg_find_last_used_frag_page_slot(inode);
+
+ if (n == ULINT_UNDEFINED) {
+ /* Freeing completed: free the segment inode */
+ fsp_free_seg_inode(space, inode, iblock, mtr);
+
+ DBUG_RETURN(true);
+ }
+
+ DBUG_RETURN(false);
+}
+
+/**********************************************************************//**
+Frees part of a segment. Differs from fseg_free_step because this function
+leaves the header page unfreed.
+@return whether the freeing was completed, except for the header page */
+bool
+fseg_free_step_not_header(
+ fseg_header_t* header, /*!< in: segment header which must reside on
+ the first fragment page of the segment */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint n;
+ xdes_t* descr;
+ fseg_inode_t* inode;
+
+ const uint32_t space_id = page_get_space_id(page_align(header));
+ ut_ad(mtr->is_named_space(space_id));
+
+ fil_space_t* space = mtr_x_lock_space(space_id, mtr);
+ buf_block_t* iblock;
+
+ inode = fseg_inode_get(header, space_id, space->zip_size(), mtr,
+ &iblock);
+ if (!space->full_crc32()) {
+ fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+ }
+
+ descr = fseg_get_first_extent(inode, space, mtr);
+
+ if (descr != NULL) {
+ /* Free the extent held by the segment */
+ fseg_free_extent(inode, iblock, space, xdes_get_offset(descr),
+ mtr);
+ return false;
+ }
+
+ /* Free a frag page */
+
+ n = fseg_find_last_used_frag_page_slot(inode);
+
+ ut_a(n != ULINT_UNDEFINED);
+
+ uint32_t page_no = fseg_get_nth_frag_page_no(inode, n);
+
+ if (page_no == page_get_page_no(page_align(header))) {
+ return true;
+ }
+
+ fseg_free_page_low(inode, iblock, space, page_no, mtr);
+ buf_page_free(space, page_no, mtr, __FILE__, __LINE__);
+ return false;
+}
+
+/** Returns the first extent descriptor for a segment.
+We think of the extent lists of the segment catenated in the order
+FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
+@param[in] inode segment inode
+@param[in] space tablespace
+@param[in,out] mtr mini-transaction
+@return the first extent descriptor, or NULL if none */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+xdes_t*
+fseg_get_first_extent(
+ fseg_inode_t* inode,
+ const fil_space_t* space,
+ mtr_t* mtr)
+{
+ fil_addr_t first;
+
+ ut_ad(space->id == page_get_space_id(page_align(inode)));
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+ if (flst_get_len(inode + FSEG_FULL) > 0) {
+ first = flst_get_first(inode + FSEG_FULL);
+ } else if (flst_get_len(inode + FSEG_NOT_FULL) > 0) {
+ first = flst_get_first(inode + FSEG_NOT_FULL);
+ } else if (flst_get_len(inode + FSEG_FREE) > 0) {
+ first = flst_get_first(inode + FSEG_FREE);
+ } else {
+ return(NULL);
+ }
+
+ DBUG_ASSERT(first.page != FIL_NULL);
+
+ buf_block_t *xdes;
+
+ return(first.page == FIL_NULL ? NULL
+ : xdes_lst_get_descriptor(space, first, &xdes, mtr));
+}
+
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+static void fseg_print_low(const fseg_inode_t *inode)
+{
+ ulint space;
+ ulint n_used;
+ ulint n_frag;
+ ulint n_free;
+ ulint n_not_full;
+ ulint n_full;
+ ulint reserved;
+ ulint used;
+ ulint page_no;
+ ib_id_t seg_id;
+
+ space = page_get_space_id(page_align(inode));
+ page_no = page_get_page_no(page_align(inode));
+
+ reserved = fseg_n_reserved_pages_low(inode, &used);
+
+ seg_id = mach_read_from_8(inode + FSEG_ID);
+ n_used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED);
+ n_frag = fseg_get_n_frag_pages(inode);
+ n_free = flst_get_len(inode + FSEG_FREE);
+ n_not_full = flst_get_len(inode + FSEG_NOT_FULL);
+ n_full = flst_get_len(inode + FSEG_FULL);
+
+ ib::info() << "SEGMENT id " << seg_id
+ << " space " << space << ";"
+ << " page " << page_no << ";"
+ << " res " << reserved << " used " << used << ";"
+ << " full ext " << n_full << ";"
+ << " fragm pages " << n_frag << ";"
+ << " free extents " << n_free << ";"
+ << " not full extents " << n_not_full << ": pages " << n_used;
+
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+}
+
+/*******************************************************************//**
+Writes info of a segment. */
+void
+fseg_print(
+/*=======*/
+ fseg_header_t* header, /*!< in: segment header */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ fseg_inode_t* inode;
+ ulint space_id;
+
+ space_id = page_get_space_id(page_align(header));
+ const fil_space_t* space = mtr_x_lock_space(space_id, mtr);
+
+ inode = fseg_inode_get(header, space_id, space->zip_size(), mtr);
+
+ fseg_print_low(inode);
+}
+#endif /* UNIV_BTR_PRINT */
+
+#ifdef UNIV_DEBUG
+std::ostream &fseg_header::to_stream(std::ostream &out) const
+{
+ out << "[fseg_header_t: space="
+ << mach_read_from_4(m_header + FSEG_HDR_SPACE)
+ << ", page=" << mach_read_from_4(m_header + FSEG_HDR_PAGE_NO)
+ << ", offset=" << mach_read_from_2(m_header + FSEG_HDR_OFFSET) << "]";
+ return out;
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc
new file mode 100644
index 00000000..b0a80efe
--- /dev/null
+++ b/storage/innobase/fsp/fsp0space.cc
@@ -0,0 +1,230 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fsp/fsp0space.cc
+Shared tablespace implementation.
+
+Created 2012-11-16 by Sunny Bains as srv/srv0space.cc
+*******************************************************/
+
+#include "fsp0sysspace.h"
+#include "fsp0fsp.h"
+#include "os0file.h"
+#include "my_sys.h"
+
+/** Check if two tablespaces have common data file names.
+@param other_space Tablespace to check against this.
+@return true if they have the same data filenames and paths */
+bool
+Tablespace::intersection(
+ const Tablespace* other_space)
+{
+ for (files_t::const_iterator it(other_space->begin()),
+ end(other_space->end()); it != end; ++it) {
+
+ if (find(it->m_filename)) {
+
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Frees the memory allocated by the SysTablespace object. */
+void
+Tablespace::shutdown()
+{
+ for (iterator it = begin(); it != end(); ++it) {
+ it->shutdown();
+ }
+
+ m_files.clear();
+ ut_free(m_path);
+ m_path = NULL;
+ m_space_id = ULINT_UNDEFINED;
+}
+
+/** Note that the data file was found.
+@param[in,out] file Data file object to set */
+void
+Tablespace::file_found(Datafile& file)
+{
+ /* Note that the file exists and can be opened
+ in the appropriate mode. */
+ file.m_exists = true;
+
+ file.set_open_flags(
+ &file == &m_files.front()
+ ? OS_FILE_OPEN_RETRY : OS_FILE_OPEN);
+}
+
+/** Open or Create the data files if they do not exist.
+@param[in] is_temp whether this is a temporary tablespace
+@return DB_SUCCESS or error code */
+dberr_t
+Tablespace::open_or_create(bool is_temp)
+{
+ fil_space_t* space = NULL;
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(!m_files.empty());
+
+ for (iterator it = begin(); it != end(); ++it) {
+
+ if (it->m_exists) {
+ err = it->open_or_create(
+ m_ignore_read_only
+ ? false : srv_read_only_mode);
+ } else {
+ err = it->open_or_create(
+ m_ignore_read_only
+ ? false : srv_read_only_mode);
+
+ /* Set the correct open flags now that we have
+ successfully created the file. */
+ if (err == DB_SUCCESS) {
+ file_found(*it);
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ /* We can close the handle now and open the tablespace
+ the proper way. */
+ it->close();
+
+ if (it == begin()) {
+ /* First data file. */
+
+ /* Create the tablespace entry for the multi-file
+ tablespace in the tablespace manager. */
+ ulint fsp_flags = 0;
+
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ fsp_flags = (FSP_FLAGS_FCRC32_MASK_MARKER
+ | FSP_FLAGS_FCRC32_PAGE_SSIZE());
+ break;
+ default:
+ fsp_flags = FSP_FLAGS_PAGE_SSIZE();
+ }
+
+ space = fil_space_t::create(
+ m_name, m_space_id, fsp_flags,
+ is_temp
+ ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE,
+ NULL);
+ if (!space) {
+ return DB_ERROR;
+ }
+ }
+
+ ut_a(fil_validate());
+
+ space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size,
+ false, true);
+ }
+
+ return(err);
+}
+
+/** Find a filename in the list of Datafiles for a tablespace
+@return true if the filename exists in the data files */
+bool
+Tablespace::find(const char* filename) const
+{
+ for (const_iterator it = begin(); it != end(); ++it) {
+
+ if (innobase_strcasecmp(filename, it->m_filename) == 0) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Delete all the data files. */
+void
+Tablespace::delete_files()
+{
+ for (iterator it = begin(); it != end(); ++it) {
+
+ it->close();
+
+ bool file_pre_exists;
+ bool success = os_file_delete_if_exists(
+ innodb_data_file_key, it->m_filepath, &file_pre_exists);
+
+ if (success && file_pre_exists) {
+ ib::info() << "Removed temporary tablespace data"
+ " file: \"" << it->m_name << "\"";
+ }
+ }
+}
+
+/** Use the ADD DATAFILE path to create a Datafile object and add it to the
+front of m_files.
+Parse the datafile path into a path and a filename with extension 'ibd'.
+This datafile_path provided may or may not be an absolute path, but it
+must end with the extension .ibd and have a basename of at least 1 byte.
+
+Set tablespace m_path member and add a Datafile with the filename.
+@param[in] datafile_path full path of the tablespace file. */
+dberr_t
+Tablespace::add_datafile(
+ const char* datafile_added)
+{
+ /* The path provided ends in ".ibd". This was assured by
+ validate_create_tablespace_info() */
+ ut_d(const char* dot = strrchr(datafile_added, '.'));
+ ut_ad(dot != NULL && 0 == strcmp(dot, DOT_IBD));
+
+ char* filepath = mem_strdup(datafile_added);
+ os_normalize_path(filepath);
+
+ /* If the path is an absolute path, separate it onto m_path and a
+ basename. For relative paths, make the whole thing a basename so that
+ it can be appended to the datadir. */
+ bool is_abs_path = is_absolute_path(filepath);
+ size_t dirlen = (is_abs_path ? dirname_length(filepath) : 0);
+ const char* basename = filepath + dirlen;
+
+ /* If the pathname contains a directory separator, fill the
+ m_path member which is the default directory for files in this
+ tablespace. Leave it null otherwise. */
+ if (dirlen > 0) {
+ set_path(filepath, dirlen);
+ }
+
+ /* Now add a new Datafile and set the filepath
+ using the m_path created above. */
+ m_files.push_back(Datafile(m_name, m_flags,
+ FIL_IBD_FILE_INITIAL_SIZE, 0));
+ Datafile* datafile = &m_files.back();
+ datafile->make_filepath(m_path, basename, IBD);
+
+ ut_free(filepath);
+
+ return(DB_SUCCESS);
+}
diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc
new file mode 100644
index 00000000..a2c9e1bc
--- /dev/null
+++ b/storage/innobase/fsp/fsp0sysspace.cc
@@ -0,0 +1,994 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fsp/fsp0space.cc
+Multi file, shared, system tablespace implementation.
+
+Created 2012-11-16 by Sunny Bains as srv/srv0space.cc
+Refactored 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#include "fsp0sysspace.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "dict0load.h"
+#include "mem0mem.h"
+#include "os0file.h"
+#include "row0mysql.h"
+#include "buf0dblwr.h"
+
+/** The server header file is included to access opt_initialize global variable.
+If server passes the option for create/open DB to SE, we should remove such
+direct reference to server header and global variable */
+#include "mysqld.h"
+
+/** The control info of the system tablespace. */
+SysTablespace srv_sys_space;
+
+/** The control info of a temporary table shared tablespace. */
+SysTablespace srv_tmp_space;
+
+/** If the last data file is auto-extended, we add this many pages to it
+at a time. We have to make this public because it is a config variable. */
+uint sys_tablespace_auto_extend_increment;
+
+/** Convert a numeric string that optionally ends in G or M or K,
+ to a number containing megabytes.
+@param[in] str String with a quantity in bytes
+@param[out] megs The number in megabytes
+@return next character in string */
+char*
+SysTablespace::parse_units(
+ char* ptr,
+ ulint* megs)
+{
+ char* endp;
+
+ *megs = strtoul(ptr, &endp, 10);
+
+ ptr = endp;
+
+ switch (*ptr) {
+ case 'G': case 'g':
+ *megs *= 1024;
+ /* fall through */
+ case 'M': case 'm':
+ ++ptr;
+ break;
+ case 'K': case 'k':
+ *megs /= 1024;
+ ++ptr;
+ break;
+ default:
+ *megs /= 1024 * 1024;
+ break;
+ }
+
+ return(ptr);
+}
+
+/** Parse the input params and populate member variables.
+@param[in] filepath path to data files
+@param[in] supports_raw true if the tablespace supports raw devices
+@return true on success parse */
+bool
+SysTablespace::parse_params(
+ const char* filepath_spec,
+ bool supports_raw)
+{
+ char* filepath;
+ ulint size;
+ char* input_str;
+ ulint n_files = 0;
+
+ ut_ad(m_last_file_size_max == 0);
+ ut_ad(!m_auto_extend_last_file);
+
+ char* new_str = mem_strdup(filepath_spec);
+ char* str = new_str;
+
+ input_str = str;
+
+ /*---------------------- PASS 1 ---------------------------*/
+ /* First calculate the number of data files and check syntax:
+ filepath:size[K |M | G];filepath:size[K |M | G]... .
+ Note that a Windows path may contain a drive name and a ':'. */
+ while (*str != '\0') {
+ filepath = str;
+
+ while ((*str != ':' && *str != '\0')
+ || (*str == ':'
+ && (*(str + 1) == '\\' || *(str + 1) == '/'
+ || *(str + 1) == ':'))) {
+ str++;
+ }
+
+ if (*str == '\0') {
+ ut_free(new_str);
+
+ ib::error()
+ << "syntax error in file path or size"
+ " specified is less than 1 megabyte";
+ return(false);
+ }
+
+ str++;
+
+ str = parse_units(str, &size);
+
+ if (0 == strncmp(str, ":autoextend",
+ (sizeof ":autoextend") - 1)) {
+
+ str += (sizeof ":autoextend") - 1;
+
+ if (0 == strncmp(str, ":max:",
+ (sizeof ":max:") - 1)) {
+
+ str += (sizeof ":max:") - 1;
+
+ str = parse_units(str, &size);
+ }
+
+ if (*str != '\0') {
+ ut_free(new_str);
+ ib::error()
+ << "syntax error in file path or"
+ << " size specified is less than"
+ << " 1 megabyte";
+ return(false);
+ }
+ }
+
+ if (::strlen(str) >= 6
+ && *str == 'n'
+ && *(str + 1) == 'e'
+ && *(str + 2) == 'w') {
+
+ if (!supports_raw) {
+ ib::error()
+ << "Tablespace doesn't support raw"
+ " devices";
+ ut_free(new_str);
+ return(false);
+ }
+
+ str += 3;
+ }
+
+ if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+ str += 3;
+
+ if (!supports_raw) {
+ ib::error()
+ << "Tablespace doesn't support raw"
+ " devices";
+ ut_free(new_str);
+ return(false);
+ }
+ }
+
+ if (size == 0) {
+
+ ut_free(new_str);
+
+ ib::error()
+ << "syntax error in file path or size"
+ " specified is less than 1 megabyte";
+
+ return(false);
+ }
+
+ ++n_files;
+
+ if (*str == ';') {
+ str++;
+ } else if (*str != '\0') {
+ ut_free(new_str);
+
+ ib::error()
+ << "syntax error in file path or size"
+ " specified is less than 1 megabyte";
+ return(false);
+ }
+ }
+
+ if (n_files == 0) {
+
+ /* filepath_spec must contain at least one data file
+ definition */
+
+ ut_free(new_str);
+
+ ib::error()
+ << "syntax error in file path or size specified"
+ " is less than 1 megabyte";
+
+ return(false);
+ }
+
+ /*---------------------- PASS 2 ---------------------------*/
+ /* Then store the actual values to our arrays */
+ str = input_str;
+ ulint order = 0;
+
+ while (*str != '\0') {
+ filepath = str;
+
+ /* Note that we must step over the ':' in a Windows filepath;
+ a Windows path normally looks like C:\ibdata\ibdata1:1G, but
+ a Windows raw partition may have a specification like
+ \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */
+
+ while ((*str != ':' && *str != '\0')
+ || (*str == ':'
+ && (*(str + 1) == '\\' || *(str + 1) == '/'
+ || *(str + 1) == ':'))) {
+ str++;
+ }
+
+ if (*str == ':') {
+ /* Make filepath a null-terminated string */
+ *str = '\0';
+ str++;
+ }
+
+ str = parse_units(str, &size);
+
+ if (0 == strncmp(str, ":autoextend",
+ (sizeof ":autoextend") - 1)) {
+
+ m_auto_extend_last_file = true;
+
+ str += (sizeof ":autoextend") - 1;
+
+ if (0 == strncmp(str, ":max:",
+ (sizeof ":max:") - 1)) {
+
+ str += (sizeof ":max:") - 1;
+
+ str = parse_units(str, &m_last_file_size_max);
+ }
+
+ if (*str != '\0') {
+ ut_free(new_str);
+ ib::error() << "syntax error in file path or"
+ " size specified is less than 1"
+ " megabyte";
+ return(false);
+ }
+ }
+
+ m_files.push_back(Datafile(filepath, flags(), uint32_t(size),
+ order));
+ Datafile* datafile = &m_files.back();
+ datafile->make_filepath(path(), filepath, NO_EXT);
+
+ if (::strlen(str) >= 6
+ && *str == 'n'
+ && *(str + 1) == 'e'
+ && *(str + 2) == 'w') {
+
+ ut_a(supports_raw);
+
+ str += 3;
+
+ /* Initialize new raw device only during initialize */
+ /* JAN: TODO: MySQL 5.7 used opt_initialize */
+ m_files.back().m_type =
+ opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW;
+ }
+
+ if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+
+ ut_a(supports_raw);
+
+ str += 3;
+
+ /* Initialize new raw device only during initialize */
+ if (m_files.back().m_type == SRV_NOT_RAW) {
+ /* JAN: TODO: MySQL 5.7 used opt_initialize */
+ m_files.back().m_type =
+ opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW;
+ }
+ }
+
+ if (*str == ';') {
+ ++str;
+ }
+ order++;
+ }
+
+ ut_ad(n_files == ulint(m_files.size()));
+
+ ut_free(new_str);
+
+ return(true);
+}
+
+/** Frees the memory allocated by the parse method. */
+void
+SysTablespace::shutdown()
+{
+ Tablespace::shutdown();
+
+ m_auto_extend_last_file = 0;
+ m_last_file_size_max = 0;
+ m_created_new_raw = 0;
+ m_is_tablespace_full = false;
+ m_sanity_checks_done = false;
+}
+
+/** Verify the size of the physical file.
+@param[in] file data file object
+@return DB_SUCCESS if OK else error code. */
+dberr_t
+SysTablespace::check_size(
+ Datafile& file)
+{
+ os_offset_t size = os_file_get_size(file.m_handle);
+ ut_a(size != (os_offset_t) -1);
+
+ /* Under some error conditions like disk full scenarios
+ or file size reaching filesystem limit the data file
+ could contain an incomplete extent at the end. When we
+ extend a data file and if some failure happens, then
+ also the data file could contain an incomplete extent.
+ So we need to round the size downward to a megabyte.*/
+
+ const uint32_t rounded_size_pages = static_cast<uint32_t>(
+ size >> srv_page_size_shift);
+
+ /* If last file */
+ if (&file == &m_files.back() && m_auto_extend_last_file) {
+
+ if (file.m_size > rounded_size_pages
+ || (m_last_file_size_max > 0
+ && m_last_file_size_max < rounded_size_pages)) {
+ ib::error() << "The Auto-extending " << name()
+ << " data file '" << file.filepath() << "' is"
+ " of a different size " << rounded_size_pages
+ << " pages than specified"
+ " in the .cnf file: initial " << file.m_size
+ << " pages, max " << m_last_file_size_max
+ << " (relevant if non-zero) pages!";
+ return(DB_ERROR);
+ }
+
+ file.m_size = rounded_size_pages;
+ }
+
+ if (rounded_size_pages != file.m_size) {
+ ib::error() << "The " << name() << " data file '"
+ << file.filepath() << "' is of a different size "
+ << rounded_size_pages << " pages"
+ " than the " << file.m_size << " pages specified in"
+ " the .cnf file!";
+ return(DB_ERROR);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Set the size of the file.
+@param[in] file data file object
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::set_size(
+ Datafile& file)
+{
+ ut_ad(!srv_read_only_mode || m_ignore_read_only);
+
+ /* We created the data file and now write it full of zeros */
+ ib::info() << "Setting file '" << file.filepath() << "' size to "
+ << (file.m_size >> (20U - srv_page_size_shift)) << " MB."
+ " Physically writing the file full; Please wait ...";
+
+ bool success = os_file_set_size(
+ file.m_filepath, file.m_handle,
+ static_cast<os_offset_t>(file.m_size) << srv_page_size_shift);
+
+ if (success) {
+ ib::info() << "File '" << file.filepath() << "' size is now "
+ << (file.m_size >> (20U - srv_page_size_shift))
+ << " MB.";
+ } else {
+ ib::error() << "Could not set the file size of '"
+ << file.filepath() << "'. Probably out of disk space";
+
+ return(DB_ERROR);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Create a data file.
+@param[in] file data file object
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::create_file(
+ Datafile& file)
+{
+ dberr_t err = DB_SUCCESS;
+
+ ut_a(!file.m_exists);
+ ut_ad(!srv_read_only_mode || m_ignore_read_only);
+
+ switch (file.m_type) {
+ case SRV_NEW_RAW:
+
+ /* The partition is opened, not created; then it is
+ written over */
+ m_created_new_raw = true;
+
+ /* Fall through. */
+
+ case SRV_OLD_RAW:
+
+ srv_start_raw_disk_in_use = TRUE;
+
+ /* Fall through. */
+
+ case SRV_NOT_RAW:
+ err = file.open_or_create(
+ m_ignore_read_only ? false : srv_read_only_mode);
+ break;
+ }
+
+
+ if (err == DB_SUCCESS && file.m_type != SRV_OLD_RAW) {
+ err = set_size(file);
+ }
+
+ return(err);
+}
+
+/** Open a data file.
+@param[in] file data file object
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::open_file(
+ Datafile& file)
+{
+ dberr_t err = DB_SUCCESS;
+
+ ut_a(file.m_exists);
+
+ switch (file.m_type) {
+ case SRV_NEW_RAW:
+ /* The partition is opened, not created; then it is
+ written over */
+ m_created_new_raw = true;
+
+ /* Fall through */
+
+ case SRV_OLD_RAW:
+ srv_start_raw_disk_in_use = TRUE;
+
+ if (srv_read_only_mode && !m_ignore_read_only) {
+ ib::error() << "Can't open a raw device '"
+ << file.m_filepath << "' when"
+ " --innodb-read-only is set";
+
+ return(DB_ERROR);
+ }
+
+ /* Fall through */
+
+ case SRV_NOT_RAW:
+ err = file.open_or_create(
+ m_ignore_read_only ? false : srv_read_only_mode);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ break;
+ }
+
+ switch (file.m_type) {
+ case SRV_NEW_RAW:
+ /* Set file size for new raw device. */
+ err = set_size(file);
+ break;
+
+ case SRV_NOT_RAW:
+ /* Check file size for existing file. */
+ err = check_size(file);
+ break;
+
+ case SRV_OLD_RAW:
+ err = DB_SUCCESS;
+ break;
+
+ }
+
+ if (err != DB_SUCCESS) {
+ file.close();
+ }
+
+ return(err);
+}
+
+/** Check the tablespace header for this tablespace.
+@param[out] flushed_lsn the value of FIL_PAGE_FILE_FLUSH_LSN
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::read_lsn_and_check_flags(lsn_t* flushed_lsn)
+{
+ dberr_t err;
+
+ /* Only relevant for the system tablespace. */
+ ut_ad(space_id() == TRX_SYS_SPACE);
+
+ files_t::iterator it = m_files.begin();
+
+ ut_a(it->m_exists);
+
+ if (it->m_handle == OS_FILE_CLOSED) {
+
+ err = it->open_or_create(
+ m_ignore_read_only ? false : srv_read_only_mode);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ err = it->read_first_page(
+ m_ignore_read_only ? false : srv_read_only_mode);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ ut_a(it->order() == 0);
+
+ if (srv_operation == SRV_OPERATION_NORMAL) {
+ buf_dblwr.init_or_load_pages(it->handle(), it->filepath());
+ }
+
+ /* Check the contents of the first page of the
+ first datafile. */
+ for (int retry = 0; retry < 2; ++retry) {
+
+ err = it->validate_first_page(flushed_lsn);
+
+ if (err != DB_SUCCESS
+ && (retry == 1
+ || it->restore_from_doublewrite())) {
+
+ it->close();
+
+ return(err);
+ }
+ }
+
+ /* Make sure the tablespace space ID matches the
+ space ID on the first page of the first datafile. */
+ if (space_id() != it->m_space_id) {
+
+ ib::error()
+ << "The " << name() << " data file '" << it->name()
+ << "' has the wrong space ID. It should be "
+ << space_id() << ", but " << it->m_space_id
+ << " was found";
+
+ it->close();
+
+ return(err);
+ }
+
+ it->close();
+
+ return(DB_SUCCESS);
+}
+
+/** Check if a file can be opened in the correct mode.
+@param[in] file data file object
+@param[out] reason exact reason if file_status check failed.
+@return DB_SUCCESS or error code. */
+dberr_t
+SysTablespace::check_file_status(
+ const Datafile& file,
+ file_status_t& reason)
+{
+ os_file_stat_t stat;
+
+ memset(&stat, 0x0, sizeof(stat));
+
+ dberr_t err = os_file_get_status(
+ file.m_filepath, &stat, true,
+ m_ignore_read_only ? false : srv_read_only_mode);
+
+ reason = FILE_STATUS_VOID;
+ /* File exists but we can't read the rw-permission settings. */
+ switch (err) {
+ case DB_FAIL:
+ ib::error() << "os_file_get_status() failed on '"
+ << file.filepath()
+ << "'. Can't determine file permissions";
+ err = DB_ERROR;
+ reason = FILE_STATUS_RW_PERMISSION_ERROR;
+ break;
+
+ case DB_SUCCESS:
+
+ /* Note: stat.rw_perm is only valid for "regular" files */
+
+ if (stat.type == OS_FILE_TYPE_FILE) {
+
+ if (!stat.rw_perm) {
+ const char *p = (!srv_read_only_mode
+ || m_ignore_read_only)
+ ? "writable"
+ : "readable";
+
+ ib::error() << "The " << name() << " data file"
+ << " '" << file.name() << "' must be "
+ << p;
+
+ err = DB_ERROR;
+ reason = FILE_STATUS_READ_WRITE_ERROR;
+ }
+
+ } else {
+ /* Not a regular file, bail out. */
+ ib::error() << "The " << name() << " data file '"
+ << file.name() << "' is not a regular"
+ " InnoDB data file.";
+
+ err = DB_ERROR;
+ reason = FILE_STATUS_NOT_REGULAR_FILE_ERROR;
+ }
+ break;
+
+ case DB_NOT_FOUND:
+ break;
+
+ default:
+ ut_ad(0);
+ }
+
+ return(err);
+}
+
+/** Note that the data file was not found.
+@param[in] file data file object
+@param[out] create_new_db true if a new instance to be created
+@return DB_SUCESS or error code */
+dberr_t
+SysTablespace::file_not_found(
+ Datafile& file,
+ bool* create_new_db)
+{
+ file.m_exists = false;
+
+ if (srv_read_only_mode && !m_ignore_read_only) {
+ ib::error() << "Can't create file '" << file.filepath()
+ << "' when --innodb-read-only is set";
+
+ return(DB_ERROR);
+
+ } else if (&file == &m_files.front()) {
+
+ /* First data file. */
+ ut_a(!*create_new_db);
+ *create_new_db = TRUE;
+
+ if (space_id() == TRX_SYS_SPACE) {
+ ib::info() << "The first " << name() << " data file '"
+ << file.name() << "' did not exist."
+ " A new tablespace will be created!";
+ }
+
+ } else {
+ ib::info() << "Need to create a new " << name()
+ << " data file '" << file.name() << "'.";
+ }
+
+ /* Set the file create mode. */
+ switch (file.m_type) {
+ case SRV_NOT_RAW:
+ file.set_open_flags(OS_FILE_CREATE);
+ break;
+
+ case SRV_NEW_RAW:
+ case SRV_OLD_RAW:
+ file.set_open_flags(OS_FILE_OPEN_RAW);
+ break;
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Note that the data file was found.
+@param[in,out] file data file object
+@return true if a new instance to be created */
+bool
+SysTablespace::file_found(
+ Datafile& file)
+{
+ /* Note that the file exists and can be opened
+ in the appropriate mode. */
+ file.m_exists = true;
+
+ /* Set the file open mode */
+ switch (file.m_type) {
+ case SRV_NOT_RAW:
+ file.set_open_flags(
+ &file == &m_files.front()
+ ? OS_FILE_OPEN_RETRY : OS_FILE_OPEN);
+ break;
+
+ case SRV_NEW_RAW:
+ case SRV_OLD_RAW:
+ file.set_open_flags(OS_FILE_OPEN_RAW);
+ break;
+ }
+
+ /* Need to create the system tablespace for new raw device. */
+ return(file.m_type == SRV_NEW_RAW);
+}
+
+/** Check the data file specification.
+@param[out] create_new_db true if a new database is to be created
+@param[in] min_expected_size Minimum expected tablespace size in bytes
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+SysTablespace::check_file_spec(
+ bool* create_new_db,
+ ulint min_expected_size)
+{
+ *create_new_db = FALSE;
+
+ if (m_files.size() >= 1000) {
+ ib::error() << "There must be < 1000 data files in "
+ << name() << " but " << m_files.size() << " have been"
+ " defined.";
+
+ return(DB_ERROR);
+ }
+
+ if (!m_auto_extend_last_file
+ && get_sum_of_sizes()
+ < (min_expected_size >> srv_page_size_shift)) {
+ ib::error() << "Tablespace size must be at least "
+ << (min_expected_size >> 20) << " MB";
+ return(DB_ERROR);
+ }
+
+ dberr_t err = DB_SUCCESS;
+
+ ut_a(!m_files.empty());
+
+ /* If there is more than one data file and the last data file
+ doesn't exist, that is OK. We allow adding of new data files. */
+
+ files_t::iterator begin = m_files.begin();
+ files_t::iterator end = m_files.end();
+
+ for (files_t::iterator it = begin; it != end; ++it) {
+
+ file_status_t reason_if_failed;
+ err = check_file_status(*it, reason_if_failed);
+
+ if (err == DB_NOT_FOUND) {
+
+ err = file_not_found(*it, create_new_db);
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ } else if (err != DB_SUCCESS) {
+ if (reason_if_failed == FILE_STATUS_READ_WRITE_ERROR) {
+ const char* p = (!srv_read_only_mode
+ || m_ignore_read_only)
+ ? "writable" : "readable";
+ ib::error() << "The " << name() << " data file"
+ << " '" << it->name() << "' must be "
+ << p;
+ }
+
+ ut_a(err != DB_FAIL);
+ break;
+
+ } else if (*create_new_db) {
+ ib::error() << "The " << name() << " data file '"
+ << begin->m_name << "' was not found but"
+ " one of the other data files '" << it->m_name
+ << "' exists.";
+
+ err = DB_ERROR;
+ break;
+
+ } else {
+ *create_new_db = file_found(*it);
+ }
+ }
+
+ return(err);
+}
+
+/** Open or create the data files
+@param[in] is_temp whether this is a temporary tablespace
+@param[in] create_new_db whether we are creating a new database
+@param[out] sum_new_sizes sum of sizes of the new files added
+@param[out] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first file
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::open_or_create(
+ bool is_temp,
+ bool create_new_db,
+ ulint* sum_new_sizes,
+ lsn_t* flush_lsn)
+{
+ dberr_t err = DB_SUCCESS;
+ fil_space_t* space = NULL;
+
+ ut_ad(!m_files.empty());
+
+ if (sum_new_sizes) {
+ *sum_new_sizes = 0;
+ }
+
+ files_t::iterator begin = m_files.begin();
+ files_t::iterator end = m_files.end();
+
+ ut_ad(begin->order() == 0);
+
+ for (files_t::iterator it = begin; it != end; ++it) {
+
+ if (it->m_exists) {
+ err = open_file(*it);
+
+ /* For new raw device increment new size. */
+ if (sum_new_sizes && it->m_type == SRV_NEW_RAW) {
+
+ *sum_new_sizes += it->m_size;
+ }
+
+ } else {
+ err = create_file(*it);
+
+ if (sum_new_sizes) {
+ *sum_new_sizes += it->m_size;
+ }
+
+ /* Set the correct open flags now that we have
+ successfully created the file. */
+ if (err == DB_SUCCESS) {
+ /* We ignore new_db OUT parameter here
+ as the information is known at this stage */
+ file_found(*it);
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ }
+
+ if (!create_new_db && flush_lsn) {
+ /* Validate the header page in the first datafile
+ and read LSNs fom the others. */
+ err = read_lsn_and_check_flags(flush_lsn);
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ /* Close the curent handles, add space and file info to the
+ fil_system cache and the Data Dictionary, and re-open them
+ in file_system cache so that they stay open until shutdown. */
+ ulint node_counter = 0;
+ for (files_t::iterator it = begin; it != end; ++it) {
+ it->close();
+ it->m_exists = true;
+
+ if (it != begin) {
+ } else if (is_temp) {
+ ut_ad(space_id() == SRV_TMP_SPACE_ID);
+ space = fil_space_t::create(
+ name(), SRV_TMP_SPACE_ID, flags(),
+ FIL_TYPE_TEMPORARY, NULL);
+ ut_ad(space == fil_system.temp_space);
+ if (!space) {
+ return DB_ERROR;
+ }
+ ut_ad(!space->is_compressed());
+ ut_ad(space->full_crc32());
+ } else {
+ ut_ad(space_id() == TRX_SYS_SPACE);
+ space = fil_space_t::create(
+ name(), TRX_SYS_SPACE, it->flags(),
+ FIL_TYPE_TABLESPACE, NULL);
+ ut_ad(space == fil_system.sys_space);
+ if (!space) {
+ return DB_ERROR;
+ }
+ }
+
+ ut_a(fil_validate());
+
+ uint32_t max_size = (++node_counter == m_files.size()
+ ? (m_last_file_size_max == 0
+ ? UINT32_MAX
+ : uint32_t(m_last_file_size_max))
+ : it->m_size);
+
+ space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size,
+ it->m_type != SRV_NOT_RAW, true, max_size);
+ }
+
+ return(err);
+}
+
+/** Normalize the file size, convert from megabytes to number of pages. */
+void
+SysTablespace::normalize_size()
+{
+ files_t::iterator end = m_files.end();
+
+ for (files_t::iterator it = m_files.begin(); it != end; ++it) {
+
+ it->m_size <<= (20U - srv_page_size_shift);
+ }
+
+ m_last_file_size_max <<= (20U - srv_page_size_shift);
+}
+
+
+/**
+@return next increment size */
+uint32_t SysTablespace::get_increment() const
+{
+ if (m_last_file_size_max == 0)
+ return get_autoextend_increment();
+
+ if (!is_valid_size())
+ {
+ ib::error() << "The last data file in " << name()
+ << " has a size of " << last_file_size()
+ << " but the max size allowed is "
+ << m_last_file_size_max;
+ }
+
+ return std::min(uint32_t(m_last_file_size_max) - last_file_size(),
+ get_autoextend_increment());
+}
+
+
+/**
+@return true if configured to use raw devices */
+bool
+SysTablespace::has_raw_device()
+{
+ files_t::iterator end = m_files.end();
+
+ for (files_t::iterator it = m_files.begin(); it != end; ++it) {
+
+ if (it->is_raw_device()) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
diff --git a/storage/innobase/fts/Makefile.query b/storage/innobase/fts/Makefile.query
new file mode 100644
index 00000000..d91b1b92
--- /dev/null
+++ b/storage/innobase/fts/Makefile.query
@@ -0,0 +1,18 @@
+LEX=flex
+YACC=bison
+PREFIX=fts
+
+all: fts0pars.cc fts0blex.cc fts0tlex.cc
+
+fts0par.cc: fts0pars.y
+fts0blex.cc: fts0blex.l
+fts0tlex.cc: fts0tlex.l
+
+.l.cc:
+ echo '#include "univ.i"' > $*.cc
+ $(LEX) --stdout -P$(subst lex,,$*) -o $*.cc \
+ --header-file=../include/$*.h $< >> $*.cc
+
+.y.cc:
+ $(YACC) -p $(PREFIX) -o $*.cc -d $<
+ mv $*.h ../include
diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc
new file mode 100644
index 00000000..bb42f7c9
--- /dev/null
+++ b/storage/innobase/fts/fts0ast.cc
@@ -0,0 +1,815 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0ast.cc
+Full Text Search parser helper file.
+
+Created 2007/3/16 Sunny Bains.
+***********************************************************************/
+
+#include "row0sel.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+#include "fts0fts.h"
+
+/* The FTS ast visit pass. */
+enum fts_ast_visit_pass_t {
+ FTS_PASS_FIRST, /*!< First visit pass,
+ process operators excluding
+ FTS_EXIST and FTS_IGNORE */
+ FTS_PASS_EXIST, /*!< Exist visit pass,
+ process operator FTS_EXIST */
+ FTS_PASS_IGNORE /*!< Ignore visit pass,
+ process operator FTS_IGNORE */
+};
+
+/******************************************************************//**
+Create an empty fts_ast_node_t.
+@return Create a new node */
+static
+fts_ast_node_t*
+fts_ast_node_create(void)
+/*=====================*/
+{
+ fts_ast_node_t* node;
+
+ node = (fts_ast_node_t*) ut_zalloc_nokey(sizeof(*node));
+
+ return(node);
+}
+
+/** Track node allocations, in case there is an error during parsing. */
+static
+void
+fts_ast_state_add_node(
+ fts_ast_state_t*state, /*!< in: ast instance */
+ fts_ast_node_t* node) /*!< in: node to add to ast */
+{
+ if (!state->list.head) {
+ ut_a(!state->list.tail);
+
+ state->list.head = state->list.tail = node;
+ } else {
+ state->list.tail->next_alloc = node;
+ state->list.tail = node;
+ }
+}
+
+/******************************************************************//**
+Create a operator fts_ast_node_t.
+@return new node */
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+ void* arg, /*!< in: ast state instance */
+ fts_ast_oper_t oper) /*!< in: ast operator */
+{
+ fts_ast_node_t* node = fts_ast_node_create();
+
+ node->type = FTS_AST_OPER;
+ node->oper = oper;
+
+ fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+ return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it
+@return new node or a node list with tokenized words */
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+ void* arg, /*!< in: ast state instance */
+ const fts_ast_string_t* ptr) /*!< in: ast term string */
+{
+ fts_ast_state_t* state = static_cast<fts_ast_state_t*>(arg);
+ ulint len = ptr->len;
+ ulint cur_pos = 0;
+ fts_ast_node_t* node = NULL;
+ fts_ast_node_t* node_list = NULL;
+ fts_ast_node_t* first_node = NULL;
+
+ /* Scan the incoming string and filter out any "non-word" characters */
+ while (cur_pos < len) {
+ fts_string_t str;
+ ulint cur_len;
+
+ cur_len = innobase_mysql_fts_get_token(
+ state->charset,
+ reinterpret_cast<const byte*>(ptr->str) + cur_pos,
+ reinterpret_cast<const byte*>(ptr->str) + len, &str);
+
+ if (cur_len == 0) {
+ break;
+ }
+
+ cur_pos += cur_len;
+
+ if (str.f_n_char > 0) {
+ /* If the subsequent term (after the first one)'s size
+ is less than fts_min_token_size or the term is greater
+ than fts_max_token_size, we shall ignore that. This is
+ to make consistent with MyISAM behavior */
+ if ((first_node && (str.f_n_char < fts_min_token_size))
+ || str.f_n_char > fts_max_token_size) {
+ continue;
+ }
+
+ node = fts_ast_node_create();
+
+ node->type = FTS_AST_TERM;
+
+ node->term.ptr = fts_ast_string_create(
+ str.f_str, str.f_len);
+
+ fts_ast_state_add_node(
+ static_cast<fts_ast_state_t*>(arg), node);
+
+ if (first_node) {
+ /* There is more than one word, create
+ a list to organize them */
+ if (!node_list) {
+ node_list = fts_ast_create_node_list(
+ static_cast<fts_ast_state_t*>(
+ arg),
+ first_node);
+ }
+
+ fts_ast_add_node(node_list, node);
+ } else {
+ first_node = node;
+ }
+ }
+ }
+
+ return((node_list != NULL) ? node_list : first_node);
+}
+
+/******************************************************************//**
+Create an AST term node, makes a copy of ptr for plugin parser
+@return node */
+fts_ast_node_t*
+fts_ast_create_node_term_for_parser(
+/*================================*/
+ void* arg, /*!< in: ast state */
+ const char* ptr, /*!< in: term string */
+ const ulint len) /*!< in: term string length */
+{
+ fts_ast_node_t* node = NULL;
+
+ /* '%' as first char is forbidden for LIKE in internal SQL parser;
+ '%' as last char is reserved for wildcard search;*/
+ if (len == 0 || len > FTS_MAX_WORD_LEN
+ || ptr[0] == '%' || ptr[len - 1] == '%') {
+ return(NULL);
+ }
+
+ node = fts_ast_node_create();
+
+ node->type = FTS_AST_TERM;
+
+ node->term.ptr = fts_ast_string_create(
+ reinterpret_cast<const byte*>(ptr), len);
+
+ fts_ast_state_add_node(static_cast<fts_ast_state_t*>(arg), node);
+
+ return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it.
+@return new node */
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+ void* arg, /*!< in: ast state instance */
+ const fts_ast_string_t* ptr) /*!< in: ast text string */
+{
+ ulint len = ptr->len;
+ fts_ast_node_t* node = NULL;
+
+ /* Once we come here, the string must have at least 2 quotes ""
+ around the query string, which could be empty. Also the query
+ string may contain 0x00 in it, we don't treat it as null-terminated. */
+ ut_ad(len >= 2);
+ ut_ad(ptr->str[0] == '\"' && ptr->str[len - 1] == '\"');
+
+ if (len == 2) {
+ /* If the query string contains nothing except quotes,
+ it's obviously an invalid query. */
+ return(NULL);
+ }
+
+ node = fts_ast_node_create();
+
+ /*!< We ignore the actual quotes "" */
+ len -= 2;
+
+ node->type = FTS_AST_TEXT;
+ /*!< Skip copying the first quote */
+ node->text.ptr = fts_ast_string_create(
+ reinterpret_cast<const byte*>(ptr->str + 1), len);
+ node->text.distance = ULINT_UNDEFINED;
+
+ fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+ return(node);
+}
+
+/******************************************************************//**
+Create an AST phrase list node for plugin parser
+@return node */
+fts_ast_node_t*
+fts_ast_create_node_phrase_list(
+/*============================*/
+ void* arg) /*!< in: ast state */
+{
+ fts_ast_node_t* node = fts_ast_node_create();
+
+ node->type = FTS_AST_PARSER_PHRASE_LIST;
+
+ node->text.distance = ULINT_UNDEFINED;
+ node->list.head = node->list.tail = NULL;
+
+ fts_ast_state_add_node(static_cast<fts_ast_state_t*>(arg), node);
+
+ return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the expr and is responsible
+for free'ing it.
+@return new node */
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+ void* arg, /*!< in: ast state instance */
+ fts_ast_node_t* expr) /*!< in: ast expr instance */
+{
+ fts_ast_node_t* node = fts_ast_node_create();
+
+ node->type = FTS_AST_LIST;
+ node->list.head = node->list.tail = expr;
+
+ fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+ return(node);
+}
+
+/******************************************************************//**
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it.
+@return new node */
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+ void* arg, /*!< in: ast state instance */
+ fts_ast_node_t* expr) /*!< in: ast expr instance */
+{
+ fts_ast_node_t* node = fts_ast_node_create();
+
+ node->type = FTS_AST_SUBEXP_LIST;
+ node->list.head = node->list.tail = expr;
+
+ fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+ return(node);
+}
+
+/******************************************************************//**
+Free an expr list node elements. */
+static
+void
+fts_ast_free_list(
+/*==============*/
+ fts_ast_node_t* node) /*!< in: ast node to free */
+{
+ ut_a(node->type == FTS_AST_LIST
+ || node->type == FTS_AST_SUBEXP_LIST
+ || node->type == FTS_AST_PARSER_PHRASE_LIST);
+
+ for (node = node->list.head;
+ node != NULL;
+ node = fts_ast_free_node(node)) {
+
+ /*!< No op */
+ }
+}
+
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+ fts_ast_node_t* node) /*!< in: the node to free */
+{
+ fts_ast_node_t* next_node;
+
+ switch (node->type) {
+ case FTS_AST_TEXT:
+ if (node->text.ptr) {
+ fts_ast_string_free(node->text.ptr);
+ node->text.ptr = NULL;
+ }
+ break;
+
+ case FTS_AST_TERM:
+ if (node->term.ptr) {
+ fts_ast_string_free(node->term.ptr);
+ node->term.ptr = NULL;
+ }
+ break;
+
+ case FTS_AST_LIST:
+ case FTS_AST_SUBEXP_LIST:
+ case FTS_AST_PARSER_PHRASE_LIST:
+ fts_ast_free_list(node);
+ node->list.head = node->list.tail = NULL;
+ break;
+
+ case FTS_AST_OPER:
+ break;
+
+ default:
+ ut_error;
+ }
+
+ /*!< Get next node before freeing the node itself */
+ next_node = node->next;
+
+ ut_free(node);
+
+ return(next_node);
+}
+
+/******************************************************************//**
+This AST takes ownership of the expr and is responsible
+for free'ing it.
+@return in param "list" */
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+ fts_ast_node_t* node, /*!< in: list instance */
+ fts_ast_node_t* elem) /*!< in: node to add to list */
+{
+ if (!elem) {
+ return(NULL);
+ }
+
+ ut_a(!elem->next);
+ ut_a(node->type == FTS_AST_LIST
+ || node->type == FTS_AST_SUBEXP_LIST
+ || node->type == FTS_AST_PARSER_PHRASE_LIST);
+
+ if (!node->list.head) {
+ ut_a(!node->list.tail);
+
+ node->list.head = node->list.tail = elem;
+ } else {
+ ut_a(node->list.tail);
+
+ node->list.tail->next = elem;
+ node->list.tail = elem;
+ }
+
+ return(node);
+}
+
+/******************************************************************//**
+Set the wildcard attribute of a term. */
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+ fts_ast_node_t* node) /*!< in/out: set attribute of
+ a term node */
+{
+ if (!node) {
+ return;
+ }
+
+ /* If it's a node list, the wildcard should be set to the tail node*/
+ if (node->type == FTS_AST_LIST) {
+ ut_ad(node->list.tail != NULL);
+ node = node->list.tail;
+ }
+
+ ut_a(node->type == FTS_AST_TERM);
+ ut_a(!node->term.wildcard);
+
+ node->term.wildcard = TRUE;
+}
+
+/******************************************************************//**
+Set the proximity attribute of a text node. */
+void
+fts_ast_text_set_distance(
+/*======================*/
+ fts_ast_node_t* node, /*!< in/out: text node */
+ ulint distance) /*!< in: the text proximity
+ distance */
+{
+ if (node == NULL) {
+ return;
+ }
+
+ ut_a(node->type == FTS_AST_TEXT);
+ ut_a(node->text.distance == ULINT_UNDEFINED);
+
+ node->text.distance = distance;
+}
+
+/******************************************************************//**
+Free node and expr allocations. */
+void
+fts_ast_state_free(
+/*===============*/
+ fts_ast_state_t*state) /*!< in: ast state to free */
+{
+ fts_ast_node_t* node = state->list.head;
+
+ /* Free the nodes that were allocated during parsing. */
+ while (node) {
+ fts_ast_node_t* next = node->next_alloc;
+
+ if (node->type == FTS_AST_TEXT && node->text.ptr) {
+ fts_ast_string_free(node->text.ptr);
+ node->text.ptr = NULL;
+ } else if (node->type == FTS_AST_TERM && node->term.ptr) {
+ fts_ast_string_free(node->term.ptr);
+ node->term.ptr = NULL;
+ }
+
+ ut_free(node);
+ node = next;
+ }
+
+ state->root = state->list.head = state->list.tail = NULL;
+}
+
+/** Print the ast string
+@param[in] str string to print */
+static
+void
+fts_ast_string_print(
+ const fts_ast_string_t* ast_str)
+{
+ for (ulint i = 0; i < ast_str->len; ++i) {
+ printf("%c", ast_str->str[i]);
+ }
+
+ printf("\n");
+}
+
+/******************************************************************//**
+Print an ast node recursively. */
+static
+void
+fts_ast_node_print_recursive(
+/*=========================*/
+ fts_ast_node_t* node, /*!< in: ast node to print */
+ ulint level) /*!< in: recursive level */
+{
+ /* Print alignment blank */
+ for (ulint i = 0; i < level; i++) {
+ printf(" ");
+ }
+
+ switch (node->type) {
+ case FTS_AST_TEXT:
+ printf("TEXT: ");
+ fts_ast_string_print(node->text.ptr);
+ break;
+
+ case FTS_AST_TERM:
+ printf("TERM: ");
+ fts_ast_string_print(node->term.ptr);
+ break;
+
+ case FTS_AST_LIST:
+ printf("LIST: \n");
+
+ for (node = node->list.head; node; node = node->next) {
+ fts_ast_node_print_recursive(node, level + 1);
+ }
+ break;
+
+ case FTS_AST_SUBEXP_LIST:
+ printf("SUBEXP_LIST: \n");
+
+ for (node = node->list.head; node; node = node->next) {
+ fts_ast_node_print_recursive(node, level + 1);
+ }
+ break;
+
+ case FTS_AST_OPER:
+ printf("OPER: %d\n", node->oper);
+ break;
+
+ case FTS_AST_PARSER_PHRASE_LIST:
+ printf("PARSER_PHRASE_LIST: \n");
+
+ for (node = node->list.head; node; node = node->next) {
+ fts_ast_node_print_recursive(node, level + 1);
+ }
+ break;
+
+ default:
+ ut_error;
+ }
+}
+
+/******************************************************************//**
+Print an ast node */
+void
+fts_ast_node_print(
+/*===============*/
+ fts_ast_node_t* node) /*!< in: ast node to print */
+{
+ fts_ast_node_print_recursive(node, 0);
+}
+
+/** Check only union operation involved in the node
+@param[in] node ast node to check
+@return true if the node contains only union else false. */
+bool
+fts_ast_node_check_union(
+ fts_ast_node_t* node)
+{
+ if (node->type == FTS_AST_LIST
+ || node->type == FTS_AST_SUBEXP_LIST) {
+
+ for (node = node->list.head; node; node = node->next) {
+ if (!fts_ast_node_check_union(node)) {
+ return(false);
+ }
+ }
+
+ } else if (node->type == FTS_AST_PARSER_PHRASE_LIST) {
+ /* Phrase search for plugin parser */
+ return(false);
+ } else if (node->type == FTS_AST_OPER
+ && (node->oper == FTS_IGNORE
+ || node->oper == FTS_EXIST)) {
+
+ return(false);
+ } else if (node->type == FTS_AST_TEXT) {
+ /* Distance or phrase search query. */
+ return(false);
+ }
+
+ return(true);
+}
+
+/******************************************************************//**
+Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE
+nodes, which will be ignored in the first pass of each level, and visited in a
+second and third pass after all other nodes in the same level are visited.
+@return DB_SUCCESS if all went well */
+dberr_t
+fts_ast_visit(
+/*==========*/
+ fts_ast_oper_t oper, /*!< in: current operator */
+ fts_ast_node_t* node, /*!< in: current root node */
+ fts_ast_callback visitor, /*!< in: callback function */
+ void* arg, /*!< in: arg for callback */
+ bool* has_ignore) /*!< out: true, if the operator
+ was ignored during processing,
+ currently we ignore FTS_EXIST
+ and FTS_IGNORE operators */
+{
+ dberr_t error = DB_SUCCESS;
+ fts_ast_node_t* oper_node = NULL;
+ fts_ast_node_t* start_node;
+ bool revisit = false;
+ bool will_be_ignored = false;
+ fts_ast_visit_pass_t visit_pass = FTS_PASS_FIRST;
+ const trx_t* trx = node->trx;
+
+ start_node = node->list.head;
+
+ ut_a(node->type == FTS_AST_LIST
+ || node->type == FTS_AST_SUBEXP_LIST);
+
+ if (oper == FTS_EXIST_SKIP) {
+ visit_pass = FTS_PASS_EXIST;
+ } else if (oper == FTS_IGNORE_SKIP) {
+ visit_pass = FTS_PASS_IGNORE;
+ }
+
+ /* In the first pass of the tree, at the leaf level of the
+ tree, FTS_EXIST and FTS_IGNORE operation will be ignored.
+ It will be repeated at the level above the leaf level.
+
+ The basic idea here is that when we encounter FTS_EXIST or
+ FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP
+ or FTS_IGNORE_SKIP, and term node & text node with the operators
+ is ignored in the first pass. We have two passes during the revisit:
+ We process nodes with FTS_EXIST_SKIP in the exist pass, and then
+ process nodes with FTS_IGNORE_SKIP in the ignore pass.
+
+ The order should be restrictly followed, or we will get wrong results.
+ For example, we have a query 'a +b -c d +e -f'.
+ first pass: process 'a' and 'd' by union;
+ exist pass: process '+b' and '+e' by intersection;
+ ignore pass: process '-c' and '-f' by difference. */
+
+ for (node = node->list.head;
+ node && (error == DB_SUCCESS);
+ node = node->next) {
+
+ switch (node->type) {
+ case FTS_AST_LIST:
+ if (visit_pass != FTS_PASS_FIRST) {
+ break;
+ }
+
+ error = fts_ast_visit(oper, node, visitor,
+ arg, &will_be_ignored);
+
+ /* If will_be_ignored is set to true, then
+ we encountered and ignored a FTS_EXIST or FTS_IGNORE
+ operator. */
+ if (will_be_ignored) {
+ revisit = true;
+ /* Remember oper for list in case '-abc&def',
+ ignored oper is from previous node of list.*/
+ node->oper = oper;
+ }
+
+ break;
+
+ case FTS_AST_OPER:
+ oper = node->oper;
+ oper_node = node;
+
+ /* Change the operator for revisit */
+ if (oper == FTS_EXIST) {
+ oper_node->oper = FTS_EXIST_SKIP;
+ } else if (oper == FTS_IGNORE) {
+ oper_node->oper = FTS_IGNORE_SKIP;
+ }
+
+ break;
+
+ default:
+ if (node->visited) {
+ continue;
+ }
+
+ ut_a(oper == FTS_NONE || !oper_node
+ || oper_node->oper == oper
+ || oper_node->oper == FTS_EXIST_SKIP
+ || oper_node->oper == FTS_IGNORE_SKIP);
+
+ if (oper== FTS_EXIST || oper == FTS_IGNORE) {
+ *has_ignore = true;
+ continue;
+ }
+
+ /* Process leaf node accroding to its pass.*/
+ if (oper == FTS_EXIST_SKIP
+ && visit_pass == FTS_PASS_EXIST) {
+ error = visitor(FTS_EXIST, node, arg);
+ node->visited = true;
+ } else if (oper == FTS_IGNORE_SKIP
+ && visit_pass == FTS_PASS_IGNORE) {
+ error = visitor(FTS_IGNORE, node, arg);
+ node->visited = true;
+ } else if (visit_pass == FTS_PASS_FIRST) {
+ error = visitor(oper, node, arg);
+ node->visited = true;
+ }
+ }
+ }
+
+ if (trx_is_interrupted(trx)) {
+ return DB_INTERRUPTED;
+ }
+
+ if (revisit) {
+ /* Exist pass processes the skipped FTS_EXIST operation. */
+ for (node = start_node;
+ node && error == DB_SUCCESS;
+ node = node->next) {
+
+ if (node->type == FTS_AST_LIST
+ && node->oper != FTS_IGNORE) {
+ error = fts_ast_visit(FTS_EXIST_SKIP, node,
+ visitor, arg, &will_be_ignored);
+ }
+ }
+
+ /* Ignore pass processes the skipped FTS_IGNORE operation. */
+ for (node = start_node;
+ node && error == DB_SUCCESS;
+ node = node->next) {
+
+ if (node->type == FTS_AST_LIST) {
+ error = fts_ast_visit(FTS_IGNORE_SKIP, node,
+ visitor, arg, &will_be_ignored);
+ }
+ }
+ }
+
+ return(error);
+}
+
+/**
+Create an ast string object, with NUL-terminator, so the string
+has one more byte than len
+@param[in] str pointer to string
+@param[in] len length of the string
+@return ast string with NUL-terminator */
+fts_ast_string_t*
+fts_ast_string_create(
+ const byte* str,
+ ulint len)
+{
+ fts_ast_string_t* ast_str;
+
+ ut_ad(len > 0);
+
+ ast_str = static_cast<fts_ast_string_t*>(
+ ut_malloc_nokey(sizeof(fts_ast_string_t)));
+
+ ast_str->str = static_cast<byte*>(ut_malloc_nokey(len + 1));
+
+ ast_str->len = len;
+ memcpy(ast_str->str, str, len);
+ ast_str->str[len] = '\0';
+
+ return(ast_str);
+}
+
+/**
+Free an ast string instance
+@param[in,out] ast_str string to free */
+void
+fts_ast_string_free(
+ fts_ast_string_t* ast_str)
+{
+ if (ast_str != NULL) {
+ ut_free(ast_str->str);
+ ut_free(ast_str);
+ }
+}
+
+/**
+Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul
+@param[in] str string to translate
+@param[in] base the base
+@return translated number */
+ulint
+fts_ast_string_to_ul(
+ const fts_ast_string_t* ast_str,
+ int base)
+{
+ return(strtoul(reinterpret_cast<const char*>(ast_str->str),
+ NULL, base));
+}
+
+#ifdef UNIV_DEBUG
+const char*
+fts_ast_node_type_get(fts_ast_type_t type)
+{
+ switch (type) {
+ case FTS_AST_OPER:
+ return("FTS_AST_OPER");
+ case FTS_AST_NUMB:
+ return("FTS_AST_NUMB");
+ case FTS_AST_TERM:
+ return("FTS_AST_TERM");
+ case FTS_AST_TEXT:
+ return("FTS_AST_TEXT");
+ case FTS_AST_LIST:
+ return("FTS_AST_LIST");
+ case FTS_AST_SUBEXP_LIST:
+ return("FTS_AST_SUBEXP_LIST");
+ case FTS_AST_PARSER_PHRASE_LIST:
+ return("FTS_AST_PARSER_PHRASE_LIST");
+ }
+ ut_ad(0);
+ return("FTS_UNKNOWN");
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/fts/fts0blex.cc b/storage/innobase/fts/fts0blex.cc
new file mode 100644
index 00000000..6a2b4202
--- /dev/null
+++ b/storage/innobase/fts/fts0blex.cc
@@ -0,0 +1,2177 @@
+#include "univ.i"
+#line 2 "fts0blex.cc"
+
+#line 4 "fts0blex.cc"
+
+#define YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0b_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0b_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0b_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0b_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0b_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0b_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0b_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0b_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0b_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0b_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0b_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0b_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0b_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0b_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0b_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0b_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0b_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0b_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0bpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0bpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0bpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0bpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0bensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0bensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0blex_ALREADY_DEFINED
+#else
+#define yylex fts0blex
+#endif
+
+#ifdef yyrestart
+#define fts0brestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0brestart
+#endif
+
+#ifdef yylex_init
+#define fts0blex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0blex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0blex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0blex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0blex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0blex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0bget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0bget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0bset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0bset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0bget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0bget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0bset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0bset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0bget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0bget_in
+#endif
+
+#ifdef yyset_in
+#define fts0bset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0bset_in
+#endif
+
+#ifdef yyget_out
+#define fts0bget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0bget_out
+#endif
+
+#ifdef yyset_out
+#define fts0bset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0bset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0bget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0bget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0bget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0bget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0bget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0bget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0bset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0bset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0bget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0bget_column
+#endif
+
+#ifdef yyset_column
+#define fts0bset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0bset_column
+#endif
+
+#ifdef yywrap
+#define fts0bwrap_ALREADY_DEFINED
+#else
+#define yywrap fts0bwrap
+#endif
+
+#ifdef yyalloc
+#define fts0balloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0balloc
+#endif
+
+#ifdef yyrealloc
+#define fts0brealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0brealloc
+#endif
+
+#ifdef yyfree
+#define fts0bfree_ALREADY_DEFINED
+#else
+#define yyfree fts0bfree
+#endif
+
+/* First, we deal with platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an
+ * integer in range [0..255] for use as an array index.
+ */
+#define YY_SC_TO_UI(c) ((YY_CHAR) (c))
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+ are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Enter a start condition. This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN yyg->yy_start = 1 + 2 *
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state. The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START ((yyg->yy_start - 1) / 2)
+#define YYSTATE YY_START
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE yyrestart( yyin , yyscanner )
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+ #define YY_LESS_LINENO(n)
+ #define YY_LINENO_REWIND_TO(ptr)
+
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+ do \
+ { \
+ /* Undo effects of setting up yytext. */ \
+ int yyless_macro_arg = (n); \
+ YY_LESS_LINENO(yyless_macro_arg);\
+ *yy_cp = yyg->yy_hold_char; \
+ YY_RESTORE_YY_MORE_OFFSET \
+ yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+ YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+ } \
+ while ( 0 )
+#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner )
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+ {
+ FILE *yy_input_file;
+
+ char *yy_ch_buf; /* input buffer */
+ char *yy_buf_pos; /* current position in input buffer */
+
+ /* Size of input buffer in bytes, not including room for EOB
+ * characters.
+ */
+ int yy_buf_size;
+
+ /* Number of characters read into yy_ch_buf, not including EOB
+ * characters.
+ */
+ int yy_n_chars;
+
+ /* Whether we "own" the buffer - i.e., we know we created it,
+ * and can realloc() it to grow it, and should free() it to
+ * delete it.
+ */
+ int yy_is_our_buffer;
+
+ /* Whether this is an "interactive" input source; if so, and
+ * if we're using stdio for input, then we want to use getc()
+ * instead of fread(), to make sure we stop fetching input after
+ * each newline.
+ */
+ int yy_is_interactive;
+
+ /* Whether we're considered to be at the beginning of a line.
+ * If so, '^' rules will be active on the next match, otherwise
+ * not.
+ */
+ int yy_at_bol;
+
+ int yy_bs_lineno; /**< The line count. */
+ int yy_bs_column; /**< The column count. */
+
+ /* Whether to try to fill the input buffer when we reach the
+ * end of it.
+ */
+ int yy_fill_buffer;
+
+ int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+ /* When an EOF's been seen but there's still some text to process
+ * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+ * shouldn't try reading from the input source any more. We might
+ * still have a bunch of tokens to match, though, because of
+ * possible backing-up.
+ *
+ * When we actually see the EOF, we change the status to "new"
+ * (via yyrestart()), so that the user can continue scanning by
+ * just pointing yyin at a new input file.
+ */
+#define YY_BUFFER_EOF_PENDING 2
+
+ };
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
+ ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
+ : 0)
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top]
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+static void yyensure_buffer_stack ( yyscan_t yyscanner );
+static void yy_load_buffer_state ( yyscan_t yyscanner );
+static void yy_init_buffer ( YY_BUFFER_STATE b, FILE *file , yyscan_t yyscanner );
+#define YY_FLUSH_BUFFER yy_flush_buffer( YY_CURRENT_BUFFER , yyscanner)
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+#define yy_new_buffer yy_create_buffer
+#define yy_set_interactive(is_interactive) \
+ { \
+ if ( ! YY_CURRENT_BUFFER ){ \
+ yyensure_buffer_stack (yyscanner); \
+ YY_CURRENT_BUFFER_LVALUE = \
+ yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \
+ } \
+ YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+ }
+#define yy_set_bol(at_bol) \
+ { \
+ if ( ! YY_CURRENT_BUFFER ){\
+ yyensure_buffer_stack (yyscanner); \
+ YY_CURRENT_BUFFER_LVALUE = \
+ yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \
+ } \
+ YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+ }
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define fts0bwrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+typedef flex_uint8_t YY_CHAR;
+
+typedef int yy_state_type;
+
+#define yytext_ptr yytext_r
+
+static yy_state_type yy_get_previous_state ( yyscan_t yyscanner );
+static yy_state_type yy_try_NUL_trans ( yy_state_type current_state , yyscan_t yyscanner);
+static int yy_get_next_buffer ( yyscan_t yyscanner );
+static void yynoreturn yy_fatal_error ( const char* msg , yyscan_t yyscanner );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+ yyg->yytext_ptr = yy_bp; \
+ yyleng = (int) (yy_cp - yy_bp); \
+ yyg->yy_hold_char = *yy_cp; \
+ *yy_cp = '\0'; \
+ yyg->yy_c_buf_p = yy_cp;
+#define YY_NUM_RULES 7
+#define YY_END_OF_BUFFER 8
+/* This struct is not used in this scanner,
+ but its presence is necessary. */
+struct yy_trans_info
+ {
+ flex_int32_t yy_verify;
+ flex_int32_t yy_nxt;
+ };
+static const flex_int16_t yy_accept[19] =
+ { 0,
+ 4, 4, 8, 4, 1, 6, 1, 7, 7, 2,
+ 3, 4, 1, 1, 0, 5, 3, 0
+ } ;
+
+static const YY_CHAR yy_ec[256] =
+ { 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 4, 1, 5, 1, 1, 6, 1, 1, 7,
+ 7, 7, 7, 1, 7, 1, 1, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 1, 1, 7,
+ 1, 7, 1, 7, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 7, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1
+ } ;
+
+static const YY_CHAR yy_meta[9] =
+ { 0,
+ 1, 2, 3, 4, 5, 5, 5, 1
+ } ;
+
+static const flex_int16_t yy_base[22] =
+ { 0,
+ 0, 0, 22, 0, 7, 23, 0, 14, 23, 23,
+ 7, 0, 0, 0, 5, 23, 0, 23, 11, 12,
+ 16
+ } ;
+
+static const flex_int16_t yy_def[22] =
+ { 0,
+ 18, 1, 18, 19, 19, 18, 20, 21, 18, 18,
+ 19, 19, 5, 20, 21, 18, 11, 0, 18, 18,
+ 18
+ } ;
+
+static const flex_int16_t yy_nxt[32] =
+ { 0,
+ 4, 5, 6, 7, 8, 9, 10, 11, 13, 16,
+ 14, 12, 12, 14, 17, 14, 15, 15, 16, 15,
+ 15, 18, 3, 18, 18, 18, 18, 18, 18, 18,
+ 18
+ } ;
+
+static const flex_int16_t yy_chk[32] =
+ { 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 5, 15,
+ 5, 19, 19, 20, 11, 20, 21, 21, 8, 21,
+ 21, 3, 18, 18, 18, 18, 18, 18, 18, 18,
+ 18
+ } ;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+#line 1 "fts0blex.l"
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**
+ * @file fts/fts0blex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+#line 27 "fts0blex.l"
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner)
+#define exit(A) ut_error
+
+#line 675 "fts0blex.cc"
+#define YY_NO_INPUT 1
+#line 677 "fts0blex.cc"
+
+#define INITIAL 0
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Holds the entire state of the reentrant scanner. */
+struct yyguts_t
+ {
+
+ /* User-defined. Not touched by flex. */
+ YY_EXTRA_TYPE yyextra_r;
+
+ /* The rest are the same as the globals declared in the non-reentrant scanner. */
+ FILE *yyin_r, *yyout_r;
+ size_t yy_buffer_stack_top; /**< index of top of stack. */
+ size_t yy_buffer_stack_max; /**< capacity of stack. */
+ YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */
+ char yy_hold_char;
+ int yy_n_chars;
+ int yyleng_r;
+ char *yy_c_buf_p;
+ int yy_init;
+ int yy_start;
+ int yy_did_buffer_switch_on_eof;
+ int yy_start_stack_ptr;
+ int yy_start_stack_depth;
+ int *yy_start_stack;
+ yy_state_type yy_last_accepting_state;
+ char* yy_last_accepting_cpos;
+
+ int yylineno_r;
+ int yy_flex_debug_r;
+
+ char *yytext_r;
+ int yy_more_flag;
+ int yy_more_len;
+
+ }; /* end struct yyguts_t */
+
+static int yy_init_globals ( yyscan_t yyscanner );
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+ These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out ( FILE * _out_str , yyscan_t yyscanner );
+
+ int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef YY_NO_UNPUT
+
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+static int yyinput ( yyscan_t yyscanner );
+#else
+static int input ( yyscan_t yyscanner );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+ { \
+ int c = '*'; \
+ int n; \
+ for ( n = 0; n < max_size && \
+ (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+ buf[n] = (char) c; \
+ if ( c == '\n' ) \
+ buf[n++] = (char) c; \
+ if ( c == EOF && ferror( yyin ) ) \
+ YY_FATAL_ERROR( "input in flex scanner failed" ); \
+ result = n; \
+ } \
+ else \
+ { \
+ errno=0; \
+ while ( (result = (int) fread(buf, 1, (yy_size_t) max_size, yyin)) == 0 && ferror(yyin)) \
+ { \
+ if( errno != EINTR) \
+ { \
+ YY_FATAL_ERROR( "input in flex scanner failed" ); \
+ break; \
+ } \
+ errno=0; \
+ clearerr(yyin); \
+ } \
+ }\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner)
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK /*LINTED*/break;
+#endif
+
+#define YY_RULE_SETUP \
+ YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+ yy_state_type yy_current_state;
+ char *yy_cp, *yy_bp;
+ int yy_act;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if ( !yyg->yy_init )
+ {
+ yyg->yy_init = 1;
+
+#ifdef YY_USER_INIT
+ YY_USER_INIT;
+#endif
+
+ if ( ! yyg->yy_start )
+ yyg->yy_start = 1; /* first start state */
+
+ if ( ! yyin )
+ yyin = stdin;
+
+ if ( ! yyout )
+ yyout = stdout;
+
+ if ( ! YY_CURRENT_BUFFER ) {
+ yyensure_buffer_stack (yyscanner);
+ YY_CURRENT_BUFFER_LVALUE =
+ yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner);
+ }
+
+ yy_load_buffer_state( yyscanner );
+ }
+
+ {
+#line 44 "fts0blex.l"
+
+
+#line 938 "fts0blex.cc"
+
+ while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */
+ {
+ yy_cp = yyg->yy_c_buf_p;
+
+ /* Support of yytext. */
+ *yy_cp = yyg->yy_hold_char;
+
+ /* yy_bp points to the position in yy_ch_buf of the start of
+ * the current run.
+ */
+ yy_bp = yy_cp;
+
+ yy_current_state = yyg->yy_start;
+yy_match:
+ do
+ {
+ YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
+ if ( yy_accept[yy_current_state] )
+ {
+ yyg->yy_last_accepting_state = yy_current_state;
+ yyg->yy_last_accepting_cpos = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 19 )
+ yy_c = yy_meta[yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+ ++yy_cp;
+ }
+ while ( yy_current_state != 18 );
+ yy_cp = yyg->yy_last_accepting_cpos;
+ yy_current_state = yyg->yy_last_accepting_state;
+
+yy_find_action:
+ yy_act = yy_accept[yy_current_state];
+
+ YY_DO_BEFORE_ACTION;
+
+do_action: /* This label is used only to access EOF actions. */
+
+ switch ( yy_act )
+ { /* beginning of action switch */
+ case 0: /* must back up */
+ /* undo the effects of YY_DO_BEFORE_ACTION */
+ *yy_cp = yyg->yy_hold_char;
+ yy_cp = yyg->yy_last_accepting_cpos;
+ yy_current_state = yyg->yy_last_accepting_state;
+ goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 46 "fts0blex.l"
+/* Ignore whitespace */ ;
+ YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 48 "fts0blex.l"
+{
+ val->oper = fts0bget_text(yyscanner)[0];
+
+ return(val->oper);
+}
+ YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 54 "fts0blex.l"
+{
+ val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+ return(FTS_NUMB);
+}
+ YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 60 "fts0blex.l"
+{
+ val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+ return(FTS_TERM);
+}
+ YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 66 "fts0blex.l"
+{
+ val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+ return(FTS_TEXT);
+}
+ YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 72 "fts0blex.l"
+
+ YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 74 "fts0blex.l"
+ECHO;
+ YY_BREAK
+#line 1043 "fts0blex.cc"
+case YY_STATE_EOF(INITIAL):
+ yyterminate();
+
+ case YY_END_OF_BUFFER:
+ {
+ /* Amount of text matched not including the EOB char. */
+ int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
+
+ /* Undo the effects of YY_DO_BEFORE_ACTION. */
+ *yy_cp = yyg->yy_hold_char;
+ YY_RESTORE_YY_MORE_OFFSET
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+ {
+ /* We're scanning a new file or input source. It's
+ * possible that this happened because the user
+ * just pointed yyin at a new source and called
+ * yylex(). If so, then we have to assure
+ * consistency between YY_CURRENT_BUFFER and our
+ * globals. Here is the right place to do so, because
+ * this is the first action (other than possibly a
+ * back-up) that will match for the new input source.
+ */
+ yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+ YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+ YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+ }
+
+ /* Note that here we test for yy_c_buf_p "<=" to the position
+ * of the first EOB in the buffer, since yy_c_buf_p will
+ * already have been incremented past the NUL character
+ * (since all states make transitions on EOB to the
+ * end-of-buffer state). Contrast this with the test
+ * in input().
+ */
+ if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+ { /* This was really a NUL. */
+ yy_state_type yy_next_state;
+
+ yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
+
+ yy_current_state = yy_get_previous_state( yyscanner );
+
+ /* Okay, we're now positioned to make the NUL
+ * transition. We couldn't have
+ * yy_get_previous_state() go ahead and do it
+ * for us because it doesn't know how to deal
+ * with the possibility of jamming (and we don't
+ * want to build jamming into it because then it
+ * will run more slowly).
+ */
+
+ yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
+
+ yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+
+ if ( yy_next_state )
+ {
+ /* Consume the NUL. */
+ yy_cp = ++yyg->yy_c_buf_p;
+ yy_current_state = yy_next_state;
+ goto yy_match;
+ }
+
+ else
+ {
+ yy_cp = yyg->yy_last_accepting_cpos;
+ yy_current_state = yyg->yy_last_accepting_state;
+ goto yy_find_action;
+ }
+ }
+
+ else switch ( yy_get_next_buffer( yyscanner ) )
+ {
+ case EOB_ACT_END_OF_FILE:
+ {
+ yyg->yy_did_buffer_switch_on_eof = 0;
+
+ if ( yywrap( yyscanner ) )
+ {
+ /* Note: because we've taken care in
+ * yy_get_next_buffer() to have set up
+ * yytext, we can now set up
+ * yy_c_buf_p so that if some total
+ * hoser (like flex itself) wants to
+ * call the scanner after we return the
+ * YY_NULL, it'll still work - another
+ * YY_NULL will get returned.
+ */
+ yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
+
+ yy_act = YY_STATE_EOF(YY_START);
+ goto do_action;
+ }
+
+ else
+ {
+ if ( ! yyg->yy_did_buffer_switch_on_eof )
+ YY_NEW_FILE;
+ }
+ break;
+ }
+
+ case EOB_ACT_CONTINUE_SCAN:
+ yyg->yy_c_buf_p =
+ yyg->yytext_ptr + yy_amount_of_matched_text;
+
+ yy_current_state = yy_get_previous_state( yyscanner );
+
+ yy_cp = yyg->yy_c_buf_p;
+ yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+ goto yy_match;
+
+ case EOB_ACT_LAST_MATCH:
+ yyg->yy_c_buf_p =
+ &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
+
+ yy_current_state = yy_get_previous_state( yyscanner );
+
+ yy_cp = yyg->yy_c_buf_p;
+ yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+ goto yy_find_action;
+ }
+ break;
+ }
+
+ default:
+ YY_FATAL_ERROR(
+ "fatal flex scanner internal error--no action found" );
+ } /* end of action switch */
+ } /* end of scanning one token */
+ } /* end of user's declarations */
+} /* end of yylex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ * EOB_ACT_LAST_MATCH -
+ * EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ * EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+ char *source = yyg->yytext_ptr;
+ int number_to_move, i;
+ int ret_val;
+
+ if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
+ YY_FATAL_ERROR(
+ "fatal flex scanner internal error--end of buffer missed" );
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+ { /* Don't try to fill the buffer, so this is an EOF. */
+ if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
+ {
+ /* We matched a single character, the EOB, so
+ * treat this as a final EOF.
+ */
+ return EOB_ACT_END_OF_FILE;
+ }
+
+ else
+ {
+ /* We matched some text prior to the EOB, first
+ * process it.
+ */
+ return EOB_ACT_LAST_MATCH;
+ }
+ }
+
+ /* Try to read more data. */
+
+ /* First move last chars to start of buffer. */
+ number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr - 1);
+
+ for ( i = 0; i < number_to_move; ++i )
+ *(dest++) = *(source++);
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+ /* don't do the read, it's not guaranteed to return an EOF,
+ * just force an EOF
+ */
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
+
+ else
+ {
+ int num_to_read =
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+ while ( num_to_read <= 0 )
+ { /* Not enough room in the buffer - grow it. */
+
+ /* just a shorter name for the current buffer */
+ YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE;
+
+ int yy_c_buf_p_offset =
+ (int) (yyg->yy_c_buf_p - b->yy_ch_buf);
+
+ if ( b->yy_is_our_buffer )
+ {
+ int new_size = b->yy_buf_size * 2;
+
+ if ( new_size <= 0 )
+ b->yy_buf_size += b->yy_buf_size / 8;
+ else
+ b->yy_buf_size *= 2;
+
+ b->yy_ch_buf = (char *)
+ /* Include room in for 2 EOB chars. */
+ yyrealloc( (void *) b->yy_ch_buf,
+ (yy_size_t) (b->yy_buf_size + 2) , yyscanner );
+ }
+ else
+ /* Can't grow it, we don't own it. */
+ b->yy_ch_buf = NULL;
+
+ if ( ! b->yy_ch_buf )
+ YY_FATAL_ERROR(
+ "fatal error - scanner input buffer overflow" );
+
+ yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+ num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+ number_to_move - 1;
+
+ }
+
+ if ( num_to_read > YY_READ_BUF_SIZE )
+ num_to_read = YY_READ_BUF_SIZE;
+
+ /* Read in more data. */
+ YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+ yyg->yy_n_chars, num_to_read );
+
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+ }
+
+ if ( yyg->yy_n_chars == 0 )
+ {
+ if ( number_to_move == YY_MORE_ADJ )
+ {
+ ret_val = EOB_ACT_END_OF_FILE;
+ yyrestart( yyin , yyscanner);
+ }
+
+ else
+ {
+ ret_val = EOB_ACT_LAST_MATCH;
+ YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+ YY_BUFFER_EOF_PENDING;
+ }
+ }
+
+ else
+ ret_val = EOB_ACT_CONTINUE_SCAN;
+
+ if ((yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+ /* Extend the array by 50%, plus the number we really need. */
+ int new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc(
+ (void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size , yyscanner );
+ if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+ /* "- 2" to take care of EOB's */
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2);
+ }
+
+ yyg->yy_n_chars += number_to_move;
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
+
+ yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+ return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+ static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
+{
+ yy_state_type yy_current_state;
+ char *yy_cp;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ yy_current_state = yyg->yy_start;
+
+ for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
+ {
+ YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+ if ( yy_accept[yy_current_state] )
+ {
+ yyg->yy_last_accepting_state = yy_current_state;
+ yyg->yy_last_accepting_cpos = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 19 )
+ yy_c = yy_meta[yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+ }
+
+ return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ * next_state = yy_try_NUL_trans( current_state );
+ */
+ static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner)
+{
+ int yy_is_jam;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */
+ char *yy_cp = yyg->yy_c_buf_p;
+
+ YY_CHAR yy_c = 1;
+ if ( yy_accept[yy_current_state] )
+ {
+ yyg->yy_last_accepting_state = yy_current_state;
+ yyg->yy_last_accepting_cpos = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 19 )
+ yy_c = yy_meta[yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+ yy_is_jam = (yy_current_state == 18);
+
+ (void)yyg;
+ return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_UNPUT
+
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+ static int yyinput (yyscan_t yyscanner)
+#else
+ static int input (yyscan_t yyscanner)
+#endif
+
+{
+ int c;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ *yyg->yy_c_buf_p = yyg->yy_hold_char;
+
+ if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
+ {
+ /* yy_c_buf_p now points to the character we want to return.
+ * If this occurs *before* the EOB characters, then it's a
+ * valid NUL; if not, then we've hit the end of the buffer.
+ */
+ if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+ /* This was really a NUL. */
+ *yyg->yy_c_buf_p = '\0';
+
+ else
+ { /* need more input */
+ int offset = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr);
+ ++yyg->yy_c_buf_p;
+
+ switch ( yy_get_next_buffer( yyscanner ) )
+ {
+ case EOB_ACT_LAST_MATCH:
+ /* This happens because yy_g_n_b()
+ * sees that we've accumulated a
+ * token and flags that we need to
+ * try matching the token before
+ * proceeding. But for input(),
+ * there's no matching to consider.
+ * So convert the EOB_ACT_LAST_MATCH
+ * to EOB_ACT_END_OF_FILE.
+ */
+
+ /* Reset buffer status. */
+ yyrestart( yyin , yyscanner);
+
+ /*FALLTHROUGH*/
+
+ case EOB_ACT_END_OF_FILE:
+ {
+ if ( yywrap( yyscanner ) )
+ return 0;
+
+ if ( ! yyg->yy_did_buffer_switch_on_eof )
+ YY_NEW_FILE;
+#ifdef __cplusplus
+ return yyinput(yyscanner);
+#else
+ return input(yyscanner);
+#endif
+ }
+
+ case EOB_ACT_CONTINUE_SCAN:
+ yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
+ break;
+ }
+ }
+ }
+
+ c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */
+ *yyg->yy_c_buf_p = '\0'; /* preserve yytext */
+ yyg->yy_hold_char = *++yyg->yy_c_buf_p;
+
+ return c;
+}
+#endif /* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * @param yyscanner The scanner object.
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+ void yyrestart (FILE * input_file , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if ( ! YY_CURRENT_BUFFER ){
+ yyensure_buffer_stack (yyscanner);
+ YY_CURRENT_BUFFER_LVALUE =
+ yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner);
+ }
+
+ yy_init_buffer( YY_CURRENT_BUFFER, input_file , yyscanner);
+ yy_load_buffer_state( yyscanner );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * @param yyscanner The scanner object.
+ */
+ void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ /* TODO. We should be able to replace this entire function body
+ * with
+ * yypop_buffer_state();
+ * yypush_buffer_state(new_buffer);
+ */
+ yyensure_buffer_stack (yyscanner);
+ if ( YY_CURRENT_BUFFER == new_buffer )
+ return;
+
+ if ( YY_CURRENT_BUFFER )
+ {
+ /* Flush out information for old buffer. */
+ *yyg->yy_c_buf_p = yyg->yy_hold_char;
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+ }
+
+ YY_CURRENT_BUFFER_LVALUE = new_buffer;
+ yy_load_buffer_state( yyscanner );
+
+ /* We don't actually know whether we did this switch during
+ * EOF (yywrap()) processing, but the only time this flag
+ * is looked at is after yywrap() is called, so it's safe
+ * to go ahead and always set it.
+ */
+ yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+static void yy_load_buffer_state (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+ yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+ yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+ yyg->yy_hold_char = *yyg->yy_c_buf_p;
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * @param yyscanner The scanner object.
+ * @return the allocated buffer state.
+ */
+ YY_BUFFER_STATE yy_create_buffer (FILE * file, int size , yyscan_t yyscanner)
+{
+ YY_BUFFER_STATE b;
+
+ b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner );
+ if ( ! b )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+ b->yy_buf_size = size;
+
+ /* yy_ch_buf has to be 2 characters longer than the size given because
+ * we need to put in 2 end-of-buffer characters.
+ */
+ b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2) , yyscanner );
+ if ( ! b->yy_ch_buf )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+ b->yy_is_our_buffer = 1;
+
+ yy_init_buffer( b, file , yyscanner);
+
+ return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with yy_create_buffer()
+ * @param yyscanner The scanner object.
+ */
+ void yy_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if ( ! b )
+ return;
+
+ if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+ YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+ if ( b->yy_is_our_buffer )
+ yyfree( (void *) b->yy_ch_buf , yyscanner );
+
+ yyfree( (void *) b , yyscanner );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a yyrestart() or at EOF.
+ */
+ static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file , yyscan_t yyscanner)
+
+{
+ int oerrno = errno;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ yy_flush_buffer( b , yyscanner);
+
+ b->yy_input_file = file;
+ b->yy_fill_buffer = 1;
+
+ /* If b is the current buffer, then yy_init_buffer was _probably_
+ * called from yyrestart() or through yy_get_next_buffer.
+ * In that case, we don't want to reset the lineno or column.
+ */
+ if (b != YY_CURRENT_BUFFER){
+ b->yy_bs_lineno = 1;
+ b->yy_bs_column = 0;
+ }
+
+ b->yy_is_interactive = 0;
+
+ errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * @param yyscanner The scanner object.
+ */
+ void yy_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ if ( ! b )
+ return;
+
+ b->yy_n_chars = 0;
+
+ /* We always need two end-of-buffer characters. The first causes
+ * a transition to the end-of-buffer state. The second causes
+ * a jam in that state.
+ */
+ b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+ b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+ b->yy_buf_pos = &b->yy_ch_buf[0];
+
+ b->yy_at_bol = 1;
+ b->yy_buffer_status = YY_BUFFER_NEW;
+
+ if ( b == YY_CURRENT_BUFFER )
+ yy_load_buffer_state( yyscanner );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ * the current state. This function will allocate the stack
+ * if necessary.
+ * @param new_buffer The new state.
+ * @param yyscanner The scanner object.
+ */
+void yypush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ if (new_buffer == NULL)
+ return;
+
+ yyensure_buffer_stack(yyscanner);
+
+ /* This block is copied from yy_switch_to_buffer. */
+ if ( YY_CURRENT_BUFFER )
+ {
+ /* Flush out information for old buffer. */
+ *yyg->yy_c_buf_p = yyg->yy_hold_char;
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+ }
+
+ /* Only push if top exists. Otherwise, replace top. */
+ if (YY_CURRENT_BUFFER)
+ yyg->yy_buffer_stack_top++;
+ YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+ /* copied from yy_switch_to_buffer. */
+ yy_load_buffer_state( yyscanner );
+ yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ * The next element becomes the new top.
+ * @param yyscanner The scanner object.
+ */
+void yypop_buffer_state (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ if (!YY_CURRENT_BUFFER)
+ return;
+
+ yy_delete_buffer(YY_CURRENT_BUFFER , yyscanner);
+ YY_CURRENT_BUFFER_LVALUE = NULL;
+ if (yyg->yy_buffer_stack_top > 0)
+ --yyg->yy_buffer_stack_top;
+
+ if (YY_CURRENT_BUFFER) {
+ yy_load_buffer_state( yyscanner );
+ yyg->yy_did_buffer_switch_on_eof = 1;
+ }
+}
+
+/* Allocates the stack if it does not exist.
+ * Guarantees space for at least one push.
+ */
+static void yyensure_buffer_stack (yyscan_t yyscanner)
+{
+ yy_size_t num_to_alloc;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if (!yyg->yy_buffer_stack) {
+
+ /* First allocation is just for 2 elements, since we don't know if this
+ * scanner will even need a stack. We use 2 instead of 1 to avoid an
+ * immediate realloc on the next call.
+ */
+ num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */
+ yyg->yy_buffer_stack = (struct yy_buffer_state**)yyalloc
+ (num_to_alloc * sizeof(struct yy_buffer_state*)
+ , yyscanner);
+ if ( ! yyg->yy_buffer_stack )
+ YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+ memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+ yyg->yy_buffer_stack_max = num_to_alloc;
+ yyg->yy_buffer_stack_top = 0;
+ return;
+ }
+
+ if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
+
+ /* Increase the buffer to prepare for a possible push. */
+ yy_size_t grow_size = 8 /* arbitrary grow size */;
+
+ num_to_alloc = yyg->yy_buffer_stack_max + grow_size;
+ yyg->yy_buffer_stack = (struct yy_buffer_state**)yyrealloc
+ (yyg->yy_buffer_stack,
+ num_to_alloc * sizeof(struct yy_buffer_state*)
+ , yyscanner);
+ if ( ! yyg->yy_buffer_stack )
+ YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+ /* zero only the new slots.*/
+ memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
+ yyg->yy_buffer_stack_max = num_to_alloc;
+ }
+}
+
+/** Setup the input buffer state to scan directly from a user-specified character buffer.
+ * @param base the character buffer
+ * @param size the size in bytes of the character buffer
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner)
+{
+ YY_BUFFER_STATE b;
+
+ if ( size < 2 ||
+ base[size-2] != YY_END_OF_BUFFER_CHAR ||
+ base[size-1] != YY_END_OF_BUFFER_CHAR )
+ /* They forgot to leave room for the EOB's. */
+ return NULL;
+
+ b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner );
+ if ( ! b )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" );
+
+ b->yy_buf_size = (int) (size - 2); /* "- 2" to take care of EOB's */
+ b->yy_buf_pos = b->yy_ch_buf = base;
+ b->yy_is_our_buffer = 0;
+ b->yy_input_file = NULL;
+ b->yy_n_chars = b->yy_buf_size;
+ b->yy_is_interactive = 0;
+ b->yy_at_bol = 1;
+ b->yy_fill_buffer = 0;
+ b->yy_buffer_status = YY_BUFFER_NEW;
+
+ yy_switch_to_buffer( b , yyscanner );
+
+ return b;
+}
+
+/** Setup the input buffer state to scan a string. The next call to yylex() will
+ * scan from a @e copy of @a str.
+ * @param yystr a NUL-terminated string to scan
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ * @note If you want to scan bytes that may contain NUL values, then use
+ * yy_scan_bytes() instead.
+ */
+YY_BUFFER_STATE yy_scan_string (const char * yystr , yyscan_t yyscanner)
+{
+
+ return yy_scan_bytes( yystr, (int) strlen(yystr) , yyscanner);
+}
+
+/** Setup the input buffer state to scan the given bytes. The next call to yylex() will
+ * scan from a @e copy of @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE yy_scan_bytes (const char * yybytes, int _yybytes_len , yyscan_t yyscanner)
+{
+ YY_BUFFER_STATE b;
+ char *buf;
+ yy_size_t n;
+ int i;
+
+ /* Get memory for full buffer, including space for trailing EOB's. */
+ n = (yy_size_t) (_yybytes_len + 2);
+ buf = (char *) yyalloc( n , yyscanner );
+ if ( ! buf )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" );
+
+ for ( i = 0; i < _yybytes_len; ++i )
+ buf[i] = yybytes[i];
+
+ buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
+
+ b = yy_scan_buffer( buf, n , yyscanner);
+ if ( ! b )
+ YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" );
+
+ /* It's okay to grow etc. this buffer, and we should throw it
+ * away when we're done.
+ */
+ b->yy_is_our_buffer = 1;
+
+ return b;
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yynoreturn yy_fatal_error (const char* msg , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ (void)yyg;
+ fprintf( stderr, "%s\n", msg );
+ exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+ do \
+ { \
+ /* Undo effects of setting up yytext. */ \
+ int yyless_macro_arg = (n); \
+ YY_LESS_LINENO(yyless_macro_arg);\
+ yytext[yyleng] = yyg->yy_hold_char; \
+ yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
+ yyg->yy_hold_char = *yyg->yy_c_buf_p; \
+ *yyg->yy_c_buf_p = '\0'; \
+ yyleng = yyless_macro_arg; \
+ } \
+ while ( 0 )
+
+/* Accessor methods (get/set functions) to struct members. */
+
+/** Get the user-defined data for this scanner.
+ * @param yyscanner The scanner object.
+ */
+YY_EXTRA_TYPE yyget_extra (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yyextra;
+}
+
+/** Get the current line number.
+ * @param yyscanner The scanner object.
+ */
+int yyget_lineno (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if (! YY_CURRENT_BUFFER)
+ return 0;
+
+ return yylineno;
+}
+
+/** Get the current column number.
+ * @param yyscanner The scanner object.
+ */
+int yyget_column (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if (! YY_CURRENT_BUFFER)
+ return 0;
+
+ return yycolumn;
+}
+
+/** Get the input stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *yyget_in (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yyin;
+}
+
+/** Get the output stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *yyget_out (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yyout;
+}
+
+/** Get the length of the current token.
+ * @param yyscanner The scanner object.
+ */
+int yyget_leng (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yyleng;
+}
+
+/** Get the current token.
+ * @param yyscanner The scanner object.
+ */
+
+char *yyget_text (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yytext;
+}
+
+/** Set the user-defined data. This data is never touched by the scanner.
+ * @param user_defined The data to be associated with this scanner.
+ * @param yyscanner The scanner object.
+ */
+void yyset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ yyextra = user_defined ;
+}
+
+/** Set the current line number.
+ * @param _line_number line number
+ * @param yyscanner The scanner object.
+ */
+void yyset_lineno (int _line_number , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ /* lineno is only valid if an input buffer exists. */
+ if (! YY_CURRENT_BUFFER )
+ YY_FATAL_ERROR( "yyset_lineno called with no buffer" );
+
+ yylineno = _line_number;
+}
+
+/** Set the current column.
+ * @param _column_no column number
+ * @param yyscanner The scanner object.
+ */
+void yyset_column (int _column_no , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ /* column is only valid if an input buffer exists. */
+ if (! YY_CURRENT_BUFFER )
+ YY_FATAL_ERROR( "yyset_column called with no buffer" );
+
+ yycolumn = _column_no;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param _in_str A readable stream.
+ * @param yyscanner The scanner object.
+ * @see yy_switch_to_buffer
+ */
+void yyset_in (FILE * _in_str , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ yyin = _in_str ;
+}
+
+void yyset_out (FILE * _out_str , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ yyout = _out_str ;
+}
+
+int yyget_debug (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yy_flex_debug;
+}
+
+void yyset_debug (int _bdebug , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ yy_flex_debug = _bdebug ;
+}
+
+/* Accessor methods for yylval and yylloc */
+
+/* User-visible API */
+
+/* yylex_init is special because it creates the scanner itself, so it is
+ * the ONLY reentrant function that doesn't take the scanner as the last argument.
+ * That's why we explicitly handle the declaration, instead of using our macros.
+ */
+int yylex_init(yyscan_t* ptr_yy_globals)
+{
+ if (ptr_yy_globals == NULL){
+ errno = EINVAL;
+ return 1;
+ }
+
+ *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), NULL );
+
+ if (*ptr_yy_globals == NULL){
+ errno = ENOMEM;
+ return 1;
+ }
+
+ /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
+ memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+ return yy_init_globals ( *ptr_yy_globals );
+}
+
+/* yylex_init_extra has the same functionality as yylex_init, but follows the
+ * convention of taking the scanner as the last argument. Note however, that
+ * this is a *pointer* to a scanner, as it will be allocated by this call (and
+ * is the reason, too, why this function also must handle its own declaration).
+ * The user defined value in the first argument will be available to yyalloc in
+ * the yyextra field.
+ */
+int yylex_init_extra( YY_EXTRA_TYPE yy_user_defined, yyscan_t* ptr_yy_globals )
+{
+ struct yyguts_t dummy_yyguts;
+
+ yyset_extra (yy_user_defined, &dummy_yyguts);
+
+ if (ptr_yy_globals == NULL){
+ errno = EINVAL;
+ return 1;
+ }
+
+ *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
+
+ if (*ptr_yy_globals == NULL){
+ errno = ENOMEM;
+ return 1;
+ }
+
+ /* By setting to 0xAA, we expose bugs in
+ yy_init_globals. Leave at 0x00 for releases. */
+ memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+ yyset_extra (yy_user_defined, *ptr_yy_globals);
+
+ return yy_init_globals ( *ptr_yy_globals );
+}
+
+static int yy_init_globals (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ /* Initialization is the same as for the non-reentrant scanner.
+ * This function is called from yylex_destroy(), so don't allocate here.
+ */
+
+ yyg->yy_buffer_stack = NULL;
+ yyg->yy_buffer_stack_top = 0;
+ yyg->yy_buffer_stack_max = 0;
+ yyg->yy_c_buf_p = NULL;
+ yyg->yy_init = 0;
+ yyg->yy_start = 0;
+
+ yyg->yy_start_stack_ptr = 0;
+ yyg->yy_start_stack_depth = 0;
+ yyg->yy_start_stack = NULL;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+ yyin = stdin;
+ yyout = stdout;
+#else
+ yyin = NULL;
+ yyout = NULL;
+#endif
+
+ /* For future reference: Set errno on error, since we are called by
+ * yylex_init()
+ */
+ return 0;
+}
+
+/* yylex_destroy is for both reentrant and non-reentrant scanners. */
+int yylex_destroy (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ /* Pop the buffer stack, destroying each element. */
+ while(YY_CURRENT_BUFFER){
+ yy_delete_buffer( YY_CURRENT_BUFFER , yyscanner );
+ YY_CURRENT_BUFFER_LVALUE = NULL;
+ yypop_buffer_state(yyscanner);
+ }
+
+ /* Destroy the stack itself. */
+ yyfree(yyg->yy_buffer_stack , yyscanner);
+ yyg->yy_buffer_stack = NULL;
+
+ /* Destroy the start condition stack. */
+ yyfree( yyg->yy_start_stack , yyscanner );
+ yyg->yy_start_stack = NULL;
+
+ /* Reset the globals. This is important in a non-reentrant scanner so the next time
+ * yylex() is called, initialization will occur. */
+ yy_init_globals( yyscanner);
+
+ /* Destroy the main struct (reentrant only). */
+ yyfree ( yyscanner , yyscanner );
+ yyscanner = NULL;
+ return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, const char * s2, int n , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ (void)yyg;
+
+ int i;
+ for ( i = 0; i < n; ++i )
+ s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (const char * s , yyscan_t yyscanner)
+{
+ int n;
+ for ( n = 0; s[n]; ++n )
+ ;
+
+ return n;
+}
+#endif
+
+void *yyalloc (yy_size_t size , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ (void)yyg;
+ return malloc(size);
+}
+
+void *yyrealloc (void * ptr, yy_size_t size , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ (void)yyg;
+
+ /* The cast to (char *) in the following accommodates both
+ * implementations that use char* generic pointers, and those
+ * that use void* generic pointers. It works with the latter
+ * because both ANSI C and C++ allow castless assignment from
+ * any pointer type to void*, and deal with argument conversions
+ * as though doing an assignment.
+ */
+ return realloc(ptr, size);
+}
+
+void yyfree (void * ptr , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ (void)yyg;
+ free( (char *) ptr ); /* see yyrealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 74 "fts0blex.l"
+
+
diff --git a/storage/innobase/fts/fts0blex.l b/storage/innobase/fts/fts0blex.l
new file mode 100644
index 00000000..cf19cd0f
--- /dev/null
+++ b/storage/innobase/fts/fts0blex.l
@@ -0,0 +1,74 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0blex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner)
+#define exit(A) ut_error
+
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+%option nostdinit
+%option reentrant
+%option never-interactive
+
+%%
+
+[\t ]+ /* Ignore whitespace */ ;
+
+[*()+\-<>~@] {
+ val->oper = fts0bget_text(yyscanner)[0];
+
+ return(val->oper);
+}
+
+[0-9]+ {
+ val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+ return(FTS_NUMB);
+}
+
+[^" \n*()+\-<>~@%]* {
+ val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+ return(FTS_TERM);
+}
+
+\"[^\"\n]*\" {
+ val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+ return(FTS_TEXT);
+}
+
+\n
+
+%%
diff --git a/storage/innobase/fts/fts0config.cc b/storage/innobase/fts/fts0config.cc
new file mode 100644
index 00000000..9e2b4091
--- /dev/null
+++ b/storage/innobase/fts/fts0config.cc
@@ -0,0 +1,432 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0config.cc
+Full Text Search configuration table.
+
+Created 2007/5/9 Sunny Bains
+***********************************************************************/
+
+#include "trx0roll.h"
+#include "row0sel.h"
+
+#include "fts0priv.h"
+
+/******************************************************************//**
+Callback function for fetching the config value.
+@return always returns TRUE */
+static
+ibool
+fts_config_fetch_value(
+/*===================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: pointer to
+ ib_vector_t */
+{
+ sel_node_t* node = static_cast<sel_node_t*>(row);
+ fts_string_t* value = static_cast<fts_string_t*>(user_arg);
+
+ dfield_t* dfield = que_node_get_val(node->select_list);
+ dtype_t* type = dfield_get_type(dfield);
+ ulint len = dfield_get_len(dfield);
+ void* data = dfield_get_data(dfield);
+
+ ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+
+ if (len != UNIV_SQL_NULL) {
+ ulint max_len = ut_min(value->f_len - 1, len);
+
+ memcpy(value->f_str, data, max_len);
+ value->f_len = max_len;
+ value->f_str[value->f_len] = '\0';
+ }
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+Get value from the config table. The caller must ensure that enough
+space is allocated for value to hold the column contents.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_value(
+/*=================*/
+ trx_t* trx, /*!< transaction */
+ fts_table_t* fts_table, /*!< in: the indexed
+ FTS table */
+ const char* name, /*!< in: get config value for
+ this parameter name */
+ fts_string_t* value) /*!< out: value read from
+ config table */
+{
+ pars_info_t* info;
+ que_t* graph;
+ dberr_t error;
+ ulint name_len = strlen(name);
+ char table_name[MAX_FULL_NAME_LEN];
+
+ info = pars_info_create();
+
+ *value->f_str = '\0';
+ ut_a(value->f_len > 0);
+
+ pars_info_bind_function(info, "my_func", fts_config_fetch_value,
+ value);
+
+ /* The len field of value must be set to the max bytes that
+ it can hold. On a successful read, the len field will be set
+ to the actual number of bytes copied to value. */
+ pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+
+ fts_table->suffix = "CONFIG";
+ fts_get_table_name(fts_table, table_name);
+ pars_info_bind_id(info, true, "table_name", table_name);
+
+ graph = fts_parse_sql(
+ fts_table,
+ info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS SELECT value FROM $table_name"
+ " WHERE key = :name;\n"
+ "BEGIN\n"
+ ""
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+
+ trx->op_info = "getting FTS config value";
+
+ error = fts_eval_sql(trx, graph);
+
+ mutex_enter(&dict_sys.mutex);
+ que_graph_free(graph);
+ mutex_exit(&dict_sys.mutex);
+
+ return(error);
+}
+
+/*********************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+ const char* param, /*!< in: base name of param */
+ const dict_index_t* index) /*!< in: index for config */
+{
+ ulint len;
+ char* name;
+
+ /* The format of the config name is: name_<index_id>. */
+ len = strlen(param);
+
+ /* Caller is responsible for deleting name. */
+ name = static_cast<char*>(ut_malloc_nokey(
+ len + FTS_AUX_MIN_TABLE_ID_LENGTH + 2));
+ ::strcpy(name, param);
+ name[len] = '_';
+
+ fts_write_object_id(index->id, name + len + 1);
+
+ return(name);
+}
+
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_index_value(
+/*=======================*/
+ trx_t* trx, /*!< transaction */
+ dict_index_t* index, /*!< in: index */
+ const char* param, /*!< in: get config value for
+ this parameter name */
+ fts_string_t* value) /*!< out: value read from
+ config table */
+{
+ char* name;
+ dberr_t error;
+ fts_table_t fts_table;
+
+ FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+ index->table);
+
+ /* We are responsible for free'ing name. */
+ name = fts_config_create_index_param_name(param, index);
+
+ error = fts_config_get_value(trx, &fts_table, name, value);
+
+ ut_free(name);
+
+ return(error);
+}
+
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_value(
+/*=================*/
+ trx_t* trx, /*!< transaction */
+ fts_table_t* fts_table, /*!< in: the indexed
+ FTS table */
+ const char* name, /*!< in: get config value for
+ this parameter name */
+ const fts_string_t*
+ value) /*!< in: value to update */
+{
+ pars_info_t* info;
+ que_t* graph;
+ dberr_t error;
+ undo_no_t undo_no;
+ undo_no_t n_rows_updated;
+ ulint name_len = strlen(name);
+ char table_name[MAX_FULL_NAME_LEN];
+
+ info = pars_info_create();
+
+ pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+ pars_info_bind_varchar_literal(info, "value",
+ value->f_str, value->f_len);
+
+ const bool dict_locked = fts_table->table->fts->dict_locked;
+
+ fts_table->suffix = "CONFIG";
+ fts_get_table_name(fts_table, table_name, dict_locked);
+ pars_info_bind_id(info, true, "table_name", table_name);
+
+ graph = fts_parse_sql(
+ fts_table, info,
+ "BEGIN UPDATE $table_name SET value = :value"
+ " WHERE key = :name;");
+
+ trx->op_info = "setting FTS config value";
+
+ undo_no = trx->undo_no;
+
+ error = fts_eval_sql(trx, graph);
+
+ fts_que_graph_free_check_lock(fts_table, NULL, graph);
+
+ n_rows_updated = trx->undo_no - undo_no;
+
+ /* Check if we need to do an insert. */
+ if (n_rows_updated == 0) {
+ info = pars_info_create();
+
+ pars_info_bind_varchar_literal(
+ info, "name", (byte*) name, name_len);
+
+ pars_info_bind_varchar_literal(
+ info, "value", value->f_str, value->f_len);
+
+ fts_get_table_name(fts_table, table_name, dict_locked);
+ pars_info_bind_id(info, true, "table_name", table_name);
+
+ graph = fts_parse_sql(
+ fts_table, info,
+ "BEGIN\n"
+ "INSERT INTO $table_name VALUES(:name, :value);");
+
+ trx->op_info = "inserting FTS config value";
+
+ error = fts_eval_sql(trx, graph);
+
+ fts_que_graph_free_check_lock(fts_table, NULL, graph);
+ }
+
+ return(error);
+}
+
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_index_value(
+/*=======================*/
+ trx_t* trx, /*!< transaction */
+ dict_index_t* index, /*!< in: index */
+ const char* param, /*!< in: get config value for
+ this parameter name */
+ fts_string_t* value) /*!< out: value read from
+ config table */
+{
+ char* name;
+ dberr_t error;
+ fts_table_t fts_table;
+
+ FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+ index->table);
+
+ /* We are responsible for free'ing name. */
+ name = fts_config_create_index_param_name(param, index);
+
+ error = fts_config_set_value(trx, &fts_table, name, value);
+
+ ut_free(name);
+
+ return(error);
+}
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_get_index_ulint(
+/*=======================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: FTS index */
+ const char* name, /*!< in: param name */
+ ulint* int_value) /*!< out: value */
+{
+ dberr_t error;
+ fts_string_t value;
+
+ /* We set the length of value to the max bytes it can hold. This
+ information is used by the callback that reads the value.*/
+ value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+ value.f_str = static_cast<byte*>(ut_malloc_nokey(value.f_len + 1));
+
+ error = fts_config_get_index_value(trx, index, name, &value);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "(" << error << ") reading `" << name << "'";
+ } else {
+ *int_value = strtoul((char*) value.f_str, NULL, 10);
+ }
+
+ ut_free(value.f_str);
+
+ return(error);
+}
+
+/******************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_set_index_ulint(
+/*=======================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: FTS index */
+ const char* name, /*!< in: param name */
+ ulint int_value) /*!< in: value */
+{
+ dberr_t error;
+ fts_string_t value;
+
+ /* We set the length of value to the max bytes it can hold. This
+ information is used by the callback that reads the value.*/
+ value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+ value.f_str = static_cast<byte*>(ut_malloc_nokey(value.f_len + 1));
+
+ // FIXME: Get rid of snprintf
+ ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN);
+
+ value.f_len = snprintf(
+ (char*) value.f_str, FTS_MAX_INT_LEN, ULINTPF, int_value);
+
+ error = fts_config_set_index_value(trx, index, name, &value);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "(" << error << ") writing `" << name << "'";
+ }
+
+ ut_free(value.f_str);
+
+ return(error);
+}
+#endif /* FTS_OPTIMIZE_DEBUG */
+
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_get_ulint(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table, /*!< in: the indexed
+ FTS table */
+ const char* name, /*!< in: param name */
+ ulint* int_value) /*!< out: value */
+{
+ dberr_t error;
+ fts_string_t value;
+
+ /* We set the length of value to the max bytes it can hold. This
+ information is used by the callback that reads the value.*/
+ value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+ value.f_str = static_cast<byte*>(ut_malloc_nokey(value.f_len + 1));
+
+ error = fts_config_get_value(trx, fts_table, name, &value);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "(" << error << ") reading `" << name << "'";
+ } else {
+ *int_value = strtoul((char*) value.f_str, NULL, 10);
+ }
+
+ ut_free(value.f_str);
+
+ return(error);
+}
+
+/******************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_set_ulint(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table, /*!< in: the indexed
+ FTS table */
+ const char* name, /*!< in: param name */
+ ulint int_value) /*!< in: value */
+{
+ dberr_t error;
+ fts_string_t value;
+
+ /* We set the length of value to the max bytes it can hold. This
+ information is used by the callback that reads the value.*/
+ value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+ value.f_str = static_cast<byte*>(ut_malloc_nokey(value.f_len + 1));
+
+ ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN);
+
+ value.f_len = (ulint) snprintf(
+ (char*) value.f_str, FTS_MAX_INT_LEN, ULINTPF, int_value);
+
+ error = fts_config_set_value(trx, fts_table, name, &value);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "(" << error << ") writing `" << name << "'";
+ }
+
+ ut_free(value.f_str);
+
+ return(error);
+}
diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc
new file mode 100644
index 00000000..96ad0570
--- /dev/null
+++ b/storage/innobase/fts/fts0fts.cc
@@ -0,0 +1,6316 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0fts.cc
+Full Text Search interface
+***********************************************************************/
+
+#include "trx0roll.h"
+#include "row0mysql.h"
+#include "row0upd.h"
+#include "dict0types.h"
+#include "dict0stats_bg.h"
+#include "row0sel.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#include "fts0plugin.h"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "btr0pcur.h"
+#include "sync0sync.h"
+
+static const ulint FTS_MAX_ID_LEN = 32;
+
+/** Column name from the FTS config table */
+#define FTS_MAX_CACHE_SIZE_IN_MB "cache_size_in_mb"
+
+/** Verify if a aux table name is a obsolete table
+by looking up the key word in the obsolete table names */
+#define FTS_IS_OBSOLETE_AUX_TABLE(table_name) \
+ (strstr((table_name), "DOC_ID") != NULL \
+ || strstr((table_name), "ADDED") != NULL \
+ || strstr((table_name), "STOPWORDS") != NULL)
+
+/** This is maximum FTS cache for each table and would be
+a configurable variable */
+ulong fts_max_cache_size;
+
+/** Whether the total memory used for FTS cache is exhausted, and we will
+need a sync to free some memory */
+bool fts_need_sync = false;
+
+/** Variable specifying the total memory allocated for FTS cache */
+ulong fts_max_total_cache_size;
+
+/** This is FTS result cache limit for each query and would be
+a configurable variable */
+size_t fts_result_cache_limit;
+
+/** Variable specifying the maximum FTS max token size */
+ulong fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+ulong fts_min_token_size;
+
+
+// FIXME: testing
+static time_t elapsed_time;
+static ulint n_nodes;
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/** The cache size permissible lower limit (1K) */
+static const ulint FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB = 1;
+
+/** The cache size permissible upper limit (1G) */
+static const ulint FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB = 1024;
+#endif
+
+/** Time to sleep after DEADLOCK error before retrying operation. */
+static const ulint FTS_DEADLOCK_RETRY_WAIT = 100000;
+
+/** InnoDB default stopword list:
+There are different versions of stopwords, the stop words listed
+below comes from "Google Stopword" list. Reference:
+http://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list.
+The final version of InnoDB default stopword list is still pending
+for decision */
+const char *fts_default_stopword[] =
+{
+ "a",
+ "about",
+ "an",
+ "are",
+ "as",
+ "at",
+ "be",
+ "by",
+ "com",
+ "de",
+ "en",
+ "for",
+ "from",
+ "how",
+ "i",
+ "in",
+ "is",
+ "it",
+ "la",
+ "of",
+ "on",
+ "or",
+ "that",
+ "the",
+ "this",
+ "to",
+ "was",
+ "what",
+ "when",
+ "where",
+ "who",
+ "will",
+ "with",
+ "und",
+ "the",
+ "www",
+ NULL
+};
+
+/** For storing table info when checking for orphaned tables. */
+struct fts_aux_table_t {
+ table_id_t id; /*!< Table id */
+ table_id_t parent_id; /*!< Parent table id */
+ table_id_t index_id; /*!< Table FT index id */
+ char* name; /*!< Name of the table */
+};
+
+/** FTS auxiliary table suffixes that are common to all FT indexes. */
+const char* fts_common_tables[] = {
+ "BEING_DELETED",
+ "BEING_DELETED_CACHE",
+ "CONFIG",
+ "DELETED",
+ "DELETED_CACHE",
+ NULL
+};
+
+/** FTS auxiliary INDEX split intervals. */
+const fts_index_selector_t fts_index_selector[] = {
+ { 9, "INDEX_1" },
+ { 65, "INDEX_2" },
+ { 70, "INDEX_3" },
+ { 75, "INDEX_4" },
+ { 80, "INDEX_5" },
+ { 85, "INDEX_6" },
+ { 0 , NULL }
+};
+
+/** Default config values for FTS indexes on a table. */
+static const char* fts_config_table_insert_values_sql =
+ "BEGIN\n"
+ "\n"
+ "INSERT INTO $config_table VALUES('"
+ FTS_MAX_CACHE_SIZE_IN_MB "', '256');\n"
+ ""
+ "INSERT INTO $config_table VALUES('"
+ FTS_OPTIMIZE_LIMIT_IN_SECS "', '180');\n"
+ ""
+ "INSERT INTO $config_table VALUES ('"
+ FTS_SYNCED_DOC_ID "', '0');\n"
+ ""
+ "INSERT INTO $config_table VALUES ('"
+ FTS_TOTAL_DELETED_COUNT "', '0');\n"
+ "" /* Note: 0 == FTS_TABLE_STATE_RUNNING */
+ "INSERT INTO $config_table VALUES ('"
+ FTS_TABLE_STATE "', '0');\n";
+
+/** FTS tokenize parmameter for plugin parser */
+struct fts_tokenize_param_t {
+ fts_doc_t* result_doc; /*!< Result doc for tokens */
+ ulint add_pos; /*!< Added position for tokens */
+};
+
+/** Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@param[in,out] sync sync state
+@param[in] unlock_cache whether unlock cache lock when write node
+@param[in] wait whether wait when a sync is in progress
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+fts_sync(
+ fts_sync_t* sync,
+ bool unlock_cache,
+ bool wait);
+
+/****************************************************************//**
+Release all resources help by the words rb tree e.g., the node ilist. */
+static
+void
+fts_words_free(
+/*===========*/
+ ib_rbt_t* words) /*!< in: rb tree of words */
+ MY_ATTRIBUTE((nonnull));
+#ifdef FTS_CACHE_SIZE_DEBUG
+/****************************************************************//**
+Read the max cache size parameter from the config table. */
+static
+void
+fts_update_max_cache_size(
+/*======================*/
+ fts_sync_t* sync); /*!< in: sync state */
+#endif
+
+/*********************************************************************//**
+This function fetches the document just inserted right before
+we commit the transaction, and tokenize the inserted text data
+and insert into FTS auxiliary table and its cache.
+@return TRUE if successful */
+static
+ulint
+fts_add_doc_by_id(
+/*==============*/
+ fts_trx_table_t*ftt, /*!< in: FTS trx table */
+ doc_id_t doc_id, /*!< in: doc id */
+ ib_vector_t* fts_indexes MY_ATTRIBUTE((unused)));
+ /*!< in: affected fts indexes */
+/******************************************************************//**
+Update the last document id. This function could create a new
+transaction to update the last document id.
+@return DB_SUCCESS if OK */
+static
+dberr_t
+fts_update_sync_doc_id(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ doc_id_t doc_id, /*!< in: last document id */
+ trx_t* trx) /*!< in: update trx, or NULL */
+ MY_ATTRIBUTE((nonnull(1)));
+
+/** Tokenize a document.
+@param[in,out] doc document to tokenize
+@param[out] result tokenization result
+@param[in] parser pluggable parser */
+static
+void
+fts_tokenize_document(
+ fts_doc_t* doc,
+ fts_doc_t* result,
+ st_mysql_ftparser* parser);
+
+/** Continue to tokenize a document.
+@param[in,out] doc document to tokenize
+@param[in] add_pos add this position to all tokens from this tokenization
+@param[out] result tokenization result
+@param[in] parser pluggable parser */
+static
+void
+fts_tokenize_document_next(
+ fts_doc_t* doc,
+ ulint add_pos,
+ fts_doc_t* result,
+ st_mysql_ftparser* parser);
+
+/** Create the vector of fts_get_doc_t instances.
+@param[in,out] cache fts cache
+@return vector of fts_get_doc_t instances */
+static
+ib_vector_t*
+fts_get_docs_create(
+ fts_cache_t* cache);
+
+/** Free the FTS cache.
+@param[in,out] cache to be freed */
+static
+void
+fts_cache_destroy(fts_cache_t* cache)
+{
+ rw_lock_free(&cache->lock);
+ rw_lock_free(&cache->init_lock);
+ mutex_free(&cache->deleted_lock);
+ mutex_free(&cache->doc_id_lock);
+ os_event_destroy(cache->sync->event);
+
+ if (cache->stopword_info.cached_stopword) {
+ rbt_free(cache->stopword_info.cached_stopword);
+ }
+
+ if (cache->sync_heap->arg) {
+ mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
+ }
+
+ mem_heap_free(cache->cache_heap);
+}
+
+/** Get a character set based on precise type.
+@param prtype precise type
+@return the corresponding character set */
+UNIV_INLINE
+CHARSET_INFO*
+fts_get_charset(ulint prtype)
+{
+#ifdef UNIV_DEBUG
+ switch (prtype & DATA_MYSQL_TYPE_MASK) {
+ case MYSQL_TYPE_BIT:
+ case MYSQL_TYPE_STRING:
+ case MYSQL_TYPE_VAR_STRING:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ case MYSQL_TYPE_VARCHAR:
+ break;
+ default:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ uint cs_num = (uint) dtype_get_charset_coll(prtype);
+
+ if (CHARSET_INFO* cs = get_charset(cs_num, MYF(MY_WME))) {
+ return(cs);
+ }
+
+ ib::fatal() << "Unable to find charset-collation " << cs_num;
+ return(NULL);
+}
+
+/****************************************************************//**
+This function loads the default InnoDB stopword list */
+static
+void
+fts_load_default_stopword(
+/*======================*/
+ fts_stopword_t* stopword_info) /*!< in: stopword info */
+{
+ fts_string_t str;
+ mem_heap_t* heap;
+ ib_alloc_t* allocator;
+ ib_rbt_t* stop_words;
+
+ allocator = stopword_info->heap;
+ heap = static_cast<mem_heap_t*>(allocator->arg);
+
+ if (!stopword_info->cached_stopword) {
+ stopword_info->cached_stopword = rbt_create_arg_cmp(
+ sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+ &my_charset_latin1);
+ }
+
+ stop_words = stopword_info->cached_stopword;
+
+ str.f_n_char = 0;
+
+ for (ulint i = 0; fts_default_stopword[i]; ++i) {
+ char* word;
+ fts_tokenizer_word_t new_word;
+
+ /* We are going to duplicate the value below. */
+ word = const_cast<char*>(fts_default_stopword[i]);
+
+ new_word.nodes = ib_vector_create(
+ allocator, sizeof(fts_node_t), 4);
+
+ str.f_len = strlen(word);
+ str.f_str = reinterpret_cast<byte*>(word);
+
+ fts_string_dup(&new_word.text, &str, heap);
+
+ rbt_insert(stop_words, &new_word, &new_word);
+ }
+
+ stopword_info->status = STOPWORD_FROM_DEFAULT;
+}
+
+/****************************************************************//**
+Callback function to read a single stopword value.
+@return Always return TRUE */
+static
+ibool
+fts_read_stopword(
+/*==============*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: pointer to ib_vector_t */
+{
+ ib_alloc_t* allocator;
+ fts_stopword_t* stopword_info;
+ sel_node_t* sel_node;
+ que_node_t* exp;
+ ib_rbt_t* stop_words;
+ dfield_t* dfield;
+ fts_string_t str;
+ mem_heap_t* heap;
+ ib_rbt_bound_t parent;
+
+ sel_node = static_cast<sel_node_t*>(row);
+ stopword_info = static_cast<fts_stopword_t*>(user_arg);
+
+ stop_words = stopword_info->cached_stopword;
+ allocator = static_cast<ib_alloc_t*>(stopword_info->heap);
+ heap = static_cast<mem_heap_t*>(allocator->arg);
+
+ exp = sel_node->select_list;
+
+ /* We only need to read the first column */
+ dfield = que_node_get_val(exp);
+
+ str.f_n_char = 0;
+ str.f_str = static_cast<byte*>(dfield_get_data(dfield));
+ str.f_len = dfield_get_len(dfield);
+
+ /* Only create new node if it is a value not already existed */
+ if (str.f_len != UNIV_SQL_NULL
+ && rbt_search(stop_words, &parent, &str) != 0) {
+
+ fts_tokenizer_word_t new_word;
+
+ new_word.nodes = ib_vector_create(
+ allocator, sizeof(fts_node_t), 4);
+
+ new_word.text.f_str = static_cast<byte*>(
+ mem_heap_alloc(heap, str.f_len + 1));
+
+ memcpy(new_word.text.f_str, str.f_str, str.f_len);
+
+ new_word.text.f_n_char = 0;
+ new_word.text.f_len = str.f_len;
+ new_word.text.f_str[str.f_len] = 0;
+
+ rbt_insert(stop_words, &new_word, &new_word);
+ }
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+Load user defined stopword from designated user table
+@return whether the operation is successful */
+static
+bool
+fts_load_user_stopword(
+/*===================*/
+ fts_t* fts, /*!< in: FTS struct */
+ const char* stopword_table_name, /*!< in: Stopword table
+ name */
+ fts_stopword_t* stopword_info) /*!< in: Stopword info */
+{
+ if (!fts->dict_locked) {
+ mutex_enter(&dict_sys.mutex);
+ }
+
+ /* Validate the user table existence in the right format */
+ bool ret= false;
+ stopword_info->charset = fts_valid_stopword_table(stopword_table_name);
+ if (!stopword_info->charset) {
+cleanup:
+ if (!fts->dict_locked) {
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ return ret;
+ }
+
+ trx_t* trx = trx_create();
+ trx->op_info = "Load user stopword table into FTS cache";
+
+ if (!stopword_info->cached_stopword) {
+ /* Create the stopword RB tree with the stopword column
+ charset. All comparison will use this charset */
+ stopword_info->cached_stopword = rbt_create_arg_cmp(
+ sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+ (void*)stopword_info->charset);
+
+ }
+
+ pars_info_t* info = pars_info_create();
+
+ pars_info_bind_id(info, TRUE, "table_stopword", stopword_table_name);
+
+ pars_info_bind_function(info, "my_func", fts_read_stopword,
+ stopword_info);
+
+ que_t* graph = fts_parse_sql_no_dict_lock(
+ info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT value"
+ " FROM $table_stopword;\n"
+ "BEGIN\n"
+ "\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+
+ for (;;) {
+ dberr_t error = fts_eval_sql(trx, graph);
+
+ if (UNIV_LIKELY(error == DB_SUCCESS)) {
+ fts_sql_commit(trx);
+ stopword_info->status = STOPWORD_USER_TABLE;
+ break;
+ } else {
+ fts_sql_rollback(trx);
+
+ if (error == DB_LOCK_WAIT_TIMEOUT) {
+ ib::warn() << "Lock wait timeout reading user"
+ " stopword table. Retrying!";
+
+ trx->error_state = DB_SUCCESS;
+ } else {
+ ib::error() << "Error '" << error
+ << "' while reading user stopword"
+ " table.";
+ ret = FALSE;
+ break;
+ }
+ }
+ }
+
+ que_graph_free(graph);
+ trx->free();
+ ret = true;
+ goto cleanup;
+}
+
+/******************************************************************//**
+Initialize the index cache. */
+static
+void
+fts_index_cache_init(
+/*=================*/
+ ib_alloc_t* allocator, /*!< in: the allocator to use */
+ fts_index_cache_t* index_cache) /*!< in: index cache */
+{
+ ulint i;
+
+ ut_a(index_cache->words == NULL);
+
+ index_cache->words = rbt_create_arg_cmp(
+ sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+ (void*) index_cache->charset);
+
+ ut_a(index_cache->doc_stats == NULL);
+
+ index_cache->doc_stats = ib_vector_create(
+ allocator, sizeof(fts_doc_stats_t), 4);
+
+ for (i = 0; i < FTS_NUM_AUX_INDEX; ++i) {
+ ut_a(index_cache->ins_graph[i] == NULL);
+ ut_a(index_cache->sel_graph[i] == NULL);
+ }
+}
+
+/*********************************************************************//**
+Initialize FTS cache. */
+void
+fts_cache_init(
+/*===========*/
+ fts_cache_t* cache) /*!< in: cache to initialize */
+{
+ ulint i;
+
+ /* Just to make sure */
+ ut_a(cache->sync_heap->arg == NULL);
+
+ cache->sync_heap->arg = mem_heap_create(1024);
+
+ cache->total_size = 0;
+
+ mutex_enter((ib_mutex_t*) &cache->deleted_lock);
+ cache->deleted_doc_ids = ib_vector_create(
+ cache->sync_heap, sizeof(doc_id_t), 4);
+ mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+
+ /* Reset the cache data for all the FTS indexes. */
+ for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+ fts_index_cache_t* index_cache;
+
+ index_cache = static_cast<fts_index_cache_t*>(
+ ib_vector_get(cache->indexes, i));
+
+ fts_index_cache_init(cache->sync_heap, index_cache);
+ }
+}
+
+/****************************************************************//**
+Create a FTS cache. */
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+ dict_table_t* table) /*!< in: table owns the FTS cache */
+{
+ mem_heap_t* heap;
+ fts_cache_t* cache;
+
+ heap = static_cast<mem_heap_t*>(mem_heap_create(512));
+
+ cache = static_cast<fts_cache_t*>(
+ mem_heap_zalloc(heap, sizeof(*cache)));
+
+ cache->cache_heap = heap;
+
+ rw_lock_create(fts_cache_rw_lock_key, &cache->lock, SYNC_FTS_CACHE);
+
+ rw_lock_create(
+ fts_cache_init_rw_lock_key, &cache->init_lock,
+ SYNC_FTS_CACHE_INIT);
+
+ mutex_create(LATCH_ID_FTS_DELETE, &cache->deleted_lock);
+
+ mutex_create(LATCH_ID_FTS_DOC_ID, &cache->doc_id_lock);
+
+ /* This is the heap used to create the cache itself. */
+ cache->self_heap = ib_heap_allocator_create(heap);
+
+ /* This is a transient heap, used for storing sync data. */
+ cache->sync_heap = ib_heap_allocator_create(heap);
+ cache->sync_heap->arg = NULL;
+
+ cache->sync = static_cast<fts_sync_t*>(
+ mem_heap_zalloc(heap, sizeof(fts_sync_t)));
+
+ cache->sync->table = table;
+ cache->sync->event = os_event_create(0);
+
+ /* Create the index cache vector that will hold the inverted indexes. */
+ cache->indexes = ib_vector_create(
+ cache->self_heap, sizeof(fts_index_cache_t), 2);
+
+ fts_cache_init(cache);
+
+ cache->stopword_info.cached_stopword = NULL;
+ cache->stopword_info.charset = NULL;
+
+ cache->stopword_info.heap = cache->self_heap;
+
+ cache->stopword_info.status = STOPWORD_NOT_INIT;
+
+ return(cache);
+}
+
+/*******************************************************************//**
+Add a newly create index into FTS cache */
+void
+fts_add_index(
+/*==========*/
+ dict_index_t* index, /*!< FTS index to be added */
+ dict_table_t* table) /*!< table */
+{
+ fts_t* fts = table->fts;
+ fts_cache_t* cache;
+ fts_index_cache_t* index_cache;
+
+ ut_ad(fts);
+ cache = table->fts->cache;
+
+ rw_lock_x_lock(&cache->init_lock);
+
+ ib_vector_push(fts->indexes, &index);
+
+ index_cache = fts_find_index_cache(cache, index);
+
+ if (!index_cache) {
+ /* Add new index cache structure */
+ index_cache = fts_cache_index_cache_create(table, index);
+ }
+
+ rw_lock_x_unlock(&cache->init_lock);
+}
+
+/*******************************************************************//**
+recalibrate get_doc structure after index_cache in cache->indexes changed */
+static
+void
+fts_reset_get_doc(
+/*==============*/
+ fts_cache_t* cache) /*!< in: FTS index cache */
+{
+ fts_get_doc_t* get_doc;
+ ulint i;
+
+ ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_X));
+
+ ib_vector_reset(cache->get_docs);
+
+ for (i = 0; i < ib_vector_size(cache->indexes); i++) {
+ fts_index_cache_t* ind_cache;
+
+ ind_cache = static_cast<fts_index_cache_t*>(
+ ib_vector_get(cache->indexes, i));
+
+ get_doc = static_cast<fts_get_doc_t*>(
+ ib_vector_push(cache->get_docs, NULL));
+
+ memset(get_doc, 0x0, sizeof(*get_doc));
+
+ get_doc->index_cache = ind_cache;
+ get_doc->cache = cache;
+ }
+
+ ut_ad(ib_vector_size(cache->get_docs)
+ == ib_vector_size(cache->indexes));
+}
+
+/*******************************************************************//**
+Check an index is in the table->indexes list
+@return TRUE if it exists */
+static
+ibool
+fts_in_dict_index(
+/*==============*/
+ dict_table_t* table, /*!< in: Table */
+ dict_index_t* index_check) /*!< in: index to be checked */
+{
+ dict_index_t* index;
+
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+
+ if (index == index_check) {
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Check an index is in the fts->cache->indexes list
+@return TRUE if it exists */
+static
+ibool
+fts_in_index_cache(
+/*===============*/
+ dict_table_t* table, /*!< in: Table */
+ dict_index_t* index) /*!< in: index to be checked */
+{
+ ulint i;
+
+ for (i = 0; i < ib_vector_size(table->fts->cache->indexes); i++) {
+ fts_index_cache_t* index_cache;
+
+ index_cache = static_cast<fts_index_cache_t*>(
+ ib_vector_get(table->fts->cache->indexes, i));
+
+ if (index_cache->index == index) {
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+ibool
+fts_check_cached_index(
+/*===================*/
+ dict_table_t* table) /*!< in: Table where indexes are dropped */
+{
+ ulint i;
+
+ if (!table->fts || !table->fts->cache) {
+ return(TRUE);
+ }
+
+ ut_a(ib_vector_size(table->fts->indexes)
+ == ib_vector_size(table->fts->cache->indexes));
+
+ for (i = 0; i < ib_vector_size(table->fts->indexes); i++) {
+ dict_index_t* index;
+
+ index = static_cast<dict_index_t*>(
+ ib_vector_getp(table->fts->indexes, i));
+
+ if (!fts_in_index_cache(table, index)) {
+ return(FALSE);
+ }
+
+ if (!fts_in_dict_index(table, index)) {
+ return(FALSE);
+ }
+ }
+
+ return(TRUE);
+}
+
+/** Clear all fts resources when there is no internal DOC_ID
+and there are no new fts index to add.
+@param[in,out] table table where fts is to be freed
+@param[in] trx transaction to drop all fts tables */
+void fts_clear_all(dict_table_t *table, trx_t *trx)
+{
+ if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) ||
+ !table->fts ||
+ !ib_vector_is_empty(table->fts->indexes))
+ return;
+
+ for (const dict_index_t *index= dict_table_get_first_index(table);
+ index; index= dict_table_get_next_index(index))
+ if (index->type & DICT_FTS)
+ return;
+
+ fts_optimize_remove_table(table);
+
+ fts_drop_tables(trx, table);
+ fts_free(table);
+ DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+}
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+dberr_t
+fts_drop_index(
+/*===========*/
+ dict_table_t* table, /*!< in: Table where indexes are dropped */
+ dict_index_t* index, /*!< in: Index to be dropped */
+ trx_t* trx) /*!< in: Transaction for the drop */
+{
+ ib_vector_t* indexes = table->fts->indexes;
+ dberr_t err = DB_SUCCESS;
+
+ ut_a(indexes);
+
+ if ((ib_vector_size(indexes) == 1
+ && (index == static_cast<dict_index_t*>(
+ ib_vector_getp(table->fts->indexes, 0)))
+ && DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID))
+ || ib_vector_is_empty(indexes)) {
+ doc_id_t current_doc_id;
+ doc_id_t first_doc_id;
+
+ DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+
+ current_doc_id = table->fts->cache->next_doc_id;
+ first_doc_id = table->fts->cache->first_doc_id;
+ fts_cache_clear(table->fts->cache);
+ fts_cache_destroy(table->fts->cache);
+ table->fts->cache = fts_cache_create(table);
+ table->fts->cache->next_doc_id = current_doc_id;
+ table->fts->cache->first_doc_id = first_doc_id;
+ } else {
+ fts_cache_t* cache = table->fts->cache;
+ fts_index_cache_t* index_cache;
+
+ rw_lock_x_lock(&cache->init_lock);
+
+ index_cache = fts_find_index_cache(cache, index);
+
+ if (index_cache != NULL) {
+ if (index_cache->words) {
+ fts_words_free(index_cache->words);
+ rbt_free(index_cache->words);
+ }
+
+ ib_vector_remove(cache->indexes, *(void**) index_cache);
+ }
+
+ if (cache->get_docs) {
+ fts_reset_get_doc(cache);
+ }
+
+ rw_lock_x_unlock(&cache->init_lock);
+ }
+
+ err = fts_drop_index_tables(trx, index);
+
+ ib_vector_remove(indexes, (const void*) index);
+
+ return(err);
+}
+
+/****************************************************************//**
+Free the query graph but check whether dict_sys.mutex is already
+held */
+void
+fts_que_graph_free_check_lock(
+/*==========================*/
+ fts_table_t* fts_table, /*!< in: FTS table */
+ const fts_index_cache_t*index_cache, /*!< in: FTS index cache */
+ que_t* graph) /*!< in: query graph */
+{
+ bool has_dict = FALSE;
+
+ if (fts_table && fts_table->table) {
+ ut_ad(fts_table->table->fts);
+
+ has_dict = fts_table->table->fts->dict_locked;
+ } else if (index_cache) {
+ ut_ad(index_cache->index->table->fts);
+
+ has_dict = index_cache->index->table->fts->dict_locked;
+ }
+
+ if (!has_dict) {
+ mutex_enter(&dict_sys.mutex);
+ }
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ que_graph_free(graph);
+
+ if (!has_dict) {
+ mutex_exit(&dict_sys.mutex);
+ }
+}
+
+/****************************************************************//**
+Create an FTS index cache. */
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+ dict_index_t* index) /*!< in: FTS index */
+{
+ CHARSET_INFO* charset = NULL;
+ dict_field_t* field;
+ ulint prtype;
+
+ field = dict_index_get_nth_field(index, 0);
+ prtype = field->col->prtype;
+
+ charset = fts_get_charset(prtype);
+
+#ifdef FTS_DEBUG
+ /* Set up charset info for this index. Please note all
+ field of the FTS index should have the same charset */
+ for (i = 1; i < index->n_fields; i++) {
+ CHARSET_INFO* fld_charset;
+
+ field = dict_index_get_nth_field(index, i);
+ prtype = field->col->prtype;
+
+ fld_charset = fts_get_charset(prtype);
+
+ /* All FTS columns should have the same charset */
+ if (charset) {
+ ut_a(charset == fld_charset);
+ } else {
+ charset = fld_charset;
+ }
+ }
+#endif
+
+ return(charset);
+
+}
+/****************************************************************//**
+Create an FTS index cache.
+@return Index Cache */
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+ dict_table_t* table, /*!< in: table with FTS index */
+ dict_index_t* index) /*!< in: FTS index */
+{
+ ulint n_bytes;
+ fts_index_cache_t* index_cache;
+ fts_cache_t* cache = table->fts->cache;
+
+ ut_a(cache != NULL);
+
+ ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_X));
+
+ /* Must not already exist in the cache vector. */
+ ut_a(fts_find_index_cache(cache, index) == NULL);
+
+ index_cache = static_cast<fts_index_cache_t*>(
+ ib_vector_push(cache->indexes, NULL));
+
+ memset(index_cache, 0x0, sizeof(*index_cache));
+
+ index_cache->index = index;
+
+ index_cache->charset = fts_index_get_charset(index);
+
+ n_bytes = sizeof(que_t*) * FTS_NUM_AUX_INDEX;
+
+ index_cache->ins_graph = static_cast<que_t**>(
+ mem_heap_zalloc(static_cast<mem_heap_t*>(
+ cache->self_heap->arg), n_bytes));
+
+ index_cache->sel_graph = static_cast<que_t**>(
+ mem_heap_zalloc(static_cast<mem_heap_t*>(
+ cache->self_heap->arg), n_bytes));
+
+ fts_index_cache_init(cache->sync_heap, index_cache);
+
+ if (cache->get_docs) {
+ fts_reset_get_doc(cache);
+ }
+
+ return(index_cache);
+}
+
+/****************************************************************//**
+Release all resources help by the words rb tree e.g., the node ilist. */
+static
+void
+fts_words_free(
+/*===========*/
+ ib_rbt_t* words) /*!< in: rb tree of words */
+{
+ const ib_rbt_node_t* rbt_node;
+
+ /* Free the resources held by a word. */
+ for (rbt_node = rbt_first(words);
+ rbt_node != NULL;
+ rbt_node = rbt_first(words)) {
+
+ ulint i;
+ fts_tokenizer_word_t* word;
+
+ word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+ /* Free the ilists of this word. */
+ for (i = 0; i < ib_vector_size(word->nodes); ++i) {
+
+ fts_node_t* fts_node = static_cast<fts_node_t*>(
+ ib_vector_get(word->nodes, i));
+
+ ut_free(fts_node->ilist);
+ fts_node->ilist = NULL;
+ }
+
+ /* NOTE: We are responsible for free'ing the node */
+ ut_free(rbt_remove_node(words, rbt_node));
+ }
+}
+
+/** Clear cache.
+@param[in,out] cache fts cache */
+void
+fts_cache_clear(
+ fts_cache_t* cache)
+{
+ ulint i;
+
+ for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+ ulint j;
+ fts_index_cache_t* index_cache;
+
+ index_cache = static_cast<fts_index_cache_t*>(
+ ib_vector_get(cache->indexes, i));
+
+ fts_words_free(index_cache->words);
+
+ rbt_free(index_cache->words);
+
+ index_cache->words = NULL;
+
+ for (j = 0; j < FTS_NUM_AUX_INDEX; ++j) {
+
+ if (index_cache->ins_graph[j] != NULL) {
+
+ fts_que_graph_free_check_lock(
+ NULL, index_cache,
+ index_cache->ins_graph[j]);
+
+ index_cache->ins_graph[j] = NULL;
+ }
+
+ if (index_cache->sel_graph[j] != NULL) {
+
+ fts_que_graph_free_check_lock(
+ NULL, index_cache,
+ index_cache->sel_graph[j]);
+
+ index_cache->sel_graph[j] = NULL;
+ }
+ }
+
+ index_cache->doc_stats = NULL;
+ }
+
+ fts_need_sync = false;
+
+ cache->total_size = 0;
+
+ mutex_enter((ib_mutex_t*) &cache->deleted_lock);
+ cache->deleted_doc_ids = NULL;
+ mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+
+ mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
+ cache->sync_heap->arg = NULL;
+}
+
+/*********************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index cache else NULL */
+UNIV_INLINE
+fts_index_cache_t*
+fts_get_index_cache(
+/*================*/
+ fts_cache_t* cache, /*!< in: cache to search */
+ const dict_index_t* index) /*!< in: index to search for */
+{
+ ulint i;
+
+ ut_ad(rw_lock_own((rw_lock_t*) &cache->lock, RW_LOCK_X)
+ || rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_X));
+
+ for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+ fts_index_cache_t* index_cache;
+
+ index_cache = static_cast<fts_index_cache_t*>(
+ ib_vector_get(cache->indexes, i));
+
+ if (index_cache->index == index) {
+
+ return(index_cache);
+ }
+ }
+
+ return(NULL);
+}
+
+#ifdef FTS_DEBUG
+/*********************************************************************//**
+Search the index cache for a get_doc structure.
+@return the fts_get_doc_t item else NULL */
+static
+fts_get_doc_t*
+fts_get_index_get_doc(
+/*==================*/
+ fts_cache_t* cache, /*!< in: cache to search */
+ const dict_index_t* index) /*!< in: index to search for */
+{
+ ulint i;
+
+ ut_ad(rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_X));
+
+ for (i = 0; i < ib_vector_size(cache->get_docs); ++i) {
+ fts_get_doc_t* get_doc;
+
+ get_doc = static_cast<fts_get_doc_t*>(
+ ib_vector_get(cache->get_docs, i));
+
+ if (get_doc->index_cache->index == index) {
+
+ return(get_doc);
+ }
+ }
+
+ return(NULL);
+}
+#endif
+
+/**********************************************************************//**
+Find an existing word, or if not found, create one and return it.
+@return specified word token */
+static
+fts_tokenizer_word_t*
+fts_tokenizer_word_get(
+/*===================*/
+ fts_cache_t* cache, /*!< in: cache */
+ fts_index_cache_t*
+ index_cache, /*!< in: index cache */
+ fts_string_t* text) /*!< in: node text */
+{
+ fts_tokenizer_word_t* word;
+ ib_rbt_bound_t parent;
+
+ ut_ad(rw_lock_own(&cache->lock, RW_LOCK_X));
+
+ /* If it is a stopword, do not index it */
+ if (!fts_check_token(text,
+ cache->stopword_info.cached_stopword,
+ index_cache->charset)) {
+
+ return(NULL);
+ }
+
+ /* Check if we found a match, if not then add word to tree. */
+ if (rbt_search(index_cache->words, &parent, text) != 0) {
+ mem_heap_t* heap;
+ fts_tokenizer_word_t new_word;
+
+ heap = static_cast<mem_heap_t*>(cache->sync_heap->arg);
+
+ new_word.nodes = ib_vector_create(
+ cache->sync_heap, sizeof(fts_node_t), 4);
+
+ fts_string_dup(&new_word.text, text, heap);
+
+ parent.last = rbt_add_node(
+ index_cache->words, &parent, &new_word);
+
+ /* Take into account the RB tree memory use and the vector. */
+ cache->total_size += sizeof(new_word)
+ + sizeof(ib_rbt_node_t)
+ + text->f_len
+ + (sizeof(fts_node_t) * 4)
+ + sizeof(*new_word.nodes);
+
+ ut_ad(rbt_validate(index_cache->words));
+ }
+
+ word = rbt_value(fts_tokenizer_word_t, parent.last);
+
+ return(word);
+}
+
+/**********************************************************************//**
+Add the given doc_id/word positions to the given node's ilist. */
+void
+fts_cache_node_add_positions(
+/*=========================*/
+ fts_cache_t* cache, /*!< in: cache */
+ fts_node_t* node, /*!< in: word node */
+ doc_id_t doc_id, /*!< in: doc id */
+ ib_vector_t* positions) /*!< in: fts_token_t::positions */
+{
+ ulint i;
+ byte* ptr;
+ byte* ilist;
+ ulint enc_len;
+ ulint last_pos;
+ byte* ptr_start;
+ ulint doc_id_delta;
+
+#ifdef UNIV_DEBUG
+ if (cache) {
+ ut_ad(rw_lock_own(&cache->lock, RW_LOCK_X));
+ }
+#endif /* UNIV_DEBUG */
+
+ ut_ad(doc_id >= node->last_doc_id);
+
+ /* Calculate the space required to store the ilist. */
+ doc_id_delta = (ulint)(doc_id - node->last_doc_id);
+ enc_len = fts_get_encoded_len(doc_id_delta);
+
+ last_pos = 0;
+ for (i = 0; i < ib_vector_size(positions); i++) {
+ ulint pos = *(static_cast<ulint*>(
+ ib_vector_get(positions, i)));
+
+ ut_ad(last_pos == 0 || pos > last_pos);
+
+ enc_len += fts_get_encoded_len(pos - last_pos);
+ last_pos = pos;
+ }
+
+ /* The 0x00 byte at the end of the token positions list. */
+ enc_len++;
+
+ if ((node->ilist_size_alloc - node->ilist_size) >= enc_len) {
+ /* No need to allocate more space, we can fit in the new
+ data at the end of the old one. */
+ ilist = NULL;
+ ptr = node->ilist + node->ilist_size;
+ } else {
+ ulint new_size = node->ilist_size + enc_len;
+
+ /* Over-reserve space by a fixed size for small lengths and
+ by 20% for lengths >= 48 bytes. */
+ if (new_size < 16) {
+ new_size = 16;
+ } else if (new_size < 32) {
+ new_size = 32;
+ } else if (new_size < 48) {
+ new_size = 48;
+ } else {
+ new_size = new_size * 6 / 5;
+ }
+
+ ilist = static_cast<byte*>(ut_malloc_nokey(new_size));
+ ptr = ilist + node->ilist_size;
+
+ node->ilist_size_alloc = new_size;
+ if (cache) {
+ cache->total_size += new_size;
+ }
+ }
+
+ ptr_start = ptr;
+
+ /* Encode the new fragment. */
+ ptr += fts_encode_int(doc_id_delta, ptr);
+
+ last_pos = 0;
+ for (i = 0; i < ib_vector_size(positions); i++) {
+ ulint pos = *(static_cast<ulint*>(
+ ib_vector_get(positions, i)));
+
+ ptr += fts_encode_int(pos - last_pos, ptr);
+ last_pos = pos;
+ }
+
+ *ptr++ = 0;
+
+ ut_a(enc_len == (ulint)(ptr - ptr_start));
+
+ if (ilist) {
+ /* Copy old ilist to the start of the new one and switch the
+ new one into place in the node. */
+ if (node->ilist_size > 0) {
+ memcpy(ilist, node->ilist, node->ilist_size);
+ ut_free(node->ilist);
+ if (cache) {
+ cache->total_size -= node->ilist_size;
+ }
+ }
+
+ node->ilist = ilist;
+ }
+
+ node->ilist_size += enc_len;
+
+ if (node->first_doc_id == FTS_NULL_DOC_ID) {
+ node->first_doc_id = doc_id;
+ }
+
+ node->last_doc_id = doc_id;
+ ++node->doc_count;
+}
+
+/**********************************************************************//**
+Add document to the cache. */
+static
+void
+fts_cache_add_doc(
+/*==============*/
+ fts_cache_t* cache, /*!< in: cache */
+ fts_index_cache_t*
+ index_cache, /*!< in: index cache */
+ doc_id_t doc_id, /*!< in: doc id to add */
+ ib_rbt_t* tokens) /*!< in: document tokens */
+{
+ const ib_rbt_node_t* node;
+ ulint n_words;
+ fts_doc_stats_t* doc_stats;
+
+ if (!tokens) {
+ return;
+ }
+
+ ut_ad(rw_lock_own(&cache->lock, RW_LOCK_X));
+
+ n_words = rbt_size(tokens);
+
+ for (node = rbt_first(tokens); node; node = rbt_first(tokens)) {
+
+ fts_tokenizer_word_t* word;
+ fts_node_t* fts_node = NULL;
+ fts_token_t* token = rbt_value(fts_token_t, node);
+
+ /* Find and/or add token to the cache. */
+ word = fts_tokenizer_word_get(
+ cache, index_cache, &token->text);
+
+ if (!word) {
+ ut_free(rbt_remove_node(tokens, node));
+ continue;
+ }
+
+ if (ib_vector_size(word->nodes) > 0) {
+ fts_node = static_cast<fts_node_t*>(
+ ib_vector_last(word->nodes));
+ }
+
+ if (fts_node == NULL || fts_node->synced
+ || fts_node->ilist_size > FTS_ILIST_MAX_SIZE
+ || doc_id < fts_node->last_doc_id) {
+
+ fts_node = static_cast<fts_node_t*>(
+ ib_vector_push(word->nodes, NULL));
+
+ memset(fts_node, 0x0, sizeof(*fts_node));
+
+ cache->total_size += sizeof(*fts_node);
+ }
+
+ fts_cache_node_add_positions(
+ cache, fts_node, doc_id, token->positions);
+
+ ut_free(rbt_remove_node(tokens, node));
+ }
+
+ ut_a(rbt_empty(tokens));
+
+ /* Add to doc ids processed so far. */
+ doc_stats = static_cast<fts_doc_stats_t*>(
+ ib_vector_push(index_cache->doc_stats, NULL));
+
+ doc_stats->doc_id = doc_id;
+ doc_stats->word_count = n_words;
+
+ /* Add the doc stats memory usage too. */
+ cache->total_size += sizeof(*doc_stats);
+
+ if (doc_id > cache->sync->max_doc_id) {
+ cache->sync->max_doc_id = doc_id;
+ }
+}
+
+/****************************************************************//**
+Drops a table. If the table can't be found we return a SUCCESS code.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_drop_table(
+/*===========*/
+ trx_t* trx, /*!< in: transaction */
+ const char* table_name) /*!< in: table to drop */
+{
+ dict_table_t* table;
+ dberr_t error = DB_SUCCESS;
+
+ /* Check that the table exists in our data dictionary.
+ Similar to regular drop table case, we will open table with
+ DICT_ERR_IGNORE_INDEX_ROOT and DICT_ERR_IGNORE_CORRUPT option */
+ table = dict_table_open_on_name(
+ table_name, TRUE, FALSE,
+ static_cast<dict_err_ignore_t>(
+ DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT));
+
+ if (table != 0) {
+
+ dict_table_close(table, TRUE, FALSE);
+
+ /* Pass nonatomic=false (don't allow data dict unlock),
+ because the transaction may hold locks on SYS_* tables from
+ previous calls to fts_drop_table(). */
+ error = row_drop_table_for_mysql(table_name, trx,
+ SQLCOM_DROP_DB, false, false);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "Unable to drop FTS index aux table "
+ << table_name << ": " << error;
+ }
+ } else {
+ error = DB_FAIL;
+ }
+
+ return(error);
+}
+
+/****************************************************************//**
+Rename a single auxiliary table due to database name change.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_rename_one_aux_table(
+/*=====================*/
+ const char* new_name, /*!< in: new parent tbl name */
+ const char* fts_table_old_name, /*!< in: old aux tbl name */
+ trx_t* trx) /*!< in: transaction */
+{
+ char fts_table_new_name[MAX_TABLE_NAME_LEN];
+ ulint new_db_name_len = dict_get_db_name_len(new_name);
+ ulint old_db_name_len = dict_get_db_name_len(fts_table_old_name);
+ ulint table_new_name_len = strlen(fts_table_old_name)
+ + new_db_name_len - old_db_name_len;
+
+ /* Check if the new and old database names are the same, if so,
+ nothing to do */
+ ut_ad((new_db_name_len != old_db_name_len)
+ || strncmp(new_name, fts_table_old_name, old_db_name_len) != 0);
+
+ /* Get the database name from "new_name", and table name
+ from the fts_table_old_name */
+ strncpy(fts_table_new_name, new_name, new_db_name_len);
+ strncpy(fts_table_new_name + new_db_name_len,
+ strchr(fts_table_old_name, '/'),
+ table_new_name_len - new_db_name_len);
+ fts_table_new_name[table_new_name_len] = 0;
+
+ return row_rename_table_for_mysql(
+ fts_table_old_name, fts_table_new_name, trx, false, false);
+}
+
+/****************************************************************//**
+Rename auxiliary tables for all fts index for a table. This(rename)
+is due to database name change
+@return DB_SUCCESS or error code */
+dberr_t
+fts_rename_aux_tables(
+/*==================*/
+ dict_table_t* table, /*!< in: user Table */
+ const char* new_name, /*!< in: new table name */
+ trx_t* trx) /*!< in: transaction */
+{
+ ulint i;
+ fts_table_t fts_table;
+
+ FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+ dberr_t err = DB_SUCCESS;
+ char old_table_name[MAX_FULL_NAME_LEN];
+
+ /* Rename common auxiliary tables */
+ for (i = 0; fts_common_tables[i] != NULL; ++i) {
+ fts_table.suffix = fts_common_tables[i];
+ fts_get_table_name(&fts_table, old_table_name, true);
+
+ err = fts_rename_one_aux_table(new_name, old_table_name, trx);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ fts_t* fts = table->fts;
+
+ /* Rename index specific auxiliary tables */
+ for (i = 0; fts->indexes != 0 && i < ib_vector_size(fts->indexes);
+ ++i) {
+ dict_index_t* index;
+
+ index = static_cast<dict_index_t*>(
+ ib_vector_getp(fts->indexes, i));
+
+ FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+ for (ulint j = 0; j < FTS_NUM_AUX_INDEX; ++j) {
+ fts_table.suffix = fts_get_suffix(j);
+ fts_get_table_name(&fts_table, old_table_name, true);
+
+ err = fts_rename_one_aux_table(
+ new_name, old_table_name, trx);
+
+ DBUG_EXECUTE_IF("fts_rename_failure",
+ err = DB_DEADLOCK;
+ fts_sql_rollback(trx););
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Drops the common ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@param[in] trx transaction to drop fts common table
+@param[in] fts_table table with an FTS index
+@param[in] drop_orphan True if the function is used to drop
+ orphaned table
+@return DB_SUCCESS or error code */
+static dberr_t
+fts_drop_common_tables(
+ trx_t* trx,
+ fts_table_t* fts_table,
+ bool drop_orphan=false)
+{
+ ulint i;
+ dberr_t error = DB_SUCCESS;
+
+ for (i = 0; fts_common_tables[i] != NULL; ++i) {
+ dberr_t err;
+ char table_name[MAX_FULL_NAME_LEN];
+
+ fts_table->suffix = fts_common_tables[i];
+ fts_get_table_name(fts_table, table_name, true);
+
+ err = fts_drop_table(trx, table_name);
+
+ /* We only return the status of the last error. */
+ if (err != DB_SUCCESS && err != DB_FAIL) {
+ error = err;
+ }
+
+ if (drop_orphan && err == DB_FAIL) {
+ char* path = fil_make_filepath(
+ NULL, table_name, IBD, false);
+ if (path != NULL) {
+ os_file_delete_if_exists(
+ innodb_data_file_key, path, NULL);
+ ut_free(path);
+ }
+ }
+ }
+
+ return(error);
+}
+
+/****************************************************************//**
+Since we do a horizontal split on the index table, we need to drop
+all the split tables.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+fts_drop_index_split_tables(
+/*========================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index) /*!< in: fts instance */
+
+{
+ ulint i;
+ fts_table_t fts_table;
+ dberr_t error = DB_SUCCESS;
+
+ FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+ for (i = 0; i < FTS_NUM_AUX_INDEX; ++i) {
+ dberr_t err;
+ char table_name[MAX_FULL_NAME_LEN];
+
+ fts_table.suffix = fts_get_suffix(i);
+ fts_get_table_name(&fts_table, table_name, true);
+
+ err = fts_drop_table(trx, table_name);
+
+ /* We only return the status of the last error. */
+ if (err != DB_SUCCESS && err != DB_FAIL) {
+ error = err;
+ }
+ }
+
+ return(error);
+}
+
+/****************************************************************//**
+Drops FTS auxiliary tables for an FTS index
+@return DB_SUCCESS or error code */
+dberr_t
+fts_drop_index_tables(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index) /*!< in: Index to drop */
+{
+ return(fts_drop_index_split_tables(trx, index));
+}
+
+/****************************************************************//**
+Drops FTS ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_drop_all_index_tables(
+/*======================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_t* fts) /*!< in: fts instance */
+{
+ dberr_t error = DB_SUCCESS;
+
+ for (ulint i = 0;
+ fts->indexes != 0 && i < ib_vector_size(fts->indexes);
+ ++i) {
+
+ dberr_t err;
+ dict_index_t* index;
+
+ index = static_cast<dict_index_t*>(
+ ib_vector_getp(fts->indexes, i));
+
+ err = fts_drop_index_tables(trx, index);
+
+ if (err != DB_SUCCESS) {
+ error = err;
+ }
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+Drops the ancillary tables needed for supporting an FTS index on a
+given table. row_mysql_lock_data_dictionary must have been called before
+this.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_drop_tables(
+/*============*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table) /*!< in: table has the FTS index */
+{
+ dberr_t error;
+ fts_table_t fts_table;
+
+ FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+ /* TODO: This is not atomic and can cause problems during recovery. */
+
+ error = fts_drop_common_tables(trx, &fts_table);
+
+ if (error == DB_SUCCESS && table->fts) {
+ error = fts_drop_all_index_tables(trx, table->fts);
+ }
+
+ return(error);
+}
+
+/** Create dict_table_t object for FTS Aux tables.
+@param[in] aux_table_name FTS Aux table name
+@param[in] table table object of FTS Index
+@param[in] n_cols number of columns for FTS Aux table
+@return table object for FTS Aux table */
+static
+dict_table_t*
+fts_create_in_mem_aux_table(
+ const char* aux_table_name,
+ const dict_table_t* table,
+ ulint n_cols)
+{
+ dict_table_t* new_table = dict_mem_table_create(
+ aux_table_name, NULL, n_cols, 0, table->flags,
+ table->space_id == TRX_SYS_SPACE
+ ? 0 : table->space_id == SRV_TMP_SPACE_ID
+ ? DICT_TF2_TEMPORARY : DICT_TF2_USE_FILE_PER_TABLE);
+
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+ ut_ad(table->data_dir_path != NULL);
+ new_table->data_dir_path = mem_heap_strdup(
+ new_table->heap, table->data_dir_path);
+ }
+
+ return(new_table);
+}
+
+/** Function to create on FTS common table.
+@param[in,out] trx InnoDB transaction
+@param[in] table Table that has FTS Index
+@param[in] fts_table_name FTS AUX table name
+@param[in] fts_suffix FTS AUX table suffix
+@param[in,out] heap temporary memory heap
+@return table object if created, else NULL */
+static
+dict_table_t*
+fts_create_one_common_table(
+ trx_t* trx,
+ const dict_table_t* table,
+ const char* fts_table_name,
+ const char* fts_suffix,
+ mem_heap_t* heap)
+{
+ dict_table_t* new_table;
+ dberr_t error;
+ bool is_config = strcmp(fts_suffix, "CONFIG") == 0;
+
+ if (!is_config) {
+
+ new_table = fts_create_in_mem_aux_table(
+ fts_table_name, table, FTS_DELETED_TABLE_NUM_COLS);
+
+ dict_mem_table_add_col(
+ new_table, heap, "doc_id", DATA_INT, DATA_UNSIGNED,
+ FTS_DELETED_TABLE_COL_LEN);
+ } else {
+ /* Config table has different schema. */
+ new_table = fts_create_in_mem_aux_table(
+ fts_table_name, table, FTS_CONFIG_TABLE_NUM_COLS);
+
+ dict_mem_table_add_col(
+ new_table, heap, "key", DATA_VARCHAR, 0,
+ FTS_CONFIG_TABLE_KEY_COL_LEN);
+
+ dict_mem_table_add_col(
+ new_table, heap, "value", DATA_VARCHAR, DATA_NOT_NULL,
+ FTS_CONFIG_TABLE_VALUE_COL_LEN);
+ }
+
+ dict_table_add_system_columns(new_table, heap);
+ error = row_create_table_for_mysql(new_table, trx,
+ FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
+ if (error == DB_SUCCESS) {
+
+ dict_index_t* index = dict_mem_index_create(
+ new_table, "FTS_COMMON_TABLE_IND",
+ DICT_UNIQUE|DICT_CLUSTERED, 1);
+
+ if (!is_config) {
+ dict_mem_index_add_field(index, "doc_id", 0);
+ } else {
+ dict_mem_index_add_field(index, "key", 0);
+ }
+
+ /* We save and restore trx->dict_operation because
+ row_create_index_for_mysql() changes the operation to
+ TRX_DICT_OP_TABLE. */
+ trx_dict_op_t op = trx_get_dict_operation(trx);
+
+ error = row_create_index_for_mysql(index, trx, NULL);
+
+ trx->dict_operation = op;
+ } else {
+err_exit:
+ new_table = NULL;
+ ib::warn() << "Failed to create FTS common table "
+ << fts_table_name;
+ trx->error_state = error;
+ return NULL;
+ }
+
+ if (error != DB_SUCCESS) {
+ dict_mem_table_free(new_table);
+ trx->error_state = DB_SUCCESS;
+ row_drop_table_for_mysql(fts_table_name, trx, SQLCOM_DROP_DB);
+ goto err_exit;
+ }
+
+ return(new_table);
+}
+
+/** Creates the common auxiliary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+The following tables are created.
+CREATE TABLE $FTS_PREFIX_DELETED
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_DELETED_CACHE
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_CONFIG
+ (key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key)
+@param[in,out] trx transaction
+@param[in,out] table table with FTS index
+@param[in] skip_doc_id_index Skip index on doc id
+@return DB_SUCCESS if succeed */
+dberr_t
+fts_create_common_tables(
+ trx_t* trx,
+ dict_table_t* table,
+ bool skip_doc_id_index)
+{
+ dberr_t error;
+ que_t* graph;
+ fts_table_t fts_table;
+ mem_heap_t* heap = mem_heap_create(1024);
+ pars_info_t* info;
+ char fts_name[MAX_FULL_NAME_LEN];
+ char full_name[sizeof(fts_common_tables) / sizeof(char*)]
+ [MAX_FULL_NAME_LEN];
+
+ dict_index_t* index = NULL;
+ trx_dict_op_t op;
+ /* common_tables vector is used for dropping FTS common tables
+ on error condition. */
+ std::vector<dict_table_t*> common_tables;
+ std::vector<dict_table_t*>::const_iterator it;
+
+ FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+ op = trx_get_dict_operation(trx);
+
+ error = fts_drop_common_tables(trx, &fts_table);
+
+ if (error != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+
+ /* Create the FTS tables that are common to an FTS index. */
+ for (ulint i = 0; fts_common_tables[i] != NULL; ++i) {
+
+ fts_table.suffix = fts_common_tables[i];
+ fts_get_table_name(&fts_table, full_name[i], true);
+ dict_table_t* common_table = fts_create_one_common_table(
+ trx, table, full_name[i], fts_table.suffix, heap);
+
+ if (!common_table) {
+ trx->error_state = DB_SUCCESS;
+ error = DB_ERROR;
+ goto func_exit;
+ } else {
+ common_tables.push_back(common_table);
+ }
+
+ mem_heap_empty(heap);
+
+ DBUG_EXECUTE_IF("ib_fts_aux_table_error",
+ /* Return error after creating FTS_AUX_CONFIG table. */
+ if (i == 4) {
+ error = DB_ERROR;
+ goto func_exit;
+ }
+ );
+
+ }
+
+ /* Write the default settings to the config table. */
+ info = pars_info_create();
+
+ fts_table.suffix = "CONFIG";
+ fts_get_table_name(&fts_table, fts_name, true);
+ pars_info_bind_id(info, true, "config_table", fts_name);
+
+ graph = fts_parse_sql_no_dict_lock(
+ info, fts_config_table_insert_values_sql);
+
+ error = fts_eval_sql(trx, graph);
+
+ que_graph_free(graph);
+
+ if (error != DB_SUCCESS || skip_doc_id_index) {
+
+ goto func_exit;
+ }
+
+ index = dict_mem_index_create(table, FTS_DOC_ID_INDEX_NAME,
+ DICT_UNIQUE, 1);
+ dict_mem_index_add_field(index, FTS_DOC_ID_COL_NAME, 0);
+
+ op = trx_get_dict_operation(trx);
+
+ error = row_create_index_for_mysql(index, trx, NULL);
+
+func_exit:
+ if (error != DB_SUCCESS) {
+ for (it = common_tables.begin(); it != common_tables.end();
+ ++it) {
+ row_drop_table_for_mysql((*it)->name.m_name, trx,
+ SQLCOM_DROP_DB);
+ }
+ }
+
+ trx->dict_operation = op;
+
+ common_tables.clear();
+ mem_heap_free(heap);
+
+ return(error);
+}
+
+/** Create one FTS auxiliary index table for an FTS index.
+@param[in,out] trx transaction
+@param[in] index the index instance
+@param[in] fts_table fts_table structure
+@param[in,out] heap temporary memory heap
+@see row_merge_create_fts_sort_index()
+@return DB_SUCCESS or error code */
+static
+dict_table_t*
+fts_create_one_index_table(
+ trx_t* trx,
+ const dict_index_t* index,
+ const fts_table_t* fts_table,
+ mem_heap_t* heap)
+{
+ dict_field_t* field;
+ dict_table_t* new_table;
+ char table_name[MAX_FULL_NAME_LEN];
+ dberr_t error;
+ CHARSET_INFO* charset;
+
+ ut_ad(index->type & DICT_FTS);
+
+ fts_get_table_name(fts_table, table_name, true);
+
+ new_table = fts_create_in_mem_aux_table(
+ table_name, fts_table->table,
+ FTS_AUX_INDEX_TABLE_NUM_COLS);
+
+ field = dict_index_get_nth_field(index, 0);
+ charset = fts_get_charset(field->col->prtype);
+
+ dict_mem_table_add_col(new_table, heap, "word",
+ charset == &my_charset_latin1
+ ? DATA_VARCHAR : DATA_VARMYSQL,
+ field->col->prtype,
+ FTS_MAX_WORD_LEN_IN_CHAR
+ * unsigned(field->col->mbmaxlen));
+
+ dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT,
+ DATA_NOT_NULL | DATA_UNSIGNED,
+ FTS_INDEX_FIRST_DOC_ID_LEN);
+
+ dict_mem_table_add_col(new_table, heap, "last_doc_id", DATA_INT,
+ DATA_NOT_NULL | DATA_UNSIGNED,
+ FTS_INDEX_LAST_DOC_ID_LEN);
+
+ dict_mem_table_add_col(new_table, heap, "doc_count", DATA_INT,
+ DATA_NOT_NULL | DATA_UNSIGNED,
+ FTS_INDEX_DOC_COUNT_LEN);
+
+ /* The precise type calculation is as follows:
+ least signficiant byte: MySQL type code (not applicable for sys cols)
+ second least : DATA_NOT_NULL | DATA_BINARY_TYPE
+ third least : the MySQL charset-collation code (DATA_MTYPE_MAX) */
+
+ dict_mem_table_add_col(
+ new_table, heap, "ilist", DATA_BLOB,
+ (DATA_MTYPE_MAX << 16) | DATA_UNSIGNED | DATA_NOT_NULL,
+ FTS_INDEX_ILIST_LEN);
+
+ dict_table_add_system_columns(new_table, heap);
+ error = row_create_table_for_mysql(new_table, trx,
+ FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
+
+ if (error == DB_SUCCESS) {
+ dict_index_t* index = dict_mem_index_create(
+ new_table, "FTS_INDEX_TABLE_IND",
+ DICT_UNIQUE|DICT_CLUSTERED, 2);
+ dict_mem_index_add_field(index, "word", 0);
+ dict_mem_index_add_field(index, "first_doc_id", 0);
+
+ trx_dict_op_t op = trx_get_dict_operation(trx);
+
+ error = row_create_index_for_mysql(index, trx, NULL);
+
+ trx->dict_operation = op;
+ } else {
+err_exit:
+ new_table = NULL;
+ ib::warn() << "Failed to create FTS index table "
+ << table_name;
+ trx->error_state = error;
+ return NULL;
+ }
+
+ if (error != DB_SUCCESS) {
+ dict_mem_table_free(new_table);
+ trx->error_state = DB_SUCCESS;
+ row_drop_table_for_mysql(table_name, trx, SQLCOM_DROP_DB);
+ goto err_exit;
+ }
+
+ return(new_table);
+}
+
+/** Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table. row_mysql_lock_data_dictionary must have
+been called before this.
+
+All FTS AUX Index tables have the following schema.
+CREAT TABLE $FTS_PREFIX_INDEX_[1-6](
+ word VARCHAR(FTS_MAX_WORD_LEN),
+ first_doc_id INT NOT NULL,
+ last_doc_id UNSIGNED NOT NULL,
+ doc_count UNSIGNED INT NOT NULL,
+ ilist VARBINARY NOT NULL,
+ UNIQUE CLUSTERED INDEX ON (word, first_doc_id))
+@param[in,out] trx dictionary transaction
+@param[in] index fulltext index
+@param[in] id table id
+@return DB_SUCCESS or error code */
+dberr_t
+fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id)
+{
+ ulint i;
+ fts_table_t fts_table;
+ dberr_t error = DB_SUCCESS;
+ mem_heap_t* heap = mem_heap_create(1024);
+
+ fts_table.type = FTS_INDEX_TABLE;
+ fts_table.index_id = index->id;
+ fts_table.table_id = id;
+ fts_table.table = index->table;
+
+ /* aux_idx_tables vector is used for dropping FTS AUX INDEX
+ tables on error condition. */
+ std::vector<dict_table_t*> aux_idx_tables;
+ std::vector<dict_table_t*>::const_iterator it;
+
+ for (i = 0; i < FTS_NUM_AUX_INDEX && error == DB_SUCCESS; ++i) {
+ dict_table_t* new_table;
+
+ /* Create the FTS auxiliary tables that are specific
+ to an FTS index. We need to preserve the table_id %s
+ which fts_parse_sql_no_dict_lock() will fill in for us. */
+ fts_table.suffix = fts_get_suffix(i);
+
+ new_table = fts_create_one_index_table(
+ trx, index, &fts_table, heap);
+
+ if (new_table == NULL) {
+ error = DB_FAIL;
+ break;
+ } else {
+ aux_idx_tables.push_back(new_table);
+ }
+
+ mem_heap_empty(heap);
+
+ DBUG_EXECUTE_IF("ib_fts_index_table_error",
+ /* Return error after creating FTS_INDEX_5
+ aux table. */
+ if (i == 4) {
+ error = DB_FAIL;
+ break;
+ }
+ );
+ }
+
+ if (error != DB_SUCCESS) {
+
+ for (it = aux_idx_tables.begin(); it != aux_idx_tables.end();
+ ++it) {
+ row_drop_table_for_mysql((*it)->name.m_name, trx,
+ SQLCOM_DROP_DB);
+ }
+ }
+
+ aux_idx_tables.clear();
+ mem_heap_free(heap);
+
+ return(error);
+}
+
+/******************************************************************//**
+Calculate the new state of a row given the existing state and a new event.
+@return new state of row */
+static
+fts_row_state
+fts_trx_row_get_new_state(
+/*======================*/
+ fts_row_state old_state, /*!< in: existing state of row */
+ fts_row_state event) /*!< in: new event */
+{
+ /* The rules for transforming states:
+
+ I = inserted
+ M = modified
+ D = deleted
+ N = nothing
+
+ M+D -> D:
+
+ If the row existed before the transaction started and it is modified
+ during the transaction, followed by a deletion of the row, only the
+ deletion will be signaled.
+
+ M+ -> M:
+
+ If the row existed before the transaction started and it is modified
+ more than once during the transaction, only the last modification
+ will be signaled.
+
+ IM*D -> N:
+
+ If a new row is added during the transaction (and possibly modified
+ after its initial insertion) but it is deleted before the end of the
+ transaction, nothing will be signaled.
+
+ IM* -> I:
+
+ If a new row is added during the transaction and modified after its
+ initial insertion, only the addition will be signaled.
+
+ M*DI -> M:
+
+ If the row existed before the transaction started and it is deleted,
+ then re-inserted, only a modification will be signaled. Note that
+ this case is only possible if the table is using the row's primary
+ key for FTS row ids, since those can be re-inserted by the user,
+ which is not true for InnoDB generated row ids.
+
+ It is easily seen that the above rules decompose such that we do not
+ need to store the row's entire history of events. Instead, we can
+ store just one state for the row and update that when new events
+ arrive. Then we can implement the above rules as a two-dimensional
+ look-up table, and get checking of invalid combinations "for free"
+ in the process. */
+
+ /* The lookup table for transforming states. old_state is the
+ Y-axis, event is the X-axis. */
+ static const fts_row_state table[4][4] = {
+ /* I M D N */
+ /* I */ { FTS_INVALID, FTS_INSERT, FTS_NOTHING, FTS_INVALID },
+ /* M */ { FTS_INVALID, FTS_MODIFY, FTS_DELETE, FTS_INVALID },
+ /* D */ { FTS_MODIFY, FTS_INVALID, FTS_INVALID, FTS_INVALID },
+ /* N */ { FTS_INVALID, FTS_INVALID, FTS_INVALID, FTS_INVALID }
+ };
+
+ fts_row_state result;
+
+ ut_a(old_state < FTS_INVALID);
+ ut_a(event < FTS_INVALID);
+
+ result = table[(int) old_state][(int) event];
+ ut_a(result != FTS_INVALID);
+
+ return(result);
+}
+
+/******************************************************************//**
+Create a savepoint instance.
+@return savepoint instance */
+static
+fts_savepoint_t*
+fts_savepoint_create(
+/*=================*/
+ ib_vector_t* savepoints, /*!< out: InnoDB transaction */
+ const char* name, /*!< in: savepoint name */
+ mem_heap_t* heap) /*!< in: heap */
+{
+ fts_savepoint_t* savepoint;
+
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_push(savepoints, NULL));
+
+ memset(savepoint, 0x0, sizeof(*savepoint));
+
+ if (name) {
+ savepoint->name = mem_heap_strdup(heap, name);
+ }
+
+ savepoint->tables = rbt_create(
+ sizeof(fts_trx_table_t*), fts_trx_table_cmp);
+
+ return(savepoint);
+}
+
+/******************************************************************//**
+Create an FTS trx.
+@return FTS trx */
+fts_trx_t*
+fts_trx_create(
+/*===========*/
+ trx_t* trx) /*!< in/out: InnoDB
+ transaction */
+{
+ fts_trx_t* ftt;
+ ib_alloc_t* heap_alloc;
+ mem_heap_t* heap = mem_heap_create(1024);
+ trx_named_savept_t* savep;
+
+ ut_a(trx->fts_trx == NULL);
+
+ ftt = static_cast<fts_trx_t*>(mem_heap_alloc(heap, sizeof(fts_trx_t)));
+ ftt->trx = trx;
+ ftt->heap = heap;
+
+ heap_alloc = ib_heap_allocator_create(heap);
+
+ ftt->savepoints = static_cast<ib_vector_t*>(ib_vector_create(
+ heap_alloc, sizeof(fts_savepoint_t), 4));
+
+ ftt->last_stmt = static_cast<ib_vector_t*>(ib_vector_create(
+ heap_alloc, sizeof(fts_savepoint_t), 4));
+
+ /* Default instance has no name and no heap. */
+ fts_savepoint_create(ftt->savepoints, NULL, NULL);
+ fts_savepoint_create(ftt->last_stmt, NULL, NULL);
+
+ /* Copy savepoints that already set before. */
+ for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+ savep != NULL;
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
+
+ fts_savepoint_take(ftt, savep->name);
+ }
+
+ return(ftt);
+}
+
+/******************************************************************//**
+Create an FTS trx table.
+@return FTS trx table */
+static
+fts_trx_table_t*
+fts_trx_table_create(
+/*=================*/
+ fts_trx_t* fts_trx, /*!< in: FTS trx */
+ dict_table_t* table) /*!< in: table */
+{
+ fts_trx_table_t* ftt;
+
+ ftt = static_cast<fts_trx_table_t*>(
+ mem_heap_alloc(fts_trx->heap, sizeof(*ftt)));
+
+ memset(ftt, 0x0, sizeof(*ftt));
+
+ ftt->table = table;
+ ftt->fts_trx = fts_trx;
+
+ ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+
+ return(ftt);
+}
+
+/******************************************************************//**
+Clone an FTS trx table.
+@return FTS trx table */
+static
+fts_trx_table_t*
+fts_trx_table_clone(
+/*=================*/
+ const fts_trx_table_t* ftt_src) /*!< in: FTS trx */
+{
+ fts_trx_table_t* ftt;
+
+ ftt = static_cast<fts_trx_table_t*>(
+ mem_heap_alloc(ftt_src->fts_trx->heap, sizeof(*ftt)));
+
+ memset(ftt, 0x0, sizeof(*ftt));
+
+ ftt->table = ftt_src->table;
+ ftt->fts_trx = ftt_src->fts_trx;
+
+ ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+
+ /* Copy the rb tree values to the new savepoint. */
+ rbt_merge_uniq(ftt->rows, ftt_src->rows);
+
+ /* These are only added on commit. At this stage we only have
+ the updated row state. */
+ ut_a(ftt_src->added_doc_ids == NULL);
+
+ return(ftt);
+}
+
+/******************************************************************//**
+Initialize the FTS trx instance.
+@return FTS trx instance */
+static
+fts_trx_table_t*
+fts_trx_init(
+/*=========*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: FTS table instance */
+ ib_vector_t* savepoints) /*!< in: Savepoints */
+{
+ fts_trx_table_t* ftt;
+ ib_rbt_bound_t parent;
+ ib_rbt_t* tables;
+ fts_savepoint_t* savepoint;
+
+ savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
+
+ tables = savepoint->tables;
+ rbt_search_cmp(tables, &parent, &table->id, fts_trx_table_id_cmp, NULL);
+
+ if (parent.result == 0) {
+ fts_trx_table_t** fttp;
+
+ fttp = rbt_value(fts_trx_table_t*, parent.last);
+ ftt = *fttp;
+ } else {
+ ftt = fts_trx_table_create(trx->fts_trx, table);
+ rbt_add_node(tables, &parent, &ftt);
+ }
+
+ ut_a(ftt->table == table);
+
+ return(ftt);
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+static
+void
+fts_trx_table_add_op(
+/*=================*/
+ fts_trx_table_t*ftt, /*!< in: FTS trx table */
+ doc_id_t doc_id, /*!< in: doc id */
+ fts_row_state state, /*!< in: state of the row */
+ ib_vector_t* fts_indexes) /*!< in: FTS indexes affected */
+{
+ ib_rbt_t* rows;
+ ib_rbt_bound_t parent;
+
+ rows = ftt->rows;
+ rbt_search(rows, &parent, &doc_id);
+
+ /* Row id found, update state, and if new state is FTS_NOTHING,
+ we delete the row from our tree. */
+ if (parent.result == 0) {
+ fts_trx_row_t* row = rbt_value(fts_trx_row_t, parent.last);
+
+ row->state = fts_trx_row_get_new_state(row->state, state);
+
+ if (row->state == FTS_NOTHING) {
+ if (row->fts_indexes) {
+ ib_vector_free(row->fts_indexes);
+ }
+
+ ut_free(rbt_remove_node(rows, parent.last));
+ row = NULL;
+ } else if (row->fts_indexes != NULL) {
+ ib_vector_free(row->fts_indexes);
+ row->fts_indexes = fts_indexes;
+ }
+
+ } else { /* Row-id not found, create a new one. */
+ fts_trx_row_t row;
+
+ row.doc_id = doc_id;
+ row.state = state;
+ row.fts_indexes = fts_indexes;
+
+ rbt_add_node(rows, &parent, &row);
+ }
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+void
+fts_trx_add_op(
+/*===========*/
+ trx_t* trx, /*!< in: InnoDB transaction */
+ dict_table_t* table, /*!< in: table */
+ doc_id_t doc_id, /*!< in: new doc id */
+ fts_row_state state, /*!< in: state of the row */
+ ib_vector_t* fts_indexes) /*!< in: FTS indexes affected
+ (NULL=all) */
+{
+ fts_trx_table_t* tran_ftt;
+ fts_trx_table_t* stmt_ftt;
+
+ if (!trx->fts_trx) {
+ trx->fts_trx = fts_trx_create(trx);
+ }
+
+ tran_ftt = fts_trx_init(trx, table, trx->fts_trx->savepoints);
+ stmt_ftt = fts_trx_init(trx, table, trx->fts_trx->last_stmt);
+
+ fts_trx_table_add_op(tran_ftt, doc_id, state, fts_indexes);
+ fts_trx_table_add_op(stmt_ftt, doc_id, state, fts_indexes);
+}
+
+/******************************************************************//**
+Fetch callback that converts a textual document id to a binary value and
+stores it in the given place.
+@return always returns NULL */
+static
+ibool
+fts_fetch_store_doc_id(
+/*===================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: doc_id_t* to store
+ doc_id in */
+{
+ int n_parsed;
+ sel_node_t* node = static_cast<sel_node_t*>(row);
+ doc_id_t* doc_id = static_cast<doc_id_t*>(user_arg);
+ dfield_t* dfield = que_node_get_val(node->select_list);
+ dtype_t* type = dfield_get_type(dfield);
+ ulint len = dfield_get_len(dfield);
+
+ char buf[32];
+
+ ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+ ut_a(len > 0 && len < sizeof(buf));
+
+ memcpy(buf, dfield_get_data(dfield), len);
+ buf[len] = '\0';
+
+ n_parsed = sscanf(buf, FTS_DOC_ID_FORMAT, doc_id);
+ ut_a(n_parsed == 1);
+
+ return(FALSE);
+}
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/******************************************************************//**
+Get the max cache size in bytes. If there is an error reading the
+value we simply print an error message here and return the default
+value to the caller.
+@return max cache size in bytes */
+static
+ulint
+fts_get_max_cache_size(
+/*===================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table) /*!< in: table instance */
+{
+ dberr_t error;
+ fts_string_t value;
+ ulong cache_size_in_mb;
+
+ /* Set to the default value. */
+ cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB;
+
+ /* We set the length of value to the max bytes it can hold. This
+ information is used by the callback that reads the value. */
+ value.f_n_char = 0;
+ value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+ value.f_str = ut_malloc_nokey(value.f_len + 1);
+
+ error = fts_config_get_value(
+ trx, fts_table, FTS_MAX_CACHE_SIZE_IN_MB, &value);
+
+ if (UNIV_LIKELY(error == DB_SUCCESS)) {
+ value.f_str[value.f_len] = 0;
+ cache_size_in_mb = strtoul((char*) value.f_str, NULL, 10);
+
+ if (cache_size_in_mb > FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB) {
+
+ ib::warn() << "FTS max cache size ("
+ << cache_size_in_mb << ") out of range."
+ " Minimum value is "
+ << FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB
+ << "MB and the maximum value is "
+ << FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB
+ << "MB, setting cache size to upper limit";
+
+ cache_size_in_mb = FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB;
+
+ } else if (cache_size_in_mb
+ < FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB) {
+
+ ib::warn() << "FTS max cache size ("
+ << cache_size_in_mb << ") out of range."
+ " Minimum value is "
+ << FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB
+ << "MB and the maximum value is"
+ << FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB
+ << "MB, setting cache size to lower limit";
+
+ cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB;
+ }
+ } else {
+ ib::error() << "(" << error << ") reading max"
+ " cache config value from config table "
+ << fts_table->table->name;
+ }
+
+ ut_free(value.f_str);
+
+ return(cache_size_in_mb * 1024 * 1024);
+}
+#endif
+
+/*********************************************************************//**
+Update the next and last Doc ID in the CONFIG table to be the input
+"doc_id" value (+ 1). We would do so after each FTS index build or
+table truncate */
+void
+fts_update_next_doc_id(
+/*===================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const dict_table_t* table, /*!< in: table */
+ doc_id_t doc_id) /*!< in: DOC ID to set */
+{
+ table->fts->cache->synced_doc_id = doc_id;
+ table->fts->cache->next_doc_id = doc_id + 1;
+
+ table->fts->cache->first_doc_id = table->fts->cache->next_doc_id;
+
+ fts_update_sync_doc_id(
+ table, table->fts->cache->synced_doc_id, trx);
+
+}
+
+/*********************************************************************//**
+Get the next available document id.
+@return DB_SUCCESS if OK */
+dberr_t
+fts_get_next_doc_id(
+/*================*/
+ const dict_table_t* table, /*!< in: table */
+ doc_id_t* doc_id) /*!< out: new document id */
+{
+ fts_cache_t* cache = table->fts->cache;
+
+ /* If the Doc ID system has not yet been initialized, we
+ will consult the CONFIG table and user table to re-establish
+ the initial value of the Doc ID */
+ if (cache->first_doc_id == FTS_NULL_DOC_ID) {
+ fts_init_doc_id(table);
+ }
+
+ if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+ *doc_id = FTS_NULL_DOC_ID;
+ return(DB_SUCCESS);
+ }
+
+ DEBUG_SYNC_C("get_next_FTS_DOC_ID");
+ mutex_enter(&cache->doc_id_lock);
+ *doc_id = cache->next_doc_id++;
+ mutex_exit(&cache->doc_id_lock);
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+This function fetch the Doc ID from CONFIG table, and compare with
+the Doc ID supplied. And store the larger one to the CONFIG table.
+@return DB_SUCCESS if OK */
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+fts_cmp_set_sync_doc_id(
+/*====================*/
+ const dict_table_t* table, /*!< in: table */
+ doc_id_t cmp_doc_id, /*!< in: Doc ID to compare */
+ ibool read_only, /*!< in: TRUE if read the
+ synced_doc_id only */
+ doc_id_t* doc_id) /*!< out: larger document id
+ after comparing "cmp_doc_id"
+ to the one stored in CONFIG
+ table */
+{
+ trx_t* trx;
+ pars_info_t* info;
+ dberr_t error;
+ fts_table_t fts_table;
+ que_t* graph = NULL;
+ fts_cache_t* cache = table->fts->cache;
+ char table_name[MAX_FULL_NAME_LEN];
+retry:
+ ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+ fts_table.suffix = "CONFIG";
+ fts_table.table_id = table->id;
+ fts_table.type = FTS_COMMON_TABLE;
+ fts_table.table = table;
+
+ trx = trx_create();
+ if (srv_read_only_mode) {
+ trx_start_internal_read_only(trx);
+ } else {
+ trx_start_internal(trx);
+ }
+
+ trx->op_info = "update the next FTS document id";
+
+ info = pars_info_create();
+
+ pars_info_bind_function(
+ info, "my_func", fts_fetch_store_doc_id, doc_id);
+
+ fts_get_table_name(&fts_table, table_name);
+ pars_info_bind_id(info, true, "config_table", table_name);
+
+ graph = fts_parse_sql(
+ &fts_table, info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS SELECT value FROM $config_table"
+ " WHERE key = 'synced_doc_id' FOR UPDATE;\n"
+ "BEGIN\n"
+ ""
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+
+ *doc_id = 0;
+
+ error = fts_eval_sql(trx, graph);
+
+ fts_que_graph_free_check_lock(&fts_table, NULL, graph);
+
+ // FIXME: We need to retry deadlock errors
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (read_only) {
+ /* InnoDB stores actual synced_doc_id value + 1 in
+ FTS_CONFIG table. Reduce the value by 1 while reading
+ after startup. */
+ if (*doc_id) *doc_id -= 1;
+ goto func_exit;
+ }
+
+ if (cmp_doc_id == 0 && *doc_id) {
+ cache->synced_doc_id = *doc_id - 1;
+ } else {
+ cache->synced_doc_id = ut_max(cmp_doc_id, *doc_id);
+ }
+
+ mutex_enter(&cache->doc_id_lock);
+ /* For each sync operation, we will add next_doc_id by 1,
+ so to mark a sync operation */
+ if (cache->next_doc_id < cache->synced_doc_id + 1) {
+ cache->next_doc_id = cache->synced_doc_id + 1;
+ }
+ mutex_exit(&cache->doc_id_lock);
+
+ if (cmp_doc_id > *doc_id) {
+ error = fts_update_sync_doc_id(
+ table, cache->synced_doc_id, trx);
+ }
+
+ *doc_id = cache->next_doc_id;
+
+func_exit:
+
+ if (UNIV_LIKELY(error == DB_SUCCESS)) {
+ fts_sql_commit(trx);
+ } else {
+ *doc_id = 0;
+
+ ib::error() << "(" << error << ") while getting next doc id "
+ "for table " << table->name;
+ fts_sql_rollback(trx);
+
+ if (error == DB_DEADLOCK) {
+ os_thread_sleep(FTS_DEADLOCK_RETRY_WAIT);
+ goto retry;
+ }
+ }
+
+ trx->free();
+
+ return(error);
+}
+
+/*********************************************************************//**
+Update the last document id. This function could create a new
+transaction to update the last document id.
+@return DB_SUCCESS if OK */
+static
+dberr_t
+fts_update_sync_doc_id(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ doc_id_t doc_id, /*!< in: last document id */
+ trx_t* trx) /*!< in: update trx, or NULL */
+{
+ byte id[FTS_MAX_ID_LEN];
+ pars_info_t* info;
+ fts_table_t fts_table;
+ ulint id_len;
+ que_t* graph = NULL;
+ dberr_t error;
+ ibool local_trx = FALSE;
+ fts_cache_t* cache = table->fts->cache;
+ char fts_name[MAX_FULL_NAME_LEN];
+
+ if (srv_read_only_mode) {
+ return DB_READ_ONLY;
+ }
+
+ fts_table.suffix = "CONFIG";
+ fts_table.table_id = table->id;
+ fts_table.type = FTS_COMMON_TABLE;
+ fts_table.table = table;
+
+ if (!trx) {
+ trx = trx_create();
+ trx_start_internal(trx);
+
+ trx->op_info = "setting last FTS document id";
+ local_trx = TRUE;
+ }
+
+ info = pars_info_create();
+
+ id_len = (ulint) snprintf(
+ (char*) id, sizeof(id), FTS_DOC_ID_FORMAT, doc_id + 1);
+
+ pars_info_bind_varchar_literal(info, "doc_id", id, id_len);
+
+ fts_get_table_name(&fts_table, fts_name,
+ table->fts->dict_locked);
+ pars_info_bind_id(info, true, "table_name", fts_name);
+
+ graph = fts_parse_sql(
+ &fts_table, info,
+ "BEGIN"
+ " UPDATE $table_name SET value = :doc_id"
+ " WHERE key = 'synced_doc_id';");
+
+ error = fts_eval_sql(trx, graph);
+
+ fts_que_graph_free_check_lock(&fts_table, NULL, graph);
+
+ if (local_trx) {
+ if (UNIV_LIKELY(error == DB_SUCCESS)) {
+ fts_sql_commit(trx);
+ cache->synced_doc_id = doc_id;
+ } else {
+ ib::error() << "(" << error << ") while"
+ " updating last doc id for table"
+ << table->name;
+
+ fts_sql_rollback(trx);
+ }
+ trx->free();
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t */
+fts_doc_ids_t*
+fts_doc_ids_create(void)
+/*====================*/
+{
+ fts_doc_ids_t* fts_doc_ids;
+ mem_heap_t* heap = mem_heap_create(512);
+
+ fts_doc_ids = static_cast<fts_doc_ids_t*>(
+ mem_heap_alloc(heap, sizeof(*fts_doc_ids)));
+
+ fts_doc_ids->self_heap = ib_heap_allocator_create(heap);
+
+ fts_doc_ids->doc_ids = static_cast<ib_vector_t*>(ib_vector_create(
+ fts_doc_ids->self_heap, sizeof(doc_id_t), 32));
+
+ return(fts_doc_ids);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the insertion of a new row. */
+void
+fts_add(
+/*====*/
+ fts_trx_table_t*ftt, /*!< in: FTS trx table */
+ fts_trx_row_t* row) /*!< in: row */
+{
+ dict_table_t* table = ftt->table;
+ doc_id_t doc_id = row->doc_id;
+
+ ut_a(row->state == FTS_INSERT || row->state == FTS_MODIFY);
+
+ fts_add_doc_by_id(ftt, doc_id, row->fts_indexes);
+
+ mutex_enter(&table->fts->cache->deleted_lock);
+ ++table->fts->cache->added;
+ mutex_exit(&table->fts->cache->deleted_lock);
+
+ if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+ && doc_id >= table->fts->cache->next_doc_id) {
+ table->fts->cache->next_doc_id = doc_id + 1;
+ }
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the deletion of a row.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_delete(
+/*=======*/
+ fts_trx_table_t*ftt, /*!< in: FTS trx table */
+ fts_trx_row_t* row) /*!< in: row */
+{
+ que_t* graph;
+ fts_table_t fts_table;
+ dberr_t error = DB_SUCCESS;
+ doc_id_t write_doc_id;
+ dict_table_t* table = ftt->table;
+ doc_id_t doc_id = row->doc_id;
+ trx_t* trx = ftt->fts_trx->trx;
+ pars_info_t* info = pars_info_create();
+ fts_cache_t* cache = table->fts->cache;
+
+ /* we do not index Documents whose Doc ID value is 0 */
+ if (doc_id == FTS_NULL_DOC_ID) {
+ ut_ad(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID));
+ return(error);
+ }
+
+ ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
+
+ FTS_INIT_FTS_TABLE(&fts_table, "DELETED", FTS_COMMON_TABLE, table);
+
+ /* Convert to "storage" byte order. */
+ fts_write_doc_id((byte*) &write_doc_id, doc_id);
+ fts_bind_doc_id(info, "doc_id", &write_doc_id);
+
+ /* It is possible we update a record that has not yet been sync-ed
+ into cache from last crash (delete Doc will not initialize the
+ sync). Avoid any added counter accounting until the FTS cache
+ is re-established and sync-ed */
+ if (table->fts->added_synced
+ && doc_id > cache->synced_doc_id) {
+ mutex_enter(&table->fts->cache->deleted_lock);
+
+ /* The Doc ID could belong to those left in
+ ADDED table from last crash. So need to check
+ if it is less than first_doc_id when we initialize
+ the Doc ID system after reboot */
+ if (doc_id >= table->fts->cache->first_doc_id
+ && table->fts->cache->added > 0) {
+ --table->fts->cache->added;
+ }
+
+ mutex_exit(&table->fts->cache->deleted_lock);
+
+ /* Only if the row was really deleted. */
+ ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
+ }
+
+ /* Note the deleted document for OPTIMIZE to purge. */
+ if (error == DB_SUCCESS) {
+ char table_name[MAX_FULL_NAME_LEN];
+
+ trx->op_info = "adding doc id to FTS DELETED";
+
+ info->graph_owns_us = TRUE;
+
+ fts_table.suffix = "DELETED";
+
+ fts_get_table_name(&fts_table, table_name);
+ pars_info_bind_id(info, true, "deleted", table_name);
+
+ graph = fts_parse_sql(
+ &fts_table,
+ info,
+ "BEGIN INSERT INTO $deleted VALUES (:doc_id);");
+
+ error = fts_eval_sql(trx, graph);
+
+ fts_que_graph_free(graph);
+ } else {
+ pars_info_free(info);
+ }
+
+ /* Increment the total deleted count, this is used to calculate the
+ number of documents indexed. */
+ if (error == DB_SUCCESS) {
+ mutex_enter(&table->fts->cache->deleted_lock);
+
+ ++table->fts->cache->deleted;
+
+ mutex_exit(&table->fts->cache->deleted_lock);
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the modification of a row.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_modify(
+/*=======*/
+ fts_trx_table_t* ftt, /*!< in: FTS trx table */
+ fts_trx_row_t* row) /*!< in: row */
+{
+ dberr_t error;
+
+ ut_a(row->state == FTS_MODIFY);
+
+ error = fts_delete(ftt, row);
+
+ if (error == DB_SUCCESS) {
+ fts_add(ftt, row);
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_commit_table(
+/*=============*/
+ fts_trx_table_t* ftt) /*!< in: FTS table to commit*/
+{
+ if (srv_read_only_mode) {
+ return DB_READ_ONLY;
+ }
+
+ const ib_rbt_node_t* node;
+ ib_rbt_t* rows;
+ dberr_t error = DB_SUCCESS;
+ fts_cache_t* cache = ftt->table->fts->cache;
+ trx_t* trx = trx_create();
+
+ trx_start_internal(trx);
+
+ rows = ftt->rows;
+
+ ftt->fts_trx->trx = trx;
+
+ if (cache->get_docs == NULL) {
+ rw_lock_x_lock(&cache->init_lock);
+ if (cache->get_docs == NULL) {
+ cache->get_docs = fts_get_docs_create(cache);
+ }
+ rw_lock_x_unlock(&cache->init_lock);
+ }
+
+ for (node = rbt_first(rows);
+ node != NULL && error == DB_SUCCESS;
+ node = rbt_next(rows, node)) {
+
+ fts_trx_row_t* row = rbt_value(fts_trx_row_t, node);
+
+ switch (row->state) {
+ case FTS_INSERT:
+ fts_add(ftt, row);
+ break;
+
+ case FTS_MODIFY:
+ error = fts_modify(ftt, row);
+ break;
+
+ case FTS_DELETE:
+ error = fts_delete(ftt, row);
+ break;
+
+ default:
+ ut_error;
+ }
+ }
+
+ fts_sql_commit(trx);
+
+ trx->free();
+
+ return(error);
+}
+
+/*********************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_commit(
+/*=======*/
+ trx_t* trx) /*!< in: transaction */
+{
+ const ib_rbt_node_t* node;
+ dberr_t error;
+ ib_rbt_t* tables;
+ fts_savepoint_t* savepoint;
+
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_last(trx->fts_trx->savepoints));
+ tables = savepoint->tables;
+
+ for (node = rbt_first(tables), error = DB_SUCCESS;
+ node != NULL && error == DB_SUCCESS;
+ node = rbt_next(tables, node)) {
+
+ fts_trx_table_t** ftt;
+
+ ftt = rbt_value(fts_trx_table_t*, node);
+
+ error = fts_commit_table(*ftt);
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+Initialize a document. */
+void
+fts_doc_init(
+/*=========*/
+ fts_doc_t* doc) /*!< in: doc to initialize */
+{
+ mem_heap_t* heap = mem_heap_create(32);
+
+ memset(doc, 0, sizeof(*doc));
+
+ doc->self_heap = ib_heap_allocator_create(heap);
+}
+
+/*********************************************************************//**
+Free document. */
+void
+fts_doc_free(
+/*=========*/
+ fts_doc_t* doc) /*!< in: document */
+{
+ mem_heap_t* heap = static_cast<mem_heap_t*>(doc->self_heap->arg);
+
+ if (doc->tokens) {
+ rbt_free(doc->tokens);
+ }
+
+ ut_d(memset(doc, 0, sizeof(*doc)));
+
+ mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return always FALSE */
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: fts_doc_t* */
+{
+ que_node_t* exp;
+ sel_node_t* node = static_cast<sel_node_t*>(row);
+ fts_doc_t* result_doc = static_cast<fts_doc_t*>(user_arg);
+ dfield_t* dfield;
+ ulint len;
+ ulint doc_len;
+ fts_doc_t doc;
+ CHARSET_INFO* doc_charset = NULL;
+ ulint field_no = 0;
+
+ len = 0;
+
+ fts_doc_init(&doc);
+ doc.found = TRUE;
+
+ exp = node->select_list;
+ doc_len = 0;
+
+ doc_charset = result_doc->charset;
+
+ /* Copy each indexed column content into doc->text.f_str */
+ while (exp) {
+ dfield = que_node_get_val(exp);
+ len = dfield_get_len(dfield);
+
+ /* NULL column */
+ if (len == UNIV_SQL_NULL) {
+ exp = que_node_get_next(exp);
+ continue;
+ }
+
+ if (!doc_charset) {
+ doc_charset = fts_get_charset(dfield->type.prtype);
+ }
+
+ doc.charset = doc_charset;
+
+ if (dfield_is_ext(dfield)) {
+ /* We ignore columns that are stored externally, this
+ could result in too many words to search */
+ exp = que_node_get_next(exp);
+ continue;
+ } else {
+ doc.text.f_n_char = 0;
+
+ doc.text.f_str = static_cast<byte*>(
+ dfield_get_data(dfield));
+
+ doc.text.f_len = len;
+ }
+
+ if (field_no == 0) {
+ fts_tokenize_document(&doc, result_doc,
+ result_doc->parser);
+ } else {
+ fts_tokenize_document_next(&doc, doc_len, result_doc,
+ result_doc->parser);
+ }
+
+ exp = que_node_get_next(exp);
+
+ doc_len += (exp) ? len + 1 : len;
+
+ field_no++;
+ }
+
+ ut_ad(doc_charset);
+
+ if (!result_doc->charset) {
+ result_doc->charset = doc_charset;
+ }
+
+ fts_doc_free(&doc);
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+fetch and tokenize the document. */
+static
+void
+fts_fetch_doc_from_rec(
+/*===================*/
+ fts_get_doc_t* get_doc, /*!< in: FTS index's get_doc struct */
+ dict_index_t* clust_index, /*!< in: cluster index */
+ btr_pcur_t* pcur, /*!< in: cursor whose position
+ has been stored */
+ rec_offs* offsets, /*!< in: offsets */
+ fts_doc_t* doc) /*!< out: fts doc to hold parsed
+ documents */
+{
+ dict_index_t* index;
+ const rec_t* clust_rec;
+ const dict_field_t* ifield;
+ ulint clust_pos;
+ ulint doc_len = 0;
+ st_mysql_ftparser* parser;
+
+ if (!get_doc) {
+ return;
+ }
+
+ index = get_doc->index_cache->index;
+ parser = get_doc->index_cache->index->parser;
+
+ clust_rec = btr_pcur_get_rec(pcur);
+ ut_ad(!page_rec_is_comp(clust_rec)
+ || rec_get_status(clust_rec) == REC_STATUS_ORDINARY);
+
+ for (ulint i = 0; i < index->n_fields; i++) {
+ ifield = dict_index_get_nth_field(index, i);
+ clust_pos = dict_col_get_clust_pos(ifield->col, clust_index);
+
+ if (!get_doc->index_cache->charset) {
+ get_doc->index_cache->charset = fts_get_charset(
+ ifield->col->prtype);
+ }
+
+ if (rec_offs_nth_extern(offsets, clust_pos)) {
+ doc->text.f_str =
+ btr_rec_copy_externally_stored_field(
+ clust_rec, offsets,
+ btr_pcur_get_block(pcur)->zip_size(),
+ clust_pos, &doc->text.f_len,
+ static_cast<mem_heap_t*>(
+ doc->self_heap->arg));
+ } else {
+ doc->text.f_str = (byte*) rec_get_nth_field(
+ clust_rec, offsets, clust_pos,
+ &doc->text.f_len);
+ }
+
+ doc->found = TRUE;
+ doc->charset = get_doc->index_cache->charset;
+
+ /* Null Field */
+ if (doc->text.f_len == UNIV_SQL_NULL || doc->text.f_len == 0) {
+ continue;
+ }
+
+ if (!doc_len) {
+ fts_tokenize_document(doc, NULL, parser);
+ } else {
+ fts_tokenize_document_next(doc, doc_len, NULL, parser);
+ }
+
+ doc_len += doc->text.f_len + 1;
+ }
+}
+
+/** Fetch the data from tuple and tokenize the document.
+@param[in] get_doc FTS index's get_doc struct
+@param[in] tuple tuple should be arranged in table schema order
+@param[out] doc fts doc to hold parsed documents. */
+static
+void
+fts_fetch_doc_from_tuple(
+ fts_get_doc_t* get_doc,
+ const dtuple_t* tuple,
+ fts_doc_t* doc)
+{
+ dict_index_t* index;
+ st_mysql_ftparser* parser;
+ ulint doc_len = 0;
+ ulint processed_doc = 0;
+ ulint num_field;
+
+ if (get_doc == NULL) {
+ return;
+ }
+
+ index = get_doc->index_cache->index;
+ parser = get_doc->index_cache->index->parser;
+ num_field = dict_index_get_n_fields(index);
+
+ for (ulint i = 0; i < num_field; i++) {
+ const dict_field_t* ifield;
+ const dict_col_t* col;
+ ulint pos;
+
+ ifield = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(ifield);
+ pos = dict_col_get_no(col);
+ const dfield_t* field = dtuple_get_nth_field(tuple, pos);
+
+ if (!get_doc->index_cache->charset) {
+ get_doc->index_cache->charset = fts_get_charset(
+ ifield->col->prtype);
+ }
+
+ ut_ad(!dfield_is_ext(field));
+
+ doc->text.f_str = (byte*) dfield_get_data(field);
+ doc->text.f_len = dfield_get_len(field);
+ doc->found = TRUE;
+ doc->charset = get_doc->index_cache->charset;
+
+ /* field data is NULL. */
+ if (doc->text.f_len == UNIV_SQL_NULL || doc->text.f_len == 0) {
+ continue;
+ }
+
+ if (processed_doc == 0) {
+ fts_tokenize_document(doc, NULL, parser);
+ } else {
+ fts_tokenize_document_next(doc, doc_len, NULL, parser);
+ }
+
+ processed_doc++;
+ doc_len += doc->text.f_len + 1;
+ }
+}
+
+/** Fetch the document from tuple, tokenize the text data and
+insert the text data into fts auxiliary table and
+its cache. Moreover this tuple fields doesn't contain any information
+about externally stored field. This tuple contains data directly
+converted from mysql.
+@param[in] ftt FTS transaction table
+@param[in] doc_id doc id
+@param[in] tuple tuple from where data can be retrieved
+ and tuple should be arranged in table
+ schema order. */
+void
+fts_add_doc_from_tuple(
+ fts_trx_table_t*ftt,
+ doc_id_t doc_id,
+ const dtuple_t* tuple)
+{
+ mtr_t mtr;
+ fts_cache_t* cache = ftt->table->fts->cache;
+
+ ut_ad(cache->get_docs);
+
+ if (!ftt->table->fts->added_synced) {
+ fts_init_index(ftt->table, FALSE);
+ }
+
+ mtr_start(&mtr);
+
+ ulint num_idx = ib_vector_size(cache->get_docs);
+
+ for (ulint i = 0; i < num_idx; ++i) {
+ fts_doc_t doc;
+ dict_table_t* table;
+ fts_get_doc_t* get_doc;
+
+ get_doc = static_cast<fts_get_doc_t*>(
+ ib_vector_get(cache->get_docs, i));
+ table = get_doc->index_cache->index->table;
+
+ fts_doc_init(&doc);
+ fts_fetch_doc_from_tuple(
+ get_doc, tuple, &doc);
+
+ if (doc.found) {
+ mtr_commit(&mtr);
+ rw_lock_x_lock(&table->fts->cache->lock);
+
+ if (table->fts->cache->stopword_info.status
+ & STOPWORD_NOT_INIT) {
+ fts_load_stopword(table, NULL, NULL,
+ true, true);
+ }
+
+ fts_cache_add_doc(
+ table->fts->cache,
+ get_doc->index_cache,
+ doc_id, doc.tokens);
+
+ rw_lock_x_unlock(&table->fts->cache->lock);
+
+ if (cache->total_size > fts_max_cache_size / 5
+ || fts_need_sync) {
+ fts_sync(cache->sync, true, false);
+ }
+
+ mtr_start(&mtr);
+
+ }
+
+ fts_doc_free(&doc);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/*********************************************************************//**
+This function fetches the document inserted during the committing
+transaction, and tokenize the inserted text data and insert into
+FTS auxiliary table and its cache.
+@return TRUE if successful */
+static
+ulint
+fts_add_doc_by_id(
+/*==============*/
+ fts_trx_table_t*ftt, /*!< in: FTS trx table */
+ doc_id_t doc_id, /*!< in: doc id */
+ ib_vector_t* fts_indexes MY_ATTRIBUTE((unused)))
+ /*!< in: affected fts indexes */
+{
+ mtr_t mtr;
+ mem_heap_t* heap;
+ btr_pcur_t pcur;
+ dict_table_t* table;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ fts_get_doc_t* get_doc;
+ doc_id_t temp_doc_id;
+ dict_index_t* clust_index;
+ dict_index_t* fts_id_index;
+ ibool is_id_cluster;
+ fts_cache_t* cache = ftt->table->fts->cache;
+
+ ut_ad(cache->get_docs);
+
+ /* If Doc ID has been supplied by the user, then the table
+ might not yet be sync-ed */
+
+ if (!ftt->table->fts->added_synced) {
+ fts_init_index(ftt->table, FALSE);
+ }
+
+ /* Get the first FTS index's get_doc */
+ get_doc = static_cast<fts_get_doc_t*>(
+ ib_vector_get(cache->get_docs, 0));
+ ut_ad(get_doc);
+
+ table = get_doc->index_cache->index->table;
+
+ heap = mem_heap_create(512);
+
+ clust_index = dict_table_get_first_index(table);
+ fts_id_index = table->fts_doc_id_index;
+
+ /* Check whether the index on FTS_DOC_ID is cluster index */
+ is_id_cluster = (clust_index == fts_id_index);
+
+ mtr_start(&mtr);
+ btr_pcur_init(&pcur);
+
+ /* Search based on Doc ID. Here, we'll need to consider the case
+ when there is no primary index on Doc ID */
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+ dfield->type.mtype = DATA_INT;
+ dfield->type.prtype = DATA_NOT_NULL | DATA_UNSIGNED | DATA_BINARY_TYPE;
+
+ mach_write_to_8((byte*) &temp_doc_id, doc_id);
+ dfield_set_data(dfield, &temp_doc_id, sizeof(temp_doc_id));
+
+ btr_pcur_open_with_no_init(
+ fts_id_index, tuple, PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ &pcur, 0, &mtr);
+
+ /* If we have a match, add the data to doc structure */
+ if (btr_pcur_get_low_match(&pcur) == 1) {
+ const rec_t* rec;
+ btr_pcur_t* doc_pcur;
+ const rec_t* clust_rec;
+ btr_pcur_t clust_pcur;
+ rec_offs* offsets = NULL;
+ ulint num_idx = ib_vector_size(cache->get_docs);
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ /* Doc could be deleted */
+ if (page_rec_is_infimum(rec)
+ || rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
+
+ goto func_exit;
+ }
+
+ if (is_id_cluster) {
+ clust_rec = rec;
+ doc_pcur = &pcur;
+ } else {
+ dtuple_t* clust_ref;
+ ulint n_fields;
+
+ btr_pcur_init(&clust_pcur);
+ n_fields = dict_index_get_n_unique(clust_index);
+
+ clust_ref = dtuple_create(heap, n_fields);
+ dict_index_copy_types(clust_ref, clust_index, n_fields);
+
+ row_build_row_ref_in_tuple(
+ clust_ref, rec, fts_id_index, NULL);
+
+ btr_pcur_open_with_no_init(
+ clust_index, clust_ref, PAGE_CUR_LE,
+ BTR_SEARCH_LEAF, &clust_pcur, 0, &mtr);
+
+ doc_pcur = &clust_pcur;
+ clust_rec = btr_pcur_get_rec(&clust_pcur);
+
+ }
+
+ offsets = rec_get_offsets(clust_rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ for (ulint i = 0; i < num_idx; ++i) {
+ fts_doc_t doc;
+ dict_table_t* table;
+ fts_get_doc_t* get_doc;
+
+ get_doc = static_cast<fts_get_doc_t*>(
+ ib_vector_get(cache->get_docs, i));
+
+ table = get_doc->index_cache->index->table;
+
+ fts_doc_init(&doc);
+
+ fts_fetch_doc_from_rec(
+ get_doc, clust_index, doc_pcur, offsets, &doc);
+
+ if (doc.found) {
+ ibool success MY_ATTRIBUTE((unused));
+
+ btr_pcur_store_position(doc_pcur, &mtr);
+ mtr_commit(&mtr);
+
+ rw_lock_x_lock(&table->fts->cache->lock);
+
+ if (table->fts->cache->stopword_info.status
+ & STOPWORD_NOT_INIT) {
+ fts_load_stopword(table, NULL,
+ NULL, true, true);
+ }
+
+ fts_cache_add_doc(
+ table->fts->cache,
+ get_doc->index_cache,
+ doc_id, doc.tokens);
+
+ bool need_sync = false;
+ if ((cache->total_size > fts_max_cache_size / 10
+ || fts_need_sync)
+ && !cache->sync->in_progress) {
+ need_sync = true;
+ }
+
+ rw_lock_x_unlock(&table->fts->cache->lock);
+
+ DBUG_EXECUTE_IF(
+ "fts_instrument_sync",
+ fts_optimize_request_sync_table(table);
+ os_event_wait(cache->sync->event);
+ );
+
+ DBUG_EXECUTE_IF(
+ "fts_instrument_sync_debug",
+ fts_sync(cache->sync, true, true);
+ );
+
+ DEBUG_SYNC_C("fts_instrument_sync_request");
+ DBUG_EXECUTE_IF(
+ "fts_instrument_sync_request",
+ fts_optimize_request_sync_table(table);
+ );
+
+ if (need_sync) {
+ fts_optimize_request_sync_table(table);
+ }
+
+ mtr_start(&mtr);
+
+ if (i < num_idx - 1) {
+
+ success = btr_pcur_restore_position(
+ BTR_SEARCH_LEAF, doc_pcur,
+ &mtr);
+
+ ut_ad(success);
+ }
+ }
+
+ fts_doc_free(&doc);
+ }
+
+ if (!is_id_cluster) {
+ btr_pcur_close(doc_pcur);
+ }
+ }
+func_exit:
+ mtr_commit(&mtr);
+
+ btr_pcur_close(&pcur);
+
+ mem_heap_free(heap);
+ return(TRUE);
+}
+
+
+/*********************************************************************//**
+Callback function to read a single ulint column.
+return always returns TRUE */
+static
+ibool
+fts_read_ulint(
+/*===========*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: pointer to ulint */
+{
+ sel_node_t* sel_node = static_cast<sel_node_t*>(row);
+ ulint* value = static_cast<ulint*>(user_arg);
+ que_node_t* exp = sel_node->select_list;
+ dfield_t* dfield = que_node_get_val(exp);
+ void* data = dfield_get_data(dfield);
+
+ *value = mach_read_from_4(static_cast<const byte*>(data));
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+ dict_table_t* table) /*!< in: user table */
+{
+ dict_index_t* index;
+ dict_field_t* dfield MY_ATTRIBUTE((unused)) = NULL;
+ doc_id_t doc_id = 0;
+ mtr_t mtr;
+ btr_pcur_t pcur;
+
+ index = table->fts_doc_id_index;
+
+ if (!index) {
+ return(0);
+ }
+
+ ut_ad(!index->is_instant());
+
+ dfield = dict_index_get_nth_field(index, 0);
+
+#if 0 /* This can fail when renaming a column to FTS_DOC_ID_COL_NAME. */
+ ut_ad(innobase_strcasecmp(FTS_DOC_ID_COL_NAME, dfield->name) == 0);
+#endif
+
+ mtr_start(&mtr);
+
+ /* fetch the largest indexes value */
+ btr_pcur_open_at_index_side(
+ false, index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+
+ if (!page_is_empty(btr_pcur_get_page(&pcur))) {
+ const rec_t* rec = NULL;
+
+ do {
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (page_rec_is_user_rec(rec)) {
+ break;
+ }
+ } while (btr_pcur_move_to_prev(&pcur, &mtr));
+
+ if (!rec || rec_is_metadata(rec, *index)) {
+ goto func_exit;
+ }
+
+ doc_id = fts_read_doc_id(rec);
+ }
+
+func_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ return(doc_id);
+}
+
+/*********************************************************************//**
+Fetch document with the given document id.
+@return DB_SUCCESS if OK else error */
+dberr_t
+fts_doc_fetch_by_doc_id(
+/*====================*/
+ fts_get_doc_t* get_doc, /*!< in: state */
+ doc_id_t doc_id, /*!< in: id of document to
+ fetch */
+ dict_index_t* index_to_use, /*!< in: caller supplied FTS index,
+ or NULL */
+ ulint option, /*!< in: search option, if it is
+ greater than doc_id or equal */
+ fts_sql_callback
+ callback, /*!< in: callback to read */
+ void* arg) /*!< in: callback arg */
+{
+ pars_info_t* info;
+ dberr_t error;
+ const char* select_str;
+ doc_id_t write_doc_id;
+ dict_index_t* index;
+ trx_t* trx = trx_create();
+ que_t* graph;
+
+ trx->op_info = "fetching indexed FTS document";
+
+ /* The FTS index can be supplied by caller directly with
+ "index_to_use", otherwise, get it from "get_doc" */
+ index = (index_to_use) ? index_to_use : get_doc->index_cache->index;
+
+ if (get_doc && get_doc->get_document_graph) {
+ info = get_doc->get_document_graph->info;
+ } else {
+ info = pars_info_create();
+ }
+
+ /* Convert to "storage" byte order. */
+ fts_write_doc_id((byte*) &write_doc_id, doc_id);
+ fts_bind_doc_id(info, "doc_id", &write_doc_id);
+ pars_info_bind_function(info, "my_func", callback, arg);
+
+ select_str = fts_get_select_columns_str(index, info, info->heap);
+ pars_info_bind_id(info, TRUE, "table_name", index->table->name.m_name);
+
+ if (!get_doc || !get_doc->get_document_graph) {
+ if (option == FTS_FETCH_DOC_BY_ID_EQUAL) {
+ graph = fts_parse_sql(
+ NULL,
+ info,
+ mem_heap_printf(info->heap,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT %s FROM $table_name"
+ " WHERE %s = :doc_id;\n"
+ "BEGIN\n"
+ ""
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c %% NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;",
+ select_str, FTS_DOC_ID_COL_NAME));
+ } else {
+ ut_ad(option == FTS_FETCH_DOC_BY_ID_LARGE);
+
+ /* This is used for crash recovery of table with
+ hidden DOC ID or FTS indexes. We will scan the table
+ to re-processing user table rows whose DOC ID or
+ FTS indexed documents have not been sync-ed to disc
+ during recent crash.
+ In the case that all fulltext indexes are dropped
+ for a table, we will keep the "hidden" FTS_DOC_ID
+ column, and this scan is to retreive the largest
+ DOC ID being used in the table to determine the
+ appropriate next DOC ID.
+ In the case of there exists fulltext index(es), this
+ operation will re-tokenize any docs that have not
+ been sync-ed to the disk, and re-prime the FTS
+ cached */
+ graph = fts_parse_sql(
+ NULL,
+ info,
+ mem_heap_printf(info->heap,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT %s, %s FROM $table_name"
+ " WHERE %s > :doc_id;\n"
+ "BEGIN\n"
+ ""
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c %% NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;",
+ FTS_DOC_ID_COL_NAME,
+ select_str, FTS_DOC_ID_COL_NAME));
+ }
+ if (get_doc) {
+ get_doc->get_document_graph = graph;
+ }
+ } else {
+ graph = get_doc->get_document_graph;
+ }
+
+ error = fts_eval_sql(trx, graph);
+ fts_sql_commit(trx);
+ trx->free();
+
+ if (!get_doc) {
+ fts_que_graph_free(graph);
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+dberr_t
+fts_write_node(
+/*===========*/
+ trx_t* trx, /*!< in: transaction */
+ que_t** graph, /*!< in: query graph */
+ fts_table_t* fts_table, /*!< in: aux table */
+ fts_string_t* word, /*!< in: word in UTF-8 */
+ fts_node_t* node) /*!< in: node columns */
+{
+ pars_info_t* info;
+ dberr_t error;
+ ib_uint32_t doc_count;
+ time_t start_time;
+ doc_id_t last_doc_id;
+ doc_id_t first_doc_id;
+ char table_name[MAX_FULL_NAME_LEN];
+
+ ut_a(node->ilist != NULL);
+
+ if (*graph) {
+ info = (*graph)->info;
+ } else {
+ info = pars_info_create();
+
+ fts_get_table_name(fts_table, table_name);
+ pars_info_bind_id(info, true, "index_table_name", table_name);
+ }
+
+ pars_info_bind_varchar_literal(info, "token", word->f_str, word->f_len);
+
+ /* Convert to "storage" byte order. */
+ fts_write_doc_id((byte*) &first_doc_id, node->first_doc_id);
+ fts_bind_doc_id(info, "first_doc_id", &first_doc_id);
+
+ /* Convert to "storage" byte order. */
+ fts_write_doc_id((byte*) &last_doc_id, node->last_doc_id);
+ fts_bind_doc_id(info, "last_doc_id", &last_doc_id);
+
+ ut_a(node->last_doc_id >= node->first_doc_id);
+
+ /* Convert to "storage" byte order. */
+ mach_write_to_4((byte*) &doc_count, node->doc_count);
+ pars_info_bind_int4_literal(
+ info, "doc_count", (const ib_uint32_t*) &doc_count);
+
+ /* Set copy_name to FALSE since it's a static. */
+ pars_info_bind_literal(
+ info, "ilist", node->ilist, node->ilist_size,
+ DATA_BLOB, DATA_BINARY_TYPE);
+
+ if (!*graph) {
+
+ *graph = fts_parse_sql(
+ fts_table,
+ info,
+ "BEGIN\n"
+ "INSERT INTO $index_table_name VALUES"
+ " (:token, :first_doc_id,"
+ " :last_doc_id, :doc_count, :ilist);");
+ }
+
+ start_time = time(NULL);
+ error = fts_eval_sql(trx, *graph);
+ elapsed_time += time(NULL) - start_time;
+ ++n_nodes;
+
+ return(error);
+}
+
+/*********************************************************************//**
+Add rows to the DELETED_CACHE table.
+@return DB_SUCCESS if all went well else error code*/
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_sync_add_deleted_cache(
+/*=======================*/
+ fts_sync_t* sync, /*!< in: sync state */
+ ib_vector_t* doc_ids) /*!< in: doc ids to add */
+{
+ ulint i;
+ pars_info_t* info;
+ que_t* graph;
+ fts_table_t fts_table;
+ char table_name[MAX_FULL_NAME_LEN];
+ doc_id_t dummy = 0;
+ dberr_t error = DB_SUCCESS;
+ ulint n_elems = ib_vector_size(doc_ids);
+
+ ut_a(ib_vector_size(doc_ids) > 0);
+
+ ib_vector_sort(doc_ids, fts_doc_id_cmp);
+
+ info = pars_info_create();
+
+ fts_bind_doc_id(info, "doc_id", &dummy);
+
+ FTS_INIT_FTS_TABLE(
+ &fts_table, "DELETED_CACHE", FTS_COMMON_TABLE, sync->table);
+
+ fts_get_table_name(&fts_table, table_name);
+ pars_info_bind_id(info, true, "table_name", table_name);
+
+ graph = fts_parse_sql(
+ &fts_table,
+ info,
+ "BEGIN INSERT INTO $table_name VALUES (:doc_id);");
+
+ for (i = 0; i < n_elems && error == DB_SUCCESS; ++i) {
+ doc_id_t* update;
+ doc_id_t write_doc_id;
+
+ update = static_cast<doc_id_t*>(ib_vector_get(doc_ids, i));
+
+ /* Convert to "storage" byte order. */
+ fts_write_doc_id((byte*) &write_doc_id, *update);
+ fts_bind_doc_id(info, "doc_id", &write_doc_id);
+
+ error = fts_eval_sql(sync->trx, graph);
+ }
+
+ fts_que_graph_free(graph);
+
+ return(error);
+}
+
+/** Write the words and ilist to disk.
+@param[in,out] trx transaction
+@param[in] index_cache index cache
+@param[in] unlock_cache whether unlock cache when write node
+@return DB_SUCCESS if all went well else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_sync_write_words(
+ trx_t* trx,
+ fts_index_cache_t* index_cache,
+ bool unlock_cache)
+{
+ fts_table_t fts_table;
+ ulint n_nodes = 0;
+ ulint n_words = 0;
+ const ib_rbt_node_t* rbt_node;
+ dberr_t error = DB_SUCCESS;
+ ibool print_error = FALSE;
+ dict_table_t* table = index_cache->index->table;
+
+ FTS_INIT_INDEX_TABLE(
+ &fts_table, NULL, FTS_INDEX_TABLE, index_cache->index);
+
+ n_words = rbt_size(index_cache->words);
+
+ /* We iterate over the entire tree, even if there is an error,
+ since we want to free the memory used during caching. */
+ for (rbt_node = rbt_first(index_cache->words);
+ rbt_node;
+ rbt_node = rbt_next(index_cache->words, rbt_node)) {
+
+ ulint i;
+ ulint selected;
+ fts_tokenizer_word_t* word;
+
+ word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+ DBUG_EXECUTE_IF("fts_instrument_write_words_before_select_index",
+ os_thread_sleep(300000););
+
+ selected = fts_select_index(
+ index_cache->charset, word->text.f_str,
+ word->text.f_len);
+
+ fts_table.suffix = fts_get_suffix(selected);
+
+ /* We iterate over all the nodes even if there was an error */
+ for (i = 0; i < ib_vector_size(word->nodes); ++i) {
+
+ fts_node_t* fts_node = static_cast<fts_node_t*>(
+ ib_vector_get(word->nodes, i));
+
+ if (fts_node->synced) {
+ continue;
+ } else {
+ fts_node->synced = true;
+ }
+
+ /*FIXME: we need to handle the error properly. */
+ if (error == DB_SUCCESS) {
+ if (unlock_cache) {
+ rw_lock_x_unlock(
+ &table->fts->cache->lock);
+ }
+
+ error = fts_write_node(
+ trx,
+ &index_cache->ins_graph[selected],
+ &fts_table, &word->text, fts_node);
+
+ DEBUG_SYNC_C("fts_write_node");
+ DBUG_EXECUTE_IF("fts_write_node_crash",
+ DBUG_SUICIDE(););
+
+ DBUG_EXECUTE_IF("fts_instrument_sync_sleep",
+ os_thread_sleep(1000000);
+ );
+
+ if (unlock_cache) {
+ rw_lock_x_lock(
+ &table->fts->cache->lock);
+ }
+ }
+ }
+
+ n_nodes += ib_vector_size(word->nodes);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS) && !print_error) {
+ ib::error() << "(" << error << ") writing"
+ " word node to FTS auxiliary index table "
+ << table->name;
+ print_error = TRUE;
+ }
+ }
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ printf("Avg number of nodes: %lf\n",
+ (double) n_nodes / (double) (n_words > 1 ? n_words : 1));
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+Begin Sync, create transaction, acquire locks, etc. */
+static
+void
+fts_sync_begin(
+/*===========*/
+ fts_sync_t* sync) /*!< in: sync state */
+{
+ fts_cache_t* cache = sync->table->fts->cache;
+
+ n_nodes = 0;
+ elapsed_time = 0;
+
+ sync->start_time = time(NULL);
+
+ sync->trx = trx_create();
+ trx_start_internal(sync->trx);
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "FTS SYNC for table " << sync->table->name
+ << ", deleted count: "
+ << ib_vector_size(cache->deleted_doc_ids)
+ << " size: " << cache->total_size << " bytes";
+ }
+}
+
+/*********************************************************************//**
+Run SYNC on the table, i.e., write out data from the index specific
+cache to the FTS aux INDEX table and FTS aux doc id stats table.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_sync_index(
+/*===========*/
+ fts_sync_t* sync, /*!< in: sync state */
+ fts_index_cache_t* index_cache) /*!< in: index cache */
+{
+ trx_t* trx = sync->trx;
+
+ trx->op_info = "doing SYNC index";
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "SYNC words: " << rbt_size(index_cache->words);
+ }
+
+ ut_ad(rbt_validate(index_cache->words));
+
+ return(fts_sync_write_words(trx, index_cache, sync->unlock_cache));
+}
+
+/** Check if index cache has been synced completely
+@param[in,out] index_cache index cache
+@return true if index is synced, otherwise false. */
+static
+bool
+fts_sync_index_check(
+ fts_index_cache_t* index_cache)
+{
+ const ib_rbt_node_t* rbt_node;
+
+ for (rbt_node = rbt_first(index_cache->words);
+ rbt_node != NULL;
+ rbt_node = rbt_next(index_cache->words, rbt_node)) {
+
+ fts_tokenizer_word_t* word;
+ word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+ fts_node_t* fts_node;
+ fts_node = static_cast<fts_node_t*>(ib_vector_last(word->nodes));
+
+ if (!fts_node->synced) {
+ return(false);
+ }
+ }
+
+ return(true);
+}
+
+/** Reset synced flag in index cache when rollback
+@param[in,out] index_cache index cache */
+static
+void
+fts_sync_index_reset(
+ fts_index_cache_t* index_cache)
+{
+ const ib_rbt_node_t* rbt_node;
+
+ for (rbt_node = rbt_first(index_cache->words);
+ rbt_node != NULL;
+ rbt_node = rbt_next(index_cache->words, rbt_node)) {
+
+ fts_tokenizer_word_t* word;
+ word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+ fts_node_t* fts_node;
+ fts_node = static_cast<fts_node_t*>(ib_vector_last(word->nodes));
+
+ fts_node->synced = false;
+ }
+}
+
+/** Commit the SYNC, change state of processed doc ids etc.
+@param[in,out] sync sync state
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_sync_commit(
+ fts_sync_t* sync)
+{
+ dberr_t error;
+ trx_t* trx = sync->trx;
+ fts_cache_t* cache = sync->table->fts->cache;
+ doc_id_t last_doc_id;
+
+ trx->op_info = "doing SYNC commit";
+
+ /* After each Sync, update the CONFIG table about the max doc id
+ we just sync-ed to index table */
+ error = fts_cmp_set_sync_doc_id(sync->table, sync->max_doc_id, FALSE,
+ &last_doc_id);
+
+ /* Get the list of deleted documents that are either in the
+ cache or were headed there but were deleted before the add
+ thread got to them. */
+
+ if (error == DB_SUCCESS && ib_vector_size(cache->deleted_doc_ids) > 0) {
+
+ error = fts_sync_add_deleted_cache(
+ sync, cache->deleted_doc_ids);
+ }
+
+ /* We need to do this within the deleted lock since fts_delete() can
+ attempt to add a deleted doc id to the cache deleted id array. */
+ fts_cache_clear(cache);
+ DEBUG_SYNC_C("fts_deleted_doc_ids_clear");
+ fts_cache_init(cache);
+ rw_lock_x_unlock(&cache->lock);
+
+ if (UNIV_LIKELY(error == DB_SUCCESS)) {
+ fts_sql_commit(trx);
+ } else {
+ fts_sql_rollback(trx);
+ ib::error() << "(" << error << ") during SYNC of "
+ "table " << sync->table->name;
+ }
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print) && elapsed_time) {
+ ib::info() << "SYNC for table " << sync->table->name
+ << ": SYNC time: "
+ << (time(NULL) - sync->start_time)
+ << " secs: elapsed "
+ << static_cast<double>(n_nodes)
+ / static_cast<double>(elapsed_time)
+ << " ins/sec";
+ }
+
+ /* Avoid assertion in trx_t::free(). */
+ trx->dict_operation_lock_mode = 0;
+ trx->free();
+
+ return(error);
+}
+
+/** Rollback a sync operation
+@param[in,out] sync sync state */
+static
+void
+fts_sync_rollback(
+ fts_sync_t* sync)
+{
+ trx_t* trx = sync->trx;
+ fts_cache_t* cache = sync->table->fts->cache;
+
+ for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) {
+ ulint j;
+ fts_index_cache_t* index_cache;
+
+ index_cache = static_cast<fts_index_cache_t*>(
+ ib_vector_get(cache->indexes, i));
+
+ /* Reset synced flag so nodes will not be skipped
+ in the next sync, see fts_sync_write_words(). */
+ fts_sync_index_reset(index_cache);
+
+ for (j = 0; fts_index_selector[j].value; ++j) {
+
+ if (index_cache->ins_graph[j] != NULL) {
+
+ fts_que_graph_free_check_lock(
+ NULL, index_cache,
+ index_cache->ins_graph[j]);
+
+ index_cache->ins_graph[j] = NULL;
+ }
+
+ if (index_cache->sel_graph[j] != NULL) {
+
+ fts_que_graph_free_check_lock(
+ NULL, index_cache,
+ index_cache->sel_graph[j]);
+
+ index_cache->sel_graph[j] = NULL;
+ }
+ }
+ }
+
+ rw_lock_x_unlock(&cache->lock);
+
+ fts_sql_rollback(trx);
+
+ /* Avoid assertion in trx_t::free(). */
+ trx->dict_operation_lock_mode = 0;
+ trx->free();
+}
+
+/** Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@param[in,out] sync sync state
+@param[in] unlock_cache whether unlock cache lock when write node
+@param[in] wait whether wait when a sync is in progress
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+fts_sync(
+ fts_sync_t* sync,
+ bool unlock_cache,
+ bool wait)
+{
+ if (srv_read_only_mode) {
+ return DB_READ_ONLY;
+ }
+
+ ulint i;
+ dberr_t error = DB_SUCCESS;
+ fts_cache_t* cache = sync->table->fts->cache;
+
+ rw_lock_x_lock(&cache->lock);
+
+ /* Check if cache is being synced.
+ Note: we release cache lock in fts_sync_write_words() to
+ avoid long wait for the lock by other threads. */
+ while (sync->in_progress) {
+ rw_lock_x_unlock(&cache->lock);
+
+ if (wait) {
+ os_event_wait(sync->event);
+ } else {
+ return(DB_SUCCESS);
+ }
+
+ rw_lock_x_lock(&cache->lock);
+ }
+
+ sync->unlock_cache = unlock_cache;
+ sync->in_progress = true;
+
+ DEBUG_SYNC_C("fts_sync_begin");
+ fts_sync_begin(sync);
+
+begin_sync:
+ if (cache->total_size > fts_max_cache_size) {
+ /* Avoid the case: sync never finish when
+ insert/update keeps comming. */
+ ut_ad(sync->unlock_cache);
+ sync->unlock_cache = false;
+ }
+
+ for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+ fts_index_cache_t* index_cache;
+
+ index_cache = static_cast<fts_index_cache_t*>(
+ ib_vector_get(cache->indexes, i));
+
+ if (index_cache->index->to_be_dropped
+ || index_cache->index->table->to_be_dropped) {
+ continue;
+ }
+
+ DBUG_EXECUTE_IF("fts_instrument_sync_before_syncing",
+ os_thread_sleep(300000););
+ error = fts_sync_index(sync, index_cache);
+
+ if (error != DB_SUCCESS) {
+ goto end_sync;
+ }
+ }
+
+ DBUG_EXECUTE_IF("fts_instrument_sync_interrupted",
+ sync->interrupted = true;
+ error = DB_INTERRUPTED;
+ goto end_sync;
+ );
+
+ /* Make sure all the caches are synced. */
+ for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+ fts_index_cache_t* index_cache;
+
+ index_cache = static_cast<fts_index_cache_t*>(
+ ib_vector_get(cache->indexes, i));
+
+ if (index_cache->index->to_be_dropped
+ || index_cache->index->table->to_be_dropped
+ || fts_sync_index_check(index_cache)) {
+ continue;
+ }
+
+ goto begin_sync;
+ }
+
+end_sync:
+ if (error == DB_SUCCESS && !sync->interrupted) {
+ error = fts_sync_commit(sync);
+ } else {
+ fts_sync_rollback(sync);
+ }
+
+ rw_lock_x_lock(&cache->lock);
+
+ sync->interrupted = false;
+ sync->in_progress = false;
+ os_event_set(sync->event);
+ rw_lock_x_unlock(&cache->lock);
+
+ /* We need to check whether an optimize is required, for that
+ we make copies of the two variables that control the trigger. These
+ variables can change behind our back and we don't want to hold the
+ lock for longer than is needed. */
+ mutex_enter(&cache->deleted_lock);
+
+ cache->added = 0;
+ cache->deleted = 0;
+
+ mutex_exit(&cache->deleted_lock);
+
+ return(error);
+}
+
+/** Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@param[in,out] table fts table
+@param[in] wait whether wait for existing sync to finish
+@return DB_SUCCESS on success, error code on failure. */
+dberr_t fts_sync_table(dict_table_t* table, bool wait)
+{
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(table->fts);
+
+ if (table->space && table->fts->cache
+ && !dict_table_is_corrupted(table)) {
+ err = fts_sync(table->fts->cache->sync, !wait, wait);
+ }
+
+ return(err);
+}
+
+/** Check if a fts token is a stopword or less than fts_min_token_size
+or greater than fts_max_token_size.
+@param[in] token token string
+@param[in] stopwords stopwords rb tree
+@param[in] cs token charset
+@retval true if it is not stopword and length in range
+@retval false if it is stopword or lenght not in range */
+bool
+fts_check_token(
+ const fts_string_t* token,
+ const ib_rbt_t* stopwords,
+ const CHARSET_INFO* cs)
+{
+ ut_ad(cs != NULL || stopwords == NULL);
+
+ ib_rbt_bound_t parent;
+
+ return(token->f_n_char >= fts_min_token_size
+ && token->f_n_char <= fts_max_token_size
+ && (stopwords == NULL
+ || rbt_search(stopwords, &parent, token) != 0));
+}
+
+/** Add the token and its start position to the token's list of positions.
+@param[in,out] result_doc result doc rb tree
+@param[in] str token string
+@param[in] position token position */
+static
+void
+fts_add_token(
+ fts_doc_t* result_doc,
+ fts_string_t str,
+ ulint position)
+{
+ /* Ignore string whose character number is less than
+ "fts_min_token_size" or more than "fts_max_token_size" */
+
+ if (fts_check_token(&str, NULL, result_doc->charset)) {
+
+ mem_heap_t* heap;
+ fts_string_t t_str;
+ fts_token_t* token;
+ ib_rbt_bound_t parent;
+ ulint newlen;
+
+ heap = static_cast<mem_heap_t*>(result_doc->self_heap->arg);
+
+ t_str.f_n_char = str.f_n_char;
+
+ t_str.f_len = str.f_len * result_doc->charset->casedn_multiply + 1;
+
+ t_str.f_str = static_cast<byte*>(
+ mem_heap_alloc(heap, t_str.f_len));
+
+ /* For binary collations, a case sensitive search is
+ performed. Hence don't convert to lower case. */
+ if (my_binary_compare(result_doc->charset)) {
+ memcpy(t_str.f_str, str.f_str, str.f_len);
+ t_str.f_str[str.f_len]= 0;
+ newlen= str.f_len;
+ } else {
+ newlen = innobase_fts_casedn_str(
+ result_doc->charset, (char*) str.f_str, str.f_len,
+ (char*) t_str.f_str, t_str.f_len);
+ }
+
+ t_str.f_len = newlen;
+ t_str.f_str[newlen] = 0;
+
+ /* Add the word to the document statistics. If the word
+ hasn't been seen before we create a new entry for it. */
+ if (rbt_search(result_doc->tokens, &parent, &t_str) != 0) {
+ fts_token_t new_token;
+
+ new_token.text.f_len = newlen;
+ new_token.text.f_str = t_str.f_str;
+ new_token.text.f_n_char = t_str.f_n_char;
+
+ new_token.positions = ib_vector_create(
+ result_doc->self_heap, sizeof(ulint), 32);
+
+ parent.last = rbt_add_node(
+ result_doc->tokens, &parent, &new_token);
+
+ ut_ad(rbt_validate(result_doc->tokens));
+ }
+
+ token = rbt_value(fts_token_t, parent.last);
+ ib_vector_push(token->positions, &position);
+ }
+}
+
+/********************************************************************
+Process next token from document starting at the given position, i.e., add
+the token's start position to the token's list of positions.
+@return number of characters handled in this call */
+static
+ulint
+fts_process_token(
+/*==============*/
+ fts_doc_t* doc, /* in/out: document to
+ tokenize */
+ fts_doc_t* result, /* out: if provided, save
+ result here */
+ ulint start_pos, /*!< in: start position in text */
+ ulint add_pos) /*!< in: add this position to all
+ tokens from this tokenization */
+{
+ ulint ret;
+ fts_string_t str;
+ ulint position;
+ fts_doc_t* result_doc;
+ byte buf[FTS_MAX_WORD_LEN + 1];
+
+ str.f_str = buf;
+
+ /* Determine where to save the result. */
+ result_doc = (result != NULL) ? result : doc;
+
+ /* The length of a string in characters is set here only. */
+
+ ret = innobase_mysql_fts_get_token(
+ doc->charset, doc->text.f_str + start_pos,
+ doc->text.f_str + doc->text.f_len, &str);
+
+ position = start_pos + ret - str.f_len + add_pos;
+
+ fts_add_token(result_doc, str, position);
+
+ return(ret);
+}
+
+/*************************************************************//**
+Get token char size by charset
+@return token size */
+ulint
+fts_get_token_size(
+/*===============*/
+ const CHARSET_INFO* cs, /*!< in: Character set */
+ const char* token, /*!< in: token */
+ ulint len) /*!< in: token length */
+{
+ char* start;
+ char* end;
+ ulint size = 0;
+
+ /* const_cast is for reinterpret_cast below, or it will fail. */
+ start = const_cast<char*>(token);
+ end = start + len;
+ while (start < end) {
+ int ctype;
+ int mbl;
+
+ mbl = cs->ctype(
+ &ctype,
+ reinterpret_cast<uchar*>(start),
+ reinterpret_cast<uchar*>(end));
+
+ size++;
+
+ start += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+ }
+
+ return(size);
+}
+
+/*************************************************************//**
+FTS plugin parser 'myql_parser' callback function for document tokenize.
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return always returns 0 */
+int
+fts_tokenize_document_internal(
+/*===========================*/
+ MYSQL_FTPARSER_PARAM* param, /*!< in: parser parameter */
+ const char* doc,/*!< in/out: document */
+ int len) /*!< in: document length */
+{
+ fts_string_t str;
+ byte buf[FTS_MAX_WORD_LEN + 1];
+ /* JAN: TODO: MySQL 5.7
+ MYSQL_FTPARSER_BOOLEAN_INFO bool_info =
+ { FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0 };
+ */
+ MYSQL_FTPARSER_BOOLEAN_INFO bool_info =
+ { FT_TOKEN_WORD, 0, 0, 0, 0, ' ', 0};
+
+ ut_ad(len >= 0);
+
+ str.f_str = buf;
+
+ for (ulint i = 0, inc = 0; i < static_cast<ulint>(len); i += inc) {
+ inc = innobase_mysql_fts_get_token(
+ const_cast<CHARSET_INFO*>(param->cs),
+ (uchar*)(doc) + i,
+ (uchar*)(doc) + len,
+ &str);
+
+ if (str.f_len > 0) {
+ /* JAN: TODO: MySQL 5.7
+ bool_info.position =
+ static_cast<int>(i + inc - str.f_len);
+ ut_ad(bool_info.position >= 0);
+ */
+
+ /* Stop when add word fails */
+ if (param->mysql_add_word(
+ param,
+ reinterpret_cast<char*>(str.f_str),
+ static_cast<int>(str.f_len),
+ &bool_info)) {
+ break;
+ }
+ }
+ }
+
+ return(0);
+}
+
+/******************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for document tokenize.
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return always returns 0 */
+static
+int
+fts_tokenize_add_word_for_parser(
+/*=============================*/
+ MYSQL_FTPARSER_PARAM* param, /* in: parser paramter */
+ const char* word, /* in: token word */
+ int word_len, /* in: word len */
+ MYSQL_FTPARSER_BOOLEAN_INFO*)
+{
+ fts_string_t str;
+ fts_tokenize_param_t* fts_param;
+ fts_doc_t* result_doc;
+ ulint position;
+
+ fts_param = static_cast<fts_tokenize_param_t*>(param->mysql_ftparam);
+ result_doc = fts_param->result_doc;
+ ut_ad(result_doc != NULL);
+
+ str.f_str = (byte*)(word);
+ str.f_len = ulint(word_len);
+ str.f_n_char = fts_get_token_size(
+ const_cast<CHARSET_INFO*>(param->cs), word, str.f_len);
+
+ /* JAN: TODO: MySQL 5.7 FTS
+ ut_ad(boolean_info->position >= 0);
+ position = boolean_info->position + fts_param->add_pos;
+ */
+ position = fts_param->add_pos;
+
+ fts_add_token(result_doc, str, position);
+
+ return(0);
+}
+
+/******************************************************************//**
+Parse a document using an external / user supplied parser */
+static
+void
+fts_tokenize_by_parser(
+/*===================*/
+ fts_doc_t* doc, /* in/out: document to tokenize */
+ st_mysql_ftparser* parser, /* in: plugin fts parser */
+ fts_tokenize_param_t* fts_param) /* in: fts tokenize param */
+{
+ MYSQL_FTPARSER_PARAM param;
+
+ ut_a(parser);
+
+ /* Set paramters for param */
+ param.mysql_parse = fts_tokenize_document_internal;
+ param.mysql_add_word = fts_tokenize_add_word_for_parser;
+ param.mysql_ftparam = fts_param;
+ param.cs = doc->charset;
+ param.doc = reinterpret_cast<char*>(doc->text.f_str);
+ param.length = static_cast<int>(doc->text.f_len);
+ param.mode= MYSQL_FTPARSER_SIMPLE_MODE;
+
+ PARSER_INIT(parser, &param);
+ parser->parse(&param);
+ PARSER_DEINIT(parser, &param);
+}
+
+/** Tokenize a document.
+@param[in,out] doc document to tokenize
+@param[out] result tokenization result
+@param[in] parser pluggable parser */
+static
+void
+fts_tokenize_document(
+ fts_doc_t* doc,
+ fts_doc_t* result,
+ st_mysql_ftparser* parser)
+{
+ ut_a(!doc->tokens);
+ ut_a(doc->charset);
+
+ doc->tokens = rbt_create_arg_cmp(sizeof(fts_token_t),
+ innobase_fts_text_cmp,
+ (void*) doc->charset);
+
+ if (parser != NULL) {
+ fts_tokenize_param_t fts_param;
+ fts_param.result_doc = (result != NULL) ? result : doc;
+ fts_param.add_pos = 0;
+
+ fts_tokenize_by_parser(doc, parser, &fts_param);
+ } else {
+ ulint inc;
+
+ for (ulint i = 0; i < doc->text.f_len; i += inc) {
+ inc = fts_process_token(doc, result, i, 0);
+ ut_a(inc > 0);
+ }
+ }
+}
+
+/** Continue to tokenize a document.
+@param[in,out] doc document to tokenize
+@param[in] add_pos add this position to all tokens from this tokenization
+@param[out] result tokenization result
+@param[in] parser pluggable parser */
+static
+void
+fts_tokenize_document_next(
+ fts_doc_t* doc,
+ ulint add_pos,
+ fts_doc_t* result,
+ st_mysql_ftparser* parser)
+{
+ ut_a(doc->tokens);
+
+ if (parser) {
+ fts_tokenize_param_t fts_param;
+
+ fts_param.result_doc = (result != NULL) ? result : doc;
+ fts_param.add_pos = add_pos;
+
+ fts_tokenize_by_parser(doc, parser, &fts_param);
+ } else {
+ ulint inc;
+
+ for (ulint i = 0; i < doc->text.f_len; i += inc) {
+ inc = fts_process_token(doc, result, i, add_pos);
+ ut_a(inc > 0);
+ }
+ }
+}
+
+/** Create the vector of fts_get_doc_t instances.
+@param[in,out] cache fts cache
+@return vector of fts_get_doc_t instances */
+static
+ib_vector_t*
+fts_get_docs_create(
+ fts_cache_t* cache)
+{
+ ib_vector_t* get_docs;
+
+ ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_X));
+
+ /* We need one instance of fts_get_doc_t per index. */
+ get_docs = ib_vector_create(cache->self_heap, sizeof(fts_get_doc_t), 4);
+
+ /* Create the get_doc instance, we need one of these
+ per FTS index. */
+ for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) {
+
+ dict_index_t** index;
+ fts_get_doc_t* get_doc;
+
+ index = static_cast<dict_index_t**>(
+ ib_vector_get(cache->indexes, i));
+
+ get_doc = static_cast<fts_get_doc_t*>(
+ ib_vector_push(get_docs, NULL));
+
+ memset(get_doc, 0x0, sizeof(*get_doc));
+
+ get_doc->index_cache = fts_get_index_cache(cache, *index);
+ get_doc->cache = cache;
+
+ /* Must find the index cache. */
+ ut_a(get_doc->index_cache != NULL);
+ }
+
+ return(get_docs);
+}
+
+/********************************************************************
+Release any resources held by the fts_get_doc_t instances. */
+static
+void
+fts_get_docs_clear(
+/*===============*/
+ ib_vector_t* get_docs) /*!< in: Doc retrieval vector */
+{
+ ulint i;
+
+ /* Release the get doc graphs if any. */
+ for (i = 0; i < ib_vector_size(get_docs); ++i) {
+
+ fts_get_doc_t* get_doc = static_cast<fts_get_doc_t*>(
+ ib_vector_get(get_docs, i));
+
+ if (get_doc->get_document_graph != NULL) {
+
+ ut_a(get_doc->index_cache);
+
+ fts_que_graph_free(get_doc->get_document_graph);
+ get_doc->get_document_graph = NULL;
+ }
+ }
+}
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the CONFIG table
+@return initial Doc ID */
+doc_id_t
+fts_init_doc_id(
+/*============*/
+ const dict_table_t* table) /*!< in: table */
+{
+ doc_id_t max_doc_id = 0;
+
+ rw_lock_x_lock(&table->fts->cache->lock);
+
+ /* Return if the table is already initialized for DOC ID */
+ if (table->fts->cache->first_doc_id != FTS_NULL_DOC_ID) {
+ rw_lock_x_unlock(&table->fts->cache->lock);
+ return(0);
+ }
+
+ DEBUG_SYNC_C("fts_initialize_doc_id");
+
+ /* Then compare this value with the ID value stored in the CONFIG
+ table. The larger one will be our new initial Doc ID */
+ fts_cmp_set_sync_doc_id(table, 0, FALSE, &max_doc_id);
+
+ /* If DICT_TF2_FTS_ADD_DOC_ID is set, we are in the process of
+ creating index (and add doc id column. No need to recovery
+ documents */
+ if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+ fts_init_index((dict_table_t*) table, TRUE);
+ }
+
+ table->fts->added_synced = true;
+
+ table->fts->cache->first_doc_id = max_doc_id;
+
+ rw_lock_x_unlock(&table->fts->cache->lock);
+
+ ut_ad(max_doc_id > 0);
+
+ return(max_doc_id);
+}
+
+#ifdef FTS_MULT_INDEX
+/*********************************************************************//**
+Check if the index is in the affected set.
+@return TRUE if index is updated */
+static
+ibool
+fts_is_index_updated(
+/*=================*/
+ const ib_vector_t* fts_indexes, /*!< in: affected FTS indexes */
+ const fts_get_doc_t* get_doc) /*!< in: info for reading
+ document */
+{
+ ulint i;
+ dict_index_t* index = get_doc->index_cache->index;
+
+ for (i = 0; i < ib_vector_size(fts_indexes); ++i) {
+ const dict_index_t* updated_fts_index;
+
+ updated_fts_index = static_cast<const dict_index_t*>(
+ ib_vector_getp_const(fts_indexes, i));
+
+ ut_a(updated_fts_index != NULL);
+
+ if (updated_fts_index == index) {
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+#endif
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+ulint
+fts_get_rows_count(
+/*===============*/
+ fts_table_t* fts_table) /*!< in: fts table to read */
+{
+ trx_t* trx;
+ pars_info_t* info;
+ que_t* graph;
+ dberr_t error;
+ ulint count = 0;
+ char table_name[MAX_FULL_NAME_LEN];
+
+ trx = trx_create();
+ trx->op_info = "fetching FT table rows count";
+
+ info = pars_info_create();
+
+ pars_info_bind_function(info, "my_func", fts_read_ulint, &count);
+
+ fts_get_table_name(fts_table, table_name);
+ pars_info_bind_id(info, true, "table_name", table_name);
+
+ graph = fts_parse_sql(
+ fts_table,
+ info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT COUNT(*)"
+ " FROM $table_name;\n"
+ "BEGIN\n"
+ "\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+
+ for (;;) {
+ error = fts_eval_sql(trx, graph);
+
+ if (UNIV_LIKELY(error == DB_SUCCESS)) {
+ fts_sql_commit(trx);
+
+ break; /* Exit the loop. */
+ } else {
+ fts_sql_rollback(trx);
+
+ if (error == DB_LOCK_WAIT_TIMEOUT) {
+ ib::warn() << "lock wait timeout reading"
+ " FTS table. Retrying!";
+
+ trx->error_state = DB_SUCCESS;
+ } else {
+ ib::error() << "(" << error
+ << ") while reading FTS table "
+ << table_name;
+
+ break; /* Exit the loop. */
+ }
+ }
+ }
+
+ fts_que_graph_free(graph);
+
+ trx->free();
+
+ return(count);
+}
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/*********************************************************************//**
+Read the max cache size parameter from the config table. */
+static
+void
+fts_update_max_cache_size(
+/*======================*/
+ fts_sync_t* sync) /*!< in: sync state */
+{
+ trx_t* trx;
+ fts_table_t fts_table;
+
+ trx = trx_create();
+
+ FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, sync->table);
+
+ /* The size returned is in bytes. */
+ sync->max_cache_size = fts_get_max_cache_size(trx, &fts_table);
+
+ fts_sql_commit(trx);
+
+ trx->free();
+}
+#endif /* FTS_CACHE_SIZE_DEBUG */
+
+/*********************************************************************//**
+Free the modified rows of a table. */
+UNIV_INLINE
+void
+fts_trx_table_rows_free(
+/*====================*/
+ ib_rbt_t* rows) /*!< in: rbt of rows to free */
+{
+ const ib_rbt_node_t* node;
+
+ for (node = rbt_first(rows); node; node = rbt_first(rows)) {
+ fts_trx_row_t* row;
+
+ row = rbt_value(fts_trx_row_t, node);
+
+ if (row->fts_indexes != NULL) {
+ /* This vector shouldn't be using the
+ heap allocator. */
+ ut_a(row->fts_indexes->allocator->arg == NULL);
+
+ ib_vector_free(row->fts_indexes);
+ row->fts_indexes = NULL;
+ }
+
+ ut_free(rbt_remove_node(rows, node));
+ }
+
+ ut_a(rbt_empty(rows));
+ rbt_free(rows);
+}
+
+/*********************************************************************//**
+Free an FTS savepoint instance. */
+UNIV_INLINE
+void
+fts_savepoint_free(
+/*===============*/
+ fts_savepoint_t* savepoint) /*!< in: savepoint instance */
+{
+ const ib_rbt_node_t* node;
+ ib_rbt_t* tables = savepoint->tables;
+
+ /* Nothing to free! */
+ if (tables == NULL) {
+ return;
+ }
+
+ for (node = rbt_first(tables); node; node = rbt_first(tables)) {
+ fts_trx_table_t* ftt;
+ fts_trx_table_t** fttp;
+
+ fttp = rbt_value(fts_trx_table_t*, node);
+ ftt = *fttp;
+
+ /* This can be NULL if a savepoint was released. */
+ if (ftt->rows != NULL) {
+ fts_trx_table_rows_free(ftt->rows);
+ ftt->rows = NULL;
+ }
+
+ /* This can be NULL if a savepoint was released. */
+ if (ftt->added_doc_ids != NULL) {
+ fts_doc_ids_free(ftt->added_doc_ids);
+ ftt->added_doc_ids = NULL;
+ }
+
+ /* The default savepoint name must be NULL. */
+ if (ftt->docs_added_graph) {
+ fts_que_graph_free(ftt->docs_added_graph);
+ }
+
+ /* NOTE: We are responsible for free'ing the node */
+ ut_free(rbt_remove_node(tables, node));
+ }
+
+ ut_a(rbt_empty(tables));
+ rbt_free(tables);
+ savepoint->tables = NULL;
+}
+
+/*********************************************************************//**
+Free an FTS trx. */
+void
+fts_trx_free(
+/*=========*/
+ fts_trx_t* fts_trx) /* in, own: FTS trx */
+{
+ ulint i;
+
+ for (i = 0; i < ib_vector_size(fts_trx->savepoints); ++i) {
+ fts_savepoint_t* savepoint;
+
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_get(fts_trx->savepoints, i));
+
+ /* The default savepoint name must be NULL. */
+ if (i == 0) {
+ ut_a(savepoint->name == NULL);
+ }
+
+ fts_savepoint_free(savepoint);
+ }
+
+ for (i = 0; i < ib_vector_size(fts_trx->last_stmt); ++i) {
+ fts_savepoint_t* savepoint;
+
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_get(fts_trx->last_stmt, i));
+
+ /* The default savepoint name must be NULL. */
+ if (i == 0) {
+ ut_a(savepoint->name == NULL);
+ }
+
+ fts_savepoint_free(savepoint);
+ }
+
+ if (fts_trx->heap) {
+ mem_heap_free(fts_trx->heap);
+ }
+}
+
+/*********************************************************************//**
+Extract the doc id from the FTS hidden column.
+@return doc id that was extracted from rec */
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ dtuple_t* row) /*!< in: row whose FTS doc id we
+ want to extract.*/
+{
+ dfield_t* field;
+ doc_id_t doc_id = 0;
+
+ ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+ field = dtuple_get_nth_field(row, table->fts->doc_col);
+
+ ut_a(dfield_get_len(field) == sizeof(doc_id));
+ ut_a(dfield_get_type(field)->mtype == DATA_INT);
+
+ doc_id = fts_read_doc_id(
+ static_cast<const byte*>(dfield_get_data(field)));
+
+ return(doc_id);
+}
+
+/** Extract the doc id from the record that belongs to index.
+@param[in] rec record containing FTS_DOC_ID
+@param[in] index index of rec
+@param[in] offsets rec_get_offsets(rec,index)
+@return doc id that was extracted from rec */
+doc_id_t
+fts_get_doc_id_from_rec(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets)
+{
+ ulint f = dict_col_get_index_pos(
+ &index->table->cols[index->table->fts->doc_col], index);
+ ulint len;
+ doc_id_t doc_id = mach_read_from_8(
+ rec_get_nth_field(rec, offsets, f, &len));
+ ut_ad(len == 8);
+ return doc_id;
+}
+
+/*********************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+fts_index_cache_t*
+fts_find_index_cache(
+/*=================*/
+ const fts_cache_t* cache, /*!< in: cache to search */
+ const dict_index_t* index) /*!< in: index to search for */
+{
+ /* We cast away the const because our internal function, takes
+ non-const cache arg and returns a non-const pointer. */
+ return(static_cast<fts_index_cache_t*>(
+ fts_get_index_cache((fts_cache_t*) cache, index)));
+}
+
+/*********************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+ const fts_index_cache_t*index_cache, /*!< in: cache to search */
+ const fts_string_t* text) /*!< in: word to search for */
+{
+ ib_rbt_bound_t parent;
+ const ib_vector_t* nodes = NULL;
+#ifdef UNIV_DEBUG
+ dict_table_t* table = index_cache->index->table;
+ fts_cache_t* cache = table->fts->cache;
+
+ ut_ad(rw_lock_own(&cache->lock, RW_LOCK_X));
+#endif /* UNIV_DEBUG */
+
+ /* Lookup the word in the rb tree */
+ if (rbt_search(index_cache->words, &parent, text) == 0) {
+ const fts_tokenizer_word_t* word;
+
+ word = rbt_value(fts_tokenizer_word_t, parent.last);
+
+ nodes = word->nodes;
+ }
+
+ return(nodes);
+}
+
+/*********************************************************************//**
+Append deleted doc ids to vector. */
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+ const fts_cache_t* cache, /*!< in: cache to use */
+ ib_vector_t* vector) /*!< in: append to this vector */
+{
+ mutex_enter(const_cast<ib_mutex_t*>(&cache->deleted_lock));
+
+ if (cache->deleted_doc_ids == NULL) {
+ mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+ return;
+ }
+
+
+ for (ulint i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) {
+ doc_id_t* update;
+
+ update = static_cast<doc_id_t*>(
+ ib_vector_get(cache->deleted_doc_ids, i));
+
+ ib_vector_push(vector, &update);
+ }
+
+ mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+}
+
+/*********************************************************************//**
+Add the FTS document id hidden column. */
+void
+fts_add_doc_id_column(
+/*==================*/
+ dict_table_t* table, /*!< in/out: Table with FTS index */
+ mem_heap_t* heap) /*!< in: temporary memory heap, or NULL */
+{
+ dict_mem_table_add_col(
+ table, heap,
+ FTS_DOC_ID_COL_NAME,
+ DATA_INT,
+ dtype_form_prtype(
+ DATA_NOT_NULL | DATA_UNSIGNED
+ | DATA_BINARY_TYPE | DATA_FTS_DOC_ID, 0),
+ sizeof(doc_id_t));
+ DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_HAS_DOC_ID);
+}
+
+/** Add new fts doc id to the update vector.
+@param[in] table the table that contains the FTS index.
+@param[in,out] ufield the fts doc id field in the update vector.
+ No new memory is allocated for this in this
+ function.
+@param[in,out] next_doc_id the fts doc id that has been added to the
+ update vector. If 0, a new fts doc id is
+ automatically generated. The memory provided
+ for this argument will be used by the update
+ vector. Ensure that the life time of this
+ memory matches that of the update vector.
+@return the fts doc id used in the update vector */
+doc_id_t
+fts_update_doc_id(
+ dict_table_t* table,
+ upd_field_t* ufield,
+ doc_id_t* next_doc_id)
+{
+ doc_id_t doc_id;
+ dberr_t error = DB_SUCCESS;
+
+ if (*next_doc_id) {
+ doc_id = *next_doc_id;
+ } else {
+ /* Get the new document id that will be added. */
+ error = fts_get_next_doc_id(table, &doc_id);
+ }
+
+ if (error == DB_SUCCESS) {
+ dict_index_t* clust_index;
+ dict_col_t* col = dict_table_get_nth_col(
+ table, table->fts->doc_col);
+
+ ufield->exp = NULL;
+
+ ufield->new_val.len = sizeof(doc_id);
+
+ clust_index = dict_table_get_first_index(table);
+
+ ufield->field_no = static_cast<unsigned>(
+ dict_col_get_clust_pos(col, clust_index))
+ & dict_index_t::MAX_N_FIELDS;
+ dict_col_copy_type(col, dfield_get_type(&ufield->new_val));
+
+ /* It is possible we update record that has
+ not yet be sync-ed from last crash. */
+
+ /* Convert to storage byte order. */
+ ut_a(doc_id != FTS_NULL_DOC_ID);
+ fts_write_doc_id((byte*) next_doc_id, doc_id);
+
+ ufield->new_val.data = next_doc_id;
+ ufield->new_val.ext = 0;
+ }
+
+ return(doc_id);
+}
+
+/** fts_t constructor.
+@param[in] table table with FTS indexes
+@param[in,out] heap memory heap where 'this' is stored */
+fts_t::fts_t(
+ const dict_table_t* table,
+ mem_heap_t* heap)
+ :
+ added_synced(0), dict_locked(0),
+ add_wq(NULL),
+ cache(NULL),
+ doc_col(ULINT_UNDEFINED), in_queue(false), sync_message(false),
+ fts_heap(heap)
+{
+ ut_a(table->fts == NULL);
+
+ ib_alloc_t* heap_alloc = ib_heap_allocator_create(fts_heap);
+
+ indexes = ib_vector_create(heap_alloc, sizeof(dict_index_t*), 4);
+
+ dict_table_get_all_fts_indexes(table, indexes);
+}
+
+/** fts_t destructor. */
+fts_t::~fts_t()
+{
+ ut_ad(add_wq == NULL);
+
+ if (cache != NULL) {
+ fts_cache_clear(cache);
+ fts_cache_destroy(cache);
+ cache = NULL;
+ }
+
+ /* There is no need to call ib_vector_free() on this->indexes
+ because it is stored in this->fts_heap. */
+}
+
+/*********************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+fts_t*
+fts_create(
+/*=======*/
+ dict_table_t* table) /*!< in/out: table with FTS indexes */
+{
+ fts_t* fts;
+ mem_heap_t* heap;
+
+ heap = mem_heap_create(512);
+
+ fts = static_cast<fts_t*>(mem_heap_alloc(heap, sizeof(*fts)));
+
+ new(fts) fts_t(table, heap);
+
+ return(fts);
+}
+
+/*********************************************************************//**
+Free the FTS resources. */
+void
+fts_free(
+/*=====*/
+ dict_table_t* table) /*!< in/out: table with FTS indexes */
+{
+ fts_t* fts = table->fts;
+
+ fts->~fts_t();
+
+ mem_heap_free(fts->fts_heap);
+
+ table->fts = NULL;
+}
+
+/*********************************************************************//**
+Take a FTS savepoint. */
+UNIV_INLINE
+void
+fts_savepoint_copy(
+/*===============*/
+ const fts_savepoint_t* src, /*!< in: source savepoint */
+ fts_savepoint_t* dst) /*!< out: destination savepoint */
+{
+ const ib_rbt_node_t* node;
+ const ib_rbt_t* tables;
+
+ tables = src->tables;
+
+ for (node = rbt_first(tables); node; node = rbt_next(tables, node)) {
+
+ fts_trx_table_t* ftt_dst;
+ const fts_trx_table_t** ftt_src;
+
+ ftt_src = rbt_value(const fts_trx_table_t*, node);
+
+ ftt_dst = fts_trx_table_clone(*ftt_src);
+
+ rbt_insert(dst->tables, &ftt_dst, &ftt_dst);
+ }
+}
+
+/*********************************************************************//**
+Take a FTS savepoint. */
+void
+fts_savepoint_take(
+/*===============*/
+ fts_trx_t* fts_trx, /*!< in: fts transaction */
+ const char* name) /*!< in: savepoint name */
+{
+ mem_heap_t* heap;
+ fts_savepoint_t* savepoint;
+ fts_savepoint_t* last_savepoint;
+
+ ut_a(name != NULL);
+
+ heap = fts_trx->heap;
+
+ /* The implied savepoint must exist. */
+ ut_a(ib_vector_size(fts_trx->savepoints) > 0);
+
+ last_savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_last(fts_trx->savepoints));
+ savepoint = fts_savepoint_create(fts_trx->savepoints, name, heap);
+
+ if (last_savepoint->tables != NULL) {
+ fts_savepoint_copy(last_savepoint, savepoint);
+ }
+}
+
+/*********************************************************************//**
+Lookup a savepoint instance by name.
+@return ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+fts_savepoint_lookup(
+/*==================*/
+ ib_vector_t* savepoints, /*!< in: savepoints */
+ const char* name) /*!< in: savepoint name */
+{
+ ulint i;
+
+ ut_a(ib_vector_size(savepoints) > 0);
+
+ for (i = 1; i < ib_vector_size(savepoints); ++i) {
+ fts_savepoint_t* savepoint;
+
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_get(savepoints, i));
+
+ if (strcmp(name, savepoint->name) == 0) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/*********************************************************************//**
+Release the savepoint data identified by name. All savepoints created
+after the named savepoint are kept.
+@return DB_SUCCESS or error code */
+void
+fts_savepoint_release(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* name) /*!< in: savepoint name */
+{
+ ut_a(name != NULL);
+
+ ib_vector_t* savepoints = trx->fts_trx->savepoints;
+
+ ut_a(ib_vector_size(savepoints) > 0);
+
+ ulint i = fts_savepoint_lookup(savepoints, name);
+ if (i != ULINT_UNDEFINED) {
+ ut_a(i >= 1);
+
+ fts_savepoint_t* savepoint;
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_get(savepoints, i));
+
+ if (i == ib_vector_size(savepoints) - 1) {
+ /* If the savepoint is the last, we save its
+ tables to the previous savepoint. */
+ fts_savepoint_t* prev_savepoint;
+ prev_savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_get(savepoints, i - 1));
+
+ ib_rbt_t* tables = savepoint->tables;
+ savepoint->tables = prev_savepoint->tables;
+ prev_savepoint->tables = tables;
+ }
+
+ fts_savepoint_free(savepoint);
+ ib_vector_remove(savepoints, *(void**)savepoint);
+
+ /* Make sure we don't delete the implied savepoint. */
+ ut_a(ib_vector_size(savepoints) > 0);
+ }
+}
+
+/**********************************************************************//**
+Refresh last statement savepoint. */
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+ trx_t* trx) /*!< in: transaction */
+{
+
+ fts_trx_t* fts_trx;
+ fts_savepoint_t* savepoint;
+
+ fts_trx = trx->fts_trx;
+
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_pop(fts_trx->last_stmt));
+ fts_savepoint_free(savepoint);
+
+ ut_ad(ib_vector_is_empty(fts_trx->last_stmt));
+ savepoint = fts_savepoint_create(fts_trx->last_stmt, NULL, NULL);
+}
+
+/********************************************************************
+Undo the Doc ID add/delete operations in last stmt */
+static
+void
+fts_undo_last_stmt(
+/*===============*/
+ fts_trx_table_t* s_ftt, /*!< in: Transaction FTS table */
+ fts_trx_table_t* l_ftt) /*!< in: last stmt FTS table */
+{
+ ib_rbt_t* s_rows;
+ ib_rbt_t* l_rows;
+ const ib_rbt_node_t* node;
+
+ l_rows = l_ftt->rows;
+ s_rows = s_ftt->rows;
+
+ for (node = rbt_first(l_rows);
+ node;
+ node = rbt_next(l_rows, node)) {
+ fts_trx_row_t* l_row = rbt_value(fts_trx_row_t, node);
+ ib_rbt_bound_t parent;
+
+ rbt_search(s_rows, &parent, &(l_row->doc_id));
+
+ if (parent.result == 0) {
+ fts_trx_row_t* s_row = rbt_value(
+ fts_trx_row_t, parent.last);
+
+ switch (l_row->state) {
+ case FTS_INSERT:
+ ut_free(rbt_remove_node(s_rows, parent.last));
+ break;
+
+ case FTS_DELETE:
+ if (s_row->state == FTS_NOTHING) {
+ s_row->state = FTS_INSERT;
+ } else if (s_row->state == FTS_DELETE) {
+ ut_free(rbt_remove_node(
+ s_rows, parent.last));
+ }
+ break;
+
+ /* FIXME: Check if FTS_MODIFY need to be addressed */
+ case FTS_MODIFY:
+ case FTS_NOTHING:
+ break;
+ default:
+ ut_error;
+ }
+ }
+ }
+}
+
+/**********************************************************************//**
+Rollback to savepoint indentified by name.
+@return DB_SUCCESS or error code */
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ ib_vector_t* savepoints;
+ fts_savepoint_t* savepoint;
+ fts_savepoint_t* last_stmt;
+ fts_trx_t* fts_trx;
+ ib_rbt_bound_t parent;
+ const ib_rbt_node_t* node;
+ ib_rbt_t* l_tables;
+ ib_rbt_t* s_tables;
+
+ fts_trx = trx->fts_trx;
+ savepoints = fts_trx->savepoints;
+
+ savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
+ last_stmt = static_cast<fts_savepoint_t*>(
+ ib_vector_last(fts_trx->last_stmt));
+
+ l_tables = last_stmt->tables;
+ s_tables = savepoint->tables;
+
+ for (node = rbt_first(l_tables);
+ node;
+ node = rbt_next(l_tables, node)) {
+
+ fts_trx_table_t** l_ftt;
+
+ l_ftt = rbt_value(fts_trx_table_t*, node);
+
+ rbt_search_cmp(
+ s_tables, &parent, &(*l_ftt)->table->id,
+ fts_trx_table_id_cmp, NULL);
+
+ if (parent.result == 0) {
+ fts_trx_table_t** s_ftt;
+
+ s_ftt = rbt_value(fts_trx_table_t*, parent.last);
+
+ fts_undo_last_stmt(*s_ftt, *l_ftt);
+ }
+ }
+}
+
+/**********************************************************************//**
+Rollback to savepoint indentified by name.
+@return DB_SUCCESS or error code */
+void
+fts_savepoint_rollback(
+/*===================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* name) /*!< in: savepoint name */
+{
+ ulint i;
+ ib_vector_t* savepoints;
+
+ ut_a(name != NULL);
+
+ savepoints = trx->fts_trx->savepoints;
+
+ /* We pop all savepoints from the the top of the stack up to
+ and including the instance that was found. */
+ i = fts_savepoint_lookup(savepoints, name);
+
+ if (i != ULINT_UNDEFINED) {
+ fts_savepoint_t* savepoint;
+
+ ut_a(i > 0);
+
+ while (ib_vector_size(savepoints) > i) {
+ fts_savepoint_t* savepoint;
+
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_pop(savepoints));
+
+ if (savepoint->name != NULL) {
+ /* Since name was allocated on the heap, the
+ memory will be released when the transaction
+ completes. */
+ savepoint->name = NULL;
+
+ fts_savepoint_free(savepoint);
+ }
+ }
+
+ /* Pop all a elements from the top of the stack that may
+ have been released. We have to be careful that we don't
+ delete the implied savepoint. */
+
+ for (savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_last(savepoints));
+ ib_vector_size(savepoints) > 1
+ && savepoint->name == NULL;
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_last(savepoints))) {
+
+ ib_vector_pop(savepoints);
+ }
+
+ /* Make sure we don't delete the implied savepoint. */
+ ut_a(ib_vector_size(savepoints) > 0);
+
+ /* Restore the savepoint. */
+ fts_savepoint_take(trx->fts_trx, name);
+ }
+}
+
+bool fts_check_aux_table(const char *name,
+ table_id_t *table_id,
+ index_id_t *index_id)
+{
+ ulint len= strlen(name);
+ const char* ptr;
+ const char* end= name + len;
+
+ ut_ad(len <= MAX_FULL_NAME_LEN);
+ ptr= static_cast<const char*>(memchr(name, '/', len));
+
+ if (ptr != NULL)
+ {
+ /* We will start the match after the '/' */
+ ++ptr;
+ len = end - ptr;
+ }
+
+ /* All auxiliary tables are prefixed with "FTS_" and the name
+ length will be at the very least greater than 20 bytes. */
+ if (ptr && len > 20 && !memcmp(ptr, "FTS_", 4))
+ {
+ /* Skip the prefix. */
+ ptr+= 4;
+ len-= 4;
+
+ const char *table_id_ptr= ptr;
+ /* Skip the table id. */
+ ptr= static_cast<const char*>(memchr(ptr, '_', len));
+
+ if (!ptr)
+ return false;
+
+ /* Skip the underscore. */
+ ++ptr;
+ ut_ad(end > ptr);
+ len= end - ptr;
+
+ sscanf(table_id_ptr, UINT64PFx, table_id);
+ /* First search the common table suffix array. */
+ for (ulint i = 0; fts_common_tables[i]; ++i)
+ {
+ if (!strncmp(ptr, fts_common_tables[i], len))
+ return true;
+ }
+
+ /* Could be obsolete common tables. */
+ if ((len == 5 && !memcmp(ptr, "ADDED", len)) ||
+ (len == 9 && !memcmp(ptr, "STOPWORDS", len)))
+ return true;
+
+ const char* index_id_ptr= ptr;
+ /* Skip the index id. */
+ ptr= static_cast<const char*>(memchr(ptr, '_', len));
+ if (!ptr)
+ return false;
+
+ sscanf(index_id_ptr, UINT64PFx, index_id);
+
+ /* Skip the underscore. */
+ ++ptr;
+ ut_a(end > ptr);
+ len= end - ptr;
+
+ if (len > 7)
+ return false;
+
+ /* Search the FT index specific array. */
+ for (ulint i = 0; i < FTS_NUM_AUX_INDEX; ++i)
+ {
+ if (!memcmp(ptr, "INDEX_", len - 1))
+ return true;
+ }
+
+ /* Other FT index specific table(s). */
+ if (len == 6 && !memcmp(ptr, "DOC_ID", len))
+ return true;
+ }
+
+ return false;
+}
+
+typedef std::pair<table_id_t,index_id_t> fts_aux_id;
+typedef std::set<fts_aux_id> fts_space_set_t;
+
+/** Iterate over all the spaces in the space list and fetch the
+fts parent table id and index id.
+@param[in,out] fts_space_set store the list of tablespace id and
+ index id */
+static void fil_get_fts_spaces(fts_space_set_t& fts_space_set)
+{
+ mutex_enter(&fil_system.mutex);
+
+ for (fil_space_t *space= UT_LIST_GET_FIRST(fil_system.space_list);
+ space;
+ space= UT_LIST_GET_NEXT(space_list, space))
+ {
+ index_id_t index_id= 0;
+ table_id_t table_id= 0;
+
+ if (space->purpose == FIL_TYPE_TABLESPACE
+ && fts_check_aux_table(space->name, &table_id, &index_id))
+ fts_space_set.insert(std::make_pair(table_id, index_id));
+ }
+
+ mutex_exit(&fil_system.mutex);
+}
+
+/** Check whether the parent table id and index id of fts auxilary
+tables with SYS_INDEXES. If it exists then we can safely ignore the
+fts table from orphaned tables.
+@param[in,out] fts_space_set fts space set contains set of auxiliary
+ table ids */
+static void fts_check_orphaned_tables(fts_space_set_t& fts_space_set)
+{
+ btr_pcur_t pcur;
+ mtr_t mtr;
+ trx_t* trx = trx_create();
+ trx->op_info = "checking fts orphaned tables";
+
+ row_mysql_lock_data_dictionary(trx);
+
+ mtr.start();
+ btr_pcur_open_at_index_side(
+ true, dict_table_get_first_index(dict_sys.sys_indexes),
+ BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+
+ do
+ {
+ const rec_t *rec;
+ const byte *tbl_field;
+ const byte *index_field;
+ ulint len;
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ if (!btr_pcur_is_on_user_rec(&pcur))
+ break;
+
+ rec= btr_pcur_get_rec(&pcur);
+ if (rec_get_deleted_flag(rec, 0))
+ continue;
+
+ tbl_field= rec_get_nth_field_old(rec, 0, &len);
+ if (len != 8)
+ continue;
+
+ index_field= rec_get_nth_field_old(rec, 1, &len);
+ if (len != 8)
+ continue;
+
+ table_id_t table_id = mach_read_from_8(tbl_field);
+ index_id_t index_id = mach_read_from_8(index_field);
+
+ fts_space_set_t::iterator it = fts_space_set.find(
+ fts_aux_id(table_id, index_id));
+
+ if (it != fts_space_set.end())
+ fts_space_set.erase(*it);
+ else
+ {
+ it= fts_space_set.find(fts_aux_id(table_id, 0));
+ if (it != fts_space_set.end())
+ fts_space_set.erase(*it);
+ }
+ } while(!fts_space_set.empty());
+
+ btr_pcur_close(&pcur);
+ mtr.commit();
+ row_mysql_unlock_data_dictionary(trx);
+ trx->free();
+}
+
+/** Drop all fts auxilary table for the respective fts_id
+@param[in] fts_id fts auxilary table ids */
+static void fts_drop_all_aux_tables(trx_t *trx, fts_table_t *fts_table)
+{
+ char fts_table_name[MAX_FULL_NAME_LEN];
+ for (ulint i= 0;i < FTS_NUM_AUX_INDEX; i++)
+ {
+ fts_table->suffix= fts_get_suffix(i);
+ fts_get_table_name(fts_table, fts_table_name, true);
+
+ /* Drop all fts aux and common table */
+ dberr_t err= fts_drop_table(trx, fts_table_name);
+
+ if (err == DB_FAIL)
+ {
+ char *path= fil_make_filepath(NULL, fts_table_name, IBD, false);
+
+ if (path != NULL)
+ {
+ os_file_delete_if_exists(innodb_data_file_key, path , NULL);
+ ut_free(path);
+ }
+ }
+ }
+}
+
+/** Drop all orphaned FTS auxiliary tables, those that don't have
+a parent table or FTS index defined on them. */
+void fts_drop_orphaned_tables()
+{
+ fts_space_set_t fts_space_set;
+ fil_get_fts_spaces(fts_space_set);
+
+ if (fts_space_set.empty())
+ return;
+
+ fts_check_orphaned_tables(fts_space_set);
+
+ if (fts_space_set.empty())
+ return;
+
+ trx_t* trx= trx_create();
+ trx->op_info= "Drop orphaned aux FTS tables";
+ row_mysql_lock_data_dictionary(trx);
+
+ for (fts_space_set_t::iterator it = fts_space_set.begin();
+ it != fts_space_set.end(); it++)
+ {
+ fts_table_t fts_table;
+ dict_table_t *table= dict_table_open_on_id(it->first, TRUE,
+ DICT_TABLE_OP_NORMAL);
+ if (!table)
+ continue;
+
+ FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+ fts_drop_common_tables(trx, &fts_table, true);
+
+ fts_table.type= FTS_INDEX_TABLE;
+ fts_table.index_id= it->second;
+ fts_drop_all_aux_tables(trx, &fts_table);
+
+ dict_table_close(table, true, false);
+ }
+ trx_commit_for_mysql(trx);
+ row_mysql_unlock_data_dictionary(trx);
+ trx->dict_operation_lock_mode= 0;
+ trx->free();
+}
+
+/**********************************************************************//**
+Check whether user supplied stopword table is of the right format.
+Caller is responsible to hold dictionary locks.
+@return the stopword column charset if qualifies */
+CHARSET_INFO*
+fts_valid_stopword_table(
+/*=====================*/
+ const char* stopword_table_name) /*!< in: Stopword table
+ name */
+{
+ dict_table_t* table;
+ dict_col_t* col = NULL;
+
+ if (!stopword_table_name) {
+ return(NULL);
+ }
+
+ table = dict_table_get_low(stopword_table_name);
+
+ if (!table) {
+ ib::error() << "User stopword table " << stopword_table_name
+ << " does not exist.";
+
+ return(NULL);
+ } else {
+ if (strcmp(dict_table_get_col_name(table, 0), "value")) {
+ ib::error() << "Invalid column name for stopword"
+ " table " << stopword_table_name << ". Its"
+ " first column must be named as 'value'.";
+
+ return(NULL);
+ }
+
+ col = dict_table_get_nth_col(table, 0);
+
+ if (col->mtype != DATA_VARCHAR
+ && col->mtype != DATA_VARMYSQL) {
+ ib::error() << "Invalid column type for stopword"
+ " table " << stopword_table_name << ". Its"
+ " first column must be of varchar type";
+
+ return(NULL);
+ }
+ }
+
+ ut_ad(col);
+
+ return(fts_get_charset(col->prtype));
+}
+
+/**********************************************************************//**
+This function loads the stopword into the FTS cache. It also
+records/fetches stopword configuration to/from FTS configure
+table, depending on whether we are creating or reloading the
+FTS.
+@return true if load operation is successful */
+bool
+fts_load_stopword(
+/*==============*/
+ const dict_table_t*
+ table, /*!< in: Table with FTS */
+ trx_t* trx, /*!< in: Transactions */
+ const char* session_stopword_table, /*!< in: Session stopword table
+ name */
+ bool stopword_is_on, /*!< in: Whether stopword
+ option is turned on/off */
+ bool reload) /*!< in: Whether it is
+ for reloading FTS table */
+{
+ fts_table_t fts_table;
+ fts_string_t str;
+ dberr_t error = DB_SUCCESS;
+ ulint use_stopword;
+ fts_cache_t* cache;
+ const char* stopword_to_use = NULL;
+ ibool new_trx = FALSE;
+ byte str_buffer[MAX_FULL_NAME_LEN + 1];
+
+ FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, table);
+
+ cache = table->fts->cache;
+
+ if (!reload && !(cache->stopword_info.status & STOPWORD_NOT_INIT)) {
+ return true;
+ }
+
+ if (!trx) {
+ trx = trx_create();
+ if (srv_read_only_mode) {
+ trx_start_internal_read_only(trx);
+ } else {
+ trx_start_internal(trx);
+ }
+ trx->op_info = "upload FTS stopword";
+ new_trx = TRUE;
+ }
+
+ /* First check whether stopword filtering is turned off */
+ if (reload) {
+ error = fts_config_get_ulint(
+ trx, &fts_table, FTS_USE_STOPWORD, &use_stopword);
+ } else {
+ use_stopword = (ulint) stopword_is_on;
+
+ error = fts_config_set_ulint(
+ trx, &fts_table, FTS_USE_STOPWORD, use_stopword);
+ }
+
+ if (error != DB_SUCCESS) {
+ goto cleanup;
+ }
+
+ /* If stopword is turned off, no need to continue to load the
+ stopword into cache, but still need to do initialization */
+ if (!use_stopword) {
+ cache->stopword_info.status = STOPWORD_OFF;
+ goto cleanup;
+ }
+
+ if (reload) {
+ /* Fetch the stopword table name from FTS config
+ table */
+ str.f_n_char = 0;
+ str.f_str = str_buffer;
+ str.f_len = sizeof(str_buffer) - 1;
+
+ error = fts_config_get_value(
+ trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
+
+ if (error != DB_SUCCESS) {
+ goto cleanup;
+ }
+
+ if (*str.f_str) {
+ stopword_to_use = (const char*) str.f_str;
+ }
+ } else {
+ stopword_to_use = session_stopword_table;
+ }
+
+ if (stopword_to_use
+ && fts_load_user_stopword(table->fts, stopword_to_use,
+ &cache->stopword_info)) {
+ /* Save the stopword table name to the configure
+ table */
+ if (!reload) {
+ str.f_n_char = 0;
+ str.f_str = (byte*) stopword_to_use;
+ str.f_len = strlen(stopword_to_use);
+
+ error = fts_config_set_value(
+ trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
+ }
+ } else {
+ /* Load system default stopword list */
+ fts_load_default_stopword(&cache->stopword_info);
+ }
+
+cleanup:
+ if (new_trx) {
+ if (error == DB_SUCCESS) {
+ fts_sql_commit(trx);
+ } else {
+ fts_sql_rollback(trx);
+ }
+
+ trx->free();
+ }
+
+ if (!cache->stopword_info.cached_stopword) {
+ cache->stopword_info.cached_stopword = rbt_create_arg_cmp(
+ sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+ &my_charset_latin1);
+ }
+
+ return error == DB_SUCCESS;
+}
+
+/**********************************************************************//**
+Callback function when we initialize the FTS at the start up
+time. It recovers the maximum Doc IDs presented in the current table.
+@return: always returns TRUE */
+static
+ibool
+fts_init_get_doc_id(
+/*================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: fts cache */
+{
+ doc_id_t doc_id = FTS_NULL_DOC_ID;
+ sel_node_t* node = static_cast<sel_node_t*>(row);
+ que_node_t* exp = node->select_list;
+ fts_cache_t* cache = static_cast<fts_cache_t*>(user_arg);
+
+ ut_ad(ib_vector_is_empty(cache->get_docs));
+
+ /* Copy each indexed column content into doc->text.f_str */
+ if (exp) {
+ dfield_t* dfield = que_node_get_val(exp);
+ dtype_t* type = dfield_get_type(dfield);
+ void* data = dfield_get_data(dfield);
+
+ ut_a(dtype_get_mtype(type) == DATA_INT);
+
+ doc_id = static_cast<doc_id_t>(mach_read_from_8(
+ static_cast<const byte*>(data)));
+
+ if (doc_id >= cache->next_doc_id) {
+ cache->next_doc_id = doc_id + 1;
+ }
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Callback function when we initialize the FTS at the start up
+time. It recovers Doc IDs that have not sync-ed to the auxiliary
+table, and require to bring them back into FTS index.
+@return: always returns TRUE */
+static
+ibool
+fts_init_recover_doc(
+/*=================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: fts cache */
+{
+
+ fts_doc_t doc;
+ ulint doc_len = 0;
+ ulint field_no = 0;
+ fts_get_doc_t* get_doc = static_cast<fts_get_doc_t*>(user_arg);
+ doc_id_t doc_id = FTS_NULL_DOC_ID;
+ sel_node_t* node = static_cast<sel_node_t*>(row);
+ que_node_t* exp = node->select_list;
+ fts_cache_t* cache = get_doc->cache;
+ st_mysql_ftparser* parser = get_doc->index_cache->index->parser;
+
+ fts_doc_init(&doc);
+ doc.found = TRUE;
+
+ ut_ad(cache);
+
+ /* Copy each indexed column content into doc->text.f_str */
+ while (exp) {
+ dfield_t* dfield = que_node_get_val(exp);
+ ulint len = dfield_get_len(dfield);
+
+ if (field_no == 0) {
+ dtype_t* type = dfield_get_type(dfield);
+ void* data = dfield_get_data(dfield);
+
+ ut_a(dtype_get_mtype(type) == DATA_INT);
+
+ doc_id = static_cast<doc_id_t>(mach_read_from_8(
+ static_cast<const byte*>(data)));
+
+ field_no++;
+ exp = que_node_get_next(exp);
+ continue;
+ }
+
+ if (len == UNIV_SQL_NULL) {
+ exp = que_node_get_next(exp);
+ continue;
+ }
+
+ ut_ad(get_doc);
+
+ if (!get_doc->index_cache->charset) {
+ get_doc->index_cache->charset = fts_get_charset(
+ dfield->type.prtype);
+ }
+
+ doc.charset = get_doc->index_cache->charset;
+
+ if (dfield_is_ext(dfield)) {
+ dict_table_t* table = cache->sync->table;
+
+ doc.text.f_str = btr_copy_externally_stored_field(
+ &doc.text.f_len,
+ static_cast<byte*>(dfield_get_data(dfield)),
+ table->space->zip_size(), len,
+ static_cast<mem_heap_t*>(doc.self_heap->arg));
+ } else {
+ doc.text.f_str = static_cast<byte*>(
+ dfield_get_data(dfield));
+
+ doc.text.f_len = len;
+ }
+
+ if (field_no == 1) {
+ fts_tokenize_document(&doc, NULL, parser);
+ } else {
+ fts_tokenize_document_next(&doc, doc_len, NULL, parser);
+ }
+
+ exp = que_node_get_next(exp);
+
+ doc_len += (exp) ? len + 1 : len;
+
+ field_no++;
+ }
+
+ fts_cache_add_doc(cache, get_doc->index_cache, doc_id, doc.tokens);
+
+ fts_doc_free(&doc);
+
+ cache->added++;
+
+ if (doc_id >= cache->next_doc_id) {
+ cache->next_doc_id = doc_id + 1;
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations
+@return TRUE if all OK */
+ibool
+fts_init_index(
+/*===========*/
+ dict_table_t* table, /*!< in: Table with FTS */
+ ibool has_cache_lock) /*!< in: Whether we already have
+ cache lock */
+{
+ dict_index_t* index;
+ doc_id_t start_doc;
+ fts_get_doc_t* get_doc = NULL;
+ fts_cache_t* cache = table->fts->cache;
+ bool need_init = false;
+
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ /* First check cache->get_docs is initialized */
+ if (!has_cache_lock) {
+ rw_lock_x_lock(&cache->lock);
+ }
+
+ rw_lock_x_lock(&cache->init_lock);
+ if (cache->get_docs == NULL) {
+ cache->get_docs = fts_get_docs_create(cache);
+ }
+ rw_lock_x_unlock(&cache->init_lock);
+
+ if (table->fts->added_synced) {
+ goto func_exit;
+ }
+
+ need_init = true;
+
+ start_doc = cache->synced_doc_id;
+
+ if (!start_doc) {
+ fts_cmp_set_sync_doc_id(table, 0, TRUE, &start_doc);
+ cache->synced_doc_id = start_doc;
+ }
+
+ /* No FTS index, this is the case when previous FTS index
+ dropped, and we re-initialize the Doc ID system for subsequent
+ insertion */
+ if (ib_vector_is_empty(cache->get_docs)) {
+ index = table->fts_doc_id_index;
+
+ ut_a(index);
+
+ fts_doc_fetch_by_doc_id(NULL, start_doc, index,
+ FTS_FETCH_DOC_BY_ID_LARGE,
+ fts_init_get_doc_id, cache);
+ } else {
+ if (table->fts->cache->stopword_info.status
+ & STOPWORD_NOT_INIT) {
+ fts_load_stopword(table, NULL, NULL, true, true);
+ }
+
+ for (ulint i = 0; i < ib_vector_size(cache->get_docs); ++i) {
+ get_doc = static_cast<fts_get_doc_t*>(
+ ib_vector_get(cache->get_docs, i));
+
+ index = get_doc->index_cache->index;
+
+ fts_doc_fetch_by_doc_id(NULL, start_doc, index,
+ FTS_FETCH_DOC_BY_ID_LARGE,
+ fts_init_recover_doc, get_doc);
+ }
+ }
+
+ table->fts->added_synced = true;
+
+ fts_get_docs_clear(cache->get_docs);
+
+func_exit:
+ if (!has_cache_lock) {
+ rw_lock_x_unlock(&cache->lock);
+ }
+
+ if (need_init) {
+ mutex_enter(&dict_sys.mutex);
+ /* Register the table with the optimize thread. */
+ fts_optimize_add_table(table);
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ return(TRUE);
+}
diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
new file mode 100644
index 00000000..e3c0f8f5
--- /dev/null
+++ b/storage/innobase/fts/fts0opt.cc
@@ -0,0 +1,3053 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0opt.cc
+Full Text Search optimize thread
+
+Created 2007/03/27 Sunny Bains
+Completed 2011/7/10 Sunny and Jimmy Yang
+
+***********************************************************************/
+
+#include "fts0fts.h"
+#include "row0sel.h"
+#include "que0types.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+#include "ut0wqueue.h"
+#include "srv0start.h"
+#include "ut0list.h"
+#include "zlib.h"
+#include "fts0opt.h"
+
+/** The FTS optimize thread's work queue. */
+ib_wqueue_t* fts_optimize_wq;
+static void fts_optimize_callback(void *);
+static void timer_callback(void*);
+static tpool::timer* timer;
+
+static tpool::task_group task_group(1);
+static tpool::task task(fts_optimize_callback,0, &task_group);
+
+/** FTS optimize thread, for MDL acquisition */
+static THD *fts_opt_thd;
+
+/** The FTS vector to store fts_slot_t */
+static ib_vector_t* fts_slots;
+
+/** Default optimize interval in secs. */
+static const ulint FTS_OPTIMIZE_INTERVAL_IN_SECS = 300;
+
+/** Server is shutting down, so does we exiting the optimize thread */
+static bool fts_opt_start_shutdown = false;
+
+/** Event to wait for shutdown of the optimize thread */
+static os_event_t fts_opt_shutdown_event = NULL;
+
+/** Initial size of nodes in fts_word_t. */
+static const ulint FTS_WORD_NODES_INIT_SIZE = 64;
+
+/** Last time we did check whether system need a sync */
+static time_t last_check_sync_time;
+
+/** FTS optimize thread message types. */
+enum fts_msg_type_t {
+ FTS_MSG_STOP, /*!< Stop optimizing and exit thread */
+
+ FTS_MSG_ADD_TABLE, /*!< Add table to the optimize thread's
+ work queue */
+
+ FTS_MSG_DEL_TABLE, /*!< Remove a table from the optimize
+ threads work queue */
+ FTS_MSG_SYNC_TABLE /*!< Sync fts cache of a table */
+};
+
+/** Compressed list of words that have been read from FTS INDEX
+that needs to be optimized. */
+struct fts_zip_t {
+ lint status; /*!< Status of (un)/zip operation */
+
+ ulint n_words; /*!< Number of words compressed */
+
+ ulint block_sz; /*!< Size of a block in bytes */
+
+ ib_vector_t* blocks; /*!< Vector of compressed blocks */
+
+ ib_alloc_t* heap_alloc; /*!< Heap to use for allocations */
+
+ ulint pos; /*!< Offset into blocks */
+
+ ulint last_big_block; /*!< Offset of last block in the
+ blocks array that is of size
+ block_sz. Blocks beyond this offset
+ are of size FTS_MAX_WORD_LEN */
+
+ z_streamp zp; /*!< ZLib state */
+
+ /*!< The value of the last word read
+ from the FTS INDEX table. This is
+ used to discard duplicates */
+
+ fts_string_t word; /*!< UTF-8 string */
+
+ ulint max_words; /*!< maximum number of words to read
+ in one pase */
+};
+
+/** Prepared statemets used during optimize */
+struct fts_optimize_graph_t {
+ /*!< Delete a word from FTS INDEX */
+ que_t* delete_nodes_graph;
+ /*!< Insert a word into FTS INDEX */
+ que_t* write_nodes_graph;
+ /*!< COMMIT a transaction */
+ que_t* commit_graph;
+ /*!< Read the nodes from FTS_INDEX */
+ que_t* read_nodes_graph;
+};
+
+/** Used by fts_optimize() to store state. */
+struct fts_optimize_t {
+ trx_t* trx; /*!< The transaction used for all SQL */
+
+ ib_alloc_t* self_heap; /*!< Heap to use for allocations */
+
+ char* name_prefix; /*!< FTS table name prefix */
+
+ fts_table_t fts_index_table;/*!< Common table definition */
+
+ /*!< Common table definition */
+ fts_table_t fts_common_table;
+
+ dict_table_t* table; /*!< Table that has to be queried */
+
+ dict_index_t* index; /*!< The FTS index to be optimized */
+
+ fts_doc_ids_t* to_delete; /*!< doc ids to delete, we check against
+ this vector and purge the matching
+ entries during the optimizing
+ process. The vector entries are
+ sorted on doc id */
+
+ ulint del_pos; /*!< Offset within to_delete vector,
+ this is used to keep track of where
+ we are up to in the vector */
+
+ ibool done; /*!< TRUE when optimize finishes */
+
+ ib_vector_t* words; /*!< Word + Nodes read from FTS_INDEX,
+ it contains instances of fts_word_t */
+
+ fts_zip_t* zip; /*!< Words read from the FTS_INDEX */
+
+ fts_optimize_graph_t /*!< Prepared statements used during */
+ graph; /*optimize */
+
+ ulint n_completed; /*!< Number of FTS indexes that have
+ been optimized */
+ ibool del_list_regenerated;
+ /*!< BEING_DELETED list regenarated */
+};
+
+/** Used by the optimize, to keep state during compacting nodes. */
+struct fts_encode_t {
+ doc_id_t src_last_doc_id;/*!< Last doc id read from src node */
+ byte* src_ilist_ptr; /*!< Current ptr within src ilist */
+};
+
+/** We use this information to determine when to start the optimize
+cycle for a table. */
+struct fts_slot_t {
+ /** table, or NULL if the slot is unused */
+ dict_table_t* table;
+
+ /** whether this slot is being processed */
+ bool running;
+
+ ulint added; /*!< Number of doc ids added since the
+ last time this table was optimized */
+
+ ulint deleted; /*!< Number of doc ids deleted since the
+ last time this table was optimized */
+
+ /** time(NULL) of completing fts_optimize_table_bk() */
+ time_t last_run;
+
+ /** time(NULL) of latest successful fts_optimize_table() */
+ time_t completed;
+};
+
+/** A table remove message for the FTS optimize thread. */
+struct fts_msg_del_t {
+ dict_table_t* table; /*!< The table to remove */
+
+ os_event_t event; /*!< Event to synchronize acknowledgement
+ of receipt and processing of the
+ this message by the consumer */
+};
+
+/** The FTS optimize message work queue message type. */
+struct fts_msg_t {
+ fts_msg_type_t type; /*!< Message type */
+
+ void* ptr; /*!< The message contents */
+
+ mem_heap_t* heap; /*!< The heap used to allocate this
+ message, the message consumer will
+ free the heap. */
+};
+
+/** The number of words to read and optimize in a single pass. */
+ulong fts_num_word_optimize;
+
+/** Whether to enable additional FTS diagnostic printout. */
+char fts_enable_diag_print;
+
+/** ZLib compressed block size.*/
+static ulint FTS_ZIP_BLOCK_SIZE = 1024;
+
+/** The amount of time optimizing in a single pass, in seconds. */
+static ulint fts_optimize_time_limit;
+
+/** It's defined in fts0fts.cc */
+extern const char* fts_common_tables[];
+
+/** SQL Statement for changing state of rows to be deleted from FTS Index. */
+static const char* fts_init_delete_sql =
+ "BEGIN\n"
+ "\n"
+ "INSERT INTO $BEING_DELETED\n"
+ "SELECT doc_id FROM $DELETED;\n"
+ "\n"
+ "INSERT INTO $BEING_DELETED_CACHE\n"
+ "SELECT doc_id FROM $DELETED_CACHE;\n";
+
+static const char* fts_delete_doc_ids_sql =
+ "BEGIN\n"
+ "\n"
+ "DELETE FROM $DELETED WHERE doc_id = :doc_id1;\n"
+ "DELETE FROM $DELETED_CACHE WHERE doc_id = :doc_id2;\n";
+
+static const char* fts_end_delete_sql =
+ "BEGIN\n"
+ "\n"
+ "DELETE FROM $BEING_DELETED;\n"
+ "DELETE FROM $BEING_DELETED_CACHE;\n";
+
+/**********************************************************************//**
+Initialize fts_zip_t. */
+static
+void
+fts_zip_initialize(
+/*===============*/
+ fts_zip_t* zip) /*!< out: zip instance to initialize */
+{
+ zip->pos = 0;
+ zip->n_words = 0;
+
+ zip->status = Z_OK;
+
+ zip->last_big_block = 0;
+
+ zip->word.f_len = 0;
+ *zip->word.f_str = 0;
+
+ ib_vector_reset(zip->blocks);
+
+ memset(zip->zp, 0, sizeof(*zip->zp));
+}
+
+/**********************************************************************//**
+Create an instance of fts_zip_t.
+@return a new instance of fts_zip_t */
+static
+fts_zip_t*
+fts_zip_create(
+/*===========*/
+ mem_heap_t* heap, /*!< in: heap */
+ ulint block_sz, /*!< in: size of a zip block.*/
+ ulint max_words) /*!< in: max words to read */
+{
+ fts_zip_t* zip;
+
+ zip = static_cast<fts_zip_t*>(mem_heap_zalloc(heap, sizeof(*zip)));
+
+ zip->word.f_str = static_cast<byte*>(
+ mem_heap_zalloc(heap, FTS_MAX_WORD_LEN + 1));
+
+ zip->block_sz = block_sz;
+
+ zip->heap_alloc = ib_heap_allocator_create(heap);
+
+ zip->blocks = ib_vector_create(zip->heap_alloc, sizeof(void*), 128);
+
+ zip->max_words = max_words;
+
+ zip->zp = static_cast<z_stream*>(
+ mem_heap_zalloc(heap, sizeof(*zip->zp)));
+
+ return(zip);
+}
+
+/**********************************************************************//**
+Initialize an instance of fts_zip_t. */
+static
+void
+fts_zip_init(
+/*=========*/
+
+ fts_zip_t* zip) /*!< in: zip instance to init */
+{
+ memset(zip->zp, 0, sizeof(*zip->zp));
+
+ zip->word.f_len = 0;
+ *zip->word.f_str = '\0';
+}
+
+/**********************************************************************//**
+Create a fts_optimizer_word_t instance.
+@return new instance */
+static
+fts_word_t*
+fts_word_init(
+/*==========*/
+ fts_word_t* word, /*!< in: word to initialize */
+ byte* utf8, /*!< in: UTF-8 string */
+ ulint len) /*!< in: length of string in bytes */
+{
+ mem_heap_t* heap = mem_heap_create(sizeof(fts_node_t));
+
+ memset(word, 0, sizeof(*word));
+
+ word->text.f_len = len;
+ word->text.f_str = static_cast<byte*>(mem_heap_alloc(heap, len + 1));
+
+ /* Need to copy the NUL character too. */
+ memcpy(word->text.f_str, utf8, word->text.f_len);
+ word->text.f_str[word->text.f_len] = 0;
+
+ word->heap_alloc = ib_heap_allocator_create(heap);
+
+ word->nodes = ib_vector_create(
+ word->heap_alloc, sizeof(fts_node_t), FTS_WORD_NODES_INIT_SIZE);
+
+ return(word);
+}
+
+/**********************************************************************//**
+Read the FTS INDEX row.
+@return fts_node_t instance */
+static
+fts_node_t*
+fts_optimize_read_node(
+/*===================*/
+ fts_word_t* word, /*!< in: */
+ que_node_t* exp) /*!< in: */
+{
+ int i;
+ fts_node_t* node = static_cast<fts_node_t*>(
+ ib_vector_push(word->nodes, NULL));
+
+ /* Start from 1 since the first node has been read by the caller */
+ for (i = 1; exp; exp = que_node_get_next(exp), ++i) {
+
+ dfield_t* dfield = que_node_get_val(exp);
+ byte* data = static_cast<byte*>(
+ dfield_get_data(dfield));
+ ulint len = dfield_get_len(dfield);
+
+ ut_a(len != UNIV_SQL_NULL);
+
+ /* Note: The column numbers below must match the SELECT */
+ switch (i) {
+ case 1: /* DOC_COUNT */
+ node->doc_count = mach_read_from_4(data);
+ break;
+
+ case 2: /* FIRST_DOC_ID */
+ node->first_doc_id = fts_read_doc_id(data);
+ break;
+
+ case 3: /* LAST_DOC_ID */
+ node->last_doc_id = fts_read_doc_id(data);
+ break;
+
+ case 4: /* ILIST */
+ node->ilist_size_alloc = node->ilist_size = len;
+ node->ilist = static_cast<byte*>(ut_malloc_nokey(len));
+ memcpy(node->ilist, data, len);
+ break;
+
+ default:
+ ut_error;
+ }
+ }
+
+ /* Make sure all columns were read. */
+ ut_a(i == 5);
+
+ return(node);
+}
+
+/**********************************************************************//**
+Callback function to fetch the rows in an FTS INDEX record.
+@return always returns non-NULL */
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: pointer to ib_vector_t */
+{
+ fts_word_t* word;
+ sel_node_t* sel_node = static_cast<sel_node_t*>(row);
+ fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg);
+ ib_vector_t* words = static_cast<ib_vector_t*>(fetch->read_arg);
+ que_node_t* exp = sel_node->select_list;
+ dfield_t* dfield = que_node_get_val(exp);
+ void* data = dfield_get_data(dfield);
+ ulint dfield_len = dfield_get_len(dfield);
+ fts_node_t* node;
+ bool is_word_init = false;
+
+ ut_a(dfield_len <= FTS_MAX_WORD_LEN);
+
+ if (ib_vector_size(words) == 0) {
+
+ word = static_cast<fts_word_t*>(ib_vector_push(words, NULL));
+ fts_word_init(word, (byte*) data, dfield_len);
+ is_word_init = true;
+ }
+
+ word = static_cast<fts_word_t*>(ib_vector_last(words));
+
+ if (dfield_len != word->text.f_len
+ || memcmp(word->text.f_str, data, dfield_len)) {
+
+ word = static_cast<fts_word_t*>(ib_vector_push(words, NULL));
+ fts_word_init(word, (byte*) data, dfield_len);
+ is_word_init = true;
+ }
+
+ node = fts_optimize_read_node(word, que_node_get_next(exp));
+
+ fetch->total_memory += node->ilist_size;
+ if (is_word_init) {
+ fetch->total_memory += sizeof(fts_word_t)
+ + sizeof(ib_alloc_t) + sizeof(ib_vector_t) + dfield_len
+ + sizeof(fts_node_t) * FTS_WORD_NODES_INIT_SIZE;
+ } else if (ib_vector_size(words) > FTS_WORD_NODES_INIT_SIZE) {
+ fetch->total_memory += sizeof(fts_node_t);
+ }
+
+ if (fetch->total_memory >= fts_result_cache_limit) {
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Read the rows from the FTS inde.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_index_fetch_nodes(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ que_t** graph, /*!< in: prepared statement */
+ fts_table_t* fts_table, /*!< in: table of the FTS INDEX */
+ const fts_string_t*
+ word, /*!< in: the word to fetch */
+ fts_fetch_t* fetch) /*!< in: fetch callback.*/
+{
+ pars_info_t* info;
+ dberr_t error;
+ char table_name[MAX_FULL_NAME_LEN];
+
+ trx->op_info = "fetching FTS index nodes";
+
+ if (*graph) {
+ info = (*graph)->info;
+ } else {
+ ulint selected;
+
+ info = pars_info_create();
+
+ ut_a(fts_table->type == FTS_INDEX_TABLE);
+
+ selected = fts_select_index(fts_table->charset,
+ word->f_str, word->f_len);
+
+ fts_table->suffix = fts_get_suffix(selected);
+
+ fts_get_table_name(fts_table, table_name);
+
+ pars_info_bind_id(info, true, "table_name", table_name);
+ }
+
+ pars_info_bind_function(info, "my_func", fetch->read_record, fetch);
+ pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+ if (!*graph) {
+
+ *graph = fts_parse_sql(
+ fts_table,
+ info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT word, doc_count, first_doc_id, last_doc_id,"
+ " ilist\n"
+ " FROM $table_name\n"
+ " WHERE word LIKE :word\n"
+ " ORDER BY first_doc_id;\n"
+ "BEGIN\n"
+ "\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+ }
+
+ for (;;) {
+ error = fts_eval_sql(trx, *graph);
+
+ if (UNIV_LIKELY(error == DB_SUCCESS)) {
+ fts_sql_commit(trx);
+
+ break; /* Exit the loop. */
+ } else {
+ fts_sql_rollback(trx);
+
+ if (error == DB_LOCK_WAIT_TIMEOUT) {
+ ib::warn() << "lock wait timeout reading"
+ " FTS index. Retrying!";
+
+ trx->error_state = DB_SUCCESS;
+ } else {
+ ib::error() << "(" << error
+ << ") while reading FTS index.";
+
+ break; /* Exit the loop. */
+ }
+ }
+ }
+
+ return(error);
+}
+
+/**********************************************************************//**
+Read a word */
+static
+byte*
+fts_zip_read_word(
+/*==============*/
+ fts_zip_t* zip, /*!< in: Zip state + data */
+ fts_string_t* word) /*!< out: uncompressed word */
+{
+ short len = 0;
+ void* null = NULL;
+ byte* ptr = word->f_str;
+ int flush = Z_NO_FLUSH;
+
+ /* Either there was an error or we are at the Z_STREAM_END. */
+ if (zip->status != Z_OK) {
+ return(NULL);
+ }
+
+ zip->zp->next_out = reinterpret_cast<byte*>(&len);
+ zip->zp->avail_out = sizeof(len);
+
+ while (zip->status == Z_OK && zip->zp->avail_out > 0) {
+
+ /* Finished decompressing block. */
+ if (zip->zp->avail_in == 0) {
+
+ /* Free the block that's been decompressed. */
+ if (zip->pos > 0) {
+ ulint prev = zip->pos - 1;
+
+ ut_a(zip->pos < ib_vector_size(zip->blocks));
+
+ ut_free(ib_vector_getp(zip->blocks, prev));
+ ib_vector_set(zip->blocks, prev, &null);
+ }
+
+ /* Any more blocks to decompress. */
+ if (zip->pos < ib_vector_size(zip->blocks)) {
+
+ zip->zp->next_in = static_cast<byte*>(
+ ib_vector_getp(
+ zip->blocks, zip->pos));
+
+ if (zip->pos > zip->last_big_block) {
+ zip->zp->avail_in =
+ FTS_MAX_WORD_LEN;
+ } else {
+ zip->zp->avail_in =
+ static_cast<uInt>(zip->block_sz);
+ }
+
+ ++zip->pos;
+ } else {
+ flush = Z_FINISH;
+ }
+ }
+
+ switch (zip->status = inflate(zip->zp, flush)) {
+ case Z_OK:
+ if (zip->zp->avail_out == 0 && len > 0) {
+
+ ut_a(len <= FTS_MAX_WORD_LEN);
+ ptr[len] = 0;
+
+ zip->zp->next_out = ptr;
+ zip->zp->avail_out = uInt(len);
+
+ word->f_len = ulint(len);
+ len = 0;
+ }
+ break;
+
+ case Z_BUF_ERROR: /* No progress possible. */
+ case Z_STREAM_END:
+ inflateEnd(zip->zp);
+ break;
+
+ case Z_STREAM_ERROR:
+ default:
+ ut_error;
+ }
+ }
+
+ /* All blocks must be freed at end of inflate. */
+ if (zip->status != Z_OK) {
+ for (ulint i = 0; i < ib_vector_size(zip->blocks); ++i) {
+ if (ib_vector_getp(zip->blocks, i)) {
+ ut_free(ib_vector_getp(zip->blocks, i));
+ ib_vector_set(zip->blocks, i, &null);
+ }
+ }
+ }
+
+ if (ptr != NULL) {
+ ut_ad(word->f_len == strlen((char*) ptr));
+ }
+
+ return(zip->status == Z_OK || zip->status == Z_STREAM_END ? ptr : NULL);
+}
+
+/**********************************************************************//**
+Callback function to fetch and compress the word in an FTS
+INDEX record.
+@return FALSE on EOF */
+static
+ibool
+fts_fetch_index_words(
+/*==================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: pointer to ib_vector_t */
+{
+ sel_node_t* sel_node = static_cast<sel_node_t*>(row);
+ fts_zip_t* zip = static_cast<fts_zip_t*>(user_arg);
+ que_node_t* exp = sel_node->select_list;
+ dfield_t* dfield = que_node_get_val(exp);
+
+ ut_a(dfield_get_len(dfield) <= FTS_MAX_WORD_LEN);
+
+ uint16 len = uint16(dfield_get_len(dfield));
+ void* data = dfield_get_data(dfield);
+
+ /* Skip the duplicate words. */
+ if (zip->word.f_len == len && !memcmp(zip->word.f_str, data, len)) {
+ return(TRUE);
+ }
+
+ memcpy(zip->word.f_str, data, len);
+ zip->word.f_len = len;
+
+ ut_a(zip->zp->avail_in == 0);
+ ut_a(zip->zp->next_in == NULL);
+
+ /* The string is prefixed by len. */
+ /* FIXME: This is not byte order agnostic (InnoDB data files
+ with FULLTEXT INDEX are not portable between little-endian and
+ big-endian systems!) */
+ zip->zp->next_in = reinterpret_cast<byte*>(&len);
+ zip->zp->avail_in = sizeof(len);
+
+ /* Compress the word, create output blocks as necessary. */
+ while (zip->zp->avail_in > 0) {
+
+ /* No space left in output buffer, create a new one. */
+ if (zip->zp->avail_out == 0) {
+ byte* block;
+
+ block = static_cast<byte*>(
+ ut_malloc_nokey(zip->block_sz));
+
+ ib_vector_push(zip->blocks, &block);
+
+ zip->zp->next_out = block;
+ zip->zp->avail_out = static_cast<uInt>(zip->block_sz);
+ }
+
+ switch (zip->status = deflate(zip->zp, Z_NO_FLUSH)) {
+ case Z_OK:
+ if (zip->zp->avail_in == 0) {
+ zip->zp->next_in = static_cast<byte*>(data);
+ zip->zp->avail_in = uInt(len);
+ ut_a(len <= FTS_MAX_WORD_LEN);
+ len = 0;
+ }
+ continue;
+
+ case Z_STREAM_END:
+ case Z_BUF_ERROR:
+ case Z_STREAM_ERROR:
+ default:
+ ut_error;
+ }
+ }
+
+ /* All data should have been compressed. */
+ ut_a(zip->zp->avail_in == 0);
+ zip->zp->next_in = NULL;
+
+ ++zip->n_words;
+
+ return(zip->n_words >= zip->max_words ? FALSE : TRUE);
+}
+
+/**********************************************************************//**
+Finish Zip deflate. */
+static
+void
+fts_zip_deflate_end(
+/*================*/
+ fts_zip_t* zip) /*!< in: instance that should be closed*/
+{
+ ut_a(zip->zp->avail_in == 0);
+ ut_a(zip->zp->next_in == NULL);
+
+ zip->status = deflate(zip->zp, Z_FINISH);
+
+ ut_a(ib_vector_size(zip->blocks) > 0);
+ zip->last_big_block = ib_vector_size(zip->blocks) - 1;
+
+ /* Allocate smaller block(s), since this is trailing data. */
+ while (zip->status == Z_OK) {
+ byte* block;
+
+ ut_a(zip->zp->avail_out == 0);
+
+ block = static_cast<byte*>(
+ ut_malloc_nokey(FTS_MAX_WORD_LEN + 1));
+
+ ib_vector_push(zip->blocks, &block);
+
+ zip->zp->next_out = block;
+ zip->zp->avail_out = FTS_MAX_WORD_LEN;
+
+ zip->status = deflate(zip->zp, Z_FINISH);
+ }
+
+ ut_a(zip->status == Z_STREAM_END);
+
+ zip->status = deflateEnd(zip->zp);
+ ut_a(zip->status == Z_OK);
+
+ /* Reset the ZLib data structure. */
+ memset(zip->zp, 0, sizeof(*zip->zp));
+}
+
+/**********************************************************************//**
+Read the words from the FTS INDEX.
+@return DB_SUCCESS if all OK, DB_TABLE_NOT_FOUND if no more indexes
+ to search else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_index_fetch_words(
+/*==================*/
+ fts_optimize_t* optim, /*!< in: optimize scratch pad */
+ const fts_string_t* word, /*!< in: get words greater than this
+ word */
+ ulint n_words)/*!< in: max words to read */
+{
+ pars_info_t* info;
+ que_t* graph;
+ ulint selected;
+ fts_zip_t* zip = NULL;
+ dberr_t error = DB_SUCCESS;
+ mem_heap_t* heap = static_cast<mem_heap_t*>(optim->self_heap->arg);
+ ibool inited = FALSE;
+
+ optim->trx->op_info = "fetching FTS index words";
+
+ if (optim->zip == NULL) {
+ optim->zip = fts_zip_create(heap, FTS_ZIP_BLOCK_SIZE, n_words);
+ } else {
+ fts_zip_initialize(optim->zip);
+ }
+
+ for (selected = fts_select_index(
+ optim->fts_index_table.charset, word->f_str, word->f_len);
+ selected < FTS_NUM_AUX_INDEX;
+ selected++) {
+
+ char table_name[MAX_FULL_NAME_LEN];
+
+ optim->fts_index_table.suffix = fts_get_suffix(selected);
+
+ info = pars_info_create();
+
+ pars_info_bind_function(
+ info, "my_func", fts_fetch_index_words, optim->zip);
+
+ pars_info_bind_varchar_literal(
+ info, "word", word->f_str, word->f_len);
+
+ fts_get_table_name(&optim->fts_index_table, table_name);
+ pars_info_bind_id(info, true, "table_name", table_name);
+
+ graph = fts_parse_sql(
+ &optim->fts_index_table,
+ info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT word\n"
+ " FROM $table_name\n"
+ " WHERE word > :word\n"
+ " ORDER BY word;\n"
+ "BEGIN\n"
+ "\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+
+ zip = optim->zip;
+
+ for (;;) {
+ int err;
+
+ if (!inited && ((err = deflateInit(zip->zp, 9))
+ != Z_OK)) {
+ ib::error() << "ZLib deflateInit() failed: "
+ << err;
+
+ error = DB_ERROR;
+ break;
+ } else {
+ inited = TRUE;
+ error = fts_eval_sql(optim->trx, graph);
+ }
+
+ if (UNIV_LIKELY(error == DB_SUCCESS)) {
+ //FIXME fts_sql_commit(optim->trx);
+ break;
+ } else {
+ //FIXME fts_sql_rollback(optim->trx);
+
+ if (error == DB_LOCK_WAIT_TIMEOUT) {
+ ib::warn() << "Lock wait timeout"
+ " reading document. Retrying!";
+
+ /* We need to reset the ZLib state. */
+ inited = FALSE;
+ deflateEnd(zip->zp);
+ fts_zip_init(zip);
+
+ optim->trx->error_state = DB_SUCCESS;
+ } else {
+ ib::error() << "(" << error
+ << ") while reading document.";
+
+ break; /* Exit the loop. */
+ }
+ }
+ }
+
+ fts_que_graph_free(graph);
+
+ /* Check if max word to fetch is exceeded */
+ if (optim->zip->n_words >= n_words) {
+ break;
+ }
+ }
+
+ if (error == DB_SUCCESS && zip->status == Z_OK && zip->n_words > 0) {
+
+ /* All data should have been read. */
+ ut_a(zip->zp->avail_in == 0);
+
+ fts_zip_deflate_end(zip);
+ } else {
+ deflateEnd(zip->zp);
+ }
+
+ return(error);
+}
+
+/**********************************************************************//**
+Callback function to fetch the doc id from the record.
+@return always returns TRUE */
+static
+ibool
+fts_fetch_doc_ids(
+/*==============*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: pointer to ib_vector_t */
+{
+ que_node_t* exp;
+ int i = 0;
+ sel_node_t* sel_node = static_cast<sel_node_t*>(row);
+ fts_doc_ids_t* fts_doc_ids = static_cast<fts_doc_ids_t*>(user_arg);
+ doc_id_t* update = static_cast<doc_id_t*>(
+ ib_vector_push(fts_doc_ids->doc_ids, NULL));
+
+ for (exp = sel_node->select_list;
+ exp;
+ exp = que_node_get_next(exp), ++i) {
+
+ dfield_t* dfield = que_node_get_val(exp);
+ void* data = dfield_get_data(dfield);
+ ulint len = dfield_get_len(dfield);
+
+ ut_a(len != UNIV_SQL_NULL);
+
+ /* Note: The column numbers below must match the SELECT. */
+ switch (i) {
+ case 0: /* DOC_ID */
+ *update = fts_read_doc_id(
+ static_cast<byte*>(data));
+ break;
+
+ default:
+ ut_error;
+ }
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Read the rows from a FTS common auxiliary table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_table_fetch_doc_ids(
+/*====================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table, /*!< in: table */
+ fts_doc_ids_t* doc_ids) /*!< in: For collecting doc ids */
+{
+ dberr_t error;
+ que_t* graph;
+ pars_info_t* info = pars_info_create();
+ ibool alloc_bk_trx = FALSE;
+ char table_name[MAX_FULL_NAME_LEN];
+
+ ut_a(fts_table->suffix != NULL);
+ ut_a(fts_table->type == FTS_COMMON_TABLE);
+
+ if (!trx) {
+ trx = trx_create();
+ alloc_bk_trx = TRUE;
+ }
+
+ trx->op_info = "fetching FTS doc ids";
+
+ pars_info_bind_function(info, "my_func", fts_fetch_doc_ids, doc_ids);
+
+ fts_get_table_name(fts_table, table_name);
+ pars_info_bind_id(info, true, "table_name", table_name);
+
+ graph = fts_parse_sql(
+ fts_table,
+ info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT doc_id FROM $table_name;\n"
+ "BEGIN\n"
+ "\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+
+ error = fts_eval_sql(trx, graph);
+ fts_sql_commit(trx);
+
+ mutex_enter(&dict_sys.mutex);
+ que_graph_free(graph);
+ mutex_exit(&dict_sys.mutex);
+
+ if (error == DB_SUCCESS) {
+ ib_vector_sort(doc_ids->doc_ids, fts_doc_id_cmp);
+ }
+
+ if (alloc_bk_trx) {
+ trx->free();
+ }
+
+ return(error);
+}
+
+/**********************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be inserted
+ if not found */
+int
+fts_bsearch(
+/*========*/
+ doc_id_t* array, /*!< in: array to sort */
+ int lower, /*!< in: the array lower bound */
+ int upper, /*!< in: the array upper bound */
+ doc_id_t doc_id) /*!< in: the doc id to search for */
+{
+ int orig_size = upper;
+
+ if (upper == 0) {
+ /* Nothing to search */
+ return(-1);
+ } else {
+ while (lower < upper) {
+ int i = (lower + upper) >> 1;
+
+ if (doc_id > array[i]) {
+ lower = i + 1;
+ } else if (doc_id < array[i]) {
+ upper = i - 1;
+ } else {
+ return(i); /* Found. */
+ }
+ }
+ }
+
+ if (lower == upper && lower < orig_size) {
+ if (doc_id == array[lower]) {
+ return(lower);
+ } else if (lower == 0) {
+ return(-1);
+ }
+ }
+
+ /* Not found. */
+ return( (lower == 0) ? -1 : -(lower));
+}
+
+/**********************************************************************//**
+Search in the to delete array whether any of the doc ids within
+the [first, last] range are to be deleted
+@return +ve index if found -ve index where it should be inserted
+ if not found */
+static
+int
+fts_optimize_lookup(
+/*================*/
+ ib_vector_t* doc_ids, /*!< in: array to search */
+ ulint lower, /*!< in: lower limit of array */
+ doc_id_t first_doc_id, /*!< in: doc id to lookup */
+ doc_id_t last_doc_id) /*!< in: doc id to lookup */
+{
+ int pos;
+ int upper = static_cast<int>(ib_vector_size(doc_ids));
+ doc_id_t* array = (doc_id_t*) doc_ids->data;
+
+ pos = fts_bsearch(array, static_cast<int>(lower), upper, first_doc_id);
+
+ ut_a(abs(pos) <= upper + 1);
+
+ if (pos < 0) {
+
+ int i = abs(pos);
+
+ /* If i is 1, it could be first_doc_id is less than
+ either the first or second array item, do a
+ double check */
+ if (i == 1 && array[0] <= last_doc_id
+ && first_doc_id < array[0]) {
+ pos = 0;
+ } else if (i < upper && array[i] <= last_doc_id) {
+
+ /* Check if the "next" doc id is within the
+ first & last doc id of the node. */
+ pos = i;
+ }
+ }
+
+ return(pos);
+}
+
+/**********************************************************************//**
+Encode the word pos list into the node
+@return DB_SUCCESS or error code*/
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+fts_optimize_encode_node(
+/*=====================*/
+ fts_node_t* node, /*!< in: node to fill*/
+ doc_id_t doc_id, /*!< in: doc id to encode */
+ fts_encode_t* enc) /*!< in: encoding state.*/
+{
+ byte* dst;
+ ulint enc_len;
+ ulint pos_enc_len;
+ doc_id_t doc_id_delta;
+ dberr_t error = DB_SUCCESS;
+ byte* src = enc->src_ilist_ptr;
+
+ if (node->first_doc_id == 0) {
+ ut_a(node->last_doc_id == 0);
+
+ node->first_doc_id = doc_id;
+ }
+
+ /* Calculate the space required to store the ilist. */
+ ut_ad(doc_id > node->last_doc_id);
+ doc_id_delta = doc_id - node->last_doc_id;
+ enc_len = fts_get_encoded_len(static_cast<ulint>(doc_id_delta));
+
+ /* Calculate the size of the encoded pos array. */
+ while (*src) {
+ fts_decode_vlc(&src);
+ }
+
+ /* Skip the 0x00 byte at the end of the word positions list. */
+ ++src;
+
+ /* Number of encoded pos bytes to copy. */
+ pos_enc_len = ulint(src - enc->src_ilist_ptr);
+
+ /* Total number of bytes required for copy. */
+ enc_len += pos_enc_len;
+
+ /* Check we have enough space in the destination buffer for
+ copying the document word list. */
+ if (!node->ilist) {
+ ulint new_size;
+
+ ut_a(node->ilist_size == 0);
+
+ new_size = enc_len > FTS_ILIST_MAX_SIZE
+ ? enc_len : FTS_ILIST_MAX_SIZE;
+
+ node->ilist = static_cast<byte*>(ut_malloc_nokey(new_size));
+ node->ilist_size_alloc = new_size;
+
+ } else if ((node->ilist_size + enc_len) > node->ilist_size_alloc) {
+ ulint new_size = node->ilist_size + enc_len;
+ byte* ilist = static_cast<byte*>(ut_malloc_nokey(new_size));
+
+ memcpy(ilist, node->ilist, node->ilist_size);
+
+ ut_free(node->ilist);
+
+ node->ilist = ilist;
+ node->ilist_size_alloc = new_size;
+ }
+
+ src = enc->src_ilist_ptr;
+ dst = node->ilist + node->ilist_size;
+
+ /* Encode the doc id. Cast to ulint, the delta should be small and
+ therefore no loss of precision. */
+ dst += fts_encode_int((ulint) doc_id_delta, dst);
+
+ /* Copy the encoded pos array. */
+ memcpy(dst, src, pos_enc_len);
+
+ node->last_doc_id = doc_id;
+
+ /* Data copied upto here. */
+ node->ilist_size += enc_len;
+ enc->src_ilist_ptr += pos_enc_len;
+
+ ut_a(node->ilist_size <= node->ilist_size_alloc);
+
+ return(error);
+}
+
+/**********************************************************************//**
+Optimize the data contained in a node.
+@return DB_SUCCESS or error code*/
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+fts_optimize_node(
+/*==============*/
+ ib_vector_t* del_vec, /*!< in: vector of doc ids to delete*/
+ int* del_pos, /*!< in: offset into above vector */
+ fts_node_t* dst_node, /*!< in: node to fill*/
+ fts_node_t* src_node, /*!< in: source node for data*/
+ fts_encode_t* enc) /*!< in: encoding state */
+{
+ ulint copied;
+ dberr_t error = DB_SUCCESS;
+ doc_id_t doc_id = enc->src_last_doc_id;
+
+ if (!enc->src_ilist_ptr) {
+ enc->src_ilist_ptr = src_node->ilist;
+ }
+
+ copied = ulint(enc->src_ilist_ptr - src_node->ilist);
+
+ /* While there is data in the source node and space to copy
+ into in the destination node. */
+ while (copied < src_node->ilist_size
+ && dst_node->ilist_size < FTS_ILIST_MAX_SIZE) {
+
+ doc_id_t delta;
+ doc_id_t del_doc_id = FTS_NULL_DOC_ID;
+
+ delta = fts_decode_vlc(&enc->src_ilist_ptr);
+
+test_again:
+ /* Check whether the doc id is in the delete list, if
+ so then we skip the entries but we need to track the
+ delta for decoding the entries following this document's
+ entries. */
+ if (*del_pos >= 0 && *del_pos < (int) ib_vector_size(del_vec)) {
+ doc_id_t* update;
+
+ update = (doc_id_t*) ib_vector_get(
+ del_vec, ulint(*del_pos));
+
+ del_doc_id = *update;
+ }
+
+ if (enc->src_ilist_ptr == src_node->ilist && doc_id == 0) {
+ ut_a(delta == src_node->first_doc_id);
+ }
+
+ doc_id += delta;
+
+ if (del_doc_id > 0 && doc_id == del_doc_id) {
+
+ ++*del_pos;
+
+ /* Skip the entries for this document. */
+ while (*enc->src_ilist_ptr) {
+ fts_decode_vlc(&enc->src_ilist_ptr);
+ }
+
+ /* Skip the end of word position marker. */
+ ++enc->src_ilist_ptr;
+
+ } else {
+
+ /* DOC ID already becomes larger than
+ del_doc_id, check the next del_doc_id */
+ if (del_doc_id > 0 && doc_id > del_doc_id) {
+ del_doc_id = 0;
+ ++*del_pos;
+ delta = 0;
+ goto test_again;
+ }
+
+ /* Decode and copy the word positions into
+ the dest node. */
+ fts_optimize_encode_node(dst_node, doc_id, enc);
+
+ ++dst_node->doc_count;
+
+ ut_a(dst_node->last_doc_id == doc_id);
+ }
+
+ /* Bytes copied so for from source. */
+ copied = ulint(enc->src_ilist_ptr - src_node->ilist);
+ }
+
+ if (copied >= src_node->ilist_size) {
+ ut_a(doc_id == src_node->last_doc_id);
+ }
+
+ enc->src_last_doc_id = doc_id;
+
+ return(error);
+}
+
+/**********************************************************************//**
+Determine the starting pos within the deleted doc id vector for a word.
+@return delete position */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+int
+fts_optimize_deleted_pos(
+/*=====================*/
+ fts_optimize_t* optim, /*!< in: optimize state data */
+ fts_word_t* word) /*!< in: the word data to check */
+{
+ int del_pos;
+ ib_vector_t* del_vec = optim->to_delete->doc_ids;
+
+ /* Get the first and last dict ids for the word, we will use
+ these values to determine which doc ids need to be removed
+ when we coalesce the nodes. This way we can reduce the numer
+ of elements that need to be searched in the deleted doc ids
+ vector and secondly we can remove the doc ids during the
+ coalescing phase. */
+ if (ib_vector_size(del_vec) > 0) {
+ fts_node_t* node;
+ doc_id_t last_id;
+ doc_id_t first_id;
+ ulint size = ib_vector_size(word->nodes);
+
+ node = (fts_node_t*) ib_vector_get(word->nodes, 0);
+ first_id = node->first_doc_id;
+
+ node = (fts_node_t*) ib_vector_get(word->nodes, size - 1);
+ last_id = node->last_doc_id;
+
+ ut_a(first_id <= last_id);
+
+ del_pos = fts_optimize_lookup(
+ del_vec, optim->del_pos, first_id, last_id);
+ } else {
+
+ del_pos = -1; /* Note that there is nothing to delete. */
+ }
+
+ return(del_pos);
+}
+
+#define FTS_DEBUG_PRINT
+/**********************************************************************//**
+Compact the nodes for a word, we also remove any doc ids during the
+compaction pass.
+@return DB_SUCCESS or error code.*/
+static
+ib_vector_t*
+fts_optimize_word(
+/*==============*/
+ fts_optimize_t* optim, /*!< in: optimize state data */
+ fts_word_t* word) /*!< in: the word to optimize */
+{
+ fts_encode_t enc;
+ ib_vector_t* nodes;
+ ulint i = 0;
+ int del_pos;
+ fts_node_t* dst_node = NULL;
+ ib_vector_t* del_vec = optim->to_delete->doc_ids;
+ ulint size = ib_vector_size(word->nodes);
+
+ del_pos = fts_optimize_deleted_pos(optim, word);
+ nodes = ib_vector_create(word->heap_alloc, sizeof(*dst_node), 128);
+
+ enc.src_last_doc_id = 0;
+ enc.src_ilist_ptr = NULL;
+
+ while (i < size) {
+ ulint copied;
+ fts_node_t* src_node;
+
+ src_node = (fts_node_t*) ib_vector_get(word->nodes, i);
+
+ if (dst_node == NULL
+ || dst_node->last_doc_id > src_node->first_doc_id) {
+
+ dst_node = static_cast<fts_node_t*>(
+ ib_vector_push(nodes, NULL));
+ memset(dst_node, 0, sizeof(*dst_node));
+ }
+
+ /* Copy from the src to the dst node. */
+ fts_optimize_node(del_vec, &del_pos, dst_node, src_node, &enc);
+
+ ut_a(enc.src_ilist_ptr != NULL);
+
+ /* Determine the numer of bytes copied to dst_node. */
+ copied = ulint(enc.src_ilist_ptr - src_node->ilist);
+
+ /* Can't copy more than whats in the vlc array. */
+ ut_a(copied <= src_node->ilist_size);
+
+ /* We are done with this node release the resources. */
+ if (copied == src_node->ilist_size) {
+
+ enc.src_last_doc_id = 0;
+ enc.src_ilist_ptr = NULL;
+
+ ut_free(src_node->ilist);
+
+ src_node->ilist = NULL;
+ src_node->ilist_size = src_node->ilist_size_alloc = 0;
+
+ src_node = NULL;
+
+ ++i; /* Get next source node to OPTIMIZE. */
+ }
+
+ if (dst_node->ilist_size >= FTS_ILIST_MAX_SIZE || i >= size) {
+
+ dst_node = NULL;
+ }
+ }
+
+ /* All dst nodes created should have been added to the vector. */
+ ut_a(dst_node == NULL);
+
+ /* Return the OPTIMIZED nodes. */
+ return(nodes);
+}
+
+/**********************************************************************//**
+Update the FTS index table. This is a delete followed by an insert.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_write_word(
+/*====================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table, /*!< in: table of FTS index */
+ fts_string_t* word, /*!< in: word data to write */
+ ib_vector_t* nodes) /*!< in: the nodes to write */
+{
+ ulint i;
+ pars_info_t* info;
+ que_t* graph;
+ ulint selected;
+ dberr_t error = DB_SUCCESS;
+ char table_name[MAX_FULL_NAME_LEN];
+
+ info = pars_info_create();
+
+ ut_ad(fts_table->charset);
+
+ pars_info_bind_varchar_literal(
+ info, "word", word->f_str, word->f_len);
+
+ selected = fts_select_index(fts_table->charset,
+ word->f_str, word->f_len);
+
+ fts_table->suffix = fts_get_suffix(selected);
+ fts_get_table_name(fts_table, table_name);
+ pars_info_bind_id(info, true, "table_name", table_name);
+
+ graph = fts_parse_sql(
+ fts_table,
+ info,
+ "BEGIN DELETE FROM $table_name WHERE word = :word;");
+
+ error = fts_eval_sql(trx, graph);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "(" << error << ") during optimize,"
+ " when deleting a word from the FTS index.";
+ }
+
+ fts_que_graph_free(graph);
+ graph = NULL;
+
+ /* Even if the operation needs to be rolled back and redone,
+ we iterate over the nodes in order to free the ilist. */
+ for (i = 0; i < ib_vector_size(nodes); ++i) {
+
+ fts_node_t* node = (fts_node_t*) ib_vector_get(nodes, i);
+
+ if (error == DB_SUCCESS) {
+ /* Skip empty node. */
+ if (node->ilist == NULL) {
+ ut_ad(node->ilist_size == 0);
+ continue;
+ }
+
+ error = fts_write_node(
+ trx, &graph, fts_table, word, node);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "(" << error << ")"
+ " during optimize, while adding a"
+ " word to the FTS index.";
+ }
+ }
+
+ ut_free(node->ilist);
+ node->ilist = NULL;
+ node->ilist_size = node->ilist_size_alloc = 0;
+ }
+
+ if (graph != NULL) {
+ fts_que_graph_free(graph);
+ }
+
+ return(error);
+}
+
+/**********************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+void
+fts_word_free(
+/*==========*/
+ fts_word_t* word) /*!< in: instance to free.*/
+{
+ mem_heap_t* heap = static_cast<mem_heap_t*>(word->heap_alloc->arg);
+
+#ifdef UNIV_DEBUG
+ memset(word, 0, sizeof(*word));
+#endif /* UNIV_DEBUG */
+
+ mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Optimize the word ilist and rewrite data to the FTS index.
+@return status one of RESTART, EXIT, ERROR */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_compact(
+/*=================*/
+ fts_optimize_t* optim, /*!< in: optimize state data */
+ dict_index_t* index, /*!< in: current FTS being optimized */
+ time_t start_time) /*!< in: optimize start time */
+{
+ ulint i;
+ dberr_t error = DB_SUCCESS;
+ ulint size = ib_vector_size(optim->words);
+
+ for (i = 0; i < size && error == DB_SUCCESS && !optim->done; ++i) {
+ fts_word_t* word;
+ ib_vector_t* nodes;
+ trx_t* trx = optim->trx;
+
+ word = (fts_word_t*) ib_vector_get(optim->words, i);
+
+ /* nodes is allocated from the word heap and will be destroyed
+ when the word is freed. We however have to be careful about
+ the ilist, that needs to be freed explicitly. */
+ nodes = fts_optimize_word(optim, word);
+
+ /* Update the data on disk. */
+ error = fts_optimize_write_word(
+ trx, &optim->fts_index_table, &word->text, nodes);
+
+ if (error == DB_SUCCESS) {
+ /* Write the last word optimized to the config table,
+ we use this value for restarting optimize. */
+ error = fts_config_set_index_value(
+ optim->trx, index,
+ FTS_LAST_OPTIMIZED_WORD, &word->text);
+ }
+
+ /* Free the word that was optimized. */
+ fts_word_free(word);
+
+ ulint interval = ulint(time(NULL) - start_time);
+
+ if (fts_optimize_time_limit > 0
+ && (lint(interval) < 0
+ || interval > fts_optimize_time_limit)) {
+
+ optim->done = TRUE;
+ }
+ }
+
+ return(error);
+}
+
+/**********************************************************************//**
+Create an instance of fts_optimize_t. Also create a new
+background transaction.*/
+static
+fts_optimize_t*
+fts_optimize_create(
+/*================*/
+ dict_table_t* table) /*!< in: table with FTS indexes */
+{
+ fts_optimize_t* optim;
+ mem_heap_t* heap = mem_heap_create(128);
+
+ optim = (fts_optimize_t*) mem_heap_zalloc(heap, sizeof(*optim));
+
+ optim->self_heap = ib_heap_allocator_create(heap);
+
+ optim->to_delete = fts_doc_ids_create();
+
+ optim->words = ib_vector_create(
+ optim->self_heap, sizeof(fts_word_t), 256);
+
+ optim->table = table;
+
+ optim->trx = trx_create();
+ trx_start_internal(optim->trx);
+
+ optim->fts_common_table.table_id = table->id;
+ optim->fts_common_table.type = FTS_COMMON_TABLE;
+ optim->fts_common_table.table = table;
+
+ optim->fts_index_table.table_id = table->id;
+ optim->fts_index_table.type = FTS_INDEX_TABLE;
+ optim->fts_index_table.table = table;
+
+ /* The common prefix for all this parent table's aux tables. */
+ optim->name_prefix = fts_get_table_name_prefix(
+ &optim->fts_common_table);
+
+ return(optim);
+}
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/**********************************************************************//**
+Get optimize start time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_get_index_start_time(
+/*==============================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: FTS index */
+ time_t* start_time) /*!< out: time in secs */
+{
+ return(fts_config_get_index_ulint(
+ trx, index, FTS_OPTIMIZE_START_TIME,
+ (ulint*) start_time));
+}
+
+/**********************************************************************//**
+Set the optimize start time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_set_index_start_time(
+/*==============================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: FTS index */
+ time_t start_time) /*!< in: start time */
+{
+ return(fts_config_set_index_ulint(
+ trx, index, FTS_OPTIMIZE_START_TIME,
+ (ulint) start_time));
+}
+
+/**********************************************************************//**
+Get optimize end time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_get_index_end_time(
+/*============================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: FTS index */
+ time_t* end_time) /*!< out: time in secs */
+{
+ return(fts_config_get_index_ulint(
+ trx, index, FTS_OPTIMIZE_END_TIME, (ulint*) end_time));
+}
+
+/**********************************************************************//**
+Set the optimize end time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_set_index_end_time(
+/*============================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: FTS index */
+ time_t end_time) /*!< in: end time */
+{
+ return(fts_config_set_index_ulint(
+ trx, index, FTS_OPTIMIZE_END_TIME, (ulint) end_time));
+}
+#endif
+
+/**********************************************************************//**
+Free the optimize prepared statements.*/
+static
+void
+fts_optimize_graph_free(
+/*====================*/
+ fts_optimize_graph_t* graph) /*!< in/out: The graph instances
+ to free */
+{
+ if (graph->commit_graph) {
+ que_graph_free(graph->commit_graph);
+ graph->commit_graph = NULL;
+ }
+
+ if (graph->write_nodes_graph) {
+ que_graph_free(graph->write_nodes_graph);
+ graph->write_nodes_graph = NULL;
+ }
+
+ if (graph->delete_nodes_graph) {
+ que_graph_free(graph->delete_nodes_graph);
+ graph->delete_nodes_graph = NULL;
+ }
+
+ if (graph->read_nodes_graph) {
+ que_graph_free(graph->read_nodes_graph);
+ graph->read_nodes_graph = NULL;
+ }
+}
+
+/**********************************************************************//**
+Free all optimize resources. */
+static
+void
+fts_optimize_free(
+/*==============*/
+ fts_optimize_t* optim) /*!< in: table with on FTS index */
+{
+ mem_heap_t* heap = static_cast<mem_heap_t*>(optim->self_heap->arg);
+
+ trx_commit_for_mysql(optim->trx);
+ optim->trx->free();
+ optim->trx = NULL;
+
+ fts_doc_ids_free(optim->to_delete);
+ fts_optimize_graph_free(&optim->graph);
+
+ ut_free(optim->name_prefix);
+
+ /* This will free the heap from which optim itself was allocated. */
+ mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Get the max time optimize should run in millisecs.
+@return max optimize time limit in millisecs. */
+static
+ulint
+fts_optimize_get_time_limit(
+/*========================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table) /*!< in: aux table */
+{
+ ulint time_limit = 0;
+
+ fts_config_get_ulint(
+ trx, fts_table,
+ FTS_OPTIMIZE_LIMIT_IN_SECS, &time_limit);
+
+ /* FIXME: This is returning milliseconds, while the variable
+ is being stored and interpreted as seconds! */
+ return(time_limit * 1000);
+}
+
+/**********************************************************************//**
+Run OPTIMIZE on the given table. Note: this can take a very long time
+(hours). */
+static
+void
+fts_optimize_words(
+/*===============*/
+ fts_optimize_t* optim, /*!< in: optimize instance */
+ dict_index_t* index, /*!< in: current FTS being optimized */
+ fts_string_t* word) /*!< in: the starting word to optimize */
+{
+ fts_fetch_t fetch;
+ que_t* graph = NULL;
+ CHARSET_INFO* charset = optim->fts_index_table.charset;
+
+ ut_a(!optim->done);
+
+ /* Get the time limit from the config table. */
+ fts_optimize_time_limit = fts_optimize_get_time_limit(
+ optim->trx, &optim->fts_common_table);
+
+ const time_t start_time = time(NULL);
+
+ /* Setup the callback to use for fetching the word ilist etc. */
+ fetch.read_arg = optim->words;
+ fetch.read_record = fts_optimize_index_fetch_node;
+
+ while (!optim->done) {
+ dberr_t error;
+ trx_t* trx = optim->trx;
+ ulint selected;
+
+ ut_a(ib_vector_size(optim->words) == 0);
+
+ selected = fts_select_index(charset, word->f_str, word->f_len);
+
+ /* Read the index records to optimize. */
+ fetch.total_memory = 0;
+ error = fts_index_fetch_nodes(
+ trx, &graph, &optim->fts_index_table, word,
+ &fetch);
+ ut_ad(fetch.total_memory < fts_result_cache_limit);
+
+ if (error == DB_SUCCESS) {
+ /* There must be some nodes to read. */
+ ut_a(ib_vector_size(optim->words) > 0);
+
+ /* Optimize the nodes that were read and write
+ back to DB. */
+ error = fts_optimize_compact(optim, index, start_time);
+
+ if (error == DB_SUCCESS) {
+ fts_sql_commit(optim->trx);
+ } else {
+ fts_sql_rollback(optim->trx);
+ }
+ }
+
+ ib_vector_reset(optim->words);
+
+ if (error == DB_SUCCESS) {
+ if (!optim->done) {
+ if (!fts_zip_read_word(optim->zip, word)) {
+ optim->done = TRUE;
+ } else if (selected
+ != fts_select_index(
+ charset, word->f_str,
+ word->f_len)
+ && graph) {
+ fts_que_graph_free(graph);
+ graph = NULL;
+ }
+ }
+ } else if (error == DB_LOCK_WAIT_TIMEOUT) {
+ ib::warn() << "Lock wait timeout during optimize."
+ " Retrying!";
+
+ trx->error_state = DB_SUCCESS;
+ } else if (error == DB_DEADLOCK) {
+ ib::warn() << "Deadlock during optimize. Retrying!";
+
+ trx->error_state = DB_SUCCESS;
+ } else {
+ optim->done = TRUE; /* Exit the loop. */
+ }
+ }
+
+ if (graph != NULL) {
+ fts_que_graph_free(graph);
+ }
+}
+
+/**********************************************************************//**
+Optimize is complete. Set the completion time, and reset the optimize
+start string for this FTS index to "".
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index_completed(
+/*=========================*/
+ fts_optimize_t* optim, /*!< in: optimize instance */
+ dict_index_t* index) /*!< in: table with one FTS index */
+{
+ fts_string_t word;
+ dberr_t error;
+ byte buf[sizeof(ulint)];
+#ifdef FTS_OPTIMIZE_DEBUG
+ time_t end_time = time(NULL);
+
+ error = fts_optimize_set_index_end_time(optim->trx, index, end_time);
+#endif
+
+ /* If we've reached the end of the index then set the start
+ word to the empty string. */
+
+ word.f_len = 0;
+ word.f_str = buf;
+ *word.f_str = '\0';
+
+ error = fts_config_set_index_value(
+ optim->trx, index, FTS_LAST_OPTIMIZED_WORD, &word);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "(" << error << ") while updating"
+ " last optimized word!";
+ }
+
+ return(error);
+}
+
+
+/**********************************************************************//**
+Read the list of words from the FTS auxiliary index that will be
+optimized in this pass.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index_read_words(
+/*==========================*/
+ fts_optimize_t* optim, /*!< in: optimize instance */
+ dict_index_t* index, /*!< in: table with one FTS index */
+ fts_string_t* word) /*!< in: buffer to use */
+{
+ dberr_t error = DB_SUCCESS;
+
+ if (optim->del_list_regenerated) {
+ word->f_len = 0;
+ } else {
+
+ /* Get the last word that was optimized from
+ the config table. */
+ error = fts_config_get_index_value(
+ optim->trx, index, FTS_LAST_OPTIMIZED_WORD, word);
+ }
+
+ /* If record not found then we start from the top. */
+ if (error == DB_RECORD_NOT_FOUND) {
+ word->f_len = 0;
+ error = DB_SUCCESS;
+ }
+
+ while (error == DB_SUCCESS) {
+
+ error = fts_index_fetch_words(
+ optim, word, fts_num_word_optimize);
+
+ if (error == DB_SUCCESS) {
+ /* Reset the last optimized word to '' if no
+ more words could be read from the FTS index. */
+ if (optim->zip->n_words == 0) {
+ word->f_len = 0;
+ *word->f_str = 0;
+ }
+
+ break;
+ }
+ }
+
+ return(error);
+}
+
+/**********************************************************************//**
+Run OPTIMIZE on the given FTS index. Note: this can take a very long
+time (hours).
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index(
+/*===============*/
+ fts_optimize_t* optim, /*!< in: optimize instance */
+ dict_index_t* index) /*!< in: table with one FTS index */
+{
+ fts_string_t word;
+ dberr_t error;
+ byte str[FTS_MAX_WORD_LEN + 1];
+
+ /* Set the current index that we have to optimize. */
+ optim->fts_index_table.index_id = index->id;
+ optim->fts_index_table.charset = fts_index_get_charset(index);
+
+ optim->done = FALSE; /* Optimize until !done */
+
+ /* We need to read the last word optimized so that we start from
+ the next word. */
+ word.f_str = str;
+
+ /* We set the length of word to the size of str since we
+ need to pass the max len info to the fts_get_config_value() function. */
+ word.f_len = sizeof(str) - 1;
+
+ memset(word.f_str, 0x0, word.f_len);
+
+ /* Read the words that will be optimized in this pass. */
+ error = fts_optimize_index_read_words(optim, index, &word);
+
+ if (error == DB_SUCCESS) {
+ int zip_error;
+
+ ut_a(optim->zip->pos == 0);
+ ut_a(optim->zip->zp->total_in == 0);
+ ut_a(optim->zip->zp->total_out == 0);
+
+ zip_error = inflateInit(optim->zip->zp);
+ ut_a(zip_error == Z_OK);
+
+ word.f_len = 0;
+ word.f_str = str;
+
+ /* Read the first word to optimize from the Zip buffer. */
+ if (!fts_zip_read_word(optim->zip, &word)) {
+
+ optim->done = TRUE;
+ } else {
+ fts_optimize_words(optim, index, &word);
+ }
+
+ /* If we couldn't read any records then optimize is
+ complete. Increment the number of indexes that have
+ been optimized and set FTS index optimize state to
+ completed. */
+ if (error == DB_SUCCESS && optim->zip->n_words == 0) {
+
+ error = fts_optimize_index_completed(optim, index);
+
+ if (error == DB_SUCCESS) {
+ ++optim->n_completed;
+ }
+ }
+ }
+
+ return(error);
+}
+
+/**********************************************************************//**
+Delete the document ids in the delete, and delete cache tables.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_deleted_doc_ids(
+/*===============================*/
+ fts_optimize_t* optim) /*!< in: optimize instance */
+{
+ ulint i;
+ pars_info_t* info;
+ que_t* graph;
+ doc_id_t* update;
+ doc_id_t write_doc_id;
+ dberr_t error = DB_SUCCESS;
+ char deleted[MAX_FULL_NAME_LEN];
+ char deleted_cache[MAX_FULL_NAME_LEN];
+
+ info = pars_info_create();
+
+ ut_a(ib_vector_size(optim->to_delete->doc_ids) > 0);
+
+ update = static_cast<doc_id_t*>(
+ ib_vector_get(optim->to_delete->doc_ids, 0));
+
+ /* Convert to "storage" byte order. */
+ fts_write_doc_id((byte*) &write_doc_id, *update);
+
+ /* This is required for the SQL parser to work. It must be able
+ to find the following variables. So we do it twice. */
+ fts_bind_doc_id(info, "doc_id1", &write_doc_id);
+ fts_bind_doc_id(info, "doc_id2", &write_doc_id);
+
+ /* Make sure the following two names are consistent with the name
+ used in the fts_delete_doc_ids_sql */
+ optim->fts_common_table.suffix = fts_common_tables[3];
+ fts_get_table_name(&optim->fts_common_table, deleted);
+ pars_info_bind_id(info, true, fts_common_tables[3], deleted);
+
+ optim->fts_common_table.suffix = fts_common_tables[4];
+ fts_get_table_name(&optim->fts_common_table, deleted_cache);
+ pars_info_bind_id(info, true, fts_common_tables[4], deleted_cache);
+
+ graph = fts_parse_sql(NULL, info, fts_delete_doc_ids_sql);
+
+ /* Delete the doc ids that were copied at the start. */
+ for (i = 0; i < ib_vector_size(optim->to_delete->doc_ids); ++i) {
+
+ update = static_cast<doc_id_t*>(ib_vector_get(
+ optim->to_delete->doc_ids, i));
+
+ /* Convert to "storage" byte order. */
+ fts_write_doc_id((byte*) &write_doc_id, *update);
+
+ fts_bind_doc_id(info, "doc_id1", &write_doc_id);
+
+ fts_bind_doc_id(info, "doc_id2", &write_doc_id);
+
+ error = fts_eval_sql(optim->trx, graph);
+
+ // FIXME: Check whether delete actually succeeded!
+ if (error != DB_SUCCESS) {
+
+ fts_sql_rollback(optim->trx);
+ break;
+ }
+ }
+
+ fts_que_graph_free(graph);
+
+ return(error);
+}
+
+/**********************************************************************//**
+Delete the document ids in the pending delete, and delete tables.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_deleted_doc_id_snapshot(
+/*=======================================*/
+ fts_optimize_t* optim) /*!< in: optimize instance */
+{
+ dberr_t error;
+ que_t* graph;
+ pars_info_t* info;
+ char being_deleted[MAX_FULL_NAME_LEN];
+ char being_deleted_cache[MAX_FULL_NAME_LEN];
+
+ info = pars_info_create();
+
+ /* Make sure the following two names are consistent with the name
+ used in the fts_end_delete_sql */
+ optim->fts_common_table.suffix = fts_common_tables[0];
+ fts_get_table_name(&optim->fts_common_table, being_deleted);
+ pars_info_bind_id(info, true, fts_common_tables[0], being_deleted);
+
+ optim->fts_common_table.suffix = fts_common_tables[1];
+ fts_get_table_name(&optim->fts_common_table, being_deleted_cache);
+ pars_info_bind_id(info, true, fts_common_tables[1],
+ being_deleted_cache);
+
+ /* Delete the doc ids that were copied to delete pending state at
+ the start of optimize. */
+ graph = fts_parse_sql(NULL, info, fts_end_delete_sql);
+
+ error = fts_eval_sql(optim->trx, graph);
+ fts_que_graph_free(graph);
+
+ return(error);
+}
+
+/**********************************************************************//**
+Copy the deleted doc ids that will be purged during this optimize run
+to the being deleted FTS auxiliary tables. The transaction is committed
+upon successfull copy and rolled back on DB_DUPLICATE_KEY error.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_being_deleted_count(
+/*=============================*/
+ fts_optimize_t* optim) /*!< in: optimize instance */
+{
+ fts_table_t fts_table;
+
+ FTS_INIT_FTS_TABLE(&fts_table, "BEING_DELETED", FTS_COMMON_TABLE,
+ optim->table);
+
+ return(fts_get_rows_count(&fts_table));
+}
+
+/*********************************************************************//**
+Copy the deleted doc ids that will be purged during this optimize run
+to the being deleted FTS auxiliary tables. The transaction is committed
+upon successfull copy and rolled back on DB_DUPLICATE_KEY error.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_create_deleted_doc_id_snapshot(
+/*========================================*/
+ fts_optimize_t* optim) /*!< in: optimize instance */
+{
+ dberr_t error;
+ que_t* graph;
+ pars_info_t* info;
+ char being_deleted[MAX_FULL_NAME_LEN];
+ char deleted[MAX_FULL_NAME_LEN];
+ char being_deleted_cache[MAX_FULL_NAME_LEN];
+ char deleted_cache[MAX_FULL_NAME_LEN];
+
+ info = pars_info_create();
+
+ /* Make sure the following four names are consistent with the name
+ used in the fts_init_delete_sql */
+ optim->fts_common_table.suffix = fts_common_tables[0];
+ fts_get_table_name(&optim->fts_common_table, being_deleted);
+ pars_info_bind_id(info, true, fts_common_tables[0], being_deleted);
+
+ optim->fts_common_table.suffix = fts_common_tables[3];
+ fts_get_table_name(&optim->fts_common_table, deleted);
+ pars_info_bind_id(info, true, fts_common_tables[3], deleted);
+
+ optim->fts_common_table.suffix = fts_common_tables[1];
+ fts_get_table_name(&optim->fts_common_table, being_deleted_cache);
+ pars_info_bind_id(info, true, fts_common_tables[1],
+ being_deleted_cache);
+
+ optim->fts_common_table.suffix = fts_common_tables[4];
+ fts_get_table_name(&optim->fts_common_table, deleted_cache);
+ pars_info_bind_id(info, true, fts_common_tables[4], deleted_cache);
+
+ /* Move doc_ids that are to be deleted to state being deleted. */
+ graph = fts_parse_sql(NULL, info, fts_init_delete_sql);
+
+ error = fts_eval_sql(optim->trx, graph);
+
+ fts_que_graph_free(graph);
+
+ if (error != DB_SUCCESS) {
+ fts_sql_rollback(optim->trx);
+ } else {
+ fts_sql_commit(optim->trx);
+ }
+
+ optim->del_list_regenerated = TRUE;
+
+ return(error);
+}
+
+/*********************************************************************//**
+Read in the document ids that are to be purged during optimize. The
+transaction is committed upon successfully read.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_read_deleted_doc_id_snapshot(
+/*======================================*/
+ fts_optimize_t* optim) /*!< in: optimize instance */
+{
+ dberr_t error;
+
+ optim->fts_common_table.suffix = "BEING_DELETED";
+
+ /* Read the doc_ids to delete. */
+ error = fts_table_fetch_doc_ids(
+ optim->trx, &optim->fts_common_table, optim->to_delete);
+
+ if (error == DB_SUCCESS) {
+
+ optim->fts_common_table.suffix = "BEING_DELETED_CACHE";
+
+ /* Read additional doc_ids to delete. */
+ error = fts_table_fetch_doc_ids(
+ optim->trx, &optim->fts_common_table, optim->to_delete);
+ }
+
+ if (error != DB_SUCCESS) {
+
+ fts_doc_ids_free(optim->to_delete);
+ optim->to_delete = NULL;
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+Optimze all the FTS indexes, skipping those that have already been
+optimized, since the FTS auxiliary indexes are not guaranteed to be
+of the same cardinality.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_indexes(
+/*=================*/
+ fts_optimize_t* optim) /*!< in: optimize instance */
+{
+ ulint i;
+ dberr_t error = DB_SUCCESS;
+ fts_t* fts = optim->table->fts;
+
+ /* Optimize the FTS indexes. */
+ for (i = 0; i < ib_vector_size(fts->indexes); ++i) {
+ dict_index_t* index;
+
+#ifdef FTS_OPTIMIZE_DEBUG
+ time_t end_time;
+ time_t start_time;
+
+ /* Get the start and end optimize times for this index. */
+ error = fts_optimize_get_index_start_time(
+ optim->trx, index, &start_time);
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+
+ error = fts_optimize_get_index_end_time(
+ optim->trx, index, &end_time);
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+
+ /* Start time will be 0 only for the first time or after
+ completing the optimization of all FTS indexes. */
+ if (start_time == 0) {
+ start_time = time(NULL);
+
+ error = fts_optimize_set_index_start_time(
+ optim->trx, index, start_time);
+ }
+
+ /* Check if this index needs to be optimized or not. */
+ if (difftime(end_time, start_time) < 0) {
+ error = fts_optimize_index(optim, index);
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+ } else {
+ ++optim->n_completed;
+ }
+#endif
+ index = static_cast<dict_index_t*>(
+ ib_vector_getp(fts->indexes, i));
+ error = fts_optimize_index(optim, index);
+ }
+
+ if (error == DB_SUCCESS) {
+ fts_sql_commit(optim->trx);
+ } else {
+ fts_sql_rollback(optim->trx);
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+Cleanup the snapshot tables and the master deleted table.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_snapshot(
+/*========================*/
+ fts_optimize_t* optim) /*!< in: optimize instance */
+{
+ dberr_t error;
+
+ /* Delete the doc ids from the master deleted tables, that were
+ in the snapshot that was taken at the start of optimize. */
+ error = fts_optimize_purge_deleted_doc_ids(optim);
+
+ if (error == DB_SUCCESS) {
+ /* Destroy the deleted doc id snapshot. */
+ error = fts_optimize_purge_deleted_doc_id_snapshot(optim);
+ }
+
+ if (error == DB_SUCCESS) {
+ fts_sql_commit(optim->trx);
+ } else {
+ fts_sql_rollback(optim->trx);
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+Reset the start time to 0 so that a new optimize can be started.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_reset_start_time(
+/*==========================*/
+ fts_optimize_t* optim) /*!< in: optimize instance */
+{
+ dberr_t error = DB_SUCCESS;
+#ifdef FTS_OPTIMIZE_DEBUG
+ fts_t* fts = optim->table->fts;
+
+ /* Optimization should have been completed for all indexes. */
+ ut_a(optim->n_completed == ib_vector_size(fts->indexes));
+
+ for (uint i = 0; i < ib_vector_size(fts->indexes); ++i) {
+ dict_index_t* index;
+
+ time_t start_time = 0;
+
+ /* Reset the start time to 0 for this index. */
+ error = fts_optimize_set_index_start_time(
+ optim->trx, index, start_time);
+
+ index = static_cast<dict_index_t*>(
+ ib_vector_getp(fts->indexes, i));
+ }
+#endif
+
+ if (error == DB_SUCCESS) {
+ fts_sql_commit(optim->trx);
+ } else {
+ fts_sql_rollback(optim->trx);
+ }
+
+ return(error);
+}
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table by a background thread.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+fts_optimize_table_bk(
+/*==================*/
+ fts_slot_t* slot) /*!< in: table to optimiza */
+{
+ const time_t now = time(NULL);
+ const ulint interval = ulint(now - slot->last_run);
+
+ /* Avoid optimizing tables that were optimized recently. */
+ if (slot->last_run > 0
+ && lint(interval) >= 0
+ && interval < FTS_OPTIMIZE_INTERVAL_IN_SECS) {
+
+ return(DB_SUCCESS);
+ }
+
+ dict_table_t* table = slot->table;
+ dberr_t error;
+
+ if (table->is_accessible()
+ && table->fts && table->fts->cache
+ && table->fts->cache->deleted >= FTS_OPTIMIZE_THRESHOLD) {
+ error = fts_optimize_table(table);
+
+ slot->last_run = time(NULL);
+
+ if (error == DB_SUCCESS) {
+ slot->running = false;
+ slot->completed = slot->last_run;
+ }
+ } else {
+ /* Note time this run completed. */
+ slot->last_run = now;
+ error = DB_SUCCESS;
+ }
+
+ return(error);
+}
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+dberr_t
+fts_optimize_table(
+/*===============*/
+ dict_table_t* table) /*!< in: table to optimiza */
+{
+ if (srv_read_only_mode) {
+ return DB_READ_ONLY;
+ }
+
+ dberr_t error = DB_SUCCESS;
+ fts_optimize_t* optim = NULL;
+ fts_t* fts = table->fts;
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "FTS start optimize " << table->name;
+ }
+
+ optim = fts_optimize_create(table);
+
+ // FIXME: Call this only at the start of optimize, currently we
+ // rely on DB_DUPLICATE_KEY to handle corrupting the snapshot.
+
+ /* Check whether there are still records in BEING_DELETED table */
+ if (fts_optimize_being_deleted_count(optim) == 0) {
+ /* Take a snapshot of the deleted document ids, they are copied
+ to the BEING_ tables. */
+ error = fts_optimize_create_deleted_doc_id_snapshot(optim);
+ }
+
+ /* A duplicate error is OK, since we don't erase the
+ doc ids from the being deleted state until all FTS
+ indexes have been optimized. */
+ if (error == DB_DUPLICATE_KEY) {
+ error = DB_SUCCESS;
+ }
+
+ if (error == DB_SUCCESS) {
+
+ /* These document ids will be filtered out during the
+ index optimization phase. They are in the snapshot that we
+ took above, at the start of the optimize. */
+ error = fts_optimize_read_deleted_doc_id_snapshot(optim);
+
+ if (error == DB_SUCCESS) {
+
+ /* Commit the read of being deleted
+ doc ids transaction. */
+ fts_sql_commit(optim->trx);
+
+ /* We would do optimization only if there
+ are deleted records to be cleaned up */
+ if (ib_vector_size(optim->to_delete->doc_ids) > 0) {
+ error = fts_optimize_indexes(optim);
+ }
+
+ } else {
+ ut_a(optim->to_delete == NULL);
+ }
+
+ /* Only after all indexes have been optimized can we
+ delete the (snapshot) doc ids in the pending delete,
+ and master deleted tables. */
+ if (error == DB_SUCCESS
+ && optim->n_completed == ib_vector_size(fts->indexes)) {
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "FTS_OPTIMIZE: Completed"
+ " Optimize, cleanup DELETED table";
+ }
+
+ if (ib_vector_size(optim->to_delete->doc_ids) > 0) {
+
+ /* Purge the doc ids that were in the
+ snapshot from the snapshot tables and
+ the master deleted table. */
+ error = fts_optimize_purge_snapshot(optim);
+ }
+
+ if (error == DB_SUCCESS) {
+ /* Reset the start time of all the FTS indexes
+ so that optimize can be restarted. */
+ error = fts_optimize_reset_start_time(optim);
+ }
+ }
+ }
+
+ fts_optimize_free(optim);
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "FTS end optimize " << table->name;
+ }
+
+ return(error);
+}
+
+/********************************************************************//**
+Add the table to add to the OPTIMIZER's list.
+@return new message instance */
+static
+fts_msg_t*
+fts_optimize_create_msg(
+/*====================*/
+ fts_msg_type_t type, /*!< in: type of message */
+ void* ptr) /*!< in: message payload */
+{
+ mem_heap_t* heap;
+ fts_msg_t* msg;
+
+ heap = mem_heap_create(sizeof(*msg) + sizeof(ib_list_node_t) + 16);
+ msg = static_cast<fts_msg_t*>(mem_heap_alloc(heap, sizeof(*msg)));
+
+ msg->ptr = ptr;
+ msg->type = type;
+ msg->heap = heap;
+
+ return(msg);
+}
+
+/** Add message to wqueue, signal thread pool*/
+static void add_msg(fts_msg_t *msg, bool wq_locked= false)
+{
+ ib_wqueue_add(fts_optimize_wq, msg, msg->heap, wq_locked);
+ srv_thread_pool->submit_task(&task);
+}
+
+/**
+Called by "idle" timer. Submits optimize task, which
+will only recalculate is_sync_needed, in case the queue is empty.
+*/
+static void timer_callback(void*)
+{
+ srv_thread_pool->submit_task(&task);
+}
+
+/** Add the table to add to the OPTIMIZER's list.
+@param[in] table table to add */
+void fts_optimize_add_table(dict_table_t* table)
+{
+ fts_msg_t* msg;
+
+ if (!fts_optimize_wq) {
+ return;
+ }
+
+ /* Make sure table with FTS index cannot be evicted */
+ dict_table_prevent_eviction(table);
+
+ msg = fts_optimize_create_msg(FTS_MSG_ADD_TABLE, table);
+
+ mutex_enter(&fts_optimize_wq->mutex);
+
+ add_msg(msg, true);
+
+ table->fts->in_queue = true;
+
+ mutex_exit(&fts_optimize_wq->mutex);
+}
+
+/**********************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+void
+fts_optimize_remove_table(
+/*======================*/
+ dict_table_t* table) /*!< in: table to remove */
+{
+ fts_msg_t* msg;
+ os_event_t event;
+ fts_msg_del_t* remove;
+
+ /* if the optimize system not yet initialized, return */
+ if (!fts_optimize_wq) {
+ return;
+ }
+
+ /* FTS optimizer thread is already exited */
+ if (fts_opt_start_shutdown) {
+ ib::info() << "Try to remove table " << table->name
+ << " after FTS optimize thread exiting.";
+ /* If the table can't be removed then wait till
+ fts optimize thread shuts down */
+ while (fts_optimize_wq) {
+ os_thread_sleep(10000);
+ }
+ return;
+ }
+
+ mutex_enter(&fts_optimize_wq->mutex);
+
+ if (!table->fts->in_queue) {
+ mutex_exit(&fts_optimize_wq->mutex);
+ return;
+ }
+
+ msg = fts_optimize_create_msg(FTS_MSG_DEL_TABLE, NULL);
+
+ /* We will wait on this event until signalled by the consumer. */
+ event = os_event_create(0);
+
+ remove = static_cast<fts_msg_del_t*>(
+ mem_heap_alloc(msg->heap, sizeof(*remove)));
+
+ remove->table = table;
+ remove->event = event;
+ msg->ptr = remove;
+
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ add_msg(msg, true);
+
+ mutex_exit(&fts_optimize_wq->mutex);
+
+ os_event_wait(event);
+
+ os_event_destroy(event);
+
+#ifdef UNIV_DEBUG
+ if (!fts_opt_start_shutdown) {
+ mutex_enter(&fts_optimize_wq->mutex);
+ ut_ad(!table->fts->in_queue);
+ mutex_exit(&fts_optimize_wq->mutex);
+ }
+#endif /* UNIV_DEBUG */
+}
+
+/** Send sync fts cache for the table.
+@param[in] table table to sync */
+void
+fts_optimize_request_sync_table(
+ dict_table_t* table)
+{
+ /* if the optimize system not yet initialized, return */
+ if (!fts_optimize_wq) {
+ return;
+ }
+
+ /* FTS optimizer thread is already exited */
+ if (fts_opt_start_shutdown) {
+ ib::info() << "Try to sync table " << table->name
+ << " after FTS optimize thread exiting.";
+ return;
+ }
+
+ mutex_enter(&fts_optimize_wq->mutex);
+
+ if (table->fts->sync_message) {
+ /* If the table already has SYNC message in
+ fts_optimize_wq queue then ignore it */
+ mutex_exit(&fts_optimize_wq->mutex);
+ return;
+ }
+
+ fts_msg_t* msg = fts_optimize_create_msg(FTS_MSG_SYNC_TABLE, table);
+
+ add_msg(msg, true);
+
+ table->fts->sync_message = true;
+
+ mutex_exit(&fts_optimize_wq->mutex);
+}
+
+/** Add a table to fts_slots if it doesn't already exist. */
+static bool fts_optimize_new_table(dict_table_t* table)
+{
+ ut_ad(table);
+
+ ulint i;
+ fts_slot_t* slot;
+ fts_slot_t* empty = NULL;
+
+ /* Search for duplicates, also find a free slot if one exists. */
+ for (i = 0; i < ib_vector_size(fts_slots); ++i) {
+
+ slot = static_cast<fts_slot_t*>(ib_vector_get(fts_slots, i));
+
+ if (!slot->table) {
+ empty = slot;
+ } else if (slot->table == table) {
+ /* Already exists in our optimize queue. */
+ return false;
+ }
+ }
+
+ slot = empty ? empty : static_cast<fts_slot_t*>(
+ ib_vector_push(fts_slots, NULL));
+
+ memset(slot, 0x0, sizeof(*slot));
+
+ slot->table = table;
+ return true;
+}
+
+/** Remove a table from fts_slots if it exists.
+@param[in,out] table table to be removed from fts_slots */
+static bool fts_optimize_del_table(const dict_table_t* table)
+{
+ ut_ad(table);
+ for (ulint i = 0; i < ib_vector_size(fts_slots); ++i) {
+ fts_slot_t* slot;
+
+ slot = static_cast<fts_slot_t*>(ib_vector_get(fts_slots, i));
+
+ if (slot->table == table) {
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "FTS Optimize Removing table "
+ << table->name;
+ }
+
+ mutex_enter(&fts_optimize_wq->mutex);
+ slot->table->fts->in_queue = false;
+ mutex_exit(&fts_optimize_wq->mutex);
+ slot->table = NULL;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/**********************************************************************//**
+Calculate how many tables in fts_slots need to be optimized.
+@return no. of tables to optimize */
+static ulint fts_optimize_how_many()
+{
+ ulint n_tables = 0;
+ const time_t current_time = time(NULL);
+
+ for (ulint i = 0; i < ib_vector_size(fts_slots); ++i) {
+ const fts_slot_t* slot = static_cast<const fts_slot_t*>(
+ ib_vector_get_const(fts_slots, i));
+ if (!slot->table) {
+ continue;
+ }
+
+ const time_t end = slot->running
+ ? slot->last_run : slot->completed;
+ ulint interval = ulint(current_time - end);
+
+ if (lint(interval) < 0
+ || interval >= FTS_OPTIMIZE_INTERVAL_IN_SECS) {
+ ++n_tables;
+ }
+ }
+
+ return(n_tables);
+}
+
+/**********************************************************************//**
+Check if the total memory used by all FTS table exceeds the maximum limit.
+@return true if a sync is needed, false otherwise */
+static bool fts_is_sync_needed()
+{
+ ulint total_memory = 0;
+ const time_t now = time(NULL);
+ double time_diff = difftime(now, last_check_sync_time);
+
+ if (fts_need_sync || (time_diff >= 0 && time_diff < 5)) {
+ return(false);
+ }
+
+ last_check_sync_time = now;
+
+ for (ulint i = 0; i < ib_vector_size(fts_slots); ++i) {
+ const fts_slot_t* slot = static_cast<const fts_slot_t*>(
+ ib_vector_get_const(fts_slots, i));
+
+ if (!slot->table) {
+ continue;
+ }
+
+ if (slot->table->fts && slot->table->fts->cache) {
+ total_memory += slot->table->fts->cache->total_size;
+ }
+
+ if (total_memory > fts_max_total_cache_size) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Sync fts cache of a table
+@param[in,out] table table to be synced
+@param[in] process_message processing messages from fts_optimize_wq */
+static void fts_optimize_sync_table(dict_table_t *table,
+ bool process_message= false)
+{
+ MDL_ticket* mdl_ticket= nullptr;
+ dict_table_t *sync_table= dict_acquire_mdl_shared<true>(table, fts_opt_thd,
+ &mdl_ticket);
+
+ if (!sync_table)
+ return;
+
+ if (sync_table->fts && sync_table->fts->cache && sync_table->is_accessible())
+ {
+ fts_sync_table(sync_table, false);
+ if (process_message)
+ {
+ mutex_enter(&fts_optimize_wq->mutex);
+ sync_table->fts->sync_message = false;
+ mutex_exit(&fts_optimize_wq->mutex);
+ }
+ }
+
+ DBUG_EXECUTE_IF("ib_optimize_wq_hang", os_thread_sleep(6000000););
+
+ if (mdl_ticket)
+ dict_table_close(sync_table, false, false, fts_opt_thd, mdl_ticket);
+}
+
+/**********************************************************************//**
+Optimize all FTS tables.
+@return Dummy return */
+static void fts_optimize_callback(void *)
+{
+ ut_ad(!srv_read_only_mode);
+
+ if (!fts_optimize_wq) {
+ /* Possibly timer initiated callback, can come after FTS_MSG_STOP.*/
+ return;
+ }
+
+ static ulint current = 0;
+ static ibool done = FALSE;
+ static ulint n_tables = ib_vector_size(fts_slots);
+ static ulint n_optimize = 0;
+
+ while (!done && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
+ /* If there is no message in the queue and we have tables
+ to optimize then optimize the tables. */
+
+ if (!done
+ && ib_wqueue_is_empty(fts_optimize_wq)
+ && n_tables > 0
+ && n_optimize > 0) {
+ fts_slot_t* slot = static_cast<fts_slot_t*>(
+ ib_vector_get(fts_slots, current));
+
+ /* Handle the case of empty slots. */
+ if (slot->table) {
+ slot->running = true;
+ fts_optimize_table_bk(slot);
+ }
+
+ /* Wrap around the counter. */
+ if (++current >= ib_vector_size(fts_slots)) {
+ n_optimize = fts_optimize_how_many();
+ current = 0;
+ }
+
+ } else if (n_optimize == 0
+ || !ib_wqueue_is_empty(fts_optimize_wq)) {
+ fts_msg_t* msg = static_cast<fts_msg_t*>
+ (ib_wqueue_nowait(fts_optimize_wq));
+ /* Timeout ? */
+ if (msg == NULL) {
+ if (fts_is_sync_needed()) {
+ fts_need_sync = true;
+ }
+ if (n_tables)
+ timer->set_time(5000, 0);
+ return;
+ }
+
+ switch (msg->type) {
+ case FTS_MSG_STOP:
+ done = TRUE;
+ break;
+
+ case FTS_MSG_ADD_TABLE:
+ ut_a(!done);
+ if (fts_optimize_new_table(
+ static_cast<dict_table_t*>(
+ msg->ptr))) {
+ ++n_tables;
+ }
+ break;
+
+ case FTS_MSG_DEL_TABLE:
+ if (fts_optimize_del_table(
+ static_cast<fts_msg_del_t*>(
+ msg->ptr)->table)) {
+ --n_tables;
+ }
+
+ /* Signal the producer that we have
+ removed the table. */
+ os_event_set(
+ ((fts_msg_del_t*) msg->ptr)->event);
+ break;
+
+ case FTS_MSG_SYNC_TABLE:
+ DBUG_EXECUTE_IF(
+ "fts_instrument_msg_sync_sleep",
+ os_thread_sleep(300000););
+
+ fts_optimize_sync_table(
+ static_cast<dict_table_t*>(msg->ptr),
+ true);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ mem_heap_free(msg->heap);
+ n_optimize = done ? 0 : fts_optimize_how_many();
+ }
+ }
+
+ /* Server is being shutdown, sync the data from FTS cache to disk
+ if needed */
+ if (n_tables > 0) {
+ for (ulint i = 0; i < ib_vector_size(fts_slots); i++) {
+ fts_slot_t* slot = static_cast<fts_slot_t*>(
+ ib_vector_get(fts_slots, i));
+
+ if (slot->table) {
+ fts_optimize_sync_table(slot->table);
+ }
+ }
+ }
+
+ ib_vector_free(fts_slots);
+ fts_slots = NULL;
+
+ ib_wqueue_free(fts_optimize_wq);
+ fts_optimize_wq = NULL;
+
+ innobase_destroy_background_thd(fts_opt_thd);
+ ib::info() << "FTS optimize thread exiting.";
+
+ os_event_set(fts_opt_shutdown_event);
+}
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+void
+fts_optimize_init(void)
+/*===================*/
+{
+ mem_heap_t* heap;
+ ib_alloc_t* heap_alloc;
+
+ ut_ad(!srv_read_only_mode);
+
+ /* For now we only support one optimize thread. */
+ ut_a(!fts_optimize_wq);
+
+ /* Create FTS optimize work queue */
+ fts_optimize_wq = ib_wqueue_create();
+ ut_a(fts_optimize_wq != NULL);
+ timer = srv_thread_pool->create_timer(timer_callback);
+
+ /* Create FTS vector to store fts_slot_t */
+ heap = mem_heap_create(sizeof(dict_table_t*) * 64);
+ heap_alloc = ib_heap_allocator_create(heap);
+ fts_slots = ib_vector_create(heap_alloc, sizeof(fts_slot_t), 4);
+
+ fts_opt_thd = innobase_create_background_thd("InnoDB FTS optimizer");
+ /* Add fts tables to fts_slots which could be skipped
+ during dict_load_table_one() because fts_optimize_thread
+ wasn't even started. */
+ mutex_enter(&dict_sys.mutex);
+ for (dict_table_t* table = UT_LIST_GET_FIRST(dict_sys.table_LRU);
+ table != NULL;
+ table = UT_LIST_GET_NEXT(table_LRU, table)) {
+ if (!table->fts || !dict_table_has_fts_index(table)) {
+ continue;
+ }
+
+ /* fts_optimize_thread is not started yet. So there is no
+ need to acquire fts_optimize_wq->mutex for adding the fts
+ table to the fts slots. */
+ ut_ad(!table->can_be_evicted);
+ fts_optimize_new_table(table);
+ table->fts->in_queue = true;
+ }
+ mutex_exit(&dict_sys.mutex);
+
+ fts_opt_shutdown_event = os_event_create(0);
+ last_check_sync_time = time(NULL);
+}
+
+/** Shutdown fts optimize thread. */
+void
+fts_optimize_shutdown()
+{
+ ut_ad(!srv_read_only_mode);
+
+ fts_msg_t* msg;
+
+ /* If there is an ongoing activity on dictionary, such as
+ srv_master_evict_from_table_cache(), wait for it */
+ dict_mutex_enter_for_mysql();
+
+ /* Tells FTS optimizer system that we are exiting from
+ optimizer thread, message send their after will not be
+ processed */
+ fts_opt_start_shutdown = true;
+ dict_mutex_exit_for_mysql();
+
+ /* We tell the OPTIMIZE thread to switch to state done, we
+ can't delete the work queue here because the add thread needs
+ deregister the FTS tables. */
+ timer->disarm();
+ task_group.cancel_pending(&task);
+
+ msg = fts_optimize_create_msg(FTS_MSG_STOP, NULL);
+
+ add_msg(msg);
+
+ os_event_wait(fts_opt_shutdown_event);
+
+ os_event_destroy(fts_opt_shutdown_event);
+ fts_opt_thd = NULL;
+ delete timer;
+ timer = NULL;
+}
+
+/** Sync the table during commit phase
+@param[in] table table to be synced */
+void fts_sync_during_ddl(dict_table_t* table)
+{
+ mutex_enter(&fts_optimize_wq->mutex);
+ if (!table->fts->sync_message)
+ {
+ mutex_exit(&fts_optimize_wq->mutex);
+ return;
+ }
+
+ mutex_exit(&fts_optimize_wq->mutex);
+ fts_sync_table(table, false);
+
+ mutex_enter(&fts_optimize_wq->mutex);
+ table->fts->sync_message = false;
+ mutex_exit(&fts_optimize_wq->mutex);
+}
diff --git a/storage/innobase/fts/fts0pars.cc b/storage/innobase/fts/fts0pars.cc
new file mode 100644
index 00000000..56cc8d60
--- /dev/null
+++ b/storage/innobase/fts/fts0pars.cc
@@ -0,0 +1,2007 @@
+/* A Bison parser, made by GNU Bison 2.5. */
+
+/* Bison implementation for Yacc-like parsers in C
+
+ Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* As a special exception, you may create a larger work that contains
+ part or all of the Bison parser skeleton and distribute that work
+ under terms of your choice, so long as that work isn't itself a
+ parser generator using the skeleton or a modified version thereof
+ as a parser skeleton. Alternatively, if you modify or redistribute
+ the parser skeleton itself, you may (at your option) remove this
+ special exception, which will cause the skeleton and the resulting
+ Bison output files to be licensed under the GNU General Public
+ License without this special exception.
+
+ This special exception was added by the Free Software Foundation in
+ version 2.2 of Bison. */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+ simplifying the original so-called "semantic" parser. */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+ infringing on user name space. This should be done even for local
+ variables, as they might otherwise be expanded by user macros.
+ There are some unavoidable exceptions within include files to
+ define necessary library symbols; they are noted "INFRINGES ON
+ USER NAME SPACE" below. */
+
+/* Identify Bison output. */
+#define YYBISON 1
+
+/* Bison version. */
+#define YYBISON_VERSION "2.5"
+
+/* Skeleton name. */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers. */
+#define YYPURE 1
+
+/* Push parsers. */
+#define YYPUSH 0
+
+/* Pull parsers. */
+#define YYPULL 1
+
+/* Using locations. */
+#define YYLSP_NEEDED 0
+
+/* Substitute the variable and function names. */
+#define yyparse ftsparse
+#define yylex ftslex
+#define yyerror ftserror
+#define yylval ftslval
+#define yychar ftschar
+#define yydebug ftsdebug
+#define yynerrs ftsnerrs
+
+
+/* Copy the first part of user declarations. */
+
+/* Line 268 of yacc.c */
+#line 26 "fts0pars.y"
+
+#include "ha_prototypes.h"
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0blex.h"
+#include "fts0tlex.h"
+#include "fts0pars.h"
+#include <my_sys.h>
+
+extern int fts_lexer(YYSTYPE*, fts_lexer_t*);
+extern int fts_blexer(YYSTYPE*, yyscan_t);
+extern int fts_tlexer(YYSTYPE*, yyscan_t);
+
+
+
+extern int ftserror(const char* p);
+
+/* Required for reentrant parser */
+#define ftslex fts_lexer
+
+#define YYERROR_VERBOSE
+
+/* For passing an argument to yyparse() */
+#define YYPARSE_PARAM state
+#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer
+
+#define YYTOKENFREE(token) fts_ast_string_free((token))
+
+
+typedef int (*fts_scanner)(YYSTYPE* val, yyscan_t yyscanner);
+
+struct fts_lexer_t {
+ fts_scanner scanner;
+ void* yyscanner;
+};
+
+
+
+/* Line 268 of yacc.c */
+#line 115 "fts0pars.cc"
+
+/* Enabling traces. */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+
+/* Enabling verbose error messages. */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+/* Enabling the token table. */
+#ifndef YYTOKEN_TABLE
+# define YYTOKEN_TABLE 0
+#endif
+
+
+/* Tokens. */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+ /* Put the tokens into the symbol table, so that GDB and other debuggers
+ know about them. */
+ enum yytokentype {
+ FTS_OPER = 258,
+ FTS_TEXT = 259,
+ FTS_TERM = 260,
+ FTS_NUMB = 261
+ };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 293 of yacc.c */
+#line 61 "fts0pars.y"
+
+ int oper;
+ fts_ast_string_t* token;
+ fts_ast_node_t* node;
+
+
+
+/* Line 293 of yacc.c */
+#line 165 "fts0pars.cc"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+/* Copy the second part of user declarations. */
+
+
+/* Line 343 of yacc.c */
+#line 177 "fts0pars.cc"
+
+#ifdef short
+# undef short
+#endif
+
+#ifdef YYTYPE_UINT8
+typedef YYTYPE_UINT8 yytype_uint8;
+#else
+typedef unsigned char yytype_uint8;
+#endif
+
+#ifdef YYTYPE_INT8
+typedef YYTYPE_INT8 yytype_int8;
+#elif (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+typedef signed char yytype_int8;
+#else
+typedef short int yytype_int8;
+#endif
+
+#ifdef YYTYPE_UINT16
+typedef YYTYPE_UINT16 yytype_uint16;
+#else
+typedef unsigned short int yytype_uint16;
+#endif
+
+#ifdef YYTYPE_INT16
+typedef YYTYPE_INT16 yytype_int16;
+#else
+typedef short int yytype_int16;
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+# define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+# define YYSIZE_T size_t
+# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+# include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+# define YYSIZE_T size_t
+# else
+# define YYSIZE_T unsigned int
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+
+#ifndef YY_
+# if defined YYENABLE_NLS && YYENABLE_NLS
+# if ENABLE_NLS
+# include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+# define YY_(msgid) dgettext ("bison-runtime", msgid)
+# endif
+# endif
+# ifndef YY_
+# define YY_(msgid) msgid
+# endif
+#endif
+
+/* Suppress unused-variable warnings by "using" E. */
+#if ! defined lint || defined __GNUC__
+# define YYUSE(e) ((void) (e))
+#else
+# define YYUSE(e) /* empty */
+#endif
+
+/* Identity function, used to suppress warnings about constant conditions. */
+#ifndef lint
+# define YYID(n) (n)
+#else
+#if (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+static int
+YYID (int yyi)
+#else
+static int
+YYID (yyi)
+ int yyi;
+#endif
+{
+ return yyi;
+}
+#endif
+
+#if ! defined yyoverflow || YYERROR_VERBOSE
+
+/* The parser invokes alloca or malloc; define the necessary symbols. */
+
+# ifdef YYSTACK_USE_ALLOCA
+# if YYSTACK_USE_ALLOCA
+# ifdef __GNUC__
+# define YYSTACK_ALLOC __builtin_alloca
+# elif defined __BUILTIN_VA_ARG_INCR
+# include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+# elif defined _MSC_VER
+# include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+# define alloca _alloca
+# else
+# define YYSTACK_ALLOC alloca
+# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+# ifndef EXIT_SUCCESS
+# define EXIT_SUCCESS 0
+# endif
+# endif
+# endif
+# endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+ /* Pacify GCC's `empty if-body' warning. */
+# define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0))
+# ifndef YYSTACK_ALLOC_MAXIMUM
+ /* The OS might guarantee only one guard page at the bottom of the stack,
+ and a page size can be as small as 4096 bytes. So we cannot safely
+ invoke alloca (N) if N exceeds 4096. Use a slightly smaller number
+ to allow for a few compiler-allocated temporary stack slots. */
+# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+# endif
+# else
+# define YYSTACK_ALLOC YYMALLOC
+# define YYSTACK_FREE YYFREE
+# ifndef YYSTACK_ALLOC_MAXIMUM
+# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+# endif
+# if (defined __cplusplus && ! defined EXIT_SUCCESS \
+ && ! ((defined YYMALLOC || defined malloc) \
+ && (defined YYFREE || defined free)))
+# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+# ifndef EXIT_SUCCESS
+# define EXIT_SUCCESS 0
+# endif
+# endif
+# ifndef YYMALLOC
+# define YYMALLOC malloc
+# if ! defined malloc && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+# endif
+# endif
+# ifndef YYFREE
+# define YYFREE free
+# if ! defined free && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+void free (void *); /* INFRINGES ON USER NAME SPACE */
+# endif
+# endif
+# endif
+#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
+
+
+#if (! defined yyoverflow \
+ && (! defined __cplusplus \
+ || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member. */
+union yyalloc
+{
+ yytype_int16 yyss_alloc;
+ YYSTYPE yyvs_alloc;
+};
+
+/* The size of the maximum gap between one aligned stack and the next. */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+ N elements. */
+# define YYSTACK_BYTES(N) \
+ ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
+ + YYSTACK_GAP_MAXIMUM)
+
+# define YYCOPY_NEEDED 1
+
+/* Relocate STACK from its old location to the new one. The
+ local variables YYSIZE and YYSTACKSIZE give the old and new number of
+ elements in the stack, and YYPTR gives the new location of the
+ stack. Advance YYPTR to a properly aligned location for the next
+ stack. */
+# define YYSTACK_RELOCATE(Stack_alloc, Stack) \
+ do \
+ { \
+ YYSIZE_T yynewbytes; \
+ YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \
+ Stack = &yyptr->Stack_alloc; \
+ yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+ yyptr += yynewbytes / sizeof (*yyptr); \
+ } \
+ while (YYID (0))
+
+#endif
+
+#if defined YYCOPY_NEEDED && YYCOPY_NEEDED
+/* Copy COUNT objects from FROM to TO. The source and destination do
+ not overlap. */
+# ifndef YYCOPY
+# if defined __GNUC__ && 1 < __GNUC__
+# define YYCOPY(To, From, Count) \
+ __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+# else
+# define YYCOPY(To, From, Count) \
+ do \
+ { \
+ YYSIZE_T yyi; \
+ for (yyi = 0; yyi < (Count); yyi++) \
+ (To)[yyi] = (From)[yyi]; \
+ } \
+ while (YYID (0))
+# endif
+# endif
+#endif /* !YYCOPY_NEEDED */
+
+/* YYFINAL -- State number of the termination state. */
+#define YYFINAL 3
+/* YYLAST -- Last index in YYTABLE. */
+#define YYLAST 52
+
+/* YYNTOKENS -- Number of terminals. */
+#define YYNTOKENS 16
+/* YYNNTS -- Number of nonterminals. */
+#define YYNNTS 8
+/* YYNRULES -- Number of rules. */
+#define YYNRULES 24
+/* YYNRULES -- Number of states. */
+#define YYNSTATES 33
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */
+#define YYUNDEFTOK 2
+#define YYMAXUTOK 261
+
+#define YYTRANSLATE(YYX) \
+ ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */
+static const yytype_uint8 yytranslate[] =
+{
+ 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 12, 13, 14, 7, 2, 8, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 10, 2, 11, 2, 15, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 9, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 1, 2, 3, 4,
+ 5, 6
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+ YYRHS. */
+static const yytype_uint8 yyprhs[] =
+{
+ 0, 0, 3, 5, 6, 9, 12, 16, 21, 23,
+ 25, 28, 32, 36, 39, 44, 47, 49, 51, 53,
+ 55, 57, 59, 61, 64
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS. */
+static const yytype_int8 yyrhs[] =
+{
+ 17, 0, -1, 18, -1, -1, 18, 20, -1, 18,
+ 19, -1, 12, 18, 13, -1, 21, 12, 18, 13,
+ -1, 22, -1, 23, -1, 22, 14, -1, 23, 15,
+ 6, -1, 21, 22, 14, -1, 21, 22, -1, 21,
+ 23, 15, 6, -1, 21, 23, -1, 8, -1, 7,
+ -1, 9, -1, 10, -1, 11, -1, 5, -1, 6,
+ -1, 14, 22, -1, 4, -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
+static const yytype_uint8 yyrline[] =
+{
+ 0, 79, 79, 85, 89, 99, 111, 119, 129, 133,
+ 137, 141, 146, 152, 157, 164, 170, 174, 178, 182,
+ 186, 191, 196, 202, 207
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+ First, the terminals, then, starting at YYNTOKENS, nonterminals. */
+static const char *const yytname[] =
+{
+ "$end", "error", "$undefined", "FTS_OPER", "FTS_TEXT", "FTS_TERM",
+ "FTS_NUMB", "'+'", "'-'", "'~'", "'<'", "'>'", "'('", "')'", "'*'",
+ "'@'", "$accept", "query", "expr_lst", "sub_expr", "expr", "prefix",
+ "term", "text", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+ token YYLEX-NUM. */
+static const yytype_uint16 yytoknum[] =
+{
+ 0, 256, 257, 258, 259, 260, 261, 43, 45, 126,
+ 60, 62, 40, 41, 42, 64
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */
+static const yytype_uint8 yyr1[] =
+{
+ 0, 16, 17, 18, 18, 18, 19, 19, 20, 20,
+ 20, 20, 20, 20, 20, 20, 21, 21, 21, 21,
+ 21, 22, 22, 22, 23
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */
+static const yytype_uint8 yyr2[] =
+{
+ 0, 2, 1, 0, 2, 2, 3, 4, 1, 1,
+ 2, 3, 3, 2, 4, 2, 1, 1, 1, 1,
+ 1, 1, 1, 2, 1
+};
+
+/* YYDEFACT[STATE-NAME] -- Default reduction number in state STATE-NUM.
+ Performed when YYTABLE doesn't specify something else to do. Zero
+ means the default is an error. */
+static const yytype_uint8 yydefact[] =
+{
+ 3, 0, 2, 1, 24, 21, 22, 17, 16, 18,
+ 19, 20, 3, 0, 5, 4, 0, 8, 9, 0,
+ 23, 3, 13, 15, 10, 0, 6, 0, 12, 0,
+ 11, 7, 14
+};
+
+/* YYDEFGOTO[NTERM-NUM]. */
+static const yytype_int8 yydefgoto[] =
+{
+ -1, 1, 2, 14, 15, 16, 17, 18
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+ STATE-NUM. */
+#define YYPACT_NINF -5
+static const yytype_int8 yypact[] =
+{
+ -5, 38, 18, -5, -5, -5, -5, -5, -5, -5,
+ -5, -5, -5, 31, -5, -5, 29, 30, 32, -4,
+ -5, -5, 34, 35, -5, 40, -5, 7, -5, 43,
+ -5, -5, -5
+};
+
+/* YYPGOTO[NTERM-NUM]. */
+static const yytype_int8 yypgoto[] =
+{
+ -5, -5, 19, -5, -5, -5, 26, 36
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If
+ positive, shift that token. If negative, reduce the rule which
+ number is the opposite. If YYTABLE_NINF, syntax error. */
+#define YYTABLE_NINF -1
+static const yytype_uint8 yytable[] =
+{
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 26,
+ 13, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 31, 13, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 19, 13, 4, 5, 6, 5, 6, 3, 20,
+ 27, 21, 22, 13, 24, 13, 30, 25, 28, 32,
+ 29, 0, 23
+};
+
+#define yypact_value_is_default(yystate) \
+ ((yystate) == (-5))
+
+#define yytable_value_is_error(yytable_value) \
+ YYID (0)
+
+static const yytype_int8 yycheck[] =
+{
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 12, 14, 4, 5, 6, 5, 6, 0, 13,
+ 21, 12, 16, 14, 14, 14, 6, 15, 14, 6,
+ 15, -1, 16
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+ symbol of state STATE-NUM. */
+static const yytype_uint8 yystos[] =
+{
+ 0, 17, 18, 0, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 14, 19, 20, 21, 22, 23, 18,
+ 22, 12, 22, 23, 14, 15, 13, 18, 14, 15,
+ 6, 13, 6
+};
+
+#define yyerrok (yyerrstatus = 0)
+#define yyclearin (yychar = YYEMPTY)
+#define YYEMPTY (-2)
+#define YYEOF 0
+
+#define YYACCEPT goto yyacceptlab
+#define YYABORT goto yyabortlab
+#define YYERROR goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror. This remains here temporarily
+ to ease the transition to the new meaning of YYERROR, for GCC.
+ Once GCC version 2 has supplanted version 1, this can go. However,
+ YYFAIL appears to be in use. Nevertheless, it is formally deprecated
+ in Bison 2.4.2's NEWS entry, where a plan to phase it out is
+ discussed. */
+
+#define YYFAIL goto yyerrlab
+#if defined YYFAIL
+ /* This is here to suppress warnings from the GCC cpp's
+ -Wunused-macros. Normally we don't worry about that warning, but
+ some users do, and we want to make it easy for users to remove
+ YYFAIL uses, which will produce warnings from Bison 2.5. */
+#endif
+
+#define YYRECOVERING() (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value) \
+do \
+ if (yychar == YYEMPTY && yylen == 1) \
+ { \
+ yychar = (Token); \
+ yylval = (Value); \
+ YYPOPSTACK (1); \
+ goto yybackup; \
+ } \
+ else \
+ { \
+ yyerror (YY_("syntax error: cannot back up")); \
+ YYERROR; \
+ } \
+while (YYID (0))
+
+
+#define YYTERROR 1
+#define YYERRCODE 256
+
+#define YYERRCLEANUP \
+do \
+ switch (yylastchar) \
+ { \
+ case FTS_NUMB: \
+ case FTS_TEXT: \
+ case FTS_TERM: \
+ YYTOKENFREE(yylval.token); \
+ break; \
+ default: \
+ break; \
+ } \
+while (YYID (0))
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+ If N is 0, then set CURRENT to the empty location which ends
+ the previous symbol: RHS[0] (always defined). */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N) \
+ do \
+ if (YYID (N)) \
+ { \
+ (Current).first_line = YYRHSLOC (Rhs, 1).first_line; \
+ (Current).first_column = YYRHSLOC (Rhs, 1).first_column; \
+ (Current).last_line = YYRHSLOC (Rhs, N).last_line; \
+ (Current).last_column = YYRHSLOC (Rhs, N).last_column; \
+ } \
+ else \
+ { \
+ (Current).first_line = (Current).last_line = \
+ YYRHSLOC (Rhs, 0).last_line; \
+ (Current).first_column = (Current).last_column = \
+ YYRHSLOC (Rhs, 0).last_column; \
+ } \
+ while (YYID (0))
+#endif
+
+
+/* This macro is provided for backward compatibility. */
+
+#ifndef YY_LOCATION_PRINT
+# define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments. */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (&yylval, YYLEX_PARAM)
+#else
+# define YYLEX yylex (&yylval)
+#endif
+
+/* Enable debugging if requested. */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+# include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+# define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args) \
+do { \
+ if (yydebug) \
+ YYFPRINTF Args; \
+} while (YYID (0))
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \
+do { \
+ if (yydebug) \
+ { \
+ YYFPRINTF (stderr, "%s ", Title); \
+ yy_symbol_print (stderr, \
+ Type, Value); \
+ YYFPRINTF (stderr, "\n"); \
+ } \
+} while (YYID (0))
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT. |
+`--------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_value_print (yyoutput, yytype, yyvaluep)
+ FILE *yyoutput;
+ int yytype;
+ YYSTYPE const * const yyvaluep;
+#endif
+{
+ if (!yyvaluep)
+ return;
+# ifdef YYPRINT
+ if (yytype < YYNTOKENS)
+ YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# else
+ YYUSE (yyoutput);
+# endif
+ switch (yytype)
+ {
+ default:
+ break;
+ }
+}
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT. |
+`--------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_print (yyoutput, yytype, yyvaluep)
+ FILE *yyoutput;
+ int yytype;
+ YYSTYPE const * const yyvaluep;
+#endif
+{
+ if (yytype < YYNTOKENS)
+ YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+ else
+ YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+ yy_symbol_value_print (yyoutput, yytype, yyvaluep);
+ YYFPRINTF (yyoutput, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included). |
+`------------------------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+static void
+yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop)
+#else
+static void
+yy_stack_print (yybottom, yytop)
+ yytype_int16 *yybottom;
+ yytype_int16 *yytop;
+#endif
+{
+ YYFPRINTF (stderr, "Stack now");
+ for (; yybottom <= yytop; yybottom++)
+ {
+ int yybot = *yybottom;
+ YYFPRINTF (stderr, " %d", yybot);
+ }
+ YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top) \
+do { \
+ if (yydebug) \
+ yy_stack_print ((Bottom), (Top)); \
+} while (YYID (0))
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced. |
+`------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+static void
+yy_reduce_print (YYSTYPE *yyvsp, int yyrule)
+#else
+static void
+yy_reduce_print (yyvsp, yyrule)
+ YYSTYPE *yyvsp;
+ int yyrule;
+#endif
+{
+ int yynrhs = yyr2[yyrule];
+ int yyi;
+ unsigned long int yylno = yyrline[yyrule];
+ YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+ yyrule - 1, yylno);
+ /* The symbols being reduced. */
+ for (yyi = 0; yyi < yynrhs; yyi++)
+ {
+ YYFPRINTF (stderr, " $%d = ", yyi + 1);
+ yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi],
+ &(yyvsp[(yyi + 1) - (yynrhs)])
+ );
+ YYFPRINTF (stderr, "\n");
+ }
+}
+
+# define YY_REDUCE_PRINT(Rule) \
+do { \
+ if (yydebug) \
+ yy_reduce_print (yyvsp, Rule); \
+} while (YYID (0))
+
+/* Nonzero means print parse trace. It is left uninitialized so that
+ multiple parsers can coexist. */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks. */
+#ifndef YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+ if the built-in stack extension method is used).
+
+ Do not make this value too large; the results are undefined if
+ YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+ evaluated with infinite-precision integer arithmetic. */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+# if defined __GLIBC__ && defined _STRING_H
+# define yystrlen strlen
+# else
+/* Return the length of YYSTR. */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+static YYSIZE_T
+yystrlen (const char *yystr)
+#else
+static YYSIZE_T
+yystrlen (yystr)
+ const char *yystr;
+#endif
+{
+ YYSIZE_T yylen;
+ for (yylen = 0; yystr[yylen]; yylen++)
+ continue;
+ return yylen;
+}
+# endif
+# endif
+
+# ifndef yystpcpy
+# if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
+# define yystpcpy stpcpy
+# else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+ YYDEST. */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+static char *
+yystpcpy (char *yydest, const char *yysrc)
+#else
+static char *
+yystpcpy (yydest, yysrc)
+ char *yydest;
+ const char *yysrc;
+#endif
+{
+ char *yyd = yydest;
+ const char *yys = yysrc;
+
+ while ((*yyd++ = *yys++) != '\0')
+ continue;
+
+ return yyd - 1;
+}
+# endif
+# endif
+
+# ifndef yytnamerr
+/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
+ quotes and backslashes, so that it's suitable for yyerror. The
+ heuristic is that double-quoting is unnecessary unless the string
+ contains an apostrophe, a comma, or backslash (other than
+ backslash-backslash). YYSTR is taken from yytname. If YYRES is
+ null, do not copy; instead, return the length of what the result
+ would have been. */
+static YYSIZE_T
+yytnamerr (char *yyres, const char *yystr)
+{
+ if (*yystr == '"')
+ {
+ YYSIZE_T yyn = 0;
+ char const *yyp = yystr;
+
+ for (;;)
+ switch (*++yyp)
+ {
+ case '\'':
+ case ',':
+ goto do_not_strip_quotes;
+
+ case '\\':
+ if (*++yyp != '\\')
+ goto do_not_strip_quotes;
+ /* Fall through. */
+ default:
+ if (yyres)
+ yyres[yyn] = *yyp;
+ yyn++;
+ break;
+
+ case '"':
+ if (yyres)
+ yyres[yyn] = '\0';
+ return yyn;
+ }
+ do_not_strip_quotes: ;
+ }
+
+ if (! yyres)
+ return yystrlen (yystr);
+
+ return yystpcpy (yyres, yystr) - yyres;
+}
+# endif
+
+/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message
+ about the unexpected token YYTOKEN for the state stack whose top is
+ YYSSP.
+
+ Return 0 if *YYMSG was successfully written. Return 1 if *YYMSG is
+ not large enough to hold the message. In that case, also set
+ *YYMSG_ALLOC to the required number of bytes. Return 2 if the
+ required number of bytes is too large to store. */
+static int
+yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
+ yytype_int16 *yyssp, int yytoken)
+{
+ YYSIZE_T yysize0 = yytnamerr (0, yytname[yytoken]);
+ YYSIZE_T yysize = yysize0;
+ YYSIZE_T yysize1;
+ enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
+ /* Internationalized format string. */
+ const char *yyformat = 0;
+ /* Arguments of yyformat. */
+ char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
+ /* Number of reported tokens (one for the "unexpected", one per
+ "expected"). */
+ int yycount = 0;
+
+ /* There are many possibilities here to consider:
+ - Assume YYFAIL is not used. It's too flawed to consider. See
+ <http://lists.gnu.org/archive/html/bison-patches/2009-12/msg00024.html>
+ for details. YYERROR is fine as it does not invoke this
+ function.
+ - If this state is a consistent state with a default action, then
+ the only way this function was invoked is if the default action
+ is an error action. In that case, don't check for expected
+ tokens because there are none.
+ - The only way there can be no lookahead present (in yychar) is if
+ this state is a consistent state with a default action. Thus,
+ detecting the absence of a lookahead is sufficient to determine
+ that there is no unexpected or expected token to report. In that
+ case, just report a simple "syntax error".
+ - Don't assume there isn't a lookahead just because this state is a
+ consistent state with a default action. There might have been a
+ previous inconsistent state, consistent state with a non-default
+ action, or user semantic action that manipulated yychar.
+ - Of course, the expected token list depends on states to have
+ correct lookahead information, and it depends on the parser not
+ to perform extra reductions after fetching a lookahead from the
+ scanner and before detecting a syntax error. Thus, state merging
+ (from LALR or IELR) and default reductions corrupt the expected
+ token list. However, the list is correct for canonical LR with
+ one exception: it will still contain any token that will not be
+ accepted due to an error action in a later state.
+ */
+ if (yytoken != YYEMPTY)
+ {
+ int yyn = yypact[*yyssp];
+ yyarg[yycount++] = yytname[yytoken];
+ if (!yypact_value_is_default (yyn))
+ {
+ /* Start YYX at -YYN if negative to avoid negative indexes in
+ YYCHECK. In other words, skip the first -YYN actions for
+ this state because they are default actions. */
+ int yyxbegin = yyn < 0 ? -yyn : 0;
+ /* Stay within bounds of both yycheck and yytname. */
+ int yychecklim = YYLAST - yyn + 1;
+ int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+ int yyx;
+
+ for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+ if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR
+ && !yytable_value_is_error (yytable[yyx + yyn]))
+ {
+ if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
+ {
+ yycount = 1;
+ yysize = yysize0;
+ break;
+ }
+ yyarg[yycount++] = yytname[yyx];
+ yysize1 = yysize + yytnamerr (0, yytname[yyx]);
+ if (! (yysize <= yysize1
+ && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
+ return 2;
+ yysize = yysize1;
+ }
+ }
+ }
+
+ switch (yycount)
+ {
+# define YYCASE_(N, S) \
+ case N: \
+ yyformat = S; \
+ break
+ YYCASE_(0, YY_("syntax error"));
+ YYCASE_(1, YY_("syntax error, unexpected %s"));
+ YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s"));
+ YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s"));
+ YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s"));
+ YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s"));
+# undef YYCASE_
+ }
+
+ yysize1 = yysize + yystrlen (yyformat);
+ if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
+ return 2;
+ yysize = yysize1;
+
+ if (*yymsg_alloc < yysize)
+ {
+ *yymsg_alloc = 2 * yysize;
+ if (! (yysize <= *yymsg_alloc
+ && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM))
+ *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM;
+ return 1;
+ }
+
+ /* Avoid sprintf, as that infringes on the user's name space.
+ Don't have undefined behavior even if the translation
+ produced a string with the wrong number of "%s"s. */
+ {
+ char *yyp = *yymsg;
+ int yyi = 0;
+ while ((*yyp = *yyformat) != '\0')
+ if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount)
+ {
+ yyp += yytnamerr (yyp, yyarg[yyi++]);
+ yyformat += 2;
+ }
+ else
+ {
+ yyp++;
+ yyformat++;
+ }
+ }
+ return 0;
+}
+#endif /* YYERROR_VERBOSE */
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol. |
+`-----------------------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep)
+ const char *yymsg;
+ int yytype;
+ YYSTYPE *yyvaluep;
+#endif
+{
+ YYUSE (yyvaluep);
+
+ if (!yymsg)
+ yymsg = "Deleting";
+ YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+
+ switch (yytype)
+ {
+
+ default:
+ break;
+ }
+}
+
+
+/* Prevent warnings from -Wmissing-prototypes. */
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+/*----------.
+| yyparse. |
+`----------*/
+
+#ifdef YYPARSE_PARAM
+#if (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void *YYPARSE_PARAM)
+#else
+int
+yyparse (YYPARSE_PARAM)
+ void *YYPARSE_PARAM;
+#endif
+#else /* ! YYPARSE_PARAM */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+ || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void)
+#else
+int
+yyparse ()
+
+#endif
+#endif
+{
+/* The lookahead symbol. */
+int yychar;
+/* The backup of yychar when there is an error and we're in yyerrlab. */
+int yylastchar;
+
+/* The semantic value of the lookahead symbol. */
+YYSTYPE yylval;
+
+ /* Number of syntax errors so far. */
+ int yynerrs;
+
+ int yystate;
+ /* Number of tokens to shift before error messages enabled. */
+ int yyerrstatus;
+
+ /* The stacks and their tools:
+ `yyss': related to states.
+ `yyvs': related to semantic values.
+
+ Refer to the stacks thru separate pointers, to allow yyoverflow
+ to reallocate them elsewhere. */
+
+ /* The state stack. */
+ yytype_int16 yyssa[YYINITDEPTH];
+ yytype_int16 *yyss;
+ yytype_int16 *yyssp;
+
+ /* The semantic value stack. */
+ YYSTYPE yyvsa[YYINITDEPTH];
+ YYSTYPE *yyvs;
+ YYSTYPE *yyvsp;
+
+ YYSIZE_T yystacksize;
+
+ int yyn;
+ int yyresult;
+ /* Lookahead token as an internal (translated) token number. */
+ int yytoken;
+ /* The variables used to return semantic value and location from the
+ action routines. */
+ YYSTYPE yyval;
+
+#if YYERROR_VERBOSE
+ /* Buffer for error messages, and its allocated size. */
+ char yymsgbuf[128];
+ char *yymsg = yymsgbuf;
+ YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
+#endif
+
+#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N))
+
+ /* The number of symbols on the RHS of the reduced rule.
+ Keep to zero when no symbol should be popped. */
+ int yylen = 0;
+
+ yytoken = 0;
+ yyss = yyssa;
+ yyvs = yyvsa;
+ yystacksize = YYINITDEPTH;
+
+ YYDPRINTF ((stderr, "Starting parse\n"));
+
+ yystate = 0;
+ yyerrstatus = 0;
+ yynerrs = 0;
+ yychar = YYEMPTY; /* Cause a token to be read. */
+
+ /* Initialize stack pointers.
+ Waste one element of value and location stack
+ so that they stay on the same level as the state stack.
+ The wasted elements are never initialized. */
+ yyssp = yyss;
+ yyvsp = yyvs;
+
+ goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate. |
+`------------------------------------------------------------*/
+ yynewstate:
+ /* In all cases, when you get here, the value and location stacks
+ have just been pushed. So pushing a state here evens the stacks. */
+ yyssp++;
+
+ yysetstate:
+ *yyssp = yystate;
+
+ if (yyss + yystacksize - 1 <= yyssp)
+ {
+ /* Get the current used size of the three stacks, in elements. */
+ YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+ {
+ /* Give user a chance to reallocate the stack. Use copies of
+ these so that the &'s don't force the real ones into
+ memory. */
+ YYSTYPE *yyvs1 = yyvs;
+ yytype_int16 *yyss1 = yyss;
+
+ /* Each stack pointer address is followed by the size of the
+ data in use in that stack, in bytes. This used to be a
+ conditional around just the two extra args, but that might
+ be undefined if yyoverflow is a macro. */
+ yyoverflow (YY_("memory exhausted"),
+ &yyss1, yysize * sizeof (*yyssp),
+ &yyvs1, yysize * sizeof (*yyvsp),
+ &yystacksize);
+
+ yyss = yyss1;
+ yyvs = yyvs1;
+ }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+ goto yyexhaustedlab;
+# else
+ /* Extend the stack our own way. */
+ if (YYMAXDEPTH <= yystacksize)
+ goto yyexhaustedlab;
+ yystacksize *= 2;
+ if (YYMAXDEPTH < yystacksize)
+ yystacksize = YYMAXDEPTH;
+
+ {
+ yytype_int16 *yyss1 = yyss;
+ union yyalloc *yyptr =
+ (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+ if (! yyptr)
+ goto yyexhaustedlab;
+ YYSTACK_RELOCATE (yyss_alloc, yyss);
+ YYSTACK_RELOCATE (yyvs_alloc, yyvs);
+# undef YYSTACK_RELOCATE
+ if (yyss1 != yyssa)
+ YYSTACK_FREE (yyss1);
+ }
+# endif
+#endif /* no yyoverflow */
+
+ yyssp = yyss + yysize - 1;
+ yyvsp = yyvs + yysize - 1;
+
+ YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+ (unsigned long int) yystacksize));
+
+ if (yyss + yystacksize - 1 <= yyssp)
+ YYABORT;
+ }
+
+ YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+ if (yystate == YYFINAL)
+ YYACCEPT;
+
+ goto yybackup;
+
+/*-----------.
+| yybackup. |
+`-----------*/
+yybackup:
+
+ /* Do appropriate processing given the current state. Read a
+ lookahead token if we need one and don't already have one. */
+
+ /* First try to decide what to do without reference to lookahead token. */
+ yyn = yypact[yystate];
+ if (yypact_value_is_default (yyn))
+ goto yydefault;
+
+ /* Not known => get a lookahead token if don't already have one. */
+
+ /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol. */
+ if (yychar == YYEMPTY)
+ {
+ YYDPRINTF ((stderr, "Reading a token: "));
+ yychar = YYLEX;
+ }
+
+ if (yychar <= YYEOF)
+ {
+ yychar = yytoken = YYEOF;
+ YYDPRINTF ((stderr, "Now at end of input.\n"));
+ }
+ else
+ {
+ yytoken = YYTRANSLATE (yychar);
+ YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+ }
+
+ /* If the proper action on seeing token YYTOKEN is to reduce or to
+ detect an error, take that action. */
+ yyn += yytoken;
+ if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+ goto yydefault;
+ yyn = yytable[yyn];
+ if (yyn <= 0)
+ {
+ if (yytable_value_is_error (yyn))
+ goto yyerrlab;
+ yyn = -yyn;
+ goto yyreduce;
+ }
+
+ /* Count tokens shifted since error; after three, turn off error
+ status. */
+ if (yyerrstatus)
+ yyerrstatus--;
+
+ /* Shift the lookahead token. */
+ YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+ /* Discard the shifted token. */
+ yychar = YYEMPTY;
+
+ yystate = yyn;
+ *++yyvsp = yylval;
+
+ goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state. |
+`-----------------------------------------------------------*/
+yydefault:
+ yyn = yydefact[yystate];
+ if (yyn == 0)
+ goto yyerrlab;
+ goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction. |
+`-----------------------------*/
+yyreduce:
+ /* yyn is the number of a rule to reduce with. */
+ yylen = yyr2[yyn];
+
+ /* If YYLEN is nonzero, implement the default value of the action:
+ `$$ = $1'.
+
+ Otherwise, the following line sets YYVAL to garbage.
+ This behavior is undocumented and Bison
+ users should not rely upon it. Assigning to YYVAL
+ unconditionally makes the parser a bit smaller, and it avoids a
+ GCC warning that YYVAL may be used uninitialized. */
+ yyval = yyvsp[1-yylen];
+
+
+ YY_REDUCE_PRINT (yyn);
+ switch (yyn)
+ {
+ case 2:
+
+/* Line 1806 of yacc.c */
+#line 79 "fts0pars.y"
+ {
+ (yyval.node) = (yyvsp[(1) - (1)].node);
+ ((fts_ast_state_t*) state)->root = (yyval.node);
+ }
+ break;
+
+ case 3:
+
+/* Line 1806 of yacc.c */
+#line 85 "fts0pars.y"
+ {
+ (yyval.node) = NULL;
+ }
+ break;
+
+ case 4:
+
+/* Line 1806 of yacc.c */
+#line 89 "fts0pars.y"
+ {
+ (yyval.node) = (yyvsp[(1) - (2)].node);
+
+ if (!(yyval.node)) {
+ (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(2) - (2)].node));
+ } else {
+ fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+ }
+ }
+ break;
+
+ case 5:
+
+/* Line 1806 of yacc.c */
+#line 99 "fts0pars.y"
+ {
+ (yyval.node) = (yyvsp[(1) - (2)].node);
+ (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+
+ if (!(yyval.node)) {
+ (yyval.node) = (yyvsp[(2) - (2)].node);
+ } else {
+ fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+ }
+ }
+ break;
+
+ case 6:
+
+/* Line 1806 of yacc.c */
+#line 111 "fts0pars.y"
+ {
+ (yyval.node) = (yyvsp[(2) - (3)].node);
+
+ if ((yyval.node)) {
+ (yyval.node) = fts_ast_create_node_subexp_list(state, (yyval.node));
+ }
+ }
+ break;
+
+ case 7:
+
+/* Line 1806 of yacc.c */
+#line 119 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node));
+
+ if ((yyvsp[(3) - (4)].node)) {
+ fts_ast_add_node((yyval.node),
+ fts_ast_create_node_subexp_list(state, (yyvsp[(3) - (4)].node)));
+ }
+ }
+ break;
+
+ case 8:
+
+/* Line 1806 of yacc.c */
+#line 129 "fts0pars.y"
+ {
+ (yyval.node) = (yyvsp[(1) - (1)].node);
+ }
+ break;
+
+ case 9:
+
+/* Line 1806 of yacc.c */
+#line 133 "fts0pars.y"
+ {
+ (yyval.node) = (yyvsp[(1) - (1)].node);
+ }
+ break;
+
+ case 10:
+
+/* Line 1806 of yacc.c */
+#line 137 "fts0pars.y"
+ {
+ fts_ast_term_set_wildcard((yyvsp[(1) - (2)].node));
+ }
+ break;
+
+ case 11:
+
+/* Line 1806 of yacc.c */
+#line 141 "fts0pars.y"
+ {
+ fts_ast_text_set_distance((yyvsp[(1) - (3)].node), fts_ast_string_to_ul((yyvsp[(3) - (3)].token), 10));
+ fts_ast_string_free((yyvsp[(3) - (3)].token));
+ }
+ break;
+
+ case 12:
+
+/* Line 1806 of yacc.c */
+#line 146 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (3)].node));
+ fts_ast_add_node((yyval.node), (yyvsp[(2) - (3)].node));
+ fts_ast_term_set_wildcard((yyvsp[(2) - (3)].node));
+ }
+ break;
+
+ case 13:
+
+/* Line 1806 of yacc.c */
+#line 152 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+ fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+ }
+ break;
+
+ case 14:
+
+/* Line 1806 of yacc.c */
+#line 157 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node));
+ fts_ast_add_node((yyval.node), (yyvsp[(2) - (4)].node));
+ fts_ast_text_set_distance((yyvsp[(2) - (4)].node), fts_ast_string_to_ul((yyvsp[(4) - (4)].token), 10));
+ fts_ast_string_free((yyvsp[(4) - (4)].token));
+ }
+ break;
+
+ case 15:
+
+/* Line 1806 of yacc.c */
+#line 164 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+ fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+ }
+ break;
+
+ case 16:
+
+/* Line 1806 of yacc.c */
+#line 170 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_oper(state, FTS_IGNORE);
+ }
+ break;
+
+ case 17:
+
+/* Line 1806 of yacc.c */
+#line 174 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_oper(state, FTS_EXIST);
+ }
+ break;
+
+ case 18:
+
+/* Line 1806 of yacc.c */
+#line 178 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_oper(state, FTS_NEGATE);
+ }
+ break;
+
+ case 19:
+
+/* Line 1806 of yacc.c */
+#line 182 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+ }
+ break;
+
+ case 20:
+
+/* Line 1806 of yacc.c */
+#line 186 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+ }
+ break;
+
+ case 21:
+
+/* Line 1806 of yacc.c */
+#line 191 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token));
+ fts_ast_string_free((yyvsp[(1) - (1)].token));
+ }
+ break;
+
+ case 22:
+
+/* Line 1806 of yacc.c */
+#line 196 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token));
+ fts_ast_string_free((yyvsp[(1) - (1)].token));
+ }
+ break;
+
+ case 23:
+
+/* Line 1806 of yacc.c */
+#line 202 "fts0pars.y"
+ {
+ (yyval.node) = (yyvsp[(2) - (2)].node);
+ }
+ break;
+
+ case 24:
+
+/* Line 1806 of yacc.c */
+#line 207 "fts0pars.y"
+ {
+ (yyval.node) = fts_ast_create_node_text(state, (yyvsp[(1) - (1)].token));
+ fts_ast_string_free((yyvsp[(1) - (1)].token));
+ }
+ break;
+
+
+
+/* Line 1806 of yacc.c */
+#line 1663 "fts0pars.cc"
+ default: break;
+ }
+ /* User semantic actions sometimes alter yychar, and that requires
+ that yytoken be updated with the new translation. We take the
+ approach of translating immediately before every use of yytoken.
+ One alternative is translating here after every semantic action,
+ but that translation would be missed if the semantic action invokes
+ YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or
+ if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an
+ incorrect destructor might then be invoked immediately. In the
+ case of YYERROR or YYBACKUP, subsequent parser actions might lead
+ to an incorrect destructor call or verbose syntax error message
+ before the lookahead is translated. */
+ YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+
+ YYPOPSTACK (yylen);
+ yylen = 0;
+ YY_STACK_PRINT (yyss, yyssp);
+
+ *++yyvsp = yyval;
+
+ /* Now `shift' the result of the reduction. Determine what state
+ that goes to, based on the state we popped back to and the rule
+ number reduced by. */
+
+ yyn = yyr1[yyn];
+
+ yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+ if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+ yystate = yytable[yystate];
+ else
+ yystate = yydefgoto[yyn - YYNTOKENS];
+
+ goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+ /* Backup yychar, in case we would change it. */
+ yylastchar = yychar;
+ /* Make sure we have latest lookahead translation. See comments at
+ user semantic actions for why this is necessary. */
+ yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar);
+
+ /* If not already recovering from an error, report this error. */
+ if (!yyerrstatus)
+ {
+ ++yynerrs;
+#if ! YYERROR_VERBOSE
+ yyerror (YY_("syntax error"));
+#else
+# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \
+ yyssp, yytoken)
+ {
+ char const *yymsgp = YY_("syntax error");
+ int yysyntax_error_status;
+ yysyntax_error_status = YYSYNTAX_ERROR;
+ if (yysyntax_error_status == 0)
+ yymsgp = yymsg;
+ else if (yysyntax_error_status == 1)
+ {
+ if (yymsg != yymsgbuf)
+ YYSTACK_FREE (yymsg);
+ yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc);
+ if (!yymsg)
+ {
+ yymsg = yymsgbuf;
+ yymsg_alloc = sizeof yymsgbuf;
+ yysyntax_error_status = 2;
+ }
+ else
+ {
+ yysyntax_error_status = YYSYNTAX_ERROR;
+ yymsgp = yymsg;
+ }
+ }
+ yyerror (yymsgp);
+ if (yysyntax_error_status == 2)
+ goto yyexhaustedlab;
+ }
+# undef YYSYNTAX_ERROR
+#endif
+ }
+
+
+
+ if (yyerrstatus == 3)
+ {
+ /* If just tried and failed to reuse lookahead token after an
+ error, discard it. */
+
+ if (yychar <= YYEOF)
+ {
+ /* Return failure if at end of input. */
+ if (yychar == YYEOF)
+ {
+ /* Since we don't need the token, we have to free it first. */
+ YYERRCLEANUP;
+ YYABORT;
+ }
+ }
+ else
+ {
+ yydestruct ("Error: discarding",
+ yytoken, &yylval);
+ yychar = YYEMPTY;
+ }
+ }
+
+ /* Else will try to reuse lookahead token after shifting the error
+ token. */
+ goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR. |
+`---------------------------------------------------*/
+yyerrorlab:
+
+ /* Pacify compilers like GCC when the user code never invokes
+ YYERROR and the label yyerrorlab therefore never appears in user
+ code. */
+ if (/*CONSTCOND*/ 0)
+ goto yyerrorlab;
+
+ /* Do not reclaim the symbols of the rule which action triggered
+ this YYERROR. */
+ YYPOPSTACK (yylen);
+ yylen = 0;
+ YY_STACK_PRINT (yyss, yyssp);
+ yystate = *yyssp;
+ goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR. |
+`-------------------------------------------------------------*/
+yyerrlab1:
+ yyerrstatus = 3; /* Each real token shifted decrements this. */
+
+ for (;;)
+ {
+ yyn = yypact[yystate];
+ if (!yypact_value_is_default (yyn))
+ {
+ yyn += YYTERROR;
+ if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+ {
+ yyn = yytable[yyn];
+ if (0 < yyn)
+ break;
+ }
+ }
+
+ /* Pop the current state because it cannot handle the error token. */
+ if (yyssp == yyss)
+ {
+ /* Since we don't need the error token, we have to free it first. */
+ YYERRCLEANUP;
+ YYABORT;
+ }
+
+
+ yydestruct ("Error: popping",
+ yystos[yystate], yyvsp);
+ YYPOPSTACK (1);
+ yystate = *yyssp;
+ YY_STACK_PRINT (yyss, yyssp);
+ }
+
+ *++yyvsp = yylval;
+
+
+ /* Shift the error token. */
+ YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+ yystate = yyn;
+ goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here. |
+`-------------------------------------*/
+yyacceptlab:
+ yyresult = 0;
+ goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here. |
+`-----------------------------------*/
+yyabortlab:
+ yyresult = 1;
+ goto yyreturn;
+
+#if !defined(yyoverflow) || YYERROR_VERBOSE
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here. |
+`-------------------------------------------------*/
+yyexhaustedlab:
+ yyerror (YY_("memory exhausted"));
+ yyresult = 2;
+ /* Fall through. */
+#endif
+
+yyreturn:
+ if (yychar != YYEMPTY)
+ {
+ /* Make sure we have latest lookahead translation. See comments at
+ user semantic actions for why this is necessary. */
+ yytoken = YYTRANSLATE (yychar);
+ yydestruct ("Cleanup: discarding lookahead",
+ yytoken, &yylval);
+ }
+ /* Do not reclaim the symbols of the rule which action triggered
+ this YYABORT or YYACCEPT. */
+ YYPOPSTACK (yylen);
+ YY_STACK_PRINT (yyss, yyssp);
+ while (yyssp != yyss)
+ {
+ yydestruct ("Cleanup: popping",
+ yystos[*yyssp], yyvsp);
+ YYPOPSTACK (1);
+ }
+#ifndef yyoverflow
+ if (yyss != yyssa)
+ YYSTACK_FREE (yyss);
+#endif
+#if YYERROR_VERBOSE
+ if (yymsg != yymsgbuf)
+ YYSTACK_FREE (yymsg);
+#endif
+ /* Make sure YYID is used. */
+ return YYID (yyresult);
+}
+
+
+
+/* Line 2067 of yacc.c */
+#line 212 "fts0pars.y"
+
+
+/********************************************************************
+*/
+int
+ftserror(
+/*=====*/
+ const char* p)
+{
+ my_printf_error(ER_PARSE_ERROR, "%s", MYF(0), p);
+ return(0);
+}
+
+/********************************************************************
+Create a fts_lexer_t instance.*/
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+ ibool boolean_mode,
+ const byte* query,
+ ulint query_len)
+{
+ fts_lexer_t* fts_lexer = static_cast<fts_lexer_t*>(
+ ut_malloc_nokey(sizeof(fts_lexer_t)));
+
+ if (boolean_mode) {
+ fts0blex_init(&fts_lexer->yyscanner);
+ fts0b_scan_bytes(
+ reinterpret_cast<const char*>(query),
+ static_cast<int>(query_len),
+ fts_lexer->yyscanner);
+ fts_lexer->scanner = fts_blexer;
+ /* FIXME: Debugging */
+ /* fts0bset_debug(1 , fts_lexer->yyscanner); */
+ } else {
+ fts0tlex_init(&fts_lexer->yyscanner);
+ fts0t_scan_bytes(
+ reinterpret_cast<const char*>(query),
+ static_cast<int>(query_len),
+ fts_lexer->yyscanner);
+ fts_lexer->scanner = fts_tlexer;
+ }
+
+ return(fts_lexer);
+}
+
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+
+fts_lexer_free(
+/*===========*/
+ fts_lexer_t* fts_lexer)
+{
+ if (fts_lexer->scanner == fts_blexer) {
+ fts0blex_destroy(fts_lexer->yyscanner);
+ } else {
+ fts0tlex_destroy(fts_lexer->yyscanner);
+ }
+
+ ut_free(fts_lexer);
+}
+
+/********************************************************************
+Call the appropaiate scanner.*/
+int
+fts_lexer(
+/*======*/
+ YYSTYPE* val,
+ fts_lexer_t* fts_lexer)
+{
+ fts_scanner func_ptr;
+
+ func_ptr = fts_lexer->scanner;
+
+ return(func_ptr(val, fts_lexer->yyscanner));
+}
+
+/********************************************************************
+Parse the query.*/
+int
+fts_parse(
+/*======*/
+ fts_ast_state_t* state)
+{
+ return(ftsparse(state));
+}
+
diff --git a/storage/innobase/fts/fts0pars.y b/storage/innobase/fts/fts0pars.y
new file mode 100644
index 00000000..deebc79e
--- /dev/null
+++ b/storage/innobase/fts/fts0pars.y
@@ -0,0 +1,293 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0pars.y
+ * FTS parser: input file for the GNU Bison parser generator
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+#include "ha_prototypes.h"
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0blex.h"
+#include "fts0tlex.h"
+#include "fts0pars.h"
+#include <my_sys.h>
+
+extern int fts_lexer(YYSTYPE*, fts_lexer_t*);
+extern int fts_blexer(YYSTYPE*, yyscan_t);
+extern int fts_tlexer(YYSTYPE*, yyscan_t);
+
+
+
+extern int ftserror(const char* p);
+
+/* Required for reentrant parser */
+#define ftslex fts_lexer
+
+#define YYERROR_VERBOSE
+
+/* For passing an argument to yyparse() */
+#define YYPARSE_PARAM state
+#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer
+
+
+typedef int (*fts_scanner)(YYSTYPE* val, yyscan_t yyscanner);
+
+struct fts_lexer_struct {
+ fts_scanner scanner;
+ void* yyscanner;
+};
+
+%}
+
+%union {
+ int oper;
+ fts_ast_string_t* token;
+ fts_ast_node_t* node;
+};
+
+/* Enable re-entrant parser */
+%pure_parser
+
+%token<oper> FTS_OPER
+%token<token> FTS_TEXT FTS_TERM FTS_NUMB
+
+%type<node> prefix term text expr sub_expr expr_lst query
+
+%nonassoc '+' '-' '~' '<' '>'
+
+%%
+
+query : expr_lst {
+ $$ = $1;
+ ((fts_ast_state_t*) state)->root = $$;
+ }
+ ;
+
+expr_lst: /* Empty */ {
+ $$ = NULL;
+ }
+
+ | expr_lst expr {
+ $$ = $1;
+
+ if (!$$) {
+ $$ = fts_ast_create_node_list(state, $2);
+ } else {
+ fts_ast_add_node($$, $2);
+ }
+ }
+
+ | expr_lst sub_expr {
+ $$ = $1;
+ $$ = fts_ast_create_node_list(state, $1);
+
+ if (!$$) {
+ $$ = $2;
+ } else {
+ fts_ast_add_node($$, $2);
+ }
+ }
+ ;
+
+sub_expr: '(' expr_lst ')' {
+ $$ = $2;
+
+ if ($$) {
+ $$ = fts_ast_create_node_subexp_list(state, $$);
+ }
+ }
+
+ | prefix '(' expr_lst ')' {
+ $$ = fts_ast_create_node_list(state, $1);
+
+ if ($3) {
+ fts_ast_add_node($$,
+ fts_ast_create_node_subexp_list(state, $3));
+ }
+ }
+ ;
+
+expr : term {
+ $$ = $1;
+ }
+
+ | text {
+ $$ = $1;
+ }
+
+ | term '*' {
+ fts_ast_term_set_wildcard($1);
+ }
+
+ | text '@' FTS_NUMB {
+ fts_ast_text_set_distance($1, fts_ast_string_to_ul($3, 10));
+ fts_ast_string_free($3);
+ }
+
+ | prefix term '*' {
+ $$ = fts_ast_create_node_list(state, $1);
+ fts_ast_add_node($$, $2);
+ fts_ast_term_set_wildcard($2);
+ }
+
+ | prefix term {
+ $$ = fts_ast_create_node_list(state, $1);
+ fts_ast_add_node($$, $2);
+ }
+
+ | prefix text '@' FTS_NUMB {
+ $$ = fts_ast_create_node_list(state, $1);
+ fts_ast_add_node($$, $2);
+ fts_ast_text_set_distance($2, fts_ast_string_to_ul($4, 10));
+ fts_ast_string_free($4);
+ }
+
+ | prefix text {
+ $$ = fts_ast_create_node_list(state, $1);
+ fts_ast_add_node($$, $2);
+ }
+ ;
+
+prefix : '-' {
+ $$ = fts_ast_create_node_oper(state, FTS_IGNORE);
+ }
+
+ | '+' {
+ $$ = fts_ast_create_node_oper(state, FTS_EXIST);
+ }
+
+ | '~' {
+ $$ = fts_ast_create_node_oper(state, FTS_NEGATE);
+ }
+
+ | '<' {
+ $$ = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+ }
+
+ | '>' {
+ $$ = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+ }
+ ;
+
+term : FTS_TERM {
+ $$ = fts_ast_create_node_term(state, $1);
+ fts_ast_string_free($1);
+ }
+
+ | FTS_NUMB {
+ $$ = fts_ast_create_node_term(state, $1);
+ fts_ast_string_free($1);
+ }
+
+ /* Ignore leading '*' */
+ | '*' term {
+ $$ = $2;
+ }
+ ;
+
+text : FTS_TEXT {
+ $$ = fts_ast_create_node_text(state, $1);
+ fts_ast_string_free($1);
+ }
+ ;
+%%
+
+/********************************************************************
+*/
+int
+ftserror(
+/*=====*/
+ const char* p)
+{
+ fprintf(stderr, "%s\n", p);
+ return(0);
+}
+
+/********************************************************************
+Create a fts_lexer_t instance.*/
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+ ibool boolean_mode,
+ const byte* query,
+ ulint query_len)
+{
+ fts_lexer_t* fts_lexer = static_cast<fts_lexer_t*>(
+ ut_malloc_nokey(sizeof(fts_lexer_t)));
+
+ if (boolean_mode) {
+ fts0blex_init(&fts_lexer->yyscanner);
+ fts0b_scan_bytes((char*) query, (int) query_len, fts_lexer->yyscanner);
+ fts_lexer->scanner = fts_blexer;
+ /* FIXME: Debugging */
+ /* fts0bset_debug(1 , fts_lexer->yyscanner); */
+ } else {
+ fts0tlex_init(&fts_lexer->yyscanner);
+ fts0t_scan_bytes((char*) query, (int) query_len, fts_lexer->yyscanner);
+ fts_lexer->scanner = fts_tlexer;
+ }
+
+ return(fts_lexer);
+}
+
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+
+fts_lexer_free(
+/*===========*/
+ fts_lexer_t* fts_lexer)
+{
+ if (fts_lexer->scanner == fts_blexer) {
+ fts0blex_destroy(fts_lexer->yyscanner);
+ } else {
+ fts0tlex_destroy(fts_lexer->yyscanner);
+ }
+
+ ut_free(fts_lexer);
+}
+
+/********************************************************************
+Call the appropaiate scanner.*/
+int
+fts_lexer(
+/*======*/
+ YYSTYPE* val,
+ fts_lexer_t* fts_lexer)
+{
+ fts_scanner func_ptr;
+
+ func_ptr = fts_lexer->scanner;
+
+ return(func_ptr(val, fts_lexer->yyscanner));
+}
+
+/********************************************************************
+Parse the query.*/
+int
+fts_parse(
+/*======*/
+ fts_ast_state_t* state)
+{
+ return(ftsparse(state));
+}
diff --git a/storage/innobase/fts/fts0plugin.cc b/storage/innobase/fts/fts0plugin.cc
new file mode 100644
index 00000000..de99d170
--- /dev/null
+++ b/storage/innobase/fts/fts0plugin.cc
@@ -0,0 +1,283 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0plugin.cc
+Full Text Search plugin support.
+
+Created 2013/06/04 Shaohua Wang
+***********************************************************************/
+
+#include "fts0ast.h"
+#include "fts0plugin.h"
+#include "fts0tokenize.h"
+
+#include "ft_global.h"
+
+/******************************************************************//**
+FTS default parser init
+@return 0 */
+static int fts_default_parser_init(MYSQL_FTPARSER_PARAM*) { return 0; }
+
+/******************************************************************//**
+FTS default parser deinit
+@return 0 */
+static int fts_default_parser_deinit(MYSQL_FTPARSER_PARAM*) { return 0; }
+
+/******************************************************************//**
+FTS default parser parse from ft_static.c in MYISAM.
+@return 0 if parse successfully, or return non-zero */
+static
+int
+fts_default_parser_parse(
+/*=====================*/
+ MYSQL_FTPARSER_PARAM *param) /*!< in: plugin parser param */
+{
+ return(param->mysql_parse(param, param->doc, param->length));
+}
+
+/* FTS default parser from ft_static.c in MYISAM. */
+struct st_mysql_ftparser fts_default_parser =
+{
+ MYSQL_FTPARSER_INTERFACE_VERSION,
+ fts_default_parser_parse,
+ fts_default_parser_init,
+ fts_default_parser_deinit
+};
+
+/******************************************************************//**
+Get a operator node from token boolean info
+@return node */
+static
+fts_ast_node_t*
+fts_query_get_oper_node(
+/*====================*/
+ MYSQL_FTPARSER_BOOLEAN_INFO* info, /*!< in: token info */
+ fts_ast_state_t* state) /*!< in/out: query parse state*/
+{
+ fts_ast_node_t* oper_node = NULL;
+
+ if (info->yesno > 0) {
+ oper_node = fts_ast_create_node_oper(state, FTS_EXIST);
+ } else if (info->yesno < 0) {
+ oper_node = fts_ast_create_node_oper(state, FTS_IGNORE);
+ } else if (info->weight_adjust > 0) {
+ oper_node = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+ } else if (info->weight_adjust < 0) {
+ oper_node = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+ } else if (info->wasign > 0) {
+ oper_node = fts_ast_create_node_oper(state, FTS_NEGATE);
+ }
+
+ return(oper_node);
+}
+
+/******************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for query parse.
+Refer to 'st_mysql_ftparser_param' for more detail.
+Note:
+a. Parse logic refers to 'ftb_query_add_word' from ft_boolean_search.c in MYISAM;
+b. Parse node or tree refers to fts0pars.y.
+@return 0 if add successfully, or return non-zero. */
+static
+int
+fts_query_add_word_for_parser(
+/*==========================*/
+ MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */
+ const char* word, /*!< in: token */
+ int word_len, /*!< in: token length */
+ MYSQL_FTPARSER_BOOLEAN_INFO* info) /*!< in: token info */
+{
+ fts_ast_state_t* state =
+ static_cast<fts_ast_state_t*>(param->mysql_ftparam);
+ fts_ast_node_t* cur_node = state->cur_node;
+ fts_ast_node_t* oper_node = NULL;
+ fts_ast_node_t* term_node = NULL;
+ fts_ast_node_t* node = NULL;
+
+ switch (info->type) {
+ case FT_TOKEN_STOPWORD:
+ /* We only handler stopword in phrase */
+ if (cur_node->type != FTS_AST_PARSER_PHRASE_LIST) {
+ break;
+ }
+ /* fall through */
+
+ case FT_TOKEN_WORD:
+ term_node = fts_ast_create_node_term_for_parser(
+ state, word, ulint(word_len));
+
+ if (info->trunc) {
+ fts_ast_term_set_wildcard(term_node);
+ }
+
+ if (cur_node->type == FTS_AST_PARSER_PHRASE_LIST) {
+ /* Ignore operator inside phrase */
+ fts_ast_add_node(cur_node, term_node);
+ } else {
+ ut_ad(cur_node->type == FTS_AST_LIST
+ || cur_node->type == FTS_AST_SUBEXP_LIST);
+ oper_node = fts_query_get_oper_node(info, state);
+
+ if (oper_node) {
+ node = fts_ast_create_node_list(state, oper_node);
+ fts_ast_add_node(node, term_node);
+ fts_ast_add_node(cur_node, node);
+ } else {
+ fts_ast_add_node(cur_node, term_node);
+ }
+ }
+
+ break;
+
+ case FT_TOKEN_LEFT_PAREN:
+ /* Check parse error */
+ if (cur_node->type != FTS_AST_LIST
+ && cur_node->type != FTS_AST_SUBEXP_LIST) {
+ return(1);
+ }
+
+ /* Set operator */
+ oper_node = fts_query_get_oper_node(info, state);
+ if (oper_node != NULL) {
+ node = fts_ast_create_node_list(state, oper_node);
+ fts_ast_add_node(cur_node, node);
+ node->go_up = true;
+ node->up_node = cur_node;
+ cur_node = node;
+ }
+
+ if (info->quot) {
+ /* Phrase node */
+ node = fts_ast_create_node_phrase_list(state);
+ } else {
+ /* Subexp list node */
+ node = fts_ast_create_node_subexp_list(state, NULL);
+ }
+
+ fts_ast_add_node(cur_node, node);
+
+ node->up_node = cur_node;
+ state->cur_node = node;
+ state->depth += 1;
+
+ break;
+
+ case FT_TOKEN_RIGHT_PAREN:
+ info->quot = 0;
+
+ if (cur_node->up_node != NULL) {
+ cur_node = cur_node->up_node;
+
+ if (cur_node->go_up) {
+ ut_a(cur_node->up_node
+ && !(cur_node->up_node->go_up));
+ cur_node = cur_node->up_node;
+ }
+ }
+
+ state->cur_node = cur_node;
+
+ if (state->depth > 0) {
+ state->depth--;
+ } else {
+ /* Parentheses mismatch */
+ return(1);
+ }
+
+ break;
+
+ case FT_TOKEN_EOF:
+ default:
+ break;
+ }
+
+ return(0);
+}
+
+/******************************************************************//**
+FTS plugin parser 'myql_parser' callback function for query parse.
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return 0 if parse successfully */
+static
+int
+fts_parse_query_internal(
+/*=====================*/
+ MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */
+ const char* query, /*!< in: query string */
+ int len) /*!< in: query length */
+{
+ MYSQL_FTPARSER_BOOLEAN_INFO info;
+ const CHARSET_INFO* cs = param->cs;
+ uchar** start = (uchar**)(&query);
+ uchar* end = (uchar*)(query + len);
+ FT_WORD w = {NULL, 0, 0};
+
+ info.prev = ' ';
+ info.quot = 0;
+ memset(&w, 0, sizeof(w));
+ /* Note: We don't handle simple parser mode here,
+ but user supplied plugin parser should handler it. */
+ while (fts_get_word(cs, start, end, &w, &info)) {
+ int ret = param->mysql_add_word(
+ param,
+ reinterpret_cast<char*>(w.pos),
+ int(w.len), &info);
+ if (ret) {
+ return(ret);
+ }
+ }
+
+ return(0);
+}
+
+/******************************************************************//**
+fts parse query by plugin parser.
+@return 0 if parse successfully, or return non-zero. */
+int
+fts_parse_by_parser(
+/*================*/
+ ibool mode, /*!< in: parse boolean mode */
+ uchar* query_str, /*!< in: query string */
+ ulint query_len, /*!< in: query string length */
+ st_mysql_ftparser* parser, /*!< in: fts plugin parser */
+ fts_ast_state_t* state) /*!< in/out: parser state */
+{
+ MYSQL_FTPARSER_PARAM param;
+ int ret;
+
+ ut_ad(parser);
+
+ /* Initial parser param */
+ param.mysql_parse = fts_parse_query_internal;
+ param.mysql_add_word = fts_query_add_word_for_parser;
+ param.mysql_ftparam = static_cast<void*>(state);
+ param.cs = state->charset;
+ param.doc = reinterpret_cast<char*>(query_str);
+ param.length = static_cast<int>(query_len);
+ param.flags = 0;
+ param.mode = mode ?
+ MYSQL_FTPARSER_FULL_BOOLEAN_INFO :
+ MYSQL_FTPARSER_SIMPLE_MODE;
+
+ PARSER_INIT(parser, &param);
+ ret = parser->parse(&param);
+ PARSER_DEINIT(parser, &param);
+
+ return(ret | state->depth);
+}
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
new file mode 100644
index 00000000..8e2cb838
--- /dev/null
+++ b/storage/innobase/fts/fts0que.cc
@@ -0,0 +1,4596 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0que.cc
+Full Text Search functionality.
+
+Created 2007/03/27 Sunny Bains
+Completed 2011/7/10 Sunny and Jimmy Yang
+*******************************************************/
+
+#include "dict0dict.h"
+#include "ut0rbt.h"
+#include "row0sel.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+#include "fts0types.h"
+#include "fts0plugin.h"
+
+#include <iomanip>
+#include <vector>
+
+#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
+
+#define RANK_DOWNGRADE (-1.0F)
+#define RANK_UPGRADE (1.0F)
+
+/* Maximum number of words supported in a phrase or proximity search. */
+#define MAX_PROXIMITY_ITEM 128
+
+/* Memory used by rbt itself for create and node add */
+#define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
+#define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t)
+
+/*Initial byte length for 'words' in fts_ranking_t */
+#define RANKING_WORDS_INIT_LEN 4
+
+// FIXME: Need to have a generic iterator that traverses the ilist.
+
+typedef std::vector<fts_string_t, ut_allocator<fts_string_t> > word_vector_t;
+
+struct fts_word_freq_t;
+
+/** State of an FTS query. */
+struct fts_query_t {
+ mem_heap_t* heap; /*!< Heap to use for allocations */
+
+ trx_t* trx; /*!< The query transaction */
+
+ dict_index_t* index; /*!< The FTS index to search */
+ /*!< FTS auxiliary common table def */
+
+ fts_table_t fts_common_table;
+
+ fts_table_t fts_index_table;/*!< FTS auxiliary index table def */
+
+ size_t total_size; /*!< total memory size used by query */
+
+ fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be
+ filtered from the output */
+
+ fts_ast_node_t* root; /*!< Abstract syntax tree */
+
+ fts_ast_node_t* cur_node; /*!< Current tree node */
+
+ ib_rbt_t* word_map; /*!< Matched word map for
+ searching by word*/
+
+ word_vector_t* word_vector; /*!< Matched word vector for
+ searching by index */
+
+ ib_rbt_t* doc_ids; /*!< The current set of matching
+ doc ids, elements are of
+ type fts_ranking_t */
+
+ ib_rbt_t* intersection; /*!< The doc ids that were found in
+ doc_ids, this tree will become
+ the new doc_ids, elements are of type
+ fts_ranking_t */
+
+ /*!< Prepared statement to read the
+ nodes from the FTS INDEX */
+ que_t* read_nodes_graph;
+
+ fts_ast_oper_t oper; /*!< Current boolean mode operator */
+
+ /*!< TRUE if we want to collect the
+ word positions within the document */
+ ibool collect_positions;
+
+ ulint flags; /*!< Specify the full text search type,
+ such as boolean search, phrase
+ search, proximity search etc. */
+
+ ulint distance; /*!< The proximity distance of a
+ phrase search. */
+
+ /*!< These doc ids are used as a
+ boundary condition when searching the
+ FTS index rows */
+
+ doc_id_t lower_doc_id; /*!< Lowest doc id in doc_ids */
+
+ doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */
+
+ bool boolean_mode; /*!< TRUE if boolean mode query */
+
+ ib_vector_t* matched; /*!< Array of matching documents
+ (fts_match_t) to search for a phrase */
+
+ ib_vector_t** match_array; /*!< Used for proximity search, contains
+ position info for each matched word
+ in the word list */
+
+ ib_uint64_t total_docs; /*!< The total number of documents */
+
+ ulint total_words; /*!< The total number of words */
+
+ dberr_t error; /*!< Error code if any, that is
+ encountered during query processing */
+
+ ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per
+ document, its elements are of type
+ fts_word_freq_t */
+
+ ib_rbt_t* wildcard_words; /*!< words with wildcard */
+
+ bool multi_exist; /*!< multiple FTS_EXIST oper */
+ byte visiting_sub_exp; /*!< count of nested
+ fts_ast_visit_sub_exp() */
+
+ st_mysql_ftparser* parser; /*!< fts plugin parser */
+};
+
+/** For phrase matching, first we collect the documents and the positions
+then we match. */
+struct fts_match_t {
+ doc_id_t doc_id; /*!< Document id */
+
+ ulint start; /*!< Start the phrase match from
+ this offset within the positions
+ vector. */
+
+ ib_vector_t* positions; /*!< Offsets of a word in a
+ document */
+};
+
+/** For matching tokens in a phrase search. We use this data structure in
+the callback that determines whether a document should be accepted or
+rejected for a phrase search. */
+struct fts_select_t {
+ doc_id_t doc_id; /*!< The document id to match */
+
+ ulint min_pos; /*!< For found to be TRUE at least
+ one position must be greater than
+ min_pos. */
+
+ ibool found; /*!< TRUE if found */
+
+ fts_word_freq_t*
+ word_freq; /*!< Word frequency instance of the
+ current word being looked up in
+ the FTS index */
+};
+
+typedef std::vector<ulint, ut_allocator<ulint> > pos_vector_t;
+
+/** structure defines a set of ranges for original documents, each of which
+has a minimum position and maximum position. Text in such range should
+contain all words in the proximity search. We will need to count the
+words in such range to make sure it is less than the specified distance
+of the proximity search */
+struct fts_proximity_t {
+ ulint n_pos; /*!< number of position set, defines
+ a range (min to max) containing all
+ matching words */
+ pos_vector_t min_pos; /*!< the minimum position (in bytes)
+ of the range */
+ pos_vector_t max_pos; /*!< the maximum position (in bytes)
+ of the range */
+};
+
+/** The match positions and tokesn to match */
+struct fts_phrase_t {
+ fts_phrase_t(const dict_table_t* table)
+ :
+ found(false),
+ match(NULL),
+ tokens(NULL),
+ distance(0),
+ charset(NULL),
+ heap(NULL),
+ zip_size(table->space->zip_size()),
+ proximity_pos(NULL),
+ parser(NULL)
+ {
+ }
+
+ /** Match result */
+ ibool found;
+
+ /** Positions within text */
+ const fts_match_t* match;
+
+ /** Tokens to match */
+ const ib_vector_t* tokens;
+
+ /** For matching on proximity distance. Can be 0 for exact match */
+ ulint distance;
+
+ /** Phrase match charset */
+ CHARSET_INFO* charset;
+
+ /** Heap for word processing */
+ mem_heap_t* heap;
+
+ /** ROW_FORMAT=COMPRESSED page size, or 0 */
+ const ulint zip_size;
+
+ /** Position info for proximity search verification. Records the
+ min and max position of words matched */
+ fts_proximity_t* proximity_pos;
+
+ /** FTS plugin parser */
+ st_mysql_ftparser* parser;
+};
+
+/** Paramter passed to fts phrase match by parser */
+struct fts_phrase_param_t {
+ fts_phrase_t* phrase; /*!< Match phrase instance */
+ ulint token_index; /*!< Index of token to match next */
+ mem_heap_t* heap; /*!< Heap for word processing */
+};
+
+/** For storing the frequncy of a word/term in a document */
+struct fts_doc_freq_t {
+ doc_id_t doc_id; /*!< Document id */
+ ulint freq; /*!< Frequency of a word in a document */
+};
+
+/** To determine the word frequency per document. */
+struct fts_word_freq_t {
+ fts_string_t word; /*!< Word for which we need the freq,
+ it's allocated on the query heap */
+
+ ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document
+ word frequencies. The elements are
+ of type fts_doc_freq_t */
+ ib_uint64_t doc_count; /*!< Total number of documents that
+ contain this word */
+ double idf; /*!< Inverse document frequency */
+};
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record.
+@return always TRUE */
+static
+ibool
+fts_query_index_fetch_nodes(
+/*========================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg); /*!< in: pointer to ib_vector_t */
+
+/********************************************************************
+Read and filter nodes.
+@return fts_node_t instance */
+static
+dberr_t
+fts_query_filter_doc_ids(
+/*=====================*/
+ fts_query_t* query, /*!< in: query instance */
+ const fts_string_t* word, /*!< in: the current word */
+ fts_word_freq_t* word_freq, /*!< in/out: word frequency */
+ const fts_node_t* node, /*!< in: current FTS node */
+ void* data, /*!< in: doc id ilist */
+ ulint len, /*!< in: doc id ilist size */
+ ibool calc_doc_count);/*!< in: whether to remember doc
+ count */
+
+/** Process (nested) sub-expression, create a new result set to store the
+sub-expression result by processing nodes under current sub-expression
+list. Merge the sub-expression result with that of parent expression list.
+@param[in,out] node current root node
+@param[in,out] visitor callback function
+@param[in,out] arg argument for callback
+@return DB_SUCCESS if all go well */
+static
+dberr_t
+fts_ast_visit_sub_exp(
+ fts_ast_node_t* node,
+ fts_ast_callback visitor,
+ void* arg);
+
+#if 0
+/*****************************************************************//***
+Find a doc_id in a word's ilist.
+@return TRUE if found. */
+static
+ibool
+fts_query_find_doc_id(
+/*==================*/
+ fts_select_t* select, /*!< in/out: search the doc id selected,
+ update the frequency if found. */
+ void* data, /*!< in: doc id ilist */
+ ulint len); /*!< in: doc id ilist size */
+#endif
+
+/*************************************************************//**
+This function implements a simple "blind" query expansion search:
+words in documents found in the first search pass will be used as
+search arguments to search the document again, thus "expand"
+the search result set.
+@return DB_SUCCESS if success, otherwise the error code */
+static
+dberr_t
+fts_expand_query(
+/*=============*/
+ dict_index_t* index, /*!< in: FTS index to search */
+ fts_query_t* query) /*!< in: query result, to be freed
+ by the client */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+This function finds documents that contain all words in a
+phrase or proximity search. And if proximity search, verify
+the words are close enough to each other, as in specified distance.
+This function is called for phrase and proximity search.
+@return TRUE if documents are found, FALSE if otherwise */
+static
+ibool
+fts_phrase_or_proximity_search(
+/*===========================*/
+ fts_query_t* query, /*!< in/out: query instance
+ query->doc_ids might be instantiated
+ with qualified doc IDs */
+ ib_vector_t* tokens); /*!< in: Tokens contain words */
+/*************************************************************//**
+This function checks whether words in result documents are close to
+each other (within proximity range as specified by "distance").
+If "distance" is MAX_ULINT, then it will find all combinations of
+positions of matching words and store min and max positions
+in the "qualified_pos" for later verification.
+@return true if words are close to each other, false if otherwise */
+static
+bool
+fts_proximity_get_positions(
+/*========================*/
+ fts_match_t** match, /*!< in: query instance */
+ ulint num_match, /*!< in: number of matching
+ items */
+ ulint distance, /*!< in: distance value
+ for proximity search */
+ fts_proximity_t* qualified_pos); /*!< out: the position info
+ records ranges containing
+ all matching words. */
+#if 0
+/********************************************************************
+Get the total number of words in a documents. */
+static
+ulint
+fts_query_terms_in_document(
+/*========================*/
+ /*!< out: DB_SUCCESS if all go well
+ else error code */
+ fts_query_t* query, /*!< in: FTS query state */
+ doc_id_t doc_id, /*!< in: the word to check */
+ ulint* total); /*!< out: total words in document */
+#endif
+
+/********************************************************************
+Compare two fts_doc_freq_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_freq_doc_id_cmp(
+/*================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1;
+ const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2;
+
+ return((int) (fq1->doc_id - fq2->doc_id));
+}
+
+#if 0
+/*******************************************************************//**
+Print the table used for calculating LCS. */
+static
+void
+fts_print_lcs_table(
+/*================*/
+ const ulint* table, /*!< in: array to print */
+ ulint n_rows, /*!< in: total no. of rows */
+ ulint n_cols) /*!< in: total no. of cols */
+{
+ ulint i;
+
+ for (i = 0; i < n_rows; ++i) {
+ ulint j;
+
+ printf("\n");
+
+ for (j = 0; j < n_cols; ++j) {
+
+ printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
+ }
+ }
+}
+
+/********************************************************************
+Find the longest common subsequence between the query string and
+the document. */
+static
+ulint
+fts_query_lcs(
+/*==========*/
+ /*!< out: LCS (length) between
+ two ilists */
+ const ulint* p1, /*!< in: word positions of query */
+ ulint len_p1, /*!< in: no. of elements in p1 */
+ const ulint* p2, /*!< in: word positions within document */
+ ulint len_p2) /*!< in: no. of elements in p2 */
+{
+ int i;
+ ulint len = 0;
+ ulint r = len_p1;
+ ulint c = len_p2;
+ ulint size = (r + 1) * (c + 1) * sizeof(ulint);
+ ulint* table = (ulint*) ut_malloc_nokey(size);
+
+ /* Traverse the table backwards, from the last row to the first and
+ also from the last column to the first. We compute the smaller
+ common subsequeces first, then use the caluclated values to determine
+ the longest common subsequence. The result will be in TABLE[0][0]. */
+ for (i = r; i >= 0; --i) {
+ int j;
+
+ for (j = c; j >= 0; --j) {
+
+ if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
+
+ FTS_ELEM(table, c, i, j) = 0;
+
+ } else if (p1[i] == p2[j]) {
+
+ FTS_ELEM(table, c, i, j) = FTS_ELEM(
+ table, c, i + 1, j + 1) + 1;
+
+ } else {
+
+ ulint value;
+
+ value = ut_max(
+ FTS_ELEM(table, c, i + 1, j),
+ FTS_ELEM(table, c, i, j + 1));
+
+ FTS_ELEM(table, c, i, j) = value;
+ }
+ }
+ }
+
+ len = FTS_ELEM(table, c, 0, 0);
+
+ fts_print_lcs_table(table, r, c);
+ printf("\nLen=" ULINTPF "\n", len);
+
+ ut_free(table);
+
+ return(len);
+}
+#endif
+
+/*******************************************************************//**
+Compare two fts_ranking_t instance on their rank value and doc ids in
+descending order on the rank and ascending order on doc id.
+@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
+static
+int
+fts_query_compare_rank(
+/*===================*/
+ const void* p1, /*!< in: pointer to elem */
+ const void* p2) /*!< in: pointer to elem */
+{
+ const fts_ranking_t* r1 = (const fts_ranking_t*) p1;
+ const fts_ranking_t* r2 = (const fts_ranking_t*) p2;
+
+ if (r2->rank < r1->rank) {
+ return(-1);
+ } else if (r2->rank == r1->rank) {
+
+ if (r1->doc_id < r2->doc_id) {
+ return(1);
+ } else if (r1->doc_id > r2->doc_id) {
+ return(1);
+ }
+
+ return(0);
+ }
+
+ return(1);
+}
+
+/*******************************************************************//**
+Create words in ranking */
+static
+void
+fts_ranking_words_create(
+/*=====================*/
+ fts_query_t* query, /*!< in: query instance */
+ fts_ranking_t* ranking) /*!< in: ranking instance */
+{
+ ranking->words = static_cast<byte*>(
+ mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
+ ranking->words_len = RANKING_WORDS_INIT_LEN;
+}
+
+/*
+The optimization here is using a char array(bitmap) to replace words rb tree
+in fts_ranking_t.
+
+It can save lots of memory except in some cases of QUERY EXPANSION.
+
+'word_map' is used as a word dictionary, in which the key is a word, the value
+is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
+if not, we add it into 'word_map', and give it a position(actually a number).
+then we set the corresponding bit to '1' at the position in the char array 'words'.
+
+'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
+more quickly than searching by value in 'word_map'. we use 'word_vector'
+in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
+to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
+by the word.
+*/
+
+/*******************************************************************//**
+Add a word into ranking */
+static
+void
+fts_ranking_words_add(
+/*==================*/
+ fts_query_t* query, /*!< in: query instance */
+ fts_ranking_t* ranking, /*!< in: ranking instance */
+ const fts_string_t* word) /*!< in: term/word to add */
+{
+ ulint pos;
+ ulint byte_offset;
+ ulint bit_offset;
+ ib_rbt_bound_t parent;
+
+ /* Note: we suppose the word map and vector are append-only. */
+ ut_ad(query->word_vector->size() == rbt_size(query->word_map));
+
+ /* We use ib_rbt to simulate a map, f_n_char means position. */
+ if (rbt_search(query->word_map, &parent, word) == 0) {
+ fts_string_t* result_word;
+
+ result_word = rbt_value(fts_string_t, parent.last);
+ pos = result_word->f_n_char;
+ ut_ad(pos < rbt_size(query->word_map));
+ } else {
+ /* Add the word to map. */
+ fts_string_t new_word;
+
+ pos = rbt_size(query->word_map);
+
+ fts_string_dup(&new_word, word, query->heap);
+ new_word.f_n_char = pos;
+
+ rbt_add_node(query->word_map, &parent, &new_word);
+ ut_ad(rbt_validate(query->word_map));
+ query->word_vector->push_back(new_word);
+ }
+
+ /* Check words len */
+ byte_offset = pos / CHAR_BIT;
+ if (byte_offset >= ranking->words_len) {
+ byte* words = ranking->words;
+ ulint words_len = ranking->words_len;
+
+ while (byte_offset >= words_len) {
+ words_len *= 2;
+ }
+
+ ranking->words = static_cast<byte*>(
+ mem_heap_zalloc(query->heap, words_len));
+ memcpy(ranking->words, words, ranking->words_len);
+ ranking->words_len = words_len;
+ }
+
+ /* Set ranking words */
+ ut_ad(byte_offset < ranking->words_len);
+ bit_offset = pos % CHAR_BIT;
+ ranking->words[byte_offset] = static_cast<byte>(
+ ranking->words[byte_offset] | 1 << bit_offset);
+}
+
+/*******************************************************************//**
+Get a word from a ranking
+@return true if it's successful */
+static
+bool
+fts_ranking_words_get_next(
+/*=======================*/
+ const fts_query_t* query, /*!< in: query instance */
+ fts_ranking_t* ranking,/*!< in: ranking instance */
+ ulint* pos, /*!< in/out: word start pos */
+ fts_string_t* word) /*!< in/out: term/word to add */
+{
+ bool ret = false;
+ ulint max_pos = ranking->words_len * CHAR_BIT;
+
+ /* Search for next word */
+ while (*pos < max_pos) {
+ ulint byte_offset = *pos / CHAR_BIT;
+ ulint bit_offset = *pos % CHAR_BIT;
+
+ if (ranking->words[byte_offset] & (1 << bit_offset)) {
+ ret = true;
+ break;
+ }
+
+ *pos += 1;
+ };
+
+ /* Get next word from word vector */
+ if (ret) {
+ ut_ad(*pos < query->word_vector->size());
+ *word = query->word_vector->at((size_t)*pos);
+ *pos += 1;
+ }
+
+ return ret;
+}
+
+/*******************************************************************//**
+Add a word if it doesn't exist, to the term freq RB tree. We store
+a pointer to the word that is passed in as the argument.
+@return pointer to word */
+static
+fts_word_freq_t*
+fts_query_add_word_freq(
+/*====================*/
+ fts_query_t* query, /*!< in: query instance */
+ const fts_string_t* word) /*!< in: term/word to add */
+{
+ ib_rbt_bound_t parent;
+
+ /* Lookup the word in our rb tree and add if it doesn't exist. */
+ if (rbt_search(query->word_freqs, &parent, word) != 0) {
+ fts_word_freq_t word_freq;
+
+ memset(&word_freq, 0, sizeof(word_freq));
+
+ fts_string_dup(&word_freq.word, word, query->heap);
+
+ word_freq.doc_count = 0;
+
+ word_freq.doc_freqs = rbt_create(
+ sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
+
+ parent.last = rbt_add_node(
+ query->word_freqs, &parent, &word_freq);
+
+ query->total_size += word->f_len
+ + SIZEOF_RBT_CREATE
+ + SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_word_freq_t);
+ }
+
+ return(rbt_value(fts_word_freq_t, parent.last));
+}
+
+/*******************************************************************//**
+Add a doc id if it doesn't exist, to the doc freq RB tree.
+@return pointer to word */
+static
+fts_doc_freq_t*
+fts_query_add_doc_freq(
+/*===================*/
+ fts_query_t* query, /*!< in: query instance */
+ ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */
+ doc_id_t doc_id) /*!< in: doc id to add */
+{
+ ib_rbt_bound_t parent;
+
+ /* Lookup the doc id in our rb tree and add if it doesn't exist. */
+ if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
+ fts_doc_freq_t doc_freq;
+
+ memset(&doc_freq, 0, sizeof(doc_freq));
+
+ doc_freq.freq = 0;
+ doc_freq.doc_id = doc_id;
+
+ parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
+
+ query->total_size += SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_doc_freq_t);
+ }
+
+ return(rbt_value(fts_doc_freq_t, parent.last));
+}
+
+/*******************************************************************//**
+Add the doc id to the query set only if it's not in the
+deleted array. */
+static
+void
+fts_query_union_doc_id(
+/*===================*/
+ fts_query_t* query, /*!< in: query instance */
+ doc_id_t doc_id, /*!< in: the doc id to add */
+ fts_rank_t rank) /*!< in: if non-zero, it is the
+ rank associated with the doc_id */
+{
+ ib_rbt_bound_t parent;
+ ulint size = ib_vector_size(query->deleted->doc_ids);
+ doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
+
+ /* Check if the doc id is deleted and it's not already in our set. */
+ if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
+ && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+
+ fts_ranking_t ranking;
+
+ ranking.rank = rank;
+ ranking.doc_id = doc_id;
+ fts_ranking_words_create(query, &ranking);
+
+ rbt_add_node(query->doc_ids, &parent, &ranking);
+
+ query->total_size += SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
+ }
+}
+
+/*******************************************************************//**
+Remove the doc id from the query set only if it's not in the
+deleted set. */
+static
+void
+fts_query_remove_doc_id(
+/*====================*/
+ fts_query_t* query, /*!< in: query instance */
+ doc_id_t doc_id) /*!< in: the doc id to add */
+{
+ ib_rbt_bound_t parent;
+ ulint size = ib_vector_size(query->deleted->doc_ids);
+ doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
+
+ /* Check if the doc id is deleted and it's in our set. */
+ if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
+ && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+ ut_free(rbt_remove_node(query->doc_ids, parent.last));
+
+ ut_ad(query->total_size >=
+ SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
+ query->total_size -= SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_ranking_t);
+ }
+}
+
+/*******************************************************************//**
+Find the doc id in the query set but not in the deleted set, artificialy
+downgrade or upgrade its ranking by a value and make/initialize its ranking
+under or above its normal range 0 to 1. This is used for Boolean Search
+operator such as Negation operator, which makes word's contribution to the
+row's relevance to be negative */
+static
+void
+fts_query_change_ranking(
+/*====================*/
+ fts_query_t* query, /*!< in: query instance */
+ doc_id_t doc_id, /*!< in: the doc id to add */
+ ibool downgrade) /*!< in: Whether to downgrade ranking */
+{
+ ib_rbt_bound_t parent;
+ ulint size = ib_vector_size(query->deleted->doc_ids);
+ doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
+
+ /* Check if the doc id is deleted and it's in our set. */
+ if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
+ && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+ fts_ranking_t* ranking;
+
+ ranking = rbt_value(fts_ranking_t, parent.last);
+
+ ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
+
+ /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
+ and RANK_UPGRADE (0.5) */
+ if (ranking->rank >= 1.0F) {
+ ranking->rank = 1.0F;
+ } else if (ranking->rank <= -1.0F) {
+ ranking->rank = -1.0F;
+ }
+ }
+}
+
+/*******************************************************************//**
+Check the doc id in the query set only if it's not in the
+deleted array. The doc ids that were found are stored in
+another rb tree (fts_query_t::intersect). */
+static
+void
+fts_query_intersect_doc_id(
+/*=======================*/
+ fts_query_t* query, /*!< in: query instance */
+ doc_id_t doc_id, /*!< in: the doc id to add */
+ fts_rank_t rank) /*!< in: if non-zero, it is the
+ rank associated with the doc_id */
+{
+ ib_rbt_bound_t parent;
+ ulint size = ib_vector_size(query->deleted->doc_ids);
+ doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data;
+ fts_ranking_t* ranking= NULL;
+
+ /* There are three types of intersect:
+ 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
+ 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
+ if it matches 'b'. if the doc is also in doc_ids, then change the
+ doc's rank, and add 'a' in doc's words.
+ 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
+ if it matches 'b' and it's in doc_ids.(multi_exist = true). */
+
+ /* Check if the doc id is deleted and it's in our set */
+ if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0) {
+ fts_ranking_t new_ranking;
+
+ if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+ if (query->multi_exist) {
+ return;
+ } else {
+ new_ranking.words = NULL;
+ }
+ } else {
+ ranking = rbt_value(fts_ranking_t, parent.last);
+
+ /* We've just checked the doc id before */
+ if (ranking->words == NULL) {
+ ut_ad(rbt_search(query->intersection, &parent,
+ ranking) == 0);
+ return;
+ }
+
+ /* Merge rank */
+ rank += ranking->rank;
+ if (rank >= 1.0F) {
+ rank = 1.0F;
+ } else if (rank <= -1.0F) {
+ rank = -1.0F;
+ }
+
+ /* Take words */
+ new_ranking.words = ranking->words;
+ new_ranking.words_len = ranking->words_len;
+ }
+
+ new_ranking.rank = rank;
+ new_ranking.doc_id = doc_id;
+
+ if (rbt_search(query->intersection, &parent,
+ &new_ranking) != 0) {
+ if (new_ranking.words == NULL) {
+ fts_ranking_words_create(query, &new_ranking);
+
+ query->total_size += RANKING_WORDS_INIT_LEN;
+ } else {
+ /* Note that the intersection has taken
+ ownership of the ranking data. */
+ ranking->words = NULL;
+ }
+
+ rbt_add_node(query->intersection,
+ &parent, &new_ranking);
+
+ query->total_size += SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_ranking_t);
+ }
+ }
+}
+
+/*******************************************************************//**
+Free the document ranking rb tree. */
+static
+void
+fts_query_free_doc_ids(
+/*===================*/
+ fts_query_t* query, /*!< in: query instance */
+ ib_rbt_t* doc_ids) /*!< in: rb tree to free */
+{
+ const ib_rbt_node_t* node;
+
+ for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
+
+ fts_ranking_t* ranking;
+
+ ranking = rbt_value(fts_ranking_t, node);
+
+ if (ranking->words) {
+ ranking->words = NULL;
+ }
+
+ ut_free(rbt_remove_node(doc_ids, node));
+
+ ut_ad(query->total_size >=
+ SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
+ query->total_size -= SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_ranking_t);
+ }
+
+ rbt_free(doc_ids);
+
+ ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
+ query->total_size -= SIZEOF_RBT_CREATE;
+}
+
+/*******************************************************************//**
+Add the word to the documents "list" of matching words from
+the query. We make a copy of the word from the query heap. */
+static
+void
+fts_query_add_word_to_document(
+/*===========================*/
+ fts_query_t* query, /*!< in: query to update */
+ doc_id_t doc_id, /*!< in: the document to update */
+ const fts_string_t* word) /*!< in: the token to add */
+{
+ ib_rbt_bound_t parent;
+ fts_ranking_t* ranking = NULL;
+
+ if (query->flags == FTS_OPT_RANKING) {
+ return;
+ }
+
+ /* First we search the intersection RB tree as it could have
+ taken ownership of the words rb tree instance. */
+ if (query->intersection
+ && rbt_search(query->intersection, &parent, &doc_id) == 0) {
+
+ ranking = rbt_value(fts_ranking_t, parent.last);
+ }
+
+ if (ranking == NULL
+ && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+ ranking = rbt_value(fts_ranking_t, parent.last);
+ }
+
+ if (ranking != NULL) {
+ fts_ranking_words_add(query, ranking, word);
+ }
+}
+
+/*******************************************************************//**
+Check the node ilist. */
+static
+void
+fts_query_check_node(
+/*=================*/
+ fts_query_t* query, /*!< in: query to update */
+ const fts_string_t* token, /*!< in: the token to search */
+ const fts_node_t* node) /*!< in: node to check */
+{
+ /* Skip nodes whose doc ids are out range. */
+ if (query->oper == FTS_EXIST
+ && ((query->upper_doc_id > 0
+ && node->first_doc_id > query->upper_doc_id)
+ || (query->lower_doc_id > 0
+ && node->last_doc_id < query->lower_doc_id))) {
+
+ /* Ignore */
+
+ } else {
+ int ret;
+ ib_rbt_bound_t parent;
+ ulint ilist_size = node->ilist_size;
+ fts_word_freq_t*word_freqs;
+
+ /* The word must exist. */
+ ret = rbt_search(query->word_freqs, &parent, token);
+ ut_a(ret == 0);
+
+ word_freqs = rbt_value(fts_word_freq_t, parent.last);
+
+ query->error = fts_query_filter_doc_ids(
+ query, token, word_freqs, node,
+ node->ilist, ilist_size, TRUE);
+ }
+}
+
+/*****************************************************************//**
+Search index cache for word with wildcard match.
+@return number of words matched */
+static
+ulint
+fts_cache_find_wildcard(
+/*====================*/
+ fts_query_t* query, /*!< in: query instance */
+ const fts_index_cache_t*index_cache, /*!< in: cache to search */
+ const fts_string_t* token) /*!< in: token to search */
+{
+ ib_rbt_bound_t parent;
+ const ib_vector_t* nodes = NULL;
+ fts_string_t srch_text;
+ byte term[FTS_MAX_WORD_LEN + 1];
+ ulint num_word = 0;
+
+ srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
+ ? token->f_len - 1
+ : token->f_len;
+
+ strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
+ term[srch_text.f_len] = '\0';
+ srch_text.f_str = term;
+
+ /* Lookup the word in the rb tree */
+ if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
+ innobase_fts_text_cmp_prefix) == 0) {
+ const fts_tokenizer_word_t* word;
+ ulint i;
+ const ib_rbt_node_t* cur_node;
+ ibool forward = FALSE;
+
+ word = rbt_value(fts_tokenizer_word_t, parent.last);
+ cur_node = parent.last;
+
+ while (innobase_fts_text_cmp_prefix(
+ index_cache->charset, &srch_text, &word->text) == 0) {
+
+ nodes = word->nodes;
+
+ for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+ int ret;
+ const fts_node_t* node;
+ ib_rbt_bound_t freq_parent;
+ fts_word_freq_t* word_freqs;
+
+ node = static_cast<const fts_node_t*>(
+ ib_vector_get_const(nodes, i));
+
+ ret = rbt_search(query->word_freqs,
+ &freq_parent,
+ &srch_text);
+
+ ut_a(ret == 0);
+
+ word_freqs = rbt_value(
+ fts_word_freq_t,
+ freq_parent.last);
+
+ query->error = fts_query_filter_doc_ids(
+ query, &srch_text,
+ word_freqs, node,
+ node->ilist, node->ilist_size, TRUE);
+
+ if (query->error != DB_SUCCESS) {
+ return(0);
+ }
+ }
+
+ num_word++;
+
+ if (!forward) {
+ cur_node = rbt_prev(
+ index_cache->words, cur_node);
+ } else {
+cont_search:
+ cur_node = rbt_next(
+ index_cache->words, cur_node);
+ }
+
+ if (!cur_node) {
+ break;
+ }
+
+ word = rbt_value(fts_tokenizer_word_t, cur_node);
+ }
+
+ if (!forward) {
+ forward = TRUE;
+ cur_node = parent.last;
+ goto cont_search;
+ }
+ }
+
+ return(num_word);
+}
+
+/*****************************************************************//**
+Set difference.
+@return DB_SUCCESS if all go well */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_difference(
+/*=================*/
+ fts_query_t* query, /*!< in: query instance */
+ const fts_string_t* token) /*!< in: token to search */
+{
+ ulint n_doc_ids= 0;
+ trx_t* trx = query->trx;
+ dict_table_t* table = query->index->table;
+
+ ut_a(query->oper == FTS_IGNORE);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ {
+ ib::info out;
+ out << "DIFFERENCE: Searching: '";
+ out.write(token->f_str, token->f_len);
+ out << "'";
+ }
+#endif
+
+ if (query->doc_ids) {
+ n_doc_ids = rbt_size(query->doc_ids);
+ }
+
+ /* There is nothing we can substract from an empty set. */
+ if (query->doc_ids && !rbt_empty(query->doc_ids)) {
+ ulint i;
+ fts_fetch_t fetch;
+ const ib_vector_t* nodes;
+ const fts_index_cache_t*index_cache;
+ que_t* graph = NULL;
+ fts_cache_t* cache = table->fts->cache;
+ dberr_t error;
+
+ rw_lock_x_lock(&cache->lock);
+
+ index_cache = fts_find_index_cache(cache, query->index);
+
+ /* Must find the index cache */
+ ut_a(index_cache != NULL);
+
+ /* Search the cache for a matching word first. */
+ if (query->cur_node->term.wildcard
+ && query->flags != FTS_PROXIMITY
+ && query->flags != FTS_PHRASE) {
+ fts_cache_find_wildcard(query, index_cache, token);
+ } else {
+ nodes = fts_cache_find_word(index_cache, token);
+
+ for (i = 0; nodes && i < ib_vector_size(nodes)
+ && query->error == DB_SUCCESS; ++i) {
+ const fts_node_t* node;
+
+ node = static_cast<const fts_node_t*>(
+ ib_vector_get_const(nodes, i));
+
+ fts_query_check_node(query, token, node);
+ }
+ }
+
+ rw_lock_x_unlock(&cache->lock);
+
+ /* error is passed by 'query->error' */
+ if (query->error != DB_SUCCESS) {
+ ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+ return(query->error);
+ }
+
+ /* Setup the callback args for filtering and
+ consolidating the ilist. */
+ fetch.read_arg = query;
+ fetch.read_record = fts_query_index_fetch_nodes;
+
+ error = fts_index_fetch_nodes(
+ trx, &graph, &query->fts_index_table, token, &fetch);
+
+ /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+ ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+ if (error != DB_SUCCESS) {
+ query->error = error;
+ }
+
+ fts_que_graph_free(graph);
+ }
+
+ /* The size can't increase. */
+ ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
+
+ return(query->error);
+}
+
+/*****************************************************************//**
+Intersect the token doc ids with the current set.
+@return DB_SUCCESS if all go well */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_intersect(
+/*================*/
+ fts_query_t* query, /*!< in: query instance */
+ const fts_string_t* token) /*!< in: the token to search */
+{
+ trx_t* trx = query->trx;
+ dict_table_t* table = query->index->table;
+
+ ut_a(query->oper == FTS_EXIST);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ {
+ ib::info out;
+ out << "INTERSECT: Searching: '";
+ out.write(token->f_str, token->f_len);
+ out << "'";
+ }
+#endif
+
+ /* If the words set is not empty and multi exist is true,
+ we know the intersection set is empty in advance. */
+ if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
+ ulint n_doc_ids = 0;
+ ulint i;
+ fts_fetch_t fetch;
+ const ib_vector_t* nodes;
+ const fts_index_cache_t*index_cache;
+ que_t* graph = NULL;
+ fts_cache_t* cache = table->fts->cache;
+ dberr_t error;
+
+ ut_a(!query->intersection);
+
+ n_doc_ids = rbt_size(query->doc_ids);
+
+ /* Create the rb tree that will hold the doc ids of
+ the intersection. */
+ query->intersection = rbt_create(
+ sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+ query->total_size += SIZEOF_RBT_CREATE;
+
+ /* This is to avoid decompressing the ilist if the
+ node's ilist doc ids are out of range. */
+ if (!rbt_empty(query->doc_ids) && query->multi_exist) {
+ const ib_rbt_node_t* node;
+ doc_id_t* doc_id;
+
+ node = rbt_first(query->doc_ids);
+ doc_id = rbt_value(doc_id_t, node);
+ query->lower_doc_id = *doc_id;
+
+ node = rbt_last(query->doc_ids);
+ doc_id = rbt_value(doc_id_t, node);
+ query->upper_doc_id = *doc_id;
+
+ } else {
+ query->lower_doc_id = 0;
+ query->upper_doc_id = 0;
+ }
+
+ /* Search the cache for a matching word first. */
+
+ rw_lock_x_lock(&cache->lock);
+
+ /* Search for the index specific cache. */
+ index_cache = fts_find_index_cache(cache, query->index);
+
+ /* Must find the index cache. */
+ ut_a(index_cache != NULL);
+
+ if (query->cur_node->term.wildcard) {
+ /* Wildcard search the index cache */
+ fts_cache_find_wildcard(query, index_cache, token);
+ } else {
+ nodes = fts_cache_find_word(index_cache, token);
+
+ for (i = 0; nodes && i < ib_vector_size(nodes)
+ && query->error == DB_SUCCESS; ++i) {
+ const fts_node_t* node;
+
+ node = static_cast<const fts_node_t*>(
+ ib_vector_get_const(nodes, i));
+
+ fts_query_check_node(query, token, node);
+ }
+ }
+
+ rw_lock_x_unlock(&cache->lock);
+
+ /* error is passed by 'query->error' */
+ if (query->error != DB_SUCCESS) {
+ ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+ return(query->error);
+ }
+
+ /* Setup the callback args for filtering and
+ consolidating the ilist. */
+ fetch.read_arg = query;
+ fetch.read_record = fts_query_index_fetch_nodes;
+
+ error = fts_index_fetch_nodes(
+ trx, &graph, &query->fts_index_table, token, &fetch);
+
+ /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+ ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+ if (error != DB_SUCCESS) {
+ query->error = error;
+ }
+
+ fts_que_graph_free(graph);
+
+ if (query->error == DB_SUCCESS) {
+ /* Make the intesection (rb tree) the current doc id
+ set and free the old set. */
+ fts_query_free_doc_ids(query, query->doc_ids);
+ query->doc_ids = query->intersection;
+ query->intersection = NULL;
+
+ ut_a(!query->multi_exist || (query->multi_exist
+ && rbt_size(query->doc_ids) <= n_doc_ids));
+ }
+ }
+
+ return(query->error);
+}
+
+/*****************************************************************//**
+Query index cache.
+@return DB_SUCCESS if all go well */
+static
+dberr_t
+fts_query_cache(
+/*============*/
+ fts_query_t* query, /*!< in/out: query instance */
+ const fts_string_t* token) /*!< in: token to search */
+{
+ const fts_index_cache_t*index_cache;
+ dict_table_t* table = query->index->table;
+ fts_cache_t* cache = table->fts->cache;
+
+ /* Search the cache for a matching word first. */
+ rw_lock_x_lock(&cache->lock);
+
+ /* Search for the index specific cache. */
+ index_cache = fts_find_index_cache(cache, query->index);
+
+ /* Must find the index cache. */
+ ut_a(index_cache != NULL);
+
+ if (query->cur_node->term.wildcard
+ && query->flags != FTS_PROXIMITY
+ && query->flags != FTS_PHRASE) {
+ /* Wildcard search the index cache */
+ fts_cache_find_wildcard(query, index_cache, token);
+ } else {
+ const ib_vector_t* nodes;
+ ulint i;
+
+ nodes = fts_cache_find_word(index_cache, token);
+
+ for (i = 0; nodes && i < ib_vector_size(nodes)
+ && query->error == DB_SUCCESS; ++i) {
+ const fts_node_t* node;
+
+ node = static_cast<const fts_node_t*>(
+ ib_vector_get_const(nodes, i));
+
+ fts_query_check_node(query, token, node);
+ }
+ }
+
+ rw_lock_x_unlock(&cache->lock);
+
+ return(query->error);
+}
+
+/*****************************************************************//**
+Set union.
+@return DB_SUCCESS if all go well */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_union(
+/*============*/
+ fts_query_t* query, /*!< in: query instance */
+ fts_string_t* token) /*!< in: token to search */
+{
+ fts_fetch_t fetch;
+ ulint n_doc_ids = 0;
+ trx_t* trx = query->trx;
+ que_t* graph = NULL;
+ dberr_t error;
+
+ ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
+ query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ {
+ ib::info out;
+ out << "UNION: Searching: '";
+ out.write(token->f_str, token->f_len);
+ out << "'";
+ }
+#endif
+
+ if (query->doc_ids) {
+ n_doc_ids = rbt_size(query->doc_ids);
+ }
+
+ if (token->f_len == 0) {
+ return(query->error);
+ }
+
+ fts_query_cache(query, token);
+
+ /* Setup the callback args for filtering and
+ consolidating the ilist. */
+ fetch.read_arg = query;
+ fetch.read_record = fts_query_index_fetch_nodes;
+
+ /* Read the nodes from disk. */
+ error = fts_index_fetch_nodes(
+ trx, &graph, &query->fts_index_table, token, &fetch);
+
+ /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+ ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+ if (error != DB_SUCCESS) {
+ query->error = error;
+ }
+
+ fts_que_graph_free(graph);
+
+ if (query->error == DB_SUCCESS) {
+
+ /* The size can't decrease. */
+ ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
+
+ /* Calulate the number of doc ids that were added to
+ the current doc id set. */
+ if (query->doc_ids) {
+ n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
+ }
+ }
+
+ return(query->error);
+}
+
+/*****************************************************************//**
+Depending upon the current query operator process the doc id.
+return DB_SUCCESS if all go well
+or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
+static
+dberr_t
+fts_query_process_doc_id(
+/*=====================*/
+ fts_query_t* query, /*!< in: query instance */
+ doc_id_t doc_id, /*!< in: doc id to process */
+ fts_rank_t rank) /*!< in: if non-zero, it is the
+ rank associated with the doc_id */
+{
+ if (query->flags == FTS_OPT_RANKING) {
+ return(DB_SUCCESS);
+ }
+
+ switch (query->oper) {
+ case FTS_NONE:
+ fts_query_union_doc_id(query, doc_id, rank);
+ break;
+
+ case FTS_EXIST:
+ fts_query_intersect_doc_id(query, doc_id, rank);
+ break;
+
+ case FTS_IGNORE:
+ fts_query_remove_doc_id(query, doc_id);
+ break;
+
+ case FTS_NEGATE:
+ fts_query_change_ranking(query, doc_id, TRUE);
+ break;
+
+ case FTS_DECR_RATING:
+ fts_query_union_doc_id(query, doc_id, rank);
+ fts_query_change_ranking(query, doc_id, TRUE);
+ break;
+
+ case FTS_INCR_RATING:
+ fts_query_union_doc_id(query, doc_id, rank);
+ fts_query_change_ranking(query, doc_id, FALSE);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ if (query->total_size > fts_result_cache_limit) {
+ return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+ } else {
+ return(DB_SUCCESS);
+ }
+}
+
+/*****************************************************************//**
+Merge two result sets. */
+static
+dberr_t
+fts_merge_doc_ids(
+/*==============*/
+ fts_query_t* query, /*!< in,out: query instance */
+ const ib_rbt_t* doc_ids) /*!< in: result set to merge */
+{
+ const ib_rbt_node_t* node;
+
+ DBUG_ENTER("fts_merge_doc_ids");
+
+ ut_a(!query->intersection);
+
+ /* To process FTS_EXIST operation (intersection), we need
+ to create a new result set for fts_query_intersect(). */
+ if (query->oper == FTS_EXIST) {
+
+ query->intersection = rbt_create(
+ sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+ query->total_size += SIZEOF_RBT_CREATE;
+ }
+
+ /* Merge the elements to the result set. */
+ for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
+ fts_ranking_t* ranking;
+ ulint pos = 0;
+ fts_string_t word;
+
+ ranking = rbt_value(fts_ranking_t, node);
+
+ query->error = fts_query_process_doc_id(
+ query, ranking->doc_id, ranking->rank);
+
+ if (query->error != DB_SUCCESS) {
+ DBUG_RETURN(query->error);
+ }
+
+ /* Merge words. Don't need to take operator into account. */
+ ut_a(ranking->words);
+ while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+ fts_query_add_word_to_document(query, ranking->doc_id,
+ &word);
+ }
+ }
+
+ /* If it is an intersection operation, reset query->doc_ids
+ to query->intersection and free the old result list. */
+ if (query->oper == FTS_EXIST && query->intersection != NULL) {
+ fts_query_free_doc_ids(query, query->doc_ids);
+ query->doc_ids = query->intersection;
+ query->intersection = NULL;
+ }
+
+ DBUG_RETURN(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Skip non-whitespace in a string. Move ptr to the next word boundary.
+@return pointer to first whitespace character or end */
+UNIV_INLINE
+byte*
+fts_query_skip_word(
+/*================*/
+ byte* ptr, /*!< in: start of scan */
+ const byte* end) /*!< in: pointer to end of string */
+{
+ /* TODO: Does this have to be UTF-8 too ? */
+ while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
+ ++ptr;
+ }
+
+ return(ptr);
+}
+
+/*****************************************************************//**
+Check whether the remaining terms in the phrase match the text.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase_terms(
+/*=========================*/
+ fts_phrase_t* phrase, /*!< in: phrase to match */
+ byte** start, /*!< in/out: text to search, we can't
+ make this const becase we need to
+ first convert the string to
+ lowercase */
+ const byte* end, /*!< in: pointer to the end of
+ the string to search */
+ mem_heap_t* heap) /*!< in: heap */
+{
+ ulint i;
+ byte* ptr = *start;
+ const ib_vector_t* tokens = phrase->tokens;
+ ulint distance = phrase->distance;
+
+ /* We check only from the second term onwards, since the first
+ must have matched otherwise we wouldn't be here. */
+ for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
+ fts_string_t match;
+ fts_string_t cmp_str;
+ const fts_string_t* token;
+ int result;
+ ulint ret;
+
+ ret = innobase_mysql_fts_get_token(
+ phrase->charset, ptr,
+ const_cast<byte*>(end), &match);
+
+ if (match.f_len > 0) {
+ /* Get next token to match. */
+ token = static_cast<const fts_string_t*>(
+ ib_vector_get_const(tokens, i));
+
+ fts_string_dup(&cmp_str, &match, heap);
+
+ result = innobase_fts_text_case_cmp(
+ phrase->charset, token, &cmp_str);
+
+ /* Skip the rest of the tokens if this one doesn't
+ match and the proximity distance is exceeded. */
+ if (result
+ && (distance == ULINT_UNDEFINED
+ || distance == 0)) {
+
+ break;
+ }
+
+ /* This token matched move to the next token. */
+ if (result == 0) {
+ /* Advance the text to search by the length
+ of the last token. */
+ ptr += ret;
+
+ /* Advance to the next token. */
+ ++i;
+ } else {
+
+ ut_a(distance != ULINT_UNDEFINED);
+
+ ptr = fts_query_skip_word(ptr, end);
+ }
+
+ /* Distance can be 0 for exact matches. */
+ if (distance != ULINT_UNDEFINED && distance > 0) {
+ --distance;
+ }
+ } else {
+ ptr += ret;
+ }
+ }
+
+ *start = ptr;
+
+ /* Can't be greater than the number of elements. */
+ ut_a(i <= ib_vector_size(tokens));
+
+ /* This is the case for multiple words. */
+ if (i == ib_vector_size(tokens)) {
+ phrase->found = TRUE;
+ }
+
+ return(phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to count the number of words in position ranges,
+and see whether the word count is in specified "phrase->distance"
+@return true if the number of characters is less than the "distance" */
+static
+bool
+fts_proximity_is_word_in_range(
+/*===========================*/
+ const fts_phrase_t*
+ phrase, /*!< in: phrase with the search info */
+ byte* start, /*!< in: text to search */
+ ulint total_len) /*!< in: length of text */
+{
+ fts_proximity_t* proximity_pos = phrase->proximity_pos;
+
+ ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
+ ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
+
+ /* Search each matched position pair (with min and max positions)
+ and count the number of words in the range */
+ for (ulint i = 0; i < proximity_pos->n_pos; i++) {
+ ulint cur_pos = proximity_pos->min_pos[i];
+ ulint n_word = 0;
+
+ ut_ad(proximity_pos->max_pos[i] <= total_len);
+
+ /* Walk through words in the range and count them */
+ while (cur_pos <= proximity_pos->max_pos[i]) {
+ ulint len;
+ fts_string_t str;
+
+ len = innobase_mysql_fts_get_token(
+ phrase->charset,
+ start + cur_pos,
+ start + total_len, &str);
+
+ if (len == 0) {
+ break;
+ }
+
+ /* Advances position with "len" bytes */
+ cur_pos += len;
+
+ /* Record the number of words */
+ if (str.f_n_char > 0) {
+ n_word++;
+ }
+
+ if (n_word > phrase->distance) {
+ break;
+ }
+ }
+
+ /* Check if the number of words is less than specified
+ "distance" */
+ if (n_word && n_word <= phrase->distance) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/*****************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for phrase match
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return 0 if match, or return non-zero */
+static
+int
+fts_query_match_phrase_add_word_for_parser(
+/*=======================================*/
+ MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */
+ const char* word, /*!< in: token */
+ int word_len, /*!< in: token length */
+ MYSQL_FTPARSER_BOOLEAN_INFO*)
+{
+ fts_phrase_param_t* phrase_param;
+ fts_phrase_t* phrase;
+ const ib_vector_t* tokens;
+ fts_string_t match;
+ fts_string_t cmp_str;
+ const fts_string_t* token;
+ int result;
+ mem_heap_t* heap;
+
+ phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
+ heap = phrase_param->heap;
+ phrase = phrase_param->phrase;
+ tokens = phrase->tokens;
+
+ /* In case plugin parser doesn't check return value */
+ if (phrase_param->token_index == ib_vector_size(tokens)) {
+ return(1);
+ }
+
+ match.f_str = (uchar *)(word);
+ match.f_len = ulint(word_len);
+ match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len);
+
+ if (match.f_len > 0) {
+ /* Get next token to match. */
+ ut_a(phrase_param->token_index < ib_vector_size(tokens));
+ token = static_cast<const fts_string_t*>(
+ ib_vector_get_const(tokens, phrase_param->token_index));
+
+ fts_string_dup(&cmp_str, &match, heap);
+
+ result = innobase_fts_text_case_cmp(
+ phrase->charset, token, &cmp_str);
+
+ if (result == 0) {
+ phrase_param->token_index++;
+ } else {
+ return(1);
+ }
+ }
+
+ /* Can't be greater than the number of elements. */
+ ut_a(phrase_param->token_index <= ib_vector_size(tokens));
+
+ /* This is the case for multiple words. */
+ if (phrase_param->token_index == ib_vector_size(tokens)) {
+ phrase->found = TRUE;
+ }
+
+ return(static_cast<int>(phrase->found));
+}
+
+/*****************************************************************//**
+Check whether the terms in the phrase match the text.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase_terms_by_parser(
+/*===================================*/
+ fts_phrase_param_t* phrase_param, /* in/out: phrase param */
+ st_mysql_ftparser* parser, /* in: plugin fts parser */
+ byte* text, /* in: text to check */
+ ulint len) /* in: text length */
+{
+ MYSQL_FTPARSER_PARAM param;
+
+ ut_a(parser);
+
+ /* Set paramters for param */
+ param.mysql_parse = fts_tokenize_document_internal;
+ param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
+ param.mysql_ftparam = phrase_param;
+ param.cs = phrase_param->phrase->charset;
+ param.doc = reinterpret_cast<char*>(text);
+ param.length = static_cast<int>(len);
+ param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
+
+ PARSER_INIT(parser, &param);
+ parser->parse(&param);
+ PARSER_DEINIT(parser, &param);
+
+ return(phrase_param->phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to fetch and search the document.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase(
+/*===================*/
+ fts_phrase_t* phrase, /*!< in: phrase to match */
+ byte* start, /*!< in: text to search, we can't make
+ this const becase we need to first
+ convert the string to lowercase */
+ ulint cur_len, /*!< in: length of text */
+ ulint prev_len, /*!< in: total length for searched
+ doc fields*/
+ mem_heap_t* heap) /* heap */
+{
+ ulint i;
+ const fts_string_t* first;
+ const byte* end = start + cur_len;
+ const ib_vector_t* tokens = phrase->tokens;
+ const ib_vector_t* positions = phrase->match->positions;
+
+ ut_a(!phrase->found);
+ ut_a(phrase->match->doc_id > 0);
+ ut_a(ib_vector_size(tokens) > 0);
+ ut_a(ib_vector_size(positions) > 0);
+
+ first = static_cast<const fts_string_t*>(
+ ib_vector_get_const(tokens, 0));
+
+ ut_a(phrase->match->start < ib_vector_size(positions));
+
+ for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
+ ulint pos;
+ byte* ptr = start;
+
+ pos = *(ulint*) ib_vector_get_const(positions, i);
+
+ if (pos == ULINT_UNDEFINED) {
+ break;
+ }
+
+ if (pos < prev_len) {
+ continue;
+ }
+
+ /* Document positions are calculated from the beginning
+ of the first field, need to save the length for each
+ searched field to adjust the doc position when search
+ phrases. */
+ pos -= prev_len;
+ ptr = start + pos;
+
+ /* Within limits ? */
+ if (ptr >= end) {
+ break;
+ }
+
+ if (phrase->parser) {
+ fts_phrase_param_t phrase_param;
+
+ phrase_param.phrase = phrase;
+ phrase_param.token_index = 0;
+ phrase_param.heap = heap;
+
+ if (fts_query_match_phrase_terms_by_parser(
+ &phrase_param,
+ phrase->parser,
+ ptr,
+ ulint(end - ptr))) {
+ break;
+ }
+ } else {
+ fts_string_t match;
+ fts_string_t cmp_str;
+ ulint ret;
+
+ match.f_str = ptr;
+ ret = innobase_mysql_fts_get_token(
+ phrase->charset, start + pos,
+ const_cast<byte*>(end), &match);
+
+ if (match.f_len == 0) {
+ break;
+ }
+
+ fts_string_dup(&cmp_str, &match, heap);
+
+ if (innobase_fts_text_case_cmp(
+ phrase->charset, first, &cmp_str) == 0) {
+
+ /* This is the case for the single word
+ in the phrase. */
+ if (ib_vector_size(phrase->tokens) == 1) {
+ phrase->found = TRUE;
+ break;
+ }
+
+ ptr += ret;
+
+ /* Match the remaining terms in the phrase. */
+ if (fts_query_match_phrase_terms(phrase, &ptr,
+ end, heap)) {
+ break;
+ }
+ }
+ }
+ }
+
+ return(phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to fetch and search the document.
+@return whether the phrase is found */
+static
+ibool
+fts_query_fetch_document(
+/*=====================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: fts_doc_t* */
+{
+
+ que_node_t* exp;
+ sel_node_t* node = static_cast<sel_node_t*>(row);
+ fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg);
+ ulint prev_len = 0;
+ ulint total_len = 0;
+ byte* document_text = NULL;
+
+ exp = node->select_list;
+
+ phrase->found = FALSE;
+
+ /* For proximity search, we will need to get the whole document
+ from all fields, so first count the total length of the document
+ from all the fields */
+ if (phrase->proximity_pos) {
+ while (exp) {
+ ulint field_len;
+ dfield_t* dfield = que_node_get_val(exp);
+ byte* data = static_cast<byte*>(
+ dfield_get_data(dfield));
+
+ if (dfield_is_ext(dfield)) {
+ ulint local_len = dfield_get_len(dfield);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ field_len = mach_read_from_4(
+ data + local_len + BTR_EXTERN_LEN + 4);
+ } else {
+ field_len = dfield_get_len(dfield);
+ }
+
+ if (field_len != UNIV_SQL_NULL) {
+ total_len += field_len + 1;
+ }
+
+ exp = que_node_get_next(exp);
+ }
+
+ document_text = static_cast<byte*>(mem_heap_zalloc(
+ phrase->heap, total_len));
+
+ if (!document_text) {
+ return(FALSE);
+ }
+ }
+
+ exp = node->select_list;
+
+ while (exp) {
+ dfield_t* dfield = que_node_get_val(exp);
+ byte* data = static_cast<byte*>(
+ dfield_get_data(dfield));
+ ulint cur_len;
+
+ if (dfield_is_ext(dfield)) {
+ data = btr_copy_externally_stored_field(
+ &cur_len, data, phrase->zip_size,
+ dfield_get_len(dfield), phrase->heap);
+ } else {
+ cur_len = dfield_get_len(dfield);
+ }
+
+ if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
+ if (phrase->proximity_pos) {
+ ut_ad(prev_len + cur_len <= total_len);
+ memcpy(document_text + prev_len, data, cur_len);
+ } else {
+ /* For phrase search */
+ phrase->found =
+ fts_query_match_phrase(
+ phrase,
+ static_cast<byte*>(data),
+ cur_len, prev_len,
+ phrase->heap);
+ }
+
+ /* Document positions are calculated from the beginning
+ of the first field, need to save the length for each
+ searched field to adjust the doc position when search
+ phrases. */
+ prev_len += cur_len + 1;
+ }
+
+ if (phrase->found) {
+ break;
+ }
+
+ exp = que_node_get_next(exp);
+ }
+
+ if (phrase->proximity_pos) {
+ ut_ad(prev_len <= total_len);
+
+ phrase->found = fts_proximity_is_word_in_range(
+ phrase, document_text, total_len);
+ }
+
+ return(phrase->found);
+}
+
+#if 0
+/********************************************************************
+Callback function to check whether a record was found or not. */
+static
+ibool
+fts_query_select(
+/*=============*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: fts_doc_t* */
+{
+ int i;
+ que_node_t* exp;
+ sel_node_t* node = row;
+ fts_select_t* select = user_arg;
+
+ ut_a(select->word_freq);
+ ut_a(select->word_freq->doc_freqs);
+
+ exp = node->select_list;
+
+ for (i = 0; exp && !select->found; ++i) {
+ dfield_t* dfield = que_node_get_val(exp);
+ void* data = dfield_get_data(dfield);
+ ulint len = dfield_get_len(dfield);
+
+ switch (i) {
+ case 0: /* DOC_COUNT */
+ if (len != UNIV_SQL_NULL && len != 0) {
+
+ select->word_freq->doc_count +=
+ mach_read_from_4(data);
+ }
+ break;
+
+ case 1: /* ILIST */
+ if (len != UNIV_SQL_NULL && len != 0) {
+
+ fts_query_find_doc_id(select, data, len);
+ }
+ break;
+
+ default:
+ ut_error;
+ }
+
+ exp = que_node_get_next(exp);
+ }
+
+ return(FALSE);
+}
+
+/********************************************************************
+Read the rows from the FTS index, that match word and where the
+doc id is between first and last doc id.
+@return DB_SUCCESS if all go well else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_find_term(
+/*================*/
+ fts_query_t* query, /*!< in: FTS query state */
+ que_t** graph, /*!< in: prepared statement */
+ const fts_string_t* word, /*!< in: the word to fetch */
+ doc_id_t doc_id, /*!< in: doc id to match */
+ ulint* min_pos,/*!< in/out: pos found must be
+ greater than this minimum value. */
+ ibool* found) /*!< out: TRUE if found else FALSE */
+{
+ pars_info_t* info;
+ dberr_t error;
+ fts_select_t select;
+ doc_id_t match_doc_id;
+ trx_t* trx = query->trx;
+ char table_name[MAX_FULL_NAME_LEN];
+
+ trx->op_info = "fetching FTS index matching nodes";
+
+ if (*graph) {
+ info = (*graph)->info;
+ } else {
+ ulint selected;
+
+ info = pars_info_create();
+
+ selected = fts_select_index(*word->f_str);
+ query->fts_index_table.suffix = fts_get_suffix(selected);
+
+ fts_get_table_name(&query->fts_index_table, table_name);
+ pars_info_bind_id(info, true, "index_table_name", table_name);
+ }
+
+ select.found = FALSE;
+ select.doc_id = doc_id;
+ select.min_pos = *min_pos;
+ select.word_freq = fts_query_add_word_freq(query, word->f_str);
+
+ pars_info_bind_function(info, "my_func", fts_query_select, &select);
+ pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+ /* Convert to "storage" byte order. */
+ fts_write_doc_id((byte*) &match_doc_id, doc_id);
+
+ fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
+
+ fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
+
+ if (!*graph) {
+
+ *graph = fts_parse_sql(
+ &query->fts_index_table,
+ info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT doc_count, ilist\n"
+ " FROM $index_table_name\n"
+ " WHERE word LIKE :word AND"
+ " first_doc_id <= :min_doc_id AND"
+ " last_doc_id >= :max_doc_id\n"
+ " ORDER BY first_doc_id;\n"
+ "BEGIN\n"
+ "\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+ }
+
+ for (;;) {
+ error = fts_eval_sql(trx, *graph);
+
+ if (error == DB_SUCCESS) {
+
+ break; /* Exit the loop. */
+ } else {
+
+ if (error == DB_LOCK_WAIT_TIMEOUT) {
+ ib::warn() << "lock wait timeout reading FTS"
+ " index. Retrying!";
+
+ trx->error_state = DB_SUCCESS;
+ } else {
+ ib::error() << error
+ << " while reading FTS index.";
+
+ break; /* Exit the loop. */
+ }
+ }
+ }
+
+ /* Value to return */
+ *found = select.found;
+
+ if (*found) {
+ *min_pos = select.min_pos;
+ }
+
+ return(error);
+}
+
+/********************************************************************
+Callback aggregator for int columns. */
+static
+ibool
+fts_query_sum(
+/*==========*/
+ /*!< out: always returns TRUE */
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: ulint* */
+{
+
+ que_node_t* exp;
+ sel_node_t* node = row;
+ ulint* total = user_arg;
+
+ exp = node->select_list;
+
+ while (exp) {
+ dfield_t* dfield = que_node_get_val(exp);
+ void* data = dfield_get_data(dfield);
+ ulint len = dfield_get_len(dfield);
+
+ if (len != UNIV_SQL_NULL && len != 0) {
+ *total += mach_read_from_4(data);
+ }
+
+ exp = que_node_get_next(exp);
+ }
+
+ return(TRUE);
+}
+
+/********************************************************************
+Calculate the total documents that contain a particular word (term).
+@return DB_SUCCESS if all go well else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_total_docs_containing_term(
+/*=================================*/
+ fts_query_t* query, /*!< in: FTS query state */
+ const fts_string_t* word, /*!< in: the word to check */
+ ulint* total) /*!< out: documents containing word */
+{
+ pars_info_t* info;
+ dberr_t error;
+ que_t* graph;
+ ulint selected;
+ trx_t* trx = query->trx;
+ char table_name[MAX_FULL_NAME_LEN]
+
+ trx->op_info = "fetching FTS index document count";
+
+ *total = 0;
+
+ info = pars_info_create();
+
+ pars_info_bind_function(info, "my_func", fts_query_sum, total);
+ pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+ selected = fts_select_index(*word->f_str);
+
+ query->fts_index_table.suffix = fts_get_suffix(selected);
+
+ fts_get_table_name(&query->fts_index_table, table_name);
+
+ pars_info_bind_id(info, true, "index_table_name", table_name);
+
+ graph = fts_parse_sql(
+ &query->fts_index_table,
+ info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT doc_count\n"
+ " FROM $index_table_name\n"
+ " WHERE word = :word"
+ " ORDER BY first_doc_id;\n"
+ "BEGIN\n"
+ "\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+
+ for (;;) {
+ error = fts_eval_sql(trx, graph);
+
+ if (error == DB_SUCCESS) {
+
+ break; /* Exit the loop. */
+ } else {
+
+ if (error == DB_LOCK_WAIT_TIMEOUT) {
+ ib::warn() << "lock wait timeout reading FTS"
+ " index. Retrying!";
+
+ trx->error_state = DB_SUCCESS;
+ } else {
+ ib::error() << error
+ << " while reading FTS index.";
+
+ break; /* Exit the loop. */
+ }
+ }
+ }
+
+ fts_que_graph_free(graph);
+
+ return(error);
+}
+
+/********************************************************************
+Get the total number of words in a documents.
+@return DB_SUCCESS if all go well else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_terms_in_document(
+/*========================*/
+ fts_query_t* query, /*!< in: FTS query state */
+ doc_id_t doc_id, /*!< in: the word to check */
+ ulint* total) /*!< out: total words in document */
+{
+ pars_info_t* info;
+ dberr_t error;
+ que_t* graph;
+ doc_id_t read_doc_id;
+ trx_t* trx = query->trx;
+ char table_name[MAX_FULL_NAME_LEN];
+
+ trx->op_info = "fetching FTS document term count";
+
+ *total = 0;
+
+ info = pars_info_create();
+
+ pars_info_bind_function(info, "my_func", fts_query_sum, total);
+
+ /* Convert to "storage" byte order. */
+ fts_write_doc_id((byte*) &read_doc_id, doc_id);
+ fts_bind_doc_id(info, "doc_id", &read_doc_id);
+
+ query->fts_index_table.suffix = "DOC_ID";
+
+ fts_get_table_name(&query->fts_index_table, table_name);
+
+ pars_info_bind_id(info, true, "index_table_name", table_name);
+
+ graph = fts_parse_sql(
+ &query->fts_index_table,
+ info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT count\n"
+ " FROM $index_table_name\n"
+ " WHERE doc_id = :doc_id"
+ " BEGIN\n"
+ "\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+
+ for (;;) {
+ error = fts_eval_sql(trx, graph);
+
+ if (error == DB_SUCCESS) {
+
+ break; /* Exit the loop. */
+ } else {
+
+ if (error == DB_LOCK_WAIT_TIMEOUT) {
+ ib::warn() << "lock wait timeout reading FTS"
+ " doc id table. Retrying!";
+
+ trx->error_state = DB_SUCCESS;
+ } else {
+ ib::error() << error << " while reading FTS"
+ " doc id table.";
+
+ break; /* Exit the loop. */
+ }
+ }
+ }
+
+ fts_que_graph_free(graph);
+
+ return(error);
+}
+#endif
+
+/*****************************************************************//**
+Retrieve the document and match the phrase tokens.
+@return DB_SUCCESS or error code */
+MY_ATTRIBUTE((nonnull(1,2,3,6), warn_unused_result))
+static
+dberr_t
+fts_query_match_document(
+/*=====================*/
+ ib_vector_t* tokens, /*!< in: phrase tokens */
+ fts_get_doc_t* get_doc, /*!< in: table and prepared statements */
+ fts_match_t* match, /*!< in: doc id and positions */
+ ulint distance, /*!< in: proximity distance */
+ st_mysql_ftparser* parser, /*!< in: fts plugin parser */
+ ibool* found) /*!< out: TRUE if phrase found */
+{
+ dberr_t error;
+ fts_phrase_t phrase(get_doc->index_cache->index->table);
+
+ phrase.match = match; /* Positions to match */
+ phrase.tokens = tokens; /* Tokens to match */
+ phrase.distance = distance;
+ phrase.charset = get_doc->index_cache->charset;
+ phrase.heap = mem_heap_create(512);
+ phrase.parser = parser;
+
+ *found = phrase.found = FALSE;
+
+ error = fts_doc_fetch_by_doc_id(
+ get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
+ fts_query_fetch_document, &phrase);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "(" << error << ") matching document.";
+ } else {
+ *found = phrase.found;
+ }
+
+ mem_heap_free(phrase.heap);
+
+ return(error);
+}
+
+/*****************************************************************//**
+This function fetches the original documents and count the
+words in between matching words to see that is in specified distance
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+fts_query_is_in_proximity_range(
+/*============================*/
+ const fts_query_t* query, /*!< in: query instance */
+ fts_match_t** match, /*!< in: query instance */
+ fts_proximity_t* qualified_pos) /*!< in: position info for
+ qualified ranges */
+{
+ fts_get_doc_t get_doc;
+ fts_cache_t* cache = query->index->table->fts->cache;
+ dberr_t err;
+
+ memset(&get_doc, 0x0, sizeof(get_doc));
+
+ rw_lock_x_lock(&cache->lock);
+ get_doc.index_cache = fts_find_index_cache(cache, query->index);
+ rw_lock_x_unlock(&cache->lock);
+ ut_a(get_doc.index_cache != NULL);
+
+ fts_phrase_t phrase(get_doc.index_cache->index->table);
+
+ phrase.distance = query->distance;
+ phrase.charset = get_doc.index_cache->charset;
+ phrase.heap = mem_heap_create(512);
+ phrase.proximity_pos = qualified_pos;
+ phrase.found = FALSE;
+
+ err = fts_doc_fetch_by_doc_id(
+ &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
+ fts_query_fetch_document, &phrase);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ib::error() << "(" << err << ") in verification"
+ " phase of proximity search";
+ }
+
+ /* Free the prepared statement. */
+ if (get_doc.get_document_graph) {
+ fts_que_graph_free(get_doc.get_document_graph);
+ get_doc.get_document_graph = NULL;
+ }
+
+ mem_heap_free(phrase.heap);
+
+ return(err == DB_SUCCESS && phrase.found);
+}
+
+/*****************************************************************//**
+Iterate over the matched document ids and search the for the
+actual phrase in the text.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_search_phrase(
+/*====================*/
+ fts_query_t* query, /*!< in: query instance */
+ ib_vector_t* orig_tokens, /*!< in: tokens to search,
+ with any stopwords in the
+ original phrase */
+ ib_vector_t* tokens) /*!< in: tokens that does
+ not include stopwords and
+ can be used to calculate
+ ranking */
+{
+ ulint i;
+ fts_get_doc_t get_doc;
+ ulint n_matched;
+ fts_cache_t* cache = query->index->table->fts->cache;
+
+ n_matched = ib_vector_size(query->matched);
+
+ /* Setup the doc retrieval infrastructure. */
+ memset(&get_doc, 0x0, sizeof(get_doc));
+
+ rw_lock_x_lock(&cache->lock);
+
+ get_doc.index_cache = fts_find_index_cache(cache, query->index);
+
+ /* Must find the index cache */
+ ut_a(get_doc.index_cache != NULL);
+
+ rw_lock_x_unlock(&cache->lock);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ ib::info() << "Start phrase search";
+#endif
+
+ /* Read the document from disk and do the actual
+ match, matching documents will be added to the current
+ doc id set. */
+ for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
+ fts_match_t* match;
+ ibool found = FALSE;
+
+ match = static_cast<fts_match_t*>(
+ ib_vector_get(query->matched, i));
+
+ /* Skip the document ids that were filtered out by
+ an earlier pass. */
+ if (match->doc_id != 0) {
+
+ query->error = fts_query_match_document(
+ orig_tokens, &get_doc, match,
+ query->distance, query->parser, &found);
+
+ if (query->error == DB_SUCCESS && found) {
+ ulint z;
+
+ query->error = fts_query_process_doc_id(query,
+ match->doc_id, 0);
+ if (query->error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ for (z = 0; z < ib_vector_size(tokens); z++) {
+ fts_string_t* token;
+ token = static_cast<fts_string_t*>(
+ ib_vector_get(tokens, z));
+ fts_query_add_word_to_document(
+ query, match->doc_id, token);
+ }
+ }
+ }
+ }
+
+func_exit:
+ /* Free the prepared statement. */
+ if (get_doc.get_document_graph) {
+ fts_que_graph_free(get_doc.get_document_graph);
+ get_doc.get_document_graph = NULL;
+ }
+
+ return(query->error);
+}
+
+/** Split the phrase into tokens
+@param[in,out] query query instance
+@param[in] node query node to search
+@param[in,out] tokens token vector
+@param[in,out] orig_tokens original node tokens include stopword
+@param[in,out] heap mem heap */
+static
+void
+fts_query_phrase_split(
+ fts_query_t* query,
+ const fts_ast_node_t* node,
+ ib_vector_t* tokens,
+ ib_vector_t* orig_tokens,
+ mem_heap_t* heap)
+{
+ fts_string_t phrase;
+ ulint len = 0;
+ ulint cur_pos = 0;
+ fts_ast_node_t* term_node = NULL;
+
+ if (node->type == FTS_AST_TEXT) {
+ phrase.f_str = node->text.ptr->str;
+ phrase.f_len = node->text.ptr->len;
+ len = phrase.f_len;
+ } else {
+ ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
+ phrase.f_str = NULL;
+ phrase.f_len = 0;
+ term_node = node->list.head;
+ }
+
+ while (true) {
+ fts_cache_t* cache = query->index->table->fts->cache;
+ ulint cur_len;
+ fts_string_t result_str;
+
+ if (node->type == FTS_AST_TEXT) {
+ if (cur_pos >= len) {
+ break;
+ }
+
+ cur_len = innobase_mysql_fts_get_token(
+ query->fts_index_table.charset,
+ reinterpret_cast<const byte*>(phrase.f_str)
+ + cur_pos,
+ reinterpret_cast<const byte*>(phrase.f_str)
+ + len,
+ &result_str);
+
+ if (cur_len == 0) {
+ break;
+ }
+
+ cur_pos += cur_len;
+ } else {
+ ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
+ /* Term node in parser phrase list */
+ if (term_node == NULL) {
+ break;
+ }
+
+ ut_a(term_node->type == FTS_AST_TERM);
+ result_str.f_str = term_node->term.ptr->str;
+ result_str.f_len = term_node->term.ptr->len;
+ result_str.f_n_char = fts_get_token_size(
+ query->fts_index_table.charset,
+ reinterpret_cast<char*>(result_str.f_str),
+ result_str.f_len);
+
+ term_node = term_node->next;
+ }
+
+ if (result_str.f_n_char == 0) {
+ continue;
+ }
+
+ fts_string_t* token = static_cast<fts_string_t*>(
+ ib_vector_push(tokens, NULL));
+ fts_string_dup(token, &result_str, heap);
+
+ if (fts_check_token(
+ &result_str,
+ cache->stopword_info.cached_stopword,
+ query->fts_index_table.charset)) {
+ /* Add the word to the RB tree so that we can
+ calculate it's frequencey within a document. */
+ fts_query_add_word_freq(query, token);
+ } else {
+ ib_vector_pop(tokens);
+ }
+
+ /* we will start to store all words including stopwords
+ in the "orig_tokens" vector, but skip any leading words
+ that are stopwords */
+ if (!ib_vector_is_empty(tokens)) {
+ fts_string_t* orig_token = static_cast<fts_string_t*>(
+ ib_vector_push(orig_tokens, NULL));
+
+ orig_token->f_str = token->f_str;
+ orig_token->f_len = token->f_len;
+ }
+ }
+}
+
+/*****************************************************************//**
+Text/Phrase search.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+fts_query_phrase_search(
+/*====================*/
+ fts_query_t* query, /*!< in: query instance */
+ const fts_ast_node_t* node) /*!< in: node to search */
+{
+ ib_vector_t* tokens;
+ ib_vector_t* orig_tokens;
+ mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
+ ib_alloc_t* heap_alloc;
+ ulint num_token;
+
+ heap_alloc = ib_heap_allocator_create(heap);
+
+ tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+ orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+
+ if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
+ query->flags = FTS_PROXIMITY;
+ } else {
+ query->flags = FTS_PHRASE;
+ }
+
+ /* Split the phrase into tokens. */
+ fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
+
+ num_token = ib_vector_size(tokens);
+ if (num_token > MAX_PROXIMITY_ITEM) {
+ query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
+ goto func_exit;
+ }
+
+ ut_ad(ib_vector_size(orig_tokens) >= num_token);
+
+ /* Ignore empty strings. */
+ if (num_token > 0) {
+ fts_string_t* token = NULL;
+ fts_fetch_t fetch;
+ trx_t* trx = query->trx;
+ fts_ast_oper_t oper = query->oper;
+ que_t* graph = NULL;
+ ulint i;
+ dberr_t error;
+
+ /* Create the vector for storing matching document ids
+ and the positions of the first token of the phrase. */
+ if (!query->matched) {
+ ib_alloc_t* heap_alloc;
+
+ heap_alloc = ib_heap_allocator_create(heap);
+
+ if (!(query->flags & FTS_PROXIMITY)
+ && !(query->flags & FTS_PHRASE)) {
+ query->matched = ib_vector_create(
+ heap_alloc, sizeof(fts_match_t),
+ 64);
+ } else {
+ ut_a(num_token <= MAX_PROXIMITY_ITEM);
+ query->match_array =
+ (ib_vector_t**) mem_heap_alloc(
+ heap,
+ num_token *
+ sizeof(query->matched));
+
+ for (i = 0; i < num_token; i++) {
+ query->match_array[i] =
+ ib_vector_create(
+ heap_alloc, sizeof(fts_match_t),
+ 64);
+ }
+
+ query->matched = query->match_array[0];
+ }
+ }
+
+ /* Setup the callback args for filtering and consolidating
+ the ilist. */
+ fetch.read_arg = query;
+ fetch.read_record = fts_query_index_fetch_nodes;
+
+ for (i = 0; i < num_token; i++) {
+ /* Search for the first word from the phrase. */
+ token = static_cast<fts_string_t*>(
+ ib_vector_get(tokens, i));
+
+ if (query->flags & FTS_PROXIMITY
+ || query->flags & FTS_PHRASE) {
+ query->matched = query->match_array[i];
+ }
+
+ error = fts_index_fetch_nodes(
+ trx, &graph, &query->fts_index_table,
+ token, &fetch);
+
+ /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+ ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+ if (error != DB_SUCCESS) {
+ query->error = error;
+ }
+
+ fts_que_graph_free(graph);
+ graph = NULL;
+
+ fts_query_cache(query, token);
+
+ if (!(query->flags & FTS_PHRASE)
+ && !(query->flags & FTS_PROXIMITY)) {
+ break;
+ }
+
+ /* If any of the token can't be found,
+ no need to continue match */
+ if (ib_vector_is_empty(query->match_array[i])
+ || query->error != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+
+ /* Just a single word, no need to fetch the original
+ documents to do phrase matching */
+ if (ib_vector_size(orig_tokens) == 1
+ && !ib_vector_is_empty(query->match_array[0])) {
+ fts_match_t* match;
+ ulint n_matched;
+
+ n_matched = ib_vector_size(query->match_array[0]);
+
+ for (i = 0; i < n_matched; i++) {
+ match = static_cast<fts_match_t*>(
+ ib_vector_get(
+ query->match_array[0], i));
+
+ query->error = fts_query_process_doc_id(
+ query, match->doc_id, 0);
+ if (query->error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ fts_query_add_word_to_document(
+ query, match->doc_id, token);
+ }
+ query->oper = oper;
+ goto func_exit;
+ }
+
+ /* If we are doing proximity search, verify the distance
+ between all words, and check they are in specified distance. */
+ if (query->flags & FTS_PROXIMITY) {
+ fts_phrase_or_proximity_search(query, tokens);
+ } else {
+ ibool matched;
+
+ /* Phrase Search case:
+ We filter out the doc ids that don't contain
+ all the tokens in the phrase. It's cheaper to
+ search the ilist than bringing the documents in
+ and then doing a search through the text. Isolated
+ testing shows this also helps in mitigating disruption
+ of the buffer cache. */
+ matched = fts_phrase_or_proximity_search(query, tokens);
+ query->matched = query->match_array[0];
+
+ /* Read the actual text in and search for the phrase. */
+ if (matched) {
+ ut_ad(query->error == DB_SUCCESS);
+ query->error = fts_query_search_phrase(
+ query, orig_tokens, tokens);
+ }
+ }
+
+ /* Restore original operation. */
+ query->oper = oper;
+
+ if (query->error != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+
+func_exit:
+ mem_heap_free(heap);
+
+ /* Don't need it anymore. */
+ query->matched = NULL;
+
+ return(query->error);
+}
+
+/*****************************************************************//**
+Find the word and evaluate.
+@return DB_SUCCESS if all go well */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_execute(
+/*==============*/
+ fts_query_t* query, /*!< in: query instance */
+ fts_string_t* token) /*!< in: token to search */
+{
+ switch (query->oper) {
+ case FTS_NONE:
+ case FTS_NEGATE:
+ case FTS_INCR_RATING:
+ case FTS_DECR_RATING:
+ query->error = fts_query_union(query, token);
+ break;
+
+ case FTS_EXIST:
+ query->error = fts_query_intersect(query, token);
+ break;
+
+ case FTS_IGNORE:
+ query->error = fts_query_difference(query, token);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ return(query->error);
+}
+
+/*****************************************************************//**
+Create a wildcard string. It's the responsibility of the caller to
+free the byte* pointer. It's allocated using ut_malloc_nokey().
+@return ptr to allocated memory */
+static
+byte*
+fts_query_get_token(
+/*================*/
+ fts_ast_node_t* node, /*!< in: the current sub tree */
+ fts_string_t* token) /*!< in: token to create */
+{
+ ulint str_len;
+ byte* new_ptr = NULL;
+
+ str_len = node->term.ptr->len;
+
+ ut_a(node->type == FTS_AST_TERM);
+
+ token->f_len = str_len;
+ token->f_str = node->term.ptr->str;
+
+ if (node->term.wildcard) {
+
+ token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2));
+ token->f_len = str_len + 1;
+
+ memcpy(token->f_str, node->term.ptr->str, str_len);
+
+ token->f_str[str_len] = '%';
+ token->f_str[token->f_len] = 0;
+
+ new_ptr = token->f_str;
+ }
+
+ return(new_ptr);
+}
+
+static dberr_t fts_ast_visit_sub_exp(fts_ast_node_t*, fts_ast_callback, void*);
+
+/*****************************************************************//**
+Visit every node of the AST. */
+static
+dberr_t
+fts_query_visitor(
+/*==============*/
+ fts_ast_oper_t oper, /*!< in: current operator */
+ fts_ast_node_t* node, /*!< in: The root of the current subtree*/
+ void* arg) /*!< in: callback arg*/
+{
+ byte* ptr;
+ fts_string_t token;
+ fts_query_t* query = static_cast<fts_query_t*>(arg);
+
+ ut_a(node);
+ DBUG_ENTER("fts_query_visitor");
+ DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
+
+ token.f_n_char = 0;
+ query->oper = oper;
+ query->cur_node = node;
+
+ switch (node->type) {
+ case FTS_AST_TEXT:
+ case FTS_AST_PARSER_PHRASE_LIST:
+
+ if (query->oper == FTS_EXIST) {
+ ut_ad(query->intersection == NULL);
+ query->intersection = rbt_create(
+ sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+ query->total_size += SIZEOF_RBT_CREATE;
+ }
+
+ /* Set the current proximity distance. */
+ query->distance = node->text.distance;
+
+ /* Force collection of doc ids and the positions. */
+ query->collect_positions = TRUE;
+
+ query->error = fts_query_phrase_search(query, node);
+
+ query->collect_positions = FALSE;
+
+ if (query->oper == FTS_EXIST) {
+ fts_query_free_doc_ids(query, query->doc_ids);
+ query->doc_ids = query->intersection;
+ query->intersection = NULL;
+ }
+
+ break;
+
+ case FTS_AST_TERM:
+ token.f_str = node->term.ptr->str;
+ token.f_len = node->term.ptr->len;
+
+ /* Collect wildcard words for QUERY EXPANSION. */
+ if (node->term.wildcard && query->wildcard_words != NULL) {
+ ib_rbt_bound_t parent;
+
+ if (rbt_search(query->wildcard_words, &parent, &token)
+ != 0) {
+ fts_string_t word;
+
+ fts_string_dup(&word, &token, query->heap);
+ rbt_add_node(query->wildcard_words, &parent,
+ &word);
+ }
+ }
+
+ /* Add the word to our RB tree that will be used to
+ calculate this terms per document frequency. */
+ fts_query_add_word_freq(query, &token);
+
+ ptr = fts_query_get_token(node, &token);
+ query->error = fts_query_execute(query, &token);
+
+ if (ptr) {
+ ut_free(ptr);
+ }
+
+ break;
+
+ case FTS_AST_SUBEXP_LIST:
+ query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ if (query->oper == FTS_EXIST) {
+ query->multi_exist = true;
+ }
+
+ DBUG_RETURN(query->error);
+}
+
+/** Process (nested) sub-expression, create a new result set to store the
+sub-expression result by processing nodes under current sub-expression
+list. Merge the sub-expression result with that of parent expression list.
+@param[in,out] node current root node
+@param[in,out] visitor callback function
+@param[in,out] arg argument for callback
+@return DB_SUCCESS if all go well */
+static
+dberr_t
+fts_ast_visit_sub_exp(
+ fts_ast_node_t* node,
+ fts_ast_callback visitor,
+ void* arg)
+{
+ fts_ast_oper_t cur_oper;
+ fts_query_t* query = static_cast<fts_query_t*>(arg);
+ ib_rbt_t* parent_doc_ids;
+ ib_rbt_t* subexpr_doc_ids;
+ dberr_t error = DB_SUCCESS;
+ bool will_be_ignored = false;
+ bool multi_exist;
+
+ DBUG_ENTER("fts_ast_visit_sub_exp");
+
+ ut_a(node->type == FTS_AST_SUBEXP_LIST);
+
+ /* To avoid stack overflow, we limit the mutual recursion
+ depth between fts_ast_visit(), fts_query_visitor() and
+ fts_ast_visit_sub_exp(). */
+ if (query->visiting_sub_exp++ > 31) {
+ query->error = DB_OUT_OF_MEMORY;
+ DBUG_RETURN(query->error);
+ }
+
+ cur_oper = query->oper;
+
+ /* Save current result set */
+ parent_doc_ids = query->doc_ids;
+
+ /* Create new result set to store the sub-expression result. We
+ will merge this result set with the parent after processing. */
+ query->doc_ids = rbt_create(sizeof(fts_ranking_t),
+ fts_ranking_doc_id_cmp);
+
+ query->total_size += SIZEOF_RBT_CREATE;
+
+ multi_exist = query->multi_exist;
+ query->multi_exist = false;
+ /* Process nodes in current sub-expression and store its
+ result set in query->doc_ids we created above. */
+ error = fts_ast_visit(FTS_NONE, node, visitor,
+ arg, &will_be_ignored);
+
+ /* Reinstate parent node state */
+ query->multi_exist = multi_exist;
+ query->oper = cur_oper;
+ query->visiting_sub_exp--;
+
+ /* Merge the sub-expression result with the parent result set. */
+ subexpr_doc_ids = query->doc_ids;
+ query->doc_ids = parent_doc_ids;
+ if (error == DB_SUCCESS) {
+ error = fts_merge_doc_ids(query, subexpr_doc_ids);
+ }
+
+ /* Free current result set. Result already merged into parent. */
+ fts_query_free_doc_ids(query, subexpr_doc_ids);
+
+ DBUG_RETURN(error);
+}
+
+#if 0
+/*****************************************************************//***
+Check if the doc id exists in the ilist.
+@return TRUE if doc id found */
+static
+ulint
+fts_query_find_doc_id(
+/*==================*/
+ fts_select_t* select, /*!< in/out: contains the doc id to
+ find, we update the word freq if
+ document found */
+ void* data, /*!< in: doc id ilist */
+ ulint len) /*!< in: doc id ilist size */
+{
+ byte* ptr = data;
+ doc_id_t doc_id = 0;
+ ulint decoded = 0;
+
+ /* Decode the ilist and search for selected doc_id. We also
+ calculate the frequency of the word in the document if found. */
+ while (decoded < len && !select->found) {
+ ulint freq = 0;
+ ulint min_pos = 0;
+ ulint last_pos = 0;
+ ulint pos = fts_decode_vlc(&ptr);
+
+ /* Add the delta. */
+ doc_id += pos;
+
+ while (*ptr) {
+ ++freq;
+ last_pos += fts_decode_vlc(&ptr);
+
+ /* Only if min_pos is not set and the current
+ term exists in a position greater than the
+ min_pos of the previous term. */
+ if (min_pos == 0 && last_pos > select->min_pos) {
+ min_pos = last_pos;
+ }
+ }
+
+ /* Skip the end of word position marker. */
+ ++ptr;
+
+ /* Bytes decoded so far. */
+ decoded = ptr - (byte*) data;
+
+ /* A word may exist in the document but we only consider a
+ match if it exists in a position that is greater than the
+ position of the previous term. */
+ if (doc_id == select->doc_id && min_pos > 0) {
+ fts_doc_freq_t* doc_freq;
+
+ /* Add the doc id to the doc freq rb tree, if
+ the doc id doesn't exist it will be created. */
+ doc_freq = fts_query_add_doc_freq(
+ select->word_freq->doc_freqs, doc_id);
+
+ /* Avoid duplicating the frequency tally */
+ if (doc_freq->freq == 0) {
+ doc_freq->freq = freq;
+ }
+
+ select->found = TRUE;
+ select->min_pos = min_pos;
+ }
+ }
+
+ return(select->found);
+}
+#endif
+
+/*****************************************************************//**
+Read and filter nodes.
+@return DB_SUCCESS if all go well,
+or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
+static
+dberr_t
+fts_query_filter_doc_ids(
+/*=====================*/
+ fts_query_t* query, /*!< in: query instance */
+ const fts_string_t* word, /*!< in: the current word */
+ fts_word_freq_t* word_freq, /*!< in/out: word frequency */
+ const fts_node_t* node, /*!< in: current FTS node */
+ void* data, /*!< in: doc id ilist */
+ ulint len, /*!< in: doc id ilist size */
+ ibool calc_doc_count) /*!< in: whether to remember doc count */
+{
+ byte* ptr = static_cast<byte*>(data);
+ doc_id_t doc_id = 0;
+ ulint decoded = 0;
+ ib_rbt_t* doc_freqs = word_freq->doc_freqs;
+
+ /* Decode the ilist and add the doc ids to the query doc_id set. */
+ while (decoded < len) {
+ ulint freq = 0;
+ fts_doc_freq_t* doc_freq;
+ fts_match_t* match = NULL;
+ ulint last_pos = 0;
+ ulint pos = fts_decode_vlc(&ptr);
+
+ /* Some sanity checks. */
+ if (doc_id == 0) {
+ ut_a(pos == node->first_doc_id);
+ }
+
+ /* Add the delta. */
+ doc_id += pos;
+
+ if (calc_doc_count) {
+ word_freq->doc_count++;
+ }
+
+ /* We simply collect the matching instances here. */
+ if (query->collect_positions) {
+ ib_alloc_t* heap_alloc;
+
+ /* Create a new fts_match_t instance. */
+ match = static_cast<fts_match_t*>(
+ ib_vector_push(query->matched, NULL));
+
+ match->start = 0;
+ match->doc_id = doc_id;
+ heap_alloc = ib_vector_allocator(query->matched);
+
+ /* Allocate from the same heap as the
+ parent container. */
+ match->positions = ib_vector_create(
+ heap_alloc, sizeof(ulint), 64);
+
+ query->total_size += sizeof(fts_match_t)
+ + sizeof(ib_vector_t)
+ + sizeof(ulint) * 64;
+ }
+
+ /* Unpack the positions within the document. */
+ while (*ptr) {
+ last_pos += fts_decode_vlc(&ptr);
+
+ /* Collect the matching word positions, for phrase
+ matching later. */
+ if (query->collect_positions) {
+ ib_vector_push(match->positions, &last_pos);
+ }
+
+ ++freq;
+ }
+
+ /* End of list marker. */
+ last_pos = (ulint) -1;
+
+ if (query->collect_positions) {
+ ut_a(match != NULL);
+ ib_vector_push(match->positions, &last_pos);
+ }
+
+ /* Add the doc id to the doc freq rb tree, if the doc id
+ doesn't exist it will be created. */
+ doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
+
+ /* Avoid duplicating frequency tally. */
+ if (doc_freq->freq == 0) {
+ doc_freq->freq = freq;
+ }
+
+ /* Skip the end of word position marker. */
+ ++ptr;
+
+ /* Bytes decoded so far */
+ decoded = ulint(ptr - (byte*) data);
+
+ /* We simply collect the matching documents and the
+ positions here and match later. */
+ if (!query->collect_positions) {
+ /* We ignore error here and will check it later */
+ fts_query_process_doc_id(query, doc_id, 0);
+
+ /* Add the word to the document's matched RB tree. */
+ fts_query_add_word_to_document(query, doc_id, word);
+ }
+ }
+
+ /* Some sanity checks. */
+ ut_a(doc_id == node->last_doc_id);
+
+ if (query->total_size > fts_result_cache_limit) {
+ return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+ } else {
+ return(DB_SUCCESS);
+ }
+}
+
+/*****************************************************************//**
+Read the FTS INDEX row.
+@return DB_SUCCESS if all go well. */
+static
+dberr_t
+fts_query_read_node(
+/*================*/
+ fts_query_t* query, /*!< in: query instance */
+ const fts_string_t* word, /*!< in: current word */
+ que_node_t* exp) /*!< in: query graph node */
+{
+ int i;
+ int ret;
+ fts_node_t node;
+ ib_rbt_bound_t parent;
+ fts_word_freq_t* word_freq;
+ ibool skip = FALSE;
+ fts_string_t term;
+ byte buf[FTS_MAX_WORD_LEN + 1];
+ dberr_t error = DB_SUCCESS;
+
+ ut_a(query->cur_node->type == FTS_AST_TERM
+ || query->cur_node->type == FTS_AST_TEXT
+ || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
+
+ memset(&node, 0, sizeof(node));
+ term.f_str = buf;
+
+ /* Need to consider the wildcard search case, the word frequency
+ is created on the search string not the actual word. So we need
+ to assign the frequency on search string behalf. */
+ if (query->cur_node->type == FTS_AST_TERM
+ && query->cur_node->term.wildcard) {
+
+ term.f_len = query->cur_node->term.ptr->len;
+ ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
+ memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
+ } else {
+ term.f_len = word->f_len;
+ ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
+ memcpy(term.f_str, word->f_str, word->f_len);
+ }
+
+ /* Lookup the word in our rb tree, it must exist. */
+ ret = rbt_search(query->word_freqs, &parent, &term);
+
+ ut_a(ret == 0);
+
+ word_freq = rbt_value(fts_word_freq_t, parent.last);
+
+ /* Start from 1 since the first column has been read by the caller.
+ Also, we rely on the order of the columns projected, to filter
+ out ilists that are out of range and we always want to read
+ the doc_count irrespective of the suitablility of the row. */
+
+ for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
+
+ dfield_t* dfield = que_node_get_val(exp);
+ byte* data = static_cast<byte*>(
+ dfield_get_data(dfield));
+ ulint len = dfield_get_len(dfield);
+
+ ut_a(len != UNIV_SQL_NULL);
+
+ /* Note: The column numbers below must match the SELECT. */
+
+ switch (i) {
+ case 1: /* DOC_COUNT */
+ word_freq->doc_count += mach_read_from_4(data);
+ break;
+
+ case 2: /* FIRST_DOC_ID */
+ node.first_doc_id = fts_read_doc_id(data);
+
+ /* Skip nodes whose doc ids are out range. */
+ if (query->oper == FTS_EXIST
+ && query->upper_doc_id > 0
+ && node.first_doc_id > query->upper_doc_id) {
+ skip = TRUE;
+ }
+ break;
+
+ case 3: /* LAST_DOC_ID */
+ node.last_doc_id = fts_read_doc_id(data);
+
+ /* Skip nodes whose doc ids are out range. */
+ if (query->oper == FTS_EXIST
+ && query->lower_doc_id > 0
+ && node.last_doc_id < query->lower_doc_id) {
+ skip = TRUE;
+ }
+ break;
+
+ case 4: /* ILIST */
+
+ error = fts_query_filter_doc_ids(
+ query, &word_freq->word, word_freq,
+ &node, data, len, FALSE);
+
+ break;
+
+ default:
+ ut_error;
+ }
+ }
+
+ if (!skip) {
+ /* Make sure all columns were read. */
+
+ ut_a(i == 5);
+ }
+
+ return error;
+}
+
+/*****************************************************************//**
+Callback function to fetch the rows in an FTS INDEX record.
+@return always returns TRUE */
+static
+ibool
+fts_query_index_fetch_nodes(
+/*========================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: pointer to fts_fetch_t */
+{
+ fts_string_t key;
+ sel_node_t* sel_node = static_cast<sel_node_t*>(row);
+ fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg);
+ fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg);
+ que_node_t* exp = sel_node->select_list;
+ dfield_t* dfield = que_node_get_val(exp);
+ void* data = dfield_get_data(dfield);
+ ulint dfield_len = dfield_get_len(dfield);
+
+ key.f_str = static_cast<byte*>(data);
+ key.f_len = dfield_len;
+
+ ut_a(dfield_len <= FTS_MAX_WORD_LEN);
+
+ /* Note: we pass error out by 'query->error' */
+ query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
+
+ if (query->error != DB_SUCCESS) {
+ ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+ return(FALSE);
+ } else {
+ return(TRUE);
+ }
+}
+
+/*****************************************************************//**
+Calculate the inverse document frequency (IDF) for all the terms. */
+static
+void
+fts_query_calculate_idf(
+/*====================*/
+ fts_query_t* query) /*!< in: Query state */
+{
+ const ib_rbt_node_t* node;
+ ib_uint64_t total_docs = query->total_docs;
+
+ /* We need to free any instances of fts_doc_freq_t that we
+ may have allocated. */
+ for (node = rbt_first(query->word_freqs);
+ node;
+ node = rbt_next(query->word_freqs, node)) {
+
+ fts_word_freq_t* word_freq;
+
+ word_freq = rbt_value(fts_word_freq_t, node);
+
+ if (word_freq->doc_count > 0) {
+ if (total_docs == word_freq->doc_count) {
+ /* QP assume ranking > 0 if we find
+ a match. Since Log10(1) = 0, we cannot
+ make IDF a zero value if do find a
+ word in all documents. So let's make
+ it an arbitrary very small number */
+ word_freq->idf = log10(1.0001);
+ } else {
+ word_freq->idf = log10(
+ static_cast<double>(total_docs)
+ / static_cast<double>(
+ word_freq->doc_count));
+ }
+ }
+ }
+}
+
+/*****************************************************************//**
+Calculate the ranking of the document. */
+static
+void
+fts_query_calculate_ranking(
+/*========================*/
+ const fts_query_t* query, /*!< in: query state */
+ fts_ranking_t* ranking) /*!< in: Document to rank */
+{
+ ulint pos = 0;
+ fts_string_t word;
+
+ /* At this stage, ranking->rank should not exceed the 1.0
+ bound */
+ ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
+ ut_ad(rbt_size(query->word_map) == query->word_vector->size());
+
+ while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+ int ret;
+ ib_rbt_bound_t parent;
+ double weight;
+ fts_doc_freq_t* doc_freq;
+ fts_word_freq_t* word_freq;
+
+ ret = rbt_search(query->word_freqs, &parent, &word);
+
+ /* It must exist. */
+ ut_a(ret == 0);
+
+ word_freq = rbt_value(fts_word_freq_t, parent.last);
+
+ ret = rbt_search(
+ word_freq->doc_freqs, &parent, &ranking->doc_id);
+
+ /* It must exist. */
+ ut_a(ret == 0);
+
+ doc_freq = rbt_value(fts_doc_freq_t, parent.last);
+
+ weight = (double) doc_freq->freq * word_freq->idf;
+
+ ranking->rank += (fts_rank_t) (weight * word_freq->idf);
+ }
+}
+
+/*****************************************************************//**
+Add ranking to the result set. */
+static
+void
+fts_query_add_ranking(
+/*==================*/
+ fts_query_t* query, /*!< in: query state */
+ ib_rbt_t* ranking_tree, /*!< in: ranking tree */
+ const fts_ranking_t* new_ranking) /*!< in: ranking of a document */
+{
+ ib_rbt_bound_t parent;
+
+ /* Lookup the ranking in our rb tree and add if it doesn't exist. */
+ if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
+ fts_ranking_t* ranking;
+
+ ranking = rbt_value(fts_ranking_t, parent.last);
+
+ ranking->rank += new_ranking->rank;
+
+ ut_a(ranking->words == NULL);
+ } else {
+ rbt_add_node(ranking_tree, &parent, new_ranking);
+
+ query->total_size += SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_ranking_t);
+ }
+}
+
+/*****************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value, 0 if no ranking value
+present. */
+float
+fts_retrieve_ranking(
+/*=================*/
+ fts_result_t* result, /*!< in: FTS result structure */
+ doc_id_t doc_id) /*!< in: doc_id of the item to retrieve */
+{
+ ib_rbt_bound_t parent;
+ fts_ranking_t new_ranking;
+
+ DBUG_ENTER("fts_retrieve_ranking");
+
+ if (!result || !result->rankings_by_id) {
+ DBUG_RETURN(0);
+ }
+
+ new_ranking.doc_id = doc_id;
+
+ /* Lookup the ranking in our rb tree */
+ if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
+ fts_ranking_t* ranking;
+
+ ranking = rbt_value(fts_ranking_t, parent.last);
+
+ DBUG_RETURN(ranking->rank);
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Create the result and copy the data to it. */
+static
+fts_result_t*
+fts_query_prepare_result(
+/*=====================*/
+ fts_query_t* query, /*!< in: Query state */
+ fts_result_t* result) /*!< in: result this can contain
+ data from a previous search on
+ another FTS index */
+{
+ const ib_rbt_node_t* node;
+ bool result_is_null = false;
+
+ DBUG_ENTER("fts_query_prepare_result");
+
+ if (result == NULL) {
+ result = static_cast<fts_result_t*>(
+ ut_zalloc_nokey(sizeof(*result)));
+
+ result->rankings_by_id = rbt_create(
+ sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+ query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
+ result_is_null = true;
+ }
+
+ if (query->flags == FTS_OPT_RANKING) {
+ fts_word_freq_t* word_freq;
+ ulint size = ib_vector_size(query->deleted->doc_ids);
+ doc_id_t* updates =
+ (doc_id_t*) query->deleted->doc_ids->data;
+
+ node = rbt_first(query->word_freqs);
+ ut_ad(node);
+ word_freq = rbt_value(fts_word_freq_t, node);
+
+ for (node = rbt_first(word_freq->doc_freqs);
+ node;
+ node = rbt_next(word_freq->doc_freqs, node)) {
+ fts_doc_freq_t* doc_freq;
+ fts_ranking_t ranking;
+
+ doc_freq = rbt_value(fts_doc_freq_t, node);
+
+ /* Don't put deleted docs into result */
+ if (fts_bsearch(updates, 0, static_cast<int>(size),
+ doc_freq->doc_id) >= 0) {
+ /* one less matching doc count */
+ --word_freq->doc_count;
+ continue;
+ }
+
+ ranking.doc_id = doc_freq->doc_id;
+ ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
+ ranking.words = NULL;
+
+ fts_query_add_ranking(query, result->rankings_by_id,
+ &ranking);
+
+ if (query->total_size > fts_result_cache_limit) {
+ query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+ fts_query_free_result(result);
+ DBUG_RETURN(NULL);
+ }
+ }
+
+ /* Calculate IDF only after we exclude the deleted items */
+ fts_query_calculate_idf(query);
+
+ node = rbt_first(query->word_freqs);
+ word_freq = rbt_value(fts_word_freq_t, node);
+
+ /* Calculate the ranking for each doc */
+ for (node = rbt_first(result->rankings_by_id);
+ node != NULL;
+ node = rbt_next(result->rankings_by_id, node)) {
+
+ fts_ranking_t* ranking;
+
+ ranking = rbt_value(fts_ranking_t, node);
+
+ ranking->rank = static_cast<fts_rank_t>(
+ ranking->rank * word_freq->idf * word_freq->idf);
+ }
+
+ DBUG_RETURN(result);
+ }
+
+ ut_a(rbt_size(query->doc_ids) > 0);
+
+ for (node = rbt_first(query->doc_ids);
+ node;
+ node = rbt_next(query->doc_ids, node)) {
+
+ fts_ranking_t* ranking;
+
+ ranking = rbt_value(fts_ranking_t, node);
+ fts_query_calculate_ranking(query, ranking);
+
+ // FIXME: I think we may requre this information to improve the
+ // ranking of doc ids which have more word matches from
+ // different FTS indexes.
+
+ /* We don't need these anymore free the resources. */
+ ranking->words = NULL;
+
+ if (!result_is_null) {
+ fts_query_add_ranking(query, result->rankings_by_id, ranking);
+
+ if (query->total_size > fts_result_cache_limit) {
+ query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+ fts_query_free_result(result);
+ DBUG_RETURN(NULL);
+ }
+ }
+ }
+
+ if (result_is_null) {
+ /* Use doc_ids directly */
+ rbt_free(result->rankings_by_id);
+ result->rankings_by_id = query->doc_ids;
+ query->doc_ids = NULL;
+ }
+
+ DBUG_RETURN(result);
+}
+
+/*****************************************************************//**
+Get the result of the query. Calculate the similarity coefficient. */
+static
+fts_result_t*
+fts_query_get_result(
+/*=================*/
+ fts_query_t* query, /*!< in: query instance */
+ fts_result_t* result) /*!< in: result */
+{
+ DBUG_ENTER("fts_query_get_result");
+
+ if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
+ /* Copy the doc ids to the result. */
+ result = fts_query_prepare_result(query, result);
+ } else {
+ /* Create an empty result instance. */
+ result = static_cast<fts_result_t*>(
+ ut_zalloc_nokey(sizeof(*result)));
+ }
+
+ DBUG_RETURN(result);
+}
+
+/*****************************************************************//**
+FTS Query free resources and reset. */
+static
+void
+fts_query_free(
+/*===========*/
+ fts_query_t* query) /*!< in: query instance to free*/
+{
+
+ if (query->read_nodes_graph) {
+ fts_que_graph_free(query->read_nodes_graph);
+ }
+
+ if (query->root) {
+ fts_ast_free_node(query->root);
+ }
+
+ if (query->deleted) {
+ fts_doc_ids_free(query->deleted);
+ }
+
+ if (query->intersection) {
+ fts_query_free_doc_ids(query, query->intersection);
+ }
+
+ if (query->doc_ids) {
+ fts_query_free_doc_ids(query, query->doc_ids);
+ }
+
+ if (query->word_freqs) {
+ const ib_rbt_node_t* node;
+
+ /* We need to free any instances of fts_doc_freq_t that we
+ may have allocated. */
+ for (node = rbt_first(query->word_freqs);
+ node;
+ node = rbt_next(query->word_freqs, node)) {
+
+ fts_word_freq_t* word_freq;
+
+ word_freq = rbt_value(fts_word_freq_t, node);
+
+ /* We need to cast away the const. */
+ rbt_free(word_freq->doc_freqs);
+ }
+
+ rbt_free(query->word_freqs);
+ }
+
+ if (query->wildcard_words != NULL) {
+ rbt_free(query->wildcard_words);
+ }
+
+ ut_a(!query->intersection);
+
+ if (query->word_map) {
+ rbt_free(query->word_map);
+ }
+
+ if (query->word_vector != NULL) {
+ UT_DELETE(query->word_vector);
+ }
+
+ if (query->heap) {
+ mem_heap_free(query->heap);
+ }
+
+ memset(query, 0, sizeof(*query));
+}
+
+/*****************************************************************//**
+Parse the query using flex/bison or plugin parser.
+@return parse tree node. */
+static
+fts_ast_node_t*
+fts_query_parse(
+/*============*/
+ fts_query_t* query, /*!< in: query instance */
+ byte* query_str, /*!< in: query string */
+ ulint query_len) /*!< in: query string length */
+{
+ int error;
+ fts_ast_state_t state;
+ bool mode = query->boolean_mode;
+ DBUG_ENTER("fts_query_parse");
+
+ memset(&state, 0x0, sizeof(state));
+
+ state.charset = query->fts_index_table.charset;
+
+ DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
+ query->parser = NULL;);
+
+ if (query->parser) {
+ state.root = state.cur_node =
+ fts_ast_create_node_list(&state, NULL);
+ error = fts_parse_by_parser(mode, query_str, query_len,
+ query->parser, &state);
+ } else {
+ /* Setup the scanner to use, this depends on the mode flag. */
+ state.lexer = fts_lexer_create(mode, query_str, query_len);
+ state.charset = query->fts_index_table.charset;
+ error = fts_parse(&state);
+ fts_lexer_free(state.lexer);
+ state.lexer = NULL;
+ }
+
+ /* Error during parsing ? */
+ if (error) {
+ /* Free the nodes that were allocated during parsing. */
+ fts_ast_state_free(&state);
+ } else {
+ query->root = state.root;
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print) && query->root) {
+ fts_ast_node_print(query->root);
+ }
+ }
+
+ DBUG_RETURN(state.root);
+}
+
+/*******************************************************************//**
+FTS Query optimization
+Set FTS_OPT_RANKING if it is a simple term query */
+static
+void
+fts_query_can_optimize(
+/*===================*/
+ fts_query_t* query, /*!< in/out: query instance */
+ uint flags) /*!< In: FTS search mode */
+{
+ fts_ast_node_t* node = query->root;
+
+ if (flags & FTS_EXPAND) {
+ return;
+ }
+
+ /* Check if it has only a term without oper */
+ ut_ad(node->type == FTS_AST_LIST);
+ node = node->list.head;
+ if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
+ query->flags = FTS_OPT_RANKING;
+ }
+}
+
+/** FTS Query entry point.
+@param[in,out] trx transaction
+@param[in] index fts index to search
+@param[in] flags FTS search mode
+@param[in] query_str FTS query
+@param[in] query_len FTS query string len in bytes
+@param[in,out] result result doc ids
+@return DB_SUCCESS if successful otherwise error code */
+dberr_t
+fts_query(
+ trx_t* trx,
+ dict_index_t* index,
+ uint flags,
+ const byte* query_str,
+ ulint query_len,
+ fts_result_t** result)
+{
+ fts_query_t query;
+ dberr_t error = DB_SUCCESS;
+ byte* lc_query_str;
+ ulint lc_query_str_len;
+ ulint result_len;
+ bool boolean_mode;
+ trx_t* query_trx; /* FIXME: use provided trx */
+ CHARSET_INFO* charset;
+ ulint start_time_ms;
+ bool will_be_ignored = false;
+
+ boolean_mode = flags & FTS_BOOL;
+
+ *result = NULL;
+ memset(&query, 0x0, sizeof(query));
+ query_trx = trx_create();
+ query_trx->op_info = "FTS query";
+
+ start_time_ms = ut_time_ms();
+
+ query.trx = query_trx;
+ query.index = index;
+ query.boolean_mode = boolean_mode;
+ query.deleted = fts_doc_ids_create();
+ query.cur_node = NULL;
+
+ query.fts_common_table.type = FTS_COMMON_TABLE;
+ query.fts_common_table.table_id = index->table->id;
+ query.fts_common_table.table = index->table;
+
+ charset = fts_index_get_charset(index);
+
+ query.fts_index_table.type = FTS_INDEX_TABLE;
+ query.fts_index_table.index_id = index->id;
+ query.fts_index_table.table_id = index->table->id;
+ query.fts_index_table.charset = charset;
+ query.fts_index_table.table = index->table;
+
+ query.word_map = rbt_create_arg_cmp(
+ sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset);
+ query.word_vector = UT_NEW_NOKEY(word_vector_t());
+ query.error = DB_SUCCESS;
+
+ /* Setup the RB tree that will be used to collect per term
+ statistics. */
+ query.word_freqs = rbt_create_arg_cmp(
+ sizeof(fts_word_freq_t), innobase_fts_text_cmp,
+ (void*) charset);
+
+ if (flags & FTS_EXPAND) {
+ query.wildcard_words = rbt_create_arg_cmp(
+ sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset);
+ }
+
+ query.total_size += SIZEOF_RBT_CREATE;
+
+ query.total_docs = dict_table_get_n_rows(index->table);
+
+ query.fts_common_table.suffix = "DELETED";
+
+ /* Read the deleted doc_ids, we need these for filtering. */
+ error = fts_table_fetch_doc_ids(
+ NULL, &query.fts_common_table, query.deleted);
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ query.fts_common_table.suffix = "DELETED_CACHE";
+
+ error = fts_table_fetch_doc_ids(
+ NULL, &query.fts_common_table, query.deleted);
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ /* Get the deleted doc ids that are in the cache. */
+ fts_cache_append_deleted_doc_ids(
+ index->table->fts->cache, query.deleted->doc_ids);
+ DEBUG_SYNC_C("fts_deleted_doc_ids_append");
+
+ /* Sort the vector so that we can do a binary search over the ids. */
+ ib_vector_sort(query.deleted->doc_ids, fts_doc_id_cmp);
+
+ /* Convert the query string to lower case before parsing. We own
+ the ut_malloc'ed result and so remember to free it before return. */
+
+ lc_query_str_len = query_len * charset->casedn_multiply + 1;
+ lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
+
+ /* For binary collations, a case sensitive search is
+ performed. Hence don't convert to lower case. */
+ if (my_binary_compare(charset)) {
+ memcpy(lc_query_str, query_str, query_len);
+ lc_query_str[query_len]= 0;
+ result_len= query_len;
+ } else {
+ result_len = innobase_fts_casedn_str(
+ charset, (char*)( query_str), query_len,
+ (char*)(lc_query_str), lc_query_str_len);
+ }
+
+ ut_ad(result_len < lc_query_str_len);
+
+ lc_query_str[result_len] = 0;
+
+ query.heap = mem_heap_create(128);
+
+ /* Create the rb tree for the doc id (current) set. */
+ query.doc_ids = rbt_create(
+ sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+ query.parser = index->parser;
+
+ query.total_size += SIZEOF_RBT_CREATE;
+
+ /* Parse the input query string. */
+ if (fts_query_parse(&query, lc_query_str, result_len)) {
+ fts_ast_node_t* ast = query.root;
+ ast->trx = trx;
+
+ /* Optimize query to check if it's a single term */
+ fts_query_can_optimize(&query, flags);
+
+ DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
+ fts_result_cache_limit = 2048;
+ );
+
+ /* Traverse the Abstract Syntax Tree (AST) and execute
+ the query. */
+ query.error = fts_ast_visit(
+ FTS_NONE, ast, fts_query_visitor,
+ &query, &will_be_ignored);
+ if (query.error == DB_INTERRUPTED) {
+ error = DB_INTERRUPTED;
+ ut_free(lc_query_str);
+ goto func_exit;
+ }
+
+ /* If query expansion is requested, extend the search
+ with first search pass result */
+ if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
+ query.error = fts_expand_query(index, &query);
+ }
+
+ /* Calculate the inverse document frequency of the terms. */
+ if (query.error == DB_SUCCESS
+ && query.flags != FTS_OPT_RANKING) {
+ fts_query_calculate_idf(&query);
+ }
+
+ /* Copy the result from the query state, so that we can
+ return it to the caller. */
+ if (query.error == DB_SUCCESS) {
+ *result = fts_query_get_result(&query, *result);
+ }
+
+ error = query.error;
+ } else {
+ /* still return an empty result set */
+ *result = static_cast<fts_result_t*>(
+ ut_zalloc_nokey(sizeof(**result)));
+ }
+
+ if (trx_is_interrupted(trx)) {
+ error = DB_INTERRUPTED;
+ ut_free(lc_query_str);
+ if (*result) {
+ fts_query_free_result(*result);
+ }
+ goto func_exit;
+ }
+
+ ut_free(lc_query_str);
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print) && (*result)) {
+ ulint diff_time = ut_time_ms() - start_time_ms;
+
+ ib::info() << "FTS Search Processing time: "
+ << diff_time / 1000 << " secs: " << diff_time % 1000
+ << " millisec: row(s) "
+ << ((*result)->rankings_by_id
+ ? lint(rbt_size((*result)->rankings_by_id))
+ : -1);
+
+ /* Log memory consumption & result size */
+ ib::info() << "Full Search Memory: " << query.total_size
+ << " (bytes), Row: "
+ << ((*result)->rankings_by_id
+ ? rbt_size((*result)->rankings_by_id)
+ : 0)
+ << ".";
+ }
+
+func_exit:
+ fts_query_free(&query);
+
+ query_trx->free();
+
+ return(error);
+}
+
+/*****************************************************************//**
+FTS Query free result, returned by fts_query(). */
+void
+fts_query_free_result(
+/*==================*/
+ fts_result_t* result) /*!< in: result instance to free.*/
+{
+ if (result) {
+ if (result->rankings_by_id != NULL) {
+ rbt_free(result->rankings_by_id);
+ result->rankings_by_id = NULL;
+ }
+ if (result->rankings_by_rank != NULL) {
+ rbt_free(result->rankings_by_rank);
+ result->rankings_by_rank = NULL;
+ }
+
+ ut_free(result);
+ result = NULL;
+ }
+}
+
+/*****************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+ fts_result_t* result) /*!< out: result instance to sort.*/
+{
+ const ib_rbt_node_t* node;
+ ib_rbt_t* ranked;
+
+ ut_a(result->rankings_by_id != NULL);
+ if (result->rankings_by_rank) {
+ rbt_free(result->rankings_by_rank);
+ }
+
+ ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
+
+ /* We need to free any instances of fts_doc_freq_t that we
+ may have allocated. */
+ for (node = rbt_first(result->rankings_by_id);
+ node;
+ node = rbt_next(result->rankings_by_id, node)) {
+
+ fts_ranking_t* ranking;
+
+ ranking = rbt_value(fts_ranking_t, node);
+
+ ut_a(ranking->words == NULL);
+
+ rbt_insert(ranked, ranking, ranking);
+ }
+
+ /* Reset the current node too. */
+ result->current = NULL;
+ result->rankings_by_rank = ranked;
+}
+
+/*******************************************************************//**
+A debug function to print result doc_id set. */
+static
+void
+fts_print_doc_id(
+/*=============*/
+ fts_query_t* query) /*!< in : tree that stores doc_ids.*/
+{
+ const ib_rbt_node_t* node;
+
+ /* Iterate each member of the doc_id set */
+ for (node = rbt_first(query->doc_ids);
+ node;
+ node = rbt_next(query->doc_ids, node)) {
+ fts_ranking_t* ranking;
+ ranking = rbt_value(fts_ranking_t, node);
+
+ ib::info() << "doc_ids info, doc_id: " << ranking->doc_id;
+
+ ulint pos = 0;
+ fts_string_t word;
+
+ while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+ ib::info() << "doc_ids info, value: " << word.f_str;
+ }
+ }
+}
+
+/*************************************************************//**
+This function implements a simple "blind" query expansion search:
+words in documents found in the first search pass will be used as
+search arguments to search the document again, thus "expand"
+the search result set.
+@return DB_SUCCESS if success, otherwise the error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_expand_query(
+/*=============*/
+ dict_index_t* index, /*!< in: FTS index to search */
+ fts_query_t* query) /*!< in: FTS query instance */
+{
+ const ib_rbt_node_t* node;
+ const ib_rbt_node_t* token_node;
+ fts_doc_t result_doc;
+ dberr_t error = DB_SUCCESS;
+ const fts_index_cache_t*index_cache;
+
+ /* If no doc is found in first search pass, return */
+ if (!rbt_size(query->doc_ids)) {
+ return(error);
+ }
+
+ /* Init "result_doc", to hold words from the first search pass */
+ fts_doc_init(&result_doc);
+
+ rw_lock_x_lock(&index->table->fts->cache->lock);
+ index_cache = fts_find_index_cache(index->table->fts->cache, index);
+ rw_lock_x_unlock(&index->table->fts->cache->lock);
+
+ ut_a(index_cache);
+
+ result_doc.tokens = rbt_create_arg_cmp(
+ sizeof(fts_token_t), innobase_fts_text_cmp,
+ (void*) index_cache->charset);
+
+ result_doc.charset = index_cache->charset;
+ result_doc.parser = index_cache->index->parser;
+
+ query->total_size += SIZEOF_RBT_CREATE;
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ fts_print_doc_id(query);
+ }
+
+ for (node = rbt_first(query->doc_ids);
+ node;
+ node = rbt_next(query->doc_ids, node)) {
+
+ fts_ranking_t* ranking;
+ ulint prev_token_size;
+ ulint estimate_size;
+
+ prev_token_size = rbt_size(result_doc.tokens);
+
+ ranking = rbt_value(fts_ranking_t, node);
+
+ /* Fetch the documents with the doc_id from the
+ result of first seach pass. Since we do not
+ store document-to-word mapping, we need to
+ fetch the original document and parse them.
+ Future optimization could be done here if we
+ support some forms of document-to-word mapping */
+ fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
+ FTS_FETCH_DOC_BY_ID_EQUAL,
+ fts_query_expansion_fetch_doc,
+ &result_doc);
+
+ /* Estimate memory used, see fts_process_token and fts_token_t.
+ We ignore token size here. */
+ estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
+ * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
+ + sizeof(ib_vector_t) + sizeof(ulint) * 32);
+ query->total_size += estimate_size;
+
+ if (query->total_size > fts_result_cache_limit) {
+ error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+ goto func_exit;
+ }
+ }
+
+ /* Remove words that have already been searched in the first pass */
+ for (ulint i = 0; i < query->word_vector->size(); i++) {
+ fts_string_t word = query->word_vector->at(i);
+ ib_rbt_bound_t parent;
+
+ if (query->wildcard_words
+ && rbt_search(query->wildcard_words, &parent, &word) == 0) {
+ /* If it's a wildcard word, remove words having
+ it as prefix. */
+ while (rbt_search_cmp(result_doc.tokens,
+ &parent, &word, NULL,
+ innobase_fts_text_cmp_prefix)
+ == 0) {
+ ut_free(rbt_remove_node(result_doc.tokens,
+ parent.last));
+ }
+ } else {
+ /* We don't check return value, because the word may
+ have been deleted by a previous wildcard word as its
+ prefix, e.g. ('g * good'). */
+ rbt_delete(result_doc.tokens, &word);
+ }
+ }
+
+ /* Search the table the second time with expanded search list */
+ for (token_node = rbt_first(result_doc.tokens);
+ token_node;
+ token_node = rbt_next(result_doc.tokens, token_node)) {
+ fts_token_t* mytoken;
+ mytoken = rbt_value(fts_token_t, token_node);
+
+ /* '%' in the end is treated as prefix search,
+ it can cause assert failure, so we skip it. */
+ if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') {
+ continue;
+ }
+
+ ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
+ fts_query_add_word_freq(query, &mytoken->text);
+ error = fts_query_union(query, &mytoken->text);
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+ }
+
+func_exit:
+ fts_doc_free(&result_doc);
+
+ return(error);
+}
+/*************************************************************//**
+This function finds documents that contain all words in a
+phrase or proximity search. And if proximity search, verify
+the words are close enough to each other, as in specified distance.
+This function is called for phrase and proximity search.
+@return TRUE if documents are found, FALSE if otherwise */
+static
+ibool
+fts_phrase_or_proximity_search(
+/*===========================*/
+ fts_query_t* query, /*!< in/out: query instance.
+ query->doc_ids might be instantiated
+ with qualified doc IDs */
+ ib_vector_t* tokens) /*!< in: Tokens contain words */
+{
+ ulint n_matched;
+ ulint i;
+ ibool matched = FALSE;
+ ulint num_token = ib_vector_size(tokens);
+ fts_match_t* match[MAX_PROXIMITY_ITEM];
+ ibool end_list = FALSE;
+
+ /* Number of matched documents for the first token */
+ n_matched = ib_vector_size(query->match_array[0]);
+
+ /* We have a set of match list for each word, we shall
+ walk through the list and find common documents that
+ contain all the matching words. */
+ for (i = 0; i < n_matched; i++) {
+ ulint j;
+ ulint k = 0;
+ fts_proximity_t qualified_pos;
+
+ match[0] = static_cast<fts_match_t*>(
+ ib_vector_get(query->match_array[0], i));
+
+ /* For remaining match list for the token(word), we
+ try to see if there is a document with the same
+ doc id */
+ for (j = 1; j < num_token; j++) {
+ match[j] = static_cast<fts_match_t*>(
+ ib_vector_get(query->match_array[j], k));
+
+ while (match[j]->doc_id < match[0]->doc_id
+ && k < ib_vector_size(query->match_array[j])) {
+ match[j] = static_cast<fts_match_t*>(
+ ib_vector_get(
+ query->match_array[j], k));
+ k++;
+ }
+
+ if (match[j]->doc_id > match[0]->doc_id) {
+ /* no match */
+ if (query->flags & FTS_PHRASE) {
+ match[0]->doc_id = 0;
+ }
+ break;
+ }
+
+ if (k == ib_vector_size(query->match_array[j])) {
+ end_list = TRUE;
+
+ if (query->flags & FTS_PHRASE) {
+ ulint s;
+ /* Since i is the last doc id in the
+ match_array[j], remove all doc ids > i
+ from the match_array[0]. */
+ fts_match_t* match_temp;
+ for (s = i + 1; s < n_matched; s++) {
+ match_temp = static_cast<
+ fts_match_t*>(ib_vector_get(
+ query->match_array[0], s));
+ match_temp->doc_id = 0;
+ }
+
+ if (match[j]->doc_id !=
+ match[0]->doc_id) {
+ /* no match */
+ match[0]->doc_id = 0;
+ }
+ }
+
+ if (match[j]->doc_id != match[0]->doc_id) {
+ goto func_exit;
+ }
+ }
+
+ /* FIXME: A better solution will be a counter array
+ remember each run's last position. So we don't
+ reset it here very time */
+ k = 0;
+ }
+
+ if (j != num_token) {
+ continue;
+ }
+
+ /* For this matching doc, we need to further
+ verify whether the words in the doc are close
+ to each other, and within the distance specified
+ in the proximity search */
+ if (query->flags & FTS_PHRASE) {
+ matched = TRUE;
+ } else if (fts_proximity_get_positions(
+ match, num_token, ULINT_MAX, &qualified_pos)) {
+
+ /* Fetch the original documents and count the
+ words in between matching words to see that is in
+ specified distance */
+ if (fts_query_is_in_proximity_range(
+ query, match, &qualified_pos)) {
+ /* If so, mark we find a matching doc */
+ query->error = fts_query_process_doc_id(
+ query, match[0]->doc_id, 0);
+ if (query->error != DB_SUCCESS) {
+ matched = FALSE;
+ goto func_exit;
+ }
+
+ matched = TRUE;
+ for (ulint z = 0; z < num_token; z++) {
+ fts_string_t* token;
+ token = static_cast<fts_string_t*>(
+ ib_vector_get(tokens, z));
+ fts_query_add_word_to_document(
+ query, match[0]->doc_id, token);
+ }
+ }
+ }
+
+ if (end_list) {
+ break;
+ }
+ }
+
+func_exit:
+ return(matched);
+}
+
+/*************************************************************//**
+This function checks whether words in result documents are close to
+each other (within proximity range as specified by "distance").
+If "distance" is MAX_ULINT, then it will find all combinations of
+positions of matching words and store min and max positions
+in the "qualified_pos" for later verification.
+@return true if words are close to each other, false if otherwise */
+static
+bool
+fts_proximity_get_positions(
+/*========================*/
+ fts_match_t** match, /*!< in: query instance */
+ ulint num_match, /*!< in: number of matching
+ items */
+ ulint distance, /*!< in: distance value
+ for proximity search */
+ fts_proximity_t* qualified_pos) /*!< out: the position info
+ records ranges containing
+ all matching words. */
+{
+ ulint i;
+ ulint idx[MAX_PROXIMITY_ITEM];
+ ulint num_pos[MAX_PROXIMITY_ITEM];
+ ulint min_idx;
+
+ qualified_pos->n_pos = 0;
+
+ ut_a(num_match <= MAX_PROXIMITY_ITEM);
+
+ /* Each word could appear multiple times in a doc. So
+ we need to walk through each word's position list, and find
+ closest distance between different words to see if
+ they are in the proximity distance. */
+
+ /* Assume each word's position list is sorted, we
+ will just do a walk through to all words' lists
+ similar to a the merge phase of a merge sort */
+ for (i = 0; i < num_match; i++) {
+ /* idx is the current position we are checking
+ for a particular word */
+ idx[i] = 0;
+
+ /* Number of positions for this word */
+ num_pos[i] = ib_vector_size(match[i]->positions);
+ }
+
+ /* Start with the first word */
+ min_idx = 0;
+
+ while (idx[min_idx] < num_pos[min_idx]) {
+ ulint position[MAX_PROXIMITY_ITEM];
+ ulint min_pos = ULINT_MAX;
+ ulint max_pos = 0;
+
+ /* Check positions in each word position list, and
+ record the max/min position */
+ for (i = 0; i < num_match; i++) {
+ position[i] = *(ulint*) ib_vector_get_const(
+ match[i]->positions, idx[i]);
+
+ if (position[i] == ULINT_UNDEFINED) {
+ break;
+ }
+
+ if (position[i] < min_pos) {
+ min_pos = position[i];
+ min_idx = i;
+ }
+
+ if (position[i] > max_pos) {
+ max_pos = position[i];
+ }
+ }
+
+ /* If max and min position are within range, we
+ find a good match */
+ if (max_pos - min_pos <= distance
+ && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
+ /* The charset has variable character
+ length encoding, record the min_pos and
+ max_pos, we will need to verify the actual
+ number of characters */
+ qualified_pos->min_pos.push_back(min_pos);
+ qualified_pos->max_pos.push_back(max_pos);
+ qualified_pos->n_pos++;
+ }
+
+ /* Otherwise, move to the next position is the
+ list for the word with the smallest position */
+ idx[min_idx]++;
+ }
+
+ return(qualified_pos->n_pos != 0);
+}
diff --git a/storage/innobase/fts/fts0sql.cc b/storage/innobase/fts/fts0sql.cc
new file mode 100644
index 00000000..180500f6
--- /dev/null
+++ b/storage/innobase/fts/fts0sql.cc
@@ -0,0 +1,258 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0sql.cc
+Full Text Search functionality.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#include "que0que.h"
+#include "trx0roll.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "fts0types.h"
+#include "fts0priv.h"
+
+/** SQL statements for creating the ancillary FTS tables. */
+
+/** Preamble to all SQL statements. */
+static const char* fts_sql_begin=
+ "PROCEDURE P() IS\n";
+
+/** Postamble to non-committing SQL statements. */
+static const char* fts_sql_end=
+ "\n"
+ "END;\n";
+
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+int
+fts_get_table_id(
+/*=============*/
+ const fts_table_t*
+ fts_table, /*!< in: FTS Auxiliary table */
+ char* table_id) /*!< out: table id, must be at least
+ FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+ long */
+{
+ int len;
+
+ ut_a(fts_table->table != NULL);
+
+ switch (fts_table->type) {
+ case FTS_COMMON_TABLE:
+ len = fts_write_object_id(fts_table->table_id, table_id);
+ break;
+
+ case FTS_INDEX_TABLE:
+
+ len = fts_write_object_id(fts_table->table_id, table_id);
+
+ table_id[len] = '_';
+ ++len;
+ table_id += len;
+
+ len += fts_write_object_id(fts_table->index_id, table_id);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ ut_a(len >= 16);
+ ut_a(len < FTS_AUX_MIN_TABLE_ID_LENGTH);
+
+ return(len);
+}
+
+/** Construct the name of an internal FTS table for the given table.
+@param[in] fts_table metadata on fulltext-indexed table
+@param[in] dict_locked whether dict_sys.mutex is being held
+@return the prefix, must be freed with ut_free() */
+char* fts_get_table_name_prefix(const fts_table_t* fts_table)
+{
+ char table_id[FTS_AUX_MIN_TABLE_ID_LENGTH];
+ const size_t table_id_len = size_t(fts_get_table_id(fts_table,
+ table_id)) + 1;
+ mutex_enter(&dict_sys.mutex);
+ /* Include the separator as well. */
+ const size_t dbname_len = fts_table->table->name.dblen() + 1;
+ ut_ad(dbname_len > 1);
+ const size_t prefix_name_len = dbname_len + 4 + table_id_len;
+ char* prefix_name = static_cast<char*>(
+ ut_malloc_nokey(prefix_name_len));
+ memcpy(prefix_name, fts_table->table->name.m_name, dbname_len);
+ mutex_exit(&dict_sys.mutex);
+ memcpy(prefix_name + dbname_len, "FTS_", 4);
+ memcpy(prefix_name + dbname_len + 4, table_id, table_id_len);
+ return prefix_name;
+}
+
+/** Construct the name of an internal FTS table for the given table.
+@param[in] fts_table metadata on fulltext-indexed table
+@param[out] table_name a name up to MAX_FULL_NAME_LEN
+@param[in] dict_locked whether dict_sys.mutex is being held */
+void fts_get_table_name(const fts_table_t* fts_table, char* table_name,
+ bool dict_locked)
+{
+ if (!dict_locked) {
+ mutex_enter(&dict_sys.mutex);
+ }
+ ut_ad(mutex_own(&dict_sys.mutex));
+ /* Include the separator as well. */
+ const size_t dbname_len = fts_table->table->name.dblen() + 1;
+ ut_ad(dbname_len > 1);
+ memcpy(table_name, fts_table->table->name.m_name, dbname_len);
+ if (!dict_locked) {
+ mutex_exit(&dict_sys.mutex);
+ }
+ memcpy(table_name += dbname_len, "FTS_", 4);
+ table_name += 4;
+ table_name += fts_get_table_id(fts_table, table_name);
+ *table_name++ = '_';
+ strcpy(table_name, fts_table->suffix);
+}
+
+/******************************************************************//**
+Parse an SQL string.
+@return query graph */
+que_t*
+fts_parse_sql(
+/*==========*/
+ fts_table_t* fts_table, /*!< in: FTS auxiliarry table info */
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql) /*!< in: SQL string to evaluate */
+{
+ char* str;
+ que_t* graph;
+ ibool dict_locked;
+
+ str = ut_str3cat(fts_sql_begin, sql, fts_sql_end);
+
+ dict_locked = (fts_table && fts_table->table->fts
+ && fts_table->table->fts->dict_locked);
+
+ if (!dict_locked) {
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ /* The InnoDB SQL parser is not re-entrant. */
+ mutex_enter(&dict_sys.mutex);
+ }
+
+ graph = pars_sql(info, str);
+ ut_a(graph);
+
+ if (!dict_locked) {
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ ut_free(str);
+
+ return(graph);
+}
+
+/******************************************************************//**
+Parse an SQL string.
+@return query graph */
+que_t*
+fts_parse_sql_no_dict_lock(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql) /*!< in: SQL string to evaluate */
+{
+ char* str;
+ que_t* graph;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ str = ut_str3cat(fts_sql_begin, sql, fts_sql_end);
+
+ graph = pars_sql(info, str);
+ ut_a(graph);
+
+ ut_free(str);
+
+ return(graph);
+}
+
+/******************************************************************//**
+Evaluate an SQL query graph.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_eval_sql(
+/*=========*/
+ trx_t* trx, /*!< in: transaction */
+ que_t* graph) /*!< in: Query graph to evaluate */
+{
+ que_thr_t* thr;
+
+ graph->trx = trx;
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ ut_a(thr = que_fork_start_command(graph));
+
+ que_run_threads(thr);
+
+ return(trx->error_state);
+}
+
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+ dict_index_t* index, /*!< in: index */
+ pars_info_t* info, /*!< in/out: parser info */
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ulint i;
+ const char* str = "";
+
+ for (i = 0; i < index->n_user_defined_cols; i++) {
+ char* sel_str;
+
+ dict_field_t* field = dict_index_get_nth_field(index, i);
+
+ sel_str = mem_heap_printf(heap, "sel%lu", (ulong) i);
+
+ /* Set copy_name to TRUE since it's dynamic. */
+ pars_info_bind_id(info, TRUE, sel_str, field->name);
+
+ str = mem_heap_printf(
+ heap, "%s%s$%s", str, (*str) ? ", " : "", sel_str);
+ }
+
+ return(str);
+}
diff --git a/storage/innobase/fts/fts0tlex.cc b/storage/innobase/fts/fts0tlex.cc
new file mode 100644
index 00000000..29f73f23
--- /dev/null
+++ b/storage/innobase/fts/fts0tlex.cc
@@ -0,0 +1,2169 @@
+#include "univ.i"
+#line 2 "fts0tlex.cc"
+
+#line 4 "fts0tlex.cc"
+
+#define YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0t_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0t_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0t_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0t_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0t_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0t_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0t_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0t_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0t_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0t_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0t_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0t_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0t_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0t_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0t_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0t_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0t_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0t_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0tpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0tpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0tpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0tpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0tensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0tensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0tlex_ALREADY_DEFINED
+#else
+#define yylex fts0tlex
+#endif
+
+#ifdef yyrestart
+#define fts0trestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0trestart
+#endif
+
+#ifdef yylex_init
+#define fts0tlex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0tlex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0tlex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0tlex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0tlex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0tlex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0tget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0tget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0tset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0tset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0tget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0tget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0tset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0tset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0tget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0tget_in
+#endif
+
+#ifdef yyset_in
+#define fts0tset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0tset_in
+#endif
+
+#ifdef yyget_out
+#define fts0tget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0tget_out
+#endif
+
+#ifdef yyset_out
+#define fts0tset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0tset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0tget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0tget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0tget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0tget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0tget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0tget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0tset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0tset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0tget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0tget_column
+#endif
+
+#ifdef yyset_column
+#define fts0tset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0tset_column
+#endif
+
+#ifdef yywrap
+#define fts0twrap_ALREADY_DEFINED
+#else
+#define yywrap fts0twrap
+#endif
+
+#ifdef yyalloc
+#define fts0talloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0talloc
+#endif
+
+#ifdef yyrealloc
+#define fts0trealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0trealloc
+#endif
+
+#ifdef yyfree
+#define fts0tfree_ALREADY_DEFINED
+#else
+#define yyfree fts0tfree
+#endif
+
+/* First, we deal with platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an
+ * integer in range [0..255] for use as an array index.
+ */
+#define YY_SC_TO_UI(c) ((YY_CHAR) (c))
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+ are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Enter a start condition. This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN yyg->yy_start = 1 + 2 *
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state. The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START ((yyg->yy_start - 1) / 2)
+#define YYSTATE YY_START
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE yyrestart( yyin , yyscanner )
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+ #define YY_LESS_LINENO(n)
+ #define YY_LINENO_REWIND_TO(ptr)
+
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+ do \
+ { \
+ /* Undo effects of setting up yytext. */ \
+ int yyless_macro_arg = (n); \
+ YY_LESS_LINENO(yyless_macro_arg);\
+ *yy_cp = yyg->yy_hold_char; \
+ YY_RESTORE_YY_MORE_OFFSET \
+ yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+ YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+ } \
+ while ( 0 )
+#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner )
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+ {
+ FILE *yy_input_file;
+
+ char *yy_ch_buf; /* input buffer */
+ char *yy_buf_pos; /* current position in input buffer */
+
+ /* Size of input buffer in bytes, not including room for EOB
+ * characters.
+ */
+ int yy_buf_size;
+
+ /* Number of characters read into yy_ch_buf, not including EOB
+ * characters.
+ */
+ int yy_n_chars;
+
+ /* Whether we "own" the buffer - i.e., we know we created it,
+ * and can realloc() it to grow it, and should free() it to
+ * delete it.
+ */
+ int yy_is_our_buffer;
+
+ /* Whether this is an "interactive" input source; if so, and
+ * if we're using stdio for input, then we want to use getc()
+ * instead of fread(), to make sure we stop fetching input after
+ * each newline.
+ */
+ int yy_is_interactive;
+
+ /* Whether we're considered to be at the beginning of a line.
+ * If so, '^' rules will be active on the next match, otherwise
+ * not.
+ */
+ int yy_at_bol;
+
+ int yy_bs_lineno; /**< The line count. */
+ int yy_bs_column; /**< The column count. */
+
+ /* Whether to try to fill the input buffer when we reach the
+ * end of it.
+ */
+ int yy_fill_buffer;
+
+ int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+ /* When an EOF's been seen but there's still some text to process
+ * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+ * shouldn't try reading from the input source any more. We might
+ * still have a bunch of tokens to match, though, because of
+ * possible backing-up.
+ *
+ * When we actually see the EOF, we change the status to "new"
+ * (via yyrestart()), so that the user can continue scanning by
+ * just pointing yyin at a new input file.
+ */
+#define YY_BUFFER_EOF_PENDING 2
+
+ };
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
+ ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
+ : NULL)
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top]
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+static void yyensure_buffer_stack ( yyscan_t yyscanner );
+static void yy_load_buffer_state ( yyscan_t yyscanner );
+static void yy_init_buffer ( YY_BUFFER_STATE b, FILE *file , yyscan_t yyscanner );
+#define YY_FLUSH_BUFFER yy_flush_buffer( YY_CURRENT_BUFFER , yyscanner)
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+#define yy_new_buffer yy_create_buffer
+#define yy_set_interactive(is_interactive) \
+ { \
+ if ( ! YY_CURRENT_BUFFER ){ \
+ yyensure_buffer_stack (yyscanner); \
+ YY_CURRENT_BUFFER_LVALUE = \
+ yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \
+ } \
+ YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+ }
+#define yy_set_bol(at_bol) \
+ { \
+ if ( ! YY_CURRENT_BUFFER ){\
+ yyensure_buffer_stack (yyscanner); \
+ YY_CURRENT_BUFFER_LVALUE = \
+ yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \
+ } \
+ YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+ }
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define fts0twrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+typedef flex_uint8_t YY_CHAR;
+
+typedef int yy_state_type;
+
+#define yytext_ptr yytext_r
+
+static yy_state_type yy_get_previous_state ( yyscan_t yyscanner );
+static yy_state_type yy_try_NUL_trans ( yy_state_type current_state , yyscan_t yyscanner);
+static int yy_get_next_buffer ( yyscan_t yyscanner );
+static void yynoreturn yy_fatal_error ( const char* msg , yyscan_t yyscanner );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+ yyg->yytext_ptr = yy_bp; \
+ yyleng = (int) (yy_cp - yy_bp); \
+ yyg->yy_hold_char = *yy_cp; \
+ *yy_cp = '\0'; \
+ yyg->yy_c_buf_p = yy_cp;
+#define YY_NUM_RULES 7
+#define YY_END_OF_BUFFER 8
+/* This struct is not used in this scanner,
+ but its presence is necessary. */
+struct yy_trans_info
+ {
+ flex_int32_t yy_verify;
+ flex_int32_t yy_nxt;
+ };
+static const flex_int16_t yy_accept[17] =
+ { 0,
+ 4, 4, 8, 4, 1, 6, 1, 5, 5, 2,
+ 4, 1, 1, 0, 3, 0
+ } ;
+
+static const YY_CHAR yy_ec[256] =
+ { 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 4, 1, 5, 1, 1, 6, 1, 1, 1,
+ 1, 7, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1
+ } ;
+
+static const YY_CHAR yy_meta[8] =
+ { 0,
+ 1, 2, 3, 4, 5, 5, 1
+ } ;
+
+static const flex_int16_t yy_base[20] =
+ { 0,
+ 0, 0, 18, 0, 6, 21, 0, 9, 21, 0,
+ 0, 0, 0, 4, 21, 21, 10, 11, 15
+ } ;
+
+static const flex_int16_t yy_def[20] =
+ { 0,
+ 16, 1, 16, 17, 17, 16, 18, 19, 16, 17,
+ 17, 5, 18, 19, 16, 0, 16, 16, 16
+ } ;
+
+static const flex_int16_t yy_nxt[29] =
+ { 0,
+ 4, 5, 6, 7, 8, 9, 10, 12, 15, 13,
+ 11, 11, 13, 15, 13, 14, 14, 16, 14, 14,
+ 3, 16, 16, 16, 16, 16, 16, 16
+ } ;
+
+static const flex_int16_t yy_chk[29] =
+ { 0,
+ 1, 1, 1, 1, 1, 1, 1, 5, 14, 5,
+ 17, 17, 18, 8, 18, 19, 19, 3, 19, 19,
+ 16, 16, 16, 16, 16, 16, 16, 16
+ } ;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+#line 1 "fts0tlex.l"
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**
+ * @file fts/fts0tlex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+#line 27 "fts0tlex.l"
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner)
+#define exit(A) ut_error
+
+#line 671 "fts0tlex.cc"
+#define YY_NO_INPUT 1
+#line 673 "fts0tlex.cc"
+
+#define INITIAL 0
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Holds the entire state of the reentrant scanner. */
+struct yyguts_t
+ {
+
+ /* User-defined. Not touched by flex. */
+ YY_EXTRA_TYPE yyextra_r;
+
+ /* The rest are the same as the globals declared in the non-reentrant scanner. */
+ FILE *yyin_r, *yyout_r;
+ size_t yy_buffer_stack_top; /**< index of top of stack. */
+ size_t yy_buffer_stack_max; /**< capacity of stack. */
+ YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */
+ char yy_hold_char;
+ int yy_n_chars;
+ int yyleng_r;
+ char *yy_c_buf_p;
+ int yy_init;
+ int yy_start;
+ int yy_did_buffer_switch_on_eof;
+ int yy_start_stack_ptr;
+ int yy_start_stack_depth;
+ int *yy_start_stack;
+ yy_state_type yy_last_accepting_state;
+ char* yy_last_accepting_cpos;
+
+ int yylineno_r;
+ int yy_flex_debug_r;
+
+ char *yytext_r;
+ int yy_more_flag;
+ int yy_more_len;
+
+ }; /* end struct yyguts_t */
+
+static int yy_init_globals ( yyscan_t yyscanner );
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+ These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out ( FILE * _out_str , yyscan_t yyscanner );
+
+ int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef YY_NO_UNPUT
+
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+static int yyinput ( yyscan_t yyscanner );
+#else
+static int input ( yyscan_t yyscanner );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+ { \
+ int c = '*'; \
+ int n; \
+ for ( n = 0; n < max_size && \
+ (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+ buf[n] = (char) c; \
+ if ( c == '\n' ) \
+ buf[n++] = (char) c; \
+ if ( c == EOF && ferror( yyin ) ) \
+ YY_FATAL_ERROR( "input in flex scanner failed" ); \
+ result = n; \
+ } \
+ else \
+ { \
+ errno=0; \
+ while ( (result = (int) fread(buf, 1, (yy_size_t) max_size, yyin)) == 0 && ferror(yyin)) \
+ { \
+ if( errno != EINTR) \
+ { \
+ YY_FATAL_ERROR( "input in flex scanner failed" ); \
+ break; \
+ } \
+ errno=0; \
+ clearerr(yyin); \
+ } \
+ }\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner)
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK /*LINTED*/break;
+#endif
+
+#define YY_RULE_SETUP \
+ YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+ yy_state_type yy_current_state;
+ char *yy_cp, *yy_bp;
+ int yy_act;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if ( !yyg->yy_init )
+ {
+ yyg->yy_init = 1;
+
+#ifdef YY_USER_INIT
+ YY_USER_INIT;
+#endif
+
+ if ( ! yyg->yy_start )
+ yyg->yy_start = 1; /* first start state */
+
+ if ( ! yyin )
+ yyin = stdin;
+
+ if ( ! yyout )
+ yyout = stdout;
+
+ if ( ! YY_CURRENT_BUFFER ) {
+ yyensure_buffer_stack (yyscanner);
+ YY_CURRENT_BUFFER_LVALUE =
+ yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner);
+ }
+
+ yy_load_buffer_state( yyscanner );
+ }
+
+ {
+#line 45 "fts0tlex.l"
+
+
+#line 934 "fts0tlex.cc"
+
+ while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */
+ {
+ yy_cp = yyg->yy_c_buf_p;
+
+ /* Support of yytext. */
+ *yy_cp = yyg->yy_hold_char;
+
+ /* yy_bp points to the position in yy_ch_buf of the start of
+ * the current run.
+ */
+ yy_bp = yy_cp;
+
+ yy_current_state = yyg->yy_start;
+yy_match:
+ do
+ {
+ YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
+ if ( yy_accept[yy_current_state] )
+ {
+ yyg->yy_last_accepting_state = yy_current_state;
+ yyg->yy_last_accepting_cpos = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 17 )
+ yy_c = yy_meta[yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+ ++yy_cp;
+ }
+ while ( yy_current_state != 16 );
+ yy_cp = yyg->yy_last_accepting_cpos;
+ yy_current_state = yyg->yy_last_accepting_state;
+
+yy_find_action:
+ yy_act = yy_accept[yy_current_state];
+
+ YY_DO_BEFORE_ACTION;
+
+do_action: /* This label is used only to access EOF actions. */
+
+ switch ( yy_act )
+ { /* beginning of action switch */
+ case 0: /* must back up */
+ /* undo the effects of YY_DO_BEFORE_ACTION */
+ *yy_cp = yyg->yy_hold_char;
+ yy_cp = yyg->yy_last_accepting_cpos;
+ yy_current_state = yyg->yy_last_accepting_state;
+ goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 47 "fts0tlex.l"
+/* Ignore whitespace */ ;
+ YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 49 "fts0tlex.l"
+{
+ val->oper = fts0tget_text(yyscanner)[0];
+
+ return(val->oper);
+}
+ YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 55 "fts0tlex.l"
+{
+ val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+ return(FTS_TEXT);
+}
+ YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 61 "fts0tlex.l"
+{
+ val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+ return(FTS_TERM);
+}
+ YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 66 "fts0tlex.l"
+;
+ YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 67 "fts0tlex.l"
+
+ YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 69 "fts0tlex.l"
+ECHO;
+ YY_BREAK
+#line 1035 "fts0tlex.cc"
+case YY_STATE_EOF(INITIAL):
+ yyterminate();
+
+ case YY_END_OF_BUFFER:
+ {
+ /* Amount of text matched not including the EOB char. */
+ int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
+
+ /* Undo the effects of YY_DO_BEFORE_ACTION. */
+ *yy_cp = yyg->yy_hold_char;
+ YY_RESTORE_YY_MORE_OFFSET
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+ {
+ /* We're scanning a new file or input source. It's
+ * possible that this happened because the user
+ * just pointed yyin at a new source and called
+ * yylex(). If so, then we have to assure
+ * consistency between YY_CURRENT_BUFFER and our
+ * globals. Here is the right place to do so, because
+ * this is the first action (other than possibly a
+ * back-up) that will match for the new input source.
+ */
+ yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+ YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+ YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+ }
+
+ /* Note that here we test for yy_c_buf_p "<=" to the position
+ * of the first EOB in the buffer, since yy_c_buf_p will
+ * already have been incremented past the NUL character
+ * (since all states make transitions on EOB to the
+ * end-of-buffer state). Contrast this with the test
+ * in input().
+ */
+ if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+ { /* This was really a NUL. */
+ yy_state_type yy_next_state;
+
+ yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
+
+ yy_current_state = yy_get_previous_state( yyscanner );
+
+ /* Okay, we're now positioned to make the NUL
+ * transition. We couldn't have
+ * yy_get_previous_state() go ahead and do it
+ * for us because it doesn't know how to deal
+ * with the possibility of jamming (and we don't
+ * want to build jamming into it because then it
+ * will run more slowly).
+ */
+
+ yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
+
+ yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+
+ if ( yy_next_state )
+ {
+ /* Consume the NUL. */
+ yy_cp = ++yyg->yy_c_buf_p;
+ yy_current_state = yy_next_state;
+ goto yy_match;
+ }
+
+ else
+ {
+ yy_cp = yyg->yy_last_accepting_cpos;
+ yy_current_state = yyg->yy_last_accepting_state;
+ goto yy_find_action;
+ }
+ }
+
+ else switch ( yy_get_next_buffer( yyscanner ) )
+ {
+ case EOB_ACT_END_OF_FILE:
+ {
+ yyg->yy_did_buffer_switch_on_eof = 0;
+
+ if ( yywrap( yyscanner ) )
+ {
+ /* Note: because we've taken care in
+ * yy_get_next_buffer() to have set up
+ * yytext, we can now set up
+ * yy_c_buf_p so that if some total
+ * hoser (like flex itself) wants to
+ * call the scanner after we return the
+ * YY_NULL, it'll still work - another
+ * YY_NULL will get returned.
+ */
+ yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
+
+ yy_act = YY_STATE_EOF(YY_START);
+ goto do_action;
+ }
+
+ else
+ {
+ if ( ! yyg->yy_did_buffer_switch_on_eof )
+ YY_NEW_FILE;
+ }
+ break;
+ }
+
+ case EOB_ACT_CONTINUE_SCAN:
+ yyg->yy_c_buf_p =
+ yyg->yytext_ptr + yy_amount_of_matched_text;
+
+ yy_current_state = yy_get_previous_state( yyscanner );
+
+ yy_cp = yyg->yy_c_buf_p;
+ yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+ goto yy_match;
+
+ case EOB_ACT_LAST_MATCH:
+ yyg->yy_c_buf_p =
+ &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
+
+ yy_current_state = yy_get_previous_state( yyscanner );
+
+ yy_cp = yyg->yy_c_buf_p;
+ yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+ goto yy_find_action;
+ }
+ break;
+ }
+
+ default:
+ YY_FATAL_ERROR(
+ "fatal flex scanner internal error--no action found" );
+ } /* end of action switch */
+ } /* end of scanning one token */
+ } /* end of user's declarations */
+} /* end of yylex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ * EOB_ACT_LAST_MATCH -
+ * EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ * EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+ char *source = yyg->yytext_ptr;
+ int number_to_move, i;
+ int ret_val;
+
+ if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
+ YY_FATAL_ERROR(
+ "fatal flex scanner internal error--end of buffer missed" );
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+ { /* Don't try to fill the buffer, so this is an EOF. */
+ if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
+ {
+ /* We matched a single character, the EOB, so
+ * treat this as a final EOF.
+ */
+ return EOB_ACT_END_OF_FILE;
+ }
+
+ else
+ {
+ /* We matched some text prior to the EOB, first
+ * process it.
+ */
+ return EOB_ACT_LAST_MATCH;
+ }
+ }
+
+ /* Try to read more data. */
+
+ /* First move last chars to start of buffer. */
+ number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr - 1);
+
+ for ( i = 0; i < number_to_move; ++i )
+ *(dest++) = *(source++);
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+ /* don't do the read, it's not guaranteed to return an EOF,
+ * just force an EOF
+ */
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
+
+ else
+ {
+ int num_to_read =
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+ while ( num_to_read <= 0 )
+ { /* Not enough room in the buffer - grow it. */
+
+ /* just a shorter name for the current buffer */
+ YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE;
+
+ int yy_c_buf_p_offset =
+ (int) (yyg->yy_c_buf_p - b->yy_ch_buf);
+
+ if ( b->yy_is_our_buffer )
+ {
+ int new_size = b->yy_buf_size * 2;
+
+ if ( new_size <= 0 )
+ b->yy_buf_size += b->yy_buf_size / 8;
+ else
+ b->yy_buf_size *= 2;
+
+ b->yy_ch_buf = (char *)
+ /* Include room in for 2 EOB chars. */
+ yyrealloc( (void *) b->yy_ch_buf,
+ (yy_size_t) (b->yy_buf_size + 2) , yyscanner );
+ }
+ else
+ /* Can't grow it, we don't own it. */
+ b->yy_ch_buf = NULL;
+
+ if ( ! b->yy_ch_buf )
+ YY_FATAL_ERROR(
+ "fatal error - scanner input buffer overflow" );
+
+ yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+ num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+ number_to_move - 1;
+
+ }
+
+ if ( num_to_read > YY_READ_BUF_SIZE )
+ num_to_read = YY_READ_BUF_SIZE;
+
+ /* Read in more data. */
+ YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+ yyg->yy_n_chars, num_to_read );
+
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+ }
+
+ if ( yyg->yy_n_chars == 0 )
+ {
+ if ( number_to_move == YY_MORE_ADJ )
+ {
+ ret_val = EOB_ACT_END_OF_FILE;
+ yyrestart( yyin , yyscanner);
+ }
+
+ else
+ {
+ ret_val = EOB_ACT_LAST_MATCH;
+ YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+ YY_BUFFER_EOF_PENDING;
+ }
+ }
+
+ else
+ ret_val = EOB_ACT_CONTINUE_SCAN;
+
+ if ((yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+ /* Extend the array by 50%, plus the number we really need. */
+ int new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc(
+ (void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size , yyscanner );
+ if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+ /* "- 2" to take care of EOB's */
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2);
+ }
+
+ yyg->yy_n_chars += number_to_move;
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
+
+ yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+ return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+ static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
+{
+ yy_state_type yy_current_state;
+ char *yy_cp;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ yy_current_state = yyg->yy_start;
+
+ for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
+ {
+ YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+ if ( yy_accept[yy_current_state] )
+ {
+ yyg->yy_last_accepting_state = yy_current_state;
+ yyg->yy_last_accepting_cpos = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 17 )
+ yy_c = yy_meta[yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+ }
+
+ return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ * next_state = yy_try_NUL_trans( current_state );
+ */
+ static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner)
+{
+ int yy_is_jam;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */
+ char *yy_cp = yyg->yy_c_buf_p;
+
+ YY_CHAR yy_c = 1;
+ if ( yy_accept[yy_current_state] )
+ {
+ yyg->yy_last_accepting_state = yy_current_state;
+ yyg->yy_last_accepting_cpos = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 17 )
+ yy_c = yy_meta[yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+ yy_is_jam = (yy_current_state == 16);
+
+ (void)yyg;
+ return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_UNPUT
+
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+ static int yyinput (yyscan_t yyscanner)
+#else
+ static int input (yyscan_t yyscanner)
+#endif
+
+{
+ int c;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ *yyg->yy_c_buf_p = yyg->yy_hold_char;
+
+ if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
+ {
+ /* yy_c_buf_p now points to the character we want to return.
+ * If this occurs *before* the EOB characters, then it's a
+ * valid NUL; if not, then we've hit the end of the buffer.
+ */
+ if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+ /* This was really a NUL. */
+ *yyg->yy_c_buf_p = '\0';
+
+ else
+ { /* need more input */
+ int offset = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr);
+ ++yyg->yy_c_buf_p;
+
+ switch ( yy_get_next_buffer( yyscanner ) )
+ {
+ case EOB_ACT_LAST_MATCH:
+ /* This happens because yy_g_n_b()
+ * sees that we've accumulated a
+ * token and flags that we need to
+ * try matching the token before
+ * proceeding. But for input(),
+ * there's no matching to consider.
+ * So convert the EOB_ACT_LAST_MATCH
+ * to EOB_ACT_END_OF_FILE.
+ */
+
+ /* Reset buffer status. */
+ yyrestart( yyin , yyscanner);
+
+ /*FALLTHROUGH*/
+
+ case EOB_ACT_END_OF_FILE:
+ {
+ if ( yywrap( yyscanner ) )
+ return 0;
+
+ if ( ! yyg->yy_did_buffer_switch_on_eof )
+ YY_NEW_FILE;
+#ifdef __cplusplus
+ return yyinput(yyscanner);
+#else
+ return input(yyscanner);
+#endif
+ }
+
+ case EOB_ACT_CONTINUE_SCAN:
+ yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
+ break;
+ }
+ }
+ }
+
+ c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */
+ *yyg->yy_c_buf_p = '\0'; /* preserve yytext */
+ yyg->yy_hold_char = *++yyg->yy_c_buf_p;
+
+ return c;
+}
+#endif /* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * @param yyscanner The scanner object.
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+ void yyrestart (FILE * input_file , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if ( ! YY_CURRENT_BUFFER ){
+ yyensure_buffer_stack (yyscanner);
+ YY_CURRENT_BUFFER_LVALUE =
+ yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner);
+ }
+
+ yy_init_buffer( YY_CURRENT_BUFFER, input_file , yyscanner);
+ yy_load_buffer_state( yyscanner );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * @param yyscanner The scanner object.
+ */
+ void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ /* TODO. We should be able to replace this entire function body
+ * with
+ * yypop_buffer_state();
+ * yypush_buffer_state(new_buffer);
+ */
+ yyensure_buffer_stack (yyscanner);
+ if ( YY_CURRENT_BUFFER == new_buffer )
+ return;
+
+ if ( YY_CURRENT_BUFFER )
+ {
+ /* Flush out information for old buffer. */
+ *yyg->yy_c_buf_p = yyg->yy_hold_char;
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+ }
+
+ YY_CURRENT_BUFFER_LVALUE = new_buffer;
+ yy_load_buffer_state( yyscanner );
+
+ /* We don't actually know whether we did this switch during
+ * EOF (yywrap()) processing, but the only time this flag
+ * is looked at is after yywrap() is called, so it's safe
+ * to go ahead and always set it.
+ */
+ yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+static void yy_load_buffer_state (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+ yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+ yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+ yyg->yy_hold_char = *yyg->yy_c_buf_p;
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * @param yyscanner The scanner object.
+ * @return the allocated buffer state.
+ */
+ YY_BUFFER_STATE yy_create_buffer (FILE * file, int size , yyscan_t yyscanner)
+{
+ YY_BUFFER_STATE b;
+
+ b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner );
+ if ( ! b )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+ b->yy_buf_size = size;
+
+ /* yy_ch_buf has to be 2 characters longer than the size given because
+ * we need to put in 2 end-of-buffer characters.
+ */
+ b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2) , yyscanner );
+ if ( ! b->yy_ch_buf )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+ b->yy_is_our_buffer = 1;
+
+ yy_init_buffer( b, file , yyscanner);
+
+ return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with yy_create_buffer()
+ * @param yyscanner The scanner object.
+ */
+ void yy_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if ( ! b )
+ return;
+
+ if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+ YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+ if ( b->yy_is_our_buffer )
+ yyfree( (void *) b->yy_ch_buf , yyscanner );
+
+ yyfree( (void *) b , yyscanner );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a yyrestart() or at EOF.
+ */
+ static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file , yyscan_t yyscanner)
+
+{
+ int oerrno = errno;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ yy_flush_buffer( b , yyscanner);
+
+ b->yy_input_file = file;
+ b->yy_fill_buffer = 1;
+
+ /* If b is the current buffer, then yy_init_buffer was _probably_
+ * called from yyrestart() or through yy_get_next_buffer.
+ * In that case, we don't want to reset the lineno or column.
+ */
+ if (b != YY_CURRENT_BUFFER){
+ b->yy_bs_lineno = 1;
+ b->yy_bs_column = 0;
+ }
+
+ b->yy_is_interactive = 0;
+
+ errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * @param yyscanner The scanner object.
+ */
+ void yy_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ if ( ! b )
+ return;
+
+ b->yy_n_chars = 0;
+
+ /* We always need two end-of-buffer characters. The first causes
+ * a transition to the end-of-buffer state. The second causes
+ * a jam in that state.
+ */
+ b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+ b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+ b->yy_buf_pos = &b->yy_ch_buf[0];
+
+ b->yy_at_bol = 1;
+ b->yy_buffer_status = YY_BUFFER_NEW;
+
+ if ( b == YY_CURRENT_BUFFER )
+ yy_load_buffer_state( yyscanner );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ * the current state. This function will allocate the stack
+ * if necessary.
+ * @param new_buffer The new state.
+ * @param yyscanner The scanner object.
+ */
+void yypush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ if (new_buffer == NULL)
+ return;
+
+ yyensure_buffer_stack(yyscanner);
+
+ /* This block is copied from yy_switch_to_buffer. */
+ if ( YY_CURRENT_BUFFER )
+ {
+ /* Flush out information for old buffer. */
+ *yyg->yy_c_buf_p = yyg->yy_hold_char;
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+ }
+
+ /* Only push if top exists. Otherwise, replace top. */
+ if (YY_CURRENT_BUFFER)
+ yyg->yy_buffer_stack_top++;
+ YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+ /* copied from yy_switch_to_buffer. */
+ yy_load_buffer_state( yyscanner );
+ yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ * The next element becomes the new top.
+ * @param yyscanner The scanner object.
+ */
+void yypop_buffer_state (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ if (!YY_CURRENT_BUFFER)
+ return;
+
+ yy_delete_buffer(YY_CURRENT_BUFFER , yyscanner);
+ YY_CURRENT_BUFFER_LVALUE = NULL;
+ if (yyg->yy_buffer_stack_top > 0)
+ --yyg->yy_buffer_stack_top;
+
+ if (YY_CURRENT_BUFFER) {
+ yy_load_buffer_state( yyscanner );
+ yyg->yy_did_buffer_switch_on_eof = 1;
+ }
+}
+
+/* Allocates the stack if it does not exist.
+ * Guarantees space for at least one push.
+ */
+static void yyensure_buffer_stack (yyscan_t yyscanner)
+{
+ yy_size_t num_to_alloc;
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if (!yyg->yy_buffer_stack) {
+
+ /* First allocation is just for 2 elements, since we don't know if this
+ * scanner will even need a stack. We use 2 instead of 1 to avoid an
+ * immediate realloc on the next call.
+ */
+ num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */
+ yyg->yy_buffer_stack = (struct yy_buffer_state**)yyalloc
+ (num_to_alloc * sizeof(struct yy_buffer_state*)
+ , yyscanner);
+ if ( ! yyg->yy_buffer_stack )
+ YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+ memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+ yyg->yy_buffer_stack_max = num_to_alloc;
+ yyg->yy_buffer_stack_top = 0;
+ return;
+ }
+
+ if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
+
+ /* Increase the buffer to prepare for a possible push. */
+ yy_size_t grow_size = 8 /* arbitrary grow size */;
+
+ num_to_alloc = yyg->yy_buffer_stack_max + grow_size;
+ yyg->yy_buffer_stack = (struct yy_buffer_state**)yyrealloc
+ (yyg->yy_buffer_stack,
+ num_to_alloc * sizeof(struct yy_buffer_state*)
+ , yyscanner);
+ if ( ! yyg->yy_buffer_stack )
+ YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+ /* zero only the new slots.*/
+ memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
+ yyg->yy_buffer_stack_max = num_to_alloc;
+ }
+}
+
+/** Setup the input buffer state to scan directly from a user-specified character buffer.
+ * @param base the character buffer
+ * @param size the size in bytes of the character buffer
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner)
+{
+ YY_BUFFER_STATE b;
+
+ if ( size < 2 ||
+ base[size-2] != YY_END_OF_BUFFER_CHAR ||
+ base[size-1] != YY_END_OF_BUFFER_CHAR )
+ /* They forgot to leave room for the EOB's. */
+ return NULL;
+
+ b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner );
+ if ( ! b )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" );
+
+ b->yy_buf_size = (int) (size - 2); /* "- 2" to take care of EOB's */
+ b->yy_buf_pos = b->yy_ch_buf = base;
+ b->yy_is_our_buffer = 0;
+ b->yy_input_file = NULL;
+ b->yy_n_chars = b->yy_buf_size;
+ b->yy_is_interactive = 0;
+ b->yy_at_bol = 1;
+ b->yy_fill_buffer = 0;
+ b->yy_buffer_status = YY_BUFFER_NEW;
+
+ yy_switch_to_buffer( b , yyscanner );
+
+ return b;
+}
+
+/** Setup the input buffer state to scan a string. The next call to yylex() will
+ * scan from a @e copy of @a str.
+ * @param yystr a NUL-terminated string to scan
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ * @note If you want to scan bytes that may contain NUL values, then use
+ * yy_scan_bytes() instead.
+ */
+YY_BUFFER_STATE yy_scan_string (const char * yystr , yyscan_t yyscanner)
+{
+
+ return yy_scan_bytes( yystr, (int) strlen(yystr) , yyscanner);
+}
+
+/** Setup the input buffer state to scan the given bytes. The next call to yylex() will
+ * scan from a @e copy of @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE yy_scan_bytes (const char * yybytes, int _yybytes_len , yyscan_t yyscanner)
+{
+ YY_BUFFER_STATE b;
+ char *buf;
+ yy_size_t n;
+ int i;
+
+ /* Get memory for full buffer, including space for trailing EOB's. */
+ n = (yy_size_t) (_yybytes_len + 2);
+ buf = (char *) yyalloc( n , yyscanner );
+ if ( ! buf )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" );
+
+ for ( i = 0; i < _yybytes_len; ++i )
+ buf[i] = yybytes[i];
+
+ buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
+
+ b = yy_scan_buffer( buf, n , yyscanner);
+ if ( ! b )
+ YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" );
+
+ /* It's okay to grow etc. this buffer, and we should throw it
+ * away when we're done.
+ */
+ b->yy_is_our_buffer = 1;
+
+ return b;
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yynoreturn yy_fatal_error (const char* msg , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ (void)yyg;
+ fprintf( stderr, "%s\n", msg );
+ exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+ do \
+ { \
+ /* Undo effects of setting up yytext. */ \
+ int yyless_macro_arg = (n); \
+ YY_LESS_LINENO(yyless_macro_arg);\
+ yytext[yyleng] = yyg->yy_hold_char; \
+ yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
+ yyg->yy_hold_char = *yyg->yy_c_buf_p; \
+ *yyg->yy_c_buf_p = '\0'; \
+ yyleng = yyless_macro_arg; \
+ } \
+ while ( 0 )
+
+/* Accessor methods (get/set functions) to struct members. */
+
+/** Get the user-defined data for this scanner.
+ * @param yyscanner The scanner object.
+ */
+YY_EXTRA_TYPE yyget_extra (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yyextra;
+}
+
+/** Get the current line number.
+ * @param yyscanner The scanner object.
+ */
+int yyget_lineno (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if (! YY_CURRENT_BUFFER)
+ return 0;
+
+ return yylineno;
+}
+
+/** Get the current column number.
+ * @param yyscanner The scanner object.
+ */
+int yyget_column (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ if (! YY_CURRENT_BUFFER)
+ return 0;
+
+ return yycolumn;
+}
+
+/** Get the input stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *yyget_in (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yyin;
+}
+
+/** Get the output stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *yyget_out (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yyout;
+}
+
+/** Get the length of the current token.
+ * @param yyscanner The scanner object.
+ */
+int yyget_leng (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yyleng;
+}
+
+/** Get the current token.
+ * @param yyscanner The scanner object.
+ */
+
+char *yyget_text (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yytext;
+}
+
+/** Set the user-defined data. This data is never touched by the scanner.
+ * @param user_defined The data to be associated with this scanner.
+ * @param yyscanner The scanner object.
+ */
+void yyset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ yyextra = user_defined ;
+}
+
+/** Set the current line number.
+ * @param _line_number line number
+ * @param yyscanner The scanner object.
+ */
+void yyset_lineno (int _line_number , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ /* lineno is only valid if an input buffer exists. */
+ if (! YY_CURRENT_BUFFER )
+ YY_FATAL_ERROR( "yyset_lineno called with no buffer" );
+
+ yylineno = _line_number;
+}
+
+/** Set the current column.
+ * @param _column_no column number
+ * @param yyscanner The scanner object.
+ */
+void yyset_column (int _column_no , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ /* column is only valid if an input buffer exists. */
+ if (! YY_CURRENT_BUFFER )
+ YY_FATAL_ERROR( "yyset_column called with no buffer" );
+
+ yycolumn = _column_no;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param _in_str A readable stream.
+ * @param yyscanner The scanner object.
+ * @see yy_switch_to_buffer
+ */
+void yyset_in (FILE * _in_str , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ yyin = _in_str ;
+}
+
+void yyset_out (FILE * _out_str , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ yyout = _out_str ;
+}
+
+int yyget_debug (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ return yy_flex_debug;
+}
+
+void yyset_debug (int _bdebug , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ yy_flex_debug = _bdebug ;
+}
+
+/* Accessor methods for yylval and yylloc */
+
+/* User-visible API */
+
+/* yylex_init is special because it creates the scanner itself, so it is
+ * the ONLY reentrant function that doesn't take the scanner as the last argument.
+ * That's why we explicitly handle the declaration, instead of using our macros.
+ */
+int yylex_init(yyscan_t* ptr_yy_globals)
+{
+ if (ptr_yy_globals == NULL){
+ errno = EINVAL;
+ return 1;
+ }
+
+ *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), NULL );
+
+ if (*ptr_yy_globals == NULL){
+ errno = ENOMEM;
+ return 1;
+ }
+
+ /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
+ memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+ return yy_init_globals ( *ptr_yy_globals );
+}
+
+/* yylex_init_extra has the same functionality as yylex_init, but follows the
+ * convention of taking the scanner as the last argument. Note however, that
+ * this is a *pointer* to a scanner, as it will be allocated by this call (and
+ * is the reason, too, why this function also must handle its own declaration).
+ * The user defined value in the first argument will be available to yyalloc in
+ * the yyextra field.
+ */
+int yylex_init_extra( YY_EXTRA_TYPE yy_user_defined, yyscan_t* ptr_yy_globals )
+{
+ struct yyguts_t dummy_yyguts;
+
+ yyset_extra (yy_user_defined, &dummy_yyguts);
+
+ if (ptr_yy_globals == NULL){
+ errno = EINVAL;
+ return 1;
+ }
+
+ *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
+
+ if (*ptr_yy_globals == NULL){
+ errno = ENOMEM;
+ return 1;
+ }
+
+ /* By setting to 0xAA, we expose bugs in
+ yy_init_globals. Leave at 0x00 for releases. */
+ memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+ yyset_extra (yy_user_defined, *ptr_yy_globals);
+
+ return yy_init_globals ( *ptr_yy_globals );
+}
+
+static int yy_init_globals (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ /* Initialization is the same as for the non-reentrant scanner.
+ * This function is called from yylex_destroy(), so don't allocate here.
+ */
+
+ yyg->yy_buffer_stack = NULL;
+ yyg->yy_buffer_stack_top = 0;
+ yyg->yy_buffer_stack_max = 0;
+ yyg->yy_c_buf_p = NULL;
+ yyg->yy_init = 0;
+ yyg->yy_start = 0;
+
+ yyg->yy_start_stack_ptr = 0;
+ yyg->yy_start_stack_depth = 0;
+ yyg->yy_start_stack = NULL;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+ yyin = stdin;
+ yyout = stdout;
+#else
+ yyin = NULL;
+ yyout = NULL;
+#endif
+
+ /* For future reference: Set errno on error, since we are called by
+ * yylex_init()
+ */
+ return 0;
+}
+
+/* yylex_destroy is for both reentrant and non-reentrant scanners. */
+int yylex_destroy (yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+ /* Pop the buffer stack, destroying each element. */
+ while(YY_CURRENT_BUFFER){
+ yy_delete_buffer( YY_CURRENT_BUFFER , yyscanner );
+ YY_CURRENT_BUFFER_LVALUE = NULL;
+ yypop_buffer_state(yyscanner);
+ }
+
+ /* Destroy the stack itself. */
+ yyfree(yyg->yy_buffer_stack , yyscanner);
+ yyg->yy_buffer_stack = NULL;
+
+ /* Destroy the start condition stack. */
+ yyfree( yyg->yy_start_stack , yyscanner );
+ yyg->yy_start_stack = NULL;
+
+ /* Reset the globals. This is important in a non-reentrant scanner so the next time
+ * yylex() is called, initialization will occur. */
+ yy_init_globals( yyscanner);
+
+ /* Destroy the main struct (reentrant only). */
+ yyfree ( yyscanner , yyscanner );
+ yyscanner = NULL;
+ return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, const char * s2, int n , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ (void)yyg;
+
+ int i;
+ for ( i = 0; i < n; ++i )
+ s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (const char * s , yyscan_t yyscanner)
+{
+ int n;
+ for ( n = 0; s[n]; ++n )
+ ;
+
+ return n;
+}
+#endif
+
+void *yyalloc (yy_size_t size , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ (void)yyg;
+ return malloc(size);
+}
+
+void *yyrealloc (void * ptr, yy_size_t size , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ (void)yyg;
+
+ /* The cast to (char *) in the following accommodates both
+ * implementations that use char* generic pointers, and those
+ * that use void* generic pointers. It works with the latter
+ * because both ANSI C and C++ allow castless assignment from
+ * any pointer type to void*, and deal with argument conversions
+ * as though doing an assignment.
+ */
+ return realloc(ptr, size);
+}
+
+void yyfree (void * ptr , yyscan_t yyscanner)
+{
+ struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+ (void)yyg;
+ free( (char *) ptr ); /* see yyrealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 69 "fts0tlex.l"
+
+
diff --git a/storage/innobase/fts/fts0tlex.l b/storage/innobase/fts/fts0tlex.l
new file mode 100644
index 00000000..e19e907f
--- /dev/null
+++ b/storage/innobase/fts/fts0tlex.l
@@ -0,0 +1,69 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0tlex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner)
+#define exit(A) ut_error
+
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+%option nostdinit
+%option reentrant
+%option never-interactive
+
+
+%%
+
+[\t ]+ /* Ignore whitespace */ ;
+
+[*] {
+ val->oper = fts0tget_text(yyscanner)[0];
+
+ return(val->oper);
+}
+
+\"[^\"\n]*\" {
+ val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+ return(FTS_TEXT);
+}
+
+[^" \n\%]* {
+ val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+ return(FTS_TERM);
+}
+. ;
+\n
+
+%%
diff --git a/storage/innobase/fts/make_parser.sh b/storage/innobase/fts/make_parser.sh
new file mode 100755
index 00000000..6b82c5ba
--- /dev/null
+++ b/storage/innobase/fts/make_parser.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+#
+# Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+
+TMPF=t.$$
+
+make -f Makefile.query
+
+echo '#include "univ.i"' > $TMPF
+
+# This is to avoid compiler warning about unused parameters.
+# FIXME: gcc extension "MY_ATTRIBUTE" causing compilation errors on windows
+# platform. Quote them out for now.
+sed -e '
+s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+' < fts0blex.cc >> $TMPF
+
+mv $TMPF fts0blex.cc
+
+echo '#include "univ.i"' > $TMPF
+
+sed -e '
+s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+' < fts0tlex.cc >> $TMPF
+
+mv $TMPF fts0tlex.cc
diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc
new file mode 100644
index 00000000..e084f0b7
--- /dev/null
+++ b/storage/innobase/fut/fut0lst.cc
@@ -0,0 +1,392 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fut/fut0lst.cc
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0lst.h"
+#include "buf0buf.h"
+#include "page0page.h"
+
+
+/** Write a file address.
+@param[in] block file page
+@param[in,out] faddr file address location
+@param[in] page page number
+@param[in] boffset byte offset
+@param[in,out] mtr mini-transaction */
+static void flst_write_addr(const buf_block_t& block, byte *faddr,
+ uint32_t page, uint16_t boffset, mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains_page_flagged(faddr,
+ MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_SX_FIX));
+ ut_a(page == FIL_NULL || boffset >= FIL_PAGE_DATA);
+ ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
+
+ static_assert(FIL_ADDR_PAGE == 0, "compatibility");
+ static_assert(FIL_ADDR_BYTE == 4, "compatibility");
+ static_assert(FIL_ADDR_SIZE == 6, "compatibility");
+
+ const bool same_page= mach_read_from_4(faddr + FIL_ADDR_PAGE) == page;
+ const bool same_offset= mach_read_from_2(faddr + FIL_ADDR_BYTE) == boffset;
+ if (same_page)
+ {
+ if (!same_offset)
+ mtr->write<2>(block, faddr + FIL_ADDR_BYTE, boffset);
+ return;
+ }
+ if (same_offset)
+ mtr->write<4>(block, faddr + FIL_ADDR_PAGE, page);
+ else
+ {
+ alignas(4) byte fil_addr[6];
+ mach_write_to_4(fil_addr + FIL_ADDR_PAGE, page);
+ mach_write_to_2(fil_addr + FIL_ADDR_BYTE, boffset);
+ mtr->memcpy(block, faddr + FIL_ADDR_PAGE, fil_addr, 6);
+ }
+}
+
+/** Write 2 null file addresses.
+@param[in] b file page
+@param[in,out] addr file address to be zeroed out
+@param[in,out] mtr mini-transaction */
+static void flst_zero_both(const buf_block_t& b, byte *addr, mtr_t *mtr)
+{
+ if (mach_read_from_4(addr + FIL_ADDR_PAGE) != FIL_NULL)
+ mtr->memset(&b, ulint(addr - b.frame) + FIL_ADDR_PAGE, 4, 0xff);
+ mtr->write<2,mtr_t::MAYBE_NOP>(b, addr + FIL_ADDR_BYTE, 0U);
+ /* Initialize the other address by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source)
+ which is 4 bytes, or less than FIL_ADDR_SIZE. */
+ memcpy(addr + FIL_ADDR_SIZE, addr, FIL_ADDR_SIZE);
+ const uint16_t boffset= page_offset(addr);
+ mtr->memmove(b, boffset + FIL_ADDR_SIZE, boffset, FIL_ADDR_SIZE);
+}
+
+/** Add a node to an empty list. */
+static void flst_add_to_empty(buf_block_t *base, uint16_t boffset,
+ buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+{
+ ut_ad(base != add || boffset != aoffset);
+ ut_ad(boffset < base->physical_size());
+ ut_ad(aoffset < add->physical_size());
+ ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+
+ ut_ad(!mach_read_from_4(base->frame + boffset + FLST_LEN));
+ mtr->write<1>(*base, base->frame + boffset + (FLST_LEN + 3), 1U);
+ /* Update first and last fields of base node */
+ flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+ add->page.id().page_no(), aoffset, mtr);
+ memcpy(base->frame + boffset + FLST_LAST, base->frame + boffset + FLST_FIRST,
+ FIL_ADDR_SIZE);
+ /* Initialize FLST_LAST by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source)
+ which is 4 bytes, or less than FIL_ADDR_SIZE. */
+ mtr->memmove(*base, boffset + FLST_LAST, boffset + FLST_FIRST,
+ FIL_ADDR_SIZE);
+
+ /* Set prev and next fields of node to add */
+ static_assert(FLST_NEXT == FLST_PREV + FIL_ADDR_SIZE, "compatibility");
+ flst_zero_both(*add, add->frame + aoffset + FLST_PREV, mtr);
+}
+
+/** Insert a node after another one.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] cur insert position block
+@param[in] coffset byte offset of the insert position
+@param[in,out] add block to be added
+@param[in] aoffset byte offset of the block to be added
+@param[in,outr] mtr mini-transaction */
+static void flst_insert_after(buf_block_t *base, uint16_t boffset,
+ buf_block_t *cur, uint16_t coffset,
+ buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+{
+ ut_ad(base != cur || boffset != coffset);
+ ut_ad(base != add || boffset != aoffset);
+ ut_ad(cur != add || coffset != aoffset);
+ ut_ad(boffset < base->physical_size());
+ ut_ad(coffset < cur->physical_size());
+ ut_ad(aoffset < add->physical_size());
+ ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+
+ fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset);
+
+ flst_write_addr(*add, add->frame + aoffset + FLST_PREV,
+ cur->page.id().page_no(), coffset, mtr);
+ flst_write_addr(*add, add->frame + aoffset + FLST_NEXT,
+ next_addr.page, next_addr.boffset, mtr);
+
+ if (next_addr.page == FIL_NULL)
+ flst_write_addr(*base, base->frame + boffset + FLST_LAST,
+ add->page.id().page_no(), aoffset, mtr);
+ else
+ {
+ buf_block_t *block;
+ flst_node_t *next= fut_get_ptr(add->page.id().space(), add->zip_size(),
+ next_addr, RW_SX_LATCH, mtr, &block);
+ flst_write_addr(*block, next + FLST_PREV,
+ add->page.id().page_no(), aoffset, mtr);
+ }
+
+ flst_write_addr(*cur, cur->frame + coffset + FLST_NEXT,
+ add->page.id().page_no(), aoffset, mtr);
+
+ byte *len= &base->frame[boffset + FLST_LEN];
+ mtr->write<4>(*base, len, mach_read_from_4(len) + 1);
+}
+
+/** Insert a node before another one.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] cur insert position block
+@param[in] coffset byte offset of the insert position
+@param[in,out] add block to be added
+@param[in] aoffset byte offset of the block to be added
+@param[in,outr] mtr mini-transaction */
+static void flst_insert_before(buf_block_t *base, uint16_t boffset,
+ buf_block_t *cur, uint16_t coffset,
+ buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+{
+ ut_ad(base != cur || boffset != coffset);
+ ut_ad(base != add || boffset != aoffset);
+ ut_ad(cur != add || coffset != aoffset);
+ ut_ad(boffset < base->physical_size());
+ ut_ad(coffset < cur->physical_size());
+ ut_ad(aoffset < add->physical_size());
+ ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+
+ fil_addr_t prev_addr= flst_get_prev_addr(cur->frame + coffset);
+
+ flst_write_addr(*add, add->frame + aoffset + FLST_PREV,
+ prev_addr.page, prev_addr.boffset, mtr);
+ flst_write_addr(*add, add->frame + aoffset + FLST_NEXT,
+ cur->page.id().page_no(), coffset, mtr);
+
+ if (prev_addr.page == FIL_NULL)
+ flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+ add->page.id().page_no(), aoffset, mtr);
+ else
+ {
+ buf_block_t *block;
+ flst_node_t *prev= fut_get_ptr(add->page.id().space(), add->zip_size(),
+ prev_addr, RW_SX_LATCH, mtr, &block);
+ flst_write_addr(*block, prev + FLST_NEXT,
+ add->page.id().page_no(), aoffset, mtr);
+ }
+
+ flst_write_addr(*cur, cur->frame + coffset + FLST_PREV,
+ add->page.id().page_no(), aoffset, mtr);
+
+ byte *len= &base->frame[boffset + FLST_LEN];
+ mtr->write<4>(*base, len, mach_read_from_4(len) + 1);
+}
+
+/** Initialize a list base node.
+@param[in] block file page
+@param[in,out] base base node
+@param[in,out] mtr mini-transaction */
+void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
+{
+ ut_ad(mtr->memo_contains_page_flagged(base, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ mtr->write<4,mtr_t::MAYBE_NOP>(block, base + FLST_LEN, 0U);
+ static_assert(FLST_LAST == FLST_FIRST + FIL_ADDR_SIZE, "compatibility");
+ flst_zero_both(block, base + FLST_FIRST, mtr);
+}
+
+/** Append a file list node to a list.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] add block to be added
+@param[in] aoffset byte offset of the node to be added
+@param[in,outr] mtr mini-transaction */
+void flst_add_last(buf_block_t *base, uint16_t boffset,
+ buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+{
+ ut_ad(base != add || boffset != aoffset);
+ ut_ad(boffset < base->physical_size());
+ ut_ad(aoffset < add->physical_size());
+ ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+
+ if (!flst_get_len(base->frame + boffset))
+ flst_add_to_empty(base, boffset, add, aoffset, mtr);
+ else
+ {
+ fil_addr_t addr= flst_get_last(base->frame + boffset);
+ buf_block_t *cur= add;
+ const flst_node_t *c= addr.page == add->page.id().page_no()
+ ? add->frame + addr.boffset
+ : fut_get_ptr(add->page.id().space(), add->zip_size(), addr,
+ RW_SX_LATCH, mtr, &cur);
+ flst_insert_after(base, boffset, cur,
+ static_cast<uint16_t>(c - cur->frame),
+ add, aoffset, mtr);
+ }
+}
+
+/** Prepend a file list node to a list.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] add block to be added
+@param[in] aoffset byte offset of the node to be added
+@param[in,outr] mtr mini-transaction */
+void flst_add_first(buf_block_t *base, uint16_t boffset,
+ buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+{
+ ut_ad(base != add || boffset != aoffset);
+ ut_ad(boffset < base->physical_size());
+ ut_ad(aoffset < add->physical_size());
+ ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+
+ if (!flst_get_len(base->frame + boffset))
+ flst_add_to_empty(base, boffset, add, aoffset, mtr);
+ else
+ {
+ fil_addr_t addr= flst_get_first(base->frame + boffset);
+ buf_block_t *cur= add;
+ const flst_node_t *c= addr.page == add->page.id().page_no()
+ ? add->frame + addr.boffset
+ : fut_get_ptr(add->page.id().space(), add->zip_size(), addr,
+ RW_SX_LATCH, mtr, &cur);
+ flst_insert_before(base, boffset, cur,
+ static_cast<uint16_t>(c - cur->frame),
+ add, aoffset, mtr);
+ }
+}
+
+/** Remove a file list node.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] cur block to be removed
+@param[in] coffset byte offset of the current record to be removed
+@param[in,outr] mtr mini-transaction */
+void flst_remove(buf_block_t *base, uint16_t boffset,
+ buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+{
+ ut_ad(boffset < base->physical_size());
+ ut_ad(coffset < cur->physical_size());
+ ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+
+ const fil_addr_t prev_addr= flst_get_prev_addr(cur->frame + coffset);
+ const fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset);
+
+ if (prev_addr.page == FIL_NULL)
+ flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+ next_addr.page, next_addr.boffset, mtr);
+ else
+ {
+ buf_block_t *block= cur;
+ flst_node_t *prev= prev_addr.page == cur->page.id().page_no()
+ ? cur->frame + prev_addr.boffset
+ : fut_get_ptr(cur->page.id().space(), cur->zip_size(), prev_addr,
+ RW_SX_LATCH, mtr, &block);
+ flst_write_addr(*block, prev + FLST_NEXT,
+ next_addr.page, next_addr.boffset, mtr);
+ }
+
+ if (next_addr.page == FIL_NULL)
+ flst_write_addr(*base, base->frame + boffset + FLST_LAST,
+ prev_addr.page, prev_addr.boffset, mtr);
+ else
+ {
+ buf_block_t *block= cur;
+ flst_node_t *next= next_addr.page == cur->page.id().page_no()
+ ? cur->frame + next_addr.boffset
+ : fut_get_ptr(cur->page.id().space(), cur->zip_size(), next_addr,
+ RW_SX_LATCH, mtr, &block);
+ flst_write_addr(*block, next + FLST_PREV,
+ prev_addr.page, prev_addr.boffset, mtr);
+ }
+
+ byte *len= &base->frame[boffset + FLST_LEN];
+ ut_ad(mach_read_from_4(len) > 0);
+ mtr->write<4>(*base, len, mach_read_from_4(len) - 1);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate a file-based list. */
+void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr)
+{
+ ut_ad(boffset < base->physical_size());
+ ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+
+ /* We use two mini-transaction handles: the first is used to lock
+ the base node, and prevent other threads from modifying the list.
+ The second is used to traverse the list. We cannot run the second
+ mtr without committing it at times, because if the list is long,
+ the x-locked pages could fill the buffer, resulting in a deadlock. */
+ mtr_t mtr2;
+
+ const uint32_t len= flst_get_len(base->frame + boffset);
+ fil_addr_t addr= flst_get_first(base->frame + boffset);
+
+ for (uint32_t i= len; i--; )
+ {
+ mtr2.start();
+ const flst_node_t *node= fut_get_ptr(base->page.id().space(),
+ base->zip_size(), addr,
+ RW_SX_LATCH, &mtr2);
+ addr= flst_get_next_addr(node);
+ mtr2.commit();
+ }
+
+ ut_ad(addr.page == FIL_NULL);
+
+ addr= flst_get_last(base->frame + boffset);
+
+ for (uint32_t i= len; i--; )
+ {
+ mtr2.start();
+ const flst_node_t *node= fut_get_ptr(base->page.id().space(),
+ base->zip_size(), addr,
+ RW_SX_LATCH, &mtr2);
+ addr= flst_get_prev_addr(node);
+ mtr2.commit();
+ }
+
+ ut_ad(addr.page == FIL_NULL);
+}
+#endif
diff --git a/storage/innobase/gis/gis0geo.cc b/storage/innobase/gis/gis0geo.cc
new file mode 100644
index 00000000..4c3ff188
--- /dev/null
+++ b/storage/innobase/gis/gis0geo.cc
@@ -0,0 +1,650 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file gis/gis0geo.cc
+InnoDB R-tree related functions.
+
+Created 2013/03/27 Allen Lai and Jimmy Yang
+*******************************************************/
+
+#include "page0types.h"
+#include "gis0geo.h"
+#include "page0cur.h"
+#include "ut0rnd.h"
+#include "mach0data.h"
+
+#include <spatial.h>
+#include <cmath>
+
+/* These definitions are for comparing 2 mbrs. */
+
+/* Check if a intersects b.
+Return false if a intersects b, otherwise true. */
+#define INTERSECT_CMP(amin, amax, bmin, bmax) \
+(((amin) > (bmax)) || ((bmin) > (amax)))
+
+/* Check if b contains a.
+Return false if b contains a, otherwise true. */
+#define CONTAIN_CMP(amin, amax, bmin, bmax) \
+(((bmin) > (amin)) || ((bmax) < (amax)))
+
+/* Check if b is within a.
+Return false if b is within a, otherwise true. */
+#define WITHIN_CMP(amin, amax, bmin, bmax) \
+(((amin) > (bmin)) || ((amax) < (bmax)))
+
+/* Check if a disjoints b.
+Return false if a disjoints b, otherwise true. */
+#define DISJOINT_CMP(amin, amax, bmin, bmax) \
+(((amin) <= (bmax)) && ((bmin) <= (amax)))
+
+/* Check if a equals b.
+Return false if equal, otherwise true. */
+#define EQUAL_CMP(amin, amax, bmin, bmax) \
+(((amin) != (bmin)) || ((amax) != (bmax)))
+
+/****************************************************************
+Functions for generating mbr
+****************************************************************/
+/*************************************************************//**
+Add one point stored in wkb to a given mbr.
+@return 0 if the point in wkb is valid, otherwise -1. */
+static
+int
+rtree_add_point_to_mbr(
+/*===================*/
+ const uchar** wkb, /*!< in: pointer to wkb,
+ where point is stored */
+ const uchar* end, /*!< in: end of wkb. */
+ uint n_dims, /*!< in: dimensions. */
+ double* mbr) /*!< in/out: mbr, which
+ must be of length n_dims * 2. */
+{
+ double ord;
+ double* mbr_end = mbr + n_dims * 2;
+
+ while (mbr < mbr_end) {
+ if ((*wkb) + sizeof(double) > end) {
+ return(-1);
+ }
+
+ ord = mach_double_read(*wkb);
+ (*wkb) += sizeof(double);
+
+ if (ord < *mbr) {
+ *mbr = ord;
+ }
+ mbr++;
+
+ if (ord > *mbr) {
+ *mbr = ord;
+ }
+ mbr++;
+ }
+
+ return(0);
+}
+
+/*************************************************************//**
+Get mbr of point stored in wkb.
+@return 0 if ok, otherwise -1. */
+static
+int
+rtree_get_point_mbr(
+/*================*/
+ const uchar** wkb, /*!< in: pointer to wkb,
+ where point is stored. */
+ const uchar* end, /*!< in: end of wkb. */
+ uint n_dims, /*!< in: dimensions. */
+ double* mbr) /*!< in/out: mbr,
+ must be of length n_dims * 2. */
+{
+ return rtree_add_point_to_mbr(wkb, end, n_dims, mbr);
+}
+
+
+/*************************************************************//**
+Get mbr of linestring stored in wkb.
+@return 0 if the linestring is valid, otherwise -1. */
+static
+int
+rtree_get_linestring_mbr(
+/*=====================*/
+ const uchar** wkb, /*!< in: pointer to wkb,
+ where point is stored. */
+ const uchar* end, /*!< in: end of wkb. */
+ uint n_dims, /*!< in: dimensions. */
+ double* mbr) /*!< in/out: mbr,
+ must be of length n_dims * 2. */
+{
+ uint n_points;
+
+ n_points = uint4korr(*wkb);
+ (*wkb) += 4;
+
+ for (; n_points > 0; --n_points) {
+ /* Add next point to mbr */
+ if (rtree_add_point_to_mbr(wkb, end, n_dims, mbr)) {
+ return(-1);
+ }
+ }
+
+ return(0);
+}
+
+/*************************************************************//**
+Get mbr of polygon stored in wkb.
+@return 0 if the polygon is valid, otherwise -1. */
+static
+int
+rtree_get_polygon_mbr(
+/*==================*/
+ const uchar** wkb, /*!< in: pointer to wkb,
+ where point is stored. */
+ const uchar* end, /*!< in: end of wkb. */
+ uint n_dims, /*!< in: dimensions. */
+ double* mbr) /*!< in/out: mbr,
+ must be of length n_dims * 2. */
+{
+ uint n_linear_rings;
+ uint n_points;
+
+ n_linear_rings = uint4korr((*wkb));
+ (*wkb) += 4;
+
+ for (; n_linear_rings > 0; --n_linear_rings) {
+ n_points = uint4korr((*wkb));
+ (*wkb) += 4;
+
+ for (; n_points > 0; --n_points) {
+ /* Add next point to mbr */
+ if (rtree_add_point_to_mbr(wkb, end, n_dims, mbr)) {
+ return(-1);
+ }
+ }
+ }
+
+ return(0);
+}
+
+/*************************************************************//**
+Get mbr of geometry stored in wkb.
+@return 0 if the geometry is valid, otherwise -1. */
+static
+int
+rtree_get_geometry_mbr(
+/*===================*/
+ const uchar** wkb, /*!< in: pointer to wkb,
+ where point is stored. */
+ const uchar* end, /*!< in: end of wkb. */
+ uint n_dims, /*!< in: dimensions. */
+ double* mbr, /*!< in/out: mbr. */
+ int top) /*!< in: if it is the top,
+ which means it's not called
+ by itself. */
+{
+ int res;
+ uint wkb_type = 0;
+ uint n_items;
+
+ /* byte_order = *(*wkb); */
+ ++(*wkb);
+
+ wkb_type = uint4korr((*wkb));
+ (*wkb) += 4;
+
+ switch ((enum wkbType) wkb_type) {
+ case wkbPoint:
+ res = rtree_get_point_mbr(wkb, end, n_dims, mbr);
+ break;
+ case wkbLineString:
+ res = rtree_get_linestring_mbr(wkb, end, n_dims, mbr);
+ break;
+ case wkbPolygon:
+ res = rtree_get_polygon_mbr(wkb, end, n_dims, mbr);
+ break;
+ case wkbMultiPoint:
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items) {
+ /* byte_order = *(*wkb); */
+ ++(*wkb);
+ (*wkb) += 4;
+ if (rtree_get_point_mbr(wkb, end, n_dims, mbr)) {
+ return(-1);
+ }
+ }
+ res = 0;
+ break;
+ case wkbMultiLineString:
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items) {
+ /* byte_order = *(*wkb); */
+ ++(*wkb);
+ (*wkb) += 4;
+ if (rtree_get_linestring_mbr(wkb, end, n_dims, mbr)) {
+ return(-1);
+ }
+ }
+ res = 0;
+ break;
+ case wkbMultiPolygon:
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items) {
+ /* byte_order = *(*wkb); */
+ ++(*wkb);
+ (*wkb) += 4;
+ if (rtree_get_polygon_mbr(wkb, end, n_dims, mbr)) {
+ return(-1);
+ }
+ }
+ res = 0;
+ break;
+ case wkbGeometryCollection:
+ if (!top) {
+ return(-1);
+ }
+
+ n_items = uint4korr((*wkb));
+ (*wkb) += 4;
+ for (; n_items > 0; --n_items) {
+ if (rtree_get_geometry_mbr(wkb, end, n_dims,
+ mbr, 0)) {
+ return(-1);
+ }
+ }
+ res = 0;
+ break;
+ default:
+ res = -1;
+ }
+
+ return(res);
+}
+
+/*************************************************************//**
+Calculate Minimal Bounding Rectangle (MBR) of the spatial object
+stored in "well-known binary representation" (wkb) format.
+@return 0 if ok. */
+int
+rtree_mbr_from_wkb(
+/*===============*/
+ const uchar* wkb, /*!< in: wkb */
+ uint size, /*!< in: size of wkb. */
+ uint n_dims, /*!< in: dimensions. */
+ double* mbr) /*!< in/out: mbr, which must
+ be of length n_dim2 * 2. */
+{
+ for (uint i = 0; i < n_dims; ++i) {
+ mbr[i * 2] = DBL_MAX;
+ mbr[i * 2 + 1] = -DBL_MAX;
+ }
+
+ return rtree_get_geometry_mbr(&wkb, wkb + size, n_dims, mbr, 1);
+}
+
+
+/****************************************************************
+Functions for Rtree split
+****************************************************************/
+/*************************************************************//**
+Join 2 mbrs of dimensions n_dim. */
+static
+void
+mbr_join(
+/*=====*/
+ double* a, /*!< in/out: the first mbr,
+ where the joined result will be. */
+ const double* b, /*!< in: the second mbr. */
+ int n_dim) /*!< in: dimensions. */
+{
+ double* end = a + n_dim * 2;
+
+ do {
+ if (a[0] > b[0]) {
+ a[0] = b[0];
+ }
+
+ if (a[1] < b[1]) {
+ a[1] = b[1];
+ }
+
+ a += 2;
+ b += 2;
+
+ } while (a != end);
+}
+
+/*************************************************************//**
+Counts the square of mbr which is the join of a and b. Both a and b
+are of dimensions n_dim. */
+static
+double
+mbr_join_square(
+/*============*/
+ const double* a, /*!< in: the first mbr. */
+ const double* b, /*!< in: the second mbr. */
+ int n_dim) /*!< in: dimensions. */
+{
+ const double* end = a + n_dim * 2;
+ double square = 1.0;
+
+ do {
+ square *= std::max(a[1], b[1]) - std::min(a[0], b[0]);
+
+ a += 2;
+ b += 2;
+ } while (a != end);
+
+ /* Check if finite (not infinity or NaN),
+ so we don't get NaN in calculations */
+ if (!std::isfinite(square)) {
+ return DBL_MAX;
+ }
+
+ return square;
+}
+
+/*************************************************************//**
+Counts the square of mbr of dimension n_dim. */
+static
+double
+count_square(
+/*=========*/
+ const double* a, /*!< in: the mbr. */
+ int n_dim) /*!< in: dimensions. */
+{
+ const double* end = a + n_dim * 2;
+ double square = 1.0;
+
+ do {
+ square *= a[1] - a[0];
+ a += 2;
+ } while (a != end);
+
+ return square;
+}
+
+/*************************************************************//**
+Copy mbr of dimension n_dim from src to dst. */
+inline
+static
+void
+copy_coords(
+/*========*/
+ double* dst, /*!< in/out: destination. */
+ const double* src, /*!< in: source. */
+ int)
+{
+ memcpy(dst, src, DATA_MBR_LEN);
+}
+
+/*************************************************************//**
+Select two nodes to collect group upon */
+static
+void
+pick_seeds(
+/*=======*/
+ rtr_split_node_t* node, /*!< in: split nodes. */
+ int n_entries, /*!< in: entries number. */
+ rtr_split_node_t** seed_a, /*!< out: seed 1. */
+ rtr_split_node_t** seed_b, /*!< out: seed 2. */
+ int n_dim) /*!< in: dimensions. */
+{
+ rtr_split_node_t* cur1;
+ rtr_split_node_t* lim1 = node + (n_entries - 1);
+ rtr_split_node_t* cur2;
+ rtr_split_node_t* lim2 = node + n_entries;
+
+ double max_d = -DBL_MAX;
+ double d;
+
+ *seed_a = node;
+ *seed_b = node + 1;
+
+ for (cur1 = node; cur1 < lim1; ++cur1) {
+ for (cur2 = cur1 + 1; cur2 < lim2; ++cur2) {
+ d = mbr_join_square(cur1->coords, cur2->coords, n_dim) -
+ cur1->square - cur2->square;
+ if (d > max_d) {
+ max_d = d;
+ *seed_a = cur1;
+ *seed_b = cur2;
+ }
+ }
+ }
+}
+
+/*************************************************************//**
+Select next node and group where to add. */
+static
+void
+pick_next(
+/*======*/
+ rtr_split_node_t* node, /*!< in: split nodes. */
+ int n_entries, /*!< in: entries number. */
+ double* g1, /*!< in: mbr of group 1. */
+ double* g2, /*!< in: mbr of group 2. */
+ rtr_split_node_t** choice, /*!< out: the next node.*/
+ int* n_group, /*!< out: group number.*/
+ int n_dim) /*!< in: dimensions. */
+{
+ rtr_split_node_t* cur = node;
+ rtr_split_node_t* end = node + n_entries;
+ double max_diff = -DBL_MAX;
+
+ for (; cur < end; ++cur) {
+ double diff;
+ double abs_diff;
+
+ if (cur->n_node != 0) {
+ continue;
+ }
+
+ diff = mbr_join_square(g1, cur->coords, n_dim) -
+ mbr_join_square(g2, cur->coords, n_dim);
+
+ abs_diff = fabs(diff);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+
+ /* Introduce some randomness if the record
+ is identical */
+ if (diff == 0) {
+ diff = static_cast<double>(ut_rnd_gen() & 1);
+ }
+
+ *n_group = 1 + (diff > 0);
+ *choice = cur;
+ }
+ }
+}
+
+/*************************************************************//**
+Mark not-in-group entries as n_group. */
+static
+void
+mark_all_entries(
+/*=============*/
+ rtr_split_node_t* node, /*!< in/out: split nodes. */
+ int n_entries, /*!< in: entries number. */
+ int n_group) /*!< in: group number. */
+{
+ rtr_split_node_t* cur = node;
+ rtr_split_node_t* end = node + n_entries;
+ for (; cur < end; ++cur) {
+ if (cur->n_node != 0) {
+ continue;
+ }
+ cur->n_node = n_group;
+ }
+}
+
+/*************************************************************//**
+Split rtree node.
+Return which group the first rec is in. */
+int
+split_rtree_node(
+/*=============*/
+ rtr_split_node_t* node, /*!< in: split nodes. */
+ int n_entries, /*!< in: entries number. */
+ int all_size, /*!< in: total key's size. */
+ int key_size, /*!< in: key's size. */
+ int min_size, /*!< in: minimal group size. */
+ int size1, /*!< in: size of group. */
+ int size2, /*!< in: initial group sizes */
+ double** d_buffer, /*!< in/out: buffer. */
+ int n_dim, /*!< in: dimensions. */
+ uchar* first_rec) /*!< in: the first rec. */
+{
+ rtr_split_node_t* cur;
+ rtr_split_node_t* a = NULL;
+ rtr_split_node_t* b = NULL;
+ double* g1 = reserve_coords(d_buffer, n_dim);
+ double* g2 = reserve_coords(d_buffer, n_dim);
+ rtr_split_node_t* next = NULL;
+ int next_node = 0;
+ int i;
+ int first_rec_group = 1;
+ rtr_split_node_t* end = node + n_entries;
+
+ if (all_size < min_size * 2) {
+ return 1;
+ }
+
+ cur = node;
+ for (; cur < end; ++cur) {
+ cur->square = count_square(cur->coords, n_dim);
+ cur->n_node = 0;
+ }
+
+ pick_seeds(node, n_entries, &a, &b, n_dim);
+ a->n_node = 1;
+ b->n_node = 2;
+
+ copy_coords(g1, a->coords, n_dim);
+ size1 += key_size;
+ copy_coords(g2, b->coords, n_dim);
+ size2 += key_size;
+
+ for (i = n_entries - 2; i > 0; --i) {
+ /* Can't write into group 2 */
+ if (all_size - (size2 + key_size) < min_size) {
+ mark_all_entries(node, n_entries, 1);
+ break;
+ }
+
+ /* Can't write into group 1 */
+ if (all_size - (size1 + key_size) < min_size) {
+ mark_all_entries(node, n_entries, 2);
+ break;
+ }
+
+ pick_next(node, n_entries, g1, g2, &next, &next_node, n_dim);
+ if (next_node == 1) {
+ size1 += key_size;
+ mbr_join(g1, next->coords, n_dim);
+ } else {
+ size2 += key_size;
+ mbr_join(g2, next->coords, n_dim);
+ }
+
+ next->n_node = next_node;
+
+ /* Find out where the first rec (of the page) will be at,
+ and inform the caller */
+ if (first_rec && first_rec == next->key) {
+ first_rec_group = next_node;
+ }
+ }
+
+ return(first_rec_group);
+}
+
+/** Compare two minimum bounding rectangles.
+@param mode comparison operator
+ MBR_INTERSECT(a,b) a overlaps b
+ MBR_CONTAIN(a,b) a contains b
+ MBR_DISJOINT(a,b) a disjoint b
+ MBR_WITHIN(a,b) a within b
+ MBR_EQUAL(a,b) All coordinates of MBRs are equal
+ MBR_DATA(a,b) Data reference is the same
+@param b first MBR
+@param a second MBR
+@retval 0 if the predicate holds
+@retval 1 if the precidate does not hold */
+int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a)
+{
+ const byte *b_= static_cast<const byte*>(b);
+ const byte *a_= static_cast<const byte*>(a);
+
+ static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility");
+
+ for (auto i = SPDIMS; i--; )
+ {
+ double amin= mach_double_read(a_);
+ double bmin= mach_double_read(b_);
+ a_+= sizeof(double);
+ b_+= sizeof(double);
+ double amax= mach_double_read(a_);
+ double bmax= mach_double_read(b_);
+ a_+= sizeof(double);
+ b_+= sizeof(double);
+
+ switch (mode) {
+ case PAGE_CUR_INTERSECT:
+ if (INTERSECT_CMP(amin, amax, bmin, bmax))
+ return 1;
+ continue;
+ case PAGE_CUR_CONTAIN:
+ if (CONTAIN_CMP(amin, amax, bmin, bmax))
+ return 1;
+ continue;
+ case PAGE_CUR_WITHIN:
+ if (WITHIN_CMP(amin, amax, bmin, bmax))
+ return 1;
+ continue;
+ case PAGE_CUR_MBR_EQUAL:
+ if (EQUAL_CMP(amin, amax, bmin, bmax))
+ return 1;
+ continue;
+ case PAGE_CUR_DISJOINT:
+ if (!DISJOINT_CMP(amin, amax, bmin, bmax))
+ return 0;
+ if (!i)
+ return 1;
+ continue;
+ case PAGE_CUR_UNSUPP:
+ case PAGE_CUR_G:
+ case PAGE_CUR_GE:
+ case PAGE_CUR_L:
+ case PAGE_CUR_LE:
+ case PAGE_CUR_RTREE_LOCATE:
+ case PAGE_CUR_RTREE_GET_FATHER:
+ case PAGE_CUR_RTREE_INSERT:
+ break;
+ }
+ ut_ad("unknown comparison operator" == 0);
+ }
+
+ return 0;
+}
diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc
new file mode 100644
index 00000000..22e6e08f
--- /dev/null
+++ b/storage/innobase/gis/gis0rtree.cc
@@ -0,0 +1,1956 @@
+/*****************************************************************************
+
+Copyright (c) 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file gis/gis0rtree.cc
+InnoDB R-tree interfaces
+
+Created 2013/03/27 Allen Lai and Jimmy Yang
+***********************************************************************/
+
+#include "fsp0fsp.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "page0zip.h"
+#include "gis0rtree.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "srv0mon.h"
+#include "gis0geo.h"
+#include <cmath>
+
+/*************************************************************//**
+Initial split nodes info for R-tree split.
+@return initialized split nodes array */
+static
+rtr_split_node_t*
+rtr_page_split_initialize_nodes(
+/*============================*/
+ mem_heap_t* heap, /*!< in: pointer to memory heap, or NULL */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ rec_offs** offsets,/*!< in: offsets on inserted record */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ double** buf_pos)/*!< in/out: current buffer position */
+{
+ rtr_split_node_t* split_node_array;
+ double* buf;
+ ulint n_recs;
+ rtr_split_node_t* task;
+ rtr_split_node_t* stop;
+ rtr_split_node_t* cur;
+ rec_t* rec;
+ buf_block_t* block;
+ page_t* page;
+ ulint n_uniq;
+ ulint len;
+ const byte* source_cur;
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
+
+ n_recs = ulint(page_get_n_recs(page)) + 1;
+
+ /*We reserve 2 MBRs memory space for temp result of split
+ algrithm. And plus the new mbr that need to insert, we
+ need (n_recs + 3)*MBR size for storing all MBRs.*/
+ buf = static_cast<double*>(mem_heap_alloc(
+ heap, DATA_MBR_LEN * (n_recs + 3)
+ + sizeof(rtr_split_node_t) * (n_recs + 1)));
+
+ split_node_array = (rtr_split_node_t*)(buf + SPDIMS * 2 * (n_recs + 3));
+ task = split_node_array;
+ *buf_pos = buf;
+ stop = task + n_recs;
+
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ const ulint n_core = page_is_leaf(page)
+ ? cursor->index->n_core_fields : 0;
+ *offsets = rec_get_offsets(rec, cursor->index, *offsets, n_core,
+ n_uniq, &heap);
+
+ source_cur = rec_get_nth_field(rec, *offsets, 0, &len);
+
+ for (cur = task; cur < stop - 1; ++cur) {
+ cur->coords = reserve_coords(buf_pos, SPDIMS);
+ cur->key = rec;
+
+ memcpy(cur->coords, source_cur, DATA_MBR_LEN);
+
+ rec = page_rec_get_next(rec);
+ *offsets = rec_get_offsets(rec, cursor->index, *offsets,
+ n_core, n_uniq, &heap);
+ source_cur = rec_get_nth_field(rec, *offsets, 0, &len);
+ }
+
+ /* Put the insert key to node list */
+ source_cur = static_cast<const byte*>(dfield_get_data(
+ dtuple_get_nth_field(tuple, 0)));
+ cur->coords = reserve_coords(buf_pos, SPDIMS);
+ rec = (byte*) mem_heap_alloc(
+ heap, rec_get_converted_size(cursor->index, tuple, 0));
+
+ rec = rec_convert_dtuple_to_rec(rec, cursor->index, tuple, 0);
+ cur->key = rec;
+
+ memcpy(cur->coords, source_cur, DATA_MBR_LEN);
+
+ return split_node_array;
+}
+
+/**********************************************************************//**
+Builds a Rtree node pointer out of a physical record and a page number.
+Note: For Rtree, we just keep the mbr and page no field in non-leaf level
+page. It's different with Btree, Btree still keeps PK fields so far.
+@return own: node pointer */
+dtuple_t*
+rtr_index_build_node_ptr(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ const rtr_mbr_t* mbr, /*!< in: mbr of lower page */
+ const rec_t* rec, /*!< in: record for which to build node
+ pointer */
+ ulint page_no,/*!< in: page number to put in node
+ pointer */
+ mem_heap_t* heap) /*!< in: memory heap where pointer
+ created */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ byte* buf;
+ ulint n_unique;
+ ulint info_bits;
+
+ ut_ad(dict_index_is_spatial(index));
+
+ n_unique = DICT_INDEX_SPATIAL_NODEPTR_SIZE;
+
+ tuple = dtuple_create(heap, n_unique + 1);
+
+ /* For rtree internal node, we need to compare page number
+ fields. */
+ dtuple_set_n_fields_cmp(tuple, n_unique + 1);
+
+ dict_index_copy_types(tuple, index, n_unique);
+
+ /* Write page no field */
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+ mach_write_to_4(buf, page_no);
+
+ field = dtuple_get_nth_field(tuple, n_unique);
+ dfield_set_data(field, buf, 4);
+
+ dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4);
+
+ /* Set info bits. */
+ info_bits = rec_get_info_bits(rec, dict_table_is_comp(index->table));
+ dtuple_set_info_bits(tuple, info_bits | REC_STATUS_NODE_PTR);
+
+ /* Set mbr as index entry data */
+ field = dtuple_get_nth_field(tuple, 0);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_MBR_LEN));
+
+ rtr_write_mbr(buf, mbr);
+
+ dfield_set_data(field, buf, DATA_MBR_LEN);
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ return(tuple);
+}
+
+/**************************************************************//**
+Update the mbr field of a spatial index row.
+@return true if update is successful */
+bool
+rtr_update_mbr_field(
+/*=================*/
+ btr_cur_t* cursor, /*!< in/out: cursor pointed to rec.*/
+ rec_offs* offsets, /*!< in/out: offsets on rec. */
+ btr_cur_t* cursor2, /*!< in/out: cursor pointed to rec
+ that should be deleted.
+ this cursor is for btr_compress to
+ delete the merged page's father rec.*/
+ page_t* child_page, /*!< in: child page. */
+ rtr_mbr_t* mbr, /*!< in: the new mbr. */
+ rec_t* new_rec, /*!< in: rec to use */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index = cursor->index;
+ mem_heap_t* heap;
+ page_t* page;
+ rec_t* rec;
+ constexpr ulint flags = BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG;
+ dberr_t err;
+ big_rec_t* dummy_big_rec;
+ buf_block_t* block;
+ rec_t* child_rec;
+ ulint up_match = 0;
+ ulint low_match = 0;
+ ulint child;
+ ulint rec_info;
+ bool ins_suc = true;
+ ulint cur2_pos = 0;
+ ulint del_page_no = 0;
+ rec_offs* offsets2;
+
+ rec = btr_cur_get_rec(cursor);
+ page = page_align(rec);
+
+ rec_info = rec_get_info_bits(rec, rec_offs_comp(offsets));
+
+ heap = mem_heap_create(100);
+ block = btr_cur_get_block(cursor);
+ ut_ad(page == buf_block_get_frame(block));
+
+ child = btr_node_ptr_get_child_page_no(rec, offsets);
+ const ulint n_core = page_is_leaf(block->frame)
+ ? index->n_core_fields : 0;
+
+ if (new_rec) {
+ child_rec = new_rec;
+ } else {
+ child_rec = page_rec_get_next(page_get_infimum_rec(child_page));
+ }
+
+ dtuple_t* node_ptr = rtr_index_build_node_ptr(
+ index, mbr, child_rec, child, heap);
+
+ /* We need to remember the child page no of cursor2, since page could be
+ reorganized or insert a new rec before it. */
+ if (cursor2) {
+ rec_t* del_rec = btr_cur_get_rec(cursor2);
+ offsets2 = rec_get_offsets(btr_cur_get_rec(cursor2),
+ index, NULL, 0,
+ ULINT_UNDEFINED, &heap);
+ del_page_no = btr_node_ptr_get_child_page_no(del_rec, offsets2);
+ cur2_pos = page_rec_get_n_recs_before(btr_cur_get_rec(cursor2));
+ }
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_base(offsets)[0 + 1] == DATA_MBR_LEN);
+ ut_ad(node_ptr->fields[0].len == DATA_MBR_LEN);
+
+ if (rec_info & REC_INFO_MIN_REC_FLAG) {
+ /* When the rec is minimal rec in this level, we do
+ in-place update for avoiding it move to other place. */
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ /* Check if there's enough space for in-place
+ update the zip page. */
+ if (!btr_cur_update_alloc_zip(
+ page_zip,
+ btr_cur_get_page_cur(cursor),
+ index, offsets,
+ rec_offs_size(offsets),
+ false, mtr)) {
+
+ /* If there's not enought space for
+ inplace update zip page, we do delete
+ insert. */
+ ins_suc = false;
+
+ /* Since btr_cur_update_alloc_zip could
+ reorganize the page, we need to repositon
+ cursor2. */
+ if (cursor2) {
+ cursor2->page_cur.rec =
+ page_rec_get_nth(page,
+ cur2_pos);
+ }
+
+ goto update_mbr;
+ }
+
+ /* Record could be repositioned */
+ rec = btr_cur_get_rec(cursor);
+
+#ifdef UNIV_DEBUG
+ /* Make sure it is still the first record */
+ rec_info = rec_get_info_bits(
+ rec, rec_offs_comp(offsets));
+ ut_ad(rec_info & REC_INFO_MIN_REC_FLAG);
+#endif /* UNIV_DEBUG */
+ memcpy(rec, node_ptr->fields[0].data, DATA_MBR_LEN);
+ page_zip_write_rec(block, rec, index, offsets, 0, mtr);
+ } else {
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*block, rec,
+ node_ptr->fields[0].data,
+ DATA_MBR_LEN);
+ }
+
+ if (cursor2) {
+ rec_offs* offsets2;
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ cursor2->page_cur.rec
+ = page_rec_get_nth(page, cur2_pos);
+ }
+ offsets2 = rec_get_offsets(btr_cur_get_rec(cursor2),
+ index, NULL, 0,
+ ULINT_UNDEFINED, &heap);
+ ut_ad(del_page_no == btr_node_ptr_get_child_page_no(
+ cursor2->page_cur.rec,
+ offsets2));
+
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor2),
+ index, offsets2, mtr);
+ }
+ } else if (page_get_n_recs(page) == 1) {
+ /* When there's only one rec in the page, we do insert/delete to
+ avoid page merge. */
+
+ page_cur_t page_cur;
+ rec_t* insert_rec;
+ rec_offs* insert_offsets = NULL;
+ ulint old_pos;
+ rec_t* old_rec;
+
+ ut_ad(cursor2 == NULL);
+
+ /* Insert the new mbr rec. */
+ old_pos = page_rec_get_n_recs_before(rec);
+
+ err = btr_cur_optimistic_insert(
+ flags,
+ cursor, &insert_offsets, &heap,
+ node_ptr, &insert_rec, &dummy_big_rec, 0, NULL, mtr);
+
+ ut_ad(err == DB_SUCCESS);
+
+ btr_cur_position(index, insert_rec, block, cursor);
+
+ /* Delete the old mbr rec. */
+ old_rec = page_rec_get_nth(page, old_pos);
+ ut_ad(old_rec != insert_rec);
+
+ page_cur_position(old_rec, block, &page_cur);
+ offsets2 = rec_get_offsets(old_rec, index, NULL, n_core,
+ ULINT_UNDEFINED, &heap);
+ page_cur_delete_rec(&page_cur, index, offsets2, mtr);
+
+ } else {
+update_mbr:
+ /* When there're not only 1 rec in the page, we do delete/insert
+ to avoid page split. */
+ rec_t* insert_rec;
+ rec_offs* insert_offsets = NULL;
+ rec_t* next_rec;
+
+ /* Delete the rec which cursor point to. */
+ next_rec = page_rec_get_next(rec);
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ index, offsets, mtr);
+ if (!ins_suc) {
+ ut_ad(rec_info & REC_INFO_MIN_REC_FLAG);
+
+ btr_set_min_rec_mark(next_rec, *block, mtr);
+ }
+
+ /* If there's more than 1 rec left in the page, delete
+ the rec which cursor2 point to. Otherwise, delete it later.*/
+ if (cursor2 && page_get_n_recs(page) > 1) {
+ ulint cur2_rec_info;
+ rec_t* cur2_rec;
+
+ cur2_rec = cursor2->page_cur.rec;
+ offsets2 = rec_get_offsets(cur2_rec, index, NULL,
+ n_core,
+ ULINT_UNDEFINED, &heap);
+
+ cur2_rec_info = rec_get_info_bits(cur2_rec,
+ rec_offs_comp(offsets2));
+ if (cur2_rec_info & REC_INFO_MIN_REC_FLAG) {
+ /* If we delete the leftmost node
+ pointer on a non-leaf level, we must
+ mark the new leftmost node pointer as
+ the predefined minimum record */
+ rec_t* next_rec = page_rec_get_next(cur2_rec);
+ btr_set_min_rec_mark(next_rec, *block, mtr);
+ }
+
+ ut_ad(del_page_no
+ == btr_node_ptr_get_child_page_no(cur2_rec,
+ offsets2));
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor2),
+ index, offsets2, mtr);
+ cursor2 = NULL;
+ }
+
+ /* Insert the new rec. */
+ page_cur_search_with_match(block, index, node_ptr,
+ PAGE_CUR_LE , &up_match, &low_match,
+ btr_cur_get_page_cur(cursor), NULL);
+
+ err = btr_cur_optimistic_insert(flags, cursor, &insert_offsets,
+ &heap, node_ptr, &insert_rec,
+ &dummy_big_rec, 0, NULL, mtr);
+
+ if (!ins_suc && err == DB_SUCCESS) {
+ ins_suc = true;
+ }
+
+ /* If optimistic insert fail, try reorganize the page
+ and insert again. */
+ if (err != DB_SUCCESS && ins_suc) {
+ btr_page_reorganize(btr_cur_get_page_cur(cursor),
+ index, mtr);
+
+ err = btr_cur_optimistic_insert(flags,
+ cursor,
+ &insert_offsets,
+ &heap,
+ node_ptr,
+ &insert_rec,
+ &dummy_big_rec,
+ 0, NULL, mtr);
+
+ /* Will do pessimistic insert */
+ if (err != DB_SUCCESS) {
+ ins_suc = false;
+ }
+ }
+
+ /* Insert succeed, position cursor the inserted rec.*/
+ if (ins_suc) {
+ btr_cur_position(index, insert_rec, block, cursor);
+ offsets = rec_get_offsets(insert_rec,
+ index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ }
+
+ /* Delete the rec which cursor2 point to. */
+ if (cursor2) {
+ ulint cur2_pno;
+ rec_t* cur2_rec;
+
+ cursor2->page_cur.rec = page_rec_get_nth(page,
+ cur2_pos);
+
+ cur2_rec = btr_cur_get_rec(cursor2);
+
+ offsets2 = rec_get_offsets(cur2_rec, index, NULL,
+ n_core,
+ ULINT_UNDEFINED, &heap);
+
+ /* If the cursor2 position is on a wrong rec, we
+ need to reposition it. */
+ cur2_pno = btr_node_ptr_get_child_page_no(cur2_rec, offsets2);
+ if ((del_page_no != cur2_pno)
+ || (cur2_rec == insert_rec)) {
+ cur2_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+
+ while (!page_rec_is_supremum(cur2_rec)) {
+ offsets2 = rec_get_offsets(cur2_rec, index,
+ NULL,
+ n_core,
+ ULINT_UNDEFINED,
+ &heap);
+ cur2_pno = btr_node_ptr_get_child_page_no(
+ cur2_rec, offsets2);
+ if (cur2_pno == del_page_no) {
+ if (insert_rec != cur2_rec) {
+ cursor2->page_cur.rec =
+ cur2_rec;
+ break;
+ }
+ }
+ cur2_rec = page_rec_get_next(cur2_rec);
+ }
+
+ ut_ad(!page_rec_is_supremum(cur2_rec));
+ }
+
+ rec_info = rec_get_info_bits(cur2_rec,
+ rec_offs_comp(offsets2));
+ if (rec_info & REC_INFO_MIN_REC_FLAG) {
+ /* If we delete the leftmost node
+ pointer on a non-leaf level, we must
+ mark the new leftmost node pointer as
+ the predefined minimum record */
+ rec_t* next_rec = page_rec_get_next(cur2_rec);
+ btr_set_min_rec_mark(next_rec, *block, mtr);
+ }
+
+ ut_ad(cur2_pno == del_page_no && cur2_rec != insert_rec);
+
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor2),
+ index, offsets2, mtr);
+ }
+
+ if (!ins_suc) {
+ mem_heap_t* new_heap = NULL;
+
+ err = btr_cur_pessimistic_insert(
+ flags,
+ cursor, &insert_offsets, &new_heap,
+ node_ptr, &insert_rec, &dummy_big_rec,
+ 0, NULL, mtr);
+
+ ut_ad(err == DB_SUCCESS);
+
+ if (new_heap) {
+ mem_heap_free(new_heap);
+ }
+
+ }
+
+ if (cursor2) {
+ btr_cur_compress_if_useful(cursor, FALSE, mtr);
+ }
+ }
+
+ ut_ad(page_has_prev(page)
+ || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+ page_rec_get_next(page_get_infimum_rec(page)),
+ page_is_comp(page))));
+
+ mem_heap_free(heap);
+
+ return(true);
+}
+
+/**************************************************************//**
+Update parent page's MBR and Predicate lock information during a split */
+static MY_ATTRIBUTE((nonnull))
+void
+rtr_adjust_upper_level(
+/*===================*/
+ btr_cur_t* sea_cur, /*!< in: search cursor */
+ ulint flags, /*!< in: undo logging and
+ locking flags */
+ buf_block_t* block, /*!< in/out: page to be split */
+ buf_block_t* new_block, /*!< in/out: the new half page */
+ rtr_mbr_t* mbr, /*!< in: MBR on the old page */
+ rtr_mbr_t* new_mbr, /*!< in: MBR on the new page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint page_no;
+ ulint new_page_no;
+ dict_index_t* index = sea_cur->index;
+ btr_cur_t cursor;
+ rec_offs* offsets;
+ mem_heap_t* heap;
+ ulint level;
+ dtuple_t* node_ptr_upper;
+ page_cur_t* page_cursor;
+ lock_prdt_t prdt;
+ lock_prdt_t new_prdt;
+ dberr_t err;
+ big_rec_t* dummy_big_rec;
+ rec_t* rec;
+
+ /* Create a memory heap where the data tuple is stored */
+ heap = mem_heap_create(1024);
+ cursor.init();
+
+ cursor.thr = sea_cur->thr;
+
+ /* Get the level of the split pages */
+ level = btr_page_get_level(buf_block_get_frame(block));
+ ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block)));
+
+ page_no = block->page.id().page_no();
+
+ new_page_no = new_block->page.id().page_no();
+
+ /* Set new mbr for the old page on the upper level. */
+ /* Look up the index for the node pointer to page */
+ offsets = rtr_page_get_father_block(
+ NULL, heap, index, block, mtr, sea_cur, &cursor);
+
+ page_cursor = btr_cur_get_page_cur(&cursor);
+
+ rtr_update_mbr_field(&cursor, offsets, NULL, block->frame, mbr, NULL,
+ mtr);
+
+ /* Already updated parent MBR, reset in our path */
+ if (sea_cur->rtr_info) {
+ node_visit_t* node_visit = rtr_get_parent_node(
+ sea_cur, level + 1, true);
+ if (node_visit) {
+ node_visit->mbr_inc = 0;
+ }
+ }
+
+ /* Insert the node for the new page. */
+ node_ptr_upper = rtr_index_build_node_ptr(
+ index, new_mbr,
+ page_rec_get_next(page_get_infimum_rec(new_block->frame)),
+ new_page_no, heap);
+
+ ulint up_match = 0;
+ ulint low_match = 0;
+
+ buf_block_t* father_block = btr_cur_get_block(&cursor);
+
+ page_cur_search_with_match(
+ father_block, index, node_ptr_upper,
+ PAGE_CUR_LE , &up_match, &low_match,
+ btr_cur_get_page_cur(&cursor), NULL);
+
+ err = btr_cur_optimistic_insert(
+ flags
+ | BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG
+ | BTR_NO_UNDO_LOG_FLAG,
+ &cursor, &offsets, &heap,
+ node_ptr_upper, &rec, &dummy_big_rec, 0, NULL, mtr);
+
+ if (err == DB_FAIL) {
+ cursor.rtr_info = sea_cur->rtr_info;
+ cursor.tree_height = sea_cur->tree_height;
+
+ /* Recreate a memory heap as input parameter for
+ btr_cur_pessimistic_insert(), because the heap may be
+ emptied in btr_cur_pessimistic_insert(). */
+ mem_heap_t* new_heap = mem_heap_create(1024);
+
+ err = btr_cur_pessimistic_insert(flags
+ | BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG
+ | BTR_NO_UNDO_LOG_FLAG,
+ &cursor, &offsets, &new_heap,
+ node_ptr_upper, &rec,
+ &dummy_big_rec, 0, NULL, mtr);
+ cursor.rtr_info = NULL;
+ ut_a(err == DB_SUCCESS);
+
+ mem_heap_free(new_heap);
+ }
+
+ prdt.data = static_cast<void*>(mbr);
+ prdt.op = 0;
+ new_prdt.data = static_cast<void*>(new_mbr);
+ new_prdt.op = 0;
+
+ lock_prdt_update_parent(block, new_block, &prdt, &new_prdt,
+ page_cursor->block->page.id());
+
+ mem_heap_free(heap);
+
+ ut_ad(block->zip_size() == index->table->space->zip_size());
+
+ const uint32_t next_page_no = btr_page_get_next(block->frame);
+
+ if (next_page_no != FIL_NULL) {
+ buf_block_t* next_block = btr_block_get(
+ *index, next_page_no, RW_X_LATCH, false, mtr);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(next_block->frame)
+ == page_is_comp(block->frame));
+ ut_a(btr_page_get_prev(next_block->frame)
+ == block->page.id().page_no());
+#endif /* UNIV_BTR_DEBUG */
+
+ btr_page_set_prev(next_block, new_page_no, mtr);
+ }
+
+ btr_page_set_next(block, new_page_no, mtr);
+
+ btr_page_set_prev(new_block, page_no, mtr);
+ btr_page_set_next(new_block, next_page_no, mtr);
+}
+
+/*************************************************************//**
+Moves record list to another page for rtree splitting.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return TRUE on success; FALSE on compression failure */
+static
+ibool
+rtr_split_page_move_rec_list(
+/*=========================*/
+ rtr_split_node_t* node_array, /*!< in: split node array. */
+ int first_rec_group,/*!< in: group number of the
+ first rec. */
+ buf_block_t* new_block, /*!< in/out: index page
+ where to move */
+ buf_block_t* block, /*!< in/out: page containing
+ split_rec */
+ rec_t* first_rec, /*!< in: first record not to
+ move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mem_heap_t* heap, /*!< in: pointer to memory
+ heap, or NULL */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ rtr_split_node_t* cur_split_node;
+ rtr_split_node_t* end_split_node;
+ page_cur_t page_cursor;
+ page_cur_t new_page_cursor;
+ page_t* page;
+ page_t* new_page;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ page_zip_des_t* new_page_zip
+ = buf_block_get_page_zip(new_block);
+ rec_t* rec;
+ rec_t* ret;
+ ulint moved = 0;
+ ulint max_to_move = 0;
+ rtr_rec_move_t* rec_move = NULL;
+
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(dict_index_is_spatial(index));
+
+ rec_offs_init(offsets_);
+
+ page_cur_set_before_first(block, &page_cursor);
+ page_cur_set_before_first(new_block, &new_page_cursor);
+
+ page = buf_block_get_frame(block);
+ new_page = buf_block_get_frame(new_block);
+ ret = page_rec_get_prev(page_get_supremum_rec(new_page));
+
+ end_split_node = node_array + page_get_n_recs(page);
+
+ mtr_log_t log_mode = MTR_LOG_NONE;
+
+ if (new_page_zip) {
+ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+ }
+
+ max_to_move = page_get_n_recs(
+ buf_block_get_frame(block));
+ rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc(
+ heap,
+ sizeof (*rec_move) * max_to_move));
+ const ulint n_core = page_is_leaf(page)
+ ? index->n_core_fields : 0;
+
+ /* Insert the recs in group 2 to new page. */
+ for (cur_split_node = node_array;
+ cur_split_node < end_split_node; ++cur_split_node) {
+ if (cur_split_node->n_node != first_rec_group) {
+ lock_rec_store_on_page_infimum(
+ block, cur_split_node->key);
+
+ offsets = rec_get_offsets(cur_split_node->key,
+ index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+
+ ut_ad(!n_core || cur_split_node->key != first_rec);
+
+ rec = page_cur_insert_rec_low(
+ &new_page_cursor,
+ index, cur_split_node->key, offsets, mtr);
+
+ ut_a(rec);
+
+ lock_rec_restore_from_page_infimum(
+ new_block, rec, block);
+
+ page_cur_move_to_next(&new_page_cursor);
+
+ rec_move[moved].new_rec = rec;
+ rec_move[moved].old_rec = cur_split_node->key;
+ rec_move[moved].moved = false;
+ moved++;
+
+ if (moved > max_to_move) {
+ ut_ad(0);
+ break;
+ }
+ }
+ }
+
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below.
+ Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (n_core && !index->table->is_temporary()) {
+ page_update_max_trx_id(new_block, NULL,
+ page_get_max_trx_id(page),
+ mtr);
+ }
+
+ if (new_page_zip) {
+ mtr_set_log_mode(mtr, log_mode);
+
+ if (!page_zip_compress(new_block, index,
+ page_zip_level, mtr)) {
+ ulint ret_pos;
+
+ /* Before trying to reorganize the page,
+ store the number of preceding records on the page. */
+ ret_pos = page_rec_get_n_recs_before(ret);
+ /* Before copying, "ret" was the predecessor
+ of the predefined supremum record. If it was
+ the predefined infimum record, then it would
+ still be the infimum, and we would have
+ ret_pos == 0. */
+
+ if (UNIV_UNLIKELY
+ (!page_zip_reorganize(new_block, index,
+ page_zip_level, mtr))) {
+
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress(new_page_zip,
+ new_page, FALSE))) {
+ ut_error;
+ }
+#ifdef UNIV_GIS_DEBUG
+ ut_ad(page_validate(new_page, index));
+#endif
+
+ return(false);
+ }
+
+ /* The page was reorganized: Seek to ret_pos. */
+ ret = page_rec_get_nth(new_page, ret_pos);
+ }
+ }
+
+ /* Update the lock table */
+ lock_rtr_move_rec_list(new_block, block, rec_move, moved);
+
+ /* Delete recs in second group from the old page. */
+ for (cur_split_node = node_array;
+ cur_split_node < end_split_node; ++cur_split_node) {
+ if (cur_split_node->n_node != first_rec_group) {
+ page_cur_position(cur_split_node->key,
+ block, &page_cursor);
+ offsets = rec_get_offsets(
+ page_cur_get_rec(&page_cursor), index,
+ offsets, n_core, ULINT_UNDEFINED,
+ &heap);
+ page_cur_delete_rec(&page_cursor,
+ index, offsets, mtr);
+ }
+ }
+
+ return(true);
+}
+
+/*************************************************************//**
+Splits an R-tree index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+@return inserted record */
+rec_t*
+rtr_page_split_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in/out: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ page_t* page;
+ page_t* new_page;
+ buf_block_t* new_block;
+ page_zip_des_t* page_zip;
+ page_zip_des_t* new_page_zip;
+ buf_block_t* insert_block;
+ page_cur_t* page_cursor;
+ rec_t* rec = 0;
+ ulint n_recs;
+ ulint total_data;
+ ulint insert_size;
+ rtr_split_node_t* rtr_split_node_array;
+ rtr_split_node_t* cur_split_node;
+ rtr_split_node_t* end_split_node;
+ double* buf_pos;
+ node_seq_t current_ssn;
+ node_seq_t next_ssn;
+ buf_block_t* root_block;
+ rtr_mbr_t mbr;
+ rtr_mbr_t new_mbr;
+ lock_prdt_t prdt;
+ lock_prdt_t new_prdt;
+ rec_t* first_rec = NULL;
+ int first_rec_group = 1;
+ ulint n_iterations = 0;
+
+ if (!*heap) {
+ *heap = mem_heap_create(1024);
+ }
+
+func_start:
+ mem_heap_empty(*heap);
+ *offsets = NULL;
+
+ ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(!dict_index_is_online_ddl(cursor->index)
+ || (flags & BTR_CREATE_FLAG)
+ || dict_index_is_clust(cursor->index));
+ ut_ad(rw_lock_own_flagged(dict_index_get_lock(cursor->index),
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ page_zip = buf_block_get_page_zip(block);
+ current_ssn = page_get_ssn_id(page);
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(page_get_n_recs(page) >= 1);
+
+ const page_id_t page_id(block->page.id());
+
+ if (!page_has_prev(page) && !page_is_leaf(page)) {
+ first_rec = page_rec_get_next(
+ page_get_infimum_rec(buf_block_get_frame(block)));
+ }
+
+ /* Initial split nodes array. */
+ rtr_split_node_array = rtr_page_split_initialize_nodes(
+ *heap, cursor, offsets, tuple, &buf_pos);
+
+ /* Divide all mbrs to two groups. */
+ n_recs = ulint(page_get_n_recs(page)) + 1;
+
+ end_split_node = rtr_split_node_array + n_recs;
+
+#ifdef UNIV_GIS_DEBUG
+ fprintf(stderr, "Before split a page:\n");
+ for (cur_split_node = rtr_split_node_array;
+ cur_split_node < end_split_node; ++cur_split_node) {
+ for (int i = 0; i < SPDIMS * 2; i++) {
+ fprintf(stderr, "%.2lf ",
+ *(cur_split_node->coords + i));
+ }
+ fprintf(stderr, "\n");
+ }
+#endif
+
+ insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+ total_data = page_get_data_size(page) + insert_size;
+ first_rec_group = split_rtree_node(rtr_split_node_array,
+ static_cast<int>(n_recs),
+ static_cast<int>(total_data),
+ static_cast<int>(insert_size),
+ 0, 2, 2, &buf_pos, SPDIMS,
+ static_cast<uchar*>(first_rec));
+
+ /* Allocate a new page to the index */
+ const uint16_t page_level = btr_page_get_level(page);
+ new_block = btr_page_alloc(cursor->index, page_id.page_no() + 1,
+ FSP_UP, page_level, mtr, mtr);
+ if (!new_block) {
+ return NULL;
+ }
+
+ new_page_zip = buf_block_get_page_zip(new_block);
+ if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
+ /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
+ to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
+ memset_aligned<4>(new_block->frame + FIL_PAGE_PREV, 0, 4);
+ }
+ btr_page_create(new_block, new_page_zip, cursor->index,
+ page_level, mtr);
+
+ new_page = buf_block_get_frame(new_block);
+ ut_ad(page_get_ssn_id(new_page) == 0);
+
+ /* Set new ssn to the new page and page. */
+ page_set_ssn_id(new_block, new_page_zip, current_ssn, mtr);
+ next_ssn = rtr_get_new_ssn_id(cursor->index);
+
+ page_set_ssn_id(block, page_zip, next_ssn, mtr);
+
+ /* Keep recs in first group to the old page, move recs in second
+ groups to the new page. */
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || page_zip
+#endif
+ || !rtr_split_page_move_rec_list(rtr_split_node_array,
+ first_rec_group,
+ new_block, block, first_rec,
+ cursor->index, *heap, mtr)) {
+ ulint n = 0;
+ rec_t* rec;
+ ulint moved = 0;
+ ulint max_to_move = 0;
+ rtr_rec_move_t* rec_move = NULL;
+ ulint pos;
+
+ /* For some reason, compressing new_page failed,
+ even though it should contain fewer records than
+ the original page. Copy the page byte for byte
+ and then delete the records from both pages
+ as appropriate. Deleting will always succeed. */
+ ut_a(new_page_zip);
+
+ page_zip_copy_recs(new_block,
+ page_zip, page, cursor->index, mtr);
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ /* Move locks on recs. */
+ max_to_move = page_get_n_recs(page);
+ rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc(
+ *heap,
+ sizeof (*rec_move) * max_to_move));
+
+ /* Init the rec_move array for moving lock on recs. */
+ for (cur_split_node = rtr_split_node_array;
+ cur_split_node < end_split_node - 1; ++cur_split_node) {
+ if (cur_split_node->n_node != first_rec_group) {
+ pos = page_rec_get_n_recs_before(
+ cur_split_node->key);
+ rec = page_rec_get_nth(new_page, pos);
+ ut_a(rec);
+
+ rec_move[moved].new_rec = rec;
+ rec_move[moved].old_rec = cur_split_node->key;
+ rec_move[moved].moved = false;
+ moved++;
+
+ if (moved > max_to_move) {
+ ut_ad(0);
+ break;
+ }
+ }
+ }
+
+ /* Update the lock table */
+ lock_rtr_move_rec_list(new_block, block, rec_move, moved);
+
+ const ulint n_core = page_level
+ ? 0 : cursor->index->n_core_fields;
+
+ /* Delete recs in first group from the new page. */
+ for (cur_split_node = rtr_split_node_array;
+ cur_split_node < end_split_node - 1; ++cur_split_node) {
+ if (cur_split_node->n_node == first_rec_group) {
+ ulint pos;
+
+ pos = page_rec_get_n_recs_before(
+ cur_split_node->key);
+ ut_a(pos > 0);
+ rec_t* new_rec = page_rec_get_nth(new_page,
+ pos - n);
+
+ ut_a(new_rec && page_rec_is_user_rec(new_rec));
+ page_cur_position(new_rec, new_block,
+ page_cursor);
+
+ *offsets = rec_get_offsets(
+ page_cur_get_rec(page_cursor),
+ cursor->index, *offsets, n_core,
+ ULINT_UNDEFINED, heap);
+
+ page_cur_delete_rec(page_cursor,
+ cursor->index, *offsets, mtr);
+ n++;
+ }
+ }
+
+ /* Delete recs in second group from the old page. */
+ for (cur_split_node = rtr_split_node_array;
+ cur_split_node < end_split_node - 1; ++cur_split_node) {
+ if (cur_split_node->n_node != first_rec_group) {
+ page_cur_position(cur_split_node->key,
+ block, page_cursor);
+ *offsets = rec_get_offsets(
+ page_cur_get_rec(page_cursor),
+ cursor->index, *offsets, n_core,
+ ULINT_UNDEFINED, heap);
+ page_cur_delete_rec(page_cursor,
+ cursor->index, *offsets, mtr);
+ }
+ }
+
+#ifdef UNIV_GIS_DEBUG
+ ut_ad(page_validate(new_page, cursor->index));
+ ut_ad(page_validate(page, cursor->index));
+#endif
+ }
+
+ /* Insert the new rec to the proper page. */
+ cur_split_node = end_split_node - 1;
+ if (cur_split_node->n_node != first_rec_group) {
+ insert_block = new_block;
+ } else {
+ insert_block = block;
+ }
+
+ /* Reposition the cursor for insert and try insertion */
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ page_cur_search(insert_block, cursor->index, tuple,
+ PAGE_CUR_LE, page_cursor);
+
+ /* It's possible that the new record is too big to be inserted into
+ the page, and it'll need the second round split in this case.
+ We test this scenario here*/
+ DBUG_EXECUTE_IF("rtr_page_need_second_split",
+ if (n_iterations == 0) {
+ rec = NULL;
+ goto after_insert; }
+ );
+
+ rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+ offsets, heap, n_ext, mtr);
+
+ /* If insert did not fit, try page reorganization.
+ For compressed pages, page_cur_tuple_insert() will have
+ attempted this already. */
+ if (rec == NULL) {
+ if (!is_page_cur_get_page_zip(page_cursor)
+ && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
+ rec = page_cur_tuple_insert(page_cursor, tuple,
+ cursor->index, offsets,
+ heap, n_ext, mtr);
+
+ }
+ /* If insert fail, we will try to split the insert_block
+ again. */
+ }
+
+#ifdef UNIV_DEBUG
+after_insert:
+#endif
+ /* Calculate the mbr on the upper half-page, and the mbr on
+ original page. */
+ rtr_page_cal_mbr(cursor->index, block, &mbr, *heap);
+ rtr_page_cal_mbr(cursor->index, new_block, &new_mbr, *heap);
+ prdt.data = &mbr;
+ new_prdt.data = &new_mbr;
+
+ /* Check any predicate locks need to be moved/copied to the
+ new page */
+ lock_prdt_update_split(new_block, &prdt, &new_prdt, page_id);
+
+ /* Adjust the upper level. */
+ rtr_adjust_upper_level(cursor, flags, block, new_block,
+ &mbr, &new_mbr, mtr);
+
+ /* Save the new ssn to the root page, since we need to reinit
+ the first ssn value from it after restart server. */
+
+ root_block = btr_root_block_get(cursor->index, RW_SX_LATCH, mtr);
+
+ page_zip = buf_block_get_page_zip(root_block);
+ page_set_ssn_id(root_block, page_zip, next_ssn, mtr);
+
+ /* Insert fit on the page: update the free bits for the
+ left and right pages in the same mtr */
+
+ if (page_is_leaf(page)) {
+ ibuf_update_free_bits_for_two_pages_low(
+ block, new_block, mtr);
+ }
+
+
+ /* If the new res insert fail, we need to do another split
+ again. */
+ if (!rec) {
+ /* We play safe and reset the free bits for new_page */
+ if (!dict_index_is_clust(cursor->index)
+ && !cursor->index->table->is_temporary()) {
+ ibuf_reset_free_bits(new_block);
+ ibuf_reset_free_bits(block);
+ }
+
+ /* We need to clean the parent path here and search father
+ node later, otherwise, it's possible that find a wrong
+ parent. */
+ rtr_clean_rtr_info(cursor->rtr_info, true);
+ cursor->rtr_info = NULL;
+ n_iterations++;
+
+ rec_t* i_rec = page_rec_get_next(page_get_infimum_rec(
+ buf_block_get_frame(block)));
+ btr_cur_position(cursor->index, i_rec, block, cursor);
+
+ goto func_start;
+ }
+
+#ifdef UNIV_GIS_DEBUG
+ ut_ad(page_validate(buf_block_get_frame(block), cursor->index));
+ ut_ad(page_validate(buf_block_get_frame(new_block), cursor->index));
+
+ ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
+#endif
+ MONITOR_INC(MONITOR_INDEX_SPLIT);
+
+ return(rec);
+}
+
+/****************************************************************//**
+Following the right link to find the proper block for insert.
+@return the proper block.*/
+dberr_t
+rtr_ins_enlarge_mbr(
+/*================*/
+ btr_cur_t* btr_cur, /*!< in: btr cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dberr_t err = DB_SUCCESS;
+ rtr_mbr_t new_mbr;
+ buf_block_t* block;
+ mem_heap_t* heap;
+ dict_index_t* index = btr_cur->index;
+ page_cur_t* page_cursor;
+ rec_offs* offsets;
+ node_visit_t* node_visit;
+ btr_cur_t cursor;
+ page_t* page;
+
+ ut_ad(dict_index_is_spatial(index));
+
+ /* If no rtr_info or rtree is one level tree, return. */
+ if (!btr_cur->rtr_info || btr_cur->tree_height == 1) {
+ return(err);
+ }
+
+ /* Check path info is not empty. */
+ ut_ad(!btr_cur->rtr_info->parent_path->empty());
+
+ /* Create a memory heap. */
+ heap = mem_heap_create(1024);
+
+ /* Leaf level page is stored in cursor */
+ page_cursor = btr_cur_get_page_cur(btr_cur);
+ block = page_cur_get_block(page_cursor);
+
+ for (ulint i = 1; i < btr_cur->tree_height; i++) {
+ node_visit = rtr_get_parent_node(btr_cur, i, true);
+ ut_ad(node_visit != NULL);
+
+ /* If there's no mbr enlarge, return.*/
+ if (node_visit->mbr_inc == 0) {
+ block = btr_pcur_get_block(node_visit->cursor);
+ continue;
+ }
+
+ /* Calculate the mbr of the child page. */
+ rtr_page_cal_mbr(index, block, &new_mbr, heap);
+
+ /* Get father block. */
+ cursor.init();
+ offsets = rtr_page_get_father_block(
+ NULL, heap, index, block, mtr, btr_cur, &cursor);
+
+ page = buf_block_get_frame(block);
+
+ /* Update the mbr field of the rec. */
+ if (!rtr_update_mbr_field(&cursor, offsets, NULL, page,
+ &new_mbr, NULL, mtr)) {
+ err = DB_ERROR;
+ break;
+ }
+
+ page_cursor = btr_cur_get_page_cur(&cursor);
+ block = page_cur_get_block(page_cursor);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/*************************************************************//**
+Copy recs from a page to new_block of rtree.
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
+void
+rtr_page_copy_rec_list_end_no_locks(
+/*================================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ rtr_rec_move_t* rec_move, /*!< in: recording records moved */
+ ulint max_move, /*!< in: num of rec to move */
+ ulint* num_moved, /*!< out: num of rec to move */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ page_cur_t page_cur;
+ page_cur_t cur1;
+ rec_t* cur_rec;
+ rec_offs offsets_1[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets1 = offsets_1;
+ rec_offs offsets_2[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets2 = offsets_2;
+ ulint moved = 0;
+ const ulint n_core = page_is_leaf(new_page)
+ ? index->n_core_fields : 0;
+
+ rec_offs_init(offsets_1);
+ rec_offs_init(offsets_2);
+
+ page_cur_position(rec, block, &cur1);
+
+ if (page_cur_is_before_first(&cur1)) {
+ page_cur_move_to_next(&cur1);
+ }
+
+ btr_assert_not_corrupted(new_block, index);
+ ut_a(page_is_comp(new_page) == page_rec_is_comp(rec));
+ ut_a(mach_read_from_2(new_page + srv_page_size - 10) == (ulint)
+ (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM));
+
+ cur_rec = page_rec_get_next(
+ page_get_infimum_rec(buf_block_get_frame(new_block)));
+ page_cur_position(cur_rec, new_block, &page_cur);
+
+ /* Copy records from the original page to the new page */
+ while (!page_cur_is_after_last(&cur1)) {
+ rec_t* cur1_rec = page_cur_get_rec(&cur1);
+ rec_t* ins_rec;
+
+ if (page_rec_is_infimum(cur_rec)) {
+ cur_rec = page_rec_get_next(cur_rec);
+ }
+
+ offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
+ ULINT_UNDEFINED, &heap);
+ while (!page_rec_is_supremum(cur_rec)) {
+ ulint cur_matched_fields = 0;
+ int cmp;
+
+ offsets2 = rec_get_offsets(cur_rec, index, offsets2,
+ n_core,
+ ULINT_UNDEFINED, &heap);
+ cmp = cmp_rec_rec(cur1_rec, cur_rec,
+ offsets1, offsets2, index, false,
+ &cur_matched_fields);
+ if (cmp < 0) {
+ page_cur_move_to_prev(&page_cur);
+ break;
+ } else if (cmp > 0) {
+ /* Skip small recs. */
+ page_cur_move_to_next(&page_cur);
+ cur_rec = page_cur_get_rec(&page_cur);
+ } else if (n_core) {
+ if (rec_get_deleted_flag(cur1_rec,
+ dict_table_is_comp(index->table))) {
+ goto next;
+ } else {
+ /* We have two identical leaf records,
+ skip copying the undeleted one, and
+ unmark deleted on the current page */
+ btr_rec_set_deleted<false>(
+ new_block, cur_rec, mtr);
+ goto next;
+ }
+ }
+ }
+
+ /* If position is on suprenum rec, need to move to
+ previous rec. */
+ if (page_rec_is_supremum(cur_rec)) {
+ page_cur_move_to_prev(&page_cur);
+ }
+
+ cur_rec = page_cur_get_rec(&page_cur);
+
+ offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
+ ULINT_UNDEFINED, &heap);
+
+ ins_rec = page_cur_insert_rec_low(&page_cur, index,
+ cur1_rec, offsets1, mtr);
+ if (UNIV_UNLIKELY(!ins_rec)) {
+ fprintf(stderr, "page number %u and %u\n",
+ new_block->page.id().page_no(),
+ block->page.id().page_no());
+
+ ib::fatal() << "rec offset " << page_offset(rec)
+ << ", cur1 offset "
+ << page_offset(page_cur_get_rec(&cur1))
+ << ", cur_rec offset "
+ << page_offset(cur_rec);
+ }
+
+ rec_move[moved].new_rec = ins_rec;
+ rec_move[moved].old_rec = cur1_rec;
+ rec_move[moved].moved = false;
+ moved++;
+next:
+ if (moved > max_move) {
+ ut_ad(0);
+ break;
+ }
+
+ page_cur_move_to_next(&cur1);
+ }
+
+ *num_moved = moved;
+}
+
+/*************************************************************//**
+Copy recs till a specified rec from a page to new_block of rtree. */
+void
+rtr_page_copy_rec_list_start_no_locks(
+/*==================================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ rtr_rec_move_t* rec_move, /*!< in: recording records moved */
+ ulint max_move, /*!< in: num of rec to move */
+ ulint* num_moved, /*!< out: num of rec to move */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t cur1;
+ rec_t* cur_rec;
+ rec_offs offsets_1[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets1 = offsets_1;
+ rec_offs offsets_2[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets2 = offsets_2;
+ page_cur_t page_cur;
+ ulint moved = 0;
+ const ulint n_core = page_is_leaf(buf_block_get_frame(block))
+ ? index->n_core_fields : 0;
+
+ rec_offs_init(offsets_1);
+ rec_offs_init(offsets_2);
+
+ page_cur_set_before_first(block, &cur1);
+ page_cur_move_to_next(&cur1);
+
+ cur_rec = page_rec_get_next(
+ page_get_infimum_rec(buf_block_get_frame(new_block)));
+ page_cur_position(cur_rec, new_block, &page_cur);
+
+ while (page_cur_get_rec(&cur1) != rec) {
+ rec_t* cur1_rec = page_cur_get_rec(&cur1);
+ rec_t* ins_rec;
+
+ if (page_rec_is_infimum(cur_rec)) {
+ cur_rec = page_rec_get_next(cur_rec);
+ }
+
+ offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
+ ULINT_UNDEFINED, &heap);
+
+ while (!page_rec_is_supremum(cur_rec)) {
+ ulint cur_matched_fields = 0;
+
+ offsets2 = rec_get_offsets(cur_rec, index, offsets2,
+ n_core,
+ ULINT_UNDEFINED, &heap);
+ int cmp = cmp_rec_rec(cur1_rec, cur_rec,
+ offsets1, offsets2, index, false,
+ &cur_matched_fields);
+ if (cmp < 0) {
+ page_cur_move_to_prev(&page_cur);
+ cur_rec = page_cur_get_rec(&page_cur);
+ break;
+ } else if (cmp > 0) {
+ /* Skip small recs. */
+ page_cur_move_to_next(&page_cur);
+ cur_rec = page_cur_get_rec(&page_cur);
+ } else if (n_core) {
+ if (rec_get_deleted_flag(
+ cur1_rec,
+ dict_table_is_comp(index->table))) {
+ goto next;
+ } else {
+ /* We have two identical leaf records,
+ skip copying the undeleted one, and
+ unmark deleted on the current page */
+ btr_rec_set_deleted<false>(
+ new_block, cur_rec, mtr);
+ goto next;
+ }
+ }
+ }
+
+ /* If position is on suprenum rec, need to move to
+ previous rec. */
+ if (page_rec_is_supremum(cur_rec)) {
+ page_cur_move_to_prev(&page_cur);
+ }
+
+ cur_rec = page_cur_get_rec(&page_cur);
+
+ offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
+ ULINT_UNDEFINED, &heap);
+
+ ins_rec = page_cur_insert_rec_low(&page_cur, index,
+ cur1_rec, offsets1, mtr);
+ if (UNIV_UNLIKELY(!ins_rec)) {
+ ib::fatal() << new_block->page.id()
+ << "rec offset " << page_offset(rec)
+ << ", cur1 offset "
+ << page_offset(page_cur_get_rec(&cur1))
+ << ", cur_rec offset "
+ << page_offset(cur_rec);
+ }
+
+ rec_move[moved].new_rec = ins_rec;
+ rec_move[moved].old_rec = cur1_rec;
+ rec_move[moved].moved = false;
+ moved++;
+next:
+ if (moved > max_move) {
+ ut_ad(0);
+ break;
+ }
+
+ page_cur_move_to_next(&cur1);
+ }
+
+ *num_moved = moved;
+}
+
+/****************************************************************//**
+Check two MBRs are identical or need to be merged */
+bool
+rtr_merge_mbr_changed(
+/*==================*/
+ btr_cur_t* cursor, /*!< in/out: cursor */
+ btr_cur_t* cursor2, /*!< in: the other cursor */
+ rec_offs* offsets, /*!< in: rec offsets */
+ rec_offs* offsets2, /*!< in: rec offsets */
+ rtr_mbr_t* new_mbr) /*!< out: MBR to update */
+{
+ double* mbr;
+ double mbr1[SPDIMS * 2];
+ double mbr2[SPDIMS * 2];
+ rec_t* rec;
+ ulint len;
+ bool changed = false;
+
+ ut_ad(dict_index_is_spatial(cursor->index));
+
+ rec = btr_cur_get_rec(cursor);
+
+ rtr_read_mbr(rec_get_nth_field(rec, offsets, 0, &len),
+ reinterpret_cast<rtr_mbr_t*>(mbr1));
+
+ rec = btr_cur_get_rec(cursor2);
+
+ rtr_read_mbr(rec_get_nth_field(rec, offsets2, 0, &len),
+ reinterpret_cast<rtr_mbr_t*>(mbr2));
+
+ mbr = reinterpret_cast<double*>(new_mbr);
+
+ for (int i = 0; i < SPDIMS * 2; i += 2) {
+ changed = (changed || mbr1[i] != mbr2[i]);
+ *mbr = mbr1[i] < mbr2[i] ? mbr1[i] : mbr2[i];
+ mbr++;
+ changed = (changed || mbr1[i + 1] != mbr2 [i + 1]);
+ *mbr = mbr1[i + 1] > mbr2[i + 1] ? mbr1[i + 1] : mbr2[i + 1];
+ mbr++;
+ }
+
+ return(changed);
+}
+
+/****************************************************************//**
+Merge 2 mbrs and update the the mbr that cursor is on. */
+dberr_t
+rtr_merge_and_update_mbr(
+/*=====================*/
+ btr_cur_t* cursor, /*!< in/out: cursor */
+ btr_cur_t* cursor2, /*!< in: the other cursor */
+ rec_offs* offsets, /*!< in: rec offsets */
+ rec_offs* offsets2, /*!< in: rec offsets */
+ page_t* child_page, /*!< in: the page. */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dberr_t err = DB_SUCCESS;
+ rtr_mbr_t new_mbr;
+ bool changed = false;
+
+ ut_ad(dict_index_is_spatial(cursor->index));
+
+ changed = rtr_merge_mbr_changed(cursor, cursor2, offsets, offsets2,
+ &new_mbr);
+
+ /* Update the mbr field of the rec. And will delete the record
+ pointed by cursor2 */
+ if (changed) {
+ if (!rtr_update_mbr_field(cursor, offsets, cursor2, child_page,
+ &new_mbr, NULL, mtr)) {
+ err = DB_ERROR;
+ }
+ } else {
+ rtr_node_ptr_delete(cursor2, mtr);
+ }
+
+ return(err);
+}
+
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+void
+rtr_node_ptr_delete(
+/*================*/
+ btr_cur_t* cursor, /*!< in: search cursor, contains information
+ about parent nodes in search */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ibool compressed;
+ dberr_t err;
+
+ compressed = btr_cur_pessimistic_delete(&err, TRUE, cursor,
+ BTR_CREATE_FLAG, false, mtr);
+ ut_a(err == DB_SUCCESS);
+
+ if (!compressed) {
+ btr_cur_compress_if_useful(cursor, FALSE, mtr);
+ }
+}
+
+/**************************************************************//**
+Check whether a Rtree page is child of a parent page
+@return true if there is child/parent relationship */
+bool
+rtr_check_same_block(
+/*================*/
+ dict_index_t* index, /*!< in: index tree */
+ btr_cur_t* cursor, /*!< in/out: position at the parent entry
+ pointing to the child if successful */
+ buf_block_t* parentb,/*!< in: parent page to check */
+ buf_block_t* childb, /*!< in: child Page */
+ mem_heap_t* heap) /*!< in: memory heap */
+
+{
+ ulint page_no = childb->page.id().page_no();
+ rec_offs* offsets;
+ rec_t* rec = page_rec_get_next(page_get_infimum_rec(
+ buf_block_get_frame(parentb)));
+
+ while (!page_rec_is_supremum(rec)) {
+ offsets = rec_get_offsets(
+ rec, index, NULL, 0, ULINT_UNDEFINED, &heap);
+
+ if (btr_node_ptr_get_child_page_no(rec, offsets) == page_no) {
+ btr_cur_position(index, rec, parentb, cursor);
+ return(true);
+ }
+
+ rec = page_rec_get_next(rec);
+ }
+
+ return(false);
+}
+
+/*************************************************************//**
+Calculates MBR_AREA(a+b) - MBR_AREA(a)
+Note: when 'a' and 'b' objects are far from each other,
+the area increase can be really big, so this function
+can return 'inf' as a result.
+Return the area increaed. */
+static double
+rtree_area_increase(
+ const uchar* a, /*!< in: original mbr. */
+ const uchar* b, /*!< in: new mbr. */
+ double* ab_area) /*!< out: increased area. */
+{
+ double a_area = 1.0;
+ double loc_ab_area = 1.0;
+ double amin, amax, bmin, bmax;
+ double data_round = 1.0;
+
+ static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double),
+ "compatibility");
+
+ for (auto i = SPDIMS; i--; ) {
+ double area;
+
+ amin = mach_double_read(a);
+ bmin = mach_double_read(b);
+ amax = mach_double_read(a + sizeof(double));
+ bmax = mach_double_read(b + sizeof(double));
+
+ a += 2 * sizeof(double);
+ b += 2 * sizeof(double);
+
+ area = amax - amin;
+ if (area == 0) {
+ a_area *= LINE_MBR_WEIGHTS;
+ } else {
+ a_area *= area;
+ }
+
+ area = (double)std::max(amax, bmax) -
+ (double)std::min(amin, bmin);
+ if (area == 0) {
+ loc_ab_area *= LINE_MBR_WEIGHTS;
+ } else {
+ loc_ab_area *= area;
+ }
+
+ /* Value of amax or bmin can be so large that small difference
+ are ignored. For example: 3.2884281489988079e+284 - 100 =
+ 3.2884281489988079e+284. This results some area difference
+ are not detected */
+ if (loc_ab_area == a_area) {
+ if (bmin < amin || bmax > amax) {
+ data_round *= ((double)std::max(amax, bmax)
+ - amax
+ + (amin - (double)std::min(
+ amin, bmin)));
+ } else {
+ data_round *= area;
+ }
+ }
+ }
+
+ *ab_area = loc_ab_area;
+
+ if (loc_ab_area == a_area && data_round != 1.0) {
+ return(data_round);
+ }
+
+ return(loc_ab_area - a_area);
+}
+
+/** Calculates overlapping area
+@param[in] a mbr a
+@param[in] b mbr b
+@return overlapping area */
+static double rtree_area_overlapping(const byte *a, const byte *b)
+{
+ double area = 1.0;
+ double amin;
+ double amax;
+ double bmin;
+ double bmax;
+
+ static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double),
+ "compatibility");
+
+ for (auto i = SPDIMS; i--; ) {
+ amin = mach_double_read(a);
+ bmin = mach_double_read(b);
+ amax = mach_double_read(a + sizeof(double));
+ bmax = mach_double_read(b + sizeof(double));
+ a += 2 * sizeof(double);
+ b += 2 * sizeof(double);
+
+ amin = std::max(amin, bmin);
+ amax = std::min(amax, bmax);
+
+ if (amin > amax) {
+ return(0);
+ } else {
+ area *= (amax - amin);
+ }
+ }
+
+ return(area);
+}
+
+/****************************************************************//**
+Calculate the area increased for a new record
+@return area increased */
+double
+rtr_rec_cal_increase(
+/*=================*/
+ const dtuple_t* dtuple, /*!< in: data tuple to insert, which
+ cause area increase */
+ const rec_t* rec, /*!< in: physical record which differs from
+ dtuple in some of the common fields, or which
+ has an equal number or more fields than
+ dtuple */
+ double* area) /*!< out: increased area */
+{
+ const dfield_t* dtuple_field;
+
+ ut_ad(!page_rec_is_supremum(rec));
+ ut_ad(!page_rec_is_infimum(rec));
+
+ dtuple_field = dtuple_get_nth_field(dtuple, 0);
+ ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN);
+
+ return rtree_area_increase(rec,
+ static_cast<const byte*>(
+ dfield_get_data(dtuple_field)),
+ area);
+}
+
+/** Estimates the number of rows in a given area.
+@param[in] index index
+@param[in] tuple range tuple containing mbr, may also be empty tuple
+@param[in] mode search mode
+@return estimated number of rows */
+ha_rows
+rtr_estimate_n_rows_in_range(
+ dict_index_t* index,
+ const dtuple_t* tuple,
+ page_cur_mode_t mode)
+{
+ ut_ad(dict_index_is_spatial(index));
+
+ /* Check tuple & mode */
+ if (tuple->n_fields == 0) {
+ return(HA_POS_ERROR);
+ }
+
+ switch (mode) {
+ case PAGE_CUR_DISJOINT:
+ case PAGE_CUR_CONTAIN:
+ case PAGE_CUR_INTERSECT:
+ case PAGE_CUR_WITHIN:
+ case PAGE_CUR_MBR_EQUAL:
+ break;
+ default:
+ return(HA_POS_ERROR);
+ }
+
+ DBUG_EXECUTE_IF("rtr_pcur_move_to_next_return",
+ return(2);
+ );
+
+ /* Read mbr from tuple. */
+ rtr_mbr_t range_mbr;
+ double range_area;
+
+ const dfield_t* dtuple_field = dtuple_get_nth_field(tuple, 0);
+ ut_ad(dfield_get_len(dtuple_field) >= DATA_MBR_LEN);
+ const byte* range_mbr_ptr = reinterpret_cast<const byte*>(
+ dfield_get_data(dtuple_field));
+
+ rtr_read_mbr(range_mbr_ptr, &range_mbr);
+ range_area = (range_mbr.xmax - range_mbr.xmin)
+ * (range_mbr.ymax - range_mbr.ymin);
+
+ /* Get index root page. */
+ mtr_t mtr;
+
+ mtr.start();
+ index->set_modified(mtr);
+ mtr_s_lock_index(index, &mtr);
+
+ buf_block_t* block = btr_root_block_get(index, RW_S_LATCH, &mtr);
+ if (!block) {
+err_exit:
+ mtr.commit();
+ return HA_POS_ERROR;
+ }
+ const page_t* page = buf_block_get_frame(block);
+ const unsigned n_recs = page_header_get_field(page, PAGE_N_RECS);
+
+ if (n_recs == 0) {
+ goto err_exit;
+ }
+
+ /* Scan records in root page and calculate area. */
+ double area = 0;
+ for (const rec_t* rec = page_rec_get_next(
+ page_get_infimum_rec(block->frame));
+ !page_rec_is_supremum(rec);
+ rec = page_rec_get_next_const(rec)) {
+ rtr_mbr_t mbr;
+ double rec_area;
+
+ rtr_read_mbr(rec, &mbr);
+
+ rec_area = (mbr.xmax - mbr.xmin) * (mbr.ymax - mbr.ymin);
+
+ if (rec_area == 0) {
+ switch (mode) {
+ case PAGE_CUR_CONTAIN:
+ case PAGE_CUR_INTERSECT:
+ area += 1;
+ break;
+
+ case PAGE_CUR_DISJOINT:
+ break;
+
+ case PAGE_CUR_WITHIN:
+ case PAGE_CUR_MBR_EQUAL:
+ if (!rtree_key_cmp(
+ PAGE_CUR_WITHIN, range_mbr_ptr,
+ rec)) {
+ area += 1;
+ }
+
+ break;
+
+ default:
+ ut_error;
+ }
+ } else {
+ switch (mode) {
+ case PAGE_CUR_CONTAIN:
+ case PAGE_CUR_INTERSECT:
+ area += rtree_area_overlapping(
+ range_mbr_ptr, rec)
+ / rec_area;
+ break;
+
+ case PAGE_CUR_DISJOINT:
+ area += 1;
+ area -= rtree_area_overlapping(
+ range_mbr_ptr, rec)
+ / rec_area;
+ break;
+
+ case PAGE_CUR_WITHIN:
+ case PAGE_CUR_MBR_EQUAL:
+ if (!rtree_key_cmp(
+ PAGE_CUR_WITHIN, range_mbr_ptr,
+ rec)) {
+ area += range_area / rec_area;
+ }
+
+ break;
+ default:
+ ut_error;
+ }
+ }
+ }
+
+ mtr.commit();
+
+ if (!std::isfinite(area)) {
+ return(HA_POS_ERROR);
+ }
+
+ area /= n_recs;
+ return ha_rows(static_cast<double>(dict_table_get_n_rows(index->table))
+ * area);
+}
diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc
new file mode 100644
index 00000000..1c22aab4
--- /dev/null
+++ b/storage/innobase/gis/gis0sea.cc
@@ -0,0 +1,2052 @@
+/*****************************************************************************
+
+Copyright (c) 2016, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file gis/gis0sea.cc
+InnoDB R-tree search interfaces
+
+Created 2014/01/16 Jimmy Yang
+***********************************************************************/
+
+#include "fsp0fsp.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "page0zip.h"
+#include "gis0rtree.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#include "que0que.h"
+#include "gis0geo.h"
+
+/** Restore the stored position of a persistent cursor bufferfixing the page */
+static
+bool
+rtr_cur_restore_position(
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /*!< in: detached persistent cursor */
+ ulint level, /*!< in: index level */
+ mtr_t* mtr); /*!< in: mtr */
+
+/*************************************************************//**
+Pop out used parent path entry, until we find the parent with matching
+page number */
+static
+void
+rtr_adjust_parent_path(
+/*===================*/
+ rtr_info_t* rtr_info, /* R-Tree info struct */
+ ulint page_no) /* page number to look for */
+{
+ while (!rtr_info->parent_path->empty()) {
+ if (rtr_info->parent_path->back().child_no == page_no) {
+ break;
+ } else {
+ if (rtr_info->parent_path->back().cursor) {
+ btr_pcur_close(
+ rtr_info->parent_path->back().cursor);
+ ut_free(rtr_info->parent_path->back().cursor);
+ }
+
+ rtr_info->parent_path->pop_back();
+ }
+ }
+}
+
+/*************************************************************//**
+Find the next matching record. This function is used by search
+or record locating during index delete/update.
+@return true if there is suitable record found, otherwise false */
+static
+bool
+rtr_pcur_getnext_from_path(
+/*=======================*/
+ const dtuple_t* tuple, /*!< in: data tuple */
+ page_cur_mode_t mode, /*!< in: cursor search mode */
+ btr_cur_t* btr_cur,/*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ ulint target_level,
+ /*!< in: target level */
+ ulint latch_mode,
+ /*!< in: latch_mode */
+ bool index_locked,
+ /*!< in: index tree locked */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index = btr_cur->index;
+ bool found = false;
+ page_cur_t* page_cursor;
+ ulint level = 0;
+ node_visit_t next_rec;
+ rtr_info_t* rtr_info = btr_cur->rtr_info;
+ node_seq_t page_ssn;
+ ulint my_latch_mode;
+ ulint skip_parent = false;
+ bool new_split = false;
+ bool need_parent;
+ bool for_delete = false;
+ bool for_undo_ins = false;
+
+ /* exhausted all the pages to be searched */
+ if (rtr_info->path->empty()) {
+ return(false);
+ }
+
+ ut_ad(dtuple_get_n_fields_cmp(tuple));
+
+ my_latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+ for_delete = latch_mode & BTR_RTREE_DELETE_MARK;
+ for_undo_ins = latch_mode & BTR_RTREE_UNDO_INS;
+
+ /* There should be no insert coming to this function. Only
+ mode with BTR_MODIFY_* should be delete */
+ ut_ad(mode != PAGE_CUR_RTREE_INSERT);
+ ut_ad(my_latch_mode == BTR_SEARCH_LEAF
+ || my_latch_mode == BTR_MODIFY_LEAF
+ || my_latch_mode == BTR_MODIFY_TREE
+ || my_latch_mode == BTR_CONT_MODIFY_TREE);
+
+ /* Whether need to track parent information. Only need so
+ when we do tree altering operations (such as index page merge) */
+ need_parent = ((my_latch_mode == BTR_MODIFY_TREE
+ || my_latch_mode == BTR_CONT_MODIFY_TREE)
+ && mode == PAGE_CUR_RTREE_LOCATE);
+
+ if (!index_locked) {
+ ut_ad(latch_mode & BTR_SEARCH_LEAF
+ || latch_mode & BTR_MODIFY_LEAF);
+ mtr_s_lock_index(index, mtr);
+ } else {
+ ut_ad(mtr->memo_contains_flagged(&index->lock,
+ MTR_MEMO_SX_LOCK
+ | MTR_MEMO_S_LOCK
+ | MTR_MEMO_X_LOCK));
+ }
+
+ const ulint zip_size = index->table->space->zip_size();
+
+ /* Pop each node/page to be searched from "path" structure
+ and do a search on it. Please note, any pages that are in
+ the "path" structure are protected by "page" lock, so tey
+ cannot be shrunk away */
+ do {
+ buf_block_t* block;
+ node_seq_t path_ssn;
+ const page_t* page;
+ ulint rw_latch = RW_X_LATCH;
+ ulint tree_idx;
+
+ mutex_enter(&rtr_info->rtr_path_mutex);
+ next_rec = rtr_info->path->back();
+ rtr_info->path->pop_back();
+ level = next_rec.level;
+ path_ssn = next_rec.seq_no;
+ tree_idx = btr_cur->tree_height - level - 1;
+
+ /* Maintain the parent path info as well, if needed */
+ if (need_parent && !skip_parent && !new_split) {
+ ulint old_level;
+ ulint new_level;
+
+ ut_ad(!rtr_info->parent_path->empty());
+
+ /* Cleanup unused parent info */
+ if (rtr_info->parent_path->back().cursor) {
+ btr_pcur_close(
+ rtr_info->parent_path->back().cursor);
+ ut_free(rtr_info->parent_path->back().cursor);
+ }
+
+ old_level = rtr_info->parent_path->back().level;
+
+ rtr_info->parent_path->pop_back();
+
+ ut_ad(!rtr_info->parent_path->empty());
+
+ /* check whether there is a level change. If so,
+ the current parent path needs to pop enough
+ nodes to adjust to the new search page */
+ new_level = rtr_info->parent_path->back().level;
+
+ if (old_level < new_level) {
+ rtr_adjust_parent_path(
+ rtr_info, next_rec.page_no);
+ }
+
+ ut_ad(!rtr_info->parent_path->empty());
+
+ ut_ad(next_rec.page_no
+ == rtr_info->parent_path->back().child_no);
+ }
+
+ mutex_exit(&rtr_info->rtr_path_mutex);
+
+ skip_parent = false;
+ new_split = false;
+
+ /* Once we have pages in "path", these pages are
+ predicate page locked, so they can't be shrunk away.
+ They also have SSN (split sequence number) to detect
+ splits, so we can directly latch single page while
+ getting them. They can be unlatched if not qualified.
+ One reason for pre-latch is that we might need to position
+ some parent position (requires latch) during search */
+ if (level == 0) {
+ /* S latched for SEARCH_LEAF, and X latched
+ for MODIFY_LEAF */
+ if (my_latch_mode <= BTR_MODIFY_LEAF) {
+ rw_latch = my_latch_mode;
+ }
+
+ if (my_latch_mode == BTR_CONT_MODIFY_TREE
+ || my_latch_mode == BTR_MODIFY_TREE) {
+ rw_latch = RW_NO_LATCH;
+ }
+
+ } else if (level == target_level) {
+ rw_latch = RW_X_LATCH;
+ }
+
+ /* Release previous locked blocks */
+ if (my_latch_mode != BTR_SEARCH_LEAF) {
+ for (ulint idx = 0; idx < btr_cur->tree_height;
+ idx++) {
+ if (rtr_info->tree_blocks[idx]) {
+ mtr_release_block_at_savepoint(
+ mtr,
+ rtr_info->tree_savepoints[idx],
+ rtr_info->tree_blocks[idx]);
+ rtr_info->tree_blocks[idx] = NULL;
+ }
+ }
+ for (ulint idx = RTR_MAX_LEVELS; idx < RTR_MAX_LEVELS + 3;
+ idx++) {
+ if (rtr_info->tree_blocks[idx]) {
+ mtr_release_block_at_savepoint(
+ mtr,
+ rtr_info->tree_savepoints[idx],
+ rtr_info->tree_blocks[idx]);
+ rtr_info->tree_blocks[idx] = NULL;
+ }
+ }
+ }
+
+ /* set up savepoint to record any locks to be taken */
+ rtr_info->tree_savepoints[tree_idx] = mtr_set_savepoint(mtr);
+
+#ifdef UNIV_RTR_DEBUG
+ ut_ad(!(rw_lock_own_flagged(&btr_cur->page_cur.block->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S))
+ || my_latch_mode == BTR_MODIFY_TREE
+ || my_latch_mode == BTR_CONT_MODIFY_TREE
+ || !page_is_leaf(buf_block_get_frame(
+ btr_cur->page_cur.block)));
+#endif /* UNIV_RTR_DEBUG */
+
+ dberr_t err = DB_SUCCESS;
+
+ block = buf_page_get_gen(
+ page_id_t(index->table->space_id,
+ next_rec.page_no), zip_size,
+ rw_latch, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err);
+
+ if (block == NULL) {
+ continue;
+ } else if (rw_latch != RW_NO_LATCH) {
+ ut_ad(!dict_index_is_ibuf(index));
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+ }
+
+ rtr_info->tree_blocks[tree_idx] = block;
+
+ page = buf_block_get_frame(block);
+ page_ssn = page_get_ssn_id(page);
+
+ /* If there are splits, push the splitted page.
+ Note that we have SX lock on index->lock, there
+ should not be any split/shrink happening here */
+ if (page_ssn > path_ssn) {
+ uint32_t next_page_no = btr_page_get_next(page);
+ rtr_non_leaf_stack_push(
+ rtr_info->path, next_page_no, path_ssn,
+ level, 0, NULL, 0);
+
+ if (!srv_read_only_mode
+ && mode != PAGE_CUR_RTREE_INSERT
+ && mode != PAGE_CUR_RTREE_LOCATE) {
+ ut_ad(rtr_info->thr);
+ lock_place_prdt_page_lock(
+ page_id_t(block->page.id().space(),
+ next_page_no),
+ index,
+ rtr_info->thr);
+ }
+ new_split = true;
+#if defined(UNIV_GIS_DEBUG)
+ fprintf(stderr,
+ "GIS_DIAG: Splitted page found: %d, %ld\n",
+ static_cast<int>(need_parent), next_page_no);
+#endif
+ }
+
+ page_cursor = btr_cur_get_page_cur(btr_cur);
+ page_cursor->rec = NULL;
+
+ if (mode == PAGE_CUR_RTREE_LOCATE) {
+ if (level == target_level && level == 0) {
+ ulint low_match;
+
+ found = false;
+
+ low_match = page_cur_search(
+ block, index, tuple,
+ PAGE_CUR_LE,
+ btr_cur_get_page_cur(btr_cur));
+
+ if (low_match == dtuple_get_n_fields_cmp(
+ tuple)) {
+ rec_t* rec = btr_cur_get_rec(btr_cur);
+
+ if (!rec_get_deleted_flag(rec,
+ dict_table_is_comp(index->table))
+ || (!for_delete && !for_undo_ins)) {
+ found = true;
+ btr_cur->low_match = low_match;
+ } else {
+ /* mark we found deleted row */
+ btr_cur->rtr_info->fd_del
+ = true;
+ }
+ }
+ } else {
+ page_cur_mode_t page_mode = mode;
+
+ if (level == target_level
+ && target_level != 0) {
+ page_mode = PAGE_CUR_RTREE_GET_FATHER;
+ }
+ found = rtr_cur_search_with_match(
+ block, index, tuple, page_mode,
+ page_cursor, btr_cur->rtr_info);
+
+ /* Save the position of parent if needed */
+ if (found && need_parent) {
+ btr_pcur_t* r_cursor =
+ rtr_get_parent_cursor(
+ btr_cur, level, false);
+
+ rec_t* rec = page_cur_get_rec(
+ page_cursor);
+ page_cur_position(
+ rec, block,
+ btr_pcur_get_page_cur(r_cursor));
+ r_cursor->pos_state =
+ BTR_PCUR_IS_POSITIONED;
+ r_cursor->latch_mode = my_latch_mode;
+ btr_pcur_store_position(r_cursor, mtr);
+#ifdef UNIV_DEBUG
+ ulint num_stored =
+ rtr_store_parent_path(
+ block, btr_cur,
+ rw_latch, level, mtr);
+ ut_ad(num_stored > 0);
+#else
+ rtr_store_parent_path(
+ block, btr_cur, rw_latch,
+ level, mtr);
+#endif /* UNIV_DEBUG */
+ }
+ }
+ } else {
+ found = rtr_cur_search_with_match(
+ block, index, tuple, mode, page_cursor,
+ btr_cur->rtr_info);
+ }
+
+ /* Attach predicate lock if needed, no matter whether
+ there are matched records */
+ if (mode != PAGE_CUR_RTREE_INSERT
+ && mode != PAGE_CUR_RTREE_LOCATE
+ && mode >= PAGE_CUR_CONTAIN
+ && btr_cur->rtr_info->need_prdt_lock) {
+ lock_prdt_t prdt;
+
+ trx_t* trx = thr_get_trx(
+ btr_cur->rtr_info->thr);
+ lock_mutex_enter();
+ lock_init_prdt_from_mbr(
+ &prdt, &btr_cur->rtr_info->mbr,
+ mode, trx->lock.lock_heap);
+ lock_mutex_exit();
+
+ if (rw_latch == RW_NO_LATCH) {
+ rw_lock_s_lock(&(block->lock));
+ }
+
+ lock_prdt_lock(block, &prdt, index, LOCK_S,
+ LOCK_PREDICATE, btr_cur->rtr_info->thr);
+
+ if (rw_latch == RW_NO_LATCH) {
+ rw_lock_s_unlock(&(block->lock));
+ }
+ }
+
+ if (found) {
+ if (level == target_level) {
+ page_cur_t* r_cur;;
+
+ if (my_latch_mode == BTR_MODIFY_TREE
+ && level == 0) {
+ ut_ad(rw_latch == RW_NO_LATCH);
+
+ btr_cur_latch_leaves(
+ block,
+ BTR_MODIFY_TREE,
+ btr_cur, mtr);
+ }
+
+ r_cur = btr_cur_get_page_cur(btr_cur);
+
+ page_cur_position(
+ page_cur_get_rec(page_cursor),
+ page_cur_get_block(page_cursor),
+ r_cur);
+
+ btr_cur->low_match = level != 0 ?
+ DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1
+ : btr_cur->low_match;
+ break;
+ }
+
+ /* Keep the parent path node, which points to
+ last node just located */
+ skip_parent = true;
+ } else {
+ /* Release latch on the current page */
+ ut_ad(rtr_info->tree_blocks[tree_idx]);
+
+ mtr_release_block_at_savepoint(
+ mtr, rtr_info->tree_savepoints[tree_idx],
+ rtr_info->tree_blocks[tree_idx]);
+ rtr_info->tree_blocks[tree_idx] = NULL;
+ }
+
+ } while (!rtr_info->path->empty());
+
+ const rec_t* rec = btr_cur_get_rec(btr_cur);
+
+ if (page_rec_is_infimum(rec) || page_rec_is_supremum(rec)) {
+ mtr_commit(mtr);
+ mtr_start(mtr);
+ } else if (!index_locked) {
+ mtr_memo_release(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK);
+ }
+
+ return(found);
+}
+
+/*************************************************************//**
+Find the next matching record. This function will first exhaust
+the copied record listed in the rtr_info->matches vector before
+moving to the next page
+@return true if there is suitable record found, otherwise false */
+bool
+rtr_pcur_move_to_next(
+/*==================*/
+ const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
+ tuple must be set so that it cannot get
+ compared to the node ptr page number field! */
+ page_cur_mode_t mode, /*!< in: cursor search mode */
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ ulint level, /*!< in: target level */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ rtr_info_t* rtr_info = cursor->btr_cur.rtr_info;
+
+ ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ mutex_enter(&rtr_info->matches->rtr_match_mutex);
+ /* First retrieve the next record on the current page */
+ if (!rtr_info->matches->matched_recs->empty()) {
+ rtr_rec_t rec;
+ rec = rtr_info->matches->matched_recs->back();
+ rtr_info->matches->matched_recs->pop_back();
+ mutex_exit(&rtr_info->matches->rtr_match_mutex);
+
+ cursor->btr_cur.page_cur.rec = rec.r_rec;
+ cursor->btr_cur.page_cur.block = &rtr_info->matches->block;
+
+ DEBUG_SYNC_C("rtr_pcur_move_to_next_return");
+ return(true);
+ }
+
+ mutex_exit(&rtr_info->matches->rtr_match_mutex);
+
+ /* Fetch the next page */
+ return(rtr_pcur_getnext_from_path(tuple, mode, &cursor->btr_cur,
+ level, cursor->latch_mode,
+ false, mtr));
+}
+
+/*************************************************************//**
+Check if the cursor holds record pointing to the specified child page
+@return true if it is (pointing to the child page) false otherwise */
+static
+bool
+rtr_compare_cursor_rec(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ btr_cur_t* cursor, /*!< in: Cursor to check */
+ ulint page_no, /*!< in: desired child page number */
+ mem_heap_t** heap) /*!< in: memory heap */
+{
+ const rec_t* rec;
+ rec_offs* offsets;
+
+ rec = btr_cur_get_rec(cursor);
+
+ offsets = rec_get_offsets(rec, index, NULL, 0, ULINT_UNDEFINED, heap);
+
+ return(btr_node_ptr_get_child_page_no(rec, offsets) == page_no);
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. Mainly called by row_search_index_entry() */
+void
+rtr_pcur_open_low(
+/*==============*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level in the rtree */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_RTREE_LOCATE, ... */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ btr_cur_t* btr_cursor;
+ ulint n_fields;
+ ulint low_match;
+ rec_t* rec;
+ bool tree_latched = false;
+ bool for_delete = false;
+ bool for_undo_ins = false;
+
+ ut_ad(level == 0);
+
+ ut_ad(latch_mode & BTR_MODIFY_LEAF || latch_mode & BTR_MODIFY_TREE);
+ ut_ad(mode == PAGE_CUR_RTREE_LOCATE);
+
+ /* Initialize the cursor */
+
+ btr_pcur_init(cursor);
+
+ for_delete = latch_mode & BTR_RTREE_DELETE_MARK;
+ for_undo_ins = latch_mode & BTR_RTREE_UNDO_INS;
+
+ cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+ cursor->search_mode = mode;
+
+ /* Search with the tree cursor */
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ btr_cursor->rtr_info = rtr_create_rtr_info(false, false,
+ btr_cursor, index);
+
+ /* Purge will SX lock the tree instead of take Page Locks */
+ if (btr_cursor->thr) {
+ btr_cursor->rtr_info->need_page_lock = true;
+ btr_cursor->rtr_info->thr = btr_cursor->thr;
+ }
+
+ btr_cur_search_to_nth_level(index, level, tuple, mode, latch_mode,
+ btr_cursor, 0, file, line, mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ cursor->trx_if_known = NULL;
+
+ low_match = btr_pcur_get_low_match(cursor);
+
+ rec = btr_pcur_get_rec(cursor);
+
+ n_fields = dtuple_get_n_fields(tuple);
+
+ if (latch_mode & BTR_ALREADY_S_LATCHED) {
+ ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
+ tree_latched = true;
+ }
+
+ if (latch_mode & BTR_MODIFY_TREE) {
+ ut_ad(mtr->memo_contains_flagged(&index->lock,
+ MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ tree_latched = true;
+ }
+
+ if (page_rec_is_infimum(rec) || low_match != n_fields
+ || (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))
+ && (for_delete || for_undo_ins))) {
+
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))
+ && for_delete) {
+ btr_cursor->rtr_info->fd_del = true;
+ btr_cursor->low_match = 0;
+ }
+ /* Did not find matched row in first dive. Release
+ latched block if any before search more pages */
+ if (latch_mode & BTR_MODIFY_LEAF) {
+ ulint tree_idx = btr_cursor->tree_height - 1;
+ rtr_info_t* rtr_info = btr_cursor->rtr_info;
+
+ ut_ad(level == 0);
+
+ if (rtr_info->tree_blocks[tree_idx]) {
+ mtr_release_block_at_savepoint(
+ mtr,
+ rtr_info->tree_savepoints[tree_idx],
+ rtr_info->tree_blocks[tree_idx]);
+ rtr_info->tree_blocks[tree_idx] = NULL;
+ }
+ }
+
+ bool ret = rtr_pcur_getnext_from_path(
+ tuple, mode, btr_cursor, level, latch_mode,
+ tree_latched, mtr);
+
+ if (ret) {
+ low_match = btr_pcur_get_low_match(cursor);
+ ut_ad(low_match == n_fields);
+ }
+ }
+}
+
+/* Get the rtree page father.
+@param[in] index rtree index
+@param[in] block child page in the index
+@param[in] mtr mtr
+@param[in] sea_cur search cursor, contains information
+ about parent nodes in search
+@param[in] cursor cursor on node pointer record,
+ its page x-latched */
+void
+rtr_page_get_father(
+ dict_index_t* index,
+ buf_block_t* block,
+ mtr_t* mtr,
+ btr_cur_t* sea_cur,
+ btr_cur_t* cursor)
+{
+ mem_heap_t* heap = mem_heap_create(100);
+#ifdef UNIV_DEBUG
+ rec_offs* offsets;
+
+ offsets = rtr_page_get_father_block(
+ NULL, heap, index, block, mtr, sea_cur, cursor);
+
+ ulint page_no = btr_node_ptr_get_child_page_no(cursor->page_cur.rec,
+ offsets);
+
+ ut_ad(page_no == block->page.id().page_no());
+#else
+ rtr_page_get_father_block(
+ NULL, heap, index, block, mtr, sea_cur, cursor);
+#endif
+
+ mem_heap_free(heap);
+}
+
+/********************************************************************//**
+Returns the upper level node pointer to a R-Tree page. It is assumed
+that mtr holds an x-latch on the tree. */
+static void rtr_get_father_node(
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the tree level of search */
+ const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
+ tuple must be set so that it cannot get
+ compared to the node ptr page number field! */
+ btr_cur_t* sea_cur,/*!< in: search cursor */
+ btr_cur_t* btr_cur,/*!< in/out: tree cursor; the cursor page is
+ s- or x-latched, but see also above! */
+ ulint page_no,/*!< Current page no */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mem_heap_t* heap = NULL;
+ bool ret = false;
+ const rec_t* rec;
+ ulint n_fields;
+ bool new_rtr = false;
+
+ /* Try to optimally locate the parent node. Level should always
+ less than sea_cur->tree_height unless the root is splitting */
+ if (sea_cur && sea_cur->tree_height > level) {
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ret = rtr_cur_restore_position(
+ BTR_CONT_MODIFY_TREE, sea_cur, level, mtr);
+
+ /* Once we block shrink tree nodes while there are
+ active search on it, this optimal locating should always
+ succeeds */
+ ut_ad(ret);
+
+ if (ret) {
+ btr_pcur_t* r_cursor = rtr_get_parent_cursor(
+ sea_cur, level, false);
+
+ rec = btr_pcur_get_rec(r_cursor);
+
+ ut_ad(r_cursor->rel_pos == BTR_PCUR_ON);
+ page_cur_position(rec,
+ btr_pcur_get_block(r_cursor),
+ btr_cur_get_page_cur(btr_cur));
+ btr_cur->rtr_info = sea_cur->rtr_info;
+ btr_cur->tree_height = sea_cur->tree_height;
+ ut_ad(rtr_compare_cursor_rec(
+ index, btr_cur, page_no, &heap));
+ goto func_exit;
+ }
+ }
+
+ /* We arrive here in one of two scenario
+ 1) check table and btr_valide
+ 2) index root page being raised */
+ ut_ad(!sea_cur || sea_cur->tree_height == level);
+
+ if (btr_cur->rtr_info) {
+ rtr_clean_rtr_info(btr_cur->rtr_info, true);
+ } else {
+ new_rtr = true;
+ }
+
+ btr_cur->rtr_info = rtr_create_rtr_info(false, false, btr_cur, index);
+
+ if (sea_cur && sea_cur->tree_height == level) {
+ /* root split, and search the new root */
+ btr_cur_search_to_nth_level(
+ index, level, tuple, PAGE_CUR_RTREE_LOCATE,
+ BTR_CONT_MODIFY_TREE, btr_cur, 0,
+ __FILE__, __LINE__, mtr);
+
+ } else {
+ /* btr_validate */
+ ut_ad(level >= 1);
+ ut_ad(!sea_cur);
+
+ btr_cur_search_to_nth_level(
+ index, level, tuple, PAGE_CUR_RTREE_LOCATE,
+ BTR_CONT_MODIFY_TREE, btr_cur, 0,
+ __FILE__, __LINE__, mtr);
+
+ rec = btr_cur_get_rec(btr_cur);
+ n_fields = dtuple_get_n_fields_cmp(tuple);
+
+ if (page_rec_is_infimum(rec)
+ || (btr_cur->low_match != n_fields)) {
+ ret = rtr_pcur_getnext_from_path(
+ tuple, PAGE_CUR_RTREE_LOCATE, btr_cur,
+ level, BTR_CONT_MODIFY_TREE,
+ true, mtr);
+
+ ut_ad(ret && btr_cur->low_match == n_fields);
+ }
+ }
+
+ ret = rtr_compare_cursor_rec(
+ index, btr_cur, page_no, &heap);
+
+ ut_ad(ret);
+
+func_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ if (new_rtr && btr_cur->rtr_info) {
+ rtr_clean_rtr_info(btr_cur->rtr_info, true);
+ btr_cur->rtr_info = NULL;
+ }
+}
+
+/** Returns the upper level node pointer to a R-Tree page. It is assumed
+that mtr holds an SX-latch or X-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+rtr_page_get_father_node_ptr(
+ rec_offs* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ btr_cur_t* sea_cur,/*!< in: search cursor */
+ btr_cur_t* cursor, /*!< in: cursor pointing to user record,
+ out: cursor on node pointer record,
+ its page x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dtuple_t* tuple;
+ rec_t* user_rec;
+ rec_t* node_ptr;
+ ulint level;
+ ulint page_no;
+ dict_index_t* index;
+ rtr_mbr_t mbr;
+
+ page_no = btr_cur_get_block(cursor)->page.id().page_no();
+ index = btr_cur_get_index(cursor);
+
+ ut_ad(srv_read_only_mode
+ || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+
+ ut_ad(dict_index_get_page(index) != page_no);
+
+ level = btr_page_get_level(btr_cur_get_page(cursor));
+
+ user_rec = btr_cur_get_rec(cursor);
+ ut_a(page_rec_is_user_rec(user_rec));
+
+ offsets = rec_get_offsets(user_rec, index, offsets,
+ level ? 0 : index->n_fields,
+ ULINT_UNDEFINED, &heap);
+ rtr_get_mbr_from_rec(user_rec, offsets, &mbr);
+
+ tuple = rtr_index_build_node_ptr(
+ index, &mbr, user_rec, page_no, heap);
+
+ if (sea_cur && !sea_cur->rtr_info) {
+ sea_cur = NULL;
+ }
+
+ rtr_get_father_node(index, level + 1, tuple, sea_cur, cursor,
+ page_no, mtr);
+
+ node_ptr = btr_cur_get_rec(cursor);
+ ut_ad(!page_rec_is_comp(node_ptr)
+ || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
+ offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ ulint child_page = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+
+ if (child_page != page_no) {
+ const rec_t* print_rec;
+
+ ib::fatal error;
+
+ error << "Corruption of index " << index->name
+ << " of table " << index->table->name
+ << " parent page " << page_no
+ << " child page " << child_page;
+
+ print_rec = page_rec_get_next(
+ page_get_infimum_rec(page_align(user_rec)));
+ offsets = rec_get_offsets(print_rec, index, offsets,
+ page_rec_is_leaf(user_rec)
+ ? index->n_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ error << "; child ";
+ rec_print(error.m_oss, print_rec,
+ rec_get_info_bits(print_rec, rec_offs_comp(offsets)),
+ offsets);
+ offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ error << "; parent ";
+ rec_print(error.m_oss, print_rec,
+ rec_get_info_bits(print_rec, rec_offs_comp(offsets)),
+ offsets);
+
+ error << ". You should dump + drop + reimport the table to"
+ " fix the corruption. If the crash happens at"
+ " database startup, see "
+ "https://mariadb.com/kb/en/library/innodb-recovery-modes/"
+ " about forcing"
+ " recovery. Then dump + drop + reimport.";
+ }
+
+ return(offsets);
+}
+
+/************************************************************//**
+Returns the father block to a page. It is assumed that mtr holds
+an X or SX latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+rec_offs*
+rtr_page_get_father_block(
+/*======================*/
+ rec_offs* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ dict_index_t* index, /*!< in: b-tree index */
+ buf_block_t* block, /*!< in: child page in the index */
+ mtr_t* mtr, /*!< in: mtr */
+ btr_cur_t* sea_cur,/*!< in: search cursor, contains information
+ about parent nodes in search */
+ btr_cur_t* cursor) /*!< out: cursor on node pointer record,
+ its page x-latched */
+{
+ rec_t* rec = page_rec_get_next(
+ page_get_infimum_rec(buf_block_get_frame(block)));
+ btr_cur_position(index, rec, block, cursor);
+
+ return(rtr_page_get_father_node_ptr(offsets, heap, sea_cur,
+ cursor, mtr));
+}
+
+/*******************************************************************//**
+Create a RTree search info structure */
+rtr_info_t*
+rtr_create_rtr_info(
+/******************/
+ bool need_prdt, /*!< in: Whether predicate lock
+ is needed */
+ bool init_matches, /*!< in: Whether to initiate the
+ "matches" structure for collecting
+ matched leaf records */
+ btr_cur_t* cursor, /*!< in: tree search cursor */
+ dict_index_t* index) /*!< in: index struct */
+{
+ rtr_info_t* rtr_info;
+
+ index = index ? index : cursor->index;
+ ut_ad(index);
+
+ rtr_info = static_cast<rtr_info_t*>(ut_zalloc_nokey(sizeof(*rtr_info)));
+
+ rtr_info->allocated = true;
+ rtr_info->cursor = cursor;
+ rtr_info->index = index;
+
+ if (init_matches) {
+ rtr_info->heap = mem_heap_create(sizeof(*(rtr_info->matches)));
+ rtr_info->matches = static_cast<matched_rec_t*>(
+ mem_heap_zalloc(
+ rtr_info->heap,
+ sizeof(*rtr_info->matches)));
+
+ rtr_info->matches->matched_recs
+ = UT_NEW_NOKEY(rtr_rec_vector());
+
+ rtr_info->matches->bufp = page_align(rtr_info->matches->rec_buf
+ + UNIV_PAGE_SIZE_MAX + 1);
+ mutex_create(LATCH_ID_RTR_MATCH_MUTEX,
+ &rtr_info->matches->rtr_match_mutex);
+ rw_lock_create(PFS_NOT_INSTRUMENTED,
+ &(rtr_info->matches->block.lock),
+ SYNC_LEVEL_VARYING);
+ }
+
+ rtr_info->path = UT_NEW_NOKEY(rtr_node_path_t());
+ rtr_info->parent_path = UT_NEW_NOKEY(rtr_node_path_t());
+ rtr_info->need_prdt_lock = need_prdt;
+ mutex_create(LATCH_ID_RTR_PATH_MUTEX,
+ &rtr_info->rtr_path_mutex);
+
+ mutex_enter(&index->rtr_track->rtr_active_mutex);
+ index->rtr_track->rtr_active.push_front(rtr_info);
+ mutex_exit(&index->rtr_track->rtr_active_mutex);
+ return(rtr_info);
+}
+
+/*******************************************************************//**
+Update a btr_cur_t with rtr_info */
+void
+rtr_info_update_btr(
+/******************/
+ btr_cur_t* cursor, /*!< in/out: tree cursor */
+ rtr_info_t* rtr_info) /*!< in: rtr_info to set to the
+ cursor */
+{
+ ut_ad(rtr_info);
+
+ cursor->rtr_info = rtr_info;
+}
+
+/*******************************************************************//**
+Initialize a R-Tree Search structure */
+void
+rtr_init_rtr_info(
+/****************/
+ rtr_info_t* rtr_info, /*!< in: rtr_info to set to the
+ cursor */
+ bool need_prdt, /*!< in: Whether predicate lock is
+ needed */
+ btr_cur_t* cursor, /*!< in: tree search cursor */
+ dict_index_t* index, /*!< in: index structure */
+ bool reinit) /*!< in: Whether this is a reinit */
+{
+ ut_ad(rtr_info);
+
+ if (!reinit) {
+ /* Reset all members. */
+ rtr_info->path = NULL;
+ rtr_info->parent_path = NULL;
+ rtr_info->matches = NULL;
+
+ mutex_create(LATCH_ID_RTR_PATH_MUTEX,
+ &rtr_info->rtr_path_mutex);
+
+ memset(rtr_info->tree_blocks, 0x0,
+ sizeof(rtr_info->tree_blocks));
+ memset(rtr_info->tree_savepoints, 0x0,
+ sizeof(rtr_info->tree_savepoints));
+ rtr_info->mbr.xmin = 0.0;
+ rtr_info->mbr.xmax = 0.0;
+ rtr_info->mbr.ymin = 0.0;
+ rtr_info->mbr.ymax = 0.0;
+ rtr_info->thr = NULL;
+ rtr_info->heap = NULL;
+ rtr_info->cursor = NULL;
+ rtr_info->index = NULL;
+ rtr_info->need_prdt_lock = false;
+ rtr_info->need_page_lock = false;
+ rtr_info->allocated = false;
+ rtr_info->mbr_adj = false;
+ rtr_info->fd_del = false;
+ rtr_info->search_tuple = NULL;
+ rtr_info->search_mode = PAGE_CUR_UNSUPP;
+ }
+
+ ut_ad(!rtr_info->matches || rtr_info->matches->matched_recs->empty());
+
+ rtr_info->path = UT_NEW_NOKEY(rtr_node_path_t());
+ rtr_info->parent_path = UT_NEW_NOKEY(rtr_node_path_t());
+ rtr_info->need_prdt_lock = need_prdt;
+ rtr_info->cursor = cursor;
+ rtr_info->index = index;
+
+ mutex_enter(&index->rtr_track->rtr_active_mutex);
+ index->rtr_track->rtr_active.push_front(rtr_info);
+ mutex_exit(&index->rtr_track->rtr_active_mutex);
+}
+
+/**************************************************************//**
+Clean up R-Tree search structure */
+void
+rtr_clean_rtr_info(
+/*===============*/
+ rtr_info_t* rtr_info, /*!< in: RTree search info */
+ bool free_all) /*!< in: need to free rtr_info itself */
+{
+ dict_index_t* index;
+ bool initialized = false;
+
+ if (!rtr_info) {
+ return;
+ }
+
+ index = rtr_info->index;
+
+ if (index) {
+ mutex_enter(&index->rtr_track->rtr_active_mutex);
+ }
+
+ while (rtr_info->parent_path && !rtr_info->parent_path->empty()) {
+ btr_pcur_t* cur = rtr_info->parent_path->back().cursor;
+ rtr_info->parent_path->pop_back();
+
+ if (cur) {
+ btr_pcur_close(cur);
+ ut_free(cur);
+ }
+ }
+
+ UT_DELETE(rtr_info->parent_path);
+ rtr_info->parent_path = NULL;
+
+ if (rtr_info->path != NULL) {
+ UT_DELETE(rtr_info->path);
+ rtr_info->path = NULL;
+ initialized = true;
+ }
+
+ if (rtr_info->matches) {
+ rtr_info->matches->used = false;
+ rtr_info->matches->locked = false;
+ rtr_info->matches->valid = false;
+ rtr_info->matches->matched_recs->clear();
+ }
+
+ if (index) {
+ index->rtr_track->rtr_active.remove(rtr_info);
+ mutex_exit(&index->rtr_track->rtr_active_mutex);
+ }
+
+ if (free_all) {
+ if (rtr_info->matches) {
+ if (rtr_info->matches->matched_recs != NULL) {
+ UT_DELETE(rtr_info->matches->matched_recs);
+ }
+
+ rw_lock_free(&(rtr_info->matches->block.lock));
+
+ mutex_destroy(&rtr_info->matches->rtr_match_mutex);
+ }
+
+ if (rtr_info->heap) {
+ mem_heap_free(rtr_info->heap);
+ }
+
+ if (initialized) {
+ mutex_destroy(&rtr_info->rtr_path_mutex);
+ }
+
+ if (rtr_info->allocated) {
+ ut_free(rtr_info);
+ }
+ }
+}
+
+/**************************************************************//**
+Rebuilt the "path" to exclude the removing page no */
+static
+void
+rtr_rebuild_path(
+/*=============*/
+ rtr_info_t* rtr_info, /*!< in: RTree search info */
+ ulint page_no) /*!< in: need to free rtr_info itself */
+{
+ rtr_node_path_t* new_path
+ = UT_NEW_NOKEY(rtr_node_path_t());
+
+ rtr_node_path_t::iterator rit;
+#ifdef UNIV_DEBUG
+ ulint before_size = rtr_info->path->size();
+#endif /* UNIV_DEBUG */
+
+ for (rit = rtr_info->path->begin();
+ rit != rtr_info->path->end(); ++rit) {
+ node_visit_t next_rec = *rit;
+
+ if (next_rec.page_no == page_no) {
+ continue;
+ }
+
+ new_path->push_back(next_rec);
+#ifdef UNIV_DEBUG
+ node_visit_t rec = new_path->back();
+ ut_ad(rec.level < rtr_info->cursor->tree_height
+ && rec.page_no > 0);
+#endif /* UNIV_DEBUG */
+ }
+
+ UT_DELETE(rtr_info->path);
+
+ ut_ad(new_path->size() == before_size - 1);
+
+ rtr_info->path = new_path;
+
+ if (!rtr_info->parent_path->empty()) {
+ rtr_node_path_t* new_parent_path = UT_NEW_NOKEY(
+ rtr_node_path_t());
+
+ for (rit = rtr_info->parent_path->begin();
+ rit != rtr_info->parent_path->end(); ++rit) {
+ node_visit_t next_rec = *rit;
+
+ if (next_rec.child_no == page_no) {
+ btr_pcur_t* cur = next_rec.cursor;
+
+ if (cur) {
+ btr_pcur_close(cur);
+ ut_free(cur);
+ }
+
+ continue;
+ }
+
+ new_parent_path->push_back(next_rec);
+ }
+ UT_DELETE(rtr_info->parent_path);
+ rtr_info->parent_path = new_parent_path;
+ }
+
+}
+
+/**************************************************************//**
+Check whether a discarding page is in anyone's search path */
+void
+rtr_check_discard_page(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
+ the root page */
+ buf_block_t* block) /*!< in: block of page to be discarded */
+{
+ const ulint pageno = block->page.id().page_no();
+
+ mutex_enter(&index->rtr_track->rtr_active_mutex);
+
+ for (const auto& rtr_info : index->rtr_track->rtr_active) {
+ if (cursor && rtr_info == cursor->rtr_info) {
+ continue;
+ }
+
+ mutex_enter(&rtr_info->rtr_path_mutex);
+ for (const node_visit_t& node : *rtr_info->path) {
+ if (node.page_no == pageno) {
+ rtr_rebuild_path(rtr_info, pageno);
+ break;
+ }
+ }
+ mutex_exit(&rtr_info->rtr_path_mutex);
+
+ if (rtr_info->matches) {
+ mutex_enter(&rtr_info->matches->rtr_match_mutex);
+
+ if ((&rtr_info->matches->block)->page.id().page_no()
+ == pageno) {
+ if (!rtr_info->matches->matched_recs->empty()) {
+ rtr_info->matches->matched_recs->clear();
+ }
+ ut_ad(rtr_info->matches->matched_recs->empty());
+ rtr_info->matches->valid = false;
+ }
+
+ mutex_exit(&rtr_info->matches->rtr_match_mutex);
+ }
+ }
+
+ mutex_exit(&index->rtr_track->rtr_active_mutex);
+
+ lock_mutex_enter();
+ lock_prdt_page_free_from_discard(block, &lock_sys.prdt_hash);
+ lock_prdt_page_free_from_discard(block, &lock_sys.prdt_page_hash);
+ lock_mutex_exit();
+}
+
+/** Structure acts as functor to get the optimistic access of the page.
+It returns true if it successfully gets the page. */
+struct optimistic_get
+{
+ btr_pcur_t *const r_cursor;
+ mtr_t *const mtr;
+
+ optimistic_get(btr_pcur_t *r_cursor,mtr_t *mtr)
+ :r_cursor(r_cursor), mtr(mtr) {}
+
+ bool operator()(buf_block_t *hint) const
+ {
+ return hint && buf_page_optimistic_get(
+ RW_X_LATCH, hint, r_cursor->modify_clock, __FILE__,
+ __LINE__, mtr);
+ }
+};
+
+/** Restore the stored position of a persistent cursor bufferfixing the page */
+static
+bool
+rtr_cur_restore_position(
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* btr_cur, /*!< in: detached persistent cursor */
+ ulint level, /*!< in: index level */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ mem_heap_t* heap;
+ btr_pcur_t* r_cursor = rtr_get_parent_cursor(btr_cur, level, false);
+ dtuple_t* tuple;
+ bool ret = false;
+
+ ut_ad(mtr);
+ ut_ad(r_cursor);
+ ut_ad(mtr->is_active());
+
+ index = btr_cur_get_index(btr_cur);
+
+ if (r_cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
+ || r_cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+ return(false);
+ }
+
+ DBUG_EXECUTE_IF(
+ "rtr_pessimistic_position",
+ r_cursor->modify_clock = 100;
+ );
+
+ ut_ad(latch_mode == BTR_CONT_MODIFY_TREE);
+
+ if (r_cursor->block_when_stored.run_with_hint(
+ optimistic_get(r_cursor, mtr))) {
+ ut_ad(r_cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ ut_ad(r_cursor->rel_pos == BTR_PCUR_ON);
+#ifdef UNIV_DEBUG
+ do {
+ const rec_t* rec;
+ const rec_offs* offsets1;
+ const rec_offs* offsets2;
+ ulint comp;
+
+ rec = btr_pcur_get_rec(r_cursor);
+
+ heap = mem_heap_create(256);
+ offsets1 = rec_get_offsets(
+ r_cursor->old_rec, index, NULL,
+ level ? 0 : r_cursor->old_n_fields,
+ r_cursor->old_n_fields, &heap);
+ offsets2 = rec_get_offsets(
+ rec, index, NULL,
+ level ? 0 : r_cursor->old_n_fields,
+ r_cursor->old_n_fields, &heap);
+
+ comp = rec_offs_comp(offsets1);
+
+ if (rec_get_info_bits(r_cursor->old_rec, comp)
+ & REC_INFO_MIN_REC_FLAG) {
+ ut_ad(rec_get_info_bits(rec, comp)
+ & REC_INFO_MIN_REC_FLAG);
+ } else {
+
+ ut_ad(!cmp_rec_rec(r_cursor->old_rec,
+ rec, offsets1, offsets2,
+ index));
+ }
+
+ mem_heap_free(heap);
+ } while (0);
+#endif /* UNIV_DEBUG */
+
+ return(true);
+ }
+
+ /* Page has changed, for R-Tree, the page cannot be shrunk away,
+ so we search the page and its right siblings */
+ buf_block_t* block;
+ node_seq_t page_ssn;
+ const page_t* page;
+ page_cur_t* page_cursor;
+ node_visit_t* node = rtr_get_parent_node(btr_cur, level, false);
+ node_seq_t path_ssn = node->seq_no;
+ const unsigned zip_size = index->table->space->zip_size();
+ uint32_t page_no = node->page_no;
+
+ heap = mem_heap_create(256);
+
+ tuple = dict_index_build_data_tuple(r_cursor->old_rec, index, !level,
+ r_cursor->old_n_fields, heap);
+
+ page_cursor = btr_pcur_get_page_cur(r_cursor);
+ ut_ad(r_cursor == node->cursor);
+
+search_again:
+ dberr_t err = DB_SUCCESS;
+
+ block = buf_page_get_gen(
+ page_id_t(index->table->space_id, page_no),
+ zip_size, RW_X_LATCH, NULL,
+ BUF_GET, __FILE__, __LINE__, mtr, &err);
+
+ ut_ad(block);
+
+ /* Get the page SSN */
+ page = buf_block_get_frame(block);
+ page_ssn = page_get_ssn_id(page);
+
+ ulint low_match = page_cur_search(
+ block, index, tuple, PAGE_CUR_LE, page_cursor);
+
+ if (low_match == r_cursor->old_n_fields) {
+ const rec_t* rec;
+ const rec_offs* offsets1;
+ const rec_offs* offsets2;
+ ulint comp;
+
+ rec = btr_pcur_get_rec(r_cursor);
+
+ offsets1 = rec_get_offsets(r_cursor->old_rec, index, NULL,
+ level ? 0 : r_cursor->old_n_fields,
+ r_cursor->old_n_fields, &heap);
+ offsets2 = rec_get_offsets(rec, index, NULL,
+ level ? 0 : r_cursor->old_n_fields,
+ r_cursor->old_n_fields, &heap);
+
+ comp = rec_offs_comp(offsets1);
+
+ if ((rec_get_info_bits(r_cursor->old_rec, comp)
+ & REC_INFO_MIN_REC_FLAG)
+ && (rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG)) {
+ r_cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+ ret = true;
+ } else if (!cmp_rec_rec(r_cursor->old_rec, rec, offsets1, offsets2,
+ index)) {
+ r_cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+ ret = true;
+ }
+ }
+
+ /* Check the page SSN to see if it has been splitted, if so, search
+ the right page */
+ if (!ret && page_ssn > path_ssn) {
+ page_no = btr_page_get_next(page);
+ goto search_again;
+ }
+
+ mem_heap_free(heap);
+
+ return(ret);
+}
+
+/****************************************************************//**
+Copy the leaf level R-tree record, and push it to matched_rec in rtr_info */
+static
+void
+rtr_leaf_push_match_rec(
+/*====================*/
+ const rec_t* rec, /*!< in: record to copy */
+ rtr_info_t* rtr_info, /*!< in/out: search stack */
+ rec_offs* offsets, /*!< in: offsets */
+ bool is_comp) /*!< in: is compact format */
+{
+ byte* buf;
+ matched_rec_t* match_rec = rtr_info->matches;
+ rec_t* copy;
+ ulint data_len;
+ rtr_rec_t rtr_rec;
+
+ buf = match_rec->block.frame + match_rec->used;
+ ut_ad(page_rec_is_leaf(rec));
+
+ copy = rec_copy(buf, rec, offsets);
+
+ if (is_comp) {
+ rec_set_next_offs_new(copy, PAGE_NEW_SUPREMUM);
+ } else {
+ rec_set_next_offs_old(copy, PAGE_OLD_SUPREMUM);
+ }
+
+ rtr_rec.r_rec = copy;
+ rtr_rec.locked = false;
+
+ match_rec->matched_recs->push_back(rtr_rec);
+ match_rec->valid = true;
+
+ data_len = rec_offs_data_size(offsets) + rec_offs_extra_size(offsets);
+ match_rec->used += data_len;
+
+ ut_ad(match_rec->used < srv_page_size);
+}
+
+/**************************************************************//**
+Store the parent path cursor
+@return number of cursor stored */
+ulint
+rtr_store_parent_path(
+/*==================*/
+ const buf_block_t* block, /*!< in: block of the page */
+ btr_cur_t* btr_cur,/*!< in/out: persistent cursor */
+ ulint latch_mode,
+ /*!< in: latch_mode */
+ ulint level, /*!< in: index level */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint num = btr_cur->rtr_info->parent_path->size();
+ ulint num_stored = 0;
+
+ while (num >= 1) {
+ node_visit_t* node = &(*btr_cur->rtr_info->parent_path)[
+ num - 1];
+ btr_pcur_t* r_cursor = node->cursor;
+ buf_block_t* cur_block;
+
+ if (node->level > level) {
+ break;
+ }
+
+ r_cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+ r_cursor->latch_mode = latch_mode;
+
+ cur_block = btr_pcur_get_block(r_cursor);
+
+ if (cur_block == block) {
+ btr_pcur_store_position(r_cursor, mtr);
+ num_stored++;
+ } else {
+ break;
+ }
+
+ num--;
+ }
+
+ return(num_stored);
+}
+/**************************************************************//**
+push a nonleaf index node to the search path for insertion */
+static
+void
+rtr_non_leaf_insert_stack_push(
+/*===========================*/
+ dict_index_t* index, /*!< in: index descriptor */
+ rtr_node_path_t* path, /*!< in/out: search path */
+ ulint level, /*!< in: index page level */
+ uint32_t child_no,/*!< in: child page no */
+ const buf_block_t* block, /*!< in: block of the page */
+ const rec_t* rec, /*!< in: positioned record */
+ double mbr_inc)/*!< in: MBR needs to be enlarged */
+{
+ node_seq_t new_seq;
+ btr_pcur_t* my_cursor;
+
+ my_cursor = static_cast<btr_pcur_t*>(
+ ut_malloc_nokey(sizeof(*my_cursor)));
+
+ btr_pcur_init(my_cursor);
+
+ page_cur_position(rec, block, btr_pcur_get_page_cur(my_cursor));
+
+ (btr_pcur_get_btr_cur(my_cursor))->index = index;
+
+ new_seq = rtr_get_current_ssn_id(index);
+ rtr_non_leaf_stack_push(path, block->page.id().page_no(),
+ new_seq, level, child_no, my_cursor, mbr_inc);
+}
+
+/** Copy a buf_block_t, except "block->lock".
+@param[in,out] matches copy to match->block
+@param[in] block block to copy */
+static
+void
+rtr_copy_buf(
+ matched_rec_t* matches,
+ const buf_block_t* block)
+{
+ /* Copy all members of "block" to "matches->block" except "lock".
+ We skip "lock" because it is not used
+ from the dummy buf_block_t we create here and because memcpy()ing
+ it generates (valid) compiler warnings that the vtable pointer
+ will be copied. */
+ new (&matches->block.page) buf_page_t(block->page);
+ matches->block.frame = block->frame;
+ matches->block.unzip_LRU = block->unzip_LRU;
+
+ ut_d(matches->block.in_unzip_LRU_list = block->in_unzip_LRU_list);
+ ut_d(matches->block.in_withdraw_list = block->in_withdraw_list);
+
+ /* Skip buf_block_t::lock */
+ matches->block.modify_clock = block->modify_clock;
+#ifdef BTR_CUR_HASH_ADAPT
+ matches->block.n_hash_helps = block->n_hash_helps;
+ matches->block.n_fields = block->n_fields;
+ matches->block.left_side = block->left_side;
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ matches->block.n_pointers = 0;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ matches->block.curr_n_fields = block->curr_n_fields;
+ matches->block.curr_left_side = block->curr_left_side;
+ matches->block.index = block->index;
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_d(matches->block.debug_latch = NULL);
+}
+
+/****************************************************************//**
+Generate a shadow copy of the page block header to save the
+matched records */
+static
+void
+rtr_init_match(
+/*===========*/
+ matched_rec_t* matches,/*!< in/out: match to initialize */
+ const buf_block_t* block, /*!< in: buffer block */
+ const page_t* page) /*!< in: buffer page */
+{
+ ut_ad(matches->matched_recs->empty());
+ matches->locked = false;
+ rtr_copy_buf(matches, block);
+ matches->block.frame = matches->bufp;
+ matches->valid = false;
+ /* We have to copy PAGE_W*_SUPREMUM_END bytes so that we can
+ use infimum/supremum of this page as normal btr page for search. */
+ memcpy(matches->block.frame, page, page_is_comp(page)
+ ? PAGE_NEW_SUPREMUM_END
+ : PAGE_OLD_SUPREMUM_END);
+ matches->used = page_is_comp(page)
+ ? PAGE_NEW_SUPREMUM_END
+ : PAGE_OLD_SUPREMUM_END;
+#ifdef RTR_SEARCH_DIAGNOSTIC
+ ulint pageno = page_get_page_no(page);
+ fprintf(stderr, "INNODB_RTR: Searching leaf page %d\n",
+ static_cast<int>(pageno));
+#endif /* RTR_SEARCH_DIAGNOSTIC */
+}
+
+/****************************************************************//**
+Get the bounding box content from an index record */
+void
+rtr_get_mbr_from_rec(
+/*=================*/
+ const rec_t* rec, /*!< in: data tuple */
+ const rec_offs* offsets,/*!< in: offsets array */
+ rtr_mbr_t* mbr) /*!< out MBR */
+{
+ ulint rec_f_len;
+ const byte* data;
+
+ data = rec_get_nth_field(rec, offsets, 0, &rec_f_len);
+
+ rtr_read_mbr(data, mbr);
+}
+
+/****************************************************************//**
+Get the bounding box content from a MBR data record */
+void
+rtr_get_mbr_from_tuple(
+/*===================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ rtr_mbr* mbr) /*!< out: mbr to fill */
+{
+ const dfield_t* dtuple_field;
+ ulint dtuple_f_len;
+
+ dtuple_field = dtuple_get_nth_field(dtuple, 0);
+ dtuple_f_len = dfield_get_len(dtuple_field);
+ ut_a(dtuple_f_len >= 4 * sizeof(double));
+
+ rtr_read_mbr(static_cast<const byte*>(dfield_get_data(dtuple_field)),
+ mbr);
+}
+
+/** Compare minimum bounding rectangles.
+@return 1, 0, -1, if mode == PAGE_CUR_MBR_EQUAL. And return
+1, 0 for rest compare modes, depends on a and b qualifies the
+relationship (CONTAINS, WITHIN etc.) */
+static int cmp_gis_field(page_cur_mode_t mode, const void *a, const void *b)
+{
+ return mode == PAGE_CUR_MBR_EQUAL
+ ? cmp_geometry_field(a, b)
+ : rtree_key_cmp(mode, a, b);
+}
+
+/** Compare a GIS data tuple to a physical record in rtree non-leaf node.
+We need to check the page number field, since we don't store pk field in
+rtree non-leaf node.
+@param[in] dtuple data tuple
+@param[in] rec R-tree record
+@return whether dtuple is less than rec */
+static bool
+cmp_dtuple_rec_with_gis_internal(const dtuple_t* dtuple, const rec_t* rec)
+{
+ const dfield_t *dtuple_field= dtuple_get_nth_field(dtuple, 0);
+ ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN);
+
+ if (cmp_gis_field(PAGE_CUR_WITHIN, dfield_get_data(dtuple_field), rec))
+ return true;
+
+ dtuple_field= dtuple_get_nth_field(dtuple, 1);
+ ut_ad(dfield_get_len(dtuple_field) == 4); /* child page number */
+ ut_ad(dtuple_field->type.mtype == DATA_SYS_CHILD);
+ ut_ad(!(dtuple_field->type.prtype & ~DATA_NOT_NULL));
+
+ return memcmp(dtuple_field->data, rec + DATA_MBR_LEN, 4) != 0;
+}
+
+#ifndef UNIV_DEBUG
+static
+#endif
+/** Compare a GIS data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec R-tree record
+@param[in] mode compare mode
+@retval negative if dtuple is less than rec */
+int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec,
+ page_cur_mode_t mode)
+{
+ const dfield_t *dtuple_field= dtuple_get_nth_field(dtuple, 0);
+ /* FIXME: TABLE_SHARE::init_from_binary_frm_image() is adding
+ field->key_part_length_bytes() to the key length */
+ ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN ||
+ dfield_get_len(dtuple_field) == DATA_MBR_LEN + 2);
+
+ return cmp_gis_field(mode, dfield_get_data(dtuple_field), rec);
+}
+
+/****************************************************************//**
+Searches the right position in rtree for a page cursor. */
+bool
+rtr_cur_search_with_match(
+/*======================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ dict_index_t* index, /*!< in: index descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_RTREE_INSERT,
+ PAGE_CUR_RTREE_LOCATE etc. */
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ rtr_info_t* rtr_info)/*!< in/out: search stack */
+{
+ bool found = false;
+ const page_t* page;
+ const rec_t* rec;
+ const rec_t* last_rec;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ mem_heap_t* heap = NULL;
+ int cmp = 1;
+ double least_inc = DBL_MAX;
+ const rec_t* best_rec;
+ const rec_t* last_match_rec = NULL;
+ bool match_init = false;
+ page_cur_mode_t orig_mode = mode;
+ const rec_t* first_rec = NULL;
+
+ rec_offs_init(offsets_);
+
+ ut_ad(RTREE_SEARCH_MODE(mode));
+
+ ut_ad(dict_index_is_spatial(index));
+
+ page = buf_block_get_frame(block);
+
+ const ulint level = btr_page_get_level(page);
+ const ulint n_core = level ? 0 : index->n_fields;
+
+ if (mode == PAGE_CUR_RTREE_LOCATE) {
+ ut_ad(level != 0);
+ mode = PAGE_CUR_WITHIN;
+ }
+
+ rec = page_dir_slot_get_rec(page_dir_get_nth_slot(page, 0));
+
+ last_rec = rec;
+ best_rec = rec;
+
+ if (page_rec_is_infimum(rec)) {
+ rec = page_rec_get_next_const(rec);
+ }
+
+ /* Check insert tuple size is larger than first rec, and try to
+ avoid it if possible */
+ if (mode == PAGE_CUR_RTREE_INSERT && !page_rec_is_supremum(rec)) {
+
+ ulint new_rec_size = rec_get_converted_size(index, tuple, 0);
+
+ offsets = rec_get_offsets(rec, index, offsets, n_core,
+ dtuple_get_n_fields_cmp(tuple),
+ &heap);
+
+ if (rec_offs_size(offsets) < new_rec_size) {
+ first_rec = rec;
+ }
+
+ /* If this is the left-most page of this index level
+ and the table is a compressed table, try to avoid
+ first page as much as possible, as there will be problem
+ when update MIN_REC rec in compress table */
+ if (is_buf_block_get_page_zip(block)
+ && !page_has_prev(page)
+ && page_get_n_recs(page) >= 2) {
+
+ rec = page_rec_get_next_const(rec);
+ }
+ }
+
+ while (!page_rec_is_supremum(rec)) {
+ if (!n_core) {
+ switch (mode) {
+ case PAGE_CUR_CONTAIN:
+ case PAGE_CUR_INTERSECT:
+ case PAGE_CUR_MBR_EQUAL:
+ /* At non-leaf level, we will need to check
+ both CONTAIN and INTERSECT for either of
+ the search mode */
+ cmp = cmp_dtuple_rec_with_gis(
+ tuple, rec, PAGE_CUR_CONTAIN);
+
+ if (cmp != 0) {
+ cmp = cmp_dtuple_rec_with_gis(
+ tuple, rec,
+ PAGE_CUR_INTERSECT);
+ }
+ break;
+ case PAGE_CUR_DISJOINT:
+ cmp = cmp_dtuple_rec_with_gis(
+ tuple, rec, mode);
+
+ if (cmp != 0) {
+ cmp = cmp_dtuple_rec_with_gis(
+ tuple, rec,
+ PAGE_CUR_INTERSECT);
+ }
+ break;
+ case PAGE_CUR_RTREE_INSERT:
+ double increase;
+ double area;
+
+ cmp = cmp_dtuple_rec_with_gis(
+ tuple, rec, PAGE_CUR_WITHIN);
+
+ if (cmp != 0) {
+ increase = rtr_rec_cal_increase(
+ tuple, rec, &area);
+ /* Once it goes beyond DBL_MAX,
+ it would not make sense to record
+ such value, just make it
+ DBL_MAX / 2 */
+ if (increase >= DBL_MAX) {
+ increase = DBL_MAX / 2;
+ }
+
+ if (increase < least_inc) {
+ least_inc = increase;
+ best_rec = rec;
+ } else if (best_rec
+ && best_rec == first_rec) {
+ /* if first_rec is set,
+ we will try to avoid it */
+ least_inc = increase;
+ best_rec = rec;
+ }
+ }
+ break;
+ case PAGE_CUR_RTREE_GET_FATHER:
+ cmp = cmp_dtuple_rec_with_gis_internal(
+ tuple, rec);
+ break;
+ default:
+ /* WITHIN etc. */
+ cmp = cmp_dtuple_rec_with_gis(
+ tuple, rec, mode);
+ }
+ } else {
+ /* At leaf level, INSERT should translate to LE */
+ ut_ad(mode != PAGE_CUR_RTREE_INSERT);
+
+ cmp = cmp_dtuple_rec_with_gis(
+ tuple, rec, mode);
+ }
+
+ if (cmp == 0) {
+ found = true;
+
+ /* If located, the matching node/rec will be pushed
+ to rtr_info->path for non-leaf nodes, or
+ rtr_info->matches for leaf nodes */
+ if (rtr_info && mode != PAGE_CUR_RTREE_INSERT) {
+ if (!n_core) {
+ uint32_t page_no;
+ node_seq_t new_seq;
+ bool is_loc;
+
+ is_loc = (orig_mode
+ == PAGE_CUR_RTREE_LOCATE
+ || orig_mode
+ == PAGE_CUR_RTREE_GET_FATHER);
+
+ offsets = rec_get_offsets(
+ rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ page_no = btr_node_ptr_get_child_page_no(
+ rec, offsets);
+
+ ut_ad(level >= 1);
+
+ /* Get current SSN, before we insert
+ it into the path stack */
+ new_seq = rtr_get_current_ssn_id(index);
+
+ rtr_non_leaf_stack_push(
+ rtr_info->path,
+ page_no,
+ new_seq, level - 1, 0,
+ NULL, 0);
+
+ if (is_loc) {
+ rtr_non_leaf_insert_stack_push(
+ index,
+ rtr_info->parent_path,
+ level, page_no, block,
+ rec, 0);
+ }
+
+ if (!srv_read_only_mode
+ && (rtr_info->need_page_lock
+ || !is_loc)) {
+
+ /* Lock the page, preventing it
+ from being shrunk */
+ lock_place_prdt_page_lock(
+ page_id_t(block->page
+ .id()
+ .space(),
+ page_no),
+ index,
+ rtr_info->thr);
+ }
+ } else {
+ ut_ad(orig_mode
+ != PAGE_CUR_RTREE_LOCATE);
+
+ if (!match_init) {
+ rtr_init_match(
+ rtr_info->matches,
+ block, page);
+ match_init = true;
+ }
+
+ /* Collect matched records on page */
+ offsets = rec_get_offsets(
+ rec, index, offsets,
+ index->n_fields,
+ ULINT_UNDEFINED, &heap);
+ rtr_leaf_push_match_rec(
+ rec, rtr_info, offsets,
+ page_is_comp(page));
+ }
+
+ last_match_rec = rec;
+ } else {
+ /* This is the insertion case, it will break
+ once it finds the first MBR that can accomodate
+ the inserting rec */
+ break;
+ }
+ }
+
+ last_rec = rec;
+
+ rec = page_rec_get_next_const(rec);
+ }
+
+ /* All records on page are searched */
+ if (page_rec_is_supremum(rec)) {
+ if (!n_core) {
+ if (!found) {
+ /* No match case, if it is for insertion,
+ then we select the record that result in
+ least increased area */
+ if (mode == PAGE_CUR_RTREE_INSERT) {
+ ut_ad(least_inc < DBL_MAX);
+ offsets = rec_get_offsets(
+ best_rec, index, offsets,
+ 0, ULINT_UNDEFINED, &heap);
+ uint32_t child_no =
+ btr_node_ptr_get_child_page_no(
+ best_rec, offsets);
+
+ rtr_non_leaf_insert_stack_push(
+ index, rtr_info->parent_path,
+ level, child_no, block,
+ best_rec, least_inc);
+
+ page_cur_position(best_rec, block,
+ cursor);
+ rtr_info->mbr_adj = true;
+ } else {
+ /* Position at the last rec of the
+ page, if it is not the leaf page */
+ page_cur_position(last_rec, block,
+ cursor);
+ }
+ } else {
+ /* There are matching records, position
+ in the last matching records */
+ if (rtr_info) {
+ rec = last_match_rec;
+ page_cur_position(
+ rec, block, cursor);
+ }
+ }
+ } else if (rtr_info) {
+ /* Leaf level, no match, position at the
+ last (supremum) rec */
+ if (!last_match_rec) {
+ page_cur_position(rec, block, cursor);
+ goto func_exit;
+ }
+
+ /* There are matched records */
+ matched_rec_t* match_rec = rtr_info->matches;
+
+ rtr_rec_t test_rec;
+
+ test_rec = match_rec->matched_recs->back();
+#ifdef UNIV_DEBUG
+ rec_offs offsets_2[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets2 = offsets_2;
+ rec_offs_init(offsets_2);
+
+ ut_ad(found);
+
+ /* Verify the record to be positioned is the same
+ as the last record in matched_rec vector */
+ offsets2 = rec_get_offsets(test_rec.r_rec, index,
+ offsets2, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ offsets = rec_get_offsets(last_match_rec, index,
+ offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ ut_ad(cmp_rec_rec(test_rec.r_rec, last_match_rec,
+ offsets2, offsets, index) == 0);
+#endif /* UNIV_DEBUG */
+ /* Pop the last match record and position on it */
+ match_rec->matched_recs->pop_back();
+ page_cur_position(test_rec.r_rec, &match_rec->block,
+ cursor);
+ }
+ } else {
+
+ if (mode == PAGE_CUR_RTREE_INSERT) {
+ ut_ad(!last_match_rec);
+ rtr_non_leaf_insert_stack_push(
+ index, rtr_info->parent_path, level,
+ mach_read_from_4(rec + DATA_MBR_LEN),
+ block, rec, 0);
+
+ } else if (rtr_info && found && !n_core) {
+ rec = last_match_rec;
+ }
+
+ page_cur_position(rec, block, cursor);
+ }
+
+#ifdef UNIV_DEBUG
+ /* Verify that we are positioned at the same child page as pushed in
+ the path stack */
+ if (!n_core && (!page_rec_is_supremum(rec) || found)
+ && mode != PAGE_CUR_RTREE_INSERT) {
+ ulint page_no;
+
+ offsets = rec_get_offsets(rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ page_no = btr_node_ptr_get_child_page_no(rec, offsets);
+
+ if (rtr_info && found) {
+ rtr_node_path_t* path = rtr_info->path;
+ node_visit_t last_visit = path->back();
+
+ ut_ad(last_visit.page_no == page_no);
+ }
+ }
+#endif /* UNIV_DEBUG */
+
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(found);
+}
diff --git a/storage/innobase/ha/ha0storage.cc b/storage/innobase/ha/ha0storage.cc
new file mode 100644
index 00000000..acde71b0
--- /dev/null
+++ b/storage/innobase/ha/ha0storage.cc
@@ -0,0 +1,178 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ha/ha0storage.cc
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+#include "ut0rnd.h"
+
+/*******************************************************************//**
+Retrieves a data from a storage. If it is present, a pointer to the
+stored copy of data is returned, otherwise NULL is returned. */
+static
+const void*
+ha_storage_get(
+/*===========*/
+ ha_storage_t* storage, /*!< in: hash storage */
+ const void* data, /*!< in: data to check for */
+ ulint data_len) /*!< in: data length */
+{
+ ha_storage_node_t* node;
+ ulint fold;
+
+ /* avoid repetitive calls to ut_fold_binary() in the HASH_SEARCH
+ macro */
+ fold = ut_fold_binary(static_cast<const byte*>(data), data_len);
+
+#define IS_FOUND \
+ node->data_len == data_len && memcmp(node->data, data, data_len) == 0
+
+ HASH_SEARCH(
+ next, /* node->"next" */
+ &storage->hash, /* the hash table */
+ fold, /* key */
+ ha_storage_node_t*, /* type of node->next */
+ node, /* auxiliary variable */
+ , /* assertion */
+ IS_FOUND); /* search criteria */
+
+ if (node == NULL) {
+
+ return(NULL);
+ }
+ /* else */
+
+ return(node->data);
+}
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit". */
+const void*
+ha_storage_put_memlim(
+/*==================*/
+ ha_storage_t* storage, /*!< in/out: hash storage */
+ const void* data, /*!< in: data to store */
+ ulint data_len, /*!< in: data length */
+ ulint memlim) /*!< in: memory limit to obey */
+{
+ void* raw;
+ ha_storage_node_t* node;
+ const void* data_copy;
+ ulint fold;
+
+ /* check if data chunk is already present */
+ data_copy = ha_storage_get(storage, data, data_len);
+ if (data_copy != NULL) {
+
+ return(data_copy);
+ }
+
+ /* not present */
+
+ /* check if we are allowed to allocate data_len bytes */
+ if (memlim > 0
+ && ha_storage_get_size(storage) + data_len > memlim) {
+
+ return(NULL);
+ }
+
+ /* we put the auxiliary node struct and the data itself in one
+ continuous block */
+ raw = mem_heap_alloc(storage->heap,
+ sizeof(ha_storage_node_t) + data_len);
+
+ node = (ha_storage_node_t*) raw;
+ data_copy = (byte*) raw + sizeof(*node);
+
+ memcpy((byte*) raw + sizeof(*node), data, data_len);
+
+ node->data_len = data_len;
+ node->data = data_copy;
+
+ /* avoid repetitive calls to ut_fold_binary() in the HASH_INSERT
+ macro */
+ fold = ut_fold_binary(static_cast<const byte*>(data), data_len);
+
+ HASH_INSERT(
+ ha_storage_node_t, /* type used in the hash chain */
+ next, /* node->"next" */
+ &storage->hash, /* the hash table */
+ fold, /* key */
+ node); /* add this data to the hash */
+
+ /* the output should not be changed because it will spoil the
+ hash table */
+ return(data_copy);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+void
+test_ha_storage()
+{
+ ha_storage_t* storage;
+ char buf[1024];
+ int i;
+ const void* stored[256];
+ const void* p;
+
+ storage = ha_storage_create(0, 0);
+
+ for (i = 0; i < 256; i++) {
+
+ memset(buf, i, sizeof(buf));
+ stored[i] = ha_storage_put(storage, buf, sizeof(buf));
+ }
+
+ //ha_storage_empty(&storage);
+
+ for (i = 255; i >= 0; i--) {
+
+ memset(buf, i, sizeof(buf));
+ p = ha_storage_put(storage, buf, sizeof(buf));
+
+ if (p != stored[i]) {
+ ib::warn() << "ha_storage_put() returned " << p
+ << " instead of " << stored[i] << ", i=" << i;
+ return;
+ }
+ }
+
+ ib::info() << "all ok";
+
+ ha_storage_free(storage);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
new file mode 100644
index 00000000..d61624a5
--- /dev/null
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -0,0 +1,21691 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/** @file ha_innodb.cc */
+
+#include "univ.i"
+
+/* Include necessary SQL headers */
+#include "ha_prototypes.h"
+#include <debug_sync.h>
+#include <gstream.h>
+#include <log.h>
+#include <mysys_err.h>
+#include <innodb_priv.h>
+#include <strfunc.h>
+#include <sql_acl.h>
+#include <sql_class.h>
+#include <sql_show.h>
+#include <sql_table.h>
+#include <table_cache.h>
+#include <my_check_opt.h>
+#include <my_bitmap.h>
+#include <mysql/service_thd_alloc.h>
+#include <mysql/service_thd_wait.h>
+#include "field.h"
+#include "scope.h"
+#include "srv0srv.h"
+
+// MYSQL_PLUGIN_IMPORT extern my_bool lower_case_file_system;
+// MYSQL_PLUGIN_IMPORT extern char mysql_unpacked_real_data_home[];
+
+#include <my_service_manager.h>
+#include <key.h>
+#include <sql_manager.h>
+
+/* Include necessary InnoDB headers */
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0bulk.h"
+#include "btr0sea.h"
+#include "buf0dblwr.h"
+#include "buf0dump.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "btr0defragment.h"
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#include "fts0fts.h"
+#include "fts0plugin.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "log0crypt.h"
+#include "mtr0mtr.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "pars0pars.h"
+#include "rem0types.h"
+#include "row0import.h"
+#include "row0ins.h"
+#include "row0merge.h"
+#include "row0mysql.h"
+#include "row0quiesce.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "fil0crypt.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "rem0rec.h"
+#ifdef UNIV_DEBUG
+#include "trx0purge.h"
+#endif /* UNIV_DEBUG */
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "fil0pagecompress.h"
+#include "ut0mem.h"
+#include "ut0mutex.h"
+#include "row0ext.h"
+
+#include <limits>
+
+#define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X))
+
+extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
+unsigned long long thd_get_query_id(const MYSQL_THD thd);
+void thd_clear_error(MYSQL_THD thd);
+
+TABLE *find_fk_open_table(THD *thd, const char *db, size_t db_len,
+ const char *table, size_t table_len);
+MYSQL_THD create_background_thd();
+void destroy_background_thd(MYSQL_THD thd);
+void reset_thd(MYSQL_THD thd);
+TABLE *get_purge_table(THD *thd);
+TABLE *open_purge_table(THD *thd, const char *db, size_t dblen,
+ const char *tb, size_t tblen);
+void close_thread_tables(THD* thd);
+
+#ifdef MYSQL_DYNAMIC_PLUGIN
+#define tc_size 400
+#define tdc_size 400
+#endif
+
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
+
+#include "ha_innodb.h"
+#include "i_s.h"
+#include "sync0sync.h"
+
+#include <string>
+#include <sstream>
+
+#ifdef WITH_WSREP
+#include "dict0priv.h"
+#include <mysql/service_md5.h>
+#include "wsrep_sst.h"
+#endif /* WITH_WSREP */
+
+#define INSIDE_HA_INNOBASE_CC
+
+#define EQ_CURRENT_THD(thd) ((thd) == current_thd)
+
+struct handlerton* innodb_hton_ptr;
+
+static const long AUTOINC_OLD_STYLE_LOCKING = 0;
+static const long AUTOINC_NEW_STYLE_LOCKING = 1;
+static const long AUTOINC_NO_LOCKING = 2;
+
+static ulong innobase_open_files;
+static long innobase_autoinc_lock_mode;
+
+static ulonglong innobase_buffer_pool_size;
+
+/** Percentage of the buffer pool to reserve for 'old' blocks.
+Connected to buf_LRU_old_ratio. */
+static uint innobase_old_blocks_pct;
+
+static char* innobase_data_file_path;
+static char* innobase_temp_data_file_path;
+
+/* The default values for the following char* start-up parameters
+are determined in innodb_init_params(). */
+
+static char* innobase_data_home_dir;
+static char* innobase_enable_monitor_counter;
+static char* innobase_disable_monitor_counter;
+static char* innobase_reset_monitor_counter;
+static char* innobase_reset_all_monitor_counter;
+
+static ulong innodb_flush_method;
+
+/* This variable can be set in the server configure file, specifying
+stopword table to be used */
+static char* innobase_server_stopword_table;
+
+static my_bool innobase_rollback_on_timeout;
+static my_bool innobase_create_status_file;
+my_bool innobase_stats_on_metadata;
+static my_bool innodb_optimize_fulltext_only;
+
+static char* innodb_version_str = (char*) INNODB_VERSION_STR;
+
+extern uint srv_fil_crypt_rotate_key_age;
+extern uint srv_n_fil_crypt_iops;
+
+#ifdef UNIV_DEBUG
+my_bool innodb_evict_tables_on_commit_debug;
+#endif
+
+/** File format constraint for ALTER TABLE */
+ulong innodb_instant_alter_column_allowed;
+
+/** Note we cannot use rec_format_enum because we do not allow
+COMPRESSED row format for innodb_default_row_format option. */
+enum default_row_format_enum {
+ DEFAULT_ROW_FORMAT_REDUNDANT = 0,
+ DEFAULT_ROW_FORMAT_COMPACT = 1,
+ DEFAULT_ROW_FORMAT_DYNAMIC = 2,
+};
+
+/** A dummy variable */
+static uint innodb_max_purge_lag_wait;
+
+/** Wait for trx_sys_t::rseg_history_len to be below a limit. */
+static void innodb_max_purge_lag_wait_update(THD *thd, st_mysql_sys_var *,
+ void *, const void *limit)
+{
+ const uint l= *static_cast<const uint*>(limit);
+ if (trx_sys.rseg_history_len <= l)
+ return;
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ while (trx_sys.rseg_history_len > l)
+ {
+ if (thd_kill_level(thd))
+ break;
+ srv_wake_purge_thread_if_not_active();
+ os_thread_sleep(100000);
+ }
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+static
+void set_my_errno(int err)
+{
+ errno = err;
+}
+
+/** Checks whether the file name belongs to a partition of a table.
+@param[in] file_name file name
+@return pointer to the end of the table name part of the file name, or NULL */
+static
+char*
+is_partition(
+/*=========*/
+ char* file_name)
+{
+ /* We look for pattern #P# to see if the table is partitioned
+ MariaDB table. */
+ return strstr(file_name, table_name_t::part_suffix);
+}
+
+
+
+/** Return the InnoDB ROW_FORMAT enum value
+@param[in] row_format row_format from "innodb_default_row_format"
+@return InnoDB ROW_FORMAT value from rec_format_t enum. */
+static
+rec_format_t
+get_row_format(
+ ulong row_format)
+{
+ switch(row_format) {
+ case DEFAULT_ROW_FORMAT_REDUNDANT:
+ return(REC_FORMAT_REDUNDANT);
+ case DEFAULT_ROW_FORMAT_COMPACT:
+ return(REC_FORMAT_COMPACT);
+ case DEFAULT_ROW_FORMAT_DYNAMIC:
+ return(REC_FORMAT_DYNAMIC);
+ default:
+ ut_ad(0);
+ return(REC_FORMAT_DYNAMIC);
+ }
+}
+
+static ulong innodb_default_row_format = DEFAULT_ROW_FORMAT_DYNAMIC;
+
+/** Possible values for system variable "innodb_stats_method". The values
+are defined the same as its corresponding MyISAM system variable
+"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */
+static const char* innodb_stats_method_names[] = {
+ "nulls_equal",
+ "nulls_unequal",
+ "nulls_ignored",
+ NullS
+};
+
+/** Used to define an enumerate type of the system variable innodb_stats_method.
+This is the same as "myisam_stats_method_typelib" */
+static TYPELIB innodb_stats_method_typelib = {
+ array_elements(innodb_stats_method_names) - 1,
+ "innodb_stats_method_typelib",
+ innodb_stats_method_names,
+ NULL
+};
+
+/** Possible values of the parameter innodb_checksum_algorithm */
+const char* innodb_checksum_algorithm_names[] = {
+ "crc32",
+ "strict_crc32",
+ "innodb",
+ "strict_innodb",
+ "none",
+ "strict_none",
+ "full_crc32",
+ "strict_full_crc32",
+ NullS
+};
+
+/** Used to define an enumerate type of the system variable
+innodb_checksum_algorithm. */
+TYPELIB innodb_checksum_algorithm_typelib = {
+ array_elements(innodb_checksum_algorithm_names) - 1,
+ "innodb_checksum_algorithm_typelib",
+ innodb_checksum_algorithm_names,
+ NULL
+};
+
+/** Possible values for system variable "innodb_default_row_format". */
+static const char* innodb_default_row_format_names[] = {
+ "redundant",
+ "compact",
+ "dynamic",
+ NullS
+};
+
+/** Used to define an enumerate type of the system variable
+innodb_default_row_format. */
+static TYPELIB innodb_default_row_format_typelib = {
+ array_elements(innodb_default_row_format_names) - 1,
+ "innodb_default_row_format_typelib",
+ innodb_default_row_format_names,
+ NULL
+};
+
+/** Possible values of the parameter innodb_lock_schedule_algorithm */
+static const char* innodb_lock_schedule_algorithm_names[] = {
+ "fcfs",
+ "vats",
+ NullS
+};
+
+/** Used to define an enumerate type of the system variable
+innodb_lock_schedule_algorithm. */
+static TYPELIB innodb_lock_schedule_algorithm_typelib = {
+ array_elements(innodb_lock_schedule_algorithm_names) - 1,
+ "innodb_lock_schedule_algorithm_typelib",
+ innodb_lock_schedule_algorithm_names,
+ NULL
+};
+
+/** Names of allowed values of innodb_flush_method */
+const char* innodb_flush_method_names[] = {
+ "fsync",
+ "O_DSYNC",
+ "littlesync",
+ "nosync",
+ "O_DIRECT",
+ "O_DIRECT_NO_FSYNC",
+#ifdef _WIN32
+ "unbuffered",
+ "async_unbuffered" /* alias for "unbuffered" */,
+ "normal" /* alias for "fsync" */,
+#endif
+ NullS
+};
+
+/** Enumeration of innodb_flush_method */
+TYPELIB innodb_flush_method_typelib = {
+ array_elements(innodb_flush_method_names) - 1,
+ "innodb_flush_method_typelib",
+ innodb_flush_method_names,
+ NULL
+};
+
+/** Allowed values of innodb_change_buffering */
+static const char* innodb_change_buffering_names[] = {
+ "none", /* IBUF_USE_NONE */
+ "inserts", /* IBUF_USE_INSERT */
+ "deletes", /* IBUF_USE_DELETE_MARK */
+ "changes", /* IBUF_USE_INSERT_DELETE_MARK */
+ "purges", /* IBUF_USE_DELETE */
+ "all", /* IBUF_USE_ALL */
+ NullS
+};
+
+/** Enumeration of innodb_change_buffering */
+static TYPELIB innodb_change_buffering_typelib = {
+ array_elements(innodb_change_buffering_names) - 1,
+ "innodb_change_buffering_typelib",
+ innodb_change_buffering_names,
+ NULL
+};
+
+/** Allowed values of innodb_instant_alter_column_allowed */
+const char* innodb_instant_alter_column_allowed_names[] = {
+ "never", /* compatible with MariaDB 5.5 to 10.2 */
+ "add_last",/* allow instant ADD COLUMN ... LAST */
+ "add_drop_reorder", /* allow instant ADD anywhere & DROP & reorder */
+ NullS
+};
+
+/** Enumeration of innodb_instant_alter_column_allowed */
+static TYPELIB innodb_instant_alter_column_allowed_typelib = {
+ array_elements(innodb_instant_alter_column_allowed_names) - 1,
+ "innodb_instant_alter_column_allowed_typelib",
+ innodb_instant_alter_column_allowed_names,
+ NULL
+};
+
+/** Retrieve the FTS Relevance Ranking result for doc with doc_id
+of m_prebuilt->fts_doc_id
+@param[in,out] fts_hdl FTS handler
+@return the relevance ranking value */
+static
+float
+innobase_fts_retrieve_ranking(
+ FT_INFO* fts_hdl);
+/** Free the memory for the FTS handler
+@param[in,out] fts_hdl FTS handler */
+static
+void
+innobase_fts_close_ranking(
+ FT_INFO* fts_hdl);
+/** Find and Retrieve the FTS Relevance Ranking result for doc with doc_id
+of m_prebuilt->fts_doc_id
+@param[in,out] fts_hdl FTS handler
+@return the relevance ranking value */
+static
+float
+innobase_fts_find_ranking(
+ FT_INFO* fts_hdl,
+ uchar*,
+ uint);
+
+/* Call back function array defined by MySQL and used to
+retrieve FTS results. */
+const struct _ft_vft ft_vft_result = {NULL,
+ innobase_fts_find_ranking,
+ innobase_fts_close_ranking,
+ innobase_fts_retrieve_ranking,
+ NULL};
+
+/** @return version of the extended FTS API */
+static
+uint
+innobase_fts_get_version()
+{
+ /* Currently this doesn't make much sense as returning
+ HA_CAN_FULLTEXT_EXT automatically mean this version is supported.
+ This supposed to ease future extensions. */
+ return(2);
+}
+
+/** @return Which part of the extended FTS API is supported */
+static
+ulonglong
+innobase_fts_flags()
+{
+ return(FTS_ORDERED_RESULT | FTS_DOCID_IN_RESULT);
+}
+
+/** Find and Retrieve the FTS doc_id for the current result row
+@param[in,out] fts_hdl FTS handler
+@return the document ID */
+static
+ulonglong
+innobase_fts_retrieve_docid(
+ FT_INFO_EXT* fts_hdl);
+
+/** Find and retrieve the size of the current result
+@param[in,out] fts_hdl FTS handler
+@return number of matching rows */
+static
+ulonglong
+innobase_fts_count_matches(
+ FT_INFO_EXT* fts_hdl) /*!< in: FTS handler */
+{
+ NEW_FT_INFO* handle = reinterpret_cast<NEW_FT_INFO*>(fts_hdl);
+
+ if (handle->ft_result->rankings_by_id != NULL) {
+ return(rbt_size(handle->ft_result->rankings_by_id));
+ } else {
+ return(0);
+ }
+}
+
+const struct _ft_vft_ext ft_vft_ext_result = {innobase_fts_get_version,
+ innobase_fts_flags,
+ innobase_fts_retrieve_docid,
+ innobase_fts_count_matches};
+
+#ifdef HAVE_PSI_INTERFACE
+# define PSI_KEY(n) {&n##_key, #n, 0}
+/* All RWLOCK used in Innodb are SX-locks */
+# define PSI_RWLOCK_KEY(n) {&n##_key, #n, PSI_RWLOCK_FLAG_SX}
+
+/* Keys to register pthread mutexes in the current file with
+performance schema */
+static mysql_pfs_key_t pending_checkpoint_mutex_key;
+
+# ifdef UNIV_PFS_MUTEX
+/* all_innodb_mutexes array contains mutexes that are
+performance schema instrumented if "UNIV_PFS_MUTEX"
+is defined */
+static PSI_mutex_info all_innodb_mutexes[] = {
+ PSI_KEY(pending_checkpoint_mutex),
+ PSI_KEY(buf_pool_mutex),
+ PSI_KEY(dict_foreign_err_mutex),
+ PSI_KEY(dict_sys_mutex),
+ PSI_KEY(recalc_pool_mutex),
+ PSI_KEY(fil_system_mutex),
+ PSI_KEY(flush_list_mutex),
+ PSI_KEY(fts_delete_mutex),
+ PSI_KEY(fts_doc_id_mutex),
+ PSI_KEY(log_flush_order_mutex),
+ PSI_KEY(ibuf_bitmap_mutex),
+ PSI_KEY(ibuf_mutex),
+ PSI_KEY(ibuf_pessimistic_insert_mutex),
+ PSI_KEY(index_online_log),
+ PSI_KEY(log_sys_mutex),
+ PSI_KEY(page_zip_stat_per_index_mutex),
+ PSI_KEY(purge_sys_pq_mutex),
+ PSI_KEY(recv_sys_mutex),
+ PSI_KEY(redo_rseg_mutex),
+ PSI_KEY(noredo_rseg_mutex),
+# ifdef UNIV_DEBUG
+ PSI_KEY(rw_lock_debug_mutex),
+# endif /* UNIV_DEBUG */
+ PSI_KEY(rw_lock_list_mutex),
+ PSI_KEY(srv_innodb_monitor_mutex),
+ PSI_KEY(srv_misc_tmpfile_mutex),
+ PSI_KEY(srv_monitor_file_mutex),
+ PSI_KEY(buf_dblwr_mutex),
+ PSI_KEY(trx_pool_mutex),
+ PSI_KEY(trx_pool_manager_mutex),
+ PSI_KEY(lock_mutex),
+ PSI_KEY(lock_wait_mutex),
+ PSI_KEY(trx_mutex),
+ PSI_KEY(srv_threads_mutex),
+ PSI_KEY(rtr_active_mutex),
+ PSI_KEY(rtr_match_mutex),
+ PSI_KEY(rtr_path_mutex),
+ PSI_KEY(trx_sys_mutex),
+};
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+/* all_innodb_rwlocks array contains rwlocks that are
+performance schema instrumented if "UNIV_PFS_RWLOCK"
+is defined */
+static PSI_rwlock_info all_innodb_rwlocks[] = {
+ PSI_RWLOCK_KEY(btr_search_latch),
+ PSI_RWLOCK_KEY(dict_operation_lock),
+ PSI_RWLOCK_KEY(fil_space_latch),
+ PSI_RWLOCK_KEY(fts_cache_rw_lock),
+ PSI_RWLOCK_KEY(fts_cache_init_rw_lock),
+ PSI_RWLOCK_KEY(trx_i_s_cache_lock),
+ PSI_RWLOCK_KEY(trx_purge_latch),
+ PSI_RWLOCK_KEY(index_tree_rw_lock),
+};
+# endif /* UNIV_PFS_RWLOCK */
+
+# ifdef UNIV_PFS_THREAD
+/* all_innodb_threads array contains threads that are
+performance schema instrumented if "UNIV_PFS_THREAD"
+is defined */
+static PSI_thread_info all_innodb_threads[] = {
+ PSI_KEY(page_cleaner_thread),
+ PSI_KEY(trx_rollback_clean_thread),
+ PSI_KEY(thread_pool_thread)
+};
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_IO
+/* all_innodb_files array contains the type of files that are
+performance schema instrumented if "UNIV_PFS_IO" is defined */
+static PSI_file_info all_innodb_files[] = {
+ PSI_KEY(innodb_data_file),
+ PSI_KEY(innodb_log_file),
+ PSI_KEY(innodb_temp_file)
+};
+# endif /* UNIV_PFS_IO */
+#endif /* HAVE_PSI_INTERFACE */
+
+static void innodb_remember_check_sysvar_funcs();
+mysql_var_check_func check_sysvar_enum;
+mysql_var_check_func check_sysvar_int;
+
+// should page compression be used by default for new tables
+static MYSQL_THDVAR_BOOL(compression_default, PLUGIN_VAR_OPCMDARG,
+ "Is compression the default for new tables",
+ NULL, NULL, FALSE);
+
+/** Update callback for SET [SESSION] innodb_default_encryption_key_id */
+static void
+innodb_default_encryption_key_id_update(THD* thd, st_mysql_sys_var* var,
+ void* var_ptr, const void *save)
+{
+ uint key_id = *static_cast<const uint*>(save);
+ if (key_id != FIL_DEFAULT_ENCRYPTION_KEY
+ && !encryption_key_id_exists(key_id)) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "innodb_default_encryption_key=%u"
+ " is not available", key_id);
+ }
+ *static_cast<uint*>(var_ptr) = key_id;
+}
+
+static MYSQL_THDVAR_UINT(default_encryption_key_id, PLUGIN_VAR_RQCMDARG,
+ "Default encryption key id used for table encryption.",
+ NULL, innodb_default_encryption_key_id_update,
+ FIL_DEFAULT_ENCRYPTION_KEY, 1, UINT_MAX32, 0);
+
+/**
+ Structure for CREATE TABLE options (table options).
+ It needs to be called ha_table_option_struct.
+
+ The option values can be specified in the CREATE TABLE at the end:
+ CREATE TABLE ( ... ) *here*
+*/
+
+ha_create_table_option innodb_table_option_list[]=
+{
+ /* With this option user can enable page compression feature for the
+ table */
+ HA_TOPTION_SYSVAR("PAGE_COMPRESSED", page_compressed, compression_default),
+ /* With this option user can set zip compression level for page
+ compression for this table*/
+ HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, 0, 1, 9, 1),
+ /* With this option the user can enable encryption for the table */
+ HA_TOPTION_ENUM("ENCRYPTED", encryption, "DEFAULT,YES,NO", 0),
+ /* With this option the user defines the key identifier using for the encryption */
+ HA_TOPTION_SYSVAR("ENCRYPTION_KEY_ID", encryption_key_id, default_encryption_key_id),
+
+ HA_TOPTION_END
+};
+
+/*************************************************************//**
+Check whether valid argument given to innodb_ft_*_stopword_table.
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_stopword_table_validate(
+/*===========================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value); /*!< in: incoming string */
+
+static bool is_mysql_datadir_path(const char *path);
+
+/** Validate passed-in "value" is a valid directory name.
+This function is registered as a callback with MySQL.
+@param[in,out] thd thread handle
+@param[in] var pointer to system variable
+@param[out] save immediate result for update
+@param[in] value incoming string
+@return 0 for valid name */
+static
+int
+innodb_tmpdir_validate(
+ THD* thd,
+ struct st_mysql_sys_var*,
+ void* save,
+ struct st_mysql_value* value)
+{
+
+ char* alter_tmp_dir;
+ char* innodb_tmp_dir;
+ char buff[OS_FILE_MAX_PATH];
+ int len = sizeof(buff);
+ char tmp_abs_path[FN_REFLEN + 2];
+
+ ut_ad(save != NULL);
+ ut_ad(value != NULL);
+
+ if (check_global_access(thd, FILE_ACL)) {
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "InnoDB: FILE Permissions required");
+ *static_cast<const char**>(save) = NULL;
+ return(1);
+ }
+
+ alter_tmp_dir = (char*) value->val_str(value, buff, &len);
+
+ if (!alter_tmp_dir) {
+ *static_cast<const char**>(save) = alter_tmp_dir;
+ return(0);
+ }
+
+ if (strlen(alter_tmp_dir) > FN_REFLEN) {
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Path length should not exceed %d bytes", FN_REFLEN);
+ *static_cast<const char**>(save) = NULL;
+ return(1);
+ }
+
+ os_normalize_path(alter_tmp_dir);
+ my_realpath(tmp_abs_path, alter_tmp_dir, 0);
+ size_t tmp_abs_len = strlen(tmp_abs_path);
+
+ if (my_access(tmp_abs_path, F_OK)) {
+
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "InnoDB: Path doesn't exist.");
+ *static_cast<const char**>(save) = NULL;
+ return(1);
+ } else if (my_access(tmp_abs_path, R_OK | W_OK)) {
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "InnoDB: Server doesn't have permission in "
+ "the given location.");
+ *static_cast<const char**>(save) = NULL;
+ return(1);
+ }
+
+ MY_STAT stat_info_dir;
+
+ if (my_stat(tmp_abs_path, &stat_info_dir, MYF(0))) {
+ if ((stat_info_dir.st_mode & S_IFDIR) != S_IFDIR) {
+
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Given path is not a directory. ");
+ *static_cast<const char**>(save) = NULL;
+ return(1);
+ }
+ }
+
+ if (!is_mysql_datadir_path(tmp_abs_path)) {
+
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "InnoDB: Path Location should not be same as "
+ "mysql data directory location.");
+ *static_cast<const char**>(save) = NULL;
+ return(1);
+ }
+
+ innodb_tmp_dir = static_cast<char*>(
+ thd_memdup(thd, tmp_abs_path, tmp_abs_len + 1));
+ *static_cast<const char**>(save) = innodb_tmp_dir;
+ return(0);
+}
+
+/******************************************************************//**
+Maps a MySQL trx isolation level code to the InnoDB isolation level code
+@return InnoDB isolation level */
+static inline
+uint
+innobase_map_isolation_level(
+/*=========================*/
+ enum_tx_isolation iso); /*!< in: MySQL isolation level code */
+
+/** Gets field offset for a field in a table.
+@param[in] table MySQL table object
+@param[in] field MySQL field object (from table->field array)
+@return offset */
+static inline
+uint
+get_field_offset(
+ const TABLE* table,
+ const Field* field)
+{
+ return field->offset(table->record[0]);
+}
+
+
+/*************************************************************//**
+Check for a valid value of innobase_compression_algorithm.
+@return 0 for valid innodb_compression_algorithm. */
+static
+int
+innodb_compression_algorithm_validate(
+/*==================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value); /*!< in: incoming string */
+
+static ibool innodb_have_lzo=IF_LZO(1, 0);
+static ibool innodb_have_lz4=IF_LZ4(1, 0);
+static ibool innodb_have_lzma=IF_LZMA(1, 0);
+static ibool innodb_have_bzip2=IF_BZIP2(1, 0);
+static ibool innodb_have_snappy=IF_SNAPPY(1, 0);
+static ibool innodb_have_punch_hole=IF_PUNCH_HOLE(1, 0);
+
+static
+int
+innodb_encrypt_tables_validate(
+/*==================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value); /*!< in: incoming string */
+
+static const char innobase_hton_name[]= "InnoDB";
+
+static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
+ "Enable InnoDB locking in LOCK TABLES",
+ /* check_func */ NULL, /* update_func */ NULL,
+ /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
+ "Use strict mode when evaluating create options.",
+ NULL, NULL, TRUE);
+
+static MYSQL_THDVAR_BOOL(ft_enable_stopword, PLUGIN_VAR_OPCMDARG,
+ "Create FTS index with stopword.",
+ NULL, NULL,
+ /* default */ TRUE);
+
+static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
+ "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
+ NULL, NULL, 50, 0, 1024 * 1024 * 1024, 0);
+
+static MYSQL_THDVAR_STR(ft_user_stopword_table,
+ PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC,
+ "User supplied stopword table name, effective in the session level.",
+ innodb_stopword_table_validate, NULL, NULL);
+
+static MYSQL_THDVAR_STR(tmpdir,
+ PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC,
+ "Directory for temporary non-tablespace files.",
+ innodb_tmpdir_validate, NULL, NULL);
+
+static SHOW_VAR innodb_status_variables[]= {
+#ifdef BTR_CUR_HASH_ADAPT
+ {"adaptive_hash_hash_searches", &btr_cur_n_sea, SHOW_SIZE_T},
+ {"adaptive_hash_non_hash_searches", &btr_cur_n_non_sea, SHOW_SIZE_T},
+#endif
+ {"background_log_sync", &srv_log_writes_and_flush, SHOW_SIZE_T},
+ {"buffer_pool_dump_status",
+ (char*) &export_vars.innodb_buffer_pool_dump_status, SHOW_CHAR},
+ {"buffer_pool_load_status",
+ (char*) &export_vars.innodb_buffer_pool_load_status, SHOW_CHAR},
+ {"buffer_pool_resize_status",
+ (char*) &export_vars.innodb_buffer_pool_resize_status, SHOW_CHAR},
+ {"buffer_pool_load_incomplete",
+ &export_vars.innodb_buffer_pool_load_incomplete, SHOW_BOOL},
+ {"buffer_pool_pages_data",
+ &export_vars.innodb_buffer_pool_pages_data, SHOW_SIZE_T},
+ {"buffer_pool_bytes_data",
+ &export_vars.innodb_buffer_pool_bytes_data, SHOW_SIZE_T},
+ {"buffer_pool_pages_dirty",
+ &export_vars.innodb_buffer_pool_pages_dirty, SHOW_SIZE_T},
+ {"buffer_pool_bytes_dirty",
+ &export_vars.innodb_buffer_pool_bytes_dirty, SHOW_SIZE_T},
+ {"buffer_pool_pages_flushed", &buf_flush_page_count, SHOW_SIZE_T},
+ {"buffer_pool_pages_free",
+ &export_vars.innodb_buffer_pool_pages_free, SHOW_SIZE_T},
+#ifdef UNIV_DEBUG
+ {"buffer_pool_pages_latched",
+ &export_vars.innodb_buffer_pool_pages_latched, SHOW_SIZE_T},
+#endif /* UNIV_DEBUG */
+ {"buffer_pool_pages_made_not_young",
+ &export_vars.innodb_buffer_pool_pages_made_not_young, SHOW_SIZE_T},
+ {"buffer_pool_pages_made_young",
+ &export_vars.innodb_buffer_pool_pages_made_young, SHOW_SIZE_T},
+ {"buffer_pool_pages_misc",
+ &export_vars.innodb_buffer_pool_pages_misc, SHOW_SIZE_T},
+ {"buffer_pool_pages_old",
+ &export_vars.innodb_buffer_pool_pages_old, SHOW_SIZE_T},
+ {"buffer_pool_pages_total",
+ &export_vars.innodb_buffer_pool_pages_total, SHOW_SIZE_T},
+ {"buffer_pool_pages_LRU_flushed", &buf_lru_flush_page_count, SHOW_SIZE_T},
+ {"buffer_pool_read_ahead_rnd",
+ &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_SIZE_T},
+ {"buffer_pool_read_ahead",
+ &export_vars.innodb_buffer_pool_read_ahead, SHOW_SIZE_T},
+ {"buffer_pool_read_ahead_evicted",
+ &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_SIZE_T},
+ {"buffer_pool_read_requests",
+ &export_vars.innodb_buffer_pool_read_requests, SHOW_SIZE_T},
+ {"buffer_pool_reads",
+ &export_vars.innodb_buffer_pool_reads, SHOW_SIZE_T},
+ {"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T},
+ {"buffer_pool_write_requests",
+ &export_vars.innodb_buffer_pool_write_requests, SHOW_SIZE_T},
+ {"checkpoint_age", &export_vars.innodb_checkpoint_age, SHOW_SIZE_T},
+ {"checkpoint_max_age", &export_vars.innodb_checkpoint_max_age, SHOW_SIZE_T},
+ {"data_fsyncs", &export_vars.innodb_data_fsyncs, SHOW_SIZE_T},
+ {"data_pending_fsyncs", &export_vars.innodb_data_pending_fsyncs,SHOW_SIZE_T},
+ {"data_pending_reads", &export_vars.innodb_data_pending_reads, SHOW_SIZE_T},
+ {"data_pending_writes", &export_vars.innodb_data_pending_writes,SHOW_SIZE_T},
+ {"data_read", &export_vars.innodb_data_read, SHOW_SIZE_T},
+ {"data_reads", &export_vars.innodb_data_reads, SHOW_SIZE_T},
+ {"data_writes", &export_vars.innodb_data_writes, SHOW_SIZE_T},
+ {"data_written", &export_vars.innodb_data_written, SHOW_SIZE_T},
+ {"dblwr_pages_written", &export_vars.innodb_dblwr_pages_written,SHOW_SIZE_T},
+ {"dblwr_writes", &export_vars.innodb_dblwr_writes, SHOW_SIZE_T},
+ {"deadlocks", &srv_stats.lock_deadlock_count, SHOW_SIZE_T},
+ {"history_list_length", &export_vars.innodb_history_list_length,SHOW_SIZE_T},
+ {"ibuf_discarded_delete_marks", &ibuf.n_discarded_ops[IBUF_OP_DELETE_MARK],
+ SHOW_SIZE_T},
+ {"ibuf_discarded_deletes", &ibuf.n_discarded_ops[IBUF_OP_DELETE],
+ SHOW_SIZE_T},
+ {"ibuf_discarded_inserts", &ibuf.n_discarded_ops[IBUF_OP_INSERT],
+ SHOW_SIZE_T},
+ {"ibuf_free_list", &ibuf.free_list_len, SHOW_SIZE_T},
+ {"ibuf_merged_delete_marks", &ibuf.n_merged_ops[IBUF_OP_DELETE_MARK],
+ SHOW_SIZE_T},
+ {"ibuf_merged_deletes", &ibuf.n_merged_ops[IBUF_OP_DELETE], SHOW_SIZE_T},
+ {"ibuf_merged_inserts", &ibuf.n_merged_ops[IBUF_OP_INSERT], SHOW_SIZE_T},
+ {"ibuf_merges", &ibuf.n_merges, SHOW_SIZE_T},
+ {"ibuf_segment_size", &ibuf.seg_size, SHOW_SIZE_T},
+ {"ibuf_size", &ibuf.size, SHOW_SIZE_T},
+ {"log_waits", &export_vars.innodb_log_waits, SHOW_SIZE_T},
+ {"log_write_requests", &export_vars.innodb_log_write_requests, SHOW_SIZE_T},
+ {"log_writes", &export_vars.innodb_log_writes, SHOW_SIZE_T},
+ {"lsn_current", &export_vars.innodb_lsn_current, SHOW_ULONGLONG},
+ {"lsn_flushed", &export_vars.innodb_lsn_flushed, SHOW_ULONGLONG},
+ {"lsn_last_checkpoint", &export_vars.innodb_lsn_last_checkpoint,
+ SHOW_ULONGLONG},
+ {"master_thread_active_loops", &srv_main_active_loops, SHOW_SIZE_T},
+ {"master_thread_idle_loops", &srv_main_idle_loops, SHOW_SIZE_T},
+ {"max_trx_id", &export_vars.innodb_max_trx_id, SHOW_ULONGLONG},
+#ifdef BTR_CUR_HASH_ADAPT
+ {"mem_adaptive_hash", &export_vars.innodb_mem_adaptive_hash, SHOW_SIZE_T},
+#endif
+ {"mem_dictionary", &export_vars.innodb_mem_dictionary, SHOW_SIZE_T},
+ {"os_log_fsyncs", &export_vars.innodb_os_log_fsyncs, SHOW_SIZE_T},
+ {"os_log_pending_fsyncs", &export_vars.innodb_os_log_pending_fsyncs,
+ SHOW_SIZE_T},
+ {"os_log_pending_writes", &export_vars.innodb_os_log_pending_writes,
+ SHOW_SIZE_T},
+ {"os_log_written", &export_vars.innodb_os_log_written, SHOW_SIZE_T},
+ {"page_size", &srv_page_size, SHOW_ULONG},
+ {"pages_created", &buf_pool.stat.n_pages_created, SHOW_SIZE_T},
+ {"pages_read", &buf_pool.stat.n_pages_read, SHOW_SIZE_T},
+ {"pages_written", &buf_pool.stat.n_pages_written, SHOW_SIZE_T},
+ {"row_lock_current_waits", &export_vars.innodb_row_lock_current_waits,
+ SHOW_SIZE_T},
+ {"row_lock_time", &export_vars.innodb_row_lock_time, SHOW_LONGLONG},
+ {"row_lock_time_avg", &export_vars.innodb_row_lock_time_avg, SHOW_SIZE_T},
+ {"row_lock_time_max", &export_vars.innodb_row_lock_time_max, SHOW_SIZE_T},
+ {"row_lock_waits", &export_vars.innodb_row_lock_waits, SHOW_SIZE_T},
+ {"rows_deleted", &export_vars.innodb_rows_deleted, SHOW_SIZE_T},
+ {"rows_inserted", &export_vars.innodb_rows_inserted, SHOW_SIZE_T},
+ {"rows_read", &export_vars.innodb_rows_read, SHOW_SIZE_T},
+ {"rows_updated", &export_vars.innodb_rows_updated, SHOW_SIZE_T},
+ {"system_rows_deleted", &export_vars.innodb_system_rows_deleted,SHOW_SIZE_T},
+ {"system_rows_inserted", &export_vars.innodb_system_rows_inserted,
+ SHOW_SIZE_T},
+ {"system_rows_read", &export_vars.innodb_system_rows_read, SHOW_SIZE_T},
+ {"system_rows_updated", &export_vars.innodb_system_rows_updated,
+ SHOW_SIZE_T},
+ {"num_open_files", &fil_system.n_open, SHOW_SIZE_T},
+ {"truncated_status_writes", &export_vars.innodb_truncated_status_writes,
+ SHOW_SIZE_T},
+ {"available_undo_logs", &srv_available_undo_logs, SHOW_ULONG},
+ {"undo_truncations", &export_vars.innodb_undo_truncations, SHOW_ULONG},
+
+ /* Status variables for page compression */
+ {"page_compression_saved",
+ &export_vars.innodb_page_compression_saved, SHOW_LONGLONG},
+ {"num_index_pages_written",
+ &export_vars.innodb_index_pages_written, SHOW_LONGLONG},
+ {"num_non_index_pages_written",
+ &export_vars.innodb_non_index_pages_written, SHOW_LONGLONG},
+ {"num_pages_page_compressed",
+ &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG},
+ {"num_page_compressed_trim_op",
+ &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG},
+ {"num_pages_page_decompressed",
+ &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG},
+ {"num_pages_page_compression_error",
+ &export_vars.innodb_pages_page_compression_error, SHOW_LONGLONG},
+ {"num_pages_encrypted",
+ &export_vars.innodb_pages_encrypted, SHOW_LONGLONG},
+ {"num_pages_decrypted",
+ &export_vars.innodb_pages_decrypted, SHOW_LONGLONG},
+ {"have_lz4", &innodb_have_lz4, SHOW_BOOL},
+ {"have_lzo", &innodb_have_lzo, SHOW_BOOL},
+ {"have_lzma", &innodb_have_lzma, SHOW_BOOL},
+ {"have_bzip2", &innodb_have_bzip2, SHOW_BOOL},
+ {"have_snappy", &innodb_have_snappy, SHOW_BOOL},
+ {"have_punch_hole", &innodb_have_punch_hole, SHOW_BOOL},
+
+ /* Defragmentation */
+ {"defragment_compression_failures",
+ &export_vars.innodb_defragment_compression_failures, SHOW_SIZE_T},
+ {"defragment_failures", &export_vars.innodb_defragment_failures,SHOW_SIZE_T},
+ {"defragment_count", &export_vars.innodb_defragment_count, SHOW_SIZE_T},
+
+ {"instant_alter_column",
+ &export_vars.innodb_instant_alter_column, SHOW_ULONG},
+
+ /* Online alter table status variables */
+ {"onlineddl_rowlog_rows",
+ &export_vars.innodb_onlineddl_rowlog_rows, SHOW_SIZE_T},
+ {"onlineddl_rowlog_pct_used",
+ &export_vars.innodb_onlineddl_rowlog_pct_used, SHOW_SIZE_T},
+ {"onlineddl_pct_progress",
+ &export_vars.innodb_onlineddl_pct_progress, SHOW_SIZE_T},
+
+ /* Times secondary index lookup triggered cluster lookup and
+ times prefix optimization avoided triggering cluster lookup */
+ {"secondary_index_triggered_cluster_reads",
+ &export_vars.innodb_sec_rec_cluster_reads, SHOW_SIZE_T},
+ {"secondary_index_triggered_cluster_reads_avoided",
+ &export_vars.innodb_sec_rec_cluster_reads_avoided, SHOW_SIZE_T},
+
+ /* Encryption */
+ {"encryption_rotation_pages_read_from_cache",
+ &export_vars.innodb_encryption_rotation_pages_read_from_cache, SHOW_SIZE_T},
+ {"encryption_rotation_pages_read_from_disk",
+ &export_vars.innodb_encryption_rotation_pages_read_from_disk, SHOW_SIZE_T},
+ {"encryption_rotation_pages_modified",
+ &export_vars.innodb_encryption_rotation_pages_modified, SHOW_SIZE_T},
+ {"encryption_rotation_pages_flushed",
+ &export_vars.innodb_encryption_rotation_pages_flushed, SHOW_SIZE_T},
+ {"encryption_rotation_estimated_iops",
+ &export_vars.innodb_encryption_rotation_estimated_iops, SHOW_SIZE_T},
+ {"encryption_key_rotation_list_length",
+ &export_vars.innodb_key_rotation_list_length, SHOW_LONGLONG},
+ {"encryption_n_merge_blocks_encrypted",
+ &export_vars.innodb_n_merge_blocks_encrypted, SHOW_LONGLONG},
+ {"encryption_n_merge_blocks_decrypted",
+ &export_vars.innodb_n_merge_blocks_decrypted, SHOW_LONGLONG},
+ {"encryption_n_rowlog_blocks_encrypted",
+ &export_vars.innodb_n_rowlog_blocks_encrypted, SHOW_LONGLONG},
+ {"encryption_n_rowlog_blocks_decrypted",
+ &export_vars.innodb_n_rowlog_blocks_decrypted, SHOW_LONGLONG},
+ {"encryption_n_temp_blocks_encrypted",
+ &export_vars.innodb_n_temp_blocks_encrypted, SHOW_LONGLONG},
+ {"encryption_n_temp_blocks_decrypted",
+ &export_vars.innodb_n_temp_blocks_decrypted, SHOW_LONGLONG},
+ {"encryption_num_key_requests", &export_vars.innodb_encryption_key_requests,
+ SHOW_LONGLONG},
+
+ {NullS, NullS, SHOW_LONG}
+};
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return 0 or error number */
+static
+int
+innobase_close_connection(
+/*======================*/
+ handlerton* hton, /*!< in/out: InnoDB handlerton */
+ THD* thd); /*!< in: MySQL thread handle for
+ which to close the connection */
+
+/** Cancel any pending lock request associated with the current THD.
+@sa THD::awake() @sa ha_kill_query() */
+static void innobase_kill_query(handlerton*, THD* thd, enum thd_kill_levels);
+static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return 0 */
+static
+int
+innobase_commit(
+/*============*/
+ handlerton* hton, /*!< in/out: InnoDB handlerton */
+ THD* thd, /*!< in: MySQL thread handle of the
+ user for whom the transaction should
+ be committed */
+ bool commit_trx); /*!< in: true - commit transaction
+ false - the current SQL statement
+ ended */
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback(
+/*==============*/
+ handlerton* hton, /*!< in/out: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread
+ of the user whose transaction should
+ be rolled back */
+ bool rollback_trx); /*!< in: TRUE - rollback entire
+ transaction FALSE - rollback the current
+ statement only */
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+ handlerton* hton, /*!< in/out: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of
+ the user whose XA transaction should
+ be rolled back to savepoint */
+ void* savepoint); /*!< in: savepoint data */
+
+/*****************************************************************//**
+Check whether innodb state allows to safely release MDL locks after
+rollback to savepoint.
+@return true if it is safe, false if its not safe. */
+static
+bool
+innobase_rollback_to_savepoint_can_release_mdl(
+/*===========================================*/
+ handlerton* hton, /*!< in/out: InnoDB handlerton */
+ THD* thd); /*!< in: handle to the MySQL thread of
+ the user whose XA transaction should
+ be rolled back to savepoint */
+
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+ handlerton* hton, /*!< in/out: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of
+ the user's XA transaction for which
+ we need to take a savepoint */
+ void* savepoint); /*!< in: savepoint data */
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+ handlerton* hton, /*!< in/out: handlerton for InnoDB */
+ THD* thd, /*!< in: handle to the MySQL thread
+ of the user whose transaction's
+ savepoint should be released */
+ void* savepoint); /*!< in: savepoint data */
+
+/** Request notification of log writes */
+static void innodb_log_flush_request(void *cookie);
+
+/** Requests for log flushes */
+struct log_flush_request
+{
+ /** earlier request (for a smaller LSN) */
+ log_flush_request *next;
+ /** parameter provided to innodb_log_flush_request() */
+ void *cookie;
+ /** log sequence number that is being waited for */
+ lsn_t lsn;
+};
+
+/** Buffer of pending innodb_log_flush_request() */
+MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) static
+struct
+{
+ /** first request */
+ std::atomic<log_flush_request*> start;
+ /** last request */
+ log_flush_request *end;
+ /** mutex protecting this object */
+ mysql_mutex_t mutex;
+}
+log_requests;
+
+/** @brief Adjust some InnoDB startup parameters based on file contents
+or innodb_page_size. */
+static
+void
+innodb_params_adjust();
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return 0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of
+ the user whose XA transaction should
+ be prepared */
+ bool all); /*!< in: true - prepare transaction
+ false - the current SQL statement
+ ended */
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ XID* xid_list, /*!< in/out: prepared transactions */
+ uint len); /*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return 0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ XID* xid); /*!< in: X/Open XA transaction
+ identification */
+/** Remove all tables in the named database inside InnoDB.
+@param[in] hton handlerton from InnoDB
+@param[in] path Database path; Inside InnoDB the name of the last
+directory in the path is used as the database name.
+For example, in 'mysql/data/test' the database name is 'test'. */
+static
+void
+innobase_drop_database(
+ handlerton* hton,
+ char* path);
+
+/** Shut down the InnoDB storage engine.
+@return 0 */
+static
+int
+innobase_end(handlerton*, ha_panic_function);
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return 0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+ handlerton* hton, /* in: InnoDB handlerton */
+ THD* thd); /* in: MySQL thread handle of the
+ user for whom the transaction should
+ be committed */
+
+/** Flush InnoDB redo logs to the file system.
+@param[in] hton InnoDB handlerton
+@param[in] binlog_group_flush true if we got invoked by binlog
+group commit during flush stage, false in other cases.
+@return false */
+static
+bool
+innobase_flush_logs(
+ handlerton* hton,
+ bool binlog_group_flush)
+{
+ DBUG_ENTER("innobase_flush_logs");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ if (srv_read_only_mode) {
+ DBUG_RETURN(false);
+ }
+
+ /* If !binlog_group_flush, we got invoked by FLUSH LOGS or similar.
+ Else, we got invoked by binlog group commit during flush stage. */
+
+ if (binlog_group_flush && srv_flush_log_at_trx_commit == 0) {
+ /* innodb_flush_log_at_trx_commit=0
+ (write and sync once per second).
+ Do not flush the redo log during binlog group commit. */
+ DBUG_RETURN(false);
+ }
+
+ /* Flush the redo log buffer to the redo log file.
+ Sync it to disc if we are in FLUSH LOGS, or if
+ innodb_flush_log_at_trx_commit=1
+ (write and sync at each commit). */
+ log_buffer_flush_to_disk(!binlog_group_flush
+ || srv_flush_log_at_trx_commit == 1);
+
+ DBUG_RETURN(false);
+}
+
+/** Flush InnoDB redo logs to the file system.
+@param[in] hton InnoDB handlerton
+@param[in] binlog_group_flush true if we got invoked by binlog
+group commit during flush stage, false in other cases.
+@return false */
+static
+bool
+innobase_flush_logs(
+ handlerton* hton)
+{
+ return innobase_flush_logs(hton, true);
+}
+
+/************************************************************************//**
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
+static
+int
+innodb_show_status(
+/*===============*/
+ handlerton* hton, /*!< in: the innodb handlerton */
+ THD* thd, /*!< in: the MySQL query thread of
+ the caller */
+ stat_print_fn* stat_print);
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
+static
+bool
+innobase_show_status(
+/*=================*/
+ handlerton* hton, /*!< in: the innodb handlerton */
+ THD* thd, /*!< in: the MySQL query thread of
+ the caller */
+ stat_print_fn* stat_print,
+ enum ha_stat_type stat_type);
+
+/** After ALTER TABLE, recompute statistics. */
+inline void ha_innobase::reload_statistics()
+{
+ if (dict_table_t *table= m_prebuilt ? m_prebuilt->table : nullptr)
+ {
+ if (table->is_readable())
+ dict_stats_init(table);
+ else
+ table->stat_initialized= 1;
+ }
+}
+
+/** After ALTER TABLE, recompute statistics. */
+static int innodb_notify_tabledef_changed(handlerton *,
+ LEX_CSTRING *, LEX_CSTRING *,
+ LEX_CUSTRING *, LEX_CUSTRING *,
+ handler *handler)
+{
+ DBUG_ENTER("innodb_notify_tabledef_changed");
+ if (handler)
+ static_cast<ha_innobase*>(handler)->reload_statistics();
+ DBUG_RETURN(0);
+}
+
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can enable monitor counters/groups by specifying
+"loose-innodb_monitor_enable = monitor_name1;monitor_name2..."
+in server configuration file or at the command line. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+ char* str); /*!< in: monitor counter enable list */
+
+#ifdef MYSQL_STORE_FTS_DOC_ID
+/** Store doc_id value into FTS_DOC_ID field
+@param[in,out] tbl table containing FULLTEXT index
+@param[in] doc_id FTS_DOC_ID value */
+static
+void
+innobase_fts_store_docid(
+ TABLE* tbl,
+ ulonglong doc_id)
+{
+ my_bitmap_map* old_map
+ = dbug_tmp_use_all_columns(tbl, tbl->write_set);
+
+ tbl->fts_doc_id_field->store(static_cast<longlong>(doc_id), true);
+
+ dbug_tmp_restore_column_map(tbl->write_set, old_map);
+}
+#endif
+
+/*******************************************************************//**
+Function for constructing an InnoDB table handler instance. */
+static
+handler*
+innobase_create_handler(
+/*====================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ TABLE_SHARE* table,
+ MEM_ROOT* mem_root)
+{
+ return(new (mem_root) ha_innobase(hton, table));
+}
+
+/* General functions */
+
+/** Check that a page_size is correct for InnoDB.
+If correct, set the associated page_size_shift which is the power of 2
+for this page size.
+@param[in] page_size Page Size to evaluate
+@return an associated page_size_shift if valid, 0 if invalid. */
+inline
+ulong
+innodb_page_size_validate(
+ ulong page_size)
+{
+ ulong n;
+
+ DBUG_ENTER("innodb_page_size_validate");
+
+ for (n = UNIV_PAGE_SIZE_SHIFT_MIN;
+ n <= UNIV_PAGE_SIZE_SHIFT_MAX;
+ n++) {
+ if (page_size == static_cast<ulong>(1 << n)) {
+ DBUG_RETURN(n);
+ }
+ }
+
+ DBUG_RETURN(0);
+}
+
+/******************************************************************//**
+Returns true if the thread is the replication thread on the slave
+server.
+@return true if thd is the replication thread */
+ibool
+thd_is_replication_slave_thread(
+/*============================*/
+ THD* thd) /*!< in: thread handle */
+{
+ return thd && ((ibool) thd_slave_thread(thd));
+}
+
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return true if the thd is marked as read-only */
+bool
+thd_trx_is_read_only(
+/*=================*/
+ THD* thd) /*!< in: thread handle */
+{
+ return(thd != 0 && thd_tx_is_read_only(thd));
+}
+
+static MYSQL_THDVAR_BOOL(background_thread,
+ PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_NOSYSVAR,
+ "Internal (not user visible) flag to mark "
+ "background purge threads", NULL, NULL, 0);
+
+/** Create a MYSQL_THD for a background thread and mark it as such.
+@param name thread info for SHOW PROCESSLIST
+@return new MYSQL_THD */
+MYSQL_THD
+innobase_create_background_thd(const char* name)
+/*============================*/
+{
+ MYSQL_THD thd= create_background_thd();
+ thd_proc_info(thd, name);
+ THDVAR(thd, background_thread) = true;
+ return thd;
+}
+
+
+/** Destroy a background purge thread THD.
+@param[in] thd MYSQL_THD to destroy */
+void
+innobase_destroy_background_thd(
+/*============================*/
+ MYSQL_THD thd)
+{
+ /* need to close the connection explicitly, the server won't do it
+ if innodb is in the PLUGIN_IS_DYING state */
+ innobase_close_connection(innodb_hton_ptr, thd);
+ thd_set_ha_data(thd, innodb_hton_ptr, NULL);
+ destroy_background_thd(thd);
+}
+
+/** Close opened tables, free memory, delete items for a MYSQL_THD.
+@param[in] thd MYSQL_THD to reset */
+void
+innobase_reset_background_thd(MYSQL_THD thd)
+{
+ if (!thd) {
+ thd = current_thd;
+ }
+
+ ut_ad(thd);
+ ut_ad(THDVAR(thd, background_thread));
+
+ /* background purge thread */
+ const char *proc_info= thd_proc_info(thd, "reset");
+ reset_thd(thd);
+ thd_proc_info(thd, proc_info);
+}
+
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return true if the transaction is an auto commit read-only transaction. */
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+ THD* thd) /*!< in: thread handle, can be NULL */
+{
+ return(thd != NULL
+ && !thd_test_options(
+ thd,
+ OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
+ && thd_sql_command(thd) == SQLCOM_SELECT);
+}
+
+/******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return pointer to glob_hostname. */
+const char*
+server_get_hostname()
+/*=================*/
+{
+ return(glob_hostname);
+}
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return true if non-transactional tables have been edited */
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+ THD* thd) /*!< in: thread handle */
+{
+ return((ibool) thd_non_transactional_update(thd));
+}
+
+/* Return high resolution timestamp for the start of the current query */
+UNIV_INTERN
+unsigned long long
+thd_query_start_micro(
+ const THD* thd) /*!< in: thread handle */
+{
+ return thd_start_utime(thd);
+}
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return the lock wait timeout, in seconds */
+ulong
+thd_lock_wait_timeout(
+/*==================*/
+ THD* thd) /*!< in: thread handle, or NULL to query
+ the global innodb_lock_wait_timeout */
+{
+ /* According to <mysql/plugin.h>, passing thd == NULL
+ returns the global value of the session variable. */
+ return(THDVAR(thd, lock_wait_timeout));
+}
+
+/** Get the value of innodb_tmpdir.
+@param[in] thd thread handle, or NULL to query
+ the global innodb_tmpdir.
+@retval NULL if innodb_tmpdir="" */
+const char*
+thd_innodb_tmpdir(
+ THD* thd)
+{
+ ut_ad(!sync_check_iterate(sync_check()));
+
+ const char* tmp_dir = THDVAR(thd, tmpdir);
+
+ if (tmp_dir != NULL && *tmp_dir == '\0') {
+ tmp_dir = NULL;
+ }
+
+ return(tmp_dir);
+}
+
+/** Obtain the InnoDB transaction of a MySQL thread.
+@param[in,out] thd thread handle
+@return reference to transaction pointer */
+static trx_t* thd_to_trx(THD* thd)
+{
+ return reinterpret_cast<trx_t*>(thd_get_ha_data(thd, innodb_hton_ptr));
+}
+
+#ifdef WITH_WSREP
+/********************************************************************//**
+Obtain the InnoDB transaction id of a MySQL thread.
+@return transaction id */
+__attribute__((warn_unused_result, nonnull))
+ulonglong
+thd_to_trx_id(
+ THD* thd) /*!< in: MySQL thread */
+{
+ return(thd_to_trx(thd)->id);
+}
+
+static void wsrep_abort_transaction(handlerton*, THD *, THD *, my_bool);
+static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid);
+static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid);
+#endif /* WITH_WSREP */
+/********************************************************************//**
+Converts an InnoDB error code to a MySQL error code and also tells to MySQL
+about a possible transaction rollback inside InnoDB caused by a lock wait
+timeout or a deadlock.
+@return MySQL error code */
+static int
+convert_error_code_to_mysql(
+/*========================*/
+ dberr_t error, /*!< in: InnoDB error code */
+ ulint flags, /*!< in: InnoDB table flags, or 0 */
+ THD* thd) /*!< in: user thread handle or NULL */
+{
+ switch (error) {
+ case DB_SUCCESS:
+ return(0);
+
+ case DB_INTERRUPTED:
+ return(HA_ERR_ABORTED_BY_USER);
+
+ case DB_FOREIGN_EXCEED_MAX_CASCADE:
+ ut_ad(thd);
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_ROW_IS_REFERENCED,
+ "InnoDB: Cannot delete/update "
+ "rows with cascading foreign key "
+ "constraints that exceed max "
+ "depth of %d. Please "
+ "drop extra constraints and try "
+ "again", DICT_FK_MAX_RECURSIVE_LOAD);
+ return(HA_ERR_FK_DEPTH_EXCEEDED);
+
+ case DB_CANT_CREATE_GEOMETRY_OBJECT:
+ my_error(ER_CANT_CREATE_GEOMETRY_OBJECT, MYF(0));
+ return(HA_ERR_NULL_IN_SPATIAL);
+
+ case DB_ERROR:
+ default:
+ return(HA_ERR_GENERIC); /* unspecified error */
+
+ case DB_DUPLICATE_KEY:
+ /* Be cautious with returning this error, since
+ mysql could re-enter the storage layer to get
+ duplicated key info, the operation requires a
+ valid table handle and/or transaction information,
+ which might not always be available in the error
+ handling stage. */
+ return(HA_ERR_FOUND_DUPP_KEY);
+
+ case DB_READ_ONLY:
+ return(HA_ERR_TABLE_READONLY);
+
+ case DB_FOREIGN_DUPLICATE_KEY:
+ return(HA_ERR_FOREIGN_DUPLICATE_KEY);
+
+ case DB_MISSING_HISTORY:
+ return(HA_ERR_TABLE_DEF_CHANGED);
+
+ case DB_RECORD_NOT_FOUND:
+ return(HA_ERR_NO_ACTIVE_RECORD);
+
+ case DB_DEADLOCK:
+ /* Since we rolled back the whole transaction, we must
+ tell it also to MySQL so that MySQL knows to empty the
+ cached binlog for this transaction */
+
+ if (thd != NULL) {
+ thd_mark_transaction_to_rollback(thd, 1);
+ }
+
+ return(HA_ERR_LOCK_DEADLOCK);
+
+ case DB_LOCK_WAIT_TIMEOUT:
+ /* Starting from 5.0.13, we let MySQL just roll back the
+ latest SQL statement in a lock wait timeout. Previously, we
+ rolled back the whole transaction. */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(
+ thd, (bool) row_rollback_on_timeout);
+ }
+
+ return(HA_ERR_LOCK_WAIT_TIMEOUT);
+
+ case DB_NO_REFERENCED_ROW:
+ return(HA_ERR_NO_REFERENCED_ROW);
+
+ case DB_ROW_IS_REFERENCED:
+ return(HA_ERR_ROW_IS_REFERENCED);
+
+ case DB_NO_FK_ON_S_BASE_COL:
+ case DB_CANNOT_ADD_CONSTRAINT:
+ case DB_CHILD_NO_INDEX:
+ case DB_PARENT_NO_INDEX:
+ return(HA_ERR_CANNOT_ADD_FOREIGN);
+
+ case DB_CANNOT_DROP_CONSTRAINT:
+
+ return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
+ misleading, a new MySQL error
+ code should be introduced */
+
+ case DB_CORRUPTION:
+ return(HA_ERR_CRASHED);
+
+ case DB_OUT_OF_FILE_SPACE:
+ return(HA_ERR_RECORD_FILE_FULL);
+
+ case DB_TEMP_FILE_WRITE_FAIL:
+ my_error(ER_GET_ERRMSG, MYF(0),
+ DB_TEMP_FILE_WRITE_FAIL,
+ ut_strerr(DB_TEMP_FILE_WRITE_FAIL),
+ "InnoDB");
+ return(HA_ERR_INTERNAL_ERROR);
+
+ case DB_TABLE_IN_FK_CHECK:
+ return(HA_ERR_TABLE_IN_FK_CHECK);
+
+ case DB_TABLE_IS_BEING_USED:
+ return(HA_ERR_WRONG_COMMAND);
+
+ case DB_TABLE_NOT_FOUND:
+ return(HA_ERR_NO_SUCH_TABLE);
+
+ case DB_DECRYPTION_FAILED:
+ return(HA_ERR_DECRYPTION_FAILED);
+
+ case DB_TABLESPACE_NOT_FOUND:
+ return(HA_ERR_TABLESPACE_MISSING);
+
+ case DB_TOO_BIG_RECORD: {
+ /* If prefix is true then a 768-byte prefix is stored
+ locally for BLOB fields. Refer to dict_table_get_format().
+ We limit max record size to 16k for 64k page size. */
+ bool prefix = !DICT_TF_HAS_ATOMIC_BLOBS(flags);
+ bool comp = !!(flags & DICT_TF_COMPACT);
+ ulint free_space = page_get_free_space_of_empty(comp) / 2;
+
+ if (free_space >= ulint(comp ? COMPRESSED_REC_MAX_DATA_SIZE :
+ REDUNDANT_REC_MAX_DATA_SIZE)) {
+ free_space = (comp ? COMPRESSED_REC_MAX_DATA_SIZE :
+ REDUNDANT_REC_MAX_DATA_SIZE) - 1;
+ }
+
+ my_printf_error(ER_TOO_BIG_ROWSIZE,
+ "Row size too large (> " ULINTPF "). Changing some columns "
+ "to TEXT or BLOB %smay help. In current row "
+ "format, BLOB prefix of %d bytes is stored inline.",
+ MYF(0),
+ free_space,
+ prefix
+ ? "or using ROW_FORMAT=DYNAMIC or"
+ " ROW_FORMAT=COMPRESSED "
+ : "",
+ prefix
+ ? DICT_MAX_FIXED_COL_LEN
+ : 0);
+ return(HA_ERR_TO_BIG_ROW);
+ }
+
+ case DB_TOO_BIG_INDEX_COL:
+ my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+ (ulong) DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
+ return(HA_ERR_INDEX_COL_TOO_LONG);
+
+ case DB_NO_SAVEPOINT:
+ return(HA_ERR_NO_SAVEPOINT);
+
+ case DB_LOCK_TABLE_FULL:
+ /* Since we rolled back the whole transaction, we must
+ tell it also to MySQL so that MySQL knows to empty the
+ cached binlog for this transaction */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(thd, 1);
+ }
+
+ return(HA_ERR_LOCK_TABLE_FULL);
+
+ case DB_FTS_INVALID_DOCID:
+ return(HA_FTS_INVALID_DOCID);
+ case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
+ return(HA_ERR_OUT_OF_MEM);
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
+ case DB_UNSUPPORTED:
+ return(HA_ERR_UNSUPPORTED);
+ case DB_INDEX_CORRUPT:
+ return(HA_ERR_INDEX_CORRUPT);
+ case DB_UNDO_RECORD_TOO_BIG:
+ return(HA_ERR_UNDO_REC_TOO_BIG);
+ case DB_OUT_OF_MEMORY:
+ return(HA_ERR_OUT_OF_MEM);
+ case DB_TABLESPACE_EXISTS:
+ return(HA_ERR_TABLESPACE_EXISTS);
+ case DB_TABLESPACE_DELETED:
+ return(HA_ERR_TABLESPACE_MISSING);
+ case DB_IDENTIFIER_TOO_LONG:
+ return(HA_ERR_INTERNAL_ERROR);
+ case DB_TABLE_CORRUPT:
+ return(HA_ERR_TABLE_CORRUPT);
+ case DB_FTS_TOO_MANY_WORDS_IN_PHRASE:
+ return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE);
+ case DB_COMPUTE_VALUE_FAILED:
+ return(HA_ERR_GENERIC); // impossible
+ }
+}
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+void
+innobase_mysql_print_thd(
+/*=====================*/
+ FILE* f, /*!< in: output stream */
+ THD* thd, /*!< in: MySQL THD object */
+ uint max_query_len) /*!< in: max query length to print, or 0 to
+ use the default max length */
+{
+ char buffer[1024];
+
+ fputs(thd_get_error_context_description(thd, buffer, sizeof buffer,
+ max_query_len), f);
+ putc('\n', f);
+}
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+void
+innobase_get_cset_width(
+/*====================*/
+ ulint cset, /*!< in: MySQL charset-collation code */
+ unsigned*mbminlen, /*!< out: minimum length of a char (in bytes) */
+ unsigned*mbmaxlen) /*!< out: maximum length of a char (in bytes) */
+{
+ CHARSET_INFO* cs;
+ ut_ad(cset <= MAX_CHAR_COLL_NUM);
+ ut_ad(mbminlen);
+ ut_ad(mbmaxlen);
+
+ cs = all_charsets[cset];
+ if (cs) {
+ *mbminlen = cs->mbminlen;
+ *mbmaxlen = cs->mbmaxlen;
+ ut_ad(*mbminlen < DATA_MBMAX);
+ ut_ad(*mbmaxlen < DATA_MBMAX);
+ } else {
+ THD* thd = current_thd;
+
+ if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
+
+ /* Fix bug#46256: allow tables to be dropped if the
+ collation is not found, but issue a warning. */
+ if (cset != 0) {
+
+ sql_print_warning(
+ "Unknown collation #" ULINTPF ".",
+ cset);
+ }
+ } else {
+
+ ut_a(cset == 0);
+ }
+
+ *mbminlen = *mbmaxlen = 0;
+ }
+}
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+void
+innobase_convert_from_table_id(
+/*===========================*/
+ CHARSET_INFO* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len) /*!< in: length of 'to', in bytes */
+{
+ uint errors;
+
+ strconvert(cs, from, FN_REFLEN, &my_charset_filename, to, (uint) len, &errors);
+}
+
+/**********************************************************************
+Check if the length of the identifier exceeds the maximum allowed.
+return true when length of identifier is too long. */
+my_bool
+innobase_check_identifier_length(
+/*=============================*/
+ const char* id) /* in: FK identifier to check excluding the
+ database portion. */
+{
+ int well_formed_error = 0;
+ CHARSET_INFO *cs = system_charset_info;
+ DBUG_ENTER("innobase_check_identifier_length");
+
+ size_t len = my_well_formed_length(
+ cs, id, id + strlen(id),
+ NAME_CHAR_LEN, &well_formed_error);
+
+ if (well_formed_error || len == NAME_CHAR_LEN) {
+ my_error(ER_TOO_LONG_IDENT, MYF(0), id);
+ DBUG_RETURN(true);
+ }
+ DBUG_RETURN(false);
+}
+
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+void
+innobase_convert_from_id(
+/*=====================*/
+ CHARSET_INFO* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len) /*!< in: length of 'to', in bytes */
+{
+ uint errors;
+
+ strconvert(cs, from, FN_REFLEN, system_charset_info, to, (uint) len, &errors);
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+int
+innobase_strcasecmp(
+/*================*/
+ const char* a, /*!< in: first string to compare */
+ const char* b) /*!< in: second string to compare */
+{
+ if (!a) {
+ if (!b) {
+ return(0);
+ } else {
+ return(-1);
+ }
+ } else if (!b) {
+ return(1);
+ }
+
+ return(my_strcasecmp(system_charset_info, a, b));
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively. The
+second string contains wildcards.
+@return 0 if a match is found, 1 if not */
+static
+int
+innobase_wildcasecmp(
+/*=================*/
+ const char* a, /*!< in: string to compare */
+ const char* b) /*!< in: wildcard string to compare */
+{
+ return(wild_case_compare(system_charset_info, a, b));
+}
+
+/** Strip dir name from a full path name and return only the file name
+@param[in] path_name full path name
+@return file name or "null" if no file name */
+const char*
+innobase_basename(
+ const char* path_name)
+{
+ const char* name = base_name(path_name);
+
+ return((name) ? name : "null");
+}
+
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+void
+innobase_casedn_str(
+/*================*/
+ char* a) /*!< in/out: string to put in lower case */
+{
+ my_casedn_str(system_charset_info, a);
+}
+
+/** Determines the current SQL statement.
+Thread unsafe, can only be called from the thread owning the THD.
+@param[in] thd MySQL thread handle
+@param[out] length Length of the SQL statement
+@return SQL statement string */
+const char*
+innobase_get_stmt_unsafe(
+ THD* thd,
+ size_t* length)
+{
+ if (const LEX_STRING *stmt = thd_query_string(thd)) {
+ *length = stmt->length;
+ return stmt->str;
+ }
+
+ *length = 0;
+ return NULL;
+}
+
+/**********************************************************************//**
+Get the current setting of the tdc_size global parameter. We do
+a dirty read because for one there is no synchronization object and
+secondly there is little harm in doing so even if we get a torn read.
+@return value of tdc_size */
+ulint
+innobase_get_table_cache_size(void)
+/*===============================*/
+{
+ return(tdc_size);
+}
+
+/**********************************************************************//**
+Get the current setting of the lower_case_table_names global parameter from
+mysqld.cc. We do a dirty read because for one there is no synchronization
+object and secondly there is little harm in doing so even if we get a torn
+read.
+@return value of lower_case_table_names */
+ulint
+innobase_get_lower_case_table_names(void)
+/*=====================================*/
+{
+ return(lower_case_table_names);
+}
+
+/**
+ Test a file path whether it is same as mysql data directory path.
+
+ @param path null terminated character string
+
+ @return
+ @retval TRUE The path is different from mysql data directory.
+ @retval FALSE The path is same as mysql data directory.
+*/
+static bool is_mysql_datadir_path(const char *path)
+{
+ if (path == NULL)
+ return false;
+
+ char mysql_data_dir[FN_REFLEN], path_dir[FN_REFLEN];
+ convert_dirname(path_dir, path, NullS);
+ convert_dirname(mysql_data_dir, mysql_unpacked_real_data_home, NullS);
+ size_t mysql_data_home_len= dirname_length(mysql_data_dir);
+ size_t path_len = dirname_length(path_dir);
+
+ if (path_len < mysql_data_home_len)
+ return true;
+
+ if (!lower_case_file_system)
+ return(memcmp(mysql_data_dir, path_dir, mysql_data_home_len));
+
+ return(files_charset_info->strnncoll((uchar *) path_dir, path_len,
+ (uchar *) mysql_data_dir,
+ mysql_data_home_len,
+ TRUE));
+}
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return number of bytes copied to 'to' */
+static
+ulint
+innobase_convert_string(
+/*====================*/
+ void* to, /*!< out: converted string */
+ ulint to_length, /*!< in: number of bytes reserved
+ for the converted string */
+ CHARSET_INFO* to_cs, /*!< in: character set to convert to */
+ const void* from, /*!< in: string to convert */
+ ulint from_length, /*!< in: number of bytes to convert */
+ CHARSET_INFO* from_cs, /*!< in: character set to convert
+ from */
+ uint* errors) /*!< out: number of errors encountered
+ during the conversion */
+{
+ return(copy_and_convert(
+ (char*) to, (uint32) to_length, to_cs,
+ (const char*) from, (uint32) from_length, from_cs,
+ errors));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+innobase_raw_format(
+/*================*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint charset_coll, /*!< in: charset collation */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ /* XXX we use a hard limit instead of allocating
+ but_size bytes from the heap */
+ CHARSET_INFO* data_cs;
+ char buf_tmp[8192];
+ ulint buf_tmp_used;
+ uint num_errors;
+
+ data_cs = all_charsets[charset_coll];
+
+ buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
+ system_charset_info,
+ data, data_len, data_cs,
+ &num_errors);
+
+ return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
+}
+
+/*
+The helper function nlz(x) calculates the number of leading zeros
+in the binary representation of the number "x", either using a
+built-in compiler function or a substitute trick based on the use
+of the multiplication operation and a table indexed by the prefix
+of the multiplication result:
+*/
+#ifdef __GNUC__
+#define nlz(x) __builtin_clzll(x)
+#elif defined(_MSC_VER) && !defined(_M_CEE_PURE) && \
+ (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64))
+#ifndef __INTRIN_H_
+#pragma warning(push, 4)
+#pragma warning(disable: 4255 4668)
+#include <intrin.h>
+#pragma warning(pop)
+#endif
+__forceinline unsigned int nlz (ulonglong x)
+{
+#if defined(_M_IX86) || defined(_M_X64)
+ unsigned long n;
+#ifdef _M_X64
+ _BitScanReverse64(&n, x);
+ return (unsigned int) n ^ 63;
+#else
+ unsigned long y = (unsigned long) (x >> 32);
+ unsigned int m = 31;
+ if (y == 0)
+ {
+ y = (unsigned long) x;
+ m = 63;
+ }
+ _BitScanReverse(&n, y);
+ return (unsigned int) n ^ m;
+#endif
+#elif defined(_M_ARM64)
+ return _CountLeadingZeros(x);
+#endif
+}
+#else
+inline unsigned int nlz (ulonglong x)
+{
+ static unsigned char table [48] = {
+ 32, 6, 5, 0, 4, 12, 0, 20,
+ 15, 3, 11, 0, 0, 18, 25, 31,
+ 8, 14, 2, 0, 10, 0, 0, 0,
+ 0, 0, 0, 21, 0, 0, 19, 26,
+ 7, 0, 13, 0, 16, 1, 22, 27,
+ 9, 0, 17, 23, 28, 24, 29, 30
+ };
+ unsigned int y= (unsigned int) (x >> 32);
+ unsigned int n= 0;
+ if (y == 0) {
+ y= (unsigned int) x;
+ n= 32;
+ }
+ y = y | (y >> 1); // Propagate leftmost 1-bit to the right.
+ y = y | (y >> 2);
+ y = y | (y >> 4);
+ y = y | (y >> 8);
+ y = y & ~(y >> 16);
+ y = y * 0x3EF5D037;
+ return n + table[y >> 26];
+}
+#endif
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+ INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return the next value */
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+ ulonglong current, /*!< in: Current value */
+ ulonglong need, /*!< in: count of values needed */
+ ulonglong step, /*!< in: AUTOINC increment step */
+ ulonglong offset, /*!< in: AUTOINC offset */
+ ulonglong max_value) /*!< in: max value for type */
+{
+ ulonglong next_value;
+ ulonglong block;
+
+ /* Should never be 0. */
+ ut_a(need > 0);
+ ut_a(step > 0);
+ ut_a(max_value > 0);
+
+ /*
+ We need to calculate the "block" value equal to the product
+ "step * need". However, when calculating this product, an integer
+ overflow can occur, so we cannot simply use the usual multiplication
+ operation. The snippet below calculates the product of two numbers
+ and detects an unsigned integer overflow:
+ */
+ unsigned int m= nlz(need);
+ unsigned int n= nlz(step);
+ if (m + n <= 8 * sizeof(ulonglong) - 2) {
+ // The bit width of the original values is too large,
+ // therefore we are guaranteed to get an overflow.
+ goto overflow;
+ }
+ block = need * (step >> 1);
+ if ((longlong) block < 0) {
+ goto overflow;
+ }
+ block += block;
+ if (step & 1) {
+ block += need;
+ if (block < need) {
+ goto overflow;
+ }
+ }
+
+ /* Check for overflow. Current can be > max_value if the value
+ is in reality a negative value. Also, the visual studio compiler
+ converts large double values (which hypothetically can then be
+ passed here as the values of the "current" parameter) automatically
+ into unsigned long long datatype maximum value: */
+ if (current > max_value) {
+ goto overflow;
+ }
+
+ /* According to MySQL documentation, if the offset is greater than
+ the step then the offset is ignored. */
+ if (offset > step) {
+ offset = 0;
+ }
+
+ /*
+ Let's round the current value to within a step-size block:
+ */
+ if (current > offset) {
+ next_value = current - offset;
+ } else {
+ next_value = offset - current;
+ }
+ next_value -= next_value % step;
+
+ /*
+ Add an offset to the next value and check that the addition
+ does not cause an integer overflow:
+ */
+ next_value += offset;
+ if (next_value < offset) {
+ goto overflow;
+ }
+
+ /*
+ Add a block to the next value and check that the addition
+ does not cause an integer overflow:
+ */
+ next_value += block;
+ if (next_value < block) {
+ goto overflow;
+ }
+
+ return(next_value);
+
+overflow:
+ /*
+ Allow auto_increment to go over max_value up to max ulonglong.
+ This allows us to detect that all values are exhausted.
+ If we don't do this, we will return max_value several times
+ and get duplicate key errors instead of auto increment value
+ out of range:
+ */
+ return(~(ulonglong) 0);
+}
+
+/********************************************************************//**
+Reset the autoinc value in the table.
+@return DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_reset_autoinc(
+/*================================*/
+ ulonglong autoinc) /*!< in: value to store */
+{
+ dberr_t error;
+
+ error = innobase_lock_autoinc();
+
+ if (error == DB_SUCCESS) {
+
+ dict_table_autoinc_initialize(m_prebuilt->table, autoinc);
+ m_prebuilt->table->autoinc_mutex.unlock();
+ }
+
+ return(error);
+}
+
+/*******************************************************************//**
+Reset the auto-increment counter to the given value, i.e. the next row
+inserted will get the given value. This is called e.g. after TRUNCATE
+is emulated by doing a 'DELETE FROM t'. HA_ERR_WRONG_COMMAND is
+returned by storage engines that don't support this operation.
+@return 0 or error code */
+UNIV_INTERN
+int
+ha_innobase::reset_auto_increment(
+/*==============================*/
+ ulonglong value) /*!< in: new value for table autoinc */
+{
+ DBUG_ENTER("ha_innobase::reset_auto_increment");
+
+ dberr_t error;
+
+ update_thd(ha_thd());
+
+ error = row_lock_table_autoinc_for_mysql(m_prebuilt);
+
+ if (error != DB_SUCCESS) {
+ DBUG_RETURN(convert_error_code_to_mysql(
+ error, m_prebuilt->table->flags, m_user_thd));
+ }
+
+ /* The next value can never be 0. */
+ if (value == 0) {
+ value = 1;
+ }
+
+ innobase_reset_autoinc(value);
+
+ DBUG_RETURN(0);
+}
+
+/*********************************************************************//**
+Initializes some fields in an InnoDB transaction object. */
+static
+void
+innobase_trx_init(
+/*==============*/
+ THD* thd, /*!< in: user thread handle */
+ trx_t* trx) /*!< in/out: InnoDB transaction handle */
+{
+ DBUG_ENTER("innobase_trx_init");
+ DBUG_ASSERT(thd == trx->mysql_thd);
+
+ /* Ensure that thd_lock_wait_timeout(), which may be called
+ while holding lock_sys.mutex, by lock_rec_enqueue_waiting(),
+ will not end up acquiring LOCK_global_system_variables in
+ intern_sys_var_ptr(). */
+ THDVAR(thd, lock_wait_timeout);
+
+ trx->check_foreigns = !thd_test_options(
+ thd, OPTION_NO_FOREIGN_KEY_CHECKS);
+
+ trx->check_unique_secondary = !thd_test_options(
+ thd, OPTION_RELAXED_UNIQUE_CHECKS);
+#ifdef WITH_WSREP
+ trx->wsrep = wsrep_on(thd);
+#endif
+
+ DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Allocates an InnoDB transaction for a MySQL handler object for DML.
+@return InnoDB transaction handle */
+trx_t*
+innobase_trx_allocate(
+/*==================*/
+ THD* thd) /*!< in: user thread handle */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("innobase_trx_allocate");
+ DBUG_ASSERT(thd != NULL);
+ DBUG_ASSERT(EQ_CURRENT_THD(thd));
+
+ trx = trx_create();
+
+ trx->mysql_thd = thd;
+
+ innobase_trx_init(thd, trx);
+
+ DBUG_RETURN(trx);
+}
+
+/*********************************************************************//**
+Gets the InnoDB transaction handle for a MySQL handler object, creates
+an InnoDB transaction struct if the corresponding MySQL thread struct still
+lacks one.
+@return InnoDB transaction handle */
+static inline
+trx_t*
+check_trx_exists(
+/*=============*/
+ THD* thd) /*!< in: user thread handle */
+{
+ if (trx_t* trx = thd_to_trx(thd)) {
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+ innobase_trx_init(thd, trx);
+ return trx;
+ } else {
+ trx = innobase_trx_allocate(thd);
+ thd_set_ha_data(thd, innodb_hton_ptr, trx);
+ return trx;
+ }
+}
+
+/**
+ Gets current trx.
+
+ This function may be called during InnoDB initialisation, when
+ innodb_hton_ptr->slot is not yet set to meaningful value.
+*/
+
+trx_t *current_trx()
+{
+ THD *thd=current_thd;
+ if (likely(thd != 0) && innodb_hton_ptr->slot != HA_SLOT_UNDEF) {
+ return thd_to_trx(thd);
+ } else {
+ return(NULL);
+ }
+}
+
+/*********************************************************************//**
+Note that a transaction has been registered with MySQL.
+@return true if transaction is registered with MySQL 2PC coordinator */
+static inline
+bool
+trx_is_registered_for_2pc(
+/*======================*/
+ const trx_t* trx) /* in: transaction */
+{
+ return(trx->is_registered == 1);
+}
+
+/*********************************************************************//**
+Note that a transaction has been deregistered. */
+static inline
+void
+trx_deregister_from_2pc(
+/*====================*/
+ trx_t* trx) /* in: transaction */
+{
+ trx->is_registered= false;
+ trx->active_commit_ordered= false;
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+static
+void
+innobase_copy_frm_flags_from_create_info(
+/*=====================================*/
+ dict_table_t* innodb_table, /*!< in/out: InnoDB table */
+ const HA_CREATE_INFO* create_info) /*!< in: create info */
+{
+ ibool ps_on;
+ ibool ps_off;
+
+ if (innodb_table->is_temporary()
+ || innodb_table->no_rollback()) {
+ /* Temp tables do not use persistent stats. */
+ ps_on = FALSE;
+ ps_off = TRUE;
+ } else {
+ ps_on = create_info->table_options
+ & HA_OPTION_STATS_PERSISTENT;
+ ps_off = create_info->table_options
+ & HA_OPTION_NO_STATS_PERSISTENT;
+ }
+
+ dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+ dict_stats_auto_recalc_set(
+ innodb_table,
+ create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+ create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+ innodb_table->stats_sample_pages = create_info->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+void
+innobase_copy_frm_flags_from_table_share(
+/*=====================================*/
+ dict_table_t* innodb_table, /*!< in/out: InnoDB table */
+ const TABLE_SHARE* table_share) /*!< in: table share */
+{
+ ibool ps_on;
+ ibool ps_off;
+
+ if (innodb_table->is_temporary()) {
+ /* Temp tables do not use persistent stats */
+ ps_on = FALSE;
+ ps_off = TRUE;
+ } else {
+ ps_on = table_share->db_create_options
+ & HA_OPTION_STATS_PERSISTENT;
+ ps_off = table_share->db_create_options
+ & HA_OPTION_NO_STATS_PERSISTENT;
+ }
+
+ dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+ dict_stats_auto_recalc_set(
+ innodb_table,
+ table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+ table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+ innodb_table->stats_sample_pages = table_share->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Construct ha_innobase handler. */
+
+ha_innobase::ha_innobase(
+/*=====================*/
+ handlerton* hton,
+ TABLE_SHARE* table_arg)
+ :handler(hton, table_arg),
+ m_prebuilt(),
+ m_user_thd(),
+ m_int_table_flags(HA_REC_NOT_IN_SEQ
+ | HA_NULL_IN_KEY
+ | HA_CAN_VIRTUAL_COLUMNS
+ | HA_CAN_INDEX_BLOBS
+ | HA_CAN_SQL_HANDLER
+ | HA_REQUIRES_KEY_COLUMNS_FOR_DELETE
+ | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION
+ | HA_PRIMARY_KEY_IN_READ_INDEX
+ | HA_BINLOG_ROW_CAPABLE
+ | HA_CAN_GEOMETRY
+ | HA_PARTIAL_COLUMN_READ
+ | HA_TABLE_SCAN_ON_INDEX
+ | HA_CAN_FULLTEXT
+ | HA_CAN_FULLTEXT_EXT
+ /* JAN: TODO: MySQL 5.7
+ | HA_CAN_FULLTEXT_HINTS
+ */
+ | HA_CAN_EXPORT
+ | HA_ONLINE_ANALYZE
+ | HA_CAN_RTREEKEYS
+ | HA_CAN_TABLES_WITHOUT_ROLLBACK
+ | HA_CAN_ONLINE_BACKUPS
+ | HA_CONCURRENT_OPTIMIZE
+ | (srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0)
+ ),
+ m_start_of_scan(),
+ m_mysql_has_locked()
+{}
+
+/*********************************************************************//**
+Destruct ha_innobase handler. */
+
+ha_innobase::~ha_innobase()
+/*======================*/
+{
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+m_prebuilt struct. */
+void
+ha_innobase::update_thd(
+/*====================*/
+ THD* thd) /*!< in: thd to use the handle */
+{
+ DBUG_ENTER("ha_innobase::update_thd");
+ DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p",
+ m_user_thd, thd));
+
+ /* The table should have been opened in ha_innobase::open(). */
+ DBUG_ASSERT(m_prebuilt->table->get_ref_count() > 0);
+
+ trx_t* trx = check_trx_exists(thd);
+
+ ut_ad(trx->dict_operation_lock_mode == 0);
+ ut_ad(trx->dict_operation == TRX_DICT_OP_NONE);
+
+ if (m_prebuilt->trx != trx) {
+
+ row_update_prebuilt_trx(m_prebuilt, trx);
+ }
+
+ m_user_thd = thd;
+
+ DBUG_ASSERT(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
+ DBUG_ASSERT(m_prebuilt->trx == thd_to_trx(m_user_thd));
+
+ DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+m_prebuilt struct. */
+
+void
+ha_innobase::update_thd()
+/*=====================*/
+{
+ THD* thd = ha_thd();
+
+ ut_ad(EQ_CURRENT_THD(thd));
+ update_thd(thd);
+}
+
+/*********************************************************************//**
+Registers an InnoDB transaction with the MySQL 2PC coordinator, so that
+the MySQL XA code knows to call the InnoDB prepare and commit, or rollback
+for the transaction. This MUST be called for every transaction for which
+the user may call commit or rollback. Calling this several times to register
+the same transaction is allowed, too. This function also registers the
+current SQL statement. */
+static inline
+void
+innobase_register_trx(
+/*==================*/
+ handlerton* hton, /* in: Innobase handlerton */
+ THD* thd, /* in: MySQL thd (connection) object */
+ trx_t* trx) /* in: transaction to register */
+{
+ ut_ad(!trx->active_commit_ordered);
+ const trx_id_t trx_id= trx->id;
+
+ trans_register_ha(thd, false, hton, trx_id);
+
+ if (!trx->is_registered)
+ {
+ trx->is_registered= true;
+ if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+ trans_register_ha(thd, true, hton, trx_id);
+ }
+}
+
+/* BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
+ ------------------------------------------------------------
+
+1) The use of the query cache for TBL is disabled when there is an
+uncommitted change to TBL.
+
+2) When a change to TBL commits, InnoDB stores the current value of
+its global trx id counter, let us denote it by INV_TRX_ID, to the table object
+in the InnoDB data dictionary, and does only allow such transactions whose
+id <= INV_TRX_ID to use the query cache.
+
+3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit
+modification because an ON DELETE CASCADE, we invalidate the MySQL query cache
+of TBL immediately.
+
+How this is implemented inside InnoDB:
+
+1) Since every modification always sets an IX type table lock on the InnoDB
+table, it is easy to check if there can be uncommitted modifications for a
+table: just check if there are locks in the lock list of the table.
+
+2) When a transaction inside InnoDB commits, it reads the global trx id
+counter and stores the value INV_TRX_ID to the tables on which it had a lock.
+
+3) If there is an implicit table change from ON DELETE CASCADE or SET NULL,
+InnoDB calls an invalidate method for the MySQL query cache for that table.
+
+How this is implemented inside sql_cache.cc:
+
+1) The query cache for an InnoDB table TBL is invalidated immediately at an
+INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay
+invalidation to the transaction commit.
+
+2) To store or retrieve a value from the query cache of an InnoDB table TBL,
+any query must first ask InnoDB's permission. We must pass the thd as a
+parameter because InnoDB will look at the trx id, if any, associated with
+that thd. Also the full_name which is used as key to search for the table
+object. The full_name is a string containing the normalized path to the
+table in the canonical format.
+
+3) Use of the query cache for InnoDB tables is now allowed also when
+AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer
+put restrictions on the use of the query cache.
+*/
+
+/** Check if mysql can allow the transaction to read from/store to
+the query cache.
+@param[in] table table object
+@param[in] trx transaction object
+@return whether the storing or retrieving from the query cache is permitted */
+static bool innobase_query_caching_table_check_low(
+ const dict_table_t* table,
+ trx_t* trx)
+{
+ /* The following conditions will decide the query cache
+ retrieval or storing into:
+
+ (1) There should not be any locks on the table.
+ (2) Someother trx shouldn't invalidate the cache before this
+ transaction started.
+ (3) Read view shouldn't exist. If exists then the view
+ low_limit_id should be greater than or equal to the transaction that
+ invalidates the cache for the particular table.
+
+ For read-only transaction: should satisfy (1) and (3)
+ For read-write transaction: should satisfy (1), (2), (3) */
+
+ if (lock_table_get_n_locks(table)) {
+ return false;
+ }
+
+ if (trx->id && trx->id < table->query_cache_inv_trx_id) {
+ return false;
+ }
+
+ return !trx->read_view.is_open()
+ || trx->read_view.low_limit_id()
+ >= table->query_cache_inv_trx_id;
+}
+
+/** Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache.
+@param[in,out] trx transaction
+@param[in] norm_name concatenation of database name,
+ '/' char, table name
+@return whether storing or retrieving from the query cache is permitted */
+static bool innobase_query_caching_table_check(
+ trx_t* trx,
+ const char* norm_name)
+{
+ dict_table_t* table = dict_table_open_on_name(
+ norm_name, FALSE, FALSE, DICT_ERR_IGNORE_FK_NOKEY);
+
+ if (table == NULL) {
+ return false;
+ }
+
+ /* Start the transaction if it is not started yet */
+ trx_start_if_not_started(trx, false);
+
+ bool allow = innobase_query_caching_table_check_low(table, trx);
+
+ dict_table_close(table, FALSE, FALSE);
+
+ if (allow) {
+ /* If the isolation level is high, assign a read view for the
+ transaction if it does not yet have one */
+
+ if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+ && !srv_read_only_mode
+ && !trx->read_view.is_open()) {
+
+ /* Start the transaction if it is not started yet */
+ trx_start_if_not_started(trx, false);
+
+ trx->read_view.open(trx);
+ }
+ }
+
+ return allow;
+}
+
+/******************************************************************//**
+The MySQL query cache uses this to check from InnoDB if the query cache at
+the moment is allowed to operate on an InnoDB table. The SQL query must
+be a non-locking SELECT.
+
+The query cache is allowed to operate on certain query only if this function
+returns TRUE for all tables in the query.
+
+If thd is not in the autocommit state, this function also starts a new
+transaction for thd if there is no active trx yet, and assigns a consistent
+read view to it if there is no read view yet.
+
+Why a deadlock of threads is not possible: the query cache calls this function
+at the start of a SELECT processing. Then the calling thread cannot be
+holding any InnoDB semaphores. The calling thread is holding the
+query cache mutex, and this function will reserve the InnoDB trx_sys.mutex.
+Thus, the 'rank' in sync0mutex.h of the MySQL query cache mutex is above
+the InnoDB trx_sys.mutex.
+@return TRUE if permitted, FALSE if not; note that the value FALSE
+does not mean we should invalidate the query cache: invalidation is
+called explicitly */
+static
+my_bool
+innobase_query_caching_of_table_permitted(
+/*======================================*/
+ THD* thd, /*!< in: thd of the user who is trying to
+ store a result to the query cache or
+ retrieve it */
+ const char* full_name, /*!< in: normalized path to the table */
+ uint full_name_len, /*!< in: length of the normalized path
+ to the table */
+ ulonglong *)
+{
+ char norm_name[1000];
+ trx_t* trx = check_trx_exists(thd);
+
+ ut_a(full_name_len < 999);
+
+ if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+ /* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every
+ plain SELECT if AUTOCOMMIT is not on. */
+
+ return(false);
+ }
+
+ if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
+ && trx->n_mysql_tables_in_use == 0) {
+ /* We are going to retrieve the query result from the query
+ cache. This cannot be a store operation to the query cache
+ because then MySQL would have locks on tables already.
+
+ TODO: if the user has used LOCK TABLES to lock the table,
+ then we open a transaction in the call of row_.. below.
+ That trx can stay open until UNLOCK TABLES. The same problem
+ exists even if we do not use the query cache. MySQL should be
+ modified so that it ALWAYS calls some cleanup function when
+ the processing of a query ends!
+
+ We can imagine we instantaneously serialize this consistent
+ read trx to the current trx id counter. If trx2 would have
+ changed the tables of a query result stored in the cache, and
+ trx2 would have already committed, making the result obsolete,
+ then trx2 would have already invalidated the cache. Thus we
+ can trust the result in the cache is ok for this query. */
+
+ return(true);
+ }
+
+ /* Normalize the table name to InnoDB format */
+ normalize_table_name(norm_name, full_name);
+
+ innobase_register_trx(innodb_hton_ptr, thd, trx);
+
+ return innobase_query_caching_table_check(trx, norm_name);
+}
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+void
+innobase_invalidate_query_cache(
+/*============================*/
+ trx_t* trx, /*!< in: transaction which
+ modifies the table */
+ const char* full_name) /*!< in: concatenation of
+ database name, path separator,
+ table name, null char NUL;
+ NOTE that in Windows this is
+ always in LOWER CASE! */
+{
+ /* Note that the sync0mutex.h rank of the query cache mutex is just
+ above the InnoDB trx_sys_t->lock. The caller of this function must
+ not have latches of a lower rank. */
+
+#ifdef HAVE_QUERY_CACHE
+ char qcache_key_name[2 * (NAME_LEN + 1)];
+ char db_name[NAME_CHAR_LEN * MY_CS_MBMAXLEN + 1];
+ const char *key_ptr;
+ size_t tabname_len;
+
+ // Extract the database name.
+ key_ptr= strchr(full_name, '/');
+ DBUG_ASSERT(key_ptr != NULL); // Database name should be present
+ size_t dbname_len= size_t(key_ptr - full_name);
+ memcpy(db_name, full_name, dbname_len);
+ db_name[dbname_len]= '\0';
+
+ /* Construct the key("db-name\0table$name\0") for the query cache using
+ the path name("db@002dname\0table@0024name\0") of the table in its
+ canonical form. */
+ dbname_len = filename_to_tablename(db_name, qcache_key_name,
+ sizeof(qcache_key_name));
+ tabname_len = filename_to_tablename(++key_ptr,
+ (qcache_key_name + dbname_len + 1),
+ sizeof(qcache_key_name) -
+ dbname_len - 1);
+
+ /* Argument TRUE below means we are using transactions */
+ mysql_query_cache_invalidate4(trx->mysql_thd,
+ qcache_key_name,
+ uint(dbname_len + tabname_len + 2),
+ TRUE);
+#endif
+}
+
+/** Quote a standard SQL identifier like index or column name.
+@param[in] file output stream
+@param[in] trx InnoDB transaction, or NULL
+@param[in] id identifier to quote */
+void
+innobase_quote_identifier(
+ FILE* file,
+ trx_t* trx,
+ const char* id)
+{
+ const int q = trx != NULL && trx->mysql_thd != NULL
+ ? get_quote_char_for_identifier(trx->mysql_thd, id, strlen(id))
+ : '`';
+
+ if (q == EOF) {
+ fputs(id, file);
+ } else {
+ putc(q, file);
+
+ while (int c = *id++) {
+ if (c == q) {
+ putc(c, file);
+ }
+ putc(c, file);
+ }
+
+ putc(q, file);
+ }
+}
+
+/** Quote a standard SQL identifier like tablespace, index or column name.
+@param[in] trx InnoDB transaction, or NULL
+@param[in] id identifier to quote
+@return quoted identifier */
+std::string
+innobase_quote_identifier(
+/*======================*/
+ trx_t* trx,
+ const char* id)
+{
+ std::string quoted_identifier;
+ const int q = trx != NULL && trx->mysql_thd != NULL
+ ? get_quote_char_for_identifier(trx->mysql_thd, id, strlen(id))
+ : '`';
+
+ if (q == EOF) {
+ quoted_identifier.append(id);
+ } else {
+ quoted_identifier += char(q);
+ quoted_identifier.append(id);
+ quoted_identifier += char(q);
+ }
+
+ return (quoted_identifier);
+}
+
+/** Convert a table name to the MySQL system_charset_info (UTF-8)
+and quote it.
+@param[out] buf buffer for converted identifier
+@param[in] buflen length of buf, in bytes
+@param[in] id identifier to convert
+@param[in] idlen length of id, in bytes
+@param[in] thd MySQL connection thread, or NULL
+@return pointer to the end of buf */
+static
+char*
+innobase_convert_identifier(
+ char* buf,
+ ulint buflen,
+ const char* id,
+ ulint idlen,
+ THD* thd)
+{
+ const char* s = id;
+
+ char nz[MAX_TABLE_NAME_LEN + 1];
+ char nz2[MAX_TABLE_NAME_LEN + 1];
+
+ /* Decode the table name. The MySQL function expects
+ a NUL-terminated string. The input and output strings
+ buffers must not be shared. */
+ ut_a(idlen <= MAX_TABLE_NAME_LEN);
+ memcpy(nz, id, idlen);
+ nz[idlen] = 0;
+
+ s = nz2;
+ idlen = explain_filename(thd, nz, nz2, sizeof nz2,
+ EXPLAIN_PARTITIONS_AS_COMMENT);
+ if (idlen > buflen) {
+ idlen = buflen;
+ }
+ memcpy(buf, s, idlen);
+ return(buf + idlen);
+}
+
+/*****************************************************************//**
+Convert a table name to the MySQL system_charset_info (UTF-8).
+@return pointer to the end of buf */
+char*
+innobase_convert_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: table name to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ THD* thd) /*!< in: MySQL connection thread, or NULL */
+{
+ char* s = buf;
+ const char* bufend = buf + buflen;
+
+ const char* slash = (const char*) memchr(id, '/', idlen);
+
+ if (slash == NULL) {
+ return(innobase_convert_identifier(
+ buf, buflen, id, idlen, thd));
+ }
+
+ /* Print the database name and table name separately. */
+ s = innobase_convert_identifier(s, ulint(bufend - s),
+ id, ulint(slash - id), thd);
+ if (s < bufend) {
+ *s++ = '.';
+ s = innobase_convert_identifier(s, ulint(bufend - s),
+ slash + 1, idlen
+ - ulint(slash - id) - 1,
+ thd);
+ }
+
+ return(s);
+}
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table name
+to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return pointer to the end of buf */
+void
+innobase_format_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* name) /*!< in: table name to format */
+{
+ const char* bufend;
+
+ bufend = innobase_convert_name(buf, buflen, name, strlen(name), NULL);
+
+ ut_ad((ulint) (bufend - buf) < buflen);
+
+ buf[bufend - buf] = '\0';
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return true if interrupted */
+bool
+trx_is_interrupted(
+/*===============*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ return(trx && trx->mysql_thd && thd_kill_level(trx->mysql_thd));
+}
+
+/**************************************************************//**
+Resets some fields of a m_prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+void
+ha_innobase::reset_template(void)
+/*=============================*/
+{
+ ut_ad(m_prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_ad(m_prebuilt->magic_n2 == m_prebuilt->magic_n);
+
+ /* Force table to be freed in close_thread_table(). */
+ DBUG_EXECUTE_IF("free_table_in_fts_query",
+ if (m_prebuilt->in_fts_query) {
+ table->mark_table_for_reopen();
+ }
+ );
+
+ m_prebuilt->keep_other_fields_on_keyread = false;
+ m_prebuilt->read_just_key = 0;
+ m_prebuilt->in_fts_query = 0;
+
+ /* Reset index condition pushdown state. */
+ if (m_prebuilt->idx_cond) {
+ m_prebuilt->idx_cond = NULL;
+ m_prebuilt->idx_cond_n_cols = 0;
+ /* Invalidate m_prebuilt->mysql_template
+ in ha_innobase::write_row(). */
+ m_prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
+ }
+ if (m_prebuilt->pk_filter) {
+ m_prebuilt->pk_filter = NULL;
+ m_prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
+ }
+}
+
+/*****************************************************************//**
+Call this when you have opened a new table handle in HANDLER, before you
+call index_read_map() etc. Actually, we can let the cursor stay open even
+over a transaction commit! Then you should call this before every operation,
+fetch next etc. This function inits the necessary things even after a
+transaction commit. */
+
+void
+ha_innobase::init_table_handle_for_HANDLER(void)
+/*============================================*/
+{
+ /* If current thd does not yet have a trx struct, create one.
+ If the current handle does not yet have a m_prebuilt struct, create
+ one. Update the trx pointers in the m_prebuilt struct. Normally
+ this operation is done in external_lock. */
+
+ update_thd(ha_thd());
+
+ /* Initialize the m_prebuilt struct much like it would be inited in
+ external_lock */
+
+ /* If the transaction is not started yet, start it */
+
+ trx_start_if_not_started_xa(m_prebuilt->trx, false);
+
+ /* Assign a read view if the transaction does not have it yet */
+
+ m_prebuilt->trx->read_view.open(m_prebuilt->trx);
+
+ innobase_register_trx(ht, m_user_thd, m_prebuilt->trx);
+
+ /* We did the necessary inits in this function, no need to repeat them
+ in row_search_for_mysql */
+
+ m_prebuilt->sql_stat_start = FALSE;
+
+ /* We let HANDLER always to do the reads as consistent reads, even
+ if the trx isolation level would have been specified as SERIALIZABLE */
+
+ m_prebuilt->select_lock_type = LOCK_NONE;
+ m_prebuilt->stored_select_lock_type = LOCK_NONE;
+
+ /* Always fetch all columns in the index record */
+
+ m_prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS;
+
+ /* We want always to fetch all columns in the whole row? Or do
+ we???? */
+
+ m_prebuilt->used_in_HANDLER = TRUE;
+
+ reset_template();
+}
+
+/*********************************************************************//**
+Free any resources that were allocated and return failure.
+@return always return 1 */
+static int innodb_init_abort()
+{
+ DBUG_ENTER("innodb_init_abort");
+
+ if (fil_system.temp_space) {
+ fil_system.temp_space->close();
+ }
+
+ srv_sys_space.shutdown();
+ if (srv_tmp_space.get_sanity_check_status()) {
+ srv_tmp_space.delete_files();
+ }
+ srv_tmp_space.shutdown();
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+ os_event_destroy(srv_allow_writes_event);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+ DBUG_RETURN(1);
+}
+
+static const char* deprecated_innodb_checksum_algorithm
+ = "Setting innodb_checksum_algorithm to values other than"
+ " crc32, full_crc32, strict_crc32 or strict_full_crc32"
+ " is UNSAFE and DEPRECATED."
+ " These deprecated values will be disallowed in MariaDB 10.6.";
+
+static void innodb_checksum_algorithm_update(THD *thd, st_mysql_sys_var*,
+ void *, const void *save)
+{
+ srv_checksum_algorithm= *static_cast<const ulong*>(save);
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ break;
+ default:
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated_innodb_checksum_algorithm);
+ }
+}
+
+/****************************************************************//**
+Gives the file extension of an InnoDB single-table tablespace. */
+static const char* ha_innobase_exts[] = {
+ dot_ext[IBD],
+ dot_ext[ISL],
+ NullS
+};
+
+/** Determine if system-versioned data was modified by the transaction.
+@param[in,out] thd current session
+@param[out] trx_id transaction start ID
+@return transaction commit ID
+@retval 0 if no system-versioned data was affected by the transaction */
+static ulonglong innodb_prepare_commit_versioned(THD* thd, ulonglong *trx_id)
+{
+ if (const trx_t* trx = thd_to_trx(thd)) {
+ *trx_id = trx->id;
+
+ for (trx_mod_tables_t::const_iterator t
+ = trx->mod_tables.begin();
+ t != trx->mod_tables.end(); t++) {
+ if (t->second.is_versioned()) {
+ DBUG_ASSERT(t->first->versioned_by_id());
+ DBUG_ASSERT(trx->rsegs.m_redo.rseg);
+
+ return trx_sys.get_new_trx_id();
+ }
+ }
+
+ return 0;
+ }
+
+ *trx_id = 0;
+ return 0;
+}
+
+/** Initialize and normalize innodb_buffer_pool_size. */
+static void innodb_buffer_pool_size_init()
+{
+ if (srv_buf_pool_chunk_unit > srv_buf_pool_size) {
+ /* Size unit of buffer pool is larger than srv_buf_pool_size.
+ adjust srv_buf_pool_chunk_unit for srv_buf_pool_size. */
+ srv_buf_pool_chunk_unit = ulong(srv_buf_pool_size);
+ }
+
+ srv_buf_pool_size = buf_pool_size_align(srv_buf_pool_size);
+ innobase_buffer_pool_size = srv_buf_pool_size;
+}
+
+namespace deprecated {
+/** Deprecated; no effect other than issuing a deprecation warning. */
+char* innodb_file_format;
+/** Deprecated; no effect other than issuing a deprecation warning. */
+char* innodb_large_prefix;
+
+/** Deprecated parameter with no effect */
+static my_bool innodb_log_checksums;
+/** Deprecation message for innodb_log_checksums */
+static const char* innodb_log_checksums_msg
+= "The parameter innodb_log_checksums is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static my_bool innodb_log_compressed_pages;
+/** Deprecation message for innodb_log_compressed_pages */
+static const char* innodb_log_compressed_pages_msg
+= "The parameter innodb_log_compressed_pages is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static my_bool innodb_log_optimize_ddl;
+static const char* innodb_log_optimize_ddl_msg
+= "The parameter innodb_log_optimize_ddl is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static my_bool innodb_scrub_log;
+/** Deprecation message for innodb_scrub_log */
+static const char* innodb_scrub_log_msg
+= "The parameter innodb_scrub_log is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static ulonglong innodb_scrub_log_speed;
+/** Deprecation message for innodb_scrub_log_speed */
+static const char* innodb_scrub_log_speed_msg
+= "The parameter innodb_scrub_log_speed is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static ulong innodb_undo_logs;
+/** Deprecation message for innodb_undo_logs */
+static const char* innodb_undo_logs_msg
+= "The parameter innodb_undo_logs is deprecated and has no effect.";
+/** Deprecated parameter with no effect */
+static ulong innodb_buffer_pool_instances;
+/** Deprecated parameter with no effect */
+static ulong innodb_page_cleaners;
+static const char* innodb_page_cleaners_msg
+= "The parameter innodb_page_cleaners is deprecated and has no effect.";
+
+ulong srv_n_log_files;
+static const char* srv_n_log_files_msg
+= "The parameter innodb_log_files_in_group is deprecated and has no effect.";
+
+static my_bool innodb_background_scrub_data_uncompressed;
+
+static const char* innodb_background_scrub_data_uncompressed_msg
+= "The parameter innodb_background_scrub_data_uncompressed is deprecated and"
+ " has no effect.";
+
+static my_bool innodb_background_scrub_data_compressed;
+
+static const char* innodb_background_scrub_data_compressed_msg
+= "The parameter innodb_background_scrub_data_compressed is deprecated and"
+ " has no effect.";
+
+static uint innodb_background_scrub_data_check_interval;
+
+static const char* innodb_background_scrub_data_check_interval_msg
+= "The parameter innodb_background_scrub_data_check_interval is deprecated and"
+ " has no effect.";
+
+static uint innodb_background_scrub_data_interval;
+
+static const char* innodb_background_scrub_data_interval_msg
+= "The parameter innodb_background_scrub_data_interval is deprecated and"
+ " has no effect.";
+
+uint replication_delay;
+uint thread_concurrency;
+uint commit_concurrency;
+uint concurrency_tickets;
+uint adaptive_max_sleep_delay;
+uint thread_sleep_delay;
+
+static const char * const replication_delay_msg
+= "The parameter innodb_replication_delay is deprecated and has no effect.";
+static const char * const thread_concurrency_msg
+= "The parameter innodb_thread_concurrency is deprecated and has no effect.";
+static const char * const commit_concurrency_msg
+= "The parameter innodb_commit_concurrency is deprecated and has no effect.";
+static const char * const concurrency_tickets_msg
+= "The parameter innodb_concurrency_tickets is deprecated and has no effect.";
+static const char * const adaptive_max_sleep_delay_msg
+= "The parameter innodb_adaptive_max_sleep_delay is deprecated and"
+ " has no effect.";
+static const char * const thread_sleep_delay_msg
+= "The parameter innodb_thread_sleep_delay is deprecated and has no effect.";
+
+static void replication_delay_warn(THD* thd, st_mysql_sys_var*, void*,
+ const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+ replication_delay_msg);
+}
+static void thread_concurrency_warn(THD* thd, st_mysql_sys_var*, void*,
+ const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+ thread_concurrency_msg);
+}
+static void commit_concurrency_warn(THD* thd, st_mysql_sys_var*, void*,
+ const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+ commit_concurrency_msg);
+}
+static void concurrency_tickets_warn(THD* thd, st_mysql_sys_var*, void*,
+ const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+ concurrency_tickets_msg);
+}
+static void adaptive_max_sleep_delay_warn(THD* thd, st_mysql_sys_var*, void*,
+ const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+ adaptive_max_sleep_delay_msg);
+}
+static void thread_sleep_delay_warn(THD* thd, st_mysql_sys_var*, void*,
+ const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
+ thread_sleep_delay_msg);
+}
+
+} // namespace deprecated
+
+/** Initialize, validate and normalize the InnoDB startup parameters.
+@return failure code
+@retval 0 on success
+@retval HA_ERR_OUT_OF_MEM when out of memory
+@retval HA_ERR_INITIALIZATION when some parameters are out of range */
+static int innodb_init_params()
+{
+ DBUG_ENTER("innodb_init_params");
+
+ static char current_dir[3];
+ char *default_path;
+ ulong num_pll_degree;
+
+ if (deprecated::innodb_large_prefix || deprecated::innodb_file_format) {
+ const char* p = deprecated::innodb_file_format
+ ? "file_format"
+ : "large_prefix";
+ sql_print_warning("The parameter innodb_%s is deprecated"
+ " and has no effect."
+ " It may be removed in future releases."
+ " See https://mariadb.com/kb/en/library/"
+ "xtradbinnodb-file-format/", p);
+ }
+
+ if (UNIV_UNLIKELY(!deprecated::innodb_log_checksums)) {
+ sql_print_warning(deprecated::innodb_log_checksums_msg);
+ deprecated::innodb_log_checksums = TRUE;
+ }
+
+ if (UNIV_UNLIKELY(!deprecated::innodb_log_compressed_pages)) {
+ sql_print_warning(deprecated::innodb_log_compressed_pages_msg);
+ deprecated::innodb_log_compressed_pages = TRUE;
+ }
+
+ if (UNIV_UNLIKELY(deprecated::innodb_log_optimize_ddl)) {
+ sql_print_warning(deprecated::innodb_log_optimize_ddl_msg);
+ deprecated::innodb_log_optimize_ddl = FALSE;
+ }
+
+ if (UNIV_UNLIKELY(deprecated::innodb_scrub_log)) {
+ sql_print_warning(deprecated::innodb_scrub_log_msg);
+ deprecated::innodb_scrub_log = FALSE;
+ }
+
+ if (UNIV_UNLIKELY(deprecated::innodb_scrub_log_speed != 256)) {
+ sql_print_warning(deprecated::innodb_scrub_log_speed_msg);
+ deprecated::innodb_scrub_log_speed = 256;
+ }
+
+ if (UNIV_UNLIKELY(deprecated::innodb_buffer_pool_instances)) {
+ sql_print_warning("The parameter innodb_buffer_pool_instances"
+ " is deprecated and has no effect.");
+ }
+
+ if (UNIV_UNLIKELY(deprecated::innodb_page_cleaners)) {
+ sql_print_warning(deprecated::innodb_page_cleaners_msg);
+ }
+
+ if (UNIV_UNLIKELY(deprecated::srv_n_log_files != 1)) {
+ sql_print_warning(deprecated::srv_n_log_files_msg);
+ deprecated::srv_n_log_files = 1;
+ }
+
+ deprecated::innodb_buffer_pool_instances = 1;
+
+ deprecated::innodb_page_cleaners = 1;
+
+ if (UNIV_UNLIKELY(deprecated::innodb_undo_logs != TRX_SYS_N_RSEGS)) {
+ sql_print_warning(deprecated::innodb_undo_logs_msg);
+ deprecated::innodb_undo_logs = TRX_SYS_N_RSEGS;
+ }
+
+ if (UNIV_UNLIKELY(deprecated::replication_delay)) {
+ sql_print_warning(deprecated::replication_delay_msg);
+ deprecated::replication_delay = 0;
+ }
+ if (UNIV_UNLIKELY(deprecated::thread_concurrency)) {
+ sql_print_warning(deprecated::thread_concurrency_msg);
+ deprecated::thread_concurrency = 0;
+ }
+ if (UNIV_UNLIKELY(deprecated::commit_concurrency)) {
+ sql_print_warning(deprecated::commit_concurrency_msg);
+ deprecated::commit_concurrency = 0;
+ }
+ if (UNIV_UNLIKELY(deprecated::concurrency_tickets)) {
+ sql_print_warning(deprecated::concurrency_tickets_msg);
+ deprecated::concurrency_tickets = 0;
+ }
+ if (UNIV_UNLIKELY(deprecated::adaptive_max_sleep_delay)) {
+ sql_print_warning(deprecated::adaptive_max_sleep_delay_msg);
+ deprecated::adaptive_max_sleep_delay = 0;
+ }
+ if (UNIV_UNLIKELY(deprecated::thread_sleep_delay)) {
+ sql_print_warning(deprecated::thread_sleep_delay_msg);
+ deprecated::thread_sleep_delay = 0;
+ }
+
+ /* Check that values don't overflow on 32-bit systems. */
+ if (sizeof(ulint) == 4) {
+ if (innobase_buffer_pool_size > UINT_MAX32) {
+ sql_print_error(
+ "innodb_buffer_pool_size can't be over 4GB"
+ " on 32-bit systems");
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+ }
+ }
+
+ /* The buffer pool needs to be able to accommodate enough many
+ pages, even for larger pages */
+ if (srv_page_size > UNIV_PAGE_SIZE_DEF
+ && innobase_buffer_pool_size < (24 * 1024 * 1024)) {
+ ib::error() << "innodb_page_size="
+ << srv_page_size << " requires "
+ << "innodb_buffer_pool_size > 24M current "
+ << innobase_buffer_pool_size;
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+
+ if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) {
+ ib::warn() << "The parameter innodb_lock_schedule_algorithm"
+ " is deprecated, and the setting"
+ " innodb_lock_schedule_algorithm=vats"
+ " may cause corruption. The parameter may be removed"
+ " in future releases.";
+
+#ifdef WITH_WSREP
+ /* Currently, Galera does not support VATS lock schedule algorithm. */
+ if (global_system_variables.wsrep_on) {
+ ib::info() << "For Galera, using innodb_lock_schedule_algorithm=fcfs";
+ innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS;
+ }
+#endif /* WITH_WSREP */
+ }
+
+#ifdef WITH_WSREP
+ /* Print deprecation info if xtrabackup is used for SST method */
+ if (global_system_variables.wsrep_on
+ && wsrep_sst_method
+ && (!strcmp(wsrep_sst_method, "xtrabackup")
+ || !strcmp(wsrep_sst_method, "xtrabackup-v2"))) {
+ ib::info() << "Galera SST method xtrabackup is deprecated and the "
+ " support for it may be removed in future releases.";
+ }
+#endif /* WITH_WSREP */
+
+#ifndef HAVE_LZ4
+ if (innodb_compression_algorithm == PAGE_LZ4_ALGORITHM) {
+ sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblz4 is not installed. \n",
+ innodb_compression_algorithm);
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+#endif
+
+#ifndef HAVE_LZO
+ if (innodb_compression_algorithm == PAGE_LZO_ALGORITHM) {
+ sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblzo is not installed. \n",
+ innodb_compression_algorithm);
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+#endif
+
+#ifndef HAVE_LZMA
+ if (innodb_compression_algorithm == PAGE_LZMA_ALGORITHM) {
+ sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblzma is not installed. \n",
+ innodb_compression_algorithm);
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+#endif
+
+#ifndef HAVE_BZIP2
+ if (innodb_compression_algorithm == PAGE_BZIP2_ALGORITHM) {
+ sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: libbz2 is not installed. \n",
+ innodb_compression_algorithm);
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+#endif
+
+#ifndef HAVE_SNAPPY
+ if (innodb_compression_algorithm == PAGE_SNAPPY_ALGORITHM) {
+ sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: libsnappy is not installed. \n",
+ innodb_compression_algorithm);
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+#endif
+
+ if ((srv_encrypt_tables || srv_encrypt_log
+ || innodb_encrypt_temporary_tables)
+ && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) {
+ sql_print_error("InnoDB: cannot enable encryption, "
+ "encryption plugin is not available");
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+
+#ifdef _WIN32
+ if (!is_filename_allowed(srv_buf_dump_filename,
+ strlen(srv_buf_dump_filename), FALSE)) {
+ sql_print_error("InnoDB: innodb_buffer_pool_filename"
+ " cannot have colon (:) in the file name.");
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+#endif
+
+ /* First calculate the default path for innodb_data_home_dir etc.,
+ in case the user has not given any value.
+
+ Note that when using the embedded server, the datadirectory is not
+ necessarily the current directory of this program. */
+
+ if (mysqld_embedded) {
+ default_path = mysql_real_data_home;
+ } else {
+ /* It's better to use current lib, to keep paths short */
+ current_dir[0] = FN_CURLIB;
+ current_dir[1] = FN_LIBCHAR;
+ current_dir[2] = 0;
+ default_path = current_dir;
+ }
+
+ ut_a(default_path);
+
+ fil_path_to_mysql_datadir = default_path;
+
+ /* Set InnoDB initialization parameters according to the values
+ read from MySQL .cnf file */
+
+ /* The default dir for data files is the datadir of MySQL */
+
+ srv_data_home = innobase_data_home_dir
+ ? innobase_data_home_dir : default_path;
+#ifdef WITH_WSREP
+ /* If we use the wsrep API, then we need to tell the server
+ the path to the data files (for passing it to the SST scripts): */
+ wsrep_set_data_home_dir(srv_data_home);
+#endif /* WITH_WSREP */
+
+
+ /*--------------- Shared tablespaces -------------------------*/
+
+ /* Check that the value of system variable innodb_page_size was
+ set correctly. Its value was put into srv_page_size. If valid,
+ return the associated srv_page_size_shift. */
+ srv_page_size_shift = innodb_page_size_validate(srv_page_size);
+ if (!srv_page_size_shift) {
+ sql_print_error("InnoDB: Invalid page size=%lu.\n",
+ srv_page_size);
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+
+ srv_sys_space.set_space_id(TRX_SYS_SPACE);
+
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ srv_sys_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER
+ | FSP_FLAGS_FCRC32_PAGE_SSIZE());
+ break;
+ default:
+ srv_sys_space.set_flags(FSP_FLAGS_PAGE_SSIZE());
+ }
+
+ srv_sys_space.set_name("innodb_system");
+ srv_sys_space.set_path(srv_data_home);
+
+ /* Supports raw devices */
+ if (!srv_sys_space.parse_params(innobase_data_file_path, true)) {
+ ib::error() << "Unable to parse innodb_data_file_path="
+ << innobase_data_file_path;
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+
+ srv_tmp_space.set_name("innodb_temporary");
+ srv_tmp_space.set_path(srv_data_home);
+
+ /* Temporary tablespace is in full crc32 format. */
+ srv_tmp_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER
+ | FSP_FLAGS_FCRC32_PAGE_SSIZE());
+
+ if (!srv_tmp_space.parse_params(innobase_temp_data_file_path, false)) {
+ ib::error() << "Unable to parse innodb_temp_data_file_path="
+ << innobase_temp_data_file_path;
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+
+ /* Perform all sanity check before we take action of deleting files*/
+ if (srv_sys_space.intersection(&srv_tmp_space)) {
+ sql_print_error("%s and %s file names seem to be the same.",
+ srv_tmp_space.name(), srv_sys_space.name());
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+
+ srv_sys_space.normalize_size();
+ srv_tmp_space.normalize_size();
+
+ /* ------------ UNDO tablespaces files ---------------------*/
+ if (!srv_undo_dir) {
+ srv_undo_dir = default_path;
+ }
+
+ os_normalize_path(srv_undo_dir);
+
+ if (strchr(srv_undo_dir, ';')) {
+ sql_print_error("syntax error in innodb_undo_directory");
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+
+ /* -------------- All log files ---------------------------*/
+
+ /* The default dir for log files is the datadir of MySQL */
+
+ if (!srv_log_group_home_dir) {
+ srv_log_group_home_dir = default_path;
+ }
+
+ os_normalize_path(srv_log_group_home_dir);
+
+ if (strchr(srv_log_group_home_dir, ';')) {
+ sql_print_error("syntax error in innodb_log_group_home_dir");
+ DBUG_RETURN(HA_ERR_INITIALIZATION);
+ }
+
+ DBUG_ASSERT(innodb_change_buffering <= IBUF_USE_ALL);
+
+ /* Check that interdependent parameters have sane values. */
+ if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) {
+ sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
+ " cannot be set higher than"
+ " innodb_max_dirty_pages_pct.\n"
+ "InnoDB: Setting"
+ " innodb_max_dirty_pages_pct_lwm to %lf\n",
+ srv_max_buf_pool_modified_pct);
+
+ srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct;
+ }
+
+ if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) {
+
+ if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) {
+ /* Avoid overflow. */
+ srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT;
+ } else {
+ /* The user has not set the value. We should
+ set it based on innodb_io_capacity. */
+ srv_max_io_capacity =
+ ut_max(2 * srv_io_capacity, 2000UL);
+ }
+
+ } else if (srv_max_io_capacity < srv_io_capacity) {
+ sql_print_warning("InnoDB: innodb_io_capacity"
+ " cannot be set higher than"
+ " innodb_io_capacity_max."
+ "Setting innodb_io_capacity=%lu",
+ srv_max_io_capacity);
+
+ srv_io_capacity = srv_max_io_capacity;
+ }
+
+ if (UNIV_PAGE_SIZE_DEF != srv_page_size) {
+ ib::info() << "innodb_page_size=" << srv_page_size;
+
+ srv_max_undo_log_size = std::max(
+ srv_max_undo_log_size,
+ ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
+ << srv_page_size_shift);
+ }
+
+ if (srv_log_write_ahead_size > srv_page_size) {
+ srv_log_write_ahead_size = srv_page_size;
+ } else {
+ ulong srv_log_write_ahead_size_tmp = OS_FILE_LOG_BLOCK_SIZE;
+
+ while (srv_log_write_ahead_size_tmp
+ < srv_log_write_ahead_size) {
+ srv_log_write_ahead_size_tmp
+ = srv_log_write_ahead_size_tmp * 2;
+ }
+ if (srv_log_write_ahead_size_tmp
+ != srv_log_write_ahead_size) {
+ srv_log_write_ahead_size
+ = srv_log_write_ahead_size_tmp / 2;
+ }
+ }
+
+ srv_buf_pool_size = ulint(innobase_buffer_pool_size);
+
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ break;
+ default:
+ ib::warn() << deprecated_innodb_checksum_algorithm;
+ }
+
+ row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
+
+ if (innobase_open_files < 10) {
+ innobase_open_files = 300;
+ if (srv_file_per_table && tc_size > 300 && tc_size < open_files_limit) {
+ innobase_open_files = tc_size;
+ }
+ }
+
+ if (innobase_open_files > open_files_limit) {
+ ib::warn() << "innodb_open_files " << innobase_open_files
+ << " should not be greater"
+ << " than the open_files_limit " << open_files_limit;
+ if (innobase_open_files > tc_size) {
+ innobase_open_files = tc_size;
+ }
+ }
+
+ srv_max_n_open_files = innobase_open_files;
+ srv_innodb_status = (ibool) innobase_create_status_file;
+
+ srv_print_verbose_log = mysqld_embedded ? 0 : 1;
+
+ /* Round up fts_sort_pll_degree to nearest power of 2 number */
+ for (num_pll_degree = 1;
+ num_pll_degree < fts_sort_pll_degree;
+ num_pll_degree <<= 1) {
+
+ /* No op */
+ }
+
+ fts_sort_pll_degree = num_pll_degree;
+
+ /* Store the default charset-collation number of this MySQL
+ installation */
+
+ data_mysql_default_charset_coll = (ulint) default_charset_info->number;
+
+#ifndef _WIN32
+ if (srv_use_atomic_writes && my_may_have_atomic_write) {
+ /*
+ Force O_DIRECT on Unixes (on Windows writes are always
+ unbuffered)
+ */
+ switch (innodb_flush_method) {
+ case SRV_O_DIRECT:
+ case SRV_O_DIRECT_NO_FSYNC:
+ break;
+ default:
+ innodb_flush_method = SRV_O_DIRECT;
+ fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
+ }
+ }
+#endif
+
+ if (srv_read_only_mode) {
+ ib::info() << "Started in read only mode";
+ srv_use_doublewrite_buf = FALSE;
+ }
+
+#ifdef LINUX_NATIVE_AIO
+#elif !defined _WIN32
+ /* Currently native AIO is supported only on windows and linux
+ and that also when the support is compiled in. In all other
+ cases, we ignore the setting of innodb_use_native_aio. */
+ srv_use_native_aio = FALSE;
+#endif
+
+#ifndef _WIN32
+ ut_ad(innodb_flush_method <= SRV_O_DIRECT_NO_FSYNC);
+#else
+ switch (innodb_flush_method) {
+ case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */:
+ innodb_flush_method = SRV_ALL_O_DIRECT_FSYNC;
+ break;
+ case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */:
+ innodb_flush_method = SRV_FSYNC;
+ break;
+ default:
+ ut_ad(innodb_flush_method <= SRV_ALL_O_DIRECT_FSYNC);
+ }
+#endif
+ srv_file_flush_method = srv_flush_t(innodb_flush_method);
+
+ innodb_buffer_pool_size_init();
+
+ srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift);
+ DBUG_RETURN(0);
+}
+
+/** Initialize the InnoDB storage engine plugin.
+@param[in,out] p InnoDB handlerton
+@return error code
+@retval 0 on success */
+static int innodb_init(void* p)
+{
+ DBUG_ENTER("innodb_init");
+ handlerton* innobase_hton= static_cast<handlerton*>(p);
+ innodb_hton_ptr = innobase_hton;
+
+ innobase_hton->db_type = DB_TYPE_INNODB;
+ innobase_hton->savepoint_offset = sizeof(trx_named_savept_t);
+ innobase_hton->close_connection = innobase_close_connection;
+ innobase_hton->kill_query = innobase_kill_query;
+ innobase_hton->savepoint_set = innobase_savepoint;
+ innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint;
+
+ innobase_hton->savepoint_rollback_can_release_mdl =
+ innobase_rollback_to_savepoint_can_release_mdl;
+
+ innobase_hton->savepoint_release = innobase_release_savepoint;
+ innobase_hton->prepare_ordered= NULL;
+ innobase_hton->commit_ordered= innobase_commit_ordered;
+ innobase_hton->commit = innobase_commit;
+ innobase_hton->rollback = innobase_rollback;
+ innobase_hton->prepare = innobase_xa_prepare;
+ innobase_hton->recover = innobase_xa_recover;
+ innobase_hton->commit_by_xid = innobase_commit_by_xid;
+ innobase_hton->rollback_by_xid = innobase_rollback_by_xid;
+ innobase_hton->commit_checkpoint_request = innodb_log_flush_request;
+ innobase_hton->create = innobase_create_handler;
+
+ innobase_hton->drop_database = innobase_drop_database;
+ innobase_hton->panic = innobase_end;
+ innobase_hton->pre_shutdown = innodb_preshutdown;
+
+ innobase_hton->start_consistent_snapshot =
+ innobase_start_trx_and_assign_read_view;
+
+ innobase_hton->flush_logs = innobase_flush_logs;
+ innobase_hton->show_status = innobase_show_status;
+ innobase_hton->notify_tabledef_changed= innodb_notify_tabledef_changed;
+ innobase_hton->flags =
+ HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS |
+ HTON_NATIVE_SYS_VERSIONING |
+ HTON_WSREP_REPLICATION |
+ HTON_REQUIRES_CLOSE_AFTER_TRUNCATE;
+
+#ifdef WITH_WSREP
+ innobase_hton->abort_transaction=wsrep_abort_transaction;
+ innobase_hton->set_checkpoint=innobase_wsrep_set_checkpoint;
+ innobase_hton->get_checkpoint=innobase_wsrep_get_checkpoint;
+#endif /* WITH_WSREP */
+
+ innobase_hton->tablefile_extensions = ha_innobase_exts;
+ innobase_hton->table_options = innodb_table_option_list;
+
+ /* System Versioning */
+ innobase_hton->prepare_commit_versioned
+ = innodb_prepare_commit_versioned;
+
+ innodb_remember_check_sysvar_funcs();
+
+ compile_time_assert(DATA_MYSQL_TRUE_VARCHAR == MYSQL_TYPE_VARCHAR);
+
+#ifndef DBUG_OFF
+ static const char test_filename[] = "-@";
+ char test_tablename[sizeof test_filename
+ + sizeof(srv_mysql50_table_name_prefix) - 1];
+ DBUG_ASSERT(sizeof test_tablename - 1
+ == filename_to_tablename(test_filename,
+ test_tablename,
+ sizeof test_tablename, true));
+ DBUG_ASSERT(!strncmp(test_tablename,
+ srv_mysql50_table_name_prefix,
+ sizeof srv_mysql50_table_name_prefix - 1));
+ DBUG_ASSERT(!strcmp(test_tablename
+ + sizeof srv_mysql50_table_name_prefix - 1,
+ test_filename));
+#endif /* DBUG_OFF */
+
+ os_file_set_umask(my_umask);
+
+ /* Setup the memory alloc/free tracing mechanisms before calling
+ any functions that could possibly allocate memory. */
+ ut_new_boot();
+
+ if (int error = innodb_init_params()) {
+ DBUG_RETURN(error);
+ }
+
+ /* After this point, error handling has to use
+ innodb_init_abort(). */
+
+#ifdef HAVE_PSI_INTERFACE
+ /* Register keys with MySQL performance schema */
+ int count;
+
+# ifdef UNIV_PFS_MUTEX
+ count = array_elements(all_innodb_mutexes);
+ mysql_mutex_register("innodb", all_innodb_mutexes, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+ count = array_elements(all_innodb_rwlocks);
+ mysql_rwlock_register("innodb", all_innodb_rwlocks, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_THREAD
+ count = array_elements(all_innodb_threads);
+ mysql_thread_register("innodb", all_innodb_threads, count);
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_IO
+ count = array_elements(all_innodb_files);
+ mysql_file_register("innodb", all_innodb_files, count);
+# endif /* UNIV_PFS_IO */
+#endif /* HAVE_PSI_INTERFACE */
+
+ bool create_new_db = false;
+
+ /* Check whether the data files exist. */
+ dberr_t err = srv_sys_space.check_file_spec(&create_new_db, 5U << 20);
+
+ if (err != DB_SUCCESS) {
+ DBUG_RETURN(innodb_init_abort());
+ }
+
+ err = srv_start(create_new_db);
+
+ if (err != DB_SUCCESS) {
+ innodb_shutdown();
+ DBUG_RETURN(innodb_init_abort());
+ }
+
+ srv_was_started = true;
+ innodb_params_adjust();
+
+ innobase_old_blocks_pct = buf_LRU_old_ratio_update(
+ innobase_old_blocks_pct, true);
+
+ ibuf_max_size_update(srv_change_buffer_max_size);
+
+ mysql_mutex_init(pending_checkpoint_mutex_key,
+ &log_requests.mutex,
+ MY_MUTEX_INIT_FAST);
+#ifdef MYSQL_DYNAMIC_PLUGIN
+ if (innobase_hton != p) {
+ innobase_hton = reinterpret_cast<handlerton*>(p);
+ *innobase_hton = *innodb_hton_ptr;
+ }
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+ memset(innodb_counter_value, 0, sizeof innodb_counter_value);
+
+ /* Do this as late as possible so server is fully starts up,
+ since we might get some initial stats if user choose to turn
+ on some counters from start up */
+ if (innobase_enable_monitor_counter) {
+ innodb_enable_monitor_at_startup(
+ innobase_enable_monitor_counter);
+ }
+
+ /* Turn on monitor counters that are default on */
+ srv_mon_default_on();
+
+ /* Unit Tests */
+#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+ unit_test_os_file_get_parent_dir();
+#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
+
+#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+ test_make_filepath();
+#endif /*UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */
+
+#ifdef UNIV_ENABLE_DICT_STATS_TEST
+ test_dict_stats_all();
+#endif /*UNIV_ENABLE_DICT_STATS_TEST */
+
+#ifdef UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT
+# ifdef HAVE_UT_CHRONO_T
+ test_row_raw_format_int();
+# endif /* HAVE_UT_CHRONO_T */
+#endif /* UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT */
+
+ DBUG_RETURN(0);
+}
+
+/** Shut down the InnoDB storage engine.
+@return 0 */
+static
+int
+innobase_end(handlerton*, ha_panic_function)
+{
+ DBUG_ENTER("innobase_end");
+
+ if (srv_was_started) {
+ THD *thd= current_thd;
+ if (thd) { // may be UNINSTALL PLUGIN statement
+ if (trx_t* trx = thd_to_trx(thd)) {
+ trx->free();
+ }
+ }
+
+
+ innodb_shutdown();
+
+ mysql_mutex_destroy(&log_requests.mutex);
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+void
+innobase_commit_low(
+/*================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+#ifdef WITH_WSREP
+ const char* tmp = 0;
+ const bool is_wsrep = trx->is_wsrep();
+ THD* thd = trx->mysql_thd;
+ if (is_wsrep) {
+ tmp = thd_proc_info(thd, "innobase_commit_low()");
+ }
+#endif /* WITH_WSREP */
+ if (trx_is_started(trx)) {
+ trx_commit_for_mysql(trx);
+ } else {
+ trx->will_lock = false;
+#ifdef WITH_WSREP
+ trx->wsrep = false;
+#endif /* WITH_WSREP */
+ }
+
+#ifdef WITH_WSREP
+ if (is_wsrep) {
+ thd_proc_info(thd, tmp);
+ }
+#endif /* WITH_WSREP */
+}
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return 0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd) /*!< in: MySQL thread handle of the user for
+ whom the transaction should be committed */
+{
+ DBUG_ENTER("innobase_start_trx_and_assign_read_view");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ /* Create a new trx struct for thd, if it does not yet have one */
+
+ trx_t* trx = check_trx_exists(thd);
+
+ /* The transaction should not be active yet, start it */
+
+ ut_ad(!trx_is_started(trx));
+
+ trx_start_if_not_started_xa(trx, false);
+
+ /* Assign a read view if the transaction does not have it yet.
+ Do this only if transaction is using REPEATABLE READ isolation
+ level. */
+ trx->isolation_level = innobase_map_isolation_level(
+ thd_get_trx_isolation(thd));
+
+ if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) {
+ trx->read_view.open(trx);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: WITH CONSISTENT SNAPSHOT"
+ " was ignored because this phrase"
+ " can only be used with"
+ " REPEATABLE READ isolation level.");
+ }
+
+ /* Set the MySQL flag to mark that there is an active transaction */
+
+ innobase_register_trx(hton, current_thd, trx);
+
+ DBUG_RETURN(0);
+}
+
+static
+void
+innobase_commit_ordered_2(
+/*======================*/
+ trx_t* trx, /*!< in: Innodb transaction */
+ THD* thd) /*!< in: MySQL thread handle */
+{
+ DBUG_ENTER("innobase_commit_ordered_2");
+
+ const bool read_only = trx->read_only || trx->id == 0;
+
+ if (!read_only) {
+ /* The following call reads the binary log position of
+ the transaction being committed.
+
+ Binary logging of other engines is not relevant to
+ InnoDB as all InnoDB requires is that committing
+ InnoDB transactions appear in the same order in the
+ MySQL binary log as they appear in InnoDB logs, which
+ is guaranteed by the server.
+
+ If the binary log is not enabled, or the transaction
+ is not written to the binary log, the file name will
+ be a NULL pointer. */
+ thd_binlog_pos(thd, &trx->mysql_log_file_name,
+ &trx->mysql_log_offset);
+
+ /* Don't do write + flush right now. For group commit
+ to work we want to do the flush later. */
+ trx->flush_log_later = true;
+ }
+
+#ifdef WITH_WSREP
+ /* If the transaction is not run in 2pc, we must assign wsrep
+ XID here in order to get it written in rollback segment. */
+ if (trx->is_wsrep()) {
+ thd_get_xid(thd, (MYSQL_XID*)trx->xid);
+ }
+#endif /* WITH_WSREP */
+
+ innobase_commit_low(trx);
+
+ if (!read_only) {
+ trx->mysql_log_file_name = NULL;
+ trx->flush_log_later = false;
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/*****************************************************************//**
+Perform the first, fast part of InnoDB commit.
+
+Doing it in this call ensures that we get the same commit order here
+as in binlog and any other participating transactional storage engines.
+
+Note that we want to do as little as really needed here, as we run
+under a global mutex. The expensive fsync() is done later, in
+innobase_commit(), without a lock so group commit can take place.
+
+Note also that this method can be called from a different thread than
+the one handling the rest of the transaction. */
+static
+void
+innobase_commit_ordered(
+/*====================*/
+ handlerton *hton, /*!< in: Innodb handlerton */
+ THD* thd, /*!< in: MySQL thread handle of the user for whom
+ the transaction should be committed */
+ bool all) /*!< in: TRUE - commit transaction
+ FALSE - the current SQL statement ended */
+{
+ trx_t* trx;
+ DBUG_ENTER("innobase_commit_ordered");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ trx = check_trx_exists(thd);
+
+ if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+ /* We cannot throw error here; instead we will catch this error
+ again in innobase_commit() and report it from there. */
+ DBUG_VOID_RETURN;
+ }
+
+ /* commit_ordered is only called when committing the whole transaction
+ (or an SQL statement when autocommit is on). */
+ DBUG_ASSERT(all ||
+ (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
+
+ innobase_commit_ordered_2(trx, thd);
+ trx->active_commit_ordered = true;
+
+ DBUG_VOID_RETURN;
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return 0 or deadlock error if the transaction was aborted by another
+ higher priority transaction. */
+static
+int
+innobase_commit(
+/*============*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd, /*!< in: MySQL thread handle of the
+ user for whom the transaction should
+ be committed */
+ bool commit_trx) /*!< in: true - commit transaction
+ false - the current SQL statement
+ ended */
+{
+ DBUG_ENTER("innobase_commit");
+ DBUG_PRINT("enter", ("commit_trx: %d", commit_trx));
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ DBUG_PRINT("trans", ("ending transaction"));
+
+ trx_t* trx = check_trx_exists(thd);
+
+ ut_ad(trx->dict_operation_lock_mode == 0);
+ ut_ad(trx->dict_operation == TRX_DICT_OP_NONE);
+
+ /* Transaction is deregistered only in a commit or a rollback. If
+ it is deregistered we know there cannot be resources to be freed
+ and we could return immediately. For the time being, we play safe
+ and do the cleanup though there should be nothing to clean up. */
+
+ if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+
+ sql_print_error("Transaction not registered for MariaDB 2PC,"
+ " but transaction is active");
+ }
+
+ bool read_only = trx->read_only || trx->id == 0;
+ DBUG_PRINT("info", ("readonly: %d", read_only));
+
+ if (commit_trx
+ || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+ DBUG_EXECUTE_IF("crash_innodb_before_commit",
+ DBUG_SUICIDE(););
+
+ /* Run the fast part of commit if we did not already. */
+ if (!trx->active_commit_ordered) {
+ innobase_commit_ordered_2(trx, thd);
+
+ }
+
+ /* We were instructed to commit the whole transaction, or
+ this is an SQL statement end and autocommit is on */
+
+ /* At this point commit order is fixed and transaction is
+ visible to others. So we can wakeup other commits waiting for
+ this one, to allow then to group commit with us. */
+ thd_wakeup_subsequent_commits(thd, 0);
+
+ /* Now do a write + flush of logs. */
+ trx_commit_complete_for_mysql(trx);
+
+ trx_deregister_from_2pc(trx);
+ } else {
+ /* We just mark the SQL statement ended and do not do a
+ transaction commit */
+
+ /* If we had reserved the auto-inc lock for some
+ table in this SQL statement we release it now */
+
+ if (!read_only) {
+ lock_unlock_table_autoinc(trx);
+ }
+
+ /* Store the current undo_no of the transaction so that we
+ know where to roll back if we have to roll back the next
+ SQL statement */
+
+ trx_mark_sql_stat_end(trx);
+ }
+
+ /* Reset the number AUTO-INC rows required */
+ trx->n_autoinc_rows = 0;
+
+ /* This is a statement level variable. */
+ trx->fts_next_doc_id = 0;
+
+ DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Rolls back a transaction or the latest SQL statement.
+@return 0 or error number */
+static
+int
+innobase_rollback(
+/*==============*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread
+ of the user whose transaction should
+ be rolled back */
+ bool rollback_trx) /*!< in: TRUE - rollback entire
+ transaction FALSE - rollback the current
+ statement only */
+{
+ DBUG_ENTER("innobase_rollback");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ DBUG_PRINT("trans", ("aborting transaction"));
+
+ trx_t* trx = check_trx_exists(thd);
+
+ ut_ad(trx->dict_operation_lock_mode == 0);
+ ut_ad(trx->dict_operation == TRX_DICT_OP_NONE);
+
+ /* Reset the number AUTO-INC rows required */
+
+ trx->n_autoinc_rows = 0;
+
+ /* If we had reserved the auto-inc lock for some table (if
+ we come here to roll back the latest SQL statement) we
+ release it now before a possibly lengthy rollback */
+ lock_unlock_table_autoinc(trx);
+
+ /* This is a statement level variable. */
+
+ trx->fts_next_doc_id = 0;
+
+ dberr_t error;
+
+#ifdef WITH_WSREP
+ /* If trx was assigned wsrep XID in prepare phase and the
+ trx is being rolled back due to BF abort, clear XID in order
+ to avoid writing it to rollback segment out of order. The XID
+ will be reassigned when the transaction is replayed. */
+ if (trx->state != TRX_STATE_NOT_STARTED && wsrep_is_wsrep_xid(trx->xid)) {
+ trx->xid->null();
+ }
+#endif /* WITH_WSREP */
+ if (rollback_trx
+ || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ error = trx_rollback_for_mysql(trx);
+
+ trx_deregister_from_2pc(trx);
+ } else {
+
+ error = trx_rollback_last_sql_stat_for_mysql(trx);
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, trx->mysql_thd));
+}
+
+/*****************************************************************//**
+Rolls back a transaction
+@return 0 or error number */
+static
+int
+innobase_rollback_trx(
+/*==================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ DBUG_ENTER("innobase_rollback_trx");
+ DBUG_PRINT("trans", ("aborting transaction"));
+
+ /* If we had reserved the auto-inc lock for some table (if
+ we come here to roll back the latest SQL statement) we
+ release it now before a possibly lengthy rollback */
+ lock_unlock_table_autoinc(trx);
+ trx_deregister_from_2pc(trx);
+
+ DBUG_RETURN(convert_error_code_to_mysql(trx_rollback_for_mysql(trx),
+ 0, trx->mysql_thd));
+}
+
+/** Invoke commit_checkpoint_notify_ha() on completed log flush requests.
+@param pending log_requests.start
+@param lsn log_sys.get_flushed_lsn() */
+static void log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
+{
+ mysql_mutex_assert_owner(&log_requests.mutex);
+ ut_ad(pending == log_requests.start.load(std::memory_order_relaxed));
+ log_flush_request *entry= pending, *last= nullptr;
+ /* Process the first requests that have been completed. Since
+ the list is not necessarily in ascending order of LSN, we may
+ miss to notify some requests that have already been completed.
+ But there is no harm in delaying notifications for those a bit.
+ And in practise, the list is unlikely to have more than one
+ element anyway, because the redo log would be flushed every
+ srv_flush_log_at_timeout seconds (1 by default). */
+ for (; entry && entry->lsn <= lsn; last= entry, entry= entry->next);
+
+ if (!last)
+ {
+ mysql_mutex_unlock(&log_requests.mutex);
+ return;
+ }
+
+ /* Detach the head of the list that corresponds to persisted log writes. */
+ if (!entry)
+ log_requests.end= entry;
+ log_requests.start.store(entry, std::memory_order_relaxed);
+ mysql_mutex_unlock(&log_requests.mutex);
+
+ /* Now that we have released the mutex, notify the submitters
+ and free the head of the list. */
+ do
+ {
+ entry= pending;
+ pending= pending->next;
+ commit_checkpoint_notify_ha(entry->cookie);
+ my_free(entry);
+ }
+ while (entry != last);
+}
+
+/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
+log writes have been completed. */
+void log_flush_notify(lsn_t flush_lsn)
+{
+ if (auto pending= log_requests.start.load(std::memory_order_acquire))
+ {
+ mysql_mutex_lock(&log_requests.mutex);
+ pending= log_requests.start.load(std::memory_order_relaxed);
+ log_flush_notify_and_unlock(pending, flush_lsn);
+ }
+}
+
+/** Handle a commit checkpoint request from server layer.
+We put the request in a queue, so that we can notify upper layer about
+checkpoint complete when we have flushed the redo log.
+If we have already flushed all relevant redo log, we notify immediately.*/
+static void innodb_log_flush_request(void *cookie)
+{
+ lsn_t flush_lsn= log_sys.get_flushed_lsn();
+ /* Load lsn relaxed after flush_lsn was loaded from the same cache line */
+ const lsn_t lsn= log_sys.get_lsn();
+
+ if (flush_lsn >= lsn)
+ /* All log is already persistent. */;
+ else if (UNIV_UNLIKELY(srv_force_recovery >= SRV_FORCE_NO_BACKGROUND))
+ /* Normally, srv_master_callback() should periodically invoke
+ srv_sync_log_buffer_in_background(), which should initiate a log
+ flush about once every srv_flush_log_at_timeout seconds. But,
+ starting with the innodb_force_recovery=2 level, that background
+ task will not run. */
+ log_write_up_to(flush_lsn= lsn, true);
+ else if (log_flush_request *req= static_cast<log_flush_request*>
+ (my_malloc(PSI_INSTRUMENT_ME, sizeof *req, MYF(MY_WME))))
+ {
+ req->next= nullptr;
+ req->cookie= cookie;
+ req->lsn= lsn;
+
+ log_flush_request *start= nullptr;
+
+ mysql_mutex_lock(&log_requests.mutex);
+ /* In order to prevent a race condition where log_flush_notify()
+ would skip a notification due to, we must update log_requests.start from
+ nullptr (empty) to the first req using std::memory_order_release. */
+ if (log_requests.start.compare_exchange_strong(start, req,
+ std::memory_order_release,
+ std::memory_order_relaxed))
+ {
+ ut_ad(!log_requests.end);
+ start= req;
+ /* In case log_flush_notify() executed
+ log_requests.start.load(std::memory_order_acquire) right before
+ our successful compare_exchange, we must re-read flush_lsn to
+ ensure that our request will be notified immediately if applicable. */
+ flush_lsn= log_sys.get_flushed_lsn();
+ }
+ else
+ {
+ /* Append the entry to the list. Because we determined req->lsn before
+ acquiring the mutex, this list may not be ordered by req->lsn,
+ even though log_flush_notify_and_unlock() assumes so. */
+ log_requests.end->next= req;
+ }
+
+ log_requests.end= req;
+
+ /* This hopefully addresses the hang that was reported in MDEV-24302.
+ Upon receiving a new request, we will notify old requests of
+ completion. */
+ log_flush_notify_and_unlock(start, flush_lsn);
+ return;
+ }
+ else
+ sql_print_error("Failed to allocate %zu bytes."
+ " Commit checkpoint will be skipped.", sizeof *req);
+
+ /* This hopefully addresses the hang that was reported in MDEV-24302.
+ Upon receiving a new request to notify of log writes becoming
+ persistent, we will notify old requests of completion. Note:
+ log_flush_notify() may skip some notifications because it is
+ basically assuming that the list is in ascending order of LSN. */
+ log_flush_notify(flush_lsn);
+ commit_checkpoint_notify_ha(cookie);
+}
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread
+ of the user whose transaction should
+ be rolled back to savepoint */
+ void* savepoint) /*!< in: savepoint data */
+{
+
+ DBUG_ENTER("innobase_rollback_to_savepoint");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ trx_t* trx = check_trx_exists(thd);
+
+ /* TODO: use provided savepoint data area to store savepoint data */
+
+ char name[64];
+
+ longlong2str(longlong(savepoint), name, 36);
+
+ int64_t mysql_binlog_cache_pos;
+
+ dberr_t error = trx_rollback_to_savepoint_for_mysql(
+ trx, name, &mysql_binlog_cache_pos);
+
+ if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+ fts_savepoint_rollback(trx, name);
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Check whether innodb state allows to safely release MDL locks after
+rollback to savepoint.
+When binlog is on, MDL locks acquired after savepoint unit are not
+released if there are any locks held in InnoDB.
+@return true if it is safe, false if its not safe. */
+static
+bool
+innobase_rollback_to_savepoint_can_release_mdl(
+/*===========================================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd) /*!< in: handle to the MySQL thread
+ of the user whose transaction should
+ be rolled back to savepoint */
+{
+ DBUG_ENTER("innobase_rollback_to_savepoint_can_release_mdl");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ trx_t* trx = check_trx_exists(thd);
+
+ /* If transaction has not acquired any locks then it is safe
+ to release MDL after rollback to savepoint */
+ if (UT_LIST_GET_LEN(trx->lock.trx_locks) == 0) {
+
+ DBUG_RETURN(true);
+ }
+
+ DBUG_RETURN(false);
+}
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+ handlerton* hton, /*!< in: handlerton for InnoDB */
+ THD* thd, /*!< in: handle to the MySQL thread
+ of the user whose transaction's
+ savepoint should be released */
+ void* savepoint) /*!< in: savepoint data */
+{
+ dberr_t error;
+ trx_t* trx;
+ char name[64];
+
+ DBUG_ENTER("innobase_release_savepoint");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ trx = check_trx_exists(thd);
+
+ /* TODO: use provided savepoint data area to store savepoint data */
+
+ longlong2str(longlong(savepoint), name, 36);
+
+ error = trx_release_savepoint_for_mysql(trx, name);
+
+ if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+ fts_savepoint_release(trx, name);
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+ handlerton* hton, /*!< in: handle to the InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread */
+ void* savepoint)/*!< in: savepoint data */
+{
+ DBUG_ENTER("innobase_savepoint");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ /* In the autocommit mode there is no sense to set a savepoint
+ (unless we are in sub-statement), so SQL layer ensures that
+ this method is never called in such situation. */
+
+ trx_t* trx = check_trx_exists(thd);
+
+ /* Cannot happen outside of transaction */
+ DBUG_ASSERT(trx_is_registered_for_2pc(trx));
+
+ /* TODO: use provided savepoint data area to store savepoint data */
+ char name[64];
+
+ longlong2str(longlong(savepoint), name, 36);
+
+ dberr_t error = trx_savepoint_for_mysql(trx, name, 0);
+
+ if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+ fts_savepoint_take(trx->fts_trx, name);
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+
+/**
+ Frees a possible InnoDB trx object associated with the current THD.
+
+ @param hton innobase handlerton
+ @param thd server thread descriptor, which resources should be free'd
+
+ @return 0 always
+*/
+static int innobase_close_connection(handlerton *hton, THD *thd)
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ if (auto trx= thd_to_trx(thd))
+ {
+ if (trx->state == TRX_STATE_PREPARED && trx->has_logged_persistent())
+ {
+ trx_disconnect_prepared(trx);
+ return 0;
+ }
+ innobase_rollback_trx(trx);
+ trx->free();
+ }
+ return 0;
+}
+
+void lock_cancel_waiting_and_release(lock_t *lock);
+
+/** Cancel any pending lock request associated with the current THD.
+@sa THD::awake() @sa ha_kill_query() */
+static void innobase_kill_query(handlerton*, THD *thd, enum thd_kill_levels)
+{
+ DBUG_ENTER("innobase_kill_query");
+
+ if (trx_t* trx= thd_to_trx(thd))
+ {
+ ut_ad(trx->mysql_thd == thd);
+#ifdef WITH_WSREP
+ if (trx->is_wsrep() && wsrep_thd_is_aborting(thd))
+ /* if victim has been signaled by BF thread and/or aborting is already
+ progressing, following query aborting is not necessary any more.
+ Also, BF thread should own trx mutex for the victim. */
+ DBUG_VOID_RETURN;
+#endif /* WITH_WSREP */
+ lock_mutex_enter();
+ if (lock_t *lock= trx->lock.wait_lock)
+ {
+ trx_mutex_enter(trx);
+ if (trx->is_wsrep() && wsrep_thd_is_aborting(thd))
+ trx->lock.was_chosen_as_deadlock_victim= TRUE;
+ lock_cancel_waiting_and_release(lock);
+ trx_mutex_exit(trx);
+ }
+ lock_mutex_exit();
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*************************************************************************//**
+** InnoDB database tables
+*****************************************************************************/
+
+/** Get the record format from the data dictionary.
+@return one of ROW_TYPE_REDUNDANT, ROW_TYPE_COMPACT,
+ROW_TYPE_COMPRESSED, ROW_TYPE_DYNAMIC */
+
+enum row_type
+ha_innobase::get_row_type() const
+{
+ if (m_prebuilt && m_prebuilt->table) {
+ const ulint flags = m_prebuilt->table->flags;
+
+ switch (dict_tf_get_rec_format(flags)) {
+ case REC_FORMAT_REDUNDANT:
+ return(ROW_TYPE_REDUNDANT);
+ case REC_FORMAT_COMPACT:
+ return(ROW_TYPE_COMPACT);
+ case REC_FORMAT_COMPRESSED:
+ return(ROW_TYPE_COMPRESSED);
+ case REC_FORMAT_DYNAMIC:
+ return(ROW_TYPE_DYNAMIC);
+ }
+ }
+ ut_ad(0);
+ return(ROW_TYPE_NOT_USED);
+}
+
+/****************************************************************//**
+Get the table flags to use for the statement.
+@return table flags */
+
+handler::Table_flags
+ha_innobase::table_flags() const
+/*============================*/
+{
+ THD* thd = ha_thd();
+ handler::Table_flags flags = m_int_table_flags;
+
+ /* Need to use tx_isolation here since table flags is (also)
+ called before prebuilt is inited. */
+
+ if (thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
+ return(flags);
+ }
+
+ return(flags | HA_BINLOG_STMT_CAPABLE);
+}
+
+/****************************************************************//**
+Returns the table type (storage engine name).
+@return table type */
+
+const char*
+ha_innobase::table_type() const
+/*===========================*/
+{
+ return(innobase_hton_name);
+}
+
+/****************************************************************//**
+Returns the index type.
+@return index type */
+
+const char*
+ha_innobase::index_type(
+/*====================*/
+ uint keynr) /*!< : index number */
+{
+ dict_index_t* index = innobase_get_index(keynr);
+
+ if (!index) {
+ return "Corrupted";
+ }
+
+ if (index->type & DICT_FTS) {
+ return("FULLTEXT");
+ }
+
+ if (dict_index_is_spatial(index)) {
+ return("SPATIAL");
+ }
+
+ return("BTREE");
+}
+
+/****************************************************************//**
+Returns the operations supported for indexes.
+@return flags of supported operations */
+
+ulong
+ha_innobase::index_flags(
+/*=====================*/
+ uint key,
+ uint,
+ bool) const
+{
+ if (table_share->key_info[key].algorithm == HA_KEY_ALG_FULLTEXT) {
+ return(0);
+ }
+
+ /* For spatial index, we don't support descending scan
+ and ICP so far. */
+ if (table_share->key_info[key].flags & HA_SPATIAL) {
+ return HA_READ_NEXT | HA_READ_ORDER| HA_READ_RANGE
+ | HA_KEYREAD_ONLY | HA_KEY_SCAN_NOT_ROR;
+ }
+
+ ulong flags= key == table_share->primary_key
+ ? HA_CLUSTERED_INDEX : 0;
+
+ flags |= HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER
+ | HA_READ_RANGE | HA_KEYREAD_ONLY
+ | HA_DO_INDEX_COND_PUSHDOWN
+ | HA_DO_RANGE_FILTER_PUSHDOWN;
+
+ return(flags);
+}
+
+/****************************************************************//**
+Returns the maximum number of keys.
+@return MAX_KEY */
+
+uint
+ha_innobase::max_supported_keys() const
+/*===================================*/
+{
+ return(MAX_KEY);
+}
+
+/****************************************************************//**
+Returns the maximum key length.
+@return maximum supported key length, in bytes */
+
+uint
+ha_innobase::max_supported_key_length() const
+/*=========================================*/
+{
+ /* An InnoDB page must store >= 2 keys; a secondary key record
+ must also contain the primary key value. Therefore, if both
+ the primary key and the secondary key are at this maximum length,
+ it must be less than 1/4th of the free space on a page including
+ record overhead.
+
+ MySQL imposes its own limit to this number; MAX_KEY_LENGTH = 3072.
+
+ For page sizes = 16k, InnoDB historically reported 3500 bytes here,
+ But the MySQL limit of 3072 was always used through the handler
+ interface.
+
+ Note: Handle 16k and 32k pages the same here since the limits
+ are higher than imposed by MySQL. */
+
+ switch (srv_page_size) {
+ case 4096:
+ /* Hack: allow mysql.innodb_index_stats to be created. */
+ /* FIXME: rewrite this API, and in sql_table.cc consider
+ that in index-organized tables (such as InnoDB), secondary
+ index records will be padded with the PRIMARY KEY, instead
+ of some short ROWID or record heap address. */
+ return(1173);
+ case 8192:
+ return(1536);
+ default:
+ return(3500);
+ }
+}
+
+/****************************************************************//**
+Returns the key map of keys that are usable for scanning.
+@return key_map_full */
+
+const key_map*
+ha_innobase::keys_to_use_for_scanning()
+/*===================================*/
+{
+ return(&key_map_full);
+}
+
+/****************************************************************//**
+Ensures that if there's a concurrent inplace ADD INDEX, being-indexed virtual
+columns are computed. They are not marked as indexed in the old table, so the
+server won't add them to the read_set automatically */
+void
+ha_innobase::column_bitmaps_signal()
+/*================================*/
+{
+ if (!table->vfield || table->current_lock != F_WRLCK) {
+ return;
+ }
+
+ dict_index_t* clust_index = dict_table_get_first_index(m_prebuilt->table);
+ uint num_v = 0;
+ for (uint j = 0; j < table->s->virtual_fields; j++) {
+ if (table->vfield[j]->stored_in_db()) {
+ continue;
+ }
+
+ dict_col_t* col = &m_prebuilt->table->v_cols[num_v].m_col;
+ if (col->ord_part ||
+ (dict_index_is_online_ddl(clust_index) &&
+ row_log_col_is_indexed(clust_index, num_v))) {
+ table->mark_virtual_column_with_deps(table->vfield[j]);
+ }
+ num_v++;
+ }
+}
+
+
+/****************************************************************//**
+Determines if table caching is supported.
+@return HA_CACHE_TBL_ASKTRANSACT */
+
+uint8
+ha_innobase::table_cache_type()
+/*===========================*/
+{
+ return(HA_CACHE_TBL_ASKTRANSACT);
+}
+
+/****************************************************************//**
+Determines if the primary key is clustered index.
+@return true */
+
+/** Normalizes a table name string.
+A normalized name consists of the database name catenated to '/'
+and table name. For example: test/mytable.
+On Windows, normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE.
+@param[out] norm_name Normalized name, null-terminated.
+@param[in] name Name to normalize.
+@param[in] set_lower_case True if we also should fold to lower case. */
+void
+normalize_table_name_c_low(
+/*=======================*/
+ char* norm_name, /* out: normalized name as a
+ null-terminated string */
+ const char* name, /* in: table name string */
+ ibool set_lower_case) /* in: TRUE if we want to set
+ name to lower case */
+{
+ char* name_ptr;
+ ulint name_len;
+ char* db_ptr;
+ ulint db_len;
+ char* ptr;
+ ulint norm_len;
+
+ /* Scan name from the end */
+
+ ptr = strend(name) - 1;
+
+ /* seek to the last path separator */
+ while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+ ptr--;
+ }
+
+ name_ptr = ptr + 1;
+ name_len = strlen(name_ptr);
+
+ /* skip any number of path separators */
+ while (ptr >= name && (*ptr == '\\' || *ptr == '/')) {
+ ptr--;
+ }
+
+ DBUG_ASSERT(ptr >= name);
+
+ /* seek to the last but one path separator or one char before
+ the beginning of name */
+ db_len = 0;
+ while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+ ptr--;
+ db_len++;
+ }
+
+ db_ptr = ptr + 1;
+
+ norm_len = db_len + name_len + sizeof "/";
+ ut_a(norm_len < FN_REFLEN - 1);
+
+ memcpy(norm_name, db_ptr, db_len);
+
+ norm_name[db_len] = '/';
+
+ /* Copy the name and null-byte. */
+ memcpy(norm_name + db_len + 1, name_ptr, name_len + 1);
+
+ if (set_lower_case) {
+ innobase_casedn_str(norm_name);
+ }
+}
+
+create_table_info_t::create_table_info_t(
+ THD* thd,
+ const TABLE* form,
+ HA_CREATE_INFO* create_info,
+ char* table_name,
+ char* remote_path,
+ bool file_per_table,
+ trx_t* trx)
+ : m_thd(thd),
+ m_trx(trx),
+ m_form(form),
+ m_default_row_format(innodb_default_row_format),
+ m_create_info(create_info),
+ m_table_name(table_name), m_table(NULL),
+ m_drop_before_rollback(false),
+ m_remote_path(remote_path),
+ m_innodb_file_per_table(file_per_table)
+{
+}
+
+/** Normalizes a table name string.
+A normalized name consists of the database name catenated to '/'
+and table name. For example: test/mytable.
+On Windows, normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE.
+@param[out] norm_name Normalized name, null-terminated.
+@param[in] name Name to normalize.
+@param[in] set_lower_case True if we also should fold to lower case. */
+void
+create_table_info_t::normalize_table_name_low(
+ char* norm_name,
+ const char* name,
+ ibool set_lower_case)
+{
+ normalize_table_name_c_low(norm_name, name, set_lower_case);
+}
+
+#if !defined(DBUG_OFF)
+/*********************************************************************
+Test normalize_table_name_low(). */
+static
+void
+test_normalize_table_name_low()
+/*===========================*/
+{
+ char norm_name[FN_REFLEN];
+ const char* test_data[][2] = {
+ /* input, expected result */
+ {"./mysqltest/t1", "mysqltest/t1"},
+ {"./test/#sql-842b_2", "test/#sql-842b_2"},
+ {"./test/#sql-85a3_10", "test/#sql-85a3_10"},
+ {"./test/#sql2-842b-2", "test/#sql2-842b-2"},
+ {"./test/bug29807", "test/bug29807"},
+ {"./test/foo", "test/foo"},
+ {"./test/innodb_bug52663", "test/innodb_bug52663"},
+ {"./test/t", "test/t"},
+ {"./test/t1", "test/t1"},
+ {"./test/t10", "test/t10"},
+ {"/a/b/db/table", "db/table"},
+ {"/a/b/db///////table", "db/table"},
+ {"/a/b////db///////table", "db/table"},
+ {"/var/tmp/mysqld.1/#sql842b_2_10", "mysqld.1/#sql842b_2_10"},
+ {"db/table", "db/table"},
+ {"ddd/t", "ddd/t"},
+ {"d/ttt", "d/ttt"},
+ {"d/t", "d/t"},
+ {".\\mysqltest\\t1", "mysqltest/t1"},
+ {".\\test\\#sql-842b_2", "test/#sql-842b_2"},
+ {".\\test\\#sql-85a3_10", "test/#sql-85a3_10"},
+ {".\\test\\#sql2-842b-2", "test/#sql2-842b-2"},
+ {".\\test\\bug29807", "test/bug29807"},
+ {".\\test\\foo", "test/foo"},
+ {".\\test\\innodb_bug52663", "test/innodb_bug52663"},
+ {".\\test\\t", "test/t"},
+ {".\\test\\t1", "test/t1"},
+ {".\\test\\t10", "test/t10"},
+ {"C:\\a\\b\\db\\table", "db/table"},
+ {"C:\\a\\b\\db\\\\\\\\\\\\\\table", "db/table"},
+ {"C:\\a\\b\\\\\\\\db\\\\\\\\\\\\\\table", "db/table"},
+ {"C:\\var\\tmp\\mysqld.1\\#sql842b_2_10", "mysqld.1/#sql842b_2_10"},
+ {"db\\table", "db/table"},
+ {"ddd\\t", "ddd/t"},
+ {"d\\ttt", "d/ttt"},
+ {"d\\t", "d/t"},
+ };
+
+ for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) {
+ printf("test_normalize_table_name_low():"
+ " testing \"%s\", expected \"%s\"... ",
+ test_data[i][0], test_data[i][1]);
+
+ create_table_info_t::normalize_table_name_low(
+ norm_name, test_data[i][0], FALSE);
+
+ if (strcmp(norm_name, test_data[i][1]) == 0) {
+ printf("ok\n");
+ } else {
+ printf("got \"%s\"\n", norm_name);
+ ut_error;
+ }
+ }
+}
+
+/*********************************************************************
+Test ut_format_name(). */
+static
+void
+test_ut_format_name()
+/*=================*/
+{
+ char buf[NAME_LEN * 3];
+
+ struct {
+ const char* name;
+ ulint buf_size;
+ const char* expected;
+ } test_data[] = {
+ {"test/t1", sizeof(buf), "`test`.`t1`"},
+ {"test/t1", 12, "`test`.`t1`"},
+ {"test/t1", 11, "`test`.`t1"},
+ {"test/t1", 10, "`test`.`t"},
+ {"test/t1", 9, "`test`.`"},
+ {"test/t1", 8, "`test`."},
+ {"test/t1", 7, "`test`"},
+ {"test/t1", 6, "`test"},
+ {"test/t1", 5, "`tes"},
+ {"test/t1", 4, "`te"},
+ {"test/t1", 3, "`t"},
+ {"test/t1", 2, "`"},
+ {"test/t1", 1, ""},
+ {"test/t1", 0, "BUF_NOT_CHANGED"},
+ {"table", sizeof(buf), "`table`"},
+ {"ta'le", sizeof(buf), "`ta'le`"},
+ {"ta\"le", sizeof(buf), "`ta\"le`"},
+ {"ta`le", sizeof(buf), "`ta``le`"},
+ };
+
+ for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) {
+
+ memcpy(buf, "BUF_NOT_CHANGED", strlen("BUF_NOT_CHANGED") + 1);
+
+ char* ret;
+
+ ret = ut_format_name(test_data[i].name,
+ buf,
+ test_data[i].buf_size);
+
+ ut_a(ret == buf);
+
+ if (strcmp(buf, test_data[i].expected) == 0) {
+ ib::info() << "ut_format_name(" << test_data[i].name
+ << ", buf, " << test_data[i].buf_size << "),"
+ " expected " << test_data[i].expected
+ << ", OK";
+ } else {
+ ib::error() << "ut_format_name(" << test_data[i].name
+ << ", buf, " << test_data[i].buf_size << "),"
+ " expected " << test_data[i].expected
+ << ", ERROR: got " << buf;
+ ut_error;
+ }
+ }
+}
+#endif /* !DBUG_OFF */
+
+/** Match index columns between MySQL and InnoDB.
+This function checks whether the index column information
+is consistent between KEY info from mysql and that from innodb index.
+@param[in] key_info Index info from mysql
+@param[in] index_info Index info from InnoDB
+@return true if all column types match. */
+static
+bool
+innobase_match_index_columns(
+ const KEY* key_info,
+ const dict_index_t* index_info)
+{
+ const KEY_PART_INFO* key_part;
+ const KEY_PART_INFO* key_end;
+ const dict_field_t* innodb_idx_fld;
+ const dict_field_t* innodb_idx_fld_end;
+
+ DBUG_ENTER("innobase_match_index_columns");
+
+ /* Check whether user defined index column count matches */
+ if (key_info->user_defined_key_parts !=
+ index_info->n_user_defined_cols) {
+ DBUG_RETURN(FALSE);
+ }
+
+ key_part = key_info->key_part;
+ key_end = key_part + key_info->user_defined_key_parts;
+ innodb_idx_fld = index_info->fields;
+ innodb_idx_fld_end = index_info->fields + index_info->n_fields;
+
+ /* Check each index column's datatype. We do not check
+ column name because there exists case that index
+ column name got modified in mysql but such change does not
+ propagate to InnoDB.
+ One hidden assumption here is that the index column sequences
+ are matched up between those in mysql and InnoDB. */
+ for (; key_part != key_end; ++key_part) {
+ unsigned is_unsigned;
+ auto mtype = innodb_idx_fld->col->mtype;
+
+ /* Need to translate to InnoDB column type before
+ comparison. */
+ auto col_type = get_innobase_type_from_mysql_type(
+ &is_unsigned, key_part->field);
+
+ /* Ignore InnoDB specific system columns. */
+ while (mtype == DATA_SYS) {
+ innodb_idx_fld++;
+
+ if (innodb_idx_fld >= innodb_idx_fld_end) {
+ DBUG_RETURN(FALSE);
+ }
+ }
+
+ /* MariaDB-5.5 compatibility */
+ if ((key_part->field->real_type() == MYSQL_TYPE_ENUM ||
+ key_part->field->real_type() == MYSQL_TYPE_SET) &&
+ mtype == DATA_FIXBINARY) {
+ col_type= DATA_FIXBINARY;
+ }
+
+ if (col_type != mtype) {
+ /* If the col_type we get from mysql type is a geometry
+ data type, we should check if mtype is a legacy type
+ from 5.6, either upgraded to DATA_GEOMETRY or not.
+ This is indeed not an accurate check, but should be
+ safe, since DATA_BLOB would be upgraded once we create
+ spatial index on it and we intend to use DATA_GEOMETRY
+ for legacy GIS data types which are of var-length. */
+ switch (col_type) {
+ case DATA_GEOMETRY:
+ if (mtype == DATA_BLOB) {
+ break;
+ }
+ /* Fall through */
+ default:
+ /* Column type mismatches */
+ DBUG_RETURN(false);
+ }
+ }
+
+ innodb_idx_fld++;
+ }
+
+ DBUG_RETURN(TRUE);
+}
+
+/** Build a template for a base column for a virtual column
+@param[in] table MySQL TABLE
+@param[in] clust_index InnoDB clustered index
+@param[in] field field in MySQL table
+@param[in] col InnoDB column
+@param[in,out] templ template to fill
+@param[in] col_no field index for virtual col
+*/
+static
+void
+innobase_vcol_build_templ(
+ const TABLE* table,
+ dict_index_t* clust_index,
+ Field* field,
+ const dict_col_t* col,
+ mysql_row_templ_t* templ,
+ ulint col_no)
+{
+ templ->col_no = col_no;
+ templ->is_virtual = col->is_virtual();
+
+ if (templ->is_virtual) {
+ templ->clust_rec_field_no = ULINT_UNDEFINED;
+ templ->rec_field_no = col->ind;
+ } else {
+ templ->clust_rec_field_no = dict_col_get_clust_pos(
+ col, clust_index);
+ ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+
+ templ->rec_field_no = templ->clust_rec_field_no;
+ }
+
+ if (field->real_maybe_null()) {
+ templ->mysql_null_byte_offset =
+ field->null_offset();
+
+ templ->mysql_null_bit_mask = (ulint) field->null_bit;
+ } else {
+ templ->mysql_null_bit_mask = 0;
+ }
+
+ templ->mysql_col_offset = static_cast<ulint>(
+ get_field_offset(table, field));
+ templ->mysql_col_len = static_cast<ulint>(field->pack_length());
+ templ->type = col->mtype;
+ templ->mysql_type = static_cast<ulint>(field->type());
+
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+ templ->mysql_length_bytes = static_cast<ulint>(
+ ((Field_varstring*) field)->length_bytes);
+ }
+
+ templ->charset = dtype_get_charset_coll(col->prtype);
+ templ->mbminlen = dict_col_get_mbminlen(col);
+ templ->mbmaxlen = dict_col_get_mbmaxlen(col);
+ templ->is_unsigned = col->prtype & DATA_UNSIGNED;
+}
+
+/** Build template for the virtual columns and their base columns. This
+is done when the table first opened.
+@param[in] table MySQL TABLE
+@param[in] ib_table InnoDB dict_table_t
+@param[in,out] s_templ InnoDB template structure
+@param[in] add_v new virtual columns added along with
+ add index call
+@param[in] locked true if dict_sys mutex is held */
+void
+innobase_build_v_templ(
+ const TABLE* table,
+ const dict_table_t* ib_table,
+ dict_vcol_templ_t* s_templ,
+ const dict_add_v_col_t* add_v,
+ bool locked)
+{
+ ulint ncol = unsigned(ib_table->n_cols) - DATA_N_SYS_COLS;
+ ulint n_v_col = ib_table->n_v_cols;
+ bool marker[REC_MAX_N_FIELDS];
+
+ DBUG_ENTER("innobase_build_v_templ");
+ ut_ad(ncol < REC_MAX_N_FIELDS);
+
+ if (add_v != NULL) {
+ n_v_col += add_v->n_v_col;
+ }
+
+ ut_ad(n_v_col > 0);
+
+ if (!locked) {
+ mutex_enter(&dict_sys.mutex);
+ }
+
+ if (s_templ->vtempl) {
+ if (!locked) {
+ mutex_exit(&dict_sys.mutex);
+ }
+ DBUG_VOID_RETURN;
+ }
+
+ memset(marker, 0, sizeof(bool) * ncol);
+
+ s_templ->vtempl = static_cast<mysql_row_templ_t**>(
+ ut_zalloc_nokey((ncol + n_v_col)
+ * sizeof *s_templ->vtempl));
+ s_templ->n_col = ncol;
+ s_templ->n_v_col = n_v_col;
+ s_templ->rec_len = table->s->reclength;
+ s_templ->default_rec = UT_NEW_ARRAY_NOKEY(uchar, s_templ->rec_len);
+ memcpy(s_templ->default_rec, table->s->default_values, s_templ->rec_len);
+
+ /* Mark those columns could be base columns */
+ for (ulint i = 0; i < ib_table->n_v_cols; i++) {
+ const dict_v_col_t* vcol = dict_table_get_nth_v_col(
+ ib_table, i);
+
+ for (ulint j = vcol->num_base; j--; ) {
+ marker[vcol->base_col[j]->ind] = true;
+ }
+ }
+
+ if (add_v) {
+ for (ulint i = 0; i < add_v->n_v_col; i++) {
+ const dict_v_col_t* vcol = &add_v->v_col[i];
+
+ for (ulint j = vcol->num_base; j--; ) {
+ marker[vcol->base_col[j]->ind] = true;
+ }
+ }
+ }
+
+ ulint j = 0;
+ ulint z = 0;
+
+ dict_index_t* clust_index = dict_table_get_first_index(ib_table);
+
+ for (ulint i = 0; i < table->s->fields; i++) {
+ Field* field = table->field[i];
+
+ /* Build template for virtual columns */
+ if (!field->stored_in_db()) {
+#ifdef UNIV_DEBUG
+ const char* name;
+
+ if (z >= ib_table->n_v_def) {
+ name = add_v->v_col_name[z - ib_table->n_v_def];
+ } else {
+ name = dict_table_get_v_col_name(ib_table, z);
+ }
+
+ ut_ad(!my_strcasecmp(system_charset_info, name,
+ field->field_name.str));
+#endif
+ const dict_v_col_t* vcol;
+
+ if (z >= ib_table->n_v_def) {
+ vcol = &add_v->v_col[z - ib_table->n_v_def];
+ } else {
+ vcol = dict_table_get_nth_v_col(ib_table, z);
+ }
+
+ s_templ->vtempl[z + s_templ->n_col]
+ = static_cast<mysql_row_templ_t*>(
+ ut_malloc_nokey(
+ sizeof *s_templ->vtempl[j]));
+
+ innobase_vcol_build_templ(
+ table, clust_index, field,
+ &vcol->m_col,
+ s_templ->vtempl[z + s_templ->n_col],
+ z);
+ z++;
+ continue;
+ }
+
+ ut_ad(j < ncol);
+
+ /* Build template for base columns */
+ if (marker[j]) {
+ dict_col_t* col = dict_table_get_nth_col(
+ ib_table, j);
+
+ ut_ad(!my_strcasecmp(system_charset_info,
+ dict_table_get_col_name(
+ ib_table, j),
+ field->field_name.str));
+
+ s_templ->vtempl[j] = static_cast<
+ mysql_row_templ_t*>(
+ ut_malloc_nokey(
+ sizeof *s_templ->vtempl[j]));
+
+ innobase_vcol_build_templ(
+ table, clust_index, field, col,
+ s_templ->vtempl[j], j);
+ }
+
+ j++;
+ }
+
+ if (!locked) {
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ s_templ->db_name = table->s->db.str;
+ s_templ->tb_name = table->s->table_name.str;
+ DBUG_VOID_RETURN;
+}
+
+/** Check consistency between .frm indexes and InnoDB indexes.
+@param[in] table table object formed from .frm
+@param[in] ib_table InnoDB table definition
+@retval true if not errors were found */
+static bool
+check_index_consistency(const TABLE* table, const dict_table_t* ib_table)
+{
+ ulint mysql_num_index = table->s->keys;
+ ulint ib_num_index = UT_LIST_GET_LEN(ib_table->indexes);
+ bool ret = true;
+
+ /* If there exists inconsistency between MySQL and InnoDB dictionary
+ (metadata) information, the number of index defined in MySQL
+ could exceed that in InnoDB, return error */
+ if (ib_num_index < mysql_num_index) {
+ ret = false;
+ goto func_exit;
+ }
+
+ /* For each index in the mysql key_info array, fetch its
+ corresponding InnoDB index pointer into index_mapping
+ array. */
+ for (ulint count = 0; count < mysql_num_index; count++) {
+ const dict_index_t* index = dict_table_get_index_on_name(
+ ib_table, table->key_info[count].name.str);
+
+ if (index == NULL) {
+ sql_print_error("Cannot find index %s in InnoDB"
+ " index dictionary.",
+ table->key_info[count].name.str);
+ ret = false;
+ goto func_exit;
+ }
+
+ /* Double check fetched index has the same
+ column info as those in mysql key_info. */
+ if (!innobase_match_index_columns(&table->key_info[count],
+ index)) {
+ sql_print_error("Found index %s whose column info"
+ " does not match that of MariaDB.",
+ table->key_info[count].name.str);
+ ret = false;
+ goto func_exit;
+ }
+ }
+
+func_exit:
+ return ret;
+}
+
+/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+UNIV_INTERN
+ulonglong
+innobase_get_int_col_max_value(
+/*===========================*/
+ const Field* field) /*!< in: MySQL field */
+{
+ ulonglong max_value = 0;
+
+ switch (field->key_type()) {
+ /* TINY */
+ case HA_KEYTYPE_BINARY:
+ max_value = 0xFFULL;
+ break;
+ case HA_KEYTYPE_INT8:
+ max_value = 0x7FULL;
+ break;
+ /* SHORT */
+ case HA_KEYTYPE_USHORT_INT:
+ max_value = 0xFFFFULL;
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ max_value = 0x7FFFULL;
+ break;
+ /* MEDIUM */
+ case HA_KEYTYPE_UINT24:
+ max_value = 0xFFFFFFULL;
+ break;
+ case HA_KEYTYPE_INT24:
+ max_value = 0x7FFFFFULL;
+ break;
+ /* LONG */
+ case HA_KEYTYPE_ULONG_INT:
+ max_value = 0xFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ max_value = 0x7FFFFFFFULL;
+ break;
+ /* BIG */
+ case HA_KEYTYPE_ULONGLONG:
+ max_value = 0xFFFFFFFFFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_LONGLONG:
+ max_value = 0x7FFFFFFFFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_FLOAT:
+ /* We use the maximum as per IEEE754-2008 standard, 2^24 */
+ max_value = 0x1000000ULL;
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ /* We use the maximum as per IEEE754-2008 standard, 2^53 */
+ max_value = 0x20000000000000ULL;
+ break;
+ default:
+ ut_error;
+ }
+
+ return(max_value);
+}
+
+/** Initialize the AUTO_INCREMENT column metadata.
+
+Since a partial table definition for a persistent table can already be
+present in the InnoDB dict_sys cache before it is accessed from SQL,
+we have to initialize the AUTO_INCREMENT counter on the first
+ha_innobase::open().
+
+@param[in,out] table persistent table
+@param[in] field the AUTO_INCREMENT column */
+static
+void
+initialize_auto_increment(dict_table_t* table, const Field* field)
+{
+ ut_ad(!table->is_temporary());
+
+ const unsigned col_no = innodb_col_no(field);
+
+ table->autoinc_mutex.lock();
+
+ table->persistent_autoinc = static_cast<uint16_t>(
+ dict_table_get_nth_col_pos(table, col_no, NULL) + 1)
+ & dict_index_t::MAX_N_FIELDS;
+
+ if (table->autoinc) {
+ /* Already initialized. Our caller checked
+ table->persistent_autoinc without
+ autoinc_mutex protection, and there might be multiple
+ ha_innobase::open() executing concurrently. */
+ } else if (srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE) {
+ /* If the recovery level is set so high that writes
+ are disabled we force the AUTOINC counter to 0
+ value effectively disabling writes to the table.
+ Secondly, we avoid reading the table in case the read
+ results in failure due to a corrupted table/index.
+
+ We will not return an error to the client, so that the
+ tables can be dumped with minimal hassle. If an error
+ were returned in this case, the first attempt to read
+ the table would fail and subsequent SELECTs would succeed. */
+ } else if (table->persistent_autoinc) {
+ table->autoinc = innobase_next_autoinc(
+ btr_read_autoinc_with_fallback(table, col_no),
+ 1 /* need */,
+ 1 /* auto_increment_increment */,
+ 0 /* auto_increment_offset */,
+ innobase_get_int_col_max_value(field));
+ }
+
+ table->autoinc_mutex.unlock();
+}
+
+/** Open an InnoDB table
+@param[in] name table name
+@return error code
+@retval 0 on success */
+int
+ha_innobase::open(const char* name, int, uint)
+{
+ char norm_name[FN_REFLEN];
+
+ DBUG_ENTER("ha_innobase::open");
+
+ normalize_table_name(norm_name, name);
+
+ m_user_thd = NULL;
+
+ /* Will be allocated if it is needed in ::update_row() */
+ m_upd_buf = NULL;
+ m_upd_buf_size = 0;
+
+ char* is_part = is_partition(norm_name);
+ THD* thd = ha_thd();
+ dict_table_t* ib_table = open_dict_table(name, norm_name, is_part,
+ DICT_ERR_IGNORE_FK_NOKEY);
+
+ DEBUG_SYNC(thd, "ib_open_after_dict_open");
+
+ if (NULL == ib_table) {
+
+ if (is_part) {
+ sql_print_error("Failed to open table %s.\n",
+ norm_name);
+ }
+ set_my_errno(ENOENT);
+
+ DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+ }
+
+ size_t n_fields = omits_virtual_cols(*table_share)
+ ? table_share->stored_fields : table_share->fields;
+ size_t n_cols = dict_table_get_n_user_cols(ib_table)
+ + dict_table_get_n_v_cols(ib_table)
+ - !!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID);
+
+ if (UNIV_UNLIKELY(n_cols != n_fields)) {
+ ib::warn() << "Table " << norm_name << " contains "
+ << n_cols << " user"
+ " defined columns in InnoDB, but " << n_fields
+ << " columns in MariaDB. Please check"
+ " INFORMATION_SCHEMA.INNODB_SYS_COLUMNS and"
+ " https://mariadb.com/kb/en/innodb-data-dictionary-troubleshooting/"
+ " for how to resolve the issue.";
+
+ /* Mark this table as corrupted, so the drop table
+ or force recovery can still use it, but not others. */
+ ib_table->file_unreadable = true;
+ ib_table->corrupted = true;
+ dict_table_close(ib_table, FALSE, FALSE);
+ set_my_errno(ENOENT);
+ DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+ }
+
+ innobase_copy_frm_flags_from_table_share(ib_table, table->s);
+
+ MONITOR_INC(MONITOR_TABLE_OPEN);
+
+ if ((ib_table->flags2 & DICT_TF2_DISCARDED)) {
+
+ ib_senderrf(thd,
+ IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
+
+ /* Allow an open because a proper DISCARD should have set
+ all the flags and index root page numbers to FIL_NULL that
+ should prevent any DML from running but it should allow DDL
+ operations. */
+ } else if (!ib_table->is_readable()) {
+ const fil_space_t* space = ib_table->space;
+ if (!space) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN,
+ ER_TABLESPACE_MISSING, norm_name);
+ }
+
+ if (!thd_tablespace_op(thd)) {
+ set_my_errno(ENOENT);
+ int ret_err = HA_ERR_TABLESPACE_MISSING;
+
+ if (space && space->crypt_data
+ && space->crypt_data->is_encrypted()) {
+ push_warning_printf(
+ thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_DECRYPTION_FAILED,
+ "Table %s in file %s is encrypted"
+ " but encryption service or"
+ " used key_id %u is not available. "
+ " Can't continue reading table.",
+ table_share->table_name.str,
+ space->chain.start->name,
+ space->crypt_data->key_id);
+ ret_err = HA_ERR_DECRYPTION_FAILED;
+ }
+
+ dict_table_close(ib_table, FALSE, FALSE);
+ DBUG_RETURN(ret_err);
+ }
+ }
+
+ m_prebuilt = row_create_prebuilt(ib_table, table->s->reclength);
+
+ m_prebuilt->default_rec = table->s->default_values;
+ ut_ad(m_prebuilt->default_rec);
+
+ m_prebuilt->m_mysql_table = table;
+
+ /* Looks like MySQL-3.23 sometimes has primary key number != 0 */
+ m_primary_key = table->s->primary_key;
+
+ key_used_on_scan = m_primary_key;
+
+ if (ib_table->n_v_cols) {
+ mutex_enter(&dict_sys.mutex);
+ if (ib_table->vc_templ == NULL) {
+ ib_table->vc_templ = UT_NEW_NOKEY(dict_vcol_templ_t());
+ innobase_build_v_templ(
+ table, ib_table, ib_table->vc_templ, NULL,
+ true);
+ }
+
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ if (!check_index_consistency(table, ib_table)) {
+ sql_print_error("InnoDB indexes are inconsistent with what "
+ "defined in .frm for table %s",
+ name);
+ }
+
+ /* Allocate a buffer for a 'row reference'. A row reference is
+ a string of bytes of length ref_length which uniquely specifies
+ a row in our table. Note that MySQL may also compare two row
+ references for equality by doing a simple memcmp on the strings
+ of length ref_length! */
+ if (!(m_prebuilt->clust_index_was_generated
+ = dict_index_is_auto_gen_clust(ib_table->indexes.start))) {
+ if (m_primary_key >= MAX_KEY) {
+ ib_table->dict_frm_mismatch = DICT_FRM_NO_PK;
+
+ /* This mismatch could cause further problems
+ if not attended, bring this to the user's attention
+ by printing a warning in addition to log a message
+ in the errorlog */
+
+ ib_push_frm_error(thd, ib_table, table, 0, true);
+
+ /* If m_primary_key >= MAX_KEY, its (m_primary_key)
+ value could be out of bound if continue to index
+ into key_info[] array. Find InnoDB primary index,
+ and assign its key_length to ref_length.
+ In addition, since MySQL indexes are sorted starting
+ with primary index, unique index etc., initialize
+ ref_length to the first index key length in
+ case we fail to find InnoDB cluster index.
+
+ Please note, this will not resolve the primary
+ index mismatch problem, other side effects are
+ possible if users continue to use the table.
+ However, we allow this table to be opened so
+ that user can adopt necessary measures for the
+ mismatch while still being accessible to the table
+ date. */
+ if (!table->key_info) {
+ ut_ad(!table->s->keys);
+ ref_length = 0;
+ } else {
+ ref_length = table->key_info[0].key_length;
+ }
+
+ /* Find corresponding cluster index
+ key length in MySQL's key_info[] array */
+ for (uint i = 0; i < table->s->keys; i++) {
+ dict_index_t* index;
+ index = innobase_get_index(i);
+ if (dict_index_is_clust(index)) {
+ ref_length =
+ table->key_info[i].key_length;
+ }
+ }
+ } else {
+ /* MySQL allocates the buffer for ref.
+ key_info->key_length includes space for all key
+ columns + one byte for each column that may be
+ NULL. ref_length must be as exact as possible to
+ save space, because all row reference buffers are
+ allocated based on ref_length. */
+
+ ref_length = table->key_info[m_primary_key].key_length;
+ }
+ } else {
+ if (m_primary_key != MAX_KEY) {
+
+ ib_table->dict_frm_mismatch = DICT_NO_PK_FRM_HAS;
+
+ /* This mismatch could cause further problems
+ if not attended, bring this to the user attention
+ by printing a warning in addition to log a message
+ in the errorlog */
+ ib_push_frm_error(thd, ib_table, table, 0, true);
+ }
+
+ ref_length = DATA_ROW_ID_LEN;
+
+ /* If we automatically created the clustered index, then
+ MySQL does not know about it, and MySQL must NOT be aware
+ of the index used on scan, to make it avoid checking if we
+ update the column of the index. That is why we assert below
+ that key_used_on_scan is the undefined value MAX_KEY.
+ The column is the row id in the automatical generation case,
+ and it will never be updated anyway. */
+
+ if (key_used_on_scan != MAX_KEY) {
+ sql_print_warning(
+ "Table %s key_used_on_scan is %u even "
+ "though there is no primary key inside "
+ "InnoDB.", name, key_used_on_scan);
+ }
+ }
+
+ /* Index block size in InnoDB: used by MySQL in query optimization */
+ stats.block_size = static_cast<uint>(srv_page_size);
+
+ const my_bool for_vc_purge = THDVAR(thd, background_thread);
+
+ if (for_vc_purge || !m_prebuilt->table
+ || m_prebuilt->table->is_temporary()
+ || m_prebuilt->table->persistent_autoinc
+ || !m_prebuilt->table->is_readable()) {
+ } else if (const Field* ai = table->found_next_number_field) {
+ initialize_auto_increment(m_prebuilt->table, ai);
+ }
+
+ /* Set plugin parser for fulltext index */
+ for (uint i = 0; i < table->s->keys; i++) {
+ if (table->key_info[i].flags & HA_USES_PARSER) {
+ dict_index_t* index = innobase_get_index(i);
+ plugin_ref parser = table->key_info[i].parser;
+
+ ut_ad(index->type & DICT_FTS);
+ index->parser =
+ static_cast<st_mysql_ftparser *>(
+ plugin_decl(parser)->info);
+
+ DBUG_EXECUTE_IF("fts_instrument_use_default_parser",
+ index->parser = &fts_default_parser;);
+ }
+ }
+
+ ut_ad(!m_prebuilt->table
+ || table->versioned() == m_prebuilt->table->versioned());
+
+ if (!for_vc_purge) {
+ info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST
+ | HA_STATUS_OPEN);
+ }
+
+ DBUG_RETURN(0);
+}
+
+/** Convert MySQL column number to dict_table_t::cols[] offset.
+@param[in] field non-virtual column
+@return column number relative to dict_table_t::cols[] */
+unsigned
+innodb_col_no(const Field* field)
+{
+ ut_ad(!innobase_is_s_fld(field));
+ const TABLE* table = field->table;
+ unsigned col_no = 0;
+ ut_ad(field == table->field[field->field_index]);
+ for (unsigned i = 0; i < field->field_index; i++) {
+ if (table->field[i]->stored_in_db()) {
+ col_no++;
+ }
+ }
+ return(col_no);
+}
+
+/** Opens dictionary table object using table name. For partition, we need to
+try alternative lower/upper case names to support moving data files across
+platforms.
+@param[in] table_name name of the table/partition
+@param[in] norm_name normalized name of the table/partition
+@param[in] is_partition if this is a partition of a table
+@param[in] ignore_err error to ignore for loading dictionary object
+@return dictionary table object or NULL if not found */
+dict_table_t*
+ha_innobase::open_dict_table(
+ const char*
+#ifdef _WIN32
+ table_name
+#endif
+ ,
+ const char* norm_name,
+ bool is_partition,
+ dict_err_ignore_t ignore_err)
+{
+ DBUG_ENTER("ha_innobase::open_dict_table");
+ dict_table_t* ib_table = dict_table_open_on_name(norm_name, FALSE,
+ TRUE, ignore_err);
+
+ if (NULL == ib_table && is_partition) {
+ /* MySQL partition engine hard codes the file name
+ separator as "#P#". The text case is fixed even if
+ lower_case_table_names is set to 1 or 2. This is true
+ for sub-partition names as well. InnoDB always
+ normalises file names to lower case on Windows, this
+ can potentially cause problems when copying/moving
+ tables between platforms.
+
+ 1) If boot against an installation from Windows
+ platform, then its partition table name could
+ be in lower case in system tables. So we will
+ need to check lower case name when load table.
+
+ 2) If we boot an installation from other case
+ sensitive platform in Windows, we might need to
+ check the existence of table name without lower
+ case in the system table. */
+ if (innobase_get_lower_case_table_names() == 1) {
+ char par_case_name[FN_REFLEN];
+
+#ifndef _WIN32
+ /* Check for the table using lower
+ case name, including the partition
+ separator "P" */
+ strcpy(par_case_name, norm_name);
+ innobase_casedn_str(par_case_name);
+#else
+ /* On Windows platfrom, check
+ whether there exists table name in
+ system table whose name is
+ not being normalized to lower case */
+ create_table_info_t::
+ normalize_table_name_low(
+ par_case_name,
+ table_name, FALSE);
+#endif
+ ib_table = dict_table_open_on_name(
+ par_case_name, FALSE, TRUE,
+ ignore_err);
+ }
+
+ if (ib_table != NULL) {
+#ifndef _WIN32
+ sql_print_warning("Partition table %s opened"
+ " after converting to lower"
+ " case. The table may have"
+ " been moved from a case"
+ " in-sensitive file system."
+ " Please recreate table in"
+ " the current file system\n",
+ norm_name);
+#else
+ sql_print_warning("Partition table %s opened"
+ " after skipping the step to"
+ " lower case the table name."
+ " The table may have been"
+ " moved from a case sensitive"
+ " file system. Please"
+ " recreate table in the"
+ " current file system\n",
+ norm_name);
+#endif
+ }
+ }
+
+ DBUG_RETURN(ib_table);
+}
+
+handler*
+ha_innobase::clone(
+/*===============*/
+ const char* name, /*!< in: table name */
+ MEM_ROOT* mem_root) /*!< in: memory context */
+{
+ DBUG_ENTER("ha_innobase::clone");
+
+ ha_innobase* new_handler = static_cast<ha_innobase*>(
+ handler::clone(m_prebuilt->table->name.m_name, mem_root));
+
+ if (new_handler != NULL) {
+ DBUG_ASSERT(new_handler->m_prebuilt != NULL);
+
+ new_handler->m_prebuilt->select_lock_type
+ = m_prebuilt->select_lock_type;
+ }
+
+ DBUG_RETURN(new_handler);
+}
+
+
+uint
+ha_innobase::max_supported_key_part_length() const
+/*==============================================*/
+{
+ /* A table format specific index column length check will be performed
+ at ha_innobase::add_index() and row_create_index_for_mysql() */
+ return(REC_VERSION_56_MAX_INDEX_COL_LEN);
+}
+
+/******************************************************************//**
+Closes a handle to an InnoDB table.
+@return 0 */
+
+int
+ha_innobase::close()
+/*================*/
+{
+ DBUG_ENTER("ha_innobase::close");
+
+ row_prebuilt_free(m_prebuilt, FALSE);
+
+ if (m_upd_buf != NULL) {
+ ut_ad(m_upd_buf_size != 0);
+ my_free(m_upd_buf);
+ m_upd_buf = NULL;
+ m_upd_buf_size = 0;
+ }
+
+ MONITOR_INC(MONITOR_TABLE_CLOSE);
+
+ DBUG_RETURN(0);
+}
+
+/* The following accessor functions should really be inside MySQL code! */
+
+#ifdef WITH_WSREP
+UNIV_INTERN
+ulint
+wsrep_innobase_mysql_sort(
+/*======================*/
+ /* out: str contains sort string */
+ int mysql_type, /* in: MySQL type */
+ uint charset_number, /* in: number of the charset */
+ unsigned char* str, /* in: data field */
+ ulint str_length, /* in: data field length,
+ not UNIV_SQL_NULL */
+ ulint buf_length) /* in: total str buffer length */
+
+{
+ CHARSET_INFO* charset;
+ enum_field_types mysql_tp;
+ ulint ret_length = str_length;
+
+ DBUG_ASSERT(str_length != UNIV_SQL_NULL);
+
+ mysql_tp = (enum_field_types) mysql_type;
+
+ switch (mysql_tp) {
+
+ case MYSQL_TYPE_BIT:
+ case MYSQL_TYPE_STRING:
+ case MYSQL_TYPE_VAR_STRING:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ case MYSQL_TYPE_VARCHAR:
+ {
+ uchar *tmp_str;
+ ulint tmp_length;
+
+ /* Use the charset number to pick the right charset struct for
+ the comparison. Since the MySQL function get_charset may be
+ slow before Bar removes the mutex operation there, we first
+ look at 2 common charsets directly. */
+
+ if (charset_number == default_charset_info->number) {
+ charset = default_charset_info;
+ } else if (charset_number == my_charset_latin1.number) {
+ charset = &my_charset_latin1;
+ } else {
+ charset = get_charset(charset_number, MYF(MY_WME));
+
+ if (charset == NULL) {
+ sql_print_error("InnoDB needs charset %lu for doing "
+ "a comparison, but MariaDB cannot "
+ "find that charset.",
+ (ulong) charset_number);
+ ut_a(0);
+ }
+ }
+
+ // Note that strnxfrm may change length of string
+ tmp_length= charset->coll->strnxfrmlen(charset, str_length);
+ tmp_length= ut_max(str_length, tmp_length) + 1;
+ tmp_str= static_cast<uchar *>(ut_malloc_nokey(tmp_length));
+ ut_ad(str_length <= tmp_length);
+ memcpy(tmp_str, str, str_length);
+
+ tmp_length = charset->strnxfrm(str, str_length,
+ uint(str_length), tmp_str,
+ tmp_length, 0);
+ DBUG_ASSERT(tmp_length <= str_length);
+ if (wsrep_protocol_version < 3) {
+ tmp_length = charset->strnxfrm(
+ str, str_length,
+ uint(str_length), tmp_str, tmp_length, 0);
+ DBUG_ASSERT(tmp_length <= str_length);
+ } else {
+ /* strnxfrm will expand the destination string,
+ protocols < 3 truncated the sorted sring
+ protocols >= 3 gets full sorted sring
+ */
+ tmp_length = charset->strnxfrm(
+ str, buf_length,
+ uint(str_length), tmp_str, str_length, 0);
+ DBUG_ASSERT(tmp_length <= buf_length);
+ ret_length = tmp_length;
+ }
+
+ ut_free(tmp_str);
+ break;
+ }
+ case MYSQL_TYPE_DECIMAL :
+ case MYSQL_TYPE_TINY :
+ case MYSQL_TYPE_SHORT :
+ case MYSQL_TYPE_LONG :
+ case MYSQL_TYPE_FLOAT :
+ case MYSQL_TYPE_DOUBLE :
+ case MYSQL_TYPE_NULL :
+ case MYSQL_TYPE_TIMESTAMP :
+ case MYSQL_TYPE_LONGLONG :
+ case MYSQL_TYPE_INT24 :
+ case MYSQL_TYPE_DATE :
+ case MYSQL_TYPE_TIME :
+ case MYSQL_TYPE_DATETIME :
+ case MYSQL_TYPE_YEAR :
+ case MYSQL_TYPE_NEWDATE :
+ case MYSQL_TYPE_NEWDECIMAL :
+ case MYSQL_TYPE_ENUM :
+ case MYSQL_TYPE_SET :
+ case MYSQL_TYPE_GEOMETRY :
+ break;
+ default:
+ break;
+ }
+
+ return ret_length;
+}
+#endif /* WITH_WSREP */
+
+/******************************************************************//**
+compare two character string according to their charset. */
+int
+innobase_fts_text_cmp(
+/*==================*/
+ const void* cs, /*!< in: Character set */
+ const void* p1, /*!< in: key */
+ const void* p2) /*!< in: node */
+{
+ const CHARSET_INFO* charset = (const CHARSET_INFO*) cs;
+ const fts_string_t* s1 = (const fts_string_t*) p1;
+ const fts_string_t* s2 = (const fts_string_t*) p2;
+
+ return(ha_compare_text(
+ charset, s1->f_str, static_cast<uint>(s1->f_len),
+ s2->f_str, static_cast<uint>(s2->f_len), 0));
+}
+
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+ const void* cs, /*!< in: Character set */
+ const void* p1, /*!< in: key */
+ const void* p2) /*!< in: node */
+{
+ const CHARSET_INFO* charset = (const CHARSET_INFO*) cs;
+ const fts_string_t* s1 = (const fts_string_t*) p1;
+ const fts_string_t* s2 = (const fts_string_t*) p2;
+ ulint newlen;
+
+ my_casedn_str(charset, (char*) s2->f_str);
+
+ newlen = strlen((const char*) s2->f_str);
+
+ return(ha_compare_text(
+ charset, s1->f_str, static_cast<uint>(s1->f_len),
+ s2->f_str, static_cast<uint>(newlen), 0));
+}
+
+/******************************************************************//**
+Get the first character's code position for FTS index partition. */
+ulint
+innobase_strnxfrm(
+/*==============*/
+ const CHARSET_INFO*
+ cs, /*!< in: Character set */
+ const uchar* str, /*!< in: string */
+ const ulint len) /*!< in: string length */
+{
+ uchar mystr[2];
+ ulint value;
+
+ if (!str || len == 0) {
+ return(0);
+ }
+
+ cs->strnxfrm((uchar*) mystr, 2, str, len);
+
+ value = mach_read_from_2(mystr);
+
+ if (value > 255) {
+ value = value / 256;
+ }
+
+ return(value);
+}
+
+/******************************************************************//**
+compare two character string according to their charset. */
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+ const void* cs, /*!< in: Character set */
+ const void* p1, /*!< in: prefix key */
+ const void* p2) /*!< in: value to compare */
+{
+ const CHARSET_INFO* charset = (const CHARSET_INFO*) cs;
+ const fts_string_t* s1 = (const fts_string_t*) p1;
+ const fts_string_t* s2 = (const fts_string_t*) p2;
+ int result;
+
+ result = ha_compare_text(
+ charset, s2->f_str, static_cast<uint>(s2->f_len),
+ s1->f_str, static_cast<uint>(s1->f_len), 1);
+
+ /* We switched s1, s2 position in ha_compare_text. So we need
+ to negate the result */
+ return(-result);
+}
+
+/******************************************************************//**
+Makes all characters in a string lower case. */
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+ CHARSET_INFO* cs, /*!< in: Character set */
+ char* src, /*!< in: string to put in lower case */
+ size_t src_len,/*!< in: input string length */
+ char* dst, /*!< in: buffer for result string */
+ size_t dst_len)/*!< in: buffer size */
+{
+ if (cs->casedn_multiply == 1) {
+ memcpy(dst, src, src_len);
+ dst[src_len] = 0;
+ my_casedn_str(cs, dst);
+
+ return(strlen(dst));
+ } else {
+ return(cs->casedn(src, src_len, dst, dst_len));
+ }
+}
+
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+#define misc_word_char(X) 0
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token.
+It is mostly copied from MyISAM's doc parsing function ft_simple_get_word()
+@return length of string processed */
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+ CHARSET_INFO* cs, /*!< in: Character set */
+ const byte* start, /*!< in: start of text */
+ const byte* end, /*!< in: one character past end of
+ text */
+ fts_string_t* token) /*!< out: token's text */
+{
+ int mbl;
+ const uchar* doc = start;
+
+ ut_a(cs);
+
+ token->f_n_char = token->f_len = 0;
+ token->f_str = NULL;
+
+ for (;;) {
+
+ if (doc >= end) {
+ return ulint(doc - start);
+ }
+
+ int ctype;
+
+ mbl = cs->ctype(&ctype, doc, (const uchar*) end);
+
+ if (true_word_char(ctype, *doc)) {
+ break;
+ }
+
+ doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+ }
+
+ ulint mwc = 0;
+ ulint length = 0;
+
+ token->f_str = const_cast<byte*>(doc);
+
+ while (doc < end) {
+
+ int ctype;
+
+ mbl = cs->ctype(&ctype, (uchar*) doc, (uchar*) end);
+ if (true_word_char(ctype, *doc)) {
+ mwc = 0;
+ } else if (!misc_word_char(*doc) || mwc) {
+ break;
+ } else {
+ ++mwc;
+ }
+
+ ++length;
+
+ doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+ }
+
+ token->f_len = (uint) (doc - token->f_str) - mwc;
+ token->f_n_char = length;
+
+ return ulint(doc - start);
+}
+
+/** Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@param[out] unsigned_flag DATA_UNSIGNED if an 'unsigned type'; at least
+ENUM and SET, and unsigned integer types are 'unsigned types'
+@param[in] f MySQL Field
+@return DATA_BINARY, DATA_VARCHAR, ... */
+uint8_t
+get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field)
+{
+ /* The following asserts try to check that the MySQL type code fits in
+ 8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to
+ the type */
+
+ static_assert(MYSQL_TYPE_STRING < 256, "compatibility");
+ static_assert(MYSQL_TYPE_VAR_STRING < 256, "compatibility");
+ static_assert(MYSQL_TYPE_DOUBLE < 256, "compatibility");
+ static_assert(MYSQL_TYPE_FLOAT < 256, "compatibility");
+ static_assert(MYSQL_TYPE_DECIMAL < 256, "compatibility");
+
+ if (field->flags & UNSIGNED_FLAG) {
+
+ *unsigned_flag = DATA_UNSIGNED;
+ } else {
+ *unsigned_flag = 0;
+ }
+
+ if (field->real_type() == MYSQL_TYPE_ENUM
+ || field->real_type() == MYSQL_TYPE_SET) {
+
+ /* MySQL has field->type() a string type for these, but the
+ data is actually internally stored as an unsigned integer
+ code! */
+
+ *unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned
+ flag set to zero, even though
+ internally this is an unsigned
+ integer type */
+ return(DATA_INT);
+ }
+
+ switch (field->type()) {
+ /* NOTE that we only allow string types in DATA_MYSQL and
+ DATA_VARMYSQL */
+ case MYSQL_TYPE_VAR_STRING: /* old <= 4.1 VARCHAR */
+ case MYSQL_TYPE_VARCHAR: /* new >= 5.0.3 true VARCHAR */
+ if (field->binary()) {
+ return(DATA_BINARY);
+ } else if (field->charset() == &my_charset_latin1) {
+ return(DATA_VARCHAR);
+ } else {
+ return(DATA_VARMYSQL);
+ }
+ case MYSQL_TYPE_BIT:
+ case MYSQL_TYPE_STRING:
+ if (field->binary() || field->key_type() == HA_KEYTYPE_BINARY) {
+ return(DATA_FIXBINARY);
+ } else if (field->charset() == &my_charset_latin1) {
+ return(DATA_CHAR);
+ } else {
+ return(DATA_MYSQL);
+ }
+ case MYSQL_TYPE_NEWDECIMAL:
+ return(DATA_FIXBINARY);
+ case MYSQL_TYPE_LONG:
+ case MYSQL_TYPE_LONGLONG:
+ case MYSQL_TYPE_TINY:
+ case MYSQL_TYPE_SHORT:
+ case MYSQL_TYPE_INT24:
+ case MYSQL_TYPE_DATE:
+ case MYSQL_TYPE_YEAR:
+ case MYSQL_TYPE_NEWDATE:
+ return(DATA_INT);
+ case MYSQL_TYPE_TIME:
+ case MYSQL_TYPE_DATETIME:
+ case MYSQL_TYPE_TIMESTAMP:
+ if (field->key_type() == HA_KEYTYPE_BINARY) {
+ return(DATA_FIXBINARY);
+ } else {
+ return(DATA_INT);
+ }
+ case MYSQL_TYPE_FLOAT:
+ return(DATA_FLOAT);
+ case MYSQL_TYPE_DOUBLE:
+ return(DATA_DOUBLE);
+ case MYSQL_TYPE_DECIMAL:
+ return(DATA_DECIMAL);
+ case MYSQL_TYPE_GEOMETRY:
+ return(DATA_GEOMETRY);
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ return(DATA_BLOB);
+ case MYSQL_TYPE_NULL:
+ /* MySQL currently accepts "NULL" datatype, but will
+ reject such datatype in the next release. We will cope
+ with it and not trigger assertion failure in 5.1 */
+ break;
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Reads an unsigned integer value < 64k from 2 bytes, in the little-endian
+storage format.
+@return value */
+static inline
+uint
+innobase_read_from_2_little_endian(
+/*===============================*/
+ const uchar* buf) /*!< in: from where to read */
+{
+ return((uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1]))));
+}
+
+#ifdef WITH_WSREP
+/*******************************************************************//**
+Stores a key value for a row to a buffer.
+@return key value length as stored in buff */
+static
+uint16_t
+wsrep_store_key_val_for_row(
+/*=========================*/
+ THD* thd,
+ TABLE* table,
+ uint keynr, /*!< in: key number */
+ uchar* buff, /*!< in/out: buffer for the key value (in MySQL
+ format) */
+ uint buff_len,/*!< in: buffer length */
+ const uchar* record,
+ bool* key_is_null)/*!< out: full key was null */
+{
+ KEY* key_info = table->key_info + keynr;
+ KEY_PART_INFO* key_part = key_info->key_part;
+ KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
+ uchar* buff_start = buff;
+ enum_field_types mysql_type;
+ Field* field;
+ ulint buff_space = buff_len;
+
+ DBUG_ENTER("wsrep_store_key_val_for_row");
+
+ memset(buff, 0, buff_len);
+ *key_is_null = true;
+
+ for (; key_part != end; key_part++) {
+ uchar *sorted = nullptr;
+ bool part_is_null = false;
+
+ if (key_part->null_bit) {
+ if (buff_space > 0) {
+ if (record[key_part->null_offset]
+ & key_part->null_bit) {
+ *buff = 1;
+ part_is_null = true;
+ } else {
+ *buff = 0;
+ }
+ buff++;
+ buff_space--;
+ } else {
+ fprintf (stderr, "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ }
+ }
+ if (!part_is_null) *key_is_null = false;
+
+ field = key_part->field;
+ mysql_type = field->type();
+
+ if (mysql_type == MYSQL_TYPE_VARCHAR) {
+ /* >= 5.0.3 true VARCHAR */
+ ulint lenlen;
+ ulint len;
+ const byte* data;
+ ulint key_len;
+ ulint true_len;
+ const CHARSET_INFO* cs;
+ int error=0;
+
+ key_len = key_part->length;
+
+ if (part_is_null) {
+ true_len = key_len + 2;
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ buff += true_len;
+ buff_space -= true_len;
+ continue;
+ }
+ cs = field->charset();
+
+ lenlen = (ulint)
+ (((Field_varstring*)field)->length_bytes);
+
+ data = row_mysql_read_true_varchar(&len,
+ (byte*) (record
+ + (ulint)get_field_offset(table, field)),
+ lenlen);
+
+ true_len = len;
+
+ /* For multi byte character sets we need to calculate
+ the true length of the key */
+
+ if (len > 0 && cs->mbmaxlen > 1) {
+ true_len = (ulint) my_well_formed_length(cs,
+ (const char *) data,
+ (const char *) data + len,
+ (uint) (key_len /
+ cs->mbmaxlen),
+ &error);
+ }
+
+ /* In a column prefix index, we may need to truncate
+ the stored value: */
+
+ if (true_len > key_len) {
+ true_len = key_len;
+ }
+
+ const ulint max_len = true_len;
+ sorted= static_cast<uchar *>(ut_malloc_nokey(max_len+1));
+ memcpy(sorted, data, true_len);
+ true_len = wsrep_innobase_mysql_sort(
+ mysql_type, cs->number, sorted, true_len,
+ max_len);
+ ut_ad(true_len <= max_len);
+
+ if (wsrep_protocol_version > 1) {
+ /* Note that we always reserve the maximum possible
+ length of the true VARCHAR in the key value, though
+ only len first bytes after the 2 length bytes contain
+ actual data. The rest of the space was reset to zero
+ in the bzero() call above. */
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ memcpy(buff, sorted, true_len);
+ buff += true_len;
+ buff_space -= true_len;
+ } else {
+ buff += key_len;
+ }
+ } else if (mysql_type == MYSQL_TYPE_TINY_BLOB
+ || mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+ || mysql_type == MYSQL_TYPE_BLOB
+ || mysql_type == MYSQL_TYPE_LONG_BLOB
+ /* MYSQL_TYPE_GEOMETRY data is treated
+ as BLOB data in innodb. */
+ || mysql_type == MYSQL_TYPE_GEOMETRY) {
+
+ const CHARSET_INFO* cs;
+ ulint key_len;
+ ulint true_len;
+ int error=0;
+ ulint blob_len;
+ const byte* blob_data;
+
+ ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
+
+ key_len = key_part->length;
+
+ if (part_is_null) {
+ true_len = key_len + 2;
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ buff += true_len;
+ buff_space -= true_len;
+
+ continue;
+ }
+
+ cs = field->charset();
+
+ blob_data = row_mysql_read_blob_ref(&blob_len,
+ (byte*) (record
+ + (ulint)get_field_offset(table, field)),
+ (ulint) field->pack_length());
+
+ true_len = blob_len;
+
+ ut_a(get_field_offset(table, field)
+ == key_part->offset);
+
+ /* For multi byte character sets we need to calculate
+ the true length of the key */
+
+ if (blob_len > 0 && cs->mbmaxlen > 1) {
+ true_len = (ulint) my_well_formed_length(cs,
+ (const char *) blob_data,
+ (const char *) blob_data
+ + blob_len,
+ (uint) (key_len /
+ cs->mbmaxlen),
+ &error);
+ }
+
+ /* All indexes on BLOB and TEXT are column prefix
+ indexes, and we may need to truncate the data to be
+ stored in the key value: */
+
+ if (true_len > key_len) {
+ true_len = key_len;
+ }
+
+ const ulint max_len= true_len;
+ sorted= static_cast<uchar *>(ut_malloc_nokey(max_len+1));
+ memcpy(sorted, blob_data, true_len);
+ true_len = wsrep_innobase_mysql_sort(
+ mysql_type, cs->number, sorted, true_len,
+ max_len);
+ ut_ad(true_len <= max_len);
+
+ /* Note that we always reserve the maximum possible
+ length of the BLOB prefix in the key value. */
+ if (wsrep_protocol_version > 1) {
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ buff += true_len;
+ buff_space -= true_len;
+ } else {
+ buff += key_len;
+ }
+ memcpy(buff, sorted, true_len);
+ } else {
+ /* Here we handle all other data types except the
+ true VARCHAR, BLOB and TEXT. Note that the column
+ value we store may be also in a column prefix
+ index. */
+
+ const CHARSET_INFO* cs = NULL;
+ ulint true_len;
+ ulint key_len;
+ const uchar* src_start;
+ int error=0;
+ enum_field_types real_type;
+
+ key_len = key_part->length;
+
+ if (part_is_null) {
+ true_len = key_len;
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ buff += true_len;
+ buff_space -= true_len;
+
+ continue;
+ }
+
+ src_start = record + key_part->offset;
+ real_type = field->real_type();
+ true_len = key_len;
+
+ /* Character set for the field is defined only
+ to fields whose type is string and real field
+ type is not enum or set. For these fields check
+ if character set is multi byte. */
+
+ if (real_type != MYSQL_TYPE_ENUM
+ && real_type != MYSQL_TYPE_SET
+ && ( mysql_type == MYSQL_TYPE_VAR_STRING
+ || mysql_type == MYSQL_TYPE_STRING)) {
+
+ cs = field->charset();
+
+ /* For multi byte character sets we need to
+ calculate the true length of the key */
+
+ if (key_len > 0 && cs->mbmaxlen > 1) {
+
+ true_len = (ulint)
+ my_well_formed_length(cs,
+ (const char *)src_start,
+ (const char *)src_start
+ + key_len,
+ (uint) (key_len /
+ cs->mbmaxlen),
+ &error);
+ }
+
+ const ulint max_len = true_len;
+ sorted= static_cast<uchar *>(ut_malloc_nokey(max_len+1));
+ memcpy(sorted, src_start, true_len);
+ true_len = wsrep_innobase_mysql_sort(
+ mysql_type, cs->number, sorted, true_len,
+ max_len);
+ ut_ad(true_len <= max_len);
+
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ memcpy(buff, sorted, true_len);
+ } else {
+ memcpy(buff, src_start, true_len);
+ }
+ buff += true_len;
+ buff_space -= true_len;
+ }
+
+ if (sorted) {
+ ut_free(sorted);
+ sorted= NULL;
+ }
+ }
+
+ ut_a(buff <= buff_start + buff_len);
+
+ DBUG_RETURN(static_cast<uint16_t>(buff - buff_start));
+}
+#endif /* WITH_WSREP */
+/**************************************************************//**
+Determines if a field is needed in a m_prebuilt struct 'template'.
+@return field to use, or NULL if the field is not needed */
+static
+const Field*
+build_template_needs_field(
+/*=======================*/
+ bool index_contains, /*!< in:
+ dict_index_t::contains_col_or_prefix(
+ i) */
+ bool read_just_key, /*!< in: TRUE when MySQL calls
+ ha_innobase::extra with the
+ argument HA_EXTRA_KEYREAD; it is enough
+ to read just columns defined in
+ the index (i.e., no read of the
+ clustered index record necessary) */
+ bool fetch_all_in_key,
+ /*!< in: true=fetch all fields in
+ the index */
+ bool fetch_primary_key_cols,
+ /*!< in: true=fetch the
+ primary key columns */
+ dict_index_t* index, /*!< in: InnoDB index to use */
+ const TABLE* table, /*!< in: MySQL table object */
+ ulint i, /*!< in: field index in InnoDB table */
+ ulint num_v) /*!< in: num virtual column so far */
+{
+ const Field* field = table->field[i];
+
+ if (!field->stored_in_db()
+ && ha_innobase::omits_virtual_cols(*table->s)) {
+ return NULL;
+ }
+
+ if (!index_contains) {
+ if (read_just_key) {
+ /* If this is a 'key read', we do not need
+ columns that are not in the key */
+
+ return(NULL);
+ }
+ } else if (fetch_all_in_key) {
+ /* This field is needed in the query */
+
+ return(field);
+ }
+
+ if (bitmap_is_set(table->read_set, static_cast<uint>(i))
+ || bitmap_is_set(table->write_set, static_cast<uint>(i))) {
+ /* This field is needed in the query */
+
+ return(field);
+ }
+
+ ut_ad(i >= num_v);
+ if (fetch_primary_key_cols
+ && dict_table_col_in_clustered_key(index->table, i - num_v)) {
+ /* This field is needed in the query */
+ return(field);
+ }
+
+ /* This field is not needed in the query, skip it */
+
+ return(NULL);
+}
+
+/**************************************************************//**
+Determines if a field is needed in a m_prebuilt struct 'template'.
+@return whether the field is needed for index condition pushdown */
+inline
+bool
+build_template_needs_field_in_icp(
+/*==============================*/
+ const dict_index_t* index, /*!< in: InnoDB index */
+ const row_prebuilt_t* prebuilt,/*!< in: row fetch template */
+ bool contains,/*!< in: whether the index contains
+ column i */
+ ulint i, /*!< in: column number */
+ bool is_virtual)
+ /*!< in: a virtual column or not */
+{
+ ut_ad(contains == index->contains_col_or_prefix(i, is_virtual));
+
+ return(index == prebuilt->index
+ ? contains
+ : prebuilt->index->contains_col_or_prefix(i, is_virtual));
+}
+
+/**************************************************************//**
+Adds a field to a m_prebuilt struct 'template'.
+@return the field template */
+static
+mysql_row_templ_t*
+build_template_field(
+/*=================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: template */
+ dict_index_t* clust_index, /*!< in: InnoDB clustered index */
+ dict_index_t* index, /*!< in: InnoDB index to use */
+ TABLE* table, /*!< in: MySQL table object */
+ const Field* field, /*!< in: field in MySQL table */
+ ulint i, /*!< in: field index in InnoDB table */
+ ulint v_no) /*!< in: field index for virtual col */
+{
+ mysql_row_templ_t* templ;
+ const dict_col_t* col;
+
+ ut_ad(clust_index->table == index->table);
+
+ templ = prebuilt->mysql_template + prebuilt->n_template++;
+ MEM_UNDEFINED(templ, sizeof *templ);
+ templ->rec_field_is_prefix = FALSE;
+ templ->rec_prefix_field_no = ULINT_UNDEFINED;
+ templ->is_virtual = !field->stored_in_db();
+
+ if (!templ->is_virtual) {
+ templ->col_no = i;
+ col = dict_table_get_nth_col(index->table, i);
+ templ->clust_rec_field_no = dict_col_get_clust_pos(
+ col, clust_index);
+ /* If clustered index record field is not found, lets print out
+ field names and all the rest to understand why field is not found. */
+ if (templ->clust_rec_field_no == ULINT_UNDEFINED) {
+ const char* tb_col_name = dict_table_get_col_name(clust_index->table, i);
+ dict_field_t* field=NULL;
+ size_t size = 0;
+
+ for(ulint j=0; j < clust_index->n_user_defined_cols; j++) {
+ dict_field_t* ifield = &(clust_index->fields[j]);
+ if (ifield && !memcmp(tb_col_name, ifield->name,
+ strlen(tb_col_name))) {
+ field = ifield;
+ break;
+ }
+ }
+
+ ib::info() << "Looking for field " << i << " name "
+ << (tb_col_name ? tb_col_name : "NULL")
+ << " from table " << clust_index->table->name;
+
+
+ for(ulint j=0; j < clust_index->n_user_defined_cols; j++) {
+ dict_field_t* ifield = &(clust_index->fields[j]);
+ ib::info() << "InnoDB Table "
+ << clust_index->table->name
+ << "field " << j << " name "
+ << (ifield ? ifield->name() : "NULL");
+ }
+
+ for(ulint j=0; j < table->s->stored_fields; j++) {
+ ib::info() << "MySQL table "
+ << table->s->table_name.str
+ << " field " << j << " name "
+ << table->field[j]->field_name.str;
+ }
+
+ ib::fatal() << "Clustered record field for column " << i
+ << " not found table n_user_defined "
+ << clust_index->n_user_defined_cols
+ << " index n_user_defined "
+ << clust_index->table->n_cols - DATA_N_SYS_COLS
+ << " InnoDB table "
+ << clust_index->table->name
+ << " field name "
+ << (field ? field->name() : "NULL")
+ << " MySQL table "
+ << table->s->table_name.str
+ << " field name "
+ << (tb_col_name ? tb_col_name : "NULL")
+ << " n_fields "
+ << table->s->stored_fields
+ << " query "
+ << innobase_get_stmt_unsafe(current_thd, &size);
+ }
+
+ if (dict_index_is_clust(index)) {
+ templ->rec_field_no = templ->clust_rec_field_no;
+ } else {
+ /* If we're in a secondary index, keep track
+ * of the original index position even if this
+ * is just a prefix index; we will use this
+ * later to avoid a cluster index lookup in
+ * some cases.*/
+
+ templ->rec_field_no = dict_index_get_nth_col_pos(index, i,
+ &templ->rec_prefix_field_no);
+ }
+ } else {
+ DBUG_ASSERT(!ha_innobase::omits_virtual_cols(*table->s));
+ col = &dict_table_get_nth_v_col(index->table, v_no)->m_col;
+ templ->clust_rec_field_no = v_no;
+
+ if (dict_index_is_clust(index)) {
+ templ->rec_field_no = templ->clust_rec_field_no;
+ } else {
+ templ->rec_field_no
+ = dict_index_get_nth_col_or_prefix_pos(
+ index, v_no, FALSE, true,
+ &templ->rec_prefix_field_no);
+ }
+ templ->icp_rec_field_no = ULINT_UNDEFINED;
+ }
+
+ if (field->real_maybe_null()) {
+ templ->mysql_null_byte_offset =
+ field->null_offset();
+
+ templ->mysql_null_bit_mask = (ulint) field->null_bit;
+ } else {
+ templ->mysql_null_bit_mask = 0;
+ }
+
+
+ templ->mysql_col_offset = (ulint) get_field_offset(table, field);
+ templ->mysql_col_len = (ulint) field->pack_length();
+ templ->type = col->mtype;
+ templ->mysql_type = (ulint) field->type();
+
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+ templ->mysql_length_bytes = (ulint)
+ (((Field_varstring*) field)->length_bytes);
+ } else {
+ templ->mysql_length_bytes = 0;
+ }
+
+ templ->charset = dtype_get_charset_coll(col->prtype);
+ templ->mbminlen = dict_col_get_mbminlen(col);
+ templ->mbmaxlen = dict_col_get_mbmaxlen(col);
+ templ->is_unsigned = col->prtype & DATA_UNSIGNED;
+
+ if (!dict_index_is_clust(index)
+ && templ->rec_field_no == ULINT_UNDEFINED) {
+ prebuilt->need_to_access_clustered = TRUE;
+
+ if (templ->rec_prefix_field_no != ULINT_UNDEFINED) {
+ dict_field_t* field = dict_index_get_nth_field(
+ index,
+ templ->rec_prefix_field_no);
+ templ->rec_field_is_prefix = (field->prefix_len != 0);
+ }
+ }
+
+ /* For spatial index, we need to access cluster index. */
+ if (dict_index_is_spatial(index)) {
+ prebuilt->need_to_access_clustered = TRUE;
+ }
+
+ if (prebuilt->mysql_prefix_len < templ->mysql_col_offset
+ + templ->mysql_col_len) {
+ prebuilt->mysql_prefix_len = templ->mysql_col_offset
+ + templ->mysql_col_len;
+ }
+
+ if (DATA_LARGE_MTYPE(templ->type)) {
+ prebuilt->templ_contains_blob = TRUE;
+ }
+
+ return(templ);
+}
+
+/**************************************************************//**
+Builds a 'template' to the m_prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+
+void
+ha_innobase::build_template(
+/*========================*/
+ bool whole_row) /*!< in: true=ROW_MYSQL_WHOLE_ROW,
+ false=ROW_MYSQL_REC_FIELDS */
+{
+ dict_index_t* index;
+ dict_index_t* clust_index;
+ ibool fetch_all_in_key = FALSE;
+ ibool fetch_primary_key_cols = FALSE;
+
+ if (m_prebuilt->select_lock_type == LOCK_X || m_prebuilt->table->no_rollback()) {
+ /* We always retrieve the whole clustered index record if we
+ use exclusive row level locks, for example, if the read is
+ done in an UPDATE statement or if we are using a no rollback
+ table */
+
+ whole_row = true;
+ } else if (!whole_row) {
+ if (m_prebuilt->hint_need_to_fetch_extra_cols
+ == ROW_RETRIEVE_ALL_COLS) {
+
+ /* We know we must at least fetch all columns in the
+ key, or all columns in the table */
+
+ if (m_prebuilt->read_just_key) {
+ /* MySQL has instructed us that it is enough
+ to fetch the columns in the key; looks like
+ MySQL can set this flag also when there is
+ only a prefix of the column in the key: in
+ that case we retrieve the whole column from
+ the clustered index */
+
+ fetch_all_in_key = TRUE;
+ } else {
+ whole_row = true;
+ }
+ } else if (m_prebuilt->hint_need_to_fetch_extra_cols
+ == ROW_RETRIEVE_PRIMARY_KEY) {
+ /* We must at least fetch all primary key cols. Note
+ that if the clustered index was internally generated
+ by InnoDB on the row id (no primary key was
+ defined), then row_search_for_mysql() will always
+ retrieve the row id to a special buffer in the
+ m_prebuilt struct. */
+
+ fetch_primary_key_cols = TRUE;
+ }
+ }
+
+ clust_index = dict_table_get_first_index(m_prebuilt->table);
+
+ index = whole_row ? clust_index : m_prebuilt->index;
+
+ m_prebuilt->versioned_write = table->versioned_write(VERS_TRX_ID);
+ m_prebuilt->need_to_access_clustered = (index == clust_index);
+
+ /* Either m_prebuilt->index should be a secondary index, or it
+ should be the clustered index. */
+ ut_ad(dict_index_is_clust(index) == (index == clust_index));
+
+ /* Below we check column by column if we need to access
+ the clustered index. */
+
+ if (pushed_rowid_filter && rowid_filter_is_active) {
+ fetch_primary_key_cols = TRUE;
+ m_prebuilt->pk_filter = this;
+ } else {
+ m_prebuilt->pk_filter = NULL;
+ }
+
+ const bool skip_virtual = omits_virtual_cols(*table_share);
+ const ulint n_fields = table_share->fields;
+
+ if (!m_prebuilt->mysql_template) {
+ m_prebuilt->mysql_template = (mysql_row_templ_t*)
+ ut_malloc_nokey(n_fields * sizeof(mysql_row_templ_t));
+ }
+
+ m_prebuilt->template_type = whole_row
+ ? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
+ m_prebuilt->null_bitmap_len = table->s->null_bytes
+ & dict_index_t::MAX_N_FIELDS;
+
+ /* Prepare to build m_prebuilt->mysql_template[]. */
+ m_prebuilt->templ_contains_blob = FALSE;
+ m_prebuilt->mysql_prefix_len = 0;
+ m_prebuilt->n_template = 0;
+ m_prebuilt->idx_cond_n_cols = 0;
+
+ /* Note that in InnoDB, i is the column number in the table.
+ MySQL calls columns 'fields'. */
+
+ ulint num_v = 0;
+
+ if ((active_index != MAX_KEY
+ && active_index == pushed_idx_cond_keyno)
+ || (pushed_rowid_filter && rowid_filter_is_active)) {
+ /* Push down an index condition or an end_range check. */
+ for (ulint i = 0; i < n_fields; i++) {
+ const Field* field = table->field[i];
+ const bool is_v = !field->stored_in_db();
+ if (is_v && skip_virtual) {
+ num_v++;
+ continue;
+ }
+ bool index_contains = index->contains_col_or_prefix(
+ is_v ? num_v : i - num_v, is_v);
+ if (is_v && index_contains) {
+ m_prebuilt->n_template = 0;
+ num_v = 0;
+ goto no_icp;
+ }
+
+ /* Test if an end_range or an index condition
+ refers to the field. Note that "index" and
+ "index_contains" may refer to the clustered index.
+ Index condition pushdown is relative to
+ m_prebuilt->index (the index that is being
+ looked up first). */
+
+ /* When join_read_always_key() invokes this
+ code via handler::ha_index_init() and
+ ha_innobase::index_init(), end_range is not
+ yet initialized. Because of that, we must
+ always check for index_contains, instead of
+ the subset
+ field->part_of_key.is_set(active_index)
+ which would be acceptable if end_range==NULL. */
+ if (build_template_needs_field_in_icp(
+ index, m_prebuilt, index_contains,
+ is_v ? num_v : i - num_v, is_v)) {
+ if (!whole_row) {
+ field = build_template_needs_field(
+ index_contains,
+ m_prebuilt->read_just_key,
+ fetch_all_in_key,
+ fetch_primary_key_cols,
+ index, table, i, num_v);
+ if (!field) {
+ if (is_v) {
+ num_v++;
+ }
+ continue;
+ }
+ }
+
+ ut_ad(!is_v);
+
+ mysql_row_templ_t* templ= build_template_field(
+ m_prebuilt, clust_index, index,
+ table, field, i - num_v, 0);
+
+ ut_ad(!templ->is_virtual);
+
+ m_prebuilt->idx_cond_n_cols++;
+ ut_ad(m_prebuilt->idx_cond_n_cols
+ == m_prebuilt->n_template);
+
+ if (index == m_prebuilt->index) {
+ templ->icp_rec_field_no
+ = templ->rec_field_no;
+ } else {
+ templ->icp_rec_field_no
+ = dict_index_get_nth_col_pos(
+ m_prebuilt->index,
+ i - num_v,
+ &templ->rec_prefix_field_no);
+ }
+
+ if (dict_index_is_clust(m_prebuilt->index)) {
+ ut_ad(templ->icp_rec_field_no
+ != ULINT_UNDEFINED);
+ /* If the primary key includes
+ a column prefix, use it in
+ index condition pushdown,
+ because the condition is
+ evaluated before fetching any
+ off-page (externally stored)
+ columns. */
+ if (templ->icp_rec_field_no
+ < m_prebuilt->index->n_uniq) {
+ /* This is a key column;
+ all set. */
+ continue;
+ }
+ } else if (templ->icp_rec_field_no
+ != ULINT_UNDEFINED) {
+ continue;
+ }
+
+ /* This is a column prefix index.
+ The column prefix can be used in
+ an end_range comparison. */
+
+ templ->icp_rec_field_no
+ = dict_index_get_nth_col_or_prefix_pos(
+ m_prebuilt->index, i - num_v,
+ true, false,
+ &templ->rec_prefix_field_no);
+ ut_ad(templ->icp_rec_field_no
+ != ULINT_UNDEFINED);
+
+ /* Index condition pushdown can be used on
+ all columns of a secondary index, and on
+ the PRIMARY KEY columns. On the clustered
+ index, it must never be used on other than
+ PRIMARY KEY columns, because those columns
+ may be stored off-page, and we will not
+ fetch externally stored columns before
+ checking the index condition. */
+ /* TODO: test the above with an assertion
+ like this. Note that index conditions are
+ currently pushed down as part of the
+ "optimizer phase" while end_range is done
+ as part of the execution phase. Therefore,
+ we were unable to use an accurate condition
+ for end_range in the "if" condition above,
+ and the following assertion would fail.
+ ut_ad(!dict_index_is_clust(m_prebuilt->index)
+ || templ->rec_field_no
+ < m_prebuilt->index->n_uniq);
+ */
+ }
+
+ if (is_v) {
+ num_v++;
+ }
+ }
+
+ ut_ad(m_prebuilt->idx_cond_n_cols > 0);
+ ut_ad(m_prebuilt->idx_cond_n_cols == m_prebuilt->n_template);
+
+ num_v = 0;
+
+ /* Include the fields that are not needed in index condition
+ pushdown. */
+ for (ulint i = 0; i < n_fields; i++) {
+ const Field* field = table->field[i];
+ const bool is_v = !field->stored_in_db();
+ if (is_v && skip_virtual) {
+ num_v++;
+ continue;
+ }
+
+ bool index_contains = index->contains_col_or_prefix(
+ is_v ? num_v : i - num_v, is_v);
+
+ if (!build_template_needs_field_in_icp(
+ index, m_prebuilt, index_contains,
+ is_v ? num_v : i - num_v, is_v)) {
+ /* Not needed in ICP */
+ if (!whole_row) {
+ field = build_template_needs_field(
+ index_contains,
+ m_prebuilt->read_just_key,
+ fetch_all_in_key,
+ fetch_primary_key_cols,
+ index, table, i, num_v);
+ if (!field) {
+ if (is_v) {
+ num_v++;
+ }
+ continue;
+ }
+ }
+
+ ut_d(mysql_row_templ_t* templ =)
+ build_template_field(
+ m_prebuilt, clust_index, index,
+ table, field, i - num_v, num_v);
+ ut_ad(templ->is_virtual == (ulint)is_v);
+
+ if (is_v) {
+ num_v++;
+ }
+ }
+ }
+ if (active_index == pushed_idx_cond_keyno) {
+ m_prebuilt->idx_cond = this;
+ }
+ } else {
+no_icp:
+ /* No index condition pushdown */
+ m_prebuilt->idx_cond = NULL;
+ ut_ad(num_v == 0);
+
+ for (ulint i = 0; i < n_fields; i++) {
+ const Field* field = table->field[i];
+ const bool is_v = !field->stored_in_db();
+
+ if (whole_row) {
+ if (is_v && skip_virtual) {
+ num_v++;
+ continue;
+ }
+ /* Even this is whole_row, if the seach is
+ on a virtual column, and read_just_key is
+ set, and field is not in this index, we
+ will not try to fill the value since they
+ are not stored in such index nor in the
+ cluster index. */
+ if (is_v
+ && m_prebuilt->read_just_key
+ && !m_prebuilt->index->contains_col_or_prefix(
+ num_v, true))
+ {
+ /* Turn off ROW_MYSQL_WHOLE_ROW */
+ m_prebuilt->template_type =
+ ROW_MYSQL_REC_FIELDS;
+ num_v++;
+ continue;
+ }
+ } else {
+ if (is_v
+ && (skip_virtual || index->is_primary())) {
+ num_v++;
+ continue;
+ }
+
+ bool contain = index->contains_col_or_prefix(
+ is_v ? num_v: i - num_v, is_v);
+
+ field = build_template_needs_field(
+ contain,
+ m_prebuilt->read_just_key,
+ fetch_all_in_key,
+ fetch_primary_key_cols,
+ index, table, i, num_v);
+ if (!field) {
+ if (is_v) {
+ num_v++;
+ }
+ continue;
+ }
+ }
+
+ ut_d(mysql_row_templ_t* templ =)
+ build_template_field(
+ m_prebuilt, clust_index, index,
+ table, field, i - num_v, num_v);
+ ut_ad(templ->is_virtual == (ulint)is_v);
+ if (is_v) {
+ num_v++;
+ }
+ }
+ }
+
+ if (index != clust_index && m_prebuilt->need_to_access_clustered) {
+ /* Change rec_field_no's to correspond to the clustered index
+ record */
+ for (ulint i = 0; i < m_prebuilt->n_template; i++) {
+ mysql_row_templ_t* templ
+ = &m_prebuilt->mysql_template[i];
+
+ templ->rec_field_no = templ->clust_rec_field_no;
+ }
+ }
+}
+
+/********************************************************************//**
+This special handling is really to overcome the limitations of MySQL's
+binlogging. We need to eliminate the non-determinism that will arise in
+INSERT ... SELECT type of statements, since MySQL binlog only stores the
+min value of the autoinc interval. Once that is fixed we can get rid of
+the special lock handling.
+@return DB_SUCCESS if all OK else error code */
+
+dberr_t
+ha_innobase::innobase_lock_autoinc(void)
+/*====================================*/
+{
+ DBUG_ENTER("ha_innobase::innobase_lock_autoinc");
+ dberr_t error = DB_SUCCESS;
+
+ ut_ad(!srv_read_only_mode);
+
+ switch (innobase_autoinc_lock_mode) {
+ case AUTOINC_NO_LOCKING:
+ /* Acquire only the AUTOINC mutex. */
+ m_prebuilt->table->autoinc_mutex.lock();
+ break;
+
+ case AUTOINC_NEW_STYLE_LOCKING:
+ /* For simple (single/multi) row INSERTs/REPLACEs and RBR
+ events, we fallback to the old style only if another
+ transaction has already acquired the AUTOINC lock on
+ behalf of a LOAD FILE or INSERT ... SELECT etc. type of
+ statement. */
+ switch (thd_sql_command(m_user_thd)) {
+ case SQLCOM_INSERT:
+ case SQLCOM_REPLACE:
+ case SQLCOM_END: // RBR event
+ /* Acquire the AUTOINC mutex. */
+ m_prebuilt->table->autoinc_mutex.lock();
+ /* We need to check that another transaction isn't
+ already holding the AUTOINC lock on the table. */
+ if (!m_prebuilt->table->n_waiting_or_granted_auto_inc_locks) {
+ /* Do not fall back to old style locking. */
+ DBUG_RETURN(error);
+ }
+ m_prebuilt->table->autoinc_mutex.unlock();
+ }
+ /* Use old style locking. */
+ /* fall through */
+ case AUTOINC_OLD_STYLE_LOCKING:
+ DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used",
+ ut_ad(0););
+ error = row_lock_table_autoinc_for_mysql(m_prebuilt);
+
+ if (error == DB_SUCCESS) {
+
+ /* Acquire the AUTOINC mutex. */
+ m_prebuilt->table->autoinc_mutex.lock();
+ }
+ break;
+
+ default:
+ ut_error;
+ }
+
+ DBUG_RETURN(error);
+}
+
+/********************************************************************//**
+Store the autoinc value in the table. The autoinc value is only set if
+it's greater than the existing autoinc value in the table.
+@return DB_SUCCESS if all went well else error code */
+
+dberr_t
+ha_innobase::innobase_set_max_autoinc(
+/*==================================*/
+ ulonglong auto_inc) /*!< in: value to store */
+{
+ dberr_t error;
+
+ error = innobase_lock_autoinc();
+
+ if (error == DB_SUCCESS) {
+
+ dict_table_autoinc_update_if_greater(m_prebuilt->table, auto_inc);
+ m_prebuilt->table->autoinc_mutex.unlock();
+ }
+
+ return(error);
+}
+
+/********************************************************************//**
+Stores a row in an InnoDB database, to the table specified in this
+handle.
+@return error code */
+
+int
+ha_innobase::write_row(
+/*===================*/
+ const uchar* record) /*!< in: a row in MySQL format */
+{
+ dberr_t error;
+#ifdef WITH_WSREP
+ bool wsrep_auto_inc_inserted= false;
+#endif
+ int error_result = 0;
+ bool auto_inc_used = false;
+
+ DBUG_ENTER("ha_innobase::write_row");
+
+ trx_t* trx = thd_to_trx(m_user_thd);
+
+ /* Validation checks before we commence write_row operation. */
+ if (high_level_read_only) {
+ ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ }
+
+ ut_a(m_prebuilt->trx == trx);
+
+ if (!trx_is_started(trx)) {
+ trx->will_lock = true;
+ }
+
+ ins_mode_t vers_set_fields;
+ /* Handling of Auto-Increment Columns. */
+ if (table->next_number_field && record == table->record[0]) {
+
+ /* Reset the error code before calling
+ innobase_get_auto_increment(). */
+ m_prebuilt->autoinc_error = DB_SUCCESS;
+
+#ifdef WITH_WSREP
+ wsrep_auto_inc_inserted = trx->is_wsrep()
+ && wsrep_drupal_282555_workaround
+ && table->next_number_field->val_int() == 0;
+#endif
+
+ if ((error_result = update_auto_increment())) {
+ /* We don't want to mask autoinc overflow errors. */
+
+ /* Handle the case where the AUTOINC sub-system
+ failed during initialization. */
+ if (m_prebuilt->autoinc_error == DB_UNSUPPORTED) {
+ error_result = ER_AUTOINC_READ_FAILED;
+ /* Set the error message to report too. */
+ my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+ goto func_exit;
+ } else if (m_prebuilt->autoinc_error != DB_SUCCESS) {
+ error = m_prebuilt->autoinc_error;
+ goto report_error;
+ }
+
+ /* MySQL errors are passed straight back. */
+ goto func_exit;
+ }
+
+ auto_inc_used = true;
+ }
+
+ /* Prepare INSERT graph that will be executed for actual INSERT
+ (This is a one time operation) */
+ if (m_prebuilt->mysql_template == NULL
+ || m_prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) {
+
+ /* Build the template used in converting quickly between
+ the two database formats */
+
+ build_template(true);
+ }
+
+ vers_set_fields = table->versioned_write(VERS_TRX_ID) ?
+ ROW_INS_VERSIONED : ROW_INS_NORMAL;
+
+ /* Execute insert graph that will result in actual insert. */
+ error = row_insert_for_mysql((byte*) record, m_prebuilt, vers_set_fields);
+
+ DEBUG_SYNC(m_user_thd, "ib_after_row_insert");
+
+ /* Handling of errors related to auto-increment. */
+ if (auto_inc_used) {
+ ulonglong auto_inc;
+
+ /* Note the number of rows processed for this statement, used
+ by get_auto_increment() to determine the number of AUTO-INC
+ values to reserve. This is only useful for a mult-value INSERT
+ and is a statement level counter. */
+ if (trx->n_autoinc_rows > 0) {
+ --trx->n_autoinc_rows;
+ }
+
+ /* Get the value that MySQL attempted to store in the table.*/
+ auto_inc = table->next_number_field->val_uint();
+
+ switch (error) {
+ case DB_DUPLICATE_KEY:
+
+ /* A REPLACE command and LOAD DATA INFILE REPLACE
+ handle a duplicate key error themselves, but we
+ must update the autoinc counter if we are performing
+ those statements. */
+
+ switch (thd_sql_command(m_user_thd)) {
+ case SQLCOM_LOAD:
+ if (!trx->duplicates) {
+ break;
+ }
+
+ case SQLCOM_REPLACE:
+ case SQLCOM_INSERT_SELECT:
+ case SQLCOM_REPLACE_SELECT:
+ goto set_max_autoinc;
+
+#ifdef WITH_WSREP
+ /* workaround for LP bug #355000, retrying the insert */
+ case SQLCOM_INSERT:
+
+ WSREP_DEBUG("DUPKEY error for autoinc\n"
+ "THD %ld, value %llu, off %llu inc %llu",
+ thd_get_thread_id(m_user_thd),
+ auto_inc,
+ m_prebuilt->autoinc_offset,
+ m_prebuilt->autoinc_increment);
+
+ if (wsrep_auto_inc_inserted &&
+ wsrep_thd_retry_counter(m_user_thd) == 0 &&
+ !thd_test_options(m_user_thd,
+ OPTION_NOT_AUTOCOMMIT |
+ OPTION_BEGIN)) {
+ WSREP_DEBUG(
+ "retrying insert: %s",
+ wsrep_thd_query(m_user_thd));
+ error= DB_SUCCESS;
+ wsrep_thd_self_abort(m_user_thd);
+ /* jump straight to func exit over
+ * later wsrep hooks */
+ goto func_exit;
+ }
+ break;
+#endif /* WITH_WSREP */
+
+ default:
+ break;
+ }
+
+ break;
+
+ case DB_SUCCESS:
+ /* If the actual value inserted is greater than
+ the upper limit of the interval, then we try and
+ update the table upper limit. Note: last_value
+ will be 0 if get_auto_increment() was not called. */
+
+ if (auto_inc >= m_prebuilt->autoinc_last_value) {
+set_max_autoinc:
+ /* We need the upper limit of the col type to check for
+ whether we update the table autoinc counter or not. */
+ ulonglong col_max_value =
+ table->next_number_field->get_max_int_value();
+
+ /* This should filter out the negative
+ values set explicitly by the user. */
+ if (auto_inc <= col_max_value) {
+ ut_ad(m_prebuilt->autoinc_increment > 0);
+
+ ulonglong offset;
+ ulonglong increment;
+ dberr_t err;
+
+ offset = m_prebuilt->autoinc_offset;
+ increment = m_prebuilt->autoinc_increment;
+
+ auto_inc = innobase_next_autoinc(
+ auto_inc, 1, increment, offset,
+ col_max_value);
+
+ err = innobase_set_max_autoinc(
+ auto_inc);
+
+ if (err != DB_SUCCESS) {
+ error = err;
+ }
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+report_error:
+ /* Cleanup and exit. */
+ if (error == DB_TABLESPACE_DELETED) {
+ ib_senderrf(
+ trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
+ }
+
+ error_result = convert_error_code_to_mysql(
+ error, m_prebuilt->table->flags, m_user_thd);
+
+#ifdef WITH_WSREP
+ if (!error_result && trx->is_wsrep()
+ && wsrep_thd_is_local(m_user_thd)
+ && !wsrep_thd_ignore_table(m_user_thd)
+ && !wsrep_consistency_check(m_user_thd)
+ && (thd_sql_command(m_user_thd) != SQLCOM_CREATE_TABLE)
+ && (thd_sql_command(m_user_thd) != SQLCOM_LOAD ||
+ thd_binlog_format(m_user_thd) == BINLOG_FORMAT_ROW)) {
+ if (wsrep_append_keys(m_user_thd, WSREP_SERVICE_KEY_EXCLUSIVE,
+ record,
+ NULL)) {
+ DBUG_PRINT("wsrep", ("row key failed"));
+ error_result = HA_ERR_INTERNAL_ERROR;
+ goto func_exit;
+ }
+ }
+#endif /* WITH_WSREP */
+
+ if (error_result == HA_FTS_INVALID_DOCID) {
+ my_error(HA_FTS_INVALID_DOCID, MYF(0));
+ }
+
+func_exit:
+ DBUG_RETURN(error_result);
+}
+
+/** Fill the update vector's "old_vrow" field for those non-updated,
+but indexed columns. Such columns could stil present in the virtual
+index rec fields even if they are not updated (some other fields updated),
+so needs to be logged.
+@param[in] prebuilt InnoDB prebuilt struct
+@param[in,out] vfield field to filled
+@param[in] o_len actual column length
+@param[in,out] col column to be filled
+@param[in] old_mysql_row_col MySQL old field ptr
+@param[in] col_pack_len MySQL field col length
+@param[in,out] buf buffer for a converted integer value
+@return used buffer ptr from row_mysql_store_col_in_innobase_format() */
+static
+byte*
+innodb_fill_old_vcol_val(
+ row_prebuilt_t* prebuilt,
+ dfield_t* vfield,
+ ulint o_len,
+ dict_col_t* col,
+ const byte* old_mysql_row_col,
+ ulint col_pack_len,
+ byte* buf)
+{
+ dict_col_copy_type(
+ col, dfield_get_type(vfield));
+ if (o_len != UNIV_SQL_NULL) {
+
+ buf = row_mysql_store_col_in_innobase_format(
+ vfield,
+ buf,
+ TRUE,
+ old_mysql_row_col,
+ col_pack_len,
+ dict_table_is_comp(prebuilt->table));
+ } else {
+ dfield_set_null(vfield);
+ }
+
+ return(buf);
+}
+
+/** Calculate an update vector corresponding to the changes
+between old_row and new_row.
+@param[out] uvect update vector
+@param[in] old_row current row in MySQL format
+@param[in] new_row intended updated row in MySQL format
+@param[in] table MySQL table handle
+@param[in,out] upd_buff buffer to use for converted values
+@param[in] buff_len length of upd_buff
+@param[in,out] prebuilt InnoDB execution context
+@param[out] auto_inc updated AUTO_INCREMENT value, or 0 if none
+@return DB_SUCCESS or error code */
+static
+dberr_t
+calc_row_difference(
+ upd_t* uvect,
+ const uchar* old_row,
+ const uchar* new_row,
+ TABLE* table,
+ uchar* upd_buff,
+ ulint buff_len,
+ row_prebuilt_t* prebuilt,
+ ib_uint64_t& auto_inc)
+{
+ uchar* original_upd_buff = upd_buff;
+ Field* field;
+ enum_field_types field_mysql_type;
+ ulint o_len;
+ ulint n_len;
+ ulint col_pack_len;
+ const byte* new_mysql_row_col;
+ const byte* old_mysql_row_col;
+ const byte* o_ptr;
+ const byte* n_ptr;
+ byte* buf;
+ upd_field_t* ufield;
+ ulint col_type;
+ ulint n_changed = 0;
+ dfield_t dfield;
+ dict_index_t* clust_index;
+ ibool changes_fts_column = FALSE;
+ ibool changes_fts_doc_col = FALSE;
+ trx_t* const trx = prebuilt->trx;
+ doc_id_t doc_id = FTS_NULL_DOC_ID;
+ uint16_t num_v = 0;
+ const bool skip_virtual = ha_innobase::omits_virtual_cols(*table->s);
+
+ ut_ad(!srv_read_only_mode);
+
+ clust_index = dict_table_get_first_index(prebuilt->table);
+ auto_inc = 0;
+
+ /* We use upd_buff to convert changed fields */
+ buf = (byte*) upd_buff;
+
+ for (uint i = 0; i < table->s->fields; i++) {
+ field = table->field[i];
+ const bool is_virtual = !field->stored_in_db();
+ if (is_virtual && skip_virtual) {
+ num_v++;
+ continue;
+ }
+ dict_col_t* col = is_virtual
+ ? &prebuilt->table->v_cols[num_v].m_col
+ : &prebuilt->table->cols[i - num_v];
+
+ o_ptr = (const byte*) old_row + get_field_offset(table, field);
+ n_ptr = (const byte*) new_row + get_field_offset(table, field);
+
+ /* Use new_mysql_row_col and col_pack_len save the values */
+
+ new_mysql_row_col = n_ptr;
+ old_mysql_row_col = o_ptr;
+ col_pack_len = field->pack_length();
+
+ o_len = col_pack_len;
+ n_len = col_pack_len;
+
+ /* We use o_ptr and n_ptr to dig up the actual data for
+ comparison. */
+
+ field_mysql_type = field->type();
+
+ col_type = col->mtype;
+
+ switch (col_type) {
+
+ case DATA_BLOB:
+ case DATA_GEOMETRY:
+ o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len);
+ n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len);
+
+ break;
+
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_VARMYSQL:
+ if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR where
+ the real payload data length is stored in
+ 1 or 2 bytes */
+
+ o_ptr = row_mysql_read_true_varchar(
+ &o_len, o_ptr,
+ (ulint)
+ (((Field_varstring*) field)->length_bytes));
+
+ n_ptr = row_mysql_read_true_varchar(
+ &n_len, n_ptr,
+ (ulint)
+ (((Field_varstring*) field)->length_bytes));
+ }
+
+ break;
+ default:
+ ;
+ }
+
+ if (field_mysql_type == MYSQL_TYPE_LONGLONG
+ && prebuilt->table->fts
+ && innobase_strcasecmp(
+ field->field_name.str, FTS_DOC_ID_COL_NAME) == 0) {
+ doc_id = (doc_id_t) mach_read_from_n_little_endian(
+ n_ptr, 8);
+ if (doc_id == 0) {
+ return(DB_FTS_INVALID_DOCID);
+ }
+ }
+
+ if (field->real_maybe_null()) {
+ if (field->is_null_in_record(old_row)) {
+ o_len = UNIV_SQL_NULL;
+ }
+
+ if (field->is_null_in_record(new_row)) {
+ n_len = UNIV_SQL_NULL;
+ }
+ }
+
+#ifdef UNIV_DEBUG
+ bool online_ord_part = false;
+#endif
+
+ if (is_virtual) {
+ /* If the virtual column is not indexed,
+ we shall ignore it for update */
+ if (!col->ord_part) {
+ /* Check whether there is a table-rebuilding
+ online ALTER TABLE in progress, and this
+ virtual column could be newly indexed, thus
+ it will be materialized. Then we will have
+ to log its update.
+ Note, we do not support online dropping virtual
+ column while adding new index, nor with
+ online alter column order while adding index,
+ so the virtual column sequence must not change
+ if it is online operation */
+ if (dict_index_is_online_ddl(clust_index)
+ && row_log_col_is_indexed(clust_index,
+ num_v)) {
+#ifdef UNIV_DEBUG
+ online_ord_part = true;
+#endif
+ } else {
+ num_v++;
+ continue;
+ }
+ }
+
+ if (!uvect->old_vrow) {
+ uvect->old_vrow = dtuple_create_with_vcol(
+ uvect->heap, 0, prebuilt->table->n_v_cols);
+ }
+
+ ulint max_field_len = DICT_MAX_FIELD_LEN_BY_FORMAT(
+ prebuilt->table);
+
+ /* for virtual columns, we only materialize
+ its index, and index field length would not
+ exceed max_field_len. So continue if the
+ first max_field_len bytes are matched up */
+ if (o_len != UNIV_SQL_NULL
+ && n_len != UNIV_SQL_NULL
+ && o_len >= max_field_len
+ && n_len >= max_field_len
+ && memcmp(o_ptr, n_ptr, max_field_len) == 0) {
+ dfield_t* vfield = dtuple_get_nth_v_field(
+ uvect->old_vrow, num_v);
+ buf = innodb_fill_old_vcol_val(
+ prebuilt, vfield, o_len,
+ col, old_mysql_row_col,
+ col_pack_len, buf);
+ num_v++;
+ continue;
+ }
+ }
+
+ if (o_len != n_len || (o_len != 0 && o_len != UNIV_SQL_NULL
+ && 0 != memcmp(o_ptr, n_ptr, o_len))) {
+ /* The field has changed */
+
+ ufield = uvect->fields + n_changed;
+ MEM_UNDEFINED(ufield, sizeof *ufield);
+
+ /* Let us use a dummy dfield to make the conversion
+ from the MySQL column format to the InnoDB format */
+
+
+ /* If the length of new geometry object is 0, means
+ this object is invalid geometry object, we need
+ to block it. */
+ if (DATA_GEOMETRY_MTYPE(col_type)
+ && o_len != 0 && n_len == 0) {
+ return(DB_CANT_CREATE_GEOMETRY_OBJECT);
+ }
+
+ if (n_len != UNIV_SQL_NULL) {
+ dict_col_copy_type(
+ col, dfield_get_type(&dfield));
+
+ buf = row_mysql_store_col_in_innobase_format(
+ &dfield,
+ (byte*) buf,
+ TRUE,
+ new_mysql_row_col,
+ col_pack_len,
+ dict_table_is_comp(prebuilt->table));
+ dfield_copy(&ufield->new_val, &dfield);
+ } else {
+ dict_col_copy_type(
+ col, dfield_get_type(&ufield->new_val));
+ dfield_set_null(&ufield->new_val);
+ }
+
+ ufield->exp = NULL;
+ ufield->orig_len = 0;
+ if (is_virtual) {
+ dfield_t* vfield = dtuple_get_nth_v_field(
+ uvect->old_vrow, num_v);
+ upd_fld_set_virtual_col(ufield);
+ ufield->field_no = num_v;
+
+ ut_ad(col->ord_part || online_ord_part);
+ ufield->old_v_val = static_cast<dfield_t*>(
+ mem_heap_alloc(
+ uvect->heap,
+ sizeof *ufield->old_v_val));
+
+ if (!field->is_null_in_record(old_row)) {
+ if (n_len == UNIV_SQL_NULL) {
+ dict_col_copy_type(
+ col, dfield_get_type(
+ &dfield));
+ }
+
+ buf = row_mysql_store_col_in_innobase_format(
+ &dfield,
+ (byte*) buf,
+ TRUE,
+ old_mysql_row_col,
+ col_pack_len,
+ dict_table_is_comp(
+ prebuilt->table));
+ dfield_copy(ufield->old_v_val,
+ &dfield);
+ dfield_copy(vfield, &dfield);
+ } else {
+ dict_col_copy_type(
+ col, dfield_get_type(
+ ufield->old_v_val));
+ dfield_set_null(ufield->old_v_val);
+ dfield_set_null(vfield);
+ }
+ num_v++;
+ ut_ad(field != table->found_next_number_field);
+ } else {
+ ufield->field_no = static_cast<uint16_t>(
+ dict_col_get_clust_pos(
+ &prebuilt->table->cols
+ [i - num_v],
+ clust_index));
+ ufield->old_v_val = NULL;
+ if (field != table->found_next_number_field
+ || dfield_is_null(&ufield->new_val)) {
+ } else {
+ auto_inc = field->val_uint();
+ }
+ }
+ n_changed++;
+
+ /* If an FTS indexed column was changed by this
+ UPDATE then we need to inform the FTS sub-system.
+
+ NOTE: Currently we re-index all FTS indexed columns
+ even if only a subset of the FTS indexed columns
+ have been updated. That is the reason we are
+ checking only once here. Later we will need to
+ note which columns have been updated and do
+ selective processing. */
+ if (prebuilt->table->fts != NULL && !is_virtual) {
+ ulint offset;
+ dict_table_t* innodb_table;
+
+ innodb_table = prebuilt->table;
+
+ if (!changes_fts_column) {
+ offset = row_upd_changes_fts_column(
+ innodb_table, ufield);
+
+ if (offset != ULINT_UNDEFINED) {
+ changes_fts_column = TRUE;
+ }
+ }
+
+ if (!changes_fts_doc_col) {
+ changes_fts_doc_col =
+ row_upd_changes_doc_id(
+ innodb_table, ufield);
+ }
+ }
+ } else if (is_virtual) {
+ dfield_t* vfield = dtuple_get_nth_v_field(
+ uvect->old_vrow, num_v);
+ buf = innodb_fill_old_vcol_val(
+ prebuilt, vfield, o_len,
+ col, old_mysql_row_col,
+ col_pack_len, buf);
+ ut_ad(col->ord_part || online_ord_part);
+ num_v++;
+ }
+ }
+
+ /* If the update changes a column with an FTS index on it, we
+ then add an update column node with a new document id to the
+ other changes. We piggy back our changes on the normal UPDATE
+ to reduce processing and IO overhead. */
+ if (!prebuilt->table->fts) {
+ trx->fts_next_doc_id = 0;
+ } else if (changes_fts_column || changes_fts_doc_col) {
+ dict_table_t* innodb_table = prebuilt->table;
+
+ ufield = uvect->fields + n_changed;
+
+ if (!DICT_TF2_FLAG_IS_SET(
+ innodb_table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+ /* If Doc ID is managed by user, and if any
+ FTS indexed column has been updated, its corresponding
+ Doc ID must also be updated. Otherwise, return
+ error */
+ if (changes_fts_column && !changes_fts_doc_col) {
+ ib::warn() << "A new Doc ID must be supplied"
+ " while updating FTS indexed columns.";
+ return(DB_FTS_INVALID_DOCID);
+ }
+
+ /* Doc ID must monotonically increase */
+ ut_ad(innodb_table->fts->cache);
+ if (doc_id < prebuilt->table->fts->cache->next_doc_id) {
+
+ ib::warn() << "FTS Doc ID must be larger than "
+ << innodb_table->fts->cache->next_doc_id
+ - 1 << " for table "
+ << innodb_table->name;
+
+ return(DB_FTS_INVALID_DOCID);
+ } else if ((doc_id
+ - prebuilt->table->fts->cache->next_doc_id)
+ >= FTS_DOC_ID_MAX_STEP) {
+
+ ib::warn() << "Doc ID " << doc_id << " is too"
+ " big. Its difference with largest"
+ " Doc ID used " << prebuilt->table->fts
+ ->cache->next_doc_id - 1
+ << " cannot exceed or equal to "
+ << FTS_DOC_ID_MAX_STEP;
+ }
+
+
+ trx->fts_next_doc_id = doc_id;
+ } else {
+ /* If the Doc ID is a hidden column, it can't be
+ changed by user */
+ ut_ad(!changes_fts_doc_col);
+
+ /* Doc ID column is hidden, a new Doc ID will be
+ generated by following fts_update_doc_id() call */
+ trx->fts_next_doc_id = 0;
+ }
+
+ fts_update_doc_id(
+ innodb_table, ufield, &trx->fts_next_doc_id);
+
+ ++n_changed;
+ } else {
+ /* We have a Doc ID column, but none of FTS indexed
+ columns are touched, nor the Doc ID column, so set
+ fts_next_doc_id to UINT64_UNDEFINED, which means do not
+ update the Doc ID column */
+ trx->fts_next_doc_id = UINT64_UNDEFINED;
+ }
+
+ uvect->n_fields = n_changed;
+ uvect->info_bits = 0;
+
+ ut_a(buf <= (byte*) original_upd_buff + buff_len);
+
+ ut_ad(uvect->validate());
+ return(DB_SUCCESS);
+}
+
+#ifdef WITH_WSREP
+static
+int
+wsrep_calc_row_hash(
+/*================*/
+ byte* digest, /*!< in/out: md5 sum */
+ const uchar* row, /*!< in: row in MySQL format */
+ TABLE* table, /*!< in: table in MySQL data
+ dictionary */
+ row_prebuilt_t* prebuilt) /*!< in: InnoDB prebuilt struct */
+{
+ void *ctx = alloca(my_md5_context_size());
+ my_md5_init(ctx);
+
+ for (uint i = 0; i < table->s->fields; i++) {
+ byte null_byte=0;
+ byte true_byte=1;
+ unsigned is_unsigned;
+
+ const Field* field = table->field[i];
+ if (!field->stored_in_db()) {
+ continue;
+ }
+
+ auto ptr = row + get_field_offset(table, field);
+ ulint len = field->pack_length();
+
+ switch (get_innobase_type_from_mysql_type(&is_unsigned,
+ field)) {
+ case DATA_BLOB:
+ ptr = row_mysql_read_blob_ref(&len, ptr, len);
+
+ break;
+
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_VARMYSQL:
+ if (field->type() == MYSQL_TYPE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR where
+ the real payload data length is stored in
+ 1 or 2 bytes */
+
+ ptr = row_mysql_read_true_varchar(
+ &len, ptr,
+ (ulint)
+ (((Field_varstring*)field)->length_bytes));
+
+ }
+
+ break;
+ default:
+ ;
+ }
+ /*
+ if (field->null_ptr &&
+ field_in_record_is_null(table, field, (char*) row)) {
+ */
+
+ if (field->is_null_in_record(row)) {
+ my_md5_input(ctx, &null_byte, 1);
+ } else {
+ my_md5_input(ctx, &true_byte, 1);
+ my_md5_input(ctx, ptr, len);
+ }
+ }
+
+ my_md5_result(ctx, digest);
+
+ return(0);
+}
+#endif /* WITH_WSREP */
+
+/**
+Updates a row given as a parameter to a new value. Note that we are given
+whole rows, not just the fields which are updated: this incurs some
+overhead for CPU when we check which fields are actually updated.
+TODO: currently InnoDB does not prevent the 'Halloween problem':
+in a searched update a single row can get updated several times
+if its index columns are updated!
+@param[in] old_row Old row contents in MySQL format
+@param[out] new_row Updated row contents in MySQL format
+@return error number or 0 */
+
+int
+ha_innobase::update_row(
+ const uchar* old_row,
+ const uchar* new_row)
+{
+ int err;
+
+ dberr_t error;
+ trx_t* trx = thd_to_trx(m_user_thd);
+
+ DBUG_ENTER("ha_innobase::update_row");
+
+ ut_a(m_prebuilt->trx == trx);
+
+ if (high_level_read_only) {
+ ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ } else if (!trx_is_started(trx)) {
+ trx->will_lock = true;
+ }
+
+ if (m_upd_buf == NULL) {
+ ut_ad(m_upd_buf_size == 0);
+
+ /* Create a buffer for packing the fields of a record. Why
+ table->reclength did not work here? Obviously, because char
+ fields when packed actually became 1 byte longer, when we also
+ stored the string length as the first byte. */
+
+ m_upd_buf_size = table->s->reclength + table->s->max_key_length
+ + MAX_REF_PARTS * 3;
+
+ m_upd_buf = reinterpret_cast<uchar*>(
+ my_malloc(PSI_INSTRUMENT_ME,
+ m_upd_buf_size,
+ MYF(MY_WME)));
+
+ if (m_upd_buf == NULL) {
+ m_upd_buf_size = 0;
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+ }
+ }
+
+ upd_t* uvect = row_get_prebuilt_update_vector(m_prebuilt);
+ ib_uint64_t autoinc;
+
+ /* Build an update vector from the modified fields in the rows
+ (uses m_upd_buf of the handle) */
+
+ error = calc_row_difference(
+ uvect, old_row, new_row, table, m_upd_buf, m_upd_buf_size,
+ m_prebuilt, autoinc);
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (!uvect->n_fields) {
+ /* This is the same as success, but instructs
+ MySQL that the row is not really updated and it
+ should not increase the count of updated rows.
+ This is fix for http://bugs.mysql.com/29157 */
+ if (m_prebuilt->versioned_write
+ && thd_sql_command(m_user_thd) != SQLCOM_ALTER_TABLE
+ /* Multiple UPDATE of same rows in single transaction create
+ historical rows only once. */
+ && trx->id != table->vers_start_id()) {
+ error = row_insert_for_mysql((byte*) old_row,
+ m_prebuilt,
+ ROW_INS_HISTORICAL);
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+ DBUG_RETURN(HA_ERR_RECORD_IS_THE_SAME);
+ } else {
+ const bool vers_set_fields = m_prebuilt->versioned_write
+ && m_prebuilt->upd_node->update->affects_versioned();
+ const bool vers_ins_row = vers_set_fields
+ && thd_sql_command(m_user_thd) != SQLCOM_ALTER_TABLE;
+
+ /* This is not a delete */
+ m_prebuilt->upd_node->is_delete =
+ (vers_set_fields && !vers_ins_row) ||
+ (thd_sql_command(m_user_thd) == SQLCOM_DELETE &&
+ table->versioned(VERS_TIMESTAMP))
+ ? VERSIONED_DELETE
+ : NO_DELETE;
+
+ error = row_update_for_mysql(m_prebuilt);
+
+ if (error == DB_SUCCESS && vers_ins_row
+ /* Multiple UPDATE of same rows in single transaction create
+ historical rows only once. */
+ && trx->id != table->vers_start_id()) {
+ error = row_insert_for_mysql((byte*) old_row,
+ m_prebuilt,
+ ROW_INS_HISTORICAL);
+ }
+ }
+
+ if (error == DB_SUCCESS && autoinc) {
+ /* A value for an AUTO_INCREMENT column
+ was specified in the UPDATE statement. */
+
+ /* We need the upper limit of the col type to check for
+ whether we update the table autoinc counter or not. */
+ ulonglong col_max_value =
+ table->found_next_number_field->get_max_int_value();
+
+ /* This should filter out the negative
+ values set explicitly by the user. */
+ if (autoinc <= col_max_value) {
+ ulonglong offset;
+ ulonglong increment;
+
+ offset = m_prebuilt->autoinc_offset;
+ increment = m_prebuilt->autoinc_increment;
+
+ autoinc = innobase_next_autoinc(
+ autoinc, 1, increment, offset,
+ col_max_value);
+
+ error = innobase_set_max_autoinc(autoinc);
+
+ if (m_prebuilt->table->persistent_autoinc) {
+ /* Update the PAGE_ROOT_AUTO_INC. Yes, we do
+ this even if dict_table_t::autoinc already was
+ greater than autoinc, because we cannot know
+ if any INSERT actually used (and wrote to
+ PAGE_ROOT_AUTO_INC) a value bigger than our
+ autoinc. */
+ btr_write_autoinc(dict_table_get_first_index(
+ m_prebuilt->table),
+ autoinc);
+ }
+ }
+ }
+
+func_exit:
+ if (error == DB_FTS_INVALID_DOCID) {
+ err = HA_FTS_INVALID_DOCID;
+ my_error(HA_FTS_INVALID_DOCID, MYF(0));
+ } else {
+ err = convert_error_code_to_mysql(
+ error, m_prebuilt->table->flags, m_user_thd);
+ }
+
+#ifdef WITH_WSREP
+ if (error == DB_SUCCESS && trx->is_wsrep()
+ && wsrep_thd_is_local(m_user_thd)
+ && !wsrep_thd_ignore_table(m_user_thd)) {
+ DBUG_PRINT("wsrep", ("update row key"));
+
+ if (wsrep_append_keys(m_user_thd,
+ wsrep_protocol_version >= 4
+ ? WSREP_SERVICE_KEY_UPDATE
+ : WSREP_SERVICE_KEY_EXCLUSIVE,
+ old_row, new_row)){
+ WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED");
+ DBUG_PRINT("wsrep", ("row key failed"));
+ DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+ }
+ }
+#endif /* WITH_WSREP */
+
+ DBUG_RETURN(err);
+}
+
+/**********************************************************************//**
+Deletes a row given as the parameter.
+@return error number or 0 */
+
+int
+ha_innobase::delete_row(
+/*====================*/
+ const uchar* record) /*!< in: a row in MySQL format */
+{
+ dberr_t error;
+ trx_t* trx = thd_to_trx(m_user_thd);
+
+ DBUG_ENTER("ha_innobase::delete_row");
+
+ ut_a(m_prebuilt->trx == trx);
+
+ if (high_level_read_only) {
+ ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ } else if (!trx_is_started(trx)) {
+ trx->will_lock = true;
+ }
+
+ if (!m_prebuilt->upd_node) {
+ row_get_prebuilt_update_vector(m_prebuilt);
+ }
+
+ /* This is a delete */
+ m_prebuilt->upd_node->is_delete = table->versioned_write(VERS_TRX_ID)
+ && table->vers_end_field()->is_max()
+ && trx->id != table->vers_start_id()
+ ? VERSIONED_DELETE
+ : PLAIN_DELETE;
+
+ error = row_update_for_mysql(m_prebuilt);
+
+#ifdef WITH_WSREP
+ if (error == DB_SUCCESS && trx->is_wsrep()
+ && wsrep_thd_is_local(m_user_thd)
+ && !wsrep_thd_ignore_table(m_user_thd)) {
+ if (wsrep_append_keys(m_user_thd, WSREP_SERVICE_KEY_EXCLUSIVE,
+ record,
+ NULL)) {
+ DBUG_PRINT("wsrep", ("delete fail"));
+ DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+ }
+ }
+#endif /* WITH_WSREP */
+ DBUG_RETURN(convert_error_code_to_mysql(
+ error, m_prebuilt->table->flags, m_user_thd));
+}
+
+/** Delete all rows from the table.
+@return error number or 0 */
+
+int
+ha_innobase::delete_all_rows()
+{
+ DBUG_ENTER("ha_innobase::delete_all_rows");
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+}
+
+/**********************************************************************//**
+Removes a new lock set on a row, if it was not read optimistically. This can
+be called after a row has been read in the processing of an UPDATE or a DELETE
+query. */
+
+void
+ha_innobase::unlock_row(void)
+/*=========================*/
+{
+ DBUG_ENTER("ha_innobase::unlock_row");
+
+ if (m_prebuilt->select_lock_type == LOCK_NONE) {
+ DBUG_VOID_RETURN;
+ }
+
+ ut_ad(trx_state_eq(m_prebuilt->trx, TRX_STATE_ACTIVE, true));
+
+ switch (m_prebuilt->row_read_type) {
+ case ROW_READ_WITH_LOCKS:
+ if (m_prebuilt->trx->isolation_level > TRX_ISO_READ_COMMITTED)
+ break;
+ /* fall through */
+ case ROW_READ_TRY_SEMI_CONSISTENT:
+ row_unlock_for_mysql(m_prebuilt, FALSE);
+ break;
+ case ROW_READ_DID_SEMI_CONSISTENT:
+ m_prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ break;
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/* See handler.h and row0mysql.h for docs on this function. */
+
+bool
+ha_innobase::was_semi_consistent_read(void)
+/*=======================================*/
+{
+ return(m_prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT);
+}
+
+/* See handler.h and row0mysql.h for docs on this function. */
+void ha_innobase::try_semi_consistent_read(bool yes)
+{
+ ut_ad(m_prebuilt->trx == thd_to_trx(ha_thd()));
+ /* Row read type is set to semi consistent read if this was
+ requested by the SQL layer and the transaction isolation level is
+ READ UNCOMMITTED or READ COMMITTED. */
+ m_prebuilt->row_read_type = yes
+ && m_prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ ? ROW_READ_TRY_SEMI_CONSISTENT
+ : ROW_READ_WITH_LOCKS;
+}
+
+/******************************************************************//**
+Initializes a handle to use an index.
+@return 0 or error number */
+
+int
+ha_innobase::index_init(
+/*====================*/
+ uint keynr, /*!< in: key (index) number */
+ bool)
+{
+ DBUG_ENTER("index_init");
+
+ DBUG_RETURN(change_active_index(keynr));
+}
+
+/******************************************************************//**
+Currently does nothing.
+@return 0 */
+
+int
+ha_innobase::index_end(void)
+/*========================*/
+{
+ DBUG_ENTER("index_end");
+
+ active_index = MAX_KEY;
+
+ in_range_check_pushed_down = FALSE;
+
+ m_ds_mrr.dsmrr_close();
+
+ DBUG_RETURN(0);
+}
+
+/*********************************************************************//**
+Converts a search mode flag understood by MySQL to a flag understood
+by InnoDB. */
+page_cur_mode_t
+convert_search_mode_to_innobase(
+/*============================*/
+ ha_rkey_function find_flag)
+{
+ switch (find_flag) {
+ case HA_READ_KEY_EXACT:
+ /* this does not require the index to be UNIQUE */
+ case HA_READ_KEY_OR_NEXT:
+ return(PAGE_CUR_GE);
+ case HA_READ_AFTER_KEY:
+ return(PAGE_CUR_G);
+ case HA_READ_BEFORE_KEY:
+ return(PAGE_CUR_L);
+ case HA_READ_KEY_OR_PREV:
+ case HA_READ_PREFIX_LAST:
+ case HA_READ_PREFIX_LAST_OR_PREV:
+ return(PAGE_CUR_LE);
+ case HA_READ_MBR_CONTAIN:
+ return(PAGE_CUR_CONTAIN);
+ case HA_READ_MBR_INTERSECT:
+ return(PAGE_CUR_INTERSECT);
+ case HA_READ_MBR_WITHIN:
+ return(PAGE_CUR_WITHIN);
+ case HA_READ_MBR_DISJOINT:
+ return(PAGE_CUR_DISJOINT);
+ case HA_READ_MBR_EQUAL:
+ return(PAGE_CUR_MBR_EQUAL);
+ case HA_READ_PREFIX:
+ return(PAGE_CUR_UNSUPP);
+ /* do not use "default:" in order to produce a gcc warning:
+ enumeration value '...' not handled in switch
+ (if -Wswitch or -Wall is used) */
+ }
+
+ my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality");
+
+ return(PAGE_CUR_UNSUPP);
+}
+
+/*
+ BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED
+ ---------------------------------------------------
+The following does not cover all the details, but explains how we determine
+the start of a new SQL statement, and what is associated with it.
+
+For each table in the database the MySQL interpreter may have several
+table handle instances in use, also in a single SQL query. For each table
+handle instance there is an InnoDB 'm_prebuilt' struct which contains most
+of the InnoDB data associated with this table handle instance.
+
+ A) if the user has not explicitly set any MySQL table level locks:
+
+ 1) MySQL calls ::external_lock to set an 'intention' table level lock on
+the table of the handle instance. There we set
+m_prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set
+true if we are taking this table handle instance to use in a new SQL
+statement issued by the user. We also increment trx->n_mysql_tables_in_use.
+
+ 2) If m_prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search
+instructions to m_prebuilt->template of the table handle instance in
+::index_read. The template is used to save CPU time in large joins.
+
+ 3) In row_search_for_mysql, if m_prebuilt->sql_stat_start is true, we
+allocate a new consistent read view for the trx if it does not yet have one,
+or in the case of a locking read, set an InnoDB 'intention' table level
+lock on the table.
+
+ 4) We do the SELECT. MySQL may repeatedly call ::index_read for the
+same table handle instance, if it is a join.
+
+ 5) When the SELECT ends, MySQL removes its intention table level locks
+in ::external_lock. When trx->n_mysql_tables_in_use drops to zero,
+ (a) we execute a COMMIT there if the autocommit is on,
+ (b) we also release possible 'SQL statement level resources' InnoDB may
+have for this SQL statement. The MySQL interpreter does NOT execute
+autocommit for pure read transactions, though it should. That is why the
+table handler in that case has to execute the COMMIT in ::external_lock.
+
+ B) If the user has explicitly set MySQL table level locks, then MySQL
+does NOT call ::external_lock at the start of the statement. To determine
+when we are at the start of a new SQL statement we at the start of
+::index_read also compare the query id to the latest query id where the
+table handle instance was used. If it has changed, we know we are at the
+start of a new SQL statement. Since the query id can theoretically
+overwrap, we use this test only as a secondary way of determining the
+start of a new SQL statement. */
+
+
+/**********************************************************************//**
+Positions an index cursor to the index specified in the handle. Fetches the
+row if any.
+@return 0, HA_ERR_KEY_NOT_FOUND, or error number */
+
+int
+ha_innobase::index_read(
+/*====================*/
+ uchar* buf, /*!< in/out: buffer for the returned
+ row */
+ const uchar* key_ptr, /*!< in: key value; if this is NULL
+ we position the cursor at the
+ start or end of index; this can
+ also contain an InnoDB row id, in
+ which case key_len is the InnoDB
+ row id length; the key value can
+ also be a prefix of a full key value,
+ and the last column can be a prefix
+ of a full column */
+ uint key_len,/*!< in: key value length */
+ enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
+{
+ DBUG_ENTER("index_read");
+ DEBUG_SYNC_C("ha_innobase_index_read_begin");
+
+ ut_a(m_prebuilt->trx == thd_to_trx(m_user_thd));
+ ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT);
+
+ dict_index_t* index = m_prebuilt->index;
+
+ if (index == NULL || index->is_corrupted()) {
+ m_prebuilt->index_usable = FALSE;
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ if (!m_prebuilt->index_usable) {
+ DBUG_RETURN(index->is_corrupted()
+ ? HA_ERR_INDEX_CORRUPT
+ : HA_ERR_TABLE_DEF_CHANGED);
+ }
+
+ if (index->type & DICT_FTS) {
+ DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
+ }
+
+ /* For R-Tree index, we will always place the page lock to
+ pages being searched */
+ if (index->is_spatial() && !m_prebuilt->trx->will_lock) {
+ if (trx_is_started(m_prebuilt->trx)) {
+ DBUG_RETURN(HA_ERR_READ_ONLY_TRANSACTION);
+ } else {
+ m_prebuilt->trx->will_lock = true;
+ }
+ }
+
+ /* Note that if the index for which the search template is built is not
+ necessarily m_prebuilt->index, but can also be the clustered index */
+
+ if (m_prebuilt->sql_stat_start) {
+ build_template(false);
+ }
+
+ if (key_ptr != NULL) {
+ /* Convert the search key value to InnoDB format into
+ m_prebuilt->search_tuple */
+
+ row_sel_convert_mysql_key_to_innobase(
+ m_prebuilt->search_tuple,
+ m_prebuilt->srch_key_val1,
+ m_prebuilt->srch_key_val_len,
+ index,
+ (byte*) key_ptr,
+ (ulint) key_len);
+
+ DBUG_ASSERT(m_prebuilt->search_tuple->n_fields > 0);
+ } else {
+ /* We position the cursor to the last or the first entry
+ in the index */
+
+ dtuple_set_n_fields(m_prebuilt->search_tuple, 0);
+ }
+
+ page_cur_mode_t mode = convert_search_mode_to_innobase(find_flag);
+
+ ulint match_mode = 0;
+
+ if (find_flag == HA_READ_KEY_EXACT) {
+
+ match_mode = ROW_SEL_EXACT;
+
+ } else if (find_flag == HA_READ_PREFIX_LAST) {
+
+ match_mode = ROW_SEL_EXACT_PREFIX;
+ }
+
+ m_last_match_mode = (uint) match_mode;
+
+ dberr_t ret = mode == PAGE_CUR_UNSUPP ? DB_UNSUPPORTED
+ : row_search_mvcc(buf, mode, m_prebuilt, match_mode, 0);
+
+ DBUG_EXECUTE_IF("ib_select_query_failure", ret = DB_ERROR;);
+
+ int error;
+
+ switch (ret) {
+ case DB_SUCCESS:
+ error = 0;
+ table->status = 0;
+ if (m_prebuilt->table->is_system_db) {
+ srv_stats.n_system_rows_read.add(
+ thd_get_thread_id(m_prebuilt->trx->mysql_thd), 1);
+ } else {
+ srv_stats.n_rows_read.add(
+ thd_get_thread_id(m_prebuilt->trx->mysql_thd), 1);
+ }
+ break;
+
+ case DB_RECORD_NOT_FOUND:
+ error = HA_ERR_KEY_NOT_FOUND;
+ table->status = STATUS_NOT_FOUND;
+ break;
+
+ case DB_END_OF_INDEX:
+ error = HA_ERR_KEY_NOT_FOUND;
+ table->status = STATUS_NOT_FOUND;
+ break;
+
+ case DB_TABLESPACE_DELETED:
+ ib_senderrf(
+ m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
+
+ table->status = STATUS_NOT_FOUND;
+ error = HA_ERR_TABLESPACE_MISSING;
+ break;
+
+ case DB_TABLESPACE_NOT_FOUND:
+
+ ib_senderrf(
+ m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_MISSING,
+ table->s->table_name.str);
+
+ table->status = STATUS_NOT_FOUND;
+ error = HA_ERR_TABLESPACE_MISSING;
+ break;
+
+ default:
+ error = convert_error_code_to_mysql(
+ ret, m_prebuilt->table->flags, m_user_thd);
+
+ table->status = STATUS_NOT_FOUND;
+ break;
+ }
+
+ DBUG_RETURN(error);
+}
+
+/*******************************************************************//**
+The following functions works like index_read, but it find the last
+row with the current key value or prefix.
+@return 0, HA_ERR_KEY_NOT_FOUND, or an error code */
+
+int
+ha_innobase::index_read_last(
+/*=========================*/
+ uchar* buf, /*!< out: fetched row */
+ const uchar* key_ptr,/*!< in: key value, or a prefix of a full
+ key value */
+ uint key_len)/*!< in: length of the key val or prefix
+ in bytes */
+{
+ return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST));
+}
+
+/********************************************************************//**
+Get the index for a handle. Does not change active index.
+@return NULL or index instance. */
+
+dict_index_t*
+ha_innobase::innobase_get_index(
+/*============================*/
+ uint keynr) /*!< in: use this index; MAX_KEY means always
+ clustered index, even if it was internally
+ generated by InnoDB */
+{
+ KEY* key = NULL;
+ dict_table_t* ib_table = m_prebuilt->table;
+ dict_index_t* index;
+
+ DBUG_ENTER("innobase_get_index");
+
+ if (keynr != MAX_KEY && table->s->keys > 0) {
+ key = &table->key_info[keynr];
+ index = dict_table_get_index_on_name(ib_table, key->name.str);
+ } else {
+ index = dict_table_get_first_index(ib_table);
+ }
+
+ if (index == NULL) {
+ sql_print_error(
+ "InnoDB could not find key no %u with name %s"
+ " from dict cache for table %s",
+ keynr, key ? key->name.str : "NULL",
+ ib_table->name.m_name);
+ }
+
+ DBUG_RETURN(index);
+}
+
+/********************************************************************//**
+Changes the active index of a handle.
+@return 0 or error code */
+
+int
+ha_innobase::change_active_index(
+/*=============================*/
+ uint keynr) /*!< in: use this index; MAX_KEY means always clustered
+ index, even if it was internally generated by
+ InnoDB */
+{
+ DBUG_ENTER("change_active_index");
+
+ ut_ad(m_user_thd == ha_thd());
+ ut_a(m_prebuilt->trx == thd_to_trx(m_user_thd));
+
+ active_index = keynr;
+
+ m_prebuilt->index = innobase_get_index(keynr);
+
+ if (m_prebuilt->index == NULL) {
+ sql_print_warning("InnoDB: change_active_index(%u) failed",
+ keynr);
+ m_prebuilt->index_usable = FALSE;
+ DBUG_RETURN(1);
+ }
+
+ m_prebuilt->index_usable = row_merge_is_index_usable(
+ m_prebuilt->trx, m_prebuilt->index);
+
+ if (!m_prebuilt->index_usable) {
+ if (m_prebuilt->index->is_corrupted()) {
+ char table_name[MAX_FULL_NAME_LEN + 1];
+
+ innobase_format_name(
+ table_name, sizeof table_name,
+ m_prebuilt->index->table->name.m_name);
+
+ if (m_prebuilt->index->is_primary()) {
+ ut_ad(m_prebuilt->index->table->corrupted);
+ push_warning_printf(
+ m_user_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_TABLE_CORRUPT,
+ "InnoDB: Table %s is corrupted.",
+ table_name);
+ DBUG_RETURN(ER_TABLE_CORRUPT);
+ } else {
+ push_warning_printf(
+ m_user_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_INDEX_CORRUPT,
+ "InnoDB: Index %s for table %s is"
+ " marked as corrupted",
+ m_prebuilt->index->name(),
+ table_name);
+ DBUG_RETURN(HA_ERR_INDEX_CORRUPT);
+ }
+ } else {
+ push_warning_printf(
+ m_user_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_TABLE_DEF_CHANGED,
+ "InnoDB: insufficient history for index %u",
+ keynr);
+ }
+
+ /* The caller seems to ignore this. Thus, we must check
+ this again in row_search_for_mysql(). */
+ DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY,
+ 0, NULL));
+ }
+
+ ut_a(m_prebuilt->search_tuple != 0);
+
+ /* Initialization of search_tuple is not needed for FT index
+ since FT search returns rank only. In addition engine should
+ be able to retrieve FTS_DOC_ID column value if necessary. */
+ if (m_prebuilt->index->type & DICT_FTS) {
+ for (uint i = 0; i < table->s->fields; i++) {
+ if (m_prebuilt->read_just_key
+ && bitmap_is_set(table->read_set, i)
+ && !strcmp(table->s->field[i]->field_name.str,
+ FTS_DOC_ID_COL_NAME)) {
+ m_prebuilt->fts_doc_id_in_read_set = true;
+ break;
+ }
+ }
+ } else {
+ ulint n_fields = dict_index_get_n_unique_in_tree(
+ m_prebuilt->index);
+
+ dtuple_set_n_fields(m_prebuilt->search_tuple, n_fields);
+
+ dict_index_copy_types(
+ m_prebuilt->search_tuple, m_prebuilt->index,
+ n_fields);
+
+ /* If it's FTS query and FTS_DOC_ID exists FTS_DOC_ID field is
+ always added to read_set. */
+ m_prebuilt->fts_doc_id_in_read_set = m_prebuilt->in_fts_query
+ && m_prebuilt->read_just_key
+ && m_prebuilt->index->contains_col_or_prefix(
+ m_prebuilt->table->fts->doc_col, false);
+ }
+
+ /* MySQL changes the active index for a handle also during some
+ queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX()
+ and then calculates the sum. Previously we played safe and used
+ the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary
+ copying. Starting from MySQL-4.1 we use a more efficient flag here. */
+
+ build_template(false);
+
+ DBUG_RETURN(0);
+}
+
+/***********************************************************************//**
+Reads the next or previous row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+
+int
+ha_innobase::general_fetch(
+/*=======================*/
+ uchar* buf, /*!< in/out: buffer for next row in MySQL
+ format */
+ uint direction, /*!< in: ROW_SEL_NEXT or ROW_SEL_PREV */
+ uint match_mode) /*!< in: 0, ROW_SEL_EXACT, or
+ ROW_SEL_EXACT_PREFIX */
+{
+ DBUG_ENTER("general_fetch");
+
+ const trx_t* trx = m_prebuilt->trx;
+
+ ut_ad(trx == thd_to_trx(m_user_thd));
+
+ if (m_prebuilt->table->is_readable()) {
+ } else if (m_prebuilt->table->corrupted) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ } else {
+ DBUG_RETURN(m_prebuilt->table->space
+ ? HA_ERR_DECRYPTION_FAILED
+ : HA_ERR_NO_SUCH_TABLE);
+ }
+
+ int error;
+
+ switch (dberr_t ret = row_search_mvcc(buf, PAGE_CUR_UNSUPP, m_prebuilt,
+ match_mode, direction)) {
+ case DB_SUCCESS:
+ error = 0;
+ table->status = 0;
+ if (m_prebuilt->table->is_system_db) {
+ srv_stats.n_system_rows_read.add(
+ thd_get_thread_id(trx->mysql_thd), 1);
+ } else {
+ srv_stats.n_rows_read.add(
+ thd_get_thread_id(trx->mysql_thd), 1);
+ }
+ break;
+ case DB_RECORD_NOT_FOUND:
+ error = HA_ERR_END_OF_FILE;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ case DB_END_OF_INDEX:
+ error = HA_ERR_END_OF_FILE;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ case DB_TABLESPACE_DELETED:
+ ib_senderrf(
+ trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
+
+ table->status = STATUS_NOT_FOUND;
+ error = HA_ERR_TABLESPACE_MISSING;
+ break;
+ case DB_TABLESPACE_NOT_FOUND:
+
+ ib_senderrf(
+ trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_MISSING,
+ table->s->table_name.str);
+
+ table->status = STATUS_NOT_FOUND;
+ error = HA_ERR_TABLESPACE_MISSING;
+ break;
+ default:
+ error = convert_error_code_to_mysql(
+ ret, m_prebuilt->table->flags, m_user_thd);
+
+ table->status = STATUS_NOT_FOUND;
+ break;
+ }
+
+ DBUG_RETURN(error);
+}
+
+/***********************************************************************//**
+Reads the next row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+
+int
+ha_innobase::index_next(
+/*====================*/
+ uchar* buf) /*!< in/out: buffer for next row in MySQL
+ format */
+{
+ return(general_fetch(buf, ROW_SEL_NEXT, 0));
+}
+
+/*******************************************************************//**
+Reads the next row matching to the key value given as the parameter.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+
+int
+ha_innobase::index_next_same(
+/*=========================*/
+ uchar* buf, /*!< in/out: buffer for the row */
+ const uchar*, uint)
+{
+ return(general_fetch(buf, ROW_SEL_NEXT, m_last_match_mode));
+}
+
+/***********************************************************************//**
+Reads the previous row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+
+int
+ha_innobase::index_prev(
+/*====================*/
+ uchar* buf) /*!< in/out: buffer for previous row in MySQL format */
+{
+ return(general_fetch(buf, ROW_SEL_PREV, 0));
+}
+
+/********************************************************************//**
+Positions a cursor on the first record in an index and reads the
+corresponding row to buf.
+@return 0, HA_ERR_END_OF_FILE, or error code */
+
+int
+ha_innobase::index_first(
+/*=====================*/
+ uchar* buf) /*!< in/out: buffer for the row */
+{
+ DBUG_ENTER("index_first");
+
+ int error = index_read(buf, NULL, 0, HA_READ_AFTER_KEY);
+
+ /* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */
+
+ if (error == HA_ERR_KEY_NOT_FOUND) {
+ error = HA_ERR_END_OF_FILE;
+ }
+
+ DBUG_RETURN(error);
+}
+
+/********************************************************************//**
+Positions a cursor on the last record in an index and reads the
+corresponding row to buf.
+@return 0, HA_ERR_END_OF_FILE, or error code */
+
+int
+ha_innobase::index_last(
+/*====================*/
+ uchar* buf) /*!< in/out: buffer for the row */
+{
+ DBUG_ENTER("index_last");
+
+ int error = index_read(buf, NULL, 0, HA_READ_BEFORE_KEY);
+
+ /* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */
+
+ if (error == HA_ERR_KEY_NOT_FOUND) {
+ error = HA_ERR_END_OF_FILE;
+ }
+
+ DBUG_RETURN(error);
+}
+
+/****************************************************************//**
+Initialize a table scan.
+@return 0 or error number */
+
+int
+ha_innobase::rnd_init(
+/*==================*/
+ bool scan) /*!< in: true if table/index scan FALSE otherwise */
+{
+ int err;
+
+ /* Store the active index value so that we can restore the original
+ value after a scan */
+
+ if (m_prebuilt->clust_index_was_generated) {
+ err = change_active_index(MAX_KEY);
+ } else {
+ err = change_active_index(m_primary_key);
+ }
+
+ /* Don't use semi-consistent read in random row reads (by position).
+ This means we must disable semi_consistent_read if scan is false */
+
+ if (!scan) {
+ try_semi_consistent_read(0);
+ }
+
+ m_start_of_scan = true;
+
+ return(err);
+}
+
+/*****************************************************************//**
+Ends a table scan.
+@return 0 or error number */
+
+int
+ha_innobase::rnd_end(void)
+/*======================*/
+{
+ return(index_end());
+}
+
+/*****************************************************************//**
+Reads the next row in a table scan (also used to read the FIRST row
+in a table scan).
+@return 0, HA_ERR_END_OF_FILE, or error number */
+
+int
+ha_innobase::rnd_next(
+/*==================*/
+ uchar* buf) /*!< in/out: returns the row in this buffer,
+ in MySQL format */
+{
+ int error;
+
+ DBUG_ENTER("rnd_next");
+
+ if (m_start_of_scan) {
+ error = index_first(buf);
+
+ if (error == HA_ERR_KEY_NOT_FOUND) {
+ error = HA_ERR_END_OF_FILE;
+ }
+
+ m_start_of_scan = false;
+ } else {
+ error = general_fetch(buf, ROW_SEL_NEXT, 0);
+ }
+
+ DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Fetches a row from the table based on a row reference.
+@return 0, HA_ERR_KEY_NOT_FOUND, or error code */
+
+int
+ha_innobase::rnd_pos(
+/*=================*/
+ uchar* buf, /*!< in/out: buffer for the row */
+ uchar* pos) /*!< in: primary key value of the row in the
+ MySQL format, or the row id if the clustered
+ index was internally generated by InnoDB; the
+ length of data in pos has to be ref_length */
+{
+ DBUG_ENTER("rnd_pos");
+ DBUG_DUMP("key", pos, ref_length);
+
+ ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
+
+ /* Note that we assume the length of the row reference is fixed
+ for the table, and it is == ref_length */
+
+ int error = index_read(buf, pos, (uint)ref_length, HA_READ_KEY_EXACT);
+
+ if (error != 0) {
+ DBUG_PRINT("error", ("Got error: %d", error));
+ }
+
+ DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Initialize FT index scan
+@return 0 or error number */
+
+int
+ha_innobase::ft_init()
+/*==================*/
+{
+ DBUG_ENTER("ft_init");
+
+ trx_t* trx = check_trx_exists(ha_thd());
+
+ /* FTS queries are not treated as autocommit non-locking selects.
+ This is because the FTS implementation can acquire locks behind
+ the scenes. This has not been verified but it is safer to treat
+ them as regular read only transactions for now. */
+
+ if (!trx_is_started(trx)) {
+ trx->will_lock = true;
+ }
+
+ DBUG_RETURN(rnd_init(false));
+}
+
+/**********************************************************************//**
+Initialize FT index scan
+@return FT_INFO structure if successful or NULL */
+
+FT_INFO*
+ha_innobase::ft_init_ext(
+/*=====================*/
+ uint flags, /* in: */
+ uint keynr, /* in: */
+ String* key) /* in: */
+{
+ NEW_FT_INFO* fts_hdl = NULL;
+ dict_index_t* index;
+ fts_result_t* result;
+ char buf_tmp[8192];
+ ulint buf_tmp_used;
+ uint num_errors;
+ ulint query_len = key->length();
+ const CHARSET_INFO* char_set = key->charset();
+ const char* query = key->ptr();
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ {
+ ib::info out;
+ out << "keynr=" << keynr << ", '";
+ out.write(key->ptr(), key->length());
+ }
+
+ if (flags & FT_BOOL) {
+ ib::info() << "BOOL search";
+ } else {
+ ib::info() << "NL search";
+ }
+ }
+
+ /* FIXME: utf32 and utf16 are not compatible with some
+ string function used. So to convert them to uft8 before
+ we proceed. */
+ if (strcmp(char_set->csname, "utf32") == 0
+ || strcmp(char_set->csname, "utf16") == 0) {
+
+ buf_tmp_used = innobase_convert_string(
+ buf_tmp, sizeof(buf_tmp) - 1,
+ &my_charset_utf8mb3_general_ci,
+ query, query_len, (CHARSET_INFO*) char_set,
+ &num_errors);
+
+ buf_tmp[buf_tmp_used] = 0;
+ query = buf_tmp;
+ query_len = buf_tmp_used;
+ }
+
+ trx_t* trx = m_prebuilt->trx;
+
+ /* FTS queries are not treated as autocommit non-locking selects.
+ This is because the FTS implementation can acquire locks behind
+ the scenes. This has not been verified but it is safer to treat
+ them as regular read only transactions for now. */
+
+ if (!trx_is_started(trx)) {
+ trx->will_lock = true;
+ }
+
+ dict_table_t* ft_table = m_prebuilt->table;
+
+ /* Table does not have an FTS index */
+ if (!ft_table->fts || ib_vector_is_empty(ft_table->fts->indexes)) {
+ my_error(ER_TABLE_HAS_NO_FT, MYF(0));
+ return(NULL);
+ }
+
+ /* If tablespace is discarded, we should return here */
+ if (!ft_table->space) {
+ my_error(ER_TABLESPACE_MISSING, MYF(0), table->s->db.str,
+ table->s->table_name.str);
+ return(NULL);
+ }
+
+ if (keynr == NO_SUCH_KEY) {
+ /* FIXME: Investigate the NO_SUCH_KEY usage */
+ index = reinterpret_cast<dict_index_t*>
+ (ib_vector_getp(ft_table->fts->indexes, 0));
+ } else {
+ index = innobase_get_index(keynr);
+ }
+
+ if (index == NULL || index->type != DICT_FTS) {
+ my_error(ER_TABLE_HAS_NO_FT, MYF(0));
+ return(NULL);
+ }
+
+ if (!(ft_table->fts->added_synced)) {
+ fts_init_index(ft_table, FALSE);
+
+ ft_table->fts->added_synced = true;
+ }
+
+ const byte* q = reinterpret_cast<const byte*>(
+ const_cast<char*>(query));
+
+ // FIXME: support ft_init_ext_with_hints(), pass LIMIT
+ dberr_t error = fts_query(trx, index, flags, q, query_len, &result);
+
+ if (error != DB_SUCCESS) {
+ my_error(convert_error_code_to_mysql(error, 0, NULL), MYF(0));
+ return(NULL);
+ }
+
+ /* Allocate FTS handler, and instantiate it before return */
+ fts_hdl = reinterpret_cast<NEW_FT_INFO*>(
+ my_malloc(PSI_INSTRUMENT_ME, sizeof(NEW_FT_INFO), MYF(0)));
+
+ fts_hdl->please = const_cast<_ft_vft*>(&ft_vft_result);
+ fts_hdl->could_you = const_cast<_ft_vft_ext*>(&ft_vft_ext_result);
+ fts_hdl->ft_prebuilt = m_prebuilt;
+ fts_hdl->ft_result = result;
+
+ /* FIXME: Re-evaluate the condition when Bug 14469540 is resolved */
+ m_prebuilt->in_fts_query = true;
+
+ return(reinterpret_cast<FT_INFO*>(fts_hdl));
+}
+
+/*****************************************************************//**
+Set up search tuple for a query through FTS_DOC_ID_INDEX on
+supplied Doc ID. This is used by MySQL to retrieve the documents
+once the search result (Doc IDs) is available */
+static
+void
+innobase_fts_create_doc_id_key(
+/*===========================*/
+ dtuple_t* tuple, /* in/out: m_prebuilt->search_tuple */
+ const dict_index_t*
+ index, /* in: index (FTS_DOC_ID_INDEX) */
+ doc_id_t* doc_id) /* in/out: doc id to search, value
+ could be changed to storage format
+ used for search. */
+{
+ doc_id_t temp_doc_id;
+ dfield_t* dfield = dtuple_get_nth_field(tuple, 0);
+
+ ut_a(dict_index_get_n_unique(index) == 1);
+
+ dtuple_set_n_fields(tuple, index->n_fields);
+ dict_index_copy_types(tuple, index, index->n_fields);
+
+#ifdef UNIV_DEBUG
+ /* The unique Doc ID field should be an eight-bytes integer */
+ dict_field_t* field = dict_index_get_nth_field(index, 0);
+ ut_a(field->col->mtype == DATA_INT);
+ ut_ad(sizeof(*doc_id) == field->fixed_len);
+ ut_ad(!strcmp(index->name, FTS_DOC_ID_INDEX_NAME));
+#endif /* UNIV_DEBUG */
+
+ /* Convert to storage byte order */
+ mach_write_to_8(reinterpret_cast<byte*>(&temp_doc_id), *doc_id);
+ *doc_id = temp_doc_id;
+ dfield_set_data(dfield, doc_id, sizeof(*doc_id));
+
+ dtuple_set_n_fields_cmp(tuple, 1);
+
+ for (ulint i = 1; i < index->n_fields; i++) {
+ dfield = dtuple_get_nth_field(tuple, i);
+ dfield_set_null(dfield);
+ }
+}
+
+/**********************************************************************//**
+Fetch next result from the FT result set
+@return error code */
+
+int
+ha_innobase::ft_read(
+/*=================*/
+ uchar* buf) /*!< in/out: buf contain result row */
+{
+ row_prebuilt_t* ft_prebuilt;
+
+ ft_prebuilt = reinterpret_cast<NEW_FT_INFO*>(ft_handler)->ft_prebuilt;
+
+ ut_a(ft_prebuilt == m_prebuilt);
+
+ fts_result_t* result;
+
+ result = reinterpret_cast<NEW_FT_INFO*>(ft_handler)->ft_result;
+
+ if (result->current == NULL) {
+ /* This is the case where the FTS query did not
+ contain and matching documents. */
+ if (result->rankings_by_id != NULL) {
+ /* Now that we have the complete result, we
+ need to sort the document ids on their rank
+ calculation. */
+
+ fts_query_sort_result_on_rank(result);
+
+ result->current = const_cast<ib_rbt_node_t*>(
+ rbt_first(result->rankings_by_rank));
+ } else {
+ ut_a(result->current == NULL);
+ }
+ } else {
+ result->current = const_cast<ib_rbt_node_t*>(
+ rbt_next(result->rankings_by_rank, result->current));
+ }
+
+next_record:
+
+ if (result->current != NULL) {
+ doc_id_t search_doc_id;
+ dtuple_t* tuple = m_prebuilt->search_tuple;
+
+ /* If we only need information from result we can return
+ without fetching the table row */
+ if (ft_prebuilt->read_just_key) {
+#ifdef MYSQL_STORE_FTS_DOC_ID
+ if (m_prebuilt->fts_doc_id_in_read_set) {
+ fts_ranking_t* ranking;
+ ranking = rbt_value(fts_ranking_t,
+ result->current);
+ innobase_fts_store_docid(
+ table, ranking->doc_id);
+ }
+#endif
+ table->status= 0;
+ return(0);
+ }
+
+ dict_index_t* index;
+
+ index = m_prebuilt->table->fts_doc_id_index;
+
+ /* Must find the index */
+ ut_a(index != NULL);
+
+ /* Switch to the FTS doc id index */
+ m_prebuilt->index = index;
+
+ fts_ranking_t* ranking = rbt_value(
+ fts_ranking_t, result->current);
+
+ search_doc_id = ranking->doc_id;
+
+ /* We pass a pointer of search_doc_id because it will be
+ converted to storage byte order used in the search
+ tuple. */
+ innobase_fts_create_doc_id_key(tuple, index, &search_doc_id);
+
+ int error;
+
+ switch (dberr_t ret = row_search_for_mysql(buf, PAGE_CUR_GE,
+ m_prebuilt,
+ ROW_SEL_EXACT, 0)) {
+ case DB_SUCCESS:
+ error = 0;
+ table->status = 0;
+ break;
+ case DB_RECORD_NOT_FOUND:
+ result->current = const_cast<ib_rbt_node_t*>(
+ rbt_next(result->rankings_by_rank,
+ result->current));
+
+ if (!result->current) {
+ /* exhaust the result set, should return
+ HA_ERR_END_OF_FILE just like
+ ha_innobase::general_fetch() and/or
+ ha_innobase::index_first() etc. */
+ error = HA_ERR_END_OF_FILE;
+ table->status = STATUS_NOT_FOUND;
+ } else {
+ goto next_record;
+ }
+ break;
+ case DB_END_OF_INDEX:
+ error = HA_ERR_END_OF_FILE;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ case DB_TABLESPACE_DELETED:
+
+ ib_senderrf(
+ m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
+
+ table->status = STATUS_NOT_FOUND;
+ error = HA_ERR_TABLESPACE_MISSING;
+ break;
+ case DB_TABLESPACE_NOT_FOUND:
+
+ ib_senderrf(
+ m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_MISSING,
+ table->s->table_name.str);
+
+ table->status = STATUS_NOT_FOUND;
+ error = HA_ERR_TABLESPACE_MISSING;
+ break;
+ default:
+ error = convert_error_code_to_mysql(
+ ret, 0, m_user_thd);
+
+ table->status = STATUS_NOT_FOUND;
+ break;
+ }
+
+ return(error);
+ }
+
+ return(HA_ERR_END_OF_FILE);
+}
+
+#ifdef WITH_WSREP
+inline
+const char*
+wsrep_key_type_to_str(Wsrep_service_key_type type)
+{
+ switch (type) {
+ case WSREP_SERVICE_KEY_SHARED:
+ return "shared";
+ case WSREP_SERVICE_KEY_REFERENCE:
+ return "reference";
+ case WSREP_SERVICE_KEY_UPDATE:
+ return "update";
+ case WSREP_SERVICE_KEY_EXCLUSIVE:
+ return "exclusive";
+ };
+ return "unknown";
+}
+
+extern dberr_t
+wsrep_append_foreign_key(
+/*===========================*/
+ trx_t* trx, /*!< in: trx */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ const rec_t* rec, /*!<in: clustered index record */
+ dict_index_t* index, /*!<in: clustered index */
+ ibool referenced, /*!<in: is check for referenced table */
+ Wsrep_service_key_type key_type) /*!< in: access type of this key
+ (shared, exclusive, reference...) */
+{
+ if (!trx->is_wsrep() || !wsrep_thd_is_local(trx->mysql_thd)) {
+ return DB_SUCCESS;
+ }
+
+ THD* thd = trx->mysql_thd;
+
+ if (!foreign ||
+ (!foreign->referenced_table && !foreign->foreign_table)) {
+ WSREP_INFO("FK: %s missing in: %s",
+ (!foreign ? "constraint" :
+ (!foreign->referenced_table ?
+ "referenced table" : "foreign table")),
+ wsrep_thd_query(thd));
+ return DB_ERROR;
+ }
+
+ ulint rcode = DB_SUCCESS;
+ char cache_key[513] = {'\0'};
+ size_t cache_key_len = 0;
+
+ if ( !((referenced) ?
+ foreign->referenced_table : foreign->foreign_table)) {
+ WSREP_DEBUG("pulling %s table into cache",
+ (referenced) ? "referenced" : "foreign");
+ mutex_enter(&dict_sys.mutex);
+
+ if (referenced) {
+ foreign->referenced_table =
+ dict_table_get_low(
+ foreign->referenced_table_name_lookup);
+ if (foreign->referenced_table) {
+ foreign->referenced_index =
+ dict_foreign_find_index(
+ foreign->referenced_table, NULL,
+ foreign->referenced_col_names,
+ foreign->n_fields,
+ foreign->foreign_index,
+ TRUE, FALSE);
+ }
+ } else {
+ foreign->foreign_table =
+ dict_table_get_low(
+ foreign->foreign_table_name_lookup);
+
+ if (foreign->foreign_table) {
+ foreign->foreign_index =
+ dict_foreign_find_index(
+ foreign->foreign_table, NULL,
+ foreign->foreign_col_names,
+ foreign->n_fields,
+ foreign->referenced_index,
+ TRUE, FALSE);
+ }
+ }
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ if ( !((referenced) ?
+ foreign->referenced_table : foreign->foreign_table)) {
+ WSREP_WARN("FK: %s missing in query: %s",
+ (!foreign->referenced_table) ?
+ "referenced table" : "foreign table",
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void");
+ return DB_ERROR;
+ }
+
+ byte key[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+ ulint len = WSREP_MAX_SUPPORTED_KEY_LENGTH;
+
+ dict_index_t *idx_target = (referenced) ?
+ foreign->referenced_index : index;
+ dict_index_t *idx = (referenced) ?
+ UT_LIST_GET_FIRST(foreign->referenced_table->indexes) :
+ UT_LIST_GET_FIRST(foreign->foreign_table->indexes);
+ int i = 0;
+
+ while (idx != NULL && idx != idx_target) {
+ if (innobase_strcasecmp (idx->name, innobase_index_reserve_name) != 0) {
+ i++;
+ }
+ idx = UT_LIST_GET_NEXT(indexes, idx);
+ }
+
+ ut_a(idx);
+ key[0] = byte(i);
+
+ rcode = wsrep_rec_get_foreign_key(
+ &key[1], &len, rec, index, idx,
+ wsrep_protocol_version > 1);
+
+ if (rcode != DB_SUCCESS) {
+ WSREP_ERROR(
+ "FK key set failed: " ULINTPF
+ " (" ULINTPF "%s), index: %s %s, %s",
+ rcode, referenced, wsrep_key_type_to_str(key_type),
+ (index) ? index->name() : "void index",
+ (index && index->table) ? index->table->name.m_name :
+ "void table",
+ wsrep_thd_query(thd));
+ return DB_ERROR;
+ }
+
+ strncpy(cache_key,
+ (wsrep_protocol_version > 1) ?
+ ((referenced) ?
+ foreign->referenced_table->name.m_name :
+ foreign->foreign_table->name.m_name) :
+ foreign->foreign_table->name.m_name, sizeof(cache_key) - 1);
+ cache_key_len = strlen(cache_key);
+
+#ifdef WSREP_DEBUG_PRINT
+ ulint j;
+ fprintf(stderr, "FK parent key, table: %s %s len: %lu ",
+ cache_key, wsrep_key_type_to_str(key_type), len+1);
+ for (j=0; j<len+1; j++) {
+ fprintf(stderr, " %hhX, ", key[j]);
+ }
+ fprintf(stderr, "\n");
+#endif
+ char *p = strchr(cache_key, '/');
+
+ if (p) {
+ *p = '\0';
+ } else {
+ WSREP_WARN("unexpected foreign key table %s %s",
+ foreign->referenced_table->name.m_name,
+ foreign->foreign_table->name.m_name);
+ }
+
+ wsrep_buf_t wkey_part[3];
+ wsrep_key_t wkey = {wkey_part, 3};
+
+ if (!wsrep_prepare_key_for_innodb(
+ thd,
+ (const uchar*)cache_key,
+ cache_key_len + 1,
+ (const uchar*)key, len+1,
+ wkey_part,
+ (size_t*)&wkey.key_parts_num)) {
+ WSREP_WARN("key prepare failed for cascaded FK: %s",
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void");
+ return DB_ERROR;
+ }
+ rcode = wsrep_thd_append_key(thd, &wkey, 1, key_type);
+ if (rcode) {
+ DBUG_PRINT("wsrep", ("row key failed: " ULINTPF, rcode));
+ WSREP_ERROR("Appending cascaded fk row key failed: %s, "
+ ULINTPF,
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void", rcode);
+ return DB_ERROR;
+ }
+
+ return DB_SUCCESS;
+}
+
+static int
+wsrep_append_key(
+/*=============*/
+ THD *thd,
+ trx_t *trx,
+ TABLE_SHARE *table_share,
+ const uchar* key,
+ uint16_t key_len,
+ Wsrep_service_key_type key_type /*!< in: access type of this key
+ (shared, exclusive, semi...) */
+)
+{
+ DBUG_ENTER("wsrep_append_key");
+ DBUG_PRINT("enter",
+ ("thd: %lu trx: %lld", thd_get_thread_id(thd),
+ (long long)trx->id));
+#ifdef WSREP_DEBUG_PRINT
+ fprintf(stderr, "%s conn %lu, trx " TRX_ID_FMT ", keylen %d, key %s.%s\n",
+ wsrep_key_type_to_str(key_type),
+ thd_get_thread_id(thd), trx->id, key_len,
+ table_share->table_name.str, key);
+ for (int i=0; i<key_len; i++) {
+ fprintf(stderr, "%hhX, ", key[i]);
+ }
+ fprintf(stderr, "\n");
+#endif
+ wsrep_buf_t wkey_part[3];
+ wsrep_key_t wkey = {wkey_part, 3};
+
+ if (!wsrep_prepare_key_for_innodb(
+ thd,
+ (const uchar*)table_share->table_cache_key.str,
+ table_share->table_cache_key.length,
+ (const uchar*)key, key_len,
+ wkey_part,
+ (size_t*)&wkey.key_parts_num)) {
+ WSREP_WARN("key prepare failed for: %s",
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void");
+ DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+ }
+
+ int rcode = wsrep_thd_append_key(thd, &wkey, 1, key_type);
+ if (rcode) {
+ DBUG_PRINT("wsrep", ("row key failed: %d", rcode));
+ WSREP_WARN("Appending row key failed: %s, %d",
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void", rcode);
+ DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+ }
+
+ DBUG_RETURN(0);
+}
+
+static bool
+referenced_by_foreign_key2(
+/*=======================*/
+ dict_table_t* table,
+ dict_index_t* index)
+{
+ ut_ad(table != NULL);
+ ut_ad(index != NULL);
+
+ const dict_foreign_set* fks = &table->referenced_set;
+
+ for (dict_foreign_set::const_iterator it = fks->begin();
+ it != fks->end();
+ ++it) {
+ dict_foreign_t* foreign = *it;
+
+ if (foreign->referenced_index != index) {
+ continue;
+ }
+ ut_ad(table == foreign->referenced_table);
+ return true;
+ }
+ return false;
+}
+
+int
+ha_innobase::wsrep_append_keys(
+/*===========================*/
+ THD *thd,
+ Wsrep_service_key_type key_type, /*!< in: access type of this row
+ operation:
+ (shared, exclusive, reference...) */
+ const uchar* record0, /* in: row in MySQL format */
+ const uchar* record1) /* in: row in MySQL format */
+{
+ /* Sanity check: newly inserted records should always be passed with
+ EXCLUSIVE key type, all the rest are expected to carry a pre-image
+ */
+ ut_a(record1 != NULL || key_type == WSREP_SERVICE_KEY_EXCLUSIVE);
+
+ int rcode;
+ DBUG_ENTER("wsrep_append_keys");
+
+ bool key_appended = false;
+ trx_t *trx = thd_to_trx(thd);
+
+#ifdef WSREP_DEBUG_PRINT
+ fprintf(stderr, "%s conn %lu, trx " TRX_ID_FMT ", table %s\nSQL: %s\n",
+ wsrep_key_type_to_str(key_type),
+ thd_get_thread_id(thd), trx->id,
+ table_share->table_name.str, wsrep_thd_query(thd));
+#endif
+
+ if (table_share && table_share->tmp_table != NO_TMP_TABLE) {
+ WSREP_DEBUG("skipping tmp table DML: THD: %lu tmp: %d SQL: %s",
+ thd_get_thread_id(thd),
+ table_share->tmp_table,
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void");
+ DBUG_RETURN(0);
+ }
+
+ if (wsrep_protocol_version == 0) {
+ uchar keyval[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+ uchar *key = &keyval[0];
+ bool is_null;
+
+ auto len = wsrep_store_key_val_for_row(
+ thd, table, 0, key, WSREP_MAX_SUPPORTED_KEY_LENGTH,
+ record0, &is_null);
+
+ if (!is_null) {
+ rcode = wsrep_append_key(
+ thd, trx, table_share, keyval,
+ len, key_type);
+
+ if (rcode) {
+ DBUG_RETURN(rcode);
+ }
+ } else {
+ WSREP_DEBUG("NULL key skipped (proto 0): %s",
+ wsrep_thd_query(thd));
+ }
+ } else {
+ ut_a(table->s->keys <= 256);
+ uint i;
+ bool hasPK= false;
+
+ for (i=0; i<table->s->keys; ++i) {
+ KEY* key_info = table->key_info + i;
+ if (key_info->flags & HA_NOSAME) {
+ hasPK = true;
+ break;
+ }
+ }
+
+ for (i=0; i<table->s->keys; ++i) {
+ KEY* key_info = table->key_info + i;
+
+ dict_index_t* idx = innobase_get_index(i);
+ dict_table_t* tab = (idx) ? idx->table : NULL;
+
+ /* keyval[] shall contain an ordinal number at byte 0
+ and the actual key data shall be written at byte 1.
+ Hence the total data length is the key length + 1 */
+ uchar keyval0[WSREP_MAX_SUPPORTED_KEY_LENGTH+1]= {'\0'};
+ uchar keyval1[WSREP_MAX_SUPPORTED_KEY_LENGTH+1]= {'\0'};
+ keyval0[0] = (uchar)i;
+ keyval1[0] = (uchar)i;
+ uchar* key0 = &keyval0[1];
+ uchar* key1 = &keyval1[1];
+
+ if (!tab) {
+ WSREP_WARN("MariaDB-InnoDB key mismatch %s %s",
+ table->s->table_name.str,
+ key_info->name.str);
+ }
+ /* !hasPK == table with no PK,
+ must append all non-unique keys */
+ if (!hasPK || key_info->flags & HA_NOSAME ||
+ ((tab &&
+ referenced_by_foreign_key2(tab, idx)) ||
+ (!tab && referenced_by_foreign_key()))) {
+
+ bool is_null0;
+ auto len0 = wsrep_store_key_val_for_row(
+ thd, table, i, key0,
+ WSREP_MAX_SUPPORTED_KEY_LENGTH,
+ record0, &is_null0);
+
+ if (record1) {
+ bool is_null1;
+ auto len1= wsrep_store_key_val_for_row(
+ thd, table, i, key1,
+ WSREP_MAX_SUPPORTED_KEY_LENGTH,
+ record1, &is_null1);
+
+ if (is_null0 != is_null1 ||
+ len0 != len1 ||
+ memcmp(key0, key1, len0)) {
+ /* This key has chaged. If it
+ is unique, this is an exclusive
+ operation -> upgrade key type */
+ if (key_info->flags & HA_NOSAME) {
+ key_type = WSREP_SERVICE_KEY_EXCLUSIVE;
+ }
+
+ if (!is_null1) {
+ rcode = wsrep_append_key(
+ thd, trx, table_share,
+ keyval1,
+ /* for len1+1 see keyval1
+ initialization comment */
+ uint16_t(len1+1),
+ key_type);
+ if (rcode)
+ DBUG_RETURN(rcode);
+ }
+ }
+ }
+
+ if (!is_null0) {
+ rcode = wsrep_append_key(
+ thd, trx, table_share,
+ /* for len0+1 see keyval0
+ initialization comment */
+ keyval0, uint16_t(len0+1),
+ key_type);
+ if (rcode)
+ DBUG_RETURN(rcode);
+
+ if (key_info->flags & HA_NOSAME ||
+ key_type == WSREP_SERVICE_KEY_SHARED||
+ key_type == WSREP_SERVICE_KEY_REFERENCE)
+ key_appended = true;
+ } else {
+ WSREP_DEBUG("NULL key skipped: %s",
+ wsrep_thd_query(thd));
+ }
+ }
+ }
+ }
+
+ /* if no PK, calculate hash of full row, to be the key value */
+ if (!key_appended && wsrep_certify_nonPK) {
+ uchar digest[16];
+
+ wsrep_calc_row_hash(digest, record0, table, m_prebuilt);
+
+ if (int rcode = wsrep_append_key(thd, trx, table_share,
+ digest, 16, key_type)) {
+ DBUG_RETURN(rcode);
+ }
+
+ if (record1) {
+ wsrep_calc_row_hash(
+ digest, record1, table, m_prebuilt);
+ if (int rcode = wsrep_append_key(thd, trx, table_share,
+ digest, 16,
+ key_type)) {
+ DBUG_RETURN(rcode);
+ }
+ }
+ DBUG_RETURN(0);
+ }
+
+ DBUG_RETURN(0);
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Stores a reference to the current row to 'ref' field of the handle. Note
+that in the case where we have generated the clustered index for the
+table, the function parameter is illogical: we MUST ASSUME that 'record'
+is the current 'position' of the handle, because if row ref is actually
+the row id internally generated in InnoDB, then 'record' does not contain
+it. We just guess that the row id must be for the record where the handle
+was positioned the last time. */
+
+void
+ha_innobase::position(
+/*==================*/
+ const uchar* record) /*!< in: row in MySQL format */
+{
+ uint len;
+
+ ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
+
+ if (m_prebuilt->clust_index_was_generated) {
+ /* No primary key was defined for the table and we
+ generated the clustered index from row id: the
+ row reference will be the row id, not any key value
+ that MySQL knows of */
+
+ len = DATA_ROW_ID_LEN;
+
+ memcpy(ref, m_prebuilt->row_id, len);
+ } else {
+
+ /* Copy primary key as the row reference */
+ KEY* key_info = table->key_info + m_primary_key;
+ key_copy(ref, (uchar*)record, key_info, key_info->key_length);
+ len = key_info->key_length;
+ }
+
+ ut_ad(len == ref_length);
+}
+
+/*****************************************************************//**
+Check whether there exist a column named as "FTS_DOC_ID", which is
+reserved for InnoDB FTS Doc ID
+@return true if there exist a "FTS_DOC_ID" column */
+static
+bool
+create_table_check_doc_id_col(
+/*==========================*/
+ trx_t* trx, /*!< in: InnoDB transaction handle */
+ const TABLE* form, /*!< in: information on table
+ columns and indexes */
+ ulint* doc_id_col) /*!< out: Doc ID column number if
+ there exist a FTS_DOC_ID column,
+ ULINT_UNDEFINED if column is of the
+ wrong type/name/size */
+{
+ for (ulint i = 0; i < form->s->fields; i++) {
+ const Field* field = form->field[i];
+ if (!field->stored_in_db()) {
+ continue;
+ }
+
+ unsigned unsigned_type;
+
+ auto col_type = get_innobase_type_from_mysql_type(
+ &unsigned_type, field);
+
+ auto col_len = field->pack_length();
+
+ if (innobase_strcasecmp(field->field_name.str,
+ FTS_DOC_ID_COL_NAME) == 0) {
+
+ /* Note the name is case sensitive due to
+ our internal query parser */
+ if (col_type == DATA_INT
+ && !field->real_maybe_null()
+ && col_len == sizeof(doc_id_t)
+ && (strcmp(field->field_name.str,
+ FTS_DOC_ID_COL_NAME) == 0)) {
+ *doc_id_col = i;
+ } else {
+ push_warning_printf(
+ trx->mysql_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: FTS_DOC_ID column must be"
+ " of BIGINT NOT NULL type, and named"
+ " in all capitalized characters");
+ my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+ field->field_name.str);
+ *doc_id_col = ULINT_UNDEFINED;
+ }
+
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+
+/** Finds all base columns needed to compute a given generated column.
+This is returned as a bitmap, in field->table->tmp_set.
+Works for both dict_v_col_t and dict_s_col_t columns.
+@param[in] table InnoDB table
+@param[in] field MySQL field
+@param[in,out] col virtual or stored column */
+template <typename T>
+void
+prepare_vcol_for_base_setup(
+/*========================*/
+ const dict_table_t* table,
+ const Field* field,
+ T* col)
+{
+ ut_ad(col->num_base == 0);
+ ut_ad(col->base_col == NULL);
+
+ MY_BITMAP *old_read_set = field->table->read_set;
+
+ field->table->read_set = &field->table->tmp_set;
+
+ bitmap_clear_all(&field->table->tmp_set);
+ field->vcol_info->expr->walk(
+ &Item::register_field_in_read_map, 1, field->table);
+ col->num_base= bitmap_bits_set(&field->table->tmp_set)
+ & dict_index_t::MAX_N_FIELDS;
+ if (col->num_base != 0) {
+ col->base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
+ table->heap, col->num_base * sizeof(
+ * col->base_col)));
+ }
+ field->table->read_set= old_read_set;
+}
+
+
+/** Set up base columns for virtual column
+@param[in] table InnoDB table
+@param[in] field MySQL field
+@param[in,out] v_col virtual column */
+void
+innodb_base_col_setup(
+ dict_table_t* table,
+ const Field* field,
+ dict_v_col_t* v_col)
+{
+ uint16_t n = 0;
+
+ prepare_vcol_for_base_setup(table, field, v_col);
+
+ for (uint i= 0; i < field->table->s->fields; ++i) {
+ const Field* base_field = field->table->field[i];
+ if (base_field->stored_in_db()
+ && bitmap_is_set(&field->table->tmp_set, i)) {
+ ulint z;
+
+ for (z = 0; z < table->n_cols; z++) {
+ const char* name = dict_table_get_col_name(table, z);
+ if (!innobase_strcasecmp(name,
+ base_field->field_name.str)) {
+ break;
+ }
+ }
+
+ ut_ad(z != table->n_cols);
+
+ v_col->base_col[n] = dict_table_get_nth_col(table, z);
+ ut_ad(v_col->base_col[n]->ind == z);
+ n++;
+ }
+ }
+ v_col->num_base= n & dict_index_t::MAX_N_FIELDS;
+}
+
+/** Set up base columns for stored column
+@param[in] table InnoDB table
+@param[in] field MySQL field
+@param[in,out] s_col stored column */
+void
+innodb_base_col_setup_for_stored(
+ const dict_table_t* table,
+ const Field* field,
+ dict_s_col_t* s_col)
+{
+ ulint n = 0;
+
+ prepare_vcol_for_base_setup(table, field, s_col);
+
+ for (uint i= 0; i < field->table->s->fields; ++i) {
+ const Field* base_field = field->table->field[i];
+
+ if (base_field->stored_in_db()
+ && bitmap_is_set(&field->table->tmp_set, i)) {
+ ulint z;
+ for (z = 0; z < table->n_cols; z++) {
+ const char* name = dict_table_get_col_name(
+ table, z);
+ if (!innobase_strcasecmp(
+ name, base_field->field_name.str)) {
+ break;
+ }
+ }
+
+ ut_ad(z != table->n_cols);
+
+ s_col->base_col[n] = dict_table_get_nth_col(table, z);
+ n++;
+
+ if (n == s_col->num_base) {
+ break;
+ }
+ }
+ }
+ s_col->num_base= n;
+}
+
+/** Create a table definition to an InnoDB database.
+@return ER_* level error */
+inline MY_ATTRIBUTE((warn_unused_result))
+int
+create_table_info_t::create_table_def()
+{
+ dict_table_t* table;
+ ulint nulls_allowed;
+ unsigned unsigned_type;
+ ulint binary_type;
+ ulint long_true_varchar;
+ ulint charset_no;
+ ulint doc_id_col = 0;
+ ibool has_doc_id_col = FALSE;
+ mem_heap_t* heap;
+ ha_table_option_struct *options= m_form->s->option_struct;
+ dberr_t err = DB_SUCCESS;
+
+ DBUG_ENTER("create_table_def");
+ DBUG_PRINT("enter", ("table_name: %s", m_table_name));
+
+ DBUG_ASSERT(m_trx->mysql_thd == m_thd);
+ DBUG_ASSERT(!m_drop_before_rollback);
+
+ /* MySQL does the name length check. But we do additional check
+ on the name length here */
+ const size_t table_name_len = strlen(m_table_name);
+ if (table_name_len > MAX_FULL_NAME_LEN) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_TABLE_NAME,
+ "InnoDB: Table Name or Database Name is too long");
+
+ DBUG_RETURN(ER_TABLE_NAME);
+ }
+
+ if (m_table_name[table_name_len - 1] == '/') {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_TABLE_NAME,
+ "InnoDB: Table name is empty");
+
+ DBUG_RETURN(ER_WRONG_TABLE_NAME);
+ }
+
+ /* Find out the number of virtual columns. */
+ ulint num_v = 0;
+ const bool omit_virtual = ha_innobase::omits_virtual_cols(*m_form->s);
+ const ulint n_cols = omit_virtual
+ ? m_form->s->stored_fields : m_form->s->fields;
+
+ if (!omit_virtual) {
+ for (ulint i = 0; i < n_cols; i++) {
+ num_v += !m_form->field[i]->stored_in_db();
+ }
+ }
+
+ /* Check whether there already exists a FTS_DOC_ID column */
+ if (create_table_check_doc_id_col(m_trx, m_form, &doc_id_col)){
+
+ /* Raise error if the Doc ID column is of wrong type or name */
+ if (doc_id_col == ULINT_UNDEFINED) {
+ DBUG_RETURN(HA_ERR_GENERIC);
+ } else {
+ has_doc_id_col = TRUE;
+ }
+ }
+
+ /* Adjust the number of columns for the FTS hidden field */
+ const ulint actual_n_cols = n_cols
+ + (m_flags2 & DICT_TF2_FTS && !has_doc_id_col);
+
+ table = dict_mem_table_create(m_table_name, NULL,
+ actual_n_cols, num_v, m_flags, m_flags2);
+
+ /* Set the hidden doc_id column. */
+ if (m_flags2 & DICT_TF2_FTS) {
+ table->fts->doc_col = has_doc_id_col
+ ? doc_id_col : n_cols - num_v;
+ }
+
+ if (DICT_TF_HAS_DATA_DIR(m_flags)) {
+ ut_a(strlen(m_remote_path));
+
+ table->data_dir_path = mem_heap_strdup(
+ table->heap, m_remote_path);
+
+ } else {
+ table->data_dir_path = NULL;
+ }
+
+ heap = mem_heap_create(1000);
+ auto _ = make_scope_exit([heap]() { mem_heap_free(heap); });
+
+ ut_d(bool have_vers_start = false);
+ ut_d(bool have_vers_end = false);
+
+ for (ulint i = 0, j = 0; j < n_cols; i++) {
+ Field* field = m_form->field[i];
+ ulint vers_row = 0;
+
+ if (m_form->versioned()) {
+ if (i == m_form->s->vers.start_fieldno) {
+ vers_row = DATA_VERS_START;
+ ut_d(have_vers_start = true);
+ } else if (i == m_form->s->vers.end_fieldno) {
+ vers_row = DATA_VERS_END;
+ ut_d(have_vers_end = true);
+ } else if (!(field->flags
+ & VERS_UPDATE_UNVERSIONED_FLAG)) {
+ vers_row = DATA_VERSIONED;
+ }
+ }
+
+ auto col_type = get_innobase_type_from_mysql_type(
+ &unsigned_type, field);
+
+ if (!col_type) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_CREATE_TABLE,
+ "Error creating table '%s' with"
+ " column '%s'. Please check its"
+ " column type and try to re-create"
+ " the table with an appropriate"
+ " column type.",
+ table->name.m_name, field->field_name.str);
+err_col:
+ dict_mem_table_free(table);
+ ut_ad(trx_state_eq(m_trx, TRX_STATE_NOT_STARTED));
+ DBUG_RETURN(HA_ERR_GENERIC);
+ }
+
+ nulls_allowed = field->real_maybe_null() ? 0 : DATA_NOT_NULL;
+ binary_type = field->binary() ? DATA_BINARY_TYPE : 0;
+
+ charset_no = 0;
+
+ if (dtype_is_string_type(col_type)) {
+
+ charset_no = (ulint) field->charset()->number;
+
+ DBUG_EXECUTE_IF("simulate_max_char_col",
+ charset_no = MAX_CHAR_COLL_NUM + 1;
+ );
+
+ if (charset_no > MAX_CHAR_COLL_NUM) {
+ /* in data0type.h we assume that the
+ number fits in one byte in prtype */
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_CREATE_TABLE,
+ "In InnoDB, charset-collation codes"
+ " must be below 256."
+ " Unsupported code " ULINTPF ".",
+ charset_no);
+ dict_mem_table_free(table);
+
+ DBUG_RETURN(ER_CANT_CREATE_TABLE);
+ }
+ }
+
+ auto col_len = field->pack_length();
+
+ /* The MySQL pack length contains 1 or 2 bytes length field
+ for a true VARCHAR. Let us subtract that, so that the InnoDB
+ column length in the InnoDB data dictionary is the real
+ maximum byte length of the actual data. */
+
+ long_true_varchar = 0;
+
+ if (field->type() == MYSQL_TYPE_VARCHAR) {
+ col_len -= ((Field_varstring*) field)->length_bytes;
+
+ if (((Field_varstring*) field)->length_bytes == 2) {
+ long_true_varchar = DATA_LONG_TRUE_VARCHAR;
+ }
+ }
+
+ /* First check whether the column to be added has a
+ system reserved name. */
+ if (dict_col_name_is_reserved(field->field_name.str)){
+ my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+ field->field_name.str);
+ goto err_col;
+ }
+
+ ulint is_virtual = !field->stored_in_db() ? DATA_VIRTUAL : 0;
+
+ if (!is_virtual) {
+ dict_mem_table_add_col(table, heap,
+ field->field_name.str, col_type,
+ dtype_form_prtype(
+ (ulint) field->type()
+ | nulls_allowed | unsigned_type
+ | binary_type | long_true_varchar
+ | vers_row,
+ charset_no),
+ col_len);
+ } else if (!omit_virtual) {
+ dict_mem_table_add_v_col(table, heap,
+ field->field_name.str, col_type,
+ dtype_form_prtype(
+ (ulint) field->type()
+ | nulls_allowed | unsigned_type
+ | binary_type | long_true_varchar
+ | vers_row
+ | is_virtual,
+ charset_no),
+ col_len, i, 0);
+ }
+
+ if (innobase_is_s_fld(field)) {
+ ut_ad(!is_virtual);
+ /* Added stored column in m_s_cols list. */
+ dict_mem_table_add_s_col(
+ table, 0);
+ }
+
+ if (is_virtual && omit_virtual) {
+ continue;
+ }
+
+ j++;
+ }
+
+ ut_ad(have_vers_start == have_vers_end);
+ ut_ad(table->versioned() == have_vers_start);
+ ut_ad(!table->versioned() || table->vers_start != table->vers_end);
+
+ if (num_v) {
+ for (ulint i = 0, j = 0; i < n_cols; i++) {
+ dict_v_col_t* v_col;
+
+ const Field* field = m_form->field[i];
+
+ if (field->stored_in_db()) {
+ continue;
+ }
+
+ v_col = dict_table_get_nth_v_col(table, j);
+
+ j++;
+
+ innodb_base_col_setup(table, field, v_col);
+ }
+ }
+
+ /** Fill base columns for the stored column present in the list. */
+ if (table->s_cols && !table->s_cols->empty()) {
+ for (ulint i = 0; i < n_cols; i++) {
+ Field* field = m_form->field[i];
+
+ if (!innobase_is_s_fld(field)) {
+ continue;
+ }
+
+ dict_s_col_list::iterator it;
+ for (it = table->s_cols->begin();
+ it != table->s_cols->end(); ++it) {
+ dict_s_col_t s_col = *it;
+
+ if (s_col.s_pos == i) {
+ innodb_base_col_setup_for_stored(
+ table, field, &s_col);
+ break;
+ }
+ }
+ }
+ }
+
+ /* Add the FTS doc_id hidden column. */
+ if (m_flags2 & DICT_TF2_FTS && !has_doc_id_col) {
+ fts_add_doc_id_column(table, heap);
+ }
+
+ dict_table_add_system_columns(table, heap);
+
+ if (table->is_temporary()) {
+ if ((options->encryption == 1
+ && !innodb_encrypt_temporary_tables)
+ || (options->encryption == 2
+ && innodb_encrypt_temporary_tables)) {
+ push_warning_printf(m_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "Ignoring encryption parameter during "
+ "temporary table creation.");
+ }
+
+ m_trx->table_id = table->id
+ = dict_sys.get_temporary_table_id();
+ ut_ad(dict_tf_get_rec_format(table->flags)
+ != REC_FORMAT_COMPRESSED);
+ table->space_id = SRV_TMP_SPACE_ID;
+ table->space = fil_system.temp_space;
+ table->add_to_cache();
+ } else {
+ if (err == DB_SUCCESS) {
+ err = row_create_table_for_mysql(
+ table, m_trx,
+ fil_encryption_t(options->encryption),
+ uint32_t(options->encryption_key_id));
+ m_drop_before_rollback = (err == DB_SUCCESS);
+ }
+
+ DBUG_EXECUTE_IF("ib_crash_during_create_for_encryption",
+ DBUG_SUICIDE(););
+ }
+
+ DBUG_EXECUTE_IF("ib_create_err_tablespace_exist",
+ err = DB_TABLESPACE_EXISTS;);
+
+ switch (err) {
+ case DB_SUCCESS:
+ ut_ad(table);
+ m_table = table;
+ DBUG_RETURN(0);
+ default:
+ break;
+ case DB_DUPLICATE_KEY:
+ case DB_TABLESPACE_EXISTS:
+ char display_name[FN_REFLEN];
+ char* buf_end = innobase_convert_identifier(
+ display_name, sizeof(display_name) - 1,
+ m_table_name, strlen(m_table_name),
+ m_thd);
+
+ *buf_end = '\0';
+
+ my_error(err == DB_DUPLICATE_KEY
+ ? ER_TABLE_EXISTS_ERROR
+ : ER_TABLESPACE_EXISTS, MYF(0), display_name);
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(err, m_flags, m_thd));
+}
+
+/*****************************************************************//**
+Creates an index in an InnoDB database. */
+inline
+int
+create_index(
+/*=========*/
+ trx_t* trx, /*!< in: InnoDB transaction handle */
+ const TABLE* form, /*!< in: information on table
+ columns and indexes */
+ dict_table_t* table, /*!< in,out: table */
+ uint key_num) /*!< in: index number */
+{
+ dict_index_t* index;
+ int error;
+ const KEY* key;
+ ulint* field_lengths;
+
+ DBUG_ENTER("create_index");
+
+ key = form->key_info + key_num;
+
+ /* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */
+ ut_a(innobase_strcasecmp(key->name.str, innobase_index_reserve_name) != 0);
+
+ if (key->flags & (HA_SPATIAL | HA_FULLTEXT)) {
+ /* Only one of these can be specified at a time. */
+ ut_ad(~key->flags & (HA_SPATIAL | HA_FULLTEXT));
+ ut_ad(!(key->flags & HA_NOSAME));
+ index = dict_mem_index_create(table, key->name.str,
+ (key->flags & HA_SPATIAL)
+ ? DICT_SPATIAL : DICT_FTS,
+ key->user_defined_key_parts);
+
+ for (ulint i = 0; i < key->user_defined_key_parts; i++) {
+ const Field* field = key->key_part[i].field;
+
+ /* We do not support special (Fulltext or Spatial)
+ index on virtual columns */
+ if (!field->stored_in_db()) {
+ ut_ad(0);
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+ }
+
+ dict_mem_index_add_field(index, field->field_name.str,
+ 0);
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(
+ row_create_index_for_mysql(
+ index, trx, NULL),
+ table->flags, NULL));
+ }
+
+ ulint ind_type = 0;
+
+ if (key_num == form->s->primary_key) {
+ ind_type |= DICT_CLUSTERED;
+ }
+
+ if (key->flags & HA_NOSAME) {
+ ind_type |= DICT_UNIQUE;
+ }
+
+ field_lengths = (ulint*) my_malloc(PSI_INSTRUMENT_ME,
+ key->user_defined_key_parts * sizeof *
+ field_lengths, MYF(MY_FAE));
+
+ /* We pass 0 as the space id, and determine at a lower level the space
+ id where to store the table */
+
+ index = dict_mem_index_create(table, key->name.str,
+ ind_type, key->user_defined_key_parts);
+
+ for (ulint i = 0; i < key->user_defined_key_parts; i++) {
+ KEY_PART_INFO* key_part = key->key_part + i;
+ ulint prefix_len;
+ unsigned is_unsigned;
+
+
+ /* (The flag HA_PART_KEY_SEG denotes in MySQL a
+ column prefix field in an index: we only store a
+ specified number of first bytes of the column to
+ the index field.) The flag does not seem to be
+ properly set by MySQL. Let us fall back on testing
+ the length of the key part versus the column.
+ We first reach to the table's column; if the index is on a
+ prefix, key_part->field is not the table's column (it's a
+ "fake" field forged in open_table_from_share() with length
+ equal to the length of the prefix); so we have to go to
+ form->fied. */
+ Field* field= form->field[key_part->field->field_index];
+ if (field == NULL)
+ ut_error;
+
+ const char* field_name = key_part->field->field_name.str;
+
+ auto col_type = get_innobase_type_from_mysql_type(
+ &is_unsigned, key_part->field);
+
+ if (DATA_LARGE_MTYPE(col_type)
+ || (key_part->length < field->pack_length()
+ && field->type() != MYSQL_TYPE_VARCHAR)
+ || (field->type() == MYSQL_TYPE_VARCHAR
+ && key_part->length < field->pack_length()
+ - ((Field_varstring*) field)->length_bytes)) {
+
+ switch (col_type) {
+ default:
+ prefix_len = key_part->length;
+ break;
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_DECIMAL:
+ sql_print_error(
+ "MariaDB is trying to create a column"
+ " prefix index field, on an"
+ " inappropriate data type. Table"
+ " name %s, column name %s.",
+ form->s->table_name.str,
+ key_part->field->field_name.str);
+
+ prefix_len = 0;
+ }
+ } else {
+ prefix_len = 0;
+ }
+
+ field_lengths[i] = key_part->length;
+
+ if (!key_part->field->stored_in_db()) {
+ index->type |= DICT_VIRTUAL;
+ }
+
+ dict_mem_index_add_field(index, field_name, prefix_len);
+ }
+
+ ut_ad(key->flags & HA_FULLTEXT || !(index->type & DICT_FTS));
+
+ /* Even though we've defined max_supported_key_part_length, we
+ still do our own checking using field_lengths to be absolutely
+ sure we don't create too long indexes. */
+ ulint flags = table->flags;
+
+ error = convert_error_code_to_mysql(
+ row_create_index_for_mysql(index, trx, field_lengths),
+ flags, NULL);
+
+ my_free(field_lengths);
+
+ DBUG_RETURN(error);
+}
+
+/** Return a display name for the row format
+@param[in] row_format Row Format
+@return row format name */
+static
+const char*
+get_row_format_name(
+ enum row_type row_format)
+{
+ switch (row_format) {
+ case ROW_TYPE_COMPACT:
+ return("COMPACT");
+ case ROW_TYPE_COMPRESSED:
+ return("COMPRESSED");
+ case ROW_TYPE_DYNAMIC:
+ return("DYNAMIC");
+ case ROW_TYPE_REDUNDANT:
+ return("REDUNDANT");
+ case ROW_TYPE_DEFAULT:
+ return("DEFAULT");
+ case ROW_TYPE_FIXED:
+ return("FIXED");
+ case ROW_TYPE_PAGE:
+ case ROW_TYPE_NOT_USED:
+ break;
+ }
+ return("NOT USED");
+}
+
+/** Validate DATA DIRECTORY option.
+@return true if valid, false if not. */
+bool
+create_table_info_t::create_option_data_directory_is_valid()
+{
+ bool is_valid = true;
+
+ ut_ad(m_create_info->data_file_name
+ && m_create_info->data_file_name[0] != '\0');
+
+ /* Use DATA DIRECTORY only with file-per-table. */
+ if (!m_allow_file_per_table) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: DATA DIRECTORY requires"
+ " innodb_file_per_table.");
+ is_valid = false;
+ }
+
+ /* Do not use DATA DIRECTORY with TEMPORARY TABLE. */
+ if (m_create_info->tmp_table()) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: DATA DIRECTORY cannot be used"
+ " for TEMPORARY tables.");
+ is_valid = false;
+ }
+
+ /* We check for a DATA DIRECTORY mixed with TABLESPACE in
+ create_option_tablespace_is_valid(), no need to here. */
+
+ return(is_valid);
+}
+
+/** Validate the create options. Check that the options KEY_BLOCK_SIZE,
+ROW_FORMAT, DATA DIRECTORY, TEMPORARY are compatible with
+each other and other settings. These CREATE OPTIONS are not validated
+here unless innodb_strict_mode is on. With strict mode, this function
+will report each problem it finds using a custom message with error
+code ER_ILLEGAL_HA_CREATE_OPTION, not its built-in message.
+@return NULL if valid, string name of bad option if not. */
+const char*
+create_table_info_t::create_options_are_invalid()
+{
+ bool has_key_block_size = (m_create_info->key_block_size != 0);
+
+ const char* ret = NULL;
+ enum row_type row_format = m_create_info->row_type;
+ const bool is_temp = m_create_info->tmp_table();
+
+ ut_ad(m_thd != NULL);
+
+ /* If innodb_strict_mode is not set don't do any more validation. */
+ if (!THDVAR(m_thd, strict_mode)) {
+ return(NULL);
+ }
+
+ /* Check if a non-zero KEY_BLOCK_SIZE was specified. */
+ if (has_key_block_size) {
+ if (is_temp) {
+ my_error(ER_UNSUPPORT_COMPRESSED_TEMPORARY_TABLE,
+ MYF(0));
+ return("KEY_BLOCK_SIZE");
+ }
+
+ switch (m_create_info->key_block_size) {
+ ulint kbs_max;
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ case 16:
+ /* The maximum KEY_BLOCK_SIZE (KBS) is
+ UNIV_PAGE_SIZE_MAX. But if srv_page_size is
+ smaller than UNIV_PAGE_SIZE_MAX, the maximum
+ KBS is also smaller. */
+ kbs_max = ut_min(
+ 1U << (UNIV_PAGE_SSIZE_MAX - 1),
+ 1U << (PAGE_ZIP_SSIZE_MAX - 1));
+ if (m_create_info->key_block_size > kbs_max) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: KEY_BLOCK_SIZE=%ld"
+ " cannot be larger than %ld.",
+ m_create_info->key_block_size,
+ kbs_max);
+ ret = "KEY_BLOCK_SIZE";
+ }
+
+ /* Valid KEY_BLOCK_SIZE, check its dependencies. */
+ if (!m_allow_file_per_table) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: KEY_BLOCK_SIZE requires"
+ " innodb_file_per_table.");
+ ret = "KEY_BLOCK_SIZE";
+ }
+ break;
+ default:
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: invalid KEY_BLOCK_SIZE = %u."
+ " Valid values are [1, 2, 4, 8, 16]",
+ (uint) m_create_info->key_block_size);
+ ret = "KEY_BLOCK_SIZE";
+ break;
+ }
+ }
+
+ /* Check for a valid InnoDB ROW_FORMAT specifier and
+ other incompatibilities. */
+ switch (row_format) {
+ case ROW_TYPE_COMPRESSED:
+ if (is_temp) {
+ my_error(ER_UNSUPPORT_COMPRESSED_TEMPORARY_TABLE,
+ MYF(0));
+ return("ROW_FORMAT");
+ }
+ if (!m_allow_file_per_table) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ROW_FORMAT=%s requires"
+ " innodb_file_per_table.",
+ get_row_format_name(row_format));
+ ret = "ROW_FORMAT";
+ }
+ break;
+ case ROW_TYPE_DYNAMIC:
+ case ROW_TYPE_COMPACT:
+ case ROW_TYPE_REDUNDANT:
+ if (has_key_block_size) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: cannot specify ROW_FORMAT = %s"
+ " with KEY_BLOCK_SIZE.",
+ get_row_format_name(row_format));
+ ret = "KEY_BLOCK_SIZE";
+ }
+ break;
+ case ROW_TYPE_DEFAULT:
+ break;
+ case ROW_TYPE_FIXED:
+ case ROW_TYPE_PAGE:
+ case ROW_TYPE_NOT_USED:
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: invalid ROW_FORMAT specifier.");
+ ret = "ROW_TYPE";
+ break;
+ }
+
+ if (m_create_info->data_file_name
+ && m_create_info->data_file_name[0] != '\0'
+ && !create_option_data_directory_is_valid()) {
+ ret = "DATA DIRECTORY";
+ }
+
+ /* Do not allow INDEX_DIRECTORY */
+ if (m_create_info->index_file_name) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: INDEX DIRECTORY is not supported");
+ ret = "INDEX DIRECTORY";
+ }
+
+ /* Don't support compressed table when page size > 16k. */
+ if ((has_key_block_size || row_format == ROW_TYPE_COMPRESSED)
+ && srv_page_size > UNIV_PAGE_SIZE_DEF) {
+ push_warning(m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: Cannot create a COMPRESSED table"
+ " when innodb_page_size > 16k.");
+
+ if (has_key_block_size) {
+ ret = "KEY_BLOCK_SIZE";
+ } else {
+ ret = "ROW_TYPE";
+ }
+ }
+
+ return(ret);
+}
+
+/*****************************************************************//**
+Check engine specific table options not handled by SQL-parser.
+@return NULL if valid, string if not */
+const char*
+create_table_info_t::check_table_options()
+{
+ enum row_type row_format = m_create_info->row_type;
+ const ha_table_option_struct *options= m_form->s->option_struct;
+
+ switch (options->encryption) {
+ case FIL_ENCRYPTION_OFF:
+ if (options->encryption_key_id != FIL_DEFAULT_ENCRYPTION_KEY) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: ENCRYPTED=NO implies"
+ " ENCRYPTION_KEY_ID=1");
+ compile_time_assert(FIL_DEFAULT_ENCRYPTION_KEY == 1);
+ }
+ if (srv_encrypt_tables != 2) {
+ break;
+ }
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: ENCRYPTED=NO cannot be used with"
+ " innodb_encrypt_tables=FORCE");
+ return "ENCRYPTED";
+ case FIL_ENCRYPTION_DEFAULT:
+ if (!srv_encrypt_tables) {
+ break;
+ }
+ /* fall through */
+ case FIL_ENCRYPTION_ON:
+ const uint32_t key_id = uint32_t(options->encryption_key_id);
+ if (!encryption_key_id_exists(key_id)) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: ENCRYPTION_KEY_ID %u not available",
+ key_id);
+ return "ENCRYPTION_KEY_ID";
+ }
+
+ /* We do not support encryption for spatial indexes,
+ except if innodb_checksum_algorithm=full_crc32.
+ Do not allow ENCRYPTED=YES if any SPATIAL INDEX exists. */
+ if (options->encryption != FIL_ENCRYPTION_ON
+ || srv_checksum_algorithm
+ >= SRV_CHECKSUM_ALGORITHM_FULL_CRC32) {
+ break;
+ }
+ for (ulint i = 0; i < m_form->s->keys; i++) {
+ if (m_form->key_info[i].flags & HA_SPATIAL) {
+ push_warning(m_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: ENCRYPTED=YES is not"
+ " supported for SPATIAL INDEX");
+ return "ENCRYPTED";
+ }
+ }
+ }
+
+ if (!m_allow_file_per_table
+ && options->encryption != FIL_ENCRYPTION_DEFAULT) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: ENCRYPTED requires innodb_file_per_table");
+ return "ENCRYPTED";
+ }
+
+ /* Check page compression requirements */
+ if (options->page_compressed) {
+
+ if (row_format == ROW_TYPE_COMPRESSED) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED table can't have"
+ " ROW_TYPE=COMPRESSED");
+ return "PAGE_COMPRESSED";
+ }
+
+ switch (row_format) {
+ default:
+ break;
+ case ROW_TYPE_DEFAULT:
+ if (m_default_row_format
+ != DEFAULT_ROW_FORMAT_REDUNDANT) {
+ break;
+ }
+ /* fall through */
+ case ROW_TYPE_REDUNDANT:
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED table can't have"
+ " ROW_TYPE=REDUNDANT");
+ return "PAGE_COMPRESSED";
+ }
+
+ if (!m_allow_file_per_table) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED requires"
+ " innodb_file_per_table.");
+ return "PAGE_COMPRESSED";
+ }
+
+ if (m_create_info->key_block_size) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED table can't have"
+ " key_block_size");
+ return "PAGE_COMPRESSED";
+ }
+ }
+
+ /* Check page compression level requirements, some of them are
+ already checked above */
+ if (options->page_compression_level != 0) {
+ if (options->page_compressed == false) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSION_LEVEL requires"
+ " PAGE_COMPRESSED");
+ return "PAGE_COMPRESSION_LEVEL";
+ }
+
+ if (options->page_compression_level < 1 || options->page_compression_level > 9) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu."
+ " Valid values are [1, 2, 3, 4, 5, 6, 7, 8, 9]",
+ options->page_compression_level);
+ return "PAGE_COMPRESSION_LEVEL";
+ }
+ }
+
+ return NULL;
+}
+
+/*****************************************************************//**
+Update create_info. Used in SHOW CREATE TABLE et al. */
+
+void
+ha_innobase::update_create_info(
+/*============================*/
+ HA_CREATE_INFO* create_info) /*!< in/out: create info */
+{
+ if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
+ info(HA_STATUS_AUTO);
+ create_info->auto_increment_value = stats.auto_increment_value;
+ }
+
+ if (m_prebuilt->table->is_temporary()) {
+ return;
+ }
+
+ /* Update the DATA DIRECTORY name from SYS_DATAFILES. */
+ dict_get_and_save_data_dir_path(m_prebuilt->table, false);
+
+ if (m_prebuilt->table->data_dir_path) {
+ create_info->data_file_name = m_prebuilt->table->data_dir_path;
+ }
+}
+
+/*****************************************************************//**
+Initialize the table FTS stopword list
+@return TRUE if success */
+ibool
+innobase_fts_load_stopword(
+/*=======================*/
+ dict_table_t* table, /*!< in: Table has the FTS */
+ trx_t* trx, /*!< in: transaction */
+ THD* thd) /*!< in: current thread */
+{
+ const char *stopword_table= THDVAR(thd, ft_user_stopword_table);
+ if (!stopword_table)
+ {
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ if (innobase_server_stopword_table)
+ stopword_table= thd_strdup(thd, innobase_server_stopword_table);
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ }
+
+ return fts_load_stopword(table, trx, stopword_table,
+ THDVAR(thd, ft_enable_stopword), false);
+}
+
+/** Parse the table name into normal name and remote path if needed.
+@param[in] name Table name (db/table or full path).
+@return 0 if successful, otherwise, error number */
+int
+create_table_info_t::parse_table_name(
+ const char*
+#ifdef _WIN32
+ name
+#endif
+ )
+{
+ DBUG_ENTER("parse_table_name");
+
+#ifdef _WIN32
+ /* Names passed in from server are in two formats:
+ 1. <database_name>/<table_name>: for normal table creation
+ 2. full path: for temp table creation, or DATA DIRECTORY.
+
+ When srv_file_per_table is on and mysqld_embedded is off,
+ check for full path pattern, i.e.
+ X:\dir\..., X is a driver letter, or
+ \\dir1\dir2\..., UNC path
+ returns error if it is in full path format, but not creating a temp.
+ table. Currently InnoDB does not support symbolic link on Windows. */
+
+ if (m_innodb_file_per_table
+ && !mysqld_embedded
+ && !m_create_info->tmp_table()) {
+
+ if ((name[1] == ':')
+ || (name[0] == '\\' && name[1] == '\\')) {
+ sql_print_error("Cannot create table %s\n", name);
+ DBUG_RETURN(HA_ERR_GENERIC);
+ }
+ }
+#endif
+
+ m_remote_path[0] = '\0';
+
+ /* Make sure DATA DIRECTORY is compatible with other options
+ and set the remote path. In the case of either;
+ CREATE TEMPORARY TABLE ... DATA DIRECTORY={path} ... ;
+ CREATE TABLE ... DATA DIRECTORY={path} TABLESPACE={name}... ;
+ we ignore the DATA DIRECTORY. */
+ if (m_create_info->data_file_name
+ && m_create_info->data_file_name[0] != '\0') {
+ if (!create_option_data_directory_is_valid()) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ WARN_OPTION_IGNORED,
+ ER_DEFAULT(WARN_OPTION_IGNORED),
+ "DATA DIRECTORY");
+
+ m_flags &= ~DICT_TF_MASK_DATA_DIR;
+ } else {
+ strncpy(m_remote_path,
+ m_create_info->data_file_name,
+ FN_REFLEN - 1);
+ }
+ }
+
+ if (m_create_info->index_file_name) {
+ my_error(WARN_OPTION_IGNORED, ME_WARNING,
+ "INDEX DIRECTORY");
+ }
+
+ DBUG_RETURN(0);
+}
+
+/** @return whether innodb_strict_mode is active */
+bool ha_innobase::is_innodb_strict_mode(THD *thd)
+{
+ return THDVAR(thd, strict_mode);
+}
+
+/** Determine InnoDB table flags.
+If strict_mode=OFF, this will adjust the flags to what should be assumed.
+@retval true on success
+@retval false on error */
+bool create_table_info_t::innobase_table_flags()
+{
+ DBUG_ENTER("innobase_table_flags");
+
+ const char* fts_doc_id_index_bad = NULL;
+ ulint zip_ssize = 0;
+ enum row_type row_type;
+ rec_format_t innodb_row_format =
+ get_row_format(m_default_row_format);
+ const bool is_temp = m_create_info->tmp_table();
+ bool zip_allowed = !is_temp;
+
+ const ulint zip_ssize_max =
+ ut_min(static_cast<ulint>(UNIV_PAGE_SSIZE_MAX),
+ static_cast<ulint>(PAGE_ZIP_SSIZE_MAX));
+
+ /* Cache the value of innobase_compression_level, in case it is
+ modified by another thread while the table is being created. */
+ const ulint default_compression_level = page_zip_level;
+
+ ha_table_option_struct *options= m_form->s->option_struct;
+
+ m_flags = 0;
+ m_flags2 = 0;
+
+ /* Check if there are any FTS indexes defined on this table. */
+ for (uint i = 0; i < m_form->s->keys; i++) {
+ const KEY* key = &m_form->key_info[i];
+
+ if (key->flags & HA_FULLTEXT) {
+ m_flags2 |= DICT_TF2_FTS;
+
+ /* We don't support FTS indexes in temporary
+ tables. */
+ if (is_temp) {
+ my_error(ER_INNODB_NO_FT_TEMP_TABLE, MYF(0));
+ DBUG_RETURN(false);
+ }
+
+ if (fts_doc_id_index_bad) {
+ goto index_bad;
+ }
+ }
+
+ if (innobase_strcasecmp(key->name.str, FTS_DOC_ID_INDEX_NAME)) {
+ continue;
+ }
+
+ /* Do a pre-check on FTS DOC ID index */
+ if (!(key->flags & HA_NOSAME)
+ || strcmp(key->name.str, FTS_DOC_ID_INDEX_NAME)
+ || strcmp(key->key_part[0].field->field_name.str,
+ FTS_DOC_ID_COL_NAME)) {
+ fts_doc_id_index_bad = key->name.str;
+ }
+
+ if (fts_doc_id_index_bad && (m_flags2 & DICT_TF2_FTS)) {
+index_bad:
+ my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0),
+ fts_doc_id_index_bad);
+ DBUG_RETURN(false);
+ }
+ }
+
+ if (m_create_info->key_block_size > 0) {
+ /* The requested compressed page size (key_block_size)
+ is given in kilobytes. If it is a valid number, store
+ that value as the number of log2 shifts from 512 in
+ zip_ssize. Zero means it is not compressed. */
+ ulint zssize; /* Zip Shift Size */
+ ulint kbsize; /* Key Block Size */
+ for (zssize = kbsize = 1;
+ zssize <= zip_ssize_max;
+ zssize++, kbsize <<= 1) {
+ if (kbsize == m_create_info->key_block_size) {
+ zip_ssize = zssize;
+ break;
+ }
+ }
+
+ /* Make sure compressed row format is allowed. */
+ if (is_temp) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: KEY_BLOCK_SIZE is ignored"
+ " for TEMPORARY TABLE.");
+ zip_allowed = false;
+ } else if (!m_allow_file_per_table) {
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: KEY_BLOCK_SIZE requires"
+ " innodb_file_per_table.");
+ zip_allowed = false;
+ }
+
+ if (!zip_allowed
+ || zssize > zip_ssize_max) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ignoring KEY_BLOCK_SIZE=%u.",
+ (uint) m_create_info->key_block_size);
+ }
+ }
+
+ row_type = m_create_info->row_type;
+
+ if (zip_ssize && zip_allowed) {
+ /* if ROW_FORMAT is set to default,
+ automatically change it to COMPRESSED. */
+ if (row_type == ROW_TYPE_DEFAULT) {
+ row_type = ROW_TYPE_COMPRESSED;
+ } else if (row_type != ROW_TYPE_COMPRESSED) {
+ /* ROW_FORMAT other than COMPRESSED
+ ignores KEY_BLOCK_SIZE. It does not
+ make sense to reject conflicting
+ KEY_BLOCK_SIZE and ROW_FORMAT, because
+ such combinations can be obtained
+ with ALTER TABLE anyway. */
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ignoring KEY_BLOCK_SIZE=%u"
+ " unless ROW_FORMAT=COMPRESSED.",
+ (uint) m_create_info->key_block_size);
+ zip_allowed = false;
+ }
+ } else {
+ /* zip_ssize == 0 means no KEY_BLOCK_SIZE. */
+ if (row_type == ROW_TYPE_COMPRESSED && zip_allowed) {
+ /* ROW_FORMAT=COMPRESSED without KEY_BLOCK_SIZE
+ implies half the maximum KEY_BLOCK_SIZE(*1k) or
+ srv_page_size, whichever is less. */
+ zip_ssize = zip_ssize_max - 1;
+ }
+ }
+
+ /* Validate the row format. Correct it if necessary */
+
+ switch (row_type) {
+ case ROW_TYPE_REDUNDANT:
+ innodb_row_format = REC_FORMAT_REDUNDANT;
+ break;
+ case ROW_TYPE_COMPACT:
+ innodb_row_format = REC_FORMAT_COMPACT;
+ break;
+ case ROW_TYPE_COMPRESSED:
+ if (is_temp) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ROW_FORMAT=%s is ignored for"
+ " TEMPORARY TABLE.",
+ get_row_format_name(row_type));
+ } else if (!m_allow_file_per_table) {
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ROW_FORMAT=COMPRESSED requires"
+ " innodb_file_per_table.");
+ } else {
+ innodb_row_format = REC_FORMAT_COMPRESSED;
+ break;
+ }
+ zip_allowed = false;
+ /* Set ROW_FORMAT = COMPACT */
+ /* fall through */
+ case ROW_TYPE_NOT_USED:
+ case ROW_TYPE_FIXED:
+ case ROW_TYPE_PAGE:
+ push_warning(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: assuming ROW_FORMAT=DYNAMIC.");
+ /* fall through */
+ case ROW_TYPE_DYNAMIC:
+ innodb_row_format = REC_FORMAT_DYNAMIC;
+ break;
+ case ROW_TYPE_DEFAULT:
+ ;
+ }
+
+ /* Don't support compressed table when page size > 16k. */
+ if (zip_allowed && zip_ssize && srv_page_size > UNIV_PAGE_SIZE_DEF) {
+ push_warning(m_thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: Cannot create a COMPRESSED table"
+ " when innodb_page_size > 16k."
+ " Assuming ROW_FORMAT=DYNAMIC.");
+ zip_allowed = false;
+ }
+
+ ut_ad(!is_temp || !zip_allowed);
+ ut_ad(!is_temp || innodb_row_format != REC_FORMAT_COMPRESSED);
+
+ /* Set the table flags */
+ if (!zip_allowed) {
+ zip_ssize = 0;
+ }
+
+ if (is_temp) {
+ m_flags2 |= DICT_TF2_TEMPORARY;
+ } else if (m_use_file_per_table) {
+ m_flags2 |= DICT_TF2_USE_FILE_PER_TABLE;
+ }
+
+ /* Set the table flags */
+ dict_tf_set(&m_flags, innodb_row_format, zip_ssize,
+ m_use_data_dir,
+ options->page_compressed,
+ options->page_compression_level == 0 ?
+ default_compression_level : ulint(options->page_compression_level));
+
+ if (m_form->s->table_type == TABLE_TYPE_SEQUENCE) {
+ m_flags |= DICT_TF_MASK_NO_ROLLBACK;
+ }
+
+ /* Set the flags2 when create table or alter tables */
+ m_flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
+ DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+ m_flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;);
+
+ DBUG_RETURN(true);
+}
+
+/** Parse MERGE_THRESHOLD value from the string.
+@param[in] thd connection
+@param[in] str string which might include 'MERGE_THRESHOLD='
+@return value parsed. 0 means not found or invalid value. */
+static
+unsigned
+innobase_parse_merge_threshold(
+ THD* thd,
+ const char* str)
+{
+ static const char* label = "MERGE_THRESHOLD=";
+ static const size_t label_len = strlen(label);
+ const char* pos = str;
+
+ pos = strstr(str, label);
+
+ if (pos == NULL) {
+ return(0);
+ }
+
+ pos += label_len;
+
+ lint ret = atoi(pos);
+
+ if (ret > 0 && ret <= 50) {
+ return(static_cast<unsigned>(ret));
+ }
+
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: Invalid value for MERGE_THRESHOLD in the CREATE TABLE"
+ " statement. The value is ignored.");
+
+ return(0);
+}
+
+/** Parse hint for table and its indexes, and update the information
+in dictionary.
+@param[in] thd connection
+@param[in,out] table target table
+@param[in] table_share table definition */
+void
+innobase_parse_hint_from_comment(
+ THD* thd,
+ dict_table_t* table,
+ const TABLE_SHARE* table_share)
+{
+ unsigned merge_threshold_table;
+ unsigned merge_threshold_index[MAX_KEY];
+ bool is_found[MAX_KEY];
+
+ if (table_share->comment.str != NULL) {
+ merge_threshold_table
+ = innobase_parse_merge_threshold(
+ thd, table_share->comment.str);
+ } else {
+ merge_threshold_table = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+ }
+
+ if (merge_threshold_table == 0) {
+ merge_threshold_table = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+ }
+
+ for (uint i = 0; i < table_share->keys; i++) {
+ KEY* key_info = &table_share->key_info[i];
+
+ ut_ad(i < sizeof(merge_threshold_index)
+ / sizeof(merge_threshold_index[0]));
+
+ if (key_info->flags & HA_USES_COMMENT
+ && key_info->comment.str != NULL) {
+ merge_threshold_index[i]
+ = innobase_parse_merge_threshold(
+ thd, key_info->comment.str);
+ } else {
+ merge_threshold_index[i] = merge_threshold_table;
+ }
+
+ if (merge_threshold_index[i] == 0) {
+ merge_threshold_index[i] = merge_threshold_table;
+ }
+ }
+
+ /* update SYS_INDEX table */
+ if (!table->is_temporary()) {
+ for (uint i = 0; i < table_share->keys; i++) {
+ is_found[i] = false;
+ }
+
+ for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index != NULL;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ if (dict_index_is_auto_gen_clust(index)) {
+
+ /* GEN_CLUST_INDEX should use
+ merge_threshold_table */
+ dict_index_set_merge_threshold(
+ index, merge_threshold_table);
+ continue;
+ }
+
+ for (uint i = 0; i < table_share->keys; i++) {
+ if (is_found[i]) {
+ continue;
+ }
+
+ KEY* key_info = &table_share->key_info[i];
+
+ if (innobase_strcasecmp(
+ index->name, key_info->name.str) == 0) {
+
+ dict_index_set_merge_threshold(
+ index,
+ merge_threshold_index[i]);
+ is_found[i] = true;
+ break;
+ }
+ }
+ }
+ }
+
+ for (uint i = 0; i < table_share->keys; i++) {
+ is_found[i] = false;
+ }
+
+ /* update in memory */
+ for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index != NULL;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ if (dict_index_is_auto_gen_clust(index)) {
+
+ /* GEN_CLUST_INDEX should use merge_threshold_table */
+
+ /* x-lock index is needed to exclude concurrent
+ pessimistic tree operations */
+ rw_lock_x_lock(dict_index_get_lock(index));
+ index->merge_threshold = merge_threshold_table
+ & ((1U << 6) - 1);
+ rw_lock_x_unlock(dict_index_get_lock(index));
+
+ continue;
+ }
+
+ for (uint i = 0; i < table_share->keys; i++) {
+ if (is_found[i]) {
+ continue;
+ }
+
+ KEY* key_info = &table_share->key_info[i];
+
+ if (innobase_strcasecmp(
+ index->name, key_info->name.str) == 0) {
+
+ /* x-lock index is needed to exclude concurrent
+ pessimistic tree operations */
+ rw_lock_x_lock(dict_index_get_lock(index));
+ index->merge_threshold
+ = merge_threshold_index[i]
+ & ((1U << 6) - 1);
+ rw_lock_x_unlock(dict_index_get_lock(index));
+ is_found[i] = true;
+
+ break;
+ }
+ }
+ }
+}
+
+/** Set m_use_* flags. */
+void
+create_table_info_t::set_tablespace_type(
+ bool table_being_altered_is_file_per_table)
+{
+ /** Allow file_per_table for this table either because:
+ 1) the setting innodb_file_per_table=on,
+ 2) the table being altered is currently file_per_table */
+ m_allow_file_per_table =
+ m_innodb_file_per_table
+ || table_being_altered_is_file_per_table;
+
+ /* Ignore the current innodb-file-per-table setting if we are
+ creating a temporary table. */
+ m_use_file_per_table = m_allow_file_per_table
+ && !m_create_info->tmp_table();
+
+ /* DATA DIRECTORY must have m_use_file_per_table but cannot be
+ used with TEMPORARY tables. */
+ m_use_data_dir =
+ m_use_file_per_table
+ && (m_create_info->data_file_name != NULL)
+ && (m_create_info->data_file_name[0] != '\0');
+}
+
+/** Initialize the create_table_info_t object.
+@return error number */
+int
+create_table_info_t::initialize()
+{
+ DBUG_ENTER("create_table_info_t::initialize");
+
+ ut_ad(m_thd != NULL);
+ ut_ad(m_create_info != NULL);
+
+ if (m_form->s->fields > REC_MAX_N_USER_FIELDS) {
+ DBUG_RETURN(HA_ERR_TOO_MANY_FIELDS);
+ }
+
+ /* Check for name conflicts (with reserved name) for
+ any user indices to be created. */
+ if (innobase_index_name_is_reserved(m_thd, m_form->key_info,
+ m_form->s->keys)) {
+ DBUG_RETURN(HA_ERR_WRONG_INDEX);
+ }
+
+ /* Get the transaction associated with the current thd, or create one
+ if not yet created */
+
+ check_trx_exists(m_thd);
+
+ DBUG_RETURN(0);
+}
+
+
+/** Check if a virtual column is part of a fulltext or spatial index. */
+bool
+create_table_info_t::gcols_in_fulltext_or_spatial()
+{
+ for (ulint i = 0; i < m_form->s->keys; i++) {
+ const KEY* key = m_form->key_info + i;
+ if (!(key->flags & (HA_SPATIAL | HA_FULLTEXT))) {
+ continue;
+ }
+ for (ulint j = 0; j < key->user_defined_key_parts; j++) {
+ /* We do not support special (Fulltext or
+ Spatial) index on virtual columns */
+ if (!key->key_part[j].field->stored_in_db()) {
+ my_error(ER_UNSUPPORTED_ACTION_ON_GENERATED_COLUMN, MYF(0));
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+/** Prepare to create a new table to an InnoDB database.
+@param[in] name Table name
+@return error number */
+int create_table_info_t::prepare_create_table(const char* name, bool strict)
+{
+ DBUG_ENTER("prepare_create_table");
+
+ ut_ad(m_thd != NULL);
+ ut_ad(m_create_info != NULL);
+
+ set_tablespace_type(false);
+
+ normalize_table_name(m_table_name, name);
+
+ /* Validate table options not handled by the SQL-parser */
+ if (check_table_options()) {
+ DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+ }
+
+ /* Validate the create options if innodb_strict_mode is set.
+ Do not use the regular message for ER_ILLEGAL_HA_CREATE_OPTION
+ because InnoDB might actually support the option, but not under
+ the current conditions. The messages revealing the specific
+ problems are reported inside this function. */
+ if (strict && create_options_are_invalid()) {
+ DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+ }
+
+ /* Create the table flags and flags2 */
+ if (!innobase_table_flags()) {
+ DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+ }
+
+ if (high_level_read_only) {
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ }
+
+ if (gcols_in_fulltext_or_spatial()) {
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+ }
+
+ for (uint i = 0; i < m_form->s->keys; i++) {
+ const size_t max_field_len
+ = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(m_flags);
+ const KEY& key = m_form->key_info[i];
+
+ if (key.algorithm == HA_KEY_ALG_FULLTEXT) {
+ continue;
+ }
+
+ if (too_big_key_part_length(max_field_len, key)) {
+ DBUG_RETURN(convert_error_code_to_mysql(
+ DB_TOO_BIG_INDEX_COL, m_flags, NULL));
+ }
+ }
+
+ DBUG_RETURN(parse_table_name(name));
+}
+
+/** Push warning message to SQL-layer based on foreign key constraint index
+match error.
+@param[in] trx Current transaction
+@param[in] operation Operation ("Create" or "Alter")
+@param[in] create_name Table name as specified in SQL
+@param[in] columns Foreign key column names array
+@param[in] index_error Index error code
+@param[in] err_col Column where error happened
+@param[in] err_index Index where error happened
+@param[in] table Table object */
+static void
+foreign_push_index_error(trx_t* trx, const char* operation,
+ const char* create_name, const char* fk_text,
+ const char** columns, fkerr_t index_error,
+ ulint err_col, dict_index_t* err_index,
+ dict_table_t* table)
+{
+ switch (index_error) {
+ case FK_SUCCESS:
+ break;
+ case FK_INDEX_NOT_FOUND:
+ ib_foreign_warn(trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+ "%s table %s with foreign key %s constraint"
+ " failed. There is no index in the referenced"
+ " table where the referenced columns appear"
+ " as the first columns.",
+ operation, create_name, fk_text);
+ return;
+ case FK_IS_PREFIX_INDEX:
+ ib_foreign_warn(
+ trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+ "%s table %s with foreign key %s constraint"
+ " failed. There is only prefix index in the referenced"
+ " table where the referenced columns appear"
+ " as the first columns.",
+ operation, create_name, fk_text);
+ return;
+ case FK_COL_NOT_NULL:
+ ib_foreign_warn(
+ trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+ "%s table %s with foreign key %s constraint"
+ " failed. You have defined a SET NULL condition but "
+ "column '%s' on index is defined as NOT NULL.",
+ operation, create_name, fk_text, columns[err_col]);
+ return;
+ case FK_COLS_NOT_EQUAL:
+ dict_field_t* field;
+ const char* col_name;
+ field = dict_index_get_nth_field(err_index, err_col);
+
+ col_name = field->col->is_virtual()
+ ? "(null)"
+ : dict_table_get_col_name(
+ table, dict_col_get_no(field->col));
+ ib_foreign_warn(
+ trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+ "%s table %s with foreign key %s constraint"
+ " failed. Field type or character set for column '%s' "
+ "does not mach referenced column '%s'.",
+ operation, create_name, fk_text, columns[err_col],
+ col_name);
+ return;
+ }
+ DBUG_ASSERT("unknown error" == 0);
+}
+
+/** Find column or virtual column in table by its name.
+@param[in] table Table where column is searched
+@param[in] name Name to search for
+@retval true if found
+@retval false if not found */
+static bool
+find_col(dict_table_t* table, const char** name)
+{
+ ulint i;
+ for (i = 0; i < dict_table_get_n_cols(table); i++) {
+
+ const char* col_name = dict_table_get_col_name(table, i);
+
+ if (0 == innobase_strcasecmp(col_name, *name)) {
+ /* Found */
+ strcpy((char*)*name, col_name);
+ return true;
+ }
+ }
+
+ for (i = 0; i < dict_table_get_n_v_cols(table); i++) {
+
+ const char* col_name = dict_table_get_v_col_name(table, i);
+
+ if (0 == innobase_strcasecmp(col_name, *name)) {
+ /* Found */
+ strcpy((char*)*name, col_name);
+ return true;
+ }
+ }
+ return false;
+}
+
+/** Foreign key printer for error messages. Prints FK name if it exists or
+key part list in the form (col1, col2, col3, ...) */
+class key_text
+{
+ static const size_t MAX_TEXT = 48;
+ char buf[MAX_TEXT + 1];
+
+public:
+ key_text(Key* key)
+ {
+ char* ptr = buf;
+ if (key->name.str) {
+ size_t len = std::min(key->name.length, MAX_TEXT - 2);
+ *(ptr++) = '`';
+ memcpy(ptr, key->name.str, len);
+ ptr += len;
+ *(ptr++) = '`';
+ *ptr = '\0';
+ return;
+ }
+ *(ptr++) = '(';
+ List_iterator_fast<Key_part_spec> it(key->columns);
+ while (Key_part_spec* k = it++) {
+ /* 3 is etc continuation ("...");
+ 2 is comma separator (", ") in case of next exists;
+ 1 is terminating ')' */
+ if (MAX_TEXT - (size_t)(ptr - buf)
+ >= (it.peek() ? 3 + 2 + 1 : 3 + 1)
+ + k->field_name.length) {
+ memcpy(ptr, k->field_name.str,
+ k->field_name.length);
+ ptr += k->field_name.length;
+ if (it.peek()) {
+ *(ptr++) = ',';
+ *(ptr++) = ' ';
+ }
+ } else {
+ ut_ad((size_t)(ptr - buf) <= MAX_TEXT - 4);
+ memcpy(ptr, "...", 3);
+ ptr += 3;
+ break;
+ }
+ }
+ *(ptr++) = ')';
+ *ptr = '\0';
+ }
+ const char* str() { return buf; }
+};
+
+/** Create InnoDB foreign keys from MySQL alter_info. Collect all
+dict_foreign_t items into local_fk_set and then add into system table.
+@return DB_SUCCESS or specific error code */
+dberr_t
+create_table_info_t::create_foreign_keys()
+{
+ dict_foreign_set local_fk_set;
+ dict_foreign_set_free local_fk_set_free(local_fk_set);
+ dberr_t error;
+ ulint number = 1;
+ static const unsigned MAX_COLS_PER_FK = 500;
+ const char* column_names[MAX_COLS_PER_FK];
+ const char* ref_column_names[MAX_COLS_PER_FK];
+ char create_name[MAX_DATABASE_NAME_LEN + 1 +
+ MAX_TABLE_NAME_LEN + 1];
+ dict_index_t* index = NULL;
+ fkerr_t index_error = FK_SUCCESS;
+ dict_index_t* err_index = NULL;
+ ulint err_col;
+ const bool tmp_table = m_flags2 & DICT_TF2_TEMPORARY;
+ const CHARSET_INFO* cs = thd_charset(m_thd);
+ const char* operation = "Create ";
+ const char* name = m_table_name;
+
+ enum_sql_command sqlcom = enum_sql_command(thd_sql_command(m_thd));
+
+ if (sqlcom == SQLCOM_ALTER_TABLE) {
+ dict_table_t* table_to_alter;
+ mem_heap_t* heap = mem_heap_create(10000);
+ ulint highest_id_so_far;
+ char* n = dict_get_referenced_table(
+ name, LEX_STRING_WITH_LEN(m_form->s->db),
+ LEX_STRING_WITH_LEN(m_form->s->table_name),
+ &table_to_alter, heap, cs);
+
+ /* Starting from 4.0.18 and 4.1.2, we generate foreign key id's
+ in the format databasename/tablename_ibfk_[number], where
+ [number] is local to the table; look for the highest [number]
+ for table_to_alter, so that we can assign to new constraints
+ higher numbers. */
+
+ /* If we are altering a temporary table, the table name after
+ ALTER TABLE does not correspond to the internal table name, and
+ table_to_alter is NULL. TODO: should we fix this somehow? */
+
+ if (table_to_alter) {
+ n = table_to_alter->name.m_name;
+ highest_id_so_far = dict_table_get_highest_foreign_id(
+ table_to_alter);
+ } else {
+ highest_id_so_far = 0;
+ }
+
+ char* bufend = innobase_convert_name(
+ create_name, sizeof create_name, n, strlen(n), m_thd);
+ create_name[bufend - create_name] = '\0';
+ number = highest_id_so_far + 1;
+ mem_heap_free(heap);
+ operation = "Alter ";
+ } else if (strstr(name, "#P#") || strstr(name, "#p#")) {
+ /* Partitioned table */
+ create_name[0] = '\0';
+ } else {
+ char* bufend = innobase_convert_name(create_name,
+ sizeof create_name,
+ name,
+ strlen(name), m_thd);
+ create_name[bufend - create_name] = '\0';
+ }
+
+ Alter_info* alter_info = m_create_info->alter_info;
+ ut_ad(alter_info);
+ List_iterator_fast<Key> key_it(alter_info->key_list);
+
+ dict_table_t* table = dict_table_get_low(name);
+ if (!table) {
+ ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+ "%s table %s foreign key constraint"
+ " failed. Table not found.",
+ operation, create_name);
+
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ while (Key* key = key_it++) {
+ if (key->type != Key::FOREIGN_KEY)
+ continue;
+
+ if (tmp_table) {
+ ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT,
+ create_name,
+ "%s table `%s`.`%s` with foreign key "
+ "constraint failed. "
+ "Temporary tables can't have "
+ "foreign key constraints.",
+ operation, m_form->s->db.str,
+ m_form->s->table_name.str);
+
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ } else if (!*create_name) {
+ ut_ad("should be unreachable" == 0);
+ return DB_CANNOT_ADD_CONSTRAINT;
+ }
+
+ Foreign_key* fk = static_cast<Foreign_key*>(key);
+ Key_part_spec* col;
+ bool success;
+
+ dict_foreign_t* foreign = dict_mem_foreign_create();
+ if (!foreign) {
+ return (DB_OUT_OF_MEMORY);
+ }
+
+ List_iterator_fast<Key_part_spec> col_it(fk->columns);
+ unsigned i = 0, j = 0;
+ while ((col = col_it++)) {
+ column_names[i] = mem_heap_strdupl(
+ foreign->heap, col->field_name.str,
+ col->field_name.length);
+ success = find_col(table, column_names + i);
+ if (!success) {
+ key_text k(fk);
+ ib_foreign_warn(
+ m_trx, DB_CANNOT_ADD_CONSTRAINT,
+ create_name,
+ "%s table %s foreign key %s constraint"
+ " failed. Column %s was not found.",
+ operation, create_name, k.str(),
+ column_names[i]);
+ dict_foreign_free(foreign);
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ }
+ ++i;
+ if (i >= MAX_COLS_PER_FK) {
+ key_text k(fk);
+ ib_foreign_warn(
+ m_trx, DB_CANNOT_ADD_CONSTRAINT,
+ create_name,
+ "%s table %s foreign key %s constraint"
+ " failed. Too many columns: %u (%u "
+ "allowed).",
+ operation, create_name, k.str(), i,
+ MAX_COLS_PER_FK);
+ dict_foreign_free(foreign);
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ }
+ }
+
+ index = dict_foreign_find_index(
+ table, NULL, column_names, i, NULL, TRUE, FALSE,
+ &index_error, &err_col, &err_index);
+
+ if (!index) {
+ key_text k(fk);
+ foreign_push_index_error(m_trx, operation, create_name,
+ k.str(), column_names,
+ index_error, err_col,
+ err_index, table);
+ dict_foreign_free(foreign);
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ if (fk->constraint_name.str) {
+ ulint db_len;
+
+ /* Catenate 'databasename/' to the constraint name
+ specified by the user: we conceive the constraint as
+ belonging to the same MySQL 'database' as the table
+ itself. We store the name to foreign->id. */
+
+ db_len = dict_get_db_name_len(table->name.m_name);
+
+ foreign->id = static_cast<char*>(mem_heap_alloc(
+ foreign->heap,
+ db_len + fk->constraint_name.length + 2));
+
+ memcpy(foreign->id, table->name.m_name, db_len);
+ foreign->id[db_len] = '/';
+ strcpy(foreign->id + db_len + 1,
+ fk->constraint_name.str);
+ }
+
+ if (foreign->id == NULL) {
+ error = dict_create_add_foreign_id(
+ &number, table->name.m_name, foreign);
+ if (error != DB_SUCCESS) {
+ dict_foreign_free(foreign);
+ return (error);
+ }
+ }
+
+ std::pair<dict_foreign_set::iterator, bool> ret
+ = local_fk_set.insert(foreign);
+
+ if (!ret.second) {
+ /* A duplicate foreign key name has been found */
+ dict_foreign_free(foreign);
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ foreign->foreign_table = table;
+ foreign->foreign_table_name
+ = mem_heap_strdup(foreign->heap, table->name.m_name);
+ if (!foreign->foreign_table_name) {
+ return (DB_OUT_OF_MEMORY);
+ }
+
+ dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+ foreign->foreign_index = index;
+ foreign->n_fields = i & dict_index_t::MAX_N_FIELDS;
+
+ foreign->foreign_col_names = static_cast<const char**>(
+ mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+ if (!foreign->foreign_col_names) {
+ return (DB_OUT_OF_MEMORY);
+ }
+
+ memcpy(foreign->foreign_col_names, column_names,
+ i * sizeof(void*));
+
+ foreign->referenced_table_name = dict_get_referenced_table(
+ name, LEX_STRING_WITH_LEN(fk->ref_db),
+ LEX_STRING_WITH_LEN(fk->ref_table),
+ &foreign->referenced_table, foreign->heap, cs);
+
+ if (!foreign->referenced_table_name) {
+ return (DB_OUT_OF_MEMORY);
+ }
+
+ if (!foreign->referenced_table && m_trx->check_foreigns) {
+ char buf[MAX_TABLE_NAME_LEN + 1] = "";
+ char* bufend;
+
+ bufend = innobase_convert_name(
+ buf, MAX_TABLE_NAME_LEN,
+ foreign->referenced_table_name,
+ strlen(foreign->referenced_table_name), m_thd);
+ buf[bufend - buf] = '\0';
+ key_text k(fk);
+ ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT,
+ create_name,
+ "%s table %s with foreign key %s "
+ "constraint failed. Referenced table "
+ "%s not found in the data dictionary.",
+ operation, create_name, k.str(), buf);
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ /* Don't allow foreign keys on partitioned tables yet. */
+ if (foreign->referenced_table
+ && dict_table_is_partition(foreign->referenced_table)) {
+ /* How could one make a referenced table to be a
+ * partition? */
+ ut_ad(0);
+ my_error(ER_FEATURE_NOT_SUPPORTED_WITH_PARTITIONING,
+ MYF(0), "FOREIGN KEY");
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ col_it.init(fk->ref_columns);
+ while ((col = col_it++)) {
+ ref_column_names[j] = mem_heap_strdupl(
+ foreign->heap, col->field_name.str,
+ col->field_name.length);
+ if (foreign->referenced_table) {
+ success = find_col(foreign->referenced_table,
+ ref_column_names + j);
+ if (!success) {
+ key_text k(fk);
+ ib_foreign_warn(
+ m_trx,
+ DB_CANNOT_ADD_CONSTRAINT,
+ create_name,
+ "%s table %s foreign key %s "
+ "constraint failed. "
+ "Column %s was not found.",
+ operation, create_name,
+ k.str(), ref_column_names[j]);
+
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ }
+ }
+ ++j;
+ }
+ /* See ER_WRONG_FK_DEF in mysql_prepare_create_table() */
+ ut_ad(i == j);
+
+ /* Try to find an index which contains the columns as the first
+ fields and in the right order, and the types are the same as in
+ foreign->foreign_index */
+
+ if (foreign->referenced_table) {
+ index = dict_foreign_find_index(
+ foreign->referenced_table, NULL,
+ ref_column_names, i, foreign->foreign_index,
+ TRUE, FALSE, &index_error, &err_col,
+ &err_index);
+
+ if (!index) {
+ key_text k(fk);
+ foreign_push_index_error(
+ m_trx, operation, create_name, k.str(),
+ column_names, index_error, err_col,
+ err_index, foreign->referenced_table);
+
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ }
+ } else {
+ ut_a(m_trx->check_foreigns == FALSE);
+ index = NULL;
+ }
+
+ foreign->referenced_index = index;
+ dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+ foreign->referenced_col_names = static_cast<const char**>(
+ mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+ if (!foreign->referenced_col_names) {
+ return (DB_OUT_OF_MEMORY);
+ }
+
+ memcpy(foreign->referenced_col_names, ref_column_names,
+ i * sizeof(void*));
+
+ if (fk->delete_opt == FK_OPTION_SET_NULL
+ || fk->update_opt == FK_OPTION_SET_NULL) {
+ for (j = 0; j < foreign->n_fields; j++) {
+ if ((dict_index_get_nth_col(
+ foreign->foreign_index, j)
+ ->prtype)
+ & DATA_NOT_NULL) {
+ const dict_col_t* col
+ = dict_index_get_nth_col(
+ foreign->foreign_index,
+ j);
+ const char* col_name
+ = dict_table_get_col_name(
+ foreign->foreign_index
+ ->table,
+ dict_col_get_no(col));
+
+ /* It is not sensible to define SET
+ NULL
+ if the column is not allowed to be
+ NULL! */
+ key_text k(fk);
+ ib_foreign_warn(
+ m_trx,
+ DB_CANNOT_ADD_CONSTRAINT,
+ create_name,
+ "%s table %s with foreign key "
+ "%s constraint failed. You have"
+ " defined a SET NULL condition "
+ "but column '%s' is defined as "
+ "NOT NULL.",
+ operation, create_name,
+ k.str(), col_name);
+
+ return (DB_CANNOT_ADD_CONSTRAINT);
+ }
+ }
+ }
+
+ switch (fk->delete_opt) {
+ case FK_OPTION_UNDEF:
+ case FK_OPTION_RESTRICT:
+ break;
+ case FK_OPTION_CASCADE:
+ foreign->type |= DICT_FOREIGN_ON_DELETE_CASCADE;
+ break;
+ case FK_OPTION_SET_NULL:
+ foreign->type |= DICT_FOREIGN_ON_DELETE_SET_NULL;
+ break;
+ case FK_OPTION_NO_ACTION:
+ foreign->type |= DICT_FOREIGN_ON_DELETE_NO_ACTION;
+ break;
+ case FK_OPTION_SET_DEFAULT:
+ // TODO: MDEV-10393 Foreign keys SET DEFAULT action
+ break;
+ default:
+ ut_ad(0);
+ break;
+ }
+
+ switch (fk->update_opt) {
+ case FK_OPTION_UNDEF:
+ case FK_OPTION_RESTRICT:
+ break;
+ case FK_OPTION_CASCADE:
+ foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
+ break;
+ case FK_OPTION_SET_NULL:
+ foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
+ break;
+ case FK_OPTION_NO_ACTION:
+ foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
+ break;
+ case FK_OPTION_SET_DEFAULT:
+ // TODO: MDEV-10393 Foreign keys SET DEFAULT action
+ break;
+ default:
+ ut_ad(0);
+ break;
+ }
+ }
+
+ if (dict_foreigns_has_s_base_col(local_fk_set, table)) {
+ return (DB_NO_FK_ON_S_BASE_COL);
+ }
+
+ /**********************************************************/
+ /* The following call adds the foreign key constraints
+ to the data dictionary system tables on disk */
+ m_trx->op_info = "adding foreign keys";
+
+ trx_start_if_not_started_xa(m_trx, true);
+
+ trx_set_dict_operation(m_trx, TRX_DICT_OP_TABLE);
+
+ error = dict_create_add_foreigns_to_dictionary(local_fk_set, table,
+ m_trx);
+
+ if (error == DB_SUCCESS) {
+
+ table->foreign_set.insert(local_fk_set.begin(),
+ local_fk_set.end());
+ std::for_each(local_fk_set.begin(), local_fk_set.end(),
+ dict_foreign_add_to_referenced_table());
+ local_fk_set.clear();
+
+ dict_mem_table_fill_foreign_vcol_set(table);
+ }
+ return (error);
+}
+
+/** Create the internal innodb table.
+@param create_fk whether to add FOREIGN KEY constraints */
+int create_table_info_t::create_table(bool create_fk)
+{
+ int error;
+ int primary_key_no;
+ uint i;
+
+ DBUG_ENTER("create_table");
+
+ /* Look for a primary key */
+ primary_key_no = (m_form->s->primary_key != MAX_KEY ?
+ (int) m_form->s->primary_key : -1);
+
+ /* Our function innobase_get_mysql_key_number_for_index assumes
+ the primary key is always number 0, if it exists */
+ ut_a(primary_key_no == -1 || primary_key_no == 0);
+
+ error = create_table_def();
+
+ if (error) {
+ DBUG_RETURN(error);
+ }
+
+ DBUG_ASSERT(m_drop_before_rollback
+ == !(m_flags2 & DICT_TF2_TEMPORARY));
+
+ /* Create the keys */
+
+ if (m_form->s->keys == 0 || primary_key_no == -1) {
+ /* Create an index which is used as the clustered index;
+ order the rows by their row id which is internally generated
+ by InnoDB */
+ ulint flags = m_table->flags;
+ dict_index_t* index = dict_mem_index_create(
+ m_table, innobase_index_reserve_name,
+ DICT_CLUSTERED, 0);
+ error = convert_error_code_to_mysql(
+ row_create_index_for_mysql(index, m_trx, NULL),
+ flags, m_thd);
+ if (error) {
+ DBUG_RETURN(error);
+ }
+ }
+
+ if (primary_key_no != -1) {
+ /* In InnoDB the clustered index must always be created
+ first */
+ if ((error = create_index(m_trx, m_form, m_table,
+ (uint) primary_key_no))) {
+ DBUG_RETURN(error);
+ }
+ }
+
+ /* Create the ancillary tables that are common to all FTS indexes on
+ this table. */
+ if (m_flags2 & DICT_TF2_FTS) {
+ fts_doc_id_index_enum ret;
+
+ /* Check whether there already exists FTS_DOC_ID_INDEX */
+ ret = innobase_fts_check_doc_id_index_in_def(
+ m_form->s->keys, m_form->key_info);
+
+ switch (ret) {
+ case FTS_INCORRECT_DOC_ID_INDEX:
+ push_warning_printf(m_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_NAME_FOR_INDEX,
+ " InnoDB: Index name %s is reserved"
+ " for the unique index on"
+ " FTS_DOC_ID column for FTS"
+ " Document ID indexing"
+ " on table %s. Please check"
+ " the index definition to"
+ " make sure it is of correct"
+ " type\n",
+ FTS_DOC_ID_INDEX_NAME,
+ m_table->name.m_name);
+
+ if (m_table->fts) {
+ fts_free(m_table);
+ }
+
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ FTS_DOC_ID_INDEX_NAME);
+ DBUG_RETURN(-1);
+ case FTS_EXIST_DOC_ID_INDEX:
+ case FTS_NOT_EXIST_DOC_ID_INDEX:
+ break;
+ }
+
+ dberr_t err = fts_create_common_tables(
+ m_trx, m_table,
+ (ret == FTS_EXIST_DOC_ID_INDEX));
+
+ error = convert_error_code_to_mysql(err, 0, NULL);
+
+ if (error) {
+ DBUG_RETURN(error);
+ }
+ }
+
+ for (i = 0; i < m_form->s->keys; i++) {
+ if (i != uint(primary_key_no)
+ && (error = create_index(m_trx, m_form, m_table, i))) {
+ DBUG_RETURN(error);
+ }
+ }
+
+ /* Cache all the FTS indexes on this table in the FTS specific
+ structure. They are used for FTS indexed column update handling. */
+ if (m_flags2 & DICT_TF2_FTS) {
+ fts_t* fts = m_table->fts;
+
+ ut_a(fts != NULL);
+
+ dict_table_get_all_fts_indexes(m_table, fts->indexes);
+ }
+
+ dberr_t err = create_fk ? create_foreign_keys() : DB_SUCCESS;
+
+ if (err == DB_SUCCESS) {
+ /* Check that also referencing constraints are ok */
+ dict_names_t fk_tables;
+ err = dict_load_foreigns(m_table_name, NULL,
+ false, true,
+ DICT_ERR_IGNORE_NONE,
+ fk_tables);
+ while (err == DB_SUCCESS && !fk_tables.empty()) {
+ dict_load_table(fk_tables.front(),
+ DICT_ERR_IGNORE_NONE);
+ fk_tables.pop_front();
+ }
+ }
+
+ switch (err) {
+ case DB_PARENT_NO_INDEX:
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_CANNOT_ADD_FOREIGN,
+ "Create table '%s' with foreign key constraint"
+ " failed. There is no index in the referenced"
+ " table where the referenced columns appear"
+ " as the first columns.\n", m_table_name);
+ break;
+
+ case DB_CHILD_NO_INDEX:
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_CANNOT_ADD_FOREIGN,
+ "Create table '%s' with foreign key constraint"
+ " failed. There is no index in the referencing"
+ " table where referencing columns appear"
+ " as the first columns.\n", m_table_name);
+ break;
+ case DB_NO_FK_ON_S_BASE_COL:
+ push_warning_printf(
+ m_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_CANNOT_ADD_FOREIGN,
+ "Create table '%s' with foreign key constraint"
+ " failed. Cannot add foreign key constraint"
+ " placed on the base column of stored"
+ " column. \n",
+ m_table_name);
+ default:
+ break;
+ }
+
+ if (err != DB_SUCCESS) {
+ DBUG_RETURN(convert_error_code_to_mysql(
+ err, m_flags, NULL));
+ }
+
+ /* In TRUNCATE TABLE, we will merely warn about the maximum
+ row size being too large. */
+ if (!row_size_is_acceptable(*m_table, create_fk)) {
+ DBUG_RETURN(convert_error_code_to_mysql(
+ DB_TOO_BIG_RECORD, m_flags, NULL));
+ }
+
+ DBUG_RETURN(0);
+}
+
+bool create_table_info_t::row_size_is_acceptable(
+ const dict_table_t &table, bool strict) const
+{
+ for (dict_index_t *index= dict_table_get_first_index(&table); index;
+ index= dict_table_get_next_index(index))
+ if (!row_size_is_acceptable(*index, strict))
+ return false;
+ return true;
+}
+
+/* FIXME: row size check has some flaws and should be improved */
+dict_index_t::record_size_info_t dict_index_t::record_size_info() const
+{
+ ut_ad(!(type & DICT_FTS));
+
+ /* maximum allowed size of a node pointer record */
+ ulint page_ptr_max;
+ const bool comp= table->not_redundant();
+ /* table->space == NULL after DISCARD TABLESPACE */
+ const ulint zip_size= dict_tf_get_zip_size(table->flags);
+ record_size_info_t result;
+
+ if (zip_size && zip_size < srv_page_size)
+ {
+ /* On a ROW_FORMAT=COMPRESSED page, two records must fit in the
+ uncompressed page modification log. On compressed pages
+ with size.physical() == univ_page_size.physical(),
+ this limit will never be reached. */
+ ut_ad(comp);
+ /* The maximum allowed record size is the size of
+ an empty page, minus a byte for recoding the heap
+ number in the page modification log. The maximum
+ allowed node pointer size is half that. */
+ result.max_leaf_size= page_zip_empty_size(n_fields, zip_size);
+ if (result.max_leaf_size)
+ {
+ result.max_leaf_size--;
+ }
+ page_ptr_max= result.max_leaf_size / 2;
+ /* On a compressed page, there is a two-byte entry in
+ the dense page directory for every record. But there
+ is no record header. */
+ result.shortest_size= 2;
+ }
+ else
+ {
+ /* The maximum allowed record size is half a B-tree
+ page(16k for 64k page size). No additional sparse
+ page directory entry will be generated for the first
+ few user records. */
+ result.max_leaf_size= (comp || srv_page_size < UNIV_PAGE_SIZE_MAX)
+ ? page_get_free_space_of_empty(comp) / 2
+ : REDUNDANT_REC_MAX_DATA_SIZE;
+
+ page_ptr_max= result.max_leaf_size;
+ /* Each record has a header. */
+ result.shortest_size= comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES;
+ }
+
+ if (comp)
+ {
+ /* Include the "null" flags in the
+ maximum possible record size. */
+ result.shortest_size+= UT_BITS_IN_BYTES(n_nullable);
+ }
+ else
+ {
+ /* For each column, include a 2-byte offset and a
+ "null" flag. The 1-byte format is only used in short
+ records that do not contain externally stored columns.
+ Such records could never exceed the page limit, even
+ when using the 2-byte format. */
+ result.shortest_size+= 2 * n_fields;
+ }
+
+ const ulint max_local_len= table->get_overflow_field_local_len();
+
+ /* Compute the maximum possible record size. */
+ for (unsigned i= 0; i < n_fields; i++)
+ {
+ const dict_field_t &f= fields[i];
+ const dict_col_t &col= *f.col;
+
+ /* In dtuple_convert_big_rec(), variable-length columns
+ that are longer than BTR_EXTERN_LOCAL_STORED_MAX_SIZE
+ may be chosen for external storage.
+
+ Fixed-length columns, and all columns of secondary
+ index records are always stored inline. */
+
+ /* Determine the maximum length of the index field.
+ The field_ext_max_size should be computed as the worst
+ case in rec_get_converted_size_comp() for
+ REC_STATUS_ORDINARY records. */
+
+ size_t field_max_size= dict_col_get_fixed_size(&col, comp);
+ if (field_max_size && f.fixed_len != 0)
+ {
+ /* dict_index_add_col() should guarantee this */
+ ut_ad(!f.prefix_len || f.fixed_len == f.prefix_len);
+ /* Fixed lengths are not encoded
+ in ROW_FORMAT=COMPACT. */
+ goto add_field_size;
+ }
+
+ field_max_size= dict_col_get_max_size(&col);
+
+ if (f.prefix_len)
+ {
+ if (f.prefix_len < field_max_size)
+ {
+ field_max_size= f.prefix_len;
+ }
+
+ /* those conditions were copied from dtuple_convert_big_rec()*/
+ }
+ else if (field_max_size > max_local_len &&
+ field_max_size > BTR_EXTERN_LOCAL_STORED_MAX_SIZE &&
+ DATA_BIG_COL(&col) && dict_index_is_clust(this))
+ {
+
+ /* In the worst case, we have a locally stored
+ column of BTR_EXTERN_LOCAL_STORED_MAX_SIZE bytes.
+ The length can be stored in one byte. If the
+ column were stored externally, the lengths in
+ the clustered index page would be
+ BTR_EXTERN_FIELD_REF_SIZE and 2. */
+ field_max_size= max_local_len;
+ }
+
+ if (comp)
+ {
+ /* Add the extra size for ROW_FORMAT=COMPACT.
+ For ROW_FORMAT=REDUNDANT, these bytes were
+ added to result.shortest_size before this loop. */
+ result.shortest_size+= field_max_size < 256 ? 1 : 2;
+ }
+ add_field_size:
+ result.shortest_size+= field_max_size;
+
+ /* Check the size limit on leaf pages. */
+ if (result.shortest_size >= result.max_leaf_size)
+ {
+ result.set_too_big(i);
+ }
+
+ /* Check the size limit on non-leaf pages. Records
+ stored in non-leaf B-tree pages consist of the unique
+ columns of the record (the key columns of the B-tree)
+ and a node pointer field. When we have processed the
+ unique columns, result.shortest_size equals the size of the
+ node pointer record minus the node pointer column. */
+ if (i + 1 == dict_index_get_n_unique_in_tree(this) &&
+ result.shortest_size + REC_NODE_PTR_SIZE >= page_ptr_max)
+ {
+ result.set_too_big(i);
+ }
+ }
+
+ return result;
+}
+
+/** Issue a warning that the row is too big. */
+static void ib_warn_row_too_big(THD *thd, const dict_table_t *table)
+{
+ /* FIXME: this row size check should be improved */
+ /* If prefix is true then a 768-byte prefix is stored
+ locally for BLOB fields. Refer to dict_table_get_format() */
+ const bool prefix= !dict_table_has_atomic_blobs(table);
+
+ const ulint free_space=
+ page_get_free_space_of_empty(table->flags & DICT_TF_COMPACT) / 2;
+
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_TO_BIG_ROW,
+ "Row size too large (> " ULINTPF "). Changing some columns to TEXT"
+ " or BLOB %smay help. In current row format, BLOB prefix of"
+ " %d bytes is stored inline.",
+ free_space,
+ prefix ? "or using ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED " : "",
+ prefix ? DICT_MAX_FIXED_COL_LEN : 0);
+}
+
+bool create_table_info_t::row_size_is_acceptable(
+ const dict_index_t &index, bool strict) const
+{
+ if ((index.type & DICT_FTS) || index.table->is_system_db)
+ {
+ /* Ignore system tables check because innodb_table_stats
+ maximum row size can not fit on 4k page. */
+ return true;
+ }
+
+ const bool innodb_strict_mode= THDVAR(m_thd, strict_mode);
+ dict_index_t::record_size_info_t info= index.record_size_info();
+
+ if (info.row_is_too_big())
+ {
+ ut_ad(info.get_overrun_size() != 0);
+ ut_ad(info.max_leaf_size != 0);
+
+ const size_t idx= info.get_first_overrun_field_index();
+ const dict_field_t *field= dict_index_get_nth_field(&index, idx);
+
+ ut_ad((!field->name) == field->col->is_dropped());
+ if (innodb_strict_mode || global_system_variables.log_warnings > 2)
+ {
+ ib::error_or_warn eow(strict && innodb_strict_mode);
+ if (field->name)
+ eow << "Cannot add field " << field->name << " in table ";
+ else
+ eow << "Cannot add an instantly dropped column in table ";
+ eow << index.table->name << " because after adding it, the row size is "
+ << info.get_overrun_size()
+ << " which is greater than maximum allowed size ("
+ << info.max_leaf_size << " bytes) for a record on index leaf page.";
+ }
+
+ if (strict && innodb_strict_mode)
+ return false;
+
+ ib_warn_row_too_big(m_thd, index.table);
+ }
+
+ return true;
+}
+
+/** Update a new table in an InnoDB database.
+@return error number */
+int
+create_table_info_t::create_table_update_dict()
+{
+ dict_table_t* innobase_table;
+
+ DBUG_ENTER("create_table_update_dict");
+
+ innobase_table = dict_table_open_on_name(
+ m_table_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+ DBUG_ASSERT(innobase_table != 0);
+ if (innobase_table->fts != NULL) {
+ if (innobase_table->fts_doc_id_index == NULL) {
+ innobase_table->fts_doc_id_index
+ = dict_table_get_index_on_name(
+ innobase_table, FTS_DOC_ID_INDEX_NAME);
+ DBUG_ASSERT(innobase_table->fts_doc_id_index != NULL);
+ } else {
+ DBUG_ASSERT(innobase_table->fts_doc_id_index
+ == dict_table_get_index_on_name(
+ innobase_table,
+ FTS_DOC_ID_INDEX_NAME));
+ }
+ }
+
+ DBUG_ASSERT((innobase_table->fts == NULL)
+ == (innobase_table->fts_doc_id_index == NULL));
+
+ innobase_copy_frm_flags_from_create_info(innobase_table, m_create_info);
+
+ dict_stats_update(innobase_table, DICT_STATS_EMPTY_TABLE);
+
+ /* Load server stopword into FTS cache */
+ if (m_flags2 & DICT_TF2_FTS) {
+ if (!innobase_fts_load_stopword(innobase_table, NULL, m_thd)) {
+ dict_table_close(innobase_table, FALSE, FALSE);
+ DBUG_RETURN(-1);
+ }
+
+ mutex_enter(&dict_sys.mutex);
+ fts_optimize_add_table(innobase_table);
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ if (const Field* ai = m_form->found_next_number_field) {
+ ut_ad(ai->stored_in_db());
+
+ ib_uint64_t autoinc = m_create_info->auto_increment_value;
+
+ if (autoinc == 0) {
+ autoinc = 1;
+ }
+
+ innobase_table->autoinc_mutex.lock();
+ dict_table_autoinc_initialize(innobase_table, autoinc);
+
+ if (innobase_table->is_temporary()) {
+ /* AUTO_INCREMENT is not persistent for
+ TEMPORARY TABLE. Temporary tables are never
+ evicted. Keep the counter in memory only. */
+ } else {
+ const unsigned col_no = innodb_col_no(ai);
+
+ innobase_table->persistent_autoinc
+ = static_cast<uint16_t>(
+ dict_table_get_nth_col_pos(
+ innobase_table, col_no, NULL)
+ + 1)
+ & dict_index_t::MAX_N_FIELDS;
+
+ /* Persist the "last used" value, which
+ typically is AUTO_INCREMENT - 1.
+ In btr_create(), the value 0 was already written. */
+ if (--autoinc) {
+ btr_write_autoinc(
+ dict_table_get_first_index(
+ innobase_table),
+ autoinc);
+ }
+ }
+
+ innobase_table->autoinc_mutex.unlock();
+ }
+
+ innobase_parse_hint_from_comment(m_thd, innobase_table, m_form->s);
+
+ dict_table_close(innobase_table, FALSE, FALSE);
+ DBUG_RETURN(0);
+}
+
+/** Allocate a new trx. */
+void
+create_table_info_t::allocate_trx()
+{
+ m_trx = innobase_trx_allocate(m_thd);
+
+ m_trx->will_lock = true;
+ m_trx->ddl = true;
+}
+
+/** Create a new table to an InnoDB database.
+@param[in] name Table name, format: "db/table_name".
+@param[in] form Table format; columns and index information.
+@param[in] create_info Create info (including create statement string).
+@param[in] file_per_table whether to create .ibd file
+@param[in,out] trx dictionary transaction, or NULL to create new
+@return 0 if success else error number. */
+inline int
+ha_innobase::create(
+ const char* name,
+ TABLE* form,
+ HA_CREATE_INFO* create_info,
+ bool file_per_table,
+ trx_t* trx)
+{
+ int error;
+ char norm_name[FN_REFLEN]; /* {database}/{tablename} */
+ char remote_path[FN_REFLEN]; /* Absolute path of table */
+
+ DBUG_ENTER("ha_innobase::create");
+
+ DBUG_ASSERT(form->s == table_share);
+ DBUG_ASSERT(table_share->table_type == TABLE_TYPE_SEQUENCE
+ || table_share->table_type == TABLE_TYPE_NORMAL);
+
+ create_table_info_t info(ha_thd(),
+ form,
+ create_info,
+ norm_name,
+ remote_path,
+ file_per_table, trx);
+
+ if ((error = info.initialize())
+ || (error = info.prepare_create_table(name, !trx))) {
+ if (trx) {
+ trx_rollback_for_mysql(trx);
+ row_mysql_unlock_data_dictionary(trx);
+ }
+ DBUG_RETURN(error);
+ }
+
+ const bool own_trx = !trx;
+
+ if (own_trx) {
+ info.allocate_trx();
+ trx = info.trx();
+ /* Latch the InnoDB data dictionary exclusively so that no deadlocks
+ or lock waits can happen in it during a table create operation.
+ Drop table etc. do this latching in row0mysql.cc. */
+ row_mysql_lock_data_dictionary(trx);
+ DBUG_ASSERT(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+ }
+
+ if ((error = info.create_table(own_trx))) {
+ /* Drop the being-created table before rollback,
+ so that rollback can possibly rename back a table
+ that could have been renamed before the failed creation. */
+ if (info.drop_before_rollback()) {
+ trx->error_state = DB_SUCCESS;
+ row_drop_table_for_mysql(info.table_name(),
+ trx, SQLCOM_TRUNCATE, true,
+ false);
+ }
+ trx_rollback_for_mysql(trx);
+ row_mysql_unlock_data_dictionary(trx);
+ goto func_exit;
+ }
+
+ innobase_commit_low(trx);
+ row_mysql_unlock_data_dictionary(trx);
+
+ /* Flush the log to reduce probability that the .frm files and
+ the InnoDB data dictionary get out-of-sync if the user runs
+ with innodb_flush_log_at_trx_commit = 0 */
+ log_buffer_flush_to_disk();
+
+ ut_ad(!srv_read_only_mode);
+
+ error = info.create_table_update_dict();
+
+func_exit:
+ if (own_trx) {
+ trx->free();
+ }
+
+ DBUG_RETURN(error);
+}
+
+/** Create a new table to an InnoDB database.
+@param[in] name Table name, format: "db/table_name".
+@param[in] form Table format; columns and index information.
+@param[in] create_info Create info (including create statement string).
+@return 0 if success else error number. */
+int
+ha_innobase::create(
+ const char* name,
+ TABLE* form,
+ HA_CREATE_INFO* create_info)
+{
+ return create(name, form, create_info, srv_file_per_table);
+}
+
+/*****************************************************************//**
+Discards or imports an InnoDB tablespace.
+@return 0 == success, -1 == error */
+
+int
+ha_innobase::discard_or_import_tablespace(
+/*======================================*/
+ my_bool discard) /*!< in: TRUE if discard, else import */
+{
+
+ DBUG_ENTER("ha_innobase::discard_or_import_tablespace");
+
+ ut_a(m_prebuilt->trx != NULL);
+ ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
+ ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
+
+ if (high_level_read_only) {
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ }
+
+ if (m_prebuilt->table->is_temporary()) {
+ ib_senderrf(
+ m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_CANNOT_DISCARD_TEMPORARY_TABLE);
+
+ DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE);
+ }
+
+ if (m_prebuilt->table->space == fil_system.sys_space) {
+ ib_senderrf(
+ m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_IN_SYSTEM_TABLESPACE,
+ m_prebuilt->table->name.m_name);
+
+ DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE);
+ }
+
+ trx_start_if_not_started(m_prebuilt->trx, true);
+
+ /* Obtain an exclusive lock on the table. */
+ dberr_t err = row_mysql_lock_table(
+ m_prebuilt->trx, m_prebuilt->table, LOCK_X,
+ discard ? "setting table lock for DISCARD TABLESPACE"
+ : "setting table lock for IMPORT TABLESPACE");
+
+ if (err != DB_SUCCESS) {
+ /* unable to lock the table: do nothing */
+ } else if (discard) {
+
+ /* Discarding an already discarded tablespace should be an
+ idempotent operation. Also, if the .ibd file is missing the
+ user may want to set the DISCARD flag in order to IMPORT
+ a new tablespace. */
+
+ if (!m_prebuilt->table->is_readable()) {
+ ib_senderrf(
+ m_prebuilt->trx->mysql_thd,
+ IB_LOG_LEVEL_WARN, ER_TABLESPACE_MISSING,
+ m_prebuilt->table->name.m_name);
+ }
+
+ err = row_discard_tablespace_for_mysql(
+ m_prebuilt->table->name.m_name, m_prebuilt->trx);
+
+ } else if (m_prebuilt->table->is_readable()) {
+ /* Commit the transaction in order to
+ release the table lock. */
+ trx_commit_for_mysql(m_prebuilt->trx);
+
+ ib::error() << "Unable to import tablespace "
+ << m_prebuilt->table->name << " because it already"
+ " exists. Please DISCARD the tablespace"
+ " before IMPORT.";
+ ib_senderrf(
+ m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_EXISTS, m_prebuilt->table->name.m_name);
+
+ DBUG_RETURN(HA_ERR_TABLE_EXIST);
+ } else {
+ err = row_import_for_mysql(m_prebuilt->table, m_prebuilt);
+
+ if (err == DB_SUCCESS) {
+
+ info(HA_STATUS_TIME
+ | HA_STATUS_CONST
+ | HA_STATUS_VARIABLE
+ | HA_STATUS_AUTO);
+
+ fil_crypt_set_encrypt_tables(srv_encrypt_tables);
+ }
+ }
+
+ /* Commit the transaction in order to release the table lock. */
+ trx_commit_for_mysql(m_prebuilt->trx);
+
+ if (discard || err != DB_SUCCESS) {
+ DBUG_RETURN(convert_error_code_to_mysql(
+ err, m_prebuilt->table->flags, NULL));
+ }
+
+ /* Evict and reload the table definition in order to invoke
+ btr_cur_instant_init(). */
+ table_id_t id = m_prebuilt->table->id;
+ ut_ad(id);
+ mutex_enter(&dict_sys.mutex);
+ dict_table_close(m_prebuilt->table, TRUE, FALSE);
+ dict_sys.remove(m_prebuilt->table);
+ m_prebuilt->table = dict_table_open_on_id(id, TRUE,
+ DICT_TABLE_OP_NORMAL);
+ mutex_exit(&dict_sys.mutex);
+ if (!m_prebuilt->table) {
+ err = DB_TABLE_NOT_FOUND;
+ } else {
+ if (const Field* ai = table->found_next_number_field) {
+ initialize_auto_increment(m_prebuilt->table, ai);
+ }
+ dict_stats_init(m_prebuilt->table);
+ }
+
+ if (dict_stats_is_persistent_enabled(m_prebuilt->table)) {
+ dberr_t ret;
+
+ /* Adjust the persistent statistics. */
+ ret = dict_stats_update(m_prebuilt->table,
+ DICT_STATS_RECALC_PERSISTENT);
+
+ if (ret != DB_SUCCESS) {
+ push_warning_printf(
+ ha_thd(),
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_ALTER_INFO,
+ "Error updating stats for table '%s'"
+ " after table rebuild: %s",
+ m_prebuilt->table->name.m_name,
+ ut_strerr(ret));
+ }
+ }
+
+ DBUG_RETURN(0);
+}
+
+/**
+ @return 1 if frm file exists
+ @return 0 if it doesn't exists
+*/
+
+static bool frm_file_exists(const char *path)
+{
+ char buff[FN_REFLEN];
+ strxnmov(buff, FN_REFLEN, path, reg_ext, NullS);
+ return !access(buff, F_OK);
+}
+
+
+/**
+Drops a table from an InnoDB database. Before calling this function,
+MySQL calls innobase_commit to commit the transaction of the current user.
+Then the current user cannot have locks set on the table. Drop table
+operation inside InnoDB will remove all locks any user has on the table
+inside InnoDB.
+@param[in] name table name
+@param[in] sqlcom SQLCOM_DROP_DB, SQLCOM_TRUNCATE, ...
+@return error number */
+inline int ha_innobase::delete_table(const char* name, enum_sql_command sqlcom)
+{
+ dberr_t err;
+ THD* thd = ha_thd();
+ char norm_name[FN_REFLEN];
+
+ DBUG_ENTER("ha_innobase::delete_table");
+
+ DBUG_EXECUTE_IF(
+ "test_normalize_table_name_low",
+ test_normalize_table_name_low();
+ );
+ DBUG_EXECUTE_IF(
+ "test_ut_format_name",
+ test_ut_format_name();
+ );
+
+ /* Strangely, MySQL passes the table name without the '.frm'
+ extension, in contrast to ::create */
+ normalize_table_name(norm_name, name);
+
+ if (high_level_read_only) {
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ }
+
+ trx_t* parent_trx = check_trx_exists(thd);
+
+ /* Remove the to-be-dropped table from the list of modified tables
+ by parent_trx. Otherwise we may end up with an orphaned pointer to
+ the table object from parent_trx::mod_tables. This could happen in:
+ SET AUTOCOMMIT=0;
+ CREATE TABLE t (PRIMARY KEY (a)) ENGINE=INNODB SELECT 1 AS a UNION
+ ALL SELECT 1 AS a; */
+ trx_mod_tables_t::const_iterator iter;
+
+ for (iter = parent_trx->mod_tables.begin();
+ iter != parent_trx->mod_tables.end();
+ ++iter) {
+
+ dict_table_t* table_to_drop = iter->first;
+
+ if (strcmp(norm_name, table_to_drop->name.m_name) == 0) {
+ parent_trx->mod_tables.erase(table_to_drop);
+ break;
+ }
+ }
+
+ trx_t* trx = innobase_trx_allocate(thd);
+
+ ulint name_len = strlen(name);
+
+ ut_a(name_len < 1000);
+
+ trx->will_lock = true;
+
+ /* Drop the table in InnoDB */
+
+ err = row_drop_table_for_mysql(norm_name, trx, sqlcom);
+
+ if (err == DB_TABLE_NOT_FOUND
+ && innobase_get_lower_case_table_names() == 1) {
+ char* is_part = is_partition(norm_name);
+
+ if (is_part) {
+ char par_case_name[FN_REFLEN];
+
+#ifndef __WIN__
+ /* Check for the table using lower
+ case name, including the partition
+ separator "P" */
+ strcpy(par_case_name, norm_name);
+ innobase_casedn_str(par_case_name);
+#else
+ /* On Windows platfrom, check
+ whether there exists table name in
+ system table whose name is
+ not being normalized to lower case */
+ normalize_table_name_c_low(
+ par_case_name, name, FALSE);
+#endif
+ err = row_drop_table_for_mysql(
+ par_case_name, trx, sqlcom);
+ }
+ }
+
+ if (err == DB_TABLE_NOT_FOUND &&
+ frm_file_exists(name))
+ {
+ /* Test to drop all tables which matches db/tablename + '#'.
+ Only partitions can have '#' as non-first character in
+ the table name!
+
+ Temporary table names always start with '#', partitions are
+ the only 'tables' that can have '#' after the first character
+ and table name must have length > 0. User tables cannot have
+ '#' since it would be translated to @0023. Therefor this should
+ only match partitions. */
+ uint len = (uint) strlen(norm_name);
+ ulint num_partitions;
+ ut_a(len < FN_REFLEN);
+ norm_name[len] = '#';
+ norm_name[len + 1] = 0;
+ err = row_drop_database_for_mysql(norm_name, trx,
+ &num_partitions);
+ norm_name[len] = 0;
+ table_name_t tbl_name(norm_name);
+ if (num_partitions == 0 && !tbl_name.is_temporary()) {
+ ib::error() << "Table " << tbl_name <<
+ " does not exist in the InnoDB"
+ " internal data dictionary though MariaDB is"
+ " trying to drop it. Have you copied the .frm"
+ " file of the table to the MariaDB database"
+ " directory from another database? "
+ << TROUBLESHOOTING_MSG;
+ }
+ if (num_partitions == 0) {
+ err = DB_TABLE_NOT_FOUND;
+ }
+ }
+
+ if (err == DB_TABLE_NOT_FOUND
+ && innobase_get_lower_case_table_names() == 1) {
+ char* is_part = is_partition(norm_name);
+
+ if (is_part != NULL) {
+ char par_case_name[FN_REFLEN];
+
+#ifndef _WIN32
+ /* Check for the table using lower
+ case name, including the partition
+ separator "P" */
+ strcpy(par_case_name, norm_name);
+ innobase_casedn_str(par_case_name);
+#else
+ /* On Windows platfrom, check
+ whether there exists table name in
+ system table whose name is
+ not being normalized to lower case */
+ create_table_info_t::normalize_table_name_low(
+ par_case_name, name, FALSE);
+#endif /* _WIN32 */
+ err = row_drop_table_for_mysql(
+ par_case_name, trx, sqlcom, true);
+ }
+ }
+
+ ut_ad(!srv_read_only_mode);
+ /* Flush the log to reduce probability that the .frm files and
+ the InnoDB data dictionary get out-of-sync if the user runs
+ with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+
+ innobase_commit_low(trx);
+
+ trx->free();
+
+ DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
+}
+
+/** Drop an InnoDB table.
+@param[in] name table name
+@return error number */
+int ha_innobase::delete_table(const char* name)
+{
+ enum_sql_command sqlcom = enum_sql_command(thd_sql_command(ha_thd()));
+ /* SQLCOM_TRUNCATE should be passed via ha_innobase::truncate() only.
+
+ On client disconnect, when dropping temporary tables, the
+ previous sqlcom would not be overwritten. In such a case, we
+ will have thd_kill_level() != NOT_KILLED, !m_prebuilt can
+ hold, and sqlcom could be anything, including TRUNCATE.
+
+ The sqlcom only matters for persistent tables; no persistent
+ metadata or FOREIGN KEY metadata is kept for temporary
+ tables. Therefore, we relax the assertion. If there is a bug
+ that slips through this assertion due to !m_prebuilt, the
+ worst impact should be that on DROP TABLE of a persistent
+ table, FOREIGN KEY constraints will be ignored and their
+ metadata will not be removed. */
+ DBUG_ASSERT(sqlcom != SQLCOM_TRUNCATE
+ || (thd_kill_level(ha_thd()) != THD_IS_NOT_KILLED
+ && (!m_prebuilt
+ || m_prebuilt->table->is_temporary())));
+ return delete_table(name, sqlcom);
+}
+
+/** Remove all tables in the named database inside InnoDB.
+@param[in] hton handlerton from InnoDB
+@param[in] path Database path; Inside InnoDB the name of the last
+directory in the path is used as the database name.
+For example, in 'mysql/data/test' the database name is 'test'. */
+
+static
+void
+innobase_drop_database(
+ handlerton* hton,
+ char* path)
+{
+ char* namebuf;
+
+ /* Get the transaction associated with the current thd, or create one
+ if not yet created */
+
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ if (high_level_read_only) {
+ return;
+ }
+
+ THD* thd = current_thd;
+
+ ulint len = 0;
+ char* ptr = strend(path) - 2;
+
+ while (ptr >= path && *ptr != '\\' && *ptr != '/') {
+ ptr--;
+ len++;
+ }
+
+ ptr++;
+ namebuf = (char*) my_malloc(PSI_INSTRUMENT_ME, (uint) len + 2, MYF(0));
+
+ memcpy(namebuf, ptr, len);
+ namebuf[len] = '/';
+ namebuf[len + 1] = '\0';
+
+#ifdef _WIN32
+ innobase_casedn_str(namebuf);
+#endif /* _WIN32 */
+
+ trx_t* trx = innobase_trx_allocate(thd);
+ trx->will_lock = true;
+
+ ulint dummy;
+
+ row_drop_database_for_mysql(namebuf, trx, &dummy);
+
+ my_free(namebuf);
+
+ /* Flush the log to reduce probability that the .frm files and
+ the InnoDB data dictionary get out-of-sync if the user runs
+ with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+
+ innobase_commit_low(trx);
+
+ trx->free();
+}
+
+/** Rename an InnoDB table.
+@param[in,out] trx InnoDB data dictionary transaction
+@param[in] from old table name
+@param[in] to new table name
+@param[in] commit whether to commit trx (and to enforce FOREIGN KEY)
+@return DB_SUCCESS or error code */
+inline dberr_t innobase_rename_table(trx_t *trx, const char *from,
+ const char *to, bool commit)
+{
+ dberr_t error;
+ char norm_to[FN_REFLEN];
+ char norm_from[FN_REFLEN];
+
+ DBUG_ENTER("innobase_rename_table");
+ DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX
+ || trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE);
+
+ ut_ad(!srv_read_only_mode);
+
+ normalize_table_name(norm_to, to);
+ normalize_table_name(norm_from, from);
+
+ DEBUG_SYNC_C("innodb_rename_table_ready");
+
+ trx_start_if_not_started(trx, true);
+ ut_ad(trx->will_lock);
+
+ if (commit) {
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations. */
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ error = row_rename_table_for_mysql(norm_from, norm_to, trx, commit,
+ commit);
+
+ if (error != DB_SUCCESS) {
+ if (error == DB_TABLE_NOT_FOUND
+ && innobase_get_lower_case_table_names() == 1) {
+ char* is_part = is_partition(norm_from);
+
+ if (is_part) {
+ char par_case_name[FN_REFLEN];
+#ifndef _WIN32
+ /* Check for the table using lower
+ case name, including the partition
+ separator "P" */
+ strcpy(par_case_name, norm_from);
+ innobase_casedn_str(par_case_name);
+#else
+ /* On Windows platfrom, check
+ whether there exists table name in
+ system table whose name is
+ not being normalized to lower case */
+ create_table_info_t::normalize_table_name_low(
+ par_case_name, from, FALSE);
+#endif /* _WIN32 */
+ trx_start_if_not_started(trx, true);
+ error = row_rename_table_for_mysql(
+ par_case_name, norm_to, trx,
+ true, false);
+ }
+ }
+
+ if (error == DB_SUCCESS) {
+#ifndef _WIN32
+ sql_print_warning("Rename partition table %s"
+ " succeeds after converting to lower"
+ " case. The table may have"
+ " been moved from a case"
+ " in-sensitive file system.\n",
+ norm_from);
+#else
+ sql_print_warning("Rename partition table %s"
+ " succeeds after skipping the step to"
+ " lower case the table name."
+ " The table may have been"
+ " moved from a case sensitive"
+ " file system.\n",
+ norm_from);
+#endif /* _WIN32 */
+ }
+ }
+
+ if (commit) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ /* Flush the log to reduce probability that the .frm
+ files and the InnoDB data dictionary get out-of-sync
+ if the user runs with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+
+ DBUG_RETURN(error);
+}
+
+/** TRUNCATE TABLE
+@return error code
+@retval 0 on success */
+int ha_innobase::truncate()
+{
+ DBUG_ENTER("ha_innobase::truncate");
+
+ if (high_level_read_only) {
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ }
+
+ update_thd();
+
+ HA_CREATE_INFO info;
+ mem_heap_t* heap = mem_heap_create(1000);
+ dict_table_t* ib_table = m_prebuilt->table;
+ const auto update_time = ib_table->update_time;
+ const auto stored_lock = m_prebuilt->stored_select_lock_type;
+ info.init();
+ update_create_info_from_table(&info, table);
+
+ if (ib_table->is_temporary()) {
+ info.options|= HA_LEX_CREATE_TMP_TABLE;
+ } else {
+ dict_get_and_save_data_dir_path(ib_table, false);
+ }
+
+ char* data_file_name = ib_table->data_dir_path;
+
+ if (data_file_name) {
+ info.data_file_name = data_file_name
+ = mem_heap_strdup(heap, data_file_name);
+ }
+
+ const char* temp_name = dict_mem_create_temporary_tablename(
+ heap, ib_table->name.m_name, ib_table->id);
+ const char* name = mem_heap_strdup(heap, ib_table->name.m_name);
+ trx_t* trx = innobase_trx_allocate(m_user_thd);
+ trx->will_lock = true;
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+ row_mysql_lock_data_dictionary(trx);
+ dict_stats_wait_bg_to_stop_using_table(ib_table, trx);
+
+ int err = convert_error_code_to_mysql(
+ innobase_rename_table(trx, ib_table->name.m_name, temp_name,
+ false),
+ ib_table->flags, m_user_thd);
+ if (err) {
+ trx_rollback_for_mysql(trx);
+ row_mysql_unlock_data_dictionary(trx);
+ } else {
+ switch (dict_tf_get_rec_format(ib_table->flags)) {
+ case REC_FORMAT_REDUNDANT:
+ info.row_type = ROW_TYPE_REDUNDANT;
+ break;
+ case REC_FORMAT_COMPACT:
+ info.row_type = ROW_TYPE_COMPACT;
+ break;
+ case REC_FORMAT_COMPRESSED:
+ info.row_type = ROW_TYPE_COMPRESSED;
+ break;
+ case REC_FORMAT_DYNAMIC:
+ info.row_type = ROW_TYPE_DYNAMIC;
+ break;
+ }
+
+ err = create(name, table, &info,
+ ib_table->is_temporary()
+ || dict_table_is_file_per_table(ib_table), trx);
+ }
+
+ trx->free();
+
+ if (!err) {
+ /* Reopen the newly created table, and drop the
+ original table that was renamed to temp_name. */
+
+ row_prebuilt_t* prebuilt = m_prebuilt;
+ uchar* upd_buf = m_upd_buf;
+ ulint upd_buf_size = m_upd_buf_size;
+ /* Mimic ha_innobase::close(). */
+ m_prebuilt = NULL;
+ m_upd_buf = NULL;
+ m_upd_buf_size = 0;
+ err = open(name, 0, 0);
+ if (!err) {
+ m_prebuilt->stored_select_lock_type = stored_lock;
+ m_prebuilt->table->update_time = update_time;
+ row_prebuilt_free(prebuilt, FALSE);
+ delete_table(temp_name, SQLCOM_TRUNCATE);
+ my_free(upd_buf);
+ } else {
+ /* Revert to the old table before truncation. */
+ m_prebuilt = prebuilt;
+ m_upd_buf = upd_buf;
+ m_upd_buf_size = upd_buf_size;
+ }
+ }
+
+ mem_heap_free(heap);
+ DBUG_RETURN(err);
+}
+
+/*********************************************************************//**
+Renames an InnoDB table.
+@return 0 or error code */
+
+int
+ha_innobase::rename_table(
+/*======================*/
+ const char* from, /*!< in: old name of the table */
+ const char* to) /*!< in: new name of the table */
+{
+ THD* thd = ha_thd();
+
+ DBUG_ENTER("ha_innobase::rename_table");
+
+ if (high_level_read_only) {
+ ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ }
+
+ trx_t* trx = innobase_trx_allocate(thd);
+ trx->will_lock = true;
+ trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+ dberr_t error = innobase_rename_table(trx, from, to, true);
+
+ DEBUG_SYNC(thd, "after_innobase_rename_table");
+
+ innobase_commit_low(trx);
+
+ trx->free();
+
+ if (error == DB_SUCCESS) {
+ char norm_from[MAX_FULL_NAME_LEN];
+ char norm_to[MAX_FULL_NAME_LEN];
+ char errstr[512];
+ dberr_t ret;
+
+ normalize_table_name(norm_from, from);
+ normalize_table_name(norm_to, to);
+
+ ret = dict_stats_rename_table(norm_from, norm_to,
+ errstr, sizeof(errstr));
+
+ if (ret != DB_SUCCESS) {
+ ib::error() << errstr;
+
+ push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_LOCK_WAIT_TIMEOUT, errstr);
+ }
+ }
+
+ /* Add a special case to handle the Duplicated Key error
+ and return DB_ERROR instead.
+ This is to avoid a possible SIGSEGV error from mysql error
+ handling code. Currently, mysql handles the Duplicated Key
+ error by re-entering the storage layer and getting dup key
+ info by calling get_dup_key(). This operation requires a valid
+ table handle ('row_prebuilt_t' structure) which could no
+ longer be available in the error handling stage. The suggested
+ solution is to report a 'table exists' error message (since
+ the dup key error here is due to an existing table whose name
+ is the one we are trying to rename to) and return the generic
+ error code. */
+ if (error == DB_DUPLICATE_KEY) {
+ my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to);
+
+ error = DB_ERROR;
+ } else if (error == DB_LOCK_WAIT_TIMEOUT) {
+ my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0), to);
+
+ error = DB_LOCK_WAIT;
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*********************************************************************//**
+Estimates the number of index records in a range.
+@return estimated number of rows */
+
+ha_rows
+ha_innobase::records_in_range(
+/*==========================*/
+ uint keynr, /*!< in: index number */
+ const key_range *min_key, /*!< in: start key value of the
+ range, may also be 0 */
+ const key_range *max_key, /*!< in: range end key val, may
+ also be 0 */
+ page_range *pages)
+{
+ KEY* key;
+ dict_index_t* index;
+ dtuple_t* range_start;
+ dtuple_t* range_end;
+ ha_rows n_rows;
+ page_cur_mode_t mode1;
+ page_cur_mode_t mode2;
+ mem_heap_t* heap;
+
+ DBUG_ENTER("records_in_range");
+
+ ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
+
+ m_prebuilt->trx->op_info = "estimating records in index range";
+
+ active_index = keynr;
+
+ key = table->key_info + active_index;
+
+ index = innobase_get_index(keynr);
+
+ /* There exists possibility of not being able to find requested
+ index due to inconsistency between MySQL and InoDB dictionary info.
+ Necessary message should have been printed in innobase_get_index() */
+ if (!m_prebuilt->table->space) {
+ n_rows = HA_POS_ERROR;
+ goto func_exit;
+ }
+ if (!index) {
+ n_rows = HA_POS_ERROR;
+ goto func_exit;
+ }
+ if (index->is_corrupted()) {
+ n_rows = HA_ERR_INDEX_CORRUPT;
+ goto func_exit;
+ }
+ if (!row_merge_is_index_usable(m_prebuilt->trx, index)) {
+ n_rows = HA_ERR_TABLE_DEF_CHANGED;
+ goto func_exit;
+ }
+
+ heap = mem_heap_create(2 * (key->ext_key_parts * sizeof(dfield_t)
+ + sizeof(dtuple_t)));
+
+ range_start = dtuple_create(heap, key->ext_key_parts);
+ dict_index_copy_types(range_start, index, key->ext_key_parts);
+
+ range_end = dtuple_create(heap, key->ext_key_parts);
+ dict_index_copy_types(range_end, index, key->ext_key_parts);
+
+ row_sel_convert_mysql_key_to_innobase(
+ range_start,
+ m_prebuilt->srch_key_val1,
+ m_prebuilt->srch_key_val_len,
+ index,
+ (byte*) (min_key ? min_key->key : (const uchar*) 0),
+ (ulint) (min_key ? min_key->length : 0));
+
+ DBUG_ASSERT(min_key
+ ? range_start->n_fields > 0
+ : range_start->n_fields == 0);
+
+ row_sel_convert_mysql_key_to_innobase(
+ range_end,
+ m_prebuilt->srch_key_val2,
+ m_prebuilt->srch_key_val_len,
+ index,
+ (byte*) (max_key ? max_key->key : (const uchar*) 0),
+ (ulint) (max_key ? max_key->length : 0));
+
+ DBUG_ASSERT(max_key
+ ? range_end->n_fields > 0
+ : range_end->n_fields == 0);
+
+ mode1 = convert_search_mode_to_innobase(
+ min_key ? min_key->flag : HA_READ_KEY_EXACT);
+
+ mode2 = convert_search_mode_to_innobase(
+ max_key ? max_key->flag : HA_READ_KEY_EXACT);
+
+ if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) {
+
+ if (dict_index_is_spatial(index)) {
+ /*Only min_key used in spatial index. */
+ n_rows = rtr_estimate_n_rows_in_range(
+ index, range_start, mode1);
+ } else {
+ btr_pos_t tuple1(range_start, mode1, pages->first_page);
+ btr_pos_t tuple2(range_end, mode2, pages->last_page);
+ n_rows = btr_estimate_n_rows_in_range(
+ index, &tuple1, &tuple2);
+ pages->first_page= tuple1.page_id.raw();
+ pages->last_page= tuple2.page_id.raw();
+ }
+ } else {
+
+ n_rows = HA_POS_ERROR;
+ }
+
+ mem_heap_free(heap);
+
+ DBUG_EXECUTE_IF(
+ "print_btr_estimate_n_rows_in_range_return_value",
+ push_warning_printf(
+ ha_thd(), Sql_condition::WARN_LEVEL_WARN,
+ ER_NO_DEFAULT,
+ "btr_estimate_n_rows_in_range(): %lld",
+ (longlong) n_rows);
+ );
+
+func_exit:
+
+ m_prebuilt->trx->op_info = (char*)"";
+
+ /* The MySQL optimizer seems to believe an estimate of 0 rows is
+ always accurate and may return the result 'Empty set' based on that.
+ The accuracy is not guaranteed, and even if it were, for a locking
+ read we should anyway perform the search to set the next-key lock.
+ Add 1 to the value to make sure MySQL does not make the assumption! */
+
+ if (n_rows == 0) {
+ n_rows = 1;
+ }
+
+ DBUG_RETURN((ha_rows) n_rows);
+}
+
+/*********************************************************************//**
+Gives an UPPER BOUND to the number of rows in a table. This is used in
+filesort.cc.
+@return upper bound of rows */
+
+ha_rows
+ha_innobase::estimate_rows_upper_bound()
+/*====================================*/
+{
+ const dict_index_t* index;
+ ulonglong estimate;
+ ulonglong local_data_file_length;
+
+ DBUG_ENTER("estimate_rows_upper_bound");
+
+ /* We do not know if MySQL can call this function before calling
+ external_lock(). To be safe, update the thd of the current table
+ handle. */
+
+ update_thd(ha_thd());
+
+ m_prebuilt->trx->op_info = "calculating upper bound for table rows";
+
+ index = dict_table_get_first_index(m_prebuilt->table);
+
+ ulint stat_n_leaf_pages = index->stat_n_leaf_pages;
+
+ ut_a(stat_n_leaf_pages > 0);
+
+ local_data_file_length = ulonglong(stat_n_leaf_pages)
+ << srv_page_size_shift;
+
+ /* Calculate a minimum length for a clustered index record and from
+ that an upper bound for the number of rows. Since we only calculate
+ new statistics in row0mysql.cc when a table has grown by a threshold
+ factor, we must add a safety factor 2 in front of the formula below. */
+
+ estimate = 2 * local_data_file_length
+ / dict_index_calc_min_rec_len(index);
+
+ m_prebuilt->trx->op_info = "";
+
+ /* Set num_rows less than MERGEBUFF to simulate the case where we do
+ not have enough space to merge the externally sorted file blocks. */
+ DBUG_EXECUTE_IF("set_num_rows_lt_MERGEBUFF",
+ estimate = 2;
+ DBUG_SET("-d,set_num_rows_lt_MERGEBUFF");
+ );
+
+ DBUG_RETURN((ha_rows) estimate);
+}
+
+/*********************************************************************//**
+How many seeks it will take to read through the table. This is to be
+comparable to the number returned by records_in_range so that we can
+decide if we should scan the table or use keys.
+@return estimated time measured in disk seeks */
+
+double
+ha_innobase::scan_time()
+/*====================*/
+{
+ /* Since MySQL seems to favor table scans too much over index
+ searches, we pretend that a sequential read takes the same time
+ as a random disk read, that is, we do not divide the following
+ by 10, which would be physically realistic. */
+
+ /* The locking below is disabled for performance reasons. Without
+ it we could end up returning uninitialized value to the caller,
+ which in the worst case could make some query plan go bogus or
+ issue a Valgrind warning. */
+ if (m_prebuilt == NULL) {
+ /* In case of derived table, Optimizer will try to fetch stat
+ for table even before table is create or open. In such
+ cases return default value of 1.
+ TODO: This will be further improved to return some approximate
+ estimate but that would also needs pre-population of stats
+ structure. As of now approach is in sync with MyISAM. */
+ return(ulonglong2double(stats.data_file_length) / IO_SIZE + 2);
+ }
+
+ ulint stat_clustered_index_size;
+
+ ut_a(m_prebuilt->table->stat_initialized);
+
+ stat_clustered_index_size =
+ m_prebuilt->table->stat_clustered_index_size;
+
+ return((double) stat_clustered_index_size);
+}
+
+/******************************************************************//**
+Calculate the time it takes to read a set of ranges through an index
+This enables us to optimise reads for clustered indexes.
+@return estimated time measured in disk seeks */
+
+double
+ha_innobase::read_time(
+/*===================*/
+ uint index, /*!< in: key number */
+ uint ranges, /*!< in: how many ranges */
+ ha_rows rows) /*!< in: estimated number of rows in the ranges */
+{
+ ha_rows total_rows;
+
+ if (index != table->s->primary_key) {
+ /* Not clustered */
+ return(handler::read_time(index, ranges, rows));
+ }
+
+ /* Assume that the read time is proportional to the scan time for all
+ rows + at most one seek per range. */
+
+ double time_for_scan = scan_time();
+
+ if ((total_rows = estimate_rows_upper_bound()) < rows) {
+
+ return(time_for_scan);
+ }
+
+ return(ranges + (double) rows / (double) total_rows * time_for_scan);
+}
+
+/** Update the system variable with the given value of the InnoDB
+buffer pool size.
+@param[in] buf_pool_size given value of buffer pool size.*/
+void
+innodb_set_buf_pool_size(ulonglong buf_pool_size)
+{
+ innobase_buffer_pool_size = buf_pool_size;
+}
+
+/*********************************************************************//**
+Calculates the key number used inside MySQL for an Innobase index.
+@return the key number used inside MySQL */
+static
+unsigned
+innobase_get_mysql_key_number_for_index(
+/*====================================*/
+ const TABLE* table, /*!< in: table in MySQL data
+ dictionary */
+ dict_table_t* ib_table,/*!< in: table in InnoDB data
+ dictionary */
+ const dict_index_t* index) /*!< in: index */
+{
+ const dict_index_t* ind;
+ unsigned int i;
+
+ /* If index does not belong to the table object of share structure
+ (ib_table comes from the share structure) search the index->table
+ object instead */
+ if (index->table != ib_table) {
+ i = 0;
+ ind = dict_table_get_first_index(index->table);
+
+ while (index != ind) {
+ ind = dict_table_get_next_index(ind);
+ i++;
+ }
+
+ if (dict_index_is_auto_gen_clust(index)) {
+ ut_a(i > 0);
+ i--;
+ }
+
+ return(i);
+ }
+
+ /* Directly find matching index with information from mysql TABLE
+ structure and InnoDB dict_index_t list */
+ for (i = 0; i < table->s->keys; i++) {
+ ind = dict_table_get_index_on_name(
+ ib_table, table->key_info[i].name.str);
+
+ if (index == ind) {
+ return(i);
+ }
+ }
+
+ /* Loop through each index of the table and lock them */
+ for (ind = dict_table_get_first_index(ib_table);
+ ind != NULL;
+ ind = dict_table_get_next_index(ind)) {
+ if (index == ind) {
+ /* Temp index is internal to InnoDB, that is
+ not present in the MySQL index list, so no
+ need to print such mismatch warning. */
+ if (index->is_committed()) {
+ sql_print_warning(
+ "Found index %s in InnoDB index list"
+ " but not its MariaDB index number."
+ " It could be an InnoDB internal"
+ " index.",
+ index->name());
+ }
+ return(~0U);
+ }
+ }
+
+ ut_error;
+
+ return(~0U);
+}
+
+/*********************************************************************//**
+Calculate Record Per Key value. Need to exclude the NULL value if
+innodb_stats_method is set to "nulls_ignored"
+@return estimated record per key value */
+rec_per_key_t
+innodb_rec_per_key(
+/*===============*/
+ dict_index_t* index, /*!< in: dict_index_t structure */
+ ulint i, /*!< in: the column we are
+ calculating rec per key */
+ ha_rows records) /*!< in: estimated total records */
+{
+ rec_per_key_t rec_per_key;
+ ib_uint64_t n_diff;
+
+ ut_a(index->table->stat_initialized);
+
+ ut_ad(i < dict_index_get_n_unique(index));
+ ut_ad(!dict_index_is_spatial(index));
+
+ if (records == 0) {
+ /* "Records per key" is meaningless for empty tables.
+ Return 1.0 because that is most convenient to the Optimizer. */
+ return(1.0);
+ }
+
+ n_diff = index->stat_n_diff_key_vals[i];
+
+ if (n_diff == 0) {
+
+ rec_per_key = static_cast<rec_per_key_t>(records);
+ } else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) {
+ ib_uint64_t n_null;
+ ib_uint64_t n_non_null;
+
+ n_non_null = index->stat_n_non_null_key_vals[i];
+
+ /* In theory, index->stat_n_non_null_key_vals[i]
+ should always be less than the number of records.
+ Since this is statistics value, the value could
+ have slight discrepancy. But we will make sure
+ the number of null values is not a negative number. */
+ if (records < n_non_null) {
+ n_null = 0;
+ } else {
+ n_null = records - n_non_null;
+ }
+
+ /* If the number of NULL values is the same as or
+ larger than that of the distinct values, we could
+ consider that the table consists mostly of NULL value.
+ Set rec_per_key to 1. */
+ if (n_diff <= n_null) {
+ rec_per_key = 1.0;
+ } else {
+ /* Need to exclude rows with NULL values from
+ rec_per_key calculation */
+ rec_per_key
+ = static_cast<rec_per_key_t>(records - n_null)
+ / static_cast<rec_per_key_t>(n_diff - n_null);
+ }
+ } else {
+ DEBUG_SYNC_C("after_checking_for_0");
+ rec_per_key = static_cast<rec_per_key_t>(records)
+ / static_cast<rec_per_key_t>(n_diff);
+ }
+
+ if (rec_per_key < 1.0) {
+ /* Values below 1.0 are meaningless and must be due to the
+ stats being imprecise. */
+ rec_per_key = 1.0;
+ }
+
+ return(rec_per_key);
+}
+
+/** Calculate how many KiB of new data we will be able to insert to the
+tablespace without running out of space. Start with a space object that has
+been acquired by the caller who holds it for the calculation,
+@param[in] space tablespace object from fil_space_acquire()
+@return available space in KiB */
+static uintmax_t
+fsp_get_available_space_in_free_extents(const fil_space_t& space)
+{
+ ulint size_in_header = space.size_in_header;
+ if (size_in_header < FSP_EXTENT_SIZE) {
+ return 0; /* TODO: count free frag pages and
+ return a value based on that */
+ }
+
+ /* Below we play safe when counting free extents above the free limit:
+ some of them will contain extent descriptor pages, and therefore
+ will not be free extents */
+ ut_ad(size_in_header >= space.free_limit);
+ ulint n_free_up =
+ (size_in_header - space.free_limit) / FSP_EXTENT_SIZE;
+
+ const ulint size = space.physical_size();
+ if (n_free_up > 0) {
+ n_free_up--;
+ n_free_up -= n_free_up / (size / FSP_EXTENT_SIZE);
+ }
+
+ /* We reserve 1 extent + 0.5 % of the space size to undo logs
+ and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+ code is duplicated in the function above! */
+
+ ulint reserve = 2 + ((size_in_header / FSP_EXTENT_SIZE) * 2) / 200;
+ ulint n_free = space.free_len + n_free_up;
+
+ if (reserve > n_free) {
+ return(0);
+ }
+
+ return(static_cast<uintmax_t>(n_free - reserve)
+ * FSP_EXTENT_SIZE * (size / 1024));
+}
+
+/*********************************************************************//**
+Returns statistics information of the table to the MySQL interpreter,
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
+
+int
+ha_innobase::info_low(
+/*==================*/
+ uint flag, /*!< in: what information is requested */
+ bool is_analyze)
+{
+ dict_table_t* ib_table;
+ ib_uint64_t n_rows;
+ char path[FN_REFLEN];
+ os_file_stat_t stat_info;
+
+ DBUG_ENTER("info");
+
+ DEBUG_SYNC_C("ha_innobase_info_low");
+
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ /* If we are forcing recovery at a high level, we will suppress
+ statistics calculation on tables, because that may crash the
+ server if an index is badly corrupted. */
+
+ /* We do not know if MySQL can call this function before calling
+ external_lock(). To be safe, update the thd of the current table
+ handle. */
+
+ update_thd(ha_thd());
+
+ m_prebuilt->trx->op_info = "returning various info to MariaDB";
+
+ ib_table = m_prebuilt->table;
+ DBUG_ASSERT(ib_table->get_ref_count() > 0);
+
+ if (!ib_table->is_readable()) {
+ ib_table->stat_initialized = true;
+ }
+
+ if (flag & HA_STATUS_TIME) {
+ if (is_analyze || innobase_stats_on_metadata) {
+
+ dict_stats_upd_option_t opt;
+ dberr_t ret;
+
+ m_prebuilt->trx->op_info = "updating table statistics";
+
+ if (dict_stats_is_persistent_enabled(ib_table)) {
+
+ if (is_analyze) {
+ row_mysql_lock_data_dictionary(
+ m_prebuilt->trx);
+ dict_stats_recalc_pool_del(ib_table);
+ dict_stats_wait_bg_to_stop_using_table(
+ ib_table, m_prebuilt->trx);
+ row_mysql_unlock_data_dictionary(
+ m_prebuilt->trx);
+ opt = DICT_STATS_RECALC_PERSISTENT;
+ } else {
+ /* This is e.g. 'SHOW INDEXES', fetch
+ the persistent stats from disk. */
+ opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
+ }
+ } else {
+ opt = DICT_STATS_RECALC_TRANSIENT;
+ }
+
+ ret = dict_stats_update(ib_table, opt);
+
+ if (opt == DICT_STATS_RECALC_PERSISTENT) {
+ mutex_enter(&dict_sys.mutex);
+ ib_table->stats_bg_flag
+ &= byte(~BG_STAT_SHOULD_QUIT);
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ if (ret != DB_SUCCESS) {
+ m_prebuilt->trx->op_info = "";
+ DBUG_RETURN(HA_ERR_GENERIC);
+ }
+
+ m_prebuilt->trx->op_info =
+ "returning various info to MariaDB";
+ }
+
+
+ stats.update_time = (ulong) ib_table->update_time;
+ }
+
+ DBUG_EXECUTE_IF("dict_sys_mutex_avoid", goto func_exit;);
+
+ dict_stats_init(ib_table);
+
+ if (flag & HA_STATUS_VARIABLE) {
+
+ ulint stat_clustered_index_size;
+ ulint stat_sum_of_other_index_sizes;
+
+ mutex_enter(&dict_sys.mutex);
+
+ ut_a(ib_table->stat_initialized);
+
+ n_rows = ib_table->stat_n_rows;
+
+ stat_clustered_index_size
+ = ib_table->stat_clustered_index_size;
+
+ stat_sum_of_other_index_sizes
+ = ib_table->stat_sum_of_other_index_sizes;
+
+ mutex_exit(&dict_sys.mutex);
+
+ /*
+ The MySQL optimizer seems to assume in a left join that n_rows
+ is an accurate estimate if it is zero. Of course, it is not,
+ since we do not have any locks on the rows yet at this phase.
+ Since SHOW TABLE STATUS seems to call this function with the
+ HA_STATUS_TIME flag set, while the left join optimizer does not
+ set that flag, we add one to a zero value if the flag is not
+ set. That way SHOW TABLE STATUS will show the best estimate,
+ while the optimizer never sees the table empty. */
+
+ if (n_rows == 0 && !(flag & (HA_STATUS_TIME | HA_STATUS_OPEN))) {
+ n_rows++;
+ }
+
+ /* Fix bug#40386: Not flushing query cache after truncate.
+ n_rows can not be 0 unless the table is empty, set to 1
+ instead. The original problem of bug#29507 is actually
+ fixed in the server code. */
+ if (thd_sql_command(m_user_thd) == SQLCOM_TRUNCATE) {
+
+ n_rows = 1;
+
+ /* We need to reset the m_prebuilt value too, otherwise
+ checks for values greater than the last value written
+ to the table will fail and the autoinc counter will
+ not be updated. This will force write_row() into
+ attempting an update of the table's AUTOINC counter. */
+
+ m_prebuilt->autoinc_last_value = 0;
+ }
+
+ stats.records = (ha_rows) n_rows;
+ stats.deleted = 0;
+ if (fil_space_t* space = ib_table->space) {
+ const ulint size = space->physical_size();
+ stats.data_file_length
+ = ulonglong(stat_clustered_index_size)
+ * size;
+ stats.index_file_length
+ = ulonglong(stat_sum_of_other_index_sizes)
+ * size;
+ stats.delete_length = 1024
+ * fsp_get_available_space_in_free_extents(
+ *space);
+ }
+ stats.check_time = 0;
+ stats.mrr_length_per_rec= (uint)ref_length + 8; // 8 = max(sizeof(void *));
+
+ if (stats.records == 0) {
+ stats.mean_rec_length = 0;
+ } else {
+ stats.mean_rec_length = (ulong)
+ (stats.data_file_length / stats.records);
+ }
+ }
+
+ if (flag & HA_STATUS_CONST) {
+ /* Verify the number of index in InnoDB and MySQL
+ matches up. If m_prebuilt->clust_index_was_generated
+ holds, InnoDB defines GEN_CLUST_INDEX internally */
+ ulint num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
+ - m_prebuilt->clust_index_was_generated;
+ if (table->s->keys < num_innodb_index) {
+ /* If there are too many indexes defined
+ inside InnoDB, ignore those that are being
+ created, because MySQL will only consider
+ the fully built indexes here. */
+
+ for (const dict_index_t* index
+ = UT_LIST_GET_FIRST(ib_table->indexes);
+ index != NULL;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ /* First, online index creation is
+ completed inside InnoDB, and then
+ MySQL attempts to upgrade the
+ meta-data lock so that it can rebuild
+ the .frm file. If we get here in that
+ time frame, dict_index_is_online_ddl()
+ would not hold and the index would
+ still not be included in TABLE_SHARE. */
+ if (!index->is_committed()) {
+ num_innodb_index--;
+ }
+ }
+
+ if (table->s->keys < num_innodb_index
+ && innobase_fts_check_doc_id_index(
+ ib_table, NULL, NULL)
+ == FTS_EXIST_DOC_ID_INDEX) {
+ num_innodb_index--;
+ }
+ }
+
+ if (table->s->keys != num_innodb_index) {
+ ib_table->dict_frm_mismatch = DICT_FRM_INCONSISTENT_KEYS;
+ ib_push_frm_error(m_user_thd, ib_table, table, num_innodb_index, true);
+ }
+
+ snprintf(path, sizeof(path), "%s/%s%s",
+ mysql_data_home, table->s->normalized_path.str,
+ reg_ext);
+
+ unpack_filename(path,path);
+
+ /* Note that we do not know the access time of the table,
+ nor the CHECK TABLE time, nor the UPDATE or INSERT time. */
+
+ if (os_file_get_status(
+ path, &stat_info, false,
+ srv_read_only_mode) == DB_SUCCESS) {
+ stats.create_time = (ulong) stat_info.ctime;
+ }
+
+ struct Locking {
+ Locking() { mutex_enter(&dict_sys.mutex); }
+ ~Locking() { mutex_exit(&dict_sys.mutex); }
+ } locking;
+
+ ut_a(ib_table->stat_initialized);
+
+ for (uint i = 0; i < table->s->keys; i++) {
+ ulong j;
+
+ dict_index_t* index = innobase_get_index(i);
+
+ if (index == NULL) {
+ ib_table->dict_frm_mismatch = DICT_FRM_INCONSISTENT_KEYS;
+ ib_push_frm_error(m_user_thd, ib_table, table, num_innodb_index, true);
+ break;
+ }
+
+ KEY* key = &table->key_info[i];
+
+ for (j = 0; j < key->ext_key_parts; j++) {
+
+ if ((key->flags & HA_FULLTEXT)
+ || (key->flags & HA_SPATIAL)) {
+
+ /* The record per key does not apply to
+ FTS or Spatial indexes. */
+ /*
+ key->rec_per_key[j] = 1;
+ key->set_records_per_key(j, 1.0);
+ */
+ continue;
+ }
+
+ if (j + 1 > index->n_uniq) {
+ sql_print_error(
+ "Index %s of %s has %u columns"
+ " unique inside InnoDB, but "
+ "MySQL is asking statistics for"
+ " %lu columns. Have you mixed "
+ "up .frm files from different "
+ " installations? %s",
+ index->name(),
+ ib_table->name.m_name,
+ index->n_uniq, j + 1,
+ TROUBLESHOOTING_MSG);
+ break;
+ }
+
+ /* innodb_rec_per_key() will use
+ index->stat_n_diff_key_vals[] and the value we
+ pass index->table->stat_n_rows. Both are
+ calculated by ANALYZE and by the background
+ stats gathering thread (which kicks in when too
+ much of the table has been changed). In
+ addition table->stat_n_rows is adjusted with
+ each DML (e.g. ++ on row insert). Those
+ adjustments are not MVCC'ed and not even
+ reversed on rollback. So,
+ index->stat_n_diff_key_vals[] and
+ index->table->stat_n_rows could have been
+ calculated at different time. This is
+ acceptable. */
+
+ ulong rec_per_key_int = static_cast<ulong>(
+ innodb_rec_per_key(index, j,
+ stats.records));
+
+ /* Since MySQL seems to favor table scans
+ too much over index searches, we pretend
+ index selectivity is 2 times better than
+ our estimate: */
+
+ rec_per_key_int = rec_per_key_int / 2;
+
+ if (rec_per_key_int == 0) {
+ rec_per_key_int = 1;
+ }
+
+ key->rec_per_key[j] = rec_per_key_int;
+ }
+ }
+ }
+
+ if (srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE) {
+
+ goto func_exit;
+
+ } else if (flag & HA_STATUS_ERRKEY) {
+ const dict_index_t* err_index;
+
+ ut_a(m_prebuilt->trx);
+ ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
+
+ err_index = trx_get_error_info(m_prebuilt->trx);
+
+ if (err_index) {
+ errkey = innobase_get_mysql_key_number_for_index(
+ table, ib_table, err_index);
+ } else {
+ errkey = (unsigned int) (
+ (m_prebuilt->trx->error_key_num
+ == ULINT_UNDEFINED)
+ ? ~0U
+ : m_prebuilt->trx->error_key_num);
+ }
+ }
+
+ if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) {
+ stats.auto_increment_value = innobase_peek_autoinc();
+ }
+
+func_exit:
+ m_prebuilt->trx->op_info = (char*)"";
+
+ DBUG_RETURN(0);
+}
+
+/*********************************************************************//**
+Returns statistics information of the table to the MySQL interpreter,
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
+
+int
+ha_innobase::info(
+/*==============*/
+ uint flag) /*!< in: what information is requested */
+{
+ return(info_low(flag, false /* not ANALYZE */));
+}
+
+/*
+Updates index cardinalities of the table, based on random dives into
+each index tree. This does NOT calculate exact statistics on the table.
+@return HA_ADMIN_* error code or HA_ADMIN_OK */
+
+int
+ha_innobase::analyze(THD*, HA_CHECK_OPT*)
+{
+ /* Simply call info_low() with all the flags
+ and request recalculation of the statistics */
+ int ret = info_low(
+ HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE,
+ true /* this is ANALYZE */);
+
+ if (ret != 0) {
+ return(HA_ADMIN_FAILED);
+ }
+
+ return(HA_ADMIN_OK);
+}
+
+/*****************************************************************//**
+Defragment table.
+@return error number */
+inline int ha_innobase::defragment_table(const char *name)
+{
+ char norm_name[FN_REFLEN];
+ dict_table_t* table = NULL;
+ dict_index_t* index = NULL;
+ int ret = 0;
+ dberr_t err = DB_SUCCESS;
+
+ normalize_table_name(norm_name, name);
+
+ table = dict_table_open_on_name(norm_name, FALSE,
+ FALSE, DICT_ERR_IGNORE_FK_NOKEY);
+
+ for (index = dict_table_get_first_index(table); index;
+ index = dict_table_get_next_index(index)) {
+
+ if (index->is_corrupted()) {
+ continue;
+ }
+
+ if (dict_index_is_spatial(index)) {
+ /* Do not try to defragment spatial indexes,
+ because doing it properly would require
+ appropriate logic around the SSN (split
+ sequence number). */
+ continue;
+ }
+
+ if (index->page == FIL_NULL) {
+ /* Do not defragment auxiliary tables related
+ to FULLTEXT INDEX. */
+ ut_ad(index->type & DICT_FTS);
+ continue;
+ }
+
+ if (btr_defragment_find_index(index)) {
+ // We borrow this error code. When the same index is
+ // already in the defragmentation queue, issue another
+ // defragmentation only introduces overhead. We return
+ // an error here to let the user know this is not
+ // necessary. Note that this will fail a query that's
+ // trying to defragment a full table if one of the
+ // indicies in that table is already in defragmentation.
+ // We choose this behavior so user is aware of this
+ // rather than silently defragment other indicies of
+ // that table.
+ ret = ER_SP_ALREADY_EXISTS;
+ break;
+ }
+
+ os_event_t event = btr_defragment_add_index(index, &err);
+
+ if (err != DB_SUCCESS) {
+ push_warning_printf(
+ current_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_NO_SUCH_TABLE,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue checking table.",
+ index->table->name.m_name);
+
+ ret = convert_error_code_to_mysql(err, 0, current_thd);
+ break;
+ }
+
+ if (event) {
+ while(os_event_wait_time(event, 1000000)) {
+ if (thd_killed(current_thd)) {
+ btr_defragment_remove_index(index);
+ ret = ER_QUERY_INTERRUPTED;
+ break;
+ }
+ }
+ os_event_destroy(event);
+ }
+
+ if (ret) {
+ break;
+ }
+ }
+
+ dict_table_close(table, FALSE, FALSE);
+ return ret;
+}
+
+/**********************************************************************//**
+This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds
+the table in MySQL. */
+
+int
+ha_innobase::optimize(
+/*==================*/
+ THD* thd, /*!< in: connection thread handle */
+ HA_CHECK_OPT*)
+{
+
+ /* FTS-FIXME: Since MySQL doesn't support engine-specific commands,
+ we have to hijack some existing command in order to be able to test
+ the new admin commands added in InnoDB's FTS support. For now, we
+ use MySQL's OPTIMIZE command, normally mapped to ALTER TABLE in
+ InnoDB (so it recreates the table anew), and map it to OPTIMIZE.
+
+ This works OK otherwise, but MySQL locks the entire table during
+ calls to OPTIMIZE, which is undesirable. */
+ bool try_alter = true;
+
+ if (!m_prebuilt->table->is_temporary() && srv_defragment) {
+ int err = defragment_table(m_prebuilt->table->name.m_name);
+
+ if (err == 0) {
+ try_alter = false;
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ uint(err),
+ "InnoDB: Cannot defragment table %s: returned error code %d\n",
+ m_prebuilt->table->name.m_name, err);
+
+ if(err == ER_SP_ALREADY_EXISTS) {
+ try_alter = false;
+ }
+ }
+ }
+
+ if (innodb_optimize_fulltext_only) {
+ if (m_prebuilt->table->fts && m_prebuilt->table->fts->cache
+ && m_prebuilt->table->space) {
+ fts_sync_table(m_prebuilt->table);
+ fts_optimize_table(m_prebuilt->table);
+ }
+ try_alter = false;
+ }
+
+ return try_alter ? HA_ADMIN_TRY_ALTER : HA_ADMIN_OK;
+}
+
+/*******************************************************************//**
+Tries to check that an InnoDB table is not corrupted. If corruption is
+noticed, prints to stderr information about it. In case of corruption
+may also assert a failure and crash the server.
+@return HA_ADMIN_CORRUPT or HA_ADMIN_OK */
+
+int
+ha_innobase::check(
+/*===============*/
+ THD* thd, /*!< in: user thread handle */
+ HA_CHECK_OPT* check_opt) /*!< in: check options */
+{
+ dict_index_t* index;
+ ulint n_rows;
+ ulint n_rows_in_table = ULINT_UNDEFINED;
+ bool is_ok = true;
+ dberr_t ret;
+
+ DBUG_ENTER("ha_innobase::check");
+ DBUG_ASSERT(thd == ha_thd());
+ ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
+ ut_a(m_prebuilt->trx == thd_to_trx(thd));
+
+ if (m_prebuilt->mysql_template == NULL) {
+ /* Build the template; we will use a dummy template
+ in index scans done in checking */
+
+ build_template(true);
+ }
+
+ if (!m_prebuilt->table->space) {
+
+ ib_senderrf(
+ thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
+
+ DBUG_RETURN(HA_ADMIN_CORRUPT);
+
+ } else if (!m_prebuilt->table->is_readable() &&
+ !m_prebuilt->table->space) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_MISSING,
+ table->s->table_name.str);
+
+ DBUG_RETURN(HA_ADMIN_CORRUPT);
+ }
+
+ m_prebuilt->trx->op_info = "checking table";
+
+ if (m_prebuilt->table->corrupted) {
+ /* If some previous operation has marked the table as
+ corrupted in memory, and has not propagated such to
+ clustered index, we will do so here */
+ index = dict_table_get_first_index(m_prebuilt->table);
+
+ if (!index->is_corrupted()) {
+ dict_set_corrupted(
+ index, m_prebuilt->trx, "CHECK TABLE");
+ }
+
+ push_warning_printf(m_user_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_INDEX_CORRUPT,
+ "InnoDB: Index %s is marked as"
+ " corrupted",
+ index->name());
+
+ /* Now that the table is already marked as corrupted,
+ there is no need to check any index of this table */
+ m_prebuilt->trx->op_info = "";
+
+ DBUG_RETURN(HA_ADMIN_CORRUPT);
+ }
+
+ uint old_isolation_level = m_prebuilt->trx->isolation_level;
+
+ /* We must run the index record counts at an isolation level
+ >= READ COMMITTED, because a dirty read can see a wrong number
+ of records in some index; to play safe, we normally use
+ REPEATABLE READ here */
+ m_prebuilt->trx->isolation_level = high_level_read_only
+ ? TRX_ISO_READ_UNCOMMITTED
+ : TRX_ISO_REPEATABLE_READ;
+
+ ut_ad(!m_prebuilt->table->corrupted);
+
+ for (index = dict_table_get_first_index(m_prebuilt->table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ /* If this is an index being created or dropped, skip */
+ if (!index->is_committed()) {
+ continue;
+ }
+
+ if (!(check_opt->flags & T_QUICK)
+ && !index->is_corrupted()) {
+
+ dberr_t err = btr_validate_index(
+ index, m_prebuilt->trx);
+
+ if (err != DB_SUCCESS) {
+ is_ok = false;
+
+ if (err == DB_DECRYPTION_FAILED) {
+ push_warning_printf(
+ thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_NO_SUCH_TABLE,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue checking table.",
+ index->table->name.m_name);
+ } else {
+ push_warning_printf(
+ thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The B-tree of"
+ " index %s is corrupted.",
+ index->name());
+ }
+
+ continue;
+ }
+ }
+
+ /* Instead of invoking change_active_index(), set up
+ a dummy template for non-locking reads, disabling
+ access to the clustered index. */
+ m_prebuilt->index = index;
+
+ m_prebuilt->index_usable = row_merge_is_index_usable(
+ m_prebuilt->trx, m_prebuilt->index);
+
+ DBUG_EXECUTE_IF(
+ "dict_set_index_corrupted",
+ if (!index->is_primary()) {
+ m_prebuilt->index_usable = FALSE;
+ // row_mysql_lock_data_dictionary(m_prebuilt->trx);
+ dict_set_corrupted(index, m_prebuilt->trx, "dict_set_index_corrupted");
+ // row_mysql_unlock_data_dictionary(m_prebuilt->trx);
+ });
+
+ if (UNIV_UNLIKELY(!m_prebuilt->index_usable)) {
+ if (index->is_corrupted()) {
+ push_warning_printf(
+ m_user_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_INDEX_CORRUPT,
+ "InnoDB: Index %s is marked as"
+ " corrupted",
+ index->name());
+ is_ok = false;
+ } else {
+ push_warning_printf(
+ m_user_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_TABLE_DEF_CHANGED,
+ "InnoDB: Insufficient history for"
+ " index %s",
+ index->name());
+ }
+ continue;
+ }
+
+ m_prebuilt->sql_stat_start = TRUE;
+ m_prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
+ m_prebuilt->n_template = 0;
+ m_prebuilt->need_to_access_clustered = FALSE;
+
+ dtuple_set_n_fields(m_prebuilt->search_tuple, 0);
+
+ m_prebuilt->select_lock_type = LOCK_NONE;
+
+ /* Scan this index. */
+ if (dict_index_is_spatial(index)) {
+ ret = row_count_rtree_recs(m_prebuilt, &n_rows);
+ } else {
+ ret = row_scan_index_for_mysql(
+ m_prebuilt, index, &n_rows);
+ }
+
+ DBUG_EXECUTE_IF(
+ "dict_set_index_corrupted",
+ if (!index->is_primary()) {
+ ret = DB_CORRUPTION;
+ });
+
+ if (ret == DB_INTERRUPTED || thd_killed(m_user_thd)) {
+ /* Do not report error since this could happen
+ during shutdown */
+ break;
+ }
+ if (ret != DB_SUCCESS) {
+ /* Assume some kind of corruption. */
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The B-tree of"
+ " index %s is corrupted.",
+ index->name());
+ is_ok = false;
+ dict_set_corrupted(
+ index, m_prebuilt->trx, "CHECK TABLE-check index");
+ }
+
+
+ if (index == dict_table_get_first_index(m_prebuilt->table)) {
+ n_rows_in_table = n_rows;
+ } else if (!(index->type & DICT_FTS)
+ && (n_rows != n_rows_in_table)) {
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: Index '%-.200s' contains " ULINTPF
+ " entries, should be " ULINTPF ".",
+ index->name(), n_rows, n_rows_in_table);
+ is_ok = false;
+ dict_set_corrupted(
+ index, m_prebuilt->trx,
+ "CHECK TABLE; Wrong count");
+ }
+ }
+
+ /* Restore the original isolation level */
+ m_prebuilt->trx->isolation_level = old_isolation_level;
+#ifdef BTR_CUR_HASH_ADAPT
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ /* We validate the whole adaptive hash index for all tables
+ at every CHECK TABLE only when QUICK flag is not present. */
+
+ if (!(check_opt->flags & T_QUICK) && !btr_search_validate()) {
+ push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The adaptive hash index is corrupted.");
+ is_ok = false;
+ }
+# endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+#endif /* BTR_CUR_HASH_ADAPT */
+ m_prebuilt->trx->op_info = "";
+
+ DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT);
+}
+
+/*******************************************************************//**
+Gets the foreign key create info for a table stored in InnoDB.
+@return own: character string in the form which can be inserted to the
+CREATE TABLE statement, MUST be freed with
+ha_innobase::free_foreign_key_create_info */
+
+char*
+ha_innobase::get_foreign_key_create_info(void)
+/*==========================================*/
+{
+ ut_a(m_prebuilt != NULL);
+
+ /* We do not know if MySQL can call this function before calling
+ external_lock(). To be safe, update the thd of the current table
+ handle. */
+
+ update_thd(ha_thd());
+
+ m_prebuilt->trx->op_info = "getting info on foreign keys";
+
+ /* Output the data to a temporary string */
+ std::string str = dict_print_info_on_foreign_keys(
+ TRUE, m_prebuilt->trx,
+ m_prebuilt->table);
+
+ m_prebuilt->trx->op_info = "";
+
+ /* Allocate buffer for the string */
+ char *fk_str = reinterpret_cast<char*>(
+ my_malloc(PSI_INSTRUMENT_ME, str.length() + 1, MYF(0)));
+
+ if (fk_str) {
+ memcpy(fk_str, str.c_str(), str.length());
+ fk_str[str.length()]='\0';
+ }
+
+ return(fk_str);
+}
+
+
+/***********************************************************************//**
+Maps a InnoDB foreign key constraint to a equivalent MySQL foreign key info.
+@return pointer to foreign key info */
+static
+FOREIGN_KEY_INFO*
+get_foreign_key_info(
+/*=================*/
+ THD* thd, /*!< in: user thread handle */
+ dict_foreign_t* foreign)/*!< in: foreign key constraint */
+{
+ FOREIGN_KEY_INFO f_key_info;
+ FOREIGN_KEY_INFO* pf_key_info;
+ uint i = 0;
+ size_t len;
+ char tmp_buff[NAME_LEN+1];
+ char name_buff[NAME_LEN+1];
+ const char* ptr;
+ LEX_CSTRING* referenced_key_name;
+ LEX_CSTRING* name = NULL;
+
+ if (dict_table_t::is_temporary_name(foreign->foreign_table_name)) {
+ return NULL;
+ }
+
+ ptr = dict_remove_db_name(foreign->id);
+ f_key_info.foreign_id = thd_make_lex_string(
+ thd, 0, ptr, strlen(ptr), 1);
+
+ /* Name format: database name, '/', table name, '\0' */
+
+ /* Referenced (parent) database name */
+ len = dict_get_db_name_len(foreign->referenced_table_name);
+ ut_a(len < sizeof(tmp_buff));
+ memcpy(tmp_buff, foreign->referenced_table_name, len);
+ tmp_buff[len] = 0;
+
+ len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff));
+ f_key_info.referenced_db = thd_make_lex_string(
+ thd, 0, name_buff, len, 1);
+
+ /* Referenced (parent) table name */
+ ptr = dict_remove_db_name(foreign->referenced_table_name);
+ len = filename_to_tablename(ptr, name_buff, sizeof(name_buff), 1);
+ f_key_info.referenced_table = thd_make_lex_string(
+ thd, 0, name_buff, len, 1);
+
+ /* Dependent (child) database name */
+ len = dict_get_db_name_len(foreign->foreign_table_name);
+ ut_a(len < sizeof(tmp_buff));
+ memcpy(tmp_buff, foreign->foreign_table_name, len);
+ tmp_buff[len] = 0;
+
+ len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff));
+ f_key_info.foreign_db = thd_make_lex_string(
+ thd, 0, name_buff, len, 1);
+
+ /* Dependent (child) table name */
+ ptr = dict_remove_db_name(foreign->foreign_table_name);
+ len = filename_to_tablename(ptr, name_buff, sizeof(name_buff), 1);
+ f_key_info.foreign_table = thd_make_lex_string(
+ thd, 0, name_buff, len, 1);
+
+ do {
+ ptr = foreign->foreign_col_names[i];
+ name = thd_make_lex_string(thd, name, ptr,
+ strlen(ptr), 1);
+ f_key_info.foreign_fields.push_back(name);
+ ptr = foreign->referenced_col_names[i];
+ name = thd_make_lex_string(thd, name, ptr,
+ strlen(ptr), 1);
+ f_key_info.referenced_fields.push_back(name);
+ } while (++i < foreign->n_fields);
+
+ if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) {
+ f_key_info.delete_method = FK_OPTION_CASCADE;
+ } else if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) {
+ f_key_info.delete_method = FK_OPTION_SET_NULL;
+ } else if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+ f_key_info.delete_method = FK_OPTION_NO_ACTION;
+ } else {
+ f_key_info.delete_method = FK_OPTION_RESTRICT;
+ }
+
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+ f_key_info.update_method = FK_OPTION_CASCADE;
+ } else if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+ f_key_info.update_method = FK_OPTION_SET_NULL;
+ } else if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+ f_key_info.update_method = FK_OPTION_NO_ACTION;
+ } else {
+ f_key_info.update_method = FK_OPTION_RESTRICT;
+ }
+
+ /* Load referenced table to update FK referenced key name. */
+ if (foreign->referenced_table == NULL) {
+
+ dict_table_t* ref_table;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ref_table = dict_table_open_on_name(
+ foreign->referenced_table_name_lookup,
+ TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+
+ if (ref_table == NULL) {
+
+ if (!thd_test_options(
+ thd, OPTION_NO_FOREIGN_KEY_CHECKS)) {
+ ib::info()
+ << "Foreign Key referenced table "
+ << foreign->referenced_table_name
+ << " not found for foreign table "
+ << foreign->foreign_table_name;
+ }
+ } else {
+
+ dict_table_close(ref_table, TRUE, FALSE);
+ }
+ }
+
+ if (foreign->referenced_index
+ && foreign->referenced_index->name != NULL) {
+ referenced_key_name = thd_make_lex_string(
+ thd,
+ f_key_info.referenced_key_name,
+ foreign->referenced_index->name,
+ strlen(foreign->referenced_index->name),
+ 1);
+ } else {
+ referenced_key_name = NULL;
+ }
+
+ f_key_info.referenced_key_name = referenced_key_name;
+
+ pf_key_info = (FOREIGN_KEY_INFO*) thd_memdup(thd, &f_key_info,
+ sizeof(FOREIGN_KEY_INFO));
+
+ return(pf_key_info);
+}
+
+/*******************************************************************//**
+Gets the list of foreign keys in this table.
+@return always 0, that is, always succeeds */
+
+int
+ha_innobase::get_foreign_key_list(
+/*==============================*/
+ THD* thd, /*!< in: user thread handle */
+ List<FOREIGN_KEY_INFO>* f_key_list) /*!< out: foreign key list */
+{
+ update_thd(ha_thd());
+
+ m_prebuilt->trx->op_info = "getting list of foreign keys";
+
+ mutex_enter(&dict_sys.mutex);
+
+ for (dict_foreign_set::iterator it
+ = m_prebuilt->table->foreign_set.begin();
+ it != m_prebuilt->table->foreign_set.end();
+ ++it) {
+
+ FOREIGN_KEY_INFO* pf_key_info;
+ dict_foreign_t* foreign = *it;
+
+ pf_key_info = get_foreign_key_info(thd, foreign);
+
+ if (pf_key_info != NULL) {
+ f_key_list->push_back(pf_key_info);
+ }
+ }
+
+ mutex_exit(&dict_sys.mutex);
+
+ m_prebuilt->trx->op_info = "";
+
+ return(0);
+}
+
+/*******************************************************************//**
+Gets the set of foreign keys where this table is the referenced table.
+@return always 0, that is, always succeeds */
+
+int
+ha_innobase::get_parent_foreign_key_list(
+/*=====================================*/
+ THD* thd, /*!< in: user thread handle */
+ List<FOREIGN_KEY_INFO>* f_key_list) /*!< out: foreign key list */
+{
+ update_thd(ha_thd());
+
+ m_prebuilt->trx->op_info = "getting list of referencing foreign keys";
+
+ mutex_enter(&dict_sys.mutex);
+
+ for (dict_foreign_set::iterator it
+ = m_prebuilt->table->referenced_set.begin();
+ it != m_prebuilt->table->referenced_set.end();
+ ++it) {
+
+ FOREIGN_KEY_INFO* pf_key_info;
+ dict_foreign_t* foreign = *it;
+
+ pf_key_info = get_foreign_key_info(thd, foreign);
+
+ if (pf_key_info != NULL) {
+ f_key_list->push_back(pf_key_info);
+ }
+ }
+
+ mutex_exit(&dict_sys.mutex);
+
+ m_prebuilt->trx->op_info = "";
+
+ return(0);
+}
+
+/** Table list item structure is used to store only the table
+and name. It is used by get_cascade_foreign_key_table_list to store
+the intermediate result for fetching the table set. */
+struct table_list_item {
+ /** InnoDB table object */
+ const dict_table_t* table;
+ /** Table name */
+ const char* name;
+};
+
+/*****************************************************************//**
+Checks if ALTER TABLE may change the storage engine of the table.
+Changing storage engines is not allowed for tables for which there
+are foreign key constraints (parent or child tables).
+@return TRUE if can switch engines */
+
+bool
+ha_innobase::can_switch_engines(void)
+/*=================================*/
+{
+ DBUG_ENTER("ha_innobase::can_switch_engines");
+
+ update_thd();
+
+ m_prebuilt->trx->op_info =
+ "determining if there are foreign key constraints";
+
+ row_mysql_freeze_data_dictionary(m_prebuilt->trx);
+
+ bool can_switch = m_prebuilt->table->referenced_set.empty()
+ && m_prebuilt->table->foreign_set.empty();
+
+ row_mysql_unfreeze_data_dictionary(m_prebuilt->trx);
+ m_prebuilt->trx->op_info = "";
+
+ DBUG_RETURN(can_switch);
+}
+
+/*******************************************************************//**
+Checks if a table is referenced by a foreign key. The MySQL manual states that
+a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a
+delete is then allowed internally to resolve a duplicate key conflict in
+REPLACE, not an update.
+@return > 0 if referenced by a FOREIGN KEY */
+
+uint
+ha_innobase::referenced_by_foreign_key(void)
+/*========================================*/
+{
+ if (dict_table_is_referenced_by_foreign_key(m_prebuilt->table)) {
+
+ return(1);
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Frees the foreign key create info for a table stored in InnoDB, if it is
+non-NULL. */
+
+void
+ha_innobase::free_foreign_key_create_info(
+/*======================================*/
+ char* str) /*!< in, own: create info string to free */
+{
+ if (str != NULL) {
+ my_free(str);
+ }
+}
+
+/*******************************************************************//**
+Tells something additional to the handler about how to do things.
+@return 0 or error number */
+
+int
+ha_innobase::extra(
+/*===============*/
+ enum ha_extra_function operation)
+ /*!< in: HA_EXTRA_FLUSH or some other flag */
+{
+ check_trx_exists(ha_thd());
+
+ /* Warning: since it is not sure that MySQL calls external_lock
+ before calling this function, the trx field in m_prebuilt can be
+ obsolete! */
+
+ switch (operation) {
+ case HA_EXTRA_FLUSH:
+ if (m_prebuilt->blob_heap) {
+ row_mysql_prebuilt_free_blob_heap(m_prebuilt);
+ }
+ break;
+ case HA_EXTRA_RESET_STATE:
+ reset_template();
+ thd_to_trx(ha_thd())->duplicates = 0;
+ break;
+ case HA_EXTRA_NO_KEYREAD:
+ m_prebuilt->read_just_key = 0;
+ break;
+ case HA_EXTRA_KEYREAD:
+ m_prebuilt->read_just_key = 1;
+ break;
+ case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+ m_prebuilt->keep_other_fields_on_keyread = 1;
+ break;
+
+ /* IMPORTANT: m_prebuilt->trx can be obsolete in
+ this method, because it is not sure that MySQL
+ calls external_lock before this method with the
+ parameters below. We must not invoke update_thd()
+ either, because the calling threads may change.
+ CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */
+ case HA_EXTRA_INSERT_WITH_UPDATE:
+ thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE;
+ break;
+ case HA_EXTRA_NO_IGNORE_DUP_KEY:
+ thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE;
+ break;
+ case HA_EXTRA_WRITE_CAN_REPLACE:
+ thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE;
+ break;
+ case HA_EXTRA_WRITE_CANNOT_REPLACE:
+ thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE;
+ break;
+ case HA_EXTRA_BEGIN_ALTER_COPY:
+ m_prebuilt->table->skip_alter_undo = 1;
+ if (m_prebuilt->table->is_temporary()
+ || !m_prebuilt->table->versioned_by_id()) {
+ break;
+ }
+ trx_start_if_not_started(m_prebuilt->trx, true);
+ m_prebuilt->trx->mod_tables.insert(
+ trx_mod_tables_t::value_type(
+ const_cast<dict_table_t*>(m_prebuilt->table),
+ 0))
+ .first->second.set_versioned(0);
+ break;
+ case HA_EXTRA_END_ALTER_COPY:
+ m_prebuilt->table->skip_alter_undo = 0;
+ break;
+ default:/* Do nothing */
+ ;
+ }
+
+ return(0);
+}
+
+/**
+MySQL calls this method at the end of each statement */
+int
+ha_innobase::reset()
+{
+ if (m_prebuilt->blob_heap) {
+ row_mysql_prebuilt_free_blob_heap(m_prebuilt);
+ }
+
+ reset_template();
+
+ m_ds_mrr.dsmrr_close();
+
+ /* TODO: This should really be reset in reset_template() but for now
+ it's safer to do it explicitly here. */
+
+ /* This is a statement level counter. */
+ m_prebuilt->autoinc_last_value = 0;
+
+ return(0);
+}
+
+/******************************************************************//**
+MySQL calls this function at the start of each SQL statement inside LOCK
+TABLES. Inside LOCK TABLES the ::external_lock method does not work to
+mark SQL statement borders. Note also a special case: if a temporary table
+is created inside LOCK TABLES, MySQL has not called external_lock() at all
+on that table.
+MySQL-5.0 also calls this before each statement in an execution of a stored
+procedure. To make the execution more deterministic for binlogging, MySQL-5.0
+locks all tables involved in a stored procedure with full explicit table
+locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the
+procedure.
+@return 0 or error code */
+
+int
+ha_innobase::start_stmt(
+/*====================*/
+ THD* thd, /*!< in: handle to the user thread */
+ thr_lock_type lock_type)
+{
+ trx_t* trx = m_prebuilt->trx;
+
+ DBUG_ENTER("ha_innobase::start_stmt");
+
+ update_thd(thd);
+
+ ut_ad(m_prebuilt->table != NULL);
+
+ trx = m_prebuilt->trx;
+
+ /* Reset the AUTOINC statement level counter for multi-row INSERTs. */
+ trx->n_autoinc_rows = 0;
+
+ m_prebuilt->sql_stat_start = TRUE;
+ m_prebuilt->hint_need_to_fetch_extra_cols = 0;
+ reset_template();
+
+ if (m_prebuilt->table->is_temporary()
+ && m_mysql_has_locked
+ && m_prebuilt->select_lock_type == LOCK_NONE) {
+ dberr_t error;
+
+ switch (thd_sql_command(thd)) {
+ case SQLCOM_INSERT:
+ case SQLCOM_UPDATE:
+ case SQLCOM_DELETE:
+ case SQLCOM_REPLACE:
+ init_table_handle_for_HANDLER();
+ m_prebuilt->select_lock_type = LOCK_X;
+ m_prebuilt->stored_select_lock_type = LOCK_X;
+ error = row_lock_table(m_prebuilt);
+
+ if (error != DB_SUCCESS) {
+ int st = convert_error_code_to_mysql(
+ error, 0, thd);
+ DBUG_RETURN(st);
+ }
+ break;
+ }
+ }
+
+ if (!m_mysql_has_locked) {
+ /* This handle is for a temporary table created inside
+ this same LOCK TABLES; since MySQL does NOT call external_lock
+ in this case, we must use x-row locks inside InnoDB to be
+ prepared for an update of a row */
+
+ m_prebuilt->select_lock_type = LOCK_X;
+
+ } else if (trx->isolation_level != TRX_ISO_SERIALIZABLE
+ && thd_sql_command(thd) == SQLCOM_SELECT
+ && lock_type == TL_READ) {
+
+ /* For other than temporary tables, we obtain
+ no lock for consistent read (plain SELECT). */
+
+ m_prebuilt->select_lock_type = LOCK_NONE;
+ } else {
+ /* Not a consistent read: restore the
+ select_lock_type value. The value of
+ stored_select_lock_type was decided in:
+ 1) ::store_lock(),
+ 2) ::external_lock(),
+ 3) ::init_table_handle_for_HANDLER(). */
+
+ ut_a(m_prebuilt->stored_select_lock_type != LOCK_NONE_UNSET);
+
+ m_prebuilt->select_lock_type =
+ m_prebuilt->stored_select_lock_type;
+ }
+
+ *trx->detailed_error = 0;
+
+ innobase_register_trx(ht, thd, trx);
+
+ if (!trx_is_started(trx)) {
+ trx->will_lock = true;
+ }
+
+ DBUG_RETURN(0);
+}
+
+/******************************************************************//**
+Maps a MySQL trx isolation level code to the InnoDB isolation level code
+@return InnoDB isolation level */
+static inline
+uint
+innobase_map_isolation_level(
+/*=========================*/
+ enum_tx_isolation iso) /*!< in: MySQL isolation level code */
+{
+ if (UNIV_UNLIKELY(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN)
+ || UNIV_UNLIKELY(srv_read_only_mode)) {
+ return TRX_ISO_READ_UNCOMMITTED;
+ }
+ switch (iso) {
+ case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ);
+ case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED);
+ case ISO_SERIALIZABLE: return(TRX_ISO_SERIALIZABLE);
+ case ISO_READ_UNCOMMITTED: return(TRX_ISO_READ_UNCOMMITTED);
+ }
+
+ ut_error;
+
+ return(0);
+}
+
+/******************************************************************//**
+As MySQL will execute an external lock for every new table it uses when it
+starts to process an SQL statement (an exception is when MySQL calls
+start_stmt for the handle) we can use this function to store the pointer to
+the THD in the handle. We will also use this function to communicate
+to InnoDB that a new SQL statement has started and that we must store a
+savepoint to our transaction handle, so that we are able to roll back
+the SQL statement in case of an error.
+@return 0 */
+
+int
+ha_innobase::external_lock(
+/*=======================*/
+ THD* thd, /*!< in: handle to the user thread */
+ int lock_type) /*!< in: lock type */
+{
+ DBUG_ENTER("ha_innobase::external_lock");
+ DBUG_PRINT("enter",("lock_type: %d", lock_type));
+
+ update_thd(thd);
+ trx_t* trx = m_prebuilt->trx;
+ ut_ad(m_prebuilt->table);
+
+ /* Statement based binlogging does not work in isolation level
+ READ UNCOMMITTED and READ COMMITTED since the necessary
+ locks cannot be taken. In this case, we print an
+ informative error message and return with an error.
+ Note: decide_logging_format would give the same error message,
+ except it cannot give the extra details. */
+
+ if (lock_type == F_WRLCK
+ && !(table_flags() & HA_BINLOG_STMT_CAPABLE)
+ && thd_binlog_format(thd) == BINLOG_FORMAT_STMT
+ && thd_binlog_filter_ok(thd)
+ && thd_sqlcom_can_generate_row_events(thd)) {
+ bool skip = false;
+#ifdef WITH_WSREP
+ skip = trx->is_wsrep() && !wsrep_thd_is_local(thd);
+#endif /* WITH_WSREP */
+ /* used by test case */
+ DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = true;);
+
+ if (!skip) {
+ my_error(ER_BINLOG_STMT_MODE_AND_ROW_ENGINE, MYF(0),
+ " InnoDB is limited to row-logging when"
+ " transaction isolation level is"
+ " READ COMMITTED or READ UNCOMMITTED.");
+
+ DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE);
+ }
+ }
+
+ /* Check for UPDATEs in read-only mode. */
+ if (srv_read_only_mode) {
+ switch (thd_sql_command(thd)) {
+ case SQLCOM_CREATE_TABLE:
+ if (lock_type != F_WRLCK) {
+ break;
+ }
+ /* fall through */
+ case SQLCOM_UPDATE:
+ case SQLCOM_INSERT:
+ case SQLCOM_REPLACE:
+ case SQLCOM_DROP_TABLE:
+ case SQLCOM_ALTER_TABLE:
+ case SQLCOM_OPTIMIZE:
+ case SQLCOM_CREATE_INDEX:
+ case SQLCOM_DROP_INDEX:
+ case SQLCOM_CREATE_SEQUENCE:
+ case SQLCOM_DROP_SEQUENCE:
+ case SQLCOM_DELETE:
+ ib_senderrf(thd, IB_LOG_LEVEL_WARN,
+ ER_READ_ONLY_MODE);
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ }
+ }
+
+ m_prebuilt->sql_stat_start = TRUE;
+ m_prebuilt->hint_need_to_fetch_extra_cols = 0;
+
+ reset_template();
+
+ switch (m_prebuilt->table->quiesce) {
+ case QUIESCE_START:
+ /* Check for FLUSH TABLE t WITH READ LOCK; */
+ if (!srv_read_only_mode
+ && thd_sql_command(thd) == SQLCOM_FLUSH
+ && lock_type == F_RDLCK) {
+
+ if (!m_prebuilt->table->space) {
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
+
+ DBUG_RETURN(HA_ERR_TABLESPACE_MISSING);
+ }
+
+ row_quiesce_table_start(m_prebuilt->table, trx);
+
+ /* Use the transaction instance to track UNLOCK
+ TABLES. It can be done via START TRANSACTION; too
+ implicitly. */
+
+ ++trx->flush_tables;
+ }
+ break;
+
+ case QUIESCE_COMPLETE:
+ /* Check for UNLOCK TABLES; implicit or explicit
+ or trx interruption. */
+ if (trx->flush_tables > 0
+ && (lock_type == F_UNLCK || trx_is_interrupted(trx))) {
+
+ row_quiesce_table_complete(m_prebuilt->table, trx);
+
+ ut_a(trx->flush_tables > 0);
+ --trx->flush_tables;
+ }
+
+ break;
+
+ case QUIESCE_NONE:
+ break;
+ }
+
+ if (lock_type == F_WRLCK) {
+
+ /* If this is a SELECT, then it is in UPDATE TABLE ...
+ or SELECT ... FOR UPDATE */
+ m_prebuilt->select_lock_type = LOCK_X;
+ m_prebuilt->stored_select_lock_type = LOCK_X;
+ }
+
+ if (lock_type != F_UNLCK) {
+ /* MySQL is setting a new table lock */
+
+ *trx->detailed_error = 0;
+
+ innobase_register_trx(ht, thd, trx);
+
+ if (trx->isolation_level == TRX_ISO_SERIALIZABLE
+ && m_prebuilt->select_lock_type == LOCK_NONE
+ && thd_test_options(
+ thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ /* To get serializable execution, we let InnoDB
+ conceptually add 'LOCK IN SHARE MODE' to all SELECTs
+ which otherwise would have been consistent reads. An
+ exception is consistent reads in the AUTOCOMMIT=1 mode:
+ we know that they are read-only transactions, and they
+ can be serialized also if performed as consistent
+ reads. */
+
+ m_prebuilt->select_lock_type = LOCK_S;
+ m_prebuilt->stored_select_lock_type = LOCK_S;
+ }
+
+ /* Starting from 4.1.9, no InnoDB table lock is taken in LOCK
+ TABLES if AUTOCOMMIT=1. It does not make much sense to acquire
+ an InnoDB table lock if it is released immediately at the end
+ of LOCK TABLES, and InnoDB's table locks in that case cause
+ VERY easily deadlocks.
+
+ We do not set InnoDB table locks if user has not explicitly
+ requested a table lock. Note that thd_in_lock_tables(thd)
+ can hold in some cases, e.g., at the start of a stored
+ procedure call (SQLCOM_CALL). */
+
+ if (m_prebuilt->select_lock_type != LOCK_NONE) {
+
+ if (thd_sql_command(thd) == SQLCOM_LOCK_TABLES
+ && THDVAR(thd, table_locks)
+ && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT)
+ && thd_in_lock_tables(thd)) {
+
+ dberr_t error = row_lock_table(m_prebuilt);
+
+ if (error != DB_SUCCESS) {
+
+ DBUG_RETURN(
+ convert_error_code_to_mysql(
+ error, 0, thd));
+ }
+ }
+
+ trx->mysql_n_tables_locked++;
+ }
+
+ trx->n_mysql_tables_in_use++;
+ m_mysql_has_locked = true;
+
+ if (!trx_is_started(trx)
+ && (m_prebuilt->select_lock_type != LOCK_NONE
+ || m_prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+ trx->will_lock = true;
+ }
+
+ DBUG_RETURN(0);
+ } else {
+ DEBUG_SYNC_C("ha_innobase_end_statement");
+ }
+
+ /* MySQL is releasing a table lock */
+
+ trx->n_mysql_tables_in_use--;
+ m_mysql_has_locked = false;
+
+ /* If the MySQL lock count drops to zero we know that the current SQL
+ statement has ended */
+
+ if (trx->n_mysql_tables_in_use == 0) {
+
+ trx->mysql_n_tables_locked = 0;
+ m_prebuilt->used_in_HANDLER = FALSE;
+
+ if (!thd_test_options(
+ thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ if (trx_is_started(trx)) {
+
+ innobase_commit(ht, thd, TRUE);
+ }
+
+ } else if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+ trx->read_view.close();
+ }
+ }
+
+ if (!trx_is_started(trx)
+ && lock_type != F_UNLCK
+ && (m_prebuilt->select_lock_type != LOCK_NONE
+ || m_prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+ trx->will_lock = true;
+ }
+
+ DBUG_RETURN(0);
+}
+
+/************************************************************************//**
+Here we export InnoDB status variables to MySQL. */
+static
+void
+innodb_export_status()
+/*==================*/
+{
+ if (srv_was_started) {
+ srv_export_innodb_status();
+ }
+}
+
+/************************************************************************//**
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
+static
+int
+innodb_show_status(
+/*===============*/
+ handlerton* hton, /*!< in: the innodb handlerton */
+ THD* thd, /*!< in: the MySQL query thread of the caller */
+ stat_print_fn* stat_print)
+{
+ static const char truncated_msg[] = "... truncated...\n";
+ const long MAX_STATUS_SIZE = 1048576;
+ ulint trx_list_start = ULINT_UNDEFINED;
+ ulint trx_list_end = ULINT_UNDEFINED;
+ bool ret_val;
+
+ DBUG_ENTER("innodb_show_status");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ /* We don't create the temp files or associated
+ mutexes in read-only-mode */
+
+ if (srv_read_only_mode) {
+ DBUG_RETURN(0);
+ }
+
+ srv_wake_purge_thread_if_not_active();
+
+ /* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE
+ bytes of text. */
+
+ char* str;
+ size_t flen;
+
+ mutex_enter(&srv_monitor_file_mutex);
+ rewind(srv_monitor_file);
+
+ srv_printf_innodb_monitor(srv_monitor_file, FALSE,
+ &trx_list_start, &trx_list_end);
+
+ os_file_set_eof(srv_monitor_file);
+
+ flen = size_t(ftell(srv_monitor_file));
+ if (ssize_t(flen) < 0) {
+ flen = 0;
+ }
+
+ size_t usable_len;
+
+ if (flen > MAX_STATUS_SIZE) {
+ usable_len = MAX_STATUS_SIZE;
+ srv_truncated_status_writes++;
+ } else {
+ usable_len = flen;
+ }
+
+ /* allocate buffer for the string, and
+ read the contents of the temporary file */
+
+ if (!(str = (char*) my_malloc(PSI_INSTRUMENT_ME,
+ usable_len + 1, MYF(0)))) {
+ mutex_exit(&srv_monitor_file_mutex);
+ DBUG_RETURN(1);
+ }
+
+ rewind(srv_monitor_file);
+
+ if (flen < MAX_STATUS_SIZE) {
+ /* Display the entire output. */
+ flen = fread(str, 1, flen, srv_monitor_file);
+ } else if (trx_list_end < flen
+ && trx_list_start < trx_list_end
+ && trx_list_start + flen - trx_list_end
+ < MAX_STATUS_SIZE - sizeof truncated_msg - 1) {
+
+ /* Omit the beginning of the list of active transactions. */
+ size_t len = fread(str, 1, trx_list_start, srv_monitor_file);
+
+ memcpy(str + len, truncated_msg, sizeof truncated_msg - 1);
+ len += sizeof truncated_msg - 1;
+ usable_len = (MAX_STATUS_SIZE - 1) - len;
+ fseek(srv_monitor_file, long(flen - usable_len), SEEK_SET);
+ len += fread(str + len, 1, usable_len, srv_monitor_file);
+ flen = len;
+ } else {
+ /* Omit the end of the output. */
+ flen = fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file);
+ }
+
+ mutex_exit(&srv_monitor_file_mutex);
+
+ ret_val= stat_print(
+ thd, innobase_hton_name,
+ static_cast<uint>(strlen(innobase_hton_name)),
+ STRING_WITH_LEN(""), str, static_cast<uint>(flen));
+
+ my_free(str);
+
+ DBUG_RETURN(ret_val);
+}
+
+/** Callback for collecting mutex statistics */
+struct ShowStatus {
+
+ /** For tracking the mutex metrics */
+ struct Value {
+
+ /** Constructor
+ @param[in] name Name of the mutex
+ @param[in] spins Number of spins
+ @param[in] os_waits OS waits so far
+ @param[in] calls Number of calls to enter() */
+ Value(const char* name,
+ ulint spins,
+ uint64_t waits,
+ uint64_t calls)
+ :
+ m_name(name),
+ m_spins(spins),
+ m_waits(waits),
+ m_calls(calls)
+ {
+ /* No op */
+ }
+
+ /** Mutex name */
+ std::string m_name;
+
+ /** Spins so far */
+ ulint m_spins;
+
+ /** Waits so far */
+ uint64_t m_waits;
+
+ /** Number of calls so far */
+ uint64_t m_calls;
+ };
+
+ /** Order by m_waits, in descending order. */
+ struct OrderByWaits: public std::binary_function<Value, Value, bool>
+ {
+ /** @return true if rhs < lhs */
+ bool operator()(
+ const Value& lhs,
+ const Value& rhs) const
+ UNIV_NOTHROW
+ {
+ return(rhs.m_waits < lhs.m_waits);
+ }
+ };
+
+ typedef std::vector<Value, ut_allocator<Value> > Values;
+
+ /** Collect the individual latch counts */
+ struct GetCount {
+ typedef latch_meta_t::CounterType::Count Count;
+
+ /** Constructor
+ @param[in] name Latch name
+ @param[in,out] values Put the values here */
+ GetCount(
+ const char* name,
+ Values* values)
+ UNIV_NOTHROW
+ :
+ m_name(name),
+ m_values(values)
+ {
+ /* No op */
+ }
+
+ /** Collect the latch metrics. Ignore entries where the
+ spins and waits are zero.
+ @param[in] count The latch metrics */
+ void operator()(Count* count) const UNIV_NOTHROW
+ {
+ if (count->m_spins > 0 || count->m_waits > 0) {
+
+ m_values->push_back(Value(
+ m_name,
+ count->m_spins,
+ count->m_waits,
+ count->m_calls));
+ }
+ }
+
+ /** The latch name */
+ const char* m_name;
+
+ /** For collecting the active mutex stats. */
+ Values* m_values;
+ };
+
+ /** Constructor */
+ ShowStatus() { }
+
+ /** Callback for collecting the stats
+ @param[in] latch_meta Latch meta data
+ @return always returns true */
+ bool operator()(latch_meta_t& latch_meta)
+ UNIV_NOTHROW
+ {
+ latch_meta.get_counter()->iterate(
+ GetCount(latch_meta.get_name(), &m_values));
+
+ return(true);
+ }
+
+ /** Implements the SHOW MUTEX STATUS command, for mutexes.
+ The table structure is like so: Engine | Mutex Name | Status
+ We store the metrics in the "Status" column as:
+
+ spins=N,waits=N,calls=N"
+
+ The user has to parse the dataunfortunately
+ @param[in,out] thd the MySQL query thread of the caller
+ @param[in,out] stat_print function for printing statistics
+ @return true on success. */
+ bool to_string(
+ THD* thd,
+ stat_print_fn* stat_print)
+ UNIV_NOTHROW;
+
+ /** For collecting the active mutex stats. */
+ Values m_values;
+};
+
+/** Implements the SHOW MUTEX STATUS command, for mutexes.
+The table structure is like so: Engine | Mutex Name | Status
+We store the metrics in the "Status" column as:
+
+ spins=N,waits=N,calls=N"
+
+The user has to parse the dataunfortunately
+@param[in,out] thd the MySQL query thread of the caller
+@param[in,out] stat_print function for printing statistics
+@return true on success. */
+bool
+ShowStatus::to_string(
+ THD* thd,
+ stat_print_fn* stat_print)
+ UNIV_NOTHROW
+{
+ uint hton_name_len = (uint) strlen(innobase_hton_name);
+
+ std::sort(m_values.begin(), m_values.end(), OrderByWaits());
+
+ Values::iterator end = m_values.end();
+
+ for (Values::iterator it = m_values.begin(); it != end; ++it) {
+
+ int name_len;
+ char name_buf[IO_SIZE];
+
+ name_len = snprintf(
+ name_buf, sizeof(name_buf), "%s", it->m_name.c_str());
+
+ int status_len;
+ char status_buf[IO_SIZE];
+
+ status_len = snprintf(
+ status_buf, sizeof(status_buf),
+ "spins=%lu,waits=%lu,calls=%llu",
+ static_cast<ulong>(it->m_spins),
+ static_cast<long>(it->m_waits),
+ (ulonglong) it->m_calls);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len,
+ name_buf, static_cast<uint>(name_len),
+ status_buf, static_cast<uint>(status_len))) {
+
+ return(false);
+ }
+ }
+
+ return(true);
+}
+
+/** Implements the SHOW MUTEX STATUS command, for mutexes.
+@param[in,out] hton the innodb handlerton
+@param[in,out] thd the MySQL query thread of the caller
+@param[in,out] stat_print function for printing statistics
+@return 0 on success. */
+static
+int
+innodb_show_mutex_status(
+ handlerton*
+#ifdef DBUG_ASSERT_EXISTS
+ hton
+#endif
+ ,
+ THD* thd,
+ stat_print_fn* stat_print)
+{
+ DBUG_ENTER("innodb_show_mutex_status");
+
+ ShowStatus collector;
+
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ mutex_monitor.iterate(collector);
+
+ if (!collector.to_string(thd, stat_print)) {
+ DBUG_RETURN(1);
+ }
+
+ DBUG_RETURN(0);
+}
+
+/** Implements the SHOW MUTEX STATUS command.
+@param[in,out] hton the innodb handlerton
+@param[in,out] thd the MySQL query thread of the caller
+@param[in,out] stat_print function for printing statistics
+@return 0 on success. */
+static
+int
+innodb_show_rwlock_status(
+ handlerton*
+#ifdef DBUG_ASSERT_EXISTS
+ hton
+#endif
+ ,
+ THD* thd,
+ stat_print_fn* stat_print)
+{
+ DBUG_ENTER("innodb_show_rwlock_status");
+
+ const rw_lock_t* block_rwlock= nullptr;
+ ulint block_rwlock_oswait_count = 0;
+ uint hton_name_len = (uint) strlen(innobase_hton_name);
+
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ mutex_enter(&rw_lock_list_mutex);
+
+ for (const rw_lock_t& rw_lock : rw_lock_list) {
+
+ if (rw_lock.count_os_wait == 0) {
+ continue;
+ }
+
+ int buf1len;
+ char buf1[IO_SIZE];
+
+ if (rw_lock.is_block_lock) {
+
+ block_rwlock = &rw_lock;
+ block_rwlock_oswait_count += rw_lock.count_os_wait;
+
+ continue;
+ }
+
+ buf1len = snprintf(
+ buf1, sizeof buf1, "rwlock: %s:%u",
+ innobase_basename(rw_lock.cfile_name),
+ rw_lock.cline);
+
+ int buf2len;
+ char buf2[IO_SIZE];
+
+ buf2len = snprintf(
+ buf2, sizeof buf2, "waits=%u",
+ rw_lock.count_os_wait);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len,
+ buf1, static_cast<uint>(buf1len),
+ buf2, static_cast<uint>(buf2len))) {
+
+ mutex_exit(&rw_lock_list_mutex);
+
+ DBUG_RETURN(1);
+ }
+ }
+
+ if (block_rwlock != NULL) {
+
+ int buf1len;
+ char buf1[IO_SIZE];
+
+ buf1len = snprintf(
+ buf1, sizeof buf1, "sum rwlock: %s:%u",
+ innobase_basename(block_rwlock->cfile_name),
+ block_rwlock->cline);
+
+ int buf2len;
+ char buf2[IO_SIZE];
+
+ buf2len = snprintf(
+ buf2, sizeof buf2, "waits=" ULINTPF,
+ block_rwlock_oswait_count);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len,
+ buf1, static_cast<uint>(buf1len),
+ buf2, static_cast<uint>(buf2len))) {
+
+ mutex_exit(&rw_lock_list_mutex);
+
+ DBUG_RETURN(1);
+ }
+ }
+
+ mutex_exit(&rw_lock_list_mutex);
+
+ DBUG_RETURN(0);
+}
+
+/** Implements the SHOW MUTEX STATUS command.
+@param[in,out] hton the innodb handlerton
+@param[in,out] thd the MySQL query thread of the caller
+@param[in,out] stat_print function for printing statistics
+@return 0 on success. */
+static
+int
+innodb_show_latch_status(
+ handlerton* hton,
+ THD* thd,
+ stat_print_fn* stat_print)
+{
+ int ret = innodb_show_mutex_status(hton, thd, stat_print);
+
+ if (ret != 0) {
+ return(ret);
+ }
+
+ return(innodb_show_rwlock_status(hton, thd, stat_print));
+}
+
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
+static
+bool
+innobase_show_status(
+/*=================*/
+ handlerton* hton, /*!< in: the innodb handlerton */
+ THD* thd, /*!< in: the MySQL query thread
+ of the caller */
+ stat_print_fn* stat_print,
+ enum ha_stat_type stat_type)
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ switch (stat_type) {
+ case HA_ENGINE_STATUS:
+ /* Non-zero return value means there was an error. */
+ return(innodb_show_status(hton, thd, stat_print) != 0);
+
+ case HA_ENGINE_MUTEX:
+ return(innodb_show_latch_status(hton, thd, stat_print) != 0);
+
+ case HA_ENGINE_LOGS:
+ /* Not handled */
+ break;
+ }
+
+ /* Success */
+ return(false);
+}
+
+/*********************************************************************//**
+Returns number of THR_LOCK locks used for one instance of InnoDB table.
+InnoDB no longer relies on THR_LOCK locks so 0 value is returned.
+Instead of THR_LOCK locks InnoDB relies on combination of metadata locks
+(e.g. for LOCK TABLES and DDL) and its own locking subsystem.
+Note that even though this method returns 0, SQL-layer still calls
+::store_lock(), ::start_stmt() and ::external_lock() methods for InnoDB
+tables. */
+
+uint
+ha_innobase::lock_count(void) const
+/*===============================*/
+{
+ return 0;
+}
+
+/*****************************************************************//**
+Supposed to convert a MySQL table lock stored in the 'lock' field of the
+handle to a proper type before storing pointer to the lock into an array
+of pointers.
+In practice, since InnoDB no longer relies on THR_LOCK locks and its
+lock_count() method returns 0 it just informs storage engine about type
+of THR_LOCK which SQL-layer would have acquired for this specific statement
+on this specific table.
+MySQL also calls this if it wants to reset some table locks to a not-locked
+state during the processing of an SQL query. An example is that during a
+SELECT the read lock is released early on the 'const' tables where we only
+fetch one row. MySQL does not call this when it releases all locks at the
+end of an SQL statement.
+@return pointer to the current element in the 'to' array. */
+
+THR_LOCK_DATA**
+ha_innobase::store_lock(
+/*====================*/
+ THD* thd, /*!< in: user thread handle */
+ THR_LOCK_DATA** to, /*!< in: pointer to the current
+ element in an array of pointers
+ to lock structs;
+ only used as return value */
+ thr_lock_type lock_type) /*!< in: lock type to store in
+ 'lock'; this may also be
+ TL_IGNORE */
+{
+ /* Note that trx in this function is NOT necessarily m_prebuilt->trx
+ because we call update_thd() later, in ::external_lock()! Failure to
+ understand this caused a serious memory corruption bug in 5.1.11. */
+
+ trx_t* trx = check_trx_exists(thd);
+
+ /* NOTE: MySQL can call this function with lock 'type' TL_IGNORE!
+ Be careful to ignore TL_IGNORE if we are going to do something with
+ only 'real' locks! */
+
+ /* If no MySQL table is in use, we need to set the isolation level
+ of the transaction. */
+
+ if (lock_type != TL_IGNORE
+ && trx->n_mysql_tables_in_use == 0) {
+ trx->isolation_level = innobase_map_isolation_level(
+ (enum_tx_isolation) thd_tx_isolation(thd));
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+
+ /* At low transaction isolation levels we let
+ each consistent read set its own snapshot */
+ trx->read_view.close();
+ }
+ }
+
+ DBUG_ASSERT(EQ_CURRENT_THD(thd));
+ const bool in_lock_tables = thd_in_lock_tables(thd);
+ const int sql_command = thd_sql_command(thd);
+
+ if (srv_read_only_mode
+ && (sql_command == SQLCOM_UPDATE
+ || sql_command == SQLCOM_INSERT
+ || sql_command == SQLCOM_REPLACE
+ || sql_command == SQLCOM_DROP_TABLE
+ || sql_command == SQLCOM_ALTER_TABLE
+ || sql_command == SQLCOM_OPTIMIZE
+ || (sql_command == SQLCOM_CREATE_TABLE
+ && (lock_type >= TL_WRITE_CONCURRENT_INSERT
+ && lock_type <= TL_WRITE))
+ || sql_command == SQLCOM_CREATE_INDEX
+ || sql_command == SQLCOM_DROP_INDEX
+ || sql_command == SQLCOM_CREATE_SEQUENCE
+ || sql_command == SQLCOM_DROP_SEQUENCE
+ || sql_command == SQLCOM_DELETE)) {
+
+ ib_senderrf(trx->mysql_thd,
+ IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+
+ } else if (sql_command == SQLCOM_FLUSH
+ && lock_type == TL_READ_NO_INSERT) {
+
+ /* Check for FLUSH TABLES ... WITH READ LOCK */
+
+ /* Note: This call can fail, but there is no way to return
+ the error to the caller. We simply ignore it for now here
+ and push the error code to the caller where the error is
+ detected in the function. */
+
+ dberr_t err = row_quiesce_set_state(
+ m_prebuilt->table, QUIESCE_START, trx);
+
+ ut_a(err == DB_SUCCESS || err == DB_UNSUPPORTED);
+
+ if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+ m_prebuilt->select_lock_type = LOCK_S;
+ m_prebuilt->stored_select_lock_type = LOCK_S;
+ } else {
+ m_prebuilt->select_lock_type = LOCK_NONE;
+ m_prebuilt->stored_select_lock_type = LOCK_NONE;
+ }
+
+ /* Check for DROP TABLE */
+ } else if (sql_command == SQLCOM_DROP_TABLE ||
+ sql_command == SQLCOM_DROP_SEQUENCE) {
+
+ /* MySQL calls this function in DROP TABLE though this table
+ handle may belong to another thd that is running a query. Let
+ us in that case skip any changes to the m_prebuilt struct. */
+
+ /* Check for LOCK TABLE t1,...,tn WITH SHARED LOCKS */
+ } else if ((lock_type == TL_READ && in_lock_tables)
+ || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables)
+ || lock_type == TL_READ_WITH_SHARED_LOCKS
+ || lock_type == TL_READ_NO_INSERT
+ || (lock_type != TL_IGNORE
+ && sql_command != SQLCOM_SELECT)) {
+
+ /* The OR cases above are in this order:
+ 1) MySQL is doing LOCK TABLES ... READ LOCAL, or we
+ are processing a stored procedure or function, or
+ 2) (we do not know when TL_READ_HIGH_PRIORITY is used), or
+ 3) this is a SELECT ... IN SHARE MODE, or
+ 4) we are doing a complex SQL statement like
+ INSERT INTO ... SELECT ... and the logical logging (MySQL
+ binlog) requires the use of a locking read, or
+ MySQL is doing LOCK TABLES ... READ.
+ 5) we let InnoDB do locking reads for all SQL statements that
+ are not simple SELECTs; note that select_lock_type in this
+ case may get strengthened in ::external_lock() to LOCK_X.
+ Note that we MUST use a locking read in all data modifying
+ SQL statements, because otherwise the execution would not be
+ serializable, and also the results from the update could be
+ unexpected if an obsolete consistent read view would be
+ used. */
+
+ /* Use consistent read for checksum table */
+
+ if (sql_command == SQLCOM_CHECKSUM
+ || sql_command == SQLCOM_CREATE_SEQUENCE
+ || (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ)
+ || (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && (lock_type == TL_READ
+ || lock_type == TL_READ_NO_INSERT)
+ && (sql_command == SQLCOM_INSERT_SELECT
+ || sql_command == SQLCOM_REPLACE_SELECT
+ || sql_command == SQLCOM_UPDATE
+ || sql_command == SQLCOM_CREATE_SEQUENCE
+ || sql_command == SQLCOM_CREATE_TABLE))) {
+
+ /* If the transaction isolation level is
+ READ UNCOMMITTED or READ COMMITTED and we are executing
+ INSERT INTO...SELECT or REPLACE INTO...SELECT
+ or UPDATE ... = (SELECT ...) or CREATE ...
+ SELECT... without FOR UPDATE or IN SHARE
+ MODE in select, then we use consistent read
+ for select. */
+
+ m_prebuilt->select_lock_type = LOCK_NONE;
+ m_prebuilt->stored_select_lock_type = LOCK_NONE;
+ } else {
+ m_prebuilt->select_lock_type = LOCK_S;
+ m_prebuilt->stored_select_lock_type = LOCK_S;
+ }
+
+ } else if (lock_type != TL_IGNORE) {
+
+ /* We set possible LOCK_X value in external_lock, not yet
+ here even if this would be SELECT ... FOR UPDATE */
+
+ m_prebuilt->select_lock_type = LOCK_NONE;
+ m_prebuilt->stored_select_lock_type = LOCK_NONE;
+ }
+
+ if (!trx_is_started(trx)
+ && (m_prebuilt->select_lock_type != LOCK_NONE
+ || m_prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+ trx->will_lock = true;
+ }
+
+ return(to);
+}
+
+/*********************************************************************//**
+Read the next autoinc value. Acquire the relevant locks before reading
+the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked
+on return and all relevant locks acquired.
+@return DB_SUCCESS or error code */
+
+dberr_t
+ha_innobase::innobase_get_autoinc(
+/*==============================*/
+ ulonglong* value) /*!< out: autoinc value */
+{
+ *value = 0;
+
+ m_prebuilt->autoinc_error = innobase_lock_autoinc();
+
+ if (m_prebuilt->autoinc_error == DB_SUCCESS) {
+
+ /* Determine the first value of the interval */
+ *value = dict_table_autoinc_read(m_prebuilt->table);
+
+ /* It should have been initialized during open. */
+ if (*value == 0) {
+ m_prebuilt->autoinc_error = DB_UNSUPPORTED;
+ m_prebuilt->table->autoinc_mutex.unlock();
+ }
+ }
+
+ return(m_prebuilt->autoinc_error);
+}
+
+/*******************************************************************//**
+This function reads the global auto-inc counter. It doesn't use the
+AUTOINC lock even if the lock mode is set to TRADITIONAL.
+@return the autoinc value */
+
+ulonglong
+ha_innobase::innobase_peek_autoinc(void)
+/*====================================*/
+{
+ ulonglong auto_inc;
+ dict_table_t* innodb_table;
+
+ ut_a(m_prebuilt != NULL);
+ ut_a(m_prebuilt->table != NULL);
+
+ innodb_table = m_prebuilt->table;
+
+ innodb_table->autoinc_mutex.lock();
+
+ auto_inc = dict_table_autoinc_read(innodb_table);
+
+ if (auto_inc == 0) {
+ ib::info() << "AUTOINC next value generation is disabled for"
+ " '" << innodb_table->name << "'";
+ }
+
+ innodb_table->autoinc_mutex.unlock();
+
+ return(auto_inc);
+}
+
+/*********************************************************************//**
+Returns the value of the auto-inc counter in *first_value and ~0 on failure. */
+
+void
+ha_innobase::get_auto_increment(
+/*============================*/
+ ulonglong offset, /*!< in: table autoinc offset */
+ ulonglong increment, /*!< in: table autoinc
+ increment */
+ ulonglong nb_desired_values, /*!< in: number of values
+ reqd */
+ ulonglong* first_value, /*!< out: the autoinc value */
+ ulonglong* nb_reserved_values) /*!< out: count of reserved
+ values */
+{
+ trx_t* trx;
+ dberr_t error;
+ ulonglong autoinc = 0;
+
+ /* Prepare m_prebuilt->trx in the table handle */
+ update_thd(ha_thd());
+
+ error = innobase_get_autoinc(&autoinc);
+
+ if (error != DB_SUCCESS) {
+ *first_value = (~(ulonglong) 0);
+ return;
+ }
+
+ /* This is a hack, since nb_desired_values seems to be accurate only
+ for the first call to get_auto_increment() for multi-row INSERT and
+ meaningless for other statements e.g, LOAD etc. Subsequent calls to
+ this method for the same statement results in different values which
+ don't make sense. Therefore we store the value the first time we are
+ called and count down from that as rows are written (see write_row()).
+ */
+
+ trx = m_prebuilt->trx;
+
+ /* Note: We can't rely on *first_value since some MySQL engines,
+ in particular the partition engine, don't initialize it to 0 when
+ invoking this method. So we are not sure if it's guaranteed to
+ be 0 or not. */
+
+ /* We need the upper limit of the col type to check for
+ whether we update the table autoinc counter or not. */
+ ulonglong col_max_value =
+ table->next_number_field->get_max_int_value();
+
+ /** The following logic is needed to avoid duplicate key error
+ for autoincrement column.
+
+ (1) InnoDB gives the current autoincrement value with respect
+ to increment and offset value.
+
+ (2) Basically it does compute_next_insert_id() logic inside InnoDB
+ to avoid the current auto increment value changed by handler layer.
+
+ (3) It is restricted only for insert operations. */
+
+ if (increment > 1 && thd_sql_command(m_user_thd) != SQLCOM_ALTER_TABLE
+ && autoinc < col_max_value) {
+
+ ulonglong prev_auto_inc = autoinc;
+
+ autoinc = ((autoinc - 1) + increment - offset)/ increment;
+
+ autoinc = autoinc * increment + offset;
+
+ /* If autoinc exceeds the col_max_value then reset
+ to old autoinc value. Because in case of non-strict
+ sql mode, boundary value is not considered as error. */
+
+ if (autoinc >= col_max_value) {
+ autoinc = prev_auto_inc;
+ }
+
+ ut_ad(autoinc > 0);
+ }
+
+ /* Called for the first time ? */
+ if (trx->n_autoinc_rows == 0) {
+
+ trx->n_autoinc_rows = (ulint) nb_desired_values;
+
+ /* It's possible for nb_desired_values to be 0:
+ e.g., INSERT INTO T1(C) SELECT C FROM T2; */
+ if (nb_desired_values == 0) {
+
+ trx->n_autoinc_rows = 1;
+ }
+
+ set_if_bigger(*first_value, autoinc);
+ /* Not in the middle of a mult-row INSERT. */
+ } else if (m_prebuilt->autoinc_last_value == 0) {
+ set_if_bigger(*first_value, autoinc);
+ }
+
+ if (*first_value > col_max_value) {
+ /* Out of range number. Let handler::update_auto_increment()
+ take care of this */
+ m_prebuilt->autoinc_last_value = 0;
+ m_prebuilt->table->autoinc_mutex.unlock();
+ *nb_reserved_values= 0;
+ return;
+ }
+
+ *nb_reserved_values = trx->n_autoinc_rows;
+
+ /* With old style AUTOINC locking we only update the table's
+ AUTOINC counter after attempting to insert the row. */
+ if (innobase_autoinc_lock_mode != AUTOINC_OLD_STYLE_LOCKING) {
+ ulonglong current;
+ ulonglong next_value;
+
+ current = *first_value;
+
+ /* Compute the last value in the interval */
+ next_value = innobase_next_autoinc(
+ current, *nb_reserved_values, increment, offset,
+ col_max_value);
+
+ m_prebuilt->autoinc_last_value = next_value;
+
+ if (m_prebuilt->autoinc_last_value < *first_value) {
+ *first_value = (~(ulonglong) 0);
+ } else {
+ /* Update the table autoinc variable */
+ dict_table_autoinc_update_if_greater(
+ m_prebuilt->table,
+ m_prebuilt->autoinc_last_value);
+ }
+ } else {
+ /* This will force write_row() into attempting an update
+ of the table's AUTOINC counter. */
+ m_prebuilt->autoinc_last_value = 0;
+ }
+
+ /* The increment to be used to increase the AUTOINC value, we use
+ this in write_row() and update_row() to increase the autoinc counter
+ for columns that are filled by the user. We need the offset and
+ the increment. */
+ m_prebuilt->autoinc_offset = offset;
+ m_prebuilt->autoinc_increment = increment;
+
+ m_prebuilt->table->autoinc_mutex.unlock();
+}
+
+/*******************************************************************//**
+See comment in handler.cc */
+
+bool
+ha_innobase::get_error_message(
+/*===========================*/
+ int error,
+ String* buf)
+{
+ trx_t* trx = check_trx_exists(ha_thd());
+
+ if (error == HA_ERR_DECRYPTION_FAILED) {
+ const char *msg = "Table encrypted but decryption failed. This could be because correct encryption management plugin is not loaded, used encryption key is not available or encryption method does not match.";
+ buf->copy(msg, (uint)strlen(msg), system_charset_info);
+ } else {
+ buf->copy(trx->detailed_error, (uint) strlen(trx->detailed_error),
+ system_charset_info);
+ }
+
+ return(FALSE);
+}
+
+/** Retrieves the names of the table and the key for which there was a
+duplicate entry in the case of HA_ERR_FOREIGN_DUPLICATE_KEY.
+
+If any of the names is not available, then this method will return
+false and will not change any of child_table_name or child_key_name.
+
+@param[out] child_table_name Table name
+@param[in] child_table_name_len Table name buffer size
+@param[out] child_key_name Key name
+@param[in] child_key_name_len Key name buffer size
+
+@retval true table and key names were available and were written into the
+corresponding out parameters.
+@retval false table and key names were not available, the out parameters
+were not touched. */
+bool
+ha_innobase::get_foreign_dup_key(
+/*=============================*/
+ char* child_table_name,
+ uint child_table_name_len,
+ char* child_key_name,
+ uint child_key_name_len)
+{
+ const dict_index_t* err_index;
+
+ ut_a(m_prebuilt->trx != NULL);
+ ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
+
+ err_index = trx_get_error_info(m_prebuilt->trx);
+
+ if (err_index == NULL) {
+ return(false);
+ }
+ /* else */
+
+ /* copy table name (and convert from filename-safe encoding to
+ system_charset_info) */
+ char* p = strchr(err_index->table->name.m_name, '/');
+
+ /* strip ".../" prefix if any */
+ if (p != NULL) {
+ p++;
+ } else {
+ p = err_index->table->name.m_name;
+ }
+
+ size_t len;
+
+ len = filename_to_tablename(p, child_table_name, child_table_name_len);
+
+ child_table_name[len] = '\0';
+
+ /* copy index name */
+ snprintf(child_key_name, child_key_name_len, "%s",
+ err_index->name());
+
+ return(true);
+}
+
+/*******************************************************************//**
+Compares two 'refs'. A 'ref' is the (internal) primary key value of the row.
+If there is no explicitly declared non-null unique key or a primary key, then
+InnoDB internally uses the row id as the primary key.
+@return < 0 if ref1 < ref2, 0 if equal, else > 0 */
+
+int
+ha_innobase::cmp_ref(
+/*=================*/
+ const uchar* ref1, /*!< in: an (internal) primary key value in the
+ MySQL key value format */
+ const uchar* ref2) /*!< in: an (internal) primary key value in the
+ MySQL key value format */
+{
+ enum_field_types mysql_type;
+ Field* field;
+ KEY_PART_INFO* key_part;
+ KEY_PART_INFO* key_part_end;
+ uint len1;
+ uint len2;
+ int result;
+
+ if (m_prebuilt->clust_index_was_generated) {
+ /* The 'ref' is an InnoDB row id */
+
+ return(memcmp(ref1, ref2, DATA_ROW_ID_LEN));
+ }
+
+ /* Do a type-aware comparison of primary key fields. PK fields
+ are always NOT NULL, so no checks for NULL are performed. */
+
+ key_part = table->key_info[table->s->primary_key].key_part;
+
+ key_part_end = key_part
+ + table->key_info[table->s->primary_key].user_defined_key_parts;
+
+ for (; key_part != key_part_end; ++key_part) {
+ field = key_part->field;
+ mysql_type = field->type();
+
+ if (mysql_type == MYSQL_TYPE_TINY_BLOB
+ || mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+ || mysql_type == MYSQL_TYPE_BLOB
+ || mysql_type == MYSQL_TYPE_LONG_BLOB) {
+
+ /* In the MySQL key value format, a column prefix of
+ a BLOB is preceded by a 2-byte length field */
+
+ len1 = innobase_read_from_2_little_endian(ref1);
+ len2 = innobase_read_from_2_little_endian(ref2);
+
+ result = ((Field_blob*) field)->cmp(
+ ref1 + 2, len1, ref2 + 2, len2);
+ } else {
+ result = field->key_cmp(ref1, ref2);
+ }
+
+ if (result) {
+
+ return(result);
+ }
+
+ ref1 += key_part->store_length;
+ ref2 += key_part->store_length;
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Ask InnoDB if a query to a table can be cached.
+@return TRUE if query caching of the table is permitted */
+
+my_bool
+ha_innobase::register_query_cache_table(
+/*====================================*/
+ THD* thd, /*!< in: user thread handle */
+ const char* table_key, /*!< in: normalized path to the
+ table */
+ uint key_length, /*!< in: length of the normalized
+ path to the table */
+ qc_engine_callback*
+ call_back, /*!< out: pointer to function for
+ checking if query caching
+ is permitted */
+ ulonglong *engine_data) /*!< in/out: data to call_back */
+{
+ *engine_data = 0;
+ *call_back = innobase_query_caching_of_table_permitted;
+
+ return(innobase_query_caching_of_table_permitted(
+ thd, table_key,
+ static_cast<uint>(key_length),
+ engine_data));
+}
+
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return number of bytes occupied by the first n characters */
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+ ulint charset_id, /*!< in: character set id */
+ ulint prefix_len, /*!< in: prefix length in bytes of the index
+ (this has to be divided by mbmaxlen to get the
+ number of CHARACTERS n in the prefix) */
+ ulint data_len, /*!< in: length of the string in bytes */
+ const char* str) /*!< in: character string */
+{
+ ulint char_length; /*!< character length in bytes */
+ ulint n_chars; /*!< number of characters in prefix */
+ CHARSET_INFO* charset; /*!< charset used in the field */
+
+ charset = get_charset((uint) charset_id, MYF(MY_WME));
+
+ ut_ad(charset);
+ ut_ad(charset->mbmaxlen);
+
+ /* Calculate how many characters at most the prefix index contains */
+
+ n_chars = prefix_len / charset->mbmaxlen;
+
+ /* If the charset is multi-byte, then we must find the length of the
+ first at most n chars in the string. If the string contains less
+ characters than n, then we return the length to the end of the last
+ character. */
+
+ if (charset->mbmaxlen > 1) {
+ /* charpos() returns the byte length of the first n_chars
+ characters, or a value bigger than the length of str, if
+ there were not enough full characters in str.
+
+ Why does the code below work:
+ Suppose that we are looking for n UTF-8 characters.
+
+ 1) If the string is long enough, then the prefix contains at
+ least n complete UTF-8 characters + maybe some extra
+ characters + an incomplete UTF-8 character. No problem in
+ this case. The function returns the pointer to the
+ end of the nth character.
+
+ 2) If the string is not long enough, then the string contains
+ the complete value of a column, that is, only complete UTF-8
+ characters, and we can store in the column prefix index the
+ whole string. */
+
+ char_length= charset->charpos(str, str + data_len, n_chars);
+ if (char_length > data_len) {
+ char_length = data_len;
+ }
+ } else if (data_len < prefix_len) {
+
+ char_length = data_len;
+
+ } else {
+
+ char_length = prefix_len;
+ }
+
+ return(char_length);
+}
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return 0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of
+ the user whose XA transaction should
+ be prepared */
+ bool prepare_trx) /*!< in: true - prepare transaction
+ false - the current SQL statement
+ ended */
+{
+ trx_t* trx = check_trx_exists(thd);
+
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ thd_get_xid(thd, (MYSQL_XID*) trx->xid);
+
+ if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+
+ sql_print_error("Transaction not registered for MariaDB 2PC,"
+ " but transaction is active");
+ }
+
+ if (prepare_trx
+ || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+ /* We were instructed to prepare the whole transaction, or
+ this is an SQL statement end and autocommit is on */
+
+ ut_ad(trx_is_registered_for_2pc(trx));
+
+ trx_prepare_for_mysql(trx);
+ } else {
+ /* We just mark the SQL statement ended and do not do a
+ transaction prepare */
+
+ /* If we had reserved the auto-inc lock for some
+ table in this SQL statement we release it now */
+
+ lock_unlock_table_autoinc(trx);
+
+ /* Store the current undo_no of the transaction so that we
+ know where to roll back if we have to roll back the next
+ SQL statement */
+
+ trx_mark_sql_stat_end(trx);
+ }
+
+ if (thd_sql_command(thd) != SQLCOM_XA_PREPARE
+ && (prepare_trx
+ || !thd_test_options(
+ thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+ /* For mysqlbackup to work the order of transactions in binlog
+ and InnoDB must be the same. Consider the situation
+
+ thread1> prepare; write to binlog; ...
+ <context switch>
+ thread2> prepare; write to binlog; commit
+ thread1> ... commit
+
+ The server guarantees that writes to the binary log
+ and commits are in the same order, so we do not have
+ to handle this case. */
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ XID* xid_list,/*!< in/out: prepared transactions */
+ uint len) /*!< in: number of slots in xid_list */
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ if (len == 0 || xid_list == NULL) {
+
+ return(0);
+ }
+
+ return(trx_recover_for_mysql(xid_list, len));
+}
+
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return 0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+ handlerton* hton,
+ XID* xid) /*!< in: X/Open XA transaction identification */
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ DBUG_EXECUTE_IF("innobase_xa_fail",
+ return XAER_RMFAIL;);
+
+ if (high_level_read_only) {
+ return(XAER_RMFAIL);
+ }
+
+ if (trx_t* trx = trx_get_trx_by_xid(xid)) {
+ /* use cases are: disconnected xa, slave xa, recovery */
+ innobase_commit_low(trx);
+ ut_ad(trx->mysql_thd == NULL);
+ trx_deregister_from_2pc(trx);
+ ut_ad(!trx->will_lock); /* trx cache requirement */
+ trx->free();
+
+ return(XA_OK);
+ } else {
+ return(XAER_NOTA);
+ }
+}
+
+/** This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+
+@param[in] hton InnoDB handlerton
+@param[in] xid X/Open XA transaction identification
+
+@return 0 or error number */
+int innobase_rollback_by_xid(handlerton* hton, XID* xid)
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ DBUG_EXECUTE_IF("innobase_xa_fail",
+ return XAER_RMFAIL;);
+
+ if (high_level_read_only) {
+ return(XAER_RMFAIL);
+ }
+
+ if (trx_t* trx = trx_get_trx_by_xid(xid)) {
+#ifdef WITH_WSREP
+ /* If a wsrep transaction is being rolled back during
+ the recovery, we must clear the xid in order to avoid
+ writing serialisation history for rolled back transaction. */
+ if (wsrep_is_wsrep_xid(trx->xid)) {
+ trx->xid->null();
+ }
+#endif /* WITH_WSREP */
+ int ret = innobase_rollback_trx(trx);
+ ut_ad(!trx->will_lock);
+ trx->free();
+
+ return(ret);
+ } else {
+ return(XAER_NOTA);
+ }
+}
+
+bool
+ha_innobase::check_if_incompatible_data(
+/*====================================*/
+ HA_CREATE_INFO* info,
+ uint table_changes)
+{
+ ha_table_option_struct *param_old, *param_new;
+
+ /* Cache engine specific options */
+ param_new = info->option_struct;
+ param_old = table->s->option_struct;
+
+ innobase_copy_frm_flags_from_create_info(m_prebuilt->table, info);
+
+ if (table_changes != IS_EQUAL_YES) {
+
+ return(COMPATIBLE_DATA_NO);
+ }
+
+ /* Check that auto_increment value was not changed */
+ if ((info->used_fields & HA_CREATE_USED_AUTO)
+ && info->auto_increment_value != 0) {
+
+ return(COMPATIBLE_DATA_NO);
+ }
+
+ /* Check that row format didn't change */
+ if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT)
+ && info->row_type != get_row_type()) {
+
+ return(COMPATIBLE_DATA_NO);
+ }
+
+ /* Specifying KEY_BLOCK_SIZE requests a rebuild of the table. */
+ if (info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE) {
+ return(COMPATIBLE_DATA_NO);
+ }
+
+ /* Changes on engine specific table options requests a rebuild of the table. */
+ if (param_new->page_compressed != param_old->page_compressed ||
+ param_new->page_compression_level != param_old->page_compression_level)
+ {
+ return(COMPATIBLE_DATA_NO);
+ }
+
+ return(COMPATIBLE_DATA_YES);
+}
+
+/****************************************************************//**
+Update the system variable innodb_io_capacity_max using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_io_capacity_max_update(
+/*===========================*/
+ THD* thd, /*!< in: thread handle */
+ st_mysql_sys_var*, void*,
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ ulong in_val = *static_cast<const ulong*>(save);
+
+ if (in_val < srv_io_capacity) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Setting innodb_io_capacity_max %lu"
+ " lower than innodb_io_capacity %lu.",
+ in_val, srv_io_capacity);
+
+ srv_io_capacity = in_val;
+
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Setting innodb_io_capacity to %lu",
+ srv_io_capacity);
+ }
+
+ srv_max_io_capacity = in_val;
+}
+
+/****************************************************************//**
+Update the system variable innodb_io_capacity using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_io_capacity_update(
+/*======================*/
+ THD* thd, /*!< in: thread handle */
+ st_mysql_sys_var*, void*,
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ ulong in_val = *static_cast<const ulong*>(save);
+
+ if (in_val > srv_max_io_capacity) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Setting innodb_io_capacity to %lu"
+ " higher than innodb_io_capacity_max %lu",
+ in_val, srv_max_io_capacity);
+
+ srv_max_io_capacity = (in_val & ~(~0UL >> 1))
+ ? in_val : in_val * 2;
+
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Setting innodb_max_io_capacity to %lu",
+ srv_max_io_capacity);
+ }
+
+ srv_io_capacity = in_val;
+}
+
+/****************************************************************//**
+Update the system variable innodb_max_dirty_pages_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_max_dirty_pages_pct_update(
+/*==============================*/
+ THD* thd, /*!< in: thread handle */
+ st_mysql_sys_var*, void*,
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ double in_val = *static_cast<const double*>(save);
+ if (in_val < srv_max_dirty_pages_pct_lwm) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "innodb_max_dirty_pages_pct cannot be"
+ " set lower than"
+ " innodb_max_dirty_pages_pct_lwm.");
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Lowering"
+ " innodb_max_dirty_page_pct_lwm to %lf",
+ in_val);
+
+ srv_max_dirty_pages_pct_lwm = in_val;
+ }
+
+ srv_max_buf_pool_modified_pct = in_val;
+ pthread_cond_signal(&buf_pool.do_flush_list);
+}
+
+/****************************************************************//**
+Update the system variable innodb_max_dirty_pages_pct_lwm using the
+"saved" value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_max_dirty_pages_pct_lwm_update(
+/*==================================*/
+ THD* thd, /*!< in: thread handle */
+ st_mysql_sys_var*, void*,
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ double in_val = *static_cast<const double*>(save);
+ if (in_val > srv_max_buf_pool_modified_pct) {
+ in_val = srv_max_buf_pool_modified_pct;
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "innodb_max_dirty_pages_pct_lwm"
+ " cannot be set higher than"
+ " innodb_max_dirty_pages_pct.");
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Setting innodb_max_dirty_page_pct_lwm"
+ " to %lf",
+ in_val);
+ }
+
+ srv_max_dirty_pages_pct_lwm = in_val;
+ pthread_cond_signal(&buf_pool.do_flush_list);
+}
+
+/*************************************************************//**
+Don't allow to set innodb_fast_shutdown=0 if purge threads are
+already down.
+@return 0 if innodb_fast_shutdown can be set */
+static
+int
+fast_shutdown_validate(
+/*=============================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ if (check_sysvar_int(thd, var, save, value)) {
+ return(1);
+ }
+
+ uint new_val = *reinterpret_cast<uint*>(save);
+
+ if (srv_fast_shutdown && !new_val
+ && !srv_read_only_mode && abort_loop) {
+ return(1);
+ }
+
+ return(0);
+}
+
+/*************************************************************//**
+Check whether valid argument given to innobase_*_stopword_table.
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_stopword_table_validate(
+/*===========================*/
+ THD* thd, /*!< in: thread handle */
+ st_mysql_sys_var*,
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ const char* stopword_table_name;
+ char buff[STRING_BUFFER_USUAL_SIZE];
+ int len = sizeof(buff);
+ trx_t* trx;
+
+ ut_a(save != NULL);
+ ut_a(value != NULL);
+
+ stopword_table_name = value->val_str(value, buff, &len);
+
+ trx = check_trx_exists(thd);
+
+ row_mysql_lock_data_dictionary(trx);
+
+ /* Validate the stopword table's (if supplied) existence and
+ of the right format */
+ int ret = stopword_table_name && !fts_valid_stopword_table(
+ stopword_table_name);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ if (!ret) {
+ if (stopword_table_name == buff) {
+ ut_ad(static_cast<size_t>(len) < sizeof buff);
+ stopword_table_name = thd_strmake(thd,
+ stopword_table_name,
+ len);
+ }
+
+ *static_cast<const char**>(save) = stopword_table_name;
+ }
+
+ return(ret);
+}
+
+extern void buf_resize_start();
+
+/** Update the system variable innodb_buffer_pool_size using the "saved"
+value. This function is registered as a callback with MySQL.
+@param[in] save immediate result from check function */
+static
+void
+innodb_buffer_pool_size_update(THD*,st_mysql_sys_var*,void*, const void* save)
+{
+ longlong in_val = *static_cast<const longlong*>(save);
+
+ snprintf(export_vars.innodb_buffer_pool_resize_status,
+ sizeof(export_vars.innodb_buffer_pool_resize_status),
+ "Requested to resize buffer pool.");
+
+ buf_resize_start();
+
+ ib::info() << export_vars.innodb_buffer_pool_resize_status
+ << " (new size: " << in_val << " bytes)";
+}
+
+/** The latest assigned innodb_ft_aux_table name */
+static char* innodb_ft_aux_table;
+
+/** Update innodb_ft_aux_table_id on SET GLOBAL innodb_ft_aux_table.
+@param[in,out] thd connection
+@param[out] save new value of innodb_ft_aux_table
+@param[in] value user-specified value */
+static int innodb_ft_aux_table_validate(THD *thd, st_mysql_sys_var*,
+ void* save, st_mysql_value* value)
+{
+ char buf[STRING_BUFFER_USUAL_SIZE];
+ int len = sizeof buf;
+
+ if (const char* table_name = value->val_str(value, buf, &len)) {
+ if (dict_table_t* table = dict_table_open_on_name(
+ table_name, FALSE, TRUE, DICT_ERR_IGNORE_NONE)) {
+ const table_id_t id = dict_table_has_fts_index(table)
+ ? table->id : 0;
+ dict_table_close(table, FALSE, FALSE);
+ if (id) {
+ innodb_ft_aux_table_id = id;
+ if (table_name == buf) {
+ ut_ad(static_cast<size_t>(len)
+ < sizeof buf);
+ table_name = thd_strmake(thd,
+ table_name,
+ len);
+ }
+
+
+ *static_cast<const char**>(save) = table_name;
+ return 0;
+ }
+ }
+
+ return 1;
+ } else {
+ *static_cast<char**>(save) = NULL;
+ innodb_ft_aux_table_id = 0;
+ return 0;
+ }
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/****************************************************************//**
+Update the system variable innodb_adaptive_hash_index using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_adaptive_hash_index_update(THD*, st_mysql_sys_var*, void*,
+ const void* save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ if (*(my_bool*) save) {
+ btr_search_enable();
+ } else {
+ btr_search_disable();
+ }
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/****************************************************************//**
+Update the system variable innodb_cmp_per_index using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_cmp_per_index_update(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+ /* Reset the stats whenever we enable the table
+ INFORMATION_SCHEMA.innodb_cmp_per_index. */
+ if (!srv_cmp_per_index_enabled && *(my_bool*) save) {
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ page_zip_reset_stat_per_index();
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ }
+
+ srv_cmp_per_index_enabled = !!(*(my_bool*) save);
+}
+
+/****************************************************************//**
+Update the system variable innodb_old_blocks_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_old_blocks_pct_update(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ uint ratio = buf_LRU_old_ratio_update(*static_cast<const uint*>(save),
+ true);
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ innobase_old_blocks_pct = ratio;
+}
+
+/****************************************************************//**
+Update the system variable innodb_old_blocks_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_change_buffer_max_size_update(THD*, st_mysql_sys_var*, void*,
+ const void* save)
+{
+ srv_change_buffer_max_size = *static_cast<const uint*>(save);
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ ibuf_max_size_update(srv_change_buffer_max_size);
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+#ifdef UNIV_DEBUG
+static uint srv_fil_make_page_dirty_debug = 0;
+static uint srv_saved_page_number_debug;
+
+/****************************************************************//**
+Make the first page of given user tablespace dirty. */
+static
+void
+innodb_make_page_dirty(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+ mtr_t mtr;
+ uint space_id = *static_cast<const uint*>(save);
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ fil_space_t* space = fil_space_t::get(space_id);
+
+ if (space == NULL) {
+func_exit_no_space:
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ return;
+ }
+
+ if (srv_saved_page_number_debug >= space->size) {
+func_exit:
+ space->release();
+ goto func_exit_no_space;
+ }
+
+ mtr.start();
+ mtr.set_named_space(space);
+
+ buf_block_t* block = buf_page_get(
+ page_id_t(space_id, srv_saved_page_number_debug),
+ space->zip_size(), RW_X_LATCH, &mtr);
+
+ if (block != NULL) {
+ ib::info() << "Dirtying page: " << block->page.id();
+ mtr.write<1,mtr_t::FORCED>(*block,
+ block->frame + FIL_PAGE_SPACE_ID,
+ block->frame[FIL_PAGE_SPACE_ID]);
+ }
+ mtr.commit();
+ log_write_up_to(mtr.commit_lsn(), true);
+ goto func_exit;
+}
+#endif // UNIV_DEBUG
+
+/****************************************************************//**
+Update the monitor counter according to the "set_option", turn
+on/off or reset specified monitor counter. */
+static
+void
+innodb_monitor_set_option(
+/*======================*/
+ const monitor_info_t* monitor_info,/*!< in: monitor info for the monitor
+ to set */
+ mon_option_t set_option) /*!< in: Turn on/off reset the
+ counter */
+{
+ monitor_id_t monitor_id = monitor_info->monitor_id;
+
+ /* If module type is MONITOR_GROUP_MODULE, it cannot be
+ turned on/off individually. It should never use this
+ function to set options */
+ ut_a(!(monitor_info->monitor_type & MONITOR_GROUP_MODULE));
+
+ switch (set_option) {
+ case MONITOR_TURN_ON:
+ MONITOR_ON(monitor_id);
+ MONITOR_INIT(monitor_id);
+ MONITOR_SET_START(monitor_id);
+
+ /* If the monitor to be turned on uses
+ exisitng monitor counter (status variable),
+ make special processing to remember existing
+ counter value. */
+ if (monitor_info->monitor_type & MONITOR_EXISTING) {
+ srv_mon_process_existing_counter(
+ monitor_id, MONITOR_TURN_ON);
+ }
+
+ if (MONITOR_IS_ON(MONITOR_LATCHES)) {
+
+ mutex_monitor.enable();
+ }
+ break;
+
+ case MONITOR_TURN_OFF:
+ if (monitor_info->monitor_type & MONITOR_EXISTING) {
+ srv_mon_process_existing_counter(
+ monitor_id, MONITOR_TURN_OFF);
+ }
+
+ MONITOR_OFF(monitor_id);
+ MONITOR_SET_OFF(monitor_id);
+
+ if (!MONITOR_IS_ON(MONITOR_LATCHES)) {
+
+ mutex_monitor.disable();
+ }
+ break;
+
+ case MONITOR_RESET_VALUE:
+ srv_mon_reset(monitor_id);
+
+ if (monitor_id == (MONITOR_LATCHES)) {
+
+ mutex_monitor.reset();
+ }
+ break;
+
+ case MONITOR_RESET_ALL_VALUE:
+ srv_mon_reset_all(monitor_id);
+ mutex_monitor.reset();
+ break;
+
+ default:
+ ut_error;
+ }
+}
+
+/****************************************************************//**
+Find matching InnoDB monitor counters and update their status
+according to the "set_option", turn on/off or reset specified
+monitor counter. */
+static
+void
+innodb_monitor_update_wildcard(
+/*===========================*/
+ const char* name, /*!< in: monitor name to match */
+ mon_option_t set_option) /*!< in: the set option, whether
+ to turn on/off or reset the counter */
+{
+ ut_a(name);
+
+ for (ulint use = 0; use < NUM_MONITOR; use++) {
+ ulint type;
+ monitor_id_t monitor_id = static_cast<monitor_id_t>(use);
+ monitor_info_t* monitor_info;
+
+ if (!innobase_wildcasecmp(
+ srv_mon_get_name(monitor_id), name)) {
+ monitor_info = srv_mon_get_info(monitor_id);
+
+ type = monitor_info->monitor_type;
+
+ /* If the monitor counter is of MONITOR_MODULE
+ type, skip it. Except for those also marked with
+ MONITOR_GROUP_MODULE flag, which can be turned
+ on only as a module. */
+ if (!(type & MONITOR_MODULE)
+ && !(type & MONITOR_GROUP_MODULE)) {
+ innodb_monitor_set_option(monitor_info,
+ set_option);
+ }
+
+ /* Need to special handle counters marked with
+ MONITOR_GROUP_MODULE, turn on the whole module if
+ any one of it comes here. Currently, only
+ "module_buf_page" is marked with MONITOR_GROUP_MODULE */
+ if (type & MONITOR_GROUP_MODULE) {
+ if ((monitor_id >= MONITOR_MODULE_BUF_PAGE)
+ && (monitor_id < MONITOR_MODULE_OS)) {
+ if (set_option == MONITOR_TURN_ON
+ && MONITOR_IS_ON(
+ MONITOR_MODULE_BUF_PAGE)) {
+ continue;
+ }
+
+ srv_mon_set_module_control(
+ MONITOR_MODULE_BUF_PAGE,
+ set_option);
+ } else {
+ /* If new monitor is added with
+ MONITOR_GROUP_MODULE, it needs
+ to be added here. */
+ ut_ad(0);
+ }
+ }
+ }
+ }
+}
+
+/*************************************************************//**
+Given a configuration variable name, find corresponding monitor counter
+and return its monitor ID if found.
+@return monitor ID if found, MONITOR_NO_MATCH if there is no match */
+static
+ulint
+innodb_monitor_id_by_name_get(
+/*==========================*/
+ const char* name) /*!< in: monitor counter namer */
+{
+ ut_a(name);
+
+ /* Search for wild character '%' in the name, if
+ found, we treat it as a wildcard match. We do not search for
+ single character wildcard '_' since our monitor names already contain
+ such character. To avoid confusion, we request user must include
+ at least one '%' character to activate the wildcard search. */
+ if (strchr(name, '%')) {
+ return(MONITOR_WILDCARD_MATCH);
+ }
+
+ /* Not wildcard match, check for an exact match */
+ for (ulint i = 0; i < NUM_MONITOR; i++) {
+ if (!innobase_strcasecmp(
+ name, srv_mon_get_name(static_cast<monitor_id_t>(i)))) {
+ return(i);
+ }
+ }
+
+ return(MONITOR_NO_MATCH);
+}
+/*************************************************************//**
+Validate that the passed in monitor name matches at least one
+monitor counter name with wildcard compare.
+@return TRUE if at least one monitor name matches */
+static
+ibool
+innodb_monitor_validate_wildcard_name(
+/*==================================*/
+ const char* name) /*!< in: monitor counter namer */
+{
+ for (ulint i = 0; i < NUM_MONITOR; i++) {
+ if (!innobase_wildcasecmp(
+ srv_mon_get_name(static_cast<monitor_id_t>(i)), name)) {
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+/*************************************************************//**
+Validate the passed in monitor name, find and save the
+corresponding monitor name in the function parameter "save".
+@return 0 if monitor name is valid */
+static
+int
+innodb_monitor_valid_byname(
+/*========================*/
+ void* save, /*!< out: immediate result
+ for update function */
+ const char* name) /*!< in: incoming monitor name */
+{
+ ulint use;
+ monitor_info_t* monitor_info;
+
+ if (!name) {
+ return(1);
+ }
+
+ use = innodb_monitor_id_by_name_get(name);
+
+ /* No monitor name matches, nor it is wildcard match */
+ if (use == MONITOR_NO_MATCH) {
+ return(1);
+ }
+
+ if (use < NUM_MONITOR) {
+ monitor_info = srv_mon_get_info((monitor_id_t) use);
+
+ /* If the monitor counter is marked with
+ MONITOR_GROUP_MODULE flag, then this counter
+ cannot be turned on/off individually, instead
+ it shall be turned on/off as a group using
+ its module name */
+ if ((monitor_info->monitor_type & MONITOR_GROUP_MODULE)
+ && (!(monitor_info->monitor_type & MONITOR_MODULE))) {
+ sql_print_warning(
+ "Monitor counter '%s' cannot"
+ " be turned on/off individually."
+ " Please use its module name"
+ " to turn on/off the counters"
+ " in the module as a group.\n",
+ name);
+
+ return(1);
+ }
+
+ } else {
+ ut_a(use == MONITOR_WILDCARD_MATCH);
+
+ /* For wildcard match, if there is not a single monitor
+ counter name that matches, treat it as an invalid
+ value for the system configuration variables */
+ if (!innodb_monitor_validate_wildcard_name(name)) {
+ return(1);
+ }
+ }
+
+ /* Save the configure name for innodb_monitor_update() */
+ *static_cast<const char**>(save) = name;
+
+ return(0);
+}
+/*************************************************************//**
+Validate passed-in "value" is a valid monitor counter name.
+This function is registered as a callback with MySQL.
+@return 0 for valid name */
+static
+int
+innodb_monitor_validate(
+/*====================*/
+ THD*, st_mysql_sys_var*,
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ const char* name;
+ char* monitor_name;
+ char buff[STRING_BUFFER_USUAL_SIZE];
+ int len = sizeof(buff);
+ int ret;
+
+ ut_a(save != NULL);
+ ut_a(value != NULL);
+
+ name = value->val_str(value, buff, &len);
+
+ /* monitor_name could point to memory from MySQL
+ or buff[]. Always dup the name to memory allocated
+ by InnoDB, so we can access it in another callback
+ function innodb_monitor_update() and free it appropriately */
+ if (name) {
+ monitor_name = my_strdup(PSI_INSTRUMENT_ME,
+ name, MYF(0));
+ } else {
+ return(1);
+ }
+
+ ret = innodb_monitor_valid_byname(save, monitor_name);
+
+ if (ret) {
+ /* Validation failed */
+ my_free(monitor_name);
+ } else {
+ /* monitor_name will be freed in separate callback function
+ innodb_monitor_update(). Assert "save" point to
+ the "monitor_name" variable */
+ ut_ad(*static_cast<char**>(save) == monitor_name);
+ }
+
+ return(ret);
+}
+
+/****************************************************************//**
+Update the system variable innodb_enable(disable/reset/reset_all)_monitor
+according to the "set_option" and turn on/off or reset specified monitor
+counter. */
+static
+void
+innodb_monitor_update(
+/*==================*/
+ THD* thd, /*!< in: thread handle */
+ void* var_ptr, /*!< out: where the
+ formal string goes */
+ const void* save, /*!< in: immediate result
+ from check function */
+ mon_option_t set_option, /*!< in: the set option,
+ whether to turn on/off or
+ reset the counter */
+ ibool free_mem) /*!< in: whether we will
+ need to free the memory */
+{
+ monitor_info_t* monitor_info;
+ ulint monitor_id;
+ ulint err_monitor = 0;
+ const char* name;
+
+ ut_a(save != NULL);
+
+ name = *static_cast<const char*const*>(save);
+
+ if (!name) {
+ monitor_id = MONITOR_DEFAULT_START;
+ } else {
+ monitor_id = innodb_monitor_id_by_name_get(name);
+
+ /* Double check we have a valid monitor ID */
+ if (monitor_id == MONITOR_NO_MATCH) {
+ return;
+ }
+ }
+
+ if (monitor_id == MONITOR_DEFAULT_START) {
+ /* If user set the variable to "default", we will
+ print a message and make this set operation a "noop".
+ The check is being made here is because "set default"
+ does not go through validation function */
+ if (thd) {
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_NO_DEFAULT,
+ "Default value is not defined for"
+ " this set option. Please specify"
+ " correct counter or module name.");
+ } else {
+ sql_print_error(
+ "Default value is not defined for"
+ " this set option. Please specify"
+ " correct counter or module name.\n");
+ }
+
+ if (var_ptr) {
+ *(const char**) var_ptr = NULL;
+ }
+ } else if (monitor_id == MONITOR_WILDCARD_MATCH) {
+ innodb_monitor_update_wildcard(name, set_option);
+ } else {
+ monitor_info = srv_mon_get_info(
+ static_cast<monitor_id_t>(monitor_id));
+
+ ut_a(monitor_info);
+
+ /* If monitor is already truned on, someone could already
+ collect monitor data, exit and ask user to turn off the
+ monitor before turn it on again. */
+ if (set_option == MONITOR_TURN_ON
+ && MONITOR_IS_ON(monitor_id)) {
+ err_monitor = monitor_id;
+ goto exit;
+ }
+
+ if (var_ptr) {
+ *(const char**) var_ptr = monitor_info->monitor_name;
+ }
+
+ /* Depending on the monitor name is for a module or
+ a counter, process counters in the whole module or
+ individual counter. */
+ if (monitor_info->monitor_type & MONITOR_MODULE) {
+ srv_mon_set_module_control(
+ static_cast<monitor_id_t>(monitor_id),
+ set_option);
+ } else {
+ innodb_monitor_set_option(monitor_info, set_option);
+ }
+ }
+exit:
+ /* Only if we are trying to turn on a monitor that already
+ been turned on, we will set err_monitor. Print related
+ information */
+ if (err_monitor) {
+ sql_print_warning("InnoDB: Monitor %s is already enabled.",
+ srv_mon_get_name((monitor_id_t) err_monitor));
+ }
+
+ if (free_mem && name) {
+ my_free((void*) name);
+ }
+
+ return;
+}
+
+/** Validate SET GLOBAL innodb_buffer_pool_filename.
+On Windows, file names with colon (:) are not allowed.
+@param thd connection
+@param save &srv_buf_dump_filename
+@param value new value to be validated
+@return 0 for valid name */
+static int innodb_srv_buf_dump_filename_validate(THD *thd, st_mysql_sys_var*,
+ void *save,
+ st_mysql_value *value)
+{
+ char buff[OS_FILE_MAX_PATH];
+ int len= sizeof buff;
+
+ if (const char *buf_name= value->val_str(value, buff, &len))
+ {
+#ifdef _WIN32
+ if (!is_filename_allowed(buf_name, len, FALSE))
+ {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "InnoDB: innodb_buffer_pool_filename "
+ "cannot have colon (:) in the file name.");
+ return 1;
+ }
+#endif /* _WIN32 */
+ if (buf_name == buff)
+ {
+ ut_ad(static_cast<size_t>(len) < sizeof buff);
+ buf_name= thd_strmake(thd, buf_name, len);
+ }
+
+ *static_cast<const char**>(save)= buf_name;
+ return 0;
+ }
+
+ return 1;
+}
+
+#ifdef UNIV_DEBUG
+static char* srv_buffer_pool_evict;
+
+/****************************************************************//**
+Evict all uncompressed pages of compressed tables from the buffer pool.
+Keep the compressed pages in the buffer pool.
+@return whether all uncompressed pages were evicted */
+static bool innodb_buffer_pool_evict_uncompressed()
+{
+ bool all_evicted = true;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+ block != NULL; ) {
+ buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->in_unzip_LRU_list);
+ ut_ad(block->page.in_LRU_list);
+
+ if (!buf_LRU_free_page(&block->page, false)) {
+ all_evicted = false;
+ block = prev_block;
+ } else {
+ /* Because buf_LRU_free_page() may release
+ and reacquire buf_pool.mutex, prev_block
+ may be invalid. */
+ block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return(all_evicted);
+}
+
+/****************************************************************//**
+Called on SET GLOBAL innodb_buffer_pool_evict=...
+Handles some values specially, to evict pages from the buffer pool.
+SET GLOBAL innodb_buffer_pool_evict='uncompressed'
+evicts all uncompressed page frames of compressed tablespaces. */
+static
+void
+innodb_buffer_pool_evict_update(THD*, st_mysql_sys_var*, void*,
+ const void* save)
+{
+ if (const char* op = *static_cast<const char*const*>(save)) {
+ if (!strcmp(op, "uncompressed")) {
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ for (uint tries = 0; tries < 10000; tries++) {
+ if (innodb_buffer_pool_evict_uncompressed()) {
+ mysql_mutex_lock(
+ &LOCK_global_system_variables);
+ return;
+ }
+
+ os_thread_sleep(10000);
+ }
+
+ /* We failed to evict all uncompressed pages. */
+ ut_ad(0);
+ }
+ }
+}
+#endif /* UNIV_DEBUG */
+
+/****************************************************************//**
+Update the system variable innodb_monitor_enable and enable
+specified monitor counter.
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_enable_monitor_update(
+/*=========================*/
+ THD* thd, /*!< in: thread handle */
+ st_mysql_sys_var*,
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_ON, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_disable and turn
+off specified monitor counter. */
+static
+void
+innodb_disable_monitor_update(
+/*==========================*/
+ THD* thd, /*!< in: thread handle */
+ st_mysql_sys_var*,
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_OFF, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_reset and reset
+specified monitor counter(s).
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_reset_monitor_update(
+/*========================*/
+ THD* thd, /*!< in: thread handle */
+ st_mysql_sys_var*,
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_VALUE, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_reset_all and reset
+all value related monitor counter.
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_reset_all_monitor_update(
+/*============================*/
+ THD* thd, /*!< in: thread handle */
+ st_mysql_sys_var*,
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_ALL_VALUE,
+ TRUE);
+}
+
+static
+void
+innodb_defragment_frequency_update(THD*, st_mysql_sys_var*, void*,
+ const void* save)
+{
+ srv_defragment_frequency = (*static_cast<const uint*>(save));
+ srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
+}
+
+static inline char *my_strtok_r(char *str, const char *delim, char **saveptr)
+{
+#if defined _WIN32
+ return strtok_s(str, delim, saveptr);
+#else
+ return strtok_r(str, delim, saveptr);
+#endif
+}
+
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can list the monitor counters/groups to be enable by specifying
+"loose-innodb_monitor_enable=monitor_name1;monitor_name2..."
+in server configuration file or at the command line. The string
+separate could be ";", "," or empty space. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+ char* str) /*!< in/out: monitor counter enable list */
+{
+ static const char* sep = " ;,";
+ char* last;
+
+ ut_a(str);
+
+ /* Walk through the string, and separate each monitor counter
+ and/or counter group name, and calling innodb_monitor_update()
+ if successfully updated. Please note that the "str" would be
+ changed by strtok_r() as it walks through it. */
+ for (char* option = my_strtok_r(str, sep, &last);
+ option;
+ option = my_strtok_r(NULL, sep, &last)) {
+ char* option_name;
+ if (!innodb_monitor_valid_byname(&option_name, option)) {
+ innodb_monitor_update(NULL, NULL, &option,
+ MONITOR_TURN_ON, FALSE);
+ } else {
+ sql_print_warning("Invalid monitor counter"
+ " name: '%s'", option);
+ }
+ }
+}
+
+/****************************************************************//**
+Callback function for accessing the InnoDB variables from MySQL:
+SHOW VARIABLES. */
+static int show_innodb_vars(THD*, SHOW_VAR* var, char*)
+{
+ innodb_export_status();
+ var->type = SHOW_ARRAY;
+ var->value = (char*) &innodb_status_variables;
+ //var->scope = SHOW_SCOPE_GLOBAL;
+
+ return(0);
+}
+
+/****************************************************************//**
+This function checks each index name for a table against reserved
+system default primary index name 'GEN_CLUST_INDEX'. If a name
+matches, this function pushes an warning message to the client,
+and returns true.
+@return true if the index name matches the reserved name */
+bool
+innobase_index_name_is_reserved(
+/*============================*/
+ THD* thd, /*!< in/out: MySQL connection */
+ const KEY* key_info, /*!< in: Indexes to be created */
+ ulint num_of_keys) /*!< in: Number of indexes to
+ be created. */
+{
+ const KEY* key;
+ uint key_num; /* index number */
+
+ for (key_num = 0; key_num < num_of_keys; key_num++) {
+ key = &key_info[key_num];
+
+ if (innobase_strcasecmp(key->name.str,
+ innobase_index_reserve_name) == 0) {
+ /* Push warning to mysql */
+ push_warning_printf(thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_NAME_FOR_INDEX,
+ "Cannot Create Index with name"
+ " '%s'. The name is reserved"
+ " for the system default primary"
+ " index.",
+ innobase_index_reserve_name);
+
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ innobase_index_reserve_name);
+
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Retrieve the FTS Relevance Ranking result for doc with doc_id
+of m_prebuilt->fts_doc_id
+@param[in,out] fts_hdl FTS handler
+@return the relevance ranking value */
+static
+float
+innobase_fts_retrieve_ranking(
+ FT_INFO* fts_hdl)
+{
+ fts_result_t* result;
+ row_prebuilt_t* ft_prebuilt;
+
+ result = reinterpret_cast<NEW_FT_INFO*>(fts_hdl)->ft_result;
+
+ ft_prebuilt = reinterpret_cast<NEW_FT_INFO*>(fts_hdl)->ft_prebuilt;
+
+ fts_ranking_t* ranking = rbt_value(fts_ranking_t, result->current);
+ ft_prebuilt->fts_doc_id= ranking->doc_id;
+
+ return(ranking->rank);
+}
+
+/** Free the memory for the FTS handler
+@param[in,out] fts_hdl FTS handler */
+static
+void
+innobase_fts_close_ranking(
+ FT_INFO* fts_hdl)
+{
+ fts_result_t* result;
+
+ result = reinterpret_cast<NEW_FT_INFO*>(fts_hdl)->ft_result;
+
+ fts_query_free_result(result);
+
+ my_free((uchar*) fts_hdl);
+}
+
+/** Find and Retrieve the FTS Relevance Ranking result for doc with doc_id
+of m_prebuilt->fts_doc_id
+@param[in,out] fts_hdl FTS handler
+@return the relevance ranking value */
+static
+float
+innobase_fts_find_ranking(FT_INFO* fts_hdl, uchar*, uint)
+{
+ fts_result_t* result;
+ row_prebuilt_t* ft_prebuilt;
+
+ ft_prebuilt = reinterpret_cast<NEW_FT_INFO*>(fts_hdl)->ft_prebuilt;
+ result = reinterpret_cast<NEW_FT_INFO*>(fts_hdl)->ft_result;
+
+ /* Retrieve the ranking value for doc_id with value of
+ m_prebuilt->fts_doc_id */
+ return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id));
+}
+
+#ifdef UNIV_DEBUG
+static my_bool innodb_background_drop_list_empty = TRUE;
+static my_bool innodb_log_checkpoint_now = TRUE;
+static my_bool innodb_buf_flush_list_now = TRUE;
+static uint innodb_merge_threshold_set_all_debug
+ = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+
+/** Wait for the background drop list to become empty. */
+static
+void
+wait_background_drop_list_empty(THD*, st_mysql_sys_var*, void*, const void*)
+{
+ row_wait_for_background_drop_list_empty();
+}
+
+/****************************************************************//**
+Force innodb to checkpoint. */
+static
+void
+checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+ if (*(my_bool*) save) {
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+
+ lsn_t lsn;
+
+ while (log_sys.last_checkpoint_lsn.load(
+ std::memory_order_acquire)
+ + SIZE_OF_FILE_CHECKPOINT
+ < (lsn= log_sys.get_lsn(std::memory_order_acquire))) {
+ log_make_checkpoint();
+ log_sys.log.flush();
+ }
+
+ if (dberr_t err = fil_write_flushed_lsn(lsn)) {
+ ib::warn() << "Checkpoint set failed " << err;
+ }
+
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ }
+}
+
+/****************************************************************//**
+Force a dirty pages flush now. */
+static
+void
+buf_flush_list_now_set(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+ if (*(my_bool*) save) {
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ buf_flush_sync();
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ }
+}
+
+/** Override current MERGE_THRESHOLD setting for all indexes at dictionary
+now.
+@param[in] save immediate result from check function */
+static
+void
+innodb_merge_threshold_set_all_debug_update(THD*, st_mysql_sys_var*, void*,
+ const void* save)
+{
+ innodb_merge_threshold_set_all_debug
+ = (*static_cast<const uint*>(save));
+ dict_set_merge_threshold_all_debug(
+ innodb_merge_threshold_set_all_debug);
+}
+#endif /* UNIV_DEBUG */
+
+/** Find and Retrieve the FTS doc_id for the current result row
+@param[in,out] fts_hdl FTS handler
+@return the document ID */
+static
+ulonglong
+innobase_fts_retrieve_docid(
+ FT_INFO_EXT* fts_hdl)
+{
+ fts_result_t* result;
+ row_prebuilt_t* ft_prebuilt;
+
+ ft_prebuilt = reinterpret_cast<NEW_FT_INFO *>(fts_hdl)->ft_prebuilt;
+ result = reinterpret_cast<NEW_FT_INFO *>(fts_hdl)->ft_result;
+
+ if (ft_prebuilt->read_just_key) {
+
+ fts_ranking_t* ranking =
+ rbt_value(fts_ranking_t, result->current);
+
+ return(ranking->doc_id);
+ }
+
+ return(ft_prebuilt->fts_doc_id);
+}
+
+/* These variables are never read by InnoDB or changed. They are a kind of
+dummies that are needed by the MySQL infrastructure to call
+buffer_pool_dump_now(), buffer_pool_load_now() and buffer_pool_load_abort()
+by the user by doing:
+ SET GLOBAL innodb_buffer_pool_dump_now=ON;
+ SET GLOBAL innodb_buffer_pool_load_now=ON;
+ SET GLOBAL innodb_buffer_pool_load_abort=ON;
+Their values are read by MySQL and displayed to the user when the variables
+are queried, e.g.:
+ SELECT @@innodb_buffer_pool_dump_now;
+ SELECT @@innodb_buffer_pool_load_now;
+ SELECT @@innodb_buffer_pool_load_abort; */
+static my_bool innodb_buffer_pool_dump_now = FALSE;
+static my_bool innodb_buffer_pool_load_now = FALSE;
+static my_bool innodb_buffer_pool_load_abort = FALSE;
+
+/****************************************************************//**
+Trigger a dump of the buffer pool if innodb_buffer_pool_dump_now is set
+to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_dump_now(
+/*=================*/
+ THD* thd /*!< in: thread handle */
+ MY_ATTRIBUTE((unused)),
+ struct st_mysql_sys_var* var /*!< in: pointer to system
+ variable */
+ MY_ATTRIBUTE((unused)),
+ void* var_ptr /*!< out: where the formal
+ string goes */
+ MY_ATTRIBUTE((unused)),
+ const void* save) /*!< in: immediate result from
+ check function */
+{
+ if (*(my_bool*) save && !srv_read_only_mode) {
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ buf_dump_start();
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ }
+}
+
+/****************************************************************//**
+Trigger a load of the buffer pool if innodb_buffer_pool_load_now is set
+to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_load_now(
+/*=================*/
+ THD* thd /*!< in: thread handle */
+ MY_ATTRIBUTE((unused)),
+ struct st_mysql_sys_var* var /*!< in: pointer to system
+ variable */
+ MY_ATTRIBUTE((unused)),
+ void* var_ptr /*!< out: where the formal
+ string goes */
+ MY_ATTRIBUTE((unused)),
+ const void* save) /*!< in: immediate result from
+ check function */
+{
+ if (*(my_bool*) save && !srv_read_only_mode) {
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ buf_load_start();
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ }
+}
+
+/****************************************************************//**
+Abort a load of the buffer pool if innodb_buffer_pool_load_abort
+is set to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_load_abort(
+/*===================*/
+ THD* thd /*!< in: thread handle */
+ MY_ATTRIBUTE((unused)),
+ struct st_mysql_sys_var* var /*!< in: pointer to system
+ variable */
+ MY_ATTRIBUTE((unused)),
+ void* var_ptr /*!< out: where the formal
+ string goes */
+ MY_ATTRIBUTE((unused)),
+ const void* save) /*!< in: immediate result from
+ check function */
+{
+ if (*(my_bool*) save && !srv_read_only_mode) {
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ buf_load_abort();
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ }
+}
+
+/****************************************************************//**
+Update the system variable innodb_log_write_ahead_size using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_log_write_ahead_size_update(
+/*===============================*/
+ THD* thd, /*!< in: thread handle */
+ st_mysql_sys_var*, void*,
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ ulong val = OS_FILE_LOG_BLOCK_SIZE;
+ ulong in_val = *static_cast<const ulong*>(save);
+
+ while (val < in_val) {
+ val = val * 2;
+ }
+
+ if (val > srv_page_size) {
+ val = srv_page_size;
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "innodb_log_write_ahead_size cannot"
+ " be set higher than innodb_page_size.");
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Setting innodb_log_write_ahead_size"
+ " to %lu",
+ srv_page_size);
+ } else if (val != in_val) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "innodb_log_write_ahead_size should be"
+ " set 2^n value and larger than 512.");
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Setting innodb_log_write_ahead_size"
+ " to %lu",
+ val);
+ }
+
+ srv_log_write_ahead_size = val;
+}
+
+/** Update innodb_status_output or innodb_status_output_locks,
+which control InnoDB "status monitor" output to the error log.
+@param[out] var current value
+@param[in] save to-be-assigned value */
+static
+void
+innodb_status_output_update(THD*,st_mysql_sys_var*,void*var,const void*save)
+{
+ if (srv_monitor_timer)
+ {
+ *static_cast<my_bool*>(var)= *static_cast<const my_bool*>(save);
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ /* Wakeup server monitor. */
+ srv_monitor_timer_schedule_now();
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ }
+}
+
+/** Update the system variable innodb_encryption_threads.
+@param[in] save to-be-assigned value */
+static
+void
+innodb_encryption_threads_update(THD*,st_mysql_sys_var*,void*,const void*save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ fil_crypt_set_thread_cnt(*static_cast<const uint*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/** Update the system variable innodb_encryption_rotate_key_age.
+@param[in] save to-be-assigned value */
+static
+void
+innodb_encryption_rotate_key_age_update(THD*, st_mysql_sys_var*, void*,
+ const void* save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ fil_crypt_set_rotate_key_age(*static_cast<const uint*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/** Update the system variable innodb_encryption_rotation_iops.
+@param[in] save to-be-assigned value */
+static
+void
+innodb_encryption_rotation_iops_update(THD*, st_mysql_sys_var*, void*,
+ const void* save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ fil_crypt_set_rotation_iops(*static_cast<const uint*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/** Update the system variable innodb_encrypt_tables.
+@param[in] save to-be-assigned value */
+static
+void
+innodb_encrypt_tables_update(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ fil_crypt_set_encrypt_tables(*static_cast<const ulong*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/** Issue a deprecation warning for SET GLOBAL innodb_log_checksums.
+@param[in,out] thd client connection */
+static void
+innodb_log_checksums_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated::innodb_log_checksums_msg);
+}
+
+/** Issue a deprecation warning for SET GLOBAL innodb_log_compressed_pages.
+@param[in,out] thd client connection */
+static void
+innodb_log_compressed_pages_warn(THD* thd, st_mysql_sys_var*, void*,
+ const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated::innodb_log_compressed_pages_msg);
+}
+
+/** Issue a deprecation warning for SET GLOBAL innodb_log_optimize_ddl.
+@param[in,out] thd client connection */
+static void
+innodb_log_optimize_ddl_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated::innodb_log_optimize_ddl_msg);
+}
+
+/** Issue a deprecation warning for SET GLOBAL innodb_page_cleaners.
+@param[in,out] thd client connection */
+static void
+innodb_page_cleaners_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated::innodb_page_cleaners_msg);
+}
+
+/** Issue a deprecation warning for SET GLOBAL innodb_undo_logs.
+@param[in,out] thd client connection */
+static void
+innodb_undo_logs_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated::innodb_undo_logs_msg);
+}
+
+/** Issue a deprecation warning for SET GLOBAL innodb_scrub_log_speed.
+@param[in,out] thd client connection */
+static void
+innodb_scrub_log_speed_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated::innodb_scrub_log_speed_msg);
+}
+
+static void
+innodb_background_scrub_data_uncompressed_warn(THD* thd, st_mysql_sys_var*,
+ void*, const void*)
+{
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated::innodb_background_scrub_data_uncompressed_msg);
+}
+
+static void
+innodb_background_scrub_data_compressed_warn(THD* thd, st_mysql_sys_var*,
+ void*, const void*)
+{
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated::innodb_background_scrub_data_compressed_msg);
+}
+
+static void
+innodb_background_scrub_data_check_interval_warn(
+ THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated::innodb_background_scrub_data_check_interval_msg);
+}
+
+static void
+innodb_background_scrub_data_interval_warn(
+ THD* thd, st_mysql_sys_var*, void*, const void*)
+{
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ deprecated::innodb_background_scrub_data_interval_msg);
+}
+
+static SHOW_VAR innodb_status_variables_export[]= {
+ {"Innodb", (char*) &show_innodb_vars, SHOW_FUNC},
+ {NullS, NullS, SHOW_LONG}
+};
+
+static struct st_mysql_storage_engine innobase_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+#ifdef WITH_WSREP
+
+struct bg_wsrep_kill_trx_arg {
+ my_thread_id thd_id, bf_thd_id;
+ trx_id_t trx_id, bf_trx_id;
+ bool signal;
+};
+
+/** Kill one transaction from a background manager thread
+
+wsrep_innobase_kill_one_trx() is invoked when lock_sys.mutex and trx mutex
+are taken, wsrep_thd_bf_abort() cannot be used there as it takes THD mutexes
+that must be taken before lock_sys.mutex and trx mutex. That's why
+wsrep_innobase_kill_one_trx only posts the killing task to the manager thread
+and the actual killing happens asynchronously here.
+
+As no mutexes were held we don't know whether THD or trx pointers are still
+valid, so we need to pass thread/trx ids and perform a lookup.
+*/
+static void bg_wsrep_kill_trx(void *void_arg)
+{
+ bg_wsrep_kill_trx_arg *arg= (bg_wsrep_kill_trx_arg *)void_arg;
+ THD *thd, *bf_thd;
+ trx_t *victim_trx;
+ bool aborting= false;
+
+ if ((bf_thd= find_thread_by_id(arg->bf_thd_id)))
+ wsrep_thd_LOCK(bf_thd);
+ if ((thd= find_thread_by_id(arg->thd_id)))
+ wsrep_thd_LOCK(thd);
+
+ if (!thd || !bf_thd || !(victim_trx= thd_to_trx(thd)))
+ goto ret0;
+
+ lock_mutex_enter();
+ trx_mutex_enter(victim_trx);
+ if (victim_trx->id != arg->trx_id
+ || victim_trx->state == TRX_STATE_COMMITTED_IN_MEMORY)
+ {
+ /* Apparently victim trx was meanwhile rolled back or
+ committed. Tell bf thd not to wait, in case it already
+ started to. */
+ trx_t *trx= thd_to_trx(bf_thd);
+ if (!trx) {
+ /* bf_thd might not be associated with a
+ transaction, in case of MDL conflict */
+ } else if (lock_t *lock = trx->lock.wait_lock) {
+ trx_mutex_enter(trx);
+ lock_cancel_waiting_and_release(lock);
+ trx_mutex_exit(trx);
+ }
+ goto ret1;
+ }
+
+ DBUG_ASSERT(wsrep_on(bf_thd));
+
+ WSREP_LOG_CONFLICT(bf_thd, thd, TRUE);
+
+ WSREP_DEBUG("Aborter %s trx_id: " TRX_ID_FMT " thread: %ld "
+ "seqno: %lld client_state: %s client_mode: %s transaction_mode: %s "
+ "query: %s",
+ wsrep_thd_is_BF(bf_thd, false) ? "BF" : "normal",
+ arg->bf_trx_id,
+ thd_get_thread_id(bf_thd),
+ wsrep_thd_trx_seqno(bf_thd),
+ wsrep_thd_client_state_str(bf_thd),
+ wsrep_thd_client_mode_str(bf_thd),
+ wsrep_thd_transaction_state_str(bf_thd),
+ wsrep_thd_query(bf_thd));
+
+ WSREP_DEBUG("Victim %s trx_id: " TRX_ID_FMT " thread: %ld "
+ "seqno: %lld client_state: %s client_mode: %s transaction_mode: %s "
+ "query: %s",
+ wsrep_thd_is_BF(thd, false) ? "BF" : "normal",
+ victim_trx->id,
+ thd_get_thread_id(thd),
+ wsrep_thd_trx_seqno(thd),
+ wsrep_thd_client_state_str(thd),
+ wsrep_thd_client_mode_str(thd),
+ wsrep_thd_transaction_state_str(thd),
+ wsrep_thd_query(thd));
+
+ /* Mark transaction as a victim for Galera abort */
+ victim_trx->lock.was_chosen_as_wsrep_victim= true;
+ if (wsrep_thd_set_wsrep_aborter(bf_thd, thd))
+ {
+ WSREP_DEBUG("innodb kill transaction skipped due to wsrep_aborter set");
+ goto ret1;
+ }
+
+ aborting= true;
+
+ret1:
+ trx_mutex_exit(victim_trx);
+ lock_mutex_exit();
+ret0:
+ if (thd) {
+ wsrep_thd_UNLOCK(thd);
+ if (aborting) {
+ DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort");
+ wsrep_thd_bf_abort(bf_thd, thd, arg->signal);
+ }
+ wsrep_thd_kill_UNLOCK(thd);
+ }
+ if (bf_thd) {
+ wsrep_thd_UNLOCK(bf_thd);
+ wsrep_thd_kill_UNLOCK(bf_thd);
+ }
+ free(arg);
+}
+
+/** This function is used to kill one transaction.
+
+This transaction was open on this node (not-yet-committed), and a
+conflicting writeset from some other node that was being applied
+caused a locking conflict. First committed (from other node)
+wins, thus open transaction is rolled back. BF stands for
+brute-force: any transaction can get aborted by galera any time
+it is necessary.
+
+This conflict can happen only when the replicated writeset (from
+other node) is being applied, not when it’s waiting in the queue.
+If our local transaction reached its COMMIT and this conflicting
+writeset was in the queue, then it should fail the local
+certification test instead.
+
+A brute force abort is only triggered by a locking conflict
+between a writeset being applied by an applier thread (slave thread)
+and an open transaction on the node, not by a Galera writeset
+comparison as in the local certification failure.
+
+@param[in] bf_thd Brute force (BF) thread
+@param[in,out] victim_trx Transaction to be killed
+@param[in] signal Should victim be signaled */
+void
+wsrep_innobase_kill_one_trx(
+ THD* bf_thd,
+ trx_t *victim_trx,
+ bool signal)
+{
+ ut_ad(bf_thd);
+ ut_ad(victim_trx);
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(victim_trx));
+
+ DBUG_ENTER("wsrep_innobase_kill_one_trx");
+
+ DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
+ {
+ const char act[]=
+ "now "
+ "SIGNAL sync.before_wsrep_thd_abort_reached "
+ "WAIT_FOR signal.before_wsrep_thd_abort";
+ DBUG_ASSERT(!debug_sync_set_action(bf_thd,
+ STRING_WITH_LEN(act)));
+ };);
+
+ trx_t* bf_trx= thd_to_trx(bf_thd);
+ bg_wsrep_kill_trx_arg *arg = (bg_wsrep_kill_trx_arg*)malloc(sizeof(*arg));
+ arg->thd_id = thd_get_thread_id(victim_trx->mysql_thd);
+ arg->trx_id = victim_trx->id;
+ arg->bf_thd_id = thd_get_thread_id(bf_thd);
+ arg->bf_trx_id = bf_trx ? bf_trx->id : TRX_ID_MAX;
+ arg->signal = signal;
+ mysql_manager_submit(bg_wsrep_kill_trx, arg);
+
+ DBUG_VOID_RETURN;
+}
+
+/** This function forces the victim transaction to abort. Aborting the
+ transaction does NOT end it, it still has to be rolled back.
+
+ @param bf_thd brute force THD asking for the abort
+ @param victim_thd victim THD to be aborted
+
+ @return 0 victim was aborted
+ @return -1 victim thread was aborted (no transaction)
+*/
+static
+void
+wsrep_abort_transaction(
+ handlerton*,
+ THD *bf_thd,
+ THD *victim_thd,
+ my_bool signal)
+{
+ DBUG_ENTER("wsrep_abort_transaction");
+ ut_ad(bf_thd);
+ ut_ad(victim_thd);
+
+ trx_t* victim_trx = thd_to_trx(victim_thd);
+
+ WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %s",
+ wsrep_thd_query(bf_thd),
+ wsrep_thd_query(victim_thd),
+ wsrep_thd_transaction_state_str(victim_thd));
+
+ if (victim_trx) {
+ lock_mutex_enter();
+ trx_mutex_enter(victim_trx);
+ victim_trx->lock.was_chosen_as_wsrep_victim= true;
+ trx_mutex_exit(victim_trx);
+ lock_mutex_exit();
+
+ wsrep_thd_kill_LOCK(victim_thd);
+ wsrep_thd_LOCK(victim_thd);
+ bool aborting= !wsrep_thd_set_wsrep_aborter(bf_thd, victim_thd);
+ wsrep_thd_UNLOCK(victim_thd);
+ if (aborting) {
+ DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort");
+ DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
+ {
+ const char act[]=
+ "now "
+ "SIGNAL sync.before_wsrep_thd_abort_reached "
+ "WAIT_FOR signal.before_wsrep_thd_abort";
+ DBUG_ASSERT(!debug_sync_set_action(bf_thd,
+ STRING_WITH_LEN(act)));
+ };);
+ wsrep_thd_bf_abort(bf_thd, victim_thd, signal);
+ }
+ wsrep_thd_kill_UNLOCK(victim_thd);
+ DBUG_VOID_RETURN;
+ } else {
+ DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
+ {
+ const char act[]=
+ "now "
+ "SIGNAL sync.before_wsrep_thd_abort_reached "
+ "WAIT_FOR signal.before_wsrep_thd_abort";
+ DBUG_ASSERT(!debug_sync_set_action(bf_thd,
+ STRING_WITH_LEN(act)));
+ };);
+ wsrep_thd_kill_LOCK(victim_thd);
+ wsrep_thd_bf_abort(bf_thd, victim_thd, signal);
+ wsrep_thd_kill_UNLOCK(victim_thd);
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+static
+int
+innobase_wsrep_set_checkpoint(
+/*==========================*/
+ handlerton* hton,
+ const XID* xid)
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ if (wsrep_is_wsrep_xid(xid)) {
+
+ trx_rseg_update_wsrep_checkpoint(xid);
+ innobase_flush_logs(hton, false);
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+static
+int
+innobase_wsrep_get_checkpoint(
+/*==========================*/
+ handlerton* hton,
+ XID* xid)
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ trx_rseg_read_wsrep_checkpoint(*xid);
+ return 0;
+}
+#endif /* WITH_WSREP */
+
+/* plugin options */
+
+static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm,
+ PLUGIN_VAR_RQCMDARG,
+ "The algorithm InnoDB uses for page checksumming. Possible values are"
+ " FULL_CRC32"
+ " for new files, always use CRC-32C; for old, see CRC32 below;"
+ " STRICT_FULL_CRC32"
+ " for new files, always use CRC-32C; for old, see STRICT_CRC32 below;"
+ " CRC32"
+ " write crc32, allow any of the other checksums to match when reading;"
+ " STRICT_CRC32"
+ " write crc32, do not allow other algorithms to match when reading;"
+ " INNODB"
+ " write a software calculated checksum, allow any other checksums"
+ " to match when reading;"
+ " STRICT_INNODB"
+ " write a software calculated checksum, do not allow other algorithms"
+ " to match when reading;"
+ " NONE"
+ " write a constant magic number, do not do any checksum verification"
+ " when reading (same as innodb_checksums=OFF);"
+ " STRICT_NONE"
+ " write a constant magic number, do not allow values other than that"
+ " magic number when reading;"
+ " Files updated when this option is set to crc32 or strict_crc32 will"
+ " not be readable by MariaDB versions older than 10.0.4;"
+ " new files created with full_crc32 are readable by MariaDB 10.4.3+",
+ NULL, innodb_checksum_algorithm_update, SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
+ &innodb_checksum_algorithm_typelib);
+
+/** Description of deprecated and ignored parameters */
+static const char* innodb_deprecated_ignored
+= "Deprecated parameter with no effect.";
+
+static MYSQL_SYSVAR_BOOL(log_checksums, deprecated::innodb_log_checksums,
+ PLUGIN_VAR_RQCMDARG,
+ innodb_deprecated_ignored, NULL, innodb_log_checksums_warn, TRUE);
+
+static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir,
+ PLUGIN_VAR_READONLY,
+ "The common part for InnoDB table spaces.",
+ NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(doublewrite, srv_use_doublewrite_buf,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Enable InnoDB doublewrite buffer (enabled by default)."
+ " Disable with --skip-innodb-doublewrite.",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(use_atomic_writes, srv_use_atomic_writes,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Enable atomic writes, instead of using the doublewrite buffer, for files "
+ "on devices that supports atomic writes. "
+ "This option only works on Linux with either FusionIO cards using "
+ "the directFS filesystem or with Shannon cards using any file system.",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(stats_include_delete_marked,
+ srv_stats_include_delete_marked,
+ PLUGIN_VAR_OPCMDARG,
+ "Include delete marked records when calculating persistent statistics",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ENUM(instant_alter_column_allowed,
+ innodb_instant_alter_column_allowed,
+ PLUGIN_VAR_RQCMDARG,
+ "File format constraint for ALTER TABLE", NULL, NULL, 2/*add_drop_reorder*/,
+ &innodb_instant_alter_column_allowed_typelib);
+
+static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of IOPs the server can do. Tunes the background IO rate",
+ NULL, innodb_io_capacity_update, 200, 100, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity,
+ PLUGIN_VAR_RQCMDARG,
+ "Limit to which innodb_io_capacity can be inflated.",
+ NULL, innodb_io_capacity_max_update,
+ SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100,
+ SRV_MAX_IO_CAPACITY_LIMIT, 0);
+
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_BOOL(background_drop_list_empty,
+ innodb_background_drop_list_empty,
+ PLUGIN_VAR_OPCMDARG,
+ "Wait for the background drop list to become empty",
+ NULL, wait_background_drop_list_empty, FALSE);
+
+static MYSQL_SYSVAR_BOOL(log_checkpoint_now, innodb_log_checkpoint_now,
+ PLUGIN_VAR_OPCMDARG,
+ "Force checkpoint now",
+ NULL, checkpoint_now_set, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buf_flush_list_now, innodb_buf_flush_list_now,
+ PLUGIN_VAR_OPCMDARG,
+ "Force dirty page flush now",
+ NULL, buf_flush_list_now_set, FALSE);
+
+static MYSQL_SYSVAR_UINT(merge_threshold_set_all_debug,
+ innodb_merge_threshold_set_all_debug,
+ PLUGIN_VAR_RQCMDARG,
+ "Override current MERGE_THRESHOLD setting for all indexes at dictionary"
+ " cache by the specified value dynamically, at the time.",
+ NULL, innodb_merge_threshold_set_all_debug_update,
+ DICT_INDEX_MERGE_THRESHOLD_DEFAULT, 1, 50, 0);
+#endif /* UNIV_DEBUG */
+
+static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size,
+ PLUGIN_VAR_OPCMDARG,
+ "Number of UNDO log pages to purge in one batch from the history list.",
+ NULL, NULL,
+ 300, /* Default setting */
+ 1, /* Minimum value */
+ 5000, 0); /* Maximum value */
+
+static MYSQL_SYSVAR_UINT(purge_threads, srv_n_purge_threads,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Number of tasks for purging transaction history",
+ NULL, NULL, 4, 1, innodb_purge_threads_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(sync_array_size, srv_sync_array_size,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Size of the mutex/lock wait array.",
+ NULL, NULL,
+ 1, /* Default setting */
+ 1, /* Minimum value */
+ 1024, 0); /* Maximum value */
+
+static MYSQL_SYSVAR_UINT(fast_shutdown, srv_fast_shutdown,
+ PLUGIN_VAR_OPCMDARG,
+ "Speeds up the shutdown process of the InnoDB storage engine. Possible"
+ " values are 0, 1 (faster), 2 (crash-like), 3 (fastest clean).",
+ fast_shutdown_validate, NULL, 1, 0, 3, 0);
+
+static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table,
+ PLUGIN_VAR_NOCMDARG,
+ "Stores each InnoDB table to an .ibd file in the database dir.",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_STR(ft_server_stopword_table, innobase_server_stopword_table,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
+ "The user supplied stopword table name.",
+ innodb_stopword_table_validate,
+ NULL,
+ NULL);
+
+static MYSQL_SYSVAR_UINT(flush_log_at_timeout, srv_flush_log_at_timeout,
+ PLUGIN_VAR_OPCMDARG,
+ "Write and flush logs every (n) second.",
+ NULL, NULL, 1, 0, 2700, 0);
+
+static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
+ PLUGIN_VAR_OPCMDARG,
+ "Controls the durability/speed trade-off for commits."
+ " Set to 0 (write and flush redo log to disk only once per second),"
+ " 1 (flush to disk at each commit),"
+ " 2 (write to log at commit but flush to disk only once per second)"
+ " or 3 (flush to disk at prepare and at commit, slower and usually redundant)."
+ " 1 and 3 guarantees that after a crash, committed transactions will"
+ " not be lost and will be consistent with the binlog and other transactional"
+ " engines. 2 can get inconsistent and lose transactions if there is a"
+ " power failure or kernel crash but not if mysqld crashes. 0 has no"
+ " guarantees in case of crash. 0 and 2 can be faster than 1 or 3.",
+ NULL, NULL, 1, 0, 3, 0);
+
+static MYSQL_SYSVAR_ENUM(flush_method, innodb_flush_method,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "With which method to flush data.",
+ NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_FSYNC),
+ &innodb_flush_method_typelib);
+
+static MYSQL_SYSVAR_STR(file_format, deprecated::innodb_file_format,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ innodb_deprecated_ignored, NULL, NULL, NULL);
+static MYSQL_SYSVAR_STR(large_prefix, deprecated::innodb_large_prefix,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ innodb_deprecated_ignored, NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(force_load_corrupted, srv_load_corrupted,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Force InnoDB to load metadata of corrupted table.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Path to InnoDB log files.", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_ULONG(page_cleaners, deprecated::innodb_page_cleaners,
+ PLUGIN_VAR_RQCMDARG,
+ innodb_deprecated_ignored, NULL, innodb_page_cleaners_warn, 0, 0, 64, 0);
+
+static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
+ PLUGIN_VAR_RQCMDARG,
+ "Percentage of dirty pages allowed in bufferpool.",
+ NULL, innodb_max_dirty_pages_pct_update, 90.0, 0, 99.999, 0);
+
+static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct_lwm,
+ srv_max_dirty_pages_pct_lwm,
+ PLUGIN_VAR_RQCMDARG,
+ "Percentage of dirty pages at which flushing kicks in. "
+ "The value 0 (default) means 'refer to innodb_max_dirty_pages_pct'.",
+ NULL, innodb_max_dirty_pages_pct_lwm_update, 0, 0, 99.999, 0);
+
+static MYSQL_SYSVAR_DOUBLE(adaptive_flushing_lwm,
+ srv_adaptive_flushing_lwm,
+ PLUGIN_VAR_RQCMDARG,
+ "Percentage of log capacity below which no adaptive flushing happens.",
+ NULL, NULL, 10.0, 0.0, 70.0, 0);
+
+static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing,
+ PLUGIN_VAR_NOCMDARG,
+ "Attempt flushing dirty pages to avoid IO bursts at checkpoints.",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(flush_sync, srv_flush_sync,
+ PLUGIN_VAR_NOCMDARG,
+ "Allow IO bursts at the checkpoints ignoring io_capacity setting.",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONG(flushing_avg_loops,
+ srv_flushing_avg_loops,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of iterations over which the background flushing is averaged.",
+ NULL, NULL, 30, 1, 1000, 0);
+
+static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag,
+ PLUGIN_VAR_RQCMDARG,
+ "Desired maximum length of the purge queue (0 = no limit)",
+ NULL, NULL, 0, 0, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(max_purge_lag_delay, srv_max_purge_lag_delay,
+ PLUGIN_VAR_RQCMDARG,
+ "Maximum delay of user threads in micro-seconds",
+ NULL, NULL,
+ 0L, /* Default seting */
+ 0L, /* Minimum value */
+ 10000000UL, 0); /* Maximum value */
+
+static MYSQL_SYSVAR_UINT(max_purge_lag_wait, innodb_max_purge_lag_wait,
+ PLUGIN_VAR_RQCMDARG,
+ "Wait until History list length is below the specified limit",
+ NULL, innodb_max_purge_lag_wait_update, UINT_MAX, 0, UINT_MAX, 0);
+
+static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_NOSYSVAR,
+ "Enable SHOW ENGINE INNODB STATUS output in the innodb_status.<pid> file",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable statistics gathering for metadata commands such as"
+ " SHOW TABLE STATUS for tables that use transient statistics (off by default)",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages,
+ srv_stats_transient_sample_pages,
+ PLUGIN_VAR_RQCMDARG,
+ "The number of leaf index pages to sample when calculating transient"
+ " statistics (if persistent statistics are not used, default 8)",
+ NULL, NULL, 8, 1, ~0ULL, 0);
+
+static MYSQL_SYSVAR_BOOL(stats_persistent, srv_stats_persistent,
+ PLUGIN_VAR_OPCMDARG,
+ "InnoDB persistent statistics enabled for all tables unless overridden"
+ " at table level",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(stats_auto_recalc, srv_stats_auto_recalc,
+ PLUGIN_VAR_OPCMDARG,
+ "InnoDB automatic recalculation of persistent statistics enabled for all"
+ " tables unless overridden at table level (automatic recalculation is only"
+ " done when InnoDB decides that the table has changed too much and needs a"
+ " new statistics)",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages,
+ srv_stats_persistent_sample_pages,
+ PLUGIN_VAR_RQCMDARG,
+ "The number of leaf index pages to sample when calculating persistent"
+ " statistics (by ANALYZE, default 20)",
+ NULL, NULL, 20, 1, ~0ULL, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_modified_counter, srv_stats_modified_counter,
+ PLUGIN_VAR_RQCMDARG,
+ "The number of rows modified before we calculate new statistics (default 0 = current limits)",
+ NULL, NULL, 0, 0, ~0ULL, 0);
+
+static MYSQL_SYSVAR_BOOL(stats_traditional, srv_stats_sample_traditional,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable traditional statistic calculation based on number of configured pages (default true)",
+ NULL, NULL, TRUE);
+
+#ifdef BTR_CUR_HASH_ADAPT
+static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable InnoDB adaptive hash index (disabled by default).",
+ NULL, innodb_adaptive_hash_index_update, false);
+
+/** Number of distinct partitions of AHI.
+Each partition is protected by its own latch and so we have parts number
+of latches protecting complete search system. */
+static MYSQL_SYSVAR_ULONG(adaptive_hash_index_parts, btr_ahi_parts,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Number of InnoDB Adaptive Hash Index Partitions (default 8)",
+ NULL, NULL, 8, 1, 512, 0);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+static MYSQL_SYSVAR_UINT(replication_delay, deprecated::replication_delay,
+ PLUGIN_VAR_RQCMDARG,
+ innodb_deprecated_ignored, nullptr, deprecated::replication_delay_warn,
+ 0, 0, ~0U, 0);
+
+static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
+ PLUGIN_VAR_RQCMDARG,
+ "Compression level used for zlib compression. 0 is no compression"
+ ", 1 is fastest, 9 is best compression and default is 6.",
+ NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
+
+static MYSQL_SYSVAR_BOOL(log_compressed_pages,
+ deprecated::innodb_log_compressed_pages,
+ PLUGIN_VAR_OPCMDARG,
+ innodb_deprecated_ignored, NULL, innodb_log_compressed_pages_warn, TRUE);
+
+static MYSQL_SYSVAR_BOOL(log_optimize_ddl, deprecated::innodb_log_optimize_ddl,
+ PLUGIN_VAR_OPCMDARG,
+ innodb_deprecated_ignored, NULL, innodb_log_optimize_ddl_warn, FALSE);
+
+static MYSQL_SYSVAR_UINT(autoextend_increment,
+ sys_tablespace_auto_extend_increment,
+ PLUGIN_VAR_RQCMDARG,
+ "Data file autoextend increment in megabytes",
+ NULL, NULL, 64, 1, 1000, 0);
+
+/** Validate the requested buffer pool size. Also, reserve the necessary
+memory needed for buffer pool resize.
+@param[in] thd thread handle
+@param[in] var pointer to system variable
+@param[out] save immediate result for update function
+@param[in] value incoming string
+@return 0 on success, 1 on failure.
+*/
+static
+int
+innodb_buffer_pool_size_validate(
+ THD* thd,
+ struct st_mysql_sys_var* var,
+ void* save,
+ struct st_mysql_value* value);
+
+static MYSQL_SYSVAR_ULONGLONG(buffer_pool_size, innobase_buffer_pool_size,
+ PLUGIN_VAR_RQCMDARG,
+ "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
+ innodb_buffer_pool_size_validate,
+ innodb_buffer_pool_size_update,
+ srv_buf_pool_def_size,
+ srv_buf_pool_min_size,
+ LLONG_MAX, 1024*1024L);
+
+static MYSQL_SYSVAR_ULONG(buffer_pool_chunk_size, srv_buf_pool_chunk_unit,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Size of a single memory chunk"
+ " for resizing buffer pool. Online buffer pool resizing happens"
+ " at this granularity. 0 means disable resizing buffer pool.",
+ NULL, NULL,
+ 128 * 1024 * 1024, 1024 * 1024, LONG_MAX, 1024 * 1024);
+
+static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "The algorithm Innodb uses for deciding which locks to grant next when"
+ " a lock is released. Possible values are"
+ " FCFS"
+ " grant the locks in First-Come-First-Served order;"
+ " VATS"
+ " use the Variance-Aware-Transaction-Scheduling algorithm, which"
+ " uses an Eldest-Transaction-First heuristic.",
+ NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
+ &innodb_lock_schedule_algorithm_typelib);
+
+static MYSQL_SYSVAR_ULONG(buffer_pool_instances,
+ deprecated::innodb_buffer_pool_instances,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ innodb_deprecated_ignored, NULL, NULL, 0, 0, 64, 0);
+
+static MYSQL_SYSVAR_STR(buffer_pool_filename, srv_buf_dump_filename,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
+ "Filename to/from which to dump/load the InnoDB buffer pool",
+ innodb_srv_buf_dump_filename_validate, NULL, SRV_BUF_DUMP_FILENAME_DEFAULT);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_dump_now, innodb_buffer_pool_dump_now,
+ PLUGIN_VAR_RQCMDARG,
+ "Trigger an immediate dump of the buffer pool into a file named @@innodb_buffer_pool_filename",
+ NULL, buffer_pool_dump_now, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_dump_at_shutdown, srv_buffer_pool_dump_at_shutdown,
+ PLUGIN_VAR_RQCMDARG,
+ "Dump the buffer pool into a file named @@innodb_buffer_pool_filename",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONG(buffer_pool_dump_pct, srv_buf_pool_dump_pct,
+ PLUGIN_VAR_RQCMDARG,
+ "Dump only the hottest N% of each buffer pool, defaults to 25",
+ NULL, NULL, 25, 1, 100, 0);
+
+#ifdef UNIV_DEBUG
+/* Added to test the innodb_buffer_pool_load_incomplete status variable. */
+static MYSQL_SYSVAR_ULONG(buffer_pool_load_pages_abort, srv_buf_pool_load_pages_abort,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of pages during a buffer pool load to process before signaling innodb_buffer_pool_load_abort=1",
+ NULL, NULL, LONG_MAX, 1, LONG_MAX, 0);
+
+static MYSQL_SYSVAR_STR(buffer_pool_evict, srv_buffer_pool_evict,
+ PLUGIN_VAR_RQCMDARG,
+ "Evict pages from the buffer pool",
+ NULL, innodb_buffer_pool_evict_update, "");
+#endif /* UNIV_DEBUG */
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_now, innodb_buffer_pool_load_now,
+ PLUGIN_VAR_RQCMDARG,
+ "Trigger an immediate load of the buffer pool from a file named @@innodb_buffer_pool_filename",
+ NULL, buffer_pool_load_now, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_abort, innodb_buffer_pool_load_abort,
+ PLUGIN_VAR_RQCMDARG,
+ "Abort a currently running load of the buffer pool",
+ NULL, buffer_pool_load_abort, FALSE);
+
+/* there is no point in changing this during runtime, thus readonly */
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_startup,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Load the buffer pool from a file named @@innodb_buffer_pool_filename",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(defragment, srv_defragment,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing "
+ "defragmentation will be paused. And new defragmentation command will fail."
+ "Paused defragmentation commands will resume when this variable is set to "
+ "true again.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of pages considered at once when merging multiple pages to "
+ "defragment",
+ NULL, NULL, 7, 2, 32, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_stats_accuracy,
+ srv_defragment_stats_accuracy,
+ PLUGIN_VAR_RQCMDARG,
+ "How many defragment stats changes there are before the stats "
+ "are written to persistent storage. Set to 0 meaning disable "
+ "defragment stats tracking.",
+ NULL, NULL, 0, 0, ~0U, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs,
+ srv_defragment_fill_factor_n_recs,
+ PLUGIN_VAR_RQCMDARG,
+ "How many records of space defragmentation should leave on the page. "
+ "This variable, together with innodb_defragment_fill_factor, is introduced "
+ "so defragmentation won't pack the page too full and cause page split on "
+ "the next insert on every page. The variable indicating more defragmentation"
+ " gain is the one effective.",
+ NULL, NULL, 20, 1, 100, 0);
+
+static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor,
+ PLUGIN_VAR_RQCMDARG,
+ "A number between [0.7, 1] that tells defragmentation how full it should "
+ "fill a page. Default is 0.9. Number below 0.7 won't make much sense."
+ "This variable, together with innodb_defragment_fill_factor_n_recs, is "
+ "introduced so defragmentation won't pack the page too full and cause "
+ "page split on the next insert on every page. The variable indicating more "
+ "defragmentation gain is the one effective.",
+ NULL, NULL, 0.9, 0.7, 1, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency,
+ PLUGIN_VAR_RQCMDARG,
+ "Do not defragment a single index more than this number of time per second."
+ "This controls the number of time defragmentation thread can request X_LOCK "
+ "on an index. Defragmentation thread will check whether "
+ "1/defragment_frequency (s) has passed since it worked on this index last "
+ "time, and put the index back to the queue if not enough time has passed. "
+ "The actual frequency can only be lower than this given number.",
+ NULL, innodb_defragment_frequency_update,
+ SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0);
+
+
+static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
+ PLUGIN_VAR_RQCMDARG,
+ "How deep to scan LRU to keep it clean",
+ NULL, NULL, 1536, 100, ~0UL, 0);
+
+static MYSQL_SYSVAR_SIZE_T(lru_flush_size, innodb_lru_flush_size,
+ PLUGIN_VAR_RQCMDARG,
+ "How many pages to flush on LRU eviction",
+ NULL, NULL, 32, 1, SIZE_T_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors,
+ PLUGIN_VAR_OPCMDARG,
+ "Set to 0 (don't flush neighbors from buffer pool),"
+ " 1 (flush contiguous neighbors from buffer pool)"
+ " or 2 (flush neighbors from buffer pool),"
+ " when flushing a block",
+ NULL, NULL, 1, 0, 2, 0);
+
+static MYSQL_SYSVAR_UINT(commit_concurrency, deprecated::commit_concurrency,
+ PLUGIN_VAR_RQCMDARG,
+ innodb_deprecated_ignored, nullptr, deprecated::commit_concurrency_warn,
+ 0, 0, 1000, 0);
+
+static MYSQL_SYSVAR_UINT(concurrency_tickets, deprecated::concurrency_tickets,
+ PLUGIN_VAR_RQCMDARG,
+ innodb_deprecated_ignored, nullptr, deprecated::concurrency_tickets_warn,
+ 0, 0, ~0U, 0);
+
+static MYSQL_SYSVAR_BOOL(deadlock_detect, innobase_deadlock_detect,
+ PLUGIN_VAR_NOCMDARG,
+ "Enable/disable InnoDB deadlock detector (default ON)."
+ " if set to OFF, deadlock detection is skipped,"
+ " and we rely on innodb_lock_wait_timeout in case of deadlock.",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_UINT(fill_factor, innobase_fill_factor,
+ PLUGIN_VAR_RQCMDARG,
+ "Percentage of B-tree page filled during bulk insert",
+ NULL, NULL, 100, 10, 100, 0);
+
+static MYSQL_SYSVAR_BOOL(ft_enable_diag_print, fts_enable_diag_print,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether to enable additional FTS diagnostic printout ",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether to disable OS system file cache for sort I/O",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_STR(ft_aux_table, innodb_ft_aux_table,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
+ "FTS internal auxiliary table to be checked",
+ innodb_ft_aux_table_validate, NULL, NULL);
+
+static MYSQL_SYSVAR_ULONG(ft_cache_size, fts_max_cache_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "InnoDB Fulltext search cache size in bytes",
+ NULL, NULL, 8000000, 1600000, 80000000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_total_cache_size, fts_max_total_cache_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Total memory allocated for InnoDB Fulltext Search cache",
+ NULL, NULL, 640000000, 32000000, 1600000000, 0);
+
+static MYSQL_SYSVAR_SIZE_T(ft_result_cache_limit, fts_result_cache_limit,
+ PLUGIN_VAR_RQCMDARG,
+ "InnoDB Fulltext search query result cache limit in bytes",
+ NULL, NULL, 2000000000L, 1000000L, SIZE_T_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_min_token_size, fts_min_token_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "InnoDB Fulltext search minimum token size in characters",
+ NULL, NULL, 3, 0, 16, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_max_token_size, fts_max_token_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "InnoDB Fulltext search maximum token size in characters",
+ NULL, NULL, FTS_MAX_WORD_LEN_IN_CHAR, 10, FTS_MAX_WORD_LEN_IN_CHAR, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_num_word_optimize, fts_num_word_optimize,
+ PLUGIN_VAR_OPCMDARG,
+ "InnoDB Fulltext search number of words to optimize for each optimize table call ",
+ NULL, NULL, 2000, 1000, 10000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_sort_pll_degree, fts_sort_pll_degree,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number",
+ NULL, NULL, 2, 1, 16, 0);
+
+static MYSQL_SYSVAR_ULONG(sort_buffer_size, srv_sort_buf_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Memory buffer size for index creation",
+ NULL, NULL, 1048576, 65536, 64<<20, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(online_alter_log_max_size, srv_online_max_size,
+ PLUGIN_VAR_RQCMDARG,
+ "Maximum modification log file size for online index creation",
+ NULL, NULL, 128<<20, 65536, ~0ULL, 0);
+
+static MYSQL_SYSVAR_BOOL(optimize_fulltext_only, innodb_optimize_fulltext_only,
+ PLUGIN_VAR_NOCMDARG,
+ "Only optimize the Fulltext index of the table",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(read_io_threads, srv_n_read_io_threads,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of background read I/O threads in InnoDB.",
+ NULL, NULL, 4, 1, 64, 0);
+
+static MYSQL_SYSVAR_UINT(write_io_threads, srv_n_write_io_threads,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of background write I/O threads in InnoDB.",
+ NULL, NULL, 4, 2, 64, 0);
+
+static MYSQL_SYSVAR_ULONG(force_recovery, srv_force_recovery,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Helps to save your data in case the disk image of the database becomes corrupt. Value 5 can return bogus data, and 6 can permanently corrupt data.",
+ NULL, NULL, 0, 0, 6, 0);
+
+static MYSQL_SYSVAR_ULONG(page_size, srv_page_size,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Page size to use for all InnoDB tablespaces.",
+ NULL, NULL, UNIV_PAGE_SIZE_DEF,
+ UNIV_PAGE_SIZE_MIN, UNIV_PAGE_SIZE_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(log_buffer_size, srv_log_buffer_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "The size of the buffer which InnoDB uses to write log to the log files on disk.",
+ NULL, NULL, 16L << 20, 256L << 10, LONG_MAX, 1024);
+
+static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Size of each log file in a log group.",
+ NULL, NULL, 96 << 20, 1 << 20, std::numeric_limits<ulonglong>::max(),
+ UNIV_PAGE_SIZE_MAX);
+
+static MYSQL_SYSVAR_ULONG(log_files_in_group, deprecated::srv_n_log_files,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ innodb_deprecated_ignored, NULL, NULL, 1, 1, 100, 0);
+
+static MYSQL_SYSVAR_ULONG(log_write_ahead_size, srv_log_write_ahead_size,
+ PLUGIN_VAR_RQCMDARG,
+ "Redo log write ahead unit size to avoid read-on-write,"
+ " it should match the OS cache block IO size",
+ NULL, innodb_log_write_ahead_size_update,
+ 8*1024L, OS_FILE_LOG_BLOCK_SIZE, UNIV_PAGE_SIZE_DEF, OS_FILE_LOG_BLOCK_SIZE);
+
+static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct,
+ PLUGIN_VAR_RQCMDARG,
+ "Percentage of the buffer pool to reserve for 'old' blocks.",
+ NULL, innodb_old_blocks_pct_update, 100 * 3 / 8, 5, 95, 0);
+
+static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms,
+ PLUGIN_VAR_RQCMDARG,
+ "Move blocks to the 'new' end of the buffer pool if the first access"
+ " was at least this many milliseconds ago."
+ " The timeout is disabled if 0.",
+ NULL, NULL, 1000, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_ULONG(open_files, innobase_open_files,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "How many files at the maximum InnoDB keeps open at the same time.",
+ NULL, NULL, 0, 0, LONG_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds,
+ PLUGIN_VAR_RQCMDARG,
+ "Count of spin-loop rounds in InnoDB mutexes (30 by default)",
+ NULL, NULL, 30L, 0L, ~0UL, 0);
+
+static MYSQL_SYSVAR_UINT(spin_wait_delay, srv_spin_wait_delay,
+ PLUGIN_VAR_OPCMDARG,
+ "Maximum delay between polling for a spin lock (4 by default)",
+ NULL, NULL, 4, 0, 6000, 0);
+
+static MYSQL_SYSVAR_UINT(thread_concurrency, deprecated::thread_concurrency,
+ PLUGIN_VAR_RQCMDARG,
+ innodb_deprecated_ignored, nullptr, deprecated::thread_concurrency_warn,
+ 0, 0, 1000, 0);
+
+static MYSQL_SYSVAR_UINT(
+ adaptive_max_sleep_delay, deprecated::adaptive_max_sleep_delay,
+ PLUGIN_VAR_RQCMDARG,
+ innodb_deprecated_ignored,
+ nullptr, deprecated::adaptive_max_sleep_delay_warn, 0, 0, 1000000, 0);
+
+static MYSQL_SYSVAR_BOOL(prefix_index_cluster_optimization,
+ srv_prefix_index_cluster_optimization,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable prefix optimization to sometimes avoid cluster index lookups.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(thread_sleep_delay, deprecated::thread_sleep_delay,
+ PLUGIN_VAR_RQCMDARG,
+ innodb_deprecated_ignored, nullptr, deprecated::thread_sleep_delay_warn,
+ 0, 0, 1000000, 0);
+
+static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Path to individual files and their sizes.",
+ NULL, NULL, "ibdata1:12M:autoextend");
+
+static MYSQL_SYSVAR_STR(temp_data_file_path, innobase_temp_data_file_path,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Path to files and their sizes making temp-tablespace.",
+ NULL, NULL, "ibtmp1:12M:autoextend");
+
+static MYSQL_SYSVAR_STR(undo_directory, srv_undo_dir,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Directory where undo tablespace files live, this path can be absolute.",
+ NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_ULONG(undo_tablespaces, srv_undo_tablespaces,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of undo tablespaces to use.",
+ NULL, NULL,
+ 0L, /* Default seting */
+ 0L, /* Minimum value */
+ TRX_SYS_MAX_UNDO_SPACES, 0); /* Maximum value */
+
+static MYSQL_SYSVAR_ULONG(undo_logs, deprecated::innodb_undo_logs,
+ PLUGIN_VAR_OPCMDARG,
+ innodb_deprecated_ignored, NULL, innodb_undo_logs_warn,
+ TRX_SYS_N_RSEGS, 0, TRX_SYS_N_RSEGS, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(max_undo_log_size, srv_max_undo_log_size,
+ PLUGIN_VAR_OPCMDARG,
+ "Desired maximum UNDO tablespace size in bytes",
+ NULL, NULL,
+ 10 << 20, 10 << 20,
+ 1ULL << (32 + UNIV_PAGE_SIZE_SHIFT_MAX), 0);
+
+static MYSQL_SYSVAR_ULONG(purge_rseg_truncate_frequency,
+ srv_purge_rseg_truncate_frequency,
+ PLUGIN_VAR_OPCMDARG,
+ "Dictates rate at which UNDO records are purged. Value N means"
+ " purge rollback segment(s) on every Nth iteration of purge invocation",
+ NULL, NULL, 128, 1, 128, 0);
+
+static MYSQL_SYSVAR_BOOL(undo_log_truncate, srv_undo_log_truncate,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable or Disable Truncate of UNDO tablespace.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "The AUTOINC lock modes supported by InnoDB:"
+ " 0 => Old style AUTOINC locking (for backward compatibility);"
+ " 1 => New style AUTOINC locking;"
+ " 2 => No AUTOINC locking (unsafe for SBR)",
+ NULL, NULL,
+ AUTOINC_NEW_STYLE_LOCKING, /* Default setting */
+ AUTOINC_OLD_STYLE_LOCKING, /* Minimum value */
+ AUTOINC_NO_LOCKING, 0); /* Maximum value */
+
+static MYSQL_SYSVAR_STR(version, innodb_version_str,
+ PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
+ "InnoDB version", NULL, NULL, INNODB_VERSION_STR);
+
+static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Use native AIO if supported on this platform.",
+ NULL, NULL, TRUE);
+
+#ifdef HAVE_LIBNUMA
+static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Use NUMA interleave memory policy to allocate InnoDB buffer pool.",
+ NULL, NULL, FALSE);
+#endif /* HAVE_LIBNUMA */
+
+static MYSQL_SYSVAR_ENUM(change_buffering, innodb_change_buffering,
+ PLUGIN_VAR_RQCMDARG,
+ "Buffer changes to secondary indexes.",
+ NULL, NULL, IBUF_USE_ALL, &innodb_change_buffering_typelib);
+
+static MYSQL_SYSVAR_UINT(change_buffer_max_size,
+ srv_change_buffer_max_size,
+ PLUGIN_VAR_RQCMDARG,
+ "Maximum on-disk size of change buffer in terms of percentage"
+ " of the buffer pool.",
+ NULL, innodb_change_buffer_max_size_update,
+ CHANGE_BUFFER_DEFAULT_SIZE, 0, 50, 0);
+
+static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies how InnoDB index statistics collection code should"
+ " treat NULLs. Possible values are NULLS_EQUAL (default),"
+ " NULLS_UNEQUAL and NULLS_IGNORED",
+ NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib);
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+static MYSQL_SYSVAR_BOOL(change_buffer_dump, ibuf_dump,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Dump the change buffer at startup.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
+ PLUGIN_VAR_RQCMDARG,
+ "Debug flags for InnoDB change buffering (0=none, 1=try to buffer)",
+ NULL, NULL, 0, 0, 1, 0);
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+static MYSQL_SYSVAR_ULONG(buf_dump_status_frequency, srv_buf_dump_status_frequency,
+ PLUGIN_VAR_RQCMDARG,
+ "A number between [0, 100] that tells how oftern buffer pool dump status "
+ "in percentages should be printed. E.g. 10 means that buffer pool dump "
+ "status is printed when every 10% of number of buffer pool pages are "
+ "dumped. Default is 0 (only start and end status is printed).",
+ NULL, NULL, 0, 0, 100, 0);
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+/*******************************************************
+ * innobase_disallow_writes variable definition *
+ *******************************************************/
+
+/* Must always init to FALSE. */
+static my_bool innobase_disallow_writes = FALSE;
+
+/**************************************************************************
+An "update" method for innobase_disallow_writes variable. */
+static
+void
+innobase_disallow_writes_update(THD*, st_mysql_sys_var*,
+ void* var_ptr, const void* save)
+{
+ const my_bool val = *static_cast<const my_bool*>(save);
+ *static_cast<my_bool*>(var_ptr) = val;
+ ut_a(srv_allow_writes_event);
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ if (val) {
+ os_event_reset(srv_allow_writes_event);
+ } else {
+ os_event_set(srv_allow_writes_event);
+ }
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+static MYSQL_SYSVAR_BOOL(disallow_writes, innobase_disallow_writes,
+ PLUGIN_VAR_NOCMDOPT,
+ "Tell InnoDB to stop any writes to disk",
+ NULL, innobase_disallow_writes_update, FALSE);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
+static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead,
+ PLUGIN_VAR_NOCMDARG,
+ "Whether to use read ahead for random access within an extent.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of pages that must be accessed sequentially for InnoDB to"
+ " trigger a readahead.",
+ NULL, NULL, 56, 0, 64, 0);
+
+static MYSQL_SYSVAR_STR(monitor_enable, innobase_enable_monitor_counter,
+ PLUGIN_VAR_RQCMDARG,
+ "Turn on a monitor counter",
+ innodb_monitor_validate,
+ innodb_enable_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_disable, innobase_disable_monitor_counter,
+ PLUGIN_VAR_RQCMDARG,
+ "Turn off a monitor counter",
+ innodb_monitor_validate,
+ innodb_disable_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_reset, innobase_reset_monitor_counter,
+ PLUGIN_VAR_RQCMDARG,
+ "Reset a monitor counter",
+ innodb_monitor_validate,
+ innodb_reset_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_reset_all, innobase_reset_all_monitor_counter,
+ PLUGIN_VAR_RQCMDARG,
+ "Reset all values for a monitor counter",
+ innodb_monitor_validate,
+ innodb_reset_all_monitor_update, NULL);
+
+static MYSQL_SYSVAR_BOOL(status_output, srv_print_innodb_monitor,
+ PLUGIN_VAR_OPCMDARG, "Enable InnoDB monitor output to the error log.",
+ NULL, innodb_status_output_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(status_output_locks, srv_print_innodb_lock_monitor,
+ PLUGIN_VAR_OPCMDARG, "Enable InnoDB lock monitor output to the error log."
+ " Requires innodb_status_output=ON.",
+ NULL, innodb_status_output_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(print_all_deadlocks, srv_print_all_deadlocks,
+ PLUGIN_VAR_OPCMDARG,
+ "Print all deadlocks to MariaDB error log (off by default)",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(compression_failure_threshold_pct,
+ zip_failure_threshold_pct, PLUGIN_VAR_OPCMDARG,
+ "If the compression failure rate of a table is greater than this number"
+ " more padding is added to the pages to reduce the failures. A value of"
+ " zero implies no padding",
+ NULL, NULL, 5, 0, 100, 0);
+
+static MYSQL_SYSVAR_ULONG(compression_pad_pct_max,
+ zip_pad_max, PLUGIN_VAR_OPCMDARG,
+ "Percentage of empty space on a data page that can be reserved"
+ " to make the page compressible.",
+ NULL, NULL, 50, 0, 75, 0);
+
+static MYSQL_SYSVAR_BOOL(read_only, srv_read_only_mode,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Start InnoDB in read only mode (off by default)",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(cmp_per_index_enabled, srv_cmp_per_index_enabled,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable INFORMATION_SCHEMA.innodb_cmp_per_index,"
+ " may have negative impact on performance (off by default)",
+ NULL, innodb_cmp_per_index_update, FALSE);
+
+static MYSQL_SYSVAR_ENUM(default_row_format, innodb_default_row_format,
+ PLUGIN_VAR_RQCMDARG,
+ "The default ROW FORMAT for all innodb tables created without explicit"
+ " ROW_FORMAT. Possible values are REDUNDANT, COMPACT, and DYNAMIC."
+ " The ROW_FORMAT value COMPRESSED is not allowed",
+ NULL, NULL, DEFAULT_ROW_FORMAT_DYNAMIC,
+ &innodb_default_row_format_typelib);
+
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_UINT(trx_rseg_n_slots_debug, trx_rseg_n_slots_debug,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_NOCMDOPT,
+ "Debug flags for InnoDB to limit TRX_RSEG_N_SLOTS for trx_rsegf_undo_find_free()",
+ NULL, NULL, 0, 0, 1024, 0);
+
+static MYSQL_SYSVAR_UINT(limit_optimistic_insert_debug,
+ btr_cur_limit_optimistic_insert_debug, PLUGIN_VAR_RQCMDARG,
+ "Artificially limit the number of records per B-tree page (0=unlimited).",
+ NULL, NULL, 0, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug,
+ srv_purge_view_update_only_debug, PLUGIN_VAR_NOCMDOPT,
+ "Pause actual purging any delete-marked records, but merely update the purge view."
+ " It is to create artificially the situation the purge view have been updated"
+ " but the each purges were not done yet.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(evict_tables_on_commit_debug,
+ innodb_evict_tables_on_commit_debug, PLUGIN_VAR_OPCMDARG,
+ "On transaction commit, try to evict tables from the data dictionary cache.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(data_file_size_debug,
+ srv_sys_space_size_debug,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "InnoDB system tablespace size to be set in recovery.",
+ NULL, NULL, 0, 0, 256U << 20, 0);
+
+static MYSQL_SYSVAR_UINT(fil_make_page_dirty_debug,
+ srv_fil_make_page_dirty_debug, PLUGIN_VAR_OPCMDARG,
+ "Make the first page of the given tablespace dirty.",
+ NULL, innodb_make_page_dirty, 0, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_UINT(saved_page_number_debug,
+ srv_saved_page_number_debug, PLUGIN_VAR_OPCMDARG,
+ "An InnoDB page number.",
+ NULL, NULL, 0, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_BOOL(disable_resize_buffer_pool_debug,
+ buf_disable_resize_buffer_pool_debug, PLUGIN_VAR_NOCMDARG,
+ "Disable resizing buffer pool to make assertion code not expensive.",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(page_cleaner_disabled_debug,
+ innodb_page_cleaner_disabled_debug, PLUGIN_VAR_OPCMDARG,
+ "Disable page cleaner",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(sync_debug, srv_sync_debug,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Enable the sync debug checks",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(dict_stats_disabled_debug,
+ innodb_dict_stats_disabled_debug,
+ PLUGIN_VAR_OPCMDARG,
+ "Disable dict_stats thread",
+ NULL, dict_stats_disabled_debug_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(master_thread_disabled_debug,
+ srv_master_thread_disabled_debug,
+ PLUGIN_VAR_OPCMDARG,
+ "Disable master thread",
+ NULL, srv_master_thread_disabled_debug_update, FALSE);
+#endif /* UNIV_DEBUG */
+
+static MYSQL_SYSVAR_BOOL(force_primary_key,
+ srv_force_primary_key,
+ PLUGIN_VAR_OPCMDARG,
+ "Do not allow creating a table without primary key (off by default)",
+ NULL, NULL, FALSE);
+
+static const char *page_compression_algorithms[]= { "none", "zlib", "lz4", "lzo", "lzma", "bzip2", "snappy", 0 };
+static TYPELIB page_compression_algorithms_typelib=
+{
+ array_elements(page_compression_algorithms) - 1, 0,
+ page_compression_algorithms, 0
+};
+static MYSQL_SYSVAR_ENUM(compression_algorithm, innodb_compression_algorithm,
+ PLUGIN_VAR_OPCMDARG,
+ "Compression algorithm used on page compression. One of: none, zlib, lz4, lzo, lzma, bzip2, or snappy",
+ innodb_compression_algorithm_validate, NULL,
+ /* We use here the largest number of supported compression method to
+ enable all those methods that are available. Availability of compression
+ method is verified on innodb_compression_algorithm_validate function. */
+ PAGE_ZLIB_ALGORITHM,
+ &page_compression_algorithms_typelib);
+
+static MYSQL_SYSVAR_ULONG(fatal_semaphore_wait_threshold, srv_fatal_semaphore_wait_threshold,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Maximum number of seconds that semaphore times out in InnoDB.",
+ NULL, NULL,
+ DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT, /* Default setting */
+ 1, /* Minimum setting */
+ UINT_MAX32, /* Maximum setting */
+ 0);
+
+static const char* srv_encrypt_tables_names[] = { "OFF", "ON", "FORCE", 0 };
+static TYPELIB srv_encrypt_tables_typelib = {
+ array_elements(srv_encrypt_tables_names)-1, 0, srv_encrypt_tables_names,
+ NULL
+};
+static MYSQL_SYSVAR_ENUM(encrypt_tables, srv_encrypt_tables,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable encryption for tables. "
+ "Don't forget to enable --innodb-encrypt-log too",
+ innodb_encrypt_tables_validate,
+ innodb_encrypt_tables_update,
+ 0,
+ &srv_encrypt_tables_typelib);
+
+static MYSQL_SYSVAR_UINT(encryption_threads, srv_n_fil_crypt_threads,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of threads performing background key rotation ",
+ NULL,
+ innodb_encryption_threads_update,
+ 0, 0, 255, 0);
+
+static MYSQL_SYSVAR_UINT(encryption_rotate_key_age,
+ srv_fil_crypt_rotate_key_age,
+ PLUGIN_VAR_RQCMDARG,
+ "Key rotation - re-encrypt in background "
+ "all pages that were encrypted with a key that "
+ "many (or more) versions behind. Value 0 indicates "
+ "that key rotation is disabled.",
+ NULL,
+ innodb_encryption_rotate_key_age_update,
+ 1, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_UINT(encryption_rotation_iops, srv_n_fil_crypt_iops,
+ PLUGIN_VAR_RQCMDARG,
+ "Use this many iops for background key rotation",
+ NULL,
+ innodb_encryption_rotation_iops_update,
+ srv_n_fil_crypt_iops, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_BOOL(scrub_log, deprecated::innodb_scrub_log,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ innodb_deprecated_ignored,
+ 0, 0, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(scrub_log_speed, deprecated::innodb_scrub_log_speed,
+ PLUGIN_VAR_OPCMDARG,
+ innodb_deprecated_ignored, NULL, innodb_scrub_log_speed_warn,
+ 256, 1, 50000, 0);
+
+static MYSQL_SYSVAR_BOOL(encrypt_log, srv_encrypt_log,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Enable redo log encryption",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(immediate_scrub_data_uncompressed,
+ srv_immediate_scrub_data_uncompressed,
+ 0,
+ "Enable scrubbing of data",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(background_scrub_data_uncompressed,
+ deprecated::innodb_background_scrub_data_uncompressed,
+ PLUGIN_VAR_OPCMDARG, innodb_deprecated_ignored, NULL,
+ innodb_background_scrub_data_uncompressed_warn, FALSE);
+
+static MYSQL_SYSVAR_BOOL(background_scrub_data_compressed,
+ deprecated::innodb_background_scrub_data_compressed,
+ PLUGIN_VAR_OPCMDARG, innodb_deprecated_ignored, NULL,
+ innodb_background_scrub_data_compressed_warn, FALSE);
+
+static MYSQL_SYSVAR_UINT(background_scrub_data_check_interval,
+ deprecated::innodb_background_scrub_data_check_interval,
+ 0, innodb_deprecated_ignored, NULL,
+ innodb_background_scrub_data_check_interval_warn, 0, 0, 0, 0);
+
+static MYSQL_SYSVAR_UINT(background_scrub_data_interval,
+ deprecated::innodb_background_scrub_data_interval,
+ 0, innodb_deprecated_ignored, NULL,
+ innodb_background_scrub_data_interval_warn, 0, 0, 0, 0);
+
+static MYSQL_SYSVAR_BOOL(encrypt_temporary_tables, innodb_encrypt_temporary_tables,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Enrypt the temporary table data.",
+ NULL, NULL, false);
+
+static struct st_mysql_sys_var* innobase_system_variables[]= {
+ MYSQL_SYSVAR(autoextend_increment),
+ MYSQL_SYSVAR(buffer_pool_size),
+ MYSQL_SYSVAR(buffer_pool_chunk_size),
+ MYSQL_SYSVAR(buffer_pool_instances),
+ MYSQL_SYSVAR(buffer_pool_filename),
+ MYSQL_SYSVAR(buffer_pool_dump_now),
+ MYSQL_SYSVAR(buffer_pool_dump_at_shutdown),
+ MYSQL_SYSVAR(buffer_pool_dump_pct),
+#ifdef UNIV_DEBUG
+ MYSQL_SYSVAR(buffer_pool_evict),
+#endif /* UNIV_DEBUG */
+ MYSQL_SYSVAR(buffer_pool_load_now),
+ MYSQL_SYSVAR(buffer_pool_load_abort),
+#ifdef UNIV_DEBUG
+ MYSQL_SYSVAR(buffer_pool_load_pages_abort),
+#endif /* UNIV_DEBUG */
+ MYSQL_SYSVAR(buffer_pool_load_at_startup),
+ MYSQL_SYSVAR(defragment),
+ MYSQL_SYSVAR(defragment_n_pages),
+ MYSQL_SYSVAR(defragment_stats_accuracy),
+ MYSQL_SYSVAR(defragment_fill_factor),
+ MYSQL_SYSVAR(defragment_fill_factor_n_recs),
+ MYSQL_SYSVAR(defragment_frequency),
+ MYSQL_SYSVAR(lru_scan_depth),
+ MYSQL_SYSVAR(lru_flush_size),
+ MYSQL_SYSVAR(flush_neighbors),
+ MYSQL_SYSVAR(checksum_algorithm),
+ MYSQL_SYSVAR(log_checksums),
+ MYSQL_SYSVAR(commit_concurrency),
+ MYSQL_SYSVAR(concurrency_tickets),
+ MYSQL_SYSVAR(compression_level),
+ MYSQL_SYSVAR(data_file_path),
+ MYSQL_SYSVAR(temp_data_file_path),
+ MYSQL_SYSVAR(data_home_dir),
+ MYSQL_SYSVAR(doublewrite),
+ MYSQL_SYSVAR(stats_include_delete_marked),
+ MYSQL_SYSVAR(use_atomic_writes),
+ MYSQL_SYSVAR(fast_shutdown),
+ MYSQL_SYSVAR(read_io_threads),
+ MYSQL_SYSVAR(write_io_threads),
+ MYSQL_SYSVAR(file_per_table),
+ MYSQL_SYSVAR(file_format), /* deprecated in MariaDB 10.2; no effect */
+ MYSQL_SYSVAR(flush_log_at_timeout),
+ MYSQL_SYSVAR(flush_log_at_trx_commit),
+ MYSQL_SYSVAR(flush_method),
+ MYSQL_SYSVAR(force_recovery),
+ MYSQL_SYSVAR(fill_factor),
+ MYSQL_SYSVAR(ft_cache_size),
+ MYSQL_SYSVAR(ft_total_cache_size),
+ MYSQL_SYSVAR(ft_result_cache_limit),
+ MYSQL_SYSVAR(ft_enable_stopword),
+ MYSQL_SYSVAR(ft_max_token_size),
+ MYSQL_SYSVAR(ft_min_token_size),
+ MYSQL_SYSVAR(ft_num_word_optimize),
+ MYSQL_SYSVAR(ft_sort_pll_degree),
+ MYSQL_SYSVAR(large_prefix), /* deprecated in MariaDB 10.2; no effect */
+ MYSQL_SYSVAR(force_load_corrupted),
+ MYSQL_SYSVAR(lock_schedule_algorithm),
+ MYSQL_SYSVAR(lock_wait_timeout),
+ MYSQL_SYSVAR(deadlock_detect),
+ MYSQL_SYSVAR(page_size),
+ MYSQL_SYSVAR(log_buffer_size),
+ MYSQL_SYSVAR(log_file_size),
+ MYSQL_SYSVAR(log_files_in_group),
+ MYSQL_SYSVAR(log_write_ahead_size),
+ MYSQL_SYSVAR(log_group_home_dir),
+ MYSQL_SYSVAR(log_compressed_pages),
+ MYSQL_SYSVAR(log_optimize_ddl),
+ MYSQL_SYSVAR(max_dirty_pages_pct),
+ MYSQL_SYSVAR(max_dirty_pages_pct_lwm),
+ MYSQL_SYSVAR(adaptive_flushing_lwm),
+ MYSQL_SYSVAR(adaptive_flushing),
+ MYSQL_SYSVAR(flush_sync),
+ MYSQL_SYSVAR(flushing_avg_loops),
+ MYSQL_SYSVAR(max_purge_lag),
+ MYSQL_SYSVAR(max_purge_lag_delay),
+ MYSQL_SYSVAR(max_purge_lag_wait),
+ MYSQL_SYSVAR(old_blocks_pct),
+ MYSQL_SYSVAR(old_blocks_time),
+ MYSQL_SYSVAR(open_files),
+ MYSQL_SYSVAR(optimize_fulltext_only),
+ MYSQL_SYSVAR(rollback_on_timeout),
+ MYSQL_SYSVAR(ft_aux_table),
+ MYSQL_SYSVAR(ft_enable_diag_print),
+ MYSQL_SYSVAR(ft_server_stopword_table),
+ MYSQL_SYSVAR(ft_user_stopword_table),
+ MYSQL_SYSVAR(disable_sort_file_cache),
+ MYSQL_SYSVAR(stats_on_metadata),
+ MYSQL_SYSVAR(stats_transient_sample_pages),
+ MYSQL_SYSVAR(stats_persistent),
+ MYSQL_SYSVAR(stats_persistent_sample_pages),
+ MYSQL_SYSVAR(stats_auto_recalc),
+ MYSQL_SYSVAR(stats_modified_counter),
+ MYSQL_SYSVAR(stats_traditional),
+#ifdef BTR_CUR_HASH_ADAPT
+ MYSQL_SYSVAR(adaptive_hash_index),
+ MYSQL_SYSVAR(adaptive_hash_index_parts),
+#endif /* BTR_CUR_HASH_ADAPT */
+ MYSQL_SYSVAR(stats_method),
+ MYSQL_SYSVAR(replication_delay),
+ MYSQL_SYSVAR(status_file),
+ MYSQL_SYSVAR(strict_mode),
+ MYSQL_SYSVAR(sort_buffer_size),
+ MYSQL_SYSVAR(online_alter_log_max_size),
+ MYSQL_SYSVAR(sync_spin_loops),
+ MYSQL_SYSVAR(spin_wait_delay),
+ MYSQL_SYSVAR(table_locks),
+ MYSQL_SYSVAR(thread_concurrency),
+ MYSQL_SYSVAR(adaptive_max_sleep_delay),
+ MYSQL_SYSVAR(prefix_index_cluster_optimization),
+ MYSQL_SYSVAR(thread_sleep_delay),
+ MYSQL_SYSVAR(tmpdir),
+ MYSQL_SYSVAR(autoinc_lock_mode),
+ MYSQL_SYSVAR(version),
+ MYSQL_SYSVAR(use_native_aio),
+#ifdef HAVE_LIBNUMA
+ MYSQL_SYSVAR(numa_interleave),
+#endif /* HAVE_LIBNUMA */
+ MYSQL_SYSVAR(change_buffering),
+ MYSQL_SYSVAR(change_buffer_max_size),
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+ MYSQL_SYSVAR(change_buffer_dump),
+ MYSQL_SYSVAR(change_buffering_debug),
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+#ifdef WITH_INNODB_DISALLOW_WRITES
+ MYSQL_SYSVAR(disallow_writes),
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+ MYSQL_SYSVAR(random_read_ahead),
+ MYSQL_SYSVAR(read_ahead_threshold),
+ MYSQL_SYSVAR(read_only),
+ MYSQL_SYSVAR(instant_alter_column_allowed),
+ MYSQL_SYSVAR(io_capacity),
+ MYSQL_SYSVAR(io_capacity_max),
+ MYSQL_SYSVAR(page_cleaners),
+ MYSQL_SYSVAR(monitor_enable),
+ MYSQL_SYSVAR(monitor_disable),
+ MYSQL_SYSVAR(monitor_reset),
+ MYSQL_SYSVAR(monitor_reset_all),
+ MYSQL_SYSVAR(purge_threads),
+ MYSQL_SYSVAR(purge_batch_size),
+#ifdef UNIV_DEBUG
+ MYSQL_SYSVAR(background_drop_list_empty),
+ MYSQL_SYSVAR(log_checkpoint_now),
+ MYSQL_SYSVAR(buf_flush_list_now),
+ MYSQL_SYSVAR(merge_threshold_set_all_debug),
+#endif /* UNIV_DEBUG */
+ MYSQL_SYSVAR(status_output),
+ MYSQL_SYSVAR(status_output_locks),
+ MYSQL_SYSVAR(print_all_deadlocks),
+ MYSQL_SYSVAR(cmp_per_index_enabled),
+ MYSQL_SYSVAR(undo_logs),
+ MYSQL_SYSVAR(max_undo_log_size),
+ MYSQL_SYSVAR(purge_rseg_truncate_frequency),
+ MYSQL_SYSVAR(undo_log_truncate),
+ MYSQL_SYSVAR(undo_directory),
+ MYSQL_SYSVAR(undo_tablespaces),
+ MYSQL_SYSVAR(sync_array_size),
+ MYSQL_SYSVAR(compression_failure_threshold_pct),
+ MYSQL_SYSVAR(compression_pad_pct_max),
+ MYSQL_SYSVAR(default_row_format),
+#ifdef UNIV_DEBUG
+ MYSQL_SYSVAR(trx_rseg_n_slots_debug),
+ MYSQL_SYSVAR(limit_optimistic_insert_debug),
+ MYSQL_SYSVAR(trx_purge_view_update_only_debug),
+ MYSQL_SYSVAR(evict_tables_on_commit_debug),
+ MYSQL_SYSVAR(data_file_size_debug),
+ MYSQL_SYSVAR(fil_make_page_dirty_debug),
+ MYSQL_SYSVAR(saved_page_number_debug),
+ MYSQL_SYSVAR(disable_resize_buffer_pool_debug),
+ MYSQL_SYSVAR(page_cleaner_disabled_debug),
+ MYSQL_SYSVAR(dict_stats_disabled_debug),
+ MYSQL_SYSVAR(master_thread_disabled_debug),
+ MYSQL_SYSVAR(sync_debug),
+#endif /* UNIV_DEBUG */
+ MYSQL_SYSVAR(force_primary_key),
+ MYSQL_SYSVAR(fatal_semaphore_wait_threshold),
+ /* Table page compression feature */
+ MYSQL_SYSVAR(compression_default),
+ MYSQL_SYSVAR(compression_algorithm),
+ /* Encryption feature */
+ MYSQL_SYSVAR(encrypt_tables),
+ MYSQL_SYSVAR(encryption_threads),
+ MYSQL_SYSVAR(encryption_rotate_key_age),
+ MYSQL_SYSVAR(encryption_rotation_iops),
+ MYSQL_SYSVAR(scrub_log),
+ MYSQL_SYSVAR(scrub_log_speed),
+ MYSQL_SYSVAR(encrypt_log),
+ MYSQL_SYSVAR(default_encryption_key_id),
+ /* Scrubing feature */
+ MYSQL_SYSVAR(immediate_scrub_data_uncompressed),
+ MYSQL_SYSVAR(background_scrub_data_uncompressed),
+ MYSQL_SYSVAR(background_scrub_data_compressed),
+ MYSQL_SYSVAR(background_scrub_data_interval),
+ MYSQL_SYSVAR(background_scrub_data_check_interval),
+ MYSQL_SYSVAR(buf_dump_status_frequency),
+ MYSQL_SYSVAR(background_thread),
+ MYSQL_SYSVAR(encrypt_temporary_tables),
+
+ NULL
+};
+
+maria_declare_plugin(innobase)
+{
+ MYSQL_STORAGE_ENGINE_PLUGIN,
+ &innobase_storage_engine,
+ innobase_hton_name,
+ plugin_author,
+ "Supports transactions, row-level locking, foreign keys and encryption for tables",
+ PLUGIN_LICENSE_GPL,
+ innodb_init, /* Plugin Init */
+ NULL, /* Plugin Deinit */
+ INNODB_VERSION_SHORT,
+ innodb_status_variables_export,/* status variables */
+ innobase_system_variables, /* system variables */
+ INNODB_VERSION_STR, /* string version */
+ MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+},
+i_s_innodb_trx,
+i_s_innodb_locks,
+i_s_innodb_lock_waits,
+i_s_innodb_cmp,
+i_s_innodb_cmp_reset,
+i_s_innodb_cmpmem,
+i_s_innodb_cmpmem_reset,
+i_s_innodb_cmp_per_index,
+i_s_innodb_cmp_per_index_reset,
+i_s_innodb_buffer_page,
+i_s_innodb_buffer_page_lru,
+i_s_innodb_buffer_stats,
+i_s_innodb_metrics,
+i_s_innodb_ft_default_stopword,
+i_s_innodb_ft_deleted,
+i_s_innodb_ft_being_deleted,
+i_s_innodb_ft_config,
+i_s_innodb_ft_index_cache,
+i_s_innodb_ft_index_table,
+i_s_innodb_sys_tables,
+i_s_innodb_sys_tablestats,
+i_s_innodb_sys_indexes,
+i_s_innodb_sys_columns,
+i_s_innodb_sys_fields,
+i_s_innodb_sys_foreign,
+i_s_innodb_sys_foreign_cols,
+i_s_innodb_sys_tablespaces,
+i_s_innodb_sys_datafiles,
+i_s_innodb_sys_virtual,
+i_s_innodb_mutexes,
+i_s_innodb_sys_semaphore_waits,
+i_s_innodb_tablespaces_encryption
+maria_declare_plugin_end;
+
+/** @brief Adjust some InnoDB startup parameters based on file contents
+or innodb_page_size. */
+static
+void
+innodb_params_adjust()
+{
+ /* The default value and the max value of
+ innodb_undo_logs must be equal to the available undo logs. */
+ MYSQL_SYSVAR_NAME(undo_logs).max_val
+ = MYSQL_SYSVAR_NAME(undo_logs).def_val
+ = srv_available_undo_logs;
+ MYSQL_SYSVAR_NAME(max_undo_log_size).max_val
+ = 1ULL << (32U + srv_page_size_shift);
+ MYSQL_SYSVAR_NAME(max_undo_log_size).min_val
+ = MYSQL_SYSVAR_NAME(max_undo_log_size).def_val
+ = ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
+ << srv_page_size_shift;
+ MYSQL_SYSVAR_NAME(max_undo_log_size).max_val
+ = 1ULL << (32U + srv_page_size_shift);
+}
+
+/****************************************************************************
+ * DS-MRR implementation
+ ***************************************************************************/
+
+/**
+Multi Range Read interface, DS-MRR calls */
+int
+ha_innobase::multi_range_read_init(
+ RANGE_SEQ_IF* seq,
+ void* seq_init_param,
+ uint n_ranges,
+ uint mode,
+ HANDLER_BUFFER* buf)
+{
+ return(m_ds_mrr.dsmrr_init(this, seq, seq_init_param,
+ n_ranges, mode, buf));
+}
+
+int
+ha_innobase::multi_range_read_next(
+ range_id_t* range_info)
+{
+ return(m_ds_mrr.dsmrr_next(range_info));
+}
+
+ha_rows
+ha_innobase::multi_range_read_info_const(
+ uint keyno,
+ RANGE_SEQ_IF* seq,
+ void* seq_init_param,
+ uint n_ranges,
+ uint* bufsz,
+ uint* flags,
+ Cost_estimate* cost)
+{
+ /* See comments in ha_myisam::multi_range_read_info_const */
+ m_ds_mrr.init(this, table);
+
+ if (m_prebuilt->select_lock_type != LOCK_NONE) {
+ *flags |= HA_MRR_USE_DEFAULT_IMPL;
+ }
+
+ ha_rows res= m_ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges,
+ bufsz, flags, cost);
+ return res;
+}
+
+ha_rows
+ha_innobase::multi_range_read_info(
+ uint keyno,
+ uint n_ranges,
+ uint keys,
+ uint key_parts,
+ uint* bufsz,
+ uint* flags,
+ Cost_estimate* cost)
+{
+ m_ds_mrr.init(this, table);
+ ha_rows res= m_ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz,
+ flags, cost);
+ return res;
+}
+
+int
+ha_innobase::multi_range_read_explain_info(
+ uint mrr_mode,
+ char *str,
+ size_t size)
+{
+ return m_ds_mrr.dsmrr_explain_info(mrr_mode, str, size);
+}
+
+/** Find or open a table handle for the virtual column template
+@param[in] thd thread handle
+@param[in,out] table InnoDB table whose virtual column template
+ is to be updated
+@return table handle
+@retval NULL if the table is dropped, unaccessible or corrupted
+for purge thread */
+static TABLE* innodb_find_table_for_vc(THD* thd, dict_table_t* table)
+{
+ TABLE *mysql_table;
+ const bool bg_thread = THDVAR(thd, background_thread);
+
+ if (bg_thread) {
+ if ((mysql_table = get_purge_table(thd))) {
+ return mysql_table;
+ }
+ } else {
+ if (table->vc_templ->mysql_table_query_id
+ == thd_get_query_id(thd)) {
+ return table->vc_templ->mysql_table;
+ }
+ }
+
+ char db_buf[NAME_LEN + 1];
+ char tbl_buf[NAME_LEN + 1];
+ ulint db_buf_len, tbl_buf_len;
+
+ if (!table->parse_name(db_buf, tbl_buf, &db_buf_len, &tbl_buf_len)) {
+ return NULL;
+ }
+
+ if (bg_thread) {
+ return open_purge_table(thd, db_buf, db_buf_len,
+ tbl_buf, tbl_buf_len);
+ }
+
+ mysql_table = find_fk_open_table(thd, db_buf, db_buf_len,
+ tbl_buf, tbl_buf_len);
+ table->vc_templ->mysql_table = mysql_table;
+ table->vc_templ->mysql_table_query_id = thd_get_query_id(thd);
+ return mysql_table;
+}
+
+/** Get the computed value by supplying the base column values.
+@param[in,out] table table whose virtual column
+ template to be built */
+TABLE* innobase_init_vc_templ(dict_table_t* table)
+{
+ if (table->vc_templ != NULL) {
+ return NULL;
+ }
+ DBUG_ENTER("innobase_init_vc_templ");
+
+ table->vc_templ = UT_NEW_NOKEY(dict_vcol_templ_t());
+
+ TABLE *mysql_table= innodb_find_table_for_vc(current_thd, table);
+
+ ut_ad(mysql_table);
+ if (!mysql_table) {
+ DBUG_RETURN(NULL);
+ }
+
+ mutex_enter(&dict_sys.mutex);
+ innobase_build_v_templ(mysql_table, table, table->vc_templ, NULL, true);
+ mutex_exit(&dict_sys.mutex);
+ DBUG_RETURN(mysql_table);
+}
+
+/** Change dbname and table name in table->vc_templ.
+@param[in,out] table the table whose virtual column template
+dbname and tbname to be renamed. */
+void
+innobase_rename_vc_templ(
+ dict_table_t* table)
+{
+ char dbname[MAX_DATABASE_NAME_LEN + 1];
+ char tbname[MAX_DATABASE_NAME_LEN + 1];
+ char* name = table->name.m_name;
+ ulint dbnamelen = dict_get_db_name_len(name);
+ ulint tbnamelen = strlen(name) - dbnamelen - 1;
+ char t_dbname[MAX_DATABASE_NAME_LEN + 1];
+ char t_tbname[MAX_TABLE_NAME_LEN + 1];
+
+ strncpy(dbname, name, dbnamelen);
+ dbname[dbnamelen] = 0;
+ strncpy(tbname, name + dbnamelen + 1, tbnamelen);
+ tbname[tbnamelen] =0;
+
+ /* For partition table, remove the partition name and use the
+ "main" table name to build the template */
+ char* is_part = is_partition(tbname);
+
+ if (is_part != NULL) {
+ *is_part = '\0';
+ tbnamelen = ulint(is_part - tbname);
+ }
+
+ dbnamelen = filename_to_tablename(dbname, t_dbname,
+ MAX_DATABASE_NAME_LEN + 1);
+ tbnamelen = filename_to_tablename(tbname, t_tbname,
+ MAX_TABLE_NAME_LEN + 1);
+
+ table->vc_templ->db_name = t_dbname;
+ table->vc_templ->tb_name = t_tbname;
+}
+
+/** Get the updated parent field value from the update vector for the
+given col_no.
+@param[in] foreign foreign key information
+@param[in] update updated parent vector.
+@param[in] col_no base column position of the child table to check
+@return updated field from the parent update vector, else NULL */
+static
+dfield_t*
+innobase_get_field_from_update_vector(
+ dict_foreign_t* foreign,
+ upd_t* update,
+ ulint col_no)
+{
+ dict_table_t* parent_table = foreign->referenced_table;
+ dict_index_t* parent_index = foreign->referenced_index;
+ ulint parent_field_no;
+ ulint parent_col_no;
+ ulint prefix_col_no;
+
+ for (ulint i = 0; i < foreign->n_fields; i++) {
+ if (dict_index_get_nth_col_no(foreign->foreign_index, i)
+ != col_no) {
+ continue;
+ }
+
+ parent_col_no = dict_index_get_nth_col_no(parent_index, i);
+ parent_field_no = dict_table_get_nth_col_pos(
+ parent_table, parent_col_no, &prefix_col_no);
+
+ for (ulint j = 0; j < update->n_fields; j++) {
+ upd_field_t* parent_ufield
+ = &update->fields[j];
+
+ if (parent_ufield->field_no == parent_field_no) {
+ return(&parent_ufield->new_val);
+ }
+ }
+ }
+
+ return (NULL);
+}
+
+
+/**
+ Allocate a heap and record for calculating virtual fields
+ Used mainly for virtual fields in indexes
+
+@param[in] thd MariaDB THD
+@param[in] index Index in use
+@param[out] heap Heap that holds temporary row
+@param[in,out] table MariaDB table
+@param[out] record Pointer to allocated MariaDB record
+@param[out] storage Internal storage for blobs etc
+
+@retval true on success
+@retval false on malloc failure or failed to open the maria table
+ for purge thread.
+*/
+
+bool innobase_allocate_row_for_vcol(THD *thd, dict_index_t *index,
+ mem_heap_t **heap, TABLE **table,
+ VCOL_STORAGE *storage)
+{
+ TABLE *maria_table;
+ String *blob_value_storage;
+ if (!*table)
+ *table = innodb_find_table_for_vc(thd, index->table);
+
+ /* For purge thread, there is a possiblity that table could have
+ dropped, corrupted or unaccessible. */
+ if (!*table)
+ return false;
+ maria_table = *table;
+ if (!*heap && !(*heap = mem_heap_create(srv_page_size)))
+ return false;
+
+ uchar *record = static_cast<byte *>(mem_heap_alloc(*heap,
+ maria_table->s->reclength));
+
+ size_t len = maria_table->s->virtual_not_stored_blob_fields * sizeof(String);
+ blob_value_storage = static_cast<String *>(mem_heap_alloc(*heap, len));
+
+ if (!record || !blob_value_storage)
+ return false;
+
+ storage->maria_table = maria_table;
+ storage->innobase_record = record;
+ storage->maria_record = maria_table->field[0]->record_ptr();
+ storage->blob_value_storage = blob_value_storage;
+
+ maria_table->move_fields(maria_table->field, record, storage->maria_record);
+ maria_table->remember_blob_values(blob_value_storage);
+
+ return true;
+}
+
+
+/** Free memory allocated by innobase_allocate_row_for_vcol() */
+
+void innobase_free_row_for_vcol(VCOL_STORAGE *storage)
+{
+ TABLE *maria_table= storage->maria_table;
+ maria_table->move_fields(maria_table->field, storage->maria_record,
+ storage->innobase_record);
+ maria_table->restore_blob_values(storage->blob_value_storage);
+}
+
+
+void innobase_report_computed_value_failed(dtuple_t *row)
+{
+ ib::error() << "Compute virtual column values failed for "
+ << rec_printer(row).str();
+}
+
+
+/** Get the computed value by supplying the base column values.
+@param[in,out] row the data row
+@param[in] col virtual column
+@param[in] index index
+@param[in,out] local_heap heap memory for processing large data etc.
+@param[in,out] heap memory heap that copies the actual index row
+@param[in] ifield index field
+@param[in] thd MySQL thread handle
+@param[in,out] mysql_table mysql table object
+@param[in] old_table during ALTER TABLE, this is the old table
+ or NULL.
+@param[in] parent_update update vector for the parent row
+@param[in] foreign foreign key information
+@return the field filled with computed value, or NULL if just want
+to store the value in passed in "my_rec" */
+dfield_t*
+innobase_get_computed_value(
+ dtuple_t* row,
+ const dict_v_col_t* col,
+ const dict_index_t* index,
+ mem_heap_t** local_heap,
+ mem_heap_t* heap,
+ const dict_field_t* ifield,
+ THD* thd,
+ TABLE* mysql_table,
+ byte* mysql_rec,
+ const dict_table_t* old_table,
+ upd_t* parent_update,
+ dict_foreign_t* foreign)
+{
+ byte rec_buf2[REC_VERSION_56_MAX_INDEX_COL_LEN];
+ byte* buf;
+ dfield_t* field;
+ ulint len;
+
+ const ulint zip_size = old_table
+ ? old_table->space->zip_size()
+ : dict_tf_get_zip_size(index->table->flags);
+
+ ulint ret = 0;
+
+ ut_ad(index->table->vc_templ);
+ ut_ad(thd != NULL);
+ ut_ad(mysql_table);
+
+ DBUG_ENTER("innobase_get_computed_value");
+ const mysql_row_templ_t*
+ vctempl = index->table->vc_templ->vtempl[
+ index->table->vc_templ->n_col + col->v_pos];
+
+ if (!heap || index->table->vc_templ->rec_len
+ >= REC_VERSION_56_MAX_INDEX_COL_LEN) {
+ if (*local_heap == NULL) {
+ *local_heap = mem_heap_create(srv_page_size);
+ }
+
+ buf = static_cast<byte*>(mem_heap_alloc(
+ *local_heap, index->table->vc_templ->rec_len));
+ } else {
+ buf = rec_buf2;
+ }
+
+ for (ulint i = 0; i < unsigned{col->num_base}; i++) {
+ dict_col_t* base_col = col->base_col[i];
+ const dfield_t* row_field = NULL;
+ ulint col_no = base_col->ind;
+ const mysql_row_templ_t* templ
+ = index->table->vc_templ->vtempl[col_no];
+ const byte* data;
+
+ if (parent_update != NULL) {
+ /** Get the updated field from update vector
+ of the parent table. */
+ row_field = innobase_get_field_from_update_vector(
+ foreign, parent_update, col_no);
+ }
+
+ if (row_field == NULL) {
+ row_field = dtuple_get_nth_field(row, col_no);
+ }
+
+ data = static_cast<const byte*>(row_field->data);
+ len = row_field->len;
+
+ if (row_field->ext) {
+ if (*local_heap == NULL) {
+ *local_heap = mem_heap_create(srv_page_size);
+ }
+
+ data = btr_copy_externally_stored_field(
+ &len, data, zip_size,
+ dfield_get_len(row_field), *local_heap);
+ }
+
+ if (len == UNIV_SQL_NULL) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ mysql_rec[templ->mysql_null_byte_offset]
+ |= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ memcpy(mysql_rec + templ->mysql_col_offset,
+ static_cast<const byte*>(
+ index->table->vc_templ->default_rec
+ + templ->mysql_col_offset),
+ templ->mysql_col_len);
+ } else {
+
+ row_sel_field_store_in_mysql_format(
+ mysql_rec + templ->mysql_col_offset,
+ templ, index, templ->clust_rec_field_no,
+ (const byte*)data, len);
+
+ if (templ->mysql_null_bit_mask) {
+ /* It is a nullable column with a
+ non-NULL value */
+ mysql_rec[templ->mysql_null_byte_offset]
+ &= static_cast<byte>(
+ ~templ->mysql_null_bit_mask);
+ }
+ }
+ }
+
+ field = dtuple_get_nth_v_field(row, col->v_pos);
+
+ MY_BITMAP *old_write_set = dbug_tmp_use_all_columns(mysql_table, &mysql_table->write_set);
+ MY_BITMAP *old_read_set = dbug_tmp_use_all_columns(mysql_table, &mysql_table->read_set);
+ ret = mysql_table->update_virtual_field(mysql_table->field[col->m_col.ind]);
+ dbug_tmp_restore_column_map(&mysql_table->read_set, old_read_set);
+ dbug_tmp_restore_column_map(&mysql_table->write_set, old_write_set);
+
+ if (ret != 0) {
+ DBUG_RETURN(NULL);
+ }
+
+ if (vctempl->mysql_null_bit_mask
+ && (mysql_rec[vctempl->mysql_null_byte_offset]
+ & vctempl->mysql_null_bit_mask)) {
+ dfield_set_null(field);
+ field->type.prtype |= DATA_VIRTUAL;
+ DBUG_RETURN(field);
+ }
+
+ row_mysql_store_col_in_innobase_format(
+ field, buf,
+ TRUE, mysql_rec + vctempl->mysql_col_offset,
+ vctempl->mysql_col_len, dict_table_is_comp(index->table));
+ field->type.prtype |= DATA_VIRTUAL;
+
+ ulint max_prefix = col->m_col.max_prefix;
+
+ if (max_prefix && ifield
+ && (ifield->prefix_len == 0
+ || ifield->prefix_len > col->m_col.max_prefix)) {
+ max_prefix = ifield->prefix_len;
+ }
+
+ /* If this is a prefix index, we only need a portion of the field */
+ if (max_prefix) {
+ len = dtype_get_at_most_n_mbchars(
+ col->m_col.prtype,
+ col->m_col.mbminlen, col->m_col.mbmaxlen,
+ max_prefix,
+ field->len,
+ static_cast<char*>(dfield_get_data(field)));
+ dfield_set_len(field, len);
+ }
+
+ if (heap) {
+ dfield_dup(field, heap);
+ }
+
+ DBUG_RETURN(field);
+}
+
+
+/** Attempt to push down an index condition.
+@param[in] keyno MySQL key number
+@param[in] idx_cond Index condition to be checked
+@return Part of idx_cond which the handler will not evaluate */
+
+class Item*
+ha_innobase::idx_cond_push(
+ uint keyno,
+ class Item* idx_cond)
+{
+ DBUG_ENTER("ha_innobase::idx_cond_push");
+ DBUG_ASSERT(keyno != MAX_KEY);
+ DBUG_ASSERT(idx_cond != NULL);
+
+ /* We can only evaluate the condition if all columns are stored.*/
+ dict_index_t* idx = innobase_get_index(keyno);
+ if (idx && dict_index_has_virtual(idx)) {
+ DBUG_RETURN(idx_cond);
+ }
+
+ pushed_idx_cond = idx_cond;
+ pushed_idx_cond_keyno = keyno;
+ in_range_check_pushed_down = TRUE;
+ /* We will evaluate the condition entirely */
+ DBUG_RETURN(NULL);
+}
+
+
+/** Push a primary key filter.
+@param[in] pk_filter filter against which primary keys
+ are to be checked
+@retval false if pushed (always) */
+bool ha_innobase::rowid_filter_push(Rowid_filter* pk_filter)
+{
+ DBUG_ENTER("ha_innobase::rowid_filter_push");
+ DBUG_ASSERT(pk_filter != NULL);
+ pushed_rowid_filter= pk_filter;
+ DBUG_RETURN(false);
+}
+
+static bool is_part_of_a_key_prefix(const Field_longstr *field)
+{
+ const TABLE_SHARE *s= field->table->s;
+
+ for (uint i= 0; i < s->keys; i++)
+ {
+ const KEY &key= s->key_info[i];
+ for (uint j= 0; j < key.user_defined_key_parts; j++)
+ {
+ const KEY_PART_INFO &info= key.key_part[j];
+ // When field is a part of some key, a key part and field will have the
+ // same length. And their length will be different when only some prefix
+ // of a field is used as a key part. That's what we're looking for here.
+ if (info.field->field_index == field->field_index &&
+ info.length != field->field_length)
+ {
+ DBUG_ASSERT(info.length < field->field_length);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static bool
+is_part_of_a_primary_key(const Field* field)
+{
+ const TABLE_SHARE* s = field->table->s;
+
+ return s->primary_key != MAX_KEY
+ && field->part_of_key.is_set(s->primary_key);
+}
+
+bool ha_innobase::can_convert_string(const Field_string *field,
+ const Column_definition &new_type) const
+{
+ DBUG_ASSERT(!field->compression_method());
+ if (new_type.type_handler() != field->type_handler())
+ return false;
+
+ if (new_type.char_length != field->char_length())
+ return false;
+
+ const Charset field_cs(field->charset());
+
+ if (new_type.length != field->max_display_length() &&
+ (!m_prebuilt->table->not_redundant() ||
+ field_cs.mbminlen() == field_cs.mbmaxlen()))
+ return false;
+
+ if (new_type.charset != field->charset())
+ {
+ if (!field_cs.encoding_allows_reinterpret_as(new_type.charset))
+ return false;
+
+ if (!field_cs.eq_collation_specific_names(new_type.charset))
+ return !is_part_of_a_primary_key(field);
+
+ // Fully indexed case works instantly like
+ // Compare_keys::EqualButKeyPartLength. But prefix case isn't implemented.
+ if (is_part_of_a_key_prefix(field))
+ return false;
+
+ return true;
+ }
+
+ return true;
+}
+
+static bool
+supports_enlarging(const dict_table_t* table, const Field_varstring* field,
+ const Column_definition& new_type)
+{
+ return field->field_length <= 127 || new_type.length <= 255
+ || field->field_length > 255 || !table->not_redundant();
+}
+
+bool ha_innobase::can_convert_varstring(
+ const Field_varstring *field, const Column_definition &new_type) const
+{
+ if (new_type.length < field->field_length)
+ return false;
+
+ if (new_type.char_length < field->char_length())
+ return false;
+
+ if (!new_type.compression_method() != !field->compression_method())
+ return false;
+
+ if (new_type.type_handler() != field->type_handler())
+ return false;
+
+ if (new_type.charset != field->charset())
+ {
+ if (!supports_enlarging(m_prebuilt->table, field, new_type))
+ return false;
+
+ Charset field_cs(field->charset());
+ if (!field_cs.encoding_allows_reinterpret_as(new_type.charset))
+ return false;
+
+ if (!field_cs.eq_collation_specific_names(new_type.charset))
+ return !is_part_of_a_primary_key(field);
+
+ // Fully indexed case works instantly like
+ // Compare_keys::EqualButKeyPartLength. But prefix case isn't implemented.
+ if (is_part_of_a_key_prefix(field))
+ return false;
+
+ return true;
+ }
+
+ if (new_type.length != field->field_length)
+ {
+ if (!supports_enlarging(m_prebuilt->table, field, new_type))
+ return false;
+
+ return true;
+ }
+
+ return true;
+}
+
+static bool is_part_of_a_key(const Field_blob *field)
+{
+ const TABLE_SHARE *s= field->table->s;
+
+ for (uint i= 0; i < s->keys; i++)
+ {
+ const KEY &key= s->key_info[i];
+ for (uint j= 0; j < key.user_defined_key_parts; j++)
+ {
+ const KEY_PART_INFO &info= key.key_part[j];
+ if (info.field->field_index == field->field_index)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool ha_innobase::can_convert_blob(const Field_blob *field,
+ const Column_definition &new_type) const
+{
+ if (new_type.type_handler() != field->type_handler())
+ return false;
+
+ if (!new_type.compression_method() != !field->compression_method())
+ return false;
+
+ if (new_type.pack_length != field->pack_length())
+ return false;
+
+ if (new_type.charset != field->charset())
+ {
+ Charset field_cs(field->charset());
+ if (!field_cs.encoding_allows_reinterpret_as(new_type.charset))
+ return false;
+
+ if (!field_cs.eq_collation_specific_names(new_type.charset))
+ return !is_part_of_a_key(field);
+
+ // Fully indexed case works instantly like
+ // Compare_keys::EqualButKeyPartLength. But prefix case isn't implemented.
+ if (is_part_of_a_key_prefix(field))
+ return false;
+
+ return true;
+ }
+
+ return true;
+}
+
+Compare_keys ha_innobase::compare_key_parts(
+ const Field &old_field, const Column_definition &new_field,
+ const KEY_PART_INFO &old_part, const KEY_PART_INFO &new_part) const
+{
+ const bool is_equal= old_field.is_equal(new_field);
+ const CHARSET_INFO *old_cs= old_field.charset();
+ const CHARSET_INFO *new_cs= new_field.charset;
+
+ if (!is_equal)
+ {
+ if (!old_field.can_be_converted_by_engine(new_field))
+ return Compare_keys::NotEqual;
+
+ if (!Charset(old_cs).eq_collation_specific_names(new_cs))
+ return Compare_keys::NotEqual;
+ }
+
+ if (old_part.length / old_cs->mbmaxlen != new_part.length / new_cs->mbmaxlen)
+ {
+ if (old_part.length != old_field.field_length)
+ return Compare_keys::NotEqual;
+
+ if (old_part.length >= new_part.length)
+ return Compare_keys::NotEqual;
+
+ return Compare_keys::EqualButKeyPartLength;
+ }
+
+ return Compare_keys::Equal;
+}
+
+/******************************************************************//**
+Use this when the args are passed to the format string from
+errmsg-utf8.txt directly as is.
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+ THD *thd, Sql_condition::enum_condition_level level,
+ uint code, const char *format, ...);
+*/
+void
+ib_senderrf(
+/*========*/
+ THD* thd, /*!< in/out: session */
+ ib_log_level_t level, /*!< in: warning level */
+ ib_uint32_t code, /*!< MySQL error code */
+ ...) /*!< Args */
+{
+ va_list args;
+ const char* format = my_get_err_msg(code);
+
+ /* If the caller wants to push a message to the client then
+ the caller must pass a valid session handle. */
+
+ ut_a(thd != 0);
+
+ /* The error code must exist in the errmsg-utf8.txt file. */
+ ut_a(format != 0);
+
+ va_start(args, code);
+
+ myf l;
+
+ switch (level) {
+ case IB_LOG_LEVEL_INFO:
+ l = ME_NOTE;
+ break;
+ case IB_LOG_LEVEL_WARN:
+ l = ME_WARNING;
+ break;
+ default:
+ l = 0;
+ break;
+ }
+
+ my_printv_error(code, format, MYF(l), args);
+
+ va_end(args);
+
+ if (level == IB_LOG_LEVEL_FATAL) {
+ ut_error;
+ }
+}
+
+/******************************************************************//**
+Use this when the args are first converted to a formatted string and then
+passed to the format string from errmsg-utf8.txt. The error message format
+must be: "Some string ... %s".
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+ THD *thd, Sql_condition::enum_condition_level level,
+ uint code, const char *format, ...);
+*/
+void
+ib_errf(
+/*====*/
+ THD* thd, /*!< in/out: session */
+ ib_log_level_t level, /*!< in: warning level */
+ ib_uint32_t code, /*!< MySQL error code */
+ const char* format, /*!< printf format */
+ ...) /*!< Args */
+{
+ char* str = NULL;
+ va_list args;
+
+ /* If the caller wants to push a message to the client then
+ the caller must pass a valid session handle. */
+
+ ut_a(thd != 0);
+ ut_a(format != 0);
+
+ va_start(args, format);
+
+#ifdef _WIN32
+ int size = _vscprintf(format, args) + 1;
+ if (size > 0) {
+ str = static_cast<char*>(malloc(size));
+ }
+ if (str == NULL) {
+ va_end(args);
+ return; /* Watch for Out-Of-Memory */
+ }
+ str[size - 1] = 0x0;
+ vsnprintf(str, size, format, args);
+#elif HAVE_VASPRINTF
+ if (vasprintf(&str, format, args) == -1) {
+ /* In case of failure use a fixed length string */
+ str = static_cast<char*>(malloc(BUFSIZ));
+ vsnprintf(str, BUFSIZ, format, args);
+ }
+#else
+ /* Use a fixed length string. */
+ str = static_cast<char*>(malloc(BUFSIZ));
+ if (str == NULL) {
+ va_end(args);
+ return; /* Watch for Out-Of-Memory */
+ }
+ vsnprintf(str, BUFSIZ, format, args);
+#endif /* _WIN32 */
+
+ ib_senderrf(thd, level, code, str);
+
+ va_end(args);
+ free(str);
+}
+
+/* Keep the first 16 characters as-is, since the url is sometimes used
+as an offset from this.*/
+const char* TROUBLESHOOTING_MSG =
+ "Please refer to https://mariadb.com/kb/en/innodb-troubleshooting/"
+ " for how to resolve the issue.";
+
+const char* TROUBLESHOOT_DATADICT_MSG =
+ "Please refer to https://mariadb.com/kb/en/innodb-data-dictionary-troubleshooting/"
+ " for how to resolve the issue.";
+
+const char* BUG_REPORT_MSG =
+ "Submit a detailed bug report to https://jira.mariadb.org/";
+
+const char* FORCE_RECOVERY_MSG =
+ "Please refer to "
+ "https://mariadb.com/kb/en/library/innodb-recovery-modes/"
+ " for information about forcing recovery.";
+
+const char* OPERATING_SYSTEM_ERROR_MSG =
+ "Some operating system error numbers are described at"
+ " https://mariadb.com/kb/en/library/operating-system-error-codes/";
+
+const char* FOREIGN_KEY_CONSTRAINTS_MSG =
+ "Please refer to https://mariadb.com/kb/en/library/foreign-keys/"
+ " for correct foreign key definition.";
+
+const char* SET_TRANSACTION_MSG =
+ "Please refer to https://mariadb.com/kb/en/library/set-transaction/";
+
+const char* INNODB_PARAMETERS_MSG =
+ "Please refer to https://mariadb.com/kb/en/library/innodb-system-variables/";
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset.
+@return result string length, as returned by strconvert() */
+uint
+innobase_convert_to_filename_charset(
+/*=================================*/
+ char* to, /* out: converted identifier */
+ const char* from, /* in: identifier to convert */
+ ulint len) /* in: length of 'to', in bytes */
+{
+ uint errors;
+ CHARSET_INFO* cs_to = &my_charset_filename;
+ CHARSET_INFO* cs_from = system_charset_info;
+
+ return(static_cast<uint>(strconvert(
+ cs_from, from, uint(strlen(from)),
+ cs_to, to, static_cast<uint>(len), &errors)));
+}
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset.
+@return result string length, as returned by strconvert() */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+ char* to, /* out: converted identifier */
+ const char* from, /* in: identifier to convert */
+ ulint len, /* in: length of 'to', in bytes */
+ uint* errors) /* out: error return */
+{
+ CHARSET_INFO* cs1 = &my_charset_filename;
+ CHARSET_INFO* cs2 = system_charset_info;
+
+ return(static_cast<uint>(strconvert(
+ cs1, from, static_cast<uint>(strlen(from)),
+ cs2, to, static_cast<uint>(len), errors)));
+}
+
+/** Validate the requested buffer pool size. Also, reserve the necessary
+memory needed for buffer pool resize.
+@param[in] thd thread handle
+@param[out] save immediate result for update function
+@param[in] value incoming string
+@return 0 on success, 1 on failure.
+*/
+static
+int
+innodb_buffer_pool_size_validate(
+ THD* thd,
+ st_mysql_sys_var*,
+ void* save,
+ struct st_mysql_value* value)
+{
+ longlong intbuf;
+ value->val_int(value, &intbuf);
+
+ if (!srv_was_started) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Cannot update innodb_buffer_pool_size,"
+ " because InnoDB is not started.");
+ return(1);
+ }
+
+#ifdef UNIV_DEBUG
+ if (buf_disable_resize_buffer_pool_debug == TRUE) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Cannot update innodb_buffer_pool_size,"
+ " because innodb_disable_resize_buffer_pool_debug"
+ " is set.");
+ ib::warn() << "Cannot update innodb_buffer_pool_size,"
+ " because innodb_disable_resize_buffer_pool_debug"
+ " is set.";
+ return(1);
+ }
+#endif /* UNIV_DEBUG */
+
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (srv_buf_pool_old_size != srv_buf_pool_size) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ my_printf_error(ER_WRONG_ARGUMENTS,
+ "Another buffer pool resize is already in progress.", MYF(0));
+ return(1);
+ }
+
+ ulint requested_buf_pool_size = buf_pool_size_align(ulint(intbuf));
+
+ *static_cast<ulonglong*>(save) = requested_buf_pool_size;
+
+ if (srv_buf_pool_size == ulint(intbuf)) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ /* nothing to do */
+ return(0);
+ }
+
+ if (srv_buf_pool_size == requested_buf_pool_size) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "innodb_buffer_pool_size must be at least"
+ " innodb_buffer_pool_chunk_size=%lu",
+ srv_buf_pool_chunk_unit);
+ /* nothing to do */
+ return(0);
+ }
+
+ srv_buf_pool_size = requested_buf_pool_size;
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (intbuf != static_cast<longlong>(requested_buf_pool_size)) {
+ char buf[64];
+ int len = 64;
+ value->val_str(value, buf, &len);
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_TRUNCATED_WRONG_VALUE,
+ "Truncated incorrect %-.32s value: '%-.128s'",
+ mysql_sysvar_buffer_pool_size.name,
+ value->val_str(value, buf, &len));
+ }
+
+ return(0);
+}
+
+/*************************************************************//**
+Check for a valid value of innobase_compression_algorithm.
+@return 0 for valid innodb_compression_algorithm. */
+static
+int
+innodb_compression_algorithm_validate(
+/*==================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ ulong compression_algorithm;
+ DBUG_ENTER("innobase_compression_algorithm_validate");
+
+ if (check_sysvar_enum(thd, var, save, value)) {
+ DBUG_RETURN(1);
+ }
+
+ compression_algorithm = *reinterpret_cast<ulong*>(save);
+ (void)compression_algorithm;
+
+#ifndef HAVE_LZ4
+ if (compression_algorithm == PAGE_LZ4_ALGORITHM) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblz4 is not installed. \n",
+ compression_algorithm);
+ DBUG_RETURN(1);
+ }
+#endif
+
+#ifndef HAVE_LZO
+ if (compression_algorithm == PAGE_LZO_ALGORITHM) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblzo is not installed. \n",
+ compression_algorithm);
+ DBUG_RETURN(1);
+ }
+#endif
+
+#ifndef HAVE_LZMA
+ if (compression_algorithm == PAGE_LZMA_ALGORITHM) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblzma is not installed. \n",
+ compression_algorithm);
+ DBUG_RETURN(1);
+ }
+#endif
+
+#ifndef HAVE_BZIP2
+ if (compression_algorithm == PAGE_BZIP2_ALGORITHM) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: libbz2 is not installed. \n",
+ compression_algorithm);
+ DBUG_RETURN(1);
+ }
+#endif
+
+#ifndef HAVE_SNAPPY
+ if (compression_algorithm == PAGE_SNAPPY_ALGORITHM) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: libsnappy is not installed. \n",
+ compression_algorithm);
+ DBUG_RETURN(1);
+ }
+#endif
+ DBUG_RETURN(0);
+}
+
+static
+int
+innodb_encrypt_tables_validate(
+/*=================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ if (check_sysvar_enum(thd, var, save, value)) {
+ return 1;
+ }
+
+ ulong encrypt_tables = *(ulong*)save;
+
+ if (encrypt_tables
+ && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: cannot enable encryption, "
+ "encryption plugin is not available");
+ return 1;
+ }
+
+ return 0;
+}
+
+static void innodb_remember_check_sysvar_funcs()
+{
+ /* remember build-in sysvar check functions */
+ ut_ad((MYSQL_SYSVAR_NAME(checksum_algorithm).flags & 0x1FF) == PLUGIN_VAR_ENUM);
+ check_sysvar_enum = MYSQL_SYSVAR_NAME(checksum_algorithm).check;
+
+ ut_ad((MYSQL_SYSVAR_NAME(flush_log_at_timeout).flags & 15) == PLUGIN_VAR_INT);
+ check_sysvar_int = MYSQL_SYSVAR_NAME(flush_log_at_timeout).check;
+}
+
+static const size_t MAX_BUF_SIZE = 4 * 1024;
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+UNIV_INTERN
+void
+ib_push_warning(
+ trx_t* trx, /*!< in: trx */
+ dberr_t error, /*!< in: error code to push as warning */
+ const char *format,/*!< in: warning message */
+ ...)
+{
+ if (trx && trx->mysql_thd) {
+ THD *thd = (THD *)trx->mysql_thd;
+ va_list args;
+ char *buf;
+
+ va_start(args, format);
+ buf = (char *)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME));
+ buf[MAX_BUF_SIZE - 1] = 0;
+ vsnprintf(buf, MAX_BUF_SIZE - 1, format, args);
+
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ uint(convert_error_code_to_mysql(error, 0, thd)), buf);
+ my_free(buf);
+ va_end(args);
+ }
+}
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+UNIV_INTERN
+void
+ib_push_warning(
+ void* ithd, /*!< in: thd */
+ dberr_t error, /*!< in: error code to push as warning */
+ const char *format,/*!< in: warning message */
+ ...)
+{
+ va_list args;
+ THD *thd = (THD *)ithd;
+ char *buf;
+
+ if (ithd == NULL) {
+ thd = current_thd;
+ }
+
+ if (thd) {
+ va_start(args, format);
+ buf = (char *)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME));
+ buf[MAX_BUF_SIZE - 1] = 0;
+ vsnprintf(buf, MAX_BUF_SIZE - 1, format, args);
+
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ uint(convert_error_code_to_mysql(error, 0, thd)), buf);
+ my_free(buf);
+ va_end(args);
+ }
+}
+
+/** Helper function to push warnings from InnoDB internals to SQL-layer.
+@param[in] trx
+@param[in] error Error code to push as warning
+@param[in] table_name Table name
+@param[in] format Warning message
+@param[in] ... Message arguments */
+UNIV_INTERN
+void
+ib_foreign_warn(trx_t* trx, /*!< in: trx */
+ dberr_t error, /*!< in: error code to push as warning */
+ const char* table_name,
+ const char* format, /*!< in: warning message */
+ ...)
+{
+ va_list args;
+ char* buf;
+ static FILE* ef = dict_foreign_err_file;
+ static const size_t MAX_BUF_SIZE = 4 * 1024;
+ buf = (char*)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME));
+ if (!buf) {
+ return;
+ }
+
+ va_start(args, format);
+ vsprintf(buf, format, args);
+ va_end(args);
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fprintf(ef, " Error in foreign key constraint of table %s:\n",
+ table_name);
+ fputs(buf, ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ if (trx && trx->mysql_thd) {
+ THD* thd = (THD*)trx->mysql_thd;
+
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ uint(convert_error_code_to_mysql(error, 0, thd)), buf);
+ }
+
+ my_free(buf);
+}
+
+/********************************************************************//**
+Helper function to push frm mismatch error to error log and
+if needed to sql-layer. */
+UNIV_INTERN
+void
+ib_push_frm_error(
+/*==============*/
+ THD* thd, /*!< in: MySQL thd */
+ dict_table_t* ib_table, /*!< in: InnoDB table */
+ TABLE* table, /*!< in: MySQL table */
+ ulint n_keys, /*!< in: InnoDB #keys */
+ bool push_warning) /*!< in: print warning ? */
+{
+ switch (ib_table->dict_frm_mismatch) {
+ case DICT_FRM_NO_PK:
+ sql_print_error("Table %s has a primary key in "
+ "InnoDB data dictionary, but not "
+ "in MariaDB!"
+ " Have you mixed up "
+ ".frm files from different "
+ "installations? See "
+ "https://mariadb.com/kb/en/innodb-troubleshooting/\n",
+ ib_table->name.m_name);
+
+ if (push_warning) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_NO_SUCH_INDEX,
+ "InnoDB: Table %s has a "
+ "primary key in InnoDB data "
+ "dictionary, but not in "
+ "MariaDB!", ib_table->name.m_name);
+ }
+ break;
+ case DICT_NO_PK_FRM_HAS:
+ sql_print_error(
+ "Table %s has no primary key in InnoDB data "
+ "dictionary, but has one in MariaDB! If you "
+ "created the table with a MariaDB version < "
+ "3.23.54 and did not define a primary key, "
+ "but defined a unique key with all non-NULL "
+ "columns, then MariaDB internally treats that "
+ "key as the primary key. You can fix this "
+ "error by dump + DROP + CREATE + reimport "
+ "of the table.", ib_table->name.m_name);
+
+ if (push_warning) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_NO_SUCH_INDEX,
+ "InnoDB: Table %s has no "
+ "primary key in InnoDB data "
+ "dictionary, but has one in "
+ "MariaDB!",
+ ib_table->name.m_name);
+ }
+ break;
+
+ case DICT_FRM_INCONSISTENT_KEYS:
+ sql_print_error("InnoDB: Table %s contains " ULINTPF " "
+ "indexes inside InnoDB, which "
+ "is different from the number of "
+ "indexes %u defined in the MariaDB "
+ " Have you mixed up "
+ ".frm files from different "
+ "installations? See "
+ "https://mariadb.com/kb/en/innodb-troubleshooting/\n",
+ ib_table->name.m_name, n_keys,
+ table->s->keys);
+
+ if (push_warning) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_NO_SUCH_INDEX,
+ "InnoDB: Table %s contains " ULINTPF " "
+ "indexes inside InnoDB, which "
+ "is different from the number of "
+ "indexes %u defined in the MariaDB ",
+ ib_table->name.m_name, n_keys,
+ table->s->keys);
+ }
+ break;
+
+ case DICT_FRM_CONSISTENT:
+ default:
+ sql_print_error("InnoDB: Table %s is consistent "
+ "on InnoDB data dictionary and MariaDB "
+ " FRM file.",
+ ib_table->name.m_name);
+ ut_error;
+ break;
+ }
+}
+
+/** Writes 8 bytes to nth tuple field
+@param[in] tuple where to write
+@param[in] nth index in tuple
+@param[in] data what to write
+@param[in] buf field data buffer */
+static void set_tuple_col_8(dtuple_t *tuple, int col, uint64_t data, byte *buf)
+{
+ dfield_t *dfield= dtuple_get_nth_field(tuple, col);
+ ut_ad(dfield->type.len == 8);
+ if (dfield->len == UNIV_SQL_NULL)
+ {
+ dfield_set_data(dfield, buf, 8);
+ }
+ ut_ad(dfield->len == dfield->type.len && dfield->data);
+ mach_write_to_8(dfield->data, data);
+}
+
+void ins_node_t::vers_update_end(row_prebuilt_t *prebuilt, bool history_row)
+{
+ ut_ad(prebuilt->ins_node == this);
+ trx_t *trx= prebuilt->trx;
+#ifndef DBUG_OFF
+ ut_ad(table->vers_start != table->vers_end);
+ const mysql_row_templ_t *t= prebuilt->get_template_by_col(table->vers_end);
+ ut_ad(t);
+ ut_ad(t->mysql_col_len == 8);
+#endif
+
+ if (history_row)
+ {
+ set_tuple_col_8(row, table->vers_end, trx->id, vers_end_buf);
+ }
+ else /* ROW_INS_VERSIONED */
+ {
+ set_tuple_col_8(row, table->vers_end, TRX_ID_MAX, vers_end_buf);
+#ifndef DBUG_OFF
+ t= prebuilt->get_template_by_col(table->vers_start);
+ ut_ad(t);
+ ut_ad(t->mysql_col_len == 8);
+#endif
+ set_tuple_col_8(row, table->vers_start, trx->id, vers_start_buf);
+ }
+ dict_index_t *clust_index= dict_table_get_first_index(table);
+ THD *thd= trx->mysql_thd;
+ TABLE *mysql_table= prebuilt->m_mysql_table;
+ mem_heap_t *local_heap= NULL;
+ for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++)
+ {
+
+ const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no);
+ for (ulint i= 0; i < unsigned(v_col->num_base); i++)
+ {
+ dict_col_t *base_col= v_col->base_col[i];
+ if (base_col->ind == table->vers_end)
+ {
+ innobase_get_computed_value(row, v_col, clust_index, &local_heap,
+ table->heap, NULL, thd, mysql_table,
+ mysql_table->record[0], NULL, NULL, NULL);
+ }
+ }
+ }
+ if (local_heap)
+ {
+ mem_heap_free(local_heap);
+ }
+}
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
new file mode 100644
index 00000000..e1fdbe90
--- /dev/null
+++ b/storage/innobase/handler/ha_innodb.h
@@ -0,0 +1,973 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+#ifdef WITH_WSREP
+#include "wsrep_api.h"
+#include <mysql/service_wsrep.h>
+#endif /* WITH_WSREP */
+
+#include "table.h"
+
+/* The InnoDB handler: the interface between MySQL and InnoDB. */
+
+/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
+system clustered index when there is no primary key. */
+extern const char innobase_index_reserve_name[];
+
+/** Prebuilt structures in an InnoDB table handle used within MySQL */
+struct row_prebuilt_t;
+
+/** InnoDB transaction */
+struct trx_t;
+
+/** Engine specific table options are defined using this struct */
+struct ha_table_option_struct
+{
+ bool page_compressed; /*!< Table is using page compression
+ if this option is true. */
+ ulonglong page_compression_level; /*!< Table page compression level
+ 0-9. */
+ uint atomic_writes; /*!< Use atomic writes for this
+ table if this options is ON or
+ in DEFAULT if
+ innodb_use_atomic_writes.
+ Atomic writes are not used if
+ value OFF.*/
+ uint encryption; /*!< DEFAULT, ON, OFF */
+ ulonglong encryption_key_id; /*!< encryption key id */
+};
+
+/** The class defining a handle to an Innodb table */
+class ha_innobase final : public handler
+{
+public:
+ ha_innobase(handlerton* hton, TABLE_SHARE* table_arg);
+ ~ha_innobase() override;
+
+ /** Get the row type from the storage engine. If this method returns
+ ROW_TYPE_NOT_USED, the information in HA_CREATE_INFO should be used. */
+ enum row_type get_row_type() const override;
+
+ const char* table_type() const;
+
+ const char* index_type(uint key_number) override;
+
+ Table_flags table_flags() const override;
+
+ ulong index_flags(uint idx, uint part, bool all_parts) const override;
+
+ uint max_supported_keys() const override;
+
+ uint max_supported_key_length() const override;
+
+ uint max_supported_key_part_length() const override;
+
+ const key_map* keys_to_use_for_scanning() override;
+
+ void column_bitmaps_signal() override;
+
+ /** Opens dictionary table object using table name. For partition, we need to
+ try alternative lower/upper case names to support moving data files across
+ platforms.
+ @param[in] table_name name of the table/partition
+ @param[in] norm_name normalized name of the table/partition
+ @param[in] is_partition if this is a partition of a table
+ @param[in] ignore_err error to ignore for loading dictionary object
+ @return dictionary table object or NULL if not found */
+ static dict_table_t* open_dict_table(
+ const char* table_name,
+ const char* norm_name,
+ bool is_partition,
+ dict_err_ignore_t ignore_err);
+
+ int open(const char *name, int mode, uint test_if_locked) override;
+
+ handler* clone(const char *name, MEM_ROOT *mem_root) override;
+
+ int close(void) override;
+
+ double scan_time() override;
+
+ double read_time(uint index, uint ranges, ha_rows rows) override;
+
+ int delete_all_rows() override;
+
+ int write_row(const uchar * buf) override;
+
+ int update_row(const uchar * old_data, const uchar * new_data) override;
+
+ int delete_row(const uchar * buf) override;
+
+ bool was_semi_consistent_read() override;
+
+ void try_semi_consistent_read(bool yes) override;
+
+ void unlock_row() override;
+
+ int index_init(uint index, bool sorted) override;
+
+ int index_end() override;
+
+ int index_read(
+ uchar* buf,
+ const uchar* key,
+ uint key_len,
+ ha_rkey_function find_flag) override;
+
+ int index_read_last(uchar * buf, const uchar * key,
+ uint key_len) override;
+
+ int index_next(uchar * buf) override;
+
+ int index_next_same(uchar * buf, const uchar * key,
+ uint keylen) override;
+
+ int index_prev(uchar * buf) override;
+
+ int index_first(uchar * buf) override;
+
+ int index_last(uchar * buf) override;
+
+ /* Copy a cached MySQL row. If requested, also avoids
+ overwriting non-read columns. */
+ void copy_cached_row(uchar *to_rec, const uchar *from_rec,
+ uint rec_length);
+ int rnd_init(bool scan) override;
+
+ int rnd_end() override;
+
+ int rnd_next(uchar *buf) override;
+
+ int rnd_pos(uchar * buf, uchar *pos) override;
+
+ int ft_init() override;
+ void ft_end() override { rnd_end(); }
+ FT_INFO *ft_init_ext(uint flags, uint inx, String* key) override;
+ int ft_read(uchar* buf) override;
+
+ void position(const uchar *record) override;
+
+ int info(uint) override;
+
+ int analyze(THD* thd,HA_CHECK_OPT* check_opt) override;
+
+ int optimize(THD* thd,HA_CHECK_OPT* check_opt) override;
+
+ int discard_or_import_tablespace(my_bool discard) override;
+
+ int extra(ha_extra_function operation) override;
+
+ int reset() override;
+
+ int external_lock(THD *thd, int lock_type) override;
+
+ int start_stmt(THD *thd, thr_lock_type lock_type) override;
+
+ ha_rows records_in_range(
+ uint inx,
+ const key_range* min_key,
+ const key_range* max_key,
+ page_range* pages) override;
+
+ ha_rows estimate_rows_upper_bound() override;
+
+ void update_create_info(HA_CREATE_INFO* create_info) override;
+
+ inline int create(
+ const char* name,
+ TABLE* form,
+ HA_CREATE_INFO* create_info,
+ bool file_per_table,
+ trx_t* trx = NULL);
+
+ int create(
+ const char* name,
+ TABLE* form,
+ HA_CREATE_INFO* create_info) override;
+
+ inline int delete_table(const char* name, enum_sql_command sqlcom);
+
+ int truncate() override;
+
+ int delete_table(const char *name) override;
+
+ int rename_table(const char* from, const char* to) override;
+ inline int defragment_table(const char* name);
+ int check(THD* thd, HA_CHECK_OPT* check_opt) override;
+
+ inline void reload_statistics();
+
+ char* get_foreign_key_create_info() override;
+
+ int get_foreign_key_list(THD *thd,
+ List<FOREIGN_KEY_INFO> *f_key_list) override;
+
+ int get_parent_foreign_key_list(
+ THD* thd,
+ List<FOREIGN_KEY_INFO>* f_key_list) override;
+
+ bool can_switch_engines() override;
+
+ uint referenced_by_foreign_key() override;
+
+ void free_foreign_key_create_info(char* str) override;
+
+ uint lock_count(void) const override;
+
+ THR_LOCK_DATA** store_lock(
+ THD* thd,
+ THR_LOCK_DATA** to,
+ thr_lock_type lock_type) override;
+
+ void init_table_handle_for_HANDLER() override;
+
+ void get_auto_increment(
+ ulonglong offset,
+ ulonglong increment,
+ ulonglong nb_desired_values,
+ ulonglong* first_value,
+ ulonglong* nb_reserved_values) override;
+ int reset_auto_increment(ulonglong value) override;
+
+ bool get_error_message(int error, String *buf) override;
+
+ bool get_foreign_dup_key(char*, uint, char*, uint) override;
+
+ uint8 table_cache_type() override;
+
+ /**
+ Ask handler about permission to cache table during query registration
+ */
+ my_bool register_query_cache_table(
+ THD* thd,
+ const char* table_key,
+ uint key_length,
+ qc_engine_callback* call_back,
+ ulonglong* engine_data) override;
+
+ int cmp_ref(const uchar* ref1, const uchar* ref2) override;
+
+ /** On-line ALTER TABLE interface @see handler0alter.cc @{ */
+
+ /** Check if InnoDB supports a particular alter table in-place
+ @param altered_table TABLE object for new version of table.
+ @param ha_alter_info Structure describing changes to be done
+ by ALTER TABLE and holding data used during in-place alter.
+
+ @retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported
+ @retval HA_ALTER_INPLACE_INSTANT
+ MDL_EXCLUSIVE is needed for executing prepare_inplace_alter_table()
+ and commit_inplace_alter_table(). inplace_alter_table()
+ will not be called.
+ @retval HA_ALTER_INPLACE_COPY_NO_LOCK
+ MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded
+ to LOCK=NONE for rebuilding the table in inplace_alter_table()
+ @retval HA_ALTER_INPLACE_COPY_LOCK
+ MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded
+ to LOCK=SHARED for rebuilding the table in inplace_alter_table()
+ @retval HA_ALTER_INPLACE_NOCOPY_NO_LOCK
+ MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded
+ to LOCK=NONE for inplace_alter_table() which will not rebuild the table
+ @retval HA_ALTER_INPLACE_NOCOPY_LOCK
+ MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded
+ to LOCK=SHARED for inplace_alter_table() which will not rebuild
+ the table. */
+
+ enum_alter_inplace_result check_if_supported_inplace_alter(
+ TABLE* altered_table,
+ Alter_inplace_info* ha_alter_info) override;
+
+ /** Allows InnoDB to update internal structures with concurrent
+ writes blocked (provided that check_if_supported_inplace_alter()
+ did not return HA_ALTER_INPLACE_NO_LOCK).
+ This will be invoked before inplace_alter_table().
+
+ @param altered_table TABLE object for new version of table.
+ @param ha_alter_info Structure describing changes to be done
+ by ALTER TABLE and holding data used during in-place alter.
+
+ @retval true Failure
+ @retval false Success
+ */
+ bool prepare_inplace_alter_table(
+ TABLE* altered_table,
+ Alter_inplace_info* ha_alter_info) override;
+
+ /** Alter the table structure in-place with operations
+ specified using HA_ALTER_FLAGS and Alter_inplace_information.
+ The level of concurrency allowed during this operation depends
+ on the return value from check_if_supported_inplace_alter().
+
+ @param altered_table TABLE object for new version of table.
+ @param ha_alter_info Structure describing changes to be done
+ by ALTER TABLE and holding data used during in-place alter.
+
+ @retval true Failure
+ @retval false Success
+ */
+ bool inplace_alter_table(
+ TABLE* altered_table,
+ Alter_inplace_info* ha_alter_info) override;
+
+ /** Commit or rollback the changes made during
+ prepare_inplace_alter_table() and inplace_alter_table() inside
+ the storage engine. Note that the allowed level of concurrency
+ during this operation will be the same as for
+ inplace_alter_table() and thus might be higher than during
+ prepare_inplace_alter_table(). (E.g concurrent writes were
+ blocked during prepare, but might not be during commit).
+ @param altered_table TABLE object for new version of table.
+ @param ha_alter_info Structure describing changes to be done
+ by ALTER TABLE and holding data used during in-place alter.
+ @param commit true => Commit, false => Rollback.
+ @retval true Failure
+ @retval false Success
+ */
+ bool commit_inplace_alter_table(
+ TABLE* altered_table,
+ Alter_inplace_info* ha_alter_info,
+ bool commit) override;
+ /** @} */
+
+ bool check_if_incompatible_data(
+ HA_CREATE_INFO* info,
+ uint table_changes) override;
+
+ /** @name Multi Range Read interface @{ */
+
+ /** Initialize multi range read @see DsMrr_impl::dsmrr_init
+ @param seq
+ @param seq_init_param
+ @param n_ranges
+ @param mode
+ @param buf */
+ int multi_range_read_init(
+ RANGE_SEQ_IF* seq,
+ void* seq_init_param,
+ uint n_ranges,
+ uint mode,
+ HANDLER_BUFFER* buf) override;
+
+ /** Process next multi range read @see DsMrr_impl::dsmrr_next
+ @param range_info */
+ int multi_range_read_next(range_id_t *range_info) override;
+
+ /** Initialize multi range read and get information.
+ @see ha_myisam::multi_range_read_info_const
+ @see DsMrr_impl::dsmrr_info_const
+ @param keyno
+ @param seq
+ @param seq_init_param
+ @param n_ranges
+ @param bufsz
+ @param flags
+ @param cost */
+ ha_rows multi_range_read_info_const(
+ uint keyno,
+ RANGE_SEQ_IF* seq,
+ void* seq_init_param,
+ uint n_ranges,
+ uint* bufsz,
+ uint* flags,
+ Cost_estimate* cost) override;
+
+ /** Initialize multi range read and get information.
+ @see DsMrr_impl::dsmrr_info
+ @param keyno
+ @param seq
+ @param seq_init_param
+ @param n_ranges
+ @param bufsz
+ @param flags
+ @param cost */
+ ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+ uint key_parts, uint* bufsz, uint* flags,
+ Cost_estimate* cost) override;
+
+ int multi_range_read_explain_info(uint mrr_mode,
+ char *str, size_t size) override;
+
+ /** Attempt to push down an index condition.
+ @param[in] keyno MySQL key number
+ @param[in] idx_cond Index condition to be checked
+ @return idx_cond if pushed; NULL if not pushed */
+ Item* idx_cond_push(uint keyno, Item* idx_cond) override;
+ /* @} */
+
+ /** Check if InnoDB is not storing virtual column metadata for a table.
+ @param s table definition (based on .frm file)
+ @return whether InnoDB will omit virtual column metadata */
+ static bool omits_virtual_cols(const TABLE_SHARE& s)
+ {
+ return s.frm_version<FRM_VER_EXPRESSSIONS && s.virtual_fields;
+ }
+
+ /** Push a primary key filter.
+ @param[in] pk_filter filter against which primary keys
+ are to be checked
+ @retval false if pushed (always) */
+ bool rowid_filter_push(Rowid_filter *rowid_filter) override;
+
+ bool
+ can_convert_string(const Field_string* field,
+ const Column_definition& new_field) const override;
+ bool can_convert_varstring(
+ const Field_varstring* field,
+ const Column_definition& new_field) const override;
+ bool
+ can_convert_blob(const Field_blob* field,
+ const Column_definition& new_field) const override;
+
+ /** @return whether innodb_strict_mode is active */
+ static bool is_innodb_strict_mode(THD* thd);
+
+ /** @return whether innodb_strict_mode is active */
+ bool is_innodb_strict_mode()
+ { return is_innodb_strict_mode(m_user_thd); }
+ Compare_keys
+ compare_key_parts(const Field& old_field,
+ const Column_definition& new_field,
+ const KEY_PART_INFO& old_part,
+ const KEY_PART_INFO& new_part) const override;
+
+protected:
+ dberr_t innobase_get_autoinc(ulonglong* value);
+ dberr_t innobase_lock_autoinc();
+ ulonglong innobase_peek_autoinc();
+ dberr_t innobase_set_max_autoinc(ulonglong auto_inc);
+ dberr_t innobase_reset_autoinc(ulonglong auto_inc);
+
+ /** Resets a query execution 'template'.
+ @see build_template() */
+ void reset_template();
+
+ inline void update_thd(THD* thd);
+ void update_thd();
+
+ int general_fetch(uchar* buf, uint direction, uint match_mode);
+ int change_active_index(uint keynr);
+ dict_index_t* innobase_get_index(uint keynr);
+
+#ifdef WITH_WSREP
+ int wsrep_append_keys(
+ THD *thd,
+ Wsrep_service_key_type key_type,
+ const uchar* record0,
+ const uchar* record1);
+#endif
+ /** Builds a 'template' to the prebuilt struct.
+
+ The template is used in fast retrieval of just those column
+ values MySQL needs in its processing.
+ @param whole_row true if access is needed to a whole row,
+ false if accessing individual fields is enough */
+ void build_template(bool whole_row);
+
+ int info_low(uint, bool);
+
+ /** The multi range read session object */
+ DsMrr_impl m_ds_mrr;
+
+ /** Save CPU time with prebuilt/cached data structures */
+ row_prebuilt_t* m_prebuilt;
+
+ /** Thread handle of the user currently using the handler;
+ this is set in external_lock function */
+ THD* m_user_thd;
+
+ /** buffer used in updates */
+ uchar* m_upd_buf;
+
+ /** the size of upd_buf in bytes */
+ ulint m_upd_buf_size;
+
+ /** Flags that specificy the handler instance (table) capability. */
+ Table_flags m_int_table_flags;
+
+ /** Index into the server's primkary keye meta-data table->key_info{} */
+ uint m_primary_key;
+
+ /** this is set to 1 when we are starting a table scan but have
+ not yet fetched any row, else false */
+ bool m_start_of_scan;
+
+ /*!< match mode of the latest search: ROW_SEL_EXACT,
+ ROW_SEL_EXACT_PREFIX, or undefined */
+ uint m_last_match_mode;
+
+ /** If mysql has locked with external_lock() */
+ bool m_mysql_has_locked;
+};
+
+
+/* Some accessor functions which the InnoDB plugin needs, but which
+can not be added to mysql/plugin.h as part of the public interface;
+the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */
+
+#ifndef INNODB_COMPATIBILITY_HOOKS
+#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
+#endif
+
+extern "C" {
+
+/** Check if a user thread is a replication slave thread
+@param thd user thread
+@retval 0 the user thread is not a replication slave thread
+@retval 1 the user thread is a replication slave thread */
+int thd_slave_thread(const MYSQL_THD thd);
+
+/** Check if a user thread is running a non-transactional update
+@param thd user thread
+@retval 0 the user thread is not running a non-transactional update
+@retval 1 the user thread is running a non-transactional update */
+int thd_non_transactional_update(const MYSQL_THD thd);
+
+/** Get high resolution timestamp for the current query start time.
+The timestamp is not anchored to any specific point in time,
+but can be used for comparison.
+@param thd user thread
+@retval timestamp in microseconds precision
+*/
+unsigned long long thd_start_utime(const MYSQL_THD thd);
+
+/** Get the user thread's binary logging format
+@param thd user thread
+@return Value to be used as index into the binlog_format_names array */
+int thd_binlog_format(const MYSQL_THD thd);
+
+/** Check if binary logging is filtered for thread's current db.
+@param thd Thread handle
+@retval 1 the query is not filtered, 0 otherwise. */
+bool thd_binlog_filter_ok(const MYSQL_THD thd);
+
+/** Check if the query may generate row changes which may end up in the binary.
+@param thd Thread handle
+@retval 1 the query may generate row changes, 0 otherwise.
+*/
+bool thd_sqlcom_can_generate_row_events(const MYSQL_THD thd);
+
+/** Is strict sql_mode set.
+@param thd Thread object
+@return True if sql_mode has strict mode (all or trans), false otherwise. */
+bool thd_is_strict_mode(const MYSQL_THD thd);
+
+} /* extern "C" */
+
+/** Get the file name and position of the MySQL binlog corresponding to the
+ * current commit.
+ */
+extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file);
+
+struct trx_t;
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif /* WITH_WSREP */
+
+extern const struct _ft_vft ft_vft_result;
+
+/** Structure Returned by ha_innobase::ft_init_ext() */
+typedef struct new_ft_info
+{
+ struct _ft_vft *please;
+ struct _ft_vft_ext *could_you;
+ row_prebuilt_t* ft_prebuilt;
+ fts_result_t* ft_result;
+} NEW_FT_INFO;
+
+/**
+Allocates an InnoDB transaction for a MySQL handler object.
+@return InnoDB transaction handle */
+trx_t*
+innobase_trx_allocate(
+ MYSQL_THD thd); /*!< in: user thread handle */
+
+/*********************************************************************//**
+This function checks each index name for a table against reserved
+system default primary index name 'GEN_CLUST_INDEX'. If a name
+matches, this function pushes an warning message to the client,
+and returns true.
+@return true if the index name matches the reserved name */
+bool
+innobase_index_name_is_reserved(
+ THD* thd, /*!< in/out: MySQL connection */
+ const KEY* key_info, /*!< in: Indexes to be created */
+ ulint num_of_keys) /*!< in: Number of indexes to
+ be created. */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Parse hint for table and its indexes, and update the information
+in dictionary.
+@param[in] thd Connection thread
+@param[in,out] table Target table
+@param[in] table_share Table definition */
+void
+innobase_parse_hint_from_comment(
+ THD* thd,
+ dict_table_t* table,
+ const TABLE_SHARE* table_share);
+
+/** Class for handling create table information. */
+class create_table_info_t
+{
+public:
+ /** Constructor.
+ Used in two ways:
+ - all but file_per_table is used, when creating the table.
+ - all but name/path is used, when validating options and using flags. */
+ create_table_info_t(
+ THD* thd,
+ const TABLE* form,
+ HA_CREATE_INFO* create_info,
+ char* table_name,
+ char* remote_path,
+ bool file_per_table,
+ trx_t* trx = NULL);
+
+ /** Initialize the object. */
+ int initialize();
+
+ /** Set m_tablespace_type. */
+ void set_tablespace_type(bool table_being_altered_is_file_per_table);
+
+ /** Create InnoDB foreign keys from MySQL alter_info. */
+ dberr_t create_foreign_keys();
+
+ /** Create the internal innodb table.
+ @param create_fk whether to add FOREIGN KEY constraints */
+ int create_table(bool create_fk = true);
+
+ /** Update the internal data dictionary. */
+ int create_table_update_dict();
+
+ /** Validates the create options. Checks that the options
+ KEY_BLOCK_SIZE, ROW_FORMAT, DATA DIRECTORY, TEMPORARY & TABLESPACE
+ are compatible with each other and other settings.
+ These CREATE OPTIONS are not validated here unless innodb_strict_mode
+ is on. With strict mode, this function will report each problem it
+ finds using a custom message with error code
+ ER_ILLEGAL_HA_CREATE_OPTION, not its built-in message.
+ @return NULL if valid, string name of bad option if not. */
+ const char* create_options_are_invalid();
+
+ bool gcols_in_fulltext_or_spatial();
+
+ /** Validates engine specific table options not handled by
+ SQL-parser.
+ @return NULL if valid, string name of bad option if not. */
+ const char* check_table_options();
+
+ /** Validate DATA DIRECTORY option. */
+ bool create_option_data_directory_is_valid();
+
+ /** Validate TABLESPACE option. */
+ bool create_option_tablespace_is_valid();
+
+ /** Prepare to create a table. */
+ int prepare_create_table(const char* name, bool strict = true);
+
+ void allocate_trx();
+
+ /** Checks that every index have sane size. Depends on strict mode */
+ bool row_size_is_acceptable(const dict_table_t& table,
+ bool strict) const;
+ /** Checks that given index have sane size. Depends on strict mode */
+ bool row_size_is_acceptable(const dict_index_t& index,
+ bool strict) const;
+
+ /** Determines InnoDB table flags.
+ If strict_mode=OFF, this will adjust the flags to what should be assumed.
+ @retval true if successful, false if error */
+ bool innobase_table_flags();
+
+ /** Set flags and append '/' to remote path if necessary. */
+ void set_remote_path_flags();
+
+ /** Get table flags. */
+ ulint flags() const
+ { return(m_flags); }
+
+ /** Update table flags. */
+ void flags_set(ulint flags) { m_flags |= flags; }
+
+ /** Get table flags2. */
+ ulint flags2() const
+ { return(m_flags2); }
+
+ /** Get trx. */
+ trx_t* trx() const
+ { return(m_trx); }
+
+ /** Return table name. */
+ const char* table_name() const
+ { return(m_table_name); }
+
+ /** @return whether the table needs to be dropped on rollback */
+ bool drop_before_rollback() const { return m_drop_before_rollback; }
+
+ THD* thd() const
+ { return(m_thd); }
+
+ /** Normalizes a table name string.
+ A normalized name consists of the database name catenated to '/' and
+ table name. An example: test/mytable. On Windows normalization puts
+ both the database name and the table name always to lower case if
+ "set_lower_case" is set to true.
+ @param[in,out] norm_name Buffer to return the normalized name in.
+ @param[in] name Table name string.
+ @param[in] set_lower_case True if we want to set name to lower
+ case. */
+ static void normalize_table_name_low(
+ char* norm_name,
+ const char* name,
+ ibool set_lower_case);
+
+private:
+ /** Parses the table name into normal name and either temp path or
+ remote path if needed.*/
+ int
+ parse_table_name(
+ const char* name);
+
+ /** Create the internal innodb table definition. */
+ int create_table_def();
+
+ /** Connection thread handle. */
+ THD* m_thd;
+
+ /** InnoDB transaction handle. */
+ trx_t* m_trx;
+
+ /** Information on table columns and indexes. */
+ const TABLE* m_form;
+
+ /** Value of innodb_default_row_format */
+ const ulong m_default_row_format;
+
+ /** Create options. */
+ HA_CREATE_INFO* m_create_info;
+
+ /** Table name */
+ char* m_table_name;
+ /** Table */
+ dict_table_t* m_table;
+ /** Whether the table needs to be dropped before rollback */
+ bool m_drop_before_rollback;
+
+ /** Remote path (DATA DIRECTORY) or zero length-string */
+ char* m_remote_path;
+
+ /** Local copy of srv_file_per_table. */
+ bool m_innodb_file_per_table;
+
+ /** Allow file_per_table for this table either because:
+ 1) the setting innodb_file_per_table=on,
+ 2) it was explicitly requested by tablespace=innodb_file_per_table.
+ 3) the table being altered is currently file_per_table */
+ bool m_allow_file_per_table;
+
+ /** After all considerations, this shows whether we will actually
+ create a table and tablespace using file-per-table. */
+ bool m_use_file_per_table;
+
+ /** Using DATA DIRECTORY */
+ bool m_use_data_dir;
+
+ /** Table flags */
+ ulint m_flags;
+
+ /** Table flags2 */
+ ulint m_flags2;
+};
+
+/**
+Initialize the table FTS stopword list
+@return TRUE if success */
+ibool
+innobase_fts_load_stopword(
+/*=======================*/
+ dict_table_t* table, /*!< in: Table has the FTS */
+ trx_t* trx, /*!< in: transaction */
+ THD* thd) /*!< in: current thread */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Some defines for innobase_fts_check_doc_id_index() return value */
+enum fts_doc_id_index_enum {
+ FTS_INCORRECT_DOC_ID_INDEX,
+ FTS_EXIST_DOC_ID_INDEX,
+ FTS_NOT_EXIST_DOC_ID_INDEX
+};
+
+/**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column.
+@return the status of the FTS_DOC_ID index */
+fts_doc_id_index_enum
+innobase_fts_check_doc_id_index(
+ const dict_table_t* table, /*!< in: table definition */
+ const TABLE* altered_table, /*!< in: MySQL table
+ that is being altered */
+ ulint* fts_doc_col_no) /*!< out: The column number for
+ Doc ID */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column in MySQL create index definition.
+@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+fts_doc_id_index_enum
+innobase_fts_check_doc_id_index_in_def(
+ ulint n_key, /*!< in: Number of keys */
+ const KEY* key_info) /*!< in: Key definitions */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/**
+Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+void
+innobase_copy_frm_flags_from_table_share(
+ dict_table_t* innodb_table, /*!< in/out: InnoDB table */
+ const TABLE_SHARE* table_share); /*!< in: table share */
+
+/** Set up base columns for virtual column
+@param[in] table the InnoDB table
+@param[in] field MySQL field
+@param[in,out] v_col virtual column to be set up */
+void
+innodb_base_col_setup(
+ dict_table_t* table,
+ const Field* field,
+ dict_v_col_t* v_col);
+
+/** Set up base columns for stored column
+@param[in] table InnoDB table
+@param[in] field MySQL field
+@param[in,out] s_col stored column */
+void
+innodb_base_col_setup_for_stored(
+ const dict_table_t* table,
+ const Field* field,
+ dict_s_col_t* s_col);
+
+/** whether this is a stored generated column */
+#define innobase_is_s_fld(field) ((field)->vcol_info && (field)->stored_in_db())
+
+/** Always normalize table name to lower case on Windows */
+#ifdef _WIN32
+#define normalize_table_name(norm_name, name) \
+ create_table_info_t::normalize_table_name_low(norm_name, name, TRUE)
+#else
+#define normalize_table_name(norm_name, name) \
+ create_table_info_t::normalize_table_name_low(norm_name, name, FALSE)
+#endif /* _WIN32 */
+
+/** Converts a search mode flag understood by MySQL to a flag understood
+by InnoDB.
+@param[in] find_flag MySQL search mode flag.
+@return InnoDB search mode flag. */
+page_cur_mode_t
+convert_search_mode_to_innobase(
+ enum ha_rkey_function find_flag);
+
+/** Commits a transaction in an InnoDB database.
+@param[in] trx Transaction handle. */
+void
+innobase_commit_low(
+ trx_t* trx);
+
+extern my_bool innobase_stats_on_metadata;
+
+/** Calculate Record Per Key value.
+Need to exclude the NULL value if innodb_stats_method is set to "nulls_ignored"
+@param[in] index InnoDB index.
+@param[in] i The column we are calculating rec per key.
+@param[in] records Estimated total records.
+@return estimated record per key value */
+/* JAN: TODO: MySQL 5.7 */
+typedef float rec_per_key_t;
+rec_per_key_t
+innodb_rec_per_key(
+ dict_index_t* index,
+ ulint i,
+ ha_rows records);
+
+/** Build template for the virtual columns and their base columns
+@param[in] table MySQL TABLE
+@param[in] ib_table InnoDB dict_table_t
+@param[in,out] s_templ InnoDB template structure
+@param[in] add_v new virtual columns added along with
+ add index call
+@param[in] locked true if innobase_share_mutex is held */
+void
+innobase_build_v_templ(
+ const TABLE* table,
+ const dict_table_t* ib_table,
+ dict_vcol_templ_t* s_templ,
+ const dict_add_v_col_t* add_v,
+ bool locked);
+
+/** callback used by MySQL server layer to initialized
+the table virtual columns' template
+@param[in] table MySQL TABLE
+@param[in,out] ib_table InnoDB dict_table_t */
+void
+innobase_build_v_templ_callback(
+ const TABLE* table,
+ void* ib_table);
+
+/** Callback function definition, used by MySQL server layer to initialized
+the table virtual columns' template */
+typedef void (*my_gcolumn_templatecallback_t)(const TABLE*, void*);
+
+/** Convert MySQL column number to dict_table_t::cols[] offset.
+@param[in] field non-virtual column
+@return column number relative to dict_table_t::cols[] */
+unsigned
+innodb_col_no(const Field* field)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Helper function to push frm mismatch error to error log and
+if needed to sql-layer. */
+UNIV_INTERN
+void
+ib_push_frm_error(
+/*==============*/
+ THD* thd, /*!< in: MySQL thd */
+ dict_table_t* ib_table, /*!< in: InnoDB table */
+ TABLE* table, /*!< in: MySQL table */
+ ulint n_keys, /*!< in: InnoDB #keys */
+ bool push_warning); /*!< in: print warning ? */
+
+/** Check each index part length whether they not exceed the max limit
+@param[in] max_field_len maximum allowed key part length
+@param[in] key MariaDB key definition
+@return true if index column length exceeds limit */
+MY_ATTRIBUTE((warn_unused_result))
+bool too_big_key_part_length(size_t max_field_len, const KEY& key);
+
+/** This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+
+@param[in] hton InnoDB handlerton
+@param[in] xid X/Open XA transaction identification
+
+@return 0 or error number */
+int innobase_rollback_by_xid(handlerton* hton, XID* xid);
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
new file mode 100644
index 00000000..f10d534f
--- /dev/null
+++ b/storage/innobase/handler/handler0alter.cc
@@ -0,0 +1,11565 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/handler0alter.cc
+Smart ALTER TABLE
+*******************************************************/
+
+/* Include necessary SQL headers */
+#include "univ.i"
+#include <debug_sync.h>
+#include <log.h>
+#include <sql_lex.h>
+#include <sql_class.h>
+#include <sql_table.h>
+#include <mysql/plugin.h>
+
+/* Include necessary InnoDB headers */
+#include "btr0sea.h"
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "log0log.h"
+#include "rem0types.h"
+#include "row0log.h"
+#include "row0merge.h"
+#include "row0ins.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "trx0trx.h"
+#include "handler0alter.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "fts0priv.h"
+#include "fts0plugin.h"
+#include "pars0pars.h"
+#include "row0sel.h"
+#include "ha_innodb.h"
+#include "ut0stage.h"
+#include "span.h"
+#include <thread>
+#include <sstream>
+
+using st_::span;
+/** File format constraint for ALTER TABLE */
+extern ulong innodb_instant_alter_column_allowed;
+
+static const char *MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN=
+ "INPLACE ADD or DROP of virtual columns cannot be "
+ "combined with other ALTER TABLE actions";
+
+/** Operations for creating secondary indexes (no rebuild needed) */
+static const alter_table_operations INNOBASE_ONLINE_CREATE
+ = ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX
+ | ALTER_ADD_UNIQUE_INDEX;
+
+/** Operations that require filling in default values for columns */
+static const alter_table_operations INNOBASE_DEFAULTS
+ = ALTER_COLUMN_NOT_NULLABLE
+ | ALTER_ADD_STORED_BASE_COLUMN;
+
+
+/** Operations that require knowledge about row_start, row_end values */
+static const alter_table_operations INNOBASE_ALTER_VERSIONED_REBUILD
+ = ALTER_ADD_SYSTEM_VERSIONING
+ | ALTER_DROP_SYSTEM_VERSIONING;
+
+/** Operations for rebuilding a table in place */
+static const alter_table_operations INNOBASE_ALTER_REBUILD
+ = ALTER_ADD_PK_INDEX
+ | ALTER_DROP_PK_INDEX
+ | ALTER_OPTIONS
+ /* ALTER_OPTIONS needs to check alter_options_need_rebuild() */
+ | ALTER_COLUMN_NULLABLE
+ | INNOBASE_DEFAULTS
+ | ALTER_STORED_COLUMN_ORDER
+ | ALTER_DROP_STORED_COLUMN
+ | ALTER_RECREATE_TABLE
+ /*
+ | ALTER_STORED_COLUMN_TYPE
+ */
+ | INNOBASE_ALTER_VERSIONED_REBUILD
+ ;
+
+/** Operations that require changes to data */
+static const alter_table_operations INNOBASE_ALTER_DATA
+ = INNOBASE_ONLINE_CREATE | INNOBASE_ALTER_REBUILD;
+
+/** Operations for altering a table that InnoDB does not care about */
+static const alter_table_operations INNOBASE_INPLACE_IGNORE
+ = ALTER_COLUMN_DEFAULT
+ | ALTER_PARTITIONED
+ | ALTER_COLUMN_COLUMN_FORMAT
+ | ALTER_COLUMN_STORAGE_TYPE
+ | ALTER_CONVERT_TO
+ | ALTER_VIRTUAL_GCOL_EXPR
+ | ALTER_DROP_CHECK_CONSTRAINT
+ | ALTER_RENAME
+ | ALTER_COLUMN_INDEX_LENGTH
+ | ALTER_CHANGE_INDEX_COMMENT;
+
+/** Operations on foreign key definitions (changing the schema only) */
+static const alter_table_operations INNOBASE_FOREIGN_OPERATIONS
+ = ALTER_DROP_FOREIGN_KEY
+ | ALTER_ADD_FOREIGN_KEY;
+
+/** Operations that InnoDB cares about and can perform without creating data */
+static const alter_table_operations INNOBASE_ALTER_NOCREATE
+ = ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX
+ | ALTER_DROP_UNIQUE_INDEX;
+
+/** Operations that InnoDB cares about and can perform without validation */
+static const alter_table_operations INNOBASE_ALTER_NOVALIDATE
+ = INNOBASE_ALTER_NOCREATE
+ | ALTER_VIRTUAL_COLUMN_ORDER
+ | ALTER_COLUMN_NAME
+ | INNOBASE_FOREIGN_OPERATIONS
+ | ALTER_COLUMN_UNVERSIONED
+ | ALTER_DROP_VIRTUAL_COLUMN;
+
+/** Operations that InnoDB cares about and can perform without rebuild */
+static const alter_table_operations INNOBASE_ALTER_NOREBUILD
+ = INNOBASE_ONLINE_CREATE
+ | INNOBASE_ALTER_NOCREATE;
+
+/** Operations that can be performed instantly, without inplace_alter_table() */
+static const alter_table_operations INNOBASE_ALTER_INSTANT
+ = ALTER_VIRTUAL_COLUMN_ORDER
+ | ALTER_COLUMN_NAME
+ | ALTER_ADD_VIRTUAL_COLUMN
+ | INNOBASE_FOREIGN_OPERATIONS
+ | ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE
+ | ALTER_COLUMN_UNVERSIONED
+ | ALTER_RENAME_INDEX
+ | ALTER_DROP_VIRTUAL_COLUMN;
+
+/** Initialize instant->field_map.
+@param[in] table table definition to copy from */
+inline void dict_table_t::init_instant(const dict_table_t& table)
+{
+ const dict_index_t& oindex __attribute__((unused))= *table.indexes.start;
+ dict_index_t& index = *indexes.start;
+ const unsigned u = index.first_user_field();
+ DBUG_ASSERT(u == oindex.first_user_field());
+ DBUG_ASSERT(index.n_fields >= oindex.n_fields);
+
+ field_map_element_t* field_map_it = static_cast<field_map_element_t*>(
+ mem_heap_zalloc(heap, (index.n_fields - u)
+ * sizeof *field_map_it));
+ instant->field_map = field_map_it;
+
+ ut_d(unsigned n_drop = 0);
+ ut_d(unsigned n_nullable = 0);
+ for (unsigned i = u; i < index.n_fields; i++) {
+ auto& f = index.fields[i];
+ ut_d(n_nullable += f.col->is_nullable());
+
+ if (!f.col->is_dropped()) {
+ (*field_map_it++).set_ind(f.col->ind);
+ continue;
+ }
+
+ auto fixed_len = dict_col_get_fixed_size(
+ f.col, not_redundant());
+ field_map_it->set_dropped();
+ if (!f.col->is_nullable()) {
+ field_map_it->set_not_null();
+ }
+ field_map_it->set_ind(fixed_len
+ ? uint16_t(fixed_len + 1)
+ : DATA_BIG_COL(f.col));
+ field_map_it++;
+ ut_ad(f.col >= table.instant->dropped);
+ ut_ad(f.col < table.instant->dropped
+ + table.instant->n_dropped);
+ ut_d(n_drop++);
+ size_t d = f.col - table.instant->dropped;
+ ut_ad(f.col == &table.instant->dropped[d]);
+ ut_ad(d <= instant->n_dropped);
+ f.col = &instant->dropped[d];
+ }
+ ut_ad(n_drop == n_dropped());
+ ut_ad(field_map_it == &instant->field_map[index.n_fields - u]);
+ ut_ad(index.n_nullable == n_nullable);
+}
+
+/** Set is_instant() before instant_column().
+@param[in] old previous table definition
+@param[in] col_map map from old.cols[] and old.v_cols[] to this
+@param[out] first_alter_pos 0, or 1 + first changed column position */
+inline void dict_table_t::prepare_instant(const dict_table_t& old,
+ const ulint* col_map,
+ unsigned& first_alter_pos)
+{
+ DBUG_ASSERT(!is_instant());
+ DBUG_ASSERT(n_dropped() == 0);
+ DBUG_ASSERT(old.n_cols == old.n_def);
+ DBUG_ASSERT(n_cols == n_def);
+ DBUG_ASSERT(old.supports_instant());
+ DBUG_ASSERT(not_redundant() == old.not_redundant());
+ DBUG_ASSERT(DICT_TF_HAS_ATOMIC_BLOBS(flags)
+ == DICT_TF_HAS_ATOMIC_BLOBS(old.flags));
+ DBUG_ASSERT(!persistent_autoinc
+ || persistent_autoinc == old.persistent_autoinc);
+ /* supports_instant() does not necessarily hold here,
+ in case ROW_FORMAT=COMPRESSED according to the
+ MariaDB data dictionary, and ALTER_OPTIONS was not set.
+ If that is the case, the instant ALTER TABLE would keep
+ the InnoDB table in its current format. */
+
+ const dict_index_t& oindex = *old.indexes.start;
+ dict_index_t& index = *indexes.start;
+ first_alter_pos = 0;
+
+ for (unsigned i = 0; i + DATA_N_SYS_COLS < old.n_cols; i++) {
+ if (col_map[i] != i) {
+ first_alter_pos = 1 + i;
+ goto add_metadata;
+ }
+ }
+
+ if (!old.instant) {
+ /* Columns were not dropped or reordered.
+ Therefore columns must have been added at the end,
+ or modified instantly in place. */
+ DBUG_ASSERT(index.n_fields >= oindex.n_fields);
+ DBUG_ASSERT(index.n_fields > oindex.n_fields
+ || !not_redundant());
+#ifdef UNIV_DEBUG
+ if (index.n_fields == oindex.n_fields) {
+ ut_ad(!not_redundant());
+ for (unsigned i = index.n_fields; i--; ) {
+ ut_ad(index.fields[i].col->same_format(
+ *oindex.fields[i].col));
+ }
+ }
+#endif
+set_core_fields:
+ index.n_core_fields = oindex.n_core_fields;
+ index.n_core_null_bytes = oindex.n_core_null_bytes;
+ } else {
+add_metadata:
+ const unsigned n_old_drop = old.n_dropped();
+ unsigned n_drop = n_old_drop;
+ for (unsigned i = old.n_cols; i--; ) {
+ if (col_map[i] == ULINT_UNDEFINED) {
+ DBUG_ASSERT(i + DATA_N_SYS_COLS
+ < uint(old.n_cols));
+ n_drop++;
+ }
+ }
+
+ instant = new (mem_heap_alloc(heap, sizeof(dict_instant_t)))
+ dict_instant_t();
+ instant->n_dropped = n_drop;
+ if (n_drop) {
+ instant->dropped
+ = static_cast<dict_col_t*>(
+ mem_heap_alloc(heap, n_drop
+ * sizeof(dict_col_t)));
+ if (n_old_drop) {
+ memcpy(instant->dropped, old.instant->dropped,
+ n_old_drop * sizeof(dict_col_t));
+ }
+ } else {
+ instant->dropped = NULL;
+ }
+
+ for (unsigned i = 0, d = n_old_drop; i < old.n_cols; i++) {
+ if (col_map[i] == ULINT_UNDEFINED) {
+ (new (&instant->dropped[d++])
+ dict_col_t(old.cols[i]))->set_dropped();
+ }
+ }
+#ifndef DBUG_OFF
+ for (unsigned i = 0; i < n_drop; i++) {
+ DBUG_ASSERT(instant->dropped[i].is_dropped());
+ }
+#endif
+ const unsigned n_fields = index.n_fields + n_dropped();
+
+ DBUG_ASSERT(n_fields >= oindex.n_fields);
+ dict_field_t* fields = static_cast<dict_field_t*>(
+ mem_heap_zalloc(heap, n_fields * sizeof *fields));
+ unsigned i = 0, j = 0, n_nullable = 0;
+ ut_d(uint core_null = 0);
+ for (; i < oindex.n_fields; i++) {
+ DBUG_ASSERT(j <= i);
+ dict_field_t&f = fields[i] = oindex.fields[i];
+ if (f.col->is_dropped()) {
+ /* The column has been instantly
+ dropped earlier. */
+ DBUG_ASSERT(f.col >= old.instant->dropped);
+ {
+ size_t d = f.col
+ - old.instant->dropped;
+ DBUG_ASSERT(d < n_old_drop);
+ DBUG_ASSERT(&old.instant->dropped[d]
+ == f.col);
+ DBUG_ASSERT(!f.name);
+ f.col = instant->dropped + d;
+ }
+ if (f.col->is_nullable()) {
+found_nullable:
+ n_nullable++;
+ ut_d(core_null
+ += i < oindex.n_core_fields);
+ }
+ continue;
+ }
+
+ const ulint col_ind = col_map[f.col->ind];
+ if (col_ind != ULINT_UNDEFINED) {
+ if (index.fields[j].col->ind != col_ind) {
+ /* The fields for instantly
+ added columns must be placed
+ last in the clustered index.
+ Keep pre-existing fields in
+ the same position. */
+ uint k;
+ for (k = j + 1; k < index.n_fields;
+ k++) {
+ if (index.fields[k].col->ind
+ == col_ind) {
+ goto found_j;
+ }
+ }
+ DBUG_ASSERT("no such col" == 0);
+found_j:
+ std::swap(index.fields[j],
+ index.fields[k]);
+ }
+ DBUG_ASSERT(index.fields[j].col->ind
+ == col_ind);
+ fields[i] = index.fields[j++];
+ DBUG_ASSERT(!fields[i].col->is_dropped());
+ DBUG_ASSERT(fields[i].name
+ == fields[i].col->name(*this));
+ if (fields[i].col->is_nullable()) {
+ goto found_nullable;
+ }
+ continue;
+ }
+
+ /* This column is being dropped. */
+ unsigned d = n_old_drop;
+ for (unsigned c = 0; c < f.col->ind; c++) {
+ d += col_map[c] == ULINT_UNDEFINED;
+ }
+ DBUG_ASSERT(d < n_drop);
+ f.col = &instant->dropped[d];
+ f.name = NULL;
+ if (f.col->is_nullable()) {
+ goto found_nullable;
+ }
+ }
+ /* The n_core_null_bytes only matters for
+ ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */
+ ut_ad(UT_BITS_IN_BYTES(core_null) == oindex.n_core_null_bytes
+ || !not_redundant());
+ DBUG_ASSERT(i >= oindex.n_core_fields);
+ DBUG_ASSERT(j <= i);
+ DBUG_ASSERT(n_fields - (i - j) == index.n_fields);
+ std::sort(index.fields + j, index.fields + index.n_fields,
+ [](const dict_field_t& a, const dict_field_t& b)
+ { return a.col->ind < b.col->ind; });
+ for (; i < n_fields; i++) {
+ fields[i] = index.fields[j++];
+ n_nullable += fields[i].col->is_nullable();
+ DBUG_ASSERT(!fields[i].col->is_dropped());
+ DBUG_ASSERT(fields[i].name
+ == fields[i].col->name(*this));
+ }
+ DBUG_ASSERT(j == index.n_fields);
+ index.n_fields = index.n_def = n_fields
+ & dict_index_t::MAX_N_FIELDS;
+ index.fields = fields;
+ DBUG_ASSERT(n_nullable >= index.n_nullable);
+ DBUG_ASSERT(n_nullable >= oindex.n_nullable);
+ index.n_nullable = n_nullable & dict_index_t::MAX_N_FIELDS;
+ goto set_core_fields;
+ }
+
+ DBUG_ASSERT(n_cols + n_dropped() >= old.n_cols + old.n_dropped());
+ DBUG_ASSERT(n_dropped() >= old.n_dropped());
+ DBUG_ASSERT(index.n_core_fields == oindex.n_core_fields);
+ DBUG_ASSERT(index.n_core_null_bytes == oindex.n_core_null_bytes);
+}
+
+/** Adjust index metadata for instant ADD/DROP/reorder COLUMN.
+@param[in] clustered index definition after instant ALTER TABLE */
+inline void dict_index_t::instant_add_field(const dict_index_t& instant)
+{
+ DBUG_ASSERT(is_primary());
+ DBUG_ASSERT(instant.is_primary());
+ DBUG_ASSERT(!has_virtual());
+ DBUG_ASSERT(!instant.has_virtual());
+ DBUG_ASSERT(instant.n_core_fields <= instant.n_fields);
+ DBUG_ASSERT(n_def == n_fields);
+ DBUG_ASSERT(instant.n_def == instant.n_fields);
+ DBUG_ASSERT(type == instant.type);
+ DBUG_ASSERT(trx_id_offset == instant.trx_id_offset);
+ DBUG_ASSERT(n_user_defined_cols == instant.n_user_defined_cols);
+ DBUG_ASSERT(n_uniq == instant.n_uniq);
+ DBUG_ASSERT(instant.n_fields >= n_fields);
+ DBUG_ASSERT(instant.n_nullable >= n_nullable);
+ DBUG_ASSERT(instant.n_core_fields == n_core_fields);
+ DBUG_ASSERT(instant.n_core_null_bytes == n_core_null_bytes);
+
+ /* instant will have all fields (including ones for columns
+ that have been or are being instantly dropped) in the same position
+ as this index. Fields for any added columns are appended at the end. */
+#ifndef DBUG_OFF
+ for (unsigned i = 0; i < n_fields; i++) {
+ DBUG_ASSERT(fields[i].same(instant.fields[i]));
+ DBUG_ASSERT(instant.fields[i].col->same_format(*fields[i]
+ .col));
+ /* Instant conversion from NULL to NOT NULL is not allowed. */
+ DBUG_ASSERT(!fields[i].col->is_nullable()
+ || instant.fields[i].col->is_nullable());
+ DBUG_ASSERT(fields[i].col->is_nullable()
+ == instant.fields[i].col->is_nullable()
+ || !table->not_redundant());
+ }
+#endif
+ n_fields = instant.n_fields;
+ n_def = instant.n_def;
+ n_nullable = instant.n_nullable;
+ fields = static_cast<dict_field_t*>(
+ mem_heap_dup(heap, instant.fields, n_fields * sizeof *fields));
+
+ ut_d(unsigned n_null = 0);
+ ut_d(unsigned n_dropped = 0);
+
+ for (unsigned i = 0; i < n_fields; i++) {
+ const dict_col_t* icol = instant.fields[i].col;
+ dict_field_t& f = fields[i];
+ ut_d(n_null += icol->is_nullable());
+ DBUG_ASSERT(!icol->is_virtual());
+ if (icol->is_dropped()) {
+ ut_d(n_dropped++);
+ f.col->set_dropped();
+ f.name = NULL;
+ } else {
+ f.col = &table->cols[icol - instant.table->cols];
+ f.name = f.col->name(*table);
+ }
+ }
+
+ ut_ad(n_null == n_nullable);
+ ut_ad(n_dropped == instant.table->n_dropped());
+}
+
+/** Adjust table metadata for instant ADD/DROP/reorder COLUMN.
+@param[in] table altered table (with dropped columns)
+@param[in] col_map mapping from cols[] and v_cols[] to table
+@return whether the metadata record must be updated */
+inline bool dict_table_t::instant_column(const dict_table_t& table,
+ const ulint* col_map)
+{
+ DBUG_ASSERT(!table.cached);
+ DBUG_ASSERT(table.n_def == table.n_cols);
+ DBUG_ASSERT(table.n_t_def == table.n_t_cols);
+ DBUG_ASSERT(n_def == n_cols);
+ DBUG_ASSERT(n_t_def == n_t_cols);
+ DBUG_ASSERT(n_v_def == n_v_cols);
+ DBUG_ASSERT(table.n_v_def == table.n_v_cols);
+ DBUG_ASSERT(table.n_cols + table.n_dropped() >= n_cols + n_dropped());
+ DBUG_ASSERT(!table.persistent_autoinc
+ || persistent_autoinc == table.persistent_autoinc);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ {
+ const char* end = table.col_names;
+ for (unsigned i = table.n_cols; i--; ) end += strlen(end) + 1;
+
+ col_names = static_cast<char*>(
+ mem_heap_dup(heap, table.col_names,
+ ulint(end - table.col_names)));
+ }
+ const dict_col_t* const old_cols = cols;
+ cols = static_cast<dict_col_t*>(mem_heap_dup(heap, table.cols,
+ table.n_cols
+ * sizeof *cols));
+
+ /* Preserve the default values of previously instantly added
+ columns, or copy the new default values to this->heap. */
+ for (uint16_t i = 0; i < table.n_cols; i++) {
+ dict_col_t& c = cols[i];
+
+ if (const dict_col_t* o = find(old_cols, col_map, n_cols, i)) {
+ c.def_val = o->def_val;
+ DBUG_ASSERT(!((c.prtype ^ o->prtype)
+ & ~(DATA_NOT_NULL | DATA_VERSIONED
+ | CHAR_COLL_MASK << 16
+ | DATA_LONG_TRUE_VARCHAR)));
+ DBUG_ASSERT(c.same_type(*o));
+ DBUG_ASSERT(c.len >= o->len);
+
+ if (o->vers_sys_start()) {
+ ut_ad(o->ind == vers_start);
+ vers_start = i & dict_index_t::MAX_N_FIELDS;
+ } else if (o->vers_sys_end()) {
+ ut_ad(o->ind == vers_end);
+ vers_end = i & dict_index_t::MAX_N_FIELDS;
+ }
+ continue;
+ }
+
+ DBUG_ASSERT(c.is_added());
+ if (c.def_val.len <= UNIV_PAGE_SIZE_MAX
+ && (!c.def_val.len
+ || !memcmp(c.def_val.data, field_ref_zero,
+ c.def_val.len))) {
+ c.def_val.data = field_ref_zero;
+ } else if (const void*& d = c.def_val.data) {
+ d = mem_heap_dup(heap, d, c.def_val.len);
+ } else {
+ DBUG_ASSERT(c.def_val.len == UNIV_SQL_NULL);
+ }
+ }
+
+ n_t_def = (n_t_def + (table.n_cols - n_cols))
+ & dict_index_t::MAX_N_FIELDS;
+ n_t_cols = (n_t_cols + (table.n_cols - n_cols))
+ & dict_index_t::MAX_N_FIELDS;
+ n_def = table.n_cols;
+
+ const dict_v_col_t* const old_v_cols = v_cols;
+
+ if (const char* end = table.v_col_names) {
+ for (unsigned i = table.n_v_cols; i--; ) {
+ end += strlen(end) + 1;
+ }
+
+ v_col_names = static_cast<char*>(
+ mem_heap_dup(heap, table.v_col_names,
+ ulint(end - table.v_col_names)));
+ v_cols = static_cast<dict_v_col_t*>(
+ mem_heap_alloc(heap, table.n_v_cols * sizeof(*v_cols)));
+ for (ulint i = table.n_v_cols; i--; ) {
+ new (&v_cols[i]) dict_v_col_t(table.v_cols[i]);
+ v_cols[i].v_indexes.clear();
+ }
+ } else {
+ ut_ad(table.n_v_cols == 0);
+ v_col_names = NULL;
+ v_cols = NULL;
+ }
+
+ n_t_def = (n_t_def + (table.n_v_cols - n_v_cols))
+ & dict_index_t::MAX_N_FIELDS;
+ n_t_cols = (n_t_cols + (table.n_v_cols - n_v_cols))
+ & dict_index_t::MAX_N_FIELDS;
+ n_v_def = table.n_v_cols;
+
+ for (unsigned i = 0; i < n_v_def; i++) {
+ dict_v_col_t& v = v_cols[i];
+ DBUG_ASSERT(v.v_indexes.empty());
+ v.base_col = static_cast<dict_col_t**>(
+ mem_heap_dup(heap, v.base_col,
+ v.num_base * sizeof *v.base_col));
+
+ for (ulint n = v.num_base; n--; ) {
+ dict_col_t*& base = v.base_col[n];
+ if (base->is_virtual()) {
+ } else if (base >= table.cols
+ && base < table.cols + table.n_cols) {
+ /* The base column was instantly added. */
+ size_t c = base - table.cols;
+ DBUG_ASSERT(base == &table.cols[c]);
+ base = &cols[c];
+ } else {
+ DBUG_ASSERT(base >= old_cols);
+ size_t c = base - old_cols;
+ DBUG_ASSERT(c + DATA_N_SYS_COLS < n_cols);
+ DBUG_ASSERT(base == &old_cols[c]);
+ DBUG_ASSERT(col_map[c] + DATA_N_SYS_COLS
+ < n_cols);
+ base = &cols[col_map[c]];
+ }
+ }
+ }
+
+ dict_index_t* index = dict_table_get_first_index(this);
+ bool metadata_changed;
+ {
+ const dict_index_t& i = *dict_table_get_first_index(&table);
+ metadata_changed = i.n_fields > index->n_fields;
+ ut_ad(i.n_fields >= index->n_fields);
+ index->instant_add_field(i);
+ }
+
+ if (instant || table.instant) {
+ const auto old_instant = instant;
+ /* FIXME: add instant->heap, and transfer ownership here */
+ if (!instant) {
+ instant = new (mem_heap_zalloc(heap, sizeof *instant))
+ dict_instant_t();
+ goto dup_dropped;
+ } else if (n_dropped() < table.n_dropped()) {
+dup_dropped:
+ instant->dropped = static_cast<dict_col_t*>(
+ mem_heap_dup(heap, table.instant->dropped,
+ table.instant->n_dropped
+ * sizeof *instant->dropped));
+ instant->n_dropped = table.instant->n_dropped;
+ } else if (table.instant->n_dropped) {
+ memcpy(instant->dropped, table.instant->dropped,
+ table.instant->n_dropped
+ * sizeof *instant->dropped);
+ }
+
+ const field_map_element_t* field_map = old_instant
+ ? old_instant->field_map : NULL;
+
+ init_instant(table);
+
+ if (!metadata_changed) {
+ metadata_changed = !field_map
+ || memcmp(field_map,
+ instant->field_map,
+ (index->n_fields
+ - index->first_user_field())
+ * sizeof *field_map);
+ }
+ }
+
+ while ((index = dict_table_get_next_index(index)) != NULL) {
+ if (index->to_be_dropped) {
+ continue;
+ }
+ for (unsigned i = 0; i < index->n_fields; i++) {
+ dict_field_t& f = index->fields[i];
+ if (f.col >= table.cols
+ && f.col < table.cols + table.n_cols) {
+ /* This is an instantly added column
+ in a newly added index. */
+ DBUG_ASSERT(!f.col->is_virtual());
+ size_t c = f.col - table.cols;
+ DBUG_ASSERT(f.col == &table.cols[c]);
+ f.col = &cols[c];
+ } else if (f.col >= &table.v_cols->m_col
+ && f.col < &table.v_cols[n_v_cols].m_col) {
+ /* This is an instantly added virtual column
+ in a newly added index. */
+ DBUG_ASSERT(f.col->is_virtual());
+ size_t c = reinterpret_cast<dict_v_col_t*>(
+ f.col) - table.v_cols;
+ DBUG_ASSERT(f.col == &table.v_cols[c].m_col);
+ f.col = &v_cols[c].m_col;
+ } else if (f.col < old_cols
+ || f.col >= old_cols + n_cols) {
+ DBUG_ASSERT(f.col->is_virtual());
+ f.col = &v_cols[col_map[
+ reinterpret_cast<dict_v_col_t*>(
+ f.col)
+ - old_v_cols + n_cols]].m_col;
+ } else {
+ f.col = &cols[col_map[f.col - old_cols]];
+ DBUG_ASSERT(!f.col->is_virtual());
+ }
+ f.name = f.col->name(*this);
+ if (f.col->is_virtual()) {
+ dict_v_col_t* v_col = reinterpret_cast
+ <dict_v_col_t*>(f.col);
+ v_col->v_indexes.push_front(
+ dict_v_idx_t(index, i));
+ }
+ }
+ }
+
+ n_cols = table.n_cols;
+ n_v_cols = table.n_v_cols;
+ return metadata_changed;
+}
+
+/** Find the old column number for the given new column position.
+@param[in] col_map column map from old column to new column
+@param[in] pos new column position
+@param[in] n number of columns present in the column map
+@return old column position for the given new column position. */
+static ulint find_old_col_no(const ulint* col_map, ulint pos, ulint n)
+{
+ do {
+ ut_ad(n);
+ } while (col_map[--n] != pos);
+ return n;
+}
+
+/** Roll back instant_column().
+@param[in] old_n_cols original n_cols
+@param[in] old_cols original cols
+@param[in] old_col_names original col_names
+@param[in] old_instant original instant structure
+@param[in] old_fields original fields
+@param[in] old_n_fields original number of fields
+@param[in] old_n_core_fields original number of core fields
+@param[in] old_n_v_cols original n_v_cols
+@param[in] old_v_cols original v_cols
+@param[in] old_v_col_names original v_col_names
+@param[in] col_map column map */
+inline void dict_table_t::rollback_instant(
+ unsigned old_n_cols,
+ dict_col_t* old_cols,
+ const char* old_col_names,
+ dict_instant_t* old_instant,
+ dict_field_t* old_fields,
+ unsigned old_n_fields,
+ unsigned old_n_core_fields,
+ unsigned old_n_v_cols,
+ dict_v_col_t* old_v_cols,
+ const char* old_v_col_names,
+ const ulint* col_map)
+{
+ ut_d(dict_sys.assert_locked());
+
+ if (cols == old_cols) {
+ /* Alter fails before instant operation happens.
+ So there is no need to do rollback instant operation */
+ return;
+ }
+
+ dict_index_t* index = indexes.start;
+ /* index->is_instant() does not necessarily hold here, because
+ the table may have been emptied */
+ DBUG_ASSERT(old_n_cols >= DATA_N_SYS_COLS);
+ DBUG_ASSERT(n_cols == n_def);
+ DBUG_ASSERT(index->n_def == index->n_fields);
+ DBUG_ASSERT(index->n_core_fields <= index->n_fields);
+ DBUG_ASSERT(old_n_core_fields <= old_n_fields);
+ DBUG_ASSERT(instant || !old_instant);
+
+ instant = old_instant;
+
+ index->n_nullable = 0;
+
+ for (unsigned i = old_n_fields; i--; ) {
+ if (old_fields[i].col->is_nullable()) {
+ index->n_nullable++;
+ }
+ }
+
+ for (unsigned i = n_v_cols; i--; ) {
+ v_cols[i].~dict_v_col_t();
+ }
+
+ index->n_core_fields = ((index->n_fields == index->n_core_fields)
+ ? old_n_fields
+ : old_n_core_fields)
+ & dict_index_t::MAX_N_FIELDS;
+ index->n_def = index->n_fields = old_n_fields
+ & dict_index_t::MAX_N_FIELDS;
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(index->get_n_nullable(index->n_core_fields)));
+
+ const dict_col_t* const new_cols = cols;
+ const dict_col_t* const new_cols_end __attribute__((unused)) = cols + n_cols;
+ const dict_v_col_t* const new_v_cols = v_cols;
+ const dict_v_col_t* const new_v_cols_end __attribute__((unused))= v_cols + n_v_cols;
+
+ cols = old_cols;
+ col_names = old_col_names;
+ v_cols = old_v_cols;
+ v_col_names = old_v_col_names;
+ n_def = n_cols = old_n_cols & dict_index_t::MAX_N_FIELDS;
+ n_v_def = n_v_cols = old_n_v_cols & dict_index_t::MAX_N_FIELDS;
+ n_t_def = n_t_cols = (n_cols + n_v_cols) & dict_index_t::MAX_N_FIELDS;
+
+ if (versioned()) {
+ for (unsigned i = 0; i < n_cols; ++i) {
+ if (cols[i].vers_sys_start()) {
+ vers_start = i & dict_index_t::MAX_N_FIELDS;
+ } else if (cols[i].vers_sys_end()) {
+ vers_end = i & dict_index_t::MAX_N_FIELDS;
+ }
+ }
+ }
+
+ index->fields = old_fields;
+
+ while ((index = dict_table_get_next_index(index)) != NULL) {
+ if (index->to_be_dropped) {
+ /* instant_column() did not adjust these indexes. */
+ continue;
+ }
+
+ for (unsigned i = 0; i < index->n_fields; i++) {
+ dict_field_t& f = index->fields[i];
+ if (f.col->is_virtual()) {
+ DBUG_ASSERT(f.col >= &new_v_cols->m_col);
+ DBUG_ASSERT(f.col < &new_v_cols_end->m_col);
+ size_t n = size_t(
+ reinterpret_cast<dict_v_col_t*>(f.col)
+ - new_v_cols);
+ DBUG_ASSERT(n <= n_v_cols);
+
+ ulint old_col_no = find_old_col_no(
+ col_map + n_cols, n, n_v_cols);
+ DBUG_ASSERT(old_col_no <= n_v_cols);
+ f.col = &v_cols[old_col_no].m_col;
+ DBUG_ASSERT(f.col->is_virtual());
+ } else {
+ DBUG_ASSERT(f.col >= new_cols);
+ DBUG_ASSERT(f.col < new_cols_end);
+ size_t n = size_t(f.col - new_cols);
+ DBUG_ASSERT(n <= n_cols);
+
+ ulint old_col_no = find_old_col_no(col_map,
+ n, n_cols);
+ DBUG_ASSERT(old_col_no < n_cols);
+ f.col = &cols[old_col_no];
+ DBUG_ASSERT(!f.col->is_virtual());
+ }
+ f.name = f.col->name(*this);
+ }
+ }
+}
+
+struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
+{
+ /** Dummy query graph */
+ que_thr_t* thr;
+ /** The prebuilt struct of the creating instance */
+ row_prebuilt_t*& prebuilt;
+ /** InnoDB indexes being created */
+ dict_index_t** add_index;
+ /** MySQL key numbers for the InnoDB indexes that are being created */
+ const ulint* add_key_numbers;
+ /** number of InnoDB indexes being created */
+ ulint num_to_add_index;
+ /** InnoDB indexes being dropped */
+ dict_index_t** drop_index;
+ /** number of InnoDB indexes being dropped */
+ const ulint num_to_drop_index;
+ /** InnoDB foreign key constraints being dropped */
+ dict_foreign_t** drop_fk;
+ /** number of InnoDB foreign key constraints being dropped */
+ const ulint num_to_drop_fk;
+ /** InnoDB foreign key constraints being added */
+ dict_foreign_t** add_fk;
+ /** number of InnoDB foreign key constraints being dropped */
+ const ulint num_to_add_fk;
+ /** whether to create the indexes online */
+ bool online;
+ /** memory heap */
+ mem_heap_t* heap;
+ /** dictionary transaction */
+ trx_t* trx;
+ /** original table (if rebuilt, differs from indexed_table) */
+ dict_table_t* old_table;
+ /** table where the indexes are being created or dropped */
+ dict_table_t* new_table;
+ /** table definition for instant ADD/DROP/reorder COLUMN */
+ dict_table_t* instant_table;
+ /** mapping of old column numbers to new ones, or NULL */
+ const ulint* col_map;
+ /** new column names, or NULL if nothing was renamed */
+ const char** col_names;
+ /** added AUTO_INCREMENT column position, or ULINT_UNDEFINED */
+ const ulint add_autoinc;
+ /** default values of ADD and CHANGE COLUMN, or NULL */
+ const dtuple_t* defaults;
+ /** autoinc sequence to use */
+ ib_sequence_t sequence;
+ /** temporary table name to use for old table when renaming tables */
+ const char* tmp_name;
+ /** whether the order of the clustered index is unchanged */
+ bool skip_pk_sort;
+ /** number of virtual columns to be added */
+ unsigned num_to_add_vcol;
+ /** virtual columns to be added */
+ dict_v_col_t* add_vcol;
+ const char** add_vcol_name;
+ /** number of virtual columns to be dropped */
+ unsigned num_to_drop_vcol;
+ /** virtual columns to be dropped */
+ dict_v_col_t* drop_vcol;
+ const char** drop_vcol_name;
+ /** ALTER TABLE stage progress recorder */
+ ut_stage_alter_t* m_stage;
+ /** original number of user columns in the table */
+ const unsigned old_n_cols;
+ /** original columns of the table */
+ dict_col_t* const old_cols;
+ /** original column names of the table */
+ const char* const old_col_names;
+ /** original instantly dropped or reordered columns */
+ dict_instant_t* const old_instant;
+ /** original index fields */
+ dict_field_t* const old_fields;
+ /** size of old_fields */
+ const unsigned old_n_fields;
+ /** original old_table->n_core_fields */
+ const unsigned old_n_core_fields;
+ /** original number of virtual columns in the table */
+ const unsigned old_n_v_cols;
+ /** original virtual columns of the table */
+ dict_v_col_t* const old_v_cols;
+ /** original virtual column names of the table */
+ const char* const old_v_col_names;
+ /** 0, or 1 + first column whose position changes in instant ALTER */
+ unsigned first_alter_pos;
+ /** Allow non-null conversion.
+ (1) Alter ignore should allow the conversion
+ irrespective of sql mode.
+ (2) Don't allow the conversion in strict mode
+ (3) Allow the conversion only in non-strict mode. */
+ const bool allow_not_null;
+
+ /** The page_compression_level attribute, or 0 */
+ const uint page_compression_level;
+
+ ha_innobase_inplace_ctx(row_prebuilt_t*& prebuilt_arg,
+ dict_index_t** drop_arg,
+ ulint num_to_drop_arg,
+ dict_foreign_t** drop_fk_arg,
+ ulint num_to_drop_fk_arg,
+ dict_foreign_t** add_fk_arg,
+ ulint num_to_add_fk_arg,
+ bool online_arg,
+ mem_heap_t* heap_arg,
+ dict_table_t* new_table_arg,
+ const char** col_names_arg,
+ ulint add_autoinc_arg,
+ ulonglong autoinc_col_min_value_arg,
+ ulonglong autoinc_col_max_value_arg,
+ bool allow_not_null_flag,
+ bool page_compressed,
+ ulonglong page_compression_level_arg) :
+ inplace_alter_handler_ctx(),
+ prebuilt (prebuilt_arg),
+ add_index (0), add_key_numbers (0), num_to_add_index (0),
+ drop_index (drop_arg), num_to_drop_index (num_to_drop_arg),
+ drop_fk (drop_fk_arg), num_to_drop_fk (num_to_drop_fk_arg),
+ add_fk (add_fk_arg), num_to_add_fk (num_to_add_fk_arg),
+ online (online_arg), heap (heap_arg), trx (0),
+ old_table (prebuilt_arg->table),
+ new_table (new_table_arg), instant_table (0),
+ col_map (0), col_names (col_names_arg),
+ add_autoinc (add_autoinc_arg),
+ defaults (0),
+ sequence(prebuilt->trx->mysql_thd,
+ autoinc_col_min_value_arg, autoinc_col_max_value_arg),
+ tmp_name (0),
+ skip_pk_sort(false),
+ num_to_add_vcol(0),
+ add_vcol(0),
+ add_vcol_name(0),
+ num_to_drop_vcol(0),
+ drop_vcol(0),
+ drop_vcol_name(0),
+ m_stage(NULL),
+ old_n_cols(prebuilt_arg->table->n_cols),
+ old_cols(prebuilt_arg->table->cols),
+ old_col_names(prebuilt_arg->table->col_names),
+ old_instant(prebuilt_arg->table->instant),
+ old_fields(prebuilt_arg->table->indexes.start->fields),
+ old_n_fields(prebuilt_arg->table->indexes.start->n_fields),
+ old_n_core_fields(prebuilt_arg->table->indexes.start
+ ->n_core_fields),
+ old_n_v_cols(prebuilt_arg->table->n_v_cols),
+ old_v_cols(prebuilt_arg->table->v_cols),
+ old_v_col_names(prebuilt_arg->table->v_col_names),
+ first_alter_pos(0),
+ allow_not_null(allow_not_null_flag),
+ page_compression_level(page_compressed
+ ? (page_compression_level_arg
+ ? uint(page_compression_level_arg)
+ : page_zip_level)
+ : 0)
+ {
+ ut_ad(old_n_cols >= DATA_N_SYS_COLS);
+ ut_ad(page_compression_level <= 9);
+#ifdef UNIV_DEBUG
+ for (ulint i = 0; i < num_to_add_index; i++) {
+ ut_ad(!add_index[i]->to_be_dropped);
+ }
+ for (ulint i = 0; i < num_to_drop_index; i++) {
+ ut_ad(drop_index[i]->to_be_dropped);
+ }
+#endif /* UNIV_DEBUG */
+
+ thr = pars_complete_graph_for_exec(NULL, prebuilt->trx, heap,
+ prebuilt);
+ }
+
+ ~ha_innobase_inplace_ctx()
+ {
+ UT_DELETE(m_stage);
+ if (instant_table) {
+ ut_ad(!instant_table->id);
+ while (dict_index_t* index
+ = UT_LIST_GET_LAST(instant_table->indexes)) {
+ UT_LIST_REMOVE(instant_table->indexes, index);
+ rw_lock_free(&index->lock);
+ dict_mem_index_free(index);
+ }
+ for (unsigned i = old_n_v_cols; i--; ) {
+ old_v_cols[i].~dict_v_col_t();
+ }
+ if (instant_table->fts) {
+ fts_free(instant_table);
+ }
+ dict_mem_table_free(instant_table);
+ }
+ mem_heap_free(heap);
+ }
+
+ /** Determine if the table will be rebuilt.
+ @return whether the table will be rebuilt */
+ bool need_rebuild () const { return(old_table != new_table); }
+
+ /** Convert table-rebuilding ALTER to instant ALTER. */
+ void prepare_instant()
+ {
+ DBUG_ASSERT(need_rebuild());
+ DBUG_ASSERT(!is_instant());
+ DBUG_ASSERT(old_table->n_cols == old_n_cols);
+
+ instant_table = new_table;
+ new_table = old_table;
+ export_vars.innodb_instant_alter_column++;
+
+ instant_table->prepare_instant(*old_table, col_map,
+ first_alter_pos);
+ }
+
+ /** Adjust table metadata for instant ADD/DROP/reorder COLUMN.
+ @return whether the metadata record must be updated */
+ bool instant_column()
+ {
+ DBUG_ASSERT(is_instant());
+ DBUG_ASSERT(old_n_fields
+ == old_table->indexes.start->n_fields);
+ return old_table->instant_column(*instant_table, col_map);
+ }
+
+ /** Revert prepare_instant() if the transaction is rolled back. */
+ void rollback_instant()
+ {
+ if (!is_instant()) return;
+ old_table->rollback_instant(old_n_cols,
+ old_cols, old_col_names,
+ old_instant,
+ old_fields, old_n_fields,
+ old_n_core_fields,
+ old_n_v_cols, old_v_cols,
+ old_v_col_names,
+ col_map);
+ }
+
+ /** @return whether this is instant ALTER TABLE */
+ bool is_instant() const
+ {
+ DBUG_ASSERT(!instant_table || !instant_table->can_be_evicted);
+ return instant_table;
+ }
+
+ /** Create an index table where indexes are ordered as follows:
+
+ IF a new primary key is defined for the table THEN
+
+ 1) New primary key
+ 2) The remaining keys in key_info
+
+ ELSE
+
+ 1) All new indexes in the order they arrive from MySQL
+
+ ENDIF
+
+ @return key definitions */
+ MY_ATTRIBUTE((nonnull, warn_unused_result, malloc))
+ inline index_def_t*
+ create_key_defs(
+ const Alter_inplace_info* ha_alter_info,
+ /*!< in: alter operation */
+ const TABLE* altered_table,
+ /*!< in: MySQL table that is being altered */
+ ulint& n_fts_add,
+ /*!< out: number of FTS indexes to be created */
+ ulint& fts_doc_id_col,
+ /*!< in: The column number for Doc ID */
+ bool& add_fts_doc_id,
+ /*!< in: whether we need to add new DOC ID
+ column for FTS index */
+ bool& add_fts_doc_idx,
+ /*!< in: whether we need to add new DOC ID
+ index for FTS index */
+ const TABLE* table);
+ /*!< in: MySQL table that is being altered */
+
+ /** Share context between partitions.
+ @param[in] ctx context from another partition of the table */
+ void set_shared_data(const inplace_alter_handler_ctx& ctx)
+ {
+ if (add_autoinc != ULINT_UNDEFINED) {
+ const ha_innobase_inplace_ctx& ha_ctx =
+ static_cast<const ha_innobase_inplace_ctx&>
+ (ctx);
+ /* When adding an AUTO_INCREMENT column to a
+ partitioned InnoDB table, we must share the
+ sequence for all partitions. */
+ ut_ad(ha_ctx.add_autoinc == add_autoinc);
+ ut_ad(ha_ctx.sequence.last());
+ sequence = ha_ctx.sequence;
+ }
+ }
+
+ /** @return whether the given column is being added */
+ bool is_new_vcol(const dict_v_col_t &v_col) const
+ {
+ for (ulint i= 0; i < num_to_add_vcol; i++)
+ if (&add_vcol[i] == &v_col)
+ return true;
+ return false;
+ }
+
+ /** During rollback, make newly added indexes point to
+ newly added virtual columns. */
+ void clean_new_vcol_index()
+ {
+ ut_ad(old_table == new_table);
+ const dict_index_t *index= dict_table_get_first_index(old_table);
+ while ((index= dict_table_get_next_index(index)) != NULL)
+ {
+ if (!index->has_virtual() || index->is_committed())
+ continue;
+ ulint n_drop_new_vcol= index->get_new_n_vcol();
+ for (ulint i= 0; n_drop_new_vcol && i < index->n_fields; i++)
+ {
+ dict_col_t *col= index->fields[i].col;
+ /* Skip the non-virtual and old virtual columns */
+ if (!col->is_virtual())
+ continue;
+ dict_v_col_t *vcol= reinterpret_cast<dict_v_col_t*>(col);
+ if (!is_new_vcol(*vcol))
+ continue;
+
+ index->fields[i].col= &index->new_vcol_info->
+ add_drop_v_col(index->heap, vcol, --n_drop_new_vcol)->m_col;
+ }
+ }
+ }
+
+private:
+ // Disable copying
+ ha_innobase_inplace_ctx(const ha_innobase_inplace_ctx&);
+ ha_innobase_inplace_ctx& operator=(const ha_innobase_inplace_ctx&);
+};
+
+/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+UNIV_INTERN
+ulonglong
+innobase_get_int_col_max_value(
+/*===========================*/
+ const Field* field); /*!< in: MySQL field */
+
+/* Report an InnoDB error to the client by invoking my_error(). */
+static ATTRIBUTE_COLD __attribute__((nonnull))
+void
+my_error_innodb(
+/*============*/
+ dberr_t error, /*!< in: InnoDB error code */
+ const char* table, /*!< in: table name */
+ ulint flags) /*!< in: table flags */
+{
+ switch (error) {
+ case DB_MISSING_HISTORY:
+ my_error(ER_TABLE_DEF_CHANGED, MYF(0));
+ break;
+ case DB_RECORD_NOT_FOUND:
+ my_error(ER_KEY_NOT_FOUND, MYF(0), table);
+ break;
+ case DB_DEADLOCK:
+ my_error(ER_LOCK_DEADLOCK, MYF(0));
+ break;
+ case DB_LOCK_WAIT_TIMEOUT:
+ my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
+ break;
+ case DB_INTERRUPTED:
+ my_error(ER_QUERY_INTERRUPTED, MYF(0));
+ break;
+ case DB_OUT_OF_MEMORY:
+ my_error(ER_OUT_OF_RESOURCES, MYF(0));
+ break;
+ case DB_OUT_OF_FILE_SPACE:
+ my_error(ER_RECORD_FILE_FULL, MYF(0), table);
+ break;
+ case DB_TEMP_FILE_WRITE_FAIL:
+ my_error(ER_TEMP_FILE_WRITE_FAILURE, MYF(0));
+ break;
+ case DB_TOO_BIG_INDEX_COL:
+ my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+ (ulong) DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
+ break;
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0));
+ break;
+ case DB_LOCK_TABLE_FULL:
+ my_error(ER_LOCK_TABLE_FULL, MYF(0));
+ break;
+ case DB_UNDO_RECORD_TOO_BIG:
+ my_error(ER_UNDO_RECORD_TOO_BIG, MYF(0));
+ break;
+ case DB_CORRUPTION:
+ my_error(ER_NOT_KEYFILE, MYF(0), table);
+ break;
+ case DB_TOO_BIG_RECORD: {
+ /* Note that in page0zip.ic page_zip_rec_needs_ext() rec_size
+ is limited to COMPRESSED_REC_MAX_DATA_SIZE (16K) or
+ REDUNDANT_REC_MAX_DATA_SIZE (16K-1). */
+ bool comp = !!(flags & DICT_TF_COMPACT);
+ ulint free_space = page_get_free_space_of_empty(comp) / 2;
+
+ if (free_space >= ulint(comp ? COMPRESSED_REC_MAX_DATA_SIZE :
+ REDUNDANT_REC_MAX_DATA_SIZE)) {
+ free_space = (comp ? COMPRESSED_REC_MAX_DATA_SIZE :
+ REDUNDANT_REC_MAX_DATA_SIZE) - 1;
+ }
+
+ my_error(ER_TOO_BIG_ROWSIZE, MYF(0), free_space);
+ break;
+ }
+ case DB_INVALID_NULL:
+ /* TODO: report the row, as we do for DB_DUPLICATE_KEY */
+ my_error(ER_INVALID_USE_OF_NULL, MYF(0));
+ break;
+ case DB_CANT_CREATE_GEOMETRY_OBJECT:
+ my_error(ER_CANT_CREATE_GEOMETRY_OBJECT, MYF(0));
+ break;
+ case DB_TABLESPACE_EXISTS:
+ my_error(ER_TABLESPACE_EXISTS, MYF(0), table);
+ break;
+
+#ifdef UNIV_DEBUG
+ case DB_SUCCESS:
+ case DB_DUPLICATE_KEY:
+ case DB_ONLINE_LOG_TOO_BIG:
+ /* These codes should not be passed here. */
+ ut_error;
+#endif /* UNIV_DEBUG */
+ default:
+ my_error(ER_GET_ERRNO, MYF(0), error, "InnoDB");
+ break;
+ }
+}
+
+/** Determine if fulltext indexes exist in a given table.
+@param table MySQL table
+@return number of fulltext indexes */
+static uint innobase_fulltext_exist(const TABLE* table)
+{
+ uint count = 0;
+
+ for (uint i = 0; i < table->s->keys; i++) {
+ if (table->key_info[i].flags & HA_FULLTEXT) {
+ count++;
+ }
+ }
+
+ return count;
+}
+
+/** Determine whether indexed virtual columns exist in a table.
+@param[in] table table definition
+@return whether indexes exist on virtual columns */
+static bool innobase_indexed_virtual_exist(const TABLE* table)
+{
+ const KEY* const end = &table->key_info[table->s->keys];
+
+ for (const KEY* key = table->key_info; key < end; key++) {
+ const KEY_PART_INFO* const key_part_end = key->key_part
+ + key->user_defined_key_parts;
+ for (const KEY_PART_INFO* key_part = key->key_part;
+ key_part < key_part_end; key_part++) {
+ if (!key_part->field->stored_in_db())
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/** Determine if spatial indexes exist in a given table.
+@param table MySQL table
+@return whether spatial indexes exist on the table */
+static
+bool
+innobase_spatial_exist(
+/*===================*/
+ const TABLE* table)
+{
+ for (uint i = 0; i < table->s->keys; i++) {
+ if (table->key_info[i].flags & HA_SPATIAL) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Determine if ALTER_OPTIONS requires rebuilding the table.
+@param[in] ha_alter_info the ALTER TABLE operation
+@param[in] table metadata before ALTER TABLE
+@return whether it is mandatory to rebuild the table */
+static bool alter_options_need_rebuild(
+ const Alter_inplace_info* ha_alter_info,
+ const TABLE* table)
+{
+ DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_OPTIONS);
+
+ if (ha_alter_info->create_info->used_fields
+ & (HA_CREATE_USED_ROW_FORMAT
+ | HA_CREATE_USED_KEY_BLOCK_SIZE)) {
+ /* Specifying ROW_FORMAT or KEY_BLOCK_SIZE requires
+ rebuilding the table. (These attributes in the .frm
+ file may disagree with the InnoDB data dictionary, and
+ the interpretation of thse attributes depends on
+ InnoDB parameters. That is why we for now always
+ require a rebuild when these attributes are specified.) */
+ return true;
+ }
+
+ const ha_table_option_struct& alt_opt=
+ *ha_alter_info->create_info->option_struct;
+ const ha_table_option_struct& opt= *table->s->option_struct;
+
+ /* Allow an instant change to enable page_compressed,
+ and any change of page_compression_level. */
+ if ((!alt_opt.page_compressed && opt.page_compressed)
+ || alt_opt.encryption != opt.encryption
+ || alt_opt.encryption_key_id != opt.encryption_key_id) {
+ return(true);
+ }
+
+ return false;
+}
+
+/** Determine if ALTER TABLE needs to rebuild the table
+(or perform instant operation).
+@param[in] ha_alter_info the ALTER TABLE operation
+@param[in] table metadata before ALTER TABLE
+@return whether it is necessary to rebuild the table or to alter columns */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_need_rebuild(
+ const Alter_inplace_info* ha_alter_info,
+ const TABLE* table)
+{
+ if ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE
+ | INNOBASE_ALTER_NOREBUILD
+ | INNOBASE_ALTER_INSTANT))
+ == ALTER_OPTIONS) {
+ return alter_options_need_rebuild(ha_alter_info, table);
+ }
+
+ return !!(ha_alter_info->handler_flags & INNOBASE_ALTER_REBUILD);
+}
+
+/** Check if virtual column in old and new table are in order, excluding
+those dropped column. This is needed because when we drop a virtual column,
+ALTER_VIRTUAL_COLUMN_ORDER is also turned on, so we can't decide if this
+is a real ORDER change or just DROP COLUMN
+@param[in] table old TABLE
+@param[in] altered_table new TABLE
+@param[in] ha_alter_info Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+@return true is all columns in order, false otherwise. */
+static
+bool
+check_v_col_in_order(
+ const TABLE* table,
+ const TABLE* altered_table,
+ Alter_inplace_info* ha_alter_info)
+{
+ ulint j = 0;
+
+ /* We don't support any adding new virtual column before
+ existed virtual column. */
+ if (ha_alter_info->handler_flags
+ & ALTER_ADD_VIRTUAL_COLUMN) {
+ bool has_new = false;
+
+ for (const Create_field& new_field :
+ ha_alter_info->alter_info->create_list) {
+ if (new_field.stored_in_db()) {
+ continue;
+ }
+
+ /* Found a new added virtual column. */
+ if (!new_field.field) {
+ has_new = true;
+ continue;
+ }
+
+ /* If there's any old virtual column
+ after the new added virtual column,
+ order must be changed. */
+ if (has_new) {
+ return(false);
+ }
+ }
+ }
+
+ /* directly return true if ALTER_VIRTUAL_COLUMN_ORDER is not on */
+ if (!(ha_alter_info->handler_flags
+ & ALTER_VIRTUAL_COLUMN_ORDER)) {
+ return(true);
+ }
+
+ for (ulint i = 0; i < table->s->fields; i++) {
+ Field* field = table->field[i];
+
+ if (field->stored_in_db()) {
+ continue;
+ }
+
+ if (field->flags & FIELD_IS_DROPPED) {
+ continue;
+ }
+
+ /* Now check if the next virtual column in altered table
+ matches this column */
+ while (j < altered_table->s->fields) {
+ Field* new_field = altered_table->s->field[j];
+
+ if (new_field->stored_in_db()) {
+ j++;
+ continue;
+ }
+
+ if (my_strcasecmp(system_charset_info,
+ field->field_name.str,
+ new_field->field_name.str) != 0) {
+ /* different column */
+ return(false);
+ } else {
+ j++;
+ break;
+ }
+ }
+
+ if (j > altered_table->s->fields) {
+ /* there should not be less column in new table
+ without them being in drop list */
+ ut_ad(0);
+ return(false);
+ }
+ }
+
+ return(true);
+}
+
+/** Determine if an instant operation is possible for altering columns.
+@param[in] ib_table InnoDB table definition
+@param[in] ha_alter_info the ALTER TABLE operation
+@param[in] table table definition before ALTER TABLE
+@param[in] altered_table table definition after ALTER TABLE
+@param[in] strict whether to ensure that user records fit */
+static
+bool
+instant_alter_column_possible(
+ const dict_table_t& ib_table,
+ const Alter_inplace_info* ha_alter_info,
+ const TABLE* table,
+ const TABLE* altered_table,
+ bool strict)
+{
+ const dict_index_t* const pk = ib_table.indexes.start;
+ ut_ad(pk->is_primary());
+ ut_ad(!pk->has_virtual());
+
+ if (ha_alter_info->handler_flags
+ & (ALTER_STORED_COLUMN_ORDER | ALTER_DROP_STORED_COLUMN
+ | ALTER_ADD_STORED_BASE_COLUMN)) {
+#if 1 // MDEV-17459: adjust fts_fetch_doc_from_rec() and friends; remove this
+ if (ib_table.fts || innobase_fulltext_exist(altered_table))
+ return false;
+#endif
+#if 1 // MDEV-17468: fix bugs with indexed virtual columns & remove this
+ for (const dict_index_t* index = ib_table.indexes.start;
+ index; index = index->indexes.next) {
+ if (index->has_virtual()) {
+ ut_ad(ib_table.n_v_cols
+ || index->is_corrupted());
+ return false;
+ }
+ }
+#endif
+ uint n_add = 0, n_nullable = 0, lenlen = 0;
+ const uint blob_prefix = dict_table_has_atomic_blobs(&ib_table)
+ ? 0
+ : REC_ANTELOPE_MAX_INDEX_COL_LEN;
+ const uint min_local_len = blob_prefix
+ ? blob_prefix + FIELD_REF_SIZE
+ : 2 * FIELD_REF_SIZE;
+ size_t min_size = 0, max_size = 0;
+ Field** af = altered_table->field;
+ Field** const end = altered_table->field
+ + altered_table->s->fields;
+ List_iterator_fast<Create_field> cf_it(
+ ha_alter_info->alter_info->create_list);
+
+ for (; af < end; af++) {
+ const Create_field* cf = cf_it++;
+ if (!(*af)->stored_in_db() || cf->field) {
+ /* Virtual or pre-existing column */
+ continue;
+ }
+ const bool nullable = (*af)->real_maybe_null();
+ const bool is_null = (*af)->is_real_null();
+ ut_ad(!is_null || nullable);
+ n_nullable += nullable;
+ n_add++;
+ uint l;
+ switch ((*af)->type()) {
+ case MYSQL_TYPE_VARCHAR:
+ l = reinterpret_cast<const Field_varstring*>
+ (*af)->get_length();
+ variable_length:
+ if (l >= min_local_len) {
+ max_size += blob_prefix
+ + FIELD_REF_SIZE;
+ if (!is_null) {
+ min_size += blob_prefix
+ + FIELD_REF_SIZE;
+ }
+ lenlen += 2;
+ } else {
+ if (!is_null) {
+ min_size += l;
+ }
+ l = (*af)->pack_length();
+ max_size += l;
+ lenlen += l > 255 ? 2 : 1;
+ }
+ break;
+ case MYSQL_TYPE_GEOMETRY:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ l = reinterpret_cast<const Field_blob*>
+ ((*af))->get_length();
+ goto variable_length;
+ default:
+ l = (*af)->pack_length();
+ if (l > 255 && ib_table.not_redundant()) {
+ goto variable_length;
+ }
+ max_size += l;
+ if (!is_null) {
+ min_size += l;
+ }
+ }
+ }
+
+ ulint n_fields = pk->n_fields + n_add;
+
+ if (n_fields >= REC_MAX_N_USER_FIELDS + DATA_N_SYS_COLS) {
+ return false;
+ }
+
+ if (pk->is_gen_clust()) {
+ min_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+ + DATA_ROW_ID_LEN;
+ max_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+ + DATA_ROW_ID_LEN;
+ } else {
+ min_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ max_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ }
+
+ uint i = pk->n_fields;
+ while (i-- > pk->n_core_fields) {
+ const dict_field_t& f = pk->fields[i];
+ if (f.col->is_nullable()) {
+ n_nullable++;
+ if (!f.col->is_dropped()
+ && f.col->def_val.data) {
+ goto instantly_added_column;
+ }
+ } else if (f.fixed_len
+ && (f.fixed_len <= 255
+ || !ib_table.not_redundant())) {
+ if (ib_table.not_redundant()
+ || !f.col->is_dropped()) {
+ min_size += f.fixed_len;
+ max_size += f.fixed_len;
+ }
+ } else if (f.col->is_dropped() || !f.col->is_added()) {
+ lenlen++;
+ goto set_max_size;
+ } else {
+instantly_added_column:
+ ut_ad(f.col->is_added());
+ if (f.col->def_val.len >= min_local_len) {
+ min_size += blob_prefix
+ + FIELD_REF_SIZE;
+ lenlen += 2;
+ } else {
+ min_size += f.col->def_val.len;
+ lenlen += f.col->def_val.len
+ > 255 ? 2 : 1;
+ }
+set_max_size:
+ if (f.fixed_len
+ && (f.fixed_len <= 255
+ || !ib_table.not_redundant())) {
+ max_size += f.fixed_len;
+ } else if (f.col->len >= min_local_len) {
+ max_size += blob_prefix
+ + FIELD_REF_SIZE;
+ } else {
+ max_size += f.col->len;
+ }
+ }
+ }
+
+ do {
+ const dict_field_t& f = pk->fields[i];
+ if (f.col->is_nullable()) {
+ n_nullable++;
+ } else if (f.fixed_len) {
+ min_size += f.fixed_len;
+ } else {
+ lenlen++;
+ }
+ } while (i--);
+
+ if (ib_table.instant
+ || (ha_alter_info->handler_flags
+ & (ALTER_STORED_COLUMN_ORDER
+ | ALTER_DROP_STORED_COLUMN))) {
+ n_fields++;
+ lenlen += 2;
+ min_size += FIELD_REF_SIZE;
+ }
+
+ if (ib_table.not_redundant()) {
+ min_size += REC_N_NEW_EXTRA_BYTES
+ + UT_BITS_IN_BYTES(n_nullable)
+ + lenlen;
+ } else {
+ min_size += (n_fields > 255 || min_size > 255)
+ ? n_fields * 2 : n_fields;
+ min_size += REC_N_OLD_EXTRA_BYTES;
+ }
+
+ if (page_zip_rec_needs_ext(min_size, ib_table.not_redundant(),
+ 0, 0)) {
+ return false;
+ }
+
+ if (strict && page_zip_rec_needs_ext(max_size,
+ ib_table.not_redundant(),
+ 0, 0)) {
+ return false;
+ }
+ }
+ // Making table system-versioned instantly is not implemented yet.
+ if (ha_alter_info->handler_flags & ALTER_ADD_SYSTEM_VERSIONING) {
+ return false;
+ }
+
+ static constexpr alter_table_operations avoid_rebuild
+ = ALTER_ADD_STORED_BASE_COLUMN
+ | ALTER_DROP_STORED_COLUMN
+ | ALTER_STORED_COLUMN_ORDER
+ | ALTER_COLUMN_NULLABLE;
+
+ if (!(ha_alter_info->handler_flags & avoid_rebuild)) {
+ alter_table_operations flags = ha_alter_info->handler_flags
+ & ~avoid_rebuild;
+ /* None of the flags are set that we can handle
+ specially to avoid rebuild. In this case, we can
+ allow ALGORITHM=INSTANT, except if some requested
+ operation requires that the table be rebuilt. */
+ if (flags & INNOBASE_ALTER_REBUILD) {
+ return false;
+ }
+ if ((flags & ALTER_OPTIONS)
+ && alter_options_need_rebuild(ha_alter_info, table)) {
+ return false;
+ }
+ } else if (!ib_table.supports_instant()) {
+ return false;
+ }
+
+ /* At the moment, we disallow ADD [UNIQUE] INDEX together with
+ instant ADD COLUMN.
+
+ The main reason is that the work of instant ADD must be done
+ in commit_inplace_alter_table(). For the rollback_instant()
+ to work, we must add the columns to dict_table_t beforehand,
+ and roll back those changes in case the transaction is rolled
+ back.
+
+ If we added the columns to the dictionary cache already in the
+ prepare_inplace_alter_table(), we would have to deal with
+ column number mismatch in ha_innobase::open(), write_row() and
+ other functions. */
+
+ /* FIXME: allow instant ADD COLUMN together with
+ INNOBASE_ONLINE_CREATE (ADD [UNIQUE] INDEX) on pre-existing
+ columns. */
+ if (ha_alter_info->handler_flags
+ & ((INNOBASE_ALTER_REBUILD | INNOBASE_ONLINE_CREATE)
+ & ~ALTER_DROP_STORED_COLUMN
+ & ~ALTER_STORED_COLUMN_ORDER
+ & ~ALTER_ADD_STORED_BASE_COLUMN
+ & ~ALTER_COLUMN_NULLABLE
+ & ~ALTER_OPTIONS)) {
+ return false;
+ }
+
+ if ((ha_alter_info->handler_flags & ALTER_OPTIONS)
+ && alter_options_need_rebuild(ha_alter_info, table)) {
+ return false;
+ }
+
+ if (ha_alter_info->handler_flags & ALTER_COLUMN_NULLABLE) {
+ if (ib_table.not_redundant()) {
+ /* Instantaneous removal of NOT NULL is
+ only supported for ROW_FORMAT=REDUNDANT. */
+ return false;
+ }
+ if (ib_table.fts_doc_id_index
+ && !innobase_fulltext_exist(altered_table)) {
+ /* Removing hidden FTS_DOC_ID_INDEX(FTS_DOC_ID)
+ requires that the table be rebuilt. */
+ return false;
+ }
+
+ Field** af = altered_table->field;
+ Field** const end = altered_table->field
+ + altered_table->s->fields;
+ for (unsigned c = 0; af < end; af++) {
+ if (!(*af)->stored_in_db()) {
+ continue;
+ }
+
+ const dict_col_t* col = dict_table_get_nth_col(
+ &ib_table, c++);
+
+ if (!col->ord_part || col->is_nullable()
+ || !(*af)->real_maybe_null()) {
+ continue;
+ }
+
+ /* The column would be changed from NOT NULL.
+ Ensure that it is not a clustered index key. */
+ for (auto i = pk->n_uniq; i--; ) {
+ if (pk->fields[i].col == col) {
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+/** Check whether the non-const default value for the field
+@param[in] field field which could be added or changed
+@return true if the non-const default is present. */
+static bool is_non_const_value(Field* field)
+{
+ return field->default_value
+ && field->default_value->flags
+ & uint(~(VCOL_SESSION_FUNC | VCOL_TIME_FUNC));
+}
+
+/** Set default value for the field.
+@param[in] field field which could be added or changed
+@return true if the default value is set. */
+static bool set_default_value(Field* field)
+{
+ /* The added/changed NOT NULL column lacks a DEFAULT value,
+ or the DEFAULT is the same for all rows.
+ (Time functions, such as CURRENT_TIMESTAMP(),
+ are evaluated from a timestamp that is assigned
+ at the start of the statement. Session
+ functions, such as USER(), always evaluate the
+ same within a statement.) */
+
+ ut_ad(!is_non_const_value(field));
+
+ /* Compute the DEFAULT values of non-constant columns
+ (VCOL_SESSION_FUNC | VCOL_TIME_FUNC). */
+ switch (field->set_default()) {
+ case 0: /* OK */
+ case 3: /* DATETIME to TIME or DATE conversion */
+ return true;
+ case -1: /* OOM, or GEOMETRY type mismatch */
+ case 1: /* A number adjusted to the min/max value */
+ case 2: /* String truncation, or conversion problem */
+ break;
+ }
+
+ return false;
+}
+
+/** Check whether the table has the FTS_DOC_ID column
+@param[in] table InnoDB table with fulltext index
+@param[in] altered_table MySQL table with fulltext index
+@param[out] fts_doc_col_no The column number for Doc ID,
+ or ULINT_UNDEFINED if it is of wrong type
+@param[out] num_v Number of virtual column
+@param[in] check_only check only whether fts doc id exist.
+@return whether there exists an FTS_DOC_ID column */
+static
+bool
+innobase_fts_check_doc_id_col(
+ const dict_table_t* table,
+ const TABLE* altered_table,
+ ulint* fts_doc_col_no,
+ ulint* num_v,
+ bool check_only=false)
+{
+ *fts_doc_col_no = ULINT_UNDEFINED;
+
+ const uint n_cols = altered_table->s->fields;
+ ulint i;
+ int err = 0;
+ *num_v = 0;
+
+ for (i = 0; i < n_cols; i++) {
+ const Field* field = altered_table->field[i];
+
+ if (!field->stored_in_db()) {
+ (*num_v)++;
+ }
+
+ if (my_strcasecmp(system_charset_info,
+ field->field_name.str, FTS_DOC_ID_COL_NAME)) {
+ continue;
+ }
+
+ if (strcmp(field->field_name.str, FTS_DOC_ID_COL_NAME)) {
+ err = ER_WRONG_COLUMN_NAME;
+ } else if (field->type() != MYSQL_TYPE_LONGLONG
+ || field->pack_length() != 8
+ || field->real_maybe_null()
+ || !(field->flags & UNSIGNED_FLAG)
+ || !field->stored_in_db()) {
+ err = ER_INNODB_FT_WRONG_DOCID_COLUMN;
+ } else {
+ *fts_doc_col_no = i - *num_v;
+ }
+
+ if (err && !check_only) {
+ my_error(err, MYF(0), field->field_name.str);
+ }
+
+ return(true);
+ }
+
+ if (!table) {
+ return(false);
+ }
+
+ /* Not to count the virtual columns */
+ i -= *num_v;
+
+ for (; i + DATA_N_SYS_COLS < (uint) table->n_cols; i++) {
+ const char* name = dict_table_get_col_name(table, i);
+
+ if (strcmp(name, FTS_DOC_ID_COL_NAME) == 0) {
+#ifdef UNIV_DEBUG
+ const dict_col_t* col;
+
+ col = dict_table_get_nth_col(table, i);
+
+ /* Because the FTS_DOC_ID does not exist in
+ the MySQL data dictionary, this must be the
+ internally created FTS_DOC_ID column. */
+ ut_ad(col->mtype == DATA_INT);
+ ut_ad(col->len == 8);
+ ut_ad(col->prtype & DATA_NOT_NULL);
+ ut_ad(col->prtype & DATA_UNSIGNED);
+#endif /* UNIV_DEBUG */
+ *fts_doc_col_no = i;
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Check whether the table is empty.
+@param[in] table table to be checked
+@return true if table is empty */
+static bool innobase_table_is_empty(const dict_table_t *table)
+{
+ dict_index_t *clust_index= dict_table_get_first_index(table);
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ buf_block_t *block;
+ page_cur_t *cur;
+ const rec_t *rec;
+ bool next_page= false;
+
+ mtr.start();
+ btr_pcur_open_at_index_side(true, clust_index, BTR_SEARCH_LEAF,
+ &pcur, true, 0, &mtr);
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ if (!rec_is_metadata(btr_pcur_get_rec(&pcur), *clust_index))
+ btr_pcur_move_to_prev_on_page(&pcur);
+scan_leaf:
+ cur= btr_pcur_get_page_cur(&pcur);
+ page_cur_move_to_next(cur);
+next_page:
+ if (next_page)
+ {
+ uint32_t next_page_no= btr_page_get_next(page_cur_get_page(cur));
+ if (next_page_no == FIL_NULL)
+ {
+ mtr.commit();
+ return true;
+ }
+
+ next_page= false;
+ block= page_cur_get_block(cur);
+ block= btr_block_get(*clust_index, next_page_no, BTR_SEARCH_LEAF, false,
+ &mtr);
+ btr_leaf_page_release(page_cur_get_block(cur), BTR_SEARCH_LEAF, &mtr);
+ page_cur_set_before_first(block, cur);
+ page_cur_move_to_next(cur);
+ }
+
+ rec= page_cur_get_rec(cur);
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(table)));
+ else if (!page_rec_is_supremum(rec))
+ {
+ mtr.commit();
+ return false;
+ }
+ else
+ {
+ next_page= true;
+ goto next_page;
+ }
+ goto scan_leaf;
+}
+
+/** Check if InnoDB supports a particular alter table in-place
+@param altered_table TABLE object for new version of table.
+@param ha_alter_info Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported
+@retval HA_ALTER_INPLACE_INSTANT
+MDL_EXCLUSIVE is needed for executing prepare_inplace_alter_table()
+and commit_inplace_alter_table(). inplace_alter_table() will not be called.
+@retval HA_ALTER_INPLACE_COPY_NO_LOCK
+MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to
+LOCK=NONE for rebuilding the table in inplace_alter_table()
+@retval HA_ALTER_INPLACE_COPY_LOCK
+MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to
+LOCK=SHARED for rebuilding the table in inplace_alter_table()
+@retval HA_ALTER_INPLACE_NOCOPY_NO_LOCK
+MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to
+LOCK=NONE for inplace_alter_table() which will not rebuild the table
+@retval HA_ALTER_INPLACE_NOCOPY_LOCK
+MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to
+LOCK=SHARED for inplace_alter_table() which will not rebuild the table
+*/
+
+enum_alter_inplace_result
+ha_innobase::check_if_supported_inplace_alter(
+ TABLE* altered_table,
+ Alter_inplace_info* ha_alter_info)
+{
+ DBUG_ENTER("check_if_supported_inplace_alter");
+
+ if ((ha_alter_info->handler_flags
+ & INNOBASE_ALTER_VERSIONED_REBUILD)
+ && altered_table->versioned(VERS_TIMESTAMP)) {
+ ha_alter_info->unsupported_reason =
+ "Not implemented for system-versioned timestamp tables";
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ /* Before 10.2.2 information about virtual columns was not stored in
+ system tables. We need to do a full alter to rebuild proper 10.2.2+
+ metadata with the information about virtual columns */
+ if (omits_virtual_cols(*table_share)) {
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ if (high_level_read_only) {
+ ha_alter_info->unsupported_reason =
+ my_get_err_msg(ER_READ_ONLY_MODE);
+
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ if (altered_table->s->fields > REC_MAX_N_USER_FIELDS) {
+ /* Deny the inplace ALTER TABLE. MySQL will try to
+ re-create the table and ha_innobase::create() will
+ return an error too. This is how we effectively
+ deny adding too many columns to a table. */
+ ha_alter_info->unsupported_reason =
+ my_get_err_msg(ER_TOO_MANY_FIELDS);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ update_thd();
+
+ if (ha_alter_info->handler_flags
+ & ~(INNOBASE_INPLACE_IGNORE
+ | INNOBASE_ALTER_INSTANT
+ | INNOBASE_ALTER_NOREBUILD
+ | INNOBASE_ALTER_REBUILD)) {
+
+ if (ha_alter_info->handler_flags
+ & ALTER_STORED_COLUMN_TYPE) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_COLUMN_TYPE);
+ }
+
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ /* Only support online add foreign key constraint when
+ check_foreigns is turned off */
+ if ((ha_alter_info->handler_flags & ALTER_ADD_FOREIGN_KEY)
+ && m_prebuilt->trx->check_foreigns) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FK_CHECK);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ const char* reason_rebuild = NULL;
+
+ switch (innodb_instant_alter_column_allowed) {
+ case 0: /* never */
+ if ((ha_alter_info->handler_flags
+ & (ALTER_ADD_STORED_BASE_COLUMN
+ | ALTER_STORED_COLUMN_ORDER
+ | ALTER_DROP_STORED_COLUMN))
+ || m_prebuilt->table->is_instant()) {
+ reason_rebuild =
+ "innodb_instant_alter_column_allowed=never";
+innodb_instant_alter_column_allowed_reason:
+ if (ha_alter_info->handler_flags
+ & ALTER_RECREATE_TABLE) {
+ reason_rebuild = NULL;
+ } else {
+ ha_alter_info->handler_flags
+ |= ALTER_RECREATE_TABLE;
+ ha_alter_info->unsupported_reason
+ = reason_rebuild;
+ }
+ }
+ break;
+ case 1: /* add_last */
+ if ((ha_alter_info->handler_flags
+ & (ALTER_STORED_COLUMN_ORDER | ALTER_DROP_STORED_COLUMN))
+ || m_prebuilt->table->instant) {
+ reason_rebuild = "innodb_instant_atler_column_allowed="
+ "add_last";
+ goto innodb_instant_alter_column_allowed_reason;
+ }
+ }
+
+ switch (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) {
+ case ALTER_OPTIONS:
+ if (alter_options_need_rebuild(ha_alter_info, table)) {
+ reason_rebuild = my_get_err_msg(
+ ER_ALTER_OPERATION_TABLE_OPTIONS_NEED_REBUILD);
+ ha_alter_info->unsupported_reason = reason_rebuild;
+ break;
+ }
+ /* fall through */
+ case 0:
+ DBUG_RETURN(HA_ALTER_INPLACE_INSTANT);
+ }
+
+ /* InnoDB cannot IGNORE when creating unique indexes. IGNORE
+ should silently delete some duplicate rows. Our inplace_alter
+ code will not delete anything from existing indexes. */
+ if (ha_alter_info->ignore
+ && (ha_alter_info->handler_flags
+ & (ALTER_ADD_PK_INDEX | ALTER_ADD_UNIQUE_INDEX))) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_IGNORE);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ /* DROP PRIMARY KEY is only allowed in combination with ADD
+ PRIMARY KEY. */
+ if ((ha_alter_info->handler_flags
+ & (ALTER_ADD_PK_INDEX | ALTER_DROP_PK_INDEX))
+ == ALTER_DROP_PK_INDEX) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOPK);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ if (ha_alter_info->handler_flags & ALTER_COLUMN_NULLABLE) {
+ /* If a NOT NULL attribute is going to be removed and
+ a UNIQUE INDEX on the column had been promoted to an
+ implicit PRIMARY KEY, the table should be rebuilt by
+ ALGORITHM=COPY. (Theoretically, we could support
+ rebuilding by ALGORITHM=INPLACE if a PRIMARY KEY is
+ going to be added, either explicitly or by promoting
+ another UNIQUE KEY.) */
+ const uint my_primary_key = altered_table->s->primary_key;
+
+ if (UNIV_UNLIKELY(my_primary_key >= MAX_KEY)
+ && !dict_index_is_auto_gen_clust(
+ dict_table_get_first_index(m_prebuilt->table))) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_PRIMARY_CANT_HAVE_NULL);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+ }
+
+ /*
+ InnoDB in different MariaDB versions was generating different mtype
+ codes for certain types. In some cases the signed/unsigned bit was
+ generated differently too.
+
+ Inplace ALTER would change the mtype/unsigned_flag (to what the
+ current code generates) without changing the underlying data
+ represenation, and it might result in data corruption.
+
+ Don't do inplace ALTER if mtype/unsigned_flag are wrong.
+ */
+ for (ulint i = 0, icol= 0; i < table->s->fields; i++) {
+ const Field* field = table->field[i];
+ const dict_col_t* col = dict_table_get_nth_col(
+ m_prebuilt->table, icol);
+ unsigned unsigned_flag;
+
+ if (!field->stored_in_db()) {
+ continue;
+ }
+
+ icol++;
+
+ if (col->mtype != get_innobase_type_from_mysql_type(
+ &unsigned_flag, field)) {
+
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ if ((col->prtype & DATA_UNSIGNED) != unsigned_flag) {
+
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+ }
+
+ ulint n_indexes = UT_LIST_GET_LEN((m_prebuilt->table)->indexes);
+
+ /* If InnoDB dictionary and MySQL frm file are not consistent
+ use "Copy" method. */
+ if (m_prebuilt->table->dict_frm_mismatch) {
+
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_NO_SUCH_INDEX);
+ ib_push_frm_error(m_user_thd, m_prebuilt->table, altered_table,
+ n_indexes, true);
+
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ /* '0000-00-00' value isn't allowed for datetime datatype
+ for newly added column when table is not empty */
+ if (ha_alter_info->error_if_not_empty
+ && !innobase_table_is_empty(m_prebuilt->table)) {
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ const bool add_drop_v_cols = !!(ha_alter_info->handler_flags
+ & (ALTER_ADD_VIRTUAL_COLUMN
+ | ALTER_DROP_VIRTUAL_COLUMN
+ | ALTER_VIRTUAL_COLUMN_ORDER));
+
+ /* We should be able to do the operation in-place.
+ See if we can do it online (LOCK=NONE) or without rebuild. */
+ bool online = true, need_rebuild = false;
+ const uint fulltext_indexes = innobase_fulltext_exist(altered_table);
+
+ /* Fix the key parts. */
+ for (KEY* new_key = ha_alter_info->key_info_buffer;
+ new_key < ha_alter_info->key_info_buffer
+ + ha_alter_info->key_count;
+ new_key++) {
+
+ /* Do not support adding/droping a virtual column, while
+ there is a table rebuild caused by adding a new FTS_DOC_ID */
+ if ((new_key->flags & HA_FULLTEXT) && add_drop_v_cols
+ && !DICT_TF2_FLAG_IS_SET(m_prebuilt->table,
+ DICT_TF2_FTS_HAS_DOC_ID)) {
+ ha_alter_info->unsupported_reason =
+ MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN;
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ for (KEY_PART_INFO* key_part = new_key->key_part;
+ key_part < (new_key->key_part
+ + new_key->user_defined_key_parts);
+ key_part++) {
+ DBUG_ASSERT(key_part->fieldnr
+ < altered_table->s->fields);
+
+ const Create_field* new_field
+ = ha_alter_info->alter_info->create_list.elem(
+ key_part->fieldnr);
+
+ DBUG_ASSERT(new_field);
+
+ key_part->field = altered_table->field[
+ key_part->fieldnr];
+
+ /* In some special cases InnoDB emits "false"
+ duplicate key errors with NULL key values. Let
+ us play safe and ensure that we can correctly
+ print key values even in such cases. */
+ key_part->null_offset = key_part->field->null_offset();
+ key_part->null_bit = key_part->field->null_bit;
+
+ if (new_field->field) {
+ /* This is an existing column. */
+ continue;
+ }
+
+ /* This is an added column. */
+ DBUG_ASSERT(ha_alter_info->handler_flags
+ & ALTER_ADD_COLUMN);
+
+ /* We cannot replace a hidden FTS_DOC_ID
+ with a user-visible FTS_DOC_ID. */
+ if (fulltext_indexes && m_prebuilt->table->fts
+ && !my_strcasecmp(
+ system_charset_info,
+ key_part->field->field_name.str,
+ FTS_DOC_ID_COL_NAME)) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_HIDDEN_FTS);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ DBUG_ASSERT((MTYP_TYPENR(key_part->field->unireg_check)
+ == Field::NEXT_NUMBER)
+ == !!(key_part->field->flags
+ & AUTO_INCREMENT_FLAG));
+
+ if (key_part->field->flags & AUTO_INCREMENT_FLAG) {
+ /* We cannot assign AUTO_INCREMENT values
+ during online or instant ALTER. */
+ DBUG_ASSERT(key_part->field == altered_table
+ -> found_next_number_field);
+
+ if (ha_alter_info->online) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_AUTOINC);
+ }
+
+ online = false;
+ need_rebuild = true;
+ }
+
+ if (!key_part->field->stored_in_db()) {
+ /* Do not support adding index on newly added
+ virtual column, while there is also a drop
+ virtual column in the same clause */
+ if (ha_alter_info->handler_flags
+ & ALTER_DROP_VIRTUAL_COLUMN) {
+ ha_alter_info->unsupported_reason =
+ MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN;
+
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ if (ha_alter_info->online
+ && !ha_alter_info->unsupported_reason) {
+ ha_alter_info->unsupported_reason =
+ MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN;
+ }
+
+ online = false;
+ }
+ }
+ }
+
+ DBUG_ASSERT(!m_prebuilt->table->fts
+ || (m_prebuilt->table->fts->doc_col <= table->s->fields));
+
+ DBUG_ASSERT(!m_prebuilt->table->fts
+ || (m_prebuilt->table->fts->doc_col
+ < dict_table_get_n_user_cols(m_prebuilt->table)));
+
+ if (fulltext_indexes && m_prebuilt->table->fts) {
+ /* FULLTEXT indexes are supposed to remain. */
+ /* Disallow DROP INDEX FTS_DOC_ID_INDEX */
+
+ for (uint i = 0; i < ha_alter_info->index_drop_count; i++) {
+ if (!my_strcasecmp(
+ system_charset_info,
+ ha_alter_info->index_drop_buffer[i]->name.str,
+ FTS_DOC_ID_INDEX_NAME)) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+ }
+
+ /* InnoDB can have a hidden FTS_DOC_ID_INDEX on a
+ visible FTS_DOC_ID column as well. Prevent dropping or
+ renaming the FTS_DOC_ID. */
+
+ for (Field** fp = table->field; *fp; fp++) {
+ if (!((*fp)->flags
+ & (FIELD_IS_RENAMED | FIELD_IS_DROPPED))) {
+ continue;
+ }
+
+ if (!my_strcasecmp(
+ system_charset_info,
+ (*fp)->field_name.str,
+ FTS_DOC_ID_COL_NAME)) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+ }
+ }
+
+ m_prebuilt->trx->will_lock = true;
+
+ /* When changing a NULL column to NOT NULL and specifying a
+ DEFAULT value, ensure that the DEFAULT expression is a constant.
+ Also, in ADD COLUMN, for now we only support a
+ constant DEFAULT expression. */
+ Field **af = altered_table->field;
+ bool fts_need_rebuild = false;
+ need_rebuild = need_rebuild
+ || innobase_need_rebuild(ha_alter_info, table);
+
+ for (Create_field& cf : ha_alter_info->alter_info->create_list) {
+ DBUG_ASSERT(cf.field
+ || (ha_alter_info->handler_flags
+ & ALTER_ADD_COLUMN));
+
+ if (const Field* f = cf.field) {
+ /* An AUTO_INCREMENT attribute can only
+ be added to an existing column by ALGORITHM=COPY,
+ but we can remove the attribute. */
+ ut_ad((MTYP_TYPENR((*af)->unireg_check)
+ != Field::NEXT_NUMBER)
+ || (MTYP_TYPENR(f->unireg_check)
+ == Field::NEXT_NUMBER));
+ if (!f->real_maybe_null() || (*af)->real_maybe_null())
+ goto next_column;
+ /* We are changing an existing column
+ from NULL to NOT NULL. */
+ DBUG_ASSERT(ha_alter_info->handler_flags
+ & ALTER_COLUMN_NOT_NULLABLE);
+ /* Virtual columns are never NOT NULL. */
+ DBUG_ASSERT(f->stored_in_db());
+ switch ((*af)->type()) {
+ case MYSQL_TYPE_TIMESTAMP:
+ case MYSQL_TYPE_TIMESTAMP2:
+ /* Inserting NULL into a TIMESTAMP column
+ would cause the DEFAULT value to be
+ replaced. Ensure that the DEFAULT
+ expression is not changing during
+ ALTER TABLE. */
+ if (!(*af)->default_value
+ && (*af)->is_real_null()) {
+ /* No DEFAULT value is
+ specified. We can report
+ errors for any NULL values for
+ the TIMESTAMP. */
+ goto next_column;
+ }
+ break;
+ default:
+ /* For any other data type, NULL
+ values are not converted. */
+ goto next_column;
+ }
+
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL);
+ } else if (!is_non_const_value(*af)
+ && set_default_value(*af)) {
+ if (fulltext_indexes > 1
+ && !my_strcasecmp(system_charset_info,
+ (*af)->field_name.str,
+ FTS_DOC_ID_COL_NAME)) {
+ /* If a hidden FTS_DOC_ID column exists
+ (because of FULLTEXT INDEX), it cannot
+ be replaced with a user-created one
+ except when using ALGORITHM=COPY. */
+ ha_alter_info->unsupported_reason =
+ my_get_err_msg(ER_INNODB_FT_LIMIT);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+ goto next_column;
+ }
+
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+
+next_column:
+ af++;
+ }
+
+ const bool supports_instant = instant_alter_column_possible(
+ *m_prebuilt->table, ha_alter_info, table, altered_table,
+ is_innodb_strict_mode());
+ if (add_drop_v_cols) {
+ ulonglong flags = ha_alter_info->handler_flags;
+
+ /* TODO: uncomment the flags below, once we start to
+ support them */
+
+ flags &= ~(ALTER_ADD_VIRTUAL_COLUMN
+ | ALTER_DROP_VIRTUAL_COLUMN
+ | ALTER_VIRTUAL_COLUMN_ORDER
+ | ALTER_VIRTUAL_GCOL_EXPR
+ | ALTER_COLUMN_VCOL
+ /*
+ | ALTER_ADD_STORED_BASE_COLUMN
+ | ALTER_DROP_STORED_COLUMN
+ | ALTER_STORED_COLUMN_ORDER
+ | ALTER_ADD_UNIQUE_INDEX
+ */
+ | ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX
+ | ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX);
+ if (supports_instant) {
+ flags &= ~(ALTER_DROP_STORED_COLUMN
+#if 0 /* MDEV-17468: remove check_v_col_in_order() and fix the code */
+ | ALTER_ADD_STORED_BASE_COLUMN
+#endif
+ | ALTER_STORED_COLUMN_ORDER);
+ }
+ if (flags != 0
+ || IF_PARTITIONING((altered_table->s->partition_info_str
+ && altered_table->s->partition_info_str_len), 0)
+ || (!check_v_col_in_order(
+ this->table, altered_table, ha_alter_info))) {
+ ha_alter_info->unsupported_reason =
+ MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN;
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+ }
+
+ if (supports_instant && !(ha_alter_info->handler_flags
+ & INNOBASE_ALTER_NOREBUILD)) {
+ DBUG_RETURN(HA_ALTER_INPLACE_INSTANT);
+ }
+
+ if (need_rebuild
+ && (fulltext_indexes
+ || innobase_spatial_exist(altered_table)
+ || innobase_indexed_virtual_exist(altered_table))) {
+ /* If the table already contains fulltext indexes,
+ refuse to rebuild the table natively altogether. */
+ if (fulltext_indexes > 1) {
+cannot_create_many_fulltext_index:
+ ha_alter_info->unsupported_reason =
+ my_get_err_msg(ER_INNODB_FT_LIMIT);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ if (!online || !ha_alter_info->online
+ || ha_alter_info->unsupported_reason != reason_rebuild) {
+ /* Either LOCK=NONE was not requested, or we already
+ gave specific reason to refuse it. */
+ } else if (fulltext_indexes) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS);
+ } else if (innobase_spatial_exist(altered_table)) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_GIS);
+ } else {
+ /* MDEV-14341 FIXME: Remove this limitation. */
+ ha_alter_info->unsupported_reason =
+ "online rebuild with indexed virtual columns";
+ }
+
+ online = false;
+ }
+
+ if (ha_alter_info->handler_flags
+ & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) {
+ /* ADD FULLTEXT|SPATIAL INDEX requires a lock.
+
+ We could do ADD FULLTEXT INDEX without a lock if the
+ table already contains an FTS_DOC_ID column, but in
+ that case we would have to apply the modification log
+ to the full-text indexes.
+
+ We could also do ADD SPATIAL INDEX by implementing
+ row_log_apply() for it. */
+ bool add_fulltext = false;
+
+ for (uint i = 0; i < ha_alter_info->index_add_count; i++) {
+ const KEY* key =
+ &ha_alter_info->key_info_buffer[
+ ha_alter_info->index_add_buffer[i]];
+ if (key->flags & HA_FULLTEXT) {
+ DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+ & ~(HA_FULLTEXT
+ | HA_PACK_KEY
+ | HA_GENERATED_KEY
+ | HA_BINARY_PACK_KEY)));
+ if (add_fulltext) {
+ goto cannot_create_many_fulltext_index;
+ }
+
+ add_fulltext = true;
+ if (ha_alter_info->online
+ && !ha_alter_info->unsupported_reason) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS);
+ }
+
+ online = false;
+
+ /* Full text search index exists, check
+ whether the table already has DOC ID column.
+ If not, InnoDB have to rebuild the table to
+ add a Doc ID hidden column and change
+ primary index. */
+ ulint fts_doc_col_no;
+ ulint num_v = 0;
+
+ fts_need_rebuild =
+ !innobase_fts_check_doc_id_col(
+ m_prebuilt->table,
+ altered_table,
+ &fts_doc_col_no, &num_v, true);
+ }
+
+ if (online && (key->flags & HA_SPATIAL)) {
+
+ if (ha_alter_info->online) {
+ ha_alter_info->unsupported_reason = my_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_GIS);
+ }
+
+ online = false;
+ }
+ }
+ }
+
+ // FIXME: implement Online DDL for system-versioned operations
+ if (ha_alter_info->handler_flags & INNOBASE_ALTER_VERSIONED_REBUILD) {
+
+ if (ha_alter_info->online) {
+ ha_alter_info->unsupported_reason =
+ "Not implemented for system-versioned operations";
+ }
+
+ online = false;
+ }
+
+ if ((need_rebuild && !supports_instant) || fts_need_rebuild) {
+ ha_alter_info->handler_flags |= ALTER_RECREATE_TABLE;
+ DBUG_RETURN(online
+ ? HA_ALTER_INPLACE_COPY_NO_LOCK
+ : HA_ALTER_INPLACE_COPY_LOCK);
+ }
+
+ if (ha_alter_info->unsupported_reason) {
+ } else if (ha_alter_info->handler_flags & INNOBASE_ONLINE_CREATE) {
+ ha_alter_info->unsupported_reason = "ADD INDEX";
+ } else {
+ ha_alter_info->unsupported_reason = "DROP INDEX";
+ }
+
+ DBUG_RETURN(online
+ ? HA_ALTER_INPLACE_NOCOPY_NO_LOCK
+ : HA_ALTER_INPLACE_NOCOPY_LOCK);
+}
+
+/*************************************************************//**
+Initialize the dict_foreign_t structure with supplied info
+@return true if added, false if duplicate foreign->id */
+static MY_ATTRIBUTE((nonnull(1,3,5,7)))
+bool
+innobase_init_foreign(
+/*==================*/
+ dict_foreign_t* foreign, /*!< in/out: structure to
+ initialize */
+ const char* constraint_name, /*!< in/out: constraint name if
+ exists */
+ dict_table_t* table, /*!< in: foreign table */
+ dict_index_t* index, /*!< in: foreign key index */
+ const char** column_names, /*!< in: foreign key column
+ names */
+ ulint num_field, /*!< in: number of columns */
+ const char* referenced_table_name, /*!< in: referenced table
+ name */
+ dict_table_t* referenced_table, /*!< in: referenced table */
+ dict_index_t* referenced_index, /*!< in: referenced index */
+ const char** referenced_column_names,/*!< in: referenced column
+ names */
+ ulint referenced_num_field) /*!< in: number of referenced
+ columns */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ if (constraint_name) {
+ ulint db_len;
+
+ /* Catenate 'databasename/' to the constraint name specified
+ by the user: we conceive the constraint as belonging to the
+ same MySQL 'database' as the table itself. We store the name
+ to foreign->id. */
+
+ db_len = dict_get_db_name_len(table->name.m_name);
+
+ foreign->id = static_cast<char*>(mem_heap_alloc(
+ foreign->heap, db_len + strlen(constraint_name) + 2));
+
+ memcpy(foreign->id, table->name.m_name, db_len);
+ foreign->id[db_len] = '/';
+ strcpy(foreign->id + db_len + 1, constraint_name);
+
+ /* Check if any existing foreign key has the same id,
+ this is needed only if user supplies the constraint name */
+
+ if (table->foreign_set.find(foreign)
+ != table->foreign_set.end()) {
+ return(false);
+ }
+ }
+
+ foreign->foreign_table = table;
+ foreign->foreign_table_name = mem_heap_strdup(
+ foreign->heap, table->name.m_name);
+ dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+ foreign->foreign_index = index;
+ foreign->n_fields = static_cast<unsigned>(num_field)
+ & dict_index_t::MAX_N_FIELDS;
+
+ foreign->foreign_col_names = static_cast<const char**>(
+ mem_heap_alloc(foreign->heap, num_field * sizeof(void*)));
+
+ for (ulint i = 0; i < foreign->n_fields; i++) {
+ foreign->foreign_col_names[i] = mem_heap_strdup(
+ foreign->heap, column_names[i]);
+ }
+
+ foreign->referenced_index = referenced_index;
+ foreign->referenced_table = referenced_table;
+
+ foreign->referenced_table_name = mem_heap_strdup(
+ foreign->heap, referenced_table_name);
+ dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+ foreign->referenced_col_names = static_cast<const char**>(
+ mem_heap_alloc(foreign->heap,
+ referenced_num_field * sizeof(void*)));
+
+ for (ulint i = 0; i < foreign->n_fields; i++) {
+ foreign->referenced_col_names[i]
+ = mem_heap_strdup(foreign->heap,
+ referenced_column_names[i]);
+ }
+
+ return(true);
+}
+
+/*************************************************************//**
+Check whether the foreign key options is legit
+@return true if it is */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_check_fk_option(
+/*=====================*/
+ const dict_foreign_t* foreign) /*!< in: foreign key */
+{
+ if (!foreign->foreign_index) {
+ return(true);
+ }
+
+ if (foreign->type & (DICT_FOREIGN_ON_UPDATE_SET_NULL
+ | DICT_FOREIGN_ON_DELETE_SET_NULL)) {
+
+ for (ulint j = 0; j < foreign->n_fields; j++) {
+ if ((dict_index_get_nth_col(
+ foreign->foreign_index, j)->prtype)
+ & DATA_NOT_NULL) {
+
+ /* It is not sensible to define
+ SET NULL if the column is not
+ allowed to be NULL! */
+ return(false);
+ }
+ }
+ }
+
+ return(true);
+}
+
+/*************************************************************//**
+Set foreign key options
+@return true if successfully set */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_set_foreign_key_option(
+/*============================*/
+ dict_foreign_t* foreign, /*!< in:InnoDB Foreign key */
+ Foreign_key* fk_key) /*!< in: Foreign key info from
+ MySQL */
+{
+ ut_ad(!foreign->type);
+
+ switch (fk_key->delete_opt) {
+ case FK_OPTION_NO_ACTION:
+ case FK_OPTION_RESTRICT:
+ case FK_OPTION_SET_DEFAULT:
+ foreign->type = DICT_FOREIGN_ON_DELETE_NO_ACTION;
+ break;
+ case FK_OPTION_CASCADE:
+ foreign->type = DICT_FOREIGN_ON_DELETE_CASCADE;
+ break;
+ case FK_OPTION_SET_NULL:
+ foreign->type = DICT_FOREIGN_ON_DELETE_SET_NULL;
+ break;
+ case FK_OPTION_UNDEF:
+ break;
+ }
+
+ switch (fk_key->update_opt) {
+ case FK_OPTION_NO_ACTION:
+ case FK_OPTION_RESTRICT:
+ case FK_OPTION_SET_DEFAULT:
+ foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
+ break;
+ case FK_OPTION_CASCADE:
+ foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
+ break;
+ case FK_OPTION_SET_NULL:
+ foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
+ break;
+ case FK_OPTION_UNDEF:
+ break;
+ }
+
+ return(innobase_check_fk_option(foreign));
+}
+
+/*******************************************************************//**
+Check if a foreign key constraint can make use of an index
+that is being created.
+@param[in] col_names column names
+@param[in] n_cols number of columns
+@param[in] keys index information
+@param[in] add indexes being created
+@return useable index, or NULL if none found */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const KEY*
+innobase_find_equiv_index(
+ const char*const* col_names,
+ uint n_cols,
+ const KEY* keys,
+ span<uint> add)
+{
+ for (span<uint>::iterator it = add.begin(), end = add.end(); it != end;
+ ++it) {
+ const KEY* key = &keys[*it];
+
+ if (key->user_defined_key_parts < n_cols
+ || key->flags & HA_SPATIAL) {
+no_match:
+ continue;
+ }
+
+ for (uint j = 0; j < n_cols; j++) {
+ const KEY_PART_INFO& key_part = key->key_part[j];
+ uint32 col_len
+ = key_part.field->pack_length();
+
+ /* Any index on virtual columns cannot be used
+ for reference constaint */
+ if (!key_part.field->stored_in_db()) {
+ goto no_match;
+ }
+
+ /* The MySQL pack length contains 1 or 2 bytes
+ length field for a true VARCHAR. */
+
+ if (key_part.field->type() == MYSQL_TYPE_VARCHAR) {
+ col_len -= static_cast<const Field_varstring*>(
+ key_part.field)->length_bytes;
+ }
+
+ if (key_part.length < col_len) {
+
+ /* Column prefix indexes cannot be
+ used for FOREIGN KEY constraints. */
+ goto no_match;
+ }
+
+ if (innobase_strcasecmp(col_names[j],
+ key_part.field->field_name.str)) {
+ /* Name mismatch */
+ goto no_match;
+ }
+ }
+
+ return(key);
+ }
+
+ return(NULL);
+}
+
+/*************************************************************//**
+Find an index whose first fields are the columns in the array
+in the same order and is not marked for deletion
+@return matching index, NULL if not found */
+static MY_ATTRIBUTE((nonnull(1,4), warn_unused_result))
+dict_index_t*
+innobase_find_fk_index(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use table->col_names */
+ span<dict_index_t*> drop_index,
+ /*!< in: indexes to be dropped */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols) /*!< in: number of columns */
+{
+ dict_index_t* index;
+
+ index = dict_table_get_first_index(table);
+
+ while (index != NULL) {
+ if (dict_foreign_qualify_index(table, col_names, columns,
+ n_cols, index, NULL, true, 0,
+ NULL, NULL, NULL)
+ && std::find(drop_index.begin(), drop_index.end(), index)
+ == drop_index.end()) {
+ return index;
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ return(NULL);
+}
+
+/** Check whether given column is a base of stored column.
+@param[in] col_name column name
+@param[in] table table
+@param[in] s_cols list of stored columns
+@return true if the given column is a base of stored column,else false. */
+static
+bool
+innobase_col_check_fk(
+ const char* col_name,
+ const dict_table_t* table,
+ dict_s_col_list* s_cols)
+{
+ dict_s_col_list::const_iterator it;
+
+ for (it = s_cols->begin(); it != s_cols->end(); ++it) {
+ for (ulint j = it->num_base; j--; ) {
+ if (!strcmp(col_name, dict_table_get_col_name(
+ table, it->base_col[j]->ind))) {
+ return(true);
+ }
+ }
+ }
+
+ return(false);
+}
+
+/** Check whether the foreign key constraint is on base of any stored columns.
+@param[in] foreign Foriegn key constraing information
+@param[in] table table to which the foreign key objects
+to be added
+@param[in] s_cols list of stored column information in the table.
+@return true if yes, otherwise false. */
+static
+bool
+innobase_check_fk_stored(
+ const dict_foreign_t* foreign,
+ const dict_table_t* table,
+ dict_s_col_list* s_cols)
+{
+ ulint type = foreign->type;
+
+ type &= ~(DICT_FOREIGN_ON_DELETE_NO_ACTION
+ | DICT_FOREIGN_ON_UPDATE_NO_ACTION);
+
+ if (type == 0 || s_cols == NULL) {
+ return(false);
+ }
+
+ for (ulint i = 0; i < foreign->n_fields; i++) {
+ if (innobase_col_check_fk(
+ foreign->foreign_col_names[i], table, s_cols)) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Create InnoDB foreign key structure from MySQL alter_info
+@param[in] ha_alter_info alter table info
+@param[in] table_share TABLE_SHARE
+@param[in] table table object
+@param[in] col_names column names, or NULL to use
+table->col_names
+@param[in] drop_index indexes to be dropped
+@param[in] n_drop_index size of drop_index
+@param[out] add_fk foreign constraint added
+@param[out] n_add_fk number of foreign constraints
+added
+@param[in] trx user transaction
+@param[in] s_cols list of stored column information
+@retval true if successful
+@retval false on error (will call my_error()) */
+static MY_ATTRIBUTE((nonnull(1,2,3,7,8), warn_unused_result))
+bool
+innobase_get_foreign_key_info(
+ Alter_inplace_info*
+ ha_alter_info,
+ const TABLE_SHARE*
+ table_share,
+ dict_table_t* table,
+ const char** col_names,
+ dict_index_t** drop_index,
+ ulint n_drop_index,
+ dict_foreign_t**add_fk,
+ ulint* n_add_fk,
+ const trx_t* trx,
+ dict_s_col_list*s_cols)
+{
+ dict_table_t* referenced_table = NULL;
+ char* referenced_table_name = NULL;
+ ulint num_fk = 0;
+ Alter_info* alter_info = ha_alter_info->alter_info;
+ const CHARSET_INFO* cs = thd_charset(trx->mysql_thd);
+
+ DBUG_ENTER("innobase_get_foreign_key_info");
+
+ *n_add_fk = 0;
+
+ for (Key& key : alter_info->key_list) {
+ if (key.type != Key::FOREIGN_KEY) {
+ continue;
+ }
+
+ const char* column_names[MAX_NUM_FK_COLUMNS];
+ dict_index_t* index = NULL;
+ const char* referenced_column_names[MAX_NUM_FK_COLUMNS];
+ dict_index_t* referenced_index = NULL;
+ ulint num_col = 0;
+ ulint referenced_num_col = 0;
+ bool correct_option;
+
+ Foreign_key* fk_key = static_cast<Foreign_key*>(&key);
+
+ if (fk_key->columns.elements > 0) {
+ ulint i = 0;
+
+ /* Get all the foreign key column info for the
+ current table */
+ for (const Key_part_spec& column : fk_key->columns) {
+ column_names[i] = column.field_name.str;
+ ut_ad(i < MAX_NUM_FK_COLUMNS);
+ i++;
+ }
+
+ index = innobase_find_fk_index(
+ table, col_names,
+ span<dict_index_t*>(drop_index, n_drop_index),
+ column_names, i);
+
+ /* MySQL would add a index in the creation
+ list if no such index for foreign table,
+ so we have to use DBUG_EXECUTE_IF to simulate
+ the scenario */
+ DBUG_EXECUTE_IF("innodb_test_no_foreign_idx",
+ index = NULL;);
+
+ /* Check whether there exist such
+ index in the the index create clause */
+ if (!index && !innobase_find_equiv_index(
+ column_names, static_cast<uint>(i),
+ ha_alter_info->key_info_buffer,
+ span<uint>(ha_alter_info->index_add_buffer,
+ ha_alter_info->index_add_count))) {
+ my_error(
+ ER_FK_NO_INDEX_CHILD,
+ MYF(0),
+ fk_key->name.str
+ ? fk_key->name.str : "",
+ table_share->table_name.str);
+ goto err_exit;
+ }
+
+ num_col = i;
+ }
+
+ add_fk[num_fk] = dict_mem_foreign_create();
+
+ mutex_enter(&dict_sys.mutex);
+
+ referenced_table_name = dict_get_referenced_table(
+ table->name.m_name,
+ LEX_STRING_WITH_LEN(fk_key->ref_db),
+ LEX_STRING_WITH_LEN(fk_key->ref_table),
+ &referenced_table,
+ add_fk[num_fk]->heap, cs);
+
+ /* Test the case when referenced_table failed to
+ open, if trx->check_foreigns is not set, we should
+ still be able to add the foreign key */
+ DBUG_EXECUTE_IF("innodb_test_open_ref_fail",
+ referenced_table = NULL;);
+
+ if (!referenced_table && trx->check_foreigns) {
+ mutex_exit(&dict_sys.mutex);
+ my_error(ER_FK_CANNOT_OPEN_PARENT,
+ MYF(0), fk_key->ref_table.str);
+
+ goto err_exit;
+ }
+
+ if (fk_key->ref_columns.elements > 0) {
+ ulint i = 0;
+
+ for (Key_part_spec &column : fk_key->ref_columns) {
+ referenced_column_names[i] =
+ column.field_name.str;
+ ut_ad(i < MAX_NUM_FK_COLUMNS);
+ i++;
+ }
+
+ if (referenced_table) {
+ referenced_index =
+ dict_foreign_find_index(
+ referenced_table, 0,
+ referenced_column_names,
+ i, index,
+ TRUE, FALSE,
+ NULL, NULL, NULL);
+
+ DBUG_EXECUTE_IF(
+ "innodb_test_no_reference_idx",
+ referenced_index = NULL;);
+
+ /* Check whether there exist such
+ index in the the index create clause */
+ if (!referenced_index) {
+ mutex_exit(&dict_sys.mutex);
+ my_error(ER_FK_NO_INDEX_PARENT, MYF(0),
+ fk_key->name.str
+ ? fk_key->name.str : "",
+ fk_key->ref_table.str);
+ goto err_exit;
+ }
+ } else {
+ ut_a(!trx->check_foreigns);
+ }
+
+ referenced_num_col = i;
+ } else {
+ /* Not possible to add a foreign key without a
+ referenced column */
+ mutex_exit(&dict_sys.mutex);
+ my_error(ER_CANNOT_ADD_FOREIGN, MYF(0),
+ fk_key->ref_table.str);
+ goto err_exit;
+ }
+
+ if (!innobase_init_foreign(
+ add_fk[num_fk], fk_key->name.str,
+ table, index, column_names,
+ num_col, referenced_table_name,
+ referenced_table, referenced_index,
+ referenced_column_names, referenced_num_col)) {
+ mutex_exit(&dict_sys.mutex);
+ my_error(
+ ER_DUP_CONSTRAINT_NAME,
+ MYF(0),
+ "FOREIGN KEY", add_fk[num_fk]->id);
+ goto err_exit;
+ }
+
+ mutex_exit(&dict_sys.mutex);
+
+ correct_option = innobase_set_foreign_key_option(
+ add_fk[num_fk], fk_key);
+
+ DBUG_EXECUTE_IF("innodb_test_wrong_fk_option",
+ correct_option = false;);
+
+ if (!correct_option) {
+ my_error(ER_FK_INCORRECT_OPTION,
+ MYF(0),
+ table_share->table_name.str,
+ add_fk[num_fk]->id);
+ goto err_exit;
+ }
+
+ if (innobase_check_fk_stored(
+ add_fk[num_fk], table, s_cols)) {
+ my_printf_error(
+ HA_ERR_UNSUPPORTED,
+ "Cannot add foreign key on the base column "
+ "of stored column", MYF(0));
+ goto err_exit;
+ }
+
+ num_fk++;
+ }
+
+ *n_add_fk = num_fk;
+
+ DBUG_RETURN(true);
+err_exit:
+ for (ulint i = 0; i <= num_fk; i++) {
+ if (add_fk[i]) {
+ dict_foreign_free(add_fk[i]);
+ }
+ }
+
+ DBUG_RETURN(false);
+}
+
+/*************************************************************//**
+Copies an InnoDB column to a MySQL field. This function is
+adapted from row_sel_field_store_in_mysql_format(). */
+static
+void
+innobase_col_to_mysql(
+/*==================*/
+ const dict_col_t* col, /*!< in: InnoDB column */
+ const uchar* data, /*!< in: InnoDB column data */
+ ulint len, /*!< in: length of data, in bytes */
+ Field* field) /*!< in/out: MySQL field */
+{
+ uchar* ptr;
+ uchar* dest = field->ptr;
+ ulint flen = field->pack_length();
+
+ switch (col->mtype) {
+ case DATA_INT:
+ ut_ad(len == flen);
+
+ /* Convert integer data from Innobase to little-endian
+ format, sign bit restored to normal */
+
+ for (ptr = dest + len; ptr != dest; ) {
+ *--ptr = *data++;
+ }
+
+ if (!(col->prtype & DATA_UNSIGNED)) {
+ ((byte*) dest)[len - 1] ^= 0x80;
+ }
+
+ break;
+
+ case DATA_VARCHAR:
+ case DATA_VARMYSQL:
+ case DATA_BINARY:
+ field->reset();
+
+ if (field->type() == MYSQL_TYPE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR. Store the
+ length of the data to the first byte or the first
+ two bytes of dest. */
+
+ dest = row_mysql_store_true_var_len(
+ dest, len, flen - field->key_length());
+ }
+
+ /* Copy the actual data */
+ memcpy(dest, data, len);
+ break;
+
+ case DATA_GEOMETRY:
+ case DATA_BLOB:
+ /* Skip MySQL BLOBs when reporting an erroneous row
+ during index creation or table rebuild. */
+ field->set_null();
+ break;
+
+#ifdef UNIV_DEBUG
+ case DATA_MYSQL:
+ ut_ad(flen >= len);
+ ut_ad(col->mbmaxlen >= col->mbminlen);
+ memcpy(dest, data, len);
+ break;
+
+ default:
+ case DATA_SYS_CHILD:
+ case DATA_SYS:
+ /* These column types should never be shipped to MySQL. */
+ ut_ad(0);
+ /* fall through */
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_DECIMAL:
+ /* Above are the valid column types for MySQL data. */
+ ut_ad(flen == len);
+ /* fall through */
+ case DATA_FIXBINARY:
+ case DATA_CHAR:
+ /* We may have flen > len when there is a shorter
+ prefix on the CHAR and BINARY column. */
+ ut_ad(flen >= len);
+#else /* UNIV_DEBUG */
+ default:
+#endif /* UNIV_DEBUG */
+ memcpy(dest, data, len);
+ }
+}
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+void
+innobase_rec_to_mysql(
+/*==================*/
+ struct TABLE* table, /*!< in/out: MySQL table */
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(
+ rec, index, ...) */
+{
+ uint n_fields = table->s->fields;
+
+ ut_ad(n_fields == dict_table_get_n_user_cols(index->table)
+ - !!(DICT_TF2_FLAG_IS_SET(index->table,
+ DICT_TF2_FTS_HAS_DOC_ID)));
+
+ for (uint i = 0; i < n_fields; i++) {
+ Field* field = table->field[i];
+ ulint ipos;
+ ulint ilen;
+ const uchar* ifield;
+ ulint prefix_col;
+
+ field->reset();
+
+ ipos = dict_index_get_nth_col_or_prefix_pos(
+ index, i, true, false, &prefix_col);
+
+ if (ipos == ULINT_UNDEFINED
+ || rec_offs_nth_extern(offsets, ipos)) {
+null_field:
+ field->set_null();
+ continue;
+ }
+
+ ifield = rec_get_nth_cfield(rec, index, offsets, ipos, &ilen);
+
+ /* Assign the NULL flag */
+ if (ilen == UNIV_SQL_NULL) {
+ ut_ad(field->real_maybe_null());
+ goto null_field;
+ }
+
+ field->set_notnull();
+
+ innobase_col_to_mysql(
+ dict_field_get_col(
+ dict_index_get_nth_field(index, ipos)),
+ ifield, ilen, field);
+ }
+}
+
+/*************************************************************//**
+Copies an InnoDB index entry to table->record[0].
+This is used in preparation for print_keydup_error() from
+inline add index */
+void
+innobase_fields_to_mysql(
+/*=====================*/
+ struct TABLE* table, /*!< in/out: MySQL table */
+ const dict_index_t* index, /*!< in: InnoDB index */
+ const dfield_t* fields) /*!< in: InnoDB index fields */
+{
+ uint n_fields = table->s->fields;
+ ulint num_v = 0;
+
+ ut_ad(n_fields == dict_table_get_n_user_cols(index->table)
+ + dict_table_get_n_v_cols(index->table)
+ - !!(DICT_TF2_FLAG_IS_SET(index->table,
+ DICT_TF2_FTS_HAS_DOC_ID)));
+
+ for (uint i = 0; i < n_fields; i++) {
+ Field* field = table->field[i];
+ ulint ipos;
+ ulint prefix_col;
+
+ field->reset();
+
+ const bool is_v = !field->stored_in_db();
+ const ulint col_n = is_v ? num_v++ : i - num_v;
+
+ ipos = dict_index_get_nth_col_or_prefix_pos(
+ index, col_n, true, is_v, &prefix_col);
+
+ if (ipos == ULINT_UNDEFINED
+ || dfield_is_ext(&fields[ipos])
+ || dfield_is_null(&fields[ipos])) {
+
+ field->set_null();
+ } else {
+ field->set_notnull();
+
+ const dfield_t* df = &fields[ipos];
+
+ innobase_col_to_mysql(
+ dict_field_get_col(
+ dict_index_get_nth_field(index, ipos)),
+ static_cast<const uchar*>(dfield_get_data(df)),
+ dfield_get_len(df), field);
+ }
+ }
+}
+
+/*************************************************************//**
+Copies an InnoDB row to table->record[0].
+This is used in preparation for print_keydup_error() from
+row_log_table_apply() */
+void
+innobase_row_to_mysql(
+/*==================*/
+ struct TABLE* table, /*!< in/out: MySQL table */
+ const dict_table_t* itab, /*!< in: InnoDB table */
+ const dtuple_t* row) /*!< in: InnoDB row */
+{
+ uint n_fields = table->s->fields;
+ ulint num_v = 0;
+
+ /* The InnoDB row may contain an extra FTS_DOC_ID column at the end. */
+ ut_ad(row->n_fields == dict_table_get_n_cols(itab));
+ ut_ad(n_fields == row->n_fields - DATA_N_SYS_COLS
+ + dict_table_get_n_v_cols(itab)
+ - !!(DICT_TF2_FLAG_IS_SET(itab, DICT_TF2_FTS_HAS_DOC_ID)));
+
+ for (uint i = 0; i < n_fields; i++) {
+ Field* field = table->field[i];
+
+ field->reset();
+
+ if (!field->stored_in_db()) {
+ /* Virtual column are not stored in InnoDB table, so
+ skip it */
+ num_v++;
+ continue;
+ }
+
+ const dfield_t* df = dtuple_get_nth_field(row, i - num_v);
+
+ if (dfield_is_ext(df) || dfield_is_null(df)) {
+ field->set_null();
+ } else {
+ field->set_notnull();
+
+ innobase_col_to_mysql(
+ dict_table_get_nth_col(itab, i - num_v),
+ static_cast<const uchar*>(dfield_get_data(df)),
+ dfield_get_len(df), field);
+ }
+ }
+ if (table->vfield) {
+ MY_BITMAP* old_read_set = tmp_use_all_columns(table, &table->read_set);
+ table->update_virtual_fields(table->file, VCOL_UPDATE_FOR_READ);
+ tmp_restore_column_map(&table->read_set, old_read_set);
+ }
+}
+
+/*******************************************************************//**
+This function checks that index keys are sensible.
+@return 0 or error number */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+int
+innobase_check_index_keys(
+/*======================*/
+ const Alter_inplace_info* info,
+ /*!< in: indexes to be created or dropped */
+ const dict_table_t* innodb_table)
+ /*!< in: Existing indexes */
+{
+ for (uint key_num = 0; key_num < info->index_add_count;
+ key_num++) {
+ const KEY& key = info->key_info_buffer[
+ info->index_add_buffer[key_num]];
+
+ /* Check that the same index name does not appear
+ twice in indexes to be created. */
+
+ for (ulint i = 0; i < key_num; i++) {
+ const KEY& key2 = info->key_info_buffer[
+ info->index_add_buffer[i]];
+
+ if (0 == strcmp(key.name.str, key2.name.str)) {
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ key.name.str);
+
+ return(ER_WRONG_NAME_FOR_INDEX);
+ }
+ }
+
+ /* Check that the same index name does not already exist. */
+
+ const dict_index_t* index;
+
+ for (index = dict_table_get_first_index(innodb_table);
+ index; index = dict_table_get_next_index(index)) {
+
+ if (index->is_committed()
+ && !strcmp(key.name.str, index->name)) {
+ break;
+ }
+ }
+
+ /* Now we are in a situation where we have "ADD INDEX x"
+ and an index by the same name already exists. We have 4
+ possible cases:
+ 1. No further clauses for an index x are given. Should reject
+ the operation.
+ 2. "DROP INDEX x" is given. Should allow the operation.
+ 3. "RENAME INDEX x TO y" is given. Should allow the operation.
+ 4. "DROP INDEX x, RENAME INDEX x TO y" is given. Should allow
+ the operation, since no name clash occurs. In this particular
+ case MySQL cancels the operation without calling InnoDB
+ methods. */
+
+ if (index) {
+ /* If a key by the same name is being created and
+ dropped, the name clash is OK. */
+ for (uint i = 0; i < info->index_drop_count;
+ i++) {
+ const KEY* drop_key
+ = info->index_drop_buffer[i];
+
+ if (0 == strcmp(key.name.str,
+ drop_key->name.str)) {
+ goto name_ok;
+ }
+ }
+
+ for (const Alter_inplace_info::Rename_key_pair& pair :
+ info->rename_keys) {
+ if (0 == strcmp(key.name.str,
+ pair.old_key->name.str)) {
+ goto name_ok;
+ }
+ }
+
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ key.name.str);
+ return(ER_WRONG_NAME_FOR_INDEX);
+ }
+
+name_ok:
+ for (ulint i = 0; i < key.user_defined_key_parts; i++) {
+ const KEY_PART_INFO& key_part1
+ = key.key_part[i];
+ const Field* field
+ = key_part1.field;
+ unsigned is_unsigned;
+
+ switch (get_innobase_type_from_mysql_type(
+ &is_unsigned, field)) {
+ default:
+ break;
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_DECIMAL:
+ /* Check that MySQL does not try to
+ create a column prefix index field on
+ an inappropriate data type. */
+
+ if (field->type() == MYSQL_TYPE_VARCHAR) {
+ if (key_part1.length
+ >= field->pack_length()
+ - ((Field_varstring*) field)
+ ->length_bytes) {
+ break;
+ }
+ } else {
+ if (key_part1.length
+ >= field->pack_length()) {
+ break;
+ }
+ }
+
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
+ field->field_name.str);
+ return(ER_WRONG_KEY_COLUMN);
+ }
+
+ /* Check that the same column does not appear
+ twice in the index. */
+
+ for (ulint j = 0; j < i; j++) {
+ const KEY_PART_INFO& key_part2
+ = key.key_part[j];
+
+ if (key_part1.fieldnr != key_part2.fieldnr) {
+ continue;
+ }
+
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
+ field->field_name.str);
+ return(ER_WRONG_KEY_COLUMN);
+ }
+ }
+ }
+
+ return(0);
+}
+
+/** Create index field definition for key part
+@param[in] new_clustered true if alter is generating a new clustered
+index
+@param[in] altered_table MySQL table that is being altered
+@param[in] key_part MySQL key definition
+@param[out] index_field index field definition for key_part */
+static MY_ATTRIBUTE((nonnull))
+void
+innobase_create_index_field_def(
+ bool new_clustered,
+ const TABLE* altered_table,
+ const KEY_PART_INFO* key_part,
+ index_field_t* index_field)
+{
+ const Field* field;
+ unsigned is_unsigned;
+ unsigned num_v = 0;
+
+ DBUG_ENTER("innobase_create_index_field_def");
+
+ field = new_clustered
+ ? altered_table->field[key_part->fieldnr]
+ : key_part->field;
+
+ for (ulint i = 0; i < key_part->fieldnr; i++) {
+ if (!altered_table->field[i]->stored_in_db()) {
+ num_v++;
+ }
+ }
+
+ auto col_type = get_innobase_type_from_mysql_type(
+ &is_unsigned, field);
+
+ if ((index_field->is_v_col = !field->stored_in_db())) {
+ index_field->col_no = num_v;
+ } else {
+ index_field->col_no = key_part->fieldnr - num_v;
+ }
+
+ if (DATA_LARGE_MTYPE(col_type)
+ || (key_part->length < field->pack_length()
+ && field->type() != MYSQL_TYPE_VARCHAR)
+ || (field->type() == MYSQL_TYPE_VARCHAR
+ && key_part->length < field->pack_length()
+ - ((Field_varstring*) field)->length_bytes)) {
+
+ index_field->prefix_len = key_part->length;
+ } else {
+ index_field->prefix_len = 0;
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/** Create index definition for key
+@param[in] altered_table MySQL table that is being altered
+@param[in] keys key definitions
+@param[in] key_number MySQL key number
+@param[in] new_clustered true if generating a new clustered
+index on the table
+@param[in] key_clustered true if this is the new clustered index
+@param[out] index index definition
+@param[in] heap heap where memory is allocated */
+static MY_ATTRIBUTE((nonnull))
+void
+innobase_create_index_def(
+ const TABLE* altered_table,
+ const KEY* keys,
+ ulint key_number,
+ bool new_clustered,
+ bool key_clustered,
+ index_def_t* index,
+ mem_heap_t* heap)
+{
+ const KEY* key = &keys[key_number];
+ ulint i;
+ ulint n_fields = key->user_defined_key_parts;
+
+ DBUG_ENTER("innobase_create_index_def");
+ DBUG_ASSERT(!key_clustered || new_clustered);
+
+ index->fields = static_cast<index_field_t*>(
+ mem_heap_alloc(heap, n_fields * sizeof *index->fields));
+
+ index->parser = NULL;
+ index->key_number = key_number;
+ index->n_fields = n_fields;
+ index->name = mem_heap_strdup(heap, key->name.str);
+ index->rebuild = new_clustered;
+
+ if (key_clustered) {
+ DBUG_ASSERT(!(key->flags & (HA_FULLTEXT | HA_SPATIAL)));
+ DBUG_ASSERT(key->flags & HA_NOSAME);
+ index->ind_type = DICT_CLUSTERED | DICT_UNIQUE;
+ } else if (key->flags & HA_FULLTEXT) {
+ DBUG_ASSERT(!(key->flags & (HA_SPATIAL | HA_NOSAME)));
+ DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+ & ~(HA_FULLTEXT
+ | HA_PACK_KEY
+ | HA_BINARY_PACK_KEY)));
+ index->ind_type = DICT_FTS;
+
+ /* Note: key->parser is only parser name,
+ we need to get parser from altered_table instead */
+
+ if (key->flags & HA_USES_PARSER) {
+ for (ulint j = 0; j < altered_table->s->keys; j++) {
+ if (!strcmp(altered_table->key_info[j].name.str,
+ key->name.str)) {
+ ut_ad(altered_table->key_info[j].flags
+ & HA_USES_PARSER);
+
+ plugin_ref parser =
+ altered_table->key_info[j].parser;
+ index->parser =
+ static_cast<st_mysql_ftparser*>(
+ plugin_decl(parser)->info);
+
+ break;
+ }
+ }
+
+ DBUG_EXECUTE_IF("fts_instrument_use_default_parser",
+ index->parser = &fts_default_parser;);
+ ut_ad(index->parser);
+ }
+ } else if (key->flags & HA_SPATIAL) {
+ DBUG_ASSERT(!(key->flags & HA_NOSAME));
+ index->ind_type = DICT_SPATIAL;
+ ut_ad(n_fields == 1);
+ ulint num_v = 0;
+
+ /* Need to count the virtual fields before this spatial
+ indexed field */
+ for (ulint i = 0; i < key->key_part->fieldnr; i++) {
+ num_v += !altered_table->field[i]->stored_in_db();
+ }
+ index->fields[0].col_no = key->key_part[0].fieldnr - num_v;
+ index->fields[0].prefix_len = 0;
+ index->fields[0].is_v_col = false;
+
+ /* Currently, the spatial index cannot be created
+ on virtual columns. It is blocked in the SQL layer. */
+ DBUG_ASSERT(key->key_part[0].field->stored_in_db());
+ } else {
+ index->ind_type = (key->flags & HA_NOSAME) ? DICT_UNIQUE : 0;
+ }
+
+ if (!(key->flags & HA_SPATIAL)) {
+ for (i = 0; i < n_fields; i++) {
+ innobase_create_index_field_def(
+ new_clustered, altered_table,
+ &key->key_part[i], &index->fields[i]);
+
+ if (index->fields[i].is_v_col) {
+ index->ind_type |= DICT_VIRTUAL;
+ }
+ }
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column.
+@return the status of the FTS_DOC_ID index */
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index(
+/*============================*/
+ const dict_table_t* table, /*!< in: table definition */
+ const TABLE* altered_table, /*!< in: MySQL table
+ that is being altered */
+ ulint* fts_doc_col_no) /*!< out: The column number for
+ Doc ID, or ULINT_UNDEFINED
+ if it is being created in
+ ha_alter_info */
+{
+ const dict_index_t* index;
+ const dict_field_t* field;
+
+ if (altered_table) {
+ /* Check if a unique index with the name of
+ FTS_DOC_ID_INDEX_NAME is being created. */
+
+ for (uint i = 0; i < altered_table->s->keys; i++) {
+ const KEY& key = altered_table->key_info[i];
+
+ if (innobase_strcasecmp(
+ key.name.str, FTS_DOC_ID_INDEX_NAME)) {
+ continue;
+ }
+
+ if ((key.flags & HA_NOSAME)
+ && key.user_defined_key_parts == 1
+ && !strcmp(key.name.str, FTS_DOC_ID_INDEX_NAME)
+ && !strcmp(key.key_part[0].field->field_name.str,
+ FTS_DOC_ID_COL_NAME)) {
+ if (fts_doc_col_no) {
+ *fts_doc_col_no = ULINT_UNDEFINED;
+ }
+ return(FTS_EXIST_DOC_ID_INDEX);
+ } else {
+ return(FTS_INCORRECT_DOC_ID_INDEX);
+ }
+ }
+ }
+
+ if (!table) {
+ return(FTS_NOT_EXIST_DOC_ID_INDEX);
+ }
+
+ for (index = dict_table_get_first_index(table);
+ index; index = dict_table_get_next_index(index)) {
+
+
+ /* Check if there exists a unique index with the name of
+ FTS_DOC_ID_INDEX_NAME and ignore the corrupted index */
+ if (index->type & DICT_CORRUPT
+ || innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME)) {
+ continue;
+ }
+
+ if (!dict_index_is_unique(index)
+ || dict_index_get_n_unique(index) > 1
+ || strcmp(index->name, FTS_DOC_ID_INDEX_NAME)) {
+ return(FTS_INCORRECT_DOC_ID_INDEX);
+ }
+
+ /* Check whether the index has FTS_DOC_ID as its
+ first column */
+ field = dict_index_get_nth_field(index, 0);
+
+ /* The column would be of a BIGINT data type */
+ if (strcmp(field->name, FTS_DOC_ID_COL_NAME) == 0
+ && field->col->mtype == DATA_INT
+ && field->col->len == 8
+ && field->col->prtype & DATA_NOT_NULL
+ && !field->col->is_virtual()) {
+ if (fts_doc_col_no) {
+ *fts_doc_col_no = dict_col_get_no(field->col);
+ }
+ return(FTS_EXIST_DOC_ID_INDEX);
+ } else {
+ return(FTS_INCORRECT_DOC_ID_INDEX);
+ }
+ }
+
+
+ /* Not found */
+ return(FTS_NOT_EXIST_DOC_ID_INDEX);
+}
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column in MySQL create index definition.
+@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index_in_def(
+/*===================================*/
+ ulint n_key, /*!< in: Number of keys */
+ const KEY* key_info) /*!< in: Key definition */
+{
+ /* Check whether there is a "FTS_DOC_ID_INDEX" in the to be built index
+ list */
+ for (ulint j = 0; j < n_key; j++) {
+ const KEY* key = &key_info[j];
+
+ if (innobase_strcasecmp(key->name.str, FTS_DOC_ID_INDEX_NAME)) {
+ continue;
+ }
+
+ /* Do a check on FTS DOC ID_INDEX, it must be unique,
+ named as "FTS_DOC_ID_INDEX" and on column "FTS_DOC_ID" */
+ if (!(key->flags & HA_NOSAME)
+ || key->user_defined_key_parts != 1
+ || strcmp(key->name.str, FTS_DOC_ID_INDEX_NAME)
+ || strcmp(key->key_part[0].field->field_name.str,
+ FTS_DOC_ID_COL_NAME)) {
+ return(FTS_INCORRECT_DOC_ID_INDEX);
+ }
+
+ return(FTS_EXIST_DOC_ID_INDEX);
+ }
+
+ return(FTS_NOT_EXIST_DOC_ID_INDEX);
+}
+
+/** Create an index table where indexes are ordered as follows:
+
+IF a new primary key is defined for the table THEN
+
+ 1) New primary key
+ 2) The remaining keys in key_info
+
+ELSE
+
+ 1) All new indexes in the order they arrive from MySQL
+
+ENDIF
+
+@return key definitions */
+MY_ATTRIBUTE((nonnull, warn_unused_result, malloc))
+inline index_def_t*
+ha_innobase_inplace_ctx::create_key_defs(
+ const Alter_inplace_info* ha_alter_info,
+ /*!< in: alter operation */
+ const TABLE* altered_table,
+ /*!< in: MySQL table that is being altered */
+ ulint& n_fts_add,
+ /*!< out: number of FTS indexes to be created */
+ ulint& fts_doc_id_col,
+ /*!< in: The column number for Doc ID */
+ bool& add_fts_doc_id,
+ /*!< in: whether we need to add new DOC ID
+ column for FTS index */
+ bool& add_fts_doc_idx,
+ /*!< in: whether we need to add new DOC ID
+ index for FTS index */
+ const TABLE* table)
+ /*!< in: MySQL table that is being altered */
+{
+ ulint& n_add = num_to_add_index;
+ const bool got_default_clust = new_table->indexes.start->is_gen_clust();
+
+ index_def_t* indexdef;
+ index_def_t* indexdefs;
+ bool new_primary;
+ const uint*const add
+ = ha_alter_info->index_add_buffer;
+ const KEY*const key_info
+ = ha_alter_info->key_info_buffer;
+
+ DBUG_ENTER("ha_innobase_inplace_ctx::create_key_defs");
+ DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_idx);
+ DBUG_ASSERT(ha_alter_info->index_add_count == n_add);
+
+ /* If there is a primary key, it is always the first index
+ defined for the innodb_table. */
+
+ new_primary = n_add > 0
+ && !my_strcasecmp(system_charset_info,
+ key_info[*add].name.str, "PRIMARY");
+ n_fts_add = 0;
+
+ /* If there is a UNIQUE INDEX consisting entirely of NOT NULL
+ columns and if the index does not contain column prefix(es)
+ (only prefix/part of the column is indexed), MySQL will treat the
+ index as a PRIMARY KEY unless the table already has one. */
+
+ ut_ad(altered_table->s->primary_key == 0
+ || altered_table->s->primary_key == MAX_KEY);
+
+ if (got_default_clust && !new_primary) {
+ new_primary = (altered_table->s->primary_key != MAX_KEY);
+ }
+
+ const bool rebuild = new_primary || add_fts_doc_id
+ || innobase_need_rebuild(ha_alter_info, table);
+
+ /* Reserve one more space if new_primary is true, and we might
+ need to add the FTS_DOC_ID_INDEX */
+ indexdef = indexdefs = static_cast<index_def_t*>(
+ mem_heap_alloc(
+ heap, sizeof *indexdef
+ * (ha_alter_info->key_count
+ + rebuild
+ + got_default_clust)));
+
+ if (rebuild) {
+ ulint primary_key_number;
+
+ if (new_primary) {
+ DBUG_ASSERT(n_add || got_default_clust);
+ DBUG_ASSERT(n_add || !altered_table->s->primary_key);
+ primary_key_number = altered_table->s->primary_key;
+ } else if (got_default_clust) {
+ /* Create the GEN_CLUST_INDEX */
+ index_def_t* index = indexdef++;
+
+ index->fields = NULL;
+ index->n_fields = 0;
+ index->ind_type = DICT_CLUSTERED;
+ index->name = innobase_index_reserve_name;
+ index->rebuild = true;
+ index->key_number = ~0U;
+ primary_key_number = ULINT_UNDEFINED;
+ goto created_clustered;
+ } else {
+ primary_key_number = 0;
+ }
+
+ /* Create the PRIMARY key index definition */
+ innobase_create_index_def(
+ altered_table, key_info, primary_key_number,
+ true, true, indexdef++, heap);
+
+created_clustered:
+ n_add = 1;
+
+ for (ulint i = 0; i < ha_alter_info->key_count; i++) {
+ if (i == primary_key_number) {
+ continue;
+ }
+ /* Copy the index definitions. */
+ innobase_create_index_def(
+ altered_table, key_info, i, true,
+ false, indexdef, heap);
+
+ if (indexdef->ind_type & DICT_FTS) {
+ n_fts_add++;
+ }
+
+ indexdef++;
+ n_add++;
+ }
+
+ if (n_fts_add > 0) {
+ ulint num_v = 0;
+
+ if (!add_fts_doc_id
+ && !innobase_fts_check_doc_id_col(
+ NULL, altered_table,
+ &fts_doc_id_col, &num_v)) {
+ fts_doc_id_col = altered_table->s->fields - num_v;
+ add_fts_doc_id = true;
+ }
+
+ if (!add_fts_doc_idx) {
+ fts_doc_id_index_enum ret;
+ ulint doc_col_no;
+
+ ret = innobase_fts_check_doc_id_index(
+ NULL, altered_table, &doc_col_no);
+
+ /* This should have been checked before */
+ ut_ad(ret != FTS_INCORRECT_DOC_ID_INDEX);
+
+ if (ret == FTS_NOT_EXIST_DOC_ID_INDEX) {
+ add_fts_doc_idx = true;
+ } else {
+ ut_ad(ret == FTS_EXIST_DOC_ID_INDEX);
+ ut_ad(doc_col_no == ULINT_UNDEFINED
+ || doc_col_no == fts_doc_id_col);
+ }
+ }
+ }
+ } else {
+ /* Create definitions for added secondary indexes. */
+
+ for (ulint i = 0; i < n_add; i++) {
+ innobase_create_index_def(
+ altered_table, key_info, add[i],
+ false, false, indexdef, heap);
+
+ if (indexdef->ind_type & DICT_FTS) {
+ n_fts_add++;
+ }
+
+ indexdef++;
+ }
+ }
+
+ DBUG_ASSERT(indexdefs + n_add == indexdef);
+
+ if (add_fts_doc_idx) {
+ index_def_t* index = indexdef++;
+
+ index->fields = static_cast<index_field_t*>(
+ mem_heap_alloc(heap, sizeof *index->fields));
+ index->n_fields = 1;
+ index->fields->col_no = fts_doc_id_col;
+ index->fields->prefix_len = 0;
+ index->fields->is_v_col = false;
+ index->ind_type = DICT_UNIQUE;
+ ut_ad(!rebuild
+ || !add_fts_doc_id
+ || fts_doc_id_col <= altered_table->s->fields);
+
+ index->name = FTS_DOC_ID_INDEX_NAME;
+ index->rebuild = rebuild;
+
+ /* TODO: assign a real MySQL key number for this */
+ index->key_number = ULINT_UNDEFINED;
+ n_add++;
+ }
+
+ DBUG_ASSERT(indexdef > indexdefs);
+ DBUG_ASSERT((ulint) (indexdef - indexdefs)
+ <= ha_alter_info->key_count
+ + add_fts_doc_idx + got_default_clust);
+ DBUG_ASSERT(ha_alter_info->index_add_count <= n_add);
+ DBUG_RETURN(indexdefs);
+}
+
+MY_ATTRIBUTE((warn_unused_result))
+bool too_big_key_part_length(size_t max_field_len, const KEY& key)
+{
+ for (ulint i = 0; i < key.user_defined_key_parts; i++) {
+ if (key.key_part[i].length > max_field_len) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/********************************************************************//**
+Drop any indexes that we were not able to free previously due to
+open table handles. */
+static
+void
+online_retry_drop_indexes_low(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+ /* We can have table->n_ref_count > 1, because other threads
+ may have prebuilt->table pointing to the table. However, these
+ other threads should be between statements, waiting for the
+ next statement to execute, or for a meta-data lock. */
+ ut_ad(table->get_ref_count() >= 1);
+
+ if (table->drop_aborted) {
+ row_merge_drop_indexes(trx, table, true);
+ }
+}
+
+/********************************************************************//**
+Drop any indexes that we were not able to free previously due to
+open table handles. */
+static MY_ATTRIBUTE((nonnull))
+void
+online_retry_drop_indexes(
+/*======================*/
+ dict_table_t* table, /*!< in/out: table */
+ THD* user_thd) /*!< in/out: MySQL connection */
+{
+ if (table->drop_aborted) {
+ trx_t* trx = innobase_trx_allocate(user_thd);
+
+ trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+ row_mysql_lock_data_dictionary(trx);
+ online_retry_drop_indexes_low(table, trx);
+ trx_commit_for_mysql(trx);
+ row_mysql_unlock_data_dictionary(trx);
+ trx->free();
+ }
+
+ ut_d(mutex_enter(&dict_sys.mutex));
+ ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
+ ut_d(mutex_exit(&dict_sys.mutex));
+ ut_ad(!table->drop_aborted);
+}
+
+/********************************************************************//**
+Commit a dictionary transaction and drop any indexes that we were not
+able to free previously due to open table handles. */
+static MY_ATTRIBUTE((nonnull))
+void
+online_retry_drop_indexes_with_trx(
+/*===============================*/
+ dict_table_t* table, /*!< in/out: table */
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ /* Now that the dictionary is being locked, check if we can
+ drop any incompletely created indexes that may have been left
+ behind in rollback_inplace_alter_table() earlier. */
+ if (table->drop_aborted) {
+
+ trx->table_id = 0;
+
+ trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+ online_retry_drop_indexes_low(table, trx);
+ trx_commit_for_mysql(trx);
+ }
+}
+
+/** Determines if InnoDB is dropping a foreign key constraint.
+@param foreign the constraint
+@param drop_fk constraints being dropped
+@param n_drop_fk number of constraints that are being dropped
+@return whether the constraint is being dropped */
+MY_ATTRIBUTE((pure, nonnull(1), warn_unused_result))
+inline
+bool
+innobase_dropping_foreign(
+ const dict_foreign_t* foreign,
+ dict_foreign_t** drop_fk,
+ ulint n_drop_fk)
+{
+ while (n_drop_fk--) {
+ if (*drop_fk++ == foreign) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Determines if an InnoDB FOREIGN KEY constraint depends on a
+column that is being dropped or modified to NOT NULL.
+@param user_table InnoDB table as it is before the ALTER operation
+@param col_name Name of the column being altered
+@param drop_fk constraints being dropped
+@param n_drop_fk number of constraints that are being dropped
+@param drop true=drop column, false=set NOT NULL
+@retval true Not allowed (will call my_error())
+@retval false Allowed
+*/
+MY_ATTRIBUTE((pure, nonnull(1,4), warn_unused_result))
+static
+bool
+innobase_check_foreigns_low(
+ const dict_table_t* user_table,
+ dict_foreign_t** drop_fk,
+ ulint n_drop_fk,
+ const char* col_name,
+ bool drop)
+{
+ dict_foreign_t* foreign;
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ /* Check if any FOREIGN KEY constraints are defined on this
+ column. */
+
+ for (dict_foreign_set::const_iterator it = user_table->foreign_set.begin();
+ it != user_table->foreign_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ if (!drop && !(foreign->type
+ & (DICT_FOREIGN_ON_DELETE_SET_NULL
+ | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+ continue;
+ }
+
+ if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) {
+ continue;
+ }
+
+ for (unsigned f = 0; f < foreign->n_fields; f++) {
+ if (!strcmp(foreign->foreign_col_names[f],
+ col_name)) {
+ my_error(drop
+ ? ER_FK_COLUMN_CANNOT_DROP
+ : ER_FK_COLUMN_NOT_NULL, MYF(0),
+ col_name, foreign->id);
+ return(true);
+ }
+ }
+ }
+
+ if (!drop) {
+ /* SET NULL clauses on foreign key constraints of
+ child tables affect the child tables, not the parent table.
+ The column can be NOT NULL in the parent table. */
+ return(false);
+ }
+
+ /* Check if any FOREIGN KEY constraints in other tables are
+ referring to the column that is being dropped. */
+ for (dict_foreign_set::const_iterator it
+ = user_table->referenced_set.begin();
+ it != user_table->referenced_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) {
+ continue;
+ }
+
+ for (unsigned f = 0; f < foreign->n_fields; f++) {
+ char display_name[FN_REFLEN];
+
+ if (strcmp(foreign->referenced_col_names[f],
+ col_name)) {
+ continue;
+ }
+
+ char* buf_end = innobase_convert_name(
+ display_name, (sizeof display_name) - 1,
+ foreign->foreign_table_name,
+ strlen(foreign->foreign_table_name),
+ NULL);
+ *buf_end = '\0';
+ my_error(ER_FK_COLUMN_CANNOT_DROP_CHILD,
+ MYF(0), col_name, foreign->id,
+ display_name);
+
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Determines if an InnoDB FOREIGN KEY constraint depends on a
+column that is being dropped or modified to NOT NULL.
+@param ha_alter_info Data used during in-place alter
+@param altered_table MySQL table that is being altered
+@param old_table MySQL table as it is before the ALTER operation
+@param user_table InnoDB table as it is before the ALTER operation
+@param drop_fk constraints being dropped
+@param n_drop_fk number of constraints that are being dropped
+@retval true Not allowed (will call my_error())
+@retval false Allowed
+*/
+MY_ATTRIBUTE((pure, nonnull(1,2,3), warn_unused_result))
+static
+bool
+innobase_check_foreigns(
+ Alter_inplace_info* ha_alter_info,
+ const TABLE* old_table,
+ const dict_table_t* user_table,
+ dict_foreign_t** drop_fk,
+ ulint n_drop_fk)
+{
+ for (Field** fp = old_table->field; *fp; fp++) {
+ ut_ad(!(*fp)->real_maybe_null()
+ == !!((*fp)->flags & NOT_NULL_FLAG));
+
+ auto end = ha_alter_info->alter_info->create_list.end();
+ auto it = std::find_if(
+ ha_alter_info->alter_info->create_list.begin(), end,
+ [fp](const Create_field& field) {
+ return field.field == *fp;
+ });
+
+ if (it == end || (it->flags & NOT_NULL_FLAG)) {
+ if (innobase_check_foreigns_low(
+ user_table, drop_fk, n_drop_fk,
+ (*fp)->field_name.str, it == end)) {
+ return(true);
+ }
+ }
+ }
+
+ return(false);
+}
+
+/** Convert a default value for ADD COLUMN.
+@param[in,out] heap Memory heap where allocated
+@param[out] dfield InnoDB data field to copy to
+@param[in] field MySQL value for the column
+@param[in] old_field Old column if altering; NULL for ADD COLUMN
+@param[in] comp nonzero if in compact format. */
+static void innobase_build_col_map_add(
+ mem_heap_t* heap,
+ dfield_t* dfield,
+ const Field* field,
+ const Field* old_field,
+ ulint comp)
+{
+ if (old_field && old_field->real_maybe_null()
+ && field->real_maybe_null()) {
+ return;
+ }
+
+ if (field->is_real_null()) {
+ dfield_set_null(dfield);
+ return;
+ }
+
+ const Field& from = old_field ? *old_field : *field;
+ ulint size = from.pack_length();
+
+ byte* buf = static_cast<byte*>(mem_heap_alloc(heap, size));
+
+ row_mysql_store_col_in_innobase_format(
+ dfield, buf, true, from.ptr, size, comp);
+}
+
+/** Construct the translation table for reordering, dropping or
+adding columns.
+
+@param ha_alter_info Data used during in-place alter
+@param altered_table MySQL table that is being altered
+@param table MySQL table as it is before the ALTER operation
+@param new_table InnoDB table corresponding to MySQL altered_table
+@param old_table InnoDB table corresponding to MYSQL table
+@param defaults Default values for ADD COLUMN, or NULL if no ADD COLUMN
+@param heap Memory heap where allocated
+@return array of integers, mapping column numbers in the table
+to column numbers in altered_table */
+static MY_ATTRIBUTE((nonnull(1,2,3,4,5,7), warn_unused_result))
+const ulint*
+innobase_build_col_map(
+/*===================*/
+ Alter_inplace_info* ha_alter_info,
+ const TABLE* altered_table,
+ const TABLE* table,
+ dict_table_t* new_table,
+ const dict_table_t* old_table,
+ dtuple_t* defaults,
+ mem_heap_t* heap)
+{
+ DBUG_ENTER("innobase_build_col_map");
+ DBUG_ASSERT(altered_table != table);
+ DBUG_ASSERT(new_table != old_table);
+ DBUG_ASSERT(dict_table_get_n_cols(new_table)
+ + dict_table_get_n_v_cols(new_table)
+ >= altered_table->s->fields + DATA_N_SYS_COLS);
+ DBUG_ASSERT(dict_table_get_n_cols(old_table)
+ + dict_table_get_n_v_cols(old_table)
+ >= table->s->fields + DATA_N_SYS_COLS
+ || ha_innobase::omits_virtual_cols(*table->s));
+ DBUG_ASSERT(!!defaults == !!(ha_alter_info->handler_flags
+ & INNOBASE_DEFAULTS));
+ DBUG_ASSERT(!defaults || dtuple_get_n_fields(defaults)
+ == dict_table_get_n_cols(new_table));
+
+ const uint old_n_v_cols = uint(table->s->fields
+ - table->s->stored_fields);
+ DBUG_ASSERT(old_n_v_cols == old_table->n_v_cols
+ || table->s->frm_version < FRM_VER_EXPRESSSIONS);
+ DBUG_ASSERT(!old_n_v_cols || table->s->virtual_fields);
+
+ ulint* col_map = static_cast<ulint*>(
+ mem_heap_alloc(
+ heap, (size_t(old_table->n_cols) + old_n_v_cols)
+ * sizeof *col_map));
+
+ uint i = 0;
+ uint num_v = 0;
+
+ /* Any dropped columns will map to ULINT_UNDEFINED. */
+ for (uint old_i = 0; old_i + DATA_N_SYS_COLS < old_table->n_cols;
+ old_i++) {
+ col_map[old_i] = ULINT_UNDEFINED;
+ }
+
+ for (uint old_i = 0; old_i < old_n_v_cols; old_i++) {
+ col_map[old_i + old_table->n_cols] = ULINT_UNDEFINED;
+ }
+
+ const bool omits_virtual = ha_innobase::omits_virtual_cols(*table->s);
+
+ for (const Create_field& new_field :
+ ha_alter_info->alter_info->create_list) {
+ bool is_v = !new_field.stored_in_db();
+ ulint num_old_v = 0;
+
+ for (uint old_i = 0; table->field[old_i]; old_i++) {
+ const Field* field = table->field[old_i];
+ if (!field->stored_in_db()) {
+ if (is_v && new_field.field == field) {
+ if (!omits_virtual) {
+ col_map[old_table->n_cols
+ + num_v]
+ = num_old_v;
+ }
+ num_old_v++;
+ goto found_col;
+ }
+ num_old_v++;
+ continue;
+ }
+
+ if (new_field.field == field) {
+
+ const Field* altered_field =
+ altered_table->field[i + num_v];
+
+ if (defaults) {
+ innobase_build_col_map_add(
+ heap,
+ dtuple_get_nth_field(
+ defaults, i),
+ altered_field,
+ field,
+ dict_table_is_comp(
+ new_table));
+ }
+
+ col_map[old_i - num_old_v] = i;
+ if (!old_table->versioned()
+ || !altered_table->versioned()) {
+ } else if (old_i == old_table->vers_start) {
+ new_table->vers_start = (i + num_v)
+ & dict_index_t::MAX_N_FIELDS;
+ } else if (old_i == old_table->vers_end) {
+ new_table->vers_end = (i + num_v)
+ & dict_index_t::MAX_N_FIELDS;
+ }
+ goto found_col;
+ }
+ }
+
+ if (!is_v) {
+ innobase_build_col_map_add(
+ heap, dtuple_get_nth_field(defaults, i),
+ altered_table->field[i + num_v],
+ NULL,
+ dict_table_is_comp(new_table));
+ }
+found_col:
+ if (is_v) {
+ num_v++;
+ } else {
+ i++;
+ }
+ }
+
+ DBUG_ASSERT(i == altered_table->s->fields - num_v);
+
+ i = table->s->fields - old_n_v_cols;
+
+ /* Add the InnoDB hidden FTS_DOC_ID column, if any. */
+ if (i + DATA_N_SYS_COLS < old_table->n_cols) {
+ /* There should be exactly one extra field,
+ the FTS_DOC_ID. */
+ DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(old_table,
+ DICT_TF2_FTS_HAS_DOC_ID));
+ DBUG_ASSERT(i + DATA_N_SYS_COLS + 1 == old_table->n_cols);
+ DBUG_ASSERT(!strcmp(dict_table_get_col_name(
+ old_table, i),
+ FTS_DOC_ID_COL_NAME));
+ if (altered_table->s->fields + DATA_N_SYS_COLS
+ - new_table->n_v_cols
+ < new_table->n_cols) {
+ DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(
+ new_table,
+ DICT_TF2_FTS_HAS_DOC_ID));
+ DBUG_ASSERT(altered_table->s->fields
+ + DATA_N_SYS_COLS + 1
+ == static_cast<ulint>(
+ new_table->n_cols
+ + new_table->n_v_cols));
+ col_map[i] = altered_table->s->fields
+ - new_table->n_v_cols;
+ } else {
+ DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET(
+ new_table,
+ DICT_TF2_FTS_HAS_DOC_ID));
+ col_map[i] = ULINT_UNDEFINED;
+ }
+
+ i++;
+ } else {
+ DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET(
+ old_table,
+ DICT_TF2_FTS_HAS_DOC_ID));
+ }
+
+ for (; i < old_table->n_cols; i++) {
+ col_map[i] = i + new_table->n_cols - old_table->n_cols;
+ }
+
+ DBUG_RETURN(col_map);
+}
+
+/** Drop newly create FTS index related auxiliary table during
+FIC create index process, before fts_add_index is called
+@param table table that was being rebuilt online
+@param trx transaction
+@return DB_SUCCESS if successful, otherwise last error code
+*/
+static
+dberr_t
+innobase_drop_fts_index_table(
+/*==========================*/
+ dict_table_t* table,
+ trx_t* trx)
+{
+ dberr_t ret_err = DB_SUCCESS;
+
+ for (dict_index_t* index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ if (index->type & DICT_FTS) {
+ dberr_t err;
+
+ err = fts_drop_index_tables(trx, index);
+
+ if (err != DB_SUCCESS) {
+ ret_err = err;
+ }
+ }
+ }
+
+ return(ret_err);
+}
+
+/** Get the new non-virtual column names if any columns were renamed
+@param ha_alter_info Data used during in-place alter
+@param altered_table MySQL table that is being altered
+@param table MySQL table as it is before the ALTER operation
+@param user_table InnoDB table as it is before the ALTER operation
+@param heap Memory heap for the allocation
+@return array of new column names in rebuilt_table, or NULL if not renamed */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const char**
+innobase_get_col_names(
+ Alter_inplace_info* ha_alter_info,
+ const TABLE* altered_table,
+ const TABLE* table,
+ const dict_table_t* user_table,
+ mem_heap_t* heap)
+{
+ const char** cols;
+ uint i;
+
+ DBUG_ENTER("innobase_get_col_names");
+ DBUG_ASSERT(user_table->n_t_def > table->s->fields);
+ DBUG_ASSERT(ha_alter_info->handler_flags
+ & ALTER_COLUMN_NAME);
+
+ cols = static_cast<const char**>(
+ mem_heap_zalloc(heap, user_table->n_def * sizeof *cols));
+
+ i = 0;
+ for (const Create_field& new_field :
+ ha_alter_info->alter_info->create_list) {
+ ulint num_v = 0;
+ DBUG_ASSERT(i < altered_table->s->fields);
+
+ if (!new_field.stored_in_db()) {
+ continue;
+ }
+
+ for (uint old_i = 0; table->field[old_i]; old_i++) {
+ num_v += !table->field[old_i]->stored_in_db();
+
+ if (new_field.field == table->field[old_i]) {
+ cols[old_i - num_v] = new_field.field_name.str;
+ break;
+ }
+ }
+
+ i++;
+ }
+
+ /* Copy the internal column names. */
+ i = table->s->fields - user_table->n_v_def;
+ cols[i] = dict_table_get_col_name(user_table, i);
+
+ while (++i < user_table->n_def) {
+ cols[i] = cols[i - 1] + strlen(cols[i - 1]) + 1;
+ }
+
+ DBUG_RETURN(cols);
+}
+
+/** Check whether the column prefix is increased, decreased, or unchanged.
+@param[in] new_prefix_len new prefix length
+@param[in] old_prefix_len new prefix length
+@retval 1 prefix is increased
+@retval 0 prefix is unchanged
+@retval -1 prefix is decreased */
+static inline
+lint
+innobase_pk_col_prefix_compare(
+ ulint new_prefix_len,
+ ulint old_prefix_len)
+{
+ ut_ad(new_prefix_len < COMPRESSED_REC_MAX_DATA_SIZE);
+ ut_ad(old_prefix_len < COMPRESSED_REC_MAX_DATA_SIZE);
+
+ if (new_prefix_len == old_prefix_len) {
+ return(0);
+ }
+
+ if (new_prefix_len == 0) {
+ new_prefix_len = ULINT_MAX;
+ }
+
+ if (old_prefix_len == 0) {
+ old_prefix_len = ULINT_MAX;
+ }
+
+ if (new_prefix_len > old_prefix_len) {
+ return(1);
+ } else {
+ return(-1);
+ }
+}
+
+/** Check whether the column is existing in old table.
+@param[in] new_col_no new column no
+@param[in] col_map mapping of old column numbers to new ones
+@param[in] col_map_size the column map size
+@return true if the column is existing, otherwise false. */
+static inline
+bool
+innobase_pk_col_is_existing(
+ const ulint new_col_no,
+ const ulint* col_map,
+ const ulint col_map_size)
+{
+ for (ulint i = 0; i < col_map_size; i++) {
+ if (col_map[i] == new_col_no) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Determine whether both the indexes have same set of primary key
+fields arranged in the same order.
+
+Rules when we cannot skip sorting:
+(1) Removing existing PK columns somewhere else than at the end of the PK;
+(2) Adding existing columns to the PK, except at the end of the PK when no
+columns are removed from the PK;
+(3) Changing the order of existing PK columns;
+(4) Decreasing the prefix length just like removing existing PK columns
+follows rule(1), Increasing the prefix length just like adding existing
+PK columns follows rule(2).
+@param[in] col_map mapping of old column numbers to new ones
+@param[in] ha_alter_info Data used during in-place alter
+@param[in] old_clust_index index to be compared
+@param[in] new_clust_index index to be compared
+@retval true if both indexes have same order.
+@retval false. */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+innobase_pk_order_preserved(
+ const ulint* col_map,
+ const dict_index_t* old_clust_index,
+ const dict_index_t* new_clust_index)
+{
+ ulint old_n_uniq
+ = dict_index_get_n_ordering_defined_by_user(
+ old_clust_index);
+ ulint new_n_uniq
+ = dict_index_get_n_ordering_defined_by_user(
+ new_clust_index);
+
+ ut_ad(dict_index_is_clust(old_clust_index));
+ ut_ad(dict_index_is_clust(new_clust_index));
+ ut_ad(old_clust_index->table != new_clust_index->table);
+ ut_ad(col_map != NULL);
+
+ if (old_n_uniq == 0) {
+ /* There was no PRIMARY KEY in the table.
+ If there is no PRIMARY KEY after the ALTER either,
+ no sorting is needed. */
+ return(new_n_uniq == old_n_uniq);
+ }
+
+ /* DROP PRIMARY KEY is only allowed in combination with
+ ADD PRIMARY KEY. */
+ ut_ad(new_n_uniq > 0);
+
+ /* The order of the last processed new_clust_index key field,
+ not counting ADD COLUMN, which are constant. */
+ lint last_field_order = -1;
+ ulint existing_field_count = 0;
+ ulint old_n_cols = dict_table_get_n_cols(old_clust_index->table);
+ for (ulint new_field = 0; new_field < new_n_uniq; new_field++) {
+ ulint new_col_no =
+ new_clust_index->fields[new_field].col->ind;
+
+ /* Check if there is a match in old primary key. */
+ ulint old_field = 0;
+ while (old_field < old_n_uniq) {
+ ulint old_col_no =
+ old_clust_index->fields[old_field].col->ind;
+
+ if (col_map[old_col_no] == new_col_no) {
+ break;
+ }
+
+ old_field++;
+ }
+
+ /* The order of key field in the new primary key.
+ 1. old PK column: idx in old primary key
+ 2. existing column: old_n_uniq + sequence no
+ 3. newly added column: no order */
+ lint new_field_order;
+ const bool old_pk_column = old_field < old_n_uniq;
+
+ if (old_pk_column) {
+ new_field_order = lint(old_field);
+ } else if (innobase_pk_col_is_existing(new_col_no, col_map,
+ old_n_cols)
+ || new_clust_index->table->persistent_autoinc
+ == new_field + 1) {
+ /* Adding an existing column or an AUTO_INCREMENT
+ column may change the existing ordering. */
+ new_field_order = lint(old_n_uniq
+ + existing_field_count++);
+ } else {
+ /* Skip newly added column. */
+ continue;
+ }
+
+ if (last_field_order + 1 != new_field_order) {
+ /* Old PK order is not kept, or existing column
+ is not added at the end of old PK. */
+ return(false);
+ }
+
+ last_field_order = new_field_order;
+
+ if (!old_pk_column) {
+ continue;
+ }
+
+ /* Check prefix length change. */
+ const lint prefix_change = innobase_pk_col_prefix_compare(
+ new_clust_index->fields[new_field].prefix_len,
+ old_clust_index->fields[old_field].prefix_len);
+
+ if (prefix_change < 0) {
+ /* If a column's prefix length is decreased, it should
+ be the last old PK column in new PK.
+ Note: we set last_field_order to -2, so that if there
+ are any old PK colmns or existing columns after it in
+ new PK, the comparison to new_field_order will fail in
+ the next round.*/
+ last_field_order = -2;
+ } else if (prefix_change > 0) {
+ /* If a column's prefix length is increased, it should
+ be the last PK column in old PK. */
+ if (old_field != old_n_uniq - 1) {
+ return(false);
+ }
+ }
+ }
+
+ return(true);
+}
+
+/** Update the mtype from DATA_BLOB to DATA_GEOMETRY for a specified
+GIS column of a table. This is used when we want to create spatial index
+on legacy GIS columns coming from 5.6, where we store GIS data as DATA_BLOB
+in innodb layer.
+@param[in] table_id table id
+@param[in] col_name column name
+@param[in] trx data dictionary transaction
+@retval true Failure
+@retval false Success */
+static
+bool
+innobase_update_gis_column_type(
+ table_id_t table_id,
+ const char* col_name,
+ trx_t* trx)
+{
+ pars_info_t* info;
+ dberr_t error;
+
+ DBUG_ENTER("innobase_update_gis_column_type");
+
+ DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_d(dict_sys.assert_locked());
+
+ info = pars_info_create();
+
+ pars_info_add_ull_literal(info, "tableid", table_id);
+ pars_info_add_str_literal(info, "name", col_name);
+ pars_info_add_int4_literal(info, "mtype", DATA_GEOMETRY);
+
+ trx->op_info = "update column type to DATA_GEOMETRY";
+
+ error = que_eval_sql(
+ info,
+ "PROCEDURE UPDATE_SYS_COLUMNS_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_COLUMNS SET MTYPE=:mtype\n"
+ "WHERE TABLE_ID=:tableid AND NAME=:name;\n"
+ "END;\n",
+ false, trx);
+
+ trx->error_state = DB_SUCCESS;
+ trx->op_info = "";
+
+ DBUG_RETURN(error != DB_SUCCESS);
+}
+
+/** Check if we are creating spatial indexes on GIS columns, which are
+legacy columns from earlier MySQL, such as 5.6. If so, we have to update
+the mtypes of the old GIS columns to DATA_GEOMETRY.
+In 5.6, we store GIS columns as DATA_BLOB in InnoDB layer, it will introduce
+confusion when we run latest server on older data. That's why we need to
+do the upgrade.
+@param[in] ha_alter_info Data used during in-place alter
+@param[in] table Table on which we want to add indexes
+@param[in] trx Transaction
+@return DB_SUCCESS if update successfully or no columns need to be updated,
+otherwise DB_ERROR, which means we can't update the mtype for some
+column, and creating spatial index on it should be dangerous */
+static
+dberr_t
+innobase_check_gis_columns(
+ Alter_inplace_info* ha_alter_info,
+ dict_table_t* table,
+ trx_t* trx)
+{
+ DBUG_ENTER("innobase_check_gis_columns");
+
+ for (uint key_num = 0;
+ key_num < ha_alter_info->index_add_count;
+ key_num++) {
+
+ const KEY& key = ha_alter_info->key_info_buffer[
+ ha_alter_info->index_add_buffer[key_num]];
+
+ if (!(key.flags & HA_SPATIAL)) {
+ continue;
+ }
+
+ ut_ad(key.user_defined_key_parts == 1);
+ const KEY_PART_INFO& key_part = key.key_part[0];
+
+ /* Does not support spatial index on virtual columns */
+ if (!key_part.field->stored_in_db()) {
+ DBUG_RETURN(DB_UNSUPPORTED);
+ }
+
+ ulint col_nr = dict_table_has_column(
+ table,
+ key_part.field->field_name.str,
+ key_part.fieldnr);
+ ut_ad(col_nr != table->n_def);
+ dict_col_t* col = &table->cols[col_nr];
+
+ if (col->mtype != DATA_BLOB) {
+ ut_ad(DATA_GEOMETRY_MTYPE(col->mtype));
+ continue;
+ }
+
+ const char* col_name = dict_table_get_col_name(
+ table, col_nr);
+
+ if (innobase_update_gis_column_type(
+ table->id, col_name, trx)) {
+
+ DBUG_RETURN(DB_ERROR);
+ } else {
+ col->mtype = DATA_GEOMETRY;
+
+ ib::info() << "Updated mtype of column" << col_name
+ << " in table " << table->name
+ << ", whose id is " << table->id
+ << " to DATA_GEOMETRY";
+ }
+ }
+
+ DBUG_RETURN(DB_SUCCESS);
+}
+
+/** Collect virtual column info for its addition
+@param[in] ha_alter_info Data used during in-place alter
+@param[in] altered_table MySQL table that is being altered to
+@param[in] table MySQL table as it is before the ALTER operation
+@retval true Failure
+@retval false Success */
+static
+bool
+prepare_inplace_add_virtual(
+ Alter_inplace_info* ha_alter_info,
+ const TABLE* altered_table,
+ const TABLE* table)
+{
+ ha_innobase_inplace_ctx* ctx;
+ uint16_t i = 0, j = 0;
+
+ ctx = static_cast<ha_innobase_inplace_ctx*>
+ (ha_alter_info->handler_ctx);
+
+ ctx->num_to_add_vcol = altered_table->s->virtual_fields
+ + ctx->num_to_drop_vcol - table->s->virtual_fields;
+
+ ctx->add_vcol = static_cast<dict_v_col_t*>(
+ mem_heap_zalloc(ctx->heap, ctx->num_to_add_vcol
+ * sizeof *ctx->add_vcol));
+ ctx->add_vcol_name = static_cast<const char**>(
+ mem_heap_alloc(ctx->heap, ctx->num_to_add_vcol
+ * sizeof *ctx->add_vcol_name));
+
+ for (const Create_field& new_field :
+ ha_alter_info->alter_info->create_list) {
+ const Field* field = altered_table->field[i++];
+
+ if (new_field.field || field->stored_in_db()) {
+ continue;
+ }
+
+ unsigned is_unsigned;
+ auto col_type = get_innobase_type_from_mysql_type(
+ &is_unsigned, field);
+
+ auto col_len = field->pack_length();
+ unsigned field_type = field->type() | is_unsigned;
+
+ if (!field->real_maybe_null()) {
+ field_type |= DATA_NOT_NULL;
+ }
+
+ if (field->binary()) {
+ field_type |= DATA_BINARY_TYPE;
+ }
+
+ unsigned charset_no;
+
+ if (dtype_is_string_type(col_type)) {
+ charset_no = field->charset()->number;
+
+ DBUG_EXECUTE_IF(
+ "ib_alter_add_virtual_fail",
+ charset_no += MAX_CHAR_COLL_NUM;);
+
+ if (charset_no > MAX_CHAR_COLL_NUM) {
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
+ field->field_name.str);
+ return(true);
+ }
+ } else {
+ charset_no = 0;
+ }
+
+ if (field->type() == MYSQL_TYPE_VARCHAR) {
+ uint32 length_bytes
+ = static_cast<const Field_varstring*>(
+ field)->length_bytes;
+
+ col_len -= length_bytes;
+
+ if (length_bytes == 2) {
+ field_type |= DATA_LONG_TRUE_VARCHAR;
+ }
+ }
+
+ new (&ctx->add_vcol[j]) dict_v_col_t();
+ ctx->add_vcol[j].m_col.prtype = dtype_form_prtype(
+ field_type, charset_no);
+
+ ctx->add_vcol[j].m_col.prtype |= DATA_VIRTUAL;
+
+ ctx->add_vcol[j].m_col.mtype = col_type;
+
+ ctx->add_vcol[j].m_col.len = static_cast<uint16_t>(col_len);
+
+ ctx->add_vcol[j].m_col.ind = (i - 1)
+ & dict_index_t::MAX_N_FIELDS;
+ ctx->add_vcol[j].num_base = 0;
+ ctx->add_vcol_name[j] = field->field_name.str;
+ ctx->add_vcol[j].base_col = NULL;
+ ctx->add_vcol[j].v_pos = (ctx->old_table->n_v_cols
+ - ctx->num_to_drop_vcol + j)
+ & dict_index_t::MAX_N_FIELDS;
+
+ /* MDEV-17468: Do this on ctx->instant_table later */
+ innodb_base_col_setup(ctx->old_table, field, &ctx->add_vcol[j]);
+ j++;
+ }
+
+ return(false);
+}
+
+/** Collect virtual column info for its addition
+@param[in] ha_alter_info Data used during in-place alter
+@param[in] table MySQL table as it is before the ALTER operation
+@retval true Failure
+@retval false Success */
+static
+bool
+prepare_inplace_drop_virtual(
+ Alter_inplace_info* ha_alter_info,
+ const TABLE* table)
+{
+ ha_innobase_inplace_ctx* ctx;
+ unsigned i = 0, j = 0;
+
+ ctx = static_cast<ha_innobase_inplace_ctx*>
+ (ha_alter_info->handler_ctx);
+
+ ctx->num_to_drop_vcol = 0;
+ for (i = 0; table->field[i]; i++) {
+ const Field* field = table->field[i];
+ if (field->flags & FIELD_IS_DROPPED && !field->stored_in_db()) {
+ ctx->num_to_drop_vcol++;
+ }
+ }
+
+ ctx->drop_vcol = static_cast<dict_v_col_t*>(
+ mem_heap_alloc(ctx->heap, ctx->num_to_drop_vcol
+ * sizeof *ctx->drop_vcol));
+ ctx->drop_vcol_name = static_cast<const char**>(
+ mem_heap_alloc(ctx->heap, ctx->num_to_drop_vcol
+ * sizeof *ctx->drop_vcol_name));
+
+ for (i = 0; table->field[i]; i++) {
+ Field *field = table->field[i];
+ if (!(field->flags & FIELD_IS_DROPPED) || field->stored_in_db()) {
+ continue;
+ }
+
+ unsigned is_unsigned;
+
+ auto col_type = get_innobase_type_from_mysql_type(
+ &is_unsigned, field);
+
+ auto col_len = field->pack_length();
+ unsigned field_type = field->type() | is_unsigned;
+
+ if (!field->real_maybe_null()) {
+ field_type |= DATA_NOT_NULL;
+ }
+
+ if (field->binary()) {
+ field_type |= DATA_BINARY_TYPE;
+ }
+
+ unsigned charset_no = 0;
+
+ if (dtype_is_string_type(col_type)) {
+ charset_no = field->charset()->number;
+
+ DBUG_EXECUTE_IF(
+ "ib_alter_add_virtual_fail",
+ charset_no += MAX_CHAR_COLL_NUM;);
+
+ if (charset_no > MAX_CHAR_COLL_NUM) {
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
+ field->field_name.str);
+ return(true);
+ }
+ } else {
+ charset_no = 0;
+ }
+
+ if (field->type() == MYSQL_TYPE_VARCHAR) {
+ uint32 length_bytes
+ = static_cast<const Field_varstring*>(
+ field)->length_bytes;
+
+ col_len -= length_bytes;
+
+ if (length_bytes == 2) {
+ field_type |= DATA_LONG_TRUE_VARCHAR;
+ }
+ }
+
+
+ ctx->drop_vcol[j].m_col.prtype = dtype_form_prtype(
+ field_type, charset_no);
+
+ ctx->drop_vcol[j].m_col.prtype |= DATA_VIRTUAL;
+
+ ctx->drop_vcol[j].m_col.mtype = col_type;
+
+ ctx->drop_vcol[j].m_col.len = static_cast<uint16_t>(col_len);
+
+ ctx->drop_vcol[j].m_col.ind = i & dict_index_t::MAX_N_FIELDS;
+
+ ctx->drop_vcol_name[j] = field->field_name.str;
+
+ dict_v_col_t* v_col = dict_table_get_nth_v_col_mysql(
+ ctx->old_table, i);
+ ctx->drop_vcol[j].v_pos = v_col->v_pos;
+ j++;
+ }
+
+ return(false);
+}
+
+/** Insert a new record to INNODB SYS_VIRTUAL
+@param[in] table InnoDB table
+@param[in] pos virtual column column no
+@param[in] base_pos base column pos
+@param[in] trx transaction
+@retval false on success
+@retval true on failure (my_error() will have been called) */
+static bool innobase_insert_sys_virtual(
+ const dict_table_t* table,
+ ulint pos,
+ ulint base_pos,
+ trx_t* trx)
+{
+ pars_info_t* info = pars_info_create();
+ pars_info_add_ull_literal(info, "id", table->id);
+ pars_info_add_int4_literal(info, "pos", pos);
+ pars_info_add_int4_literal(info, "base_pos", base_pos);
+
+ if (DB_SUCCESS != que_eval_sql(
+ info,
+ "PROCEDURE P () IS\n"
+ "BEGIN\n"
+ "INSERT INTO SYS_VIRTUAL VALUES (:id, :pos, :base_pos);\n"
+ "END;\n",
+ FALSE, trx)) {
+ my_error(ER_INTERNAL_ERROR, MYF(0),
+ "InnoDB: ADD COLUMN...VIRTUAL");
+ return true;
+ }
+
+ return false;
+}
+
+/** Insert a record to the SYS_COLUMNS dictionary table.
+@param[in] table_id table id
+@param[in] pos position of the column
+@param[in] field_name field name
+@param[in] mtype main type
+@param[in] prtype precise type
+@param[in] len fixed length in bytes, or 0
+@param[in] n_base number of base columns of virtual columns, or 0
+@param[in] update whether to update instead of inserting
+@retval false on success
+@retval true on failure (my_error() will have been called) */
+static bool innodb_insert_sys_columns(
+ table_id_t table_id,
+ ulint pos,
+ const char* field_name,
+ ulint mtype,
+ ulint prtype,
+ ulint len,
+ ulint n_base,
+ trx_t* trx,
+ bool update = false)
+{
+ pars_info_t* info = pars_info_create();
+ pars_info_add_ull_literal(info, "id", table_id);
+ pars_info_add_int4_literal(info, "pos", pos);
+ pars_info_add_str_literal(info, "name", field_name);
+ pars_info_add_int4_literal(info, "mtype", mtype);
+ pars_info_add_int4_literal(info, "prtype", prtype);
+ pars_info_add_int4_literal(info, "len", len);
+ pars_info_add_int4_literal(info, "base", n_base);
+
+ if (update) {
+ if (DB_SUCCESS != que_eval_sql(
+ info,
+ "PROCEDURE UPD_COL () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_COLUMNS SET\n"
+ "NAME=:name, MTYPE=:mtype, PRTYPE=:prtype, "
+ "LEN=:len, PREC=:base\n"
+ "WHERE TABLE_ID=:id AND POS=:pos;\n"
+ "END;\n", FALSE, trx)) {
+ my_error(ER_INTERNAL_ERROR, MYF(0),
+ "InnoDB: Updating SYS_COLUMNS failed");
+ return true;
+ }
+
+ return false;
+ }
+
+ if (DB_SUCCESS != que_eval_sql(
+ info,
+ "PROCEDURE ADD_COL () IS\n"
+ "BEGIN\n"
+ "INSERT INTO SYS_COLUMNS VALUES"
+ "(:id,:pos,:name,:mtype,:prtype,:len,:base);\n"
+ "END;\n", FALSE, trx)) {
+ my_error(ER_INTERNAL_ERROR, MYF(0),
+ "InnoDB: Insert into SYS_COLUMNS failed");
+ return true;
+ }
+
+ return false;
+}
+
+/** Update INNODB SYS_COLUMNS on new virtual columns
+@param[in] table InnoDB table
+@param[in] col_name column name
+@param[in] vcol virtual column
+@param[in] trx transaction
+@retval false on success
+@retval true on failure (my_error() will have been called) */
+static bool innobase_add_one_virtual(
+ const dict_table_t* table,
+ const char* col_name,
+ dict_v_col_t* vcol,
+ trx_t* trx)
+{
+ ulint pos = dict_create_v_col_pos(vcol->v_pos,
+ vcol->m_col.ind);
+
+ if (innodb_insert_sys_columns(table->id, pos, col_name,
+ vcol->m_col.mtype, vcol->m_col.prtype,
+ vcol->m_col.len, vcol->num_base, trx)) {
+ return true;
+ }
+
+ for (unsigned i = 0; i < vcol->num_base; i++) {
+ if (innobase_insert_sys_virtual(
+ table, pos, vcol->base_col[i]->ind, trx)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/** Update SYS_TABLES.N_COLS in the data dictionary.
+@param[in] user_table InnoDB table
+@param[in] n the new value of SYS_TABLES.N_COLS
+@param[in] trx transaction
+@return whether the operation failed */
+static bool innodb_update_cols(const dict_table_t* table, ulint n, trx_t* trx)
+{
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_int4_literal(info, "n", n);
+ pars_info_add_ull_literal(info, "id", table->id);
+
+ if (DB_SUCCESS != que_eval_sql(info,
+ "PROCEDURE UPDATE_N_COLS () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLES SET N_COLS = :n"
+ " WHERE ID = :id;\n"
+ "END;\n", FALSE, trx)) {
+ my_error(ER_INTERNAL_ERROR, MYF(0),
+ "InnoDB: Updating SYS_TABLES.N_COLS failed");
+ return true;
+ }
+
+ return false;
+}
+
+/** Update system table for adding virtual column(s)
+@param[in] ha_alter_info Data used during in-place alter
+@param[in] user_table InnoDB table
+@param[in] trx transaction
+@retval true Failure
+@retval false Success */
+static
+bool
+innobase_add_virtual_try(
+ const Alter_inplace_info* ha_alter_info,
+ const dict_table_t* user_table,
+ trx_t* trx)
+{
+ ha_innobase_inplace_ctx* ctx = static_cast<ha_innobase_inplace_ctx*>(
+ ha_alter_info->handler_ctx);
+
+ for (ulint i = 0; i < ctx->num_to_add_vcol; i++) {
+ if (innobase_add_one_virtual(
+ user_table, ctx->add_vcol_name[i],
+ &ctx->add_vcol[i], trx)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/** Delete metadata from SYS_COLUMNS and SYS_VIRTUAL.
+@param[in] id table id
+@param[in] pos first SYS_COLUMNS.POS
+@param[in,out] trx data dictionary transaction
+@retval true Failure
+@retval false Success. */
+static bool innobase_instant_drop_cols(table_id_t id, ulint pos, trx_t* trx)
+{
+ pars_info_t* info = pars_info_create();
+ pars_info_add_ull_literal(info, "id", id);
+ pars_info_add_int4_literal(info, "pos", pos);
+
+ dberr_t err = que_eval_sql(
+ info,
+ "PROCEDURE DELETE_COL () IS\n"
+ "BEGIN\n"
+ "DELETE FROM SYS_COLUMNS WHERE\n"
+ "TABLE_ID = :id AND POS >= :pos;\n"
+ "DELETE FROM SYS_VIRTUAL WHERE TABLE_ID = :id;\n"
+ "END;\n", FALSE, trx);
+ if (err != DB_SUCCESS) {
+ my_error(ER_INTERNAL_ERROR, MYF(0),
+ "InnoDB: DELETE from SYS_COLUMNS/SYS_VIRTUAL failed");
+ return true;
+ }
+
+ return false;
+}
+
+/** Update INNODB SYS_COLUMNS on new virtual column's position
+@param[in] table InnoDB table
+@param[in] old_pos old position
+@param[in] new_pos new position
+@param[in] trx transaction
+@return DB_SUCCESS if successful, otherwise error code */
+static
+dberr_t
+innobase_update_v_pos_sys_columns(
+ const dict_table_t* table,
+ ulint old_pos,
+ ulint new_pos,
+ trx_t* trx)
+{
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_int4_literal(info, "pos", old_pos);
+ pars_info_add_int4_literal(info, "val", new_pos);
+ pars_info_add_ull_literal(info, "id", table->id);
+
+ dberr_t error = que_eval_sql(
+ info,
+ "PROCEDURE P () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_COLUMNS\n"
+ "SET POS = :val\n"
+ "WHERE POS = :pos\n"
+ "AND TABLE_ID = :id;\n"
+ "END;\n",
+ FALSE, trx);
+
+ return(error);
+}
+
+/** Update INNODB SYS_VIRTUAL table with new virtual column position
+@param[in] table InnoDB table
+@param[in] old_pos old position
+@param[in] new_pos new position
+@param[in] trx transaction
+@return DB_SUCCESS if successful, otherwise error code */
+static
+dberr_t
+innobase_update_v_pos_sys_virtual(
+ const dict_table_t* table,
+ ulint old_pos,
+ ulint new_pos,
+ trx_t* trx)
+{
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_int4_literal(info, "pos", old_pos);
+ pars_info_add_int4_literal(info, "val", new_pos);
+ pars_info_add_ull_literal(info, "id", table->id);
+
+ dberr_t error = que_eval_sql(
+ info,
+ "PROCEDURE P () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_VIRTUAL\n"
+ "SET POS = :val\n"
+ "WHERE POS = :pos\n"
+ "AND TABLE_ID = :id;\n"
+ "END;\n",
+ FALSE, trx);
+
+ return(error);
+}
+
+/** Update InnoDB system tables on dropping a virtual column
+@param[in] table InnoDB table
+@param[in] col_name column name of the dropping column
+@param[in] drop_col col information for the dropping column
+@param[in] n_prev_dropped number of previously dropped columns in the
+ same alter clause
+@param[in] trx transaction
+@return DB_SUCCESS if successful, otherwise error code */
+static
+dberr_t
+innobase_drop_one_virtual_sys_columns(
+ const dict_table_t* table,
+ const char* col_name,
+ dict_col_t* drop_col,
+ ulint n_prev_dropped,
+ trx_t* trx)
+{
+ pars_info_t* info = pars_info_create();
+ pars_info_add_ull_literal(info, "id", table->id);
+
+ pars_info_add_str_literal(info, "name", col_name);
+
+ dberr_t error = que_eval_sql(
+ info,
+ "PROCEDURE P () IS\n"
+ "BEGIN\n"
+ "DELETE FROM SYS_COLUMNS\n"
+ "WHERE TABLE_ID = :id\n"
+ "AND NAME = :name;\n"
+ "END;\n",
+ FALSE, trx);
+
+ if (error != DB_SUCCESS) {
+ return(error);
+ }
+
+ dict_v_col_t* v_col = dict_table_get_nth_v_col_mysql(
+ table, drop_col->ind);
+
+ /* Adjust column positions for all subsequent columns */
+ for (ulint i = v_col->v_pos + 1; i < table->n_v_cols; i++) {
+ dict_v_col_t* t_col = dict_table_get_nth_v_col(table, i);
+ ulint old_p = dict_create_v_col_pos(
+ t_col->v_pos - n_prev_dropped,
+ t_col->m_col.ind - n_prev_dropped);
+ ulint new_p = dict_create_v_col_pos(
+ t_col->v_pos - 1 - n_prev_dropped,
+ ulint(t_col->m_col.ind) - 1 - n_prev_dropped);
+
+ error = innobase_update_v_pos_sys_columns(
+ table, old_p, new_p, trx);
+ if (error != DB_SUCCESS) {
+ return(error);
+ }
+ error = innobase_update_v_pos_sys_virtual(
+ table, old_p, new_p, trx);
+ if (error != DB_SUCCESS) {
+ return(error);
+ }
+ }
+
+ return(error);
+}
+
+/** Delete virtual column's info from INNODB SYS_VIRTUAL
+@param[in] table InnoDB table
+@param[in] pos position of the virtual column to be deleted
+@param[in] trx transaction
+@return DB_SUCCESS if successful, otherwise error code */
+static
+dberr_t
+innobase_drop_one_virtual_sys_virtual(
+ const dict_table_t* table,
+ ulint pos,
+ trx_t* trx)
+{
+ pars_info_t* info = pars_info_create();
+ pars_info_add_ull_literal(info, "id", table->id);
+
+ pars_info_add_int4_literal(info, "pos", pos);
+
+ dberr_t error = que_eval_sql(
+ info,
+ "PROCEDURE P () IS\n"
+ "BEGIN\n"
+ "DELETE FROM SYS_VIRTUAL\n"
+ "WHERE TABLE_ID = :id\n"
+ "AND POS = :pos;\n"
+ "END;\n",
+ FALSE, trx);
+
+ return(error);
+}
+
+/** Update system table for dropping virtual column(s)
+@param[in] ha_alter_info Data used during in-place alter
+@param[in] user_table InnoDB table
+@param[in] trx transaction
+@retval true Failure
+@retval false Success */
+static
+bool
+innobase_drop_virtual_try(
+ const Alter_inplace_info* ha_alter_info,
+ const dict_table_t* user_table,
+ trx_t* trx)
+{
+ ha_innobase_inplace_ctx* ctx;
+ dberr_t err = DB_SUCCESS;
+
+ ctx = static_cast<ha_innobase_inplace_ctx*>
+ (ha_alter_info->handler_ctx);
+
+ for (unsigned i = 0; i < ctx->num_to_drop_vcol; i++) {
+
+ ulint pos = dict_create_v_col_pos(
+ ctx->drop_vcol[i].v_pos - i,
+ ctx->drop_vcol[i].m_col.ind - i);
+ err = innobase_drop_one_virtual_sys_virtual(
+ user_table, pos, trx);
+
+ if (err != DB_SUCCESS) {
+ my_error(ER_INTERNAL_ERROR, MYF(0),
+ "InnoDB: DROP COLUMN...VIRTUAL");
+ return(true);
+ }
+
+ err = innobase_drop_one_virtual_sys_columns(
+ user_table, ctx->drop_vcol_name[i],
+ &(ctx->drop_vcol[i].m_col), i, trx);
+
+ if (err != DB_SUCCESS) {
+ my_error(ER_INTERNAL_ERROR, MYF(0),
+ "InnoDB: DROP COLUMN...VIRTUAL");
+ return(true);
+ }
+ }
+
+ return false;
+}
+
+/** Serialise metadata of dropped or reordered columns.
+@param[in,out] heap memory heap for allocation
+@param[out] field data field with the metadata */
+inline
+void dict_table_t::serialise_columns(mem_heap_t* heap, dfield_t* field) const
+{
+ DBUG_ASSERT(instant);
+ const dict_index_t& index = *UT_LIST_GET_FIRST(indexes);
+ unsigned n_fixed = index.first_user_field();
+ unsigned num_non_pk_fields = index.n_fields - n_fixed;
+
+ ulint len = 4 + num_non_pk_fields * 2;
+
+ byte* data = static_cast<byte*>(mem_heap_alloc(heap, len));
+
+ dfield_set_data(field, data, len);
+
+ mach_write_to_4(data, num_non_pk_fields);
+
+ data += 4;
+
+ for (ulint i = n_fixed; i < index.n_fields; i++) {
+ mach_write_to_2(data, instant->field_map[i - n_fixed]);
+ data += 2;
+ }
+}
+
+/** Construct the metadata record for instant ALTER TABLE.
+@param[in] row dummy or default values for existing columns
+@param[in,out] heap memory heap for allocations
+@return metadata record */
+inline
+dtuple_t*
+dict_index_t::instant_metadata(const dtuple_t& row, mem_heap_t* heap) const
+{
+ ut_ad(is_primary());
+ dtuple_t* entry;
+
+ if (!table->instant) {
+ entry = row_build_index_entry(&row, NULL, this, heap);
+ entry->info_bits = REC_INFO_METADATA_ADD;
+ return entry;
+ }
+
+ entry = dtuple_create(heap, n_fields + 1);
+ entry->n_fields_cmp = n_uniq;
+ entry->info_bits = REC_INFO_METADATA_ALTER;
+
+ const dict_field_t* field = fields;
+
+ for (uint i = 0; i <= n_fields; i++, field++) {
+ dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+ if (i == first_user_field()) {
+ table->serialise_columns(heap, dfield);
+ dfield->type.metadata_blob_init();
+ field--;
+ continue;
+ }
+
+ ut_ad(!field->col->is_virtual());
+
+ if (field->col->is_dropped()) {
+ dict_col_copy_type(field->col, &dfield->type);
+ if (field->col->is_nullable()) {
+ dfield_set_null(dfield);
+ } else {
+ dfield_set_data(dfield, field_ref_zero,
+ field->fixed_len);
+ }
+ continue;
+ }
+
+ const dfield_t* s = dtuple_get_nth_field(&row, field->col->ind);
+ ut_ad(dict_col_type_assert_equal(field->col, &s->type));
+ *dfield = *s;
+
+ if (dfield_is_null(dfield)) {
+ continue;
+ }
+
+ if (dfield_is_ext(dfield)) {
+ ut_ad(i > first_user_field());
+ ut_ad(!field->prefix_len);
+ ut_ad(dfield->len >= FIELD_REF_SIZE);
+ dfield_set_len(dfield, dfield->len - FIELD_REF_SIZE);
+ }
+
+ if (!field->prefix_len) {
+ continue;
+ }
+
+ ut_ad(field->col->ord_part);
+ ut_ad(i < n_uniq);
+
+ ulint len = dtype_get_at_most_n_mbchars(
+ field->col->prtype,
+ field->col->mbminlen, field->col->mbmaxlen,
+ field->prefix_len, dfield->len,
+ static_cast<char*>(dfield_get_data(dfield)));
+ dfield_set_len(dfield, len);
+ }
+
+ return entry;
+}
+
+/** Insert or update SYS_COLUMNS and the hidden metadata record
+for instant ALTER TABLE.
+@param[in] ha_alter_info ALTER TABLE context
+@param[in,out] ctx ALTER TABLE context for the current partition
+@param[in] altered_table MySQL table that is being altered
+@param[in] table MySQL table as it is before the ALTER operation
+@param[in,out] trx dictionary transaction
+@retval true failure
+@retval false success */
+static bool innobase_instant_try(
+ const Alter_inplace_info* ha_alter_info,
+ ha_innobase_inplace_ctx* ctx,
+ const TABLE* altered_table,
+ const TABLE* table,
+ trx_t* trx)
+{
+ DBUG_ASSERT(!ctx->need_rebuild());
+ DBUG_ASSERT(ctx->is_instant());
+
+ dict_table_t* user_table = ctx->old_table;
+
+ dict_index_t* index = dict_table_get_first_index(user_table);
+ const unsigned n_old_fields = index->n_fields;
+ const dict_col_t* old_cols = user_table->cols;
+ DBUG_ASSERT(user_table->n_cols == ctx->old_n_cols);
+
+ const bool metadata_changed = ctx->instant_column();
+
+ DBUG_ASSERT(index->n_fields >= n_old_fields);
+ /* The table may have been emptied and may have lost its
+ 'instantness' during this ALTER TABLE. */
+
+ /* Construct a table row of default values for the stored columns. */
+ dtuple_t* row = dtuple_create(ctx->heap, user_table->n_cols);
+ dict_table_copy_types(row, user_table);
+ Field** af = altered_table->field;
+ Field** const end = altered_table->field + altered_table->s->fields;
+ ut_d(List_iterator_fast<Create_field> cf_it(
+ ha_alter_info->alter_info->create_list));
+ if (ctx->first_alter_pos
+ && innobase_instant_drop_cols(user_table->id,
+ ctx->first_alter_pos - 1, trx)) {
+ return true;
+ }
+ for (uint i = 0; af < end; af++) {
+ if (!(*af)->stored_in_db()) {
+ ut_d(cf_it++);
+ continue;
+ }
+
+ const dict_col_t* old = dict_table_t::find(old_cols,
+ ctx->col_map,
+ ctx->old_n_cols, i);
+ DBUG_ASSERT(!old || i >= ctx->old_n_cols - DATA_N_SYS_COLS
+ || old->ind == i
+ || (ctx->first_alter_pos
+ && old->ind >= ctx->first_alter_pos - 1));
+
+ dfield_t* d = dtuple_get_nth_field(row, i);
+ const dict_col_t* col = dict_table_get_nth_col(user_table, i);
+ DBUG_ASSERT(!col->is_virtual());
+ DBUG_ASSERT(!col->is_dropped());
+ DBUG_ASSERT(col->mtype != DATA_SYS);
+ DBUG_ASSERT(!strcmp((*af)->field_name.str,
+ dict_table_get_col_name(user_table, i)));
+ DBUG_ASSERT(old || col->is_added());
+
+ ut_d(const Create_field* new_field = cf_it++);
+ /* new_field->field would point to an existing column.
+ If it is NULL, the column was added by this ALTER TABLE. */
+ ut_ad(!new_field->field == !old);
+
+ if (col->is_added()) {
+ dfield_set_data(d, col->def_val.data,
+ col->def_val.len);
+ } else if ((*af)->real_maybe_null()) {
+ /* Store NULL for nullable 'core' columns. */
+ dfield_set_null(d);
+ } else {
+ switch ((*af)->type()) {
+ case MYSQL_TYPE_VARCHAR:
+ case MYSQL_TYPE_GEOMETRY:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ variable_length:
+ /* Store the empty string for 'core'
+ variable-length NOT NULL columns. */
+ dfield_set_data(d, field_ref_zero, 0);
+ break;
+ case MYSQL_TYPE_STRING:
+ if (col->mbminlen != col->mbmaxlen
+ && user_table->not_redundant()) {
+ goto variable_length;
+ }
+ /* fall through */
+ default:
+ /* For fixed-length NOT NULL 'core' columns,
+ get a dummy default value from SQL. Note that
+ we will preserve the old values of these
+ columns when updating the metadata
+ record, to avoid unnecessary updates. */
+ ulint len = (*af)->pack_length();
+ DBUG_ASSERT(d->type.mtype != DATA_INT
+ || len <= 8);
+ row_mysql_store_col_in_innobase_format(
+ d, d->type.mtype == DATA_INT
+ ? static_cast<byte*>(
+ mem_heap_alloc(ctx->heap, len))
+ : NULL, true, (*af)->ptr, len,
+ dict_table_is_comp(user_table));
+ ut_ad(new_field->field->pack_length() == len);
+ }
+ }
+
+ bool update = old && (!ctx->first_alter_pos
+ || i < ctx->first_alter_pos - 1);
+ DBUG_ASSERT(!old || col->same_format(*old));
+ if (update
+ && old->prtype == d->type.prtype) {
+ /* The record is already present in SYS_COLUMNS. */
+ } else if (innodb_insert_sys_columns(user_table->id, i,
+ (*af)->field_name.str,
+ d->type.mtype,
+ d->type.prtype,
+ d->type.len, 0, trx,
+ update)) {
+ return true;
+ }
+
+ i++;
+ }
+
+ if (innodb_update_cols(user_table, dict_table_encode_n_col(
+ unsigned(user_table->n_cols)
+ - DATA_N_SYS_COLS,
+ user_table->n_v_cols)
+ | (user_table->flags & DICT_TF_COMPACT) << 31,
+ trx)) {
+ return true;
+ }
+
+ if (ctx->first_alter_pos) {
+add_all_virtual:
+ for (uint i = 0; i < user_table->n_v_cols; i++) {
+ if (innobase_add_one_virtual(
+ user_table,
+ dict_table_get_v_col_name(user_table, i),
+ &user_table->v_cols[i], trx)) {
+ return true;
+ }
+ }
+ } else if (ha_alter_info->handler_flags & ALTER_DROP_VIRTUAL_COLUMN) {
+ if (innobase_instant_drop_cols(user_table->id, 65536, trx)) {
+ return true;
+ }
+ goto add_all_virtual;
+ } else if ((ha_alter_info->handler_flags & ALTER_ADD_VIRTUAL_COLUMN)
+ && innobase_add_virtual_try(ha_alter_info, user_table,
+ trx)) {
+ return true;
+ }
+
+ if (!user_table->space) {
+ /* In case of ALTER TABLE...DISCARD TABLESPACE,
+ update only the metadata and transform the dictionary
+ cache entry to the canonical format. */
+ index->clear_instant_alter();
+ return false;
+ }
+
+ unsigned i = unsigned(user_table->n_cols) - DATA_N_SYS_COLS;
+ DBUG_ASSERT(i >= altered_table->s->stored_fields);
+ DBUG_ASSERT(i <= altered_table->s->stored_fields + 1);
+ if (i > altered_table->s->fields) {
+ const dict_col_t& fts_doc_id = user_table->cols[i - 1];
+ DBUG_ASSERT(!strcmp(fts_doc_id.name(*user_table),
+ FTS_DOC_ID_COL_NAME));
+ DBUG_ASSERT(!fts_doc_id.is_nullable());
+ DBUG_ASSERT(fts_doc_id.len == 8);
+ dfield_set_data(dtuple_get_nth_field(row, i - 1),
+ field_ref_zero, fts_doc_id.len);
+ }
+ byte trx_id[DATA_TRX_ID_LEN], roll_ptr[DATA_ROLL_PTR_LEN];
+ dfield_set_data(dtuple_get_nth_field(row, i++), field_ref_zero,
+ DATA_ROW_ID_LEN);
+ dfield_set_data(dtuple_get_nth_field(row, i++), trx_id, sizeof trx_id);
+ dfield_set_data(dtuple_get_nth_field(row, i),roll_ptr,sizeof roll_ptr);
+ DBUG_ASSERT(i + 1 == user_table->n_cols);
+
+ trx_write_trx_id(trx_id, trx->id);
+ /* The DB_ROLL_PTR will be assigned later, when allocating undo log.
+ Silence a Valgrind warning in dtuple_validate() when
+ row_ins_clust_index_entry_low() searches for the insert position. */
+ memset(roll_ptr, 0, sizeof roll_ptr);
+
+ dtuple_t* entry = index->instant_metadata(*row, ctx->heap);
+ mtr_t mtr;
+ mtr.start();
+ index->set_modified(mtr);
+ btr_pcur_t pcur;
+ btr_pcur_open_at_index_side(true, index, BTR_MODIFY_TREE, &pcur, true,
+ 0, &mtr);
+ ut_ad(btr_pcur_is_before_first_on_page(&pcur));
+ btr_pcur_move_to_next_on_page(&pcur);
+
+ buf_block_t* block = btr_pcur_get_block(&pcur);
+ ut_ad(page_is_leaf(block->frame));
+ ut_ad(!page_has_prev(block->frame));
+ ut_ad(!buf_block_get_page_zip(block));
+ const rec_t* rec = btr_pcur_get_rec(&pcur);
+ que_thr_t* thr = pars_complete_graph_for_exec(
+ NULL, trx, ctx->heap, NULL);
+ const bool is_root = block->page.id().page_no() == index->page;
+
+ dberr_t err = DB_SUCCESS;
+ if (rec_is_metadata(rec, *index)) {
+ ut_ad(page_rec_is_user_rec(rec));
+ if (is_root
+ && !rec_is_alter_metadata(rec, *index)
+ && !index->table->instant
+ && !page_has_next(block->frame)
+ && page_rec_is_last(rec, block->frame)) {
+ goto empty_table;
+ }
+
+ if (!metadata_changed) {
+ goto func_exit;
+ }
+
+ /* Ensure that the root page is in the correct format. */
+ buf_block_t* root = btr_root_block_get(index, RW_X_LATCH,
+ &mtr);
+ DBUG_ASSERT(root);
+ if (fil_page_get_type(root->frame) != FIL_PAGE_TYPE_INSTANT) {
+ DBUG_ASSERT("wrong page type" == 0);
+ err = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ btr_set_instant(root, *index, &mtr);
+
+ /* Extend the record with any added columns. */
+ uint n = uint(index->n_fields) - n_old_fields;
+ /* Reserve room for DB_TRX_ID,DB_ROLL_PTR and any
+ non-updated off-page columns in case they are moved off
+ page as a result of the update. */
+ const uint16_t f = user_table->instant != NULL;
+ upd_t* update = upd_create(index->n_fields + f, ctx->heap);
+ update->n_fields = n + f;
+ update->info_bits = f
+ ? REC_INFO_METADATA_ALTER
+ : REC_INFO_METADATA_ADD;
+ if (f) {
+ upd_field_t* uf = upd_get_nth_field(update, 0);
+ uf->field_no = index->first_user_field();
+ uf->new_val = entry->fields[uf->field_no];
+ DBUG_ASSERT(!dfield_is_ext(&uf->new_val));
+ DBUG_ASSERT(!dfield_is_null(&uf->new_val));
+ }
+
+ /* Add the default values for instantly added columns */
+ unsigned j = f;
+
+ for (unsigned k = n_old_fields; k < index->n_fields; k++) {
+ upd_field_t* uf = upd_get_nth_field(update, j++);
+ uf->field_no = static_cast<uint16_t>(k + f);
+ uf->new_val = entry->fields[k + f];
+
+ ut_ad(j <= n + f);
+ }
+
+ ut_ad(j == n + f);
+
+ rec_offs* offsets = NULL;
+ mem_heap_t* offsets_heap = NULL;
+ big_rec_t* big_rec;
+ err = btr_cur_pessimistic_update(
+ BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG,
+ btr_pcur_get_btr_cur(&pcur),
+ &offsets, &offsets_heap, ctx->heap,
+ &big_rec, update, UPD_NODE_NO_ORD_CHANGE,
+ thr, trx->id, &mtr);
+
+ offsets = rec_get_offsets(
+ btr_pcur_get_rec(&pcur), index, offsets,
+ index->n_core_fields, ULINT_UNDEFINED, &offsets_heap);
+ if (big_rec) {
+ if (err == DB_SUCCESS) {
+ err = btr_store_big_rec_extern_fields(
+ &pcur, offsets, big_rec, &mtr,
+ BTR_STORE_UPDATE);
+ }
+
+ dtuple_big_rec_free(big_rec);
+ }
+ if (offsets_heap) {
+ mem_heap_free(offsets_heap);
+ }
+ btr_pcur_close(&pcur);
+ goto func_exit;
+ } else if (is_root && page_rec_is_supremum(rec)
+ && !index->table->instant) {
+empty_table:
+ /* The table is empty. */
+ ut_ad(fil_page_index_page_check(block->frame));
+ ut_ad(!page_has_siblings(block->frame));
+ ut_ad(block->page.id().page_no() == index->page);
+ /* MDEV-17383: free metadata BLOBs! */
+ btr_page_empty(block, NULL, index, 0, &mtr);
+ if (index->is_instant()) {
+ index->clear_instant_add();
+ }
+ goto func_exit;
+ } else if (!user_table->is_instant()) {
+ ut_ad(!user_table->not_redundant());
+ goto func_exit;
+ }
+
+ /* Convert the table to the instant ALTER TABLE format. */
+ mtr.commit();
+ mtr.start();
+ index->set_modified(mtr);
+ if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr)) {
+ if (fil_page_get_type(root->frame) != FIL_PAGE_INDEX) {
+ DBUG_ASSERT("wrong page type" == 0);
+ goto err_exit;
+ }
+
+ btr_set_instant(root, *index, &mtr);
+ mtr.commit();
+ mtr.start();
+ index->set_modified(mtr);
+ err = row_ins_clust_index_entry_low(
+ BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index,
+ index->n_uniq, entry, 0, thr);
+ } else {
+err_exit:
+ err = DB_CORRUPTION;
+ }
+
+func_exit:
+ mtr.commit();
+
+ if (err != DB_SUCCESS) {
+ my_error_innodb(err, table->s->table_name.str,
+ user_table->flags);
+ return true;
+ }
+
+ return false;
+}
+
+/** Adjust the create index column number from "New table" to
+"old InnoDB table" while we are doing dropping virtual column. Since we do
+not create separate new table for the dropping/adding virtual columns.
+To correctly find the indexed column, we will need to find its col_no
+in the "Old Table", not the "New table".
+@param[in] ha_alter_info Data used during in-place alter
+@param[in] old_table MySQL table as it is before the ALTER operation
+@param[in] num_v_dropped number of virtual column dropped
+@param[in,out] index_def index definition */
+static
+void
+innodb_v_adjust_idx_col(
+ const Alter_inplace_info* ha_alter_info,
+ const TABLE* old_table,
+ ulint num_v_dropped,
+ index_def_t* index_def)
+{
+ for (ulint i = 0; i < index_def->n_fields; i++) {
+#ifdef UNIV_DEBUG
+ bool col_found = false;
+#endif /* UNIV_DEBUG */
+ ulint num_v = 0;
+
+ index_field_t* index_field = &index_def->fields[i];
+
+ /* Only adjust virtual column col_no, since non-virtual
+ column position (in non-vcol list) won't change unless
+ table rebuild */
+ if (!index_field->is_v_col) {
+ continue;
+ }
+
+ const Field* field = NULL;
+
+ /* Found the field in the new table */
+ for (const Create_field& new_field :
+ ha_alter_info->alter_info->create_list) {
+ if (new_field.stored_in_db()) {
+ continue;
+ }
+
+ field = new_field.field;
+
+ if (num_v == index_field->col_no) {
+ break;
+ }
+ num_v++;
+ }
+
+ if (!field) {
+ /* this means the field is a newly added field, this
+ should have been blocked when we drop virtual column
+ at the same time */
+ ut_ad(num_v_dropped > 0);
+ ut_a(0);
+ }
+
+ ut_ad(!field->stored_in_db());
+
+ num_v = 0;
+
+ /* Look for its position in old table */
+ for (uint old_i = 0; old_table->field[old_i]; old_i++) {
+ if (old_table->field[old_i] == field) {
+ /* Found it, adjust its col_no to its position
+ in old table */
+ index_def->fields[i].col_no = num_v;
+ ut_d(col_found = true);
+ break;
+ }
+
+ num_v += !old_table->field[old_i]->stored_in_db();
+ }
+
+ ut_ad(col_found);
+ }
+}
+
+/** Create index metadata in the data dictionary.
+@param[in,out] trx dictionary transaction
+@param[in,out] index index being created
+@param[in] add_v virtual columns that are being added, or NULL
+@return the created index */
+MY_ATTRIBUTE((nonnull(1,2), warn_unused_result))
+static
+dict_index_t*
+create_index_dict(
+ trx_t* trx,
+ dict_index_t* index,
+ const dict_add_v_col_t* add_v)
+{
+ DBUG_ENTER("create_index_dict");
+
+ mem_heap_t* heap = mem_heap_create(512);
+ ind_node_t* node = ind_create_graph_create(
+ index, index->table->name.m_name, heap, add_v);
+ que_thr_t* thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+
+ que_fork_start_command(
+ static_cast<que_fork_t*>(que_node_get_parent(thr)));
+
+ que_run_threads(thr);
+
+ DBUG_ASSERT(trx->error_state != DB_SUCCESS || index != node->index);
+ DBUG_ASSERT(trx->error_state != DB_SUCCESS || node->index);
+ index = node->index;
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ DBUG_RETURN(index);
+}
+
+/** Update internal structures with concurrent writes blocked,
+while preparing ALTER TABLE.
+
+@param ha_alter_info Data used during in-place alter
+@param altered_table MySQL table that is being altered
+@param old_table MySQL table as it is before the ALTER operation
+@param table_name Table name in MySQL
+@param flags Table and tablespace flags
+@param flags2 Additional table flags
+@param fts_doc_id_col The column number of FTS_DOC_ID
+@param add_fts_doc_id Flag: add column FTS_DOC_ID?
+@param add_fts_doc_id_idx Flag: add index FTS_DOC_ID_INDEX (FTS_DOC_ID)?
+
+@retval true Failure
+@retval false Success
+*/
+static MY_ATTRIBUTE((warn_unused_result, nonnull(1,2,3,4)))
+bool
+prepare_inplace_alter_table_dict(
+/*=============================*/
+ Alter_inplace_info* ha_alter_info,
+ const TABLE* altered_table,
+ const TABLE* old_table,
+ const char* table_name,
+ ulint flags,
+ ulint flags2,
+ ulint fts_doc_id_col,
+ bool add_fts_doc_id,
+ bool add_fts_doc_id_idx)
+{
+ bool dict_locked = false;
+ ulint* add_key_nums; /* MySQL key numbers */
+ index_def_t* index_defs; /* index definitions */
+ dict_table_t* user_table;
+ dict_index_t* fts_index = NULL;
+ bool new_clustered = false;
+ dberr_t error;
+ ulint num_fts_index;
+ dict_add_v_col_t* add_v = NULL;
+ ha_innobase_inplace_ctx*ctx;
+
+ DBUG_ENTER("prepare_inplace_alter_table_dict");
+
+ ctx = static_cast<ha_innobase_inplace_ctx*>
+ (ha_alter_info->handler_ctx);
+
+ DBUG_ASSERT((ctx->add_autoinc != ULINT_UNDEFINED)
+ == (ctx->sequence.max_value() > 0));
+ DBUG_ASSERT(!ctx->num_to_drop_index == !ctx->drop_index);
+ DBUG_ASSERT(!ctx->num_to_drop_fk == !ctx->drop_fk);
+ DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_id_idx);
+ DBUG_ASSERT(!add_fts_doc_id_idx
+ || innobase_fulltext_exist(altered_table));
+ DBUG_ASSERT(!ctx->defaults);
+ DBUG_ASSERT(!ctx->add_index);
+ DBUG_ASSERT(!ctx->add_key_numbers);
+ DBUG_ASSERT(!ctx->num_to_add_index);
+
+ user_table = ctx->new_table;
+
+ switch (ha_alter_info->inplace_supported) {
+ default: break;
+ case HA_ALTER_INPLACE_INSTANT:
+ case HA_ALTER_INPLACE_NOCOPY_LOCK:
+ case HA_ALTER_INPLACE_NOCOPY_NO_LOCK:
+ /* If we promised ALGORITHM=NOCOPY or ALGORITHM=INSTANT,
+ we must retain the original ROW_FORMAT of the table. */
+ flags = (user_table->flags & (DICT_TF_MASK_COMPACT
+ | DICT_TF_MASK_ATOMIC_BLOBS))
+ | (flags & ~(DICT_TF_MASK_COMPACT
+ | DICT_TF_MASK_ATOMIC_BLOBS));
+ }
+
+ trx_start_if_not_started_xa(ctx->prebuilt->trx, true);
+
+ if (ha_alter_info->handler_flags
+ & ALTER_DROP_VIRTUAL_COLUMN) {
+ if (prepare_inplace_drop_virtual(ha_alter_info, old_table)) {
+ DBUG_RETURN(true);
+ }
+ }
+
+ if (ha_alter_info->handler_flags
+ & ALTER_ADD_VIRTUAL_COLUMN) {
+ if (prepare_inplace_add_virtual(
+ ha_alter_info, altered_table, old_table)) {
+ DBUG_RETURN(true);
+ }
+
+ /* Need information for newly added virtual columns
+ for create index */
+
+ if (ha_alter_info->handler_flags
+ & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) {
+ for (ulint i = 0; i < ctx->num_to_add_vcol; i++) {
+ /* Set mbminmax for newly added column */
+ dict_col_t& col = ctx->add_vcol[i].m_col;
+ unsigned mbminlen, mbmaxlen;
+ dtype_get_mblen(col.mtype, col.prtype,
+ &mbminlen, &mbmaxlen);
+ col.mbminlen = mbminlen & 7;
+ col.mbmaxlen = mbmaxlen & 7;
+ }
+ add_v = static_cast<dict_add_v_col_t*>(
+ mem_heap_alloc(ctx->heap, sizeof *add_v));
+ add_v->n_v_col = ctx->num_to_add_vcol;
+ add_v->v_col = ctx->add_vcol;
+ add_v->v_col_name = ctx->add_vcol_name;
+ }
+ }
+
+ /* There should be no order change for virtual columns coming in
+ here */
+ ut_ad(check_v_col_in_order(old_table, altered_table, ha_alter_info));
+
+ /* Create table containing all indexes to be built in this
+ ALTER TABLE ADD INDEX so that they are in the correct order
+ in the table. */
+
+ ctx->num_to_add_index = ha_alter_info->index_add_count;
+
+ ut_ad(ctx->prebuilt->trx->mysql_thd != NULL);
+ const char* path = thd_innodb_tmpdir(
+ ctx->prebuilt->trx->mysql_thd);
+
+ index_defs = ctx->create_key_defs(
+ ha_alter_info, altered_table,
+ num_fts_index,
+ fts_doc_id_col, add_fts_doc_id, add_fts_doc_id_idx,
+ old_table);
+
+ new_clustered = (DICT_CLUSTERED & index_defs[0].ind_type) != 0;
+
+ create_table_info_t info(ctx->prebuilt->trx->mysql_thd, altered_table,
+ ha_alter_info->create_info, NULL, NULL,
+ srv_file_per_table);
+ ut_d(bool stats_wait = false);
+
+ /* The primary index would be rebuilt if a FTS Doc ID
+ column is to be added, and the primary index definition
+ is just copied from old table and stored in indexdefs[0] */
+ DBUG_ASSERT(!add_fts_doc_id || new_clustered);
+ DBUG_ASSERT(!!new_clustered ==
+ (innobase_need_rebuild(ha_alter_info, old_table)
+ || add_fts_doc_id));
+
+ /* Allocate memory for dictionary index definitions */
+
+ ctx->add_index = static_cast<dict_index_t**>(
+ mem_heap_zalloc(ctx->heap, ctx->num_to_add_index
+ * sizeof *ctx->add_index));
+ ctx->add_key_numbers = add_key_nums = static_cast<ulint*>(
+ mem_heap_alloc(ctx->heap, ctx->num_to_add_index
+ * sizeof *ctx->add_key_numbers));
+
+ /* Acquire a lock on the table before creating any indexes. */
+
+ if (ctx->online) {
+ error = DB_SUCCESS;
+ } else {
+ error = row_merge_lock_table(
+ ctx->prebuilt->trx, ctx->new_table, LOCK_S);
+
+ if (error != DB_SUCCESS) {
+
+ goto error_handling;
+ }
+ }
+
+ /* Create a background transaction for the operations on
+ the data dictionary tables. */
+ ctx->trx = innobase_trx_allocate(ctx->prebuilt->trx->mysql_thd);
+
+ trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX);
+
+ /* Latch the InnoDB data dictionary exclusively so that no deadlocks
+ or lock waits can happen in it during an index create operation. */
+
+ row_mysql_lock_data_dictionary(ctx->trx);
+ dict_locked = true;
+
+ /* Wait for background stats processing to stop using the table that
+ we are going to alter. We know bg stats will not start using it again
+ until we are holding the data dict locked and we are holding it here
+ at least until checking ut_ad(user_table->n_ref_count == 1) below.
+ XXX what may happen if bg stats opens the table after we
+ have unlocked data dictionary below? */
+ dict_stats_wait_bg_to_stop_using_table(user_table, ctx->trx);
+ ut_d(stats_wait = true);
+
+ online_retry_drop_indexes_low(ctx->new_table, ctx->trx);
+
+ ut_d(dict_table_check_for_dup_indexes(
+ ctx->new_table, CHECK_ABORTED_OK));
+
+ DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter",
+ error = DB_OUT_OF_MEMORY;
+ goto error_handling;);
+
+ /* If a new clustered index is defined for the table we need
+ to rebuild the table with a temporary name. */
+
+ if (new_clustered) {
+ if (innobase_check_foreigns(
+ ha_alter_info, old_table,
+ user_table, ctx->drop_fk, ctx->num_to_drop_fk)) {
+new_clustered_failed:
+ DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx);
+ ctx->trx->rollback();
+
+ ut_ad(user_table->get_ref_count() == 1);
+
+ online_retry_drop_indexes_with_trx(
+ user_table, ctx->trx);
+
+ if (ctx->need_rebuild()) {
+ if (ctx->new_table) {
+ ut_ad(!ctx->new_table->cached);
+ dict_mem_table_free(ctx->new_table);
+ }
+ ctx->new_table = ctx->old_table;
+ }
+
+ while (ctx->num_to_add_index--) {
+ if (dict_index_t*& i = ctx->add_index[
+ ctx->num_to_add_index]) {
+ dict_mem_index_free(i);
+ i = NULL;
+ }
+ }
+
+ goto err_exit;
+ }
+
+ size_t prefixlen= strlen(mysql_data_home);
+ if (mysql_data_home[prefixlen-1] != FN_LIBCHAR)
+ prefixlen++;
+ size_t tablen = altered_table->s->path.length - prefixlen;
+ const char* part = ctx->old_table->name.part();
+ size_t partlen = part ? strlen(part) : 0;
+ char* new_table_name = static_cast<char*>(
+ mem_heap_alloc(ctx->heap, tablen + partlen + 1));
+ memcpy(new_table_name,
+ altered_table->s->path.str + prefixlen, tablen);
+#ifdef _WIN32
+ {
+ char *sep= strchr(new_table_name, FN_LIBCHAR);
+ sep[0]= '/';
+ }
+#endif
+ memcpy(new_table_name + tablen, part ? part : "", partlen + 1);
+ ulint n_cols = 0;
+ ulint n_v_cols = 0;
+ dtuple_t* defaults;
+ ulint z = 0;
+
+ for (uint i = 0; i < altered_table->s->fields; i++) {
+ const Field* field = altered_table->field[i];
+
+ if (!field->stored_in_db()) {
+ n_v_cols++;
+ } else {
+ n_cols++;
+ }
+ }
+
+ ut_ad(n_cols + n_v_cols == altered_table->s->fields);
+
+ if (add_fts_doc_id) {
+ n_cols++;
+ DBUG_ASSERT(flags2 & DICT_TF2_FTS);
+ DBUG_ASSERT(add_fts_doc_id_idx);
+ flags2 |= DICT_TF2_FTS_ADD_DOC_ID
+ | DICT_TF2_FTS_HAS_DOC_ID
+ | DICT_TF2_FTS;
+ }
+
+ DBUG_ASSERT(!add_fts_doc_id_idx || (flags2 & DICT_TF2_FTS));
+
+ ctx->new_table = dict_mem_table_create(
+ new_table_name, NULL, n_cols + n_v_cols, n_v_cols,
+ flags, flags2);
+
+ /* The rebuilt indexed_table will use the renamed
+ column names. */
+ ctx->col_names = NULL;
+
+ if (DICT_TF_HAS_DATA_DIR(flags)) {
+ ctx->new_table->data_dir_path =
+ mem_heap_strdup(ctx->new_table->heap,
+ user_table->data_dir_path);
+ }
+
+ for (uint i = 0; i < altered_table->s->fields; i++) {
+ const Field* field = altered_table->field[i];
+ unsigned is_unsigned;
+ auto col_type = get_innobase_type_from_mysql_type(
+ &is_unsigned, field);
+ unsigned field_type = field->type() | is_unsigned;
+ const bool is_virtual = !field->stored_in_db();
+
+ /* we assume in dtype_form_prtype() that this
+ fits in two bytes */
+ ut_a(field_type <= MAX_CHAR_COLL_NUM);
+
+ if (!field->real_maybe_null()) {
+ field_type |= DATA_NOT_NULL;
+ }
+
+ if (field->binary()) {
+ field_type |= DATA_BINARY_TYPE;
+ }
+
+ if (altered_table->versioned()) {
+ if (i == altered_table->s->vers.start_fieldno) {
+ field_type |= DATA_VERS_START;
+ } else if (i ==
+ altered_table->s->vers.end_fieldno) {
+ field_type |= DATA_VERS_END;
+ } else if (!(field->flags
+ & VERS_UPDATE_UNVERSIONED_FLAG)) {
+ field_type |= DATA_VERSIONED;
+ }
+ }
+
+ unsigned charset_no;
+
+ if (dtype_is_string_type(col_type)) {
+ charset_no = field->charset()->number;
+
+ if (charset_no > MAX_CHAR_COLL_NUM) {
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
+ field->field_name.str);
+ goto new_clustered_failed;
+ }
+ } else {
+ charset_no = 0;
+ }
+
+ auto col_len = field->pack_length();
+
+ /* The MySQL pack length contains 1 or 2 bytes
+ length field for a true VARCHAR. Let us
+ subtract that, so that the InnoDB column
+ length in the InnoDB data dictionary is the
+ real maximum byte length of the actual data. */
+
+ if (field->type() == MYSQL_TYPE_VARCHAR) {
+ uint32 length_bytes
+ = static_cast<const Field_varstring*>(
+ field)->length_bytes;
+
+ col_len -= length_bytes;
+
+ if (length_bytes == 2) {
+ field_type |= DATA_LONG_TRUE_VARCHAR;
+ }
+
+ }
+
+ if (dict_col_name_is_reserved(field->field_name.str)) {
+wrong_column_name:
+ dict_mem_table_free(ctx->new_table);
+ ctx->new_table = ctx->old_table;
+ my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+ field->field_name.str);
+ goto new_clustered_failed;
+ }
+
+ /** Note the FTS_DOC_ID name is case sensitive due
+ to internal query parser.
+ FTS_DOC_ID column must be of BIGINT NOT NULL type
+ and it should be in all capitalized characters */
+ if (!innobase_strcasecmp(field->field_name.str,
+ FTS_DOC_ID_COL_NAME)) {
+ if (col_type != DATA_INT
+ || field->real_maybe_null()
+ || col_len != sizeof(doc_id_t)
+ || strcmp(field->field_name.str,
+ FTS_DOC_ID_COL_NAME)) {
+ goto wrong_column_name;
+ }
+ }
+
+ if (is_virtual) {
+ dict_mem_table_add_v_col(
+ ctx->new_table, ctx->heap,
+ field->field_name.str,
+ col_type,
+ dtype_form_prtype(
+ field_type, charset_no)
+ | DATA_VIRTUAL,
+ col_len, i, 0);
+ } else {
+ dict_mem_table_add_col(
+ ctx->new_table, ctx->heap,
+ field->field_name.str,
+ col_type,
+ dtype_form_prtype(
+ field_type, charset_no),
+ col_len);
+ }
+ }
+
+ if (n_v_cols) {
+ for (uint i = 0; i < altered_table->s->fields; i++) {
+ dict_v_col_t* v_col;
+ const Field* field = altered_table->field[i];
+
+ if (!!field->stored_in_db()) {
+ continue;
+ }
+ v_col = dict_table_get_nth_v_col(
+ ctx->new_table, z);
+ z++;
+ innodb_base_col_setup(
+ ctx->new_table, field, v_col);
+ }
+ }
+
+ if (add_fts_doc_id) {
+ fts_add_doc_id_column(ctx->new_table, ctx->heap);
+ ctx->new_table->fts->doc_col = fts_doc_id_col;
+ ut_ad(fts_doc_id_col
+ == altered_table->s->fields - n_v_cols);
+ } else if (ctx->new_table->fts) {
+ ctx->new_table->fts->doc_col = fts_doc_id_col;
+ }
+
+ dict_table_add_system_columns(ctx->new_table, ctx->heap);
+
+ if (ha_alter_info->handler_flags & INNOBASE_DEFAULTS) {
+ defaults = dtuple_create_with_vcol(
+ ctx->heap,
+ dict_table_get_n_cols(ctx->new_table),
+ dict_table_get_n_v_cols(ctx->new_table));
+
+ dict_table_copy_types(defaults, ctx->new_table);
+ } else {
+ defaults = NULL;
+ }
+
+ ctx->col_map = innobase_build_col_map(
+ ha_alter_info, altered_table, old_table,
+ ctx->new_table, user_table, defaults, ctx->heap);
+ ctx->defaults = defaults;
+ } else {
+ DBUG_ASSERT(!innobase_need_rebuild(ha_alter_info, old_table));
+ DBUG_ASSERT(old_table->s->primary_key
+ == altered_table->s->primary_key);
+
+ for (dict_index_t* index
+ = dict_table_get_first_index(user_table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ if (!index->to_be_dropped && index->is_corrupted()) {
+ my_error(ER_CHECK_NO_SUCH_TABLE, MYF(0));
+ goto error_handled;
+ }
+ }
+
+ for (dict_index_t* index
+ = dict_table_get_first_index(user_table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ if (!index->to_be_dropped && index->is_corrupted()) {
+ my_error(ER_CHECK_NO_SUCH_TABLE, MYF(0));
+ goto error_handled;
+ }
+ }
+
+ if (!ctx->new_table->fts
+ && innobase_fulltext_exist(altered_table)) {
+ ctx->new_table->fts = fts_create(
+ ctx->new_table);
+ ctx->new_table->fts->doc_col = fts_doc_id_col;
+ }
+
+ /* Check if we need to update mtypes of legacy GIS columns.
+ This check is only needed when we don't have to rebuild
+ the table, since rebuild would update all mtypes for GIS
+ columns */
+ error = innobase_check_gis_columns(
+ ha_alter_info, ctx->new_table, ctx->trx);
+ if (error != DB_SUCCESS) {
+ ut_ad(error == DB_ERROR);
+ error = DB_UNSUPPORTED;
+ goto error_handling;
+ }
+ }
+
+ ut_ad(new_clustered == ctx->need_rebuild());
+
+ /* Create the index metadata. */
+ for (ulint a = 0; a < ctx->num_to_add_index; a++) {
+ if (index_defs[a].ind_type & DICT_VIRTUAL
+ && ctx->num_to_drop_vcol > 0 && !new_clustered) {
+ innodb_v_adjust_idx_col(ha_alter_info, old_table,
+ ctx->num_to_drop_vcol,
+ &index_defs[a]);
+ }
+
+ ctx->add_index[a] = row_merge_create_index(
+ ctx->new_table, &index_defs[a], add_v);
+
+ add_key_nums[a] = index_defs[a].key_number;
+
+ DBUG_ASSERT(ctx->add_index[a]->is_committed()
+ == !!new_clustered);
+ }
+
+ DBUG_ASSERT(!ctx->need_rebuild()
+ || !ctx->new_table->persistent_autoinc);
+
+ if (ctx->need_rebuild() && instant_alter_column_possible(
+ *user_table, ha_alter_info, old_table, altered_table,
+ ha_innobase::is_innodb_strict_mode(ctx->trx->mysql_thd))) {
+ for (uint a = 0; a < ctx->num_to_add_index; a++) {
+ ctx->add_index[a]->table = ctx->new_table;
+ error = dict_index_add_to_cache(
+ ctx->add_index[a], FIL_NULL, add_v);
+ ut_a(error == DB_SUCCESS);
+ }
+
+ DBUG_ASSERT(ha_alter_info->key_count
+ /* hidden GEN_CLUST_INDEX in InnoDB */
+ + dict_index_is_auto_gen_clust(
+ dict_table_get_first_index(ctx->new_table))
+ /* hidden FTS_DOC_ID_INDEX in InnoDB */
+ + (ctx->old_table->fts_doc_id_index
+ && innobase_fts_check_doc_id_index_in_def(
+ altered_table->s->keys,
+ altered_table->key_info)
+ != FTS_EXIST_DOC_ID_INDEX)
+ == ctx->num_to_add_index);
+
+ ctx->num_to_add_index = 0;
+ ctx->add_index = NULL;
+
+ uint i = 0; // index of stored columns ctx->new_table->cols[]
+ Field **af = altered_table->field;
+
+ for (const Create_field& new_field :
+ ha_alter_info->alter_info->create_list) {
+ DBUG_ASSERT(!new_field.field
+ || std::find(old_table->field,
+ old_table->field
+ + old_table->s->fields,
+ new_field.field) !=
+ old_table->field + old_table->s->fields);
+ DBUG_ASSERT(new_field.field
+ || !strcmp(new_field.field_name.str,
+ (*af)->field_name.str));
+
+ if (!(*af)->stored_in_db()) {
+ af++;
+ continue;
+ }
+
+ dict_col_t* col = dict_table_get_nth_col(
+ ctx->new_table, i);
+ DBUG_ASSERT(!strcmp((*af)->field_name.str,
+ dict_table_get_col_name(ctx->new_table,
+ i)));
+ DBUG_ASSERT(!col->is_added());
+
+ if (new_field.field) {
+ /* This is a pre-existing column,
+ possibly at a different position. */
+ } else if ((*af)->is_real_null()) {
+ /* DEFAULT NULL */
+ col->def_val.len = UNIV_SQL_NULL;
+ } else {
+ switch ((*af)->type()) {
+ case MYSQL_TYPE_VARCHAR:
+ col->def_val.len = reinterpret_cast
+ <const Field_varstring*>
+ ((*af))->get_length();
+ col->def_val.data = reinterpret_cast
+ <const Field_varstring*>
+ ((*af))->get_data();
+ break;
+ case MYSQL_TYPE_GEOMETRY:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ col->def_val.len = reinterpret_cast
+ <const Field_blob*>
+ ((*af))->get_length();
+ col->def_val.data = reinterpret_cast
+ <const Field_blob*>
+ ((*af))->get_ptr();
+ break;
+ default:
+ dfield_t d;
+ dict_col_copy_type(col, &d.type);
+ ulint len = (*af)->pack_length();
+ DBUG_ASSERT(len <= 8
+ || d.type.mtype
+ != DATA_INT);
+ row_mysql_store_col_in_innobase_format(
+ &d,
+ d.type.mtype == DATA_INT
+ ? static_cast<byte*>(
+ mem_heap_alloc(
+ ctx->heap,
+ len))
+ : NULL,
+ true, (*af)->ptr, len,
+ dict_table_is_comp(
+ user_table));
+ col->def_val.len = d.len;
+ col->def_val.data = d.data;
+ }
+ }
+
+ i++;
+ af++;
+ }
+
+ DBUG_ASSERT(af == altered_table->field
+ + altered_table->s->fields);
+ /* There might exist a hidden FTS_DOC_ID column for
+ FULLTEXT INDEX. If it exists, the columns should have
+ been implicitly added by ADD FULLTEXT INDEX together
+ with instant ADD COLUMN. (If a hidden FTS_DOC_ID pre-existed,
+ then the ctx->col_map[] check should have prevented
+ adding visible user columns after that.) */
+ DBUG_ASSERT(DATA_N_SYS_COLS + i == ctx->new_table->n_cols
+ || (1 + DATA_N_SYS_COLS + i
+ == ctx->new_table->n_cols
+ && !strcmp(dict_table_get_col_name(
+ ctx->new_table, i),
+ FTS_DOC_ID_COL_NAME)));
+
+ if (altered_table->found_next_number_field) {
+ ctx->new_table->persistent_autoinc
+ = ctx->old_table->persistent_autoinc;
+ }
+
+ ctx->prepare_instant();
+ }
+
+ if (ctx->need_rebuild()) {
+ DBUG_ASSERT(ctx->need_rebuild());
+ DBUG_ASSERT(!ctx->is_instant());
+ DBUG_ASSERT(num_fts_index <= 1);
+ DBUG_ASSERT(!ctx->online || num_fts_index == 0);
+ DBUG_ASSERT(!ctx->online
+ || ctx->add_autoinc == ULINT_UNDEFINED);
+ DBUG_ASSERT(!ctx->online
+ || !innobase_need_rebuild(ha_alter_info, old_table)
+ || !innobase_fulltext_exist(altered_table));
+
+ uint32_t key_id = FIL_DEFAULT_ENCRYPTION_KEY;
+ fil_encryption_t mode = FIL_ENCRYPTION_DEFAULT;
+
+ if (fil_space_t* s = user_table->space) {
+ if (const fil_space_crypt_t* c = s->crypt_data) {
+ key_id = c->key_id;
+ mode = c->encryption;
+ }
+ }
+
+ if (ha_alter_info->handler_flags & ALTER_OPTIONS) {
+ const ha_table_option_struct& alt_opt=
+ *ha_alter_info->create_info->option_struct;
+ const ha_table_option_struct& opt=
+ *old_table->s->option_struct;
+ if (alt_opt.encryption != opt.encryption
+ || alt_opt.encryption_key_id
+ != opt.encryption_key_id) {
+ key_id = uint32_t(alt_opt.encryption_key_id);
+ mode = fil_encryption_t(alt_opt.encryption);
+ }
+ }
+
+ if (dict_table_get_low(ctx->new_table->name.m_name)) {
+ my_error(ER_TABLE_EXISTS_ERROR, MYF(0),
+ ctx->new_table->name.m_name);
+ goto new_clustered_failed;
+ }
+
+ /* Create the table. */
+ trx_set_dict_operation(ctx->trx, TRX_DICT_OP_TABLE);
+
+ error = row_create_table_for_mysql(
+ ctx->new_table, ctx->trx, mode, key_id);
+
+ switch (error) {
+ dict_table_t* temp_table;
+ case DB_SUCCESS:
+ /* We need to bump up the table ref count and
+ before we can use it we need to open the
+ table. The new_table must be in the data
+ dictionary cache, because we are still holding
+ the dict_sys.mutex. */
+ ut_ad(mutex_own(&dict_sys.mutex));
+ temp_table = dict_table_open_on_name(
+ ctx->new_table->name.m_name, TRUE, FALSE,
+ DICT_ERR_IGNORE_NONE);
+ ut_a(ctx->new_table == temp_table);
+ /* n_ref_count must be 1, because purge cannot
+ be executing on this very table as we are
+ holding dict_sys.latch X-latch. */
+ DBUG_ASSERT(ctx->new_table->get_ref_count() == 1);
+ DBUG_ASSERT(ctx->new_table->id != 0);
+ DBUG_ASSERT(ctx->new_table->id == ctx->trx->table_id);
+ break;
+ case DB_TABLESPACE_EXISTS:
+ my_error(ER_TABLESPACE_EXISTS, MYF(0),
+ altered_table->s->table_name.str);
+ goto new_table_failed;
+ case DB_DUPLICATE_KEY:
+ my_error(HA_ERR_TABLE_EXIST, MYF(0),
+ altered_table->s->table_name.str);
+ goto new_table_failed;
+ case DB_UNSUPPORTED:
+ my_error(ER_UNSUPPORTED_EXTENSION, MYF(0),
+ altered_table->s->table_name.str);
+ goto new_table_failed;
+ default:
+ my_error_innodb(error, table_name, flags);
+new_table_failed:
+ DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx);
+ ctx->new_table = NULL;
+ goto new_clustered_failed;
+ }
+
+ for (ulint a = 0; a < ctx->num_to_add_index; a++) {
+ dict_index_t* index = ctx->add_index[a];
+ const ulint n_v_col = index->get_new_n_vcol();
+ index = create_index_dict(ctx->trx, index, add_v);
+ error = ctx->trx->error_state;
+ if (error != DB_SUCCESS) {
+ if (index) {
+ dict_mem_index_free(index);
+ }
+error_handling_drop_uncached_1:
+ while (++a < ctx->num_to_add_index) {
+ dict_mem_index_free(ctx->add_index[a]);
+ }
+ goto error_handling;
+ } else {
+ DBUG_ASSERT(index != ctx->add_index[a]);
+ }
+
+ ctx->add_index[a] = index;
+ /* For ALTER TABLE...FORCE or OPTIMIZE TABLE,
+ we may only issue warnings, because there will
+ be no schema change from the user perspective. */
+ if (!info.row_size_is_acceptable(
+ *index,
+ !!(ha_alter_info->handler_flags
+ & ~(INNOBASE_INPLACE_IGNORE
+ | INNOBASE_ALTER_NOVALIDATE
+ | ALTER_RECREATE_TABLE)))) {
+ error = DB_TOO_BIG_RECORD;
+ goto error_handling_drop_uncached_1;
+ }
+ index->parser = index_defs[a].parser;
+ if (n_v_col) {
+ index->assign_new_v_col(n_v_col);
+ }
+ /* Note the id of the transaction that created this
+ index, we use it to restrict readers from accessing
+ this index, to ensure read consistency. */
+ ut_ad(index->trx_id == ctx->trx->id);
+
+ if (index->type & DICT_FTS) {
+ DBUG_ASSERT(num_fts_index == 1);
+ DBUG_ASSERT(!fts_index);
+ DBUG_ASSERT(index->type == DICT_FTS);
+ fts_index = ctx->add_index[a];
+ }
+ }
+
+ dict_index_t* clust_index = dict_table_get_first_index(
+ user_table);
+ dict_index_t* new_clust_index = dict_table_get_first_index(
+ ctx->new_table);
+ ut_ad(!new_clust_index->is_instant());
+ /* row_merge_build_index() depends on the correct value */
+ ut_ad(new_clust_index->n_core_null_bytes
+ == UT_BITS_IN_BYTES(new_clust_index->n_nullable));
+
+ if (const Field* ai = altered_table->found_next_number_field) {
+ const unsigned col_no = innodb_col_no(ai);
+
+ ctx->new_table->persistent_autoinc =
+ (dict_table_get_nth_col_pos(
+ ctx->new_table, col_no, NULL) + 1)
+ & dict_index_t::MAX_N_FIELDS;
+
+ /* Initialize the AUTO_INCREMENT sequence
+ to the rebuilt table from the old one. */
+ if (!old_table->found_next_number_field
+ || !user_table->space) {
+ } else if (ib_uint64_t autoinc
+ = btr_read_autoinc(clust_index)) {
+ btr_write_autoinc(new_clust_index, autoinc);
+ }
+ }
+
+ ctx->skip_pk_sort = innobase_pk_order_preserved(
+ ctx->col_map, clust_index, new_clust_index);
+
+ DBUG_EXECUTE_IF("innodb_alter_table_pk_assert_no_sort",
+ DBUG_ASSERT(ctx->skip_pk_sort););
+
+ if (ctx->online) {
+ /* Allocate a log for online table rebuild. */
+ rw_lock_x_lock(&clust_index->lock);
+ bool ok = row_log_allocate(
+ ctx->prebuilt->trx,
+ clust_index, ctx->new_table,
+ !(ha_alter_info->handler_flags
+ & ALTER_ADD_PK_INDEX),
+ ctx->defaults, ctx->col_map, path,
+ old_table,
+ ctx->allow_not_null);
+ rw_lock_x_unlock(&clust_index->lock);
+
+ if (!ok) {
+ error = DB_OUT_OF_MEMORY;
+ goto error_handling;
+ }
+ }
+ } else if (ctx->num_to_add_index) {
+ ut_ad(!ctx->is_instant());
+ ctx->trx->table_id = user_table->id;
+
+ for (ulint a = 0; a < ctx->num_to_add_index; a++) {
+ dict_index_t* index = ctx->add_index[a];
+ const ulint n_v_col = index->get_new_n_vcol();
+ DBUG_EXECUTE_IF(
+ "create_index_metadata_fail",
+ if (a + 1 == ctx->num_to_add_index) {
+ ctx->trx->error_state =
+ DB_OUT_OF_FILE_SPACE;
+ goto index_created;
+ });
+ index = create_index_dict(ctx->trx, index, add_v);
+#ifndef DBUG_OFF
+index_created:
+#endif
+ error = ctx->trx->error_state;
+ if (error != DB_SUCCESS) {
+ if (index) {
+ dict_mem_index_free(index);
+ }
+error_handling_drop_uncached:
+ while (++a < ctx->num_to_add_index) {
+ dict_mem_index_free(ctx->add_index[a]);
+ }
+ goto error_handling;
+ } else {
+ DBUG_ASSERT(index != ctx->add_index[a]);
+ }
+ ctx->add_index[a]= index;
+ if (!info.row_size_is_acceptable(*index, true)) {
+ error = DB_TOO_BIG_RECORD;
+ goto error_handling_drop_uncached;
+ }
+
+ index->parser = index_defs[a].parser;
+ if (n_v_col) {
+ index->assign_new_v_col(n_v_col);
+ }
+ /* Note the id of the transaction that created this
+ index, we use it to restrict readers from accessing
+ this index, to ensure read consistency. */
+ ut_ad(index->trx_id == ctx->trx->id);
+
+ /* If ADD INDEX with LOCK=NONE has been
+ requested, allocate a modification log. */
+ if (index->type & DICT_FTS) {
+ DBUG_ASSERT(num_fts_index == 1);
+ DBUG_ASSERT(!fts_index);
+ DBUG_ASSERT(index->type == DICT_FTS);
+ fts_index = ctx->add_index[a];
+ /* Fulltext indexes are not covered
+ by a modification log. */
+ } else if (!ctx->online
+ || !user_table->is_readable()
+ || !user_table->space) {
+ /* No need to allocate a modification log. */
+ DBUG_ASSERT(!index->online_log);
+ } else {
+ rw_lock_x_lock(&ctx->add_index[a]->lock);
+
+ bool ok = row_log_allocate(
+ ctx->prebuilt->trx,
+ index,
+ NULL, true, NULL, NULL,
+ path, old_table,
+ ctx->allow_not_null);
+
+ rw_lock_x_unlock(&index->lock);
+
+ DBUG_EXECUTE_IF(
+ "innodb_OOM_prepare_add_index",
+ if (ok && a == 1) {
+ row_log_free(
+ index->online_log);
+ index->online_log = NULL;
+ ok = false;
+ });
+
+ if (!ok) {
+ error = DB_OUT_OF_MEMORY;
+ goto error_handling_drop_uncached;
+ }
+ }
+ }
+ } else if (ctx->is_instant()
+ && !info.row_size_is_acceptable(*user_table, true)) {
+ error = DB_TOO_BIG_RECORD;
+ goto error_handling;
+ }
+
+ if (ctx->online && ctx->num_to_add_index) {
+ /* Assign a consistent read view for
+ row_merge_read_clustered_index(). */
+ ctx->prebuilt->trx->read_view.open(ctx->prebuilt->trx);
+ }
+
+ if (fts_index) {
+ /* Ensure that the dictionary operation mode will
+ not change while creating the auxiliary tables. */
+ trx_dict_op_t op = trx_get_dict_operation(ctx->trx);
+
+#ifdef UNIV_DEBUG
+ switch (op) {
+ case TRX_DICT_OP_NONE:
+ break;
+ case TRX_DICT_OP_TABLE:
+ case TRX_DICT_OP_INDEX:
+ goto op_ok;
+ }
+ ut_error;
+op_ok:
+#endif /* UNIV_DEBUG */
+ ut_ad(ctx->trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_d(dict_sys.assert_locked());
+
+ DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS);
+ if (ctx->need_rebuild()) {
+ /* For !ctx->need_rebuild(), this will be set at
+ commit_cache_norebuild(). */
+ ctx->new_table->fts_doc_id_index
+ = dict_table_get_index_on_name(
+ ctx->new_table, FTS_DOC_ID_INDEX_NAME);
+ DBUG_ASSERT(ctx->new_table->fts_doc_id_index != NULL);
+ }
+
+ error = fts_create_index_tables(ctx->trx, fts_index,
+ ctx->new_table->id);
+
+ DBUG_EXECUTE_IF("innodb_test_fail_after_fts_index_table",
+ error = DB_LOCK_WAIT_TIMEOUT;
+ goto error_handling;);
+
+ if (error != DB_SUCCESS) {
+ goto error_handling;
+ }
+
+ ctx->trx->commit();
+ trx_start_for_ddl(ctx->trx, op);
+
+ if (!ctx->new_table->fts
+ || ib_vector_size(ctx->new_table->fts->indexes) == 0) {
+ error = fts_create_common_tables(
+ ctx->trx, ctx->new_table, true);
+
+ DBUG_EXECUTE_IF(
+ "innodb_test_fail_after_fts_common_table",
+ error = DB_LOCK_WAIT_TIMEOUT;);
+
+ if (error != DB_SUCCESS) {
+ goto error_handling;
+ }
+
+ ctx->new_table->fts->dict_locked = true;
+
+ error = innobase_fts_load_stopword(
+ ctx->new_table, ctx->trx,
+ ctx->prebuilt->trx->mysql_thd)
+ ? DB_SUCCESS : DB_ERROR;
+ ctx->new_table->fts->dict_locked = false;
+
+ if (error != DB_SUCCESS) {
+ goto error_handling;
+ }
+ }
+
+ ut_ad(trx_get_dict_operation(ctx->trx) == op);
+ }
+
+ DBUG_ASSERT(error == DB_SUCCESS);
+
+ /* Commit the data dictionary transaction in order to release
+ the table locks on the system tables. This means that if
+ MySQL crashes while creating a new primary key inside
+ row_merge_build_indexes(), ctx->new_table will not be dropped
+ by trx_rollback_active(). It will have to be recovered or
+ dropped by the database administrator. */
+ trx_commit_for_mysql(ctx->trx);
+
+ row_mysql_unlock_data_dictionary(ctx->trx);
+ dict_locked = false;
+
+ ut_ad(!ctx->trx->lock.n_active_thrs);
+
+ if (ctx->old_table->fts) {
+ fts_sync_during_ddl(ctx->old_table);
+ }
+
+error_handling:
+ /* After an error, remove all those index definitions from the
+ dictionary which were defined. */
+
+ switch (error) {
+ case DB_SUCCESS:
+ ut_a(!dict_locked);
+
+ ut_d(mutex_enter(&dict_sys.mutex));
+ ut_d(dict_table_check_for_dup_indexes(
+ user_table, CHECK_PARTIAL_OK));
+ ut_d(mutex_exit(&dict_sys.mutex));
+ DBUG_RETURN(false);
+ case DB_TABLESPACE_EXISTS:
+ my_error(ER_TABLESPACE_EXISTS, MYF(0), "(unknown)");
+ break;
+ case DB_DUPLICATE_KEY:
+ my_error(ER_DUP_KEY, MYF(0), "SYS_INDEXES");
+ break;
+ case DB_UNSUPPORTED:
+ my_error(ER_TABLE_CANT_HANDLE_SPKEYS, MYF(0), "SYS_COLUMNS");
+ break;
+ default:
+ my_error_innodb(error, table_name, user_table->flags);
+ }
+
+error_handled:
+
+ ctx->prebuilt->trx->error_info = NULL;
+
+ if (!ctx->trx) {
+ goto err_exit;
+ }
+
+ ctx->trx->error_state = DB_SUCCESS;
+
+ if (!dict_locked) {
+ row_mysql_lock_data_dictionary(ctx->trx);
+ }
+
+ if (new_clustered) {
+ if (ctx->need_rebuild()) {
+
+ if (DICT_TF2_FLAG_IS_SET(
+ ctx->new_table, DICT_TF2_FTS)) {
+ innobase_drop_fts_index_table(
+ ctx->new_table, ctx->trx);
+ }
+
+ dict_table_close_and_drop(ctx->trx, ctx->new_table);
+
+ /* Free the log for online table rebuild, if
+ one was allocated. */
+
+ dict_index_t* clust_index = dict_table_get_first_index(
+ user_table);
+
+ rw_lock_x_lock(&clust_index->lock);
+
+ if (clust_index->online_log) {
+ ut_ad(ctx->online);
+ row_log_abort_sec(clust_index);
+ clust_index->online_status
+ = ONLINE_INDEX_COMPLETE;
+ }
+
+ rw_lock_x_unlock(&clust_index->lock);
+ }
+
+ trx_commit_for_mysql(ctx->trx);
+ /* n_ref_count must be 1, because purge cannot
+ be executing on this very table as we are
+ holding dict_sys.latch X-latch. */
+ ut_ad(!stats_wait || ctx->online
+ || user_table->get_ref_count() == 1);
+
+ online_retry_drop_indexes_with_trx(user_table, ctx->trx);
+ } else {
+ ut_ad(!ctx->need_rebuild());
+ row_merge_drop_indexes(ctx->trx, user_table, true);
+ trx_commit_for_mysql(ctx->trx);
+ }
+
+ ut_d(dict_table_check_for_dup_indexes(user_table, CHECK_ALL_COMPLETE));
+ ut_ad(!user_table->drop_aborted);
+
+err_exit:
+ /* Clear the to_be_dropped flag in the data dictionary cache. */
+ for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+ DBUG_ASSERT(ctx->drop_index[i]->is_committed());
+ DBUG_ASSERT(ctx->drop_index[i]->to_be_dropped);
+ ctx->drop_index[i]->to_be_dropped = 0;
+ }
+
+ if (ctx->trx) {
+ row_mysql_unlock_data_dictionary(ctx->trx);
+
+ ctx->trx->free();
+ }
+ trx_commit_for_mysql(ctx->prebuilt->trx);
+
+ for (uint i = 0; i < ctx->num_to_add_fk; i++) {
+ if (ctx->add_fk[i]) {
+ dict_foreign_free(ctx->add_fk[i]);
+ }
+ }
+
+ delete ctx;
+ ha_alter_info->handler_ctx = NULL;
+
+ DBUG_RETURN(true);
+}
+
+/* Check whether an index is needed for the foreign key constraint.
+If so, if it is dropped, is there an equivalent index can play its role.
+@return true if the index is needed and can't be dropped */
+static MY_ATTRIBUTE((nonnull(1,2,3,5), warn_unused_result))
+bool
+innobase_check_foreign_key_index(
+/*=============================*/
+ Alter_inplace_info* ha_alter_info, /*!< in: Structure describing
+ changes to be done by ALTER
+ TABLE */
+ dict_index_t* index, /*!< in: index to check */
+ dict_table_t* indexed_table, /*!< in: table that owns the
+ foreign keys */
+ const char** col_names, /*!< in: column names, or NULL
+ for indexed_table->col_names */
+ trx_t* trx, /*!< in/out: transaction */
+ dict_foreign_t** drop_fk, /*!< in: Foreign key constraints
+ to drop */
+ ulint n_drop_fk) /*!< in: Number of foreign keys
+ to drop */
+{
+ const dict_foreign_set* fks = &indexed_table->referenced_set;
+
+ /* Check for all FK references from other tables to the index. */
+ for (dict_foreign_set::const_iterator it = fks->begin();
+ it != fks->end(); ++it) {
+
+ dict_foreign_t* foreign = *it;
+ if (foreign->referenced_index != index) {
+ continue;
+ }
+ ut_ad(indexed_table == foreign->referenced_table);
+
+ if (NULL == dict_foreign_find_index(
+ indexed_table, col_names,
+ foreign->referenced_col_names,
+ foreign->n_fields, index,
+ /*check_charsets=*/TRUE,
+ /*check_null=*/FALSE,
+ NULL, NULL, NULL)
+ && NULL == innobase_find_equiv_index(
+ foreign->referenced_col_names,
+ foreign->n_fields,
+ ha_alter_info->key_info_buffer,
+ span<uint>(ha_alter_info->index_add_buffer,
+ ha_alter_info->index_add_count))) {
+
+ /* Index cannot be dropped. */
+ trx->error_info = index;
+ return(true);
+ }
+ }
+
+ fks = &indexed_table->foreign_set;
+
+ /* Check for all FK references in current table using the index. */
+ for (dict_foreign_set::const_iterator it = fks->begin();
+ it != fks->end(); ++it) {
+
+ dict_foreign_t* foreign = *it;
+ if (foreign->foreign_index != index) {
+ continue;
+ }
+
+ ut_ad(indexed_table == foreign->foreign_table);
+
+ if (!innobase_dropping_foreign(
+ foreign, drop_fk, n_drop_fk)
+ && NULL == dict_foreign_find_index(
+ indexed_table, col_names,
+ foreign->foreign_col_names,
+ foreign->n_fields, index,
+ /*check_charsets=*/TRUE,
+ /*check_null=*/FALSE,
+ NULL, NULL, NULL)
+ && NULL == innobase_find_equiv_index(
+ foreign->foreign_col_names,
+ foreign->n_fields,
+ ha_alter_info->key_info_buffer,
+ span<uint>(ha_alter_info->index_add_buffer,
+ ha_alter_info->index_add_count))) {
+
+ /* Index cannot be dropped. */
+ trx->error_info = index;
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/**
+Rename a given index in the InnoDB data dictionary.
+
+@param index index to rename
+@param new_name new name of the index
+@param[in,out] trx dict transaction to use, not going to be committed here
+
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+rename_index_try(
+ const dict_index_t* index,
+ const char* new_name,
+ trx_t* trx)
+{
+ DBUG_ENTER("rename_index_try");
+ ut_d(dict_sys.assert_locked());
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ pars_info_t* pinfo;
+ dberr_t err;
+
+ pinfo = pars_info_create();
+
+ pars_info_add_ull_literal(pinfo, "table_id", index->table->id);
+ pars_info_add_ull_literal(pinfo, "index_id", index->id);
+ pars_info_add_str_literal(pinfo, "new_name", new_name);
+
+ trx->op_info = "Renaming an index in SYS_INDEXES";
+
+ DBUG_EXECUTE_IF(
+ "ib_rename_index_fail1",
+ DBUG_SET("+d,innodb_report_deadlock");
+ );
+
+ err = que_eval_sql(
+ pinfo,
+ "PROCEDURE RENAME_INDEX_IN_SYS_INDEXES () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_INDEXES SET\n"
+ "NAME = :new_name\n"
+ "WHERE\n"
+ "ID = :index_id AND\n"
+ "TABLE_ID = :table_id;\n"
+ "END;\n",
+ FALSE, trx); /* pinfo is freed by que_eval_sql() */
+
+ DBUG_EXECUTE_IF(
+ "ib_rename_index_fail1",
+ DBUG_SET("-d,innodb_report_deadlock");
+ );
+
+ trx->op_info = "";
+
+ if (err != DB_SUCCESS) {
+ my_error_innodb(err, index->table->name.m_name, 0);
+ DBUG_RETURN(true);
+ }
+
+ DBUG_RETURN(false);
+}
+
+
+/**
+Rename a given index in the InnoDB data dictionary cache.
+
+@param[in,out] index index to rename
+@param new_name new index name
+*/
+static
+void
+innobase_rename_index_cache(dict_index_t* index, const char* new_name)
+{
+ DBUG_ENTER("innobase_rename_index_cache");
+ ut_d(dict_sys.assert_locked());
+
+ size_t old_name_len = strlen(index->name);
+ size_t new_name_len = strlen(new_name);
+
+ if (old_name_len < new_name_len) {
+ index->name = static_cast<char*>(
+ mem_heap_alloc(index->heap, new_name_len + 1));
+ }
+
+ memcpy(const_cast<char*>(index->name()), new_name, new_name_len + 1);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/** Rename the index name in cache.
+@param[in] ctx alter context
+@param[in] ha_alter_info Data used during inplace alter. */
+static void
+innobase_rename_indexes_cache(const ha_innobase_inplace_ctx *ctx,
+ const Alter_inplace_info *ha_alter_info)
+{
+ DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_RENAME_INDEX);
+
+ std::vector<std::pair<dict_index_t *, const char *>> rename_info;
+ rename_info.reserve(ha_alter_info->rename_keys.size());
+
+ for (const Alter_inplace_info::Rename_key_pair &pair :
+ ha_alter_info->rename_keys)
+ {
+ dict_index_t *index=
+ dict_table_get_index_on_name(ctx->old_table, pair.old_key->name.str);
+ ut_ad(index);
+
+ rename_info.emplace_back(index, pair.new_key->name.str);
+ }
+
+ for (const auto &pair : rename_info)
+ innobase_rename_index_cache(pair.first, pair.second);
+}
+
+/** Fill the stored column information in s_cols list.
+@param[in] altered_table mysql table object
+@param[in] table innodb table object
+@param[out] s_cols list of stored column
+@param[out] s_heap heap for storing stored
+column information. */
+static
+void
+alter_fill_stored_column(
+ const TABLE* altered_table,
+ dict_table_t* table,
+ dict_s_col_list** s_cols,
+ mem_heap_t** s_heap)
+{
+ ulint n_cols = altered_table->s->fields;
+ ulint stored_col_no = 0;
+
+ for (ulint i = 0; i < n_cols; i++) {
+ Field* field = altered_table->field[i];
+ dict_s_col_t s_col;
+
+ if (field->stored_in_db()) {
+ stored_col_no++;
+ }
+
+ if (!innobase_is_s_fld(field)) {
+ continue;
+ }
+
+ ulint num_base = 0;
+ dict_col_t* col = dict_table_get_nth_col(table,
+ stored_col_no);
+
+ s_col.m_col = col;
+ s_col.s_pos = i;
+
+ if (*s_cols == NULL) {
+ *s_cols = UT_NEW_NOKEY(dict_s_col_list());
+ *s_heap = mem_heap_create(1000);
+ }
+
+ if (num_base != 0) {
+ s_col.base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
+ *s_heap, num_base * sizeof(dict_col_t*)));
+ } else {
+ s_col.base_col = NULL;
+ }
+
+ s_col.num_base = num_base;
+ innodb_base_col_setup_for_stored(table, field, &s_col);
+ (*s_cols)->push_front(s_col);
+ }
+}
+
+static bool alter_templ_needs_rebuild(const TABLE* altered_table,
+ const Alter_inplace_info* ha_alter_info,
+ const dict_table_t* table);
+
+
+/** Allows InnoDB to update internal structures with concurrent
+writes blocked (provided that check_if_supported_inplace_alter()
+did not return HA_ALTER_INPLACE_NO_LOCK).
+This will be invoked before inplace_alter_table().
+
+@param altered_table TABLE object for new version of table.
+@param ha_alter_info Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval true Failure
+@retval false Success
+*/
+
+bool
+ha_innobase::prepare_inplace_alter_table(
+/*=====================================*/
+ TABLE* altered_table,
+ Alter_inplace_info* ha_alter_info)
+{
+ dict_index_t** drop_index; /*!< Index to be dropped */
+ ulint n_drop_index; /*!< Number of indexes to drop */
+ dict_foreign_t**drop_fk; /*!< Foreign key constraints to drop */
+ ulint n_drop_fk; /*!< Number of foreign keys to drop */
+ dict_foreign_t**add_fk = NULL; /*!< Foreign key constraints to drop */
+ ulint n_add_fk; /*!< Number of foreign keys to drop */
+ dict_table_t* indexed_table; /*!< Table where indexes are created */
+ mem_heap_t* heap;
+ const char** col_names;
+ int error;
+ ulint add_autoinc_col_no = ULINT_UNDEFINED;
+ ulonglong autoinc_col_max_value = 0;
+ ulint fts_doc_col_no = ULINT_UNDEFINED;
+ bool add_fts_doc_id = false;
+ bool add_fts_doc_id_idx = false;
+ bool add_fts_idx = false;
+ dict_s_col_list*s_cols = NULL;
+ mem_heap_t* s_heap = NULL;
+
+ DBUG_ENTER("prepare_inplace_alter_table");
+ DBUG_ASSERT(!ha_alter_info->handler_ctx);
+ DBUG_ASSERT(ha_alter_info->create_info);
+ DBUG_ASSERT(!srv_read_only_mode);
+
+ /* Init online ddl status variables */
+ onlineddl_rowlog_rows = 0;
+ onlineddl_rowlog_pct_used = 0;
+ onlineddl_pct_progress = 0;
+
+ MONITOR_ATOMIC_INC(MONITOR_PENDING_ALTER_TABLE);
+
+#ifdef UNIV_DEBUG
+ for (dict_index_t* index = dict_table_get_first_index(m_prebuilt->table);
+ index;
+ index = dict_table_get_next_index(index)) {
+ ut_ad(!index->to_be_dropped);
+ }
+#endif /* UNIV_DEBUG */
+
+ ut_d(mutex_enter(&dict_sys.mutex));
+ ut_d(dict_table_check_for_dup_indexes(
+ m_prebuilt->table, CHECK_ABORTED_OK));
+ ut_d(mutex_exit(&dict_sys.mutex));
+
+ if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
+ /* Nothing to do */
+ DBUG_ASSERT(m_prebuilt->trx->dict_operation_lock_mode == 0);
+ DBUG_RETURN(false);
+ }
+
+ indexed_table = m_prebuilt->table;
+
+ /* ALTER TABLE will not implicitly move a table from a single-table
+ tablespace to the system tablespace when innodb_file_per_table=OFF.
+ But it will implicitly move a table from the system tablespace to a
+ single-table tablespace if innodb_file_per_table = ON. */
+
+ create_table_info_t info(m_user_thd,
+ altered_table,
+ ha_alter_info->create_info,
+ NULL,
+ NULL,
+ srv_file_per_table);
+
+ info.set_tablespace_type(indexed_table->space != fil_system.sys_space);
+
+ if (ha_alter_info->handler_flags & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) {
+ if (info.gcols_in_fulltext_or_spatial()) {
+ goto err_exit_no_heap;
+ }
+ }
+
+ if (indexed_table->is_readable()) {
+ } else {
+ if (indexed_table->corrupted) {
+ /* Handled below */
+ } else {
+ if (const fil_space_t* space = indexed_table->space) {
+ String str;
+ const char* engine= table_type();
+
+ push_warning_printf(
+ m_user_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_DECRYPTION_FAILED,
+ "Table %s in file %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ table_share->table_name.str,
+ space->chain.start->name);
+
+ my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, str.c_ptr(), engine);
+ DBUG_RETURN(true);
+ }
+ }
+ }
+
+ if (indexed_table->corrupted
+ || dict_table_get_first_index(indexed_table) == NULL
+ || dict_table_get_first_index(indexed_table)->is_corrupted()) {
+ /* The clustered index is corrupted. */
+ my_error(ER_CHECK_NO_SUCH_TABLE, MYF(0));
+ DBUG_RETURN(true);
+ } else {
+ const char* invalid_opt = info.create_options_are_invalid();
+
+ /* Check engine specific table options */
+ if (const char* invalid_tbopt = info.check_table_options()) {
+ my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0),
+ table_type(), invalid_tbopt);
+ goto err_exit_no_heap;
+ }
+
+ if (invalid_opt) {
+ my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0),
+ table_type(), invalid_opt);
+ goto err_exit_no_heap;
+ }
+ }
+
+ /* Check if any index name is reserved. */
+ if (innobase_index_name_is_reserved(
+ m_user_thd,
+ ha_alter_info->key_info_buffer,
+ ha_alter_info->key_count)) {
+err_exit_no_heap:
+ DBUG_ASSERT(m_prebuilt->trx->dict_operation_lock_mode == 0);
+ online_retry_drop_indexes(m_prebuilt->table, m_user_thd);
+ DBUG_RETURN(true);
+ }
+
+ indexed_table = m_prebuilt->table;
+
+ /* Check that index keys are sensible */
+ error = innobase_check_index_keys(ha_alter_info, indexed_table);
+
+ if (error) {
+ goto err_exit_no_heap;
+ }
+
+ /* Prohibit renaming a column to something that the table
+ already contains. */
+ if (ha_alter_info->handler_flags
+ & ALTER_COLUMN_NAME) {
+ for (Field** fp = table->field; *fp; fp++) {
+ if (!((*fp)->flags & FIELD_IS_RENAMED)) {
+ continue;
+ }
+
+ const char* name = 0;
+
+ for (const Create_field& cf :
+ ha_alter_info->alter_info->create_list) {
+ if (cf.field == *fp) {
+ name = cf.field_name.str;
+ goto check_if_ok_to_rename;
+ }
+ }
+
+ ut_error;
+check_if_ok_to_rename:
+ /* Prohibit renaming a column from FTS_DOC_ID
+ if full-text indexes exist. */
+ if (!my_strcasecmp(system_charset_info,
+ (*fp)->field_name.str,
+ FTS_DOC_ID_COL_NAME)
+ && innobase_fulltext_exist(altered_table)) {
+ my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN,
+ MYF(0), name);
+ goto err_exit_no_heap;
+ }
+
+ /* Prohibit renaming a column to an internal column. */
+ const char* s = m_prebuilt->table->col_names;
+ unsigned j;
+ /* Skip user columns.
+ MySQL should have checked these already.
+ We want to allow renaming of c1 to c2, c2 to c1. */
+ for (j = 0; j < table->s->fields; j++) {
+ if (table->field[j]->stored_in_db()) {
+ s += strlen(s) + 1;
+ }
+ }
+
+ for (; j < m_prebuilt->table->n_def; j++) {
+ if (!my_strcasecmp(
+ system_charset_info, name, s)) {
+ my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+ s);
+ goto err_exit_no_heap;
+ }
+
+ s += strlen(s) + 1;
+ }
+ }
+ }
+
+ if (!info.innobase_table_flags()) {
+ goto err_exit_no_heap;
+ }
+
+ if (info.flags2() & DICT_TF2_USE_FILE_PER_TABLE) {
+ /* Preserve the DATA DIRECTORY attribute, because it
+ currently cannot be changed during ALTER TABLE. */
+ info.flags_set(m_prebuilt->table->flags
+ & 1U << DICT_TF_POS_DATA_DIR);
+ }
+
+
+ /* ALGORITHM=INPLACE without rebuild (10.3+ ALGORITHM=NOCOPY)
+ must use the current ROW_FORMAT of the table. */
+ const ulint max_col_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(
+ innobase_need_rebuild(ha_alter_info, this->table)
+ ? info.flags()
+ : m_prebuilt->table->flags);
+
+ /* Check each index's column length to make sure they do not
+ exceed limit */
+ for (ulint i = 0; i < ha_alter_info->key_count; i++) {
+ const KEY* key = &ha_alter_info->key_info_buffer[i];
+
+ if (key->flags & HA_FULLTEXT) {
+ /* The column length does not matter for
+ fulltext search indexes. But, UNIQUE
+ fulltext indexes are not supported. */
+ DBUG_ASSERT(!(key->flags & HA_NOSAME));
+ DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+ & ~(HA_FULLTEXT
+ | HA_PACK_KEY
+ | HA_BINARY_PACK_KEY)));
+ add_fts_idx = true;
+ continue;
+ }
+
+ if (too_big_key_part_length(max_col_len, *key)) {
+ my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+ max_col_len);
+ goto err_exit_no_heap;
+ }
+ }
+
+ /* We won't be allowed to add fts index to a table with
+ fts indexes already but without AUX_HEX_NAME set.
+ This means the aux tables of the table failed to
+ rename to hex format but new created aux tables
+ shall be in hex format, which is contradictory. */
+ if (!DICT_TF2_FLAG_IS_SET(indexed_table, DICT_TF2_FTS_AUX_HEX_NAME)
+ && indexed_table->fts != NULL && add_fts_idx) {
+ my_error(ER_INNODB_FT_AUX_NOT_HEX_ID, MYF(0));
+ goto err_exit_no_heap;
+ }
+
+ /* Check existing index definitions for too-long column
+ prefixes as well, in case max_col_len shrunk. */
+ for (const dict_index_t* index
+ = dict_table_get_first_index(indexed_table);
+ index;
+ index = dict_table_get_next_index(index)) {
+ if (index->type & DICT_FTS) {
+ DBUG_ASSERT(index->type == DICT_FTS
+ || (index->type & DICT_CORRUPT));
+
+ /* We need to drop any corrupted fts indexes
+ before we add a new fts index. */
+ if (add_fts_idx && index->type & DICT_CORRUPT) {
+ ib_errf(m_user_thd, IB_LOG_LEVEL_ERROR,
+ ER_INNODB_INDEX_CORRUPT,
+ "Fulltext index '%s' is corrupt. "
+ "you should drop this index first.",
+ index->name());
+
+ goto err_exit_no_heap;
+ }
+
+ continue;
+ }
+
+ for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+ const dict_field_t* field
+ = dict_index_get_nth_field(index, i);
+ if (field->prefix_len > max_col_len) {
+ my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+ max_col_len);
+ goto err_exit_no_heap;
+ }
+ }
+ }
+
+ n_drop_index = 0;
+ n_drop_fk = 0;
+
+ if (ha_alter_info->handler_flags
+ & (INNOBASE_ALTER_NOREBUILD | INNOBASE_ALTER_REBUILD
+ | INNOBASE_ALTER_INSTANT)) {
+ heap = mem_heap_create(1024);
+
+ if (ha_alter_info->handler_flags
+ & ALTER_COLUMN_NAME) {
+ col_names = innobase_get_col_names(
+ ha_alter_info, altered_table, table,
+ indexed_table, heap);
+ } else {
+ col_names = NULL;
+ }
+ } else {
+ heap = NULL;
+ col_names = NULL;
+ }
+
+ if (ha_alter_info->handler_flags
+ & ALTER_DROP_FOREIGN_KEY) {
+ DBUG_ASSERT(ha_alter_info->alter_info->drop_list.elements > 0);
+
+ drop_fk = static_cast<dict_foreign_t**>(
+ mem_heap_alloc(
+ heap,
+ ha_alter_info->alter_info->drop_list.elements
+ * sizeof(dict_foreign_t*)));
+
+ for (Alter_drop& drop : ha_alter_info->alter_info->drop_list) {
+ if (drop.type != Alter_drop::FOREIGN_KEY) {
+ continue;
+ }
+
+ dict_foreign_t* foreign;
+
+ for (dict_foreign_set::iterator it
+ = m_prebuilt->table->foreign_set.begin();
+ it != m_prebuilt->table->foreign_set.end();
+ ++it) {
+
+ foreign = *it;
+ const char* fid = strchr(foreign->id, '/');
+
+ DBUG_ASSERT(fid);
+ /* If no database/ prefix was present in
+ the FOREIGN KEY constraint name, compare
+ to the full constraint name. */
+ fid = fid ? fid + 1 : foreign->id;
+
+ if (!my_strcasecmp(system_charset_info,
+ fid, drop.name)) {
+ goto found_fk;
+ }
+ }
+
+ my_error(ER_CANT_DROP_FIELD_OR_KEY, MYF(0),
+ drop.type_name(), drop.name);
+ goto err_exit;
+found_fk:
+ for (ulint i = n_drop_fk; i--; ) {
+ if (drop_fk[i] == foreign) {
+ goto dup_fk;
+ }
+ }
+ drop_fk[n_drop_fk++] = foreign;
+dup_fk:
+ continue;
+ }
+
+ DBUG_ASSERT(n_drop_fk > 0);
+
+ DBUG_ASSERT(n_drop_fk
+ <= ha_alter_info->alter_info->drop_list.elements);
+ } else {
+ drop_fk = NULL;
+ }
+
+ if (ha_alter_info->index_drop_count) {
+ dict_index_t* drop_primary = NULL;
+
+ DBUG_ASSERT(ha_alter_info->handler_flags
+ & (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX
+ | ALTER_DROP_UNIQUE_INDEX
+ | ALTER_DROP_PK_INDEX));
+ /* Check which indexes to drop. */
+ drop_index = static_cast<dict_index_t**>(
+ mem_heap_alloc(
+ heap, (ha_alter_info->index_drop_count + 1)
+ * sizeof *drop_index));
+
+ for (uint i = 0; i < ha_alter_info->index_drop_count; i++) {
+ const KEY* key
+ = ha_alter_info->index_drop_buffer[i];
+ dict_index_t* index
+ = dict_table_get_index_on_name(
+ indexed_table, key->name.str);
+
+ if (!index) {
+ push_warning_printf(
+ m_user_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_WRONG_INDEX,
+ "InnoDB could not find key"
+ " with name %s", key->name.str);
+ } else {
+ ut_ad(!index->to_be_dropped);
+ if (!index->is_primary()) {
+ drop_index[n_drop_index++] = index;
+ } else {
+ drop_primary = index;
+ }
+ }
+ }
+
+ /* If all FULLTEXT indexes were removed, drop an
+ internal FTS_DOC_ID_INDEX as well, unless it exists in
+ the table. */
+
+ if (innobase_fulltext_exist(table)
+ && !innobase_fulltext_exist(altered_table)
+ && !DICT_TF2_FLAG_IS_SET(
+ indexed_table, DICT_TF2_FTS_HAS_DOC_ID)) {
+ dict_index_t* fts_doc_index
+ = indexed_table->fts_doc_id_index;
+ ut_ad(fts_doc_index);
+
+ // Add some fault tolerance for non-debug builds.
+ if (fts_doc_index == NULL) {
+ goto check_if_can_drop_indexes;
+ }
+
+ DBUG_ASSERT(!fts_doc_index->to_be_dropped);
+
+ for (uint i = 0; i < table->s->keys; i++) {
+ if (!my_strcasecmp(
+ system_charset_info,
+ FTS_DOC_ID_INDEX_NAME,
+ table->key_info[i].name.str)) {
+ /* The index exists in the MySQL
+ data dictionary. Do not drop it,
+ even though it is no longer needed
+ by InnoDB fulltext search. */
+ goto check_if_can_drop_indexes;
+ }
+ }
+
+ drop_index[n_drop_index++] = fts_doc_index;
+ }
+
+check_if_can_drop_indexes:
+ /* Check if the indexes can be dropped. */
+
+ /* Prevent a race condition between DROP INDEX and
+ CREATE TABLE adding FOREIGN KEY constraints. */
+ row_mysql_lock_data_dictionary(m_prebuilt->trx);
+
+ if (!n_drop_index) {
+ drop_index = NULL;
+ } else {
+ /* Flag all indexes that are to be dropped. */
+ for (ulint i = 0; i < n_drop_index; i++) {
+ ut_ad(!drop_index[i]->to_be_dropped);
+ drop_index[i]->to_be_dropped = 1;
+ }
+ }
+
+ if (m_prebuilt->trx->check_foreigns) {
+ for (uint i = 0; i < n_drop_index; i++) {
+ dict_index_t* index = drop_index[i];
+
+ if (innobase_check_foreign_key_index(
+ ha_alter_info, index,
+ indexed_table, col_names,
+ m_prebuilt->trx, drop_fk, n_drop_fk)) {
+ row_mysql_unlock_data_dictionary(
+ m_prebuilt->trx);
+ m_prebuilt->trx->error_info = index;
+ print_error(HA_ERR_DROP_INDEX_FK,
+ MYF(0));
+ goto err_exit;
+ }
+ }
+
+ /* If a primary index is dropped, need to check
+ any depending foreign constraints get affected */
+ if (drop_primary
+ && innobase_check_foreign_key_index(
+ ha_alter_info, drop_primary,
+ indexed_table, col_names,
+ m_prebuilt->trx, drop_fk, n_drop_fk)) {
+ row_mysql_unlock_data_dictionary(m_prebuilt->trx);
+ print_error(HA_ERR_DROP_INDEX_FK, MYF(0));
+ goto err_exit;
+ }
+ }
+
+ row_mysql_unlock_data_dictionary(m_prebuilt->trx);
+ } else {
+ drop_index = NULL;
+ }
+
+ /* Check if any of the existing indexes are marked as corruption
+ and if they are, refuse adding more indexes. */
+ if (ha_alter_info->handler_flags & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) {
+ for (dict_index_t* index = dict_table_get_first_index(indexed_table);
+ index != NULL; index = dict_table_get_next_index(index)) {
+
+ if (!index->to_be_dropped && index->is_committed()
+ && index->is_corrupted()) {
+ my_error(ER_INDEX_CORRUPT, MYF(0), index->name());
+ goto err_exit;
+ }
+ }
+ }
+
+ n_add_fk = 0;
+
+ if (ha_alter_info->handler_flags
+ & ALTER_ADD_FOREIGN_KEY) {
+ ut_ad(!m_prebuilt->trx->check_foreigns);
+
+ alter_fill_stored_column(altered_table, m_prebuilt->table,
+ &s_cols, &s_heap);
+
+ add_fk = static_cast<dict_foreign_t**>(
+ mem_heap_zalloc(
+ heap,
+ ha_alter_info->alter_info->key_list.elements
+ * sizeof(dict_foreign_t*)));
+
+ if (!innobase_get_foreign_key_info(
+ ha_alter_info, table_share,
+ m_prebuilt->table, col_names,
+ drop_index, n_drop_index,
+ add_fk, &n_add_fk, m_prebuilt->trx, s_cols)) {
+err_exit:
+ if (n_drop_index) {
+ row_mysql_lock_data_dictionary(m_prebuilt->trx);
+
+ /* Clear the to_be_dropped flags, which might
+ have been set at this point. */
+ for (ulint i = 0; i < n_drop_index; i++) {
+ ut_ad(drop_index[i]->is_committed());
+ drop_index[i]->to_be_dropped = 0;
+ }
+
+ row_mysql_unlock_data_dictionary(
+ m_prebuilt->trx);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ if (s_cols != NULL) {
+ UT_DELETE(s_cols);
+ mem_heap_free(s_heap);
+ }
+
+ goto err_exit_no_heap;
+ }
+
+ if (s_cols != NULL) {
+ UT_DELETE(s_cols);
+ mem_heap_free(s_heap);
+ }
+ }
+
+ if (ha_alter_info->handler_flags & ALTER_RENAME_INDEX) {
+ for (const Alter_inplace_info::Rename_key_pair& pair :
+ ha_alter_info->rename_keys) {
+ dict_index_t* index = dict_table_get_index_on_name(
+ indexed_table, pair.old_key->name.str);
+
+ if (!index || index->is_corrupted()) {
+ my_error(ER_INDEX_CORRUPT, MYF(0),
+ index->name());
+ goto err_exit;
+ }
+ }
+ }
+
+ const ha_table_option_struct& alt_opt=
+ *ha_alter_info->create_info->option_struct;
+
+ if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
+ || ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE
+ | INNOBASE_ALTER_NOCREATE
+ | INNOBASE_ALTER_INSTANT))
+ == ALTER_OPTIONS
+ && !alter_options_need_rebuild(ha_alter_info, table))) {
+
+ ha_innobase_inplace_ctx *ctx = NULL;
+ if (heap) {
+ ctx = new ha_innobase_inplace_ctx(
+ m_prebuilt,
+ drop_index, n_drop_index,
+ drop_fk, n_drop_fk,
+ add_fk, n_add_fk,
+ ha_alter_info->online,
+ heap, indexed_table,
+ col_names, ULINT_UNDEFINED, 0, 0,
+ (ha_alter_info->ignore
+ || !thd_is_strict_mode(m_user_thd)),
+ alt_opt.page_compressed,
+ alt_opt.page_compression_level);
+ ha_alter_info->handler_ctx = ctx;
+ }
+
+ DBUG_ASSERT(m_prebuilt->trx->dict_operation_lock_mode == 0);
+ online_retry_drop_indexes(m_prebuilt->table, m_user_thd);
+
+ if ((ha_alter_info->handler_flags
+ & ALTER_DROP_VIRTUAL_COLUMN)
+ && prepare_inplace_drop_virtual(ha_alter_info, table)) {
+ DBUG_RETURN(true);
+ }
+
+ if ((ha_alter_info->handler_flags
+ & ALTER_ADD_VIRTUAL_COLUMN)
+ && prepare_inplace_add_virtual(
+ ha_alter_info, altered_table, table)) {
+ DBUG_RETURN(true);
+ }
+
+ if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
+ && alter_templ_needs_rebuild(altered_table, ha_alter_info,
+ ctx->new_table)
+ && ctx->new_table->n_v_cols > 0) {
+ /* Changing maria record structure may end up here only
+ if virtual columns were altered. In this case, however,
+ vc_templ should be rebuilt. Since we don't actually
+ change any stored data, we can just dispose vc_templ;
+ it will be recreated on next ha_innobase::open(). */
+
+ DBUG_ASSERT(ctx->new_table == ctx->old_table);
+
+ dict_free_vc_templ(ctx->new_table->vc_templ);
+ UT_DELETE(ctx->new_table->vc_templ);
+
+ ctx->new_table->vc_templ = NULL;
+ }
+
+ DBUG_RETURN(false);
+ }
+
+ /* If we are to build a full-text search index, check whether
+ the table already has a DOC ID column. If not, we will need to
+ add a Doc ID hidden column and rebuild the primary index */
+ if (innobase_fulltext_exist(altered_table)) {
+ ulint doc_col_no;
+ ulint num_v = 0;
+
+ if (!innobase_fts_check_doc_id_col(
+ m_prebuilt->table,
+ altered_table, &fts_doc_col_no, &num_v)) {
+
+ fts_doc_col_no = altered_table->s->fields - num_v;
+ add_fts_doc_id = true;
+ add_fts_doc_id_idx = true;
+
+ } else if (fts_doc_col_no == ULINT_UNDEFINED) {
+ goto err_exit;
+ }
+
+ switch (innobase_fts_check_doc_id_index(
+ m_prebuilt->table, altered_table,
+ &doc_col_no)) {
+ case FTS_NOT_EXIST_DOC_ID_INDEX:
+ add_fts_doc_id_idx = true;
+ break;
+ case FTS_INCORRECT_DOC_ID_INDEX:
+ my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0),
+ FTS_DOC_ID_INDEX_NAME);
+ goto err_exit;
+ case FTS_EXIST_DOC_ID_INDEX:
+ DBUG_ASSERT(
+ doc_col_no == fts_doc_col_no
+ || doc_col_no == ULINT_UNDEFINED
+ || (ha_alter_info->handler_flags
+ & (ALTER_STORED_COLUMN_ORDER
+ | ALTER_DROP_STORED_COLUMN
+ | ALTER_ADD_STORED_BASE_COLUMN)));
+ }
+ }
+
+ /* See if an AUTO_INCREMENT column was added. */
+ uint i = 0;
+ ulint num_v = 0;
+ for (const Create_field& new_field :
+ ha_alter_info->alter_info->create_list) {
+ const Field* field;
+
+ DBUG_ASSERT(i < altered_table->s->fields);
+
+ for (uint old_i = 0; table->field[old_i]; old_i++) {
+ if (new_field.field == table->field[old_i]) {
+ goto found_col;
+ }
+ }
+
+ /* This is an added column. */
+ DBUG_ASSERT(!new_field.field);
+ DBUG_ASSERT(ha_alter_info->handler_flags
+ & ALTER_ADD_COLUMN);
+
+ field = altered_table->field[i];
+
+ DBUG_ASSERT((MTYP_TYPENR(field->unireg_check)
+ == Field::NEXT_NUMBER)
+ == !!(field->flags & AUTO_INCREMENT_FLAG));
+
+ if (field->flags & AUTO_INCREMENT_FLAG) {
+ if (add_autoinc_col_no != ULINT_UNDEFINED) {
+ /* This should have been blocked earlier. */
+ ut_ad(0);
+ my_error(ER_WRONG_AUTO_KEY, MYF(0));
+ goto err_exit;
+ }
+
+ /* Get the col no of the old table non-virtual column array */
+ add_autoinc_col_no = i - num_v;
+
+ autoinc_col_max_value = innobase_get_int_col_max_value(field);
+ }
+found_col:
+ num_v += !new_field.stored_in_db();
+ i++;
+ }
+
+ DBUG_ASSERT(heap);
+ DBUG_ASSERT(m_user_thd == m_prebuilt->trx->mysql_thd);
+ DBUG_ASSERT(!ha_alter_info->handler_ctx);
+
+ ha_alter_info->handler_ctx = new ha_innobase_inplace_ctx(
+ m_prebuilt,
+ drop_index, n_drop_index,
+ drop_fk, n_drop_fk, add_fk, n_add_fk,
+ ha_alter_info->online,
+ heap, m_prebuilt->table, col_names,
+ add_autoinc_col_no,
+ ha_alter_info->create_info->auto_increment_value,
+ autoinc_col_max_value,
+ ha_alter_info->ignore || !thd_is_strict_mode(m_user_thd),
+ alt_opt.page_compressed, alt_opt.page_compression_level);
+
+ DBUG_RETURN(prepare_inplace_alter_table_dict(
+ ha_alter_info, altered_table, table,
+ table_share->table_name.str,
+ info.flags(), info.flags2(),
+ fts_doc_col_no, add_fts_doc_id,
+ add_fts_doc_id_idx));
+}
+
+/* Check whether a columnn length change alter operation requires
+to rebuild the template.
+@param[in] altered_table TABLE object for new version of table.
+@param[in] ha_alter_info Structure describing changes to be done
+ by ALTER TABLE and holding data used
+ during in-place alter.
+@param[in] table table being altered
+@return TRUE if needs rebuild. */
+static
+bool
+alter_templ_needs_rebuild(
+ const TABLE* altered_table,
+ const Alter_inplace_info* ha_alter_info,
+ const dict_table_t* table)
+{
+ ulint i = 0;
+
+ for (Field** fp = altered_table->field; *fp; fp++, i++) {
+ for (const Create_field& cf :
+ ha_alter_info->alter_info->create_list) {
+ for (ulint j=0; j < table->n_cols; j++) {
+ dict_col_t* cols
+ = dict_table_get_nth_col(table, j);
+ if (cf.length > cols->len) {
+ return(true);
+ }
+ }
+ }
+ }
+
+ return(false);
+}
+
+/** Get the name of an erroneous key.
+@param[in] error_key_num InnoDB number of the erroneus key
+@param[in] ha_alter_info changes that were being performed
+@param[in] table InnoDB table
+@return the name of the erroneous key */
+static
+const char*
+get_error_key_name(
+ ulint error_key_num,
+ const Alter_inplace_info* ha_alter_info,
+ const dict_table_t* table)
+{
+ if (error_key_num == ULINT_UNDEFINED) {
+ return(FTS_DOC_ID_INDEX_NAME);
+ } else if (ha_alter_info->key_count == 0) {
+ return(dict_table_get_first_index(table)->name);
+ } else {
+ return(ha_alter_info->key_info_buffer[error_key_num].name.str);
+ }
+}
+
+/** Alter the table structure in-place with operations
+specified using Alter_inplace_info.
+The level of concurrency allowed during this operation depends
+on the return value from check_if_supported_inplace_alter().
+
+@param altered_table TABLE object for new version of table.
+@param ha_alter_info Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval true Failure
+@retval false Success
+*/
+
+bool
+ha_innobase::inplace_alter_table(
+/*=============================*/
+ TABLE* altered_table,
+ Alter_inplace_info* ha_alter_info)
+{
+ dberr_t error;
+ dict_add_v_col_t* add_v = NULL;
+ dict_vcol_templ_t* s_templ = NULL;
+ dict_vcol_templ_t* old_templ = NULL;
+ struct TABLE* eval_table = altered_table;
+ bool rebuild_templ = false;
+ DBUG_ENTER("inplace_alter_table");
+ DBUG_ASSERT(!srv_read_only_mode);
+ ut_ad(!sync_check_iterate(sync_check()));
+ ut_ad(!rw_lock_own_flagged(&dict_sys.latch,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+ DEBUG_SYNC(m_user_thd, "innodb_inplace_alter_table_enter");
+
+ if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)) {
+ok_exit:
+ DEBUG_SYNC(m_user_thd, "innodb_after_inplace_alter_table");
+ DBUG_RETURN(false);
+ }
+
+ if ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE
+ | INNOBASE_ALTER_NOCREATE
+ | INNOBASE_ALTER_INSTANT))
+ == ALTER_OPTIONS
+ && !alter_options_need_rebuild(ha_alter_info, table)) {
+ goto ok_exit;
+ }
+
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>
+ (ha_alter_info->handler_ctx);
+
+ DBUG_ASSERT(ctx);
+ DBUG_ASSERT(ctx->trx);
+ DBUG_ASSERT(ctx->prebuilt == m_prebuilt);
+
+ if (ctx->is_instant()) goto ok_exit;
+
+ dict_index_t* pk = dict_table_get_first_index(m_prebuilt->table);
+ ut_ad(pk != NULL);
+
+ /* For partitioned tables this could be already allocated from a
+ previous partition invocation. For normal tables this is NULL. */
+ UT_DELETE(ctx->m_stage);
+
+ ctx->m_stage = UT_NEW_NOKEY(ut_stage_alter_t(pk));
+
+ if (!m_prebuilt->table->is_readable()) {
+ goto all_done;
+ }
+
+ /* If we are doing a table rebuilding or having added virtual
+ columns in the same clause, we will need to build a table template
+ that carries translation information between MySQL TABLE and InnoDB
+ table, which indicates the virtual columns and their base columns
+ info. This is used to do the computation callback, so that the
+ data in base columns can be extracted send to server.
+ If the Column length changes and it is a part of virtual
+ index then we need to rebuild the template. */
+ rebuild_templ
+ = ctx->need_rebuild()
+ || ((ha_alter_info->handler_flags
+ & ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE)
+ && alter_templ_needs_rebuild(
+ altered_table, ha_alter_info, ctx->new_table));
+
+ if ((ctx->new_table->n_v_cols > 0) && rebuild_templ) {
+ /* Save the templ if isn't NULL so as to restore the
+ original state in case of alter operation failures. */
+ if (ctx->new_table->vc_templ != NULL && !ctx->need_rebuild()) {
+ old_templ = ctx->new_table->vc_templ;
+ }
+ s_templ = UT_NEW_NOKEY(dict_vcol_templ_t());
+
+ innobase_build_v_templ(
+ altered_table, ctx->new_table, s_templ, NULL, false);
+
+ ctx->new_table->vc_templ = s_templ;
+ } else if (ctx->num_to_add_vcol > 0 && ctx->num_to_drop_vcol == 0) {
+ /* if there is ongoing drop virtual column, then we disallow
+ inplace add index on newly added virtual column, so it does
+ not need to come in here to rebuild template with add_v.
+ Please also see the assertion in innodb_v_adjust_idx_col() */
+
+ s_templ = UT_NEW_NOKEY(dict_vcol_templ_t());
+
+ add_v = static_cast<dict_add_v_col_t*>(
+ mem_heap_alloc(ctx->heap, sizeof *add_v));
+ add_v->n_v_col = ctx->num_to_add_vcol;
+ add_v->v_col = ctx->add_vcol;
+ add_v->v_col_name = ctx->add_vcol_name;
+
+ innobase_build_v_templ(
+ altered_table, ctx->new_table, s_templ, add_v, false);
+ old_templ = ctx->new_table->vc_templ;
+ ctx->new_table->vc_templ = s_templ;
+ }
+
+ /* Drop virtual column without rebuild will keep dict table
+ unchanged, we use old table to evaluate virtual column value
+ in innobase_get_computed_value(). */
+ if (!ctx->need_rebuild() && ctx->num_to_drop_vcol > 0) {
+ eval_table = table;
+ }
+
+ /* Read the clustered index of the table and build
+ indexes based on this information using temporary
+ files and merge sort. */
+ DBUG_EXECUTE_IF("innodb_OOM_inplace_alter",
+ error = DB_OUT_OF_MEMORY; goto oom;);
+
+ error = row_merge_build_indexes(
+ m_prebuilt->trx,
+ m_prebuilt->table, ctx->new_table,
+ ctx->online,
+ ctx->add_index, ctx->add_key_numbers, ctx->num_to_add_index,
+ altered_table, ctx->defaults, ctx->col_map,
+ ctx->add_autoinc, ctx->sequence, ctx->skip_pk_sort,
+ ctx->m_stage, add_v, eval_table, ctx->allow_not_null);
+
+#ifndef DBUG_OFF
+oom:
+#endif /* !DBUG_OFF */
+ if (error == DB_SUCCESS && ctx->online && ctx->need_rebuild()) {
+ DEBUG_SYNC_C("row_log_table_apply1_before");
+ error = row_log_table_apply(
+ ctx->thr, m_prebuilt->table, altered_table,
+ ctx->m_stage, ctx->new_table);
+ }
+
+ /* Init online ddl status variables */
+ onlineddl_rowlog_rows = 0;
+ onlineddl_rowlog_pct_used = 0;
+ onlineddl_pct_progress = 0;
+
+ if (s_templ) {
+ ut_ad(ctx->need_rebuild() || ctx->num_to_add_vcol > 0
+ || rebuild_templ);
+ dict_free_vc_templ(s_templ);
+ UT_DELETE(s_templ);
+
+ ctx->new_table->vc_templ = old_templ;
+ }
+
+ DEBUG_SYNC_C("inplace_after_index_build");
+
+ DBUG_EXECUTE_IF("create_index_fail",
+ error = DB_DUPLICATE_KEY;
+ m_prebuilt->trx->error_key_num = ULINT_UNDEFINED;);
+
+ /* After an error, remove all those index definitions
+ from the dictionary which were defined. */
+
+ switch (error) {
+ KEY* dup_key;
+ all_done:
+ case DB_SUCCESS:
+ ut_d(mutex_enter(&dict_sys.mutex));
+ ut_d(dict_table_check_for_dup_indexes(
+ m_prebuilt->table, CHECK_PARTIAL_OK));
+ ut_d(mutex_exit(&dict_sys.mutex));
+ /* prebuilt->table->n_ref_count can be anything here,
+ given that we hold at most a shared lock on the table. */
+ goto ok_exit;
+ case DB_DUPLICATE_KEY:
+ if (m_prebuilt->trx->error_key_num == ULINT_UNDEFINED
+ || ha_alter_info->key_count == 0) {
+ /* This should be the hidden index on
+ FTS_DOC_ID, or there is no PRIMARY KEY in the
+ table. Either way, we should be seeing and
+ reporting a bogus duplicate key error. */
+ dup_key = NULL;
+ } else {
+ DBUG_ASSERT(m_prebuilt->trx->error_key_num
+ < ha_alter_info->key_count);
+ dup_key = &ha_alter_info->key_info_buffer[
+ m_prebuilt->trx->error_key_num];
+ }
+ print_keydup_error(altered_table, dup_key, MYF(0));
+ break;
+ case DB_ONLINE_LOG_TOO_BIG:
+ DBUG_ASSERT(ctx->online);
+ my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0),
+ get_error_key_name(m_prebuilt->trx->error_key_num,
+ ha_alter_info, m_prebuilt->table));
+ break;
+ case DB_INDEX_CORRUPT:
+ my_error(ER_INDEX_CORRUPT, MYF(0),
+ get_error_key_name(m_prebuilt->trx->error_key_num,
+ ha_alter_info, m_prebuilt->table));
+ break;
+ case DB_DECRYPTION_FAILED: {
+ String str;
+ const char* engine= table_type();
+ get_error_message(HA_ERR_DECRYPTION_FAILED, &str);
+ my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, str.c_ptr(), engine);
+ break;
+ }
+ default:
+ my_error_innodb(error,
+ table_share->table_name.str,
+ m_prebuilt->table->flags);
+ }
+
+ /* prebuilt->table->n_ref_count can be anything here, given
+ that we hold at most a shared lock on the table. */
+ m_prebuilt->trx->error_info = NULL;
+ ctx->trx->error_state = DB_SUCCESS;
+
+ DBUG_RETURN(true);
+}
+
+/** Free the modification log for online table rebuild.
+@param table table that was being rebuilt online */
+static
+void
+innobase_online_rebuild_log_free(
+/*=============================*/
+ dict_table_t* table)
+{
+ dict_index_t* clust_index = dict_table_get_first_index(table);
+ ut_d(dict_sys.assert_locked());
+ rw_lock_x_lock(&clust_index->lock);
+
+ if (clust_index->online_log) {
+ ut_ad(dict_index_get_online_status(clust_index)
+ == ONLINE_INDEX_CREATION);
+ clust_index->online_status = ONLINE_INDEX_COMPLETE;
+ row_log_free(clust_index->online_log);
+ clust_index->online_log = NULL;
+ DEBUG_SYNC_C("innodb_online_rebuild_log_free_aborted");
+ }
+
+ DBUG_ASSERT(dict_index_get_online_status(clust_index)
+ == ONLINE_INDEX_COMPLETE);
+ rw_lock_x_unlock(&clust_index->lock);
+}
+
+/** For each user column, which is part of an index which is not going to be
+dropped, it checks if the column number of the column is same as col_no
+argument passed.
+@param[in] table table
+@param[in] col_no column number
+@param[in] is_v if this is a virtual column
+@param[in] only_committed whether to consider only committed indexes
+@retval true column exists
+@retval false column does not exist, true if column is system column or
+it is in the index. */
+static
+bool
+check_col_exists_in_indexes(
+ const dict_table_t* table,
+ ulint col_no,
+ bool is_v,
+ bool only_committed = false)
+{
+ /* This function does not check system columns */
+ if (!is_v && dict_table_get_nth_col(table, col_no)->mtype == DATA_SYS) {
+ return(true);
+ }
+
+ for (const dict_index_t* index = dict_table_get_first_index(table);
+ index;
+ index = dict_table_get_next_index(index)) {
+
+ if (only_committed
+ ? !index->is_committed()
+ : index->to_be_dropped) {
+ continue;
+ }
+
+ for (ulint i = 0; i < index->n_user_defined_cols; i++) {
+ const dict_col_t* idx_col
+ = dict_index_get_nth_col(index, i);
+
+ if (is_v && idx_col->is_virtual()) {
+ const dict_v_col_t* v_col = reinterpret_cast<
+ const dict_v_col_t*>(idx_col);
+ if (v_col->v_pos == col_no) {
+ return(true);
+ }
+ }
+
+ if (!is_v && !idx_col->is_virtual()
+ && dict_col_get_no(idx_col) == col_no) {
+ return(true);
+ }
+ }
+ }
+
+ return(false);
+}
+
+/** Rollback a secondary index creation, drop the indexes with
+temparary index prefix
+@param user_table InnoDB table
+@param table the TABLE
+@param locked TRUE=table locked, FALSE=may need to do a lazy drop
+@param trx the transaction
+@param alter_trx transaction which takes S-lock on the table
+ while creating the index */
+static
+void
+innobase_rollback_sec_index(
+ dict_table_t* user_table,
+ const TABLE* table,
+ bool locked,
+ trx_t* trx,
+ const trx_t* alter_trx=NULL)
+{
+ row_merge_drop_indexes(trx, user_table, locked, alter_trx);
+
+ /* Free the table->fts only if there is no FTS_DOC_ID
+ in the table */
+ if (user_table->fts
+ && !DICT_TF2_FLAG_IS_SET(user_table,
+ DICT_TF2_FTS_HAS_DOC_ID)
+ && !innobase_fulltext_exist(table)) {
+ fts_free(user_table);
+ }
+}
+
+/* Get the number of uncommitted fts index during rollback
+operation.
+@param[in] table table which undergoes rollback for alter
+@return number of uncommitted fts indexes. */
+static
+ulint innobase_get_uncommitted_fts_indexes(const dict_table_t* table)
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+ dict_index_t* index = dict_table_get_first_index(table);
+ ulint n_uncommitted_fts = 0;
+
+ for (; index ; index = dict_table_get_next_index(index))
+ {
+ if (index->type & DICT_FTS && !index->is_committed())
+ n_uncommitted_fts++;
+ }
+
+ return n_uncommitted_fts;
+}
+
+/** Roll back the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the storage engine. Note that the
+allowed level of concurrency during this operation will be the same as
+for inplace_alter_table() and thus might be higher than during
+prepare_inplace_alter_table(). (E.g concurrent writes were blocked
+during prepare, but might not be during commit).
+
+@param ha_alter_info Data used during in-place alter.
+@param table the TABLE
+@param prebuilt the prebuilt struct
+@retval true Failure
+@retval false Success
+*/
+inline MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+rollback_inplace_alter_table(
+/*=========================*/
+ Alter_inplace_info* ha_alter_info,
+ const TABLE* table,
+ row_prebuilt_t* prebuilt)
+{
+ bool fail = false;
+
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>
+ (ha_alter_info->handler_ctx);
+
+ DBUG_ENTER("rollback_inplace_alter_table");
+
+ if (!ctx || !ctx->trx) {
+ /* If we have not started a transaction yet,
+ (almost) nothing has been or needs to be done. */
+ goto func_exit;
+ }
+
+ trx_start_for_ddl(ctx->trx, ctx->need_rebuild()
+ ? TRX_DICT_OP_TABLE : TRX_DICT_OP_INDEX);
+ row_mysql_lock_data_dictionary(ctx->trx);
+
+ if (ctx->need_rebuild()) {
+ /* DML threads can access ctx->new_table via the
+ online rebuild log. Free it first. */
+ innobase_online_rebuild_log_free(prebuilt->table);
+ }
+
+ if (!ctx->new_table) {
+ ut_ad(ctx->need_rebuild());
+ } else if (ctx->need_rebuild()) {
+ dberr_t err= DB_SUCCESS;
+ ulint flags = ctx->new_table->flags;
+
+ /* Since the FTS index specific auxiliary tables has
+ not yet registered with "table->fts" by fts_add_index(),
+ we will need explicitly delete them here */
+ if (dict_table_has_fts_index(ctx->new_table)) {
+
+ err = innobase_drop_fts_index_table(
+ ctx->new_table, ctx->trx);
+
+ if (err != DB_SUCCESS) {
+ my_error_innodb(
+ err, table->s->table_name.str,
+ flags);
+ fail = true;
+ }
+ }
+
+ dict_table_close_and_drop(ctx->trx, ctx->new_table);
+
+ switch (err) {
+ case DB_SUCCESS:
+ break;
+ default:
+ my_error_innodb(err, table->s->table_name.str,
+ flags);
+ fail = true;
+ }
+ } else {
+ DBUG_ASSERT(!(ha_alter_info->handler_flags
+ & ALTER_ADD_PK_INDEX));
+ DBUG_ASSERT(ctx->new_table == prebuilt->table);
+
+ /* Remove the fts table from fts_optimize_wq if
+ there is only one fts index exist. */
+ if (prebuilt->table->fts
+ && innobase_get_uncommitted_fts_indexes(
+ prebuilt->table) == 1
+ && (ib_vector_is_empty(prebuilt->table->fts->indexes)
+ || ib_vector_size(prebuilt->table->fts->indexes)
+ == 1)) {
+ row_mysql_unlock_data_dictionary(ctx->trx);
+ fts_optimize_remove_table(prebuilt->table);
+ row_mysql_lock_data_dictionary(ctx->trx);
+ }
+
+ innobase_rollback_sec_index(
+ prebuilt->table, table,
+ (ha_alter_info->alter_info->requested_lock
+ == Alter_info::ALTER_TABLE_LOCK_EXCLUSIVE),
+ ctx->trx, prebuilt->trx);
+
+ ctx->clean_new_vcol_index();
+ }
+
+ trx_commit_for_mysql(ctx->trx);
+ row_mysql_unlock_data_dictionary(ctx->trx);
+ ctx->trx->free();
+ ctx->trx = NULL;
+
+func_exit:
+#ifndef DBUG_OFF
+ dict_index_t* clust_index = dict_table_get_first_index(
+ prebuilt->table);
+ DBUG_ASSERT(!clust_index->online_log);
+ DBUG_ASSERT(dict_index_get_online_status(clust_index)
+ == ONLINE_INDEX_COMPLETE);
+#endif /* !DBUG_OFF */
+
+ if (ctx) {
+ DBUG_ASSERT(ctx->prebuilt == prebuilt);
+
+ if (ctx->num_to_add_fk) {
+ for (ulint i = 0; i < ctx->num_to_add_fk; i++) {
+ dict_foreign_free(ctx->add_fk[i]);
+ }
+ }
+
+ if (ctx->num_to_drop_index) {
+ row_mysql_lock_data_dictionary(prebuilt->trx);
+
+ /* Clear the to_be_dropped flags
+ in the data dictionary cache.
+ The flags may already have been cleared,
+ in case an error was detected in
+ commit_inplace_alter_table(). */
+ for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+ dict_index_t* index = ctx->drop_index[i];
+ DBUG_ASSERT(index->is_committed());
+ index->to_be_dropped = 0;
+ }
+
+ row_mysql_unlock_data_dictionary(prebuilt->trx);
+ }
+ }
+
+ /* Reset dict_col_t::ord_part for those columns fail to be indexed,
+ we do this by checking every existing column, if any current
+ index would index them */
+ for (ulint i = 0; i < dict_table_get_n_cols(prebuilt->table); i++) {
+ dict_col_t& col = prebuilt->table->cols[i];
+ if (!col.ord_part) {
+ continue;
+ }
+ if (!check_col_exists_in_indexes(prebuilt->table, i, false,
+ true)) {
+ col.ord_part = 0;
+ }
+ }
+
+ for (ulint i = 0; i < dict_table_get_n_v_cols(prebuilt->table); i++) {
+ dict_col_t& col = prebuilt->table->v_cols[i].m_col;
+ if (!col.ord_part) {
+ continue;
+ }
+ if (!check_col_exists_in_indexes(prebuilt->table, i, true,
+ true)) {
+ col.ord_part = 0;
+ }
+ }
+
+ trx_commit_for_mysql(prebuilt->trx);
+ MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+ DBUG_RETURN(fail);
+}
+
+/** Drop a FOREIGN KEY constraint from the data dictionary tables.
+@param trx data dictionary transaction
+@param table_name Table name in MySQL
+@param foreign_id Foreign key constraint identifier
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_drop_foreign_try(
+/*======================*/
+ trx_t* trx,
+ const char* table_name,
+ const char* foreign_id)
+{
+ DBUG_ENTER("innobase_drop_foreign_try");
+
+ DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_d(dict_sys.assert_locked());
+
+ /* Drop the constraint from the data dictionary. */
+ static const char sql[] =
+ "PROCEDURE DROP_FOREIGN_PROC () IS\n"
+ "BEGIN\n"
+ "DELETE FROM SYS_FOREIGN WHERE ID=:id;\n"
+ "DELETE FROM SYS_FOREIGN_COLS WHERE ID=:id;\n"
+ "END;\n";
+
+ dberr_t error;
+ pars_info_t* info;
+
+ info = pars_info_create();
+ pars_info_add_str_literal(info, "id", foreign_id);
+
+ trx->op_info = "dropping foreign key constraint from dictionary";
+ error = que_eval_sql(info, sql, FALSE, trx);
+ trx->op_info = "";
+
+ DBUG_EXECUTE_IF("ib_drop_foreign_error",
+ error = DB_OUT_OF_FILE_SPACE;);
+
+ if (error != DB_SUCCESS) {
+ my_error_innodb(error, table_name, 0);
+ trx->error_state = DB_SUCCESS;
+ DBUG_RETURN(true);
+ }
+
+ DBUG_RETURN(false);
+}
+
+/** Rename a column in the data dictionary tables.
+@param[in] ctx ALTER TABLE context
+@param[in,out] trx Data dictionary transaction
+@param[in] table_name Table name in MySQL
+@param[in] from old column name
+@param[in] to new column name
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_rename_column_try(
+ const ha_innobase_inplace_ctx& ctx,
+ trx_t* trx,
+ const char* table_name,
+ const char* from,
+ const char* to)
+{
+ dberr_t error;
+ bool clust_has_prefixes = false;
+
+ DBUG_ENTER("innobase_rename_column_try");
+
+ DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_d(dict_sys.assert_locked());
+
+ if (ctx.need_rebuild()) {
+ goto rename_foreign;
+ }
+
+ error = DB_SUCCESS;
+
+ trx->op_info = "renaming column in SYS_FIELDS";
+
+ for (const dict_index_t* index = dict_table_get_first_index(
+ ctx.old_table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+
+ bool has_prefixes = false;
+ for (size_t i = 0; i < dict_index_get_n_fields(index); i++) {
+ if (dict_index_get_nth_field(index, i)->prefix_len) {
+ has_prefixes = true;
+ break;
+ }
+ }
+
+ for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+ const dict_field_t& f = index->fields[i];
+ DBUG_ASSERT(!f.name == f.col->is_dropped());
+
+ if (!f.name || my_strcasecmp(system_charset_info,
+ f.name, from)) {
+ continue;
+ }
+
+ pars_info_t* info = pars_info_create();
+ ulint pos = has_prefixes ? i << 16 | f.prefix_len : i;
+
+ pars_info_add_ull_literal(info, "indexid", index->id);
+ pars_info_add_int4_literal(info, "nth", pos);
+ pars_info_add_str_literal(info, "new", to);
+
+ error = que_eval_sql(
+ info,
+ "PROCEDURE RENAME_SYS_FIELDS_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_FIELDS SET COL_NAME=:new\n"
+ "WHERE INDEX_ID=:indexid\n"
+ "AND POS=:nth;\n"
+ "END;\n",
+ FALSE, trx);
+ DBUG_EXECUTE_IF("ib_rename_column_error",
+ error = DB_OUT_OF_FILE_SPACE;);
+
+ if (error != DB_SUCCESS) {
+ goto err_exit;
+ }
+
+ if (!has_prefixes || !clust_has_prefixes
+ || f.prefix_len) {
+ continue;
+ }
+
+ /* For secondary indexes, the
+ has_prefixes check can be 'polluted'
+ by PRIMARY KEY column prefix. Try also
+ the simpler encoding of SYS_FIELDS.POS. */
+ info = pars_info_create();
+
+ pars_info_add_ull_literal(info, "indexid", index->id);
+ pars_info_add_int4_literal(info, "nth", i);
+ pars_info_add_str_literal(info, "new", to);
+
+ error = que_eval_sql(
+ info,
+ "PROCEDURE RENAME_SYS_FIELDS_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_FIELDS SET COL_NAME=:new\n"
+ "WHERE INDEX_ID=:indexid\n"
+ "AND POS=:nth;\n"
+ "END;\n",
+ FALSE, trx);
+
+ if (error != DB_SUCCESS) {
+ goto err_exit;
+ }
+ }
+
+ if (index == dict_table_get_first_index(ctx.old_table)) {
+ clust_has_prefixes = has_prefixes;
+ }
+ }
+
+ if (error != DB_SUCCESS) {
+err_exit:
+ my_error_innodb(error, table_name, 0);
+ trx->error_state = DB_SUCCESS;
+ trx->op_info = "";
+ DBUG_RETURN(true);
+ }
+
+rename_foreign:
+ trx->op_info = "renaming column in SYS_FOREIGN_COLS";
+
+ std::set<dict_foreign_t*> fk_evict;
+ bool foreign_modified;
+
+ for (dict_foreign_set::const_iterator it = ctx.old_table->foreign_set.begin();
+ it != ctx.old_table->foreign_set.end();
+ ++it) {
+
+ dict_foreign_t* foreign = *it;
+ foreign_modified = false;
+
+ for (unsigned i = 0; i < foreign->n_fields; i++) {
+ if (my_strcasecmp(system_charset_info,
+ foreign->foreign_col_names[i],
+ from)) {
+ continue;
+ }
+
+ /* Ignore the foreign key rename if fk info
+ is being dropped. */
+ if (innobase_dropping_foreign(
+ foreign, ctx.drop_fk,
+ ctx.num_to_drop_fk)) {
+ continue;
+ }
+
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_str_literal(info, "id", foreign->id);
+ pars_info_add_int4_literal(info, "nth", i);
+ pars_info_add_str_literal(info, "new", to);
+
+ error = que_eval_sql(
+ info,
+ "PROCEDURE RENAME_SYS_FOREIGN_F_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_FOREIGN_COLS\n"
+ "SET FOR_COL_NAME=:new\n"
+ "WHERE ID=:id AND POS=:nth;\n"
+ "END;\n",
+ FALSE, trx);
+
+ if (error != DB_SUCCESS) {
+ goto err_exit;
+ }
+ foreign_modified = true;
+ }
+
+ if (foreign_modified) {
+ fk_evict.insert(foreign);
+ }
+ }
+
+ for (dict_foreign_set::const_iterator it
+ = ctx.old_table->referenced_set.begin();
+ it != ctx.old_table->referenced_set.end();
+ ++it) {
+
+ foreign_modified = false;
+ dict_foreign_t* foreign = *it;
+
+ for (unsigned i = 0; i < foreign->n_fields; i++) {
+ if (my_strcasecmp(system_charset_info,
+ foreign->referenced_col_names[i],
+ from)) {
+ continue;
+ }
+
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_str_literal(info, "id", foreign->id);
+ pars_info_add_int4_literal(info, "nth", i);
+ pars_info_add_str_literal(info, "new", to);
+
+ error = que_eval_sql(
+ info,
+ "PROCEDURE RENAME_SYS_FOREIGN_R_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_FOREIGN_COLS\n"
+ "SET REF_COL_NAME=:new\n"
+ "WHERE ID=:id AND POS=:nth;\n"
+ "END;\n",
+ FALSE, trx);
+
+ if (error != DB_SUCCESS) {
+ goto err_exit;
+ }
+ foreign_modified = true;
+ }
+
+ if (foreign_modified) {
+ fk_evict.insert(foreign);
+ }
+ }
+
+ /* Reload the foreign key info for instant table too. */
+ if (ctx.need_rebuild() || ctx.is_instant()) {
+ std::for_each(fk_evict.begin(), fk_evict.end(),
+ dict_foreign_remove_from_cache);
+ }
+
+ trx->op_info = "";
+ DBUG_RETURN(false);
+}
+
+/** Rename columns in the data dictionary tables.
+@param ha_alter_info Data used during in-place alter.
+@param ctx In-place ALTER TABLE context
+@param table the TABLE
+@param trx data dictionary transaction
+@param table_name Table name in MySQL
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_rename_columns_try(
+/*========================*/
+ Alter_inplace_info* ha_alter_info,
+ ha_innobase_inplace_ctx*ctx,
+ const TABLE* table,
+ trx_t* trx,
+ const char* table_name)
+{
+ uint i = 0;
+ ulint num_v = 0;
+
+ DBUG_ASSERT(ctx->need_rebuild());
+ DBUG_ASSERT(ha_alter_info->handler_flags
+ & ALTER_COLUMN_NAME);
+
+ for (Field** fp = table->field; *fp; fp++, i++) {
+ const bool is_virtual = !(*fp)->stored_in_db();
+ if (!((*fp)->flags & FIELD_IS_RENAMED)) {
+ goto processed_field;
+ }
+
+ for (const Create_field& cf :
+ ha_alter_info->alter_info->create_list) {
+ if (cf.field == *fp) {
+ if (innobase_rename_column_try(
+ *ctx, trx, table_name,
+ cf.field->field_name.str,
+ cf.field_name.str)) {
+ return(true);
+ }
+ goto processed_field;
+ }
+ }
+
+ ut_error;
+processed_field:
+ if (is_virtual) {
+ num_v++;
+ }
+
+ continue;
+ }
+
+ return(false);
+}
+
+/** Convert field type and length to InnoDB format */
+static void get_type(const Field& f, uint& prtype, uint8_t& mtype,
+ uint16_t& len)
+{
+ mtype = get_innobase_type_from_mysql_type(&prtype, &f);
+ len = static_cast<uint16_t>(f.pack_length());
+ prtype |= f.type();
+ if (f.type() == MYSQL_TYPE_VARCHAR) {
+ auto l = static_cast<const Field_varstring&>(f).length_bytes;
+ len = static_cast<uint16_t>(len - l);
+ if (l == 2) prtype |= DATA_LONG_TRUE_VARCHAR;
+ }
+ if (!f.real_maybe_null()) prtype |= DATA_NOT_NULL;
+ if (f.binary()) prtype |= DATA_BINARY_TYPE;
+ if (f.table->versioned()) {
+ if (&f == f.table->field[f.table->s->vers.start_fieldno]) {
+ prtype |= DATA_VERS_START;
+ } else if (&f == f.table->field[f.table->s->vers.end_fieldno]) {
+ prtype |= DATA_VERS_END;
+ } else if (!(f.flags & VERS_UPDATE_UNVERSIONED_FLAG)) {
+ prtype |= DATA_VERSIONED;
+ }
+ }
+ if (!f.stored_in_db()) prtype |= DATA_VIRTUAL;
+
+ if (dtype_is_string_type(mtype)) {
+ prtype |= f.charset()->number << 16;
+ }
+}
+
+/** Enlarge a column in the data dictionary tables.
+@param ctx In-place ALTER TABLE context
+@param trx data dictionary transaction
+@param table_name Table name in MySQL
+@param pos 0-based index to user_table->cols[] or user_table->v_cols[]
+@param f new column
+@param is_v if it's a virtual column
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_rename_or_enlarge_column_try(
+ ha_innobase_inplace_ctx*ctx,
+ trx_t* trx,
+ const char* table_name,
+ ulint pos,
+ const Field& f,
+ bool is_v)
+{
+ dict_col_t* col;
+ dict_table_t* user_table = ctx->old_table;
+
+ DBUG_ENTER("innobase_rename_or_enlarge_column_try");
+ DBUG_ASSERT(!ctx->need_rebuild());
+
+ DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_d(dict_sys.assert_locked());
+
+ ulint n_base;
+
+ if (is_v) {
+ dict_v_col_t* v_col= dict_table_get_nth_v_col(user_table, pos);
+ pos = dict_create_v_col_pos(v_col->v_pos, v_col->m_col.ind);
+ col = &v_col->m_col;
+ n_base = v_col->num_base;
+ } else {
+ col = dict_table_get_nth_col(user_table, pos);
+ n_base = 0;
+ }
+
+ unsigned prtype;
+ uint8_t mtype;
+ uint16_t len;
+ get_type(f, prtype, mtype, len);
+ DBUG_ASSERT(!dtype_is_string_type(col->mtype)
+ || col->mbminlen == f.charset()->mbminlen);
+ DBUG_ASSERT(col->len <= len);
+
+#ifdef UNIV_DEBUG
+ ut_ad(col->mbminlen <= col->mbmaxlen);
+ switch (mtype) {
+ case DATA_MYSQL:
+ if (!(prtype & DATA_BINARY_TYPE) || user_table->not_redundant()
+ || col->mbminlen != col->mbmaxlen) {
+ /* NOTE: we could allow this when !(prtype &
+ DATA_BINARY_TYPE) and ROW_FORMAT is not REDUNDANT and
+ mbminlen<mbmaxlen. That is, we treat a UTF-8 CHAR(n)
+ column somewhat like a VARCHAR. */
+ break;
+ }
+ /* fall through */
+ case DATA_FIXBINARY:
+ case DATA_CHAR:
+ ut_ad(col->len == len);
+ break;
+ case DATA_BINARY:
+ case DATA_VARCHAR:
+ case DATA_VARMYSQL:
+ case DATA_DECIMAL:
+ case DATA_BLOB:
+ break;
+ default:
+ ut_ad(!((col->prtype ^ prtype) & ~DATA_VERSIONED));
+ ut_ad(col->mtype == mtype);
+ ut_ad(col->len == len);
+ }
+#endif /* UNIV_DEBUG */
+
+ const char* col_name = col->name(*user_table);
+ const bool same_name = !strcmp(col_name, f.field_name.str);
+
+ if (!same_name
+ && innobase_rename_column_try(*ctx, trx, table_name,
+ col_name, f.field_name.str)) {
+ DBUG_RETURN(true);
+ }
+
+ if (same_name
+ && col->prtype == prtype && col->mtype == mtype
+ && col->len == len) {
+ DBUG_RETURN(false);
+ }
+
+ DBUG_RETURN(innodb_insert_sys_columns(user_table->id, pos,
+ f.field_name.str,
+ mtype, prtype, len,
+ n_base, trx, true));
+}
+
+/** Rename or enlarge columns in the data dictionary cache
+as part of commit_try_norebuild().
+@param ha_alter_info Data used during in-place alter.
+@param ctx In-place ALTER TABLE context
+@param altered_table metadata after ALTER TABLE
+@param table metadata before ALTER TABLE
+@param trx data dictionary transaction
+@param table_name Table name in MySQL
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_rename_or_enlarge_columns_try(
+ Alter_inplace_info* ha_alter_info,
+ ha_innobase_inplace_ctx*ctx,
+ const TABLE* altered_table,
+ const TABLE* table,
+ trx_t* trx,
+ const char* table_name)
+{
+ DBUG_ENTER("innobase_rename_or_enlarge_columns_try");
+
+ if (!(ha_alter_info->handler_flags
+ & (ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE
+ | ALTER_COLUMN_NAME))) {
+ DBUG_RETURN(false);
+ }
+
+ ulint i = 0;
+ ulint num_v = 0;
+
+ for (Field** fp = table->field; *fp; fp++, i++) {
+ const bool is_v = !(*fp)->stored_in_db();
+ ulint idx = is_v ? num_v++ : i - num_v;
+
+ Field** af = altered_table->field;
+ for (const Create_field& cf :
+ ha_alter_info->alter_info->create_list) {
+ if (cf.field == *fp) {
+ if (innobase_rename_or_enlarge_column_try(
+ ctx, trx, table_name,
+ idx, **af, is_v)) {
+ DBUG_RETURN(true);
+ }
+ break;
+ }
+ af++;
+ }
+ }
+
+ DBUG_RETURN(false);
+}
+
+/** Rename or enlarge columns in the data dictionary cache
+as part of commit_cache_norebuild().
+@param ha_alter_info Data used during in-place alter.
+@param altered_table metadata after ALTER TABLE
+@param table metadata before ALTER TABLE
+@param user_table InnoDB table that was being altered */
+static MY_ATTRIBUTE((nonnull))
+void
+innobase_rename_or_enlarge_columns_cache(
+/*=====================================*/
+ Alter_inplace_info* ha_alter_info,
+ const TABLE* altered_table,
+ const TABLE* table,
+ dict_table_t* user_table)
+{
+ if (!(ha_alter_info->handler_flags
+ & (ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE
+ | ALTER_COLUMN_NAME))) {
+ return;
+ }
+
+ uint i = 0;
+ ulint num_v = 0;
+
+ for (Field** fp = table->field; *fp; fp++, i++) {
+ const bool is_virtual = !(*fp)->stored_in_db();
+
+ Field** af = altered_table->field;
+ for (Create_field& cf :
+ ha_alter_info->alter_info->create_list) {
+ if (cf.field != *fp) {
+ af++;
+ continue;
+ }
+
+ ulint col_n = is_virtual ? num_v : i - num_v;
+ dict_col_t *col = is_virtual
+ ? &dict_table_get_nth_v_col(user_table, col_n)
+ ->m_col
+ : dict_table_get_nth_col(user_table, col_n);
+ const bool is_string= dtype_is_string_type(col->mtype);
+ DBUG_ASSERT(col->mbminlen
+ == (is_string
+ ? (*af)->charset()->mbminlen : 0));
+ unsigned prtype;
+ uint8_t mtype;
+ uint16_t len;
+ get_type(**af, prtype, mtype, len);
+ DBUG_ASSERT(is_string == dtype_is_string_type(mtype));
+
+ col->prtype = prtype;
+ col->mtype = mtype;
+ col->len = len;
+ col->mbmaxlen = is_string
+ ? (*af)->charset()->mbmaxlen & 7: 0;
+
+ if ((*fp)->flags & FIELD_IS_RENAMED) {
+ dict_mem_table_col_rename(
+ user_table, col_n,
+ cf.field->field_name.str,
+ (*af)->field_name.str, is_virtual);
+ }
+
+ break;
+ }
+
+ if (is_virtual) {
+ num_v++;
+ }
+ }
+}
+
+/** Set the auto-increment value of the table on commit.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param altered_table MySQL table that is being altered
+@param old_table MySQL table as it is before the ALTER operation
+@return whether the operation failed (and my_error() was called) */
+static MY_ATTRIBUTE((nonnull))
+bool
+commit_set_autoinc(
+ Alter_inplace_info* ha_alter_info,
+ ha_innobase_inplace_ctx*ctx,
+ const TABLE* altered_table,
+ const TABLE* old_table)
+{
+ DBUG_ENTER("commit_set_autoinc");
+
+ if (!altered_table->found_next_number_field) {
+ /* There is no AUTO_INCREMENT column in the table
+ after the ALTER operation. */
+ } else if (ctx->add_autoinc != ULINT_UNDEFINED) {
+ ut_ad(ctx->need_rebuild());
+ /* An AUTO_INCREMENT column was added. Get the last
+ value from the sequence, which may be based on a
+ supplied AUTO_INCREMENT value. */
+ ib_uint64_t autoinc = ctx->sequence.last();
+ ctx->new_table->autoinc = autoinc;
+ /* Bulk index creation does not update
+ PAGE_ROOT_AUTO_INC, so we must persist the "last used"
+ value here. */
+ btr_write_autoinc(dict_table_get_first_index(ctx->new_table),
+ autoinc - 1, true);
+ } else if ((ha_alter_info->handler_flags
+ & ALTER_CHANGE_CREATE_OPTION)
+ && (ha_alter_info->create_info->used_fields
+ & HA_CREATE_USED_AUTO)) {
+
+ if (!ctx->old_table->space) {
+ my_error(ER_TABLESPACE_DISCARDED, MYF(0),
+ old_table->s->table_name.str);
+ DBUG_RETURN(true);
+ }
+
+ /* An AUTO_INCREMENT value was supplied by the user.
+ It must be persisted to the data file. */
+ const Field* ai = old_table->found_next_number_field;
+ ut_ad(!strcmp(dict_table_get_col_name(ctx->old_table,
+ innodb_col_no(ai)),
+ ai->field_name.str));
+
+ ib_uint64_t autoinc
+ = ha_alter_info->create_info->auto_increment_value;
+ if (autoinc == 0) {
+ autoinc = 1;
+ }
+
+ if (autoinc >= ctx->old_table->autoinc) {
+ /* Persist the predecessor of the
+ AUTO_INCREMENT value as the last used one. */
+ ctx->new_table->autoinc = autoinc--;
+ } else {
+ /* Mimic ALGORITHM=COPY in the following scenario:
+
+ CREATE TABLE t (a SERIAL);
+ INSERT INTO t SET a=100;
+ ALTER TABLE t AUTO_INCREMENT = 1;
+ INSERT INTO t SET a=NULL;
+ SELECT * FROM t;
+
+ By default, ALGORITHM=INPLACE would reset the
+ sequence to 1, while after ALGORITHM=COPY, the
+ last INSERT would use a value larger than 100.
+
+ We could only search the tree to know current
+ max counter in the table and compare. */
+ const dict_col_t* autoinc_col
+ = dict_table_get_nth_col(ctx->old_table,
+ innodb_col_no(ai));
+ dict_index_t* index
+ = dict_table_get_first_index(ctx->old_table);
+ while (index != NULL
+ && index->fields[0].col != autoinc_col) {
+ index = dict_table_get_next_index(index);
+ }
+
+ ut_ad(index);
+
+ ib_uint64_t max_in_table = index
+ ? row_search_max_autoinc(index)
+ : 0;
+
+ if (autoinc <= max_in_table) {
+ ctx->new_table->autoinc = innobase_next_autoinc(
+ max_in_table, 1,
+ ctx->prebuilt->autoinc_increment,
+ ctx->prebuilt->autoinc_offset,
+ innobase_get_int_col_max_value(ai));
+ /* Persist the maximum value as the
+ last used one. */
+ autoinc = max_in_table;
+ } else {
+ /* Persist the predecessor of the
+ AUTO_INCREMENT value as the last used one. */
+ ctx->new_table->autoinc = autoinc--;
+ }
+ }
+
+ btr_write_autoinc(dict_table_get_first_index(ctx->new_table),
+ autoinc, true);
+ } else if (ctx->need_rebuild()) {
+ /* No AUTO_INCREMENT value was specified.
+ Copy it from the old table. */
+ ctx->new_table->autoinc = ctx->old_table->autoinc;
+ /* The persistent value was already copied in
+ prepare_inplace_alter_table_dict() when ctx->new_table
+ was created. If this was a LOCK=NONE operation, the
+ AUTO_INCREMENT values would be updated during
+ row_log_table_apply(). If this was LOCK!=NONE,
+ the table contents could not possibly have changed
+ between prepare_inplace and commit_inplace. */
+ }
+
+ DBUG_RETURN(false);
+}
+
+/** Add or drop foreign key constraints to the data dictionary tables,
+but do not touch the data dictionary cache.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param trx Data dictionary transaction
+@param table_name Table name in MySQL
+@retval true Failure
+@retval false Success
+*/
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_update_foreign_try(
+/*========================*/
+ ha_innobase_inplace_ctx*ctx,
+ trx_t* trx,
+ const char* table_name)
+{
+ ulint foreign_id;
+ ulint i;
+
+ DBUG_ENTER("innobase_update_foreign_try");
+
+ foreign_id = dict_table_get_highest_foreign_id(ctx->new_table);
+
+ foreign_id++;
+
+ for (i = 0; i < ctx->num_to_add_fk; i++) {
+ dict_foreign_t* fk = ctx->add_fk[i];
+
+ ut_ad(fk->foreign_table == ctx->new_table
+ || fk->foreign_table == ctx->old_table);
+
+ dberr_t error = dict_create_add_foreign_id(
+ &foreign_id, ctx->old_table->name.m_name, fk);
+
+ if (error != DB_SUCCESS) {
+ my_error(ER_TOO_LONG_IDENT, MYF(0),
+ fk->id);
+ DBUG_RETURN(true);
+ }
+
+ if (!fk->foreign_index) {
+ fk->foreign_index = dict_foreign_find_index(
+ ctx->new_table, ctx->col_names,
+ fk->foreign_col_names,
+ fk->n_fields, fk->referenced_index, TRUE,
+ fk->type
+ & (DICT_FOREIGN_ON_DELETE_SET_NULL
+ | DICT_FOREIGN_ON_UPDATE_SET_NULL),
+ NULL, NULL, NULL);
+ if (!fk->foreign_index) {
+ my_error(ER_FK_INCORRECT_OPTION,
+ MYF(0), table_name, fk->id);
+ DBUG_RETURN(true);
+ }
+ }
+
+ /* The fk->foreign_col_names[] uses renamed column
+ names, while the columns in ctx->old_table have not
+ been renamed yet. */
+ error = dict_create_add_foreign_to_dictionary(
+ ctx->old_table->name.m_name, fk, trx);
+
+ DBUG_EXECUTE_IF(
+ "innodb_test_cannot_add_fk_system",
+ error = DB_ERROR;);
+
+ if (error != DB_SUCCESS) {
+ my_error(ER_FK_FAIL_ADD_SYSTEM, MYF(0),
+ fk->id);
+ DBUG_RETURN(true);
+ }
+ }
+
+ for (i = 0; i < ctx->num_to_drop_fk; i++) {
+ dict_foreign_t* fk = ctx->drop_fk[i];
+
+ DBUG_ASSERT(fk->foreign_table == ctx->old_table);
+
+ if (innobase_drop_foreign_try(trx, table_name, fk->id)) {
+ DBUG_RETURN(true);
+ }
+ }
+
+ DBUG_RETURN(false);
+}
+
+/** Update the foreign key constraint definitions in the data dictionary cache
+after the changes to data dictionary tables were committed.
+@param ctx In-place ALTER TABLE context
+@param user_thd MySQL connection
+@return InnoDB error code (should always be DB_SUCCESS) */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+innobase_update_foreign_cache(
+/*==========================*/
+ ha_innobase_inplace_ctx* ctx,
+ THD* user_thd)
+{
+ dict_table_t* user_table;
+ dberr_t err = DB_SUCCESS;
+
+ DBUG_ENTER("innobase_update_foreign_cache");
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ user_table = ctx->old_table;
+
+ /* Discard the added foreign keys, because we will
+ load them from the data dictionary. */
+ for (ulint i = 0; i < ctx->num_to_add_fk; i++) {
+ dict_foreign_t* fk = ctx->add_fk[i];
+ dict_foreign_free(fk);
+ }
+
+ if (ctx->need_rebuild()) {
+ /* The rebuilt table is already using the renamed
+ column names. No need to pass col_names or to drop
+ constraints from the data dictionary cache. */
+ DBUG_ASSERT(!ctx->col_names);
+ DBUG_ASSERT(user_table->foreign_set.empty());
+ DBUG_ASSERT(user_table->referenced_set.empty());
+ user_table = ctx->new_table;
+ } else {
+ /* Drop the foreign key constraints if the
+ table was not rebuilt. If the table is rebuilt,
+ there would not be any foreign key contraints for
+ it yet in the data dictionary cache. */
+ for (ulint i = 0; i < ctx->num_to_drop_fk; i++) {
+ dict_foreign_t* fk = ctx->drop_fk[i];
+ dict_foreign_remove_from_cache(fk);
+ }
+ }
+
+ /* Load the old or added foreign keys from the data dictionary
+ and prevent the table from being evicted from the data
+ dictionary cache (work around the lack of WL#6049). */
+ dict_names_t fk_tables;
+
+ err = dict_load_foreigns(user_table->name.m_name,
+ ctx->col_names, false, true,
+ DICT_ERR_IGNORE_NONE,
+ fk_tables);
+
+ if (err == DB_CANNOT_ADD_CONSTRAINT) {
+ fk_tables.clear();
+
+ /* It is possible there are existing foreign key are
+ loaded with "foreign_key checks" off,
+ so let's retry the loading with charset_check is off */
+ err = dict_load_foreigns(user_table->name.m_name,
+ ctx->col_names, false, false,
+ DICT_ERR_IGNORE_NONE,
+ fk_tables);
+
+ /* The load with "charset_check" off is successful, warn
+ the user that the foreign key has loaded with mis-matched
+ charset */
+ if (err == DB_SUCCESS) {
+ push_warning_printf(
+ user_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_ALTER_INFO,
+ "Foreign key constraints for table '%s'"
+ " are loaded with charset check off",
+ user_table->name.m_name);
+ }
+ }
+
+ /* For complete loading of foreign keys, all associated tables must
+ also be loaded. */
+ while (err == DB_SUCCESS && !fk_tables.empty()) {
+ dict_table_t* table = dict_load_table(
+ fk_tables.front(), DICT_ERR_IGNORE_NONE);
+
+ if (table == NULL) {
+ err = DB_TABLE_NOT_FOUND;
+ ib::error()
+ << "Failed to load table '"
+ << table_name_t(const_cast<char*>
+ (fk_tables.front()))
+ << "' which has a foreign key constraint with"
+ << " table '" << user_table->name << "'.";
+ break;
+ }
+
+ fk_tables.pop_front();
+ }
+
+ DBUG_RETURN(err);
+}
+
+/** Changes SYS_COLUMNS.PRTYPE for one column.
+@param[in,out] trx transaction
+@param[in] table_name table name
+@param[in] tableid table ID as in SYS_TABLES
+@param[in] pos column position
+@param[in] prtype new precise type
+@return boolean flag
+@retval true on failure
+@retval false on success */
+static
+bool
+vers_change_field_try(
+ trx_t* trx,
+ const char* table_name,
+ const table_id_t tableid,
+ const ulint pos,
+ const ulint prtype)
+{
+ DBUG_ENTER("vers_change_field_try");
+
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_int4_literal(info, "prtype", prtype);
+ pars_info_add_ull_literal(info,"tableid", tableid);
+ pars_info_add_int4_literal(info, "pos", pos);
+
+ dberr_t error = que_eval_sql(info,
+ "PROCEDURE CHANGE_COLUMN_MTYPE () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_COLUMNS SET PRTYPE=:prtype\n"
+ "WHERE TABLE_ID=:tableid AND POS=:pos;\n"
+ "END;\n",
+ false, trx);
+
+ if (error != DB_SUCCESS) {
+ my_error_innodb(error, table_name, 0);
+ trx->error_state = DB_SUCCESS;
+ trx->op_info = "";
+ DBUG_RETURN(true);
+ }
+
+ DBUG_RETURN(false);
+}
+
+/** Changes fields WITH/WITHOUT SYSTEM VERSIONING property in SYS_COLUMNS.
+@param[in] ha_alter_info alter info
+@param[in] ctx alter inplace context
+@param[in] trx transaction
+@param[in] table old table
+@return boolean flag
+@retval true on failure
+@retval false on success */
+static
+bool
+vers_change_fields_try(
+ const Alter_inplace_info* ha_alter_info,
+ const ha_innobase_inplace_ctx* ctx,
+ trx_t* trx,
+ const TABLE* table)
+{
+ DBUG_ENTER("vers_change_fields_try");
+
+ DBUG_ASSERT(ha_alter_info);
+ DBUG_ASSERT(ctx);
+
+ for (const Create_field& create_field : ha_alter_info->alter_info->create_list) {
+ if (!create_field.field) {
+ continue;
+ }
+ if (create_field.versioning
+ == Column_definition::VERSIONING_NOT_SET) {
+ continue;
+ }
+
+ const dict_table_t* new_table = ctx->new_table;
+ const uint pos = innodb_col_no(create_field.field);
+ const dict_col_t* col = dict_table_get_nth_col(new_table, pos);
+
+ DBUG_ASSERT(!col->vers_sys_start());
+ DBUG_ASSERT(!col->vers_sys_end());
+
+ ulint new_prtype
+ = create_field.versioning
+ == Column_definition::WITHOUT_VERSIONING
+ ? col->prtype & ~DATA_VERSIONED
+ : col->prtype | DATA_VERSIONED;
+
+ if (vers_change_field_try(trx, table->s->table_name.str,
+ new_table->id, pos,
+ new_prtype)) {
+ DBUG_RETURN(true);
+ }
+ }
+
+ DBUG_RETURN(false);
+}
+
+/** Changes WITH/WITHOUT SYSTEM VERSIONING for fields
+in the data dictionary cache.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param table MySQL table as it is before the ALTER operation */
+static
+void
+vers_change_fields_cache(
+ Alter_inplace_info* ha_alter_info,
+ const ha_innobase_inplace_ctx* ctx,
+ const TABLE* table)
+{
+ DBUG_ENTER("vers_change_fields_cache");
+
+ DBUG_ASSERT(ha_alter_info);
+ DBUG_ASSERT(ctx);
+ DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED);
+
+ for (const Create_field& create_field :
+ ha_alter_info->alter_info->create_list) {
+ if (!create_field.field || create_field.field->vcol_info) {
+ continue;
+ }
+ dict_col_t* col = dict_table_get_nth_col(
+ ctx->new_table, innodb_col_no(create_field.field));
+
+ if (create_field.versioning
+ == Column_definition::WITHOUT_VERSIONING) {
+
+ DBUG_ASSERT(!col->vers_sys_start());
+ DBUG_ASSERT(!col->vers_sys_end());
+ col->prtype &= ~DATA_VERSIONED;
+ } else if (create_field.versioning
+ == Column_definition::WITH_VERSIONING) {
+
+ DBUG_ASSERT(!col->vers_sys_start());
+ DBUG_ASSERT(!col->vers_sys_end());
+ col->prtype |= DATA_VERSIONED;
+ }
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/** Commit the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the data dictionary tables,
+when rebuilding the table.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param altered_table MySQL table that is being altered
+@param old_table MySQL table as it is before the ALTER operation
+@param trx Data dictionary transaction
+@param table_name Table name in MySQL
+@retval true Failure
+@retval false Success
+*/
+inline MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+commit_try_rebuild(
+/*===============*/
+ Alter_inplace_info* ha_alter_info,
+ ha_innobase_inplace_ctx*ctx,
+ TABLE* altered_table,
+ const TABLE* old_table,
+ trx_t* trx,
+ const char* table_name)
+{
+ dict_table_t* rebuilt_table = ctx->new_table;
+ dict_table_t* user_table = ctx->old_table;
+
+ DBUG_ENTER("commit_try_rebuild");
+ DBUG_ASSERT(ctx->need_rebuild());
+ DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH);
+ DBUG_ASSERT(!(ha_alter_info->handler_flags
+ & ALTER_DROP_FOREIGN_KEY)
+ || ctx->num_to_drop_fk > 0);
+ DBUG_ASSERT(ctx->num_to_drop_fk
+ <= ha_alter_info->alter_info->drop_list.elements);
+
+ for (dict_index_t* index = dict_table_get_first_index(rebuilt_table);
+ index;
+ index = dict_table_get_next_index(index)) {
+ DBUG_ASSERT(dict_index_get_online_status(index)
+ == ONLINE_INDEX_COMPLETE);
+ DBUG_ASSERT(index->is_committed());
+ if (index->is_corrupted()) {
+ my_error(ER_INDEX_CORRUPT, MYF(0), index->name());
+ DBUG_RETURN(true);
+ }
+ }
+
+ if (innobase_update_foreign_try(ctx, trx, table_name)) {
+ DBUG_RETURN(true);
+ }
+
+ dberr_t error;
+
+ /* Clear the to_be_dropped flag in the data dictionary cache
+ of user_table. */
+ for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+ dict_index_t* index = ctx->drop_index[i];
+ DBUG_ASSERT(index->table == user_table);
+ DBUG_ASSERT(index->is_committed());
+ DBUG_ASSERT(index->to_be_dropped);
+ index->to_be_dropped = 0;
+ }
+
+ if ((ha_alter_info->handler_flags
+ & ALTER_COLUMN_NAME)
+ && innobase_rename_columns_try(ha_alter_info, ctx, old_table,
+ trx, table_name)) {
+ DBUG_RETURN(true);
+ }
+
+ DBUG_EXECUTE_IF("ib_ddl_crash_before_rename", DBUG_SUICIDE(););
+
+ /* The new table must inherit the flag from the
+ "parent" table. */
+ if (!user_table->space) {
+ rebuilt_table->file_unreadable = true;
+ rebuilt_table->flags2 |= DICT_TF2_DISCARDED;
+ }
+
+ /* We can now rename the old table as a temporary table,
+ rename the new temporary table as the old table and drop the
+ old table. */
+ char* old_name= mem_heap_strdup(ctx->heap, user_table->name.m_name);
+
+ error = row_rename_table_for_mysql(user_table->name.m_name,
+ ctx->tmp_name, trx, false, false);
+ if (error == DB_SUCCESS) {
+ error = row_rename_table_for_mysql(rebuilt_table->name.m_name,
+ old_name, trx,
+ false, false);
+ }
+
+ /* We must be still holding a table handle. */
+ DBUG_ASSERT(user_table->get_ref_count() == 1);
+
+ DBUG_EXECUTE_IF("ib_ddl_crash_after_rename", DBUG_SUICIDE(););
+ DBUG_EXECUTE_IF("ib_rebuild_cannot_rename", error = DB_ERROR;);
+
+ switch (error) {
+ case DB_SUCCESS:
+ DBUG_RETURN(false);
+ case DB_TABLESPACE_EXISTS:
+ ut_a(rebuilt_table->get_ref_count() == 1);
+ my_error(ER_TABLESPACE_EXISTS, MYF(0), ctx->tmp_name);
+ DBUG_RETURN(true);
+ case DB_DUPLICATE_KEY:
+ ut_a(rebuilt_table->get_ref_count() == 1);
+ my_error(ER_TABLE_EXISTS_ERROR, MYF(0), ctx->tmp_name);
+ DBUG_RETURN(true);
+ default:
+ my_error_innodb(error, table_name, user_table->flags);
+ DBUG_RETURN(true);
+ }
+}
+
+/** Rename indexes in dictionary.
+@param[in] ctx alter info context
+@param[in] ha_alter_info Operation used during inplace alter
+@param[out] trx transaction to change the index name
+ in dictionary
+@return true if it failed to rename
+@return false if it is success. */
+static
+bool
+rename_indexes_try(
+ const ha_innobase_inplace_ctx* ctx,
+ const Alter_inplace_info* ha_alter_info,
+ trx_t* trx)
+{
+ DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_RENAME_INDEX);
+
+ for (const Alter_inplace_info::Rename_key_pair& pair :
+ ha_alter_info->rename_keys) {
+ dict_index_t* index = dict_table_get_index_on_name(
+ ctx->old_table, pair.old_key->name.str);
+ // This was checked previously in
+ // ha_innobase::prepare_inplace_alter_table()
+ ut_ad(index);
+
+ if (rename_index_try(index, pair.new_key->name.str, trx)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/** Set of column numbers */
+typedef std::set<ulint, std::less<ulint>, ut_allocator<ulint> > col_set;
+
+/** Collect (not instantly dropped) columns from dropped indexes
+@param[in] ctx In-place ALTER TABLE context
+@param[in, out] drop_col_list list which will be set, containing columns
+ which is part of index being dropped
+@param[in, out] drop_v_col_list list which will be set, containing
+ virtual columns which is part of index
+ being dropped */
+static
+void
+collect_columns_from_dropped_indexes(
+ const ha_innobase_inplace_ctx* ctx,
+ col_set& drop_col_list,
+ col_set& drop_v_col_list)
+{
+ for (ulint index_count = 0; index_count < ctx->num_to_drop_index;
+ index_count++) {
+ const dict_index_t* index = ctx->drop_index[index_count];
+
+ for (ulint col = 0; col < index->n_user_defined_cols; col++) {
+ const dict_col_t* idx_col
+ = dict_index_get_nth_col(index, col);
+
+ if (idx_col->is_virtual()) {
+ const dict_v_col_t* v_col
+ = reinterpret_cast<
+ const dict_v_col_t*>(idx_col);
+ drop_v_col_list.insert(v_col->v_pos);
+
+ } else {
+ ulint col_no = dict_col_get_no(idx_col);
+ if (ctx->col_map
+ && ctx->col_map[col_no]
+ == ULINT_UNDEFINED) {
+ // this column was instantly dropped
+ continue;
+ }
+ drop_col_list.insert(col_no);
+ }
+ }
+ }
+}
+
+/** Change PAGE_COMPRESSED to ON or change the PAGE_COMPRESSION_LEVEL.
+@param[in] level PAGE_COMPRESSION_LEVEL
+@param[in] table table before the change
+@param[in,out] trx data dictionary transaction
+@param[in] table_name table name in MariaDB
+@return whether the operation succeeded */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+bool
+innobase_page_compression_try(
+ uint level,
+ const dict_table_t* table,
+ trx_t* trx,
+ const char* table_name)
+{
+ DBUG_ENTER("innobase_page_compression_try");
+ DBUG_ASSERT(level >= 1);
+ DBUG_ASSERT(level <= 9);
+
+ unsigned flags = table->flags
+ & ~(0xFU << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
+ flags |= 1U << DICT_TF_POS_PAGE_COMPRESSION
+ | level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL;
+
+ if (table->flags == flags) {
+ DBUG_RETURN(false);
+ }
+
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_ull_literal(info, "id", table->id);
+ pars_info_add_int4_literal(info, "type",
+ dict_tf_to_sys_tables_type(flags));
+
+ dberr_t error = que_eval_sql(info,
+ "PROCEDURE CHANGE_COMPRESSION () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLES SET TYPE=:type\n"
+ "WHERE ID=:id;\n"
+ "END;\n",
+ false, trx);
+
+ if (error != DB_SUCCESS) {
+ my_error_innodb(error, table_name, 0);
+ trx->error_state = DB_SUCCESS;
+ trx->op_info = "";
+ DBUG_RETURN(true);
+ }
+
+ DBUG_RETURN(false);
+}
+
+static
+void
+dict_stats_try_drop_table(THD *thd, const table_name_t &name,
+ const LEX_CSTRING &table_name)
+{
+ char errstr[1024];
+ if (dict_stats_drop_table(name.m_name, errstr, sizeof(errstr)) != DB_SUCCESS)
+ {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_ALTER_INFO,
+ "Deleting persistent statistics"
+ " for table '%s' in InnoDB failed: %s",
+ table_name.str,
+ errstr);
+ }
+}
+
+/** Evict the table from cache and reopen it. Drop outdated statistics.
+@param thd mariadb THD entity
+@param table innodb table
+@param table_name user-friendly table name for errors
+@param ctx ALTER TABLE context
+@return newly opened table */
+static dict_table_t *innobase_reload_table(THD *thd, dict_table_t *table,
+ const LEX_CSTRING &table_name,
+ ha_innobase_inplace_ctx &ctx)
+{
+ char *tb_name= strdup(table->name.m_name);
+ dict_table_close(table, true, false);
+
+ if (ctx.is_instant())
+ {
+ for (auto i = ctx.old_n_v_cols; i--; )
+ {
+ ctx.old_v_cols[i].~dict_v_col_t();
+ const_cast<unsigned&>(ctx.old_n_v_cols) = 0;
+ }
+ }
+
+ dict_sys.remove(table);
+ table= dict_table_open_on_name(tb_name, TRUE, TRUE,
+ DICT_ERR_IGNORE_FK_NOKEY);
+
+ /* Drop outdated table stats. */
+ dict_stats_try_drop_table(thd, table->name, table_name);
+ free(tb_name);
+ return table;
+}
+
+/** Commit the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the data dictionary tables,
+when not rebuilding the table.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param old_table MySQL table as it is before the ALTER operation
+@param trx Data dictionary transaction
+@param table_name Table name in MySQL
+@retval true Failure
+@retval false Success
+*/
+inline MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+commit_try_norebuild(
+/*=================*/
+ Alter_inplace_info* ha_alter_info,
+ ha_innobase_inplace_ctx*ctx,
+ TABLE* altered_table,
+ const TABLE* old_table,
+ trx_t* trx,
+ const char* table_name)
+{
+ DBUG_ENTER("commit_try_norebuild");
+ DBUG_ASSERT(!ctx->need_rebuild());
+ DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH);
+ DBUG_ASSERT(!(ha_alter_info->handler_flags
+ & ALTER_DROP_FOREIGN_KEY)
+ || ctx->num_to_drop_fk > 0);
+ DBUG_ASSERT(ctx->num_to_drop_fk
+ <= ha_alter_info->alter_info->drop_list.elements
+ || ctx->num_to_drop_vcol
+ == ha_alter_info->alter_info->drop_list.elements);
+
+ if (ctx->page_compression_level
+ && innobase_page_compression_try(ctx->page_compression_level,
+ ctx->new_table, trx,
+ table_name)) {
+ DBUG_RETURN(true);
+ }
+
+ for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+ dict_index_t* index = ctx->add_index[i];
+ DBUG_ASSERT(dict_index_get_online_status(index)
+ == ONLINE_INDEX_COMPLETE);
+ DBUG_ASSERT(!index->is_committed());
+ if (index->is_corrupted()) {
+ /* Report a duplicate key
+ error for the index that was
+ flagged corrupted, most likely
+ because a duplicate value was
+ inserted (directly or by
+ rollback) after
+ ha_innobase::inplace_alter_table()
+ completed.
+ TODO: report this as a corruption
+ with a detailed reason once
+ WL#6379 has been implemented. */
+ my_error(ER_DUP_UNKNOWN_IN_INDEX,
+ MYF(0), index->name());
+ DBUG_RETURN(true);
+ }
+ }
+
+ if (innobase_update_foreign_try(ctx, trx, table_name)) {
+ DBUG_RETURN(true);
+ }
+
+ if ((ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED)
+ && vers_change_fields_try(ha_alter_info, ctx, trx, old_table)) {
+ DBUG_RETURN(true);
+ }
+
+ dberr_t error;
+
+ /* We altered the table in place. Mark the indexes as committed. */
+ for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+ dict_index_t* index = ctx->add_index[i];
+ DBUG_ASSERT(dict_index_get_online_status(index)
+ == ONLINE_INDEX_COMPLETE);
+ DBUG_ASSERT(!index->is_committed());
+ error = row_merge_rename_index_to_add(
+ trx, ctx->new_table->id, index->id);
+ switch (error) {
+ case DB_SUCCESS:
+ break;
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ /* If we wrote some undo log here, then the
+ persistent data dictionary for this table may
+ probably be corrupted. This is because a
+ 'trigger' on SYS_INDEXES could already have invoked
+ btr_free_if_exists(), which cannot be rolled back. */
+ DBUG_ASSERT(trx->undo_no == 0);
+ my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0));
+ DBUG_RETURN(true);
+ default:
+ sql_print_error(
+ "InnoDB: rename index to add: %lu\n",
+ (ulong) error);
+ DBUG_ASSERT(0);
+ my_error(ER_INTERNAL_ERROR, MYF(0),
+ "rename index to add");
+ DBUG_RETURN(true);
+ }
+ }
+
+ /* Drop any indexes that were requested to be dropped.
+ Flag them in the data dictionary first. */
+
+ for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+ dict_index_t* index = ctx->drop_index[i];
+ DBUG_ASSERT(index->is_committed());
+ DBUG_ASSERT(index->table == ctx->new_table);
+ DBUG_ASSERT(index->to_be_dropped);
+
+ error = row_merge_rename_index_to_drop(
+ trx, index->table->id, index->id);
+ if (error != DB_SUCCESS) {
+ sql_print_error(
+ "InnoDB: rename index to drop: %lu\n",
+ (ulong) error);
+ DBUG_ASSERT(0);
+ my_error(ER_INTERNAL_ERROR, MYF(0),
+ "rename index to drop");
+ DBUG_RETURN(true);
+ }
+ }
+
+ if (innobase_rename_or_enlarge_columns_try(ha_alter_info, ctx,
+ altered_table, old_table,
+ trx, table_name)) {
+ DBUG_RETURN(true);
+ }
+
+ if ((ha_alter_info->handler_flags & ALTER_RENAME_INDEX)
+ && rename_indexes_try(ctx, ha_alter_info, trx)) {
+ DBUG_RETURN(true);
+ }
+
+ if (ctx->is_instant()) {
+ DBUG_RETURN(innobase_instant_try(ha_alter_info, ctx,
+ altered_table, old_table,
+ trx));
+ }
+
+ if (ha_alter_info->handler_flags
+ & (ALTER_DROP_VIRTUAL_COLUMN | ALTER_ADD_VIRTUAL_COLUMN)) {
+ if ((ha_alter_info->handler_flags & ALTER_DROP_VIRTUAL_COLUMN)
+ && innobase_drop_virtual_try(ha_alter_info, ctx->old_table,
+ trx)) {
+ DBUG_RETURN(true);
+ }
+
+ if ((ha_alter_info->handler_flags & ALTER_ADD_VIRTUAL_COLUMN)
+ && innobase_add_virtual_try(ha_alter_info, ctx->old_table,
+ trx)) {
+ DBUG_RETURN(true);
+ }
+
+ unsigned n_col = ctx->old_table->n_cols
+ - DATA_N_SYS_COLS;
+ unsigned n_v_col = ctx->old_table->n_v_cols
+ + ctx->num_to_add_vcol - ctx->num_to_drop_vcol;
+
+ if (innodb_update_cols(
+ ctx->old_table,
+ dict_table_encode_n_col(n_col, n_v_col)
+ | unsigned(ctx->old_table->flags & DICT_TF_COMPACT)
+ << 31, trx)) {
+ DBUG_RETURN(true);
+ }
+ }
+
+ DBUG_RETURN(false);
+}
+
+/** Commit the changes to the data dictionary cache
+after a successful commit_try_norebuild() call.
+@param ha_alter_info algorithm=inplace context
+@param ctx In-place ALTER TABLE context for the current partition
+@param altered_table the TABLE after the ALTER
+@param table the TABLE before the ALTER
+@param trx Data dictionary transaction
+(will be started and committed, for DROP INDEX)
+@return whether all replacements were found for dropped indexes */
+inline MY_ATTRIBUTE((nonnull))
+bool
+commit_cache_norebuild(
+/*===================*/
+ Alter_inplace_info* ha_alter_info,
+ ha_innobase_inplace_ctx*ctx,
+ const TABLE* altered_table,
+ const TABLE* table,
+ trx_t* trx)
+{
+ DBUG_ENTER("commit_cache_norebuild");
+ DBUG_ASSERT(!ctx->need_rebuild());
+ DBUG_ASSERT(ctx->new_table->space != fil_system.temp_space);
+ DBUG_ASSERT(!ctx->new_table->is_temporary());
+
+ bool found = true;
+
+ if (ctx->page_compression_level) {
+ DBUG_ASSERT(ctx->new_table->space != fil_system.sys_space);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+ ctx->new_table->flags
+ = static_cast<uint16_t>(
+ (ctx->new_table->flags
+ & ~(0xFU
+ << DICT_TF_POS_PAGE_COMPRESSION_LEVEL))
+ | 1 << DICT_TF_POS_PAGE_COMPRESSION
+ | (ctx->page_compression_level & 0xF)
+ << DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+ & ((1U << DICT_TF_BITS) - 1);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+
+ if (fil_space_t* space = ctx->new_table->space) {
+ bool update = !(space->flags
+ & FSP_FLAGS_MASK_PAGE_COMPRESSION);
+ mutex_enter(&fil_system.mutex);
+ space->flags &= ~FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL;
+ space->flags |= ctx->page_compression_level
+ << FSP_FLAGS_MEM_COMPRESSION_LEVEL;
+ if (!space->full_crc32()) {
+ space->flags
+ |= FSP_FLAGS_MASK_PAGE_COMPRESSION;
+ } else if (!space->is_compressed()) {
+ space->flags
+ |= innodb_compression_algorithm
+ << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+ }
+ mutex_exit(&fil_system.mutex);
+
+ if (update) {
+ /* Maybe we should introduce an undo
+ log record for updating tablespace
+ flags, and perform the update already
+ in innobase_page_compression_try().
+
+ If the server is killed before the
+ following mini-transaction commit
+ becomes durable, fsp_flags_try_adjust()
+ will perform the equivalent adjustment
+ and warn "adjusting FSP_SPACE_FLAGS". */
+ mtr_t mtr;
+ mtr.start();
+ if (buf_block_t* b = buf_page_get(
+ page_id_t(space->id, 0),
+ space->zip_size(),
+ RW_X_LATCH, &mtr)) {
+ byte* f = FSP_HEADER_OFFSET
+ + FSP_SPACE_FLAGS + b->frame;
+ const auto sf = space->flags
+ & ~FSP_FLAGS_MEM_MASK;
+ if (mach_read_from_4(f) != sf) {
+ mtr.set_named_space(space);
+ mtr.write<4,mtr_t::FORCED>(
+ *b, f, sf);
+ }
+ }
+ mtr.commit();
+ }
+ }
+ }
+
+ col_set drop_list;
+ col_set v_drop_list;
+
+ /* Check if the column, part of an index to be dropped is part of any
+ other index which is not being dropped. If it so, then set the ord_part
+ of the column to 0. */
+ collect_columns_from_dropped_indexes(ctx, drop_list, v_drop_list);
+
+ for (ulint col : drop_list) {
+ if (!check_col_exists_in_indexes(ctx->new_table, col, false)) {
+ ctx->new_table->cols[col].ord_part = 0;
+ }
+ }
+
+ for (ulint col : v_drop_list) {
+ if (!check_col_exists_in_indexes(ctx->new_table, col, true)) {
+ ctx->new_table->v_cols[col].m_col.ord_part = 0;
+ }
+ }
+
+ for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+ dict_index_t* index = ctx->add_index[i];
+ DBUG_ASSERT(dict_index_get_online_status(index)
+ == ONLINE_INDEX_COMPLETE);
+ DBUG_ASSERT(!index->is_committed());
+ index->set_committed(true);
+ }
+
+ if (ctx->num_to_drop_index) {
+ /* Really drop the indexes that were dropped.
+ The transaction had to be committed first
+ (after renaming the indexes), so that in the
+ event of a crash, crash recovery will drop the
+ indexes, because it drops all indexes whose
+ names start with TEMP_INDEX_PREFIX_STR. Once we
+ have started dropping an index tree, there is
+ no way to roll it back. */
+
+ for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+ dict_index_t* index = ctx->drop_index[i];
+ DBUG_ASSERT(index->is_committed());
+ DBUG_ASSERT(index->table == ctx->new_table);
+ DBUG_ASSERT(index->to_be_dropped);
+
+ /* Replace the indexes in foreign key
+ constraints if needed. */
+
+ if (!dict_foreign_replace_index(
+ index->table, ctx->col_names, index)) {
+ found = false;
+ }
+
+ /* Mark the index dropped
+ in the data dictionary cache. */
+ rw_lock_x_lock(dict_index_get_lock(index));
+ index->page = FIL_NULL;
+ rw_lock_x_unlock(dict_index_get_lock(index));
+ }
+
+ trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+ row_merge_drop_indexes_dict(trx, ctx->new_table->id);
+
+ for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+ dict_index_t* index = ctx->drop_index[i];
+ DBUG_ASSERT(index->is_committed());
+ DBUG_ASSERT(index->table == ctx->new_table);
+
+ if (index->type & DICT_FTS) {
+ DBUG_ASSERT(index->type == DICT_FTS
+ || (index->type
+ & DICT_CORRUPT));
+ DBUG_ASSERT(index->table->fts);
+ DEBUG_SYNC_C("norebuild_fts_drop");
+ fts_drop_index(index->table, index, trx);
+ }
+
+ dict_index_remove_from_cache(index->table, index);
+ }
+
+ fts_clear_all(ctx->old_table, trx);
+ trx_commit_for_mysql(trx);
+ }
+
+ if (!ctx->is_instant()) {
+ innobase_rename_or_enlarge_columns_cache(
+ ha_alter_info, altered_table, table, ctx->new_table);
+ } else {
+ ut_ad(ctx->col_map);
+
+ if (fts_t* fts = ctx->new_table->fts) {
+ ut_ad(fts->doc_col != ULINT_UNDEFINED);
+ ut_ad(ctx->new_table->n_cols > DATA_N_SYS_COLS);
+ const ulint c = ctx->col_map[fts->doc_col];
+ ut_ad(c < ulint(ctx->new_table->n_cols)
+ - DATA_N_SYS_COLS);
+ ut_d(const dict_col_t& col = ctx->new_table->cols[c]);
+ ut_ad(!col.is_nullable());
+ ut_ad(!col.is_virtual());
+ ut_ad(!col.is_added());
+ ut_ad(col.prtype & DATA_UNSIGNED);
+ ut_ad(col.mtype == DATA_INT);
+ ut_ad(col.len == 8);
+ ut_ad(col.ord_part);
+ fts->doc_col = c;
+ }
+
+ if (ha_alter_info->handler_flags & ALTER_DROP_STORED_COLUMN) {
+ const dict_index_t* index = ctx->new_table->indexes.start;
+
+ for (const dict_field_t* f = index->fields,
+ * const end = f + index->n_fields;
+ f != end; f++) {
+ dict_col_t& c = *f->col;
+ if (c.is_dropped()) {
+ c.set_dropped(!c.is_nullable(),
+ DATA_LARGE_MTYPE(c.mtype)
+ || (!f->fixed_len
+ && c.len > 255),
+ f->fixed_len);
+ }
+ }
+ }
+
+ if (!ctx->instant_table->persistent_autoinc) {
+ ctx->new_table->persistent_autoinc = 0;
+ }
+ }
+
+ if (ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED) {
+ vers_change_fields_cache(ha_alter_info, ctx, table);
+ }
+
+ if (ha_alter_info->handler_flags & ALTER_RENAME_INDEX) {
+ innobase_rename_indexes_cache(ctx, ha_alter_info);
+ }
+
+ ctx->new_table->fts_doc_id_index
+ = ctx->new_table->fts
+ ? dict_table_get_index_on_name(
+ ctx->new_table, FTS_DOC_ID_INDEX_NAME)
+ : NULL;
+ DBUG_ASSERT((ctx->new_table->fts == NULL)
+ == (ctx->new_table->fts_doc_id_index == NULL));
+ DBUG_RETURN(found);
+}
+
+/** Adjust the persistent statistics after non-rebuilding ALTER TABLE.
+Remove statistics for dropped indexes, add statistics for created indexes
+and rename statistics for renamed indexes.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param thd MySQL connection
+*/
+static
+void
+alter_stats_norebuild(
+/*==================*/
+ Alter_inplace_info* ha_alter_info,
+ ha_innobase_inplace_ctx* ctx,
+ THD* thd)
+{
+ ulint i;
+
+ DBUG_ENTER("alter_stats_norebuild");
+ DBUG_ASSERT(!ctx->need_rebuild());
+
+ if (!dict_stats_is_persistent_enabled(ctx->new_table)) {
+ DBUG_VOID_RETURN;
+ }
+
+ /* Delete corresponding rows from the stats table. We do this
+ in a separate transaction from trx, because lock waits are not
+ allowed in a data dictionary transaction. (Lock waits are possible
+ on the statistics table, because it is directly accessible by users,
+ not covered by the dict_sys.latch.)
+
+ Because the data dictionary changes were already committed, orphaned
+ rows may be left in the statistics table if the system crashes.
+
+ FIXME: each change to the statistics tables is being committed in a
+ separate transaction, meaning that the operation is not atomic
+
+ FIXME: This will not drop the (unused) statistics for
+ FTS_DOC_ID_INDEX if it was a hidden index, dropped together
+ with the last renamining FULLTEXT index. */
+ for (i = 0; i < ha_alter_info->index_drop_count; i++) {
+ const KEY* key = ha_alter_info->index_drop_buffer[i];
+
+ if (key->flags & HA_FULLTEXT) {
+ /* There are no index cardinality
+ statistics for FULLTEXT indexes. */
+ continue;
+ }
+
+ char errstr[1024];
+
+ if (dict_stats_drop_index(
+ ctx->new_table->name.m_name, key->name.str,
+ errstr, sizeof errstr) != DB_SUCCESS) {
+ push_warning(thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_LOCK_WAIT_TIMEOUT, errstr);
+ }
+ }
+
+ for (size_t i = 0; i < ha_alter_info->rename_keys.size(); i++) {
+ const Alter_inplace_info::Rename_key_pair& pair
+ = ha_alter_info->rename_keys[i];
+
+ std::stringstream ss;
+ ss << TEMP_FILE_PREFIX_INNODB << std::this_thread::get_id()
+ << i;
+ auto tmp_name = ss.str();
+
+ dberr_t err = dict_stats_rename_index(ctx->new_table,
+ pair.old_key->name.str,
+ tmp_name.c_str());
+
+ if (err != DB_SUCCESS) {
+ push_warning_printf(
+ thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_ERROR_ON_RENAME,
+ "Error renaming an index of table '%s'"
+ " from '%s' to '%s' in InnoDB persistent"
+ " statistics storage: %s",
+ ctx->new_table->name.m_name,
+ pair.old_key->name.str,
+ tmp_name.c_str(),
+ ut_strerr(err));
+ }
+ }
+
+ for (size_t i = 0; i < ha_alter_info->rename_keys.size(); i++) {
+ const Alter_inplace_info::Rename_key_pair& pair
+ = ha_alter_info->rename_keys[i];
+
+ std::stringstream ss;
+ ss << TEMP_FILE_PREFIX_INNODB << std::this_thread::get_id()
+ << i;
+ auto tmp_name = ss.str();
+
+ dberr_t err = dict_stats_rename_index(ctx->new_table,
+ tmp_name.c_str(),
+ pair.new_key->name.str);
+
+ if (err != DB_SUCCESS) {
+ push_warning_printf(
+ thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_ERROR_ON_RENAME,
+ "Error renaming an index of table '%s'"
+ " from '%s' to '%s' in InnoDB persistent"
+ " statistics storage: %s",
+ ctx->new_table->name.m_name,
+ tmp_name.c_str(),
+ pair.new_key->name.str,
+ ut_strerr(err));
+ }
+ }
+
+ for (i = 0; i < ctx->num_to_add_index; i++) {
+ dict_index_t* index = ctx->add_index[i];
+ DBUG_ASSERT(index->table == ctx->new_table);
+
+ if (!(index->type & DICT_FTS)) {
+ dict_stats_init(ctx->new_table);
+ dict_stats_update_for_index(index);
+ }
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/** Adjust the persistent statistics after rebuilding ALTER TABLE.
+Remove statistics for dropped indexes, add statistics for created indexes
+and rename statistics for renamed indexes.
+@param table InnoDB table that was rebuilt by ALTER TABLE
+@param table_name Table name in MySQL
+@param thd MySQL connection
+*/
+static
+void
+alter_stats_rebuild(
+/*================*/
+ dict_table_t* table,
+ const char* table_name,
+ THD* thd)
+{
+ DBUG_ENTER("alter_stats_rebuild");
+
+ if (!table->space
+ || !dict_stats_is_persistent_enabled(table)) {
+ DBUG_VOID_RETURN;
+ }
+
+ dberr_t ret = dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT);
+
+ if (ret != DB_SUCCESS) {
+ push_warning_printf(
+ thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_ALTER_INFO,
+ "Error updating stats for table '%s'"
+ " after table rebuild: %s",
+ table_name, ut_strerr(ret));
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+#ifndef DBUG_OFF
+# define DBUG_INJECT_CRASH(prefix, count) \
+do { \
+ char buf[32]; \
+ snprintf(buf, sizeof buf, prefix "_%u", count); \
+ DBUG_EXECUTE_IF(buf, DBUG_SUICIDE();); \
+} while (0)
+#else
+# define DBUG_INJECT_CRASH(prefix, count)
+#endif
+
+/** Apply the log for the table rebuild operation.
+@param[in] ctx Inplace Alter table context
+@param[in] altered_table MySQL table that is being altered
+@return true Failure, else false. */
+static bool alter_rebuild_apply_log(
+ ha_innobase_inplace_ctx* ctx,
+ Alter_inplace_info* ha_alter_info,
+ TABLE* altered_table)
+{
+ DBUG_ENTER("alter_rebuild_apply_log");
+
+ if (!ctx->online) {
+ DBUG_RETURN(false);
+ }
+
+ /* We copied the table. Any indexes that were requested to be
+ dropped were not created in the copy of the table. Apply any
+ last bit of the rebuild log and then rename the tables. */
+ dict_table_t* user_table = ctx->old_table;
+ dict_table_t* rebuilt_table = ctx->new_table;
+
+ DEBUG_SYNC_C("row_log_table_apply2_before");
+
+ dict_vcol_templ_t* s_templ = NULL;
+
+ if (ctx->new_table->n_v_cols > 0) {
+ s_templ = UT_NEW_NOKEY(
+ dict_vcol_templ_t());
+ s_templ->vtempl = NULL;
+
+ innobase_build_v_templ(altered_table, ctx->new_table, s_templ,
+ NULL, true);
+ ctx->new_table->vc_templ = s_templ;
+ }
+
+ dberr_t error = row_log_table_apply(
+ ctx->thr, user_table, altered_table,
+ static_cast<ha_innobase_inplace_ctx*>(
+ ha_alter_info->handler_ctx)->m_stage,
+ ctx->new_table);
+
+ if (s_templ) {
+ ut_ad(ctx->need_rebuild());
+ dict_free_vc_templ(s_templ);
+ UT_DELETE(s_templ);
+ ctx->new_table->vc_templ = NULL;
+ }
+
+ ulint err_key = thr_get_trx(ctx->thr)->error_key_num;
+
+ switch (error) {
+ KEY* dup_key;
+ case DB_SUCCESS:
+ break;
+ case DB_DUPLICATE_KEY:
+ if (err_key == ULINT_UNDEFINED) {
+ /* This should be the hidden index on
+ FTS_DOC_ID. */
+ dup_key = NULL;
+ } else {
+ DBUG_ASSERT(err_key < ha_alter_info->key_count);
+ dup_key = &ha_alter_info->key_info_buffer[err_key];
+ }
+
+ print_keydup_error(altered_table, dup_key, MYF(0));
+ DBUG_RETURN(true);
+ case DB_ONLINE_LOG_TOO_BIG:
+ my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0),
+ get_error_key_name(err_key, ha_alter_info,
+ rebuilt_table));
+ DBUG_RETURN(true);
+ case DB_INDEX_CORRUPT:
+ my_error(ER_INDEX_CORRUPT, MYF(0),
+ get_error_key_name(err_key, ha_alter_info,
+ rebuilt_table));
+ DBUG_RETURN(true);
+ default:
+ my_error_innodb(error, ctx->old_table->name.m_name,
+ user_table->flags);
+ DBUG_RETURN(true);
+ }
+
+ DBUG_RETURN(false);
+}
+
+/** Commit or rollback the changes made during
+prepare_inplace_alter_table() and inplace_alter_table() inside
+the storage engine. Note that the allowed level of concurrency
+during this operation will be the same as for
+inplace_alter_table() and thus might be higher than during
+prepare_inplace_alter_table(). (E.g concurrent writes were
+blocked during prepare, but might not be during commit).
+@param altered_table TABLE object for new version of table.
+@param ha_alter_info Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+@param commit true => Commit, false => Rollback.
+@retval true Failure
+@retval false Success
+*/
+
+bool
+ha_innobase::commit_inplace_alter_table(
+/*====================================*/
+ TABLE* altered_table,
+ Alter_inplace_info* ha_alter_info,
+ bool commit)
+{
+ ha_innobase_inplace_ctx*ctx0;
+
+ ctx0 = static_cast<ha_innobase_inplace_ctx*>
+ (ha_alter_info->handler_ctx);
+
+#ifndef DBUG_OFF
+ uint crash_inject_count = 1;
+ uint crash_fail_inject_count = 1;
+ uint failure_inject_count = 1;
+#endif /* DBUG_OFF */
+
+ DBUG_ENTER("commit_inplace_alter_table");
+ DBUG_ASSERT(!srv_read_only_mode);
+ DBUG_ASSERT(!ctx0 || ctx0->prebuilt == m_prebuilt);
+ DBUG_ASSERT(!ctx0 || ctx0->old_table == m_prebuilt->table);
+
+ DEBUG_SYNC_C("innodb_commit_inplace_alter_table_enter");
+
+ DEBUG_SYNC_C("innodb_commit_inplace_alter_table_wait");
+
+ if (ctx0 != NULL && ctx0->m_stage != NULL) {
+ ctx0->m_stage->begin_phase_end();
+ }
+
+ if (!commit) {
+ /* A rollback is being requested. So far we may at
+ most have created some indexes. If any indexes were to
+ be dropped, they would actually be dropped in this
+ method if commit=true. */
+ const bool ret = rollback_inplace_alter_table(
+ ha_alter_info, table, m_prebuilt);
+ DBUG_RETURN(ret);
+ }
+
+ if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
+ DBUG_ASSERT(!ctx0);
+ MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+ ha_alter_info->group_commit_ctx = NULL;
+ DBUG_RETURN(false);
+ }
+
+ DBUG_ASSERT(ctx0);
+
+ inplace_alter_handler_ctx** ctx_array;
+ inplace_alter_handler_ctx* ctx_single[2];
+
+ if (ha_alter_info->group_commit_ctx) {
+ ctx_array = ha_alter_info->group_commit_ctx;
+ } else {
+ ctx_single[0] = ctx0;
+ ctx_single[1] = NULL;
+ ctx_array = ctx_single;
+ }
+
+ DBUG_ASSERT(ctx0 == ctx_array[0]);
+ ut_ad(m_prebuilt->table == ctx0->old_table);
+ ha_alter_info->group_commit_ctx = NULL;
+
+ trx_start_if_not_started_xa(m_prebuilt->trx, true);
+
+ for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+ DBUG_ASSERT(ctx->prebuilt->trx == m_prebuilt->trx);
+
+ /* If decryption failed for old table or new table
+ fail here. */
+ if ((!ctx->old_table->is_readable()
+ && ctx->old_table->space)
+ || (!ctx->new_table->is_readable()
+ && ctx->new_table->space)) {
+ String str;
+ const char* engine= table_type();
+ get_error_message(HA_ERR_DECRYPTION_FAILED, &str);
+ my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, str.c_ptr(), engine);
+ DBUG_RETURN(true);
+ }
+
+ /* Exclusively lock the table, to ensure that no other
+ transaction is holding locks on the table while we
+ change the table definition. The MySQL meta-data lock
+ should normally guarantee that no conflicting locks
+ exist. However, FOREIGN KEY constraints checks and any
+ transactions collected during crash recovery could be
+ holding InnoDB locks only, not MySQL locks. */
+
+ dberr_t error = row_merge_lock_table(
+ m_prebuilt->trx, ctx->old_table, LOCK_X);
+
+ if (error != DB_SUCCESS) {
+ my_error_innodb(
+ error, table_share->table_name.str, 0);
+ DBUG_RETURN(true);
+ }
+ }
+
+ DEBUG_SYNC(m_user_thd, "innodb_alter_commit_after_lock_table");
+
+ const bool new_clustered = ctx0->need_rebuild();
+ trx_t* trx = ctx0->trx;
+ bool fail = false;
+
+ /* Stop background FTS operations. */
+ for (inplace_alter_handler_ctx** pctx = ctx_array;
+ *pctx; pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+ DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+
+ if (new_clustered) {
+ if (ctx->old_table->fts) {
+ ut_ad(!ctx->old_table->fts->add_wq);
+ fts_optimize_remove_table(ctx->old_table);
+ }
+ }
+
+ if (ctx->new_table->fts) {
+ ut_ad(!ctx->new_table->fts->add_wq);
+ fts_optimize_remove_table(ctx->new_table);
+ fts_sync_during_ddl(ctx->new_table);
+ }
+
+ /* Apply the online log of the table before acquiring
+ data dictionary latches. Here alter thread already acquired
+ MDL_EXCLUSIVE on the table. So there can't be anymore DDLs, DMLs
+ for the altered table. By applying the log here, InnoDB
+ makes sure that concurrent DDLs, purge thread or any other
+ background thread doesn't wait for the dict_operation_lock
+ for longer time. */
+ if (new_clustered && commit
+ && alter_rebuild_apply_log(
+ ctx, ha_alter_info, altered_table)) {
+ DBUG_RETURN(true);
+ }
+ }
+
+ if (!trx) {
+ DBUG_ASSERT(!new_clustered);
+ trx = innobase_trx_allocate(m_user_thd);
+ }
+
+ trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+ /* Latch the InnoDB data dictionary exclusively so that no deadlocks
+ or lock waits can happen in it during the data dictionary operation. */
+ row_mysql_lock_data_dictionary(trx);
+
+ /* Prevent the background statistics collection from accessing
+ the tables. */
+ for (;;) {
+ bool retry = false;
+
+ for (inplace_alter_handler_ctx** pctx = ctx_array;
+ *pctx; pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+ DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+
+ if (new_clustered
+ && !dict_stats_stop_bg(ctx->old_table)) {
+ retry = true;
+ }
+
+ if (!dict_stats_stop_bg(ctx->new_table)) {
+ retry = true;
+ }
+ }
+
+ if (!retry) {
+ break;
+ }
+
+ DICT_BG_YIELD(trx);
+ }
+
+ /* Apply the changes to the data dictionary tables, for all
+ partitions. */
+
+ for (inplace_alter_handler_ctx** pctx = ctx_array;
+ *pctx && !fail; pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+ DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+ if (ctx->need_rebuild() && !ctx->old_table->space) {
+ my_error(ER_TABLESPACE_DISCARDED, MYF(0),
+ table->s->table_name.str);
+ fail = true;
+ } else {
+ fail = commit_set_autoinc(ha_alter_info, ctx,
+ altered_table, table);
+ }
+
+ if (fail) {
+ } else if (ctx->need_rebuild()) {
+ ctx->tmp_name = dict_mem_create_temporary_tablename(
+ ctx->heap, ctx->new_table->name.m_name,
+ ctx->new_table->id);
+
+ fail = commit_try_rebuild(
+ ha_alter_info, ctx, altered_table, table,
+ trx, table_share->table_name.str);
+ } else {
+ fail = commit_try_norebuild(
+ ha_alter_info, ctx, altered_table, table, trx,
+ table_share->table_name.str);
+ }
+ DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+ crash_inject_count++);
+#ifndef DBUG_OFF
+ {
+ /* Generate a dynamic dbug text. */
+ char buf[32];
+
+ snprintf(buf, sizeof buf,
+ "ib_commit_inplace_fail_%u",
+ failure_inject_count++);
+
+ DBUG_EXECUTE_IF(buf,
+ my_error(ER_INTERNAL_ERROR, MYF(0),
+ "Injected error!");
+ fail = true;
+ );
+ }
+#endif
+ }
+
+ /* Commit or roll back the changes to the data dictionary. */
+ DEBUG_SYNC(m_user_thd, "innodb_alter_inplace_before_commit");
+
+ if (fail) {
+ trx_rollback_for_mysql(trx);
+ for (inplace_alter_handler_ctx** pctx = ctx_array;
+ *pctx; pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+ ctx->rollback_instant();
+ }
+ } else if (!new_clustered) {
+ trx_commit_for_mysql(trx);
+ } else {
+ /* Test what happens on crash if the redo logs
+ are flushed to disk here. The log records
+ about the rename should not be committed, and
+ the data dictionary transaction should be
+ rolled back, restoring the old table. */
+ DBUG_EXECUTE_IF("innodb_alter_commit_crash_before_commit",
+ log_buffer_flush_to_disk();
+ DBUG_SUICIDE(););
+ ut_ad(!trx->fts_trx);
+
+ if (fail) {
+ trx_rollback_for_mysql(trx);
+ } else {
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(trx->has_logged());
+ trx->commit();
+ }
+
+ /* If server crashes here, the dictionary in
+ InnoDB and MySQL will differ. The .ibd files
+ and the .frm files must be swapped manually by
+ the administrator. No loss of data. */
+ DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
+ log_buffer_flush_to_disk();
+ DBUG_SUICIDE(););
+ }
+
+ /* Flush the log to reduce probability that the .frm files and
+ the InnoDB data dictionary get out-of-sync if the user runs
+ with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+
+ /* At this point, the changes to the persistent storage have
+ been committed or rolled back. What remains to be done is to
+ update the in-memory structures, close some handles, release
+ temporary files, and (unless we rolled back) update persistent
+ statistics. */
+ for (inplace_alter_handler_ctx** pctx = ctx_array;
+ *pctx; pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+ DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+ if (new_clustered) {
+ innobase_online_rebuild_log_free(ctx->old_table);
+ }
+
+ if (fail) {
+ if (new_clustered) {
+ trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+
+ dict_table_close_and_drop(trx, ctx->new_table);
+
+ trx_commit_for_mysql(trx);
+ ctx->new_table = NULL;
+ } else {
+ /* We failed, but did not rebuild the table.
+ Roll back any ADD INDEX, or get rid of garbage
+ ADD INDEX that was left over from a previous
+ ALTER TABLE statement. */
+ trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+ innobase_rollback_sec_index(
+ ctx->new_table, table, TRUE, trx);
+ trx_commit_for_mysql(trx);
+ }
+ DBUG_INJECT_CRASH("ib_commit_inplace_crash_fail",
+ crash_fail_inject_count++);
+
+ continue;
+ }
+
+ innobase_copy_frm_flags_from_table_share(
+ ctx->new_table, altered_table->s);
+
+ if (new_clustered) {
+ /* We will reload and refresh the
+ in-memory foreign key constraint
+ metadata. This is a rename operation
+ in preparing for dropping the old
+ table. Set the table to_be_dropped bit
+ here, so to make sure DML foreign key
+ constraint check does not use the
+ stale dict_foreign_t. This is done
+ because WL#6049 (FK MDL) has not been
+ implemented yet. */
+ ctx->old_table->to_be_dropped = true;
+
+ DBUG_PRINT("to_be_dropped",
+ ("table: %s", ctx->old_table->name.m_name));
+
+ if (innobase_update_foreign_cache(ctx, m_user_thd)
+ != DB_SUCCESS
+ && m_prebuilt->trx->check_foreigns) {
+foreign_fail:
+ push_warning_printf(
+ m_user_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_ALTER_INFO,
+ "failed to load FOREIGN KEY"
+ " constraints");
+ }
+ } else {
+ bool fk_fail = innobase_update_foreign_cache(
+ ctx, m_user_thd) != DB_SUCCESS;
+
+ if (!commit_cache_norebuild(ha_alter_info, ctx,
+ altered_table, table,
+ trx)) {
+ fk_fail = true;
+ }
+
+ if (fk_fail && m_prebuilt->trx->check_foreigns) {
+ goto foreign_fail;
+ }
+ }
+
+ dict_mem_table_free_foreign_vcol_set(ctx->new_table);
+ dict_mem_table_fill_foreign_vcol_set(ctx->new_table);
+
+ DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+ crash_inject_count++);
+ }
+
+ if (fail) {
+ for (inplace_alter_handler_ctx** pctx = ctx_array;
+ *pctx; pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>
+ (*pctx);
+ DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+ ut_d(dict_table_check_for_dup_indexes(
+ ctx->old_table,
+ CHECK_ABORTED_OK));
+ ut_a(fts_check_cached_index(ctx->old_table));
+ DBUG_INJECT_CRASH("ib_commit_inplace_crash_fail",
+ crash_fail_inject_count++);
+
+ /* Restart the FTS background operations. */
+ if (ctx->old_table->fts) {
+ fts_optimize_add_table(ctx->old_table);
+ }
+ }
+
+ row_mysql_unlock_data_dictionary(trx);
+ if (trx != ctx0->trx) {
+ trx->free();
+ }
+ DBUG_RETURN(true);
+ }
+
+ if (trx == ctx0->trx) {
+ ctx0->trx = NULL;
+ }
+
+ /* Free the ctx->trx of other partitions, if any. We will only
+ use the ctx0->trx here. Others may have been allocated in
+ the prepare stage. */
+
+ for (inplace_alter_handler_ctx** pctx = &ctx_array[1]; *pctx;
+ pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+ if (ctx->trx) {
+ ctx->trx->free();
+ ctx->trx = NULL;
+ }
+ }
+
+ /* MDEV-17468: Avoid this at least when ctx->is_instant().
+ Currently dict_load_column_low() is the only place where
+ num_base for virtual columns is assigned to nonzero. */
+ if (ctx0->num_to_drop_vcol || ctx0->num_to_add_vcol
+ || (ctx0->new_table->n_v_cols && !new_clustered
+ && (ha_alter_info->alter_info->drop_list.elements
+ || ha_alter_info->alter_info->create_list.elements))
+ || (ctx0->is_instant()
+ && m_prebuilt->table->n_v_cols
+ && ha_alter_info->handler_flags & ALTER_STORED_COLUMN_ORDER)) {
+ DBUG_ASSERT(ctx0->old_table->get_ref_count() == 1);
+ ut_ad(ctx0->prebuilt == m_prebuilt);
+ trx_commit_for_mysql(m_prebuilt->trx);
+
+ for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx;
+ pctx++) {
+ auto ctx= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+ ctx->prebuilt->table = innobase_reload_table(
+ m_user_thd, ctx->prebuilt->table,
+ table->s->table_name, *ctx);
+ innobase_copy_frm_flags_from_table_share(
+ ctx->prebuilt->table, altered_table->s);
+ }
+
+ row_mysql_unlock_data_dictionary(trx);
+ trx->free();
+ MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+ DBUG_RETURN(false);
+ }
+
+ /* Release the table locks. */
+ trx_commit_for_mysql(m_prebuilt->trx);
+
+ DBUG_EXECUTE_IF("ib_ddl_crash_after_user_trx_commit", DBUG_SUICIDE(););
+
+ for (inplace_alter_handler_ctx** pctx = ctx_array;
+ *pctx; pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>
+ (*pctx);
+ DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+ /* Publish the created fulltext index, if any.
+ Note that a fulltext index can be created without
+ creating the clustered index, if there already exists
+ a suitable FTS_DOC_ID column. If not, one will be
+ created, implying new_clustered */
+ for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+ dict_index_t* index = ctx->add_index[i];
+
+ if (index->type & DICT_FTS) {
+ DBUG_ASSERT(index->type == DICT_FTS);
+ /* We reset DICT_TF2_FTS here because the bit
+ is left unset when a drop proceeds the add. */
+ DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS);
+ fts_add_index(index, ctx->new_table);
+ }
+ }
+
+ ut_d(dict_table_check_for_dup_indexes(
+ ctx->new_table, CHECK_ALL_COMPLETE));
+
+ /* Start/Restart the FTS background operations. */
+ if (ctx->new_table->fts) {
+ fts_optimize_add_table(ctx->new_table);
+ }
+
+ ut_d(dict_table_check_for_dup_indexes(
+ ctx->new_table, CHECK_ABORTED_OK));
+
+#ifdef UNIV_DEBUG
+ if (!(ctx->new_table->fts != NULL
+ && ctx->new_table->fts->cache->sync->in_progress)) {
+ ut_a(fts_check_cached_index(ctx->new_table));
+ }
+#endif
+ if (new_clustered) {
+ /* Since the table has been rebuilt, we remove
+ all persistent statistics corresponding to the
+ old copy of the table (which was renamed to
+ ctx->tmp_name). */
+
+ DBUG_ASSERT(0 == strcmp(ctx->old_table->name.m_name,
+ ctx->tmp_name));
+
+ dict_stats_try_drop_table(m_user_thd,
+ ctx->new_table->name,
+ table->s->table_name);
+
+ DBUG_EXECUTE_IF("ib_ddl_crash_before_commit",
+ DBUG_SUICIDE(););
+
+ ut_ad(m_prebuilt != ctx->prebuilt
+ || ctx == ctx0);
+ bool update_own_prebuilt =
+ (m_prebuilt == ctx->prebuilt);
+ trx_t* const user_trx = m_prebuilt->trx;
+
+ row_prebuilt_free(ctx->prebuilt, TRUE);
+
+ /* Drop the copy of the old table, which was
+ renamed to ctx->tmp_name at the atomic DDL
+ transaction commit. If the system crashes
+ before this is completed, some orphan tables
+ with ctx->tmp_name may be recovered. */
+ trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+ dberr_t error = row_merge_drop_table(trx, ctx->old_table);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "Inplace alter table " << ctx->old_table->name
+ << " dropping copy of the old table failed error "
+ << error
+ << ". tmp_name " << (ctx->tmp_name ? ctx->tmp_name : "N/A")
+ << " new_table " << ctx->new_table->name;
+ }
+
+ trx_commit_for_mysql(trx);
+
+ /* Rebuild the prebuilt object. */
+ ctx->prebuilt = row_create_prebuilt(
+ ctx->new_table, altered_table->s->reclength);
+ if (update_own_prebuilt) {
+ m_prebuilt = ctx->prebuilt;
+ }
+ trx_start_if_not_started(user_trx, true);
+ m_prebuilt->trx = user_trx;
+ }
+ DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+ crash_inject_count++);
+ }
+
+ row_mysql_unlock_data_dictionary(trx);
+ trx->free();
+
+ /* TODO: The following code could be executed
+ while allowing concurrent access to the table
+ (MDL downgrade). */
+
+ if (new_clustered) {
+ for (inplace_alter_handler_ctx** pctx = ctx_array;
+ *pctx; pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>
+ (*pctx);
+ DBUG_ASSERT(ctx->need_rebuild());
+
+ alter_stats_rebuild(
+ ctx->new_table, table->s->table_name.str,
+ m_user_thd);
+ DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+ crash_inject_count++);
+ }
+ } else {
+ for (inplace_alter_handler_ctx** pctx = ctx_array;
+ *pctx; pctx++) {
+ ha_innobase_inplace_ctx* ctx
+ = static_cast<ha_innobase_inplace_ctx*>
+ (*pctx);
+ DBUG_ASSERT(!ctx->need_rebuild());
+
+ alter_stats_norebuild(ha_alter_info, ctx, m_user_thd);
+ DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+ crash_inject_count++);
+ }
+ }
+
+ innobase_parse_hint_from_comment(
+ m_user_thd, m_prebuilt->table, altered_table->s);
+
+ /* TODO: Also perform DROP TABLE and DROP INDEX after
+ the MDL downgrade. */
+
+#ifndef DBUG_OFF
+ dict_index_t* clust_index = dict_table_get_first_index(
+ ctx0->prebuilt->table);
+ DBUG_ASSERT(!clust_index->online_log);
+ DBUG_ASSERT(dict_index_get_online_status(clust_index)
+ == ONLINE_INDEX_COMPLETE);
+
+ for (dict_index_t* index = clust_index;
+ index;
+ index = dict_table_get_next_index(index)) {
+ DBUG_ASSERT(!index->to_be_dropped);
+ }
+#endif /* DBUG_OFF */
+ MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+ DBUG_RETURN(false);
+}
+
+/**
+@param thd the session
+@param start_value the lower bound
+@param max_value the upper bound (inclusive) */
+
+ib_sequence_t::ib_sequence_t(
+ THD* thd,
+ ulonglong start_value,
+ ulonglong max_value)
+ :
+ m_max_value(max_value),
+ m_increment(0),
+ m_offset(0),
+ m_next_value(start_value),
+ m_eof(false)
+{
+ if (thd != 0 && m_max_value > 0) {
+
+ thd_get_autoinc(thd, &m_offset, &m_increment);
+
+ if (m_increment > 1 || m_offset > 1) {
+
+ /* If there is an offset or increment specified
+ then we need to work out the exact next value. */
+
+ m_next_value = innobase_next_autoinc(
+ start_value, 1,
+ m_increment, m_offset, m_max_value);
+
+ } else if (start_value == 0) {
+ /* The next value can never be 0. */
+ m_next_value = 1;
+ }
+ } else {
+ m_eof = true;
+ }
+}
+
+/**
+Postfix increment
+@return the next value to insert */
+
+ulonglong
+ib_sequence_t::operator++(int) UNIV_NOTHROW
+{
+ ulonglong current = m_next_value;
+
+ ut_ad(!m_eof);
+ ut_ad(m_max_value > 0);
+
+ m_next_value = innobase_next_autoinc(
+ current, 1, m_increment, m_offset, m_max_value);
+
+ if (m_next_value == m_max_value && current == m_next_value) {
+ m_eof = true;
+ }
+
+ return(current);
+}
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
new file mode 100644
index 00000000..420e7eac
--- /dev/null
+++ b/storage/innobase/handler/i_s.cc
@@ -0,0 +1,7461 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/i_s.cc
+InnoDB INFORMATION SCHEMA tables interface to MySQL.
+
+Created July 18, 2007 Vasil Dimov
+Modified Dec 29, 2014 Jan Lindström (Added sys_semaphore_waits)
+*******************************************************/
+
+#include "univ.i"
+#include <mysql_version.h>
+#include <field.h>
+
+#include <sql_acl.h>
+#include <sql_show.h>
+#include <sql_time.h>
+
+#include "i_s.h"
+#include "btr0pcur.h"
+#include "btr0types.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "ibuf0ibuf.h"
+#include "dict0mem.h"
+#include "dict0types.h"
+#include "srv0start.h"
+#include "trx0i_s.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#include "fut0fut.h"
+#include "pars0pars.h"
+#include "fts0types.h"
+#include "fts0opt.h"
+#include "fts0priv.h"
+#include "btr0btr.h"
+#include "page0zip.h"
+#include "sync0arr.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "dict0crea.h"
+
+/** The latest successfully looked up innodb_fts_aux_table */
+UNIV_INTERN table_id_t innodb_ft_aux_table_id;
+
+/** structure associates a name string with a file page type and/or buffer
+page state. */
+struct buf_page_desc_t{
+ const char* type_str; /*!< String explain the page
+ type/state */
+ ulint type_value; /*!< Page type or page state */
+};
+
+/** We also define I_S_PAGE_TYPE_INDEX as the Index Page's position
+in i_s_page_type[] array */
+#define I_S_PAGE_TYPE_INDEX 1
+
+/** Any unassigned FIL_PAGE_TYPE will be treated as unknown. */
+#define I_S_PAGE_TYPE_UNKNOWN FIL_PAGE_TYPE_UNKNOWN
+
+/** R-tree index page */
+#define I_S_PAGE_TYPE_RTREE (FIL_PAGE_TYPE_LAST + 1)
+
+/** Change buffer B-tree page */
+#define I_S_PAGE_TYPE_IBUF (FIL_PAGE_TYPE_LAST + 2)
+
+#define I_S_PAGE_TYPE_LAST I_S_PAGE_TYPE_IBUF
+
+#define I_S_PAGE_TYPE_BITS 4
+
+/** Name string for File Page Types */
+static buf_page_desc_t i_s_page_type[] = {
+ {"ALLOCATED", FIL_PAGE_TYPE_ALLOCATED},
+ {"INDEX", FIL_PAGE_INDEX},
+ {"UNDO_LOG", FIL_PAGE_UNDO_LOG},
+ {"INODE", FIL_PAGE_INODE},
+ {"IBUF_FREE_LIST", FIL_PAGE_IBUF_FREE_LIST},
+ {"IBUF_BITMAP", FIL_PAGE_IBUF_BITMAP},
+ {"SYSTEM", FIL_PAGE_TYPE_SYS},
+ {"TRX_SYSTEM", FIL_PAGE_TYPE_TRX_SYS},
+ {"FILE_SPACE_HEADER", FIL_PAGE_TYPE_FSP_HDR},
+ {"EXTENT_DESCRIPTOR", FIL_PAGE_TYPE_XDES},
+ {"BLOB", FIL_PAGE_TYPE_BLOB},
+ {"COMPRESSED_BLOB", FIL_PAGE_TYPE_ZBLOB},
+ {"COMPRESSED_BLOB2", FIL_PAGE_TYPE_ZBLOB2},
+ {"UNKNOWN", I_S_PAGE_TYPE_UNKNOWN},
+ {"RTREE_INDEX", I_S_PAGE_TYPE_RTREE},
+ {"IBUF_INDEX", I_S_PAGE_TYPE_IBUF},
+ {"PAGE COMPRESSED", FIL_PAGE_PAGE_COMPRESSED},
+ {"PAGE COMPRESSED AND ENCRYPTED", FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED},
+};
+
+/** This structure defines information we will fetch from pages
+currently cached in the buffer pool. It will be used to populate
+table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE */
+struct buf_page_info_t{
+ ulint block_id; /*!< Buffer Pool block ID */
+ /** page identifier */
+ page_id_t id;
+ unsigned access_time:32; /*!< Time of first access */
+ unsigned io_fix:2; /*!< type of pending I/O operation */
+ uint32_t fix_count; /*!< Count of how manyfold this block
+ is bufferfixed */
+#ifdef BTR_CUR_HASH_ADAPT
+ unsigned hashed:1; /*!< Whether hash index has been
+ built on this page */
+#endif /* BTR_CUR_HASH_ADAPT */
+ unsigned is_old:1; /*!< TRUE if the block is in the old
+ blocks in buf_pool.LRU_old */
+ unsigned freed_page_clock:31; /*!< the value of
+ buf_pool.freed_page_clock */
+ unsigned zip_ssize:PAGE_ZIP_SSIZE_BITS;
+ /*!< Compressed page size */
+ unsigned page_state:3; /*!< Page state */
+ unsigned page_type:I_S_PAGE_TYPE_BITS; /*!< Page type */
+ unsigned num_recs:UNIV_PAGE_SIZE_SHIFT_MAX-2;
+ /*!< Number of records on Page */
+ unsigned data_size:UNIV_PAGE_SIZE_SHIFT_MAX;
+ /*!< Sum of the sizes of the records */
+ lsn_t newest_mod; /*!< Log sequence number of
+ the youngest modification */
+ lsn_t oldest_mod; /*!< Log sequence number of
+ the oldest modification */
+ index_id_t index_id; /*!< Index ID if a index page */
+};
+
+/*
+Use the following types mapping:
+
+C type ST_FIELD_INFO::field_type
+---------------------------------
+long MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS)
+
+long unsigned MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED)
+
+char* MYSQL_TYPE_STRING
+(field_length=n)
+
+float MYSQL_TYPE_FLOAT
+(field_length=0 is ignored)
+
+void* MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED)
+
+boolean (if else) MYSQL_TYPE_LONG
+(field_length=1)
+
+time_t MYSQL_TYPE_DATETIME
+(field_length=0 ignored)
+---------------------------------
+*/
+
+/** Implemented on sync0arr.cc */
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS table.
+Loop through each item on sync array, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS table.
+@return 0 on success */
+UNIV_INTERN
+int
+sync_arr_fill_sys_semphore_waits_table(
+/*===================================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ); /*!< in: condition (not used) */
+
+/*******************************************************************//**
+Common function to fill any of the dynamic tables:
+INFORMATION_SCHEMA.innodb_trx
+INFORMATION_SCHEMA.innodb_locks
+INFORMATION_SCHEMA.innodb_lock_waits
+@return 0 on success */
+static
+int
+trx_i_s_common_fill_table(
+/*======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ); /*!< in: condition (not used) */
+
+/*******************************************************************//**
+Unbind a dynamic INFORMATION_SCHEMA table.
+@return 0 on success */
+static
+int
+i_s_common_deinit(
+/*==============*/
+ void* p); /*!< in/out: table schema object */
+/*******************************************************************//**
+Auxiliary function to store time_t value in MYSQL_TYPE_DATETIME
+field.
+@return 0 on success */
+static
+int
+field_store_time_t(
+/*===============*/
+ Field* field, /*!< in/out: target field for storage */
+ time_t time) /*!< in: value to store */
+{
+ MYSQL_TIME my_time;
+ struct tm tm_time;
+
+ if (time) {
+#if 0
+ /* use this if you are sure that `variables' and `time_zone'
+ are always initialized */
+ thd->variables.time_zone->gmt_sec_to_TIME(
+ &my_time, (my_time_t) time);
+#else
+ localtime_r(&time, &tm_time);
+ localtime_to_TIME(&my_time, &tm_time);
+ my_time.time_type = MYSQL_TIMESTAMP_DATETIME;
+#endif
+ } else {
+ memset(&my_time, 0, sizeof(my_time));
+ }
+
+ /* JAN: TODO: MySQL 5.7
+ return(field->store_time(&my_time, MYSQL_TIMESTAMP_DATETIME));
+ */
+ return(field->store_time(&my_time));
+}
+
+/*******************************************************************//**
+Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
+@return 0 on success */
+int
+field_store_string(
+/*===============*/
+ Field* field, /*!< in/out: target field for storage */
+ const char* str) /*!< in: NUL-terminated utf-8 string,
+ or NULL */
+{
+ if (!str) {
+ field->set_null();
+ return 0;
+ }
+
+ field->set_notnull();
+ return field->store(str, uint(strlen(str)), system_charset_info);
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+# define I_S_AHI 1 /* Include the IS_HASHED column */
+#else
+# define I_S_AHI 0 /* Omit the IS_HASHED column */
+#endif
+
+static const LEX_CSTRING isolation_level_values[] =
+{
+ { STRING_WITH_LEN("READ UNCOMMITTED") },
+ { STRING_WITH_LEN("READ COMMITTED") },
+ { STRING_WITH_LEN("REPEATABLE READ") },
+ { STRING_WITH_LEN("SERIALIZABLE") }
+};
+
+static TypelibBuffer<4> isolation_level_values_typelib(isolation_level_values);
+
+namespace Show {
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_trx */
+static ST_FIELD_INFO innodb_trx_fields_info[] =
+{
+#define IDX_TRX_ID 0
+ Column("trx_id", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_STATE 1
+ Column("trx_state", Varchar(TRX_QUE_STATE_STR_MAX_LEN + 1), NOT_NULL),
+
+#define IDX_TRX_STARTED 2
+ Column("trx_started", Datetime(0), NOT_NULL),
+
+#define IDX_TRX_REQUESTED_LOCK_ID 3
+ Column("trx_requested_lock_id",
+ Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NULLABLE),
+
+#define IDX_TRX_WAIT_STARTED 4
+ Column("trx_wait_started", Datetime(0), NULLABLE),
+
+#define IDX_TRX_WEIGHT 5
+ Column("trx_weight", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_MYSQL_THREAD_ID 6
+ Column("trx_mysql_thread_id", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_QUERY 7
+ Column("trx_query", Varchar(TRX_I_S_TRX_QUERY_MAX_LEN), NULLABLE),
+
+#define IDX_TRX_OPERATION_STATE 8
+ Column("trx_operation_state", Varchar(64), NULLABLE),
+
+#define IDX_TRX_TABLES_IN_USE 9
+ Column("trx_tables_in_use", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_TABLES_LOCKED 10
+ Column("trx_tables_locked", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_LOCK_STRUCTS 11
+ Column("trx_lock_structs", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_LOCK_MEMORY_BYTES 12
+ Column("trx_lock_memory_bytes", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_ROWS_LOCKED 13
+ Column("trx_rows_locked", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_ROWS_MODIFIED 14
+ Column("trx_rows_modified", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_CONNCURRENCY_TICKETS 15
+ Column("trx_concurrency_tickets", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_ISOLATION_LEVEL 16
+ Column("trx_isolation_level",
+ Enum(&isolation_level_values_typelib), NOT_NULL, DEFAULT_NONE),
+
+#define IDX_TRX_UNIQUE_CHECKS 17
+ Column("trx_unique_checks", SLong(1), NOT_NULL),
+
+#define IDX_TRX_FOREIGN_KEY_CHECKS 18
+ Column("trx_foreign_key_checks", SLong(1), NOT_NULL),
+
+#define IDX_TRX_LAST_FOREIGN_KEY_ERROR 19
+ Column("trx_last_foreign_key_error",
+ Varchar(TRX_I_S_TRX_FK_ERROR_MAX_LEN),NULLABLE),
+
+#define IDX_TRX_READ_ONLY 20
+ Column("trx_is_read_only", SLong(1), NOT_NULL),
+
+#define IDX_TRX_AUTOCOMMIT_NON_LOCKING 21
+ Column("trx_autocommit_non_locking", SLong(1), NOT_NULL),
+
+ CEnd()
+};
+
+} // namespace Show
+
+/*******************************************************************//**
+Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_trx
+table with it.
+@return 0 on success */
+static
+int
+fill_innodb_trx_from_cache(
+/*=======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache to read from */
+ THD* thd, /*!< in: used to call
+ schema_table_store_record() */
+ TABLE* table) /*!< in/out: fill this table */
+{
+ Field** fields;
+ ulint rows_num;
+ char lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+ ulint i;
+
+ DBUG_ENTER("fill_innodb_trx_from_cache");
+
+ fields = table->field;
+
+ rows_num = trx_i_s_cache_get_rows_used(cache,
+ I_S_INNODB_TRX);
+
+ for (i = 0; i < rows_num; i++) {
+
+ i_s_trx_row_t* row;
+
+ row = (i_s_trx_row_t*)
+ trx_i_s_cache_get_nth_row(
+ cache, I_S_INNODB_TRX, i);
+
+ /* trx_id */
+ OK(fields[IDX_TRX_ID]->store(row->trx_id, true));
+
+ /* trx_state */
+ OK(field_store_string(fields[IDX_TRX_STATE],
+ row->trx_state));
+
+ /* trx_started */
+ OK(field_store_time_t(fields[IDX_TRX_STARTED],
+ (time_t) row->trx_started));
+
+ /* trx_requested_lock_id */
+ /* trx_wait_started */
+ if (row->trx_wait_started != 0) {
+
+ OK(field_store_string(
+ fields[IDX_TRX_REQUESTED_LOCK_ID],
+ trx_i_s_create_lock_id(
+ row->requested_lock_row,
+ lock_id, sizeof(lock_id))));
+ /* field_store_string() sets it no notnull */
+
+ OK(field_store_time_t(
+ fields[IDX_TRX_WAIT_STARTED],
+ (time_t) row->trx_wait_started));
+ fields[IDX_TRX_WAIT_STARTED]->set_notnull();
+ } else {
+
+ fields[IDX_TRX_REQUESTED_LOCK_ID]->set_null();
+ fields[IDX_TRX_WAIT_STARTED]->set_null();
+ }
+
+ /* trx_weight */
+ OK(fields[IDX_TRX_WEIGHT]->store(row->trx_weight, true));
+
+ /* trx_mysql_thread_id */
+ OK(fields[IDX_TRX_MYSQL_THREAD_ID]->store(
+ row->trx_mysql_thread_id, true));
+
+ /* trx_query */
+ if (row->trx_query) {
+ /* store will do appropriate character set
+ conversion check */
+ fields[IDX_TRX_QUERY]->store(
+ row->trx_query,
+ static_cast<uint>(strlen(row->trx_query)),
+ row->trx_query_cs);
+ fields[IDX_TRX_QUERY]->set_notnull();
+ } else {
+ fields[IDX_TRX_QUERY]->set_null();
+ }
+
+ /* trx_operation_state */
+ OK(field_store_string(fields[IDX_TRX_OPERATION_STATE],
+ row->trx_operation_state));
+
+ /* trx_tables_in_use */
+ OK(fields[IDX_TRX_TABLES_IN_USE]->store(
+ row->trx_tables_in_use, true));
+
+ /* trx_tables_locked */
+ OK(fields[IDX_TRX_TABLES_LOCKED]->store(
+ row->trx_tables_locked, true));
+
+ /* trx_lock_structs */
+ OK(fields[IDX_TRX_LOCK_STRUCTS]->store(
+ row->trx_lock_structs, true));
+
+ /* trx_lock_memory_bytes */
+ OK(fields[IDX_TRX_LOCK_MEMORY_BYTES]->store(
+ row->trx_lock_memory_bytes, true));
+
+ /* trx_rows_locked */
+ OK(fields[IDX_TRX_ROWS_LOCKED]->store(
+ row->trx_rows_locked, true));
+
+ /* trx_rows_modified */
+ OK(fields[IDX_TRX_ROWS_MODIFIED]->store(
+ row->trx_rows_modified, true));
+
+ /* trx_concurrency_tickets */
+ OK(fields[IDX_TRX_CONNCURRENCY_TICKETS]->store(0, true));
+
+ /* trx_isolation_level */
+ OK(fields[IDX_TRX_ISOLATION_LEVEL]->store(
+ 1 + row->trx_isolation_level, true));
+
+ /* trx_unique_checks */
+ OK(fields[IDX_TRX_UNIQUE_CHECKS]->store(
+ row->trx_unique_checks, true));
+
+ /* trx_foreign_key_checks */
+ OK(fields[IDX_TRX_FOREIGN_KEY_CHECKS]->store(
+ row->trx_foreign_key_checks, true));
+
+ /* trx_last_foreign_key_error */
+ OK(field_store_string(fields[IDX_TRX_LAST_FOREIGN_KEY_ERROR],
+ row->trx_foreign_key_error));
+
+ /* trx_is_read_only*/
+ OK(fields[IDX_TRX_READ_ONLY]->store(
+ row->trx_is_read_only, true));
+
+ /* trx_is_autocommit_non_locking */
+ OK(fields[IDX_TRX_AUTOCOMMIT_NON_LOCKING]->store(
+ row->trx_is_autocommit_non_locking, true));
+
+ OK(schema_table_store_record(thd, table));
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_trx
+@return 0 on success */
+static
+int
+innodb_trx_init(
+/*============*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_trx_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_trx_fields_info;
+ schema->fill_table = trx_i_s_common_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+static struct st_mysql_information_schema i_s_info =
+{
+ MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_trx =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_TRX"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB transactions"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_trx_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+static const LEX_CSTRING lock_mode_values[] =
+{
+ { STRING_WITH_LEN("S") },
+ { STRING_WITH_LEN("S,GAP") },
+ { STRING_WITH_LEN("X") },
+ { STRING_WITH_LEN("X,GAP") },
+ { STRING_WITH_LEN("IS") },
+ { STRING_WITH_LEN("IS,GAP") },
+ { STRING_WITH_LEN("IX") },
+ { STRING_WITH_LEN("IX,GAP") },
+ { STRING_WITH_LEN("AUTO_INC") }
+};
+
+static TypelibBuffer<9> lock_mode_values_typelib(lock_mode_values);
+
+static const LEX_CSTRING lock_type_values[] =
+{
+ { STRING_WITH_LEN("RECORD") },
+ { STRING_WITH_LEN("TABLE") }
+};
+
+static TypelibBuffer<2> lock_type_values_typelib(lock_type_values);
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */
+static ST_FIELD_INFO innodb_locks_fields_info[] =
+{
+#define IDX_LOCK_ID 0
+ Column("lock_id", Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NOT_NULL),
+
+#define IDX_LOCK_TRX_ID 1
+ Column("lock_trx_id", ULonglong(), NOT_NULL),
+
+#define IDX_LOCK_MODE 2
+ Column("lock_mode", Enum(&lock_mode_values_typelib), NOT_NULL, DEFAULT_NONE),
+
+#define IDX_LOCK_TYPE 3
+ Column("lock_type", Enum(&lock_type_values_typelib), NOT_NULL, DEFAULT_NONE),
+
+#define IDX_LOCK_TABLE 4
+ Column("lock_table", Varchar(1024), NOT_NULL),
+
+#define IDX_LOCK_INDEX 5
+ Column("lock_index", Varchar(1024), NULLABLE),
+
+#define IDX_LOCK_SPACE 6
+ Column("lock_space", ULong(), NULLABLE),
+
+#define IDX_LOCK_PAGE 7
+ Column("lock_page", ULong(), NULLABLE),
+
+#define IDX_LOCK_REC 8
+ Column("lock_rec", ULong(), NULLABLE),
+
+#define IDX_LOCK_DATA 9
+ Column("lock_data", Varchar(TRX_I_S_LOCK_DATA_MAX_LEN), NULLABLE),
+ CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_locks
+table with it.
+@return 0 on success */
+static
+int
+fill_innodb_locks_from_cache(
+/*=========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache to read from */
+ THD* thd, /*!< in: MySQL client connection */
+ TABLE* table) /*!< in/out: fill this table */
+{
+ Field** fields;
+ ulint rows_num;
+ char lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+ ulint i;
+
+ DBUG_ENTER("fill_innodb_locks_from_cache");
+
+ fields = table->field;
+
+ rows_num = trx_i_s_cache_get_rows_used(cache,
+ I_S_INNODB_LOCKS);
+
+ for (i = 0; i < rows_num; i++) {
+
+ i_s_locks_row_t* row;
+ char buf[MAX_FULL_NAME_LEN + 1];
+ const char* bufend;
+
+ row = (i_s_locks_row_t*)
+ trx_i_s_cache_get_nth_row(
+ cache, I_S_INNODB_LOCKS, i);
+
+ /* lock_id */
+ trx_i_s_create_lock_id(row, lock_id, sizeof(lock_id));
+ OK(field_store_string(fields[IDX_LOCK_ID],
+ lock_id));
+
+ /* lock_trx_id */
+ OK(fields[IDX_LOCK_TRX_ID]->store(row->lock_trx_id, true));
+
+ /* lock_mode */
+ OK(fields[IDX_LOCK_MODE]->store(row->lock_mode, true));
+
+ /* lock_type */
+ OK(fields[IDX_LOCK_TYPE]->store(
+ row->lock_index ? 1 : 2, true));
+
+ /* lock_table */
+ bufend = innobase_convert_name(buf, sizeof(buf),
+ row->lock_table,
+ strlen(row->lock_table),
+ thd);
+ OK(fields[IDX_LOCK_TABLE]->store(
+ buf, uint(bufend - buf), system_charset_info));
+
+ if (row->lock_index) {
+ /* record lock */
+ OK(field_store_string(fields[IDX_LOCK_INDEX],
+ row->lock_index));
+ OK(fields[IDX_LOCK_SPACE]->store(
+ row->lock_page.space(), true));
+ fields[IDX_LOCK_SPACE]->set_notnull();
+ OK(fields[IDX_LOCK_PAGE]->store(
+ row->lock_page.page_no(), true));
+ fields[IDX_LOCK_PAGE]->set_notnull();
+ OK(fields[IDX_LOCK_REC]->store(
+ row->lock_rec, true));
+ fields[IDX_LOCK_REC]->set_notnull();
+ OK(field_store_string(fields[IDX_LOCK_DATA],
+ row->lock_data));
+ } else {
+ fields[IDX_LOCK_INDEX]->set_null();
+ fields[IDX_LOCK_SPACE]->set_null();
+ fields[IDX_LOCK_REC]->set_null();
+ fields[IDX_LOCK_DATA]->set_null();
+ }
+
+ OK(schema_table_store_record(thd, table));
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_locks
+@return 0 on success */
+static
+int
+innodb_locks_init(
+/*==============*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_locks_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_locks_fields_info;
+ schema->fill_table = trx_i_s_common_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_locks =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_LOCKS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB conflicting locks"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_locks_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */
+static ST_FIELD_INFO innodb_lock_waits_fields_info[] =
+{
+#define IDX_REQUESTING_TRX_ID 0
+ Column("requesting_trx_id", ULonglong(), NOT_NULL),
+
+#define IDX_REQUESTED_LOCK_ID 1
+ Column("requested_lock_id", Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NOT_NULL),
+
+#define IDX_BLOCKING_TRX_ID 2
+ Column("blocking_trx_id", ULonglong(), NOT_NULL),
+
+#define IDX_BLOCKING_LOCK_ID 3
+ Column("blocking_lock_id", Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NOT_NULL),
+ CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Read data from cache buffer and fill the
+INFORMATION_SCHEMA.innodb_lock_waits table with it.
+@return 0 on success */
+static
+int
+fill_innodb_lock_waits_from_cache(
+/*==============================*/
+ trx_i_s_cache_t* cache, /*!< in: cache to read from */
+ THD* thd, /*!< in: used to call
+ schema_table_store_record() */
+ TABLE* table) /*!< in/out: fill this table */
+{
+ Field** fields;
+ ulint rows_num;
+ char requested_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+ char blocking_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+ ulint i;
+
+ DBUG_ENTER("fill_innodb_lock_waits_from_cache");
+
+ fields = table->field;
+
+ rows_num = trx_i_s_cache_get_rows_used(cache,
+ I_S_INNODB_LOCK_WAITS);
+
+ for (i = 0; i < rows_num; i++) {
+
+ i_s_lock_waits_row_t* row;
+
+ row = (i_s_lock_waits_row_t*)
+ trx_i_s_cache_get_nth_row(
+ cache, I_S_INNODB_LOCK_WAITS, i);
+
+ /* requesting_trx_id */
+ OK(fields[IDX_REQUESTING_TRX_ID]->store(
+ row->requested_lock_row->lock_trx_id, true));
+
+ /* requested_lock_id */
+ OK(field_store_string(
+ fields[IDX_REQUESTED_LOCK_ID],
+ trx_i_s_create_lock_id(
+ row->requested_lock_row,
+ requested_lock_id,
+ sizeof(requested_lock_id))));
+
+ /* blocking_trx_id */
+ OK(fields[IDX_BLOCKING_TRX_ID]->store(
+ row->blocking_lock_row->lock_trx_id, true));
+
+ /* blocking_lock_id */
+ OK(field_store_string(
+ fields[IDX_BLOCKING_LOCK_ID],
+ trx_i_s_create_lock_id(
+ row->blocking_lock_row,
+ blocking_lock_id,
+ sizeof(blocking_lock_id))));
+
+ OK(schema_table_store_record(thd, table));
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_lock_waits
+@return 0 on success */
+static
+int
+innodb_lock_waits_init(
+/*===================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_lock_waits_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_lock_waits_fields_info;
+ schema->fill_table = trx_i_s_common_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_lock_waits =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_LOCK_WAITS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB which lock is blocking which"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_lock_waits_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/*******************************************************************//**
+Common function to fill any of the dynamic tables:
+INFORMATION_SCHEMA.innodb_trx
+INFORMATION_SCHEMA.innodb_locks
+INFORMATION_SCHEMA.innodb_lock_waits
+@return 0 on success */
+static
+int
+trx_i_s_common_fill_table(
+/*======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ LEX_CSTRING table_name;
+ int ret;
+ trx_i_s_cache_t* cache;
+
+ DBUG_ENTER("trx_i_s_common_fill_table");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ /* minimize the number of places where global variables are
+ referenced */
+ cache = trx_i_s_cache;
+
+ /* which table we have to fill? */
+ table_name = tables->schema_table_name;
+ /* or table_name = tables->schema_table->table_name; */
+
+ RETURN_IF_INNODB_NOT_STARTED(table_name.str);
+
+ /* update the cache */
+ trx_i_s_cache_start_write(cache);
+ trx_i_s_possibly_fetch_data_into_cache(cache);
+ trx_i_s_cache_end_write(cache);
+
+ if (trx_i_s_cache_is_truncated(cache)) {
+
+ ib::warn() << "Data in " << table_name.str << " truncated due to"
+ " memory limit of " << TRX_I_S_MEM_LIMIT << " bytes";
+ }
+
+ ret = 0;
+
+ trx_i_s_cache_start_read(cache);
+
+ if (innobase_strcasecmp(table_name.str, "innodb_trx") == 0) {
+
+ if (fill_innodb_trx_from_cache(
+ cache, thd, tables->table) != 0) {
+
+ ret = 1;
+ }
+
+ } else if (innobase_strcasecmp(table_name.str, "innodb_locks") == 0) {
+
+ if (fill_innodb_locks_from_cache(
+ cache, thd, tables->table) != 0) {
+
+ ret = 1;
+ }
+
+ } else if (innobase_strcasecmp(table_name.str, "innodb_lock_waits") == 0) {
+
+ if (fill_innodb_lock_waits_from_cache(
+ cache, thd, tables->table) != 0) {
+
+ ret = 1;
+ }
+
+ } else {
+ ib::error() << "trx_i_s_common_fill_table() was"
+ " called to fill unknown table: " << table_name.str << "."
+ " This function only knows how to fill"
+ " innodb_trx, innodb_locks and"
+ " innodb_lock_waits tables.";
+
+ ret = 1;
+ }
+
+ trx_i_s_cache_end_read(cache);
+
+#if 0
+ DBUG_RETURN(ret);
+#else
+ /* if this function returns something else than 0 then a
+ deadlock occurs between the mysqld server and mysql client,
+ see http://bugs.mysql.com/29900 ; when that bug is resolved
+ we can enable the DBUG_RETURN(ret) above */
+ ret++; // silence a gcc46 warning
+ DBUG_RETURN(0);
+#endif
+}
+
+namespace Show {
+/* Fields of the dynamic table information_schema.innodb_cmp. */
+static ST_FIELD_INFO i_s_cmp_fields_info[] =
+{
+ Column("page_size", SLong(5),NOT_NULL, "Compressed Page Size"),
+ Column("compress_ops", SLong(), NOT_NULL, "Total Number of Compressions"),
+ Column("compress_ops_ok",SLong(), NOT_NULL, "Total Number of "
+ "Successful Compressions"),
+ Column("compress_time", SLong(), NOT_NULL, "Total Duration of "
+ "Compressions, in Seconds"),
+ Column("uncompress_ops", SLong(), NOT_NULL, "Total Number of Decompressions"),
+ Column("uncompress_time",SLong(), NOT_NULL, "Total Duration of "
+ "Decompressions, in Seconds"),
+ CEnd(),
+};
+} // namespace Show
+
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp or
+innodb_cmp_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_fill_low(
+/*=============*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* , /*!< in: condition (ignored) */
+ ibool reset) /*!< in: TRUE=reset cumulated counts */
+{
+ TABLE* table = (TABLE*) tables->table;
+ int status = 0;
+
+ DBUG_ENTER("i_s_cmp_fill_low");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ for (uint i = 0; i < PAGE_ZIP_SSIZE_MAX; i++) {
+ page_zip_stat_t* zip_stat = &page_zip_stat[i];
+
+ table->field[0]->store(UNIV_ZIP_SIZE_MIN << i);
+
+ /* The cumulated counts are not protected by any
+ mutex. Thus, some operation in page0zip.cc could
+ increment a counter between the time we read it and
+ clear it. We could introduce mutex protection, but it
+ could cause a measureable performance hit in
+ page0zip.cc. */
+ table->field[1]->store(zip_stat->compressed, true);
+ table->field[2]->store(zip_stat->compressed_ok, true);
+ table->field[3]->store(zip_stat->compressed_usec / 1000000,
+ true);
+ table->field[4]->store(zip_stat->decompressed, true);
+ table->field[5]->store(zip_stat->decompressed_usec / 1000000,
+ true);
+
+ if (reset) {
+ new (zip_stat) page_zip_stat_t();
+ }
+
+ if (schema_table_store_record(thd, table)) {
+ status = 1;
+ break;
+ }
+ }
+
+ DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_fill(
+/*=========*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* cond) /*!< in: condition (ignored) */
+{
+ return(i_s_cmp_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_reset_fill(
+/*===============*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* cond) /*!< in: condition (ignored) */
+{
+ return(i_s_cmp_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp.
+@return 0 on success */
+static
+int
+i_s_cmp_init(
+/*=========*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_cmp_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_cmp_fields_info;
+ schema->fill_table = i_s_cmp_fill;
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_reset.
+@return 0 on success */
+static
+int
+i_s_cmp_reset_init(
+/*===============*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_cmp_reset_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_cmp_fields_info;
+ schema->fill_table = i_s_cmp_reset_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMP"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compression"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmp_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_reset =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMP_RESET"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compression;"
+ " reset cumulated counts"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmp_reset_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+
+namespace Show {
+/* Fields of the dynamic tables
+information_schema.innodb_cmp_per_index and
+information_schema.innodb_cmp_per_index_reset. */
+static ST_FIELD_INFO i_s_cmp_per_index_fields_info[] =
+{
+#define IDX_DATABASE_NAME 0
+ Column("database_name", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define IDX_TABLE_NAME 1 /* FIXME: this is in my_charset_filename! */
+ Column("table_name", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define IDX_INDEX_NAME 2
+ Column("index_name", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define IDX_COMPRESS_OPS 3
+ Column("compress_ops", SLong(), NOT_NULL),
+
+#define IDX_COMPRESS_OPS_OK 4
+ Column("compress_ops_ok", SLong(), NOT_NULL),
+
+#define IDX_COMPRESS_TIME 5
+ Column("compress_time", SLong(), NOT_NULL),
+
+#define IDX_UNCOMPRESS_OPS 6
+ Column("uncompress_ops", SLong(), NOT_NULL),
+
+#define IDX_UNCOMPRESS_TIME 7
+ Column("uncompress_time", SLong(), NOT_NULL),
+
+ CEnd()
+};
+
+} // namespace Show
+
+/*******************************************************************//**
+Fill the dynamic table
+information_schema.innodb_cmp_per_index or
+information_schema.innodb_cmp_per_index_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_fill_low(
+/*=======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* , /*!< in: condition (ignored) */
+ ibool reset) /*!< in: TRUE=reset cumulated counts */
+{
+ TABLE* table = tables->table;
+ Field** fields = table->field;
+ int status = 0;
+
+ DBUG_ENTER("i_s_cmp_per_index_fill_low");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* Create a snapshot of the stats so we do not bump into lock
+ order violations with dict_sys.mutex below. */
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index_t snap (page_zip_stat_per_index);
+ mutex_exit(&page_zip_stat_per_index_mutex);
+
+ mutex_enter(&dict_sys.mutex);
+
+ page_zip_stat_per_index_t::iterator iter;
+ ulint i;
+
+ for (iter = snap.begin(), i = 0; iter != snap.end(); iter++, i++) {
+
+ dict_index_t* index = dict_index_find_on_id_low(iter->first);
+
+ if (index != NULL) {
+ char db_utf8[MAX_DB_UTF8_LEN];
+ char table_utf8[MAX_TABLE_UTF8_LEN];
+
+ dict_fs2utf8(index->table->name.m_name,
+ db_utf8, sizeof(db_utf8),
+ table_utf8, sizeof(table_utf8));
+
+ status = field_store_string(fields[IDX_DATABASE_NAME],
+ db_utf8)
+ || field_store_string(fields[IDX_TABLE_NAME],
+ table_utf8)
+ || field_store_string(fields[IDX_INDEX_NAME],
+ index->name);
+ } else {
+ /* index not found */
+ char name[MY_INT64_NUM_DECIMAL_DIGITS
+ + sizeof "index_id: "];
+ fields[IDX_DATABASE_NAME]->set_null();
+ fields[IDX_TABLE_NAME]->set_null();
+ fields[IDX_INDEX_NAME]->set_notnull();
+ status = fields[IDX_INDEX_NAME]->store(
+ name,
+ uint(snprintf(name, sizeof name,
+ "index_id: " IB_ID_FMT,
+ iter->first)),
+ system_charset_info);
+ }
+
+ if (status
+ || fields[IDX_COMPRESS_OPS]->store(
+ iter->second.compressed, true)
+ || fields[IDX_COMPRESS_OPS_OK]->store(
+ iter->second.compressed_ok, true)
+ || fields[IDX_COMPRESS_TIME]->store(
+ iter->second.compressed_usec / 1000000, true)
+ || fields[IDX_UNCOMPRESS_OPS]->store(
+ iter->second.decompressed, true)
+ || fields[IDX_UNCOMPRESS_TIME]->store(
+ iter->second.decompressed_usec / 1000000, true)
+ || schema_table_store_record(thd, table)) {
+ status = 1;
+ break;
+ }
+ /* Release and reacquire the dict mutex to allow other
+ threads to proceed. This could eventually result in the
+ contents of INFORMATION_SCHEMA.innodb_cmp_per_index being
+ inconsistent, but it is an acceptable compromise. */
+ if (i == 1000) {
+ mutex_exit(&dict_sys.mutex);
+ i = 0;
+ mutex_enter(&dict_sys.mutex);
+ }
+ }
+
+ mutex_exit(&dict_sys.mutex);
+
+ if (reset) {
+ page_zip_reset_stat_per_index();
+ }
+
+ DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_per_index.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_fill(
+/*===================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* cond) /*!< in: condition (ignored) */
+{
+ return(i_s_cmp_per_index_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_per_index_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_reset_fill(
+/*=========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* cond) /*!< in: condition (ignored) */
+{
+ return(i_s_cmp_per_index_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_per_index.
+@return 0 on success */
+static
+int
+i_s_cmp_per_index_init(
+/*===================*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_cmp_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_cmp_per_index_fields_info;
+ schema->fill_table = i_s_cmp_per_index_fill;
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_per_index_reset.
+@return 0 on success */
+static
+int
+i_s_cmp_per_index_reset_init(
+/*=========================*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_cmp_reset_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_cmp_per_index_fields_info;
+ schema->fill_table = i_s_cmp_per_index_reset_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_per_index =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMP_PER_INDEX"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compression (per index)"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmp_per_index_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_per_index_reset =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMP_PER_INDEX_RESET"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compression (per index);"
+ " reset cumulated counts"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmp_per_index_reset_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+
+namespace Show {
+/* Fields of the dynamic table information_schema.innodb_cmpmem. */
+static ST_FIELD_INFO i_s_cmpmem_fields_info[] =
+{
+ Column("page_size", SLong(5), NOT_NULL, "Buddy Block Size"),
+ Column("buffer_pool_instance", SLong(), NOT_NULL, "Buffer Pool Id"),
+ Column("pages_used", SLong(), NOT_NULL, "Currently in Use"),
+ Column("pages_free", SLong(), NOT_NULL, "Currently Available"),
+ Column("relocation_ops", SLonglong(), NOT_NULL, "Total Number of Relocations"),
+ Column("relocation_time", SLong(), NOT_NULL, "Total Duration of Relocations,"
+ " in Seconds"),
+ CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem or
+innodb_cmpmem_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_fill_low(
+/*================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* , /*!< in: condition (ignored) */
+ ibool reset) /*!< in: TRUE=reset cumulated counts */
+{
+ TABLE* table = (TABLE*) tables->table;
+
+ DBUG_ENTER("i_s_cmpmem_fill_low");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ ulint zip_free_len_local[BUF_BUDDY_SIZES_MAX + 1];
+ buf_buddy_stat_t buddy_stat_local[BUF_BUDDY_SIZES_MAX + 1];
+
+ /* Save buddy stats for buffer pool in local variables. */
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
+ zip_free_len_local[x] = (x < BUF_BUDDY_SIZES) ?
+ UT_LIST_GET_LEN(buf_pool.zip_free[x]) : 0;
+
+ buddy_stat_local[x] = buf_pool.buddy_stat[x];
+
+ if (reset) {
+ /* This is protected by buf_pool.mutex. */
+ buf_pool.buddy_stat[x].relocated = 0;
+ buf_pool.buddy_stat[x].relocated_usec = 0;
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
+ buf_buddy_stat_t* buddy_stat = &buddy_stat_local[x];
+
+ Field **field = table->field;
+
+ (*field++)->store(BUF_BUDDY_LOW << x);
+ (*field++)->store(0, true);
+ (*field++)->store(buddy_stat->used, true);
+ (*field++)->store(zip_free_len_local[x], true);
+ (*field++)->store(buddy_stat->relocated, true);
+ (*field)->store(buddy_stat->relocated_usec / 1000000, true);
+
+ if (schema_table_store_record(thd, table)) {
+ DBUG_RETURN(1);
+ }
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_fill(
+/*============*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* cond) /*!< in: condition (ignored) */
+{
+ return(i_s_cmpmem_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_reset_fill(
+/*==================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* cond) /*!< in: condition (ignored) */
+{
+ return(i_s_cmpmem_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmpmem.
+@return 0 on success */
+static
+int
+i_s_cmpmem_init(
+/*============*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_cmpmem_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_cmpmem_fields_info;
+ schema->fill_table = i_s_cmpmem_fill;
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmpmem_reset.
+@return 0 on success */
+static
+int
+i_s_cmpmem_reset_init(
+/*==================*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_cmpmem_reset_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_cmpmem_fields_info;
+ schema->fill_table = i_s_cmpmem_reset_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMPMEM"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmpmem_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem_reset =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMPMEM_RESET"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool;"
+ " reset cumulated counts"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmpmem_reset_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+
+static const LEX_CSTRING metric_type_values[] =
+{
+ { STRING_WITH_LEN("value") },
+ { STRING_WITH_LEN("status_counter") },
+ { STRING_WITH_LEN("set_owner") },
+ { STRING_WITH_LEN("set_member") },
+ { STRING_WITH_LEN("counter") }
+};
+
+static TypelibBuffer<5> metric_type_values_typelib(metric_type_values);
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_metrics */
+static ST_FIELD_INFO innodb_metrics_fields_info[] =
+{
+#define METRIC_NAME 0
+ Column("NAME", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define METRIC_SUBSYS 1
+ Column("SUBSYSTEM", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define METRIC_VALUE_START 2
+ Column("COUNT", SLonglong(), NOT_NULL),
+
+#define METRIC_MAX_VALUE_START 3
+ Column("MAX_COUNT", SLonglong(), NULLABLE),
+
+#define METRIC_MIN_VALUE_START 4
+ Column("MIN_COUNT", SLonglong(), NULLABLE),
+
+#define METRIC_AVG_VALUE_START 5
+ Column("AVG_COUNT", Float(MAX_FLOAT_STR_LENGTH), NULLABLE),
+
+#define METRIC_VALUE_RESET 6
+ Column("COUNT_RESET", SLonglong(), NOT_NULL),
+
+#define METRIC_MAX_VALUE_RESET 7
+ Column("MAX_COUNT_RESET", SLonglong(), NULLABLE),
+
+#define METRIC_MIN_VALUE_RESET 8
+ Column("MIN_COUNT_RESET", SLonglong(), NULLABLE),
+
+#define METRIC_AVG_VALUE_RESET 9
+ Column("AVG_COUNT_RESET", Float(MAX_FLOAT_STR_LENGTH), NULLABLE),
+
+#define METRIC_START_TIME 10
+ Column("TIME_ENABLED", Datetime(0), NULLABLE),
+
+#define METRIC_STOP_TIME 11
+ Column("TIME_DISABLED", Datetime(0), NULLABLE),
+
+#define METRIC_TIME_ELAPSED 12
+ Column("TIME_ELAPSED", SLonglong(), NULLABLE),
+
+#define METRIC_RESET_TIME 13
+ Column("TIME_RESET", Datetime(0), NULLABLE),
+
+#define METRIC_STATUS 14
+ Column("ENABLED", SLong(1), NOT_NULL),
+
+#define METRIC_TYPE 15
+ Column("TYPE", Enum(&metric_type_values_typelib), NOT_NULL, DEFAULT_NONE),
+
+#define METRIC_DESC 16
+ Column("COMMENT", Varchar(NAME_LEN + 1), NOT_NULL),
+ CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Fill the information schema metrics table.
+@return 0 on success */
+static
+int
+i_s_metrics_fill(
+/*=============*/
+ THD* thd, /*!< in: thread */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ int count;
+ Field** fields;
+ double time_diff = 0;
+ monitor_info_t* monitor_info;
+ mon_type_t min_val;
+ mon_type_t max_val;
+
+ DBUG_ENTER("i_s_metrics_fill");
+ fields = table_to_fill->field;
+
+ for (count = 0; count < NUM_MONITOR; count++) {
+ monitor_info = srv_mon_get_info((monitor_id_t) count);
+
+ /* A good place to sanity check the Monitor ID */
+ ut_a(count == monitor_info->monitor_id);
+
+ /* If the item refers to a Module, nothing to fill,
+ continue. */
+ if ((monitor_info->monitor_type & MONITOR_MODULE)
+ || (monitor_info->monitor_type & MONITOR_HIDDEN)) {
+ continue;
+ }
+
+ /* If this is an existing "status variable", and
+ its corresponding counter is still on, we need
+ to calculate the result from its corresponding
+ counter. */
+ if (monitor_info->monitor_type & MONITOR_EXISTING
+ && MONITOR_IS_ON(count)) {
+ srv_mon_process_existing_counter((monitor_id_t) count,
+ MONITOR_GET_VALUE);
+ }
+
+ /* Fill in counter's basic information */
+ OK(field_store_string(fields[METRIC_NAME],
+ monitor_info->monitor_name));
+
+ OK(field_store_string(fields[METRIC_SUBSYS],
+ monitor_info->monitor_module));
+
+ OK(field_store_string(fields[METRIC_DESC],
+ monitor_info->monitor_desc));
+
+ /* Fill in counter values */
+ OK(fields[METRIC_VALUE_RESET]->store(
+ MONITOR_VALUE(count), FALSE));
+
+ OK(fields[METRIC_VALUE_START]->store(
+ MONITOR_VALUE_SINCE_START(count), FALSE));
+
+ /* If the max value is MAX_RESERVED, counter max
+ value has not been updated. Set the column value
+ to NULL. */
+ if (MONITOR_MAX_VALUE(count) == MAX_RESERVED
+ || MONITOR_MAX_MIN_NOT_INIT(count)) {
+ fields[METRIC_MAX_VALUE_RESET]->set_null();
+ } else {
+ OK(fields[METRIC_MAX_VALUE_RESET]->store(
+ MONITOR_MAX_VALUE(count), FALSE));
+ fields[METRIC_MAX_VALUE_RESET]->set_notnull();
+ }
+
+ /* If the min value is MAX_RESERVED, counter min
+ value has not been updated. Set the column value
+ to NULL. */
+ if (MONITOR_MIN_VALUE(count) == MIN_RESERVED
+ || MONITOR_MAX_MIN_NOT_INIT(count)) {
+ fields[METRIC_MIN_VALUE_RESET]->set_null();
+ } else {
+ OK(fields[METRIC_MIN_VALUE_RESET]->store(
+ MONITOR_MIN_VALUE(count), FALSE));
+ fields[METRIC_MIN_VALUE_RESET]->set_notnull();
+ }
+
+ /* Calculate the max value since counter started */
+ max_val = srv_mon_calc_max_since_start((monitor_id_t) count);
+
+ if (max_val == MAX_RESERVED
+ || MONITOR_MAX_MIN_NOT_INIT(count)) {
+ fields[METRIC_MAX_VALUE_START]->set_null();
+ } else {
+ OK(fields[METRIC_MAX_VALUE_START]->store(
+ max_val, FALSE));
+ fields[METRIC_MAX_VALUE_START]->set_notnull();
+ }
+
+ /* Calculate the min value since counter started */
+ min_val = srv_mon_calc_min_since_start((monitor_id_t) count);
+
+ if (min_val == MIN_RESERVED
+ || MONITOR_MAX_MIN_NOT_INIT(count)) {
+ fields[METRIC_MIN_VALUE_START]->set_null();
+ } else {
+ OK(fields[METRIC_MIN_VALUE_START]->store(
+ min_val, FALSE));
+
+ fields[METRIC_MIN_VALUE_START]->set_notnull();
+ }
+
+ /* If monitor has been enabled (no matter it is disabled
+ or not now), fill METRIC_START_TIME and METRIC_TIME_ELAPSED
+ field */
+ if (MONITOR_FIELD(count, mon_start_time)) {
+ OK(field_store_time_t(fields[METRIC_START_TIME],
+ (time_t)MONITOR_FIELD(count, mon_start_time)));
+ fields[METRIC_START_TIME]->set_notnull();
+
+ /* If monitor is enabled, the TIME_ELAPSED is the
+ time difference between current and time when monitor
+ is enabled. Otherwise, it is the time difference
+ between time when monitor is enabled and time
+ when it is disabled */
+ if (MONITOR_IS_ON(count)) {
+ time_diff = difftime(time(NULL),
+ MONITOR_FIELD(count, mon_start_time));
+ } else {
+ time_diff = difftime(
+ MONITOR_FIELD(count, mon_stop_time),
+ MONITOR_FIELD(count, mon_start_time));
+ }
+
+ OK(fields[METRIC_TIME_ELAPSED]->store(
+ time_diff));
+ fields[METRIC_TIME_ELAPSED]->set_notnull();
+ } else {
+ fields[METRIC_START_TIME]->set_null();
+ fields[METRIC_TIME_ELAPSED]->set_null();
+ time_diff = 0;
+ }
+
+ /* Unless MONITOR_NO_AVERAGE is set, we must
+ to calculate the average value. If this is a monitor set
+ owner marked by MONITOR_SET_OWNER, divide
+ the value by another counter (number of calls) designated
+ by monitor_info->monitor_related_id.
+ Otherwise average the counter value by the time between the
+ time that the counter is enabled and time it is disabled
+ or time it is sampled. */
+ if ((monitor_info->monitor_type
+ & (MONITOR_NO_AVERAGE | MONITOR_SET_OWNER))
+ == MONITOR_SET_OWNER
+ && monitor_info->monitor_related_id) {
+ mon_type_t value_start
+ = MONITOR_VALUE_SINCE_START(
+ monitor_info->monitor_related_id);
+
+ if (value_start) {
+ OK(fields[METRIC_AVG_VALUE_START]->store(
+ MONITOR_VALUE_SINCE_START(count)
+ / value_start, FALSE));
+
+ fields[METRIC_AVG_VALUE_START]->set_notnull();
+ } else {
+ fields[METRIC_AVG_VALUE_START]->set_null();
+ }
+
+ if (mon_type_t related_value =
+ MONITOR_VALUE(monitor_info->monitor_related_id)) {
+ OK(fields[METRIC_AVG_VALUE_RESET]
+ ->store(MONITOR_VALUE(count)
+ / related_value, false));
+ fields[METRIC_AVG_VALUE_RESET]->set_notnull();
+ } else {
+ fields[METRIC_AVG_VALUE_RESET]->set_null();
+ }
+ } else if (!(monitor_info->monitor_type
+ & (MONITOR_NO_AVERAGE
+ | MONITOR_DISPLAY_CURRENT))) {
+ if (time_diff != 0) {
+ OK(fields[METRIC_AVG_VALUE_START]->store(
+ (double) MONITOR_VALUE_SINCE_START(
+ count) / time_diff));
+ fields[METRIC_AVG_VALUE_START]->set_notnull();
+ } else {
+ fields[METRIC_AVG_VALUE_START]->set_null();
+ }
+
+ if (MONITOR_FIELD(count, mon_reset_time)) {
+ /* calculate the time difference since last
+ reset */
+ if (MONITOR_IS_ON(count)) {
+ time_diff = difftime(
+ time(NULL), MONITOR_FIELD(
+ count, mon_reset_time));
+ } else {
+ time_diff = difftime(
+ MONITOR_FIELD(count, mon_stop_time),
+ MONITOR_FIELD(count, mon_reset_time));
+ }
+ } else {
+ time_diff = 0;
+ }
+
+ if (time_diff != 0) {
+ OK(fields[METRIC_AVG_VALUE_RESET]->store(
+ static_cast<double>(
+ MONITOR_VALUE(count))
+ / time_diff));
+ fields[METRIC_AVG_VALUE_RESET]->set_notnull();
+ } else {
+ fields[METRIC_AVG_VALUE_RESET]->set_null();
+ }
+ } else {
+ fields[METRIC_AVG_VALUE_START]->set_null();
+ fields[METRIC_AVG_VALUE_RESET]->set_null();
+ }
+
+ if (MONITOR_IS_ON(count)) {
+ /* If monitor is on, the stop time will set to NULL */
+ fields[METRIC_STOP_TIME]->set_null();
+
+ /* Display latest Monitor Reset Time only if Monitor
+ counter is on. */
+ if (MONITOR_FIELD(count, mon_reset_time)) {
+ OK(field_store_time_t(
+ fields[METRIC_RESET_TIME],
+ (time_t)MONITOR_FIELD(
+ count, mon_reset_time)));
+ fields[METRIC_RESET_TIME]->set_notnull();
+ } else {
+ fields[METRIC_RESET_TIME]->set_null();
+ }
+
+ OK(fields[METRIC_STATUS]->store(1, true));
+ } else {
+ if (MONITOR_FIELD(count, mon_stop_time)) {
+ OK(field_store_time_t(fields[METRIC_STOP_TIME],
+ (time_t)MONITOR_FIELD(count, mon_stop_time)));
+ fields[METRIC_STOP_TIME]->set_notnull();
+ } else {
+ fields[METRIC_STOP_TIME]->set_null();
+ }
+
+ fields[METRIC_RESET_TIME]->set_null();
+
+ OK(fields[METRIC_STATUS]->store(0, true));
+ }
+
+ uint metric_type;
+
+ if (monitor_info->monitor_type & MONITOR_DISPLAY_CURRENT) {
+ metric_type = 1; /* "value" */
+ } else if (monitor_info->monitor_type & MONITOR_EXISTING) {
+ metric_type = 2; /* "status_counter" */
+ } else if (monitor_info->monitor_type & MONITOR_SET_OWNER) {
+ metric_type = 3; /* "set_owner" */
+ } else if (monitor_info->monitor_type & MONITOR_SET_MEMBER) {
+ metric_type = 4; /* "set_member" */
+ } else {
+ metric_type = 5; /* "counter" */
+ }
+
+ OK(fields[METRIC_TYPE]->store(metric_type, true));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to fill information schema metrics tables.
+@return 0 on success */
+static
+int
+i_s_metrics_fill_table(
+/*===================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ DBUG_ENTER("i_s_metrics_fill_table");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ i_s_metrics_fill(thd, tables->table);
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_metrics
+@return 0 on success */
+static
+int
+innodb_metrics_init(
+/*================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_metrics_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_metrics_fields_info;
+ schema->fill_table = i_s_metrics_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_metrics =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_METRICS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB Metrics Info"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_metrics_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_ft_default_stopword */
+static ST_FIELD_INFO i_s_stopword_fields_info[] =
+{
+#define STOPWORD_VALUE 0
+ Column("value", Varchar(TRX_ID_MAX_LEN + 1), NOT_NULL),
+ CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_ft_default_stopword.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_stopword_fill(
+/*==============*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ Field** fields;
+ ulint i = 0;
+ TABLE* table = (TABLE*) tables->table;
+
+ DBUG_ENTER("i_s_stopword_fill");
+
+ fields = table->field;
+
+ /* Fill with server default stopword list in array
+ fts_default_stopword */
+ while (fts_default_stopword[i]) {
+ OK(field_store_string(fields[STOPWORD_VALUE],
+ fts_default_stopword[i]));
+
+ OK(schema_table_store_record(thd, table));
+ i++;
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_ft_default_stopword.
+@return 0 on success */
+static
+int
+i_s_stopword_init(
+/*==============*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_stopword_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_stopword_fields_info;
+ schema->fill_table = i_s_stopword_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_default_stopword =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_FT_DEFAULT_STOPWORD"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Default stopword list for InnoDB Full Text Search"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_stopword_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED */
+static ST_FIELD_INFO i_s_fts_doc_fields_info[] =
+{
+#define I_S_FTS_DOC_ID 0
+ Column("DOC_ID", ULonglong(), NOT_NULL),
+ CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED or
+INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_deleted_generic_fill(
+/*=========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ ibool being_deleted) /*!< in: BEING_DELTED table */
+{
+ Field** fields;
+ TABLE* table = (TABLE*) tables->table;
+ trx_t* trx;
+ fts_table_t fts_table;
+ fts_doc_ids_t* deleted;
+ dict_table_t* user_table;
+
+ DBUG_ENTER("i_s_fts_deleted_generic_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* Prevent DROP of the internal tables for fulltext indexes.
+ FIXME: acquire DDL-blocking MDL on the user table name! */
+ rw_lock_s_lock(&dict_sys.latch);
+
+ user_table = dict_table_open_on_id(
+ innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL);
+
+ if (!user_table) {
+ rw_lock_s_unlock(&dict_sys.latch);
+ DBUG_RETURN(0);
+ } else if (!dict_table_has_fts_index(user_table)
+ || !user_table->is_readable()) {
+ dict_table_close(user_table, FALSE, FALSE);
+ rw_lock_s_unlock(&dict_sys.latch);
+ DBUG_RETURN(0);
+ }
+
+ deleted = fts_doc_ids_create();
+
+ trx = trx_create();
+ trx->op_info = "Select for FTS DELETE TABLE";
+
+ FTS_INIT_FTS_TABLE(&fts_table,
+ (being_deleted) ? "BEING_DELETED" : "DELETED",
+ FTS_COMMON_TABLE, user_table);
+
+ fts_table_fetch_doc_ids(trx, &fts_table, deleted);
+
+ dict_table_close(user_table, FALSE, FALSE);
+
+ rw_lock_s_unlock(&dict_sys.latch);
+
+ trx->free();
+
+ fields = table->field;
+
+ int ret = 0;
+
+ for (ulint j = 0; j < ib_vector_size(deleted->doc_ids); ++j) {
+ doc_id_t doc_id;
+
+ doc_id = *(doc_id_t*) ib_vector_get_const(deleted->doc_ids, j);
+
+ BREAK_IF(ret = fields[I_S_FTS_DOC_ID]->store(doc_id, true));
+
+ BREAK_IF(ret = schema_table_store_record(thd, table));
+ }
+
+ fts_doc_ids_free(deleted);
+
+ DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_deleted_fill(
+/*=================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (ignored) */
+{
+ DBUG_ENTER("i_s_fts_deleted_fill");
+
+ DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, FALSE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+@return 0 on success */
+static
+int
+i_s_fts_deleted_init(
+/*=================*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_fts_deleted_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_fts_doc_fields_info;
+ schema->fill_table = i_s_fts_deleted_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_deleted =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_FT_DELETED"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "INNODB AUXILIARY FTS DELETED TABLE"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_fts_deleted_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_being_deleted_fill(
+/*=======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (ignored) */
+{
+ DBUG_ENTER("i_s_fts_being_deleted_fill");
+
+ DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return 0 on success */
+static
+int
+i_s_fts_being_deleted_init(
+/*=======================*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_fts_deleted_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_fts_doc_fields_info;
+ schema->fill_table = i_s_fts_being_deleted_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_being_deleted =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_FT_BEING_DELETED"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "INNODB AUXILIARY FTS BEING DELETED TABLE"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_fts_being_deleted_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED and
+INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE */
+static ST_FIELD_INFO i_s_fts_index_fields_info[] =
+{
+#define I_S_FTS_WORD 0
+ Column("WORD", Varchar(FTS_MAX_WORD_LEN + 1), NOT_NULL),
+
+#define I_S_FTS_FIRST_DOC_ID 1
+ Column("FIRST_DOC_ID", ULonglong(), NOT_NULL),
+
+#define I_S_FTS_LAST_DOC_ID 2
+ Column("LAST_DOC_ID", ULonglong(), NOT_NULL),
+
+#define I_S_FTS_DOC_COUNT 3
+ Column("DOC_COUNT", ULonglong(), NOT_NULL),
+
+#define I_S_FTS_ILIST_DOC_ID 4
+ Column("DOC_ID", ULonglong(), NOT_NULL),
+
+#define I_S_FTS_ILIST_DOC_POS 5
+ Column("POSITION", ULonglong(), NOT_NULL),
+ CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Go through the Doc Node and its ilist, fill the dynamic table
+INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED for one FTS index on the table.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_index_cache_fill_one_index(
+/*===============================*/
+ fts_index_cache_t* index_cache, /*!< in: FTS index cache */
+ THD* thd, /*!< in: thread */
+ fts_string_t* conv_str, /*!< in/out: buffer */
+ TABLE_LIST* tables) /*!< in/out: tables to fill */
+{
+ TABLE* table = (TABLE*) tables->table;
+ Field** fields;
+ CHARSET_INFO* index_charset;
+ const ib_rbt_node_t* rbt_node;
+ uint dummy_errors;
+ char* word_str;
+
+ DBUG_ENTER("i_s_fts_index_cache_fill_one_index");
+
+ fields = table->field;
+
+ index_charset = index_cache->charset;
+ conv_str->f_n_char = 0;
+
+ int ret = 0;
+
+ /* Go through each word in the index cache */
+ for (rbt_node = rbt_first(index_cache->words);
+ rbt_node;
+ rbt_node = rbt_next(index_cache->words, rbt_node)) {
+ fts_tokenizer_word_t* word;
+
+ word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+ /* Convert word from index charset to system_charset_info */
+ if (index_charset->cset != system_charset_info->cset) {
+ conv_str->f_n_char = my_convert(
+ reinterpret_cast<char*>(conv_str->f_str),
+ static_cast<uint32>(conv_str->f_len),
+ system_charset_info,
+ reinterpret_cast<char*>(word->text.f_str),
+ static_cast<uint32>(word->text.f_len),
+ index_charset, &dummy_errors);
+ ut_ad(conv_str->f_n_char <= conv_str->f_len);
+ conv_str->f_str[conv_str->f_n_char] = 0;
+ word_str = reinterpret_cast<char*>(conv_str->f_str);
+ } else {
+ word_str = reinterpret_cast<char*>(word->text.f_str);
+ }
+
+ /* Decrypt the ilist, and display Dod ID and word position */
+ for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+ fts_node_t* node;
+ byte* ptr;
+ ulint decoded = 0;
+ doc_id_t doc_id = 0;
+
+ node = static_cast<fts_node_t*> (ib_vector_get(
+ word->nodes, i));
+
+ ptr = node->ilist;
+
+ while (decoded < node->ilist_size) {
+ ulint pos = fts_decode_vlc(&ptr);
+
+ doc_id += pos;
+
+ /* Get position info */
+ while (*ptr) {
+ pos = fts_decode_vlc(&ptr);
+
+ OK(field_store_string(
+ fields[I_S_FTS_WORD],
+ word_str));
+
+ OK(fields[I_S_FTS_FIRST_DOC_ID]->store(
+ node->first_doc_id,
+ true));
+
+ OK(fields[I_S_FTS_LAST_DOC_ID]->store(
+ node->last_doc_id,
+ true));
+
+ OK(fields[I_S_FTS_DOC_COUNT]->store(
+ node->doc_count, true));
+
+ OK(fields[I_S_FTS_ILIST_DOC_ID]->store(
+ doc_id, true));
+
+ OK(fields[I_S_FTS_ILIST_DOC_POS]->store(
+ pos, true));
+
+ OK(schema_table_store_record(
+ thd, table));
+ }
+
+ ++ptr;
+
+ decoded = ptr - (byte*) node->ilist;
+ }
+ }
+ }
+
+ DBUG_RETURN(ret);
+}
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_index_cache_fill(
+/*=====================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (ignored) */
+{
+ dict_table_t* user_table;
+ fts_cache_t* cache;
+
+ DBUG_ENTER("i_s_fts_index_cache_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* Prevent DROP of the internal tables for fulltext indexes.
+ FIXME: acquire DDL-blocking MDL on the user table name! */
+ rw_lock_s_lock(&dict_sys.latch);
+
+ user_table = dict_table_open_on_id(
+ innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL);
+
+ if (!user_table) {
+no_fts:
+ rw_lock_s_unlock(&dict_sys.latch);
+ DBUG_RETURN(0);
+ }
+
+ if (!user_table->fts || !user_table->fts->cache) {
+ dict_table_close(user_table, FALSE, FALSE);
+ goto no_fts;
+ }
+
+ cache = user_table->fts->cache;
+
+ int ret = 0;
+ fts_string_t conv_str;
+ byte word[HA_FT_MAXBYTELEN + 1];
+ conv_str.f_len = sizeof word;
+ conv_str.f_str = word;
+
+ rw_lock_s_lock(&cache->lock);
+
+ for (ulint i = 0; i < ib_vector_size(cache->indexes); i++) {
+ fts_index_cache_t* index_cache;
+
+ index_cache = static_cast<fts_index_cache_t*> (
+ ib_vector_get(cache->indexes, i));
+
+ BREAK_IF(ret = i_s_fts_index_cache_fill_one_index(
+ index_cache, thd, &conv_str, tables));
+ }
+
+ rw_lock_s_unlock(&cache->lock);
+ dict_table_close(user_table, FALSE, FALSE);
+ rw_lock_s_unlock(&dict_sys.latch);
+
+ DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHE
+@return 0 on success */
+static
+int
+i_s_fts_index_cache_init(
+/*=====================*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_fts_index_cache_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_fts_index_fields_info;
+ schema->fill_table = i_s_fts_index_cache_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_index_cache =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_FT_INDEX_CACHE"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "INNODB AUXILIARY FTS INDEX CACHED"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_fts_index_cache_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/*******************************************************************//**
+Go through a FTS index auxiliary table, fetch its rows and fill
+FTS word cache structure.
+@return DB_SUCCESS on success, otherwise error code */
+static
+dberr_t
+i_s_fts_index_table_fill_selected(
+/*==============================*/
+ dict_index_t* index, /*!< in: FTS index */
+ ib_vector_t* words, /*!< in/out: vector to hold
+ fetched words */
+ ulint selected, /*!< in: selected FTS index */
+ fts_string_t* word) /*!< in: word to select */
+{
+ pars_info_t* info;
+ fts_table_t fts_table;
+ trx_t* trx;
+ que_t* graph;
+ dberr_t error;
+ fts_fetch_t fetch;
+ char table_name[MAX_FULL_NAME_LEN];
+
+ info = pars_info_create();
+
+ fetch.read_arg = words;
+ fetch.read_record = fts_optimize_index_fetch_node;
+ fetch.total_memory = 0;
+
+ DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
+ fts_result_cache_limit = 8192;
+ );
+
+ trx = trx_create();
+
+ trx->op_info = "fetching FTS index nodes";
+
+ pars_info_bind_function(info, "my_func", fetch.read_record, &fetch);
+ pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+ FTS_INIT_INDEX_TABLE(&fts_table, fts_get_suffix(selected),
+ FTS_INDEX_TABLE, index);
+ fts_get_table_name(&fts_table, table_name);
+ pars_info_bind_id(info, true, "table_name", table_name);
+
+ graph = fts_parse_sql(
+ &fts_table, info,
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS"
+ " SELECT word, doc_count, first_doc_id, last_doc_id,"
+ " ilist\n"
+ " FROM $table_name WHERE word >= :word;\n"
+ "BEGIN\n"
+ "\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE c;");
+
+ for (;;) {
+ error = fts_eval_sql(trx, graph);
+
+ if (UNIV_LIKELY(error == DB_SUCCESS)) {
+ fts_sql_commit(trx);
+
+ break;
+ } else {
+ fts_sql_rollback(trx);
+
+ if (error == DB_LOCK_WAIT_TIMEOUT) {
+ ib::warn() << "Lock wait timeout reading"
+ " FTS index. Retrying!";
+
+ trx->error_state = DB_SUCCESS;
+ } else {
+ ib::error() << "Error occurred while reading"
+ " FTS index: " << error;
+ break;
+ }
+ }
+ }
+
+ mutex_enter(&dict_sys.mutex);
+ que_graph_free(graph);
+ mutex_exit(&dict_sys.mutex);
+
+ trx->free();
+
+ if (fetch.total_memory >= fts_result_cache_limit) {
+ error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+ }
+
+ return(error);
+}
+
+/*******************************************************************//**
+Free words. */
+static
+void
+i_s_fts_index_table_free_one_fetch(
+/*===============================*/
+ ib_vector_t* words) /*!< in: words fetched */
+{
+ for (ulint i = 0; i < ib_vector_size(words); i++) {
+ fts_word_t* word;
+
+ word = static_cast<fts_word_t*>(ib_vector_get(words, i));
+
+ for (ulint j = 0; j < ib_vector_size(word->nodes); j++) {
+ fts_node_t* node;
+
+ node = static_cast<fts_node_t*> (ib_vector_get(
+ word->nodes, j));
+ ut_free(node->ilist);
+ }
+
+ fts_word_free(word);
+ }
+
+ ib_vector_reset(words);
+}
+
+/*******************************************************************//**
+Go through words, fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill_one_fetch(
+/*===============================*/
+ CHARSET_INFO* index_charset, /*!< in: FTS index charset */
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ ib_vector_t* words, /*!< in: words fetched */
+ fts_string_t* conv_str, /*!< in: string for conversion*/
+ bool has_more) /*!< in: has more to fetch */
+{
+ TABLE* table = (TABLE*) tables->table;
+ Field** fields;
+ uint dummy_errors;
+ char* word_str;
+ ulint words_size;
+ int ret = 0;
+
+ DBUG_ENTER("i_s_fts_index_table_fill_one_fetch");
+
+ fields = table->field;
+
+ words_size = ib_vector_size(words);
+ if (has_more) {
+ /* the last word is not fetched completely. */
+ ut_ad(words_size > 1);
+ words_size -= 1;
+ }
+
+ /* Go through each word in the index cache */
+ for (ulint i = 0; i < words_size; i++) {
+ fts_word_t* word;
+
+ word = static_cast<fts_word_t*>(ib_vector_get(words, i));
+
+ word->text.f_str[word->text.f_len] = 0;
+
+ /* Convert word from index charset to system_charset_info */
+ if (index_charset->cset != system_charset_info->cset) {
+ conv_str->f_n_char = my_convert(
+ reinterpret_cast<char*>(conv_str->f_str),
+ static_cast<uint32>(conv_str->f_len),
+ system_charset_info,
+ reinterpret_cast<char*>(word->text.f_str),
+ static_cast<uint32>(word->text.f_len),
+ index_charset, &dummy_errors);
+ ut_ad(conv_str->f_n_char <= conv_str->f_len);
+ conv_str->f_str[conv_str->f_n_char] = 0;
+ word_str = reinterpret_cast<char*>(conv_str->f_str);
+ } else {
+ word_str = reinterpret_cast<char*>(word->text.f_str);
+ }
+
+ /* Decrypt the ilist, and display Dod ID and word position */
+ for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+ fts_node_t* node;
+ byte* ptr;
+ ulint decoded = 0;
+ doc_id_t doc_id = 0;
+
+ node = static_cast<fts_node_t*> (ib_vector_get(
+ word->nodes, i));
+
+ ptr = node->ilist;
+
+ while (decoded < node->ilist_size) {
+ ulint pos = fts_decode_vlc(&ptr);
+
+ doc_id += pos;
+
+ /* Get position info */
+ while (*ptr) {
+ pos = fts_decode_vlc(&ptr);
+
+ OK(field_store_string(
+ fields[I_S_FTS_WORD],
+ word_str));
+
+ OK(fields[I_S_FTS_FIRST_DOC_ID]->store(
+ longlong(node->first_doc_id), true));
+
+ OK(fields[I_S_FTS_LAST_DOC_ID]->store(
+ longlong(node->last_doc_id), true));
+
+ OK(fields[I_S_FTS_DOC_COUNT]->store(
+ node->doc_count, true));
+
+ OK(fields[I_S_FTS_ILIST_DOC_ID]->store(
+ longlong(doc_id), true));
+
+ OK(fields[I_S_FTS_ILIST_DOC_POS]->store(
+ pos, true));
+
+ OK(schema_table_store_record(
+ thd, table));
+ }
+
+ ++ptr;
+
+ decoded = ptr - (byte*) node->ilist;
+ }
+ }
+ }
+
+ DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Go through a FTS index and its auxiliary tables, fetch rows in each table
+and fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill_one_index(
+/*===============================*/
+ dict_index_t* index, /*!< in: FTS index */
+ THD* thd, /*!< in: thread */
+ fts_string_t* conv_str, /*!< in/out: buffer */
+ TABLE_LIST* tables) /*!< in/out: tables to fill */
+{
+ ib_vector_t* words;
+ mem_heap_t* heap;
+ CHARSET_INFO* index_charset;
+ dberr_t error;
+ int ret = 0;
+
+ DBUG_ENTER("i_s_fts_index_table_fill_one_index");
+ DBUG_ASSERT(!dict_index_is_online_ddl(index));
+
+ heap = mem_heap_create(1024);
+
+ words = ib_vector_create(ib_heap_allocator_create(heap),
+ sizeof(fts_word_t), 256);
+
+ index_charset = fts_index_get_charset(index);
+
+ /* Iterate through each auxiliary table as described in
+ fts_index_selector */
+ for (ulint selected = 0; selected < FTS_NUM_AUX_INDEX; selected++) {
+ fts_string_t word;
+ bool has_more = false;
+
+ word.f_str = NULL;
+ word.f_len = 0;
+ word.f_n_char = 0;
+
+ do {
+ /* Fetch from index */
+ error = i_s_fts_index_table_fill_selected(
+ index, words, selected, &word);
+
+ if (error == DB_SUCCESS) {
+ has_more = false;
+ } else if (error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT) {
+ has_more = true;
+ } else {
+ i_s_fts_index_table_free_one_fetch(words);
+ ret = 1;
+ goto func_exit;
+ }
+
+ if (has_more) {
+ fts_word_t* last_word;
+
+ /* Prepare start point for next fetch */
+ last_word = static_cast<fts_word_t*>(ib_vector_last(words));
+ ut_ad(last_word != NULL);
+ fts_string_dup(&word, &last_word->text, heap);
+ }
+
+ /* Fill into tables */
+ ret = i_s_fts_index_table_fill_one_fetch(
+ index_charset, thd, tables, words, conv_str,
+ has_more);
+ i_s_fts_index_table_free_one_fetch(words);
+
+ if (ret != 0) {
+ goto func_exit;
+ }
+ } while (has_more);
+ }
+
+func_exit:
+ mem_heap_free(heap);
+
+ DBUG_RETURN(ret);
+}
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill(
+/*=====================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (ignored) */
+{
+ dict_table_t* user_table;
+ dict_index_t* index;
+
+ DBUG_ENTER("i_s_fts_index_table_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* Prevent DROP of the internal tables for fulltext indexes.
+ FIXME: acquire DDL-blocking MDL on the user table name! */
+ rw_lock_s_lock(&dict_sys.latch);
+
+ user_table = dict_table_open_on_id(
+ innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL);
+
+ if (!user_table) {
+ rw_lock_s_unlock(&dict_sys.latch);
+ DBUG_RETURN(0);
+ }
+
+ int ret = 0;
+ fts_string_t conv_str;
+ conv_str.f_len = system_charset_info->mbmaxlen
+ * FTS_MAX_WORD_LEN_IN_CHAR;
+ conv_str.f_str = static_cast<byte*>(ut_malloc_nokey(conv_str.f_len));
+
+ for (index = dict_table_get_first_index(user_table);
+ index; index = dict_table_get_next_index(index)) {
+ if (index->type & DICT_FTS) {
+ BREAK_IF(ret = i_s_fts_index_table_fill_one_index(
+ index, thd, &conv_str, tables));
+ }
+ }
+
+ dict_table_close(user_table, FALSE, FALSE);
+
+ rw_lock_s_unlock(&dict_sys.latch);
+
+ ut_free(conv_str.f_str);
+
+ DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE
+@return 0 on success */
+static
+int
+i_s_fts_index_table_init(
+/*=====================*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_fts_index_table_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_fts_index_fields_info;
+ schema->fill_table = i_s_fts_index_table_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_index_table =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_FT_INDEX_TABLE"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "INNODB AUXILIARY FTS INDEX TABLE"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_fts_index_table_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG */
+static ST_FIELD_INFO i_s_fts_config_fields_info[] =
+{
+#define FTS_CONFIG_KEY 0
+ Column("KEY", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define FTS_CONFIG_VALUE 1
+ Column("VALUE", Varchar(NAME_LEN + 1), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+static const char* fts_config_key[] = {
+ FTS_OPTIMIZE_LIMIT_IN_SECS,
+ FTS_SYNCED_DOC_ID,
+ FTS_STOPWORD_TABLE_NAME,
+ FTS_USE_STOPWORD,
+ NULL
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_config_fill(
+/*================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (ignored) */
+{
+ Field** fields;
+ TABLE* table = (TABLE*) tables->table;
+ trx_t* trx;
+ fts_table_t fts_table;
+ dict_table_t* user_table;
+ ulint i = 0;
+ dict_index_t* index = NULL;
+ unsigned char str[FTS_MAX_CONFIG_VALUE_LEN + 1];
+
+ DBUG_ENTER("i_s_fts_config_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* Prevent DROP of the internal tables for fulltext indexes.
+ FIXME: acquire DDL-blocking MDL on the user table name! */
+ rw_lock_s_lock(&dict_sys.latch);
+
+ user_table = dict_table_open_on_id(
+ innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL);
+
+ if (!user_table) {
+no_fts:
+ rw_lock_s_unlock(&dict_sys.latch);
+ DBUG_RETURN(0);
+ }
+
+ if (!dict_table_has_fts_index(user_table)) {
+ dict_table_close(user_table, FALSE, FALSE);
+ goto no_fts;
+ }
+
+ fields = table->field;
+
+ trx = trx_create();
+ trx->op_info = "Select for FTS CONFIG TABLE";
+
+ FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, user_table);
+
+ if (!ib_vector_is_empty(user_table->fts->indexes)) {
+ index = (dict_index_t*) ib_vector_getp_const(
+ user_table->fts->indexes, 0);
+ DBUG_ASSERT(!dict_index_is_online_ddl(index));
+ }
+
+ int ret = 0;
+
+ while (fts_config_key[i]) {
+ fts_string_t value;
+ char* key_name;
+ ulint allocated = FALSE;
+
+ value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+
+ value.f_str = str;
+
+ if (index
+ && strcmp(fts_config_key[i], FTS_TOTAL_WORD_COUNT) == 0) {
+ key_name = fts_config_create_index_param_name(
+ fts_config_key[i], index);
+ allocated = TRUE;
+ } else {
+ key_name = (char*) fts_config_key[i];
+ }
+
+ fts_config_get_value(trx, &fts_table, key_name, &value);
+
+ if (allocated) {
+ ut_free(key_name);
+ }
+
+ BREAK_IF(ret = field_store_string(
+ fields[FTS_CONFIG_KEY], fts_config_key[i]));
+
+ BREAK_IF(ret = field_store_string(
+ fields[FTS_CONFIG_VALUE],
+ reinterpret_cast<const char*>(value.f_str)));
+
+ BREAK_IF(ret = schema_table_store_record(thd, table));
+
+ i++;
+ }
+
+ fts_sql_commit(trx);
+
+ dict_table_close(user_table, FALSE, FALSE);
+
+ rw_lock_s_unlock(&dict_sys.latch);
+
+ trx->free();
+
+ DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG
+@return 0 on success */
+static
+int
+i_s_fts_config_init(
+/*=================*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_fts_config_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::i_s_fts_config_fields_info;
+ schema->fill_table = i_s_fts_config_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_config =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_FT_CONFIG"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "INNODB AUXILIARY FTS CONFIG TABLE"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_fts_config_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/* Fields of the dynamic table INNODB_BUFFER_POOL_STATS. */
+static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[] =
+{
+#define IDX_BUF_STATS_POOL_ID 0
+ Column("POOL_ID", ULong(), NOT_NULL),
+
+#define IDX_BUF_STATS_POOL_SIZE 1
+ Column("POOL_SIZE", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_FREE_BUFFERS 2
+ Column("FREE_BUFFERS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_LRU_LEN 3
+ Column("DATABASE_PAGES", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_OLD_LRU_LEN 4
+ Column("OLD_DATABASE_PAGES", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_FLUSH_LIST_LEN 5
+ Column("MODIFIED_DATABASE_PAGES", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PENDING_ZIP 6
+ Column("PENDING_DECOMPRESS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PENDING_READ 7
+ Column("PENDING_READS",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_FLUSH_LRU 8
+ Column("PENDING_FLUSH_LRU",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_FLUSH_LIST 9
+ Column("PENDING_FLUSH_LIST", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_YOUNG 10
+ Column("PAGES_MADE_YOUNG",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_NOT_YOUNG 11
+ Column("PAGES_NOT_MADE_YOUNG",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_YOUNG_RATE 12
+ Column("PAGES_MADE_YOUNG_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE 13
+ Column("PAGES_MADE_NOT_YOUNG_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_READ 14
+ Column("NUMBER_PAGES_READ",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_CREATED 15
+ Column("NUMBER_PAGES_CREATED",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_WRITTEN 16
+ Column("NUMBER_PAGES_WRITTEN",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_READ_RATE 17
+ Column("PAGES_READ_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_CREATE_RATE 18
+ Column("PAGES_CREATE_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_WRITTEN_RATE 19
+ Column("PAGES_WRITTEN_RATE",Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define IDX_BUF_STATS_GET 20
+ Column("NUMBER_PAGES_GET", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_HIT_RATE 21
+ Column("HIT_RATE", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_MADE_YOUNG_PCT 22
+ Column("YOUNG_MAKE_PER_THOUSAND_GETS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_NOT_MADE_YOUNG_PCT 23
+ Column("NOT_YOUNG_MAKE_PER_THOUSAND_GETS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_READ_AHEAD 24
+ Column("NUMBER_PAGES_READ_AHEAD", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_READ_AHEAD_EVICTED 25
+ Column("NUMBER_READ_AHEAD_EVICTED", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_READ_AHEAD_RATE 26
+ Column("READ_AHEAD_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define IDX_BUF_STATS_READ_AHEAD_EVICT_RATE 27
+ Column("READ_AHEAD_EVICTED_RATE",Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define IDX_BUF_STATS_LRU_IO_SUM 28
+ Column("LRU_IO_TOTAL", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_LRU_IO_CUR 29
+ Column("LRU_IO_CURRENT", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_UNZIP_SUM 30
+ Column("UNCOMPRESS_TOTAL",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_UNZIP_CUR 31
+ Column("UNCOMPRESS_CURRENT", ULonglong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/** Fill INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS
+@param[in,out] thd connection
+@param[in,out] tables tables to fill
+@return 0 on success, 1 on failure */
+static int i_s_innodb_stats_fill(THD *thd, TABLE_LIST * tables, Item *)
+{
+ TABLE* table;
+ Field** fields;
+ buf_pool_info_t info;
+
+ DBUG_ENTER("i_s_innodb_stats_fill");
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* Only allow the PROCESS privilege holder to access the stats */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ buf_stats_get_pool_info(&info);
+
+ table = tables->table;
+
+ fields = table->field;
+
+ OK(fields[IDX_BUF_STATS_POOL_ID]->store(0, true));
+
+ OK(fields[IDX_BUF_STATS_POOL_SIZE]->store(info.pool_size, true));
+
+ OK(fields[IDX_BUF_STATS_LRU_LEN]->store(info.lru_len, true));
+
+ OK(fields[IDX_BUF_STATS_OLD_LRU_LEN]->store(info.old_lru_len, true));
+
+ OK(fields[IDX_BUF_STATS_FREE_BUFFERS]->store(
+ info.free_list_len, true));
+
+ OK(fields[IDX_BUF_STATS_FLUSH_LIST_LEN]->store(
+ info.flush_list_len, true));
+
+ OK(fields[IDX_BUF_STATS_PENDING_ZIP]->store(info.n_pend_unzip, true));
+
+ OK(fields[IDX_BUF_STATS_PENDING_READ]->store(info.n_pend_reads, true));
+
+ OK(fields[IDX_BUF_STATS_FLUSH_LRU]->store(
+ info.n_pending_flush_lru, true));
+
+ OK(fields[IDX_BUF_STATS_FLUSH_LIST]->store(
+ info.n_pending_flush_list, true));
+
+ OK(fields[IDX_BUF_STATS_PAGE_YOUNG]->store(
+ info.n_pages_made_young, true));
+
+ OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG]->store(
+ info.n_pages_not_made_young, true));
+
+ OK(fields[IDX_BUF_STATS_PAGE_YOUNG_RATE]->store(
+ info.page_made_young_rate));
+
+ OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE]->store(
+ info.page_not_made_young_rate));
+
+ OK(fields[IDX_BUF_STATS_PAGE_READ]->store(info.n_pages_read, true));
+
+ OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store(
+ info.n_pages_created, true));
+
+ OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store(
+ info.n_pages_written, true));
+
+ OK(fields[IDX_BUF_STATS_GET]->store(info.n_page_gets, true));
+
+ OK(fields[IDX_BUF_STATS_PAGE_READ_RATE]->store(
+ info.pages_read_rate));
+
+ OK(fields[IDX_BUF_STATS_PAGE_CREATE_RATE]->store(
+ info.pages_created_rate));
+
+ OK(fields[IDX_BUF_STATS_PAGE_WRITTEN_RATE]->store(
+ info.pages_written_rate));
+
+ if (info.n_page_get_delta) {
+ if (info.page_read_delta <= info.n_page_get_delta) {
+ OK(fields[IDX_BUF_STATS_HIT_RATE]->store(
+ static_cast<double>(
+ 1000 - (1000 * info.page_read_delta
+ / info.n_page_get_delta))));
+ } else {
+ OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0));
+ }
+
+ OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(
+ 1000 * info.young_making_delta
+ / info.n_page_get_delta, true));
+
+ OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(
+ 1000 * info.not_young_making_delta
+ / info.n_page_get_delta, true));
+ } else {
+ OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0, true));
+ OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(0, true));
+ OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(0, true));
+ }
+
+ OK(fields[IDX_BUF_STATS_READ_AHEAD]->store(
+ info.n_ra_pages_read, true));
+
+ OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICTED]->store(
+ info.n_ra_pages_evicted, true));
+
+ OK(fields[IDX_BUF_STATS_READ_AHEAD_RATE]->store(
+ info.pages_readahead_rate));
+
+ OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICT_RATE]->store(
+ info.pages_evicted_rate));
+
+ OK(fields[IDX_BUF_STATS_LRU_IO_SUM]->store(info.io_sum, true));
+
+ OK(fields[IDX_BUF_STATS_LRU_IO_CUR]->store(info.io_cur, true));
+
+ OK(fields[IDX_BUF_STATS_UNZIP_SUM]->store(info.unzip_sum, true));
+
+ OK(fields[IDX_BUF_STATS_UNZIP_CUR]->store(info.unzip_cur, true));
+
+ DBUG_RETURN(schema_table_store_record(thd, table));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_pool_stats_init(
+/*==============================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("i_s_innodb_buffer_pool_stats_init");
+
+ schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+ schema->fields_info = Show::i_s_innodb_buffer_stats_fields_info;
+ schema->fill_table = i_s_innodb_stats_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_stats =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_BUFFER_POOL_STATS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB Buffer Pool Statistics Information "),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_buffer_pool_stats_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/** These must correspond to the first values of buf_page_state */
+static const LEX_CSTRING page_state_values[] =
+{
+ { STRING_WITH_LEN("NOT_USED") },
+ { STRING_WITH_LEN("MEMORY") },
+ { STRING_WITH_LEN("REMOVE_HASH") },
+ { STRING_WITH_LEN("FILE_PAGE") },
+};
+
+static const TypelibBuffer<4> page_state_values_typelib(page_state_values);
+
+static const LEX_CSTRING io_values[] =
+{
+ { STRING_WITH_LEN("IO_NONE") },
+ { STRING_WITH_LEN("IO_READ") },
+ { STRING_WITH_LEN("IO_WRITE") },
+ { STRING_WITH_LEN("IO_PIN") }
+};
+
+
+static TypelibBuffer<4> io_values_typelib(io_values);
+
+namespace Show {
+/* Fields of the dynamic table INNODB_BUFFER_POOL_PAGE. */
+static ST_FIELD_INFO i_s_innodb_buffer_page_fields_info[] =
+{
+#define IDX_BUFFER_POOL_ID 0
+ Column("POOL_ID", ULong(), NOT_NULL),
+
+#define IDX_BUFFER_BLOCK_ID 1
+ Column("BLOCK_ID", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_SPACE 2
+ Column("SPACE", ULong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_NUM 3
+ Column("PAGE_NUMBER", ULong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_TYPE 4
+ Column("PAGE_TYPE", Varchar(64), NULLABLE),
+
+#define IDX_BUFFER_PAGE_FLUSH_TYPE 5
+ Column("FLUSH_TYPE", ULong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_FIX_COUNT 6
+ Column("FIX_COUNT", ULong(), NOT_NULL),
+
+#ifdef BTR_CUR_HASH_ADAPT
+#define IDX_BUFFER_PAGE_HASHED 7
+ Column("IS_HASHED", SLong(1), NOT_NULL),
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#define IDX_BUFFER_PAGE_NEWEST_MOD 7 + I_S_AHI
+ Column("NEWEST_MODIFICATION", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_OLDEST_MOD 8 + I_S_AHI
+ Column("OLDEST_MODIFICATION", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_ACCESS_TIME 9 + I_S_AHI
+ Column("ACCESS_TIME", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_TABLE_NAME 10 + I_S_AHI
+ Column("TABLE_NAME", Varchar(1024), NULLABLE),
+
+#define IDX_BUFFER_PAGE_INDEX_NAME 11 + I_S_AHI
+ Column("INDEX_NAME", Varchar(NAME_CHAR_LEN), NULLABLE),
+
+#define IDX_BUFFER_PAGE_NUM_RECS 12 + I_S_AHI
+ Column("NUMBER_RECORDS", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_DATA_SIZE 13 + I_S_AHI
+ Column("DATA_SIZE", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_ZIP_SIZE 14 + I_S_AHI
+ Column("COMPRESSED_SIZE", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_STATE 15 + I_S_AHI
+ Column("PAGE_STATE", Enum(&page_state_values_typelib), NOT_NULL, DEFAULT_NONE),
+
+#define IDX_BUFFER_PAGE_IO_FIX 16 + I_S_AHI
+ Column("IO_FIX", Enum(&io_values_typelib), NOT_NULL, DEFAULT_NONE),
+
+#define IDX_BUFFER_PAGE_IS_OLD 17 + I_S_AHI
+ Column("IS_OLD", SLong(1), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_FREE_CLOCK 18 + I_S_AHI
+ Column("FREE_PAGE_CLOCK", ULonglong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Fill Information Schema table INNODB_BUFFER_PAGE with information
+cached in the buf_page_info_t array
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_fill(
+/*========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ const buf_page_info_t* info_array, /*!< in: array cached page
+ info */
+ ulint num_page) /*!< in: number of page info
+ cached */
+{
+ TABLE* table;
+ Field** fields;
+
+ compile_time_assert(I_S_PAGE_TYPE_LAST < 1 << I_S_PAGE_TYPE_BITS);
+
+ DBUG_ENTER("i_s_innodb_buffer_page_fill");
+
+ table = tables->table;
+
+ fields = table->field;
+
+ /* Iterate through the cached array and fill the I_S table rows */
+ for (ulint i = 0; i < num_page; i++) {
+ const buf_page_info_t* page_info;
+ char table_name[MAX_FULL_NAME_LEN + 1];
+ const char* table_name_end = NULL;
+
+ page_info = info_array + i;
+
+ OK(fields[IDX_BUFFER_POOL_ID]->store(0, true));
+
+ OK(fields[IDX_BUFFER_BLOCK_ID]->store(
+ page_info->block_id, true));
+
+ OK(fields[IDX_BUFFER_PAGE_SPACE]->store(
+ page_info->id.space(), true));
+
+ OK(fields[IDX_BUFFER_PAGE_NUM]->store(
+ page_info->id.page_no(), true));
+
+ OK(field_store_string(
+ fields[IDX_BUFFER_PAGE_TYPE],
+ i_s_page_type[page_info->page_type].type_str));
+
+ OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store(0, true));
+
+ OK(fields[IDX_BUFFER_PAGE_FIX_COUNT]->store(
+ page_info->fix_count, true));
+
+#ifdef BTR_CUR_HASH_ADAPT
+ OK(fields[IDX_BUFFER_PAGE_HASHED]->store(
+ page_info->hashed, true));
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ OK(fields[IDX_BUFFER_PAGE_NEWEST_MOD]->store(
+ page_info->newest_mod, true));
+
+ OK(fields[IDX_BUFFER_PAGE_OLDEST_MOD]->store(
+ page_info->oldest_mod, true));
+
+ OK(fields[IDX_BUFFER_PAGE_ACCESS_TIME]->store(
+ page_info->access_time, true));
+
+ fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_null();
+
+ fields[IDX_BUFFER_PAGE_INDEX_NAME]->set_null();
+
+ /* If this is an index page, fetch the index name
+ and table name */
+ if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
+ bool ret = false;
+
+ mutex_enter(&dict_sys.mutex);
+
+ const dict_index_t* index =
+ dict_index_get_if_in_cache_low(
+ page_info->index_id);
+
+ if (index) {
+ table_name_end = innobase_convert_name(
+ table_name, sizeof(table_name),
+ index->table->name.m_name,
+ strlen(index->table->name.m_name),
+ thd);
+
+ ret = fields[IDX_BUFFER_PAGE_TABLE_NAME]
+ ->store(table_name,
+ static_cast<uint>(
+ table_name_end
+ - table_name),
+ system_charset_info)
+ || fields[IDX_BUFFER_PAGE_INDEX_NAME]
+ ->store(index->name,
+ uint(strlen(index->name)),
+ system_charset_info);
+ }
+
+ mutex_exit(&dict_sys.mutex);
+
+ OK(ret);
+
+ if (index) {
+ fields[IDX_BUFFER_PAGE_TABLE_NAME]
+ ->set_notnull();
+ fields[IDX_BUFFER_PAGE_INDEX_NAME]
+ ->set_notnull();
+ }
+ }
+
+ OK(fields[IDX_BUFFER_PAGE_NUM_RECS]->store(
+ page_info->num_recs, true));
+
+ OK(fields[IDX_BUFFER_PAGE_DATA_SIZE]->store(
+ page_info->data_size, true));
+
+ OK(fields[IDX_BUFFER_PAGE_ZIP_SIZE]->store(
+ page_info->zip_ssize
+ ? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize
+ : 0, true));
+
+ OK(fields[IDX_BUFFER_PAGE_STATE]->store(
+ 1 + std::min<unsigned>(page_info->page_state,
+ BUF_BLOCK_FILE_PAGE), true));
+
+ OK(fields[IDX_BUFFER_PAGE_IO_FIX]->store(
+ 1 + page_info->io_fix, true));
+
+ OK(fields[IDX_BUFFER_PAGE_IS_OLD]->store(
+ page_info->is_old, true));
+
+ OK(fields[IDX_BUFFER_PAGE_FREE_CLOCK]->store(
+ page_info->freed_page_clock, true));
+
+ OK(schema_table_store_record(thd, table));
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Set appropriate page type to a buf_page_info_t structure */
+static
+void
+i_s_innodb_set_page_type(
+/*=====================*/
+ buf_page_info_t*page_info, /*!< in/out: structure to fill with
+ scanned info */
+ const byte* frame) /*!< in: buffer frame */
+{
+ uint16_t page_type = fil_page_get_type(frame);
+
+ if (fil_page_type_is_index(page_type)) {
+ const page_t* page = (const page_t*) frame;
+
+ page_info->index_id = btr_page_get_index_id(page);
+
+ /* FIL_PAGE_INDEX and FIL_PAGE_RTREE are a bit special,
+ their values are defined as 17855 and 17854, so we cannot
+ use them to index into i_s_page_type[] array, its array index
+ in the i_s_page_type[] array is I_S_PAGE_TYPE_INDEX
+ (1) for index pages or I_S_PAGE_TYPE_IBUF for
+ change buffer index pages */
+ if (page_type == FIL_PAGE_RTREE) {
+ page_info->page_type = I_S_PAGE_TYPE_RTREE;
+ } else if (page_info->index_id
+ == static_cast<index_id_t>(DICT_IBUF_ID_MIN
+ + IBUF_SPACE_ID)) {
+ page_info->page_type = I_S_PAGE_TYPE_IBUF;
+ } else {
+ ut_ad(page_type == FIL_PAGE_INDEX
+ || page_type == FIL_PAGE_TYPE_INSTANT);
+ page_info->page_type = I_S_PAGE_TYPE_INDEX;
+ }
+
+ page_info->data_size = uint16_t(page_header_get_field(
+ page, PAGE_HEAP_TOP) - (page_is_comp(page)
+ ? PAGE_NEW_SUPREMUM_END
+ : PAGE_OLD_SUPREMUM_END)
+ - page_header_get_field(page, PAGE_GARBAGE));
+
+ page_info->num_recs = page_get_n_recs(page) & ((1U << 14) - 1);
+ } else if (page_type > FIL_PAGE_TYPE_LAST) {
+ /* Encountered an unknown page type */
+ page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+ } else {
+ /* Make sure we get the right index into the
+ i_s_page_type[] array */
+ ut_a(page_type == i_s_page_type[page_type].type_value);
+
+ page_info->page_type = page_type & 0xf;
+ }
+}
+/*******************************************************************//**
+Scans pages in the buffer cache, and collect their general information
+into the buf_page_info_t array which is zero-filled. So any fields
+that are not initialized in the function will default to 0 */
+static
+void
+i_s_innodb_buffer_page_get_info(
+/*============================*/
+ const buf_page_t*bpage, /*!< in: buffer pool page to scan */
+ ulint pos, /*!< in: buffer block position in
+ buffer pool or in the LRU list */
+ buf_page_info_t*page_info) /*!< in: zero filled info structure;
+ out: structure filled with scanned
+ info */
+{
+ page_info->block_id = pos;
+
+ compile_time_assert(BUF_BLOCK_NOT_USED == 0);
+ compile_time_assert(BUF_BLOCK_MEMORY == 1);
+ compile_time_assert(BUF_BLOCK_REMOVE_HASH == 2);
+ compile_time_assert(BUF_BLOCK_FILE_PAGE == 3);
+ compile_time_assert(BUF_BLOCK_ZIP_PAGE == 4);
+
+ auto state = bpage->state();
+ page_info->page_state= int{state} & 7;
+
+ switch (state) {
+ default:
+ page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ case BUF_BLOCK_ZIP_PAGE:
+ const byte* frame;
+
+ page_info->id = bpage->id();
+
+ page_info->fix_count = bpage->buf_fix_count();
+
+ page_info->oldest_mod = bpage->oldest_modification();
+
+ page_info->access_time = bpage->access_time;
+
+ page_info->zip_ssize = bpage->zip.ssize;
+
+ page_info->io_fix = bpage->io_fix() & 3;
+
+ page_info->is_old = bpage->old;
+
+ page_info->freed_page_clock = bpage->freed_page_clock;
+
+ switch (bpage->io_fix()) {
+ case BUF_IO_NONE:
+ case BUF_IO_WRITE:
+ case BUF_IO_PIN:
+ break;
+ case BUF_IO_READ:
+ page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+ page_info->newest_mod = 0;
+ return;
+ }
+
+ if (state == BUF_BLOCK_FILE_PAGE) {
+ const buf_block_t*block;
+
+ block = reinterpret_cast<const buf_block_t*>(bpage);
+ frame = block->frame;
+#ifdef BTR_CUR_HASH_ADAPT
+ /* Note: this may be a false positive, that
+ is, block->index will not always be set to
+ NULL when the last adaptive hash index
+ reference is dropped. */
+ page_info->hashed = (block->index != NULL);
+#endif /* BTR_CUR_HASH_ADAPT */
+ } else {
+ ut_ad(page_info->zip_ssize);
+ frame = bpage->zip.data;
+ }
+
+ page_info->newest_mod = mach_read_from_8(FIL_PAGE_LSN + frame);
+ i_s_innodb_set_page_type(page_info, frame);
+ }
+}
+
+/*******************************************************************//**
+This is the function that goes through each block of the buffer pool
+and fetch information to information schema tables: INNODB_BUFFER_PAGE.
+@param[in,out] thd connection
+@param[in,out] tables tables to fill
+@return 0 on success, 1 on failure */
+static int i_s_innodb_buffer_page_fill(THD *thd, TABLE_LIST *tables, Item *)
+{
+ int status = 0;
+ mem_heap_t* heap;
+
+ DBUG_ENTER("i_s_innodb_buffer_page_fill");
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(10000);
+
+ for (ulint n = 0;
+ n < ut_min(buf_pool.n_chunks, buf_pool.n_chunks_new); n++) {
+ const buf_block_t* block;
+ ulint n_blocks;
+ buf_page_info_t* info_buffer;
+ ulint num_page;
+ ulint mem_size;
+ ulint chunk_size;
+ ulint num_to_process = 0;
+ ulint block_id = 0;
+
+ /* Get buffer block of the nth chunk */
+ block = buf_pool.chunks[n].blocks;
+ chunk_size = buf_pool.chunks[n].size;
+ num_page = 0;
+
+ while (chunk_size > 0) {
+ /* we cache maximum MAX_BUF_INFO_CACHED number of
+ buffer page info */
+ num_to_process = ut_min(chunk_size,
+ (ulint)MAX_BUF_INFO_CACHED);
+
+ mem_size = num_to_process * sizeof(buf_page_info_t);
+
+ /* For each chunk, we'll pre-allocate information
+ structures to cache the page information read from
+ the buffer pool. Doing so before obtain any mutex */
+ info_buffer = (buf_page_info_t*) mem_heap_zalloc(
+ heap, mem_size);
+
+ /* Obtain appropriate mutexes. Since this is diagnostic
+ buffer pool info printout, we are not required to
+ preserve the overall consistency, so we can
+ release mutex periodically */
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ /* GO through each block in the chunk */
+ for (n_blocks = num_to_process; n_blocks--; block++) {
+ i_s_innodb_buffer_page_get_info(
+ &block->page, block_id,
+ info_buffer + num_page);
+ block_id++;
+ num_page++;
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ /* Fill in information schema table with information
+ just collected from the buffer chunk scan */
+ status = i_s_innodb_buffer_page_fill(
+ thd, tables, info_buffer,
+ num_page);
+
+ /* If something goes wrong, break and return */
+ if (status) {
+ break;
+ }
+
+ mem_heap_empty(heap);
+ chunk_size -= num_to_process;
+ num_page = 0;
+ }
+ }
+
+ mem_heap_free(heap);
+
+ DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_init(
+/*========================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("i_s_innodb_buffer_page_init");
+
+ schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+ schema->fields_info = Show::i_s_innodb_buffer_page_fields_info;
+ schema->fill_table = i_s_innodb_buffer_page_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_page =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_BUFFER_PAGE"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB Buffer Page Information"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_buffer_page_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+static ST_FIELD_INFO i_s_innodb_buf_page_lru_fields_info[] =
+{
+#define IDX_BUF_LRU_POOL_ID 0
+ Column("POOL_ID", ULong(), NOT_NULL),
+
+#define IDX_BUF_LRU_POS 1
+ Column("LRU_POSITION", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_SPACE 2
+ Column("SPACE", ULong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_NUM 3
+ Column("PAGE_NUMBER", ULong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_TYPE 4
+ Column("PAGE_TYPE", Varchar(64), NULLABLE),
+
+#define IDX_BUF_LRU_PAGE_FLUSH_TYPE 5
+ Column("FLUSH_TYPE", ULong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_FIX_COUNT 6
+ Column("FIX_COUNT", ULong(), NOT_NULL),
+
+#ifdef BTR_CUR_HASH_ADAPT
+#define IDX_BUF_LRU_PAGE_HASHED 7
+ Column("IS_HASHED", SLong(1), NOT_NULL),
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#define IDX_BUF_LRU_PAGE_NEWEST_MOD 7 + I_S_AHI
+ Column("NEWEST_MODIFICATION",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_OLDEST_MOD 8 + I_S_AHI
+ Column("OLDEST_MODIFICATION",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_ACCESS_TIME 9 + I_S_AHI
+ Column("ACCESS_TIME",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_TABLE_NAME 10 + I_S_AHI
+ Column("TABLE_NAME", Varchar(1024), NULLABLE),
+
+#define IDX_BUF_LRU_PAGE_INDEX_NAME 11 + I_S_AHI
+ Column("INDEX_NAME", Varchar(NAME_CHAR_LEN), NULLABLE),
+
+#define IDX_BUF_LRU_PAGE_NUM_RECS 12 + I_S_AHI
+ Column("NUMBER_RECORDS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_DATA_SIZE 13 + I_S_AHI
+ Column("DATA_SIZE", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_ZIP_SIZE 14 + I_S_AHI
+ Column("COMPRESSED_SIZE",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_STATE 15 + I_S_AHI
+ Column("COMPRESSED", SLong(1), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_IO_FIX 16 + I_S_AHI
+ Column("IO_FIX", Enum(&io_values_typelib), NOT_NULL, DEFAULT_NONE),
+
+#define IDX_BUF_LRU_PAGE_IS_OLD 17 + I_S_AHI
+ Column("IS_OLD", SLong(1), NULLABLE),
+
+#define IDX_BUF_LRU_PAGE_FREE_CLOCK 18 + I_S_AHI
+ Column("FREE_PAGE_CLOCK", ULonglong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Fill Information Schema table INNODB_BUFFER_PAGE_LRU with information
+cached in the buf_page_info_t array
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_buf_page_lru_fill(
+/*=========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ const buf_page_info_t* info_array, /*!< in: array cached page
+ info */
+ ulint num_page) /*!< in: number of page info
+ cached */
+{
+ DBUG_ENTER("i_s_innodb_buf_page_lru_fill");
+
+ TABLE* table = tables->table;
+ Field** fields = table->field;
+
+ /* Iterate through the cached array and fill the I_S table rows */
+ for (ulint i = 0; i < num_page; i++) {
+ const buf_page_info_t* page_info;
+ char table_name[MAX_FULL_NAME_LEN + 1];
+ const char* table_name_end = NULL;
+
+ page_info = info_array + i;
+
+ OK(fields[IDX_BUF_LRU_POOL_ID]->store(0, true));
+
+ OK(fields[IDX_BUF_LRU_POS]->store(
+ page_info->block_id, true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_SPACE]->store(
+ page_info->id.space(), true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_NUM]->store(
+ page_info->id.page_no(), true));
+
+ OK(field_store_string(
+ fields[IDX_BUF_LRU_PAGE_TYPE],
+ i_s_page_type[page_info->page_type].type_str));
+
+ OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store(0, true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_FIX_COUNT]->store(
+ page_info->fix_count, true));
+
+#ifdef BTR_CUR_HASH_ADAPT
+ OK(fields[IDX_BUF_LRU_PAGE_HASHED]->store(
+ page_info->hashed, true));
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ OK(fields[IDX_BUF_LRU_PAGE_NEWEST_MOD]->store(
+ page_info->newest_mod, true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_OLDEST_MOD]->store(
+ page_info->oldest_mod, true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_ACCESS_TIME]->store(
+ page_info->access_time, true));
+
+ fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_null();
+
+ fields[IDX_BUF_LRU_PAGE_INDEX_NAME]->set_null();
+
+ /* If this is an index page, fetch the index name
+ and table name */
+ if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
+ bool ret = false;
+
+ mutex_enter(&dict_sys.mutex);
+
+ const dict_index_t* index =
+ dict_index_get_if_in_cache_low(
+ page_info->index_id);
+
+ if (index) {
+ table_name_end = innobase_convert_name(
+ table_name, sizeof(table_name),
+ index->table->name.m_name,
+ strlen(index->table->name.m_name),
+ thd);
+
+ ret = fields[IDX_BUF_LRU_PAGE_TABLE_NAME]
+ ->store(table_name,
+ static_cast<uint>(
+ table_name_end
+ - table_name),
+ system_charset_info)
+ || fields[IDX_BUF_LRU_PAGE_INDEX_NAME]
+ ->store(index->name,
+ uint(strlen(index->name)),
+ system_charset_info);
+ }
+
+ mutex_exit(&dict_sys.mutex);
+
+ OK(ret);
+
+ if (index) {
+ fields[IDX_BUF_LRU_PAGE_TABLE_NAME]
+ ->set_notnull();
+ fields[IDX_BUF_LRU_PAGE_INDEX_NAME]
+ ->set_notnull();
+ }
+ }
+
+ OK(fields[IDX_BUF_LRU_PAGE_NUM_RECS]->store(
+ page_info->num_recs, true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_DATA_SIZE]->store(
+ page_info->data_size, true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_ZIP_SIZE]->store(
+ page_info->zip_ssize
+ ? 512 << page_info->zip_ssize : 0, true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_STATE]->store(
+ page_info->page_state == BUF_BLOCK_ZIP_PAGE,
+ true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_IO_FIX]->store(
+ 1 + page_info->io_fix, true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_IS_OLD]->store(
+ page_info->is_old, true));
+
+ OK(fields[IDX_BUF_LRU_PAGE_FREE_CLOCK]->store(
+ page_info->freed_page_clock, true));
+
+ OK(schema_table_store_record(thd, table));
+ }
+
+ DBUG_RETURN(0);
+}
+
+/** Fill the table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
+@param[in] thd thread
+@param[in,out] tables tables to fill
+@return 0 on success, 1 on failure */
+static int i_s_innodb_fill_buffer_lru(THD *thd, TABLE_LIST *tables, Item *)
+{
+ int status = 0;
+ buf_page_info_t* info_buffer;
+ ulint lru_pos = 0;
+ const buf_page_t* bpage;
+ ulint lru_len;
+
+ DBUG_ENTER("i_s_innodb_fill_buffer_lru");
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to any users that do not hold PROCESS_ACL */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ /* Aquire the mutex before allocating info_buffer, since
+ UT_LIST_GET_LEN(buf_pool.LRU) could change */
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+ /* Print error message if malloc fail */
+ info_buffer = (buf_page_info_t*) my_malloc(PSI_INSTRUMENT_ME,
+ lru_len * sizeof *info_buffer, MYF(MY_WME | MY_ZEROFILL));
+
+ if (!info_buffer) {
+ status = 1;
+ goto exit;
+ }
+
+ /* Walk through Pool's LRU list and print the buffer page
+ information */
+ bpage = UT_LIST_GET_LAST(buf_pool.LRU);
+
+ while (bpage != NULL) {
+ /* Use the same function that collect buffer info for
+ INNODB_BUFFER_PAGE to get buffer page info */
+ i_s_innodb_buffer_page_get_info(bpage, lru_pos,
+ (info_buffer + lru_pos));
+
+ bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+ lru_pos++;
+ }
+
+ ut_ad(lru_pos == lru_len);
+ ut_ad(lru_pos == UT_LIST_GET_LEN(buf_pool.LRU));
+
+exit:
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (info_buffer) {
+ status = i_s_innodb_buf_page_lru_fill(
+ thd, tables, info_buffer, lru_len);
+
+ my_free(info_buffer);
+ }
+
+ DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_lru_init(
+/*============================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("i_s_innodb_buffer_page_lru_init");
+
+ schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+ schema->fields_info = Show::i_s_innodb_buf_page_lru_fields_info;
+ schema->fill_table = i_s_innodb_fill_buffer_lru;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_page_lru =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_BUFFER_PAGE_LRU"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB Buffer Page in LRU"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_buffer_page_lru_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+/*******************************************************************//**
+Unbind a dynamic INFORMATION_SCHEMA table.
+@return 0 */
+static int i_s_common_deinit(void*)
+{
+ DBUG_ENTER("i_s_common_deinit");
+
+ /* Do nothing */
+
+ DBUG_RETURN(0);
+}
+
+static const LEX_CSTRING row_format_values[] =
+{
+ { STRING_WITH_LEN("Redundant") },
+ { STRING_WITH_LEN("Compact") },
+ { STRING_WITH_LEN("Compressed") },
+ { STRING_WITH_LEN("Dynamic") }
+};
+
+static TypelibBuffer<4> row_format_values_typelib(row_format_values);
+
+static const LEX_CSTRING space_type_values[] =
+{
+ { STRING_WITH_LEN("Single") },
+ { STRING_WITH_LEN("System") }
+};
+
+static TypelibBuffer<2> space_type_values_typelib(space_type_values);
+
+namespace Show {
+/** SYS_TABLES ***************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLES */
+static ST_FIELD_INFO innodb_sys_tables_fields_info[] =
+{
+#define SYS_TABLES_ID 0
+ Column("TABLE_ID", ULonglong(), NOT_NULL),
+
+#define SYS_TABLES_NAME 1
+ Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NOT_NULL),
+
+#define SYS_TABLES_FLAG 2
+ Column("FLAG", SLong(), NOT_NULL),
+
+#define SYS_TABLES_NUM_COLUMN 3
+ Column("N_COLS", ULong(), NOT_NULL),
+
+#define SYS_TABLES_SPACE 4
+ Column("SPACE", ULong(), NOT_NULL),
+
+#define SYS_TABLES_ROW_FORMAT 5
+ Column("ROW_FORMAT", Enum(&row_format_values_typelib), NULLABLE),
+
+#define SYS_TABLES_ZIP_PAGE_SIZE 6
+ Column("ZIP_PAGE_SIZE", ULong(), NOT_NULL),
+
+#define SYS_TABLES_SPACE_TYPE 7
+ Column("SPACE_TYPE", Enum(&space_type_values_typelib), NULLABLE),
+
+ CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Populate information_schema.innodb_sys_tables table with information
+from SYS_TABLES.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_tables(
+/*=====================*/
+ THD* thd, /*!< in: thread */
+ dict_table_t* table, /*!< in: table */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ Field** fields;
+ ulint compact = DICT_TF_GET_COMPACT(table->flags);
+ ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(
+ table->flags);
+ const ulint zip_size = dict_tf_get_zip_size(table->flags);
+ const char* row_format;
+
+ if (!compact) {
+ row_format = "Redundant";
+ } else if (!atomic_blobs) {
+ row_format = "Compact";
+ } else if (DICT_TF_GET_ZIP_SSIZE(table->flags)) {
+ row_format = "Compressed";
+ } else {
+ row_format = "Dynamic";
+ }
+
+ DBUG_ENTER("i_s_dict_fill_sys_tables");
+
+ fields = table_to_fill->field;
+
+ OK(fields[SYS_TABLES_ID]->store(longlong(table->id), TRUE));
+
+ OK(field_store_string(fields[SYS_TABLES_NAME], table->name.m_name));
+
+ OK(fields[SYS_TABLES_FLAG]->store(table->flags));
+
+ OK(fields[SYS_TABLES_NUM_COLUMN]->store(table->n_cols));
+
+ OK(fields[SYS_TABLES_SPACE]->store(table->space_id, true));
+
+ OK(field_store_string(fields[SYS_TABLES_ROW_FORMAT], row_format));
+
+ OK(fields[SYS_TABLES_ZIP_PAGE_SIZE]->store(zip_size, true));
+
+ OK(field_store_string(fields[SYS_TABLES_SPACE_TYPE],
+ table->space_id ? "Single" : "System"));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_TABLES table, and fill the
+information_schema.innodb_sys_tables table with related table information
+@return 0 on success */
+static
+int
+i_s_sys_tables_fill_table(
+/*======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ mtr_t mtr;
+
+ DBUG_ENTER("i_s_sys_tables_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+
+ rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
+
+ while (rec) {
+ const char* err_msg;
+ dict_table_t* table_rec;
+
+ /* Create and populate a dict_table_t structure with
+ information from SYS_TABLES row */
+ err_msg = dict_process_sys_tables_rec_and_mtr_commit(
+ heap, rec, &table_rec, false, &mtr);
+
+ mutex_exit(&dict_sys.mutex);
+
+ if (!err_msg) {
+ i_s_dict_fill_sys_tables(thd, table_rec,
+ tables->table);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ if (table_rec) {
+ dict_mem_table_free(table_rec);
+ }
+
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tables
+@return 0 on success */
+static
+int
+innodb_sys_tables_init(
+/*===================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_tables_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sys_tables_fields_info;
+ schema->fill_table = i_s_sys_tables_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tables =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_TABLES"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_TABLES"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_tables_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/** SYS_TABLESTATS ***********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLESTATS */
+static ST_FIELD_INFO innodb_sys_tablestats_fields_info[] =
+{
+#define SYS_TABLESTATS_ID 0
+ Column("TABLE_ID", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_NAME 1
+ Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_TABLESTATS_INIT 2
+ Column("STATS_INITIALIZED", SLong(1), NOT_NULL),
+
+#define SYS_TABLESTATS_NROW 3
+ Column("NUM_ROWS", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_CLUST_SIZE 4
+ Column("CLUST_INDEX_SIZE", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_INDEX_SIZE 5
+ Column("OTHER_INDEX_SIZE", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_MODIFIED 6
+ Column("MODIFIED_COUNTER", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_AUTONINC 7
+ Column("AUTOINC", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_TABLE_REF_COUNT 8
+ Column("REF_COUNT", SLong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/** Populate information_schema.innodb_sys_tablestats table with information
+from SYS_TABLES.
+@param[in] thd thread ID
+@param[in,out] table table
+@param[in] ref_count table reference count
+@param[in,out] table_to_fill fill this table
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_tablestats(
+ THD* thd,
+ dict_table_t* table,
+ ulint ref_count,
+ TABLE* table_to_fill)
+{
+ Field** fields;
+
+ DBUG_ENTER("i_s_dict_fill_sys_tablestats");
+
+ fields = table_to_fill->field;
+
+ OK(fields[SYS_TABLESTATS_ID]->store(longlong(table->id), TRUE));
+
+ OK(field_store_string(fields[SYS_TABLESTATS_NAME],
+ table->name.m_name));
+
+ {
+ struct Locking
+ {
+ Locking() { mutex_enter(&dict_sys.mutex); }
+ ~Locking() { mutex_exit(&dict_sys.mutex); }
+ } locking;
+
+ OK(fields[SYS_TABLESTATS_INIT]->store(table->stat_initialized,
+ true));
+
+ if (table->stat_initialized) {
+ OK(fields[SYS_TABLESTATS_NROW]->store(
+ table->stat_n_rows, true));
+
+ OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(
+ table->stat_clustered_index_size, true));
+
+ OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(
+ table->stat_sum_of_other_index_sizes,
+ true));
+
+ OK(fields[SYS_TABLESTATS_MODIFIED]->store(
+ table->stat_modified_counter, true));
+ } else {
+ OK(fields[SYS_TABLESTATS_NROW]->store(0, true));
+
+ OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(0, true));
+
+ OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(0, true));
+
+ OK(fields[SYS_TABLESTATS_MODIFIED]->store(0, true));
+ }
+ }
+
+ OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, true));
+
+ OK(fields[SYS_TABLESTATS_TABLE_REF_COUNT]->store(ref_count, true));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to go through each record in SYS_TABLES table, and fill the
+information_schema.innodb_sys_tablestats table with table statistics
+related information
+@return 0 on success */
+static
+int
+i_s_sys_tables_fill_table_stats(
+/*============================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ mtr_t mtr;
+
+ DBUG_ENTER("i_s_sys_tables_fill_table_stats");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ rw_lock_s_lock(&dict_sys.latch);
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+
+ rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
+
+ while (rec) {
+ const char* err_msg;
+ dict_table_t* table_rec;
+
+ /* Fetch the dict_table_t structure corresponding to
+ this SYS_TABLES record */
+ err_msg = dict_process_sys_tables_rec_and_mtr_commit(
+ heap, rec, &table_rec, true, &mtr);
+
+ ulint ref_count = table_rec ? table_rec->get_ref_count() : 0;
+ mutex_exit(&dict_sys.mutex);
+
+ DBUG_EXECUTE_IF("test_sys_tablestats", {
+ if (strcmp("test/t1", table_rec->name.m_name) == 0 ) {
+ DEBUG_SYNC_C("dict_table_not_protected");
+ }});
+
+ if (table_rec != NULL) {
+ ut_ad(err_msg == NULL);
+ i_s_dict_fill_sys_tablestats(thd, table_rec, ref_count,
+ tables->table);
+ } else {
+ ut_ad(err_msg != NULL);
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ rw_lock_s_unlock(&dict_sys.latch);
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ rw_lock_s_lock(&dict_sys.latch);
+ mutex_enter(&dict_sys.mutex);
+
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+ rw_lock_s_unlock(&dict_sys.latch);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tablestats
+@return 0 on success */
+static
+int
+innodb_sys_tablestats_init(
+/*=======================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_tablestats_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sys_tablestats_fields_info;
+ schema->fill_table = i_s_sys_tables_fill_table_stats;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tablestats =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_TABLESTATS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_TABLESTATS"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_tablestats_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/** SYS_INDEXES **************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_INDEXES */
+static ST_FIELD_INFO innodb_sysindex_fields_info[] =
+{
+#define SYS_INDEX_ID 0
+ Column("INDEX_ID", ULonglong(), NOT_NULL),
+
+#define SYS_INDEX_NAME 1
+ Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_INDEX_TABLE_ID 2
+ Column("TABLE_ID", ULonglong(), NOT_NULL),
+
+#define SYS_INDEX_TYPE 3
+ Column("TYPE", SLong(), NOT_NULL),
+
+#define SYS_INDEX_NUM_FIELDS 4
+ Column("N_FIELDS", SLong(), NOT_NULL),
+
+#define SYS_INDEX_PAGE_NO 5
+ Column("PAGE_NO", SLong(), NOT_NULL),
+
+#define SYS_INDEX_SPACE 6
+ Column("SPACE", SLong(), NOT_NULL),
+
+#define SYS_INDEX_MERGE_THRESHOLD 7
+ Column("MERGE_THRESHOLD", SLong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to populate the information_schema.innodb_sys_indexes table with
+collected index information
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_indexes(
+/*======================*/
+ THD* thd, /*!< in: thread */
+ table_id_t table_id, /*!< in: table id */
+ ulint space_id, /*!< in: tablespace id */
+ dict_index_t* index, /*!< in: populated dict_index_t
+ struct with index info */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ Field** fields;
+
+ DBUG_ENTER("i_s_dict_fill_sys_indexes");
+
+ fields = table_to_fill->field;
+
+ if (*index->name == *TEMP_INDEX_PREFIX_STR) {
+ /* Since TEMP_INDEX_PREFIX_STR is not valid UTF-8, we
+ need to convert it to something else. */
+ *const_cast<char*>(index->name()) = '?';
+ }
+
+ OK(fields[SYS_INDEX_NAME]->store(index->name,
+ uint(strlen(index->name)),
+ system_charset_info));
+
+ OK(fields[SYS_INDEX_ID]->store(longlong(index->id), true));
+
+ OK(fields[SYS_INDEX_TABLE_ID]->store(longlong(table_id), true));
+
+ OK(fields[SYS_INDEX_TYPE]->store(index->type, true));
+
+ OK(fields[SYS_INDEX_NUM_FIELDS]->store(index->n_fields));
+
+ /* FIL_NULL is ULINT32_UNDEFINED */
+ if (index->page == FIL_NULL) {
+ fields[SYS_INDEX_PAGE_NO]->set_null();
+ } else {
+ OK(fields[SYS_INDEX_PAGE_NO]->store(index->page, true));
+ }
+
+ if (space_id == ULINT_UNDEFINED) {
+ fields[SYS_INDEX_SPACE]->set_null();
+ } else {
+ OK(fields[SYS_INDEX_SPACE]->store(space_id, true));
+ }
+
+ OK(fields[SYS_INDEX_MERGE_THRESHOLD]->store(index->merge_threshold,
+ true));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_INDEXES table, and fill the
+information_schema.innodb_sys_indexes table with related index information
+@return 0 on success */
+static
+int
+i_s_sys_indexes_fill_table(
+/*=======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ mtr_t mtr;
+
+ DBUG_ENTER("i_s_sys_indexes_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+
+ /* Start scan the SYS_INDEXES table */
+ rec = dict_startscan_system(&pcur, &mtr, SYS_INDEXES);
+
+ /* Process each record in the table */
+ while (rec) {
+ const char* err_msg;
+ table_id_t table_id;
+ ulint space_id;
+ dict_index_t index_rec;
+
+ /* Populate a dict_index_t structure with information from
+ a SYS_INDEXES row */
+ err_msg = dict_process_sys_indexes_rec(heap, rec, &index_rec,
+ &table_id);
+ const byte* field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__SPACE, &space_id);
+ space_id = space_id == 4 ? mach_read_from_4(field)
+ : ULINT_UNDEFINED;
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+
+ if (!err_msg) {
+ if (int err = i_s_dict_fill_sys_indexes(
+ thd, table_id, space_id, &index_rec,
+ tables->table)) {
+ mem_heap_free(heap);
+ DBUG_RETURN(err);
+ }
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_indexes
+@return 0 on success */
+static
+int
+innodb_sys_indexes_init(
+/*====================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_indexes_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sysindex_fields_info;
+ schema->fill_table = i_s_sys_indexes_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_indexes =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_INDEXES"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_INDEXES"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_indexes_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/** SYS_COLUMNS **************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_COLUMNS */
+static ST_FIELD_INFO innodb_sys_columns_fields_info[] =
+{
+#define SYS_COLUMN_TABLE_ID 0
+ Column("TABLE_ID", ULonglong(), NOT_NULL),
+
+#define SYS_COLUMN_NAME 1
+ Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_COLUMN_POSITION 2
+ Column("POS", ULonglong(), NOT_NULL),
+
+#define SYS_COLUMN_MTYPE 3
+ Column("MTYPE", SLong(), NOT_NULL),
+
+#define SYS_COLUMN__PRTYPE 4
+ Column("PRTYPE", SLong(), NOT_NULL),
+
+#define SYS_COLUMN_COLUMN_LEN 5
+ Column("LEN", SLong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to populate the information_schema.innodb_sys_columns with
+related column information
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_columns(
+/*======================*/
+ THD* thd, /*!< in: thread */
+ table_id_t table_id, /*!< in: table ID */
+ const char* col_name, /*!< in: column name */
+ dict_col_t* column, /*!< in: dict_col_t struct holding
+ more column information */
+ ulint nth_v_col, /*!< in: virtual column, its
+ sequence number (nth virtual col) */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ Field** fields;
+
+ DBUG_ENTER("i_s_dict_fill_sys_columns");
+
+ fields = table_to_fill->field;
+
+ OK(fields[SYS_COLUMN_TABLE_ID]->store((longlong) table_id, TRUE));
+
+ OK(field_store_string(fields[SYS_COLUMN_NAME], col_name));
+
+ if (column->is_virtual()) {
+ ulint pos = dict_create_v_col_pos(nth_v_col, column->ind);
+ OK(fields[SYS_COLUMN_POSITION]->store(pos, true));
+ } else {
+ OK(fields[SYS_COLUMN_POSITION]->store(column->ind, true));
+ }
+
+ OK(fields[SYS_COLUMN_MTYPE]->store(column->mtype));
+
+ OK(fields[SYS_COLUMN__PRTYPE]->store(column->prtype));
+
+ OK(fields[SYS_COLUMN_COLUMN_LEN]->store(column->len));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to fill information_schema.innodb_sys_columns with information
+collected by scanning SYS_COLUMNS table.
+@return 0 on success */
+static
+int
+i_s_sys_columns_fill_table(
+/*=======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ const char* col_name;
+ mem_heap_t* heap;
+ mtr_t mtr;
+
+ DBUG_ENTER("i_s_sys_columns_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+
+ rec = dict_startscan_system(&pcur, &mtr, SYS_COLUMNS);
+
+ while (rec) {
+ const char* err_msg;
+ dict_col_t column_rec;
+ table_id_t table_id;
+ ulint nth_v_col;
+
+ /* populate a dict_col_t structure with information from
+ a SYS_COLUMNS row */
+ err_msg = dict_process_sys_columns_rec(heap, rec, &column_rec,
+ &table_id, &col_name,
+ &nth_v_col);
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+
+ if (!err_msg) {
+ i_s_dict_fill_sys_columns(thd, table_id, col_name,
+ &column_rec, nth_v_col,
+ tables->table);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_columns
+@return 0 on success */
+static
+int
+innodb_sys_columns_init(
+/*====================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_columns_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sys_columns_fields_info;
+ schema->fill_table = i_s_sys_columns_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_columns =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_COLUMNS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_COLUMNS"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_columns_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/** SYS_VIRTUAL **************************************************/
+/** Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_VIRTUAL */
+static ST_FIELD_INFO innodb_sys_virtual_fields_info[] =
+{
+#define SYS_VIRTUAL_TABLE_ID 0
+ Column("TABLE_ID", ULonglong(), NOT_NULL),
+
+#define SYS_VIRTUAL_POS 1
+ Column("POS", ULong(), NOT_NULL),
+
+#define SYS_VIRTUAL_BASE_POS 2
+ Column("BASE_POS", ULong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/** Function to populate the information_schema.innodb_sys_virtual with
+related information
+param[in] thd thread
+param[in] table_id table ID
+param[in] pos virtual column position
+param[in] base_pos base column position
+param[in,out] table_to_fill fill this table
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_virtual(
+ THD* thd,
+ table_id_t table_id,
+ ulint pos,
+ ulint base_pos,
+ TABLE* table_to_fill)
+{
+ Field** fields;
+
+ DBUG_ENTER("i_s_dict_fill_sys_virtual");
+
+ fields = table_to_fill->field;
+
+ OK(fields[SYS_VIRTUAL_TABLE_ID]->store(table_id, true));
+
+ OK(fields[SYS_VIRTUAL_POS]->store(pos, true));
+
+ OK(fields[SYS_VIRTUAL_BASE_POS]->store(base_pos, true));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+
+/** Function to fill information_schema.innodb_sys_virtual with information
+collected by scanning SYS_VIRTUAL table.
+param[in] thd thread
+param[in,out] tables tables to fill
+param[in] item condition (not used)
+@return 0 on success */
+static
+int
+i_s_sys_virtual_fill_table(
+ THD* thd,
+ TABLE_LIST* tables,
+ Item* )
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ ulint pos;
+ ulint base_pos;
+ mtr_t mtr;
+
+ DBUG_ENTER("i_s_sys_virtual_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+
+ rec = dict_startscan_system(&pcur, &mtr, SYS_VIRTUAL);
+
+ while (rec) {
+ const char* err_msg;
+ table_id_t table_id;
+
+ /* populate a dict_col_t structure with information from
+ a SYS_VIRTUAL row */
+ err_msg = dict_process_sys_virtual_rec(rec,
+ &table_id, &pos,
+ &base_pos);
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+
+ if (!err_msg) {
+ i_s_dict_fill_sys_virtual(thd, table_id, pos, base_pos,
+ tables->table);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ /* Get the next record */
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+
+ DBUG_RETURN(0);
+}
+
+/** Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_virtual
+param[in,out] p table schema object
+@return 0 on success */
+static
+int
+innodb_sys_virtual_init(
+ void* p)
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_virtual_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sys_virtual_fields_info;
+ schema->fill_table = i_s_sys_virtual_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+struct st_maria_plugin i_s_innodb_sys_virtual =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_VIRTUAL"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_VIRTUAL"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_virtual_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+
+namespace Show {
+/** SYS_FIELDS ***************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FIELDS */
+static ST_FIELD_INFO innodb_sys_fields_fields_info[] =
+{
+#define SYS_FIELD_INDEX_ID 0
+ Column("INDEX_ID", ULonglong(), NOT_NULL),
+
+#define SYS_FIELD_NAME 1
+ Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_FIELD_POS 2
+ Column("POS", ULong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_fields with information
+collected by scanning SYS_FIELDS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_fields(
+/*=====================*/
+ THD* thd, /*!< in: thread */
+ index_id_t index_id, /*!< in: index id for the field */
+ dict_field_t* field, /*!< in: table */
+ ulint pos, /*!< in: Field position */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ Field** fields;
+
+ DBUG_ENTER("i_s_dict_fill_sys_fields");
+
+ fields = table_to_fill->field;
+
+ OK(fields[SYS_FIELD_INDEX_ID]->store(index_id, true));
+
+ OK(field_store_string(fields[SYS_FIELD_NAME], field->name));
+
+ OK(fields[SYS_FIELD_POS]->store(pos, true));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_FIELDS table, and fill the
+information_schema.innodb_sys_fields table with related index field
+information
+@return 0 on success */
+static
+int
+i_s_sys_fields_fill_table(
+/*======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ index_id_t last_id;
+ mtr_t mtr;
+
+ DBUG_ENTER("i_s_sys_fields_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+
+ /* will save last index id so that we know whether we move to
+ the next index. This is used to calculate prefix length */
+ last_id = 0;
+
+ rec = dict_startscan_system(&pcur, &mtr, SYS_FIELDS);
+
+ while (rec) {
+ ulint pos;
+ const char* err_msg;
+ index_id_t index_id;
+ dict_field_t field_rec;
+
+ /* Populate a dict_field_t structure with information from
+ a SYS_FIELDS row */
+ err_msg = dict_process_sys_fields_rec(heap, rec, &field_rec,
+ &pos, &index_id, last_id);
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+
+ if (!err_msg) {
+ i_s_dict_fill_sys_fields(thd, index_id, &field_rec,
+ pos, tables->table);
+ last_id = index_id;
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_fields
+@return 0 on success */
+static
+int
+innodb_sys_fields_init(
+/*===================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_field_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sys_fields_fields_info;
+ schema->fill_table = i_s_sys_fields_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_fields =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_FIELDS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_FIELDS"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_fields_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/** SYS_FOREIGN ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN */
+static ST_FIELD_INFO innodb_sys_foreign_fields_info[] =
+{
+#define SYS_FOREIGN_ID 0
+ Column("ID", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define SYS_FOREIGN_FOR_NAME 1
+ Column("FOR_NAME", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define SYS_FOREIGN_REF_NAME 2
+ Column("REF_NAME", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define SYS_FOREIGN_NUM_COL 3
+ Column("N_COLS", ULong(), NOT_NULL),
+
+#define SYS_FOREIGN_TYPE 4
+ Column("TYPE", ULong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_foreign with information
+collected by scanning SYS_FOREIGN table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_foreign(
+/*======================*/
+ THD* thd, /*!< in: thread */
+ dict_foreign_t* foreign, /*!< in: table */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ Field** fields;
+
+ DBUG_ENTER("i_s_dict_fill_sys_foreign");
+
+ fields = table_to_fill->field;
+
+ OK(field_store_string(fields[SYS_FOREIGN_ID], foreign->id));
+
+ OK(field_store_string(fields[SYS_FOREIGN_FOR_NAME],
+ foreign->foreign_table_name));
+
+ OK(field_store_string(fields[SYS_FOREIGN_REF_NAME],
+ foreign->referenced_table_name));
+
+ OK(fields[SYS_FOREIGN_NUM_COL]->store(foreign->n_fields));
+
+ OK(fields[SYS_FOREIGN_TYPE]->store(foreign->type));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.innodb_sys_foreign table. Loop
+through each record in SYS_FOREIGN, and extract the foreign key
+information.
+@return 0 on success */
+static
+int
+i_s_sys_foreign_fill_table(
+/*=======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ mtr_t mtr;
+
+ DBUG_ENTER("i_s_sys_foreign_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+
+ rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN);
+
+ while (rec) {
+ const char* err_msg;
+ dict_foreign_t foreign_rec;
+
+ /* Populate a dict_foreign_t structure with information from
+ a SYS_FOREIGN row */
+ err_msg = dict_process_sys_foreign_rec(heap, rec, &foreign_rec);
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+
+ if (!err_msg) {
+ i_s_dict_fill_sys_foreign(thd, &foreign_rec,
+ tables->table);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ mtr_start(&mtr);
+ mutex_enter(&dict_sys.mutex);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign
+@return 0 on success */
+static
+int
+innodb_sys_foreign_init(
+/*====================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_foreign_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sys_foreign_fields_info;
+ schema->fill_table = i_s_sys_foreign_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_foreign =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_FOREIGN"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_FOREIGN"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_foreign_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/** SYS_FOREIGN_COLS ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN_COLS */
+static ST_FIELD_INFO innodb_sys_foreign_cols_fields_info[] =
+{
+#define SYS_FOREIGN_COL_ID 0
+ Column("ID", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define SYS_FOREIGN_COL_FOR_NAME 1
+ Column("FOR_COL_NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_FOREIGN_COL_REF_NAME 2
+ Column("REF_COL_NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_FOREIGN_COL_POS 3
+ Column("POS", ULong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_foreign_cols with information
+collected by scanning SYS_FOREIGN_COLS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_foreign_cols(
+/*==========================*/
+ THD* thd, /*!< in: thread */
+ const char* name, /*!< in: foreign key constraint name */
+ const char* for_col_name, /*!< in: referencing column name*/
+ const char* ref_col_name, /*!< in: referenced column
+ name */
+ ulint pos, /*!< in: column position */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ Field** fields;
+
+ DBUG_ENTER("i_s_dict_fill_sys_foreign_cols");
+
+ fields = table_to_fill->field;
+
+ OK(field_store_string(fields[SYS_FOREIGN_COL_ID], name));
+
+ OK(field_store_string(fields[SYS_FOREIGN_COL_FOR_NAME], for_col_name));
+
+ OK(field_store_string(fields[SYS_FOREIGN_COL_REF_NAME], ref_col_name));
+
+ OK(fields[SYS_FOREIGN_COL_POS]->store(pos, true));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.innodb_sys_foreign_cols table. Loop
+through each record in SYS_FOREIGN_COLS, and extract the foreign key column
+information and fill the INFORMATION_SCHEMA.innodb_sys_foreign_cols table.
+@return 0 on success */
+static
+int
+i_s_sys_foreign_cols_fill_table(
+/*============================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ mtr_t mtr;
+
+ DBUG_ENTER("i_s_sys_foreign_cols_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+
+ rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN_COLS);
+
+ while (rec) {
+ const char* err_msg;
+ const char* name;
+ const char* for_col_name;
+ const char* ref_col_name;
+ ulint pos;
+
+ /* Extract necessary information from a SYS_FOREIGN_COLS row */
+ err_msg = dict_process_sys_foreign_col_rec(
+ heap, rec, &name, &for_col_name, &ref_col_name, &pos);
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+
+ if (!err_msg) {
+ i_s_dict_fill_sys_foreign_cols(
+ thd, name, for_col_name, ref_col_name, pos,
+ tables->table);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols
+@return 0 on success */
+static
+int
+innodb_sys_foreign_cols_init(
+/*========================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_foreign_cols_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sys_foreign_cols_fields_info;
+ schema->fill_table = i_s_sys_foreign_cols_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_foreign_cols =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_FOREIGN_COLS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_FOREIGN_COLS"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_foreign_cols_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/** SYS_TABLESPACES ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES */
+static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[] =
+{
+#define SYS_TABLESPACES_SPACE 0
+ Column("SPACE", ULong(), NOT_NULL),
+
+#define SYS_TABLESPACES_NAME 1
+ Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NOT_NULL),
+
+#define SYS_TABLESPACES_FLAGS 2
+ Column("FLAG", ULong(), NOT_NULL),
+
+#define SYS_TABLESPACES_ROW_FORMAT 3
+ Column("ROW_FORMAT", Varchar(22), NULLABLE),
+
+#define SYS_TABLESPACES_PAGE_SIZE 4
+ Column("PAGE_SIZE", ULong(), NOT_NULL),
+
+#define SYS_TABLESPACES_ZIP_PAGE_SIZE 5
+ Column("ZIP_PAGE_SIZE", ULong(), NOT_NULL),
+
+#define SYS_TABLESPACES_FS_BLOCK_SIZE 6
+ Column("FS_BLOCK_SIZE", ULong(),NOT_NULL),
+
+#define SYS_TABLESPACES_FILE_SIZE 7
+ Column("FILE_SIZE", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESPACES_ALLOC_SIZE 8
+ Column("ALLOCATED_SIZE", ULonglong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to fill INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES with information
+collected by scanning SYS_TABLESPACESS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_tablespaces(
+/*==========================*/
+ THD* thd, /*!< in: thread */
+ uint32_t space, /*!< in: space ID */
+ const char* name, /*!< in: tablespace name */
+ ulint flags, /*!< in: tablespace flags */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ Field** fields;
+ ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
+ const char* row_format;
+
+ DBUG_ENTER("i_s_dict_fill_sys_tablespaces");
+
+ if (fil_space_t::full_crc32(flags)) {
+ row_format = NULL;
+ } else if (is_system_tablespace(space)) {
+ row_format = "Compact, Redundant or Dynamic";
+ } else if (FSP_FLAGS_GET_ZIP_SSIZE(flags)) {
+ row_format = "Compressed";
+ } else if (atomic_blobs) {
+ row_format = "Dynamic";
+ } else {
+ row_format = "Compact or Redundant";
+ }
+
+ fields = table_to_fill->field;
+
+ OK(fields[SYS_TABLESPACES_SPACE]->store(space, true));
+
+ OK(field_store_string(fields[SYS_TABLESPACES_NAME], name));
+
+ OK(fields[SYS_TABLESPACES_FLAGS]->store(flags, true));
+
+ OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT], row_format));
+
+ ulint cflags = fil_space_t::is_valid_flags(flags, space)
+ ? flags : fsp_flags_convert_from_101(flags);
+ if (cflags == ULINT_UNDEFINED) {
+ fields[SYS_TABLESPACES_PAGE_SIZE]->set_null();
+ fields[SYS_TABLESPACES_ZIP_PAGE_SIZE]->set_null();
+ fields[SYS_TABLESPACES_FS_BLOCK_SIZE]->set_null();
+ fields[SYS_TABLESPACES_FILE_SIZE]->set_null();
+ fields[SYS_TABLESPACES_ALLOC_SIZE]->set_null();
+ OK(schema_table_store_record(thd, table_to_fill));
+ DBUG_RETURN(0);
+ }
+
+ OK(fields[SYS_TABLESPACES_PAGE_SIZE]->store(
+ fil_space_t::logical_size(cflags), true));
+
+ OK(fields[SYS_TABLESPACES_ZIP_PAGE_SIZE]->store(
+ fil_space_t::physical_size(cflags), true));
+
+ os_file_stat_t stat;
+ os_file_size_t file;
+
+ memset(&file, 0xff, sizeof(file));
+ memset(&stat, 0x0, sizeof(stat));
+
+ if (fil_space_t* s = fil_space_t::get(space)) {
+ const char *filepath = s->chain.start
+ ? s->chain.start->name : NULL;
+ if (!filepath) {
+ goto file_done;
+ }
+
+ file = os_file_get_size(filepath);
+
+ /* Get the file system (or Volume) block size. */
+ switch (dberr_t err = os_file_get_status(filepath, &stat,
+ false, false)) {
+ case DB_FAIL:
+ ib::warn()
+ << "File '" << filepath << "', failed to get "
+ << "stats";
+ break;
+
+ case DB_SUCCESS:
+ case DB_NOT_FOUND:
+ break;
+
+ default:
+ ib::error() << "File '" << filepath << "' " << err;
+ break;
+ }
+
+file_done:
+ s->release();
+ }
+
+ if (file.m_total_size == os_offset_t(~0)) {
+ stat.block_size = 0;
+ file.m_total_size = 0;
+ file.m_alloc_size = 0;
+ }
+
+ OK(fields[SYS_TABLESPACES_FS_BLOCK_SIZE]->store(stat.block_size, true));
+
+ OK(fields[SYS_TABLESPACES_FILE_SIZE]->store(file.m_total_size, true));
+
+ OK(fields[SYS_TABLESPACES_ALLOC_SIZE]->store(file.m_alloc_size, true));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table.
+Loop through each record in SYS_TABLESPACES, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table.
+@return 0 on success */
+static
+int
+i_s_sys_tablespaces_fill_table(
+/*===========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ mtr_t mtr;
+
+ DBUG_ENTER("i_s_sys_tablespaces_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+
+ for (rec = dict_startscan_system(&pcur, &mtr, SYS_TABLESPACES);
+ rec != NULL;
+ rec = dict_getnext_system(&pcur, &mtr)) {
+
+ const char* err_msg;
+ uint32_t space;
+ const char* name;
+ ulint flags;
+
+ /* Extract necessary information from a SYS_TABLESPACES row */
+ err_msg = dict_process_sys_tablespaces(
+ heap, rec, &space, &name, &flags);
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+
+ if (!err_msg) {
+ i_s_dict_fill_sys_tablespaces(
+ thd, space, name, flags,
+ tables->table);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES
+@return 0 on success */
+static
+int
+innodb_sys_tablespaces_init(
+/*========================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_tablespaces_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sys_tablespaces_fields_info;
+ schema->fill_table = i_s_sys_tablespaces_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tablespaces =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_TABLESPACES"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_TABLESPACES"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_tablespaces_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/** SYS_DATAFILES ************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES */
+static ST_FIELD_INFO innodb_sys_datafiles_fields_info[] =
+{
+#define SYS_DATAFILES_SPACE 0
+ Column("SPACE", ULong(), NOT_NULL),
+
+#define SYS_DATAFILES_PATH 1
+ Column("PATH", Varchar(OS_FILE_MAX_PATH), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to fill INFORMATION_SCHEMA.INNODB_SYS_DATAFILES with information
+collected by scanning SYS_DATAFILESS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_datafiles(
+/*========================*/
+ THD* thd, /*!< in: thread */
+ uint32_t space, /*!< in: space ID */
+ const char* path, /*!< in: absolute path */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ Field** fields;
+
+ DBUG_ENTER("i_s_dict_fill_sys_datafiles");
+
+ fields = table_to_fill->field;
+
+ OK(fields[SYS_DATAFILES_SPACE]->store(space, true));
+
+ OK(field_store_string(fields[SYS_DATAFILES_PATH], path));
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table.
+Loop through each record in SYS_DATAFILES, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table.
+@return 0 on success */
+static
+int
+i_s_sys_datafiles_fill_table(
+/*=========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ mtr_t mtr;
+
+ DBUG_ENTER("i_s_sys_datafiles_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+
+ rec = dict_startscan_system(&pcur, &mtr, SYS_DATAFILES);
+
+ while (rec) {
+ const char* err_msg;
+ uint32_t space;
+ const char* path;
+
+ /* Extract necessary information from a SYS_DATAFILES row */
+ err_msg = dict_process_sys_datafiles(
+ heap, rec, &space, &path);
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+
+ if (!err_msg) {
+ i_s_dict_fill_sys_datafiles(
+ thd, space, path, tables->table);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ mutex_enter(&dict_sys.mutex);
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys.mutex);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES
+@return 0 on success */
+static
+int
+innodb_sys_datafiles_init(
+/*======================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_datafiles_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sys_datafiles_fields_info;
+ schema->fill_table = i_s_sys_datafiles_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_datafiles =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_DATAFILES"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_DATAFILES"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_datafiles_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/** TABLESPACES_ENCRYPTION ********************************************/
+/* Fields of the table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION */
+static ST_FIELD_INFO innodb_tablespaces_encryption_fields_info[] =
+{
+#define TABLESPACES_ENCRYPTION_SPACE 0
+ Column("SPACE", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_NAME 1
+ Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NULLABLE),
+
+#define TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME 2
+ Column("ENCRYPTION_SCHEME", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_KEYSERVER_REQUESTS 3
+ Column("KEYSERVER_REQUESTS", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_MIN_KEY_VERSION 4
+ Column("MIN_KEY_VERSION", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_CURRENT_KEY_VERSION 5
+ Column("CURRENT_KEY_VERSION", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER 6
+ Column("KEY_ROTATION_PAGE_NUMBER", ULonglong(), NULLABLE),
+
+#define TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER 7
+ Column("KEY_ROTATION_MAX_PAGE_NUMBER", ULonglong(), NULLABLE),
+
+#define TABLESPACES_ENCRYPTION_CURRENT_KEY_ID 8
+ Column("CURRENT_KEY_ID", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING 9
+ Column("ROTATING_OR_FLUSHING", SLong(1), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION
+with information collected by scanning SYS_TABLESPACES table.
+@param[in] thd thread handle
+@param[in] space Tablespace
+@param[in] table_to_fill I_S table to fill
+@return 0 on success */
+static
+int
+i_s_dict_fill_tablespaces_encryption(
+ THD* thd,
+ fil_space_t* space,
+ TABLE* table_to_fill)
+{
+ Field** fields;
+ struct fil_space_crypt_status_t status;
+
+ DBUG_ENTER("i_s_dict_fill_tablespaces_encryption");
+
+ fields = table_to_fill->field;
+
+ fil_space_crypt_get_status(space, &status);
+
+ /* If tablespace id does not match, we did not find
+ encryption information for this tablespace. */
+ if (!space->crypt_data || space->id != status.space) {
+ goto skip;
+ }
+
+ OK(fields[TABLESPACES_ENCRYPTION_SPACE]->store(space->id, true));
+
+ OK(field_store_string(fields[TABLESPACES_ENCRYPTION_NAME],
+ space->name));
+
+ OK(fields[TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME]->store(
+ status.scheme, true));
+ OK(fields[TABLESPACES_ENCRYPTION_KEYSERVER_REQUESTS]->store(
+ status.keyserver_requests, true));
+ OK(fields[TABLESPACES_ENCRYPTION_MIN_KEY_VERSION]->store(
+ status.min_key_version, true));
+ OK(fields[TABLESPACES_ENCRYPTION_CURRENT_KEY_VERSION]->store(
+ status.current_key_version, true));
+ OK(fields[TABLESPACES_ENCRYPTION_CURRENT_KEY_ID]->store(
+ status.key_id, true));
+ OK(fields[TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING]->store(
+ status.rotating || status.flushing, true));
+
+ if (status.rotating) {
+ fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->set_notnull();
+ OK(fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->store(
+ status.rotate_next_page_number, true));
+ fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]->set_notnull();
+ OK(fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]->store(
+ status.rotate_max_page_number, true));
+ } else {
+ fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]
+ ->set_null();
+ fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]
+ ->set_null();
+ }
+
+ OK(schema_table_store_record(thd, table_to_fill));
+
+skip:
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION table.
+Loop through each record in TABLESPACES_ENCRYPTION, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION table.
+@return 0 on success */
+static
+int
+i_s_tablespaces_encryption_fill_table(
+/*===========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ DBUG_ENTER("i_s_tablespaces_encryption_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ int err = 0;
+ mutex_enter(&fil_system.mutex);
+ fil_system.freeze_space_list++;
+
+ for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
+ space; space = UT_LIST_GET_NEXT(space_list, space)) {
+ if (space->purpose == FIL_TYPE_TABLESPACE
+ && !space->is_stopping()) {
+ space->reacquire();
+ mutex_exit(&fil_system.mutex);
+ err = i_s_dict_fill_tablespaces_encryption(
+ thd, space, tables->table);
+ mutex_enter(&fil_system.mutex);
+ space->release();
+ if (err) {
+ break;
+ }
+ }
+ }
+
+ fil_system.freeze_space_list--;
+ mutex_exit(&fil_system.mutex);
+ DBUG_RETURN(err);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION
+@return 0 on success */
+static
+int
+innodb_tablespaces_encryption_init(
+/*========================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_tablespaces_encryption_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_tablespaces_encryption_fields_info;
+ schema->fill_table = i_s_tablespaces_encryption_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_tablespaces_encryption =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_TABLESPACES_ENCRYPTION"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, "Google Inc"),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB TABLESPACES_ENCRYPTION"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_BSD),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_tablespaces_encryption_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+namespace Show {
+/** INNODB_MUTEXES *********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_MUTEXES */
+static ST_FIELD_INFO innodb_mutexes_fields_info[] =
+{
+#define MUTEXES_NAME 0
+ Column("NAME", Varchar(OS_FILE_MAX_PATH), NOT_NULL),
+
+#define MUTEXES_CREATE_FILE 1
+ Column("CREATE_FILE", Varchar(OS_FILE_MAX_PATH), NOT_NULL),
+
+#define MUTEXES_CREATE_LINE 2
+ Column("CREATE_LINE", ULong(), NOT_NULL),
+
+#define MUTEXES_OS_WAITS 3
+ Column("OS_WAITS", ULonglong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_MUTEXES table.
+Loop through each record in mutex and rw_lock lists, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_MUTEXES table.
+@return 0 on success */
+static
+int
+i_s_innodb_mutexes_fill_table(
+/*==========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ ulint block_lock_oswait_count = 0;
+ const rw_lock_t* block_lock= nullptr;
+ Field** fields = tables->table->field;
+
+ DBUG_ENTER("i_s_innodb_mutexes_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ } else {
+ struct Locking
+ {
+ Locking() { mutex_enter(&rw_lock_list_mutex); }
+ ~Locking() { mutex_exit(&rw_lock_list_mutex); }
+ } locking;
+
+ char lock_name[sizeof "buf0dump.cc:12345"];
+
+ for (const rw_lock_t& lock : rw_lock_list) {
+ if (lock.count_os_wait == 0) {
+ continue;
+ }
+
+ if (buf_pool.is_block_lock(&lock)) {
+ block_lock = &lock;
+ block_lock_oswait_count += lock.count_os_wait;
+ continue;
+ }
+
+ const char* basename = innobase_basename(
+ lock.cfile_name);
+
+ snprintf(lock_name, sizeof lock_name, "%s:%u",
+ basename, lock.cline);
+
+ OK(field_store_string(fields[MUTEXES_NAME],
+ lock_name));
+ OK(field_store_string(fields[MUTEXES_CREATE_FILE],
+ basename));
+ OK(fields[MUTEXES_CREATE_LINE]->store(lock.cline,
+ true));
+ fields[MUTEXES_CREATE_LINE]->set_notnull();
+ OK(fields[MUTEXES_OS_WAITS]->store(lock.count_os_wait,
+ true));
+ fields[MUTEXES_OS_WAITS]->set_notnull();
+ OK(schema_table_store_record(thd, tables->table));
+ }
+
+ if (block_lock) {
+ char buf1[IO_SIZE];
+
+ snprintf(buf1, sizeof buf1, "combined %s",
+ innobase_basename(block_lock->cfile_name));
+
+ OK(field_store_string(fields[MUTEXES_NAME],
+ "buf_block_t::lock"));
+ OK(field_store_string(fields[MUTEXES_CREATE_FILE],
+ buf1));
+ OK(fields[MUTEXES_CREATE_LINE]->store(block_lock->cline,
+ true));
+ fields[MUTEXES_CREATE_LINE]->set_notnull();
+ OK(fields[MUTEXES_OS_WAITS]->store(
+ block_lock_oswait_count, true));
+ fields[MUTEXES_OS_WAITS]->set_notnull();
+ OK(schema_table_store_record(thd, tables->table));
+ }
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_MUTEXES
+@return 0 on success */
+static
+int
+innodb_mutexes_init(
+/*================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_mutexes_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_mutexes_fields_info;
+ schema->fill_table = i_s_innodb_mutexes_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_mutexes =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_MUTEXES"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_DATAFILES"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_mutexes_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS */
+static ST_FIELD_INFO innodb_sys_semaphore_waits_fields_info[] =
+{
+ // SYS_SEMAPHORE_WAITS_THREAD_ID 0
+ Column("THREAD_ID", ULonglong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_OBJECT_NAME 1
+ Column("OBJECT_NAME", Varchar(OS_FILE_MAX_PATH), NULLABLE),
+
+ // SYS_SEMAPHORE_WAITS_FILE 2
+ Column("FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
+
+ // SYS_SEMAPHORE_WAITS_LINE 3
+ Column("LINE", ULong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_WAIT_TIME 4
+ Column("WAIT_TIME", ULonglong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_WAIT_OBJECT 5
+ Column("WAIT_OBJECT", ULonglong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_WAIT_TYPE 6
+ Column("WAIT_TYPE", Varchar(16), NULLABLE),
+
+ // SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID 7
+ Column("HOLDER_THREAD_ID", ULonglong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_HOLDER_FILE 8
+ Column("HOLDER_FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
+
+ // SYS_SEMAPHORE_WAITS_HOLDER_LINE 9
+ Column("HOLDER_LINE", ULong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_CREATED_FILE 10
+ Column("CREATED_FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
+
+ // SYS_SEMAPHORE_WAITS_CREATED_LINE 11
+ Column("CREATED_LINE", ULong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_WRITER_THREAD 12
+ Column("WRITER_THREAD", ULonglong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_RESERVATION_MODE 13
+ Column("RESERVATION_MODE", Varchar(16), NULLABLE),
+
+ // SYS_SEMAPHORE_WAITS_READERS 14
+ Column("READERS", ULong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_WAITERS_FLAG 15
+ Column("WAITERS_FLAG", ULonglong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_LOCK_WORD 16
+ Column("LOCK_WORD", ULonglong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 17
+ Column("LAST_WRITER_FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
+
+ // SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 18
+ Column("LAST_WRITER_LINE", ULong(), NOT_NULL),
+
+ // SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 19
+ Column("OS_WAIT_COUNT", ULong(), NOT_NULL),
+
+ CEnd()
+};
+} // namespace Show
+
+
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS
+@return 0 on success */
+static
+int
+innodb_sys_semaphore_waits_init(
+/*============================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_semaphore_waits_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = Show::innodb_sys_semaphore_waits_fields_info;
+ schema->fill_table = sync_arr_fill_sys_semphore_waits_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_semaphore_waits =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_SEMAPHORE_WAITS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, maria_plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_SEMAPHORE_WAITS"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_semaphore_waits_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+};
diff --git a/storage/innobase/handler/i_s.h b/storage/innobase/handler/i_s.h
new file mode 100644
index 00000000..385c249d
--- /dev/null
+++ b/storage/innobase/handler/i_s.h
@@ -0,0 +1,147 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/i_s.h
+InnoDB INFORMATION SCHEMA tables interface to MySQL.
+
+Created July 18, 2007 Vasil Dimov
+Modified Dec 29, 2014 Jan Lindström
+*******************************************************/
+
+#ifndef i_s_h
+#define i_s_h
+#include "dict0types.h"
+
+const char plugin_author[] = "Oracle Corporation";
+const char maria_plugin_author[] = "MariaDB Corporation";
+
+extern struct st_maria_plugin i_s_innodb_trx;
+extern struct st_maria_plugin i_s_innodb_locks;
+extern struct st_maria_plugin i_s_innodb_lock_waits;
+extern struct st_maria_plugin i_s_innodb_cmp;
+extern struct st_maria_plugin i_s_innodb_cmp_reset;
+extern struct st_maria_plugin i_s_innodb_cmp_per_index;
+extern struct st_maria_plugin i_s_innodb_cmp_per_index_reset;
+extern struct st_maria_plugin i_s_innodb_cmpmem;
+extern struct st_maria_plugin i_s_innodb_cmpmem_reset;
+extern struct st_maria_plugin i_s_innodb_metrics;
+extern struct st_maria_plugin i_s_innodb_ft_default_stopword;
+extern struct st_maria_plugin i_s_innodb_ft_deleted;
+extern struct st_maria_plugin i_s_innodb_ft_being_deleted;
+extern struct st_maria_plugin i_s_innodb_ft_index_cache;
+extern struct st_maria_plugin i_s_innodb_ft_index_table;
+extern struct st_maria_plugin i_s_innodb_ft_config;
+extern struct st_maria_plugin i_s_innodb_buffer_page;
+extern struct st_maria_plugin i_s_innodb_buffer_page_lru;
+extern struct st_maria_plugin i_s_innodb_buffer_stats;
+extern struct st_maria_plugin i_s_innodb_sys_tables;
+extern struct st_maria_plugin i_s_innodb_sys_tablestats;
+extern struct st_maria_plugin i_s_innodb_sys_indexes;
+extern struct st_maria_plugin i_s_innodb_sys_columns;
+extern struct st_maria_plugin i_s_innodb_sys_fields;
+extern struct st_maria_plugin i_s_innodb_sys_foreign;
+extern struct st_maria_plugin i_s_innodb_sys_foreign_cols;
+extern struct st_maria_plugin i_s_innodb_sys_tablespaces;
+extern struct st_maria_plugin i_s_innodb_sys_datafiles;
+extern struct st_maria_plugin i_s_innodb_mutexes;
+extern struct st_maria_plugin i_s_innodb_sys_virtual;
+extern struct st_maria_plugin i_s_innodb_tablespaces_encryption;
+extern struct st_maria_plugin i_s_innodb_sys_semaphore_waits;
+
+/** The latest successfully looked up innodb_fts_aux_table */
+extern table_id_t innodb_ft_aux_table_id;
+
+/** maximum number of buffer page info we would cache. */
+#define MAX_BUF_INFO_CACHED 10000
+
+#define OK(expr) \
+ if ((expr) != 0) { \
+ DBUG_RETURN(1); \
+ }
+
+#define BREAK_IF(expr) if ((expr)) break
+
+#define RETURN_IF_INNODB_NOT_STARTED(plugin_name) \
+do { \
+ if (!srv_was_started) { \
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, \
+ ER_CANT_FIND_SYSTEM_REC, \
+ "InnoDB: SELECTing from " \
+ "INFORMATION_SCHEMA.%s but " \
+ "the InnoDB storage engine " \
+ "is not installed", plugin_name); \
+ DBUG_RETURN(0); \
+ } \
+} while (0)
+
+#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 && !defined __INTEL_COMPILER && !defined __clang__
+#ifdef HAVE_C99_INITIALIZERS
+#define STRUCT_FLD(name, value) .name = value
+#else
+#define STRUCT_FLD(name, value) name: value
+#endif /* HAVE_C99_INITIALIZERS */
+#else
+#define STRUCT_FLD(name, value) value
+#endif
+
+/* Don't use a static const variable here, as some C++ compilers (notably
+HPUX aCC: HP ANSI C++ B3910B A.03.65) can't handle it. */
+#define END_OF_ST_FIELD_INFO \
+ {STRUCT_FLD(field_name, NULL), \
+ STRUCT_FLD(field_length, 0), \
+ STRUCT_FLD(field_type, MYSQL_TYPE_NULL), \
+ STRUCT_FLD(value, 0), \
+ STRUCT_FLD(field_flags, 0), \
+ STRUCT_FLD(old_name, ""), \
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}
+
+/** Fields on INFORMATION_SCHEMA.SYS_SEMAMPHORE_WAITS table */
+#define SYS_SEMAPHORE_WAITS_THREAD_ID 0
+#define SYS_SEMAPHORE_WAITS_OBJECT_NAME 1
+#define SYS_SEMAPHORE_WAITS_FILE 2
+#define SYS_SEMAPHORE_WAITS_LINE 3
+#define SYS_SEMAPHORE_WAITS_WAIT_TIME 4
+#define SYS_SEMAPHORE_WAITS_WAIT_OBJECT 5
+#define SYS_SEMAPHORE_WAITS_WAIT_TYPE 6
+#define SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID 7
+#define SYS_SEMAPHORE_WAITS_HOLDER_FILE 8
+#define SYS_SEMAPHORE_WAITS_HOLDER_LINE 9
+#define SYS_SEMAPHORE_WAITS_CREATED_FILE 10
+#define SYS_SEMAPHORE_WAITS_CREATED_LINE 11
+#define SYS_SEMAPHORE_WAITS_WRITER_THREAD 12
+#define SYS_SEMAPHORE_WAITS_RESERVATION_MODE 13
+#define SYS_SEMAPHORE_WAITS_READERS 14
+#define SYS_SEMAPHORE_WAITS_WAITERS_FLAG 15
+#define SYS_SEMAPHORE_WAITS_LOCK_WORD 16
+#define SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 17
+#define SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 18
+#define SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 19
+
+/*******************************************************************//**
+Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
+@return 0 on success */
+int
+field_store_string(
+/*===============*/
+ Field* field, /*!< in/out: target field for storage */
+ const char* str); /*!< in: NUL-terminated utf-8 string,
+ or NULL */
+
+#endif /* i_s_h */
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
new file mode 100644
index 00000000..9288a496
--- /dev/null
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -0,0 +1,4811 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ibuf/ibuf0ibuf.cc
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "ibuf0ibuf.h"
+#include "sync0sync.h"
+#include "btr0sea.h"
+
+/** Number of bits describing a single page */
+#define IBUF_BITS_PER_PAGE 4
+/** The start address for an insert buffer bitmap page bitmap */
+#define IBUF_BITMAP PAGE_DATA
+
+#include "buf0buf.h"
+#include "buf0rea.h"
+#include "fsp0fsp.h"
+#include "trx0sys.h"
+#include "fil0fil.h"
+#include "rem0rec.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "row0upd.h"
+#include "dict0boot.h"
+#include "fut0lst.h"
+#include "lock0lock.h"
+#include "log0recv.h"
+#include "que0que.h"
+#include "srv0start.h" /* srv_shutdown_state */
+#include "rem0cmp.h"
+
+/* STRUCTURE OF AN INSERT BUFFER RECORD
+
+In versions < 4.1.x:
+
+1. The first field is the page number.
+2. The second field is an array which stores type info for each subsequent
+ field. We store the information which affects the ordering of records, and
+ also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it
+ is 10 bytes.
+3. Next we have the fields of the actual index record.
+
+In versions >= 4.1.x:
+
+Note that contary to what we planned in the 1990's, there will only be one
+insert buffer tree, and that is in the system tablespace of InnoDB.
+
+1. The first field is the space id.
+2. The second field is a one-byte marker (0) which differentiates records from
+ the < 4.1.x storage format.
+3. The third field is the page number.
+4. The fourth field contains the type info, where we have also added 2 bytes to
+ store the charset. In the compressed table format of 5.0.x we must add more
+ information here so that we can build a dummy 'index' struct which 5.0.x
+ can use in the binary search on the index page in the ibuf merge phase.
+5. The rest of the fields contain the fields of the actual index record.
+
+In versions >= 5.0.3:
+
+The first byte of the fourth field is an additional marker (0) if the record
+is in the compact format. The presence of this marker can be detected by
+looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE.
+
+The high-order bit of the character set field in the type info is the
+"nullable" flag for the field.
+
+In versions >= 5.5:
+
+The optional marker byte at the start of the fourth field is replaced by
+mandatory 3 fields, totaling 4 bytes:
+
+ 1. 2 bytes: Counter field, used to sort records within a (space id, page
+ no) in the order they were added. This is needed so that for example the
+ sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled
+ correctly.
+
+ 2. 1 byte: Operation type (see ibuf_op_t).
+
+ 3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT.
+
+To ensure older records, which do not have counters to enforce correct
+sorting, are merged before any new records, ibuf_insert checks if we're
+trying to insert to a position that contains old-style records, and if so,
+refuses the insert. Thus, ibuf pages are gradually converted to the new
+format as their corresponding buffer pool pages are read into memory.
+*/
+
+
+/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM
+
+If an OS thread performs any operation that brings in disk pages from
+non-system tablespaces into the buffer pool, or creates such a page there,
+then the operation may have as a side effect an insert buffer index tree
+compression. Thus, the tree latch of the insert buffer tree may be acquired
+in the x-mode, and also the file space latch of the system tablespace may
+be acquired in the x-mode.
+
+Also, an insert to an index in a non-system tablespace can have the same
+effect. How do we know this cannot lead to a deadlock of OS threads? There
+is a problem with the i\o-handler threads: they break the latching order
+because they own x-latches to pages which are on a lower level than the
+insert buffer tree latch, its page latches, and the tablespace latch an
+insert buffer operation can reserve.
+
+The solution is the following: Let all the tree and page latches connected
+with the insert buffer be later in the latching order than the fsp latch and
+fsp page latches.
+
+Insert buffer pages must be such that the insert buffer is never invoked
+when these pages are accessed as this would result in a recursion violating
+the latching order. We let a special i/o-handler thread take care of i/o to
+the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap
+pages and the first inode page, which contains the inode of the ibuf tree: let
+us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead
+access both non-ibuf and ibuf pages.
+
+Then an i/o-handler for the insert buffer never needs to access recursively the
+insert buffer tree and thus obeys the latching order. On the other hand, other
+i/o-handlers for other tablespaces may require access to the insert buffer,
+but because all kinds of latches they need to access there are later in the
+latching order, no violation of the latching order occurs in this case,
+either.
+
+A problem is how to grow and contract an insert buffer tree. As it is later
+in the latching order than the fsp management, we have to reserve the fsp
+latch first, before adding or removing pages from the insert buffer tree.
+We let the insert buffer tree have its own file space management: a free
+list of pages linked to the tree root. To prevent recursive using of the
+insert buffer when adding pages to the tree, we must first load these pages
+to memory, obtaining a latch on them, and only after that add them to the
+free list of the insert buffer tree. More difficult is removing of pages
+from the free list. If there is an excess of pages in the free list of the
+ibuf tree, they might be needed if some thread reserves the fsp latch,
+intending to allocate more file space. So we do the following: if a thread
+reserves the fsp latch, we check the writer count field of the latch. If
+this field has value 1, it means that the thread did not own the latch
+before entering the fsp system, and the mtr of the thread contains no
+modifications to the fsp pages. Now we are free to reserve the ibuf latch,
+and check if there is an excess of pages in the free list. We can then, in a
+separate mini-transaction, take them out of the free list and free them to
+the fsp system.
+
+To avoid deadlocks in the ibuf system, we divide file pages into three levels:
+
+(1) non-ibuf pages,
+(2) ibuf tree pages and the pages in the ibuf tree free list, and
+(3) ibuf bitmap pages.
+
+No OS thread is allowed to access higher level pages if it has latches to
+lower level pages; even if the thread owns a B-tree latch it must not access
+the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead
+is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle
+exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively
+level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e.,
+it uses synchronous aio, it can access any pages, as long as it obeys the
+access order rules. */
+
+/** Operations that can currently be buffered. */
+ulong innodb_change_buffering;
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/** Dump the change buffer at startup */
+my_bool ibuf_dump;
+/** Flag to control insert buffer debugging. */
+uint ibuf_debug;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+/** The insert buffer control structure */
+ibuf_t ibuf;
+
+/** @name Offsets to the per-page bits in the insert buffer bitmap */
+/* @{ */
+#define IBUF_BITMAP_FREE 0 /*!< Bits indicating the
+ amount of free space */
+#define IBUF_BITMAP_BUFFERED 2 /*!< TRUE if there are buffered
+ changes for the page */
+#define IBUF_BITMAP_IBUF 3 /*!< TRUE if page is a part of
+ the ibuf tree, excluding the
+ root page, or is in the free
+ list of the ibuf */
+/* @} */
+
+#define IBUF_REC_FIELD_SPACE 0 /*!< in the pre-4.1 format,
+ the page number. later, the space_id */
+#define IBUF_REC_FIELD_MARKER 1 /*!< starting with 4.1, a marker
+ consisting of 1 byte that is 0 */
+#define IBUF_REC_FIELD_PAGE 2 /*!< starting with 4.1, the
+ page number */
+#define IBUF_REC_FIELD_METADATA 3 /* the metadata field */
+#define IBUF_REC_FIELD_USER 4 /* first user field */
+
+/* Various constants for checking the type of an ibuf record and extracting
+data from it. For details, see the description of the record format at the
+top of this file. */
+
+/** @name Format of the IBUF_REC_FIELD_METADATA of an insert buffer record
+The fourth column in the MySQL 5.5 format contains an operation
+type, counter, and some flags. */
+/* @{ */
+#define IBUF_REC_INFO_SIZE 4 /*!< Combined size of info fields at
+ the beginning of the fourth field */
+
+/* Offsets for the fields at the beginning of the fourth field */
+#define IBUF_REC_OFFSET_COUNTER 0 /*!< Operation counter */
+#define IBUF_REC_OFFSET_TYPE 2 /*!< Type of operation */
+#define IBUF_REC_OFFSET_FLAGS 3 /*!< Additional flags */
+
+/* Record flag masks */
+#define IBUF_REC_COMPACT 0x1 /*!< Set in
+ IBUF_REC_OFFSET_FLAGS if the
+ user index is in COMPACT
+ format or later */
+
+
+/** The mutex used to block pessimistic inserts to ibuf trees */
+static ib_mutex_t ibuf_pessimistic_insert_mutex;
+
+/** The mutex protecting the insert buffer structs */
+static ib_mutex_t ibuf_mutex;
+
+/** The mutex protecting the insert buffer bitmaps */
+static ib_mutex_t ibuf_bitmap_mutex;
+
+/** The area in pages from which contract looks for page numbers for merge */
+const ulint IBUF_MERGE_AREA = 8;
+
+/** Inside the merge area, pages which have at most 1 per this number less
+buffered entries compared to maximum volume that can buffered for a single
+page are merged along with the page whose buffer became full */
+const ulint IBUF_MERGE_THRESHOLD = 4;
+
+/** In ibuf_contract at most this number of pages is read to memory in one
+batch, in order to merge the entries for them in the insert buffer */
+const ulint IBUF_MAX_N_PAGES_MERGED = IBUF_MERGE_AREA;
+
+/** If the combined size of the ibuf trees exceeds ibuf.max_size by
+this many pages, we start to contract it synchronous contract, but do
+not insert */
+const ulint IBUF_CONTRACT_DO_NOT_INSERT = 10;
+
+/* TODO: how to cope with drop table if there are records in the insert
+buffer for the indexes of the table? Is there actually any problem,
+because ibuf merge is done to a page when it is read in, and it is
+still physically like the index page even if the index would have been
+dropped! So, there seems to be no problem. */
+
+/******************************************************************//**
+Sets the flag in the current mini-transaction record indicating we're
+inside an insert buffer routine. */
+UNIV_INLINE
+void
+ibuf_enter(
+/*=======*/
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(!mtr->is_inside_ibuf());
+ mtr->enter_ibuf();
+}
+
+/******************************************************************//**
+Sets the flag in the current mini-transaction record indicating we're
+exiting an insert buffer routine. */
+UNIV_INLINE
+void
+ibuf_exit(
+/*======*/
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(mtr->is_inside_ibuf());
+ mtr->exit_ibuf();
+}
+
+/**************************************************************//**
+Commits an insert buffer mini-transaction and sets the persistent
+cursor latch mode to BTR_NO_LATCHES, that is, detaches the cursor. */
+UNIV_INLINE
+void
+ibuf_btr_pcur_commit_specify_mtr(
+/*=============================*/
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_d(ibuf_exit(mtr));
+ btr_pcur_commit_specify_mtr(pcur, mtr);
+}
+
+/******************************************************************//**
+Gets the ibuf header page and x-latches it.
+@return insert buffer header page */
+static
+page_t*
+ibuf_header_page_get(
+/*=================*/
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ buf_block_t* block;
+
+ ut_ad(!ibuf_inside(mtr));
+ page_t* page = NULL;
+
+ block = buf_page_get(
+ page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
+ 0, RW_X_LATCH, mtr);
+
+ if (block) {
+ buf_block_dbg_add_level(block, SYNC_IBUF_HEADER);
+ page = buf_block_get_frame(block);
+ }
+
+ return page;
+}
+
+/** Acquire the change buffer root page.
+@param[in,out] mtr mini-transaction
+@return change buffer root page, SX-latched */
+static buf_block_t *ibuf_tree_root_get(mtr_t *mtr)
+{
+ buf_block_t* block;
+
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(mutex_own(&ibuf_mutex));
+
+ mtr_sx_lock_index(ibuf.index, mtr);
+
+ /* only segment list access is exclusive each other */
+ block = buf_page_get(
+ page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO),
+ 0, RW_SX_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
+
+ ut_ad(page_get_space_id(block->frame) == IBUF_SPACE_ID);
+ ut_ad(page_get_page_no(block->frame) == FSP_IBUF_TREE_ROOT_PAGE_NO);
+ ut_ad(ibuf.empty == page_is_empty(block->frame));
+
+ return block;
+}
+
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+void
+ibuf_close(void)
+/*============*/
+{
+ if (!ibuf.index) {
+ return;
+ }
+
+ mutex_free(&ibuf_pessimistic_insert_mutex);
+
+ mutex_free(&ibuf_mutex);
+
+ mutex_free(&ibuf_bitmap_mutex);
+
+ dict_table_t* ibuf_table = ibuf.index->table;
+ rw_lock_free(&ibuf.index->lock);
+ dict_mem_index_free(ibuf.index);
+ dict_mem_table_free(ibuf_table);
+ ibuf.index = NULL;
+}
+
+/******************************************************************//**
+Updates the size information of the ibuf, assuming the segment size has not
+changed. */
+static
+void
+ibuf_size_update(
+/*=============*/
+ const page_t* root) /*!< in: ibuf tree root */
+{
+ ut_ad(mutex_own(&ibuf_mutex));
+
+ ibuf.free_list_len = flst_get_len(root + PAGE_HEADER
+ + PAGE_BTR_IBUF_FREE_LIST);
+
+ ibuf.height = 1 + btr_page_get_level(root);
+
+ /* the '1 +' is the ibuf header page */
+ ibuf.size = ibuf.seg_size - (1 + ibuf.free_list_len);
+}
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup and initializes
+the data structures for the insert buffer.
+@return DB_SUCCESS or failure */
+dberr_t
+ibuf_init_at_db_start(void)
+/*=======================*/
+{
+ page_t* root;
+ ulint n_used;
+
+ ut_ad(!ibuf.index);
+ mtr_t mtr;
+ mtr.start();
+ compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE);
+ compile_time_assert(IBUF_SPACE_ID == 0);
+ mtr_x_lock_space(fil_system.sys_space, &mtr);
+ buf_block_t* header_page = buf_page_get(
+ page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
+ 0, RW_X_LATCH, &mtr);
+
+ if (!header_page) {
+ mtr.commit();
+ return DB_DECRYPTION_FAILED;
+ }
+
+ /* At startup we intialize ibuf to have a maximum of
+ CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the
+ buffer pool size. Once ibuf struct is initialized this
+ value is updated with the user supplied size by calling
+ ibuf_max_size_update(). */
+ ibuf.max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
+ * CHANGE_BUFFER_DEFAULT_SIZE) / 100;
+
+ mutex_create(LATCH_ID_IBUF, &ibuf_mutex);
+
+ mutex_create(LATCH_ID_IBUF_BITMAP, &ibuf_bitmap_mutex);
+
+ mutex_create(LATCH_ID_IBUF_PESSIMISTIC_INSERT,
+ &ibuf_pessimistic_insert_mutex);
+
+ mutex_enter(&ibuf_mutex);
+
+ fseg_n_reserved_pages(*header_page,
+ IBUF_HEADER + IBUF_TREE_SEG_HEADER
+ + header_page->frame, &n_used, &mtr);
+
+ ut_ad(n_used >= 2);
+
+ ibuf.seg_size = n_used;
+
+ {
+ buf_block_t* block;
+
+ block = buf_page_get(
+ page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO),
+ 0, RW_X_LATCH, &mtr);
+
+ buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+
+ root = buf_block_get_frame(block);
+ }
+
+ ibuf_size_update(root);
+ mutex_exit(&ibuf_mutex);
+
+ ibuf.empty = page_is_empty(root);
+ mtr.commit();
+
+ ibuf.index = dict_mem_index_create(
+ dict_mem_table_create("innodb_change_buffer",
+ fil_system.sys_space, 1, 0, 0, 0),
+ "CLUST_IND",
+ DICT_CLUSTERED | DICT_IBUF, 1);
+ ibuf.index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
+ ibuf.index->n_uniq = REC_MAX_N_FIELDS;
+ rw_lock_create(index_tree_rw_lock_key, &ibuf.index->lock,
+ SYNC_IBUF_INDEX_TREE);
+#ifdef BTR_CUR_ADAPT
+ ibuf.index->search_info = btr_search_info_create(ibuf.index->heap);
+#endif /* BTR_CUR_ADAPT */
+ ibuf.index->page = FSP_IBUF_TREE_ROOT_PAGE_NO;
+ ut_d(ibuf.index->cached = TRUE);
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+ if (!ibuf_dump) {
+ return DB_SUCCESS;
+ }
+ ib::info() << "Dumping the change buffer";
+ ibuf_mtr_start(&mtr);
+ btr_pcur_t pcur;
+ if (DB_SUCCESS == btr_pcur_open_at_index_side(
+ true, ibuf.index, BTR_SEARCH_LEAF, &pcur,
+ true, 0, &mtr)) {
+ while (btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
+ rec_print_old(stderr, btr_pcur_get_rec(&pcur));
+ }
+ }
+ ibuf_mtr_commit(&mtr);
+ ib::info() << "Dumped the change buffer";
+#endif
+
+ return DB_SUCCESS;
+}
+
+/*********************************************************************//**
+Updates the max_size value for ibuf. */
+void
+ibuf_max_size_update(
+/*=================*/
+ ulint new_val) /*!< in: new value in terms of
+ percentage of the buffer pool size */
+{
+ ulint new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
+ * new_val) / 100;
+ mutex_enter(&ibuf_mutex);
+ ibuf.max_size = new_size;
+ mutex_exit(&ibuf_mutex);
+}
+
+# ifdef UNIV_DEBUG
+/** Gets the desired bits for a given page from a bitmap page.
+@param[in] page bitmap page
+@param[in] page_id page id whose bits to get
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@param[in,out] mtr mini-transaction holding an x-latch on the
+bitmap page
+@return value of bits */
+# define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr) \
+ ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, \
+ MTR_MEMO_PAGE_X_FIX, mtr, bit)
+# else /* UNIV_DEBUG */
+/** Gets the desired bits for a given page from a bitmap page.
+@param[in] page bitmap page
+@param[in] page_id page id whose bits to get
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@param[in,out] mtr mini-transaction holding an x-latch on the
+bitmap page
+@return value of bits */
+# define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr) \
+ ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, bit)
+# endif /* UNIV_DEBUG */
+
+/** Gets the desired bits for a given page from a bitmap page.
+@param[in] page bitmap page
+@param[in] page_id page id whose bits to get
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] latch_type MTR_MEMO_PAGE_X_FIX, MTR_MEMO_BUF_FIX, ...
+@param[in,out] mtr mini-transaction holding latch_type on the
+bitmap page
+@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@return value of bits */
+UNIV_INLINE
+ulint
+ibuf_bitmap_page_get_bits_low(
+ const page_t* page,
+ const page_id_t page_id,
+ ulint zip_size,
+#ifdef UNIV_DEBUG
+ ulint latch_type,
+ mtr_t* mtr,
+#endif /* UNIV_DEBUG */
+ ulint bit)
+{
+ ulint byte_offset;
+ ulint bit_offset;
+ ulint map_byte;
+ ulint value;
+ const ulint size = zip_size ? zip_size : srv_page_size;
+
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(bit < IBUF_BITS_PER_PAGE);
+ compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
+ ut_ad(mtr->memo_contains_page_flagged(page, latch_type));
+
+ bit_offset = (page_id.page_no() & (size - 1))
+ * IBUF_BITS_PER_PAGE + bit;
+
+ byte_offset = bit_offset / 8;
+ bit_offset = bit_offset % 8;
+
+ ut_ad(byte_offset + IBUF_BITMAP < srv_page_size);
+
+ map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
+
+ value = ut_bit_get_nth(map_byte, bit_offset);
+
+ if (bit == IBUF_BITMAP_FREE) {
+ ut_ad(bit_offset + 1 < 8);
+
+ value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1);
+ }
+
+ return(value);
+}
+
+/** Sets the desired bit for a given page in a bitmap page.
+@tparam bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@param[in,out] block bitmap page
+@param[in] page_id page id whose bits to set
+@param[in] physical_size page size
+@param[in] val value to set
+@param[in,out] mtr mtr containing an x-latch to the bitmap page */
+template<ulint bit>
+static void
+ibuf_bitmap_page_set_bits(
+ buf_block_t* block,
+ const page_id_t page_id,
+ ulint physical_size,
+ ulint val,
+ mtr_t* mtr)
+{
+ ulint byte_offset;
+ ulint bit_offset;
+
+ static_assert(bit < IBUF_BITS_PER_PAGE, "wrong bit");
+ compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr->is_named_space(page_id.space()));
+
+ bit_offset = (page_id.page_no() % physical_size)
+ * IBUF_BITS_PER_PAGE + bit;
+
+ byte_offset = bit_offset / 8;
+ bit_offset = bit_offset % 8;
+
+ ut_ad(byte_offset + IBUF_BITMAP < srv_page_size);
+
+ byte* map_byte = &block->frame[IBUF_BITMAP + byte_offset];
+ byte b = *map_byte;
+
+ if (bit == IBUF_BITMAP_FREE) {
+ ut_ad(bit_offset + 1 < 8);
+ ut_ad(val <= 3);
+ b &= static_cast<byte>(~(3U << bit_offset));
+ b |= static_cast<byte>(((val & 2) >> 1) << bit_offset
+ | (val & 1) << (bit_offset + 1));
+ } else {
+ ut_ad(val <= 1);
+ b &= static_cast<byte>(~(1U << bit_offset));
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ b |= static_cast<byte>(val << bit_offset);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ }
+
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, map_byte, b);
+}
+
+/** Calculates the bitmap page number for a given page number.
+@param[in] page_id page id
+@param[in] size page size
+@return the bitmap page id where the file page is mapped */
+inline page_id_t ibuf_bitmap_page_no_calc(const page_id_t page_id, ulint size)
+{
+ if (!size)
+ size= srv_page_size;
+
+ return page_id_t(page_id.space(), FSP_IBUF_BITMAP_OFFSET
+ + uint32_t(page_id.page_no() & ~(size - 1)));
+}
+
+/** Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@param[in] page_id page id of the file page
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] file file name
+@param[in] line line where called
+@param[in,out] mtr mini-transaction
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched */
+static
+buf_block_t*
+ibuf_bitmap_get_map_page_func(
+ const page_id_t page_id,
+ ulint zip_size,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr)
+{
+ buf_block_t* block = buf_page_get_gen(
+ ibuf_bitmap_page_no_calc(page_id, zip_size),
+ zip_size, RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED,
+ file, line, mtr);
+
+ if (block) {
+ buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP);
+ }
+
+ return block;
+}
+
+/** Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@param[in] page_id page id of the file page
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched */
+#define ibuf_bitmap_get_map_page(page_id, zip_size, mtr) \
+ ibuf_bitmap_get_map_page_func(page_id, zip_size, \
+ __FILE__, __LINE__, mtr)
+
+/************************************************************************//**
+Sets the free bits of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INLINE
+void
+ibuf_set_free_bits_low(
+/*===================*/
+ const buf_block_t* block, /*!< in: index page; free bits are set if
+ the index is non-clustered and page
+ level is 0 */
+ ulint val, /*!< in: value to set: < 4 */
+ mtr_t* mtr) /*!< in/out: mtr */
+{
+ ut_ad(mtr->is_named_space(block->page.id().space()));
+ if (!page_is_leaf(block->frame)) {
+ return;
+ }
+
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(val <= ibuf_index_page_calc_free(block));
+#endif /* UNIV_IBUF_DEBUG */
+ const page_id_t id(block->page.id());
+
+ if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+ id, block->zip_size(), mtr)) {
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
+ bitmap_page, id, block->physical_size(),
+ val, mtr);
+ }
+}
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+void
+ibuf_set_free_bits_func(
+/*====================*/
+ buf_block_t* block, /*!< in: index page of a non-clustered index;
+ free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+ ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum
+ value which the bits must have before
+ setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+ ulint val) /*!< in: value to set: < 4 */
+{
+ if (!page_is_leaf(block->frame)) {
+ return;
+ }
+
+ mtr_t mtr;
+ mtr.start();
+ const page_id_t id(block->page.id());
+
+ const fil_space_t* space = mtr.set_named_space_id(id.space());
+
+ buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(id,
+ block->zip_size(),
+ &mtr);
+
+ if (space->purpose != FIL_TYPE_TABLESPACE) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+#ifdef UNIV_IBUF_DEBUG
+ if (max_val != ULINT_UNDEFINED) {
+ ulint old_val;
+
+ old_val = ibuf_bitmap_page_get_bits(
+ bitmap_page, id,
+ IBUF_BITMAP_FREE, &mtr);
+ ut_a(old_val <= max_val);
+ }
+
+ ut_a(val <= ibuf_index_page_calc_free(block));
+#endif /* UNIV_IBUF_DEBUG */
+
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
+ bitmap_page, id, block->physical_size(),
+ val, &mtr);
+
+ mtr.commit();
+}
+
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+void
+ibuf_reset_free_bits(
+/*=================*/
+ buf_block_t* block) /*!< in: index page; free bits are set to 0
+ if the index is a non-clustered
+ non-unique, and page level is 0 */
+{
+ ibuf_set_free_bits(block, 0, ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_low(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ ulint max_ins_size, /*!< in: value of
+ maximum insert size
+ with reorganize before
+ the latest operation
+ performed to the page */
+ mtr_t* mtr) /*!< in/out: mtr */
+{
+ ulint before;
+ ulint after;
+
+ ut_a(!is_buf_block_get_page_zip(block));
+ ut_ad(mtr->is_named_space(block->page.id().space()));
+
+ before = ibuf_index_page_calc_free_bits(srv_page_size,
+ max_ins_size);
+
+ after = ibuf_index_page_calc_free(block);
+
+ /* This approach cannot be used on compressed pages, since the
+ computed value of "before" often does not match the current
+ state of the bitmap. This is because the free space may
+ increase or decrease when a compressed page is reorganized. */
+ if (before != after) {
+ ibuf_set_free_bits_low(block, after, mtr);
+ }
+}
+
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+ buf_block_t* block, /*!< in/out: index page */
+ mtr_t* mtr) /*!< in/out: mtr */
+{
+ ut_ad(page_is_leaf(block->frame));
+ ut_ad(block->zip_size());
+
+ ulint after = ibuf_index_page_calc_free_zip(block);
+
+ if (after == 0) {
+ /* We move the page to the front of the buffer pool LRU list:
+ the purpose of this is to prevent those pages to which we
+ cannot make inserts using the insert buffer from slipping
+ out of the buffer pool */
+
+ buf_page_make_young(&block->page);
+ }
+
+ if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+ block->page.id(), block->zip_size(), mtr)) {
+
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
+ bitmap_page, block->page.id(),
+ block->physical_size(), after, mtr);
+ }
+}
+
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page. It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+ buf_block_t* block1, /*!< in: index page */
+ buf_block_t* block2, /*!< in: index page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint state;
+
+ ut_ad(mtr->is_named_space(block1->page.id().space()));
+ ut_ad(block1->page.id().space() == block2->page.id().space());
+
+ /* As we have to x-latch two random bitmap pages, we have to acquire
+ the bitmap mutex to prevent a deadlock with a similar operation
+ performed by another OS thread. */
+
+ mutex_enter(&ibuf_bitmap_mutex);
+
+ state = ibuf_index_page_calc_free(block1);
+
+ ibuf_set_free_bits_low(block1, state, mtr);
+
+ state = ibuf_index_page_calc_free(block2);
+
+ ibuf_set_free_bits_low(block2, state, mtr);
+
+ mutex_exit(&ibuf_bitmap_mutex);
+}
+
+/** Returns TRUE if the page is one of the fixed address ibuf pages.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return TRUE if a fixed address ibuf i/o page */
+inline bool ibuf_fixed_addr_page(const page_id_t page_id, ulint zip_size)
+{
+ return(page_id == page_id_t(IBUF_SPACE_ID, IBUF_TREE_ROOT_PAGE_NO)
+ || ibuf_bitmap_page(page_id, zip_size));
+}
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] x_latch FALSE if relaxed check (avoid latching the
+bitmap page)
+@param[in] file file name
+@param[in] line line where called
+@param[in,out] mtr mtr which will contain an x-latch to the
+bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
+in which case a new transaction is created.
+@return TRUE if level 2 or level 3 page */
+bool
+ibuf_page_low(
+ const page_id_t page_id,
+ ulint zip_size,
+#ifdef UNIV_DEBUG
+ bool x_latch,
+#endif /* UNIV_DEBUG */
+ const char* file,
+ unsigned line,
+ mtr_t* mtr)
+{
+ ibool ret;
+ mtr_t local_mtr;
+
+ ut_ad(!recv_no_ibuf_operations);
+ ut_ad(x_latch || mtr == NULL);
+
+ if (ibuf_fixed_addr_page(page_id, zip_size)) {
+ return(true);
+ } else if (page_id.space() != IBUF_SPACE_ID) {
+ return(false);
+ }
+
+ compile_time_assert(IBUF_SPACE_ID == 0);
+ ut_ad(fil_system.sys_space->purpose == FIL_TYPE_TABLESPACE);
+
+#ifdef UNIV_DEBUG
+ if (!x_latch) {
+ mtr_start(&local_mtr);
+
+ /* Get the bitmap page without a page latch, so that
+ we will not be violating the latching order when
+ another bitmap page has already been latched by this
+ thread. The page will be buffer-fixed, and thus it
+ cannot be removed or relocated while we are looking at
+ it. The contents of the page could change, but the
+ IBUF_BITMAP_IBUF bit that we are interested in should
+ not be modified by any other thread. Nobody should be
+ calling ibuf_add_free_page() or ibuf_remove_free_page()
+ while the page is linked to the insert buffer b-tree. */
+ dberr_t err = DB_SUCCESS;
+
+ buf_block_t* block = buf_page_get_gen(
+ ibuf_bitmap_page_no_calc(page_id, zip_size),
+ zip_size, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH,
+ file, line, &local_mtr, &err);
+
+ ret = ibuf_bitmap_page_get_bits_low(
+ block->frame, page_id, zip_size,
+ MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF);
+
+ mtr_commit(&local_mtr);
+ return(ret);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (mtr == NULL) {
+ mtr = &local_mtr;
+ mtr_start(mtr);
+ }
+
+ ret = ibuf_bitmap_page_get_bits(ibuf_bitmap_get_map_page_func(
+ page_id, zip_size, file, line,
+ mtr)->frame,
+ page_id, zip_size,
+ IBUF_BITMAP_IBUF, mtr);
+
+ if (mtr == &local_mtr) {
+ mtr_commit(mtr);
+ }
+
+ return(ret);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(rec)
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Returns the page number field of an ibuf record.
+@return page number */
+static
+uint32_t
+ibuf_rec_get_page_no_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+ mtr_t* mtr, /*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+ const rec_t* rec) /*!< in: ibuf record */
+{
+ const byte* field;
+ ulint len;
+
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(rec_get_n_fields_old(rec) > 2);
+
+ field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+ ut_a(len == 1);
+
+ field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
+
+ ut_a(len == 4);
+
+ return(mach_read_from_4(field));
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(rec)
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Returns the space id field of an ibuf record. For < 4.1.x format records
+returns 0.
+@return space id */
+static
+uint32_t
+ibuf_rec_get_space_func(
+/*====================*/
+#ifdef UNIV_DEBUG
+ mtr_t* mtr, /*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+ const rec_t* rec) /*!< in: ibuf record */
+{
+ const byte* field;
+ ulint len;
+
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(rec_get_n_fields_old(rec) > 2);
+
+ field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+ ut_a(len == 1);
+
+ field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
+ ut_a(len == 4);
+
+ return(mach_read_from_4(field));
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \
+ ibuf_rec_get_info_func(mtr,rec,op,comp,info_len,counter)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \
+ ibuf_rec_get_info_func(rec,op,comp,info_len,counter)
+#endif
+/****************************************************************//**
+Get various information about an ibuf record in >= 4.1.x format. */
+static
+void
+ibuf_rec_get_info_func(
+/*===================*/
+#ifdef UNIV_DEBUG
+ mtr_t* mtr, /*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+ const rec_t* rec, /*!< in: ibuf record */
+ ibuf_op_t* op, /*!< out: operation type, or NULL */
+ ibool* comp, /*!< out: compact flag, or NULL */
+ ulint* info_len, /*!< out: length of info fields at the
+ start of the fourth field, or
+ NULL */
+ ulint* counter) /*!< in: counter value, or NULL */
+{
+ const byte* types;
+ ulint fields;
+ ulint len;
+
+ /* Local variables to shadow arguments. */
+ ibuf_op_t op_local;
+ ibool comp_local;
+ ulint info_len_local;
+ ulint counter_local;
+
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(ibuf_inside(mtr));
+ fields = rec_get_n_fields_old(rec);
+ ut_a(fields > IBUF_REC_FIELD_USER);
+
+ types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+ info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+ compile_time_assert(IBUF_REC_INFO_SIZE
+ < DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ switch (info_len_local) {
+ case 0:
+ case 1:
+ op_local = IBUF_OP_INSERT;
+ comp_local = info_len_local;
+ ut_ad(!counter);
+ counter_local = ULINT_UNDEFINED;
+ break;
+
+ case IBUF_REC_INFO_SIZE:
+ op_local = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
+ comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT;
+ counter_local = mach_read_from_2(
+ types + IBUF_REC_OFFSET_COUNTER);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ ut_a(op_local < IBUF_OP_COUNT);
+ ut_a((len - info_len_local) ==
+ (fields - IBUF_REC_FIELD_USER)
+ * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ if (op) {
+ *op = op_local;
+ }
+
+ if (comp) {
+ *comp = comp_local;
+ }
+
+ if (info_len) {
+ *info_len = info_len_local;
+ }
+
+ if (counter) {
+ *counter = counter_local;
+ }
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(rec)
+#endif
+
+/****************************************************************//**
+Returns the operation type field of an ibuf record.
+@return operation type */
+static
+ibuf_op_t
+ibuf_rec_get_op_type_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+ mtr_t* mtr, /*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+ const rec_t* rec) /*!< in: ibuf record */
+{
+ ulint len;
+
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(rec_get_n_fields_old(rec) > 2);
+
+ (void) rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+ if (len > 1) {
+ /* This is a < 4.1.x format record */
+
+ return(IBUF_OP_INSERT);
+ } else {
+ ibuf_op_t op;
+
+ ibuf_rec_get_info(mtr, rec, &op, NULL, NULL, NULL);
+
+ return(op);
+ }
+}
+
+/****************************************************************//**
+Read the first two bytes from a record's fourth field (counter field in new
+records; something else in older records).
+@return "counter" field, or ULINT_UNDEFINED if for some reason it
+can't be read */
+ulint
+ibuf_rec_get_counter(
+/*=================*/
+ const rec_t* rec) /*!< in: ibuf record */
+{
+ const byte* ptr;
+ ulint len;
+
+ if (rec_get_n_fields_old(rec) <= IBUF_REC_FIELD_METADATA) {
+
+ return(ULINT_UNDEFINED);
+ }
+
+ ptr = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+ if (len >= 2) {
+
+ return(mach_read_from_2(ptr));
+ } else {
+
+ return(ULINT_UNDEFINED);
+ }
+}
+
+
+/**
+ Add accumulated operation counts to a permanent array.
+ Both arrays must be of size IBUF_OP_COUNT.
+*/
+static void ibuf_add_ops(Atomic_counter<ulint> *out, const ulint *in)
+{
+ for (auto i = 0; i < IBUF_OP_COUNT; i++)
+ out[i]+= in[i];
+}
+
+
+/****************************************************************//**
+Print operation counts. The array must be of size IBUF_OP_COUNT. */
+static
+void
+ibuf_print_ops(
+/*===========*/
+ const Atomic_counter<ulint>* ops, /*!< in: operation counts */
+ FILE* file) /*!< in: file where to print */
+{
+ static const char* op_names[] = {
+ "insert",
+ "delete mark",
+ "delete"
+ };
+ ulint i;
+
+ ut_a(UT_ARR_SIZE(op_names) == IBUF_OP_COUNT);
+
+ for (i = 0; i < IBUF_OP_COUNT; i++) {
+ fprintf(file, "%s " ULINTPF "%s", op_names[i],
+ ulint{ops[i]}, (i < (IBUF_OP_COUNT - 1)) ? ", " : "");
+ }
+
+ putc('\n', file);
+}
+
+/********************************************************************//**
+Creates a dummy index for inserting a record to a non-clustered index.
+@return dummy index */
+static
+dict_index_t*
+ibuf_dummy_index_create(
+/*====================*/
+ ulint n, /*!< in: number of fields */
+ ibool comp) /*!< in: TRUE=use compact record format */
+{
+ dict_table_t* table;
+ dict_index_t* index;
+
+ table = dict_mem_table_create("IBUF_DUMMY", NULL, n, 0,
+ comp ? DICT_TF_COMPACT : 0, 0);
+
+ index = dict_mem_index_create(table, "IBUF_DUMMY", 0, n);
+
+ /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+ index->cached = TRUE;
+ ut_d(index->is_dummy = true);
+
+ return(index);
+}
+/********************************************************************//**
+Add a column to the dummy index */
+static
+void
+ibuf_dummy_index_add_col(
+/*=====================*/
+ dict_index_t* index, /*!< in: dummy index */
+ const dtype_t* type, /*!< in: the data type of the column */
+ ulint len) /*!< in: length of the column */
+{
+ ulint i = index->table->n_def;
+ dict_mem_table_add_col(index->table, NULL, NULL,
+ dtype_get_mtype(type),
+ dtype_get_prtype(type),
+ dtype_get_len(type));
+ dict_index_add_col(index, index->table,
+ dict_table_get_nth_col(index->table, i), len);
+}
+/********************************************************************//**
+Deallocates a dummy index for inserting a record to a non-clustered index. */
+static
+void
+ibuf_dummy_index_free(
+/*==================*/
+ dict_index_t* index) /*!< in, own: dummy index */
+{
+ dict_table_t* table = index->table;
+
+ dict_mem_index_free(index);
+ dict_mem_table_free(table);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \
+ ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex)
+#else /* UNIV_DEBUG */
+# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \
+ ibuf_build_entry_from_ibuf_rec_func(ibuf_rec,heap,pindex)
+#endif
+
+/*********************************************************************//**
+Builds the entry used to
+
+1) IBUF_OP_INSERT: insert into a non-clustered index
+
+2) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to
+ activate
+
+3) IBUF_OP_DELETE: find the record we need to delete
+
+when we have the corresponding record in an ibuf index.
+
+NOTE that as we copy pointers to fields in ibuf_rec, the caller must
+hold a latch to the ibuf_rec page as long as the entry is used!
+
+@return own: entry to insert to a non-clustered index */
+static
+dtuple_t*
+ibuf_build_entry_from_ibuf_rec_func(
+/*================================*/
+#ifdef UNIV_DEBUG
+ mtr_t* mtr, /*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+ const rec_t* ibuf_rec, /*!< in: record in an insert buffer */
+ mem_heap_t* heap, /*!< in: heap where built */
+ dict_index_t** pindex) /*!< out, own: dummy index that
+ describes the entry */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ ulint n_fields;
+ const byte* types;
+ const byte* data;
+ ulint len;
+ ulint info_len;
+ ulint i;
+ ulint comp;
+ dict_index_t* index;
+
+ ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(ibuf_inside(mtr));
+
+ data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
+
+ ut_a(len == 1);
+ ut_a(*data == 0);
+ ut_a(rec_get_n_fields_old(ibuf_rec) > IBUF_REC_FIELD_USER);
+
+ n_fields = rec_get_n_fields_old(ibuf_rec) - IBUF_REC_FIELD_USER;
+
+ tuple = dtuple_create(heap, n_fields);
+
+ types = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
+
+ ibuf_rec_get_info(mtr, ibuf_rec, NULL, &comp, &info_len, NULL);
+
+ index = ibuf_dummy_index_create(n_fields, comp);
+
+ len -= info_len;
+ types += info_len;
+
+ ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = rec_get_nth_field_old(
+ ibuf_rec, i + IBUF_REC_FIELD_USER, &len);
+
+ dfield_set_data(field, data, len);
+
+ dtype_new_read_for_order_and_null_size(
+ dfield_get_type(field),
+ types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ ibuf_dummy_index_add_col(index, dfield_get_type(field), len);
+ }
+
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+
+ /* Prevent an ut_ad() failure in page_zip_write_rec() by
+ adding system columns to the dummy table pointed to by the
+ dummy secondary index. The insert buffer is only used for
+ secondary indexes, whose records never contain any system
+ columns, such as DB_TRX_ID. */
+ ut_d(dict_table_add_system_columns(index->table, index->table->heap));
+
+ *pindex = index;
+
+ return(tuple);
+}
+
+/******************************************************************//**
+Get the data size.
+@return size of fields */
+UNIV_INLINE
+ulint
+ibuf_rec_get_size(
+/*==============*/
+ const rec_t* rec, /*!< in: ibuf record */
+ const byte* types, /*!< in: fields */
+ ulint n_fields, /*!< in: number of fields */
+ ulint comp) /*!< in: 0=ROW_FORMAT=REDUNDANT,
+ nonzero=ROW_FORMAT=COMPACT */
+{
+ ulint i;
+ ulint field_offset;
+ ulint types_offset;
+ ulint size = 0;
+
+ field_offset = IBUF_REC_FIELD_USER;
+ types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+
+ for (i = 0; i < n_fields; i++) {
+ ulint len;
+ dtype_t dtype;
+
+ rec_get_nth_field_offs_old(rec, i + field_offset, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ size += len;
+ } else {
+ dtype_new_read_for_order_and_null_size(&dtype, types);
+
+ size += dtype_get_sql_null_size(&dtype, comp);
+ }
+
+ types += types_offset;
+ }
+
+ return(size);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(rec)
+#endif
+
+/********************************************************************//**
+Returns the space taken by a stored non-clustered index entry if converted to
+an index record.
+@return size of index record in bytes + an upper limit of the space
+taken in the page directory */
+static
+ulint
+ibuf_rec_get_volume_func(
+/*=====================*/
+#ifdef UNIV_DEBUG
+ mtr_t* mtr, /*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+ const rec_t* ibuf_rec)/*!< in: ibuf record */
+{
+ ulint len;
+ const byte* data;
+ const byte* types;
+ ulint n_fields;
+ ulint data_size;
+ ulint comp;
+ ibuf_op_t op;
+ ulint info_len;
+
+ ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
+
+ data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
+ ut_a(len == 1);
+ ut_a(*data == 0);
+
+ types = rec_get_nth_field_old(
+ ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
+
+ ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL);
+
+ if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
+ /* Delete-marking a record doesn't take any
+ additional space, and while deleting a record
+ actually frees up space, we have to play it safe and
+ pretend it takes no additional space (the record
+ might not exist, etc.). */
+
+ return(0);
+ } else if (comp) {
+ dtuple_t* entry;
+ ulint volume;
+ dict_index_t* dummy_index;
+ mem_heap_t* heap = mem_heap_create(500);
+
+ entry = ibuf_build_entry_from_ibuf_rec(mtr, ibuf_rec,
+ heap, &dummy_index);
+
+ volume = rec_get_converted_size(dummy_index, entry, 0);
+
+ ibuf_dummy_index_free(dummy_index);
+ mem_heap_free(heap);
+
+ return(volume + page_dir_calc_reserved_space(1));
+ }
+
+ types += info_len;
+ n_fields = rec_get_n_fields_old(ibuf_rec)
+ - IBUF_REC_FIELD_USER;
+
+ data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, comp);
+
+ return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
+ + page_dir_calc_reserved_space(1));
+}
+
+/*********************************************************************//**
+Builds the tuple to insert to an ibuf tree when we have an entry for a
+non-clustered index.
+
+NOTE that the original entry must be kept because we copy pointers to
+its fields.
+
+@return own: entry to insert into an ibuf index tree */
+static
+dtuple_t*
+ibuf_entry_build(
+/*=============*/
+ ibuf_op_t op, /*!< in: operation type */
+ dict_index_t* index, /*!< in: non-clustered index */
+ const dtuple_t* entry, /*!< in: entry for a non-clustered index */
+ ulint space, /*!< in: space id */
+ ulint page_no,/*!< in: index page number where entry should
+ be inserted */
+ ulint counter,/*!< in: counter value;
+ ULINT_UNDEFINED=not used */
+ mem_heap_t* heap) /*!< in: heap into which to build */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ const dfield_t* entry_field;
+ ulint n_fields;
+ byte* buf;
+ byte* ti;
+ byte* type_info;
+ ulint i;
+
+ ut_ad(counter != ULINT_UNDEFINED || op == IBUF_OP_INSERT);
+ ut_ad(counter == ULINT_UNDEFINED || counter <= 0xFFFF);
+ ut_ad(op < IBUF_OP_COUNT);
+
+ /* We have to build a tuple with the following fields:
+
+ 1-4) These are described at the top of this file.
+
+ 5) The rest of the fields are copied from the entry.
+
+ All fields in the tuple are ordered like the type binary in our
+ insert buffer tree. */
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ tuple = dtuple_create(heap, n_fields + IBUF_REC_FIELD_USER);
+
+ /* 1) Space Id */
+
+ field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+ mach_write_to_4(buf, space);
+
+ dfield_set_data(field, buf, 4);
+
+ /* 2) Marker byte */
+
+ field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
+
+ /* We set the marker byte zero */
+
+ mach_write_to_1(buf, 0);
+
+ dfield_set_data(field, buf, 1);
+
+ /* 3) Page number */
+
+ field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+ mach_write_to_4(buf, page_no);
+
+ dfield_set_data(field, buf, 4);
+
+ /* 4) Type info, part #1 */
+
+ if (counter == ULINT_UNDEFINED) {
+ i = dict_table_is_comp(index->table) ? 1 : 0;
+ } else {
+ ut_ad(counter <= 0xFFFF);
+ i = IBUF_REC_INFO_SIZE;
+ }
+
+ ti = type_info = static_cast<byte*>(
+ mem_heap_alloc(
+ heap,
+ i + n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE));
+
+ switch (i) {
+ default:
+ ut_error;
+ break;
+ case 1:
+ /* set the flag for ROW_FORMAT=COMPACT */
+ *ti++ = 0;
+ /* fall through */
+ case 0:
+ /* the old format does not allow delete buffering */
+ ut_ad(op == IBUF_OP_INSERT);
+ break;
+ case IBUF_REC_INFO_SIZE:
+ mach_write_to_2(ti + IBUF_REC_OFFSET_COUNTER, counter);
+
+ ti[IBUF_REC_OFFSET_TYPE] = (byte) op;
+ ti[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table)
+ ? IBUF_REC_COMPACT : 0;
+ ti += IBUF_REC_INFO_SIZE;
+ break;
+ }
+
+ /* 5+) Fields from the entry */
+
+ for (i = 0; i < n_fields; i++) {
+ ulint fixed_len;
+ const dict_field_t* ifield;
+
+ field = dtuple_get_nth_field(tuple, i + IBUF_REC_FIELD_USER);
+ entry_field = dtuple_get_nth_field(entry, i);
+ dfield_copy(field, entry_field);
+
+ ifield = dict_index_get_nth_field(index, i);
+ /* Prefix index columns of fixed-length columns are of
+ fixed length. However, in the function call below,
+ dfield_get_type(entry_field) contains the fixed length
+ of the column in the clustered index. Replace it with
+ the fixed length of the secondary index column. */
+ fixed_len = ifield->fixed_len;
+
+#ifdef UNIV_DEBUG
+ if (fixed_len) {
+ /* dict_index_add_col() should guarantee these */
+ ut_ad(fixed_len <= (ulint)
+ dfield_get_type(entry_field)->len);
+ if (ifield->prefix_len) {
+ ut_ad(ifield->prefix_len == fixed_len);
+ } else {
+ ut_ad(fixed_len == (ulint)
+ dfield_get_type(entry_field)->len);
+ }
+ }
+#endif /* UNIV_DEBUG */
+
+ dtype_new_store_for_order_and_null_size(
+ ti, dfield_get_type(entry_field), fixed_len);
+ ti += DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+ }
+
+ /* 4) Type info, part #2 */
+
+ field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_METADATA);
+
+ dfield_set_data(field, type_info, ulint(ti - type_info));
+
+ /* Set all the types in the new tuple binary */
+
+ dtuple_set_types_binary(tuple, n_fields + IBUF_REC_FIELD_USER);
+
+ return(tuple);
+}
+
+/*********************************************************************//**
+Builds a search tuple used to search buffered inserts for an index page.
+This is for >= 4.1.x format records.
+@return own: search tuple */
+static
+dtuple_t*
+ibuf_search_tuple_build(
+/*====================*/
+ ulint space, /*!< in: space id */
+ ulint page_no,/*!< in: index page number */
+ mem_heap_t* heap) /*!< in: heap into which to build */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ byte* buf;
+
+ tuple = dtuple_create(heap, IBUF_REC_FIELD_METADATA);
+
+ /* Store the space id in tuple */
+
+ field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+ mach_write_to_4(buf, space);
+
+ dfield_set_data(field, buf, 4);
+
+ /* Store the new format record marker byte */
+
+ field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
+
+ mach_write_to_1(buf, 0);
+
+ dfield_set_data(field, buf, 1);
+
+ /* Store the page number in tuple */
+
+ field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+ mach_write_to_4(buf, page_no);
+
+ dfield_set_data(field, buf, 4);
+
+ dtuple_set_types_binary(tuple, IBUF_REC_FIELD_METADATA);
+
+ return(tuple);
+}
+
+/*********************************************************************//**
+Checks if there are enough pages in the free list of the ibuf tree that we
+dare to start a pessimistic insert to the insert buffer.
+@return whether enough free pages in list */
+static inline bool ibuf_data_enough_free_for_insert()
+{
+ ut_ad(mutex_own(&ibuf_mutex));
+
+ /* We want a big margin of free pages, because a B-tree can sometimes
+ grow in size also if records are deleted from it, as the node pointers
+ can change, and we must make sure that we are able to delete the
+ inserts buffered for pages that we read to the buffer pool, without
+ any risk of running out of free space in the insert buffer. */
+
+ return(ibuf.free_list_len >= (ibuf.size / 2) + 3 * ibuf.height);
+}
+
+/*********************************************************************//**
+Checks if there are enough pages in the free list of the ibuf tree that we
+should remove them and free to the file space management.
+@return TRUE if enough free pages in list */
+UNIV_INLINE
+ibool
+ibuf_data_too_much_free(void)
+/*=========================*/
+{
+ ut_ad(mutex_own(&ibuf_mutex));
+
+ return(ibuf.free_list_len >= 3 + (ibuf.size / 2) + 3 * ibuf.height);
+}
+
+/** Allocate a change buffer page.
+@retval true on success
+@retval false if no space left */
+static bool ibuf_add_free_page()
+{
+ mtr_t mtr;
+ page_t* header_page;
+ buf_block_t* block;
+
+ mtr.start();
+ /* Acquire the fsp latch before the ibuf header, obeying the latching
+ order */
+ mtr_x_lock_space(fil_system.sys_space, &mtr);
+ header_page = ibuf_header_page_get(&mtr);
+
+ /* Allocate a new page: NOTE that if the page has been a part of a
+ non-clustered index which has subsequently been dropped, then the
+ page may have buffered inserts in the insert buffer, and these
+ should be deleted from there. These get deleted when the page
+ allocation creates the page in buffer. Thus the call below may end
+ up calling the insert buffer routines and, as we yet have no latches
+ to insert buffer tree pages, these routines can run without a risk
+ of a deadlock. This is the reason why we created a special ibuf
+ header page apart from the ibuf tree. */
+
+ block = fseg_alloc_free_page(
+ header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP,
+ &mtr);
+
+ if (block == NULL) {
+ mtr.commit();
+ return false;
+ }
+
+ ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
+ ibuf_enter(&mtr);
+ mutex_enter(&ibuf_mutex);
+
+ buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
+
+ mtr.write<2>(*block, block->frame + FIL_PAGE_TYPE,
+ FIL_PAGE_IBUF_FREE_LIST);
+
+ /* Add the page to the free list and update the ibuf size data */
+
+ flst_add_last(ibuf_tree_root_get(&mtr),
+ PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+
+ ibuf.seg_size++;
+ ibuf.free_list_len++;
+
+ /* Set the bit indicating that this page is now an ibuf tree page
+ (level 2 page) */
+
+ const page_id_t page_id(block->page.id());
+ buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
+
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(bitmap_page, page_id,
+ srv_page_size, true,
+ &mtr);
+
+ ibuf_mtr_commit(&mtr);
+
+ return true;
+}
+
+/*********************************************************************//**
+Removes a page from the free list and frees it to the fsp system. */
+static
+void
+ibuf_remove_free_page(void)
+/*=======================*/
+{
+ mtr_t mtr;
+ mtr_t mtr2;
+ page_t* header_page;
+
+ log_free_check();
+
+ mtr_start(&mtr);
+ /* Acquire the fsp latch before the ibuf header, obeying the latching
+ order */
+
+ mtr_x_lock_space(fil_system.sys_space, &mtr);
+ header_page = ibuf_header_page_get(&mtr);
+
+ /* Prevent pessimistic inserts to insert buffer trees for a while */
+ ibuf_enter(&mtr);
+ mutex_enter(&ibuf_pessimistic_insert_mutex);
+ mutex_enter(&ibuf_mutex);
+
+ if (!ibuf_data_too_much_free()) {
+
+ mutex_exit(&ibuf_mutex);
+ mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+ ibuf_mtr_commit(&mtr);
+
+ return;
+ }
+
+ ibuf_mtr_start(&mtr2);
+
+ buf_block_t* root = ibuf_tree_root_get(&mtr2);
+
+ mutex_exit(&ibuf_mutex);
+
+ uint32_t page_no = flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST
+ + root->frame).page;
+
+ /* NOTE that we must release the latch on the ibuf tree root
+ because in fseg_free_page we access level 1 pages, and the root
+ is a level 2 page. */
+
+ ibuf_mtr_commit(&mtr2);
+ ibuf_exit(&mtr);
+
+ /* Since pessimistic inserts were prevented, we know that the
+ page is still in the free list. NOTE that also deletes may take
+ pages from the free list, but they take them from the start, and
+ the free list was so long that they cannot have taken the last
+ page from it. */
+
+ compile_time_assert(IBUF_SPACE_ID == 0);
+ fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+ fil_system.sys_space, page_no, &mtr);
+
+ const page_id_t page_id(IBUF_SPACE_ID, page_no);
+
+ ibuf_enter(&mtr);
+
+ mutex_enter(&ibuf_mutex);
+
+ root = ibuf_tree_root_get(&mtr);
+
+ ut_ad(page_no == flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST
+ + root->frame).page);
+
+ buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+
+ /* Remove the page from the free list and update the ibuf size data */
+
+ flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+
+ mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+ ibuf.seg_size--;
+ ibuf.free_list_len--;
+
+ /* Set the bit indicating that this page is no more an ibuf tree page
+ (level 2 page) */
+
+ buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
+
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(
+ bitmap_page, page_id, srv_page_size, false, &mtr);
+
+ buf_page_free(fil_system.sys_space, page_no, &mtr, __FILE__, __LINE__);
+
+ ibuf_mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+void
+ibuf_free_excess_pages(void)
+/*========================*/
+{
+ /* Free at most a few pages at a time, so that we do not delay the
+ requested service too much */
+
+ for (ulint i = 0; i < 4; i++) {
+
+ ibool too_much_free;
+
+ mutex_enter(&ibuf_mutex);
+ too_much_free = ibuf_data_too_much_free();
+ mutex_exit(&ibuf_mutex);
+
+ if (!too_much_free) {
+ return;
+ }
+
+ ibuf_remove_free_page();
+ }
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,pages,n_stored) \
+ ibuf_get_merge_page_nos_func(contract,rec,mtr,ids,pages,n_stored)
+#else /* UNIV_DEBUG */
+# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,pages,n_stored) \
+ ibuf_get_merge_page_nos_func(contract,rec,ids,pages,n_stored)
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Reads page numbers from a leaf in an ibuf tree.
+@return a lower limit for the combined volume of records which will be
+merged */
+static
+ulint
+ibuf_get_merge_page_nos_func(
+/*=========================*/
+ ibool contract,/*!< in: TRUE if this function is called to
+ contract the tree, FALSE if this is called
+ when a single page becomes full and we look
+ if it pays to read also nearby pages */
+ const rec_t* rec, /*!< in: insert buffer record */
+#ifdef UNIV_DEBUG
+ mtr_t* mtr, /*!< in: mini-transaction holding rec */
+#endif /* UNIV_DEBUG */
+ uint32_t* space_ids,/*!< in/out: space id's of the pages */
+ uint32_t* page_nos,/*!< in/out: buffer for at least
+ IBUF_MAX_N_PAGES_MERGED many page numbers;
+ the page numbers are in an ascending order */
+ ulint* n_stored)/*!< out: number of page numbers stored to
+ page_nos in this function */
+{
+ uint32_t prev_page_no;
+ uint32_t prev_space_id;
+ uint32_t first_page_no;
+ uint32_t first_space_id;
+ uint32_t rec_page_no;
+ uint32_t rec_space_id;
+ ulint sum_volumes;
+ ulint volume_for_page;
+ ulint rec_volume;
+ ulint limit;
+ ulint n_pages;
+
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(ibuf_inside(mtr));
+
+ *n_stored = 0;
+
+ limit = ut_min(IBUF_MAX_N_PAGES_MERGED,
+ buf_pool_get_curr_size() / 4);
+
+ if (page_rec_is_supremum(rec)) {
+
+ rec = page_rec_get_prev_const(rec);
+ }
+
+ if (page_rec_is_infimum(rec)) {
+
+ rec = page_rec_get_next_const(rec);
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ return(0);
+ }
+
+ first_page_no = ibuf_rec_get_page_no(mtr, rec);
+ first_space_id = ibuf_rec_get_space(mtr, rec);
+ n_pages = 0;
+ prev_page_no = 0;
+ prev_space_id = 0;
+
+ /* Go backwards from the first rec until we reach the border of the
+ 'merge area', or the page start or the limit of storeable pages is
+ reached */
+
+ while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) {
+
+ rec_page_no = ibuf_rec_get_page_no(mtr, rec);
+ rec_space_id = ibuf_rec_get_space(mtr, rec);
+
+ if (rec_space_id != first_space_id
+ || (rec_page_no / IBUF_MERGE_AREA)
+ != (first_page_no / IBUF_MERGE_AREA)) {
+
+ break;
+ }
+
+ if (rec_page_no != prev_page_no
+ || rec_space_id != prev_space_id) {
+ n_pages++;
+ }
+
+ prev_page_no = rec_page_no;
+ prev_space_id = rec_space_id;
+
+ rec = page_rec_get_prev_const(rec);
+ }
+
+ rec = page_rec_get_next_const(rec);
+
+ /* At the loop start there is no prev page; we mark this with a pair
+ of space id, page no (0, 0) for which there can never be entries in
+ the insert buffer */
+
+ prev_page_no = 0;
+ prev_space_id = 0;
+ sum_volumes = 0;
+ volume_for_page = 0;
+
+ while (*n_stored < limit) {
+ if (page_rec_is_supremum(rec)) {
+ /* When no more records available, mark this with
+ another 'impossible' pair of space id, page no */
+ rec_page_no = 1;
+ rec_space_id = 0;
+ } else {
+ rec_page_no = ibuf_rec_get_page_no(mtr, rec);
+ rec_space_id = ibuf_rec_get_space(mtr, rec);
+ /* In the system tablespace the smallest
+ possible secondary index leaf page number is
+ bigger than FSP_DICT_HDR_PAGE_NO (7).
+ In all tablespaces, pages 0 and 1 are reserved
+ for the allocation bitmap and the change
+ buffer bitmap. In file-per-table tablespaces,
+ a file segment inode page will be created at
+ page 2 and the clustered index tree is created
+ at page 3. So for file-per-table tablespaces,
+ page 4 is the smallest possible secondary
+ index leaf page. CREATE TABLESPACE also initially
+ uses pages 2 and 3 for the first created table,
+ but that table may be dropped, allowing page 2
+ to be reused for a secondary index leaf page.
+ To keep this assertion simple, just
+ make sure the page is >= 2. */
+ ut_ad(rec_page_no >= FSP_FIRST_INODE_PAGE_NO);
+ }
+
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED);
+#endif
+ if ((rec_space_id != prev_space_id
+ || rec_page_no != prev_page_no)
+ && (prev_space_id != 0 || prev_page_no != 0)) {
+
+ if (contract
+ || (prev_page_no == first_page_no
+ && prev_space_id == first_space_id)
+ || (volume_for_page
+ > ((IBUF_MERGE_THRESHOLD - 1)
+ * 4U << srv_page_size_shift
+ / IBUF_PAGE_SIZE_PER_FREE_SPACE)
+ / IBUF_MERGE_THRESHOLD)) {
+
+ space_ids[*n_stored] = prev_space_id;
+ page_nos[*n_stored] = prev_page_no;
+
+ (*n_stored)++;
+
+ sum_volumes += volume_for_page;
+ }
+
+ if (rec_space_id != first_space_id
+ || rec_page_no / IBUF_MERGE_AREA
+ != first_page_no / IBUF_MERGE_AREA) {
+
+ break;
+ }
+
+ volume_for_page = 0;
+ }
+
+ if (rec_page_no == 1 && rec_space_id == 0) {
+ /* Supremum record */
+
+ break;
+ }
+
+ rec_volume = ibuf_rec_get_volume(mtr, rec);
+
+ volume_for_page += rec_volume;
+
+ prev_page_no = rec_page_no;
+ prev_space_id = rec_space_id;
+
+ rec = page_rec_get_next_const(rec);
+ }
+
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED);
+#endif
+#if 0
+ fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n",
+ *n_stored, sum_volumes);
+#endif
+ return(sum_volumes);
+}
+
+/*******************************************************************//**
+Get the matching records for space id.
+@return current rec or NULL */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const rec_t*
+ibuf_get_user_rec(
+/*===============*/
+ btr_pcur_t* pcur, /*!< in: the current cursor */
+ mtr_t* mtr) /*!< in: mini transaction */
+{
+ do {
+ const rec_t* rec = btr_pcur_get_rec(pcur);
+
+ if (page_rec_is_user_rec(rec)) {
+ return(rec);
+ }
+ } while (btr_pcur_move_to_next(pcur, mtr));
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Reads page numbers for a space id from an ibuf tree.
+@return a lower limit for the combined volume of records which will be
+merged */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ulint
+ibuf_get_merge_pages(
+/*=================*/
+ btr_pcur_t* pcur, /*!< in/out: cursor */
+ uint32_t space, /*!< in: space for which to merge */
+ ulint limit, /*!< in: max page numbers to read */
+ uint32_t* pages, /*!< out: pages read */
+ uint32_t* spaces, /*!< out: spaces read */
+ ulint* n_pages,/*!< out: number of pages read */
+ mtr_t* mtr) /*!< in: mini transaction */
+{
+ const rec_t* rec;
+ ulint volume = 0;
+
+ *n_pages = 0;
+
+ while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0
+ && ibuf_rec_get_space(mtr, rec) == space
+ && *n_pages < limit) {
+
+ uint32_t page_no = ibuf_rec_get_page_no(mtr, rec);
+
+ if (*n_pages == 0 || pages[*n_pages - 1] != page_no) {
+ spaces[*n_pages] = space;
+ pages[*n_pages] = page_no;
+ ++*n_pages;
+ }
+
+ volume += ibuf_rec_get_volume(mtr, rec);
+
+ btr_pcur_move_to_next(pcur, mtr);
+ }
+
+ return(volume);
+}
+
+/**
+Delete a change buffer record.
+@param[in] page_id page identifier
+@param[in,out] pcur persistent cursor positioned on the record
+@param[in] search_tuple search key for (space,page_no)
+@param[in,out] mtr mini-transaction
+@return whether mtr was committed (due to pessimistic operation) */
+static MY_ATTRIBUTE((warn_unused_result, nonnull))
+bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
+ const dtuple_t* search_tuple, mtr_t* mtr);
+
+/** Merge the change buffer to some pages. */
+static void ibuf_read_merge_pages(const uint32_t* space_ids,
+ const uint32_t* page_nos, ulint n_stored)
+{
+#ifndef DBUG_OFF
+ mem_heap_t* heap = mem_heap_create(512);
+ ulint dops[IBUF_OP_COUNT];
+ memset(dops, 0, sizeof(dops));
+#endif
+
+ for (ulint i = 0; i < n_stored; i++) {
+ const ulint space_id = space_ids[i];
+ fil_space_t* s = fil_space_t::get(space_id);
+ if (!s) {
+tablespace_deleted:
+ /* The tablespace was not found: remove all
+ entries for it */
+ ibuf_delete_for_discarded_space(space_id);
+ while (i + 1 < n_stored
+ && space_ids[i + 1] == space_id) {
+ i++;
+ }
+ continue;
+ }
+
+ const ulint zip_size = s->zip_size(), size = s->size;
+ s->release();
+ mtr_t mtr;
+
+ if (UNIV_LIKELY(page_nos[i] < size)) {
+ mtr.start();
+ dberr_t err;
+ buf_page_get_gen(page_id_t(space_id, page_nos[i]),
+ zip_size, RW_X_LATCH, nullptr,
+ BUF_GET_POSSIBLY_FREED,
+ __FILE__, __LINE__, &mtr, &err, true);
+ mtr.commit();
+ if (err == DB_TABLESPACE_DELETED) {
+ goto tablespace_deleted;
+ }
+ }
+#ifndef DBUG_OFF
+ DBUG_EXECUTE_IF("ibuf_merge_corruption", goto work_around;);
+ continue;
+
+ /* The following code works around a hang when the
+ change buffer is corrupted, likely due to the race
+ condition in crash recovery that was fixed in
+ MDEV-24449. But, it also introduces corruption by
+ itself in the following scenario:
+
+ (1) We merged buffered changes in buf_page_get_gen()
+ (2) We committed the mini-transaction
+ (3) Redo log and the page with the merged changes is written
+ (4) A write completion callback thread evicts the page.
+ (5) Other threads buffer changes for that page.
+ (6) We will wrongly discard those newly buffered changes below.
+
+ This code will be available in debug builds, so that
+ users may try to fix a shutdown hang that occurs due
+ to a corrupted change buffer. */
+
+work_around:
+ /* Prevent an infinite loop, by removing entries from
+ the change buffer also in the case the bitmap bits were
+ wrongly clear even though buffered changes exist. */
+ const dtuple_t* tuple = ibuf_search_tuple_build(
+ space_id, page_nos[i], heap);
+loop:
+ btr_pcur_t pcur;
+ ibuf_mtr_start(&mtr);
+ btr_pcur_open(ibuf.index, tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+ &pcur, &mtr);
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ ut_ad(btr_pcur_is_after_last_on_page(&pcur));
+ goto done;
+ }
+
+ for (;;) {
+ ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+ const rec_t* ibuf_rec = btr_pcur_get_rec(&pcur);
+ if (ibuf_rec_get_space(&mtr, ibuf_rec) != space_id
+ || ibuf_rec_get_page_no(&mtr, ibuf_rec)
+ != page_nos[i]) {
+ break;
+ }
+
+ dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++;
+ /* Delete the record from ibuf */
+ if (ibuf_delete_rec(page_id_t(space_id, page_nos[i]),
+ &pcur, tuple, &mtr)) {
+ /* Deletion was pessimistic and mtr
+ was committed: we start from the
+ beginning again */
+ ut_ad(mtr.has_committed());
+ goto loop;
+ }
+
+ if (btr_pcur_is_after_last_on_page(&pcur)) {
+ ibuf_mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+ goto loop;
+ }
+ }
+done:
+ ibuf_mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+ mem_heap_empty(heap);
+#endif
+ }
+
+#ifndef DBUG_OFF
+ ibuf_add_ops(ibuf.n_discarded_ops, dops);
+ mem_heap_free(heap);
+#endif
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+static
+ulint
+ibuf_merge_pages(
+/*=============*/
+ ulint* n_pages) /*!< out: number of pages to which merged */
+{
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ ulint sum_sizes;
+ uint32_t page_nos[IBUF_MAX_N_PAGES_MERGED];
+ uint32_t space_ids[IBUF_MAX_N_PAGES_MERGED];
+
+ *n_pages = 0;
+
+ ibuf_mtr_start(&mtr);
+
+ /* Open a cursor to a randomly chosen leaf of the tree, at a random
+ position within the leaf */
+ bool available;
+
+ available = btr_pcur_open_at_rnd_pos(ibuf.index, BTR_SEARCH_LEAF,
+ &pcur, &mtr);
+ /* No one should make this index unavailable when server is running */
+ ut_a(available);
+
+ ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
+
+ if (page_is_empty(btr_pcur_get_page(&pcur))) {
+ /* If a B-tree page is empty, it must be the root page
+ and the whole B-tree must be empty. InnoDB does not
+ allow empty B-tree pages other than the root. */
+ ut_ad(ibuf.empty);
+ ut_ad(btr_pcur_get_block(&pcur)->page.id()
+ == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
+
+ ibuf_mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ return(0);
+ }
+
+ sum_sizes = ibuf_get_merge_page_nos(TRUE,
+ btr_pcur_get_rec(&pcur), &mtr,
+ space_ids,
+ page_nos, n_pages);
+ ibuf_mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ ibuf_read_merge_pages(space_ids, page_nos, *n_pages);
+
+ return(sum_sizes + 1);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages referring to space_id
+to the buffer pool.
+@returns number of pages merged.*/
+ulint
+ibuf_merge_space(
+/*=============*/
+ ulint space) /*!< in: tablespace id to merge */
+{
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ mem_heap_t* heap = mem_heap_create(512);
+ dtuple_t* tuple = ibuf_search_tuple_build(space, 0, heap);
+ ulint n_pages = 0;
+
+ ut_ad(space < SRV_SPACE_ID_UPPER_BOUND);
+
+ ibuf_mtr_start(&mtr);
+
+ /* Position the cursor on the first matching record. */
+
+ btr_pcur_open(
+ ibuf.index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur,
+ &mtr);
+
+ mem_heap_free(heap);
+
+ ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
+
+ ulint sum_sizes = 0;
+ uint32_t pages[IBUF_MAX_N_PAGES_MERGED];
+ uint32_t spaces[IBUF_MAX_N_PAGES_MERGED];
+
+ if (page_is_empty(btr_pcur_get_page(&pcur))) {
+ /* If a B-tree page is empty, it must be the root page
+ and the whole B-tree must be empty. InnoDB does not
+ allow empty B-tree pages other than the root. */
+ ut_ad(ibuf.empty);
+ ut_ad(btr_pcur_get_block(&pcur)->page.id()
+ == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
+ } else {
+
+ sum_sizes = ibuf_get_merge_pages(
+ &pcur, uint32_t(space), IBUF_MAX_N_PAGES_MERGED,
+ &pages[0], &spaces[0], &n_pages,
+ &mtr);
+ ib::info() << "Size of pages merged " << sum_sizes;
+ }
+
+ ibuf_mtr_commit(&mtr);
+
+ btr_pcur_close(&pcur);
+
+ if (n_pages > 0) {
+ ut_ad(n_pages <= UT_ARR_SIZE(pages));
+
+#ifdef UNIV_DEBUG
+ for (ulint i = 0; i < n_pages; ++i) {
+ ut_ad(spaces[i] == space);
+ }
+#endif /* UNIV_DEBUG */
+
+ ibuf_read_merge_pages(spaces, pages, n_pages);
+ }
+
+ return(n_pages);
+}
+
+/** Contract the change buffer by reading pages to the buffer pool.
+@param[out] n_pages number of pages merged
+@param[in] sync whether the caller waits for
+the issued reads to complete
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+MY_ATTRIBUTE((warn_unused_result))
+static ulint ibuf_merge(ulint* n_pages)
+{
+ *n_pages = 0;
+
+ /* We perform a dirty read of ibuf.empty, without latching
+ the insert buffer root page. We trust this dirty read except
+ when a slow shutdown is being executed. During a slow
+ shutdown, the insert buffer merge must be completed. */
+
+ if (ibuf.empty && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
+ return(0);
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+ } else if (ibuf_debug) {
+ return(0);
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+ } else {
+ return ibuf_merge_pages(n_pages);
+ }
+}
+
+/** Contract the change buffer by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is empty */
+static ulint ibuf_contract()
+{
+ ulint n_pages;
+ return ibuf_merge_pages(&n_pages);
+}
+
+/** Contract the change buffer by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+ulint ibuf_merge_all()
+{
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+ if (ibuf_debug) {
+ return(0);
+ }
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+ ulint sum_bytes = 0;
+ ulint n_pages = srv_io_capacity;
+
+ for (ulint sum_pages = 0; sum_pages < n_pages; ) {
+ ulint n_pag2;
+ ulint n_bytes = ibuf_merge(&n_pag2);
+
+ if (n_bytes == 0) {
+ break;
+ }
+
+ sum_bytes += n_bytes;
+ }
+
+ return sum_bytes;
+}
+
+/*********************************************************************//**
+Contract insert buffer trees after insert if they are too big. */
+UNIV_INLINE
+void
+ibuf_contract_after_insert(
+/*=======================*/
+ ulint entry_size) /*!< in: size of a record which was inserted
+ into an ibuf tree */
+{
+ /* Perform dirty reads of ibuf.size and ibuf.max_size, to
+ reduce ibuf_mutex contention. ibuf.max_size remains constant
+ after ibuf_init_at_db_start(), but ibuf.size should be
+ protected by ibuf_mutex. Given that ibuf.size fits in a
+ machine word, this should be OK; at worst we are doing some
+ excessive ibuf_contract() or occasionally skipping a
+ ibuf_contract(). */
+ if (ibuf.size < ibuf.max_size) {
+ return;
+ }
+
+ /* Contract at least entry_size many bytes */
+ ulint sum_sizes = 0;
+ ulint size;
+
+ do {
+ size = ibuf_contract();
+ sum_sizes += size;
+ } while (size > 0 && sum_sizes < entry_size);
+}
+
+/** Determine if a change buffer record has been encountered already.
+@param rec change buffer record in the MySQL 5.5 format
+@param hash hash table of encountered records
+@param size number of elements in hash
+@retval true if a distinct record
+@retval false if this may be duplicating an earlier record */
+static bool ibuf_get_volume_buffered_hash(const rec_t *rec, ulint *hash,
+ ulint size)
+{
+ ut_ad(rec_get_n_fields_old(rec) > IBUF_REC_FIELD_USER);
+ const ulint start= rec_get_field_start_offs(rec, IBUF_REC_FIELD_USER);
+ const ulint len= rec_get_data_size_old(rec) - start;
+ const uint32_t fold= ut_crc32(rec + start, len);
+ hash+= (fold / (CHAR_BIT * sizeof *hash)) % size;
+ ulint bitmask= static_cast<ulint>(1) << (fold % (CHAR_BIT * sizeof(*hash)));
+
+ if (*hash & bitmask)
+ return false;
+
+ /* We have not seen this record yet. Remember it. */
+ *hash|= bitmask;
+ return true;
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \
+ ibuf_get_volume_buffered_count_func(mtr,rec,hash,size,n_recs)
+#else /* UNIV_DEBUG */
+# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \
+ ibuf_get_volume_buffered_count_func(rec,hash,size,n_recs)
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Update the estimate of the number of records on a page, and
+get the space taken by merging the buffered record to the index page.
+@return size of index record in bytes + an upper limit of the space
+taken in the page directory */
+static
+ulint
+ibuf_get_volume_buffered_count_func(
+/*================================*/
+#ifdef UNIV_DEBUG
+ mtr_t* mtr, /*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+ const rec_t* rec, /*!< in: insert buffer record */
+ ulint* hash, /*!< in/out: hash array */
+ ulint size, /*!< in: number of elements in hash array */
+ lint* n_recs) /*!< in/out: estimated number of records
+ on the page that rec points to */
+{
+ ulint len;
+ ibuf_op_t ibuf_op;
+ const byte* types;
+ ulint n_fields;
+
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(ibuf_inside(mtr));
+
+ n_fields = rec_get_n_fields_old(rec);
+ ut_ad(n_fields > IBUF_REC_FIELD_USER);
+ n_fields -= IBUF_REC_FIELD_USER;
+
+ rec_get_nth_field_offs_old(rec, 1, &len);
+ /* This function is only invoked when buffering new
+ operations. All pre-4.1 records should have been merged
+ when the database was started up. */
+ ut_a(len == 1);
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ /* This record has been merged already,
+ but apparently the system crashed before
+ the change was discarded from the buffer.
+ Pretend that the record does not exist. */
+ return(0);
+ }
+
+ types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+ switch (UNIV_EXPECT(int(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE),
+ IBUF_REC_INFO_SIZE)) {
+ default:
+ ut_error;
+ case 0:
+ /* This ROW_TYPE=REDUNDANT record does not include an
+ operation counter. Exclude it from the *n_recs,
+ because deletes cannot be buffered if there are
+ old-style inserts buffered for the page. */
+
+ len = ibuf_rec_get_size(rec, types, n_fields, 0);
+
+ return(len
+ + rec_get_converted_extra_size(len, n_fields, 0)
+ + page_dir_calc_reserved_space(1));
+ case 1:
+ /* This ROW_TYPE=COMPACT record does not include an
+ operation counter. Exclude it from the *n_recs,
+ because deletes cannot be buffered if there are
+ old-style inserts buffered for the page. */
+ goto get_volume_comp;
+
+ case IBUF_REC_INFO_SIZE:
+ ibuf_op = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
+ break;
+ }
+
+ switch (ibuf_op) {
+ case IBUF_OP_INSERT:
+ /* Inserts can be done by updating a delete-marked record.
+ Because delete-mark and insert operations can be pointing to
+ the same records, we must not count duplicates. */
+ case IBUF_OP_DELETE_MARK:
+ /* There must be a record to delete-mark.
+ See if this record has been already buffered. */
+ if (n_recs && ibuf_get_volume_buffered_hash(rec, hash, size)) {
+ (*n_recs)++;
+ }
+
+ if (ibuf_op == IBUF_OP_DELETE_MARK) {
+ /* Setting the delete-mark flag does not
+ affect the available space on the page. */
+ return(0);
+ }
+ break;
+ case IBUF_OP_DELETE:
+ /* A record will be removed from the page. */
+ if (n_recs) {
+ (*n_recs)--;
+ }
+ /* While deleting a record actually frees up space,
+ we have to play it safe and pretend that it takes no
+ additional space (the record might not exist, etc.). */
+ return(0);
+ default:
+ ut_error;
+ }
+
+ ut_ad(ibuf_op == IBUF_OP_INSERT);
+
+get_volume_comp:
+ {
+ dtuple_t* entry;
+ ulint volume;
+ dict_index_t* dummy_index;
+ mem_heap_t* heap = mem_heap_create(500);
+
+ entry = ibuf_build_entry_from_ibuf_rec(
+ mtr, rec, heap, &dummy_index);
+
+ volume = rec_get_converted_size(dummy_index, entry, 0);
+
+ ibuf_dummy_index_free(dummy_index);
+ mem_heap_free(heap);
+
+ return(volume + page_dir_calc_reserved_space(1));
+ }
+}
+
+/*********************************************************************//**
+Gets an upper limit for the combined size of entries buffered in the insert
+buffer for a given page.
+@return upper limit for the volume of buffered inserts for the index
+page, in bytes; srv_page_size, if the entries for the index page span
+several pages in the insert buffer */
+static
+ulint
+ibuf_get_volume_buffered(
+/*=====================*/
+ const btr_pcur_t*pcur, /*!< in: pcur positioned at a place in an
+ insert buffer tree where we would insert an
+ entry for the index page whose number is
+ page_no, latch mode has to be BTR_MODIFY_PREV
+ or BTR_MODIFY_TREE */
+ ulint space, /*!< in: space id */
+ ulint page_no,/*!< in: page number of an index page */
+ lint* n_recs, /*!< in/out: minimum number of records on the
+ page after the buffered changes have been
+ applied, or NULL to disable the counting */
+ mtr_t* mtr) /*!< in: mini-transaction of pcur */
+{
+ ulint volume;
+ const rec_t* rec;
+ const page_t* page;
+ const page_t* prev_page;
+ const page_t* next_page;
+ /* bitmap of buffered recs */
+ ulint hash_bitmap[128 / sizeof(ulint)];
+
+ ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
+ || (pcur->latch_mode == BTR_MODIFY_TREE));
+
+ /* Count the volume of inserts earlier in the alphabetical order than
+ pcur */
+
+ volume = 0;
+
+ if (n_recs) {
+ memset(hash_bitmap, 0, sizeof hash_bitmap);
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ page = page_align(rec);
+ ut_ad(page_validate(page, ibuf.index));
+
+ if (page_rec_is_supremum(rec)) {
+ rec = page_rec_get_prev_const(rec);
+ }
+
+ uint32_t prev_page_no;
+
+ for (; !page_rec_is_infimum(rec);
+ rec = page_rec_get_prev_const(rec)) {
+ ut_ad(page_align(rec) == page);
+
+ if (page_no != ibuf_rec_get_page_no(mtr, rec)
+ || space != ibuf_rec_get_space(mtr, rec)) {
+
+ goto count_later;
+ }
+
+ volume += ibuf_get_volume_buffered_count(
+ mtr, rec,
+ hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+ }
+
+ /* Look at the previous page */
+
+ prev_page_no = btr_page_get_prev(page);
+
+ if (prev_page_no == FIL_NULL) {
+
+ goto count_later;
+ }
+
+ {
+ buf_block_t* block;
+
+ block = buf_page_get(
+ page_id_t(IBUF_SPACE_ID, prev_page_no),
+ 0, RW_X_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+
+ prev_page = buf_block_get_frame(block);
+ ut_ad(page_validate(prev_page, ibuf.index));
+ }
+
+#ifdef UNIV_BTR_DEBUG
+ static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+ ut_a(!memcmp_aligned<4>(prev_page + FIL_PAGE_NEXT,
+ page + FIL_PAGE_OFFSET, 4));
+#endif /* UNIV_BTR_DEBUG */
+
+ rec = page_get_supremum_rec(prev_page);
+ rec = page_rec_get_prev_const(rec);
+
+ for (;; rec = page_rec_get_prev_const(rec)) {
+ ut_ad(page_align(rec) == prev_page);
+
+ if (page_rec_is_infimum(rec)) {
+
+ /* We cannot go to yet a previous page, because we
+ do not have the x-latch on it, and cannot acquire one
+ because of the latching order: we have to give up */
+
+ return(srv_page_size);
+ }
+
+ if (page_no != ibuf_rec_get_page_no(mtr, rec)
+ || space != ibuf_rec_get_space(mtr, rec)) {
+
+ goto count_later;
+ }
+
+ volume += ibuf_get_volume_buffered_count(
+ mtr, rec,
+ hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+ }
+
+count_later:
+ rec = btr_pcur_get_rec(pcur);
+
+ if (!page_rec_is_supremum(rec)) {
+ rec = page_rec_get_next_const(rec);
+ }
+
+ for (; !page_rec_is_supremum(rec);
+ rec = page_rec_get_next_const(rec)) {
+ if (page_no != ibuf_rec_get_page_no(mtr, rec)
+ || space != ibuf_rec_get_space(mtr, rec)) {
+
+ return(volume);
+ }
+
+ volume += ibuf_get_volume_buffered_count(
+ mtr, rec,
+ hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+ }
+
+ /* Look at the next page */
+
+ uint32_t next_page_no = btr_page_get_next(page);
+
+ if (next_page_no == FIL_NULL) {
+
+ return(volume);
+ }
+
+ {
+ buf_block_t* block;
+
+ block = buf_page_get(
+ page_id_t(IBUF_SPACE_ID, next_page_no),
+ 0, RW_X_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+
+ next_page = buf_block_get_frame(block);
+ ut_ad(page_validate(next_page, ibuf.index));
+ }
+
+#ifdef UNIV_BTR_DEBUG
+ static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+ ut_a(!memcmp_aligned<4>(next_page + FIL_PAGE_PREV,
+ page + FIL_PAGE_OFFSET, 4));
+#endif /* UNIV_BTR_DEBUG */
+
+ rec = page_get_infimum_rec(next_page);
+ rec = page_rec_get_next_const(rec);
+
+ for (;; rec = page_rec_get_next_const(rec)) {
+ ut_ad(page_align(rec) == next_page);
+
+ if (page_rec_is_supremum(rec)) {
+
+ /* We give up */
+
+ return(srv_page_size);
+ }
+
+ if (page_no != ibuf_rec_get_page_no(mtr, rec)
+ || space != ibuf_rec_get_space(mtr, rec)) {
+
+ return(volume);
+ }
+
+ volume += ibuf_get_volume_buffered_count(
+ mtr, rec,
+ hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+ }
+}
+
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+void
+ibuf_update_max_tablespace_id(void)
+/*===============================*/
+{
+ ulint max_space_id;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+
+ ut_a(!dict_table_is_comp(ibuf.index->table));
+
+ ibuf_mtr_start(&mtr);
+
+ btr_pcur_open_at_index_side(
+ false, ibuf.index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+
+ ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
+
+ btr_pcur_move_to_prev(&pcur, &mtr);
+
+ if (btr_pcur_is_before_first_on_page(&pcur)) {
+ /* The tree is empty */
+
+ max_space_id = 0;
+ } else {
+ rec = btr_pcur_get_rec(&pcur);
+
+ field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
+ ut_a(len == 4);
+
+ max_space_id = mach_read_from_4(field);
+ }
+
+ ibuf_mtr_commit(&mtr);
+
+ /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */
+
+ fil_set_max_space_id_if_bigger(max_space_id);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \
+ ibuf_get_entry_counter_low_func(mtr,rec,space,page_no)
+#else /* UNIV_DEBUG */
+# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \
+ ibuf_get_entry_counter_low_func(rec,space,page_no)
+#endif
+/****************************************************************//**
+Helper function for ibuf_get_entry_counter_func. Checks if rec is for
+(space, page_no), and if so, reads counter value from it and returns
+that + 1.
+@retval ULINT_UNDEFINED if the record does not contain any counter
+@retval 0 if the record is not for (space, page_no)
+@retval 1 + previous counter value, otherwise */
+static
+ulint
+ibuf_get_entry_counter_low_func(
+/*============================*/
+#ifdef UNIV_DEBUG
+ mtr_t* mtr, /*!< in: mini-transaction of rec */
+#endif /* UNIV_DEBUG */
+ const rec_t* rec, /*!< in: insert buffer record */
+ ulint space, /*!< in: space id */
+ ulint page_no) /*!< in: page number */
+{
+ ulint counter;
+ const byte* field;
+ ulint len;
+
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(rec_get_n_fields_old(rec) > 2);
+
+ field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+ ut_a(len == 1);
+
+ /* Check the tablespace identifier. */
+ field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
+ ut_a(len == 4);
+
+ if (mach_read_from_4(field) != space) {
+
+ return(0);
+ }
+
+ /* Check the page offset. */
+ field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
+ ut_a(len == 4);
+
+ if (mach_read_from_4(field) != page_no) {
+
+ return(0);
+ }
+
+ /* Check if the record contains a counter field. */
+ field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+ switch (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
+ default:
+ ut_error;
+ case 0: /* ROW_FORMAT=REDUNDANT */
+ case 1: /* ROW_FORMAT=COMPACT */
+ return(ULINT_UNDEFINED);
+
+ case IBUF_REC_INFO_SIZE:
+ counter = mach_read_from_2(field + IBUF_REC_OFFSET_COUNTER);
+ ut_a(counter < 0xFFFF);
+ return(counter + 1);
+ }
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \
+ ibuf_get_entry_counter_func(space,page_no,rec,mtr,exact_leaf)
+#else /* UNIV_DEBUG */
+# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \
+ ibuf_get_entry_counter_func(space,page_no,rec,exact_leaf)
+#endif /* UNIV_DEBUG */
+
+/****************************************************************//**
+Calculate the counter field for an entry based on the current
+last record in ibuf for (space, page_no).
+@return the counter field, or ULINT_UNDEFINED
+if we should abort this insertion to ibuf */
+static
+ulint
+ibuf_get_entry_counter_func(
+/*========================*/
+ ulint space, /*!< in: space id of entry */
+ ulint page_no, /*!< in: page number of entry */
+ const rec_t* rec, /*!< in: the record preceding the
+ insertion point */
+#ifdef UNIV_DEBUG
+ mtr_t* mtr, /*!< in: mini-transaction */
+#endif /* UNIV_DEBUG */
+ ibool only_leaf) /*!< in: TRUE if this is the only
+ leaf page that can contain entries
+ for (space,page_no), that is, there
+ was no exact match for (space,page_no)
+ in the node pointer */
+{
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(page_validate(page_align(rec), ibuf.index));
+
+ if (page_rec_is_supremum(rec)) {
+ /* This is just for safety. The record should be a
+ page infimum or a user record. */
+ ut_ad(0);
+ return(ULINT_UNDEFINED);
+ } else if (!page_rec_is_infimum(rec)) {
+ return(ibuf_get_entry_counter_low(mtr, rec, space, page_no));
+ } else if (only_leaf || !page_has_prev(page_align(rec))) {
+ /* The parent node pointer did not contain the
+ searched for (space, page_no), which means that the
+ search ended on the correct page regardless of the
+ counter value, and since we're at the infimum record,
+ there are no existing records. */
+ return(0);
+ } else {
+ /* We used to read the previous page here. It would
+ break the latching order, because the caller has
+ buffer-fixed an insert buffer bitmap page. */
+ return(ULINT_UNDEFINED);
+ }
+}
+
+
+/** Translates the ibuf free bits to the free space on a page in bytes.
+@param[in] physical_size page_size
+@param[in] bits value for ibuf bitmap bits
+@return maximum insert size after reorganize for the page */
+inline ulint
+ibuf_index_page_calc_free_from_bits(ulint physical_size, ulint bits)
+{
+ ut_ad(bits < 4);
+ ut_ad(physical_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+ if (bits == 3) {
+ bits = 4;
+ }
+
+ return bits * physical_size / IBUF_PAGE_SIZE_PER_FREE_SPACE;
+}
+
+/** Buffer an operation in the insert/delete buffer, instead of doing it
+directly to the disk page, if this is possible.
+@param[in] mode BTR_MODIFY_PREV or BTR_MODIFY_TREE
+@param[in] op operation type
+@param[in] no_counter TRUE=use 5.0.3 format; FALSE=allow delete
+buffering
+@param[in] entry index entry to insert
+@param[in] entry_size rec_get_converted_size(index, entry)
+@param[in,out] index index where to insert; must not be unique
+or clustered
+@param[in] page_id page id where to insert
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] thr query thread
+@return DB_SUCCESS, DB_STRONG_FAIL or other error */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+ibuf_insert_low(
+ ulint mode,
+ ibuf_op_t op,
+ ibool no_counter,
+ const dtuple_t* entry,
+ ulint entry_size,
+ dict_index_t* index,
+ const page_id_t page_id,
+ ulint zip_size,
+ que_thr_t* thr)
+{
+ big_rec_t* dummy_big_rec;
+ btr_pcur_t pcur;
+ btr_cur_t* cursor;
+ dtuple_t* ibuf_entry;
+ mem_heap_t* offsets_heap = NULL;
+ mem_heap_t* heap;
+ rec_offs* offsets = NULL;
+ ulint buffered;
+ lint min_n_recs;
+ rec_t* ins_rec;
+ buf_block_t* bitmap_page;
+ buf_block_t* block;
+ page_t* root;
+ dberr_t err;
+ ibool do_merge;
+ uint32_t space_ids[IBUF_MAX_N_PAGES_MERGED];
+ uint32_t page_nos[IBUF_MAX_N_PAGES_MERGED];
+ ulint n_stored;
+ mtr_t mtr;
+ mtr_t bitmap_mtr;
+
+ ut_a(!dict_index_is_clust(index));
+ ut_ad(!dict_index_is_spatial(index));
+ ut_ad(dtuple_check_typed(entry));
+ ut_ad(!no_counter || op == IBUF_OP_INSERT);
+ ut_ad(page_id.space() == index->table->space_id);
+ ut_a(op < IBUF_OP_COUNT);
+
+ do_merge = FALSE;
+
+ /* Perform dirty reads of ibuf.size and ibuf.max_size, to
+ reduce ibuf_mutex contention. Given that ibuf.max_size and
+ ibuf.size fit in a machine word, this should be OK; at worst
+ we are doing some excessive ibuf_contract() or occasionally
+ skipping an ibuf_contract(). */
+ if (ibuf.max_size == 0) {
+ return(DB_STRONG_FAIL);
+ }
+
+ if (ibuf.size >= ibuf.max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
+ /* Insert buffer is now too big, contract it but do not try
+ to insert */
+
+
+#ifdef UNIV_IBUF_DEBUG
+ fputs("Ibuf too big\n", stderr);
+#endif
+ ibuf_contract();
+
+ return(DB_STRONG_FAIL);
+ }
+
+ heap = mem_heap_create(1024);
+
+ /* Build the entry which contains the space id and the page number
+ as the first fields and the type information for other fields, and
+ which will be inserted to the insert buffer. Using a counter value
+ of 0xFFFF we find the last record for (space, page_no), from which
+ we can then read the counter value N and use N + 1 in the record we
+ insert. (We patch the ibuf_entry's counter field to the correct
+ value just before actually inserting the entry.) */
+
+ ibuf_entry = ibuf_entry_build(
+ op, index, entry, page_id.space(), page_id.page_no(),
+ no_counter ? ULINT_UNDEFINED : 0xFFFF, heap);
+
+ /* Open a cursor to the insert buffer tree to calculate if we can add
+ the new entry to it without exceeding the free space limit for the
+ page. */
+
+ if (BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
+ for (;;) {
+ mutex_enter(&ibuf_pessimistic_insert_mutex);
+ mutex_enter(&ibuf_mutex);
+
+ if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) {
+
+ break;
+ }
+
+ mutex_exit(&ibuf_mutex);
+ mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+ if (!ibuf_add_free_page()) {
+
+ mem_heap_free(heap);
+ return(DB_STRONG_FAIL);
+ }
+ }
+ }
+
+ ibuf_mtr_start(&mtr);
+
+ btr_pcur_open(ibuf.index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+ ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
+
+ /* Find out the volume of already buffered inserts for the same index
+ page */
+ min_n_recs = 0;
+ buffered = ibuf_get_volume_buffered(&pcur,
+ page_id.space(),
+ page_id.page_no(),
+ op == IBUF_OP_DELETE
+ ? &min_n_recs
+ : NULL, &mtr);
+
+ const ulint physical_size = zip_size ? zip_size : srv_page_size;
+
+ if (op == IBUF_OP_DELETE
+ && (min_n_recs < 2 || buf_pool.watch_occurred(page_id))) {
+ /* The page could become empty after the record is
+ deleted, or the page has been read in to the buffer
+ pool. Refuse to buffer the operation. */
+
+ /* The buffer pool watch is needed for IBUF_OP_DELETE
+ because of latching order considerations. We can
+ check buf_pool_watch_occurred() only after latching
+ the insert buffer B-tree pages that contain buffered
+ changes for the page. We never buffer IBUF_OP_DELETE,
+ unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have
+ been previously buffered for the page. Because there
+ are buffered operations for the page, the insert
+ buffer B-tree page latches held by mtr will guarantee
+ that no changes for the user page will be merged
+ before mtr_commit(&mtr). We must not mtr_commit(&mtr)
+ until after the IBUF_OP_DELETE has been buffered. */
+
+fail_exit:
+ if (BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
+ mutex_exit(&ibuf_mutex);
+ mutex_exit(&ibuf_pessimistic_insert_mutex);
+ }
+
+ err = DB_STRONG_FAIL;
+ goto func_exit;
+ }
+
+ /* After this point, the page could still be loaded to the
+ buffer pool, but we do not have to care about it, since we are
+ holding a latch on the insert buffer leaf page that contains
+ buffered changes for (space, page_no). If the page enters the
+ buffer pool, buf_page_read_complete() for (space, page_no) will
+ have to acquire a latch on the same insert buffer leaf page,
+ which it cannot do until we have buffered the IBUF_OP_DELETE
+ and done mtr_commit(&mtr) to release the latch. */
+
+ ibuf_mtr_start(&bitmap_mtr);
+
+ bitmap_page = ibuf_bitmap_get_map_page(page_id, zip_size, &bitmap_mtr);
+
+ /* We check if the index page is suitable for buffered entries */
+
+ if (buf_pool.page_hash_contains(page_id)) {
+commit_exit:
+ ibuf_mtr_commit(&bitmap_mtr);
+ goto fail_exit;
+ } else {
+ lock_mutex_enter();
+ const auto lock_exists = lock_sys.get_first(page_id);
+ lock_mutex_exit();
+ if (lock_exists) {
+ goto commit_exit;
+ }
+ }
+
+ if (op == IBUF_OP_INSERT) {
+ ulint bits = ibuf_bitmap_page_get_bits(
+ bitmap_page->frame, page_id, physical_size,
+ IBUF_BITMAP_FREE, &bitmap_mtr);
+
+ if (buffered + entry_size + page_dir_calc_reserved_space(1)
+ > ibuf_index_page_calc_free_from_bits(physical_size,
+ bits)) {
+ /* Release the bitmap page latch early. */
+ ibuf_mtr_commit(&bitmap_mtr);
+
+ /* It may not fit */
+ do_merge = TRUE;
+
+ ibuf_get_merge_page_nos(FALSE,
+ btr_pcur_get_rec(&pcur), &mtr,
+ space_ids,
+ page_nos, &n_stored);
+
+ goto fail_exit;
+ }
+ }
+
+ if (!no_counter) {
+ /* Patch correct counter value to the entry to
+ insert. This can change the insert position, which can
+ result in the need to abort in some cases. */
+ ulint counter = ibuf_get_entry_counter(
+ page_id.space(), page_id.page_no(),
+ btr_pcur_get_rec(&pcur), &mtr,
+ btr_pcur_get_btr_cur(&pcur)->low_match
+ < IBUF_REC_FIELD_METADATA);
+ dfield_t* field;
+
+ if (counter == ULINT_UNDEFINED) {
+ goto commit_exit;
+ }
+
+ field = dtuple_get_nth_field(
+ ibuf_entry, IBUF_REC_FIELD_METADATA);
+ mach_write_to_2(
+ (byte*) dfield_get_data(field)
+ + IBUF_REC_OFFSET_COUNTER, counter);
+ }
+
+ /* Set the bitmap bit denoting that the insert buffer contains
+ buffered entries for this index page, if the bit is not set yet */
+ index->set_modified(bitmap_mtr);
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
+ bitmap_page, page_id, physical_size, true, &bitmap_mtr);
+ ibuf_mtr_commit(&bitmap_mtr);
+
+ cursor = btr_pcur_get_btr_cur(&pcur);
+
+ if (mode == BTR_MODIFY_PREV) {
+ err = btr_cur_optimistic_insert(
+ BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
+ cursor, &offsets, &offsets_heap,
+ ibuf_entry, &ins_rec,
+ &dummy_big_rec, 0, thr, &mtr);
+ block = btr_cur_get_block(cursor);
+ ut_ad(block->page.id().space() == IBUF_SPACE_ID);
+
+ /* If this is the root page, update ibuf.empty. */
+ if (block->page.id().page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) {
+ const page_t* root = buf_block_get_frame(block);
+
+ ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
+ ut_ad(page_get_page_no(root)
+ == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+ ibuf.empty = page_is_empty(root);
+ }
+ } else {
+ ut_ad(BTR_LATCH_MODE_WITHOUT_INTENTION(mode)
+ == BTR_MODIFY_TREE);
+
+ /* We acquire an sx-latch to the root page before the insert,
+ because a pessimistic insert releases the tree x-latch,
+ which would cause the sx-latching of the root after that to
+ break the latching order. */
+
+ root = ibuf_tree_root_get(&mtr)->frame;
+
+ err = btr_cur_optimistic_insert(
+ BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
+ cursor, &offsets, &offsets_heap,
+ ibuf_entry, &ins_rec,
+ &dummy_big_rec, 0, thr, &mtr);
+
+ if (err == DB_FAIL) {
+ err = btr_cur_pessimistic_insert(
+ BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
+ cursor, &offsets, &offsets_heap,
+ ibuf_entry, &ins_rec,
+ &dummy_big_rec, 0, thr, &mtr);
+ }
+
+ mutex_exit(&ibuf_pessimistic_insert_mutex);
+ ibuf_size_update(root);
+ mutex_exit(&ibuf_mutex);
+ ibuf.empty = page_is_empty(root);
+
+ block = btr_cur_get_block(cursor);
+ ut_ad(block->page.id().space() == IBUF_SPACE_ID);
+ }
+
+ if (offsets_heap) {
+ mem_heap_free(offsets_heap);
+ }
+
+ if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
+ /* Update the page max trx id field */
+ page_update_max_trx_id(block, NULL,
+ thr_get_trx(thr)->id, &mtr);
+ }
+
+func_exit:
+ ibuf_mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ mem_heap_free(heap);
+
+ if (err == DB_SUCCESS
+ && BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
+ ibuf_contract_after_insert(entry_size);
+ }
+
+ if (do_merge) {
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
+#endif
+ ibuf_read_merge_pages(space_ids, page_nos, n_stored);
+ }
+
+ return(err);
+}
+
+/** Buffer an operation in the change buffer, instead of applying it
+directly to the file page, if this is possible. Does not do it if the index
+is clustered or unique.
+@param[in] op operation type
+@param[in] entry index entry to insert
+@param[in,out] index index where to insert
+@param[in] page_id page id where to insert
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] thr query thread
+@return true if success */
+bool
+ibuf_insert(
+ ibuf_op_t op,
+ const dtuple_t* entry,
+ dict_index_t* index,
+ const page_id_t page_id,
+ ulint zip_size,
+ que_thr_t* thr)
+{
+ dberr_t err;
+ ulint entry_size;
+ ibool no_counter;
+ /* Read the settable global variable only once in
+ this function, so that we will have a consistent view of it. */
+ ibuf_use_t use = ibuf_use_t(innodb_change_buffering);
+ DBUG_ENTER("ibuf_insert");
+
+ DBUG_PRINT("ibuf", ("op: %d, space: " UINT32PF ", page_no: " UINT32PF,
+ op, page_id.space(), page_id.page_no()));
+
+ ut_ad(dtuple_check_typed(entry));
+ ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
+
+ ut_a(!dict_index_is_clust(index));
+ ut_ad(!index->table->is_temporary());
+
+ no_counter = use <= IBUF_USE_INSERT;
+
+ switch (op) {
+ case IBUF_OP_INSERT:
+ switch (use) {
+ case IBUF_USE_NONE:
+ case IBUF_USE_DELETE:
+ case IBUF_USE_DELETE_MARK:
+ DBUG_RETURN(false);
+ case IBUF_USE_INSERT:
+ case IBUF_USE_INSERT_DELETE_MARK:
+ case IBUF_USE_ALL:
+ goto check_watch;
+ }
+ break;
+ case IBUF_OP_DELETE_MARK:
+ switch (use) {
+ case IBUF_USE_NONE:
+ case IBUF_USE_INSERT:
+ DBUG_RETURN(false);
+ case IBUF_USE_DELETE_MARK:
+ case IBUF_USE_DELETE:
+ case IBUF_USE_INSERT_DELETE_MARK:
+ case IBUF_USE_ALL:
+ ut_ad(!no_counter);
+ goto check_watch;
+ }
+ break;
+ case IBUF_OP_DELETE:
+ switch (use) {
+ case IBUF_USE_NONE:
+ case IBUF_USE_INSERT:
+ case IBUF_USE_INSERT_DELETE_MARK:
+ DBUG_RETURN(false);
+ case IBUF_USE_DELETE_MARK:
+ case IBUF_USE_DELETE:
+ case IBUF_USE_ALL:
+ ut_ad(!no_counter);
+ goto skip_watch;
+ }
+ break;
+ case IBUF_OP_COUNT:
+ break;
+ }
+
+ /* unknown op or use */
+ ut_error;
+
+check_watch:
+ /* If a thread attempts to buffer an insert on a page while a
+ purge is in progress on the same page, the purge must not be
+ buffered, because it could remove a record that was
+ re-inserted later. For simplicity, we block the buffering of
+ all operations on a page that has a purge pending.
+
+ We do not check this in the IBUF_OP_DELETE case, because that
+ would always trigger the buffer pool watch during purge and
+ thus prevent the buffering of delete operations. We assume
+ that the issuer of IBUF_OP_DELETE has called
+ buf_pool_t::watch_set(). */
+
+ if (buf_pool.page_hash_contains<true>(page_id)) {
+ /* A buffer pool watch has been set or the
+ page has been read into the buffer pool.
+ Do not buffer the request. If a purge operation
+ is being buffered, have this request executed
+ directly on the page in the buffer pool after the
+ buffered entries for this page have been merged. */
+ DBUG_RETURN(false);
+ }
+
+skip_watch:
+ entry_size = rec_get_converted_size(index, entry, 0);
+
+ if (entry_size
+ >= page_get_free_space_of_empty(dict_table_is_comp(index->table))
+ / 2) {
+
+ DBUG_RETURN(false);
+ }
+
+ err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter,
+ entry, entry_size,
+ index, page_id, zip_size, thr);
+ if (err == DB_FAIL) {
+ err = ibuf_insert_low(BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT,
+ op, no_counter, entry, entry_size,
+ index, page_id, zip_size, thr);
+ }
+
+ ut_a(err == DB_SUCCESS || err == DB_STRONG_FAIL
+ || err == DB_TOO_BIG_RECORD);
+
+ DBUG_RETURN(err == DB_SUCCESS);
+}
+
+/********************************************************************//**
+During merge, inserts to an index page a secondary index entry extracted
+from the insert buffer.
+@return newly inserted record */
+static MY_ATTRIBUTE((nonnull))
+rec_t*
+ibuf_insert_to_index_page_low(
+/*==========================*/
+ const dtuple_t* entry, /*!< in: buffered entry to insert */
+ buf_block_t* block, /*!< in/out: index page where the buffered
+ entry should be placed */
+ dict_index_t* index, /*!< in: record descriptor */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ mtr_t* mtr, /*!< in/out: mtr */
+ page_cur_t* page_cur)/*!< in/out: cursor positioned on the record
+ after which to insert the buffered entry */
+{
+ rec_t* rec;
+ DBUG_ENTER("ibuf_insert_to_index_page_low");
+
+ rec = page_cur_tuple_insert(page_cur, entry, index,
+ offsets, &heap, 0, mtr);
+ if (rec != NULL) {
+ DBUG_RETURN(rec);
+ }
+
+ /* Page reorganization or recompression should already have
+ been attempted by page_cur_tuple_insert(). Besides, per
+ ibuf_index_page_calc_free_zip() the page should not have been
+ recompressed or reorganized. */
+ ut_ad(!is_buf_block_get_page_zip(block));
+
+ /* If the record did not fit, reorganize */
+
+ btr_page_reorganize(page_cur, index, mtr);
+
+ /* This time the record must fit */
+
+ rec = page_cur_tuple_insert(page_cur, entry, index,
+ offsets, &heap, 0, mtr);
+ if (rec != NULL) {
+ DBUG_RETURN(rec);
+ }
+
+ ib::error() << "Insert buffer insert fails; page free "
+ << page_get_max_insert_size(block->frame, 1)
+ << ", dtuple size "
+ << rec_get_converted_size(index, entry, 0);
+
+ fputs("InnoDB: Cannot insert index record ", stderr);
+ dtuple_print(stderr, entry);
+ fputs("\nInnoDB: The table where this index record belongs\n"
+ "InnoDB: is now probably corrupt. Please run CHECK TABLE on\n"
+ "InnoDB: that table.\n", stderr);
+
+ if (buf_block_t *bitmap_page = ibuf_bitmap_get_map_page(
+ block->page.id(), block->zip_size(), mtr)) {
+
+ ib::error() << "page " << block->page.id() << ", size "
+ << block->physical_size() << ", bitmap bits "
+ << ibuf_bitmap_page_get_bits(bitmap_page->frame,
+ block->page.id(), block->zip_size(),
+ IBUF_BITMAP_FREE, mtr);
+ }
+
+ ib::error() << BUG_REPORT_MSG;
+
+ ut_ad(0);
+ DBUG_RETURN(NULL);
+}
+
+/************************************************************************
+During merge, inserts to an index page a secondary index entry extracted
+from the insert buffer. */
+static
+void
+ibuf_insert_to_index_page(
+/*======================*/
+ const dtuple_t* entry, /*!< in: buffered entry to insert */
+ buf_block_t* block, /*!< in/out: index page where the buffered entry
+ should be placed */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t page_cur;
+ ulint low_match;
+ page_t* page = buf_block_get_frame(block);
+ rec_t* rec;
+ rec_offs* offsets;
+ mem_heap_t* heap;
+
+ DBUG_ENTER("ibuf_insert_to_index_page");
+
+ DBUG_PRINT("ibuf", ("page " UINT32PF ":" UINT32PF,
+ block->page.id().space(),
+ block->page.id().page_no()));
+
+ ut_ad(!dict_index_is_online_ddl(index));// this is an ibuf_dummy index
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(dtuple_check_typed(entry));
+#ifdef BTR_CUR_HASH_ADAPT
+ /* A change buffer merge must occur before users are granted
+ any access to the page. No adaptive hash index entries may
+ point to a freshly read page. */
+ ut_ad(!block->index);
+ assert_block_ahi_empty(block);
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_ad(mtr->is_named_space(block->page.id().space()));
+
+ if (UNIV_UNLIKELY(dict_table_is_comp(index->table)
+ != (ibool)!!page_is_comp(page))) {
+ ib::warn() << "Trying to insert a record from the insert"
+ " buffer to an index page but the 'compact' flag does"
+ " not match!";
+ goto dump;
+ }
+
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+
+ if (page_rec_is_supremum(rec)) {
+ ib::warn() << "Trying to insert a record from the insert"
+ " buffer to an index page but the index page"
+ " is empty!";
+ goto dump;
+ }
+
+ if (!rec_n_fields_is_sane(index, rec, entry)) {
+ ib::warn() << "Trying to insert a record from the insert"
+ " buffer to an index page but the number of fields"
+ " does not match!";
+ rec_print(stderr, rec, index);
+dump:
+ dtuple_print(stderr, entry);
+ ut_ad(0);
+
+ ib::warn() << "The table where this index record belongs"
+ " is now probably corrupt. Please run CHECK TABLE on"
+ " your tables. " << BUG_REPORT_MSG;
+
+ DBUG_VOID_RETURN;
+ }
+
+ low_match = page_cur_search(block, index, entry, &page_cur);
+
+ heap = mem_heap_create(
+ sizeof(upd_t)
+ + REC_OFFS_HEADER_SIZE * sizeof(*offsets)
+ + dtuple_get_n_fields(entry)
+ * (sizeof(upd_field_t) + sizeof *offsets));
+
+ if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) {
+ upd_t* update;
+
+ rec = page_cur_get_rec(&page_cur);
+
+ /* This is based on
+ row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */
+ ut_ad(rec_get_deleted_flag(rec, page_is_comp(page)));
+
+ offsets = rec_get_offsets(rec, index, NULL, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+ update = row_upd_build_sec_rec_difference_binary(
+ rec, index, offsets, entry, heap);
+
+ if (update->n_fields == 0) {
+ /* The records only differ in the delete-mark.
+ Clear the delete-mark, like we did before
+ Bug #56680 was fixed. */
+ btr_rec_set_deleted<false>(block, rec, mtr);
+ goto updated_in_place;
+ }
+
+ /* Copy the info bits. Clear the delete-mark. */
+ update->info_bits = rec_get_info_bits(rec, page_is_comp(page));
+ update->info_bits &= byte(~REC_INFO_DELETED_FLAG);
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+
+ /* We cannot invoke btr_cur_optimistic_update() here,
+ because we do not have a btr_cur_t or que_thr_t,
+ as the insert buffer merge occurs at a very low level. */
+ if (!row_upd_changes_field_size_or_external(index, offsets,
+ update)
+ && (!page_zip || btr_cur_update_alloc_zip(
+ page_zip, &page_cur, index, offsets,
+ rec_offs_size(offsets), false, mtr))) {
+ /* This is the easy case. Do something similar
+ to btr_cur_update_in_place(). */
+ rec = page_cur_get_rec(&page_cur);
+ btr_cur_upd_rec_in_place(rec, index, offsets,
+ update, block, mtr);
+
+ DBUG_EXECUTE_IF(
+ "crash_after_log_ibuf_upd_inplace",
+ log_buffer_flush_to_disk();
+ ib::info() << "Wrote log record for ibuf"
+ " update in place operation";
+ DBUG_SUICIDE();
+ );
+
+ goto updated_in_place;
+ }
+
+ /* btr_cur_update_alloc_zip() may have changed this */
+ rec = page_cur_get_rec(&page_cur);
+
+ /* A collation may identify values that differ in
+ storage length.
+ Some examples (1 or 2 bytes):
+ utf8_turkish_ci: I = U+0131 LATIN SMALL LETTER DOTLESS I
+ utf8_general_ci: S = U+00DF LATIN SMALL LETTER SHARP S
+ utf8_general_ci: A = U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
+
+ latin1_german2_ci: SS = U+00DF LATIN SMALL LETTER SHARP S
+
+ Examples of a character (3-byte UTF-8 sequence)
+ identified with 2 or 4 characters (1-byte UTF-8 sequences):
+
+ utf8_unicode_ci: 'II' = U+2171 SMALL ROMAN NUMERAL TWO
+ utf8_unicode_ci: '(10)' = U+247D PARENTHESIZED NUMBER TEN
+ */
+
+ /* Delete the different-length record, and insert the
+ buffered one. */
+
+ lock_rec_store_on_page_infimum(block, rec);
+ page_cur_delete_rec(&page_cur, index, offsets, mtr);
+ page_cur_move_to_prev(&page_cur);
+ rec = ibuf_insert_to_index_page_low(entry, block, index,
+ &offsets, heap, mtr,
+ &page_cur);
+
+ ut_ad(!cmp_dtuple_rec(entry, rec, offsets));
+ lock_rec_restore_from_page_infimum(block, rec, block);
+ } else {
+ offsets = NULL;
+ ibuf_insert_to_index_page_low(entry, block, index,
+ &offsets, heap, mtr,
+ &page_cur);
+ }
+updated_in_place:
+ mem_heap_free(heap);
+
+ DBUG_VOID_RETURN;
+}
+
+/****************************************************************//**
+During merge, sets the delete mark on a record for a secondary index
+entry. */
+static
+void
+ibuf_set_del_mark(
+/*==============*/
+ const dtuple_t* entry, /*!< in: entry */
+ buf_block_t* block, /*!< in/out: block */
+ const dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t page_cur;
+ ulint low_match;
+
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(dtuple_check_typed(entry));
+
+ low_match = page_cur_search(block, index, entry, &page_cur);
+
+ if (low_match == dtuple_get_n_fields(entry)) {
+ rec_t* rec = page_cur_get_rec(&page_cur);
+
+ /* Delete mark the old index record. According to a
+ comment in row_upd_sec_index_entry(), it can already
+ have been delete marked if a lock wait occurred in
+ row_ins_sec_index_entry() in a previous invocation of
+ row_upd_sec_index_entry(). */
+
+ if (UNIV_LIKELY
+ (!rec_get_deleted_flag(
+ rec, dict_table_is_comp(index->table)))) {
+ btr_rec_set_deleted<true>(block, rec, mtr);
+ }
+ } else {
+ const page_t* page
+ = page_cur_get_page(&page_cur);
+ const buf_block_t* block
+ = page_cur_get_block(&page_cur);
+
+ ib::error() << "Unable to find a record to delete-mark";
+ fputs("InnoDB: tuple ", stderr);
+ dtuple_print(stderr, entry);
+ fputs("\n"
+ "InnoDB: record ", stderr);
+ rec_print(stderr, page_cur_get_rec(&page_cur), index);
+
+ ib::error() << "page " << block->page.id() << " ("
+ << page_get_n_recs(page) << " records, index id "
+ << btr_page_get_index_id(page) << ").";
+
+ ib::error() << BUG_REPORT_MSG;
+ ut_ad(0);
+ }
+}
+
+/****************************************************************//**
+During merge, delete a record for a secondary index entry. */
+static
+void
+ibuf_delete(
+/*========*/
+ const dtuple_t* entry, /*!< in: entry */
+ buf_block_t* block, /*!< in/out: block */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in/out: mtr; must be committed
+ before latching any further pages */
+{
+ page_cur_t page_cur;
+ ulint low_match;
+
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(dtuple_check_typed(entry));
+ ut_ad(!index->is_spatial());
+ ut_ad(!index->is_clust());
+
+ low_match = page_cur_search(block, index, entry, &page_cur);
+
+ if (low_match == dtuple_get_n_fields(entry)) {
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+ page_t* page = buf_block_get_frame(block);
+ rec_t* rec = page_cur_get_rec(&page_cur);
+
+ /* TODO: the below should probably be a separate function,
+ it's a bastardized version of btr_cur_optimistic_delete. */
+
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ mem_heap_t* heap = NULL;
+ ulint max_ins_size = 0;
+
+ rec_offs_init(offsets_);
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (page_get_n_recs(page) <= 1
+ || !(REC_INFO_DELETED_FLAG
+ & rec_get_info_bits(rec, page_is_comp(page)))) {
+ /* Refuse to purge the last record or a
+ record that has not been marked for deletion. */
+ ib::error() << "Unable to purge a record";
+ fputs("InnoDB: tuple ", stderr);
+ dtuple_print(stderr, entry);
+ fputs("\n"
+ "InnoDB: record ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ fprintf(stderr, "\nspace " UINT32PF " offset " UINT32PF
+ " (%u records, index id %llu)\n"
+ "InnoDB: Submit a detailed bug report"
+ " to https://jira.mariadb.org/\n",
+ block->page.id().space(),
+ block->page.id().page_no(),
+ (unsigned) page_get_n_recs(page),
+ (ulonglong) btr_page_get_index_id(page));
+
+ ut_ad(0);
+ return;
+ }
+
+ lock_update_delete(block, rec);
+
+ if (!page_zip) {
+ max_ins_size
+ = page_get_max_insert_size_after_reorganize(
+ page, 1);
+ }
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cur_delete_rec(&page_cur, index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_zip) {
+ ibuf_update_free_bits_zip(block, mtr);
+ } else {
+ ibuf_update_free_bits_low(block, max_ins_size, mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ } else {
+ /* The record must have been purged already. */
+ }
+}
+
+/*********************************************************************//**
+Restores insert buffer tree cursor position
+@return whether the position was restored */
+static MY_ATTRIBUTE((nonnull))
+bool
+ibuf_restore_pos(
+/*=============*/
+ const page_id_t page_id,/*!< in: page identifier */
+ const dtuple_t* search_tuple,
+ /*!< in: search tuple for entries of page_no */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor whose
+ position is to be restored */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(mode == BTR_MODIFY_LEAF
+ || BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE);
+
+ if (UNIV_LIKELY(btr_pcur_restore_position(mode, pcur, mtr))) {
+ return true;
+ }
+
+ if (fil_space_t* s = fil_space_t::get(page_id.space())) {
+ ib::error() << "ibuf cursor restoration fails!"
+ " ibuf record inserted to page "
+ << page_id
+ << " in file " << s->chain.start->name;
+ s->release();
+
+ ib::error() << BUG_REPORT_MSG;
+
+ rec_print_old(stderr, btr_pcur_get_rec(pcur));
+ rec_print_old(stderr, pcur->old_rec);
+ dtuple_print(stderr, search_tuple);
+
+ rec_print_old(stderr,
+ page_rec_get_next(btr_pcur_get_rec(pcur)));
+ }
+
+ ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
+ return false;
+}
+
+/**
+Delete a change buffer record.
+@param[in] page_id page identifier
+@param[in,out] pcur persistent cursor positioned on the record
+@param[in] search_tuple search key for (space,page_no)
+@param[in,out] mtr mini-transaction
+@return whether mtr was committed (due to pessimistic operation) */
+static MY_ATTRIBUTE((warn_unused_result, nonnull))
+bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
+ const dtuple_t* search_tuple, mtr_t* mtr)
+{
+ ibool success;
+ page_t* root;
+ dberr_t err;
+
+ ut_ad(ibuf_inside(mtr));
+ ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
+ ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur))
+ == page_id.page_no());
+ ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur))
+ == page_id.space());
+
+ success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur),
+ 0, mtr);
+
+ if (success) {
+ if (page_is_empty(btr_pcur_get_page(pcur))) {
+ /* If a B-tree page is empty, it must be the root page
+ and the whole B-tree must be empty. InnoDB does not
+ allow empty B-tree pages other than the root. */
+ root = btr_pcur_get_page(pcur);
+
+ ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
+ ut_ad(page_get_page_no(root)
+ == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+ /* ibuf.empty is protected by the root page latch.
+ Before the deletion, it had to be FALSE. */
+ ut_ad(!ibuf.empty);
+ ibuf.empty = true;
+ }
+
+ return(FALSE);
+ }
+
+ /* We have to resort to a pessimistic delete from ibuf.
+ Delete-mark the record so that it will not be applied again,
+ in case the server crashes before the pessimistic delete is
+ made persistent. */
+ btr_rec_set_deleted<true>(btr_pcur_get_block(pcur),
+ btr_pcur_get_rec(pcur), mtr);
+
+ btr_pcur_store_position(pcur, mtr);
+ ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
+
+ ibuf_mtr_start(mtr);
+ mutex_enter(&ibuf_mutex);
+
+ if (!ibuf_restore_pos(page_id, search_tuple,
+ BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+ pcur, mtr)) {
+
+ mutex_exit(&ibuf_mutex);
+ ut_ad(mtr->has_committed());
+ goto func_exit;
+ }
+
+ root = ibuf_tree_root_get(mtr)->frame;
+
+ btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), 0,
+ false, mtr);
+ ut_a(err == DB_SUCCESS);
+
+ ibuf_size_update(root);
+ mutex_exit(&ibuf_mutex);
+
+ ibuf.empty = page_is_empty(root);
+ ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
+
+func_exit:
+ ut_ad(mtr->has_committed());
+ btr_pcur_close(pcur);
+
+ return(TRUE);
+}
+
+/** Check whether buffered changes exist for a page.
+@param[in] id page identifier
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return whether buffered changes exist */
+bool ibuf_page_exists(const page_id_t id, ulint zip_size)
+{
+ ut_ad(!fsp_is_system_temporary(id.space()));
+
+ const ulint physical_size = zip_size ? zip_size : srv_page_size;
+
+ if (ibuf_fixed_addr_page(id, physical_size)
+ || fsp_descr_page(id, physical_size)) {
+ return false;
+ }
+
+ mtr_t mtr;
+ bool bitmap_bits = false;
+
+ ibuf_mtr_start(&mtr);
+ if (const buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+ id, zip_size, &mtr)) {
+ bitmap_bits = ibuf_bitmap_page_get_bits(
+ bitmap_page->frame, id, zip_size,
+ IBUF_BITMAP_BUFFERED, &mtr) != 0;
+ }
+ ibuf_mtr_commit(&mtr);
+ return bitmap_bits;
+}
+
+/** Reset the bits in the bitmap page for the given block and page id.
+@param b X-latched secondary index page (nullptr to discard changes)
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param mtr mini-transaction */
+static void ibuf_reset_bitmap(buf_block_t *b, page_id_t page_id,
+ ulint zip_size, mtr_t *mtr)
+{
+ buf_block_t *bitmap= ibuf_bitmap_get_map_page(page_id, zip_size, mtr);
+ if (!bitmap)
+ return;
+
+ const ulint physical_size = zip_size ? zip_size : srv_page_size;
+ /* FIXME: update the bitmap byte only once! */
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(bitmap, page_id,
+ physical_size, false, mtr);
+
+ if (b)
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(bitmap, page_id, physical_size,
+ ibuf_index_page_calc_free(b),
+ mtr);
+}
+
+/** When an index page is read from a disk to the buffer pool, this function
+applies any buffered operations to the page and deletes the entries from the
+insert buffer. If the page is not read, but created in the buffer pool, this
+function deletes its buffered entries from the insert buffer; there can
+exist entries for such a page if the page belonged to an index which
+subsequently was dropped.
+@param block X-latched page to try to apply changes to, or NULL to discard
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */
+void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
+ ulint zip_size)
+{
+ if (trx_sys_hdr_page(page_id)) {
+ return;
+ }
+
+ btr_pcur_t pcur;
+#ifdef UNIV_IBUF_DEBUG
+ ulint volume = 0;
+#endif /* UNIV_IBUF_DEBUG */
+ bool corruption_noticed = false;
+ mtr_t mtr;
+
+ /* Counts for merged & discarded operations. */
+ ulint mops[IBUF_OP_COUNT];
+ ulint dops[IBUF_OP_COUNT];
+
+ ut_ad(!block || page_id == block->page.id());
+ ut_ad(!block || block->page.state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(!block || block->page.status == buf_page_t::NORMAL);
+ ut_ad(!trx_sys_hdr_page(page_id));
+ ut_ad(page_id < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+
+ const ulint physical_size = zip_size ? zip_size : srv_page_size;
+
+ if (ibuf_fixed_addr_page(page_id, physical_size)
+ || fsp_descr_page(page_id, physical_size)) {
+ return;
+ }
+
+ fil_space_t* space = fil_space_t::get(page_id.space());
+
+ if (UNIV_UNLIKELY(!space)) {
+ block = NULL;
+ } else {
+ ulint bitmap_bits = 0;
+
+ ibuf_mtr_start(&mtr);
+
+ buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+ page_id, zip_size, &mtr);
+
+ if (bitmap_page
+ && fil_page_get_type(bitmap_page->frame)
+ != FIL_PAGE_TYPE_ALLOCATED) {
+ bitmap_bits = ibuf_bitmap_page_get_bits(
+ bitmap_page->frame, page_id, zip_size,
+ IBUF_BITMAP_BUFFERED, &mtr);
+ }
+
+ ibuf_mtr_commit(&mtr);
+
+ if (bitmap_bits && fseg_page_is_free(
+ space, page_id.page_no())) {
+ ibuf_mtr_start(&mtr);
+ mtr.set_named_space(space);
+ ibuf_reset_bitmap(block, page_id, zip_size, &mtr);
+ ibuf_mtr_commit(&mtr);
+ bitmap_bits = 0;
+ }
+
+ if (!bitmap_bits) {
+ /* No changes are buffered for this page. */
+ space->release();
+ return;
+ }
+ }
+
+ mem_heap_t* heap = mem_heap_create(512);
+
+ const dtuple_t* search_tuple = ibuf_search_tuple_build(
+ page_id.space(), page_id.page_no(), heap);
+
+ if (block != NULL) {
+ /* Move the ownership of the x-latch on the page to this OS
+ thread, so that we can acquire a second x-latch on it. This
+ is needed for the insert operations to the index page to pass
+ the debug checks. */
+
+ rw_lock_x_lock_move_ownership(&(block->lock));
+
+ if (!fil_page_index_page_check(block->frame)
+ || !page_is_leaf(block->frame)) {
+
+ corruption_noticed = true;
+
+ ib::error() << "Corruption in the tablespace. Bitmap"
+ " shows insert buffer records to page "
+ << page_id << " though the page type is "
+ << fil_page_get_type(block->frame)
+ << ", which is not an index leaf page. We try"
+ " to resolve the problem by skipping the"
+ " insert buffer merge for this page. Please"
+ " run CHECK TABLE on your tables to determine"
+ " if they are corrupt after this.";
+ ut_ad(0);
+ }
+ }
+
+ memset(mops, 0, sizeof(mops));
+ memset(dops, 0, sizeof(dops));
+
+loop:
+ ibuf_mtr_start(&mtr);
+
+ /* Position pcur in the insert buffer at the first entry for this
+ index page */
+ btr_pcur_open_on_user_rec(
+ ibuf.index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+ &pcur, &mtr);
+
+ if (block) {
+ ut_ad(rw_lock_own(&block->lock, RW_LOCK_X));
+ buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+ rw_lock_x_lock(&block->lock);
+
+ mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+ /* This is a user page (secondary index leaf page),
+ but we pretend that it is a change buffer page in
+ order to obey the latching order. This should be OK,
+ because buffered changes are applied immediately while
+ the block is io-fixed. Other threads must not try to
+ latch an io-fixed block. */
+ buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+ }
+
+ if (space) {
+ mtr.set_named_space(space);
+ }
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ ut_ad(btr_pcur_is_after_last_on_page(&pcur));
+ goto reset_bit;
+ }
+
+ for (;;) {
+ rec_t* rec;
+
+ ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ /* Check if the entry is for this index page */
+ if (ibuf_rec_get_page_no(&mtr, rec) != page_id.page_no()
+ || ibuf_rec_get_space(&mtr, rec) != page_id.space()) {
+
+ if (block != NULL) {
+ page_header_reset_last_insert(block, &mtr);
+ }
+
+ goto reset_bit;
+ }
+
+ if (corruption_noticed) {
+ fputs("InnoDB: Discarding record\n ", stderr);
+ rec_print_old(stderr, rec);
+ fputs("\nInnoDB: from the insert buffer!\n\n", stderr);
+ } else if (block != NULL && !rec_get_deleted_flag(rec, 0)) {
+ /* Now we have at pcur a record which should be
+ applied on the index page; NOTE that the call below
+ copies pointers to fields in rec, and we must
+ keep the latch to the rec page until the
+ insertion is finished! */
+ dtuple_t* entry;
+ trx_id_t max_trx_id;
+ dict_index_t* dummy_index;
+ ibuf_op_t op = ibuf_rec_get_op_type(&mtr, rec);
+
+ max_trx_id = page_get_max_trx_id(page_align(rec));
+ page_update_max_trx_id(block,
+ buf_block_get_page_zip(block),
+ max_trx_id, &mtr);
+
+ ut_ad(page_validate(page_align(rec), ibuf.index));
+
+ entry = ibuf_build_entry_from_ibuf_rec(
+ &mtr, rec, heap, &dummy_index);
+ ut_ad(!dummy_index->table->space);
+ dummy_index->table->space = space;
+ dummy_index->table->space_id = space->id;
+
+ ut_ad(page_validate(block->frame, dummy_index));
+
+ switch (op) {
+ case IBUF_OP_INSERT:
+#ifdef UNIV_IBUF_DEBUG
+ volume += rec_get_converted_size(
+ dummy_index, entry, 0);
+
+ volume += page_dir_calc_reserved_space(1);
+
+ ut_a(volume <= (4U << srv_page_size_shift)
+ / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+#endif
+ ibuf_insert_to_index_page(
+ entry, block, dummy_index, &mtr);
+ break;
+
+ case IBUF_OP_DELETE_MARK:
+ ibuf_set_del_mark(
+ entry, block, dummy_index, &mtr);
+ break;
+
+ case IBUF_OP_DELETE:
+ ibuf_delete(entry, block, dummy_index, &mtr);
+ /* Because ibuf_delete() will latch an
+ insert buffer bitmap page, commit mtr
+ before latching any further pages.
+ Store and restore the cursor position. */
+ ut_ad(rec == btr_pcur_get_rec(&pcur));
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(ibuf_rec_get_page_no(&mtr, rec)
+ == page_id.page_no());
+ ut_ad(ibuf_rec_get_space(&mtr, rec)
+ == page_id.space());
+
+ /* Mark the change buffer record processed,
+ so that it will not be merged again in case
+ the server crashes between the following
+ mtr_commit() and the subsequent mtr_commit()
+ of deleting the change buffer record. */
+ btr_rec_set_deleted<true>(
+ btr_pcur_get_block(&pcur),
+ btr_pcur_get_rec(&pcur), &mtr);
+
+ btr_pcur_store_position(&pcur, &mtr);
+ ibuf_btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+ ibuf_mtr_start(&mtr);
+ mtr.set_named_space(space);
+
+ ut_ad(rw_lock_own(&block->lock, RW_LOCK_X));
+ buf_block_buf_fix_inc(block,
+ __FILE__, __LINE__);
+ rw_lock_x_lock(&block->lock);
+ mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+
+ /* This is a user page (secondary
+ index leaf page), but it should be OK
+ to use too low latching order for it,
+ as the block is io-fixed. */
+ buf_block_dbg_add_level(
+ block, SYNC_IBUF_TREE_NODE);
+
+ if (!ibuf_restore_pos(page_id, search_tuple,
+ BTR_MODIFY_LEAF,
+ &pcur, &mtr)) {
+
+ ut_ad(mtr.has_committed());
+ mops[op]++;
+ ibuf_dummy_index_free(dummy_index);
+ goto loop;
+ }
+
+ break;
+ default:
+ ut_error;
+ }
+
+ mops[op]++;
+
+ ibuf_dummy_index_free(dummy_index);
+ } else {
+ dops[ibuf_rec_get_op_type(&mtr, rec)]++;
+ }
+
+ /* Delete the record from ibuf */
+ if (ibuf_delete_rec(page_id, &pcur, search_tuple, &mtr)) {
+ /* Deletion was pessimistic and mtr was committed:
+ we start from the beginning again */
+
+ ut_ad(mtr.has_committed());
+ goto loop;
+ } else if (btr_pcur_is_after_last_on_page(&pcur)) {
+ ibuf_mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ goto loop;
+ }
+ }
+
+reset_bit:
+ if (space) {
+ ibuf_reset_bitmap(block, page_id, zip_size, &mtr);
+ }
+
+ ibuf_mtr_commit(&mtr);
+
+ if (space) {
+ space->release();
+ }
+
+ btr_pcur_close(&pcur);
+ mem_heap_free(heap);
+
+ ibuf.n_merges++;
+ ibuf_add_ops(ibuf.n_merged_ops, mops);
+ ibuf_add_ops(ibuf.n_discarded_ops, dops);
+}
+
+/** Delete all change buffer entries for a tablespace,
+in DISCARD TABLESPACE, IMPORT TABLESPACE, or crash recovery.
+@param[in] space missing or to-be-discarded tablespace */
+void ibuf_delete_for_discarded_space(ulint space)
+{
+ mem_heap_t* heap;
+ btr_pcur_t pcur;
+ dtuple_t* search_tuple;
+ const rec_t* ibuf_rec;
+ mtr_t mtr;
+
+ /* Counts for discarded operations. */
+ ulint dops[IBUF_OP_COUNT];
+
+ heap = mem_heap_create(512);
+
+ /* Use page number 0 to build the search tuple so that we get the
+ cursor positioned at the first entry for this space id */
+
+ search_tuple = ibuf_search_tuple_build(space, 0, heap);
+
+ memset(dops, 0, sizeof(dops));
+loop:
+ ibuf_mtr_start(&mtr);
+
+ /* Position pcur in the insert buffer at the first entry for the
+ space */
+ btr_pcur_open_on_user_rec(
+ ibuf.index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+ &pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ ut_ad(btr_pcur_is_after_last_on_page(&pcur));
+ goto leave_loop;
+ }
+
+ for (;;) {
+ ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+ ibuf_rec = btr_pcur_get_rec(&pcur);
+
+ /* Check if the entry is for this space */
+ if (ibuf_rec_get_space(&mtr, ibuf_rec) != space) {
+
+ goto leave_loop;
+ }
+
+ uint32_t page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec);
+
+ dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++;
+
+ /* Delete the record from ibuf */
+ if (ibuf_delete_rec(page_id_t(space, page_no),
+ &pcur, search_tuple, &mtr)) {
+ /* Deletion was pessimistic and mtr was committed:
+ we start from the beginning again */
+
+ ut_ad(mtr.has_committed());
+ goto loop;
+ }
+
+ if (btr_pcur_is_after_last_on_page(&pcur)) {
+ ibuf_mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ goto loop;
+ }
+ }
+
+leave_loop:
+ ibuf_mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ ibuf_add_ops(ibuf.n_discarded_ops, dops);
+
+ mem_heap_free(heap);
+}
+
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return true if empty */
+bool
+ibuf_is_empty(void)
+/*===============*/
+{
+ mtr_t mtr;
+
+ ibuf_mtr_start(&mtr);
+
+ ut_d(mutex_enter(&ibuf_mutex));
+ const buf_block_t* root = ibuf_tree_root_get(&mtr);
+ bool is_empty = page_is_empty(root->frame);
+ ut_a(is_empty == ibuf.empty);
+ ut_d(mutex_exit(&ibuf_mutex));
+ ibuf_mtr_commit(&mtr);
+
+ return(is_empty);
+}
+
+/******************************************************************//**
+Prints info of ibuf. */
+void
+ibuf_print(
+/*=======*/
+ FILE* file) /*!< in: file where to print */
+{
+ mutex_enter(&ibuf_mutex);
+
+ fprintf(file,
+ "Ibuf: size " ULINTPF ", free list len " ULINTPF ","
+ " seg size " ULINTPF ", " ULINTPF " merges\n",
+ ibuf.size,
+ ibuf.free_list_len,
+ ibuf.seg_size,
+ ulint{ibuf.n_merges});
+
+ fputs("merged operations:\n ", file);
+ ibuf_print_ops(ibuf.n_merged_ops, file);
+
+ fputs("discarded operations:\n ", file);
+ ibuf_print_ops(ibuf.n_discarded_ops, file);
+
+ mutex_exit(&ibuf_mutex);
+}
+
+/** Check the insert buffer bitmaps on IMPORT TABLESPACE.
+@param[in] trx transaction
+@param[in,out] space tablespace being imported
+@return DB_SUCCESS or error code */
+dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
+{
+ ut_ad(trx->mysql_thd);
+ ut_ad(space->purpose == FIL_TYPE_IMPORT);
+
+ const unsigned zip_size = space->zip_size();
+ const unsigned physical_size = space->physical_size();
+
+ uint32_t size= std::min(space->free_limit, space->size);
+
+ if (size == 0) {
+ return(DB_TABLE_NOT_FOUND);
+ }
+
+ mtr_t mtr;
+
+ mutex_enter(&ibuf_mutex);
+
+ /* The two bitmap pages (allocation bitmap and ibuf bitmap) repeat
+ every page_size pages. For example if page_size is 16 KiB, then the
+ two bitmap pages repeat every 16 KiB * 16384 = 256 MiB. In the loop
+ below page_no is measured in number of pages since the beginning of
+ the space, as usual. */
+
+ for (uint32_t page_no = 0; page_no < size; page_no += physical_size) {
+ if (trx_is_interrupted(trx)) {
+ mutex_exit(&ibuf_mutex);
+ return(DB_INTERRUPTED);
+ }
+
+ mtr_start(&mtr);
+
+ mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+ ibuf_enter(&mtr);
+
+ buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+ page_id_t(space->id, page_no), zip_size, &mtr);
+ if (!bitmap_page) {
+ mutex_exit(&ibuf_mutex);
+ mtr.commit();
+ return DB_CORRUPTION;
+ }
+
+ if (buf_is_zeroes(span<const byte>(bitmap_page->frame,
+ physical_size))) {
+ /* This means we got all-zero page instead of
+ ibuf bitmap page. The subsequent page should be
+ all-zero pages. */
+#ifdef UNIV_DEBUG
+ for (uint32_t curr_page = page_no + 1;
+ curr_page < physical_size; curr_page++) {
+
+ buf_block_t* block = buf_page_get(
+ page_id_t(space->id, curr_page),
+ zip_size, RW_S_LATCH, &mtr);
+ page_t* page = buf_block_get_frame(block);
+ ut_ad(buf_is_zeroes(span<const byte>(
+ page,
+ physical_size)));
+ }
+#endif /* UNIV_DEBUG */
+ ibuf_exit(&mtr);
+ mtr_commit(&mtr);
+ continue;
+ }
+
+ for (uint32_t i = FSP_IBUF_BITMAP_OFFSET + 1; i < physical_size;
+ i++) {
+ const uint32_t offset = page_no + i;
+ const page_id_t cur_page_id(space->id, offset);
+
+ if (ibuf_bitmap_page_get_bits(
+ bitmap_page->frame, cur_page_id, zip_size,
+ IBUF_BITMAP_IBUF, &mtr)) {
+
+ mutex_exit(&ibuf_mutex);
+ ibuf_exit(&mtr);
+ mtr_commit(&mtr);
+
+ ib_errf(trx->mysql_thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_INNODB_INDEX_CORRUPT,
+ "File %s page %u"
+ " is wrongly flagged to belong to the"
+ " insert buffer",
+ space->chain.start->name, offset);
+ return(DB_CORRUPTION);
+ }
+
+ if (ibuf_bitmap_page_get_bits(
+ bitmap_page->frame, cur_page_id, zip_size,
+ IBUF_BITMAP_BUFFERED, &mtr)) {
+
+ ib_errf(trx->mysql_thd,
+ IB_LOG_LEVEL_WARN,
+ ER_INNODB_INDEX_CORRUPT,
+ "Buffered changes"
+ " for file %s page %u are lost",
+ space->chain.start->name, offset);
+
+ /* Tolerate this error, so that
+ slightly corrupted tables can be
+ imported and dumped. Clear the bit. */
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
+ bitmap_page, cur_page_id,
+ physical_size, false, &mtr);
+ }
+ }
+
+ ibuf_exit(&mtr);
+ mtr_commit(&mtr);
+ }
+
+ mutex_exit(&ibuf_mutex);
+ return(DB_SUCCESS);
+}
+
+/** Updates free bits and buffered bits for bulk loaded page.
+@param[in] block index page
+@param[in] reset flag if reset free val */
+void
+ibuf_set_bitmap_for_bulk_load(
+ buf_block_t* block,
+ bool reset)
+{
+ mtr_t mtr;
+ ulint free_val;
+
+ ut_a(page_is_leaf(buf_block_get_frame(block)));
+
+ free_val = ibuf_index_page_calc_free(block);
+
+ mtr.start();
+ fil_space_t* space = mtr.set_named_space_id(block->page.id().space());
+
+ buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(block->page.id(),
+ space->zip_size(),
+ &mtr);
+
+ free_val = reset ? 0 : ibuf_index_page_calc_free(block);
+ /* FIXME: update the bitmap byte only once! */
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
+ bitmap_page, block->page.id(), block->physical_size(),
+ free_val, &mtr);
+
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
+ bitmap_page, block->page.id(), block->physical_size(),
+ false, &mtr);
+
+ mtr.commit();
+}
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
new file mode 100644
index 00000000..7fae1ad1
--- /dev/null
+++ b/storage/innobase/include/btr0btr.h
@@ -0,0 +1,760 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.h
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0btr_h
+#define btr0btr_h
+
+#include "dict0dict.h"
+#include "data0data.h"
+#include "rem0types.h"
+#include "page0cur.h"
+#include "btr0types.h"
+#include "gis0type.h"
+
+#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level
+ (not really a hard limit).
+ Used in debug assertions
+ in btr_page_set_level and
+ btr_page_get_level */
+
+/** Maximum record size which can be stored on a page, without using the
+special big record storage structure */
+#define BTR_PAGE_MAX_REC_SIZE (srv_page_size / 2 - 200)
+
+/** @brief Maximum depth of a B-tree in InnoDB.
+
+Note that this isn't a maximum as such; none of the tree operations
+avoid producing trees bigger than this. It is instead a "max depth
+that other code must work with", useful for e.g. fixed-size arrays
+that must store some information about each level in a tree. In other
+words: if a B-tree with bigger depth than this is encountered, it is
+not acceptable for it to lead to mysterious memory corruption, but it
+is acceptable for the program to die with a clear assert failure. */
+#define BTR_MAX_LEVELS 100
+
+/** Latching modes for btr_cur_search_to_nth_level(). */
+enum btr_latch_mode {
+ /** Search a record on a leaf page and S-latch it. */
+ BTR_SEARCH_LEAF = RW_S_LATCH,
+ /** (Prepare to) modify a record on a leaf page and X-latch it. */
+ BTR_MODIFY_LEAF = RW_X_LATCH,
+ /** Obtain no latches. */
+ BTR_NO_LATCHES = RW_NO_LATCH,
+ /** Start modifying the entire B-tree. */
+ BTR_MODIFY_TREE = 33,
+ /** Continue modifying the entire B-tree. */
+ BTR_CONT_MODIFY_TREE = 34,
+ /** Search the previous record. */
+ BTR_SEARCH_PREV = 35,
+ /** Modify the previous record. */
+ BTR_MODIFY_PREV = 36,
+ /** Start searching the entire B-tree. */
+ BTR_SEARCH_TREE = 37,
+ /** Continue searching the entire B-tree. */
+ BTR_CONT_SEARCH_TREE = 38,
+
+ /* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually
+ exclusive. */
+ /** The search tuple will be inserted to the secondary index
+ at the searched position. When the leaf page is not in the
+ buffer pool, try to use the change buffer. */
+ BTR_INSERT = 512,
+
+ /** Try to delete mark a secondary index leaf page record at
+ the searched position using the change buffer when the page is
+ not in the buffer pool. */
+ BTR_DELETE_MARK = 4096,
+
+ /** Try to purge the record using the change buffer when the
+ secondary index leaf page is not in the buffer pool. */
+ BTR_DELETE = 8192,
+
+ /** The caller is already holding dict_index_t::lock S-latch. */
+ BTR_ALREADY_S_LATCHED = 16384,
+ /** Search and S-latch a leaf page, assuming that the
+ dict_index_t::lock S-latch is being held. */
+ BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF
+ | BTR_ALREADY_S_LATCHED,
+ /** Search the entire index tree, assuming that the
+ dict_index_t::lock S-latch is being held. */
+ BTR_SEARCH_TREE_ALREADY_S_LATCHED = BTR_SEARCH_TREE
+ | BTR_ALREADY_S_LATCHED,
+ /** Search and X-latch a leaf page, assuming that the
+ dict_index_t::lock S-latch is being held. */
+ BTR_MODIFY_LEAF_ALREADY_S_LATCHED = BTR_MODIFY_LEAF
+ | BTR_ALREADY_S_LATCHED,
+
+ /** Attempt to delete-mark a secondary index record. */
+ BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK,
+ /** Attempt to delete-mark a secondary index record
+ while holding the dict_index_t::lock S-latch. */
+ BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF
+ | BTR_ALREADY_S_LATCHED,
+ /** Attempt to purge a secondary index record. */
+ BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE,
+ /** Attempt to purge a secondary index record
+ while holding the dict_index_t::lock S-latch. */
+ BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF
+ | BTR_ALREADY_S_LATCHED,
+
+ /** In the case of BTR_MODIFY_TREE, the caller specifies
+ the intention to delete record only. It is used to optimize
+ block->lock range.*/
+ BTR_LATCH_FOR_DELETE = 65536,
+
+ /** Attempt to purge a secondary index record in the tree. */
+ BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE
+};
+
+/** This flag ORed to btr_latch_mode says that we do the search in query
+optimization */
+#define BTR_ESTIMATE 1024U
+
+/** This flag ORed to BTR_INSERT says that we can ignore possible
+UNIQUE definition on secondary indexes when we decide if we can use
+the insert buffer to speed up inserts */
+#define BTR_IGNORE_SEC_UNIQUE 2048U
+
+/** In the case of BTR_MODIFY_TREE, the caller specifies the intention
+to insert record only. It is used to optimize block->lock range.*/
+#define BTR_LATCH_FOR_INSERT 32768U
+
+/** This flag is for undo insert of rtree. For rtree, we need this flag
+to find proper rec to undo insert.*/
+#define BTR_RTREE_UNDO_INS 131072U
+
+/** In the case of BTR_MODIFY_LEAF, the caller intends to allocate or
+free the pages of externally stored fields. */
+#define BTR_MODIFY_EXTERNAL 262144U
+
+/** Try to delete mark the record at the searched position when the
+record is in spatial index */
+#define BTR_RTREE_DELETE_MARK 524288U
+
+#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \
+ ((latch_mode) & ulint(~(BTR_INSERT \
+ | BTR_DELETE_MARK \
+ | BTR_RTREE_UNDO_INS \
+ | BTR_RTREE_DELETE_MARK \
+ | BTR_DELETE \
+ | BTR_ESTIMATE \
+ | BTR_IGNORE_SEC_UNIQUE \
+ | BTR_ALREADY_S_LATCHED \
+ | BTR_LATCH_FOR_INSERT \
+ | BTR_LATCH_FOR_DELETE \
+ | BTR_MODIFY_EXTERNAL)))
+
+#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode) \
+ ((latch_mode) & ulint(~(BTR_LATCH_FOR_INSERT \
+ | BTR_LATCH_FOR_DELETE \
+ | BTR_MODIFY_EXTERNAL)))
+
+/** Report that an index page is corrupted.
+@param[in] buffer block
+@param[in] index tree */
+ATTRIBUTE_COLD ATTRIBUTE_NORETURN __attribute__((nonnull))
+void btr_corruption_report(const buf_block_t* block,const dict_index_t* index);
+
+/** Assert that a B-tree page is not corrupted.
+@param block buffer block containing a B-tree page
+@param index the B-tree index */
+#define btr_assert_not_corrupted(block, index) \
+ if (!!page_is_comp(buf_block_get_frame(block)) \
+ != index->table->not_redundant()) \
+ btr_corruption_report(block, index)
+
+/**************************************************************//**
+Gets the root node of a tree and sx-latches it for segment access.
+@return root page, sx-latched */
+page_t*
+btr_root_get(
+/*=========*/
+ const dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull));
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+ const dict_index_t* index) /*!< in: index tree */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/**************************************************************//**
+Gets the height of the B-tree (the level of the root, when the leaf
+level is assumed to be 0). The caller must hold an S or X latch on
+the index.
+@return tree height (level of the root) */
+ulint
+btr_height_get(
+/*===========*/
+ const dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Get an index page and declare its latching order level.
+@param[in] index index tree
+@param[in] page page number
+@param[in] mode latch mode
+@param[in] merge whether change buffer merge should be attempted
+@param[in] file file name
+@param[in] line line where called
+@param[in,out] mtr mini-transaction
+@return block */
+inline buf_block_t* btr_block_get_func(const dict_index_t& index,
+ uint32_t page, ulint mode, bool merge,
+ const char* file, unsigned line,
+ mtr_t* mtr)
+{
+ dberr_t err;
+
+ if (buf_block_t* block = buf_page_get_gen(
+ page_id_t(index.table->space->id, page),
+ index.table->space->zip_size(), mode, NULL, BUF_GET,
+ file, line, mtr, &err, merge && !index.is_clust())) {
+ ut_ad(err == DB_SUCCESS);
+ if (mode != RW_NO_LATCH) {
+ buf_block_dbg_add_level(block, index.is_ibuf()
+ ? SYNC_IBUF_TREE_NODE
+ : SYNC_TREE_NODE);
+ }
+ return block;
+ } else {
+ ut_ad(err != DB_SUCCESS);
+
+ if (err == DB_DECRYPTION_FAILED) {
+ if (index.table) {
+ index.table->file_unreadable = true;
+ }
+ }
+
+ return NULL;
+ }
+}
+
+/** Gets a buffer page and declares its latching order level.
+@param index index tree
+@param page page number
+@param mode latch mode
+@param merge whether change buffer merge should be attempted
+@param mtr mini-transaction handle
+@return the block descriptor */
+# define btr_block_get(index, page, mode, merge, mtr) \
+ btr_block_get_func(index, page, mode, merge, __FILE__, __LINE__, mtr)
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+index_id_t
+btr_page_get_index_id(
+/*==================*/
+ const page_t* page) /*!< in: index page */
+ MY_ATTRIBUTE((warn_unused_result));
+/** Read the B-tree or R-tree PAGE_LEVEL.
+@param page B-tree or R-tree page
+@return number of child page links to reach the leaf level
+@retval 0 for leaf pages */
+inline uint16_t btr_page_get_level(const page_t *page)
+{
+ uint16_t level= mach_read_from_2(my_assume_aligned<2>
+ (PAGE_HEADER + PAGE_LEVEL + page));
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+ return level;
+} MY_ATTRIBUTE((warn_unused_result))
+
+/** Read FIL_PAGE_NEXT.
+@param page buffer pool page
+@return previous page number */
+inline uint32_t btr_page_get_next(const page_t* page)
+{
+ return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
+}
+
+/** Read FIL_PAGE_PREV.
+@param page buffer pool page
+@return previous page number */
+inline uint32_t btr_page_get_prev(const page_t* page)
+{
+ return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
+}
+
+/**************************************************************//**
+Releases the latch on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull));
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return child node address */
+UNIV_INLINE
+uint32_t
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+ const rec_t* rec, /*!< in: node pointer record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Create the root node for a new index tree.
+@param[in] type type of the index
+@param[in,out] space tablespace where created
+@param[in] index_id index id
+@param[in] index index, or NULL to create a system table
+@param[in,out] mtr mini-transaction
+@return page number of the created root
+@retval FIL_NULL if did not succeed */
+uint32_t
+btr_create(
+ ulint type,
+ fil_space_t* space,
+ index_id_t index_id,
+ dict_index_t* index,
+ mtr_t* mtr);
+
+/** Free a persistent index tree if it exists.
+@param[in] page_id root page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] index_id PAGE_INDEX_ID contents
+@param[in,out] mtr mini-transaction */
+void
+btr_free_if_exists(
+ const page_id_t page_id,
+ ulint zip_size,
+ index_id_t index_id,
+ mtr_t* mtr);
+
+/** Free an index tree in a temporary tablespace.
+@param[in] page_id root page id */
+void btr_free(const page_id_t page_id);
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
+@param[in,out] index clustered index
+@return the last used AUTO_INCREMENT value
+@retval 0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc(dict_index_t* index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
+or fall back to MAX(auto_increment_column).
+@param[in] table table containing an AUTO_INCREMENT column
+@param[in] col_no index of the AUTO_INCREMENT column
+@return the AUTO_INCREMENT value
+@retval 0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
+@param[in,out] index clustered index
+@param[in] autoinc the AUTO_INCREMENT value
+@param[in] reset whether to reset the AUTO_INCREMENT
+ to a possibly smaller value than currently
+ exists in the page */
+void
+btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false)
+ MY_ATTRIBUTE((nonnull));
+
+/** Write instant ALTER TABLE metadata to a root page.
+@param[in,out] root clustered index root page
+@param[in] index clustered index with instant ALTER TABLE
+@param[in,out] mtr mini-transaction */
+void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr);
+
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in] index clustered index with instant ALTER TABLE
+@param[in] all whether to reset FIL_PAGE_TYPE as well
+@param[in,out] mtr mini-transaction */
+ATTRIBUTE_COLD __attribute__((nonnull))
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr);
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
+ on the root page; when the function returns,
+ the cursor is positioned on the predecessor
+ of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap
+ that can be emptied, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((warn_unused_result));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+bool
+btr_page_reorganize(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ dict_index_t* index, /*!< in: the index tree of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+/** Decide if the page should be split at the convergence point of inserts
+converging to the left.
+@param[in] cursor insert position
+@return the first record to be moved to the right half page
+@retval NULL if no split is recommended */
+rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor);
+/** Decide if the page should be split at the convergence point of inserts
+converging to the right.
+@param[in] cursor insert position
+@param[out] split_rec if split recommended, the first record
+ on the right half page, or
+ NULL if the to-be-inserted record
+ should be first
+@return whether split is recommended */
+bool
+btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec);
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap
+ that can be emptied, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+void
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level, must be > 0 */
+ dtuple_t* tuple, /*!< in: the record to be inserted */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_insert_on_non_leaf_level(f,i,l,t,m) \
+ btr_insert_on_non_leaf_level_func(f,i,l,t,__FILE__,__LINE__,m)
+
+/** Set a child page pointer record as the predefined minimum record.
+@tparam has_prev whether the page is supposed to have a left sibling
+@param[in,out] rec leftmost record on a leftmost non-leaf page
+@param[in,out] block buffer pool block
+@param[in,out] mtr mini-transaction */
+template<bool has_prev= false>
+inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block,
+ mtr_t *mtr)
+{
+ ut_ad(block.frame == page_align(rec));
+ ut_ad(!page_is_leaf(block.frame));
+ ut_ad(has_prev == page_has_prev(block.frame));
+
+ rec-= page_rec_is_comp(rec) ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS;
+
+ if (block.page.zip.data)
+ /* This flag is computed from other contents on a ROW_FORMAT=COMPRESSED
+ page. We are not modifying the compressed page frame at all. */
+ *rec|= REC_INFO_MIN_REC_FLAG;
+ else
+ mtr->write<1>(block, rec, *rec | REC_INFO_MIN_REC_FLAG);
+}
+
+/** Seek to the parent page of a B-tree page.
+@param[in,out] index b-tree
+@param[in] block child page
+@param[in,out] mtr mini-transaction
+@param[out] cursor cursor pointing to the x-latched parent page */
+void btr_page_get_father(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
+ btr_cur_t* cursor)
+ MY_ATTRIBUTE((nonnull));
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+ibool
+btr_check_node_ptr(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: index page */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the
+brother reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to
+the brothers, if they exist.
+@return TRUE on success */
+ibool
+btr_compress(
+/*=========*/
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to merge
+ or lift; the page must not be empty:
+ when deleting records, use btr_discard_page()
+ if the page would become empty */
+ ibool adjust, /*!< in: TRUE if should adjust the
+ cursor position even if compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+void
+btr_discard_page(
+/*=============*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
+ the root page */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Gets the number of pages in a B-tree.
+@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
+ulint
+btr_get_size(
+/*=========*/
+ const dict_index_t* index, /*!< in: index */
+ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+ mtr_t* mtr) /*!< in/out: mini-transaction where index
+ is s-latched */
+ MY_ATTRIBUTE((warn_unused_result));
+/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr) /*!< in/out: mini-transaction where index
+ is s-latched */
+ __attribute__((nonnull));
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+ dict_index_t* index, /*!< in: index tree */
+ uint32_t hint_page_no, /*!< in: hint of a good page */
+ byte file_direction, /*!< in: direction where a possible
+ page split is made */
+ ulint level, /*!< in: level where the page is placed
+ in the tree */
+ mtr_t* mtr, /*!< in/out: mini-transaction
+ for the allocation */
+ mtr_t* init_mtr) /*!< in/out: mini-transaction
+ for x-latching and initializing
+ the page */
+ MY_ATTRIBUTE((warn_unused_result));
+/** Empty an index page (possibly the root page). @see btr_page_create().
+@param[in,out] block page to be emptied
+@param[in,out] page_zip compressed page frame, or NULL
+@param[in] index index of the page
+@param[in] level B-tree level of the page (0=leaf)
+@param[in,out] mtr mini-transaction */
+void
+btr_page_empty(
+ buf_block_t* block,
+ page_zip_des_t* page_zip,
+ dict_index_t* index,
+ ulint level,
+ mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull(1, 3, 5)));
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization). @see btr_page_empty(). */
+void
+btr_page_create(
+/*============*/
+ buf_block_t* block, /*!< in/out: page to be created */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr); /*!< in: mtr */
+
+/** Free an index page.
+@param[in,out] index index tree
+@param[in,out] block block to be freed
+@param[in,out] mtr mini-transaction
+@param[in] blob whether this is freeing a BLOB page */
+MY_ATTRIBUTE((nonnull))
+void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
+ bool blob = false);
+
+/**************************************************************//**
+Gets the root node of a tree and x- or s-latches it.
+@return root page, x- or s-latched */
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+ const dict_index_t* index, /*!< in: index tree */
+ rw_lock_type_t mode, /*!< in: either RW_S_LATCH
+ or RW_X_LATCH */
+ mtr_t* mtr); /*!< in: mtr */
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+bool btr_page_reorganize_block(
+ ulint z_level,/*!< in: compression level to be used
+ if dealing with compressed page */
+ buf_block_t* block, /*!< in/out: B-tree page */
+ dict_index_t* index, /*!< in: the index tree of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull));
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+void
+btr_print_size(
+/*===========*/
+ dict_index_t* index) /*!< in: index tree */
+ MY_ATTRIBUTE((nonnull));
+/**************************************************************//**
+Prints directories and other info of all nodes in the index. */
+void
+btr_print_index(
+/*============*/
+ dict_index_t* index, /*!< in: index */
+ ulint width) /*!< in: print this many entries from start
+ and end */
+ MY_ATTRIBUTE((nonnull));
+#endif /* UNIV_BTR_PRINT */
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+ibool
+btr_index_rec_validate(
+/*===================*/
+ const rec_t* rec, /*!< in: index record */
+ const dict_index_t* index, /*!< in: index */
+ ibool dump_on_error) /*!< in: TRUE if the function
+ should print hex dump of record
+ and page on error */
+ MY_ATTRIBUTE((warn_unused_result));
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return DB_SUCCESS if ok, error code if not */
+dberr_t
+btr_validate_index(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ const trx_t* trx) /*!< in: transaction or 0 */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Remove a page from the level list of pages.
+@param[in] block page to remove
+@param[in] index index tree
+@param[in,out] mtr mini-transaction */
+void btr_level_list_remove(const buf_block_t& block, const dict_index_t& index,
+ mtr_t* mtr);
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+UNIV_INTERN
+buf_block_t*
+btr_lift_page_up(
+/*=============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level;
+ must not be empty: use
+ btr_discard_only_page_on_level if the last
+ record from the page should be removed */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+
+#define BTR_N_LEAF_PAGES 1
+#define BTR_TOTAL_SIZE 2
+
+#include "btr0btr.ic"
+
+/****************************************************************
+Global variable controlling if scrubbing should be performed */
+extern my_bool srv_immediate_scrub_data_uncompressed;
+extern Atomic_counter<uint32_t> btr_validate_index_running;
+
+#endif
diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic
new file mode 100644
index 00000000..89826e8f
--- /dev/null
+++ b/storage/innobase/include/btr0btr.ic
@@ -0,0 +1,149 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.ic
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0zip.h"
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+index_id_t
+btr_page_get_index_id(
+/*==================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID));
+}
+
+/** Set PAGE_LEVEL.
+@param[in,out] block buffer block
+@param[in] level page level
+@param[in,out] mtr mini-transaction */
+inline
+void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
+{
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+ constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL;
+ byte *b= my_assume_aligned<2>(&block->frame[field]);
+ if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, level) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<2>(&block->page.zip.data[field], b, 2);
+}
+
+/** Set FIL_PAGE_NEXT.
+@param[in,out] block buffer block
+@param[in] next number of successor page
+@param[in,out] mtr mini-transaction */
+inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
+{
+ constexpr uint16_t field= FIL_PAGE_NEXT;
+ byte *b= my_assume_aligned<4>(&block->frame[field]);
+ if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, next) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
+}
+
+/** Set FIL_PAGE_PREV.
+@param[in,out] block buffer block
+@param[in] prev number of predecessor page
+@param[in,out] mtr mini-transaction */
+inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr)
+{
+ constexpr uint16_t field= FIL_PAGE_PREV;
+ byte *b= my_assume_aligned<4>(&block->frame[field]);
+ if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, prev) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return child node address */
+UNIV_INLINE
+uint32_t
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+ const rec_t* rec, /*!< in: node pointer record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ const byte* field;
+ ulint len;
+
+ ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+ /* The child address is in the last field */
+ field = rec_get_nth_field(rec, offsets,
+ rec_offs_n_fields(offsets) - 1, &len);
+
+ ut_ad(len == 4);
+
+ uint32_t page_no = mach_read_from_4(field);
+ ut_ad(page_no > 1);
+
+ return(page_no);
+}
+
+/**************************************************************//**
+Releases the latches on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(latch_mode == BTR_SEARCH_LEAF
+ || latch_mode == BTR_MODIFY_LEAF
+ || latch_mode == BTR_NO_LATCHES);
+
+ ut_ad(!mtr->memo_contains_flagged(block, MTR_MEMO_MODIFY));
+
+ mtr_memo_type_t mode;
+ switch (latch_mode) {
+ case BTR_SEARCH_LEAF:
+ mode = MTR_MEMO_PAGE_S_FIX;
+ break;
+ case BTR_MODIFY_LEAF:
+ mode = MTR_MEMO_PAGE_X_FIX;
+ break;
+ case BTR_NO_LATCHES:
+ mode = MTR_MEMO_BUF_FIX;
+ break;
+ default:
+ ut_a(0);
+ }
+
+ mtr->memo_release(block, mode);
+}
diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h
new file mode 100644
index 00000000..943836f8
--- /dev/null
+++ b/storage/innobase/include/btr0bulk.h
@@ -0,0 +1,371 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0bulk.h
+The B-tree bulk load
+
+Created 03/11/2014 Shaohua Wang
+*************************************************************************/
+
+#ifndef btr0bulk_h
+#define btr0bulk_h
+
+#include "dict0dict.h"
+#include "rem0types.h"
+#include "page0cur.h"
+
+#include <vector>
+
+/** Innodb B-tree index fill factor for bulk load. */
+extern uint innobase_fill_factor;
+
+/*
+The proper function call sequence of PageBulk is as below:
+-- PageBulk::init
+-- PageBulk::insert
+-- PageBulk::finish
+-- PageBulk::compress(COMPRESSED table only)
+-- PageBulk::pageSplit(COMPRESSED table only)
+-- PageBulk::commit
+*/
+
+class PageBulk
+{
+public:
+ /** Constructor
+ @param[in] index B-tree index
+ @param[in] page_no page number
+ @param[in] level page level
+ @param[in] trx_id transaction id */
+ PageBulk(
+ dict_index_t* index,
+ trx_id_t trx_id,
+ uint32_t page_no,
+ ulint level)
+ :
+ m_heap(NULL),
+ m_index(index),
+ m_mtr(),
+ m_trx_id(trx_id),
+ m_block(NULL),
+ m_page(NULL),
+ m_page_zip(NULL),
+ m_cur_rec(NULL),
+ m_page_no(page_no),
+ m_level(level),
+ m_is_comp(dict_table_is_comp(index->table)),
+ m_heap_top(NULL),
+ m_rec_no(0),
+ m_free_space(0),
+ m_reserved_space(0),
+#ifdef UNIV_DEBUG
+ m_total_data(0),
+#endif /* UNIV_DEBUG */
+ m_modify_clock(0),
+ m_err(DB_SUCCESS)
+ {
+ ut_ad(!dict_index_is_spatial(m_index));
+ ut_ad(!m_index->table->is_temporary());
+ }
+
+ /** Deconstructor */
+ ~PageBulk()
+ {
+ mem_heap_free(m_heap);
+ }
+
+ /** Initialize members and allocate page if needed and start mtr.
+ Note: must be called and only once right after constructor.
+ @return error code */
+ dberr_t init();
+
+ /** Insert a record in the page.
+ @param[in] rec record
+ @param[in] offsets record offsets */
+ inline void insert(const rec_t* rec, rec_offs* offsets);
+private:
+ /** Page format */
+ enum format { REDUNDANT, DYNAMIC, COMPRESSED };
+ /** Mark end of insertion to the page. Scan all records to set page
+ dirs, and set page header members.
+ @tparam format the page format */
+ template<format> inline void finishPage();
+ /** Insert a record in the page.
+ @tparam format the page format
+ @param[in,out] rec record
+ @param[in] offsets record offsets */
+ template<format> inline void insertPage(rec_t* rec, rec_offs* offsets);
+
+public:
+ /** Mark end of insertion to the page. Scan all records to set page
+ dirs, and set page header members. */
+ inline void finish();
+
+ /** @return whether finish() actually needs to do something */
+ inline bool needs_finish() const;
+
+ /** Commit mtr for a page
+ @param[in] success Flag whether all inserts succeed. */
+ void commit(bool success);
+
+ /** Compress if it is compressed table
+ @return true compress successfully or no need to compress
+ @return false compress failed. */
+ bool compress();
+
+ /** Check whether the record needs to be stored externally.
+ @return true
+ @return false */
+ bool needExt(const dtuple_t* tuple, ulint rec_size);
+
+ /** Store external record
+ @param[in] big_rec external recrod
+ @param[in] offsets record offsets
+ @return error code */
+ dberr_t storeExt(const big_rec_t* big_rec, rec_offs* offsets);
+
+ /** Get node pointer
+ @return node pointer */
+ dtuple_t* getNodePtr();
+
+ /** Get split rec in the page. We split a page in half when compresssion
+ fails, and the split rec should be copied to the new page.
+ @return split rec */
+ rec_t* getSplitRec();
+
+ /** Copy all records after split rec including itself.
+ @param[in] rec split rec */
+ void copyIn(rec_t* split_rec);
+
+ /** Remove all records after split rec including itself.
+ @param[in] rec split rec */
+ void copyOut(rec_t* split_rec);
+
+ /** Set next page
+ @param[in] next_page_no next page no */
+ inline void setNext(ulint next_page_no);
+
+ /** Set previous page
+ @param[in] prev_page_no previous page no */
+ inline void setPrev(ulint prev_page_no);
+
+ /** Release block by commiting mtr */
+ inline void release();
+
+ /** Start mtr and latch block */
+ inline dberr_t latch();
+
+ /** Check if required space is available in the page for the rec
+ to be inserted. We check fill factor & padding here.
+ @param[in] length required length
+ @return true if space is available */
+ inline bool isSpaceAvailable(ulint rec_size);
+
+ /** Get page no */
+ uint32_t getPageNo() const { return m_page_no; }
+
+ /** Get page level */
+ ulint getLevel()
+ {
+ return(m_level);
+ }
+
+ /** Get record no */
+ ulint getRecNo()
+ {
+ return(m_rec_no);
+ }
+
+ /** Get page */
+ page_t* getPage()
+ {
+ return(m_page);
+ }
+
+ /** Get page zip */
+ page_zip_des_t* getPageZip()
+ {
+ return(m_page_zip);
+ }
+
+ dberr_t getError()
+ {
+ return(m_err);
+ }
+
+ void set_modified() { m_mtr.set_modified(*m_block); }
+
+ /* Memory heap for internal allocation */
+ mem_heap_t* m_heap;
+
+private:
+ /** The index B-tree */
+ dict_index_t* m_index;
+
+ /** The mini-transaction */
+ mtr_t m_mtr;
+
+ /** The transaction id */
+ trx_id_t m_trx_id;
+
+ /** The buffer block */
+ buf_block_t* m_block;
+
+ /** The page */
+ page_t* m_page;
+
+ /** The page zip descriptor */
+ page_zip_des_t* m_page_zip;
+
+ /** The current rec, just before the next insert rec */
+ rec_t* m_cur_rec;
+
+ /** The page no */
+ uint32_t m_page_no;
+
+ /** The page level in B-tree */
+ ulint m_level;
+
+ /** Flag: is page in compact format */
+ const bool m_is_comp;
+
+ /** The heap top in page for next insert */
+ byte* m_heap_top;
+
+ /** User record no */
+ ulint m_rec_no;
+
+ /** The free space left in the page */
+ ulint m_free_space;
+
+ /** The reserved space for fill factor */
+ ulint m_reserved_space;
+
+ /** The padding space for compressed page */
+ ulint m_padding_space;
+
+#ifdef UNIV_DEBUG
+ /** Total data in the page */
+ ulint m_total_data;
+#endif /* UNIV_DEBUG */
+
+ /** The modify clock value of the buffer block
+ when the block is re-pinned */
+ ib_uint64_t m_modify_clock;
+
+ /** Operation result DB_SUCCESS or error code */
+ dberr_t m_err;
+};
+
+typedef std::vector<PageBulk*, ut_allocator<PageBulk*> >
+ page_bulk_vector;
+
+class BtrBulk
+{
+public:
+ /** Constructor
+ @param[in] index B-tree index
+ @param[in] trx transaction */
+ BtrBulk(
+ dict_index_t* index,
+ const trx_t* trx)
+ :
+ m_index(index),
+ m_trx(trx)
+ {
+ ut_ad(!dict_index_is_spatial(index));
+ }
+
+ /** Insert a tuple
+ @param[in] tuple tuple to insert.
+ @return error code */
+ dberr_t insert(dtuple_t* tuple)
+ {
+ return(insert(tuple, 0));
+ }
+
+ /** Btree bulk load finish. We commit the last page in each level
+ and copy the last page in top level to the root page of the index
+ if no error occurs.
+ @param[in] err whether bulk load was successful until now
+ @return error code */
+ dberr_t finish(dberr_t err);
+
+ /** Release all latches */
+ void release();
+
+ /** Re-latch all latches */
+ void latch();
+
+ table_name_t table_name() { return m_index->table->name; }
+
+private:
+ /** Insert a tuple to a page in a level
+ @param[in] tuple tuple to insert
+ @param[in] level B-tree level
+ @return error code */
+ dberr_t insert(dtuple_t* tuple, ulint level);
+
+ /** Split a page
+ @param[in] page_bulk page to split
+ @param[in] next_page_bulk next page
+ @return error code */
+ dberr_t pageSplit(PageBulk* page_bulk,
+ PageBulk* next_page_bulk);
+
+ /** Commit(finish) a page. We set next/prev page no, compress a page of
+ compressed table and split the page if compression fails, insert a node
+ pointer to father page if needed, and commit mini-transaction.
+ @param[in] page_bulk page to commit
+ @param[in] next_page_bulk next page
+ @param[in] insert_father flag whether need to insert node ptr
+ @return error code */
+ dberr_t pageCommit(PageBulk* page_bulk,
+ PageBulk* next_page_bulk,
+ bool insert_father);
+
+ /** Abort a page when an error occurs
+ @param[in] page_bulk page bulk object
+ Note: we should call pageAbort for a PageBulk object, which is not in
+ m_page_bulks after pageCommit, and we will commit or abort PageBulk
+ objects in function "finish". */
+ void pageAbort(PageBulk* page_bulk)
+ {
+ page_bulk->commit(false);
+ }
+
+ /** Log free check */
+ inline void logFreeCheck();
+
+private:
+ /** B-tree index */
+ dict_index_t*const m_index;
+
+ /** Transaction */
+ const trx_t*const m_trx;
+
+ /** Root page level */
+ ulint m_root_level;
+
+ /** Page cursor vector for all level */
+ page_bulk_vector m_page_bulks;
+};
+
+#endif
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
new file mode 100644
index 00000000..7136d726
--- /dev/null
+++ b/storage/innobase/include/btr0cur.h
@@ -0,0 +1,1010 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.h
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0cur_h
+#define btr0cur_h
+
+#include "dict0dict.h"
+#include "page0cur.h"
+#include "btr0types.h"
+#include "rem0types.h"
+#include "gis0type.h"
+#include "my_base.h"
+
+/** Mode flags for btr_cur operations; these can be ORed */
+enum {
+ /** do no undo logging */
+ BTR_NO_UNDO_LOG_FLAG = 1,
+ /** do no record lock checking */
+ BTR_NO_LOCKING_FLAG = 2,
+ /** sys fields will be found in the update vector or inserted
+ entry */
+ BTR_KEEP_SYS_FLAG = 4,
+
+ /** no rollback */
+ BTR_NO_ROLLBACK = BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG,
+
+ /** btr_cur_pessimistic_update() must keep cursor position
+ when moving columns to big_rec */
+ BTR_KEEP_POS_FLAG = 8,
+ /** the caller is creating the index or wants to bypass the
+ index->info.online creation log */
+ BTR_CREATE_FLAG = 16,
+ /** the caller of btr_cur_optimistic_update() or
+ btr_cur_update_in_place() will take care of
+ updating IBUF_BITMAP_FREE */
+ BTR_KEEP_IBUF_BITMAP = 32
+};
+
+/* btr_cur_latch_leaves() returns latched blocks and savepoints. */
+struct btr_latch_leaves_t {
+ /* left block, target block and right block */
+ buf_block_t* blocks[3];
+ ulint savepoints[3];
+};
+
+#include "que0types.h"
+#include "row0types.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the page cursor component of a tree cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+ const btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the buffer block on which the tree cursor is positioned.
+@return pointer to buffer block */
+UNIV_INLINE
+buf_block_t*
+btr_cur_get_block(
+/*==============*/
+ const btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the record pointer of a tree cursor.
+@return pointer to record */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+ const btr_cur_t* cursor);/*!< in: tree cursor */
+#else /* UNIV_DEBUG */
+# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
+# define btr_cur_get_block(cursor) ((cursor)->page_cur.block)
+# define btr_cur_get_rec(cursor) ((cursor)->page_cur.rec)
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the index of a cursor.
+@param cursor b-tree cursor
+@return index */
+#define btr_cur_get_index(cursor) ((cursor)->index)
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+ dict_index_t* index, /*!< in: index */
+ rec_t* rec, /*!< in: record in tree */
+ buf_block_t* block, /*!< in: buffer block of rec */
+ btr_cur_t* cursor);/*!< in: cursor */
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out] table table definition from the data dictionary
+@return error code
+@retval DB_SUCCESS if no error occurred */
+dberr_t
+btr_cur_instant_init(dict_table_t* table)
+ ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result));
+
+/** Initialize the n_core_null_bytes on first access to a clustered
+index root page.
+@param[in] index clustered index that is on its first access
+@param[in] page clustered index root page
+@return whether the page is corrupted */
+bool
+btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
+ ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result));
+
+/** Optimistically latches the leaf page or pages requested.
+@param[in] block guessed buffer block
+@param[in] modify_clock modify clock value
+@param[in,out] latch_mode BTR_SEARCH_LEAF, ...
+@param[in,out] cursor cursor
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@return true if success */
+bool
+btr_cur_optimistic_latch_leaves(
+ buf_block_t* block,
+ ib_uint64_t modify_clock,
+ ulint* latch_mode,
+ btr_cur_t* cursor,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr);
+
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */
+dberr_t
+btr_cur_search_to_nth_level_func(
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the tree level of search */
+ const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
+ tuple must be set so that it cannot get
+ compared to the node ptr page number field! */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be PAGE_CUR_LE,
+ not PAGE_CUR_GE, as the latter may end up on
+ the previous page of the record! Inserts
+ should always be made using PAGE_CUR_LE to
+ search the position! */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
+ at most one of BTR_INSERT, BTR_DELETE_MARK,
+ BTR_DELETE, or BTR_ESTIMATE;
+ cursor->left_block is used to store a pointer
+ to the left neighbor page, in the cases
+ BTR_SEARCH_PREV and BTR_MODIFY_PREV;
+ NOTE that if ahi_latch, we might not have a
+ cursor page latch, we assume that ahi_latch
+ protects the record! */
+ btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
+ s- or x-latched, but see also above! */
+#ifdef BTR_CUR_HASH_ADAPT
+ rw_lock_t* ahi_latch,
+ /*!< in: currently held btr_search_latch
+ (in RW_S_LATCH mode), or NULL */
+#endif /* BTR_CUR_HASH_ADAPT */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ ib_uint64_t autoinc = 0);
+ /*!< in: PAGE_ROOT_AUTO_INC to be written
+ (0 if none) */
+#ifdef BTR_CUR_HASH_ADAPT
+# define btr_cur_search_to_nth_level(i,l,t,m,lm,c,a,fi,li,mtr) \
+ btr_cur_search_to_nth_level_func(i,l,t,m,lm,c,a,fi,li,mtr)
+#else /* BTR_CUR_HASH_ADAPT */
+# define btr_cur_search_to_nth_level(i,l,t,m,lm,c,a,fi,li,mtr) \
+ btr_cur_search_to_nth_level_func(i,l,t,m,lm,c,fi,li,mtr)
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/*****************************************************************//**
+Opens a cursor at either end of an index.
+@return DB_SUCCESS or error code */
+dberr_t
+btr_cur_open_at_index_side_func(
+/*============================*/
+ bool from_left, /*!< in: true if open to the low end,
+ false if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_cur_t* cursor, /*!< in/out: cursor */
+ ulint level, /*!< in: level to search for
+ (0=leaf) */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+
+#define btr_cur_open_at_index_side(f,i,l,c,lv,m) \
+ btr_cur_open_at_index_side_func(f,i,l,c,lv,__FILE__,__LINE__,m)
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree.
+@return true if the index is available and we have put the cursor, false
+if the index is unavailable */
+bool
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /*!< in/out: B-tree cursor */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_cur_open_at_rnd_pos(i,l,c,m) \
+ btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+dberr_t
+btr_cur_optimistic_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameters index and thr should be
+ specified */
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
+ cursor stays valid */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in/out: query thread; can be NULL if
+ !(~flags
+ & (BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG)) */
+ mtr_t* mtr) /*!< in/out: mini-transaction;
+ if this function returns DB_SUCCESS on
+ a leaf page of a secondary index in a
+ compressed tablespace, the caller must
+ mtr_commit(mtr) before latching
+ any further pages */
+ MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result));
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+dberr_t
+btr_cur_pessimistic_insert(
+/*=======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameter thr should be
+ specified; if no undo logging is specified,
+ then the caller must have reserved enough
+ free extents in the file space so that the
+ insertion will certainly succeed */
+ btr_cur_t* cursor, /*!< in: cursor after which to insert;
+ cursor stays valid */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap
+ that can be emptied */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in/out: query thread; can be NULL if
+ !(~flags
+ & (BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG)) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result));
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ page_cur_t* cursor, /*!< in/out: B-tree page cursor */
+ dict_index_t* index, /*!< in: the index corresponding to cursor */
+#ifdef UNIV_DEBUG
+ rec_offs* offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
+ ulint length, /*!< in: size needed */
+ bool create, /*!< in: true=delete-and-insert,
+ false=update-in-place */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \
+ btr_cur_update_alloc_zip_func(page_zip,cursor,index,offsets,len,cr,mtr)
+#else /* UNIV_DEBUG */
+# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \
+ btr_cur_update_alloc_zip_func(page_zip,cursor,index,len,cr,mtr)
+#endif /* UNIV_DEBUG */
+
+/** Apply an update vector to a record. No field size changes are allowed.
+
+This is usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page().
+@param[in,out] rec index record
+@param[in] index the index of the record
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] update update vector
+@param[in,out] block index page
+@param[in,out] mtr mini-transaction */
+void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
+ const rec_offs *offsets, const upd_t *update,
+ buf_block_t *block, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_update_in_place(
+/*====================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ rec_offs* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+ const upd_t* update, /*!< in: update vector */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; if this
+ is a secondary index, the caller must
+ mtr_commit(mtr) before latching any
+ further pages */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended.
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page */
+dberr_t
+btr_cur_optimistic_update(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
+ mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
+ const upd_t* update, /*!< in: update vector; this must also
+ contain trx id and roll ptr fields */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; if this
+ is a secondary index, the caller must
+ mtr_commit(mtr) before latching any
+ further pages */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error code */
+dberr_t
+btr_cur_pessimistic_update(
+/*=======================*/
+ ulint flags, /*!< in: undo logging, locking, and rollback
+ flags */
+ btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
+ cursor may become invalid if *big_rec == NULL
+ || !(flags & BTR_KEEP_POS_FLAG) */
+ rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
+ mem_heap_t** offsets_heap,
+ /*!< in/out: pointer to memory heap
+ that can be emptied */
+ mem_heap_t* entry_heap,
+ /*!< in/out: memory heap for allocating
+ big_rec and the index tuple */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ upd_t* update, /*!< in/out: update vector; this is allowed to
+ also contain trx id and roll ptr fields.
+ Non-updated columns that are moved offpage will
+ be appended to this. */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; must be committed
+ before latching any further pages */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+dberr_t
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+ buf_block_t* block, /*!< in/out: buffer block of the record */
+ rec_t* rec, /*!< in/out: record */
+ dict_index_t* index, /*!< in: clustered index of the record */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */
+ que_thr_t* thr, /*!< in: query thread */
+ const dtuple_t* entry, /*!< in: dtuple for the deleting record */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return TRUE if compression occurred */
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
+ cursor does not stay valid if compression
+ occurs */
+ ibool adjust, /*!< in: TRUE if should adjust the
+ cursor position even if compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned. It is assumed
+that the mtr has an x-latch on the page where the cursor is positioned,
+but no latch on the whole tree.
+@return TRUE if success, i.e., the page did not become too empty */
+ibool
+btr_cur_optimistic_delete_func(
+/*===========================*/
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ cursor stays valid: if deletion succeeds,
+ on function exit it points to the successor
+ of the deleted record */
+# ifdef UNIV_DEBUG
+ ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
+# endif /* UNIV_DEBUG */
+ mtr_t* mtr) /*!< in: mtr; if this function returns
+ TRUE on a leaf page of a secondary
+ index, the mtr must be committed
+ before latching any further pages */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+# ifdef UNIV_DEBUG
+# define btr_cur_optimistic_delete(cursor, flags, mtr) \
+ btr_cur_optimistic_delete_func(cursor, flags, mtr)
+# else /* UNIV_DEBUG */
+# define btr_cur_optimistic_delete(cursor, flags, mtr) \
+ btr_cur_optimistic_delete_func(cursor, mtr)
+# endif /* UNIV_DEBUG */
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred */
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+ dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+ the latter may occur because we may have
+ to update node pointers on upper levels,
+ and in the case of variable length keys
+ these may actually grow in size */
+ ibool has_reserved_extents, /*!< in: TRUE if the
+ caller has already reserved enough free
+ extents so that he knows that the operation
+ will succeed */
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ if compression does not occur, the cursor
+ stays valid: it points to successor of
+ deleted record on function exit */
+ ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull));
+/** Delete the node pointer in a parent page.
+@param[in,out] parent cursor pointing to parent record
+@param[in,out] mtr mini-transaction */
+void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull));
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return end of log record or NULL */
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index); /*!< in: index corresponding to page */
+/** Arguments to btr_estimate_n_rows_in_range */
+struct btr_pos_t
+{
+ btr_pos_t(dtuple_t *arg_tuple,
+ page_cur_mode_t arg_mode,
+ page_id_t arg_page_id)
+ :tuple(arg_tuple), mode(arg_mode), page_id(arg_page_id)
+ {}
+
+ dtuple_t* tuple; /* Range start or end. May be NULL */
+ page_cur_mode_t mode; /* search mode for range */
+ page_id_t page_id; /* Out: Page where we found the tuple */
+};
+
+/** Estimates the number of rows in a given index range.
+@param[in] index index
+@param[in/out] range_start
+@param[in/out] range_ end
+@return estimated number of rows */
+ha_rows
+btr_estimate_n_rows_in_range(
+ dict_index_t* index,
+ btr_pos_t* range_start,
+ btr_pos_t* range_end);
+
+
+/** Statistics for one field of an index. */
+struct index_field_stats_t
+{
+ ib_uint64_t n_diff_key_vals;
+ ib_uint64_t n_sample_sizes;
+ ib_uint64_t n_non_null_key_vals;
+
+ index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
+ ib_uint64_t n_sample_sizes= 0,
+ ib_uint64_t n_non_null_key_vals= 0)
+ : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
+ n_non_null_key_vals(n_non_null_key_vals)
+ {
+ }
+};
+
+/** Estimates the number of different key values in a given index, for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+index->stat_n_sample_sizes[].
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array index->stat_n_non_null_key_vals.
+@param[in] index index
+@return stat vector if the index is available and we get the estimated numbers,
+empty vector if the index is unavailable. */
+std::vector<index_field_stats_t>
+btr_estimate_number_of_different_key_vals(dict_index_t* index);
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in] rec record
+@param[in] offsets array returned by rec_get_offsets()
+@return externally stored part, in units of a database page */
+ulint
+btr_rec_get_externally_stored_len(
+ const rec_t* rec,
+ const rec_offs* offsets);
+
+/*******************************************************************//**
+Marks non-updated off-page fields as disowned by this record. The ownership
+must be transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+void
+btr_cur_disown_inherited_fields(
+/*============================*/
+ buf_block_t* block, /*!< in/out: index page */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ const upd_t* update, /*!< in: update vector */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull(2,3,4,5,6)));
+
+/** Operation code for btr_store_big_rec_extern_fields(). */
+enum blob_op {
+ /** Store off-page columns for a freshly inserted record */
+ BTR_STORE_INSERT = 0,
+ /** Store off-page columns for an insert by update */
+ BTR_STORE_INSERT_UPDATE,
+ /** Store off-page columns for an update */
+ BTR_STORE_UPDATE,
+ /** Store off-page columns for a freshly inserted record by bulk */
+ BTR_STORE_INSERT_BULK
+};
+
+/*******************************************************************//**
+Determine if an operation on off-page columns is an update.
+@return TRUE if op != BTR_STORE_INSERT */
+UNIV_INLINE
+ibool
+btr_blob_op_is_update(
+/*==================*/
+ enum blob_op op) /*!< in: operation */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec. The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+btr_store_big_rec_extern_fields(
+/*============================*/
+ btr_pcur_t* pcur, /*!< in/out: a persistent cursor. if
+ btr_mtr is restarted, then this can
+ be repositioned. */
+ rec_offs* offsets, /*!< in/out: rec_get_offsets() on
+ pcur. the "external storage" flags
+ in offsets will correctly correspond
+ to rec when this function returns */
+ const big_rec_t*big_rec_vec, /*!< in: vector containing fields
+ to be stored externally */
+ mtr_t* btr_mtr, /*!< in/out: mtr containing the
+ latches to the clustered index. can be
+ committed and restarted. */
+ enum blob_op op) /*! in: operation code */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+void
+btr_free_externally_stored_field(
+/*=============================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched; if the tree
+ height is 1, then also the root page
+ must be X-latched! (this is relevant
+ in the case this function is called
+ from purge where 'data' is located on
+ an undo log page, not an index
+ page) */
+ byte* field_ref, /*!< in/out: field reference */
+ const rec_t* rec, /*!< in: record containing field_ref, for
+ page_zip_write_blob_ptr(), or NULL */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
+ or NULL */
+ buf_block_t* block, /*!< in/out: page of field_ref */
+ ulint i, /*!< in: field number of field_ref;
+ ignored if rec == NULL */
+ bool rollback, /*!< in: performing rollback? */
+ mtr_t* local_mtr) /*!< in: mtr containing the latch */
+ MY_ATTRIBUTE((nonnull(1,2,5,8)));
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record must be protected by a lock or a page latch.
+@param[out] buf the field, or a prefix of it
+@param[in] len length of buf, in bytes
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] data 'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in] local_len length of data, in bytes
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+ulint
+btr_copy_externally_stored_field_prefix(
+ byte* buf,
+ ulint len,
+ ulint zip_size,
+ const byte* data,
+ ulint local_len);
+
+/** Copies an externally stored field of a record to mem heap.
+The clustered index record must be protected by a lock or a page latch.
+@param[out] len length of the whole field
+@param[in] data 'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] local_len length of data
+@param[in,out] heap mem heap
+@return the whole field copied to heap */
+byte*
+btr_copy_externally_stored_field(
+ ulint* len,
+ const byte* data,
+ ulint zip_size,
+ ulint local_len,
+ mem_heap_t* heap);
+
+/** Copies an externally stored field of a record to mem heap.
+@param[in] rec record in a clustered index; must be
+protected by a lock or a page latch
+@param[in] offset array returned by rec_get_offsets()
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] no field number
+@param[out] len length of the field
+@param[in,out] heap mem heap
+@return the field copied to heap, or NULL if the field is incomplete */
+byte*
+btr_rec_copy_externally_stored_field(
+ const rec_t* rec,
+ const rec_offs* offsets,
+ ulint zip_size,
+ ulint no,
+ ulint* len,
+ mem_heap_t* heap);
+
+/** Latches the leaf page or pages requested.
+@param[in] block leaf page where the search converged
+@param[in] latch_mode BTR_SEARCH_LEAF, ...
+@param[in] cursor cursor
+@param[in] mtr mini-transaction
+@return blocks and savepoints which actually latched. */
+btr_latch_leaves_t
+btr_cur_latch_leaves(
+ buf_block_t* block,
+ ulint latch_mode,
+ btr_cur_t* cursor,
+ mtr_t* mtr);
+
+/*######################################################################*/
+
+/** In the pessimistic delete, if the page data size drops below this
+limit, merging it to a neighbor is tried */
+#define BTR_CUR_PAGE_COMPRESS_LIMIT(index) \
+ ((srv_page_size * (ulint)((index)->merge_threshold)) / 100)
+
+/** A slot in the path array. We store here info on a search path down the
+tree. Each slot contains data on a single level of the tree. */
+struct btr_path_t {
+ /* Assume a page like:
+ records: (inf, a, b, c, d, sup)
+ index of the record: 0, 1, 2, 3, 4, 5
+ */
+
+ /** Index of the record where the page cursor stopped on this level
+ (index in alphabetical order). Value ULINT_UNDEFINED denotes array
+ end. In the above example, if the search stopped on record 'c', then
+ nth_rec will be 3. */
+ ulint nth_rec;
+
+ /** Number of the records on the page, not counting inf and sup.
+ In the above example n_recs will be 4. */
+ ulint n_recs;
+
+ /** Number of the page containing the record. */
+ uint32_t page_no;
+
+ /** Level of the page. If later we fetch the page under page_no
+ and it is no different level then we know that the tree has been
+ reorganized. */
+ ulint page_level;
+};
+
+#define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */
+
+/** Values for the flag documenting the used search method */
+enum btr_cur_method {
+ BTR_CUR_HASH = 1, /*!< successful shortcut using
+ the hash index */
+ BTR_CUR_HASH_FAIL, /*!< failure using hash, success using
+ binary search: the misleading hash
+ reference is stored in the field
+ hash_node, and might be necessary to
+ update */
+ BTR_CUR_BINARY, /*!< success using the binary search */
+ BTR_CUR_INSERT_TO_IBUF, /*!< performed the intended insert to
+ the insert buffer */
+ BTR_CUR_DEL_MARK_IBUF, /*!< performed the intended delete
+ mark in the insert/delete buffer */
+ BTR_CUR_DELETE_IBUF, /*!< performed the intended delete in
+ the insert/delete buffer */
+ BTR_CUR_DELETE_REF /*!< row_purge_poss_sec() failed */
+};
+
+/** The tree cursor: the definition appears here only for the compiler
+to know struct size! */
+struct btr_cur_t {
+ dict_index_t* index; /*!< index where positioned */
+ page_cur_t page_cur; /*!< page cursor */
+ purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */
+ buf_block_t* left_block; /*!< this field is used to store
+ a pointer to the left neighbor
+ page, in the cases
+ BTR_SEARCH_PREV and
+ BTR_MODIFY_PREV */
+ /*------------------------------*/
+ que_thr_t* thr; /*!< this field is only used
+ when btr_cur_search_to_nth_level
+ is called for an index entry
+ insertion: the calling query
+ thread is passed here to be
+ used in the insert buffer */
+ /*------------------------------*/
+ /** The following fields are used in
+ btr_cur_search_to_nth_level to pass information: */
+ /* @{ */
+ enum btr_cur_method flag; /*!< Search method used */
+ ulint tree_height; /*!< Tree height if the search is done
+ for a pessimistic insert or update
+ operation */
+ ulint up_match; /*!< If the search mode was PAGE_CUR_LE,
+ the number of matched fields to the
+ the first user record to the right of
+ the cursor record after
+ btr_cur_search_to_nth_level;
+ for the mode PAGE_CUR_GE, the matched
+ fields to the first user record AT THE
+ CURSOR or to the right of it;
+ NOTE that the up_match and low_match
+ values may exceed the correct values
+ for comparison to the adjacent user
+ record if that record is on a
+ different leaf page! (See the note in
+ row_ins_duplicate_error_in_clust.) */
+ ulint up_bytes; /*!< number of matched bytes to the
+ right at the time cursor positioned;
+ only used internally in searches: not
+ defined after the search */
+ ulint low_match; /*!< if search mode was PAGE_CUR_LE,
+ the number of matched fields to the
+ first user record AT THE CURSOR or
+ to the left of it after
+ btr_cur_search_to_nth_level;
+ NOT defined for PAGE_CUR_GE or any
+ other search modes; see also the NOTE
+ in up_match! */
+ ulint low_bytes; /*!< number of matched bytes to the
+ left at the time cursor positioned;
+ only used internally in searches: not
+ defined after the search */
+ ulint n_fields; /*!< prefix length used in a hash
+ search if hash_node != NULL */
+ ulint n_bytes; /*!< hash prefix bytes if hash_node !=
+ NULL */
+ ulint fold; /*!< fold value used in the search if
+ flag is BTR_CUR_HASH */
+ /* @} */
+ btr_path_t* path_arr; /*!< in estimating the number of
+ rows in range, we store in this array
+ information of the path through
+ the tree */
+ rtr_info_t* rtr_info; /*!< rtree search info */
+ btr_cur_t():thr(NULL), rtr_info(NULL) {}
+ /* default values */
+ /** Zero-initialize all fields */
+ void init()
+ {
+ index = NULL;
+ memset(&page_cur, 0, sizeof page_cur);
+ purge_node = NULL;
+ left_block = NULL;
+ thr = NULL;
+ flag = btr_cur_method(0);
+ tree_height = 0;
+ up_match = 0;
+ up_bytes = 0;
+ low_match = 0;
+ low_bytes = 0;
+ n_fields = 0;
+ n_bytes = 0;
+ fold = 0;
+ path_arr = NULL;
+ rtr_info = NULL;
+ }
+};
+
+/** Modify the delete-mark flag of a record.
+@tparam flag the value of the delete-mark flag
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in,out] mtr mini-transaction */
+template<bool flag>
+void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later. Try this many
+times. */
+#define BTR_CUR_RETRY_DELETE_N_TIMES 100
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later. Sleep this many
+microseconds between retries. */
+#define BTR_CUR_RETRY_SLEEP_TIME 50000
+
+/** The reference in a field for which data is stored on a different page.
+The reference is at the end of the 'locally' stored part of the field.
+'Locally' means storage in the index record.
+We store locally a long enough prefix of each column so that we can determine
+the ordering parts of each index record without looking into the externally
+stored part. */
+/*-------------------------------------- @{ */
+#define BTR_EXTERN_SPACE_ID 0U /*!< space id where stored */
+#define BTR_EXTERN_PAGE_NO 4U /*!< page no where stored */
+#define BTR_EXTERN_OFFSET 8U /*!< offset of BLOB header
+ on that page */
+#define BTR_EXTERN_LEN 12U /*!< 8 bytes containing the
+ length of the externally
+ stored part of the BLOB.
+ The 2 highest bits are
+ reserved to the flags below. */
+/*-------------------------------------- @} */
+/* #define BTR_EXTERN_FIELD_REF_SIZE 20 // moved to btr0types.h */
+
+/** The most significant bit of BTR_EXTERN_LEN (i.e., the most
+significant bit of the byte at smallest address) is set to 1 if this
+field does not 'own' the externally stored field; only the owner field
+is allowed to free the field in purge! */
+#define BTR_EXTERN_OWNER_FLAG 128U
+/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the
+second most significant bit of the byte at smallest address) is 1 then
+it means that the externally stored field was inherited from an
+earlier version of the row. In rollback we are not allowed to free an
+inherited external field. */
+#define BTR_EXTERN_INHERITED_FLAG 64U
+
+/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
+extern Atomic_counter<ulint> btr_cur_n_non_sea;
+/** Old value of btr_cur_n_non_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint btr_cur_n_non_sea_old;
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of successful adaptive hash index lookups in
+btr_cur_search_to_nth_level(). */
+extern ulint btr_cur_n_sea;
+/** Old value of btr_cur_n_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint btr_cur_n_sea_old;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef UNIV_DEBUG
+/* Flag to limit optimistic insert records */
+extern uint btr_cur_limit_optimistic_insert_debug;
+#endif /* UNIV_DEBUG */
+
+#include "btr0cur.ic"
+
+#endif
diff --git a/storage/innobase/include/btr0cur.ic b/storage/innobase/include/btr0cur.ic
new file mode 100644
index 00000000..8a45b714
--- /dev/null
+++ b/storage/innobase/include/btr0cur.ic
@@ -0,0 +1,211 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.ic
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#ifdef UNIV_DEBUG
+# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\
+if (btr_cur_limit_optimistic_insert_debug > 1\
+ && (NREC) >= btr_cur_limit_optimistic_insert_debug) {\
+ CODE;\
+}
+#else
+# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the page cursor component of a tree cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+ const btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(&((btr_cur_t*) cursor)->page_cur);
+}
+
+/*********************************************************//**
+Returns the buffer block on which the tree cursor is positioned.
+@return pointer to buffer block */
+UNIV_INLINE
+buf_block_t*
+btr_cur_get_block(
+/*==============*/
+ const btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(page_cur_get_block(btr_cur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the record pointer of a tree cursor.
+@return pointer to record */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+ const btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(page_cur_get_rec(btr_cur_get_page_cur(cursor)));
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(buf_block_get_page_zip(btr_cur_get_block(cursor)));
+}
+
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(page_align(page_cur_get_rec(&(cursor->page_cur))));
+}
+
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+ dict_index_t* index, /*!< in: index */
+ rec_t* rec, /*!< in: record in tree */
+ buf_block_t* block, /*!< in: buffer block of rec */
+ btr_cur_t* cursor) /*!< out: cursor */
+{
+ ut_ad(page_align(rec) == block->frame);
+
+ page_cur_position(rec, block, btr_cur_get_page_cur(cursor));
+
+ cursor->index = index;
+}
+
+/*********************************************************************//**
+Checks if compressing an index page where a btr cursor is placed makes
+sense.
+@return TRUE if compression is recommended */
+UNIV_INLINE
+ibool
+btr_cur_compress_recommendation(
+/*============================*/
+ btr_cur_t* cursor, /*!< in: btr cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const page_t* page;
+
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ page = btr_cur_get_page(cursor);
+
+ LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2U,
+ return(FALSE));
+
+ if (!page_has_siblings(page)
+ || page_get_data_size(page)
+ < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) {
+
+ /* The page fillfactor has dropped below a predefined
+ minimum value OR the level in the B-tree contains just
+ one page: we recommend compression if this is not the
+ root page. */
+
+ return cursor->index->page
+ != btr_cur_get_block(cursor)->page.id().page_no();
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if the record on which the cursor is placed can be deleted without
+making tree compression necessary (or, recommended).
+@return TRUE if can be deleted without recommended compression */
+UNIV_INLINE
+ibool
+btr_cur_can_delete_without_compress(
+/*================================*/
+ btr_cur_t* cursor, /*!< in: btr cursor */
+ ulint rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page;
+
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ page = btr_cur_get_page(cursor);
+
+ if (!page_has_siblings(page) || page_get_n_recs(page) < 2
+ || page_get_data_size(page) - rec_size
+ < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) {
+
+ /* The page fillfactor will drop below a predefined
+ minimum value, OR the level in the B-tree contains just
+ one page, OR the page will become empty: we recommend
+ compression if this is not the root page. */
+
+ return cursor->index->page
+ == btr_cur_get_block(cursor)->page.id().page_no();
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Determine if an operation on off-page columns is an update.
+@return TRUE if op != BTR_STORE_INSERT */
+UNIV_INLINE
+ibool
+btr_blob_op_is_update(
+/*==================*/
+ enum blob_op op) /*!< in: operation */
+{
+ switch (op) {
+ case BTR_STORE_INSERT:
+ case BTR_STORE_INSERT_BULK:
+ return(FALSE);
+ case BTR_STORE_INSERT_UPDATE:
+ case BTR_STORE_UPDATE:
+ return(TRUE);
+ }
+
+ ut_ad(0);
+ return(FALSE);
+}
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
new file mode 100644
index 00000000..a9212db0
--- /dev/null
+++ b/storage/innobase/include/btr0defragment.h
@@ -0,0 +1,75 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef btr0defragment_h
+#define btr0defragment_h
+
+#include "btr0pcur.h"
+
+/* Max number of pages to consider at once during defragmentation. */
+#define BTR_DEFRAGMENT_MAX_N_PAGES 32
+
+/** stats in btr_defragment */
+extern Atomic_counter<ulint> btr_defragment_compression_failures;
+extern Atomic_counter<ulint> btr_defragment_failures;
+extern Atomic_counter<ulint> btr_defragment_count;
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init(void);
+/******************************************************************//**
+Shutdown defragmentation. */
+void
+btr_defragment_shutdown();
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index); /*!< Index to find. */
+/******************************************************************//**
+Add an index to btr_defragment_wq. Return a pointer to os_event if this
+is a synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+ dict_index_t* index, /*!< index to be added */
+ dberr_t* err); /*!< out: error code */
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table); /*!< Index to be removed. */
+/******************************************************************//**
+Mark an index as removed from btr_defragment_wq. */
+void
+btr_defragment_remove_index(
+ dict_index_t* index); /*!< Index to be removed. */
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.*/
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+ dict_index_t* index); /*!< in: index */
+
+/* Stop defragmentation.*/
+void btr_defragment_end();
+extern bool btr_defragment_active;
+#endif
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
new file mode 100644
index 00000000..7facea7b
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.h
@@ -0,0 +1,546 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.h
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0pcur_h
+#define btr0pcur_h
+
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "buf0block_hint.h"
+#include "btr0btr.h"
+#include "gis0rtree.h"
+
+/* Relative positions for a stored cursor position */
+enum btr_pcur_pos_t {
+ BTR_PCUR_ON = 1,
+ BTR_PCUR_BEFORE = 2,
+ BTR_PCUR_AFTER = 3,
+/* Note that if the tree is not empty, btr_pcur_store_position does not
+use the following, but only uses the above three alternatives, where the
+position is stored relative to a specific record: this makes implementation
+of a scroll cursor easier */
+ BTR_PCUR_BEFORE_FIRST_IN_TREE = 4, /* in an empty tree */
+ BTR_PCUR_AFTER_LAST_IN_TREE = 5 /* in an empty tree */
+};
+
+/**************************************************************//**
+Allocates memory for a persistent cursor object and initializes the cursor.
+@return own: persistent cursor */
+btr_pcur_t*
+btr_pcur_create_for_mysql(void);
+/*============================*/
+
+/**************************************************************//**
+Resets a persistent cursor object, freeing ::old_rec_buf if it is
+allocated and resetting the other members to their initial values. */
+void
+btr_pcur_reset(
+/*===========*/
+ btr_pcur_t* cursor);/*!< in, out: persistent cursor */
+
+/**************************************************************//**
+Frees the memory for a persistent cursor object. */
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+ btr_pcur_t* cursor); /*!< in, own: persistent cursor */
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+ btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the
+ position info */
+ btr_pcur_t* pcur_donate); /*!< in: pcur from which the info is
+ copied */
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+ btr_pcur_t* pcur); /*!< in: persistent cursor */
+
+/** Free old_rec_buf.
+@param[in] pcur Persistent cursor holding old_rec to be freed. */
+UNIV_INLINE
+void
+btr_pcur_free(
+ btr_pcur_t* pcur);
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+dberr_t
+btr_pcur_open_low(
+/*==============*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level in the btree */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page from the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ ib_uint64_t autoinc,/*!< in: PAGE_ROOT_AUTO_INC to be written
+ (0 if none) */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open(i,t,md,l,c,m) \
+ btr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,0,m)
+/**************************************************************//**
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+dberr_t
+btr_pcur_open_with_no_init_func(
+/*============================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page of the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...;
+ NOTE that if ahi_latch then we might not
+ acquire a cursor page latch, but assume
+ that the ahi_latch protects the record! */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+#ifdef BTR_CUR_HASH_ADAPT
+ rw_lock_t* ahi_latch,
+ /*!< in: adaptive hash index latch held
+ by the caller, or NULL if none */
+#endif /* BTR_CUR_HASH_ADAPT */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#ifdef BTR_CUR_HASH_ADAPT
+# define btr_pcur_open_with_no_init(ix,t,md,l,cur,ahi,m) \
+ btr_pcur_open_with_no_init_func(ix,t,md,l,cur,ahi,__FILE__,__LINE__,m)
+#else /* BTR_CUR_HASH_ADAPT */
+# define btr_pcur_open_with_no_init(ix,t,md,l,cur,ahi,m) \
+ btr_pcur_open_with_no_init_func(ix,t,md,l,cur,__FILE__,__LINE__,m)
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/*****************************************************************//**
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+dberr_t
+btr_pcur_open_at_index_side(
+/*========================*/
+ bool from_left, /*!< in: true if open to the low end,
+ false if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_pcur_t* pcur, /*!< in/out: cursor */
+ bool init_pcur, /*!< in: whether to initialize pcur */
+ ulint level, /*!< in: level to search for
+ (0=leaf) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+ const btr_pcur_t* cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+ const btr_pcur_t* cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+void
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ... */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent
+ cursor */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_on_user_rec(i,t,md,l,c,m) \
+ btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m)
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree.
+@return true if the index is available and we have put the cursor, false
+if the index is unavailable */
+UNIV_INLINE
+bool
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_at_rnd_pos(i,l,c,m) \
+ btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by committing the mini-transaction right after btr_pcur_close().
+A subsequent attempt to crawl the same page in the same mtr would cause
+an assertion failure. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+ btr_pcur_t* cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+void
+btr_pcur_store_position(
+/*====================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum.
+(4) cursor was positioned before the first or after the last in an empty tree:
+restores to before first or after the last in the tree.
+@return TRUE if the cursor position was stored when it was on a user
+record and it can be restored on a user record whose ordering fields
+are identical to the ones of the original user record */
+ibool
+btr_pcur_restore_position_func(
+/*===========================*/
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: detached persistent cursor */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_restore_position(l,cur,mtr) \
+ btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr)
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+ btr_pcur_t* pcur, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr to commit */
+
+/** Commits the mtr and sets the clustered index pcur and secondary index
+pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used for both cursor before
+calling this, if restoration of cursor is wanted later.
+@param[in] pcur persistent cursor
+@param[in] sec_pcur secondary index persistent cursor
+@param[in] mtr mtr to commit */
+UNIV_INLINE
+void
+btr_pcurs_commit_specify_mtr(
+ btr_pcur_t* pcur,
+ btr_pcur_t* sec_pcur,
+ mtr_t* mtr);
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return TRUE if the cursor was not before first in tree */
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page.
+Releases the latch on the current page, and bufferunfixes it.
+Note that there must not be modifications on the current page,
+as then the x-latch can be released only in mtr_commit. */
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the
+ last record of the current page */
+ mtr_t* mtr); /*!< in: mtr */
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the btr cursor component of a persistent cursor.
+@return pointer to btr cursor component */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+ const btr_pcur_t* cursor); /*!< in: persistent cursor */
+/*********************************************************//**
+Returns the page cursor component of a persistent cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+ const btr_pcur_t* cursor); /*!< in: persistent cursor */
+/*********************************************************//**
+Returns the page of a persistent cursor.
+@return pointer to the page */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the buffer block of a persistent cursor.
+@return pointer to the block */
+UNIV_INLINE
+buf_block_t*
+btr_pcur_get_block(
+/*===============*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the record of a persistent cursor.
+@return pointer to the record */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+#else /* UNIV_DEBUG */
+# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
+# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
+# define btr_pcur_get_page(cursor) ((cursor)->btr_cur.page_cur.block->frame)
+# define btr_pcur_get_block(cursor) ((cursor)->btr_cur.page_cur.block)
+# define btr_pcur_get_rec(cursor) ((cursor)->btr_cur.page_cur.rec)
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor);
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor);
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+ btr_pcur_t* cursor);/*!< in/out: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+ btr_pcur_t* cursor);/*!< in/out: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the infimum record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_before_first_on_page(
+/*===============================*/
+ btr_pcur_t* cursor); /*!< in/out: persistent cursor */
+
+/** Position state of persistent B-tree cursor. */
+enum pcur_pos_t {
+ /** The persistent cursor is not positioned. */
+ BTR_PCUR_NOT_POSITIONED = 0,
+ /** The persistent cursor was previously positioned.
+ TODO: currently, the state can be BTR_PCUR_IS_POSITIONED,
+ though it really should be BTR_PCUR_WAS_POSITIONED,
+ because we have no obligation to commit the cursor with
+ mtr; similarly latch_mode may be out of date. This can
+ lead to problems if btr_pcur is not used the right way;
+ all current code should be ok. */
+ BTR_PCUR_WAS_POSITIONED,
+ /** The persistent cursor is positioned by optimistic get to the same
+ record as it was positioned at. Not used for rel_pos == BTR_PCUR_ON.
+ It may need adjustment depending on previous/current search direction
+ and rel_pos. */
+ BTR_PCUR_IS_POSITIONED_OPTIMISTIC,
+ /** The persistent cursor is positioned by index search.
+ Or optimistic get for rel_pos == BTR_PCUR_ON. */
+ BTR_PCUR_IS_POSITIONED
+};
+
+/* The persistent B-tree cursor structure. This is used mainly for SQL
+selects, updates, and deletes. */
+
+struct btr_pcur_t{
+ /** a B-tree cursor */
+ btr_cur_t btr_cur;
+ /** see TODO note below!
+ BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE or BTR_NO_LATCHES,
+ depending on the latching state of the page and tree where the cursor
+ is positioned; BTR_NO_LATCHES means that the cursor is not currently
+ positioned:
+ we say then that the cursor is detached; it can be restored to
+ attached if the old position was stored in old_rec */
+ ulint latch_mode;
+ /** true if old_rec is stored */
+ bool old_stored;
+ /** if cursor position is stored, contains an initial segment of the
+ latest record cursor was positioned either on, before or after */
+ rec_t* old_rec;
+ /** btr_cur.index->n_core_fields when old_rec was copied */
+ uint16 old_n_core_fields;
+ /** number of fields in old_rec */
+ uint16 old_n_fields;
+ /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on
+ whether cursor was on, before, or after the old_rec record */
+ enum btr_pcur_pos_t rel_pos;
+ /** buffer block when the position was stored */
+ buf::Block_hint block_when_stored;
+ /** the modify clock value of the buffer block when the cursor position
+ was stored */
+ ib_uint64_t modify_clock;
+ /** btr_pcur_store_position() and btr_pcur_restore_position() state. */
+ enum pcur_pos_t pos_state;
+ /** PAGE_CUR_G, ... */
+ page_cur_mode_t search_mode;
+ /** the transaction, if we know it; otherwise this field is not defined;
+ can ONLY BE USED in error prints in fatal assertion failures! */
+ trx_t* trx_if_known;
+ /*-----------------------------*/
+ /* NOTE that the following fields may possess dynamically allocated
+ memory which should be freed if not needed anymore! */
+
+ /** NULL, or a dynamically allocated buffer for old_rec */
+ byte* old_rec_buf;
+ /** old_rec_buf size if old_rec_buf is not NULL */
+ ulint buf_size;
+
+ btr_pcur_t() :
+ btr_cur(), latch_mode(RW_NO_LATCH),
+ old_stored(false), old_rec(NULL),
+ old_n_fields(0), rel_pos(btr_pcur_pos_t(0)),
+ block_when_stored(),
+ modify_clock(0), pos_state(BTR_PCUR_NOT_POSITIONED),
+ search_mode(PAGE_CUR_UNSUPP), trx_if_known(NULL),
+ old_rec_buf(NULL), buf_size(0)
+ {
+ btr_cur.init();
+ }
+
+ /** Return the index of this persistent cursor */
+ dict_index_t* index() const { return(btr_cur.index); }
+};
+
+#include "btr0pcur.ic"
+
+#endif
diff --git a/storage/innobase/include/btr0pcur.ic b/storage/innobase/include/btr0pcur.ic
new file mode 100644
index 00000000..d93da475
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.ic
@@ -0,0 +1,645 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.ic
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor);
+ ut_ad(cursor->old_rec);
+ ut_ad(cursor->old_stored);
+ ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+ || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(cursor->rel_pos);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the btr cursor component of a persistent cursor.
+@return pointer to btr cursor component */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ const btr_cur_t* btr_cur = &cursor->btr_cur;
+ return((btr_cur_t*) btr_cur);
+}
+
+/*********************************************************//**
+Returns the page cursor component of a persistent cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the page of a persistent cursor.
+@return pointer to the page */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the buffer block of a persistent cursor.
+@return pointer to the block */
+UNIV_INLINE
+buf_block_t*
+btr_pcur_get_block(
+/*===============*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the record of a persistent cursor.
+@return pointer to the record */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor)));
+}
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ const btr_cur_t* btr_cursor;
+
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ ut_ad(btr_cursor->up_match != ULINT_UNDEFINED);
+
+ return(btr_cursor->up_match);
+}
+
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ const btr_cur_t* btr_cursor;
+
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+ ut_ad(btr_cursor->low_match != ULINT_UNDEFINED);
+
+ return(btr_cursor->low_match);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if (btr_pcur_is_before_first_on_page(cursor)
+ || btr_pcur_is_after_last_on_page(cursor)) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor)
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return !page_has_prev(btr_pcur_get_page(cursor))
+ && page_cur_is_before_first(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor)
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return !page_has_next(btr_pcur_get_page(cursor))
+ && page_cur_is_after_last(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+ btr_pcur_t* cursor) /*!< in/out: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = false;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+ btr_pcur_t* cursor) /*!< in/out: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = false;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ cursor->old_stored = false;
+loop:
+ if (btr_pcur_is_after_last_on_page(cursor)) {
+ if (btr_pcur_is_after_last_in_tree(cursor)) {
+ return(FALSE);
+ }
+
+ btr_pcur_move_to_next_page(cursor, mtr);
+ } else {
+ btr_pcur_move_to_next_on_page(cursor);
+ }
+
+ if (btr_pcur_is_on_user_rec(cursor)) {
+
+ return(TRUE);
+ }
+
+ goto loop;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ cursor->old_stored = false;
+
+ if (btr_pcur_is_after_last_on_page(cursor)) {
+ if (btr_pcur_is_after_last_in_tree(cursor)) {
+ return(FALSE);
+ }
+
+ btr_pcur_move_to_next_page(cursor, mtr);
+ return(TRUE);
+ }
+
+ btr_pcur_move_to_next_on_page(cursor);
+ return(TRUE);
+}
+
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+ btr_pcur_t* pcur, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr to commit */
+{
+ ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+
+ mtr_commit(mtr);
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/** Commits the mtr and sets the clustered index pcur and secondary index
+pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used for both cursor before
+calling this, if restoration of cursor is wanted later.
+@param[in] pcur persistent cursor
+@param[in] sec_pcur secondary index persistent cursor
+@param[in] mtr mtr to commit */
+UNIV_INLINE
+void
+btr_pcurs_commit_specify_mtr(
+ btr_pcur_t* pcur,
+ btr_pcur_t* sec_pcur,
+ mtr_t* mtr)
+{
+ ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(sec_pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+ sec_pcur->latch_mode = BTR_NO_LATCHES;
+
+ mtr_commit(mtr);
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+ sec_pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+ btr_pcur_t* pcur) /*!< in: persistent cursor */
+{
+ pcur->old_stored = false;
+ pcur->old_rec_buf = NULL;
+ pcur->old_rec = NULL;
+
+ pcur->btr_cur.rtr_info = NULL;
+}
+
+/** Free old_rec_buf.
+@param[in] pcur Persistent cursor holding old_rec to be freed. */
+UNIV_INLINE
+void
+btr_pcur_free(
+ btr_pcur_t* pcur)
+{
+ ut_free(pcur->old_rec_buf);
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+dberr_t
+btr_pcur_open_low(
+/*==============*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level in the btree */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page from the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ ib_uint64_t autoinc,/*!< in: PAGE_ROOT_AUTO_INC to be written
+ (0 if none) */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ btr_cur_t* btr_cursor;
+ dberr_t err = DB_SUCCESS;
+
+ /* Initialize the cursor */
+
+ btr_pcur_init(cursor);
+
+ cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+ cursor->search_mode = mode;
+
+ /* Search with the tree cursor */
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ ut_ad(!dict_index_is_spatial(index));
+
+ err = btr_cur_search_to_nth_level_func(
+ index, level, tuple, mode, latch_mode, btr_cursor,
+#ifdef BTR_CUR_HASH_ADAPT
+ NULL,
+#endif /* BTR_CUR_HASH_ADAPT */
+ file, line, mtr, autoinc);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ib::warn() << "btr_pcur_open_low"
+ << " level: " << level
+ << " called from file: "
+ << file << " line: " << line
+ << " table: " << index->table->name
+ << " index: " << index->name
+ << " error: " << err;
+ }
+
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ cursor->trx_if_known = NULL;
+
+ return(err);
+}
+
+/**************************************************************//**
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+dberr_t
+btr_pcur_open_with_no_init_func(
+/*============================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page of the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...;
+ NOTE that if ahi_latch then we might not
+ acquire a cursor page latch, but assume
+ that the ahi_latch protects the record! */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+#ifdef BTR_CUR_HASH_ADAPT
+ rw_lock_t* ahi_latch,
+ /*!< in: adaptive hash index latch held
+ by the caller, or NULL if none */
+#endif /* BTR_CUR_HASH_ADAPT */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ btr_cur_t* btr_cursor;
+ dberr_t err = DB_SUCCESS;
+
+ cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode);
+ cursor->search_mode = mode;
+
+ /* Search with the tree cursor */
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ err = btr_cur_search_to_nth_level_func(
+ index, 0, tuple, mode, latch_mode, btr_cursor,
+#ifdef BTR_CUR_HASH_ADAPT
+ ahi_latch,
+#endif /* BTR_CUR_HASH_ADAPT */
+ file, line, mtr);
+
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ cursor->old_stored = false;
+
+ cursor->trx_if_known = NULL;
+ return err;
+}
+
+/*****************************************************************//**
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+dberr_t
+btr_pcur_open_at_index_side(
+/*========================*/
+ bool from_left, /*!< in: true if open to the low end,
+ false if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_pcur_t* pcur, /*!< in/out: cursor */
+ bool init_pcur, /*!< in: whether to initialize pcur */
+ ulint level, /*!< in: level to search for
+ (0=leaf) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ dberr_t err = DB_SUCCESS;
+
+ pcur->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+ pcur->search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L;
+
+ if (init_pcur) {
+ btr_pcur_init(pcur);
+ }
+
+ err = btr_cur_open_at_index_side(
+ from_left, index, latch_mode,
+ btr_pcur_get_btr_cur(pcur), level, mtr);
+ pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ pcur->old_stored = false;
+
+ pcur->trx_if_known = NULL;
+
+ return (err);
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree.
+@return true if the index is available and we have put the cursor, false
+if the index is unavailable */
+UNIV_INLINE
+bool
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ /* Initialize the cursor */
+
+ cursor->latch_mode = latch_mode;
+ cursor->search_mode = PAGE_CUR_G;
+
+ btr_pcur_init(cursor);
+
+ bool available;
+
+ available = btr_cur_open_at_rnd_pos_func(index, latch_mode,
+ btr_pcur_get_btr_cur(cursor),
+ file, line, mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+ cursor->old_stored = false;
+
+ cursor->trx_if_known = NULL;
+
+ return(available);
+}
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by committing the mini-transaction right after btr_pcur_close().
+A subsequent attempt to crawl the same page in the same mtr would cause
+an assertion failure. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_free(cursor->old_rec_buf);
+
+ if (cursor->btr_cur.rtr_info) {
+ rtr_clean_rtr_info(cursor->btr_cur.rtr_info, true);
+ cursor->btr_cur.rtr_info = NULL;
+ }
+
+ cursor->old_rec = NULL;
+ cursor->old_rec_buf = NULL;
+ cursor->btr_cur.page_cur.rec = NULL;
+ cursor->btr_cur.page_cur.block = NULL;
+
+ cursor->old_rec = NULL;
+ cursor->old_stored = false;
+
+ cursor->latch_mode = BTR_NO_LATCHES;
+ cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+
+ cursor->trx_if_known = NULL;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the infimum record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_before_first_on_page(
+/*===============================*/
+ btr_pcur_t* cursor) /*!< in/out: persistent cursor */
+{
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_set_before_first(btr_pcur_get_block(cursor),
+ btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = false;
+}
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
new file mode 100644
index 00000000..1e6b667c
--- /dev/null
+++ b/storage/innobase/include/btr0sea.h
@@ -0,0 +1,392 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.h
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0sea_h
+#define btr0sea_h
+
+#include "dict0dict.h"
+#ifdef BTR_CUR_HASH_ADAPT
+#include "ha0ha.h"
+#include "sync0sync.h"
+
+#define btr_search_sys_create() btr_search_sys.create()
+#define btr_search_sys_free() btr_search_sys.free()
+
+/** Disable the adaptive hash search system and empty the index. */
+void btr_search_disable();
+
+/** Enable the adaptive hash search system.
+@param resize whether buf_pool_t::resize() is the caller */
+void btr_search_enable(bool resize= false);
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+ dict_index_t* index, /*!< in: index of the cursor */
+ btr_cur_t* cursor);/*!< in: cursor which was just positioned */
+
+/** Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@param[in,out] index index
+@param[in,out] info index search info
+@param[in] tuple logical record
+@param[in] mode PAGE_CUR_L, ....
+@param[in] latch_mode BTR_SEARCH_LEAF, ...;
+ NOTE that only if has_search_latch is 0, we will
+ have a latch set on the cursor page, otherwise
+ we assume the caller uses his search latch
+ to protect the record!
+@param[out] cursor tree cursor
+@param[in] ahi_latch the adaptive hash index latch being held,
+ or NULL
+@param[in] mtr mini transaction
+@return whether the search succeeded */
+bool
+btr_search_guess_on_hash(
+ dict_index_t* index,
+ btr_search_t* info,
+ const dtuple_t* tuple,
+ ulint mode,
+ ulint latch_mode,
+ btr_cur_t* cursor,
+ rw_lock_t* ahi_latch,
+ mtr_t* mtr);
+
+/** Move or delete hash entries for moved records, usually in a page split.
+If new_block is already hashed, then any hash index for block is dropped.
+If new_block is not hashed, and block is hashed, then a new hash index is
+built to new_block with the same parameters as block.
+@param[in,out] new_block destination page
+@param[in,out] block source page (subject to deletion later) */
+void
+btr_search_move_or_delete_hash_entries(
+ buf_block_t* new_block,
+ buf_block_t* block);
+
+/** Drop any adaptive hash index entries that point to an index page.
+@param[in,out] block block containing index page, s- or x-latched, or an
+ index page for which we know that
+ block->buf_fix_count == 0 or it is an index page which
+ has already been removed from the buf_pool.page_hash
+ i.e.: it is in state BUF_BLOCK_REMOVE_HASH */
+void btr_search_drop_page_hash_index(buf_block_t* block);
+
+/** Drop possible adaptive hash index entries when a page is evicted
+from the buffer pool or freed in a file, or the index is being dropped.
+@param[in] page_id page id */
+void btr_search_drop_page_hash_when_freed(const page_id_t page_id);
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in] cursor cursor which was positioned to the place to insert
+ using btr_cur_search_, and the new record has been
+ inserted next to the cursor.
+@param[in] ahi_latch the adaptive hash index latch */
+void
+btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch);
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in,out] cursor cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor
+@param[in] ahi_latch the adaptive hash index latch */
+void
+btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch);
+
+/** Updates the page hash index when a single record is deleted from a page.
+@param[in] cursor cursor which was positioned on the record to delete
+ using btr_cur_search_, the record is not yet deleted.*/
+void btr_search_update_hash_on_delete(btr_cur_t* cursor);
+
+/** Validates the search system.
+@return true if ok */
+bool btr_search_validate();
+
+/** Lock all search latches in exclusive mode. */
+static inline void btr_search_x_lock_all();
+
+/** Unlock all search latches from exclusive mode. */
+static inline void btr_search_x_unlock_all();
+
+/** Lock all search latches in shared mode. */
+static inline void btr_search_s_lock_all();
+
+#ifdef UNIV_DEBUG
+/** Check if thread owns all the search latches.
+@param[in] mode lock mode check
+@retval true if owns all of them
+@retval false if does not own some of them */
+static inline bool btr_search_own_all(ulint mode);
+
+/** Check if thread owns any of the search latches.
+@param[in] mode lock mode check
+@retval true if owns any of them
+@retval false if owns no search latch */
+static inline bool btr_search_own_any(ulint mode);
+
+/** @return whether this thread holds any of the search latches */
+static inline bool btr_search_own_any();
+#endif /* UNIV_DEBUG */
+
+/** Unlock all search latches from shared mode. */
+static inline void btr_search_s_unlock_all();
+
+#else /* BTR_CUR_HASH_ADAPT */
+# define btr_search_sys_create()
+# define btr_search_sys_free()
+# define btr_search_drop_page_hash_index(block)
+# define btr_search_s_lock_all(index)
+# define btr_search_s_unlock_all(index)
+# define btr_search_info_update(index, cursor)
+# define btr_search_move_or_delete_hash_entries(new_block, block)
+# define btr_search_update_hash_on_insert(cursor, ahi_latch)
+# define btr_search_update_hash_on_delete(cursor)
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef BTR_CUR_ADAPT
+/** Create and initialize search info.
+@param[in,out] heap heap where created
+@return own: search info struct */
+static inline btr_search_t* btr_search_info_create(mem_heap_t* heap)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** @return the search info of an index */
+static inline btr_search_t* btr_search_get_info(dict_index_t* index)
+{
+ return(index->search_info);
+}
+#endif /* BTR_CUR_ADAPT */
+
+/** The search info struct in an index */
+struct btr_search_t{
+ /* @{ The following fields are not protected by any latch.
+ Unfortunately, this means that they must be aligned to
+ the machine word, i.e., they cannot be turned into bit-fields. */
+ buf_block_t* root_guess;/*!< the root page frame when it was last time
+ fetched, or NULL */
+#ifdef BTR_CUR_HASH_ADAPT
+ ulint hash_analysis; /*!< when this exceeds
+ BTR_SEARCH_HASH_ANALYSIS, the hash
+ analysis starts; this is reset if no
+ success noticed */
+ ibool last_hash_succ; /*!< TRUE if the last search would have
+ succeeded, or did succeed, using the hash
+ index; NOTE that the value here is not exact:
+ it is not calculated for every search, and the
+ calculation itself is not always accurate! */
+ ulint n_hash_potential;
+ /*!< number of consecutive searches
+ which would have succeeded, or did succeed,
+ using the hash index;
+ the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */
+ /* @} */
+ ulint ref_count; /*!< Number of blocks in this index tree
+ that have search index built
+ i.e. block->index points to this index.
+ Protected by search latch except
+ when during initialization in
+ btr_search_info_create(). */
+
+ /*---------------------- @{ */
+ uint16_t n_fields; /*!< recommended prefix length for hash search:
+ number of full fields */
+ uint16_t n_bytes; /*!< recommended prefix: number of bytes in
+ an incomplete field
+ @see BTR_PAGE_MAX_REC_SIZE */
+ bool left_side; /*!< true or false, depending on whether
+ the leftmost record of several records with
+ the same prefix should be indexed in the
+ hash index */
+ /*---------------------- @} */
+#ifdef UNIV_SEARCH_PERF_STAT
+ ulint n_hash_succ; /*!< number of successful hash searches thus
+ far */
+ ulint n_hash_fail; /*!< number of failed hash searches */
+ ulint n_patt_succ; /*!< number of successful pattern searches thus
+ far */
+ ulint n_searches; /*!< number of searches */
+#endif /* UNIV_SEARCH_PERF_STAT */
+#endif /* BTR_CUR_HASH_ADAPT */
+#ifdef UNIV_DEBUG
+ ulint magic_n; /*!< magic number @see BTR_SEARCH_MAGIC_N */
+/** value of btr_search_t::magic_n, used in assertions */
+# define BTR_SEARCH_MAGIC_N 1112765
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** The hash index system */
+struct btr_search_sys_t
+{
+ /** Partition of the hash table */
+ struct partition
+ {
+ /** latches protecting hash_table */
+ rw_lock_t latch;
+ /** mapping of dtuple_fold() to rec_t* in buf_block_t::frame */
+ hash_table_t table;
+ /** memory heap for table */
+ mem_heap_t *heap;
+
+ char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof(rw_lock_t) -
+ sizeof(hash_table_t) - sizeof(mem_heap_t)) &
+ (CPU_LEVEL1_DCACHE_LINESIZE - 1)];
+
+ void init()
+ {
+ memset((void*) this, 0, sizeof *this);
+ rw_lock_create(btr_search_latch_key, &latch, SYNC_SEARCH_SYS);
+ }
+
+ void alloc(ulint hash_size)
+ {
+ table.create(hash_size);
+ heap= mem_heap_create_typed(std::min<ulong>(4096,
+ MEM_MAX_ALLOC_IN_BUF / 2
+ - MEM_BLOCK_HEADER_SIZE
+ - MEM_SPACE_NEEDED(0)),
+ MEM_HEAP_FOR_BTR_SEARCH);
+ }
+
+ void clear()
+ {
+ mem_heap_free(heap);
+ heap= nullptr;
+ ut_free(table.array);
+ }
+
+ void free()
+ {
+ rw_lock_free(&latch);
+ if (heap)
+ clear();
+ }
+ };
+
+ /** Partitions of the adaptive hash index */
+ partition *parts;
+
+ /** Get an adaptive hash index partition */
+ partition *get_part(index_id_t id, ulint space_id) const
+ {
+ return parts + ut_fold_ulint_pair(ulint(id), space_id) % btr_ahi_parts;
+ }
+
+ /** Get an adaptive hash index partition */
+ partition *get_part(const dict_index_t &index) const
+ {
+ ut_ad(!index.table->space ||
+ index.table->space->id == index.table->space_id);
+ return get_part(ulint(index.id), index.table->space_id);
+ }
+
+ /** Get the search latch for the adaptive hash index partition */
+ rw_lock_t *get_latch(const dict_index_t &index) const
+ { return &get_part(index)->latch; }
+
+ /** Create and initialize at startup */
+ void create()
+ {
+ parts= static_cast<partition*>(ut_malloc(btr_ahi_parts * sizeof *parts,
+ mem_key_ahi));
+ for (ulong i= 0; i < btr_ahi_parts; ++i)
+ parts[i].init();
+ if (btr_search_enabled)
+ btr_search_enable();
+ }
+
+ void alloc(ulint hash_size)
+ {
+ hash_size/= btr_ahi_parts;
+ for (ulong i= 0; i < btr_ahi_parts; ++i)
+ parts[i].alloc(hash_size);
+ }
+
+ /** Clear when disabling the adaptive hash index */
+ void clear() { for (ulong i= 0; i < btr_ahi_parts; ++i) parts[i].clear(); }
+
+ /** Free at shutdown */
+ void free()
+ {
+ if (parts)
+ {
+ for (ulong i= 0; i < btr_ahi_parts; ++i)
+ parts[i].free();
+ ut_free(parts);
+ parts= nullptr;
+ }
+ }
+};
+
+/** The adaptive hash index */
+extern btr_search_sys_t btr_search_sys;
+
+/** @return number of leaf pages pointed to by the adaptive hash index */
+inline ulint dict_index_t::n_ahi_pages() const
+{
+ if (!btr_search_enabled)
+ return 0;
+ rw_lock_t *latch = &btr_search_sys.get_part(*this)->latch;
+ rw_lock_s_lock(latch);
+ ulint ref_count= search_info->ref_count;
+ rw_lock_s_unlock(latch);
+ return ref_count;
+}
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+extern ulint btr_search_n_succ;
+/** Number of failed adaptive hash index lookups */
+extern ulint btr_search_n_hash_fail;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** After change in n_fields or n_bytes in info, this many rounds are waited
+before starting the hash analysis again: this is to save CPU time when there
+is no hope in building a hash index. */
+#define BTR_SEARCH_HASH_ANALYSIS 17
+
+/** Limit of consecutive searches for trying a search shortcut on the search
+pattern */
+#define BTR_SEARCH_ON_PATTERN_LIMIT 3
+
+/** Limit of consecutive searches for trying a search shortcut using
+the hash index */
+#define BTR_SEARCH_ON_HASH_LIMIT 3
+
+/** We do this many searches before trying to keep the search latch
+over calls from MySQL. If we notice someone waiting for the latch, we
+again set this much timeout. This is to reduce contention. */
+#define BTR_SEA_TIMEOUT 10000
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#include "btr0sea.ic"
+
+#endif
diff --git a/storage/innobase/include/btr0sea.ic b/storage/innobase/include/btr0sea.ic
new file mode 100644
index 00000000..40eb5d86
--- /dev/null
+++ b/storage/innobase/include/btr0sea.ic
@@ -0,0 +1,160 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.ic
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "dict0mem.h"
+#include "btr0cur.h"
+#include "buf0buf.h"
+
+/** Create and initialize search info.
+@param[in,out] heap heap where created
+@return own: search info struct */
+static inline btr_search_t* btr_search_info_create(mem_heap_t* heap)
+{
+ btr_search_t* info = static_cast<btr_search_t*>(
+ mem_heap_zalloc(heap, sizeof(btr_search_t)));
+ ut_d(info->magic_n = BTR_SEARCH_MAGIC_N);
+#ifdef BTR_CUR_HASH_ADAPT
+ info->n_fields = 1;
+ info->left_side = TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+ return(info);
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Updates the search info.
+@param[in,out] info search info
+@param[in,out] cursor cursor which was just positioned */
+void
+btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor);
+
+/*********************************************************************//**
+Updates the search info. */
+static inline
+void
+btr_search_info_update(
+/*===================*/
+ dict_index_t* index, /*!< in: index of the cursor */
+ btr_cur_t* cursor) /*!< in: cursor which was just positioned */
+{
+ ut_ad(!btr_search_own_any(RW_LOCK_S));
+ ut_ad(!btr_search_own_any(RW_LOCK_X));
+
+ if (dict_index_is_spatial(index) || !btr_search_enabled) {
+ return;
+ }
+
+ btr_search_t* info;
+ info = btr_search_get_info(index);
+
+ info->hash_analysis++;
+
+ if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) {
+
+ /* Do nothing */
+
+ return;
+
+ }
+
+ ut_ad(cursor->flag != BTR_CUR_HASH);
+
+ btr_search_info_update_slow(info, cursor);
+}
+
+/** Lock all search latches in exclusive mode. */
+static inline void btr_search_x_lock_all()
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ rw_lock_x_lock(&btr_search_sys.parts[i].latch);
+ }
+}
+
+/** Unlock all search latches from exclusive mode. */
+static inline void btr_search_x_unlock_all()
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ rw_lock_x_unlock(&btr_search_sys.parts[i].latch);
+ }
+}
+
+/** Lock all search latches in shared mode. */
+static inline void btr_search_s_lock_all()
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ rw_lock_s_lock(&btr_search_sys.parts[i].latch);
+ }
+}
+
+/** Unlock all search latches from shared mode. */
+static inline void btr_search_s_unlock_all()
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ rw_lock_s_unlock(&btr_search_sys.parts[i].latch);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Check if thread owns all the search latches.
+@param[in] mode lock mode check
+@retval true if owns all of them
+@retval false if does not own some of them */
+static inline bool btr_search_own_all(ulint mode)
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ if (!rw_lock_own(&btr_search_sys.parts[i].latch, mode)) {
+ return(false);
+ }
+ }
+ return(true);
+}
+
+/** Check if thread owns any of the search latches.
+@param[in] mode lock mode check
+@retval true if owns any of them
+@retval false if owns no search latch */
+static inline bool btr_search_own_any(ulint mode)
+{
+ for (ulint i = 0; i < btr_ahi_parts; ++i) {
+ if (rw_lock_own(&btr_search_sys.parts[i].latch, mode)) {
+ return(true);
+ }
+ }
+ return(false);
+}
+
+/** @return whether this thread holds any of the search latches */
+static inline bool btr_search_own_any()
+{
+ for (ulint i = btr_ahi_parts; i--; ) {
+ if (rw_lock_own_flagged(&btr_search_sys.parts[i].latch,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)) {
+ return true;
+ }
+ }
+ return false;
+}
+#endif /* UNIV_DEBUG */
+#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
new file mode 100644
index 00000000..83c374e2
--- /dev/null
+++ b/storage/innobase/include/btr0types.h
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0types.h
+The index tree general types
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0types_h
+#define btr0types_h
+
+#include "page0types.h"
+#include "rem0types.h"
+
+/** Persistent cursor */
+struct btr_pcur_t;
+/** B-tree cursor */
+struct btr_cur_t;
+/** B-tree search information for the adaptive hash index */
+struct btr_search_t;
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Is search system enabled.
+Search system is protected by array of latches. */
+extern char btr_search_enabled;
+
+/** Number of adaptive hash index partition. */
+extern ulong btr_ahi_parts;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/** The size of a reference to data stored on a different page.
+The reference is stored at the end of the prefix of the field
+in the index record. */
+#define FIELD_REF_SIZE 20U
+#define BTR_EXTERN_FIELD_REF_SIZE FIELD_REF_SIZE
+
+/** If the data don't exceed the size, the data are stored locally. */
+#define BTR_EXTERN_LOCAL_STORED_MAX_SIZE \
+ (BTR_EXTERN_FIELD_REF_SIZE * 2)
+
+#endif
diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h
new file mode 100644
index 00000000..ee48e7ce
--- /dev/null
+++ b/storage/innobase/include/buf0block_hint.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+
+Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License, version 2.0, as published by the
+Free Software Foundation.
+
+This program is also distributed with certain software (including but not
+limited to OpenSSL) that is licensed under separate terms, as designated in a
+particular file or component or in included license documentation. The authors
+of MySQL hereby grant you an additional permission to link the program and
+your derivative works with the separately licensed software that they have
+included with MySQL.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
+for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+#pragma once
+#include "buf0buf.h"
+
+namespace buf {
+class Block_hint {
+public:
+ /** Stores the pointer to the block, which is currently buffer-fixed.
+ @param block a pointer to a buffer-fixed block to be stored */
+ inline void store(buf_block_t *block)
+ {
+ ut_ad(block->page.buf_fix_count());
+ m_block= block;
+ m_page_id= block->page.id();
+ }
+
+ /** Clears currently stored pointer. */
+ inline void clear() { m_block= nullptr; }
+
+ /** Invoke f on m_block(which may be null)
+ @param f The function to be executed. It will be passed the pointer.
+ If you wish to use the block pointer subsequently,
+ you need to ensure you buffer-fix it before returning from f.
+ @return the return value of f
+ */
+ template <typename F>
+ bool run_with_hint(const F &f)
+ {
+ buffer_fix_block_if_still_valid();
+ /* m_block could be changed during f() call, so we use local
+ variable to remember which block we need to unfix */
+ buf_block_t *block= m_block;
+ bool res= f(block);
+ if (block)
+ buf_block_buf_fix_dec(block);
+ return res;
+ }
+
+ buf_block_t *block() const { return m_block; }
+
+ private:
+ /** The block pointer stored by store(). */
+ buf_block_t *m_block= nullptr;
+ /** If m_block is non-null, the m_block->page.id at time it was stored. */
+ page_id_t m_page_id{0, 0};
+
+ /** A helper function which checks if m_block is not a dangling pointer and
+ still points to block with page with m_page_id and if so, buffer-fixes it,
+ otherwise clear()s it */
+ void buffer_fix_block_if_still_valid();
+};
+} // namespace buf
diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h
new file mode 100644
index 00000000..cba31074
--- /dev/null
+++ b/storage/innobase/include/buf0buddy.h
@@ -0,0 +1,92 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.h
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifndef buf0buddy_h
+#define buf0buddy_h
+
+#include "buf0types.h"
+
+/**
+@param[in] block size in bytes
+@return index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+inline
+ulint
+buf_buddy_get_slot(ulint size)
+{
+ ulint i;
+ ulint s;
+
+ ut_ad(ut_is_2pow(size));
+ ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+ ut_ad(size <= srv_page_size);
+
+ for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
+ }
+ ut_ad(i <= BUF_BUDDY_SIZES);
+ return i;
+}
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
+@param lru assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+byte *buf_buddy_alloc_low(ulint i, bool *lru) MY_ATTRIBUTE((malloc));
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param size compressed page size in bytes
+@param lru assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr)
+{
+ return buf_buddy_alloc_low(buf_buddy_get_slot(size), lru);
+}
+
+/** Deallocate a block.
+@param[in] buf block to be freed, must not be pointed to
+ by the buffer pool
+@param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+void buf_buddy_free_low(void* buf, ulint i);
+
+/** Deallocate a block.
+@param[in] buf block to be freed, must not be pointed to
+ by the buffer pool
+@param[in] size block size in bytes */
+inline void buf_buddy_free(void* buf, ulint size)
+{
+ buf_buddy_free_low(buf, buf_buddy_get_slot(size));
+}
+
+/** Try to reallocate a block.
+@param[in] buf block to be reallocated, must be pointed
+to by the buffer pool
+@param[in] size block size, up to srv_page_size
+@retval false if failed because of no free blocks. */
+bool buf_buddy_realloc(void* buf, ulint size);
+
+/** Combine all pairs of free buddies. */
+void buf_buddy_condense_free();
+
+#endif /* buf0buddy_h */
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
new file mode 100644
index 00000000..5a118df4
--- /dev/null
+++ b/storage/innobase/include/buf0buf.h
@@ -0,0 +1,2456 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.h
+The database buffer pool high-level routines
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0buf_h
+#define buf0buf_h
+
+/** Magic value to use instead of checksums when they are disabled */
+#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
+
+#include "fil0fil.h"
+#include "mtr0types.h"
+#include "buf0types.h"
+#include "span.h"
+#include "assume_aligned.h"
+#ifndef UNIV_INNOCHECKSUM
+#include "hash0hash.h"
+#include "ut0byte.h"
+#include "page0types.h"
+#include "log0log.h"
+#include "srv0srv.h"
+#include <ostream>
+
+// Forward declaration
+struct fil_addr_t;
+
+/** @name Modes for buf_page_get_gen */
+/* @{ */
+#define BUF_GET 10 /*!< get always */
+#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */
+#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make
+ the block young in the LRU list */
+#define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but
+ set no latch; we have
+ separated this case, because
+ it is error-prone programming
+ not to set a latch, and it
+ should be used with care */
+#define BUF_GET_IF_IN_POOL_OR_WATCH 15
+ /*!< Get the page only if it's in the
+ buffer pool, if not then set a watch
+ on the page. */
+#define BUF_GET_POSSIBLY_FREED 16
+ /*!< Like BUF_GET, but do not mind
+ if the file page has been freed. */
+#define BUF_EVICT_IF_IN_POOL 20 /*!< evict a clean block if found */
+/* @} */
+
+# ifdef UNIV_DEBUG
+extern my_bool buf_disable_resize_buffer_pool_debug; /*!< if TRUE, resizing
+ buffer pool is not allowed. */
+# endif /* UNIV_DEBUG */
+
+/** buf_page_t::state() values, distinguishing buf_page_t and buf_block_t */
+enum buf_page_state
+{
+ /** available in buf_pool.free or buf_pool.watch */
+ BUF_BLOCK_NOT_USED,
+ /** allocated for something else than a file page */
+ BUF_BLOCK_MEMORY,
+ /** a previously allocated file page, in transit to NOT_USED */
+ BUF_BLOCK_REMOVE_HASH,
+ /** a buf_block_t that is also in buf_pool.LRU */
+ BUF_BLOCK_FILE_PAGE,
+ /** the buf_page_t of a ROW_FORMAT=COMPRESSED page
+ whose uncompressed page frame has been evicted */
+ BUF_BLOCK_ZIP_PAGE
+};
+
+/** This structure defines information we will fetch from each buffer pool. It
+will be used to print table IO stats */
+struct buf_pool_info_t
+{
+ /* General buffer pool info */
+ ulint pool_size; /*!< Buffer Pool size in pages */
+ ulint lru_len; /*!< Length of buf_pool.LRU */
+ ulint old_lru_len; /*!< buf_pool.LRU_old_len */
+ ulint free_list_len; /*!< Length of buf_pool.free list */
+ ulint flush_list_len; /*!< Length of buf_pool.flush_list */
+ ulint n_pend_unzip; /*!< buf_pool.n_pend_unzip, pages
+ pending decompress */
+ ulint n_pend_reads; /*!< buf_pool.n_pend_reads, pages
+ pending read */
+ ulint n_pending_flush_lru; /*!< Pages pending flush in LRU */
+ ulint n_pending_flush_list; /*!< Pages pending flush in FLUSH
+ LIST */
+ ulint n_pages_made_young; /*!< number of pages made young */
+ ulint n_pages_not_made_young; /*!< number of pages not made young */
+ ulint n_pages_read; /*!< buf_pool.n_pages_read */
+ ulint n_pages_created; /*!< buf_pool.n_pages_created */
+ ulint n_pages_written; /*!< buf_pool.n_pages_written */
+ ulint n_page_gets; /*!< buf_pool.n_page_gets */
+ ulint n_ra_pages_read_rnd; /*!< buf_pool.n_ra_pages_read_rnd,
+ number of pages readahead */
+ ulint n_ra_pages_read; /*!< buf_pool.n_ra_pages_read, number
+ of pages readahead */
+ ulint n_ra_pages_evicted; /*!< buf_pool.n_ra_pages_evicted,
+ number of readahead pages evicted
+ without access */
+ ulint n_page_get_delta; /*!< num of buffer pool page gets since
+ last printout */
+
+ /* Buffer pool access stats */
+ double page_made_young_rate; /*!< page made young rate in pages
+ per second */
+ double page_not_made_young_rate;/*!< page not made young rate
+ in pages per second */
+ double pages_read_rate; /*!< num of pages read per second */
+ double pages_created_rate; /*!< num of pages create per second */
+ double pages_written_rate; /*!< num of pages written per second */
+ ulint page_read_delta; /*!< num of pages read since last
+ printout */
+ ulint young_making_delta; /*!< num of pages made young since
+ last printout */
+ ulint not_young_making_delta; /*!< num of pages not make young since
+ last printout */
+
+ /* Statistics about read ahead algorithm. */
+ double pages_readahead_rnd_rate;/*!< random readahead rate in pages per
+ second */
+ double pages_readahead_rate; /*!< readahead rate in pages per
+ second */
+ double pages_evicted_rate; /*!< rate of readahead page evicted
+ without access, in pages per second */
+
+ /* Stats about LRU eviction */
+ ulint unzip_lru_len; /*!< length of buf_pool.unzip_LRU
+ list */
+ /* Counters for LRU policy */
+ ulint io_sum; /*!< buf_LRU_stat_sum.io */
+ ulint io_cur; /*!< buf_LRU_stat_cur.io, num of IO
+ for current interval */
+ ulint unzip_sum; /*!< buf_LRU_stat_sum.unzip */
+ ulint unzip_cur; /*!< buf_LRU_stat_cur.unzip, num
+ pages decompressed in current
+ interval */
+};
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Print the given page_id_t object.
+@param[in,out] out the output stream
+@param[in] page_id the page_id_t object to be printed
+@return the output stream */
+std::ostream&
+operator<<(
+ std::ostream& out,
+ const page_id_t page_id);
+
+#ifndef UNIV_INNOCHECKSUM
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void);
+/*========================*/
+
+/********************************************************************//**
+Allocates a buf_page_t descriptor. This function must succeed. In case
+of failure we assert in this function. */
+UNIV_INLINE
+buf_page_t*
+buf_page_alloc_descriptor(void)
+/*===========================*/
+ MY_ATTRIBUTE((malloc));
+/********************************************************************//**
+Free a buf_page_t descriptor. */
+UNIV_INLINE
+void
+buf_page_free_descriptor(
+/*=====================*/
+ buf_page_t* bpage) /*!< in: bpage descriptor to free. */
+ MY_ATTRIBUTE((nonnull));
+
+/** Allocate a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+inline buf_block_t *buf_block_alloc();
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+ buf_block_t* block); /*!< in, own: block to be freed */
+
+/**************************************************************//**
+NOTE! The following macros should be used instead of buf_page_get_gen,
+to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed
+in LA! */
+#define buf_page_get(ID, SIZE, LA, MTR) \
+ buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, __FILE__, __LINE__, MTR)
+
+/**************************************************************//**
+Use these macros to bufferfix a page with no latching. Remember not to
+read the contents of the page unless you know it is safe. Do not modify
+the contents of the page! We have separated this case, because it is
+error-prone programming not to set a latch, and it should be used
+with care. */
+#define buf_page_get_with_no_latch(ID, SIZE, MTR) \
+ buf_page_get_gen(ID, SIZE, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, \
+ __FILE__, __LINE__, MTR)
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return TRUE if success */
+ibool
+buf_page_optimistic_get(
+/*====================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: guessed block */
+ ib_uint64_t modify_clock,/*!< in: modify clock value */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mini-transaction */
+
+/** Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the lock_sys_t::mutex.
+@param[in] page_id page id
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@return pointer to a page or NULL */
+buf_block_t*
+buf_page_try_get_func(
+ const page_id_t page_id,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr);
+
+/** Tries to get a page.
+If the page is not in the buffer pool it is not loaded. Suitable for using
+when holding the lock_sys_t::mutex.
+@param[in] page_id page identifier
+@param[in] mtr mini-transaction
+@return the page if in buffer pool, NULL if not */
+#define buf_page_try_get(page_id, mtr) \
+ buf_page_try_get_func((page_id), __FILE__, __LINE__, mtr);
+
+/** Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch. Mutual exclusion has to
+be implemented at a higher level. In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size
+@return pointer to the block */
+buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size);
+
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] guess guessed block or NULL
+@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@param[out] err DB_SUCCESS or error code
+@param[in] allow_ibuf_merge Allow change buffer merge while
+reading the pages from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_gen(
+ const page_id_t page_id,
+ ulint zip_size,
+ ulint rw_latch,
+ buf_block_t* guess,
+ ulint mode,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr,
+ dberr_t* err = NULL,
+ bool allow_ibuf_merge = false);
+
+/** This is the low level function used to get access to a database page.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] guess guessed block or NULL
+@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@param[out] err DB_SUCCESS or error code
+@param[in] allow_ibuf_merge Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_low(
+ const page_id_t page_id,
+ ulint zip_size,
+ ulint rw_latch,
+ buf_block_t* guess,
+ ulint mode,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr,
+ dberr_t* err,
+ bool allow_ibuf_merge);
+
+/** Initialize a page in the buffer pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@param[in,out] space space object
+@param[in] offset offset of the tablespace
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction
+@param[in,out] free_block pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create(fil_space_t *space, uint32_t offset,
+ ulint zip_size, mtr_t *mtr, buf_block_t *free_block);
+
+/********************************************************************//**
+Releases a compressed-only page acquired with buf_page_get_zip(). */
+UNIV_INLINE
+void
+buf_page_release_zip(
+/*=================*/
+ buf_page_t* bpage); /*!< in: buffer block */
+/********************************************************************//**
+Releases a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release_latch(
+/*=====================*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint rw_latch); /*!< in: RW_S_LATCH, RW_X_LATCH,
+ RW_NO_LATCH */
+/** Move a block to the start of the LRU list. */
+void buf_page_make_young(buf_page_t *bpage);
+/** Mark the page status as FREED for the given tablespace id and
+page number. If the page is not in buffer pool then ignore it.
+@param[in,out] space tablespace
+@param[in] page page number
+@param[in,out] mtr mini-transaction
+@param[in] file file name
+@param[in] line line where called */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
+ const char *file, unsigned line);
+
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+unsigned
+buf_page_get_freed_page_clock(
+/*==========================*/
+ const buf_page_t* bpage) /*!< in: block */
+ MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+unsigned
+buf_block_get_freed_page_clock(
+/*===========================*/
+ const buf_block_t* block) /*!< in: block */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Determine if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+Note that this is for heuristics only and does not reserve buffer pool
+mutex.
+@param[in] bpage buffer pool page
+@return whether bpage is close to MRU end of LRU */
+inline bool buf_page_peek_if_young(const buf_page_t *bpage);
+
+/** Determine if a block should be moved to the start of the LRU list if
+there is danger of dropping from the buffer pool.
+@param[in] bpage buffer pool page
+@return true if bpage should be made younger */
+inline bool buf_page_peek_if_too_old(const buf_page_t *bpage);
+
+/** Move a page to the start of the buffer pool LRU list if it is too old.
+@param[in,out] bpage buffer pool page */
+inline void buf_page_make_young_if_needed(buf_page_t *bpage)
+{
+ if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) {
+ buf_page_make_young(bpage);
+ }
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool.mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+ buf_block_t* block); /*!< in: block */
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+ buf_block_t* block); /*!< in: block */
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_inc_func(
+/*=======================*/
+# ifdef UNIV_DEBUG
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line */
+# endif /* UNIV_DEBUG */
+ buf_block_t* block) /*!< in/out: block to bufferfix */
+ MY_ATTRIBUTE((nonnull));
+
+# ifdef UNIV_DEBUG
+/** Increments the bufferfix count.
+@param[in,out] b block to bufferfix
+@param[in] f file name where requested
+@param[in] l line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b)
+# else /* UNIV_DEBUG */
+/** Increments the bufferfix count.
+@param[in,out] b block to bufferfix
+@param[in] f file name where requested
+@param[in] l line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
+# endif /* UNIV_DEBUG */
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Check if a buffer is all zeroes.
+@param[in] buf data to check
+@return whether the buffer is all zeroes */
+bool buf_is_zeroes(st_::span<const byte> buf);
+
+/** Checks if the page is in crc32 checksum format.
+@param[in] read_buf database page
+@param[in] checksum_field1 new checksum field
+@param[in] checksum_field2 old checksum field
+@return true if the page is in crc32 checksum format. */
+bool
+buf_page_is_checksum_valid_crc32(
+ const byte* read_buf,
+ ulint checksum_field1,
+ ulint checksum_field2)
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Checks if the page is in innodb checksum format.
+@param[in] read_buf database page
+@param[in] checksum_field1 new checksum field
+@param[in] checksum_field2 old checksum field
+@return true if the page is in innodb checksum format. */
+bool
+buf_page_is_checksum_valid_innodb(
+ const byte* read_buf,
+ ulint checksum_field1,
+ ulint checksum_field2)
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Checks if the page is in none checksum format.
+@param[in] read_buf database page
+@param[in] checksum_field1 new checksum field
+@param[in] checksum_field2 old checksum field
+@return true if the page is in none checksum format. */
+bool
+buf_page_is_checksum_valid_none(
+ const byte* read_buf,
+ ulint checksum_field1,
+ ulint checksum_field2)
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Check if a page is corrupt.
+@param[in] check_lsn whether the LSN should be checked
+@param[in] read_buf database page
+@param[in] fsp_flags tablespace flags
+@return whether the page is corrupted */
+bool
+buf_page_is_corrupted(
+ bool check_lsn,
+ const byte* read_buf,
+ ulint fsp_flags)
+ MY_ATTRIBUTE((warn_unused_result));
+
+inline void *aligned_malloc(size_t size, size_t align)
+{
+#ifdef _MSC_VER
+ return _aligned_malloc(size, align);
+#else
+ void *result;
+ if (posix_memalign(&result, align, size))
+ result= NULL;
+ return result;
+#endif
+}
+
+inline void aligned_free(void *ptr)
+{
+#ifdef _MSC_VER
+ _aligned_free(ptr);
+#else
+ free(ptr);
+#endif
+}
+
+/** Read the key version from the page. In full crc32 format,
+key version is stored at {0-3th} bytes. In other format, it is
+stored in 26th position.
+@param[in] read_buf database page
+@param[in] fsp_flags tablespace flags
+@return key version of the page. */
+inline uint32_t buf_page_get_key_version(const byte* read_buf, ulint fsp_flags)
+{
+ static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "compatibility");
+ return fil_space_t::full_crc32(fsp_flags)
+ ? mach_read_from_4(my_assume_aligned<4>(read_buf))
+ : mach_read_from_4(my_assume_aligned<2>
+ (read_buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION));
+}
+
+/** Read the compression info from the page. In full crc32 format,
+compression info is at MSB of page type. In other format, it is
+stored in page type.
+@param[in] read_buf database page
+@param[in] fsp_flags tablespace flags
+@return true if page is compressed. */
+inline bool buf_page_is_compressed(const byte* read_buf, ulint fsp_flags)
+{
+ uint16_t page_type= fil_page_get_type(read_buf);
+ return fil_space_t::full_crc32(fsp_flags)
+ ? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)
+ : page_type == FIL_PAGE_PAGE_COMPRESSED;
+}
+
+/** Get the compressed or uncompressed size of a full_crc32 page.
+@param[in] buf page_compressed or uncompressed page
+@param[out] comp whether the page could be compressed
+@param[out] cr whether the page could be corrupted
+@return the payload size in the file page */
+inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr)
+{
+ uint t = fil_page_get_type(buf);
+ uint page_size = uint(srv_page_size);
+
+ if (!(t & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)) {
+ return page_size;
+ }
+
+ t &= ~(1U << FIL_PAGE_COMPRESS_FCRC32_MARKER);
+ t <<= 8;
+
+ if (t < page_size) {
+ page_size = t;
+ if (comp) {
+ *comp = true;
+ }
+ } else if (cr) {
+ *cr = true;
+ }
+
+ return page_size;
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/** Dump a page to stderr.
+@param[in] read_buf database page
+@param[in] zip_size compressed page size, or 0 */
+void buf_page_print(const byte* read_buf, ulint zip_size = 0)
+ ATTRIBUTE_COLD __attribute__((nonnull));
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+ibool
+buf_zip_decompress(
+/*===============*/
+ buf_block_t* block, /*!< in/out: block */
+ ibool check); /*!< in: TRUE=verify the page checksum */
+
+#ifdef UNIV_DEBUG
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+void
+buf_print_io(
+/*=========*/
+ FILE* file); /*!< in: file where to print */
+/** Collect buffer pool metadata.
+@param[out] pool_info buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info);
+
+/** Refresh the statistics used to print per-second averages. */
+void buf_refresh_io_stats();
+
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate();
+
+/*========================================================================
+--------------------------- LOWER LEVEL ROUTINES -------------------------
+=========================================================================*/
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. */
+UNIV_INLINE
+void
+buf_block_dbg_add_level(
+/*====================*/
+ buf_block_t* block, /*!< in: buffer page
+ where we have acquired latch */
+ latch_level_t level); /*!< in: latching order level */
+#else /* UNIV_DEBUG */
+# define buf_block_dbg_add_level(block, level) /* nothing */
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block.
+@return pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ MY_ATTRIBUTE((warn_unused_result));
+#else /* UNIV_DEBUG */
+# define buf_block_get_frame(block) (block)->frame
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable. */
+#define buf_block_get_page_zip(block) \
+ (UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
+#define is_buf_block_get_page_zip(block) \
+ UNIV_LIKELY_NULL((block)->page.zip.data)
+
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage buffer page whose read or write was completed
+@param io_type BUF_IO_READ or BUF_IO_WRITE */
+ATTRIBUTE_COLD __attribute__((nonnull))
+void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type);
+
+/** Complete a read request of a file page to buf_pool.
+@param bpage recently read page
+@param node data file
+@return whether the operation succeeded
+@retval DB_SUCCESS always when writing, or if a read page was OK
+@retval DB_PAGE_CORRUPTED if the checksum fails on a page read
+@retval DB_DECRYPTION_FAILED if the page cannot be decrypted */
+dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node);
+
+/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
+if needed.
+@param[in] size size in bytes
+@return aligned size */
+UNIV_INLINE
+ulint
+buf_pool_size_align(
+ ulint size);
+
+/** Verify that post encryption checksum match with the calculated checksum.
+This function should be called only if tablespace contains crypt data metadata.
+@param[in] page page frame
+@param[in] fsp_flags tablespace flags
+@return true if page is encrypted and OK, false otherwise */
+bool buf_page_verify_crypt_checksum(
+ const byte* page,
+ ulint fsp_flags);
+
+/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
+@param[in,out] page page to update
+@param[in] size compressed page size */
+void buf_flush_update_zip_checksum(buf_frame_t* page, ulint size);
+
+/** @brief The temporary memory structure.
+
+NOTE! The definition appears here only for other modules of this
+directory (buf) to see it. Do not use from outside! */
+
+class buf_tmp_buffer_t
+{
+ /** whether this slot is reserved */
+ std::atomic<bool> reserved;
+public:
+ /** For encryption, the data needs to be copied to a separate buffer
+ before it's encrypted&written. The buffer block itself can be replaced
+ while a write of crypt_buf to file is in progress. */
+ byte *crypt_buf;
+ /** buffer for fil_page_compress(), for flushing page_compressed pages */
+ byte *comp_buf;
+ /** pointer to resulting buffer after encryption or compression;
+ not separately allocated memory */
+ byte *out_buf;
+
+ /** Release the slot */
+ void release() { reserved.store(false, std::memory_order_relaxed); }
+
+ /** Acquire the slot
+ @return whether the slot was acquired */
+ bool acquire() { return !reserved.exchange(true, std::memory_order_relaxed);}
+
+ /** Allocate a buffer for encryption, decryption or decompression. */
+ void allocate()
+ {
+ if (!crypt_buf)
+ crypt_buf= static_cast<byte*>
+ (aligned_malloc(srv_page_size, srv_page_size));
+ }
+};
+
+/** The common buffer control block structure
+for compressed and uncompressed frames */
+
+class buf_pool_t;
+
+class buf_page_t
+{
+ friend buf_pool_t;
+ friend buf_block_t;
+ /** @name General fields */
+ /* @{ */
+
+public: // FIXME: fix fil_iterate()
+ /** Page id. Protected by buf_pool.hash_lock_get(id) when
+ the page is in buf_pool.page_hash. */
+ page_id_t id_;
+private:
+ /** Count of how manyfold this block is currently bufferfixed. */
+ Atomic_counter<uint32_t> buf_fix_count_;
+
+ /** log sequence number of the START of the log entry written of the
+ oldest modification to this block which has not yet been written
+ to the data file;
+
+ 0 if no modifications are pending;
+ 1 if no modifications are pending, but the block is in buf_pool.flush_list;
+ 2 if modifications are pending, but the block is not in buf_pool.flush_list
+ (because id().space() is the temporary tablespace). */
+ Atomic_relaxed<lsn_t> oldest_modification_;
+
+ /** type of pending I/O operation; protected by buf_pool.mutex
+ if in_LRU_list */
+ Atomic_relaxed<buf_io_fix> io_fix_;
+ /** Block state. @see in_file().
+ State transitions between in_file() states and to
+ BUF_BLOCK_REMOVE_HASH are protected by buf_pool.hash_lock_get(id)
+ when the block is in buf_pool.page_hash.
+ Other transitions when in_LRU_list are protected by buf_pool.mutex. */
+ buf_page_state state_;
+
+public:
+ /** buf_pool.page_hash link; protected by buf_pool.hash_lock_get(id) */
+ buf_page_t *hash;
+ /* @} */
+ page_zip_des_t zip; /*!< compressed page; zip.data
+ (but not the data it points to) is
+ also protected by buf_pool.mutex;
+ state == BUF_BLOCK_ZIP_PAGE and
+ zip.data == NULL means an active
+ buf_pool.watch */
+
+ buf_tmp_buffer_t* slot; /*!< Slot for temporary memory
+ used for encryption/compression
+ or NULL */
+#ifdef UNIV_DEBUG
+ /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */
+ bool in_zip_hash;
+ /** whether this->LRU is in buf_pool.LRU (in_file() holds);
+ protected by buf_pool.mutex */
+ bool in_LRU_list;
+ /** whether this is in buf_pool.page_hash (in_file() holds);
+ protected by buf_pool.mutex */
+ bool in_page_hash;
+ /** whether this->list is in buf_pool.free (state() == BUF_BLOCK_NOT_USED);
+ protected by buf_pool.flush_list_mutex */
+ bool in_free_list;
+#endif /* UNIV_DEBUG */
+ /** list member in one of the lists of buf_pool; protected by
+ buf_pool.mutex or buf_pool.flush_list_mutex
+
+ state() == BUF_BLOCK_NOT_USED: buf_pool.free or buf_pool.withdraw
+
+ in_file() && oldest_modification():
+ buf_pool.flush_list (protected by buf_pool.flush_list_mutex)
+
+ The contents is undefined if in_file() && !oldest_modification(),
+ or if state() is BUF_BLOCK_MEMORY or BUF_BLOCK_REMOVE_HASH. */
+ UT_LIST_NODE_T(buf_page_t) list;
+
+ /** @name LRU replacement algorithm fields.
+ Protected by buf_pool.mutex. */
+ /* @{ */
+
+ UT_LIST_NODE_T(buf_page_t) LRU;
+ /*!< node of the LRU list */
+ unsigned old:1; /*!< TRUE if the block is in the old
+ blocks in buf_pool.LRU_old */
+ unsigned freed_page_clock:31;/*!< the value of
+ buf_pool.freed_page_clock
+ when this block was the last
+ time put to the head of the
+ LRU list; a thread is allowed
+ to read this for heuristic
+ purposes without holding any
+ mutex or latch */
+ /* @} */
+ Atomic_counter<unsigned> access_time; /*!< time of first access, or
+ 0 if the block was never accessed
+ in the buffer pool.
+
+ For state==BUF_BLOCK_MEMORY
+ blocks, this field can be repurposed
+ for something else.
+
+ When this field counts log records
+ and bytes allocated for recv_sys.pages,
+ the field is protected by
+ recv_sys_t::mutex. */
+ /** Change buffer entries for the page exist.
+ Protected by io_fix()==BUF_IO_READ or by buf_block_t::lock. */
+ bool ibuf_exist;
+
+ /** Block initialization status. Can be modified while holding io_fix()
+ or buf_block_t::lock X-latch */
+ enum {
+ /** the page was read normally and should be flushed normally */
+ NORMAL = 0,
+ /** the page was (re)initialized, and the doublewrite buffer can be
+ skipped on the next flush */
+ INIT_ON_FLUSH,
+ /** the page was freed and need to be flushed.
+ For page_compressed, page flush will punch a hole to free space.
+ Else if innodb_immediate_scrub_data_uncompressed, the page will
+ be overwritten with zeroes. */
+ FREED
+ } status;
+
+ buf_page_t() : id_(0)
+ {
+ static_assert(BUF_BLOCK_NOT_USED == 0, "compatibility");
+ memset((void*) this, 0, sizeof *this);
+ }
+
+ /** Initialize some fields */
+ void init()
+ {
+ io_fix_= BUF_IO_NONE;
+ buf_fix_count_= 0;
+ old= 0;
+ freed_page_clock= 0;
+ access_time= 0;
+ oldest_modification_= 0;
+ slot= nullptr;
+ ibuf_exist= false;
+ status= NORMAL;
+ ut_d(in_zip_hash= false);
+ ut_d(in_free_list= false);
+ ut_d(in_LRU_list= false);
+ ut_d(in_page_hash= false);
+ HASH_INVALIDATE(this, hash);
+ }
+
+ /** Initialize some more fields */
+ void init(buf_page_state state, page_id_t id, uint32_t buf_fix_count= 0)
+ {
+ init();
+ state_= state;
+ id_= id;
+ buf_fix_count_= buf_fix_count;
+ }
+
+ /** Initialize some more fields */
+ void init(page_id_t id, uint32_t buf_fix_count= 0)
+ {
+ init();
+ id_= id;
+ buf_fix_count_= buf_fix_count;
+ }
+
+public:
+ const page_id_t &id() const { return id_; }
+ buf_page_state state() const { return state_; }
+ uint32_t buf_fix_count() const { return buf_fix_count_; }
+ buf_io_fix io_fix() const { return io_fix_; }
+ void io_unfix()
+ {
+ ut_d(const auto old_io_fix= io_fix());
+ ut_ad(old_io_fix == BUF_IO_READ || old_io_fix == BUF_IO_PIN);
+ io_fix_= BUF_IO_NONE;
+ }
+
+ /** @return if this belongs to buf_pool.unzip_LRU */
+ bool belongs_to_unzip_LRU() const
+ {
+ return zip.data && state() != BUF_BLOCK_ZIP_PAGE;
+ }
+
+ inline void add_buf_fix_count(uint32_t count);
+ inline void set_buf_fix_count(uint32_t count);
+ inline void set_state(buf_page_state state);
+ inline void set_io_fix(buf_io_fix io_fix);
+ inline void set_corrupt_id();
+
+ /** @return the log sequence number of the oldest pending modification
+ @retval 0 if the block is being removed from (or not in) buf_pool.flush_list
+ @retval 1 if the block is in buf_pool.flush_list but not modified
+ @retval 2 if the block belongs to the temporary tablespace and
+ has unwritten changes */
+ lsn_t oldest_modification() const { return oldest_modification_; }
+ /** @return the log sequence number of the oldest pending modification,
+ @retval 0 if the block is definitely not in buf_pool.flush_list
+ @retval 1 if the block is in buf_pool.flush_list but not modified
+ @retval 2 if the block belongs to the temporary tablespace and
+ has unwritten changes */
+ lsn_t oldest_modification_acquire() const
+ { return oldest_modification_.load(std::memory_order_acquire); }
+ /** Set oldest_modification when adding to buf_pool.flush_list */
+ inline void set_oldest_modification(lsn_t lsn);
+ /** Clear oldest_modification after removing from buf_pool.flush_list */
+ inline void clear_oldest_modification();
+ /** Note that a block is no longer dirty, while not removing
+ it from buf_pool.flush_list */
+ inline void clear_oldest_modification(bool temporary);
+
+ /** Notify that a page in a temporary tablespace has been modified. */
+ void set_temp_modified()
+ {
+ ut_ad(fsp_is_system_temporary(id().space()));
+ ut_ad(state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(!oldest_modification());
+ oldest_modification_= 2;
+ }
+
+ /** Prepare to release a file page to buf_pool.free. */
+ void free_file_page()
+ {
+ ut_ad(state() == BUF_BLOCK_REMOVE_HASH);
+ /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
+ ut_d(oldest_modification_= 0;)
+ set_corrupt_id();
+ ut_d(set_state(BUF_BLOCK_MEMORY));
+ }
+
+ void fix() { buf_fix_count_++; }
+ uint32_t unfix()
+ {
+ uint32_t count= buf_fix_count_--;
+ ut_ad(count != 0);
+ return count - 1;
+ }
+
+ /** @return the physical size, in bytes */
+ ulint physical_size() const
+ {
+ return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : srv_page_size;
+ }
+
+ /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
+ @retval 0 if not compressed */
+ ulint zip_size() const
+ {
+ return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0;
+ }
+
+ /** @return the byte offset of the page within a file */
+ os_offset_t physical_offset() const
+ {
+ os_offset_t o= id().page_no();
+ return zip.ssize
+ ? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1))
+ : o << srv_page_size_shift;
+ }
+
+ /** @return whether the block is mapped to a data file */
+ bool in_file() const
+ {
+ switch (state_) {
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_FILE_PAGE:
+ return true;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ return false;
+ }
+
+ ut_error;
+ return false;
+ }
+
+ /** @return whether the block is modified and ready for flushing */
+ inline bool ready_for_flush() const;
+ /** @return whether the state can be changed to BUF_BLOCK_NOT_USED */
+ bool ready_for_replace() const
+ { return !oldest_modification() && can_relocate(); }
+ /** @return whether the block can be relocated in memory.
+ The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
+ inline bool can_relocate() const;
+ /** @return whether the block has been flagged old in buf_pool.LRU */
+ inline bool is_old() const;
+ /** Set whether a block is old in buf_pool.LRU */
+ inline void set_old(bool old);
+ /** Flag a page accessed in buf_pool
+ @return whether this is not the first access */
+ bool set_accessed()
+ {
+ if (is_accessed()) return true;
+ access_time= static_cast<uint32_t>(ut_time_ms());
+ return false;
+ }
+ /** @return ut_time_ms() at the time of first access of a block in buf_pool
+ @retval 0 if not accessed */
+ unsigned is_accessed() const { ut_ad(in_file()); return access_time; }
+};
+
+/** The buffer control block structure */
+
+struct buf_block_t{
+
+ /** @name General fields */
+ /* @{ */
+
+ buf_page_t page; /*!< page information; this must
+ be the first field, so that
+ buf_pool.page_hash can point
+ to buf_page_t or buf_block_t */
+ byte* frame; /*!< pointer to buffer frame which
+ is of size srv_page_size, and
+ aligned to an address divisible by
+ srv_page_size */
+ rw_lock_t lock; /*!< read-write lock of the buffer
+ frame */
+#ifdef UNIV_DEBUG
+ /** whether page.list is in buf_pool.withdraw
+ ((state() == BUF_BLOCK_NOT_USED)) and the buffer pool is being shrunk;
+ protected by buf_pool.mutex */
+ bool in_withdraw_list;
+ /** whether unzip_LRU is in buf_pool.unzip_LRU
+ (state() == BUF_BLOCK_FILE_PAGE and zip.data != nullptr);
+ protected by buf_pool.mutex */
+ bool in_unzip_LRU_list;
+#endif
+ UT_LIST_NODE_T(buf_block_t) unzip_LRU;
+ /*!< node of the decompressed LRU list;
+ a block is in the unzip_LRU list
+ if page.state() == BUF_BLOCK_FILE_PAGE
+ and page.zip.data != NULL */
+ /* @} */
+ /** @name Optimistic search field */
+ /* @{ */
+
+ ib_uint64_t modify_clock; /*!< this clock is incremented every
+ time a pointer to a record on the
+ page may become obsolete; this is
+ used in the optimistic cursor
+ positioning: if the modify clock has
+ not changed, we know that the pointer
+ is still valid; this field may be
+ changed if the thread (1) owns the
+ pool mutex and the page is not
+ bufferfixed, or (2) the thread has an
+ x-latch on the block */
+ /* @} */
+#ifdef BTR_CUR_HASH_ADAPT
+ /** @name Hash search fields (unprotected)
+ NOTE that these fields are NOT protected by any semaphore! */
+ /* @{ */
+
+ volatile uint16_t n_bytes; /*!< recommended prefix length for hash
+ search: number of bytes in
+ an incomplete last field */
+ volatile uint16_t n_fields; /*!< recommended prefix length for hash
+ search: number of full fields */
+ uint16_t n_hash_helps; /*!< counter which controls building
+ of a new hash index for the page */
+ volatile bool left_side; /*!< true or false, depending on
+ whether the leftmost record of several
+ records with the same prefix should be
+ indexed in the hash index */
+ /* @} */
+
+ /** @name Hash search fields
+ These 5 fields may only be modified when:
+ we are holding the appropriate x-latch in btr_search_latches[], and
+ one of the following holds:
+ (1) the block state is BUF_BLOCK_FILE_PAGE, and
+ we are holding an s-latch or x-latch on buf_block_t::lock, or
+ (2) buf_block_t::buf_fix_count == 0, or
+ (3) the block state is BUF_BLOCK_REMOVE_HASH.
+
+ An exception to this is when we init or create a page
+ in the buffer pool in buf0buf.cc.
+
+ Another exception for buf_pool_t::clear_hash_index() is that
+ assigning block->index = NULL (and block->n_pointers = 0)
+ is allowed whenever btr_search_own_all(RW_LOCK_X).
+
+ Another exception is that ha_insert_for_fold() may
+ decrement n_pointers without holding the appropriate latch
+ in btr_search_latches[]. Thus, n_pointers must be
+ protected by atomic memory access.
+
+ This implies that the fields may be read without race
+ condition whenever any of the following hold:
+ - the btr_search_latches[] s-latch or x-latch is being held, or
+ - the block state is not BUF_BLOCK_FILE_PAGE or BUF_BLOCK_REMOVE_HASH,
+ and holding some latch prevents the state from changing to that.
+
+ Some use of assert_block_ahi_empty() or assert_block_ahi_valid()
+ is prone to race conditions while buf_pool_t::clear_hash_index() is
+ executing (the adaptive hash index is being disabled). Such use
+ is explicitly commented. */
+
+ /* @{ */
+
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ Atomic_counter<ulint>
+ n_pointers; /*!< used in debugging: the number of
+ pointers in the adaptive hash index
+ pointing to this frame;
+ protected by atomic memory access
+ or btr_search_own_all(). */
+# define assert_block_ahi_empty(block) \
+ ut_a((block)->n_pointers == 0)
+# define assert_block_ahi_empty_on_init(block) do { \
+ MEM_MAKE_DEFINED(&(block)->n_pointers, sizeof (block)->n_pointers); \
+ assert_block_ahi_empty(block); \
+} while (0)
+# define assert_block_ahi_valid(block) \
+ ut_a((block)->index || (block)->n_pointers == 0)
+# else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+# define assert_block_ahi_empty(block) /* nothing */
+# define assert_block_ahi_empty_on_init(block) /* nothing */
+# define assert_block_ahi_valid(block) /* nothing */
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ unsigned curr_n_fields:10;/*!< prefix length for hash indexing:
+ number of full fields */
+ unsigned curr_n_bytes:15;/*!< number of bytes in hash
+ indexing */
+ unsigned curr_left_side:1;/*!< TRUE or FALSE in hash indexing */
+ dict_index_t* index; /*!< Index for which the
+ adaptive hash index has been
+ created, or NULL if the page
+ does not exist in the
+ index. Note that it does not
+ guarantee that the index is
+ complete, though: there may
+ have been hash collisions,
+ record deletions, etc. */
+ /* @} */
+#else /* BTR_CUR_HASH_ADAPT */
+# define assert_block_ahi_empty(block) /* nothing */
+# define assert_block_ahi_empty_on_init(block) /* nothing */
+# define assert_block_ahi_valid(block) /* nothing */
+#endif /* BTR_CUR_HASH_ADAPT */
+# ifdef UNIV_DEBUG
+ /** @name Debug fields */
+ /* @{ */
+ rw_lock_t* debug_latch; /*!< in the debug version, each thread
+ which bufferfixes the block acquires
+ an s-latch here; so we can use the
+ debug utilities in sync0rw */
+ /* @} */
+# endif
+ void fix() { page.fix(); }
+ uint32_t unfix()
+ {
+ ut_ad(page.buf_fix_count() || page.io_fix() != BUF_IO_NONE ||
+ page.state() == BUF_BLOCK_ZIP_PAGE ||
+ !rw_lock_own_flagged(&lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S |
+ RW_LOCK_FLAG_SX));
+ return page.unfix();
+ }
+
+ /** @return the physical size, in bytes */
+ ulint physical_size() const { return page.physical_size(); }
+
+ /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
+ @retval 0 if not compressed */
+ ulint zip_size() const { return page.zip_size(); }
+
+ /** Initialize the block.
+ @param page_id page identifier
+ @param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+ @param fix initial buf_fix_count() */
+ void initialise(const page_id_t page_id, ulint zip_size, uint32_t fix= 0);
+};
+
+/**********************************************************************//**
+Compute the hash fold value for blocks in buf_pool.zip_hash. */
+/* @{ */
+#define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift)
+#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
+#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
+/* @} */
+
+/** A "Hazard Pointer" class used to iterate over page lists
+inside the buffer pool. A hazard pointer is a buf_page_t pointer
+which we intend to iterate over next and we want it remain valid
+even after we release the buffer pool mutex. */
+class HazardPointer
+{
+public:
+ virtual ~HazardPointer() {}
+
+ /** @return current value */
+ buf_page_t *get() const { mysql_mutex_assert_owner(m_mutex); return m_hp; }
+
+ /** Set current value
+ @param bpage buffer block to be set as hp */
+ void set(buf_page_t *bpage)
+ {
+ mysql_mutex_assert_owner(m_mutex);
+ ut_ad(!bpage || bpage->in_file());
+ m_hp= bpage;
+ }
+
+ /** Checks if a bpage is the hp
+ @param bpage buffer block to be compared
+ @return true if it is hp */
+ bool is_hp(const buf_page_t *bpage) const
+ { mysql_mutex_assert_owner(m_mutex); return bpage == m_hp; }
+
+ /** Adjust the value of hp. This happens when some
+ other thread working on the same list attempts to
+ remove the hp from the list. */
+ virtual void adjust(const buf_page_t*) = 0;
+
+#ifdef UNIV_DEBUG
+ /** mutex that protects access to the m_hp. */
+ const mysql_mutex_t *m_mutex= nullptr;
+#endif /* UNIV_DEBUG */
+
+protected:
+ /** hazard pointer */
+ buf_page_t *m_hp= nullptr;
+};
+
+/** Class implementing buf_pool.flush_list hazard pointer */
+class FlushHp : public HazardPointer
+{
+public:
+ ~FlushHp() override {}
+
+ /** Adjust the value of hp. This happens when some
+ other thread working on the same list attempts to
+ remove the hp from the list.
+ @param bpage buffer block to be compared */
+ void adjust(const buf_page_t *bpage) override
+ {
+ ut_ad(bpage != NULL);
+
+ /* We only support reverse traversal for now. */
+ if (is_hp(bpage))
+ m_hp= UT_LIST_GET_PREV(list, m_hp);
+
+ ut_ad(!m_hp || m_hp->oldest_modification());
+ }
+};
+
+/** Class implementing buf_pool.LRU hazard pointer */
+class LRUHp : public HazardPointer {
+public:
+ ~LRUHp() override {}
+
+ /** Adjust the value of hp. This happens when some
+ other thread working on the same list attempts to
+ remove the hp from the list.
+ @param bpage buffer block to be compared */
+ void adjust(const buf_page_t *bpage) override
+ {
+ ut_ad(bpage);
+ /** We only support reverse traversal for now. */
+ if (is_hp(bpage))
+ m_hp= UT_LIST_GET_PREV(LRU, m_hp);
+
+ ut_ad(!m_hp || m_hp->in_LRU_list);
+ }
+};
+
+/** Special purpose iterators to be used when scanning the LRU list.
+The idea is that when one thread finishes the scan it leaves the
+itr in that position and the other thread can start scan from
+there */
+class LRUItr : public LRUHp {
+public:
+ LRUItr() : LRUHp() {}
+ ~LRUItr() override {}
+
+ /** Select from where to start a scan. If we have scanned
+ too deep into the LRU list it resets the value to the tail
+ of the LRU list.
+ @return buf_page_t from where to start scan. */
+ inline buf_page_t *start();
+};
+
+/** Struct that is embedded in the free zip blocks */
+struct buf_buddy_free_t {
+ union {
+ ulint size; /*!< size of the block */
+ byte bytes[FIL_PAGE_DATA];
+ /*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID]
+ == BUF_BUDDY_FREE_STAMP denotes a free
+ block. If the space_id field of buddy
+ block != BUF_BUDDY_FREE_STAMP, the block
+ is not in any zip_free list. If the
+ space_id is BUF_BUDDY_FREE_STAMP then
+ stamp[0] will contain the
+ buddy block size. */
+ } stamp;
+
+ buf_page_t bpage; /*!< Embedded bpage descriptor */
+ UT_LIST_NODE_T(buf_buddy_free_t) list;
+ /*!< Node of zip_free list */
+};
+
+/** @brief The buffer pool statistics structure. */
+struct buf_pool_stat_t{
+ ulint n_page_gets; /*!< number of page gets performed;
+ also successful searches through
+ the adaptive hash index are
+ counted as page gets; this field
+ is NOT protected by the buffer
+ pool mutex */
+ ulint n_pages_read; /*!< number read operations */
+ ulint n_pages_written;/*!< number write operations */
+ ulint n_pages_created;/*!< number of pages created
+ in the pool with no read */
+ ulint n_ra_pages_read_rnd;/*!< number of pages read in
+ as part of random read ahead */
+ ulint n_ra_pages_read;/*!< number of pages read in
+ as part of read ahead */
+ ulint n_ra_pages_evicted;/*!< number of read ahead
+ pages that are evicted without
+ being accessed */
+ ulint n_pages_made_young; /*!< number of pages made young, in
+ buf_page_make_young() */
+ ulint n_pages_not_made_young; /*!< number of pages not made
+ young because the first access
+ was not long enough ago, in
+ buf_page_peek_if_too_old() */
+ /** number of waits for eviction; writes protected by buf_pool.mutex */
+ ulint LRU_waits;
+ ulint LRU_bytes; /*!< LRU size in bytes */
+ ulint flush_list_bytes;/*!< flush_list size in bytes */
+};
+
+/** Statistics of buddy blocks of a given size. */
+struct buf_buddy_stat_t {
+ /** Number of blocks allocated from the buddy system. */
+ ulint used;
+ /** Number of blocks relocated by the buddy system. */
+ ib_uint64_t relocated;
+ /** Total duration of block relocations, in microseconds. */
+ ib_uint64_t relocated_usec;
+};
+
+/** The buffer pool */
+class buf_pool_t
+{
+ /** A chunk of buffers */
+ struct chunk_t
+ {
+ /** number of elements in blocks[] */
+ size_t size;
+ /** memory allocated for the page frames */
+ unsigned char *mem;
+ /** descriptor of mem */
+ ut_new_pfx_t mem_pfx;
+ /** array of buffer control blocks */
+ buf_block_t *blocks;
+
+ /** Map of first page frame address to chunks[] */
+ using map= std::map<const void*, chunk_t*, std::less<const void*>,
+ ut_allocator<std::pair<const void* const,chunk_t*>>>;
+ /** Chunk map that may be under construction by buf_resize_thread() */
+ static map *map_reg;
+ /** Current chunk map for lookup only */
+ static map *map_ref;
+
+ /** @return the memory size bytes. */
+ size_t mem_size() const { return mem_pfx.m_size; }
+
+ /** Register the chunk */
+ void reg() { map_reg->emplace(map::value_type(blocks->frame, this)); }
+
+ /** Allocate a chunk of buffer frames.
+ @param bytes requested size
+ @return whether the allocation succeeded */
+ inline bool create(size_t bytes);
+
+#ifdef UNIV_DEBUG
+ /** Find a block that points to a ROW_FORMAT=COMPRESSED page
+ @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+ @return the block
+ @retval nullptr if not found */
+ const buf_block_t *contains_zip(const void *data) const
+ {
+ const buf_block_t *block= blocks;
+ for (auto i= size; i--; block++)
+ if (block->page.zip.data == data)
+ return block;
+ return nullptr;
+ }
+
+ /** Check that all blocks are in a replaceable state.
+ @return address of a non-free block
+ @retval nullptr if all freed */
+ inline const buf_block_t *not_freed() const;
+#endif /* UNIV_DEBUG */
+ };
+
+ /** Withdraw blocks from the buffer pool until meeting withdraw_target.
+ @return whether retry is needed */
+ inline bool withdraw_blocks();
+
+ /** Determine if a pointer belongs to a buf_block_t. It can be a pointer to
+ the buf_block_t itself or a member of it.
+ @param ptr a pointer that will not be dereferenced
+ @return whether the ptr belongs to a buf_block_t struct */
+ bool is_block_field(const void *ptr) const
+ {
+ const chunk_t *chunk= chunks;
+ const chunk_t *const echunk= chunk + ut_min(n_chunks, n_chunks_new);
+
+ /* TODO: protect chunks with a mutex (the older pointer will
+ currently remain during resize()) */
+ for (; chunk < echunk; chunk++)
+ if (ptr >= reinterpret_cast<const void*>(chunk->blocks) &&
+ ptr < reinterpret_cast<const void*>(chunk->blocks + chunk->size))
+ return true;
+ return false;
+ }
+
+ /** Try to reallocate a control block.
+ @param block control block to reallocate
+ @return whether the reallocation succeeded */
+ inline bool realloc(buf_block_t *block);
+
+public:
+ bool is_initialised() const { return chunks != nullptr; }
+
+ /** Create the buffer pool.
+ @return whether the creation failed */
+ bool create();
+
+ /** Clean up after successful create() */
+ void close();
+
+ /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+ inline void resize();
+
+ /** @return whether resize() is in progress */
+ bool resize_in_progress() const
+ {
+ return UNIV_UNLIKELY(resizing.load(std::memory_order_relaxed));
+ }
+
+ /** @return the current size in blocks */
+ size_t get_n_pages() const
+ {
+ ut_ad(is_initialised());
+ size_t size= 0;
+ for (auto j= n_chunks; j--; )
+ size+= chunks[j].size;
+ return size;
+ }
+
+ /** Determine whether a frame is intended to be withdrawn during resize().
+ @param ptr pointer within a buf_block_t::frame
+ @return whether the frame will be withdrawn */
+ bool will_be_withdrawn(const byte *ptr) const
+ {
+ ut_ad(curr_size < old_size);
+#ifdef SAFE_MUTEX
+ if (resizing.load(std::memory_order_relaxed))
+ mysql_mutex_assert_owner(&mutex);
+#endif /* SAFE_MUTEX */
+
+ for (const chunk_t *chunk= chunks + n_chunks_new,
+ * const echunk= chunks + n_chunks;
+ chunk != echunk; chunk++)
+ if (ptr >= chunk->blocks->frame &&
+ ptr < (chunk->blocks + chunk->size - 1)->frame + srv_page_size)
+ return true;
+ return false;
+ }
+
+ /** Determine whether a block is intended to be withdrawn during resize().
+ @param bpage buffer pool block
+ @return whether the frame will be withdrawn */
+ bool will_be_withdrawn(const buf_page_t &bpage) const
+ {
+ ut_ad(curr_size < old_size);
+#ifdef SAFE_MUTEX
+ if (resizing.load(std::memory_order_relaxed))
+ mysql_mutex_assert_owner(&mutex);
+#endif /* SAFE_MUTEX */
+
+ for (const chunk_t *chunk= chunks + n_chunks_new,
+ * const echunk= chunks + n_chunks;
+ chunk != echunk; chunk++)
+ if (&bpage >= &chunk->blocks->page &&
+ &bpage < &chunk->blocks[chunk->size].page)
+ return true;
+ return false;
+ }
+
+ /** Release and evict a corrupted page.
+ @param bpage page that was being read */
+ ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage);
+
+ /** Release a memory block to the buffer pool. */
+ ATTRIBUTE_COLD void free_block(buf_block_t *block);
+
+#ifdef UNIV_DEBUG
+ /** Find a block that points to a ROW_FORMAT=COMPRESSED page
+ @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+ @return the block
+ @retval nullptr if not found */
+ const buf_block_t *contains_zip(const void *data) const
+ {
+ mysql_mutex_assert_owner(&mutex);
+ for (const chunk_t *chunk= chunks, * const end= chunks + n_chunks;
+ chunk != end; chunk++)
+ if (const buf_block_t *block= chunk->contains_zip(data))
+ return block;
+ return nullptr;
+ }
+
+ /** Assert that all buffer pool pages are in a replaceable state */
+ void assert_all_freed();
+#endif /* UNIV_DEBUG */
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /** Clear the adaptive hash index on all pages in the buffer pool. */
+ inline void clear_hash_index();
+
+ /** Get a buffer block from an adaptive hash index pointer.
+ This function does not return if the block is not identified.
+ @param ptr pointer to within a page frame
+ @return pointer to block, never NULL */
+ inline buf_block_t *block_from_ahi(const byte *ptr) const;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ bool is_block_lock(const rw_lock_t *l) const
+ { return is_block_field(static_cast<const void*>(l)); }
+
+ /**
+ @return the smallest oldest_modification lsn for any page
+ @retval empty_lsn if all modified persistent pages have been flushed */
+ lsn_t get_oldest_modification(lsn_t empty_lsn)
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ while (buf_page_t *bpage= UT_LIST_GET_LAST(flush_list))
+ {
+ ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+ lsn_t lsn= bpage->oldest_modification();
+ if (lsn != 1)
+ {
+ ut_ad(lsn > 2);
+ return lsn;
+ }
+ delete_from_flush_list(bpage);
+ }
+ return empty_lsn;
+ }
+
+ /** Determine if a buffer block was created by chunk_t::create().
+ @param block block descriptor (not dereferenced)
+ @return whether block has been created by chunk_t::create() */
+ bool is_uncompressed(const buf_block_t *block) const
+ {
+ return is_block_field(reinterpret_cast<const void*>(block));
+ }
+
+ /** Get the page_hash latch for a page */
+ page_hash_latch *hash_lock_get(const page_id_t id) const
+ {
+ return page_hash.lock_get(id.fold());
+ }
+
+ /** Look up a block descriptor.
+ @param id page identifier
+ @param fold id.fold()
+ @return block descriptor, possibly in watch[]
+ @retval nullptr if not found*/
+ buf_page_t *page_hash_get_low(const page_id_t id, const ulint fold)
+ {
+ ut_ad(id.fold() == fold);
+#ifdef SAFE_MUTEX
+ DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
+ page_hash.lock_get(fold)->is_locked());
+#endif /* SAFE_MUTEX */
+ buf_page_t *bpage;
+ /* Look for the page in the hash table */
+ HASH_SEARCH(hash, &page_hash, fold, buf_page_t*, bpage,
+ ut_ad(bpage->in_page_hash), id == bpage->id());
+ return bpage;
+ }
+private:
+ /** Look up a block descriptor.
+ @tparam exclusive whether the latch is to be acquired exclusively
+ @tparam watch whether to allow watch_is_sentinel()
+ @param page_id page identifier
+ @param fold page_id.fold()
+ @param hash_lock pointer to the acquired latch (to be released by caller)
+ @return pointer to the block
+ @retval nullptr if no block was found; !lock || !*lock will also hold */
+ template<bool exclusive,bool watch>
+ buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
+ page_hash_latch **hash_lock)
+ {
+ ut_ad(hash_lock || !exclusive);
+ page_hash_latch *latch= page_hash.lock<exclusive>(fold);
+ buf_page_t *bpage= page_hash_get_low(page_id, fold);
+ if (!bpage || watch_is_sentinel(*bpage))
+ {
+ latch->release<exclusive>();
+ if (hash_lock)
+ *hash_lock= nullptr;
+ return watch ? bpage : nullptr;
+ }
+
+ ut_ad(bpage->in_file());
+ ut_ad(page_id == bpage->id());
+
+ if (hash_lock)
+ *hash_lock= latch; /* to be released by the caller */
+ else
+ latch->release<exclusive>();
+ return bpage;
+ }
+public:
+ /** Look up a block descriptor.
+ @tparam exclusive whether the latch is to be acquired exclusively
+ @param page_id page identifier
+ @param fold page_id.fold()
+ @param hash_lock pointer to the acquired latch (to be released by caller)
+ @return pointer to the block
+ @retval nullptr if no block was found; !lock || !*lock will also hold */
+ template<bool exclusive>
+ buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
+ page_hash_latch **hash_lock)
+ { return page_hash_get_locked<exclusive,false>(page_id, fold, hash_lock); }
+
+ /** @return whether the buffer pool contains a page
+ @tparam watch whether to allow watch_is_sentinel()
+ @param page_id page identifier */
+ template<bool watch= false>
+ bool page_hash_contains(const page_id_t page_id)
+ {
+ return page_hash_get_locked<false,watch>(page_id, page_id.fold(), nullptr);
+ }
+
+ /** Determine if a block is a sentinel for a buffer pool watch.
+ @param bpage page descriptor
+ @return whether bpage a sentinel for a buffer pool watch */
+ bool watch_is_sentinel(const buf_page_t &bpage)
+ {
+#ifdef SAFE_MUTEX
+ DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
+ hash_lock_get(bpage.id())->is_locked());
+#endif /* SAFE_MUTEX */
+ ut_ad(bpage.in_file());
+
+ if (&bpage < &watch[0] || &bpage >= &watch[UT_ARR_SIZE(watch)])
+ {
+ ut_ad(bpage.state() != BUF_BLOCK_ZIP_PAGE || bpage.zip.data);
+ return false;
+ }
+
+ ut_ad(bpage.state() == BUF_BLOCK_ZIP_PAGE);
+ ut_ad(!bpage.in_zip_hash);
+ ut_ad(!bpage.zip.data);
+ return true;
+ }
+
+ /** Check if a watched page has been read.
+ This may only be called after !watch_set() and before invoking watch_unset().
+ @param id page identifier
+ @return whether the page was read to the buffer pool */
+ bool watch_occurred(const page_id_t id)
+ {
+ const ulint fold= id.fold();
+ page_hash_latch *hash_lock= page_hash.lock<false>(fold);
+ /* The page must exist because watch_set() increments buf_fix_count. */
+ buf_page_t *bpage= page_hash_get_low(id, fold);
+ const bool is_sentinel= watch_is_sentinel(*bpage);
+ hash_lock->read_unlock();
+ return !is_sentinel;
+ }
+
+ /** Register a watch for a page identifier. The caller must hold an
+ exclusive page hash latch. The *hash_lock may be released,
+ relocated, and reacquired.
+ @param id page identifier
+ @param hash_lock exclusively held page_hash latch
+ @return a buffer pool block corresponding to id
+ @retval nullptr if the block was not present, and a watch was installed */
+ inline buf_page_t *watch_set(const page_id_t id,
+ page_hash_latch **hash_lock);
+
+ /** Stop watching whether a page has been read in.
+ watch_set(id) must have returned nullptr before.
+ @param id page identifier */
+ void watch_unset(const page_id_t id)
+ {
+ const ulint fold= id.fold();
+ page_hash_latch *hash_lock= page_hash.lock<true>(fold);
+ /* The page must exist because watch_set() increments buf_fix_count. */
+ buf_page_t *watch= page_hash_get_low(id, fold);
+ if (watch->unfix() == 0 && watch_is_sentinel(*watch))
+ {
+ /* The following is based on watch_remove(). */
+ ut_ad(watch->in_page_hash);
+ ut_d(watch->in_page_hash= false);
+ HASH_DELETE(buf_page_t, hash, &page_hash, fold, watch);
+ hash_lock->write_unlock();
+ // Now that the watch is detached from page_hash, release it to watch[].
+ mysql_mutex_lock(&mutex);
+ /* It is possible that watch_remove() already removed the watch. */
+ if (watch->id_ == id)
+ {
+ ut_ad(!watch->buf_fix_count());
+ ut_ad(watch->state() == BUF_BLOCK_ZIP_PAGE);
+ watch->set_state(BUF_BLOCK_NOT_USED);
+ }
+ mysql_mutex_unlock(&mutex);
+ }
+ else
+ hash_lock->write_unlock();
+ }
+
+ /** Remove the sentinel block for the watch before replacing it with a
+ real block. watch_unset() or watch_occurred() will notice
+ that the block has been replaced with the real block.
+ @param watch sentinel */
+ inline void watch_remove(buf_page_t *watch);
+
+ /** @return whether less than 1/4 of the buffer pool is available */
+ bool running_out() const
+ {
+ return !recv_recovery_is_on() &&
+ UNIV_UNLIKELY(UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
+ std::min(curr_size, old_size) / 4);
+ }
+
+#ifdef UNIV_DEBUG
+ /** Validate the buffer pool. */
+ void validate();
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+ /** Write information of the buf_pool to the error log. */
+ void print();
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+ /** Remove a block from the LRU list.
+ @return the predecessor in the LRU list */
+ buf_page_t *LRU_remove(buf_page_t *bpage)
+ {
+ mysql_mutex_assert_owner(&mutex);
+ ut_ad(bpage->in_LRU_list);
+ ut_ad(bpage->in_page_hash);
+ ut_ad(!bpage->in_zip_hash);
+ ut_ad(bpage->in_file());
+ lru_hp.adjust(bpage);
+ lru_scan_itr.adjust(bpage);
+ ut_d(bpage->in_LRU_list= false);
+ buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
+ UT_LIST_REMOVE(LRU, bpage);
+ return prev;
+ }
+
+ /** Number of pages to read ahead */
+ static constexpr uint32_t READ_AHEAD_PAGES= 64;
+
+ /** Buffer pool mutex */
+ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+ /** Number of pending LRU flush; protected by mutex. */
+ ulint n_flush_LRU_;
+ /** broadcast when n_flush_LRU reaches 0; protected by mutex */
+ pthread_cond_t done_flush_LRU;
+ /** Number of pending flush_list flush; protected by mutex */
+ ulint n_flush_list_;
+ /** broadcast when n_flush_list reaches 0; protected by mutex */
+ pthread_cond_t done_flush_list;
+
+ TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; }
+ TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; }
+
+ /** @name General fields */
+ /* @{ */
+ ulint curr_pool_size; /*!< Current pool size in bytes */
+ ulint LRU_old_ratio; /*!< Reserve this much of the buffer
+ pool for "old" blocks */
+#ifdef UNIV_DEBUG
+ ulint buddy_n_frames; /*!< Number of frames allocated from
+ the buffer pool to the buddy system */
+ ulint mutex_exit_forbidden; /*!< Forbid release mutex */
+#endif
+ ut_allocator<unsigned char> allocator; /*!< Allocator used for
+ allocating memory for the the "chunks"
+ member. */
+ volatile ulint n_chunks; /*!< number of buffer pool chunks */
+ volatile ulint n_chunks_new; /*!< new number of buffer pool chunks */
+ chunk_t* chunks; /*!< buffer pool chunks */
+ chunk_t* chunks_old; /*!< old buffer pool chunks to be freed
+ after resizing buffer pool */
+ /** current pool size in pages */
+ Atomic_counter<ulint> curr_size;
+ /** previous pool size in pages */
+ Atomic_counter<ulint> old_size;
+ /** read-ahead request size in pages */
+ Atomic_counter<uint32_t> read_ahead_area;
+
+ /** Hash table with singly-linked overflow lists. @see hash_table_t */
+ struct page_hash_table
+ {
+ /** Number of array[] elements per page_hash_latch.
+ Must be one less than a power of 2. */
+ static constexpr size_t ELEMENTS_PER_LATCH= CPU_LEVEL1_DCACHE_LINESIZE /
+ sizeof(void*) - 1;
+
+ /** number of payload elements in array[] */
+ Atomic_relaxed<ulint> n_cells;
+ /** the hash table, with pad(n_cells) elements, aligned to L1 cache size */
+ hash_cell_t *array;
+
+ /** Create the hash table.
+ @param n the lower bound of n_cells */
+ void create(ulint n);
+
+ /** Free the hash table. */
+ void free() { aligned_free(array); array= nullptr; }
+
+ /** @return the index of an array element */
+ ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); }
+ /** @return raw array index converted to padded index */
+ static ulint pad(ulint h) { return 1 + (h / ELEMENTS_PER_LATCH) + h; }
+ private:
+ /** @return the hash value before any ELEMENTS_PER_LATCH padding */
+ static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
+
+ /** @return the index of an array element */
+ static ulint calc_hash(ulint fold, ulint n_cells)
+ {
+ return pad(hash(fold, n_cells));
+ }
+ /** Get a page_hash latch. */
+ page_hash_latch *lock_get(ulint fold, ulint n) const
+ {
+ static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
+ "must be one less than a power of 2");
+ return reinterpret_cast<page_hash_latch*>
+ (&array[calc_hash(fold, n) & ~ELEMENTS_PER_LATCH]);
+ }
+ public:
+ /** Get a page_hash latch. */
+ page_hash_latch *lock_get(ulint fold) const
+ { return lock_get(fold, n_cells); }
+
+ /** Acquire an array latch.
+ @tparam exclusive whether the latch is to be acquired exclusively
+ @param fold hash bucket key */
+ template<bool exclusive> page_hash_latch *lock(ulint fold)
+ {
+ page_hash_latch *latch= lock_get(fold, n_cells);
+ latch->acquire<exclusive>();
+ return latch;
+ }
+
+ /** Exclusively aqcuire all latches */
+ inline void write_lock_all();
+
+ /** Release all latches */
+ inline void write_unlock_all();
+ };
+
+ /** Hash table of file pages (buf_page_t::in_file() holds),
+ indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */
+ page_hash_table page_hash;
+
+ /** map of block->frame to buf_block_t blocks that belong
+ to buf_buddy_alloc(); protected by buf_pool.mutex */
+ hash_table_t zip_hash;
+ /** number of pending read operations */
+ Atomic_counter<ulint> n_pend_reads;
+ Atomic_counter<ulint>
+ n_pend_unzip; /*!< number of pending decompressions */
+
+ time_t last_printout_time;
+ /*!< when buf_print_io was last time
+ called */
+ buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
+ /*!< Statistics of buddy system,
+ indexed by block size */
+ buf_pool_stat_t stat; /*!< current statistics */
+ buf_pool_stat_t old_stat; /*!< old statistics */
+
+ /* @} */
+
+ /** @name Page flushing algorithm fields */
+ /* @{ */
+
+ /** mutex protecting flush_list, buf_page_t::set_oldest_modification()
+ and buf_page_t::list pointers when !oldest_modification() */
+ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex;
+ /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */
+ FlushHp flush_hp;
+ /** modified blocks (a subset of LRU) */
+ UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
+private:
+ /** whether the page cleaner needs wakeup from indefinite sleep */
+ bool page_cleaner_is_idle;
+ /** track server activity count for signaling idle flushing */
+ ulint last_activity_count;
+public:
+ /** signalled to wake up the page_cleaner; protected by flush_list_mutex */
+ pthread_cond_t do_flush_list;
+
+ /** @return whether the page cleaner must sleep due to being idle */
+ bool page_cleaner_idle() const
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ return page_cleaner_is_idle;
+ }
+ /** Wake up the page cleaner if needed */
+ inline void page_cleaner_wakeup();
+
+ /** Register whether an explicit wakeup of the page cleaner is needed */
+ void page_cleaner_set_idle(bool deep_sleep)
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ page_cleaner_is_idle= deep_sleep;
+ }
+
+ /** Update server last activity count */
+ void update_last_activity_count(ulint activity_count)
+ {
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ last_activity_count= activity_count;
+ }
+
+ // n_flush_LRU() + n_flush_list()
+ // is approximately COUNT(io_fix()==BUF_IO_WRITE) in flush_list
+
+ unsigned freed_page_clock;/*!< a sequence number used
+ to count the number of buffer
+ blocks removed from the end of
+ the LRU list; NOTE that this
+ counter may wrap around at 4
+ billion! A thread is allowed
+ to read this for heuristic
+ purposes without holding any
+ mutex or latch */
+ bool try_LRU_scan; /*!< Cleared when an LRU
+ scan for free block fails. This
+ flag is used to avoid repeated
+ scans of LRU list when we know
+ that there is no free block
+ available in the scan depth for
+ eviction. Set whenever
+ we flush a batch from the
+ buffer pool. Protected by the
+ buf_pool.mutex */
+ /* @} */
+
+ /** @name LRU replacement algorithm fields */
+ /* @{ */
+
+ UT_LIST_BASE_NODE_T(buf_page_t) free;
+ /*!< base node of the free
+ block list */
+ /** signaled each time when the free list grows; protected by mutex */
+ pthread_cond_t done_free;
+
+ UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
+ /*!< base node of the withdraw
+ block list. It is only used during
+ shrinking buffer pool size, not to
+ reuse the blocks will be removed */
+
+ ulint withdraw_target;/*!< target length of withdraw
+ block list, when withdrawing */
+
+ /** "hazard pointer" used during scan of LRU while doing
+ LRU list batch. Protected by buf_pool_t::mutex. */
+ LRUHp lru_hp;
+
+ /** Iterator used to scan the LRU list when searching for
+ replacable victim. Protected by buf_pool_t::mutex. */
+ LRUItr lru_scan_itr;
+
+ UT_LIST_BASE_NODE_T(buf_page_t) LRU;
+ /*!< base node of the LRU list */
+
+ buf_page_t* LRU_old; /*!< pointer to the about
+ LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+ oldest blocks in the LRU list;
+ NULL if LRU length less than
+ BUF_LRU_OLD_MIN_LEN;
+ NOTE: when LRU_old != NULL, its length
+ should always equal LRU_old_len */
+ ulint LRU_old_len; /*!< length of the LRU list from
+ the block to which LRU_old points
+ onward, including that block;
+ see buf0lru.cc for the restrictions
+ on this value; 0 if LRU_old == NULL;
+ NOTE: LRU_old_len must be adjusted
+ whenever LRU_old shrinks or grows! */
+
+ UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
+ /*!< base node of the
+ unzip_LRU list */
+
+ /* @} */
+ /** free ROW_FORMAT=COMPRESSED page frames */
+ UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX];
+#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN
+# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
+#endif
+
+ /** Sentinels to detect if pages are read into the buffer pool while
+ a delete-buffering operation is pending. Protected by mutex. */
+ buf_page_t watch[innodb_purge_threads_MAX + 1];
+ /** Reserve a buffer. */
+ buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); }
+
+ /** @return whether any I/O is pending */
+ bool any_io_pending() const
+ {
+ return n_pend_reads || n_flush_LRU() || n_flush_list();
+ }
+ /** @return total amount of pending I/O */
+ ulint io_pending() const
+ {
+ return n_pend_reads + n_flush_LRU() + n_flush_list();
+ }
+
+private:
+ /** Remove a block from the flush list. */
+ inline void delete_from_flush_list_low(buf_page_t *bpage);
+ /** Remove a block from flush_list.
+ @param bpage buffer pool page
+ @param clear whether to invoke buf_page_t::clear_oldest_modification() */
+ void delete_from_flush_list(buf_page_t *bpage, bool clear);
+public:
+ /** Remove a block from flush_list.
+ @param bpage buffer pool page */
+ void delete_from_flush_list(buf_page_t *bpage)
+ { delete_from_flush_list(bpage, true); }
+
+ /** Insert a modified block into the flush list.
+ @param block modified block
+ @param lsn start LSN of the mini-transaction that modified the block */
+ void insert_into_flush_list(buf_block_t *block, lsn_t lsn);
+
+ /** Free a page whose underlying file page has been freed. */
+ inline void release_freed_page(buf_page_t *bpage);
+
+private:
+ /** Temporary memory for page_compressed and encrypted I/O */
+ struct io_buf_t
+ {
+ /** number of elements in slots[] */
+ ulint n_slots;
+ /** array of slots */
+ buf_tmp_buffer_t *slots;
+
+ void create(ulint n_slots)
+ {
+ this->n_slots= n_slots;
+ slots= static_cast<buf_tmp_buffer_t*>
+ (ut_malloc_nokey(n_slots * sizeof *slots));
+ memset((void*) slots, 0, n_slots * sizeof *slots);
+ }
+
+ void close()
+ {
+ for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+ {
+ aligned_free(s->crypt_buf);
+ aligned_free(s->comp_buf);
+ }
+ ut_free(slots);
+ slots= nullptr;
+ n_slots= 0;
+ }
+
+ /** Reserve a buffer */
+ buf_tmp_buffer_t *reserve()
+ {
+ for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+ if (s->acquire())
+ return s;
+ return nullptr;
+ }
+ } io_buf;
+
+ /** whether resize() is in the critical path */
+ std::atomic<bool> resizing;
+};
+
+/** The InnoDB buffer pool */
+extern buf_pool_t buf_pool;
+
+inline void page_hash_latch::read_lock()
+{
+ mysql_mutex_assert_not_owner(&buf_pool.mutex);
+ if (!read_trylock())
+ read_lock_wait();
+}
+
+inline void page_hash_latch::write_lock()
+{
+ if (!write_trylock())
+ write_lock_wait();
+}
+
+inline void buf_page_t::add_buf_fix_count(uint32_t count)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ buf_fix_count_+= count;
+}
+
+inline void buf_page_t::set_buf_fix_count(uint32_t count)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ buf_fix_count_= count;
+}
+
+inline void buf_page_t::set_state(buf_page_state state)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+#ifdef UNIV_DEBUG
+ switch (state) {
+ case BUF_BLOCK_REMOVE_HASH:
+ /* buf_pool_t::corrupted_evict() invokes set_corrupt_id()
+ before buf_LRU_free_one_page(), so we cannot assert that
+ we are holding the hash_lock. */
+ break;
+ case BUF_BLOCK_MEMORY:
+ if (!in_file()) break;
+ /* fall through */
+ case BUF_BLOCK_FILE_PAGE:
+ ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
+ break;
+ case BUF_BLOCK_NOT_USED:
+ if (!in_file()) break;
+ /* fall through */
+ case BUF_BLOCK_ZIP_PAGE:
+ ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked() ||
+ (this >= &buf_pool.watch[0] &&
+ this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)]));
+ break;
+ }
+#endif
+ state_= state;
+}
+
+inline void buf_page_t::set_io_fix(buf_io_fix io_fix)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ io_fix_= io_fix;
+}
+
+inline void buf_page_t::set_corrupt_id()
+{
+#ifdef UNIV_DEBUG
+ switch (oldest_modification()) {
+ case 0:
+ break;
+ case 2:
+ ut_ad(fsp_is_system_temporary(id().space()));
+ /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
+ ut_d(oldest_modification_= 0;)
+ break;
+ default:
+ ut_ad("block is dirty" == 0);
+ }
+ switch (state()) {
+ case BUF_BLOCK_REMOVE_HASH:
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_FILE_PAGE:
+ ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
+ break;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_MEMORY:
+ ut_ad("invalid state" == 0);
+ }
+#endif
+ id_= page_id_t(~0ULL);
+}
+
+/** Set oldest_modification when adding to buf_pool.flush_list */
+inline void buf_page_t::set_oldest_modification(lsn_t lsn)
+{
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+ ut_ad(oldest_modification() <= 1);
+ ut_ad(lsn > 2);
+ oldest_modification_= lsn;
+}
+
+/** Clear oldest_modification after removing from buf_pool.flush_list */
+inline void buf_page_t::clear_oldest_modification()
+{
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+ ut_d(const auto state= state_);
+ ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_ZIP_PAGE ||
+ state == BUF_BLOCK_REMOVE_HASH);
+ ut_ad(oldest_modification());
+ ut_ad(!list.prev);
+ ut_ad(!list.next);
+ /* We must use release memory order to guarantee that callers of
+ oldest_modification_acquire() will observe the block as
+ being detached from buf_pool.flush_list, after reading the value 0. */
+ oldest_modification_.store(0, std::memory_order_release);
+}
+
+/** Note that a block is no longer dirty, while not removing
+it from buf_pool.flush_list */
+inline void buf_page_t::clear_oldest_modification(bool temporary)
+{
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+ ut_ad(temporary == fsp_is_system_temporary(id().space()));
+ ut_ad(io_fix_ == BUF_IO_WRITE);
+ if (temporary)
+ {
+ ut_ad(oldest_modification() == 2);
+ oldest_modification_= 0;
+ }
+ else
+ {
+ /* We use release memory order to guarantee that callers of
+ oldest_modification_acquire() will observe the block as
+ being detached from buf_pool.flush_list, after reading the value 0. */
+ ut_ad(oldest_modification() > 2);
+ oldest_modification_.store(1, std::memory_order_release);
+ }
+}
+
+/** @return whether the block is modified and ready for flushing */
+inline bool buf_page_t::ready_for_flush() const
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(in_LRU_list);
+ ut_a(in_file());
+ ut_ad(fsp_is_system_temporary(id().space())
+ ? oldest_modification() == 2
+ : oldest_modification() > 2);
+ return io_fix_ == BUF_IO_NONE;
+}
+
+/** @return whether the block can be relocated in memory.
+The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
+inline bool buf_page_t::can_relocate() const
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(in_file());
+ ut_ad(in_LRU_list);
+ return io_fix_ == BUF_IO_NONE && !buf_fix_count_;
+}
+
+/** @return whether the block has been flagged old in buf_pool.LRU */
+inline bool buf_page_t::is_old() const
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(in_file());
+ ut_ad(in_LRU_list);
+ return old;
+}
+
+/** Set whether a block is old in buf_pool.LRU */
+inline void buf_page_t::set_old(bool old)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(in_LRU_list);
+
+#ifdef UNIV_LRU_DEBUG
+ ut_a((buf_pool.LRU_old_len == 0) == (buf_pool.LRU_old == nullptr));
+ /* If a block is flagged "old", the LRU_old list must exist. */
+ ut_a(!old || buf_pool.LRU_old);
+
+ if (UT_LIST_GET_PREV(LRU, this) && UT_LIST_GET_NEXT(LRU, this))
+ {
+ const buf_page_t *prev= UT_LIST_GET_PREV(LRU, this);
+ const buf_page_t *next = UT_LIST_GET_NEXT(LRU, this);
+ if (prev->old == next->old)
+ ut_a(prev->old == old);
+ else
+ {
+ ut_a(!prev->old);
+ ut_a(buf_pool.LRU_old == (old ? this : next));
+ }
+ }
+#endif /* UNIV_LRU_DEBUG */
+
+ this->old= old;
+}
+
+#ifdef UNIV_DEBUG
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() do { \
+ mysql_mutex_assert_owner(&buf_pool.mutex); \
+ buf_pool.mutex_exit_forbidden++; \
+} while (0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() do { \
+ mysql_mutex_assert_owner(&buf_pool.mutex); \
+ ut_ad(buf_pool.mutex_exit_forbidden--); \
+} while (0)
+#else
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() ((void) 0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() ((void) 0)
+#endif
+
+/**********************************************************************
+Let us list the consistency conditions for different control block states.
+
+NOT_USED: is in free list, not in LRU list, not in flush list, nor
+ page hash table
+MEMORY: is not in free list, LRU list, or flush list, nor page
+ hash table
+FILE_PAGE: space and offset are defined, is in page hash table
+ if io_fix == BUF_IO_WRITE,
+ buf_pool.n_flush_LRU() || buf_pool.n_flush_list()
+
+ (1) if buf_fix_count == 0, then
+ is in LRU list, not in free list
+ is in flush list,
+ if and only if oldest_modification > 0
+ is x-locked,
+ if and only if io_fix == BUF_IO_READ
+ is s-locked,
+ if and only if io_fix == BUF_IO_WRITE
+
+ (2) if buf_fix_count > 0, then
+ is not in LRU list, not in free list
+ is in flush list,
+ if and only if oldest_modification > 0
+ if io_fix == BUF_IO_READ,
+ is x-locked
+ if io_fix == BUF_IO_WRITE,
+ is s-locked
+
+State transitions:
+
+NOT_USED => MEMORY
+MEMORY => FILE_PAGE
+MEMORY => NOT_USED
+FILE_PAGE => NOT_USED NOTE: This transition is allowed if and only if
+ (1) buf_fix_count == 0,
+ (2) oldest_modification == 0, and
+ (3) io_fix == 0.
+*/
+
+/** Select from where to start a scan. If we have scanned
+too deep into the LRU list it resets the value to the tail
+of the LRU list.
+@return buf_page_t from where to start scan. */
+inline buf_page_t *LRUItr::start()
+{
+ mysql_mutex_assert_owner(m_mutex);
+
+ if (!m_hp || m_hp->old)
+ m_hp= UT_LIST_GET_LAST(buf_pool.LRU);
+
+ return m_hp;
+}
+
+#ifdef UNIV_DEBUG
+/** Functor to validate the LRU list. */
+struct CheckInLRUList {
+ void operator()(const buf_page_t* elem) const
+ {
+ ut_a(elem->in_LRU_list);
+ }
+
+ static void validate()
+ {
+ ut_list_validate(buf_pool.LRU, CheckInLRUList());
+ }
+};
+
+/** Functor to validate the LRU list. */
+struct CheckInFreeList {
+ void operator()(const buf_page_t* elem) const
+ {
+ ut_a(elem->in_free_list);
+ }
+
+ static void validate()
+ {
+ ut_list_validate(buf_pool.free, CheckInFreeList());
+ }
+};
+
+struct CheckUnzipLRUAndLRUList {
+ void operator()(const buf_block_t* elem) const
+ {
+ ut_a(elem->page.in_LRU_list);
+ ut_a(elem->in_unzip_LRU_list);
+ }
+
+ static void validate()
+ {
+ ut_list_validate(buf_pool.unzip_LRU,
+ CheckUnzipLRUAndLRUList());
+ }
+};
+#endif /* UNIV_DEBUG */
+
+#include "buf0buf.ic"
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic
new file mode 100644
index 00000000..4d8cef4c
--- /dev/null
+++ b/storage/innobase/include/buf0buf.ic
@@ -0,0 +1,422 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2014, 2020, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.ic
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0mtr.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "fsp0types.h"
+
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void)
+/*========================*/
+{
+ return(srv_buf_pool_curr_size);
+}
+
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+unsigned
+buf_page_get_freed_page_clock(
+/*==========================*/
+ const buf_page_t* bpage) /*!< in: block */
+{
+ /* This is sometimes read without holding buf_pool.mutex. */
+ return(bpage->freed_page_clock);
+}
+
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+unsigned
+buf_block_get_freed_page_clock(
+/*===========================*/
+ const buf_block_t* block) /*!< in: block */
+{
+ return(buf_page_get_freed_page_clock(&block->page));
+}
+
+/** Determine if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+The page must be either buffer-fixed, or its page hash must be locked.
+@param[in] bpage buffer pool page
+@return whether bpage is close to MRU end of LRU */
+inline bool buf_page_peek_if_young(const buf_page_t *bpage)
+{
+ /* FIXME: bpage->freed_page_clock is 31 bits */
+ return((buf_pool.freed_page_clock & ((1UL << 31) - 1))
+ < (bpage->freed_page_clock
+ + (buf_pool.curr_size
+ * (BUF_LRU_OLD_RATIO_DIV - buf_pool.LRU_old_ratio)
+ / (BUF_LRU_OLD_RATIO_DIV * 4))));
+}
+
+/** Determine if a block should be moved to the start of the LRU list if
+there is danger of dropping from the buffer pool.
+@param[in] bpage buffer pool page
+@return true if bpage should be made younger */
+inline bool buf_page_peek_if_too_old(const buf_page_t *bpage)
+{
+ if (buf_pool.freed_page_clock == 0) {
+ /* If eviction has not started yet, do not update the
+ statistics or move blocks in the LRU list. This is
+ either the warm-up phase or an in-memory workload. */
+ return(FALSE);
+ } else if (buf_LRU_old_threshold_ms && bpage->old) {
+ uint32_t access_time = bpage->is_accessed();
+
+ /* It is possible that the below comparison returns an
+ unexpected result. 2^32 milliseconds pass in about 50 days,
+ so if the difference between ut_time_ms() and access_time
+ is e.g. 50 days + 15 ms, then the below will behave as if
+ it is 15 ms. This is known and fixing it would require to
+ increase buf_page_t::access_time from 32 to 64 bits. */
+ if (access_time
+ && ((ib_uint32_t) (ut_time_ms() - access_time))
+ >= buf_LRU_old_threshold_ms) {
+ return(TRUE);
+ }
+
+ buf_pool.stat.n_pages_not_made_young++;
+ return false;
+ } else {
+ return !buf_page_peek_if_young(bpage);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block.
+@return pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ if (!block) {
+ return NULL;
+ }
+
+ switch (block->page.state()) {
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_NOT_USED:
+ ut_error;
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ ut_a(block->page.buf_fix_count());
+ /* fall through */
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ goto ok;
+ }
+ ut_error;
+ok:
+ return((buf_frame_t*) block->frame);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Allocates a buf_page_t descriptor. This function must succeed. In case
+of failure we assert in this function.
+@return: the allocated descriptor. */
+UNIV_INLINE
+buf_page_t*
+buf_page_alloc_descriptor(void)
+/*===========================*/
+{
+ buf_page_t* bpage;
+
+ bpage = (buf_page_t*) ut_zalloc_nokey(sizeof *bpage);
+ ut_ad(bpage);
+ MEM_UNDEFINED(bpage, sizeof *bpage);
+
+ return(bpage);
+}
+
+/********************************************************************//**
+Free a buf_page_t descriptor. */
+UNIV_INLINE
+void
+buf_page_free_descriptor(
+/*=====================*/
+ buf_page_t* bpage) /*!< in: bpage descriptor to free. */
+{
+ ut_free(bpage);
+}
+
+/** Allocate a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+inline buf_block_t *buf_block_alloc()
+{
+ return buf_LRU_get_free_block(false);
+}
+
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+ buf_block_t* block) /*!< in, own: block to be freed */
+{
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_LRU_block_free_non_file_page(block);
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+ buf_block_t* block) /*!< in: block */
+{
+#ifdef SAFE_MUTEX
+ /* No latch is acquired for the shared temporary tablespace. */
+ ut_ad(fsp_is_system_temporary(block->page.id().space())
+ || (mysql_mutex_is_owner(&buf_pool.mutex)
+ && !block->page.buf_fix_count())
+ || rw_lock_own_flagged(&block->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+#else /* SAFE_MUTEX */
+ /* No latch is acquired for the shared temporary tablespace. */
+ ut_ad(fsp_is_system_temporary(block->page.id().space())
+ || !block->page.buf_fix_count()
+ || rw_lock_own_flagged(&block->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+#endif /* SAFE_MUTEX */
+ assert_block_ahi_valid(block);
+
+ block->modify_clock++;
+}
+
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+ buf_block_t* block) /*!< in: block */
+{
+#ifdef UNIV_DEBUG
+ /* No latch is acquired for the shared temporary tablespace. */
+ if (!fsp_is_system_temporary(block->page.id().space())) {
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_S)
+ || rw_lock_own(&(block->lock), RW_LOCK_X)
+ || rw_lock_own(&(block->lock), RW_LOCK_SX));
+ }
+#endif /* UNIV_DEBUG */
+
+ return(block->modify_clock);
+}
+
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_inc_func(
+/*=======================*/
+#ifdef UNIV_DEBUG
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line */
+#endif /* UNIV_DEBUG */
+ buf_block_t* block) /*!< in/out: block to bufferfix */
+{
+#ifdef UNIV_DEBUG
+ /* No debug latch is acquired if block belongs to system temporary.
+ Debug latch is not of much help if access to block is single
+ threaded. */
+ if (!fsp_is_system_temporary(block->page.id().space())) {
+ ibool ret;
+ ret = rw_lock_s_lock_nowait(block->debug_latch, file, line);
+ ut_a(ret);
+ }
+#endif /* UNIV_DEBUG */
+
+ block->fix();
+}
+
+/*******************************************************************//**
+Decrements the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_dec(
+/*==================*/
+ buf_block_t* block) /*!< in/out: block to bufferunfix */
+{
+#ifdef UNIV_DEBUG
+ /* No debug latch is acquired if block belongs to system temporary.
+ Debug latch is not of much help if access to block is single
+ threaded. */
+ if (!fsp_is_system_temporary(block->page.id().space())) {
+ rw_lock_s_unlock(block->debug_latch);
+ }
+#endif /* UNIV_DEBUG */
+
+ block->unfix();
+}
+
+/********************************************************************//**
+Releases a compressed-only page acquired with buf_page_get_zip(). */
+UNIV_INLINE
+void
+buf_page_release_zip(
+/*=================*/
+ buf_page_t* bpage) /*!< in: buffer block */
+{
+ ut_ad(bpage);
+ ut_a(bpage->buf_fix_count());
+
+ switch (bpage->state()) {
+ case BUF_BLOCK_FILE_PAGE:
+#ifdef UNIV_DEBUG
+ {
+ /* No debug latch is acquired if block belongs to system
+ temporary. Debug latch is not of much help if access to block
+ is single threaded. */
+ buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
+ if (!fsp_is_system_temporary(block->page.id().space())) {
+ rw_lock_s_unlock(block->debug_latch);
+ }
+ }
+#endif /* UNIV_DEBUG */
+ /* Fall through */
+ case BUF_BLOCK_ZIP_PAGE:
+ reinterpret_cast<buf_block_t*>(bpage)->unfix();
+ return;
+
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ break;
+ }
+
+ ut_error;
+}
+
+/********************************************************************//**
+Releases a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release_latch(
+/*===================*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint rw_latch) /*!< in: RW_S_LATCH, RW_X_LATCH,
+ RW_NO_LATCH */
+{
+#ifdef UNIV_DEBUG
+ /* No debug latch is acquired if block belongs to system
+ temporary. Debug latch is not of much help if access to block
+ is single threaded. */
+ if (!fsp_is_system_temporary(block->page.id().space())) {
+ rw_lock_s_unlock(block->debug_latch);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (rw_latch == RW_S_LATCH) {
+ rw_lock_s_unlock(&block->lock);
+ } else if (rw_latch == RW_SX_LATCH) {
+ rw_lock_sx_unlock(&block->lock);
+ } else if (rw_latch == RW_X_LATCH) {
+ rw_lock_x_unlock(&block->lock);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. */
+UNIV_INLINE
+void
+buf_block_dbg_add_level(
+/*====================*/
+ buf_block_t* block, /*!< in: buffer page
+ where we have acquired latch */
+ latch_level_t level) /*!< in: latching order level */
+{
+ sync_check_lock(&block->lock, level);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Get buf frame. */
+UNIV_INLINE
+void *
+buf_page_get_frame(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: buffer pool page */
+{
+ /* In encryption/compression buffer pool page may contain extra
+ buffer where result is stored. */
+ if (bpage->slot && bpage->slot->out_buf) {
+ return bpage->slot->out_buf;
+ } else if (bpage->zip.data) {
+ return bpage->zip.data;
+ } else {
+ return ((buf_block_t*) bpage)->frame;
+ }
+}
+
+/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
+if needed.
+@param[in] size size in bytes
+@return aligned size */
+UNIV_INLINE
+ulint
+buf_pool_size_align(
+ ulint size)
+{
+ const ulong m = srv_buf_pool_chunk_unit;
+ size = ut_max(size, srv_buf_pool_min_size);
+
+ if (size % m == 0) {
+ return(size);
+ } else {
+ return (ulint)((size / m + 1) * m);
+ }
+}
diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h
new file mode 100644
index 00000000..8dc25f91
--- /dev/null
+++ b/storage/innobase/include/buf0checksum.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.h
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0checksum_h
+#define buf0checksum_h
+
+#include "buf0types.h"
+
+/** Calculate the CRC32 checksum of a page. The value is stored to the page
+when it is written to a file and also checked for a match when reading from
+the file. Note that we must be careful to calculate the same value on all
+architectures.
+@param[in] page buffer page (srv_page_size bytes)
+@return CRC-32C */
+uint32_t buf_calc_page_crc32(const byte* page);
+
+/** Calculate a checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@param[in] page file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_new_checksum(const byte* page);
+
+/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that
+the checksum only looked at the first few bytes of the page.
+This calculates that old checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@param[in] page file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_old_checksum(const byte* page);
+
+/** Return a printable string describing the checksum algorithm.
+@param[in] algo algorithm
+@return algorithm name */
+const char*
+buf_checksum_algorithm_name(srv_checksum_algorithm_t algo);
+
+extern ulong srv_checksum_algorithm;
+
+#endif /* buf0checksum_h */
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
new file mode 100644
index 00000000..fb9df555
--- /dev/null
+++ b/storage/innobase/include/buf0dblwr.h
@@ -0,0 +1,170 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0dblwr.h
+Doublewrite buffer module
+
+Created 2011/12/19 Inaam Rana
+*******************************************************/
+
+#pragma once
+
+#include "os0file.h"
+#include "buf0types.h"
+
+/** Doublewrite control struct */
+class buf_dblwr_t
+{
+ struct element
+ {
+ /** asynchronous write request */
+ IORequest request;
+ /** payload size in bytes */
+ size_t size;
+ };
+
+ struct slot
+ {
+ /** first free position in write_buf measured in units of
+ * srv_page_size */
+ ulint first_free;
+ /** number of slots reserved for the current write batch */
+ ulint reserved;
+ /** the doublewrite buffer, aligned to srv_page_size */
+ byte* write_buf;
+ /** buffer blocks to be written via write_buf */
+ element* buf_block_arr;
+ };
+
+ /** the page number of the first doublewrite block (block_size() pages) */
+ page_id_t block1= page_id_t(0, 0);
+ /** the page number of the second doublewrite block (block_size() pages) */
+ page_id_t block2= page_id_t(0, 0);
+
+ /** mutex protecting the data members below */
+ mysql_mutex_t mutex;
+ /** condition variable for !batch_running */
+ pthread_cond_t cond;
+ /** whether a batch is being written from the doublewrite buffer */
+ bool batch_running;
+ /** number of expected flush_buffered_writes_completed() calls */
+ unsigned flushing_buffered_writes;
+ /** pages submitted to flush_buffered_writes() */
+ ulint pages_submitted;
+ /** number of flush_buffered_writes_completed() calls */
+ ulint writes_completed;
+ /** number of pages written by flush_buffered_writes_completed() */
+ ulint pages_written;
+
+ slot slots[2];
+ slot *active_slot= &slots[0];
+
+ /** Initialize the doublewrite buffer data structure.
+ @param header doublewrite page header in the TRX_SYS page */
+ inline void init(const byte *header);
+
+ /** Flush possible buffered writes to persistent storage. */
+ bool flush_buffered_writes(const ulint size);
+
+public:
+ /** Create or restore the doublewrite buffer in the TRX_SYS page.
+ @return whether the operation succeeded */
+ bool create();
+ /** Free the doublewrite buffer. */
+ void close();
+
+ /** Acquire the mutex */
+ void lock() { mysql_mutex_lock(&mutex); }
+ /** @return the number of submitted page writes */
+ ulint submitted() const
+ { mysql_mutex_assert_owner(&mutex); return pages_submitted; }
+ /** @return the number of completed batches */
+ ulint batches() const
+ { mysql_mutex_assert_owner(&mutex); return writes_completed; }
+ /** @return the number of final pages written */
+ ulint written() const
+ { mysql_mutex_assert_owner(&mutex); return pages_written; }
+ /** Release the mutex */
+ void unlock() { mysql_mutex_unlock(&mutex); }
+
+ /** Initialize the doublewrite buffer memory structure on recovery.
+ If we are upgrading from a version before MySQL 4.1, then this
+ function performs the necessary update operations to support
+ innodb_file_per_table. If we are in a crash recovery, this function
+ loads the pages from double write buffer into memory.
+ @param file File handle
+ @param path Path name of file
+ @return DB_SUCCESS or error code */
+ dberr_t init_or_load_pages(pfs_os_file_t file, const char *path);
+
+ /** Process and remove the double write buffer pages for all tablespaces. */
+ void recover();
+
+ /** Update the doublewrite buffer on data page write completion. */
+ void write_completed();
+ /** Flush possible buffered writes to persistent storage.
+ It is very important to call this function after a batch of writes has been
+ posted, and also when we may have to wait for a page latch!
+ Otherwise a deadlock of threads can occur. */
+ void flush_buffered_writes();
+ /** Update the doublewrite buffer on write batch completion
+ @param request the completed batch write request */
+ void flush_buffered_writes_completed(const IORequest &request);
+
+ /** Size of the doublewrite block in pages */
+ uint32_t block_size() const { return FSP_EXTENT_SIZE; }
+
+ /** Schedule a page write. If the doublewrite memory buffer is full,
+ flush_buffered_writes() will be invoked to make space.
+ @param request asynchronous write request
+ @param size payload size in bytes */
+ void add_to_batch(const IORequest &request, size_t size);
+
+ /** Determine whether the doublewrite buffer is initialized */
+ bool is_initialised() const
+ { return UNIV_LIKELY(block1 != page_id_t(0, 0)); }
+
+ /** @return whether a page identifier is part of the doublewrite buffer */
+ bool is_inside(const page_id_t id) const
+ {
+ if (!is_initialised())
+ return false;
+ ut_ad(block1 < block2);
+ if (id < block1)
+ return false;
+ const uint32_t size= block_size();
+ return id < block1 + size || (id >= block2 && id < block2 + size);
+ }
+
+ /** Wait for flush_buffered_writes() to be fully completed */
+ void wait_flush_buffered_writes()
+ {
+ if (is_initialised())
+ {
+ mysql_mutex_lock(&mutex);
+ while (batch_running)
+ my_cond_wait(&cond, &mutex.m_mutex);
+ mysql_mutex_unlock(&mutex);
+ }
+ }
+};
+
+/** The doublewrite buffer */
+extern buf_dblwr_t buf_dblwr;
diff --git a/storage/innobase/include/buf0dump.h b/storage/innobase/include/buf0dump.h
new file mode 100644
index 00000000..48586900
--- /dev/null
+++ b/storage/innobase/include/buf0dump.h
@@ -0,0 +1,44 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.h
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0dump_h
+#define buf0dump_h
+
+/** Start the buffer pool dump/load task and instructs it to start a dump. */
+void buf_dump_start();
+/** Start the buffer pool dump/load task and instructs it to start a load. */
+void buf_load_start();
+
+/** Abort a currently running buffer pool load. */
+void buf_load_abort();
+
+/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
+void buf_load_at_startup();
+
+/** Wait for currently running load/dumps to finish*/
+void buf_load_dump_end();
+
+#endif /* buf0dump_h */
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
new file mode 100644
index 00000000..8d45cf2b
--- /dev/null
+++ b/storage/innobase/include/buf0flu.h
@@ -0,0 +1,153 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.h
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0flu_h
+#define buf0flu_h
+
+#include "ut0byte.h"
+#include "log0log.h"
+#include "buf0types.h"
+
+/** Number of pages flushed. Protected by buf_pool.mutex. */
+extern ulint buf_flush_page_count;
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_flush_page_count. */
+extern ulint buf_lru_flush_page_count;
+
+/** Flag indicating if the page_cleaner is in active state. */
+extern bool buf_page_cleaner_is_active;
+
+#ifdef UNIV_DEBUG
+
+/** Value of MySQL global variable used to disable page cleaner. */
+extern my_bool innodb_page_cleaner_disabled_debug;
+
+#endif /* UNIV_DEBUG */
+
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id tablespace identifier */
+void buf_flush_remove_pages(ulint id);
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+ATTRIBUTE_COLD
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage); /*!< in/out: destination block */
+
+/** Complete write of a file page from buf_pool.
+@param request write request */
+void buf_page_write_complete(const IORequest &request);
+
+/** Assign the full crc32 checksum for non-compressed page.
+@param[in,out] page page to be updated */
+void buf_flush_assign_full_crc32_checksum(byte* page);
+
+/** Initialize a page for writing to the tablespace.
+@param[in] block buffer block; NULL if bypassing the buffer pool
+@param[in,out] page page frame
+@param[in,out] page_zip_ compressed page, or NULL if uncompressed
+@param[in] use_full_checksum whether tablespace uses full checksum */
+void
+buf_flush_init_for_writing(
+ const buf_block_t* block,
+ byte* page,
+ void* page_zip_,
+ bool use_full_checksum);
+
+/** Write out dirty blocks from buf_pool.flush_list.
+@param max_n wished maximum mumber of blocks flushed
+@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
+@return the number of processed pages
+@retval 0 if a buf_pool.flush_list batch is already running */
+ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX);
+
+/** Try to flush dirty pages that belong to a given tablespace.
+@param space tablespace
+@param n_flushed number of pages written
+@return whether the flush for some pages might not have been initiated */
+bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Write out dirty blocks from buf_pool.LRU.
+@param max_n wished maximum mumber of blocks flushed
+@return the number of processed pages
+@retval 0 if a buf_pool.LRU batch is already running */
+ulint buf_flush_LRU(ulint max_n);
+
+/** Wait until a flush batch ends.
+@param lru true=buf_pool.LRU; false=buf_pool.flush_list */
+void buf_flush_wait_batch_end(bool lru);
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn);
+/** Initiate more eager page flushing if the log checkpoint age is too old.
+@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
+@param furious true=furious flushing, false=limit to innodb_io_capacity */
+ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious);
+
+/********************************************************************//**
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+ buf_block_t* block, /*!< in: block which is modified */
+ lsn_t start_lsn, /*!< in: start lsn of the first mtr in a
+ set of mtr's */
+ lsn_t end_lsn); /*!< in: end lsn of the last mtr in the
+ set of mtr's */
+
+/** Initialize page_cleaner. */
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init();
+
+/** Wait for pending flushes to complete. */
+void buf_flush_wait_batch_end_acquiring_mutex(bool lru);
+
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool();
+
+#ifdef UNIV_DEBUG
+/** Validate the flush list. */
+void buf_flush_validate();
+#endif /* UNIV_DEBUG */
+
+/** Synchronously flush dirty blocks.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync();
+
+#include "buf0flu.ic"
+
+#endif
diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic
new file mode 100644
index 00000000..b8a9b6d1
--- /dev/null
+++ b/storage/innobase/include/buf0flu.ic
@@ -0,0 +1,66 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.ic
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "assume_aligned.h"
+#include "buf0buf.h"
+#include "srv0srv.h"
+
+/********************************************************************//**
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it is not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+ buf_block_t* block, /*!< in: block which is modified */
+ lsn_t start_lsn, /*!< in: start lsn of the mtr that
+ modified this block */
+ lsn_t end_lsn) /*!< in: end lsn of the mtr that
+ modified this block */
+{
+ ut_ad(!srv_read_only_mode);
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->page.buf_fix_count());
+ ut_ad(mach_read_from_8(block->frame + FIL_PAGE_LSN) <= end_lsn);
+ mach_write_to_8(block->frame + FIL_PAGE_LSN, end_lsn);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ memcpy_aligned<8>(FIL_PAGE_LSN + block->page.zip.data,
+ FIL_PAGE_LSN + block->frame, 8);
+ }
+
+ const lsn_t oldest_modification = block->page.oldest_modification();
+
+ if (oldest_modification > 1) {
+ ut_ad(oldest_modification <= start_lsn);
+ } else if (fsp_is_system_temporary(block->page.id().space())) {
+ block->page.set_temp_modified();
+ } else {
+ buf_pool.insert_into_flush_list(block, start_lsn);
+ }
+
+ srv_stats.buf_pool_write_requests.inc();
+}
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
new file mode 100644
index 00000000..540c14a4
--- /dev/null
+++ b/storage/innobase/include/buf0lru.h
@@ -0,0 +1,204 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.h
+The database buffer pool LRU replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0lru_h
+#define buf0lru_h
+
+#include "ut0byte.h"
+#include "buf0types.h"
+
+// Forward declaration
+struct trx_t;
+struct fil_space_t;
+
+/** Flush this many pages in buf_LRU_get_free_block() */
+extern size_t innodb_lru_flush_size;
+
+/*#######################################################################
+These are low-level functions
+#########################################################################*/
+
+/** Minimum LRU list length for which the LRU_old pointer is defined */
+#define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */
+
+/** Try to free a block. If bpage is a descriptor of a compressed-only
+ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
+The caller must hold buf_pool.mutex.
+@param bpage block to be freed
+@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page
+@retval true if freed and buf_pool.mutex may have been temporarily released
+@retval false if the page was not freed */
+bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
+ MY_ATTRIBUTE((nonnull));
+
+/** Try to free a replaceable block.
+@param limit maximum number of blocks to scan
+@return true if found and freed */
+bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED);
+
+/** @return a buffer block from the buf_pool.free list
+@retval NULL if the free list is empty */
+buf_block_t* buf_LRU_get_free_only();
+
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
+
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+ * get a block from the buf_pool.free list, success:done
+ * if buf_pool.try_LRU_scan is set
+ * scan LRU up to 100 pages to free a clean block
+ * success:retry the free list
+ * flush up to innodb_lru_flush_size LRU blocks to data files
+ (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+ * on buf_page_write_complete() the blocks will put on buf_pool.free list
+ * success: retry the free list
+* subsequent iterations: same as iteration 0 except:
+ * scan whole LRU list
+ * scan LRU list even if buf_pool.try_LRU_scan is not set
+
+@param have_mutex whether buf_pool.mutex is already being held
+@return the free control block, in state BUF_BLOCK_MEMORY */
+buf_block_t* buf_LRU_get_free_block(bool have_mutex)
+ MY_ATTRIBUTE((malloc,warn_unused_result));
+
+/** @return whether the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list */
+bool buf_LRU_evict_from_unzip_LRU();
+
+/** Puts a block back to the free list.
+@param[in] block block; not containing a file page */
+void
+buf_LRU_block_free_non_file_page(buf_block_t* block);
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the page_size is
+already set when invoking the function, so that we can get correct
+page_size from the buffer page when adding a block into LRU */
+void
+buf_LRU_add_block(
+/*==============*/
+ buf_page_t* bpage, /*!< in: control block */
+ bool old); /*!< in: true if should be put to the old
+ blocks in the LRU list, else put to the
+ start; if the LRU list is very short, added to
+ the start regardless of this parameter */
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+ buf_block_t* block, /*!< in: control block */
+ ibool old); /*!< in: TRUE if should be put to the end
+ of the list, else put to the start */
+
+/** Update buf_pool.LRU_old_ratio.
+@param[in] old_pct Reserve this percentage of
+ the buffer pool for "old" blocks
+@param[in] adjust true=adjust the LRU list;
+ false=just assign buf_pool.LRU_old_ratio
+ during the initialization of InnoDB
+@return updated old_pct */
+uint buf_LRU_old_ratio_update(uint old_pct, bool adjust);
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+void
+buf_LRU_stat_update();
+
+/** Remove one page from LRU list and put it to free list.
+@param bpage file page to be freed
+@param id page identifier
+@param hash_lock buf_pool.page_hash latch (will be released here) */
+void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
+ page_hash_latch *hash_lock)
+ MY_ATTRIBUTE((nonnull));
+
+#ifdef UNIV_DEBUG
+/** Validate the LRU list. */
+void buf_LRU_validate();
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Dump the LRU list to stderr. */
+void buf_LRU_print();
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+/** @name Heuristics for detecting index scan @{ */
+/** The denominator of buf_pool.LRU_old_ratio. */
+#define BUF_LRU_OLD_RATIO_DIV 1024
+/** Maximum value of buf_pool.LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_pool.LRU_old_ratio_update */
+#define BUF_LRU_OLD_RATIO_MAX BUF_LRU_OLD_RATIO_DIV
+/** Minimum value of buf_pool.LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_pool.LRU_old_ratio_update
+The minimum must exceed
+(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */
+#define BUF_LRU_OLD_RATIO_MIN 51
+
+#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX
+# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX"
+#endif
+#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV
+# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV"
+#endif
+
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago. Not protected by any mutex or latch. */
+extern uint buf_LRU_old_threshold_ms;
+/* @} */
+
+/** @brief Statistics for selecting the LRU list for eviction.
+
+These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O
+and page_zip_decompress() operations. Based on the statistics we decide
+if we want to evict from buf_pool.unzip_LRU or buf_pool.LRU. */
+struct buf_LRU_stat_t
+{
+ ulint io; /**< Counter of buffer pool I/O operations. */
+ ulint unzip; /**< Counter of page_zip_decompress operations. */
+};
+
+/** Current operation counters. Not protected by any mutex.
+Cleared by buf_LRU_stat_update(). */
+extern buf_LRU_stat_t buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update(). Protected by buf_pool.mutex. */
+extern buf_LRU_stat_t buf_LRU_stat_sum;
+
+/********************************************************************//**
+Increments the I/O counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++
+/********************************************************************//**
+Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
+
+#endif
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
new file mode 100644
index 00000000..87c6b5d7
--- /dev/null
+++ b/storage/innobase/include/buf0rea.h
@@ -0,0 +1,119 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0rea.h
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0rea_h
+#define buf0rea_h
+
+#include "buf0buf.h"
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@retval DB_SUCCESS if the page was read and is not corrupted,
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
+dberr_t buf_read_page(const page_id_t page_id, ulint zip_size);
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in,out] space tablespace
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] sync true if synchronous aio is desired */
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+ ulint zip_size, bool sync)
+ MY_ATTRIBUTE((nonnull));
+
+/** Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@param[in] page_id page id of a page which the current thread
+wants to access
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] ibuf whether we are inside ibuf routine
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value! */
+ulint
+buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf);
+
+/** Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@param[in] page_id page id; see NOTE 3 above
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] ibuf whether if we are inside ibuf routine
+@return number of page read requests issued */
+ulint
+buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf);
+
+/** Issues read requests for pages which recovery wants to read in.
+@param[in] space_id tablespace id
+@param[in] page_nos array of page numbers to read, with the
+highest page number the last in the array
+@param[in] n number of page numbers in the array */
+void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n);
+
+/** @name Modes used in read-ahead @{ */
+/** read only pages belonging to the insert buffer tree */
+#define BUF_READ_IBUF_PAGES_ONLY 131
+/** read any page */
+#define BUF_READ_ANY_PAGE 132
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
new file mode 100644
index 00000000..5dd58109
--- /dev/null
+++ b/storage/innobase/include/buf0types.h
@@ -0,0 +1,225 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0types.h
+The database buffer pool global types for the directory
+
+Created 11/17/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0types_h
+#define buf0types_h
+
+#include "univ.i"
+
+/** Buffer page (uncompressed or compressed) */
+class buf_page_t;
+/** Buffer block for which an uncompressed page exists */
+struct buf_block_t;
+/** Buffer pool statistics struct */
+struct buf_pool_stat_t;
+/** Buffer pool buddy statistics struct */
+struct buf_buddy_stat_t;
+
+/** A buffer frame. @see page_t */
+typedef byte buf_frame_t;
+
+/** Flags for io_fix types */
+enum buf_io_fix {
+ BUF_IO_NONE = 0, /**< no pending I/O */
+ BUF_IO_READ, /**< read pending */
+ BUF_IO_WRITE, /**< write pending */
+ BUF_IO_PIN /**< disallow relocation of
+ block and its removal of from
+ the flush_list */
+};
+
+/** Alternatives for srv_checksum_algorithm, which can be changed by
+setting innodb_checksum_algorithm */
+enum srv_checksum_algorithm_t {
+ SRV_CHECKSUM_ALGORITHM_CRC32, /*!< Write crc32, allow crc32,
+ innodb or none when reading */
+ SRV_CHECKSUM_ALGORITHM_STRICT_CRC32, /*!< Write crc32, allow crc32
+ when reading */
+ SRV_CHECKSUM_ALGORITHM_INNODB, /*!< Write innodb, allow crc32,
+ innodb or none when reading */
+ SRV_CHECKSUM_ALGORITHM_STRICT_INNODB, /*!< Write innodb, allow
+ innodb when reading */
+ SRV_CHECKSUM_ALGORITHM_NONE, /*!< Write none, allow crc32,
+ innodb or none when reading */
+ SRV_CHECKSUM_ALGORITHM_STRICT_NONE, /*!< Write none, allow none
+ when reading */
+
+ /** For new files, always compute CRC-32C for the whole page.
+ For old files, allow crc32, innodb or none when reading. */
+ SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
+
+ /** For new files, always compute CRC-32C for the whole page.
+ For old files, allow crc32 when reading. */
+ SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
+};
+
+inline
+bool
+is_checksum_strict(srv_checksum_algorithm_t algo)
+{
+ return(algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32
+ || algo == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
+ || algo == SRV_CHECKSUM_ALGORITHM_STRICT_NONE);
+}
+
+inline
+bool
+is_checksum_strict(ulint algo)
+{
+ return(algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32
+ || algo == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
+ || algo == SRV_CHECKSUM_ALGORITHM_STRICT_NONE);
+}
+
+/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
+/* @{ */
+/** Zip shift value for the smallest page size */
+#define BUF_BUDDY_LOW_SHIFT UNIV_ZIP_SIZE_SHIFT_MIN
+
+/** Smallest buddy page size */
+#define BUF_BUDDY_LOW (1U << BUF_BUDDY_LOW_SHIFT)
+
+/** Actual number of buddy sizes based on current page size */
+#define BUF_BUDDY_SIZES (srv_page_size_shift - BUF_BUDDY_LOW_SHIFT)
+
+/** Maximum number of buddy sizes based on the max page size */
+#define BUF_BUDDY_SIZES_MAX (UNIV_PAGE_SIZE_SHIFT_MAX \
+ - BUF_BUDDY_LOW_SHIFT)
+
+/** twice the maximum block size of the buddy system;
+the underlying memory is aligned by this amount:
+this must be equal to srv_page_size */
+#define BUF_BUDDY_HIGH (BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
+/* @} */
+
+/** Page identifier. */
+class page_id_t
+{
+public:
+ /** Constructor from (space, page_no).
+ @param[in] space tablespace id
+ @param[in] page_no page number */
+ page_id_t(ulint space, uint32_t page_no) : m_id(uint64_t{space} << 32 | page_no)
+ {
+ ut_ad(space <= 0xFFFFFFFFU);
+ }
+
+ page_id_t(uint64_t id) : m_id(id) {}
+ bool operator==(const page_id_t& rhs) const { return m_id == rhs.m_id; }
+ bool operator!=(const page_id_t& rhs) const { return m_id != rhs.m_id; }
+ bool operator<(const page_id_t& rhs) const { return m_id < rhs.m_id; }
+ bool operator>(const page_id_t& rhs) const { return m_id > rhs.m_id; }
+ bool operator<=(const page_id_t& rhs) const { return m_id <= rhs.m_id; }
+ bool operator>=(const page_id_t& rhs) const { return m_id >= rhs.m_id; }
+ page_id_t &operator--() { ut_ad(page_no()); m_id--; return *this; }
+ page_id_t &operator++()
+ {
+ ut_ad(page_no() < 0xFFFFFFFFU);
+ m_id++;
+ return *this;
+ }
+ page_id_t operator-(uint32_t i) const
+ {
+ ut_ad(page_no() >= i);
+ return page_id_t(m_id - i);
+ }
+ page_id_t operator+(uint32_t i) const
+ {
+ ut_ad(page_no() < ~i);
+ return page_id_t(m_id + i);
+ }
+
+ /** Retrieve the tablespace id.
+ @return tablespace id */
+ uint32_t space() const { return static_cast<uint32_t>(m_id >> 32); }
+
+ /** Retrieve the page number.
+ @return page number */
+ uint32_t page_no() const { return static_cast<uint32_t>(m_id); }
+
+ /** Retrieve the fold value.
+ @return fold value */
+ ulint fold() const { return (space() << 20) + space() + page_no(); }
+
+ /** Reset the page number only.
+ @param[in] page_no page number */
+ void set_page_no(uint32_t page_no)
+ {
+ m_id= (m_id & ~uint64_t{0} << 32) | page_no;
+ }
+
+ ulonglong raw() { return m_id; }
+private:
+ /** The page identifier */
+ uint64_t m_id;
+};
+
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
+and dummy default values of instantly dropped columns.
+Initially, BLOB field references are set to NUL bytes, in
+dtuple_convert_big_rec(). */
+extern const byte *field_ref_zero;
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "ut0mutex.h"
+#include "sync0rw.h"
+#include "rw_lock.h"
+
+class page_hash_latch : public rw_lock
+{
+public:
+ /** Wait for a shared lock */
+ void read_lock_wait();
+ /** Wait for an exclusive lock */
+ void write_lock_wait();
+
+ /** Acquire a shared lock */
+ inline void read_lock();
+ /** Acquire an exclusive lock */
+ inline void write_lock();
+
+ /** Acquire a lock */
+ template<bool exclusive> void acquire()
+ {
+ if (exclusive)
+ write_lock();
+ else
+ read_lock();
+ }
+ /** Release a lock */
+ template<bool exclusive> void release()
+ {
+ if (exclusive)
+ write_unlock();
+ else
+ read_unlock();
+ }
+};
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif /* buf0types.h */
diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h
new file mode 100644
index 00000000..fc774b6e
--- /dev/null
+++ b/storage/innobase/include/data0data.h
@@ -0,0 +1,710 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, 2020 MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.h
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0data_h
+#define data0data_h
+
+#include "data0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+#include "btr0types.h"
+#include <vector>
+
+#include <ostream>
+
+/** Storage for overflow data in a big record, that is, a clustered
+index record which needs external storage of data fields */
+struct big_rec_t;
+struct upd_t;
+
+/** Dummy variable to catch access to uninitialized fields. In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+ut_d(extern byte data_error);
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+ dfield_t* field, /*!< in: SQL data field */
+ const dtype_t* type); /*!< in: pointer to data type struct */
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+ dfield_t* field, /*!< in: field */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+ MY_ATTRIBUTE((nonnull));
+
+/** Gets spatial status for "external storage"
+@param[in,out] field field */
+UNIV_INLINE
+spatial_status_t
+dfield_get_spatial_status(
+ const dfield_t* field);
+
+/** Sets spatial status for "external storage"
+@param[in,out] field field
+@param[in] spatial_status spatial status */
+UNIV_INLINE
+void
+dfield_set_spatial_status(
+ dfield_t* field,
+ spatial_status_t spatial_status);
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+ dfield_t* field, /*!< in: field */
+ const void* data, /*!< in: data */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+ MY_ATTRIBUTE((nonnull(1)));
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_write_mbr(
+/*=============*/
+ dfield_t* field, /*!< in: field */
+ const double* mbr) /*!< in: data */
+ MY_ATTRIBUTE((nonnull(1)));
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+ dfield_t* field) /*!< in/out: field */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+ byte* data, /*!< in: pointer to a buffer of size len */
+ ulint len) /*!< in: SQL null size in bytes */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2); /*!< in: field to copy from */
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2) /*!< in: field to copy from */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+ dfield_t* field, /*!< in/out: data field */
+ mem_heap_t* heap) /*!< in: memory heap where allocated */
+ MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Tests if two data fields are equal.
+If len==0, tests the data length and content for equality.
+If len>0, tests the first len bytes of the content for equality.
+@return TRUE if both fields are NULL or if they are equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+ const dfield_t* field1, /*!< in: field */
+ const dfield_t* field2, /*!< in: field */
+ ulint len) /*!< in: maximum prefix to compare,
+ or 0 to compare the whole field length */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+ const dfield_t* field, /*!< in: field */
+ ulint len, /*!< in: data length or UNIV_SQL_NULL */
+ const byte* data) /*!< in: data */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint info_bits) /*!< in: info bits */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields_cmp) /*!< in: number of fields used in
+ comparisons in rem0cmp.* */
+ MY_ATTRIBUTE((nonnull));
+
+/* Estimate the number of bytes that are going to be allocated when
+creating a new dtuple_t object */
+#define DTUPLE_EST_ALLOC(n_fields) \
+ (sizeof(dtuple_t) + (n_fields) * sizeof(dfield_t))
+
+/** Creates a data tuple from an already allocated chunk of memory.
+The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields).
+The default value for number of fields used in record comparisons
+for this tuple is n_fields.
+@param[in,out] buf buffer to use
+@param[in] buf_size buffer size
+@param[in] n_fields number of field
+@param[in] n_v_fields number of fields on virtual columns
+@return created tuple (inside buf) */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_from_mem(
+ void* buf,
+ ulint buf_size,
+ ulint n_fields,
+ ulint n_v_fields)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+ mem_heap_t* heap, /*!< in: memory heap where the tuple
+ is created, DTUPLE_EST_ALLOC(n_fields)
+ bytes will be allocated from this heap */
+ ulint n_fields)/*!< in: number of fields */
+ MY_ATTRIBUTE((nonnull, malloc));
+
+/** Initialize the virtual field data in a dtuple_t
+@param[in,out] vrow dtuple contains the virtual fields */
+UNIV_INLINE void dtuple_init_v_fld(dtuple_t* vrow);
+
+/** Duplicate the virtual field data in a dtuple_t
+@param[in,out] vrow dtuple contains the virtual fields
+@param[in] heap heap memory to use */
+UNIV_INLINE void dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap);
+
+/** Creates a data tuple with possible virtual columns to a memory heap.
+@param[in] heap memory heap where the tuple is created
+@param[in] n_fields number of fields
+@param[in] n_v_fields number of fields on virtual col
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_with_vcol(
+ mem_heap_t* heap,
+ ulint n_fields,
+ ulint n_v_fields);
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+void
+dtuple_set_n_fields(
+/*================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields) /*!< in: number of fields */
+ MY_ATTRIBUTE((nonnull));
+/** Copies a data tuple's virtaul fields to another. This is a shallow copy;
+@param[in,out] d_tuple destination tuple
+@param[in] s_tuple source tuple */
+UNIV_INLINE
+void
+dtuple_copy_v_fields(
+ dtuple_t* d_tuple,
+ const dtuple_t* s_tuple);
+/*********************************************************************//**
+Copies a data tuple to another. This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+ const dtuple_t* tuple, /*!< in: tuple to copy from */
+ mem_heap_t* heap) /*!< in: memory heap
+ where the tuple is created */
+ MY_ATTRIBUTE((nonnull, malloc));
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted.
+@return sum of data lens */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: typed data tuple */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull));
+/** Compare two data tuples.
+@param[in] tuple1 first data tuple
+@param[in] tuple2 second data tuple
+@return positive, 0, negative if tuple1 is greater, equal, less, than tuple2,
+respectively */
+int
+dtuple_coll_cmp(
+ const dtuple_t* tuple1,
+ const dtuple_t* tuple2)
+ MY_ATTRIBUTE((warn_unused_result));
+/** Fold a prefix given as the number of fields of a tuple.
+@param[in] tuple index record
+@param[in] n_fields number of complete fields to fold
+@param[in] n_bytes number of bytes to fold in the last field
+@param[in] index_id index tree ID
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+ const dtuple_t* tuple,
+ ulint n_fields,
+ ulint n_bytes,
+ index_id_t tree_id)
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+ dtuple_t* tuple, /*!< in: data tuple */
+ ulint n) /*!< in: number of fields to set */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: dtuple */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dfield_check_typed(
+/*===============*/
+ const dfield_t* field) /*!< in: data field */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dtuple_check_typed(
+/*===============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return TRUE if ok */
+ibool
+dtuple_validate(
+/*============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+void
+dfield_print(
+/*=========*/
+ const dfield_t* dfield) /*!< in: dfield */
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+void
+dfield_print_also_hex(
+/*==================*/
+ const dfield_t* dfield) /*!< in: dfield */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+void
+dtuple_print(
+/*=========*/
+ FILE* f, /*!< in: output stream */
+ const dtuple_t* tuple) /*!< in: tuple */
+ MY_ATTRIBUTE((nonnull));
+
+/** Print the contents of a tuple.
+@param[out] o output stream
+@param[in] field array of data fields
+@param[in] n number of data fields */
+void
+dfield_print(
+ std::ostream& o,
+ const dfield_t* field,
+ ulint n);
+/** Print the contents of a tuple.
+@param[out] o output stream
+@param[in] tuple data tuple */
+void
+dtuple_print(
+ std::ostream& o,
+ const dtuple_t* tuple);
+
+/** Print the contents of a tuple.
+@param[out] o output stream
+@param[in] tuple data tuple */
+inline
+std::ostream&
+operator<<(std::ostream& o, const dtuple_t& tuple)
+{
+ dtuple_print(o, &tuple);
+ return(o);
+}
+
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ upd_t* upd, /*!< in/out: update vector */
+ dtuple_t* entry, /*!< in/out: index entry */
+ ulint* n_ext) /*!< in/out: number of
+ externally stored columns */
+ MY_ATTRIBUTE((malloc, warn_unused_result));
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: entry whose data was put to vector */
+ big_rec_t* vector) /*!< in, own: big rec vector; it is
+ freed in this function */
+ MY_ATTRIBUTE((nonnull));
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+ big_rec_t* vector) /*!< in, own: big rec vector; it is
+ freed in this function */
+ MY_ATTRIBUTE((nonnull));
+
+/*######################################################################*/
+
+/** Structure for an SQL data field */
+struct dfield_t{
+ void* data; /*!< pointer to data */
+ unsigned ext:1; /*!< TRUE=externally stored, FALSE=local */
+ unsigned spatial_status:2;
+ /*!< spatial status of externally stored field
+ in undo log for purge */
+ unsigned len; /*!< data length; UNIV_SQL_NULL if SQL null */
+ dtype_t type; /*!< type of data */
+
+ /** Create a deep copy of this object.
+ @param[in,out] heap memory heap in which the clone will be created
+ @return the cloned object */
+ dfield_t* clone(mem_heap_t* heap) const;
+
+ /** @return system field indicates history row */
+ bool vers_history_row() const
+ {
+ ut_ad(type.vers_sys_end());
+ if (type.mtype == DATA_FIXBINARY) {
+ ut_ad(len == sizeof timestamp_max_bytes);
+ return 0 != memcmp(data, timestamp_max_bytes, len);
+ } else {
+ ut_ad(type.mtype == DATA_INT);
+ ut_ad(len == sizeof trx_id_max_bytes);
+ return 0 != memcmp(data, trx_id_max_bytes, len);
+ }
+ ut_ad(0);
+ return false;
+ }
+};
+
+/** Structure for an SQL data tuple of fields (logical record) */
+struct dtuple_t {
+ ulint info_bits; /*!< info bits of an index record:
+ the default is 0; this field is used
+ if an index record is built from
+ a data tuple */
+ ulint n_fields; /*!< number of fields in dtuple */
+ ulint n_fields_cmp; /*!< number of fields which should
+ be used in comparison services
+ of rem0cmp.*; the index search
+ is performed by comparing only these
+ fields, others are ignored; the
+ default value in dtuple creation is
+ the same value as n_fields */
+ dfield_t* fields; /*!< fields */
+ ulint n_v_fields; /*!< number of virtual fields */
+ dfield_t* v_fields; /*!< fields on virtual column */
+#ifdef UNIV_DEBUG
+ ulint magic_n; /*!< magic number, used in
+ debug assertions */
+/** Value of dtuple_t::magic_n */
+# define DATA_TUPLE_MAGIC_N 65478679
+#endif /* UNIV_DEBUG */
+
+ /** Trim the tail of an index tuple before insert or update.
+ After instant ADD COLUMN, if the last fields of a clustered index tuple
+ match the default values that were explicitly specified or implied
+ during ADD COLUMN, there will be no need to store them.
+ NOTE: A page latch in the index must be held, so that the index
+ may not lose 'instantness' before the trimmed tuple has been
+ inserted or updated.
+ @param[in] index index possibly with instantly added columns */
+ void trim(const dict_index_t& index);
+
+ bool vers_history_row() const
+ {
+ for (ulint i = 0; i < n_fields; i++) {
+ const dfield_t* field = &fields[i];
+ if (field->type.vers_sys_end()) {
+ return field->vers_history_row();
+ }
+ }
+ return false;
+ }
+
+ /**
+ @param info_bits the info_bits of a data tuple
+ @return whether this is a hidden metadata record
+ for instant ADD COLUMN or ALTER TABLE */
+ static bool is_alter_metadata(ulint info_bits)
+ {
+ return UNIV_UNLIKELY(info_bits == REC_INFO_METADATA_ALTER);
+ }
+
+ /**
+ @param info_bits the info_bits of a data tuple
+ @return whether this is a hidden metadata record
+ for instant ADD COLUMN or ALTER TABLE */
+ static bool is_metadata(ulint info_bits)
+ {
+ return UNIV_UNLIKELY((info_bits & ~REC_INFO_DELETED_FLAG)
+ == REC_INFO_METADATA_ADD);
+ }
+
+ /** @return whether this is a hidden metadata record
+ for instant ALTER TABLE (not only ADD COLUMN) */
+ bool is_alter_metadata() const { return is_alter_metadata(info_bits); }
+
+ /** @return whether this is a hidden metadata record
+ for instant ADD COLUMN or ALTER TABLE */
+ bool is_metadata() const { return is_metadata(info_bits); }
+};
+
+inline ulint dtuple_get_n_fields(const dtuple_t* tuple)
+{ return tuple->n_fields; }
+inline dtype_t* dfield_get_type(dfield_t* field) { return &field->type; }
+inline const dtype_t* dfield_get_type(const dfield_t* field)
+{ return &field->type; }
+inline void* dfield_get_data(dfield_t* field)
+{
+ ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+ return field->data;
+}
+inline const void* dfield_get_data(const dfield_t* field)
+{
+ ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+ return field->data;
+}
+inline ulint dfield_get_len(const dfield_t* field) {
+ ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+ ut_ad(field->len != UNIV_SQL_DEFAULT);
+ return field->len;
+}
+inline bool dfield_is_null(const dfield_t* field)
+{ return field->len == UNIV_SQL_NULL; }
+/** @return whether a column is to be stored off-page */
+inline bool dfield_is_ext(const dfield_t* field)
+{
+ ut_ad(!field->ext || field->len >= BTR_EXTERN_FIELD_REF_SIZE);
+ return static_cast<bool>(field->ext);
+}
+/** Set the "external storage" flag */
+inline void dfield_set_ext(dfield_t* field) { field->ext = 1; }
+
+/** Gets number of virtual fields in a data tuple.
+@param[in] tuple dtuple to check
+@return number of fields */
+inline ulint
+dtuple_get_n_v_fields(const dtuple_t* tuple) { return tuple->n_v_fields; }
+
+inline const dfield_t* dtuple_get_nth_field(const dtuple_t* tuple, ulint n)
+{
+ ut_ad(n < tuple->n_fields);
+ return &tuple->fields[n];
+}
+inline dfield_t* dtuple_get_nth_field(dtuple_t* tuple, ulint n)
+{
+ ut_ad(n < tuple->n_fields);
+ return &tuple->fields[n];
+}
+
+/** Get a virtual column in a table row or an extended clustered index record.
+@param[in] tuple tuple
+@oaran[in] n the nth virtual field to get
+@return nth virtual field */
+inline const dfield_t* dtuple_get_nth_v_field(const dtuple_t* tuple, ulint n)
+{
+ ut_ad(n < tuple->n_v_fields);
+ return &tuple->v_fields[n];
+}
+/** Get a virtual column in a table row or an extended clustered index record.
+@param[in] tuple tuple
+@oaran[in] n the nth virtual field to get
+@return nth virtual field */
+inline dfield_t* dtuple_get_nth_v_field(dtuple_t* tuple, ulint n)
+{
+ ut_ad(n < tuple->n_v_fields);
+ return &tuple->v_fields[n];
+}
+
+/** A slot for a field in a big rec vector */
+struct big_rec_field_t {
+
+ /** Constructor.
+ @param[in] field_no_ the field number
+ @param[in] len_ the data length
+ @param[in] data_ the data */
+ big_rec_field_t(ulint field_no_, ulint len_, const void* data_)
+ : field_no(field_no_),
+ len(len_),
+ data(data_)
+ {}
+
+ ulint field_no; /*!< field number in record */
+ ulint len; /*!< stored data length, in bytes */
+ const void* data; /*!< stored data */
+};
+
+/** Storage format for overflow data in a big record, that is, a
+clustered index record which needs external storage of data fields */
+struct big_rec_t {
+ mem_heap_t* heap; /*!< memory heap from which
+ allocated */
+ const ulint capacity; /*!< fields array size */
+ ulint n_fields; /*!< number of stored fields */
+ big_rec_field_t*fields; /*!< stored fields */
+
+ /** Constructor.
+ @param[in] max the capacity of the array of fields. */
+ explicit big_rec_t(const ulint max)
+ : heap(0),
+ capacity(max),
+ n_fields(0),
+ fields(0)
+ {}
+
+ /** Append one big_rec_field_t object to the end of array of fields */
+ void append(const big_rec_field_t& field)
+ {
+ ut_ad(n_fields < capacity);
+ fields[n_fields] = field;
+ n_fields++;
+ }
+
+ /** Allocate a big_rec_t object in the given memory heap, and for
+ storing n_fld number of fields.
+ @param[in] heap memory heap in which this object is allocated
+ @param[in] n_fld maximum number of fields that can be stored in
+ this object
+ @return the allocated object */
+ static big_rec_t* alloc(
+ mem_heap_t* heap,
+ ulint n_fld);
+};
+
+#include "data0data.ic"
+
+#endif
diff --git a/storage/innobase/include/data0data.ic b/storage/innobase/include/data0data.ic
new file mode 100644
index 00000000..2d1bf5a2
--- /dev/null
+++ b/storage/innobase/include/data0data.ic
@@ -0,0 +1,633 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.ic
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0rnd.h"
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+ dfield_t* field, /*!< in: SQL data field */
+ const dtype_t* type) /*!< in: pointer to data type struct */
+{
+ ut_ad(field != NULL);
+ ut_ad(type != NULL);
+
+ field->type = *type;
+}
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+ dfield_t* field, /*!< in: field */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+{
+ ut_ad(len != UNIV_SQL_DEFAULT);
+ field->ext = 0;
+ field->len = static_cast<unsigned int>(len);
+}
+
+/** Gets spatial status for "external storage"
+@param[in,out] field field */
+UNIV_INLINE
+spatial_status_t
+dfield_get_spatial_status(
+ const dfield_t* field)
+{
+ ut_ad(dfield_is_ext(field));
+
+ return(static_cast<spatial_status_t>(field->spatial_status));
+}
+
+/** Sets spatial status for "external storage"
+@param[in,out] field field
+@param[in] spatial_status spatial status */
+UNIV_INLINE
+void
+dfield_set_spatial_status(
+ dfield_t* field,
+ spatial_status_t spatial_status)
+{
+ field->spatial_status = spatial_status & 3;
+ ut_ad(dfield_get_spatial_status(field) == spatial_status);
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+ dfield_t* field, /*!< in: field */
+ const void* data, /*!< in: data */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+{
+ field->data = (void*) data;
+ field->ext = 0;
+ field->len = static_cast<unsigned int>(len);
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_write_mbr(
+/*=============*/
+ dfield_t* field, /*!< in: field */
+ const double* mbr) /*!< in: data */
+{
+ MEM_CHECK_DEFINED(mbr, sizeof *mbr);
+ field->ext = 0;
+
+ for (unsigned i = 0; i < SPDIMS * 2; i++) {
+ mach_double_write(static_cast<byte*>(field->data)
+ + i * sizeof(double), mbr[i]);
+ }
+
+ field->len = DATA_MBR_LEN;
+}
+
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+ dfield_t* field) /*!< in/out: field */
+{
+ dfield_set_data(field, NULL, UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2) /*!< in: field to copy from */
+{
+ ut_ad(field1 != NULL);
+ ut_ad(field2 != NULL);
+
+ field1->data = field2->data;
+ field1->len = field2->len;
+ field1->ext = field2->ext;
+ field1->spatial_status = field2->spatial_status;
+}
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2) /*!< in: field to copy from */
+{
+ *field1 = *field2;
+}
+
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+ dfield_t* field, /*!< in/out: data field */
+ mem_heap_t* heap) /*!< in: memory heap where allocated */
+{
+ if (!dfield_is_null(field)) {
+ MEM_CHECK_DEFINED(field->data, field->len);
+ field->data = mem_heap_dup(heap, field->data, field->len);
+ }
+}
+
+/*********************************************************************//**
+Tests if two data fields are equal.
+If len==0, tests the data length and content for equality.
+If len>0, tests the first len bytes of the content for equality.
+@return TRUE if both fields are NULL or if they are equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+ const dfield_t* field1, /*!< in: field */
+ const dfield_t* field2, /*!< in: field */
+ ulint len) /*!< in: maximum prefix to compare,
+ or 0 to compare the whole field length */
+{
+ ulint len2 = len;
+
+ if (field1->len == UNIV_SQL_NULL || len == 0 || field1->len < len) {
+ len = field1->len;
+ }
+
+ if (field2->len == UNIV_SQL_NULL || len2 == 0 || field2->len < len2) {
+ len2 = field2->len;
+ }
+
+ return(len == len2
+ && (len == UNIV_SQL_NULL
+ || !memcmp(field1->data, field2->data, len)));
+}
+
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+ const dfield_t* field, /*!< in: field */
+ ulint len, /*!< in: data length or UNIV_SQL_NULL */
+ const byte* data) /*!< in: data */
+{
+ ut_ad(len != UNIV_SQL_DEFAULT);
+ return(len == dfield_get_len(field)
+ && (!len || len == UNIV_SQL_NULL
+ || !memcmp(dfield_get_data(field), data, len)));
+}
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ return(tuple->info_bits);
+}
+
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint info_bits) /*!< in: info bits */
+{
+ tuple->info_bits = info_bits;
+}
+
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ return(tuple->n_fields_cmp);
+}
+
+/*********************************************************************//**
+Sets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields_cmp) /*!< in: number of fields used in
+ comparisons in rem0cmp.* */
+{
+ ut_ad(n_fields_cmp <= tuple->n_fields);
+ tuple->n_fields_cmp = n_fields_cmp;
+}
+
+/** Creates a data tuple from an already allocated chunk of memory.
+The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields).
+The default value for number of fields used in record comparisons
+for this tuple is n_fields.
+@param[in,out] buf buffer to use
+@param[in] buf_size buffer size
+@param[in] n_fields number of field
+@param[in] n_v_fields number of fields on virtual columns
+@return created tuple (inside buf) */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_from_mem(
+ void* buf,
+ ulint buf_size,
+ ulint n_fields,
+ ulint n_v_fields)
+{
+ dtuple_t* tuple;
+ ulint n_t_fields = n_fields + n_v_fields;
+
+ ut_a(buf_size >= DTUPLE_EST_ALLOC(n_t_fields));
+
+ tuple = (dtuple_t*) buf;
+ tuple->info_bits = 0;
+ tuple->n_fields = n_fields;
+ tuple->n_v_fields = n_v_fields;
+ tuple->n_fields_cmp = n_fields;
+ tuple->fields = (dfield_t*) &tuple[1];
+ if (n_v_fields > 0) {
+ tuple->v_fields = &tuple->fields[n_fields];
+ } else {
+ tuple->v_fields = NULL;
+ }
+
+#ifdef UNIV_DEBUG
+ tuple->magic_n = DATA_TUPLE_MAGIC_N;
+
+ { /* In the debug version, initialize fields to an error value */
+ ulint i;
+
+ for (i = 0; i < n_t_fields; i++) {
+ dfield_t* field;
+
+ if (i >= n_fields) {
+ field = dtuple_get_nth_v_field(
+ tuple, i - n_fields);
+ } else {
+ field = dtuple_get_nth_field(tuple, i);
+ }
+
+ dfield_set_len(field, UNIV_SQL_NULL);
+ field->data = &data_error;
+ dfield_get_type(field)->mtype = DATA_ERROR;
+ dfield_get_type(field)->prtype = DATA_ERROR;
+ }
+ }
+#endif
+ MEM_CHECK_ADDRESSABLE(tuple->fields, n_t_fields
+ * sizeof *tuple->fields);
+ MEM_UNDEFINED(tuple->fields, n_t_fields * sizeof *tuple->fields);
+ return(tuple);
+}
+
+/** Duplicate the virtual field data in a dtuple_t
+@param[in,out] vrow dtuple contains the virtual fields
+@param[in,out] heap heap memory to use */
+UNIV_INLINE
+void
+dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap)
+{
+ for (ulint i = 0; i < vrow->n_v_fields; i++) {
+ dfield_t* dfield = dtuple_get_nth_v_field(vrow, i);
+ dfield_dup(dfield, heap);
+ }
+}
+
+/** Initialize the virtual field data in a dtuple_t
+@param[in,out] vrow dtuple contains the virtual fields */
+UNIV_INLINE
+void
+dtuple_init_v_fld(dtuple_t* vrow)
+{
+ for (ulint i = 0; i < vrow->n_v_fields; i++) {
+ dfield_t* dfield = dtuple_get_nth_v_field(vrow, i);
+ dfield_get_type(dfield)->mtype = DATA_MISSING;
+ dfield_set_len(dfield, UNIV_SQL_NULL);
+ }
+}
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+ mem_heap_t* heap, /*!< in: memory heap where the tuple
+ is created, DTUPLE_EST_ALLOC(n_fields)
+ bytes will be allocated from this heap */
+ ulint n_fields) /*!< in: number of fields */
+{
+ return(dtuple_create_with_vcol(heap, n_fields, 0));
+}
+
+/** Creates a data tuple with virtual columns to a memory heap.
+@param[in] heap memory heap where the tuple is created
+@param[in] n_fields number of fields
+@param[in] n_v_fields number of fields on virtual col
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_with_vcol(
+ mem_heap_t* heap,
+ ulint n_fields,
+ ulint n_v_fields)
+{
+ void* buf;
+ ulint buf_size;
+ dtuple_t* tuple;
+
+ ut_ad(heap);
+
+ buf_size = DTUPLE_EST_ALLOC(n_fields + n_v_fields);
+ buf = mem_heap_alloc(heap, buf_size);
+
+ tuple = dtuple_create_from_mem(buf, buf_size, n_fields, n_v_fields);
+
+ return(tuple);
+}
+
+/** Copies a data tuple's virtual fields to another. This is a shallow copy;
+@param[in,out] d_tuple destination tuple
+@param[in] s_tuple source tuple */
+UNIV_INLINE
+void
+dtuple_copy_v_fields(
+ dtuple_t* d_tuple,
+ const dtuple_t* s_tuple)
+{
+
+ ulint n_v_fields = dtuple_get_n_v_fields(d_tuple);
+ ut_ad(n_v_fields == dtuple_get_n_v_fields(s_tuple));
+
+ for (ulint i = 0; i < n_v_fields; i++) {
+ dfield_copy(dtuple_get_nth_v_field(d_tuple, i),
+ dtuple_get_nth_v_field(s_tuple, i));
+ }
+}
+
+/*********************************************************************//**
+Copies a data tuple to another. This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+ const dtuple_t* tuple, /*!< in: tuple to copy from */
+ mem_heap_t* heap) /*!< in: memory heap
+ where the tuple is created */
+{
+ ulint n_fields = dtuple_get_n_fields(tuple);
+ ulint n_v_fields = dtuple_get_n_v_fields(tuple);
+ dtuple_t* new_tuple = dtuple_create_with_vcol(
+ heap, n_fields, n_v_fields);
+ ulint i;
+
+ for (i = 0; i < n_fields; i++) {
+ dfield_copy(dtuple_get_nth_field(new_tuple, i),
+ dtuple_get_nth_field(tuple, i));
+ }
+
+ for (i = 0; i < n_v_fields; i++) {
+ dfield_copy(dtuple_get_nth_v_field(new_tuple, i),
+ dtuple_get_nth_v_field(tuple, i));
+ }
+
+ return(new_tuple);
+}
+
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted. Neither
+is possible space in externally stored parts of the field.
+@return sum of data lengths */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: typed data tuple */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ const dfield_t* field;
+ ulint n_fields;
+ ulint len;
+ ulint i;
+ ulint sum = 0;
+
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+ n_fields = tuple->n_fields;
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+ len = dfield_get_len(field);
+
+ if (len == UNIV_SQL_NULL) {
+ len = dtype_get_sql_null_size(dfield_get_type(field),
+ comp);
+ }
+
+ sum += len;
+ }
+
+ return(sum);
+}
+
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ulint n_ext = 0;
+ ulint n_fields = tuple->n_fields;
+ ulint i;
+
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+ for (i = 0; i < n_fields; i++) {
+ n_ext += dtuple_get_nth_field(tuple, i)->ext;
+ }
+
+ return(n_ext);
+}
+
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+ dtuple_t* tuple, /*!< in: data tuple */
+ ulint n) /*!< in: number of fields to set */
+{
+ dtype_t* dfield_type;
+ ulint i;
+
+ for (i = 0; i < n; i++) {
+ dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+ dtype_set(dfield_type, DATA_BINARY, 0, 0);
+ }
+}
+
+/** Fold a prefix given as the number of fields of a tuple.
+@param[in] tuple index record
+@param[in] n_fields number of complete fields to fold
+@param[in] n_bytes number of bytes to fold in the last field
+@param[in] index_id index tree ID
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+ const dtuple_t* tuple,
+ ulint n_fields,
+ ulint n_bytes,
+ index_id_t tree_id)
+{
+ const dfield_t* field;
+ ulint i;
+ const byte* data;
+ ulint len;
+ ulint fold;
+
+ ut_ad(tuple);
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+ ut_ad(dtuple_check_typed(tuple));
+
+ fold = ut_fold_ull(tree_id);
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = (const byte*) dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ if (len != UNIV_SQL_NULL) {
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ if (n_bytes > 0) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = (const byte*) dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len > n_bytes) {
+ len = n_bytes;
+ }
+
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ return(fold);
+}
+
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+ byte* data, /*!< in: pointer to a buffer of size len */
+ ulint len) /*!< in: SQL null size in bytes */
+{
+ memset(data, 0, len);
+}
+
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: dtuple */
+{
+ ulint n;
+ ulint i;
+
+ n = dtuple_get_n_fields(tuple);
+
+ for (i = 0; i < n; i++) {
+ if (dfield_is_null(dtuple_get_nth_field(tuple, i))) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+ big_rec_t* vector) /*!< in, own: big rec vector; it is
+ freed in this function */
+{
+ mem_heap_free(vector->heap);
+}
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
new file mode 100644
index 00000000..750c3534
--- /dev/null
+++ b/storage/innobase/include/data0type.h
@@ -0,0 +1,606 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.h
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef data0type_h
+#define data0type_h
+
+#include "univ.i"
+
+/** Special length indicating a missing instantly added column */
+#define UNIV_SQL_DEFAULT (UNIV_SQL_NULL - 1)
+
+/** @return whether a length is actually stored in a field */
+#define len_is_stored(len) (len != UNIV_SQL_NULL && len != UNIV_SQL_DEFAULT)
+
+extern ulint data_mysql_default_charset_coll;
+#define DATA_MYSQL_BINARY_CHARSET_COLL 63
+
+/* SQL data type struct */
+struct dtype_t;
+
+/** SQL Like operator comparison types */
+enum ib_like_t {
+ IB_LIKE_EXACT, /**< e.g. STRING */
+ IB_LIKE_PREFIX /**< e.g., STRING% */
+};
+
+/*-------------------------------------------*/
+/* The 'MAIN TYPE' of a column */
+#define DATA_MISSING 0 /* missing column */
+#define DATA_VARCHAR 1 /* character varying of the
+ latin1_swedish_ci charset-collation; note
+ that the MySQL format for this, DATA_BINARY,
+ DATA_VARMYSQL, is also affected by whether the
+ 'precise type' contains
+ DATA_MYSQL_TRUE_VARCHAR */
+#define DATA_CHAR 2 /* fixed length character of the
+ latin1_swedish_ci charset-collation */
+#define DATA_FIXBINARY 3 /* binary string of fixed length */
+#define DATA_BINARY 4 /* binary string */
+#define DATA_BLOB 5 /* binary large object, or a TEXT type;
+ if prtype & DATA_BINARY_TYPE == 0, then this is
+ actually a TEXT column (or a BLOB created
+ with < 4.0.14; since column prefix indexes
+ came only in 4.0.14, the missing flag in BLOBs
+ created before that does not cause any harm) */
+#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */
+#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */
+#define DATA_SYS 8 /* system column */
+
+/* Data types >= DATA_FLOAT must be compared using the whole field, not as
+binary strings */
+
+#define DATA_FLOAT 9
+#define DATA_DOUBLE 10
+#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */
+#define DATA_VARMYSQL 12 /* any charset varying length char */
+#define DATA_MYSQL 13 /* any charset fixed length char */
+ /* NOTE that 4.1.1 used DATA_MYSQL and
+ DATA_VARMYSQL for all character sets, and the
+ charset-collation for tables created with it
+ can also be latin1_swedish_ci */
+
+/* DATA_GEOMETRY includes all standard geometry datatypes as described in
+OGC standard(point, line_string, polygon, multi_point, multi_polygon,
+multi_line_string, geometry_collection, geometry).
+Currently, geometry data is stored in the standard Well-Known Binary(WKB)
+format (http://www.opengeospatial.org/standards/sfa).
+We use BLOB as the underlying datatype. */
+#define DATA_GEOMETRY 14 /* geometry datatype of variable length */
+#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size()
+ requires the values are <= 63 */
+
+#define DATA_MTYPE_CURRENT_MIN DATA_VARCHAR /* minimum value of mtype */
+#define DATA_MTYPE_CURRENT_MAX DATA_GEOMETRY /* maximum value of mtype */
+/*-------------------------------------------*/
+/* The 'PRECISE TYPE' of a column */
+/*
+Tables created by a MySQL user have the following convention:
+
+- In the least significant byte in the precise type we store the MySQL type
+code (not applicable for system columns).
+
+- In the second least significant byte we OR flags DATA_NOT_NULL,
+DATA_UNSIGNED, DATA_BINARY_TYPE.
+
+- In the third least significant byte of the precise type of string types we
+store the MySQL charset-collation code. In DATA_BLOB columns created with
+< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there
+are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no
+problem, though.
+
+Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the
+precise type, since the charset was always the default charset of the MySQL
+installation. If the stored charset code is 0 in the system table SYS_COLUMNS
+of InnoDB, that means that the default charset of this MySQL installation
+should be used.
+
+When loading a table definition from the system tables to the InnoDB data
+dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check
+if the stored charset-collation is 0, and if that is the case and the type is
+a non-binary string, replace that 0 by the default charset-collation code of
+this MySQL installation. In short, in old tables, the charset-collation code
+in the system tables on disk can be 0, but in in-memory data structures
+(dtype_t), the charset-collation code is always != 0 for non-binary string
+types.
+
+In new tables, in binary string types, the charset-collation code is the
+MySQL code for the 'binary charset', that is, != 0.
+
+For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those
+DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci,
+InnoDB performs all comparisons internally, without resorting to the MySQL
+comparison functions. This is to save CPU time.
+
+InnoDB's own internal system tables have different precise types for their
+columns, and for them the precise type is usually not used at all.
+*/
+
+#define DATA_ENGLISH 4 /* English language character string: this
+ is a relic from pre-MySQL time and only used
+ for InnoDB's own system tables */
+#define DATA_ERROR 111 /* another relic from pre-MySQL time */
+
+#define DATA_MYSQL_TYPE_MASK 255U/* AND with this mask to extract the MySQL
+ type from the precise type */
+#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3
+ format true VARCHAR */
+
+/* Precise data types for system columns and the length of those columns;
+NOTE: the values must run from 0 up in the order given! All codes must
+be less than 256 */
+#define DATA_ROW_ID 0 /* row id: a 48-bit integer */
+#define DATA_ROW_ID_LEN 6 /* stored length for row id */
+
+#define DATA_TRX_ID 1 /* transaction id: 6 bytes */
+#define DATA_TRX_ID_LEN 6
+
+#define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */
+#define DATA_ROLL_PTR_LEN 7
+
+#define DATA_N_SYS_COLS 3 /* number of system columns defined above */
+
+#define DATA_FTS_DOC_ID 3 /* Used as FTS DOC ID column */
+
+#define DATA_SYS_PRTYPE_MASK 0xFU /* mask to extract the above from prtype */
+
+/* Flags ORed to the precise data type */
+#define DATA_NOT_NULL 256U /* this is ORed to the precise type when
+ the column is declared as NOT NULL */
+#define DATA_UNSIGNED 512U /* this id ORed to the precise type when
+ we have an unsigned integer type */
+#define DATA_BINARY_TYPE 1024U /* if the data type is a binary character
+ string, this is ORed to the precise type:
+ this only holds for tables created with
+ >= MySQL-4.0.14 */
+/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1.
+ In earlier versions this was set for some
+ BLOB columns.
+*/
+#define DATA_GIS_MBR 2048U /* Used as GIS MBR column */
+/** the size of a GIS maximum bounding rectangle */
+constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double));
+
+#define DATA_LONG_TRUE_VARCHAR 4096U /* this is ORed to the precise data
+ type when the column is true VARCHAR where
+ MySQL uses 2 bytes to store the data len;
+ for shorter VARCHARs MySQL uses only 1 byte */
+#define DATA_VIRTUAL 8192U /* Virtual column */
+
+/** System Versioning */
+#define DATA_VERS_START 16384U /* start system field */
+#define DATA_VERS_END 32768U /* end system field */
+/** system-versioned user data column */
+#define DATA_VERSIONED (DATA_VERS_START|DATA_VERS_END)
+
+/** Check whether locking is disabled (never). */
+#define dict_table_is_locking_disabled(table) false
+
+/*-------------------------------------------*/
+
+/* This many bytes we need to store the type information affecting the
+alphabetical order for a single field and decide the storage size of an
+SQL null*/
+#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4
+/* In the >= 4.1.x storage format we add 2 bytes more so that we can also
+store the charset-collation number; one byte is left unused, though */
+#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6
+
+/* Maximum multi-byte character length in bytes, plus 1 */
+#define DATA_MBMAX 8
+
+/* For checking if mtype is GEOMETRY datatype */
+#define DATA_GEOMETRY_MTYPE(mtype) ((mtype) == DATA_GEOMETRY)
+
+/* For checking if mtype is BLOB or GEOMETRY, since we use BLOB as
+the underlying datatype of GEOMETRY data. */
+#define DATA_LARGE_MTYPE(mtype) ((mtype) == DATA_BLOB \
+ || (mtype) == DATA_GEOMETRY)
+
+/* For checking if data type is big length data type. */
+#define DATA_BIG_LEN_MTYPE(len, mtype) ((len) > 255 || DATA_LARGE_MTYPE(mtype))
+
+/* For checking if the column is a big length column. */
+#define DATA_BIG_COL(col) DATA_BIG_LEN_MTYPE((col)->len, (col)->mtype)
+
+/* For checking if data type is large binary data type. */
+#define DATA_LARGE_BINARY(mtype,prtype) ((mtype) == DATA_GEOMETRY || \
+ ((mtype) == DATA_BLOB && !((prtype) & DATA_BINARY_TYPE)))
+
+/* We now support 15 bits (up to 32767) collation number */
+#define MAX_CHAR_COLL_NUM 32767
+
+/* Mask to get the Charset Collation number (0x7fff) */
+#define CHAR_COLL_MASK MAX_CHAR_COLL_NUM
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+ const dtype_t* type); /*!< in: type struct */
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return length of the prefix, in bytes */
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+ ulint prtype, /*!< in: precise type */
+ ulint mbminlen, /*!< in: minimum length of
+ a multi-byte character, in bytes */
+ ulint mbmaxlen, /*!< in: maximum length of
+ a multi-byte character, in bytes */
+ ulint prefix_len, /*!< in: length of the requested
+ prefix, in characters, multiplied by
+ dtype_get_mbmaxlen(dtype) */
+ ulint data_len, /*!< in: length of str (in bytes) */
+ const char* str); /*!< in: the string whose prefix
+ length is being determined */
+/** @return whether main type is a string type */
+inline bool dtype_is_string_type(ulint mtype)
+{
+ return mtype <= DATA_BLOB
+ || mtype == DATA_MYSQL || mtype == DATA_VARMYSQL;
+}
+
+/** @return whether a type is a binary string type */
+inline bool dtype_is_binary_string_type(ulint mtype, ulint prtype)
+{
+ /* Note that for tables created before MySQL 4.0.14,
+ we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+ For those DATA_BLOB columns we return false. */
+
+ return mtype == DATA_FIXBINARY || mtype == DATA_BINARY
+ || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE));
+}
+
+/** @return whether a type is a non-binary string type */
+inline bool dtype_is_non_binary_string_type(ulint mtype, ulint prtype)
+{
+ return dtype_is_string_type(mtype)
+ && !dtype_is_binary_string_type(mtype, prtype);
+}
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+ dtype_t* type, /*!< in: type struct to init */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint len); /*!< in: precision of type */
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+ dtype_t* type1, /*!< in: type struct to copy to */
+ const dtype_t* type2); /*!< in: type struct to copy from */
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+ const dtype_t* type); /*!< in: data type */
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+ const dtype_t* type); /*!< in: data type */
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_get_mblen(
+/*============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type (and collation) */
+ unsigned* mbminlen, /*!< out: minimum length of a
+ multi-byte character */
+ unsigned* mbmaxlen); /*!< out: maximum length of a
+ multi-byte character */
+/**
+Get the charset-collation code for string types.
+@param prtype InnoDB precise type
+@return charset-collation code */
+inline uint16_t dtype_get_charset_coll(ulint prtype)
+{
+ return static_cast<uint16_t>(prtype >> 16) & CHAR_COLL_MASK;
+}
+
+/** Form a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@param[in] old_prtype MySQL type code and the flags
+ DATA_BINARY_TYPE etc.
+@param[in] charset_coll character-set collation code
+@return precise type, including the charset-collation code */
+UNIV_INLINE
+uint32_t
+dtype_form_prtype(ulint old_prtype, ulint charset_coll)
+{
+ ut_ad(old_prtype < 256 * 256);
+ ut_ad(charset_coll <= MAX_CHAR_COLL_NUM);
+ return(uint32_t(old_prtype + (charset_coll << 16)));
+}
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8. This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return whether a subset of UTF-8 */
+UNIV_INLINE
+bool
+dtype_is_utf8(
+/*==========*/
+ ulint prtype);/*!< in: precise data type */
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+ const dtype_t* type); /*!< in: data type */
+
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+ const dtype_t* type); /*!< in: type */
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+ const dtype_t* type); /*!< in: type */
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dtype_get_fixed_size_low(
+/*=====================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a
+ multibyte character, in bytes */
+ ulint mbmaxlen, /*!< in: maximum length of a
+ multibyte character, in bytes */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dtype_get_min_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a character */
+ ulint mbmaxlen); /*!< in: maximum length of a character */
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint len); /*!< in: length */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+ const dtype_t* type, /*!< in: type */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf); /*!< in: buffer for the stored order info */
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+ byte* buf, /*!< in: buffer for
+ DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ bytes where we store the info */
+ const dtype_t* type, /*!< in: type struct */
+ ulint prefix_len);/*!< in: prefix length to
+ replace type->len, or 0 */
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf); /*!< in: buffer for stored type order info */
+
+/*********************************************************************//**
+Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
+@return the SQL type name */
+UNIV_INLINE
+char*
+dtype_sql_name(
+/*===========*/
+ unsigned mtype, /*!< in: mtype */
+ unsigned prtype, /*!< in: prtype */
+ unsigned len, /*!< in: len */
+ char* name, /*!< out: SQL name */
+ unsigned name_sz);/*!< in: size of the name buffer */
+
+/*********************************************************************//**
+Validates a data type structure.
+@return TRUE if ok */
+ibool
+dtype_validate(
+/*===========*/
+ const dtype_t* type); /*!< in: type struct to validate */
+#ifdef UNIV_DEBUG
+/** Print a data type structure.
+@param[in] type data type */
+void
+dtype_print(
+ const dtype_t* type);
+#endif /* UNIV_DEBUG */
+
+/* Structure for an SQL data type.
+If you add fields to this structure, be sure to initialize them everywhere.
+This structure is initialized in the following functions:
+dtype_set()
+dtype_read_for_order_and_null_size()
+dtype_new_read_for_order_and_null_size()
+sym_tab_add_null_lit() */
+
+struct dtype_t{
+ unsigned prtype:32; /*!< precise type; MySQL data
+ type, charset code, flags to
+ indicate nullability,
+ signedness, whether this is a
+ binary string, whether this is
+ a true VARCHAR where MySQL
+ uses 2 bytes to store the length */
+ unsigned mtype:8; /*!< main data type */
+
+ /* the remaining fields do not affect alphabetical ordering: */
+
+ unsigned len:16; /*!< length; for MySQL data this
+ is field->pack_length(),
+ except that for a >= 5.0.3
+ type true VARCHAR this is the
+ maximum byte length of the
+ string data (in addition to
+ the string, MySQL uses 1 or 2
+ bytes to store the string length) */
+ unsigned mbminlen:3; /*!< minimum length of a character,
+ in bytes */
+ unsigned mbmaxlen:3; /*!< maximum length of a character,
+ in bytes */
+
+ /** @return whether this is system versioned user field */
+ bool is_versioned() const { return !(~prtype & DATA_VERSIONED); }
+ /** @return whether this is the system field start */
+ bool vers_sys_start() const
+ {
+ return (prtype & DATA_VERSIONED) == DATA_VERS_START;
+ }
+ /** @return whether this is the system field end */
+ bool vers_sys_end() const
+ {
+ return (prtype & DATA_VERSIONED) == DATA_VERS_END;
+ }
+
+ /** Set the type of the BLOB in the hidden metadata record. */
+ void metadata_blob_init()
+ {
+ prtype = DATA_NOT_NULL;
+ mtype = DATA_BLOB;
+ len = 0;
+ mbminlen = 0;
+ mbmaxlen = 0;
+ }
+};
+
+/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */
+extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+/** Info bit denoting the predefined minimum record: this bit is set
+if and only if the record is the first user record on a non-leaf
+B-tree page that is the leftmost page on its level
+(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */
+#define REC_INFO_MIN_REC_FLAG 0x10UL
+/** The delete-mark flag in info bits */
+#define REC_INFO_DELETED_FLAG 0x20UL
+
+/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */
+enum rec_comp_status_t {
+ /** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */
+ REC_STATUS_ORDINARY = 0,
+ /** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */
+ REC_STATUS_NODE_PTR = 1,
+ /** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */
+ REC_STATUS_INFIMUM = 2,
+ /** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */
+ REC_STATUS_SUPREMUM = 3,
+ /** Clustered index record that has been inserted or updated
+ after instant ADD COLUMN (more than dict_index_t::n_core_fields) */
+ REC_STATUS_INSTANT = 4
+};
+
+/** The dtuple_t::info_bits of the hidden metadata of instant ADD COLUMN.
+@see rec_is_metadata()
+@see rec_is_alter_metadata() */
+static const byte REC_INFO_METADATA_ADD
+ = REC_INFO_MIN_REC_FLAG | REC_STATUS_INSTANT;
+
+/** The dtuple_t::info_bits of the hidden metadata of instant ALTER TABLE.
+@see rec_is_metadata() */
+static const byte REC_INFO_METADATA_ALTER
+ = REC_INFO_METADATA_ADD | REC_INFO_DELETED_FLAG;
+
+#include "data0type.ic"
+
+#endif
diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic
new file mode 100644
index 00000000..b81b68e6
--- /dev/null
+++ b/storage/innobase/include/data0type.ic
@@ -0,0 +1,618 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.ic
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "ha_prototypes.h"
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8. This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return whether a subset of UTF-8 */
+UNIV_INLINE
+bool
+dtype_is_utf8(
+/*==========*/
+ ulint prtype) /*!< in: precise data type */
+{
+ /* These codes have been copied from strings/ctype-extra.c
+ and strings/ctype-utf8.c. */
+ switch (dtype_get_charset_coll(prtype)) {
+ case 11: /* ascii_general_ci */
+ case 65: /* ascii_bin */
+ case 33: /* utf8_general_ci */
+ case 83: /* utf8_bin */
+ case 254: /* utf8_general_cs */
+ return true;
+ }
+
+ return false;
+}
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+ const dtype_t* type) /*!< in: type struct */
+{
+ return(type->prtype & 0xFFUL);
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_get_mblen(
+/*============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type (and collation) */
+ unsigned*mbminlen, /*!< out: minimum length of a
+ multi-byte character */
+ unsigned*mbmaxlen) /*!< out: maximum length of a
+ multi-byte character */
+{
+ if (dtype_is_string_type(mtype)) {
+ innobase_get_cset_width(dtype_get_charset_coll(prtype),
+ mbminlen, mbmaxlen);
+ ut_ad(*mbminlen <= *mbmaxlen);
+ ut_ad(*mbminlen < DATA_MBMAX);
+ ut_ad(*mbmaxlen < DATA_MBMAX);
+ } else {
+ *mbminlen = *mbmaxlen = 0;
+ }
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_set_mblen(
+/*============*/
+ dtype_t* type) /*!< in/out: type */
+{
+ unsigned mbminlen, mbmaxlen;
+
+ dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen);
+ type->mbminlen = mbminlen & 7;
+ type->mbmaxlen = mbmaxlen & 7;
+
+ ut_ad(dtype_validate(type));
+}
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+ dtype_t* type, /*!< in: type struct to init */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint len) /*!< in: precision of type */
+{
+ ut_ad(type);
+ ut_ad(mtype <= DATA_MTYPE_MAX);
+
+ type->mtype = static_cast<byte>(mtype);
+ type->prtype = static_cast<unsigned>(prtype);
+ type->len = static_cast<uint16_t>(len);
+
+ dtype_set_mblen(type);
+}
+
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+ dtype_t* type1, /*!< in: type struct to copy to */
+ const dtype_t* type2) /*!< in: type struct to copy from */
+{
+ *type1 = *type2;
+
+ ut_ad(dtype_validate(type1));
+}
+
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->mtype);
+}
+
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->prtype);
+}
+
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->len);
+}
+
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+ const dtype_t* type) /*!< in: type */
+{
+ return type->mbminlen;
+}
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+ const dtype_t* type) /*!< in: type */
+{
+ return type->mbmaxlen;
+}
+
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+ byte* buf, /*!< in: buffer for
+ DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ bytes where we store the info */
+ const dtype_t* type, /*!< in: type struct */
+ ulint prefix_len)/*!< in: prefix length to
+ replace type->len, or 0 */
+{
+ compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ ulint len;
+
+ ut_ad(type);
+ ut_ad(type->mtype >= DATA_VARCHAR);
+ ut_ad(type->mtype <= DATA_MTYPE_MAX);
+
+ buf[0] = (byte)(type->mtype & 0xFFUL);
+
+ if (type->prtype & DATA_BINARY_TYPE) {
+ buf[0] |= 128;
+ }
+
+ /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) {
+ buf[0] |= 64;
+ }
+ */
+
+ buf[1] = (byte)(type->prtype & 0xFFUL);
+
+ len = prefix_len ? prefix_len : type->len;
+
+ mach_write_to_2(buf + 2, len & 0xFFFFUL);
+
+ ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM);
+ mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
+
+ if (type->prtype & DATA_NOT_NULL) {
+ buf[4] |= 128;
+ }
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the < 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf) /*!< in: buffer for stored type order info */
+{
+ compile_time_assert(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ type->mtype = buf[0] & 63;
+ type->prtype = buf[1];
+
+ if (buf[0] & 128) {
+ type->prtype |= DATA_BINARY_TYPE;
+ }
+
+ type->len = mach_read_from_2(buf + 2);
+
+ type->prtype = dtype_form_prtype(type->prtype,
+ data_mysql_default_charset_coll);
+ dtype_set_mblen(type);
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf) /*!< in: buffer for stored type order info */
+{
+ compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ type->mtype = buf[0] & 63;
+ type->prtype = buf[1];
+
+ if (buf[0] & 128) {
+ type->prtype |= DATA_BINARY_TYPE;
+ }
+
+ if (buf[4] & 128) {
+ type->prtype |= DATA_NOT_NULL;
+ }
+
+ type->len = mach_read_from_2(buf + 2);
+
+ ulint charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK;
+
+ if (dtype_is_string_type(type->mtype)) {
+ ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
+
+ if (charset_coll == 0) {
+ /* This insert buffer record was inserted with MySQL
+ version < 4.1.2, and the charset-collation code was not
+ explicitly stored to dtype->prtype at that time. It
+ must be the default charset-collation of this MySQL
+ installation. */
+
+ charset_coll = data_mysql_default_charset_coll;
+ }
+
+ type->prtype = dtype_form_prtype(type->prtype, charset_coll);
+ }
+ dtype_set_mblen(type);
+}
+
+/*********************************************************************//**
+Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
+@return the SQL type name */
+UNIV_INLINE
+char*
+dtype_sql_name(
+/*===========*/
+ unsigned mtype, /*!< in: mtype */
+ unsigned prtype, /*!< in: prtype */
+ unsigned len, /*!< in: len */
+ char* name, /*!< out: SQL name */
+ unsigned name_sz)/*!< in: size of the name buffer */
+{
+
+#define APPEND_UNSIGNED() \
+ do { \
+ if (prtype & DATA_UNSIGNED) { \
+ snprintf(name + strlen(name), \
+ name_sz - strlen(name), \
+ " UNSIGNED"); \
+ } \
+ } while (0)
+
+ snprintf(name, name_sz, "UNKNOWN");
+
+ switch (mtype) {
+ case DATA_INT:
+ switch (len) {
+ case 1:
+ snprintf(name, name_sz, "TINYINT");
+ break;
+ case 2:
+ snprintf(name, name_sz, "SMALLINT");
+ break;
+ case 3:
+ snprintf(name, name_sz, "MEDIUMINT");
+ break;
+ case 4:
+ snprintf(name, name_sz, "INT");
+ break;
+ case 8:
+ snprintf(name, name_sz, "BIGINT");
+ break;
+ }
+ APPEND_UNSIGNED();
+ break;
+ case DATA_FLOAT:
+ snprintf(name, name_sz, "FLOAT");
+ APPEND_UNSIGNED();
+ break;
+ case DATA_DOUBLE:
+ snprintf(name, name_sz, "DOUBLE");
+ APPEND_UNSIGNED();
+ break;
+ case DATA_FIXBINARY:
+ snprintf(name, name_sz, "BINARY(%u)", len);
+ break;
+ case DATA_CHAR:
+ case DATA_MYSQL:
+ snprintf(name, name_sz, "CHAR(%u)", len);
+ break;
+ case DATA_VARCHAR:
+ case DATA_VARMYSQL:
+ snprintf(name, name_sz, "VARCHAR(%u)", len);
+ break;
+ case DATA_BINARY:
+ snprintf(name, name_sz, "VARBINARY(%u)", len);
+ break;
+ case DATA_GEOMETRY:
+ snprintf(name, name_sz, "GEOMETRY");
+ break;
+ case DATA_BLOB:
+ switch (len) {
+ case 9:
+ snprintf(name, name_sz, "TINYBLOB");
+ break;
+ case 10:
+ snprintf(name, name_sz, "BLOB");
+ break;
+ case 11:
+ snprintf(name, name_sz, "MEDIUMBLOB");
+ break;
+ case 12:
+ snprintf(name, name_sz, "LONGBLOB");
+ break;
+ }
+ }
+
+ if (prtype & DATA_NOT_NULL) {
+ snprintf(name + strlen(name),
+ name_sz - strlen(name),
+ " NOT NULL");
+ }
+
+ return(name);
+}
+
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dtype_get_fixed_size_low(
+/*=====================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a
+ multibyte character, in bytes */
+ ulint mbmaxlen, /*!< in: maximum length of a
+ multibyte character, in bytes */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ switch (mtype) {
+ case DATA_SYS:
+#ifdef UNIV_DEBUG
+ switch (prtype & DATA_MYSQL_TYPE_MASK) {
+ case DATA_ROW_ID:
+ ut_ad(len == DATA_ROW_ID_LEN);
+ break;
+ case DATA_TRX_ID:
+ ut_ad(len == DATA_TRX_ID_LEN);
+ break;
+ case DATA_ROLL_PTR:
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ break;
+ default:
+ ut_ad(0);
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+ /* fall through */
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ return static_cast<unsigned>(len);
+ case DATA_MYSQL:
+ if (prtype & DATA_BINARY_TYPE) {
+ return static_cast<unsigned>(len);
+ } else if (!comp) {
+ return static_cast<unsigned>(len);
+ } else {
+#ifdef UNIV_DEBUG
+ unsigned i_mbminlen, i_mbmaxlen;
+
+ innobase_get_cset_width(
+ dtype_get_charset_coll(prtype),
+ &i_mbminlen, &i_mbmaxlen);
+
+ ut_ad(i_mbminlen == mbminlen);
+ ut_ad(i_mbmaxlen == mbmaxlen);
+#endif /* UNIV_DEBUG */
+ if (mbminlen == mbmaxlen) {
+ return static_cast<unsigned>(len);
+ }
+ }
+ /* Treat as variable-length. */
+ /* fall through */
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ case DATA_GEOMETRY:
+ case DATA_BLOB:
+ return(0);
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dtype_get_min_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a character */
+ ulint mbmaxlen) /*!< in: maximum length of a character */
+{
+ switch (mtype) {
+ case DATA_SYS:
+#ifdef UNIV_DEBUG
+ switch (prtype & DATA_MYSQL_TYPE_MASK) {
+ case DATA_ROW_ID:
+ ut_ad(len == DATA_ROW_ID_LEN);
+ break;
+ case DATA_TRX_ID:
+ ut_ad(len == DATA_TRX_ID_LEN);
+ break;
+ case DATA_ROLL_PTR:
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ break;
+ default:
+ ut_ad(0);
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+ /* fall through */
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ return static_cast<unsigned>(len);
+ case DATA_MYSQL:
+ if (prtype & DATA_BINARY_TYPE) {
+ return static_cast<unsigned>(len);
+ } else {
+ if (mbminlen == mbmaxlen) {
+ return static_cast<unsigned>(len);
+ }
+
+ /* this is a variable-length character set */
+ ut_a(mbminlen > 0);
+ ut_a(mbmaxlen > mbminlen);
+ ut_a(len % mbmaxlen == 0);
+ return static_cast<unsigned>(
+ len * mbminlen / mbmaxlen);
+ }
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ case DATA_GEOMETRY:
+ case DATA_BLOB:
+ return(0);
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint len) /*!< in: length */
+{
+ switch (mtype) {
+ case DATA_SYS:
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_MYSQL:
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ return(len);
+ case DATA_GEOMETRY:
+ case DATA_BLOB:
+ break;
+ default:
+ ut_error;
+ }
+
+ return(ULINT_MAX);
+}
+
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+ const dtype_t* type, /*!< in: type */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+ type->mbminlen, type->mbmaxlen, comp));
+}
diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h
new file mode 100644
index 00000000..bcd6b8bc
--- /dev/null
+++ b/storage/innobase/include/data0types.h
@@ -0,0 +1,36 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0types.h
+Some type definitions
+
+Created 9/21/2000 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0types_h
+#define data0types_h
+
+/* SQL data field struct */
+struct dfield_t;
+
+/* SQL data tuple struct */
+struct dtuple_t;
+
+#endif
+
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
new file mode 100644
index 00000000..6cfc63f4
--- /dev/null
+++ b/storage/innobase/include/db0err.h
@@ -0,0 +1,178 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/db0err.h
+Global error codes for the database
+
+Created 5/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef db0err_h
+#define db0err_h
+
+/* Do not include univ.i because univ.i includes this. */
+
+enum dberr_t {
+ DB_SUCCESS,
+
+ DB_SUCCESS_LOCKED_REC = 9, /*!< like DB_SUCCESS, but a new
+ explicit record lock was created */
+
+ /* The following are error codes */
+ DB_ERROR = 11,
+ DB_INTERRUPTED,
+ DB_OUT_OF_MEMORY,
+ DB_OUT_OF_FILE_SPACE,
+ DB_LOCK_WAIT,
+ DB_DEADLOCK,
+ DB_ROLLBACK,
+ DB_DUPLICATE_KEY,
+ DB_MISSING_HISTORY, /*!< required history data has been
+ deleted due to lack of space in
+ rollback segment */
+ DB_CLUSTER_NOT_FOUND = 30,
+ DB_TABLE_NOT_FOUND,
+ DB_MUST_GET_MORE_FILE_SPACE, /*!< the database has to be stopped
+ and restarted with more file space */
+ DB_TABLE_IS_BEING_USED,
+ DB_TOO_BIG_RECORD, /*!< a record in an index would not fit
+ on a compressed page, or it would
+ become bigger than 1/2 free space in
+ an uncompressed page frame */
+ DB_LOCK_WAIT_TIMEOUT, /*!< lock wait lasted too long */
+ DB_NO_REFERENCED_ROW, /*!< referenced key value not found
+ for a foreign key in an insert or
+ update of a row */
+ DB_ROW_IS_REFERENCED, /*!< cannot delete or update a row
+ because it contains a key value
+ which is referenced */
+ DB_CANNOT_ADD_CONSTRAINT, /*!< adding a foreign key constraint
+ to a table failed */
+ DB_CORRUPTION, /*!< data structure corruption
+ noticed */
+ DB_CANNOT_DROP_CONSTRAINT, /*!< dropping a foreign key constraint
+ from a table failed */
+ DB_NO_SAVEPOINT, /*!< no savepoint exists with the given
+ name */
+ DB_TABLESPACE_EXISTS, /*!< we cannot create a new single-table
+ tablespace because a file of the same
+ name already exists */
+ DB_TABLESPACE_DELETED, /*!< tablespace was deleted or is
+ being dropped right now */
+ DB_TABLESPACE_NOT_FOUND, /*<! Attempt to delete a tablespace
+ instance that was not found in the
+ tablespace hash table */
+ DB_LOCK_TABLE_FULL, /*!< lock structs have exhausted the
+ buffer pool (for big transactions,
+ InnoDB stores the lock structs in the
+ buffer pool) */
+ DB_FOREIGN_DUPLICATE_KEY, /*!< foreign key constraints
+ activated by the operation would
+ lead to a duplicate key in some
+ table */
+ DB_TOO_MANY_CONCURRENT_TRXS, /*!< when InnoDB runs out of the
+ preconfigured undo slots, this can
+ only happen when there are too many
+ concurrent transactions */
+ DB_UNSUPPORTED, /*!< when InnoDB sees any artefact or
+ a feature that it can't recoginize or
+ work with e.g., FT indexes created by
+ a later version of the engine. */
+
+ DB_INVALID_NULL, /*!< a NOT NULL column was found to
+ be NULL during table rebuild */
+
+ DB_STATS_DO_NOT_EXIST, /*!< an operation that requires the
+ persistent storage, used for recording
+ table and index statistics, was
+ requested but this storage does not
+ exist itself or the stats for a given
+ table do not exist */
+ DB_FOREIGN_EXCEED_MAX_CASCADE, /*!< Foreign key constraint related
+ cascading delete/update exceeds
+ maximum allowed depth */
+ DB_CHILD_NO_INDEX, /*!< the child (foreign) table does
+ not have an index that contains the
+ foreign keys as its prefix columns */
+ DB_PARENT_NO_INDEX, /*!< the parent table does not
+ have an index that contains the
+ foreign keys as its prefix columns */
+ DB_TOO_BIG_INDEX_COL, /*!< index column size exceeds
+ maximum limit */
+ DB_INDEX_CORRUPT, /*!< we have corrupted index */
+ DB_UNDO_RECORD_TOO_BIG, /*!< the undo log record is too big */
+ DB_READ_ONLY, /*!< Update operation attempted in
+ a read-only transaction */
+ DB_FTS_INVALID_DOCID, /* FTS Doc ID cannot be zero */
+ DB_TABLE_IN_FK_CHECK, /* table is being used in foreign
+ key check */
+ DB_ONLINE_LOG_TOO_BIG, /*!< Modification log grew too big
+ during online index creation */
+
+ DB_IDENTIFIER_TOO_LONG, /*!< Identifier name too long */
+ DB_FTS_EXCEED_RESULT_CACHE_LIMIT, /*!< FTS query memory
+ exceeds result cache limit */
+ DB_TEMP_FILE_WRITE_FAIL, /*!< Temp file write failure */
+ DB_CANT_CREATE_GEOMETRY_OBJECT, /*!< Cannot create specified Geometry
+ data object */
+ DB_CANNOT_OPEN_FILE, /*!< Cannot open a file */
+ DB_FTS_TOO_MANY_WORDS_IN_PHRASE,
+ /*< Too many words in a phrase */
+
+ DB_DECRYPTION_FAILED, /* Tablespace encrypted and
+ decrypt operation failed because
+ of missing key management plugin,
+ or missing or incorrect key or
+ incorret AES method or algorithm. */
+
+ DB_IO_ERROR = 100, /*!< Generic IO error */
+
+ DB_IO_PARTIAL_FAILED, /*!< Partial IO request failed */
+
+ DB_FORCED_ABORT, /*!< Transaction was forced to rollback
+ by a higher priority transaction */
+
+ DB_TABLE_CORRUPT, /*!< Table/clustered index is
+ corrupted */
+
+ DB_COMPUTE_VALUE_FAILED, /*!< Compute generated value failed */
+
+ DB_NO_FK_ON_S_BASE_COL, /*!< Cannot add foreign constrain
+ placed on the base column of
+ stored column */
+
+ DB_IO_NO_PUNCH_HOLE, /*!< Punch hole not supported by
+ file system. */
+
+ DB_PAGE_CORRUPTED, /* Page read from tablespace is
+ corrupted. */
+ /* The following are partial failure codes */
+ DB_FAIL = 1000,
+ DB_OVERFLOW,
+ DB_UNDERFLOW,
+ DB_STRONG_FAIL,
+ DB_ZIP_OVERFLOW,
+ DB_RECORD_NOT_FOUND = 1500,
+ DB_END_OF_INDEX,
+ DB_NOT_FOUND, /*!< Generic error code for "Not found"
+ type of errors */
+};
+
+#endif
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
new file mode 100644
index 00000000..0f96df8f
--- /dev/null
+++ b/storage/innobase/include/dict0boot.h
@@ -0,0 +1,330 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.h
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0boot_h
+#define dict0boot_h
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "buf0buf.h"
+#include "dict0dict.h"
+
+/** @return the DICT_HDR block, x-latched */
+buf_block_t *dict_hdr_get(mtr_t* mtr);
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+void
+dict_hdr_get_new_id(
+/*================*/
+ table_id_t* table_id, /*!< out: table id
+ (not assigned if NULL) */
+ index_id_t* index_id, /*!< out: index id
+ (not assigned if NULL) */
+ ulint* space_id); /*!< out: space id
+ (not assigned if NULL) */
+/**********************************************************************//**
+Writes the current value of the row id counter to the dictionary header file
+page. */
+void
+dict_hdr_flush_row_id(void);
+/*=======================*/
+/**********************************************************************//**
+Returns a new row id.
+@return the new id */
+UNIV_INLINE
+row_id_t
+dict_sys_get_new_row_id(void);
+/*=========================*/
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+ byte* field, /*!< in: record field */
+ row_id_t row_id);/*!< in: row id */
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created.
+@return DB_SUCCESS or error code. */
+dberr_t
+dict_boot(void)
+/*===========*/
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*****************************************************************//**
+Creates and initializes the data dictionary at the server bootstrap.
+@return DB_SUCCESS or error code. */
+dberr_t
+dict_create(void)
+/*=============*/
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Check if a table id belongs to system table.
+@return true if the table id belongs to a system table. */
+UNIV_INLINE
+bool
+dict_is_sys_table(
+/*==============*/
+ table_id_t id) /*!< in: table id to check */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/* Space id and page no where the dictionary header resides */
+#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
+#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO
+
+/* The ids for the basic system tables and their indexes */
+#define DICT_TABLES_ID 1
+#define DICT_COLUMNS_ID 2
+#define DICT_INDEXES_ID dict_index_t::DICT_INDEXES_ID /* 3 */
+#define DICT_FIELDS_ID 4
+/* The following is a secondary index on SYS_TABLES */
+#define DICT_TABLE_IDS_ID 5
+
+/* The offset of the dictionary header on the page */
+#define DICT_HDR FSEG_PAGE_DATA
+
+/*-------------------------------------------------------------*/
+/* Dictionary header offsets */
+#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */
+#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */
+#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */
+#define DICT_HDR_MAX_SPACE_ID 24 /* The latest assigned space id,or 0*/
+#define DICT_HDR_MIX_ID_LOW 28 /* Obsolete,always DICT_HDR_FIRST_ID*/
+#define DICT_HDR_TABLES 32 /* Root of SYS_TABLES clust index */
+#define DICT_HDR_TABLE_IDS 36 /* Root of SYS_TABLE_IDS sec index */
+#define DICT_HDR_COLUMNS 40 /* Root of SYS_COLUMNS clust index */
+#define DICT_HDR_INDEXES 44 /* Root of SYS_INDEXES clust index */
+#define DICT_HDR_FIELDS 48 /* Root of SYS_FIELDS clust index */
+
+#define DICT_HDR_FSEG_HEADER 56 /* Segment header for the tablespace
+ segment into which the dictionary
+ header is created */
+/*-------------------------------------------------------------*/
+
+/* The columns in SYS_TABLES */
+enum dict_col_sys_tables_enum {
+ DICT_COL__SYS_TABLES__NAME = 0,
+ DICT_COL__SYS_TABLES__ID = 1,
+ DICT_COL__SYS_TABLES__N_COLS = 2,
+ DICT_COL__SYS_TABLES__TYPE = 3,
+ DICT_COL__SYS_TABLES__MIX_ID = 4,
+ DICT_COL__SYS_TABLES__MIX_LEN = 5,
+ DICT_COL__SYS_TABLES__CLUSTER_ID = 6,
+ DICT_COL__SYS_TABLES__SPACE = 7,
+ DICT_NUM_COLS__SYS_TABLES = 8
+};
+/* The field numbers in the SYS_TABLES clustered index */
+enum dict_fld_sys_tables_enum {
+ DICT_FLD__SYS_TABLES__NAME = 0,
+ DICT_FLD__SYS_TABLES__DB_TRX_ID = 1,
+ DICT_FLD__SYS_TABLES__DB_ROLL_PTR = 2,
+ DICT_FLD__SYS_TABLES__ID = 3,
+ DICT_FLD__SYS_TABLES__N_COLS = 4,
+ DICT_FLD__SYS_TABLES__TYPE = 5,
+ DICT_FLD__SYS_TABLES__MIX_ID = 6,
+ DICT_FLD__SYS_TABLES__MIX_LEN = 7,
+ DICT_FLD__SYS_TABLES__CLUSTER_ID = 8,
+ DICT_FLD__SYS_TABLES__SPACE = 9,
+ DICT_NUM_FIELDS__SYS_TABLES = 10
+};
+/* The field numbers in the SYS_TABLE_IDS index */
+enum dict_fld_sys_table_ids_enum {
+ DICT_FLD__SYS_TABLE_IDS__ID = 0,
+ DICT_FLD__SYS_TABLE_IDS__NAME = 1,
+ DICT_NUM_FIELDS__SYS_TABLE_IDS = 2
+};
+/* The columns in SYS_COLUMNS */
+enum dict_col_sys_columns_enum {
+ DICT_COL__SYS_COLUMNS__TABLE_ID = 0,
+ DICT_COL__SYS_COLUMNS__POS = 1,
+ DICT_COL__SYS_COLUMNS__NAME = 2,
+ DICT_COL__SYS_COLUMNS__MTYPE = 3,
+ DICT_COL__SYS_COLUMNS__PRTYPE = 4,
+ DICT_COL__SYS_COLUMNS__LEN = 5,
+ DICT_COL__SYS_COLUMNS__PREC = 6,
+ DICT_NUM_COLS__SYS_COLUMNS = 7
+};
+/* The field numbers in the SYS_COLUMNS clustered index */
+enum dict_fld_sys_columns_enum {
+ DICT_FLD__SYS_COLUMNS__TABLE_ID = 0,
+ DICT_FLD__SYS_COLUMNS__POS = 1,
+ DICT_FLD__SYS_COLUMNS__DB_TRX_ID = 2,
+ DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR = 3,
+ DICT_FLD__SYS_COLUMNS__NAME = 4,
+ DICT_FLD__SYS_COLUMNS__MTYPE = 5,
+ DICT_FLD__SYS_COLUMNS__PRTYPE = 6,
+ DICT_FLD__SYS_COLUMNS__LEN = 7,
+ DICT_FLD__SYS_COLUMNS__PREC = 8,
+ DICT_NUM_FIELDS__SYS_COLUMNS = 9
+};
+/* The columns in SYS_INDEXES */
+enum dict_col_sys_indexes_enum {
+ DICT_COL__SYS_INDEXES__TABLE_ID = 0,
+ DICT_COL__SYS_INDEXES__ID = 1,
+ DICT_COL__SYS_INDEXES__NAME = 2,
+ DICT_COL__SYS_INDEXES__N_FIELDS = 3,
+ DICT_COL__SYS_INDEXES__TYPE = 4,
+ DICT_COL__SYS_INDEXES__SPACE = 5,
+ DICT_COL__SYS_INDEXES__PAGE_NO = 6,
+ DICT_COL__SYS_INDEXES__MERGE_THRESHOLD = 7,
+ DICT_NUM_COLS__SYS_INDEXES = 8
+};
+/* The field numbers in the SYS_INDEXES clustered index */
+enum dict_fld_sys_indexes_enum {
+ DICT_FLD__SYS_INDEXES__TABLE_ID = 0,
+ DICT_FLD__SYS_INDEXES__ID = 1,
+ DICT_FLD__SYS_INDEXES__DB_TRX_ID = 2,
+ DICT_FLD__SYS_INDEXES__DB_ROLL_PTR = 3,
+ DICT_FLD__SYS_INDEXES__NAME = 4,
+ DICT_FLD__SYS_INDEXES__N_FIELDS = 5,
+ DICT_FLD__SYS_INDEXES__TYPE = 6,
+ DICT_FLD__SYS_INDEXES__SPACE = 7,
+ DICT_FLD__SYS_INDEXES__PAGE_NO = 8,
+ DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD = 9,
+ DICT_NUM_FIELDS__SYS_INDEXES = 10
+};
+/* The columns in SYS_FIELDS */
+enum dict_col_sys_fields_enum {
+ DICT_COL__SYS_FIELDS__INDEX_ID = 0,
+ DICT_COL__SYS_FIELDS__POS = 1,
+ DICT_COL__SYS_FIELDS__COL_NAME = 2,
+ DICT_NUM_COLS__SYS_FIELDS = 3
+};
+/* The field numbers in the SYS_FIELDS clustered index */
+enum dict_fld_sys_fields_enum {
+ DICT_FLD__SYS_FIELDS__INDEX_ID = 0,
+ DICT_FLD__SYS_FIELDS__POS = 1,
+ DICT_FLD__SYS_FIELDS__DB_TRX_ID = 2,
+ DICT_FLD__SYS_FIELDS__DB_ROLL_PTR = 3,
+ DICT_FLD__SYS_FIELDS__COL_NAME = 4,
+ DICT_NUM_FIELDS__SYS_FIELDS = 5
+};
+/* The columns in SYS_FOREIGN */
+enum dict_col_sys_foreign_enum {
+ DICT_COL__SYS_FOREIGN__ID = 0,
+ DICT_COL__SYS_FOREIGN__FOR_NAME = 1,
+ DICT_COL__SYS_FOREIGN__REF_NAME = 2,
+ DICT_COL__SYS_FOREIGN__N_COLS = 3,
+ DICT_NUM_COLS__SYS_FOREIGN = 4
+};
+/* The field numbers in the SYS_FOREIGN clustered index */
+enum dict_fld_sys_foreign_enum {
+ DICT_FLD__SYS_FOREIGN__ID = 0,
+ DICT_FLD__SYS_FOREIGN__DB_TRX_ID = 1,
+ DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR = 2,
+ DICT_FLD__SYS_FOREIGN__FOR_NAME = 3,
+ DICT_FLD__SYS_FOREIGN__REF_NAME = 4,
+ DICT_FLD__SYS_FOREIGN__N_COLS = 5,
+ DICT_NUM_FIELDS__SYS_FOREIGN = 6
+};
+/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */
+enum dict_fld_sys_foreign_for_name_enum {
+ DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME = 0,
+ DICT_FLD__SYS_FOREIGN_FOR_NAME__ID = 1,
+ DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME = 2
+};
+/* The columns in SYS_FOREIGN_COLS */
+enum dict_col_sys_foreign_cols_enum {
+ DICT_COL__SYS_FOREIGN_COLS__ID = 0,
+ DICT_COL__SYS_FOREIGN_COLS__POS = 1,
+ DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME = 2,
+ DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME = 3,
+ DICT_NUM_COLS__SYS_FOREIGN_COLS = 4
+};
+/* The field numbers in the SYS_FOREIGN_COLS clustered index */
+enum dict_fld_sys_foreign_cols_enum {
+ DICT_FLD__SYS_FOREIGN_COLS__ID = 0,
+ DICT_FLD__SYS_FOREIGN_COLS__POS = 1,
+ DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID = 2,
+ DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR = 3,
+ DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME = 4,
+ DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME = 5,
+ DICT_NUM_FIELDS__SYS_FOREIGN_COLS = 6
+};
+/* The columns in SYS_TABLESPACES */
+enum dict_col_sys_tablespaces_enum {
+ DICT_COL__SYS_TABLESPACES__SPACE = 0,
+ DICT_COL__SYS_TABLESPACES__NAME = 1,
+ DICT_COL__SYS_TABLESPACES__FLAGS = 2,
+ DICT_NUM_COLS__SYS_TABLESPACES = 3
+};
+/* The field numbers in the SYS_TABLESPACES clustered index */
+enum dict_fld_sys_tablespaces_enum {
+ DICT_FLD__SYS_TABLESPACES__SPACE = 0,
+ DICT_FLD__SYS_TABLESPACES__DB_TRX_ID = 1,
+ DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR = 2,
+ DICT_FLD__SYS_TABLESPACES__NAME = 3,
+ DICT_FLD__SYS_TABLESPACES__FLAGS = 4,
+ DICT_NUM_FIELDS__SYS_TABLESPACES = 5
+};
+/* The columns in SYS_DATAFILES */
+enum dict_col_sys_datafiles_enum {
+ DICT_COL__SYS_DATAFILES__SPACE = 0,
+ DICT_COL__SYS_DATAFILES__PATH = 1,
+ DICT_NUM_COLS__SYS_DATAFILES = 2
+};
+/* The field numbers in the SYS_DATAFILES clustered index */
+enum dict_fld_sys_datafiles_enum {
+ DICT_FLD__SYS_DATAFILES__SPACE = 0,
+ DICT_FLD__SYS_DATAFILES__DB_TRX_ID = 1,
+ DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR = 2,
+ DICT_FLD__SYS_DATAFILES__PATH = 3,
+ DICT_NUM_FIELDS__SYS_DATAFILES = 4
+};
+
+/* The columns in SYS_VIRTUAL */
+enum dict_col_sys_virtual_enum {
+ DICT_COL__SYS_VIRTUAL__TABLE_ID = 0,
+ DICT_COL__SYS_VIRTUAL__POS = 1,
+ DICT_COL__SYS_VIRTUAL__BASE_POS = 2,
+ DICT_NUM_COLS__SYS_VIRTUAL = 3
+};
+/* The field numbers in the SYS_VIRTUAL clustered index */
+enum dict_fld_sys_virtual_enum {
+ DICT_FLD__SYS_VIRTUAL__TABLE_ID = 0,
+ DICT_FLD__SYS_VIRTUAL__POS = 1,
+ DICT_FLD__SYS_VIRTUAL__BASE_POS = 2,
+ DICT_FLD__SYS_VIRTUAL__DB_TRX_ID = 3,
+ DICT_FLD__SYS_VIRTUAL__DB_ROLL_PTR = 4,
+ DICT_NUM_FIELDS__SYS_VIRTUAL = 5
+};
+
+/* A number of the columns above occur in multiple tables. These are the
+length of thos fields. */
+#define DICT_FLD_LEN_SPACE 4
+#define DICT_FLD_LEN_FLAGS 4
+
+/* When a row id which is zero modulo this number (which must be a power of
+two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
+updated */
+#define DICT_HDR_ROW_ID_WRITE_MARGIN 256
+
+#include "dict0boot.ic"
+
+#endif
diff --git a/storage/innobase/include/dict0boot.ic b/storage/innobase/include/dict0boot.ic
new file mode 100644
index 00000000..d920bdde
--- /dev/null
+++ b/storage/innobase/include/dict0boot.ic
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.ic
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Returns a new row id.
+@return the new id */
+UNIV_INLINE
+row_id_t
+dict_sys_get_new_row_id(void)
+/*=========================*/
+{
+ row_id_t id;
+
+ mutex_enter(&dict_sys.mutex);
+
+ id = dict_sys.row_id;
+
+ if (0 == (id % DICT_HDR_ROW_ID_WRITE_MARGIN)) {
+
+ dict_hdr_flush_row_id();
+ }
+
+ dict_sys.row_id++;
+
+ mutex_exit(&dict_sys.mutex);
+
+ return(id);
+}
+
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+ byte* field, /*!< in: record field */
+ row_id_t row_id) /*!< in: row id */
+{
+ compile_time_assert(DATA_ROW_ID_LEN == 6);
+ mach_write_to_6(field, row_id);
+}
+
+/*********************************************************************//**
+Check if a table id belongs to system table.
+@return true if the table id belongs to a system table. */
+UNIV_INLINE
+bool
+dict_is_sys_table(
+/*==============*/
+ table_id_t id) /*!< in: table id to check */
+{
+ return(id < DICT_HDR_FIRST_ID);
+}
+
+
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
new file mode 100644
index 00000000..13706d6b
--- /dev/null
+++ b/storage/innobase/include/dict0crea.h
@@ -0,0 +1,324 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.h
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0crea_h
+#define dict0crea_h
+
+#include "dict0dict.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+#include "fil0crypt.h"
+
+/*********************************************************************//**
+Creates a table create graph.
+@return own: table create node */
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+ dict_table_t* table, /*!< in: table to create, built as
+ a memory data structure */
+ mem_heap_t* heap, /*!< in: heap where created */
+ fil_encryption_t mode, /*!< in: encryption mode */
+ uint32_t key_id); /*!< in: encryption key_id */
+
+/** Creates an index create graph.
+@param[in] index index to create, built as a memory data structure
+@param[in] table table name
+@param[in,out] heap heap where created
+@param[in] add_v new virtual columns added in the same clause with
+ add index
+@return own: index create node */
+ind_node_t*
+ind_create_graph_create(
+ dict_index_t* index,
+ const char* table,
+ mem_heap_t* heap,
+ const dict_add_v_col_t* add_v = NULL);
+
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/***************************************************************//**
+Builds an index definition but doesn't update sys_table.
+@return DB_SUCCESS or error code */
+void
+dict_build_index_def(
+/*=================*/
+ const dict_table_t* table, /*!< in: table */
+ dict_index_t* index, /*!< in/out: index */
+ trx_t* trx); /*!< in/out: InnoDB transaction
+ handle */
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+Don't update SYSTEM TABLES.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+dict_create_index_tree(
+/*===================*/
+ dict_index_t* index, /*!< in/out: index */
+ const trx_t* trx); /*!< in: InnoDB transaction handle */
+
+/** Drop the index tree associated with a row in SYS_INDEXES table.
+@param[in,out] pcur persistent cursor on rec
+@param[in,out] trx dictionary transaction
+@param[in,out] mtr mini-transaction */
+void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+Don't update SYSTEM TABLES.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+dict_create_index_tree_in_mem(
+/*==========================*/
+ dict_index_t* index, /*!< in/out: index */
+ const trx_t* trx); /*!< in: InnoDB transaction handle */
+
+/****************************************************************//**
+Creates the foreign key constraints system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_create_or_check_foreign_constraint_tables(void);
+/*================================================*/
+
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18. */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+ ulint* id_nr, /*!< in/out: number to use in id
+ generation; incremented if used */
+ const char* name, /*!< in: table name */
+ dict_foreign_t* foreign); /*!< in/out: foreign key */
+
+/** Adds the given set of foreign key objects to the dictionary tables
+in the database. This function does not modify the dictionary cache. The
+caller must ensure that all foreign key objects contain a valid constraint
+name in foreign->id.
+@param[in] local_fk_set set of foreign key objects, to be added to
+the dictionary tables
+@param[in] table table to which the foreign key objects in
+local_fk_set belong to
+@param[in,out] trx transaction
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+ const dict_foreign_set& local_fk_set,
+ const dict_table_t* table,
+ trx_t* trx)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if a foreign constraint is on columns server as base columns
+of any stored column. This is to prevent creating SET NULL or CASCADE
+constraint on such columns
+@param[in] local_fk_set set of foreign key objects, to be added to
+the dictionary tables
+@param[in] table table to which the foreign key objects in
+local_fk_set belong to
+@return true if yes, otherwise, false */
+bool
+dict_foreigns_has_s_base_col(
+ const dict_foreign_set& local_fk_set,
+ const dict_table_t* table);
+
+/****************************************************************//**
+Creates the tablespaces and datafiles system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_create_or_check_sys_tablespace(void);
+/*=====================================*/
+/** Creates the virtual column system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_create_or_check_sys_virtual();
+
+/** Put a tablespace definition into the data dictionary,
+replacing what was there previously.
+@param[in] space Tablespace id
+@param[in] name Tablespace name
+@param[in] flags Tablespace flags
+@param[in] path Tablespace path
+@param[in] trx Transaction
+@return error code or DB_SUCCESS */
+dberr_t
+dict_replace_tablespace_in_dictionary(
+ ulint space_id,
+ const char* name,
+ ulint flags,
+ const char* path,
+ trx_t* trx);
+
+/********************************************************************//**
+Add a foreign key definition to the data dictionary tables.
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+ const char* name, /*!< in: table name */
+ const dict_foreign_t* foreign,/*!< in: foreign key */
+ trx_t* trx) /*!< in/out: dictionary transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Construct foreign key constraint defintion from data dictionary information.
+*/
+UNIV_INTERN
+char*
+dict_foreign_def_get(
+/*=================*/
+ dict_foreign_t* foreign,/*!< in: foreign */
+ trx_t* trx); /*!< in: trx */
+
+/* Table create node structure */
+struct tab_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_TABLE_CREATE */
+ dict_table_t* table; /*!< table to create, built as a
+ memory data structure with
+ dict_mem_... functions */
+ ins_node_t* tab_def; /*!< child node which does the insert of
+ the table definition; the row to be
+ inserted is built by the parent node */
+ ins_node_t* col_def; /*!< child node which does the inserts
+ of the column definitions; the row to
+ be inserted is built by the parent
+ node */
+ ins_node_t* v_col_def; /*!< child node which does the inserts
+ of the sys_virtual row definitions;
+ the row to be inserted is built by
+ the parent node */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ ulint col_no; /*!< next column definition to insert */
+ uint key_id; /*!< encryption key_id */
+ fil_encryption_t mode; /*!< encryption mode */
+ ulint base_col_no; /*!< next base column to insert */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary
+ storage */
+};
+
+/* Table create node states */
+#define TABLE_BUILD_TABLE_DEF 1
+#define TABLE_BUILD_COL_DEF 2
+#define TABLE_BUILD_V_COL_DEF 3
+#define TABLE_ADD_TO_CACHE 4
+#define TABLE_COMPLETED 5
+
+/* Index create node struct */
+
+struct ind_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_INDEX_CREATE */
+ dict_index_t* index; /*!< index to create, built as a
+ memory data structure with
+ dict_mem_... functions */
+ const char* table_name; /*!< table name */
+ ins_node_t* ind_def; /*!< child node which does the insert of
+ the index definition; the row to be
+ inserted is built by the parent node */
+ ins_node_t* field_def; /*!< child node which does the inserts
+ of the field definitions; the row to
+ be inserted is built by the parent
+ node */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ uint32_t page_no; /* root page number of the index */
+ dict_table_t* table; /*!< table which owns the index */
+ dtuple_t* ind_row; /* index definition row built */
+ ulint field_no; /* next field definition to insert */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary
+ storage */
+ const dict_add_v_col_t*
+ add_v; /*!< new virtual columns that being
+ added along with an add index call */
+};
+
+/** Compose a column number for a virtual column, stored in the "POS" field
+of Sys_columns. The column number includes both its virtual column sequence
+(the "nth" virtual column) and its actual column position in original table
+@param[in] v_pos virtual column sequence
+@param[in] col_pos column position in original table definition
+@return composed column position number */
+UNIV_INLINE
+ulint
+dict_create_v_col_pos(
+ ulint v_pos,
+ ulint col_pos);
+
+/** Get the column number for a virtual column (the column position in
+original table), stored in the "POS" field of Sys_columns
+@param[in] pos virtual column position
+@return column position in original table */
+UNIV_INLINE
+ulint
+dict_get_v_col_mysql_pos(
+ ulint pos);
+
+/** Get a virtual column sequence (the "nth" virtual column) for a
+virtual column, stord in the "POS" field of Sys_columns
+@param[in] pos virtual column position
+@return virtual column sequence */
+UNIV_INLINE
+ulint
+dict_get_v_col_pos(
+ ulint pos);
+
+/* Index create node states */
+#define INDEX_BUILD_INDEX_DEF 1
+#define INDEX_BUILD_FIELD_DEF 2
+#define INDEX_CREATE_INDEX_TREE 3
+#define INDEX_ADD_TO_CACHE 4
+
+#include "dict0crea.ic"
+
+#endif
diff --git a/storage/innobase/include/dict0crea.ic b/storage/innobase/include/dict0crea.ic
new file mode 100644
index 00000000..5641206d
--- /dev/null
+++ b/storage/innobase/include/dict0crea.ic
@@ -0,0 +1,136 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.ic
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#include "ha_prototypes.h"
+
+#include "mem0mem.h"
+
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18. */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+ ulint* id_nr, /*!< in/out: number to use in id generation;
+ incremented if used */
+ const char* name, /*!< in: table name */
+ dict_foreign_t* foreign)/*!< in/out: foreign key */
+{
+ DBUG_ENTER("dict_create_add_foreign_id");
+
+ if (foreign->id == NULL) {
+ /* Generate a new constraint id */
+ ulint namelen = strlen(name);
+ char* id = static_cast<char*>(
+ mem_heap_alloc(foreign->heap,
+ namelen + 20));
+
+ if (dict_table_t::is_temporary_name(name)) {
+
+ /* no overflow if number < 1e13 */
+ sprintf(id, "%s_ibfk_%lu", name,
+ (ulong) (*id_nr)++);
+ } else {
+ char table_name[MAX_TABLE_NAME_LEN + 21];
+ uint errors = 0;
+
+ strncpy(table_name, name, (sizeof table_name) - 1);
+ table_name[(sizeof table_name) - 1] = '\0';
+
+ innobase_convert_to_system_charset(
+ strchr(table_name, '/') + 1,
+ strchr(name, '/') + 1,
+ MAX_TABLE_NAME_LEN, &errors);
+
+ if (errors) {
+ strncpy(table_name, name,
+ (sizeof table_name) - 1);
+ table_name[(sizeof table_name) - 1] = '\0';
+ }
+
+ /* no overflow if number < 1e13 */
+ sprintf(id, "%s_ibfk_%lu", table_name,
+ (ulong) (*id_nr)++);
+
+ if (innobase_check_identifier_length(
+ strchr(id,'/') + 1)) {
+ DBUG_RETURN(DB_IDENTIFIER_TOO_LONG);
+ }
+ }
+ foreign->id = id;
+
+ DBUG_PRINT("dict_create_add_foreign_id",
+ ("generated foreign id: %s", id));
+ }
+
+
+ DBUG_RETURN(DB_SUCCESS);
+}
+
+/** Compose a column number for a virtual column, stored in the "POS" field
+of Sys_columns. The column number includes both its virtual column sequence
+(the "nth" virtual column) and its actual column position in original table
+@param[in] v_pos virtual column sequence
+@param[in] col_pos column position in original table definition
+@return composed column position number */
+UNIV_INLINE
+ulint
+dict_create_v_col_pos(
+ ulint v_pos,
+ ulint col_pos)
+{
+ ut_ad(v_pos <= REC_MAX_N_FIELDS);
+ ut_ad(col_pos <= REC_MAX_N_FIELDS);
+
+ return(((v_pos + 1) << 16) + col_pos);
+}
+
+/** Get the column number for a virtual column (the column position in
+original table), stored in the "POS" field of Sys_columns
+@param[in] pos virtual column position
+@return column position in original table */
+UNIV_INLINE
+ulint
+dict_get_v_col_mysql_pos(
+ ulint pos)
+{
+ return(pos & 0xFFFF);
+}
+
+/** Get a virtual column sequence (the "nth" virtual column) for a
+virtual column, stord in the "POS" field of Sys_columns
+@param[in] pos virtual column position
+@return virtual column sequence */
+UNIV_INLINE
+ulint
+dict_get_v_col_pos(
+ ulint pos)
+{
+ return((pos >> 16) - 1);
+}
diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h
new file mode 100644
index 00000000..3aea41b0
--- /dev/null
+++ b/storage/innobase/include/dict0defrag_bg.h
@@ -0,0 +1,106 @@
+/*****************************************************************************
+
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0defrag_bg.h
+Code used for background table and index
+defragmentation
+
+Created 25/08/2016 Jan Lindström
+*******************************************************/
+
+#ifndef dict0defrag_bg_h
+#define dict0defrag_bg_h
+
+#include "dict0types.h"
+
+/** Indices whose defrag stats need to be saved to persistent storage.*/
+struct defrag_pool_item_t {
+ table_id_t table_id;
+ index_id_t index_id;
+};
+
+/** Allocator type, used by std::vector */
+typedef ut_allocator<defrag_pool_item_t>
+ defrag_pool_allocator_t;
+
+/** The multitude of tables to be defragmented- an STL vector */
+typedef std::vector<defrag_pool_item_t, defrag_pool_allocator_t>
+ defrag_pool_t;
+
+/** Pool where we store information on which tables are to be processed
+by background defragmentation. */
+extern defrag_pool_t defrag_pool;
+
+/*****************************************************************//**
+Initialize the defrag pool, called once during thread initialization. */
+void
+dict_defrag_pool_init(void);
+/*========================*/
+
+/*****************************************************************//**
+Free the resources occupied by the defrag pool, called once during
+thread de-initialization. */
+void
+dict_defrag_pool_deinit(void);
+/*==========================*/
+
+/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+ const dict_index_t* index); /*!< in: table to add */
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+ const dict_table_t* table, /*!<in: if given, remove
+ all entries for the table */
+ const dict_index_t* index); /*!< in: index to remove */
+
+/*****************************************************************//**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+void
+dict_defrag_process_entries_from_defrag_pool();
+/*===========================================*/
+
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_defrag_summary(
+/*============================*/
+ dict_index_t* index) /*!< in: index */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_defrag_stats(
+/*============================*/
+ dict_index_t* index); /*!< in: index */
+#endif /* dict0defrag_bg_h */
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
new file mode 100644
index 00000000..e17da733
--- /dev/null
+++ b/storage/innobase/include/dict0dict.h
@@ -0,0 +1,1804 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0dict.h
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0dict_h
+#define dict0dict_h
+
+#include "data0data.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+#include <deque>
+
+class MDL_ticket;
+extern bool innodb_table_stats_not_found;
+extern bool innodb_index_stats_not_found;
+
+/** the first table or index ID for other than hard-coded system tables */
+constexpr uint8_t DICT_HDR_FIRST_ID= 10;
+
+
+/** Get the database name length in a table name.
+@param name filename-safe encoded table name "dbname/tablename"
+@return database name length */
+inline size_t dict_get_db_name_len(const char *name)
+{
+ /* table_name_t::dblen() would assert that '/' is contained */
+ if (const char* s= strchr(name, '/'))
+ return size_t(s - name);
+
+ return 0;
+}
+
+
+/*********************************************************************//**
+Open a table from its database and table name, this is currently used by
+foreign constraint parser to get the referenced table.
+@return complete table name with database and table name, allocated from
+heap memory passed in */
+char*
+dict_get_referenced_table(
+/*======================*/
+ const char* name, /*!< in: foreign key table name */
+ const char* database_name, /*!< in: table db name */
+ ulint database_name_len,/*!< in: db name length */
+ const char* table_name, /*!< in: table name */
+ ulint table_name_len, /*!< in: table name length */
+ dict_table_t** table, /*!< out: table object or NULL */
+ mem_heap_t* heap, /*!< in: heap memory */
+ CHARSET_INFO* from_cs); /*!< in: table name charset */
+/*********************************************************************//**
+Frees a foreign key struct. */
+void
+dict_foreign_free(
+/*==============*/
+ dict_foreign_t* foreign); /*!< in, own: foreign key struct */
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return highest number, 0 if table has no new format foreign key constraints */
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+ dict_table_t* table); /*!< in: table in the dictionary
+ memory cache */
+/** Check whether the dict_table_t is a partition.
+A partitioned table on the SQL level is composed of InnoDB tables,
+where each InnoDB table is a [sub]partition including its secondary indexes
+which belongs to the partition.
+@param[in] table Table to check.
+@return true if the dict_table_t is a partition else false. */
+UNIV_INLINE
+bool
+dict_table_is_partition(const dict_table_t* table)
+{
+ /* Check both P and p on all platforms in case it was moved to/from
+ WIN. */
+ return (strstr(table->name.m_name, "#p#")
+ || strstr(table->name.m_name, "#P#"));
+}
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return table name */
+const char*
+dict_remove_db_name(
+/*================*/
+ const char* name) /*!< in: table name in the form
+ dbname '/' tablename */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Operation to perform when opening a table */
+enum dict_table_op_t {
+ /** Expect the tablespace to exist. */
+ DICT_TABLE_OP_NORMAL = 0,
+ /** Drop any orphan indexes after an aborted online index creation */
+ DICT_TABLE_OP_DROP_ORPHAN,
+ /** Silently load the tablespace if it does not exist,
+ and do not load the definitions of incomplete indexes. */
+ DICT_TABLE_OP_LOAD_TABLESPACE,
+ /** Open the table only if it's in table cache. */
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
+};
+
+/** Acquire MDL shared for the table name.
+@tparam trylock whether to use non-blocking operation
+@param[in,out] table table object
+@param[in,out] thd background thread
+@param[out] mdl mdl ticket
+@param[in] table_op operation to perform when opening
+@return table object after locking MDL shared
+@retval NULL if the table is not readable, or if trylock && MDL blocked */
+template<bool trylock>
+dict_table_t*
+dict_acquire_mdl_shared(dict_table_t *table,
+ THD *thd,
+ MDL_ticket **mdl,
+ dict_table_op_t table_op= DICT_TABLE_OP_NORMAL);
+
+/** Look up a table by numeric identifier.
+@param[in] table_id table identifier
+@param[in] dict_locked data dictionary locked
+@param[in] table_op operation to perform when opening
+@param[in,out] thd background thread, or NULL to not acquire MDL
+@param[out] mdl mdl ticket, or NULL
+@return table, NULL if does not exist */
+dict_table_t*
+dict_table_open_on_id(table_id_t table_id, bool dict_locked,
+ dict_table_op_t table_op, THD *thd= nullptr,
+ MDL_ticket **mdl= nullptr)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Decrements the count of open handles of a table.
+@param[in,out] table table
+@param[in] dict_locked data dictionary locked
+@param[in] try_drop try to drop any orphan indexes after
+ an aborted online index creation
+@param[in] thd thread to release MDL
+@param[in] mdl metadata lock or NULL if the thread is a
+ foreground one. */
+void
+dict_table_close(
+ dict_table_t* table,
+ bool dict_locked,
+ bool try_drop,
+ THD* thd = NULL,
+ MDL_ticket* mdl = NULL);
+
+/*********************************************************************//**
+Closes the only open handle to a table and drops a table while assuring
+that dict_sys.mutex is held the whole time. This assures that the table
+is not evicted after the close when the count of open handles goes to zero.
+Because dict_sys.mutex is held, we do not need to call prevent_eviction(). */
+void
+dict_table_close_and_drop(
+/*======================*/
+ trx_t* trx, /*!< in: data dictionary transaction */
+ dict_table_t* table); /*!< in/out: table */
+
+/*********************************************************************//**
+Gets the minimum number of bytes per character.
+@return minimum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbminlen(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the maximum number of bytes per character.
+@return maximum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbmaxlen(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+ const dict_col_t* col, /*!< in: column */
+ dtype_t* type); /*!< out: data type */
+
+/**********************************************************************//**
+Determine bytes of column prefix to be stored in the undo log. Please
+note that if !dict_table_has_atomic_blobs(table), no prefix
+needs to be stored in the undo log.
+@return bytes of column prefix to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_field_len_store_undo(
+/*==========================*/
+ dict_table_t* table, /*!< in: table */
+ const dict_col_t* col) /*!< in: column which index prefix
+ is based on */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine maximum bytes of a virtual column need to be stored
+in the undo log.
+@param[in] table dict_table_t for the table
+@param[in] col_no virtual column number
+@return maximum bytes of virtual column to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_v_field_len_store_undo(
+ dict_table_t* table,
+ ulint col_no);
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ const dtype_t* type) /*!< in: data type */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dict_col_get_min_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dict_col_get_fixed_size(
+/*====================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+unsigned
+dict_col_get_sql_null_size(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+unsigned
+dict_col_get_no(
+/*============*/
+ const dict_col_t* col) /*!< in: column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+ const dict_col_t* col, /*!< in: table column */
+ const dict_index_t* clust_index) /*!< in: clustered index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Gets the column position in the given index.
+@param[in] col table column
+@param[in] index index to be searched for column
+@return position of column in the given index. */
+UNIV_INLINE
+ulint
+dict_col_get_index_pos(
+ const dict_col_t* col,
+ const dict_index_t* index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return TRUE if name is reserved */
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+ const char* name) /*!< in: column name */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Unconditionally set the AUTO_INCREMENT counter.
+@param[in,out] table table or partition
+@param[in] value next available AUTO_INCREMENT value */
+MY_ATTRIBUTE((nonnull))
+UNIV_INLINE
+void
+dict_table_autoinc_initialize(dict_table_t* table, ib_uint64_t value)
+{
+ table->autoinc = value;
+}
+
+/**
+@param[in] table table or partition
+@return the next AUTO_INCREMENT counter value
+@retval 0 if AUTO_INCREMENT is not yet initialized */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+UNIV_INLINE
+ib_uint64_t
+dict_table_autoinc_read(const dict_table_t* table)
+{
+ return(table->autoinc);
+}
+
+/** Update the AUTO_INCREMENT sequence if the value supplied is greater
+than the current value.
+@param[in,out] table table or partition
+@param[in] value AUTO_INCREMENT value that was assigned to a row
+@return whether the AUTO_INCREMENT sequence was updated */
+MY_ATTRIBUTE((nonnull))
+UNIV_INLINE
+bool
+dict_table_autoinc_update_if_greater(dict_table_t* table, ib_uint64_t value)
+{
+ if (value > table->autoinc) {
+
+ table->autoinc = value;
+ return(true);
+ }
+
+ return(false);
+}
+
+/**********************************************************************//**
+Adds system columns to a table object. */
+void
+dict_table_add_system_columns(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ mem_heap_t* heap) /*!< in: temporary heap */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Renames a table object.
+@return TRUE if success */
+dberr_t
+dict_table_rename_in_cache(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ const char* new_name, /*!< in: new name */
+ bool rename_also_foreigns,
+ /*!< in: in ALTER TABLE we want
+ to preserve the original table name
+ in constraints which reference it */
+ bool replace_new_file = false)
+ /*!< in: whether to replace the
+ file with the new name
+ (as part of rolling back TRUNCATE) */
+ MY_ATTRIBUTE((nonnull));
+
+/** Removes an index from the dictionary cache.
+@param[in,out] table table whose index to remove
+@param[in,out] index index to remove, this object is destroyed and must not
+be accessed by the caller afterwards */
+void
+dict_index_remove_from_cache(
+ dict_table_t* table,
+ dict_index_t* index);
+
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table object already in cache */
+ table_id_t new_id) /*!< in: new id to set */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+ dict_foreign_t* foreign) /*!< in, own: foreign constraint */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of foreign table or referenced table must already be in
+the dictionary cache!
+@return DB_SUCCESS or error code */
+dberr_t
+dict_foreign_add_to_cache(
+/*======================*/
+ dict_foreign_t* foreign,
+ /*!< in, own: foreign key constraint */
+ const char** col_names,
+ /*!< in: column names, or NULL to use
+ foreign->foreign_table->col_names */
+ bool check_charsets,
+ /*!< in: whether to check charset
+ compatibility */
+ dict_err_ignore_t ignore_err)
+ /*!< in: error to be ignored */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+/*********************************************************************//**
+Checks if a table is referenced by foreign keys.
+@return TRUE if table is referenced by a foreign key */
+ibool
+dict_table_is_referenced_by_foreign_key(
+/*====================================*/
+ const dict_table_t* table) /*!< in: InnoDB table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************************//**
+Replace the index passed in with another equivalent index in the
+foreign key lists of the table.
+@return whether all replacements were found */
+bool
+dict_foreign_replace_index(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use table->col_names */
+ const dict_index_t* index) /*!< in: index to be replaced */
+ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+dberr_t
+dict_foreign_parse_drop_constraints(
+/*================================*/
+ mem_heap_t* heap, /*!< in: heap from which we can
+ allocate memory */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: table */
+ ulint* n, /*!< out: number of constraints
+ to drop */
+ const char*** constraints_to_drop) /*!< out: id's of the
+ constraints to drop */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************************//**
+Returns a table object and increments its open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low
+is usually the appropriate function.
+@param[in] table_name Table name
+@param[in] dict_locked TRUE=data dictionary locked
+@param[in] try_drop TRUE=try to drop any orphan indexes after
+ an aborted online index creation
+@param[in] ignore_err error to be ignored when loading the table
+@return table, NULL if does not exist */
+dict_table_t*
+dict_table_open_on_name(
+ const char* table_name,
+ ibool dict_locked,
+ ibool try_drop,
+ dict_err_ignore_t ignore_err)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Outcome of dict_foreign_find_index() or dict_foreign_qualify_index() */
+enum fkerr_t
+{
+ /** A backing index was found for a FOREIGN KEY constraint */
+ FK_SUCCESS = 0,
+ /** There is no index that covers the columns in the constraint. */
+ FK_INDEX_NOT_FOUND,
+ /** The index is for a prefix index, not a full column. */
+ FK_IS_PREFIX_INDEX,
+ /** A condition of SET NULL conflicts with a NOT NULL column. */
+ FK_COL_NOT_NULL,
+ /** The column types do not match */
+ FK_COLS_NOT_EQUAL
+};
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+dict_index_t*
+dict_foreign_find_index(
+/*====================*/
+ const dict_table_t* table, /*!< in: table */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use table->col_names */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols, /*!< in: number of columns */
+ const dict_index_t* types_idx,
+ /*!< in: NULL or an index
+ whose types the column types
+ must match */
+ bool check_charsets,
+ /*!< in: whether to check
+ charsets. only has an effect
+ if types_idx != NULL */
+ ulint check_null,
+ /*!< in: nonzero if none of
+ the columns must be declared
+ NOT NULL */
+ fkerr_t* error = NULL, /*!< out: error code */
+ ulint* err_col_no = NULL,
+ /*!< out: column number where
+ error happened */
+ dict_index_t** err_index = NULL)
+ /*!< out: index where error
+ happened */
+
+ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+
+/** Returns a virtual column's name.
+@param[in] table table object
+@param[in] col_nr virtual column number(nth virtual column)
+@return column name. */
+const char*
+dict_table_get_v_col_name(
+ const dict_table_t* table,
+ ulint col_nr);
+
+/** Check if the table has a given column.
+@param[in] table table object
+@param[in] col_name column name
+@param[in] col_nr column number guessed, 0 as default
+@return column number if the table has the specified column,
+otherwise table->n_def */
+ulint
+dict_table_has_column(
+ const dict_table_t* table,
+ const char* col_name,
+ ulint col_nr = 0);
+
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+std::string
+dict_print_info_on_foreign_keys(
+/*============================*/
+ ibool create_table_format, /*!< in: if TRUE then print in
+ a format suitable to be inserted into
+ a CREATE TABLE, otherwise in the format
+ of SHOW TABLE STATUS */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table); /*!< in: table */
+
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+std::string
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ ibool add_newline); /*!< in: whether to add a newline */
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+bool
+dict_foreign_qualify_index(
+/*====================*/
+ const dict_table_t* table, /*!< in: table */
+ const char** col_names,
+ /*!< in: column names, or NULL
+ to use table->col_names */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols, /*!< in: number of columns */
+ const dict_index_t* index, /*!< in: index to check */
+ const dict_index_t* types_idx,
+ /*!< in: NULL or an index
+ whose types the column types
+ must match */
+ bool check_charsets,
+ /*!< in: whether to check
+ charsets. only has an effect
+ if types_idx != NULL */
+ ulint check_null,
+ /*!< in: nonzero if none of
+ the columns must be declared
+ NOT NULL */
+ fkerr_t* error, /*!< out: error code */
+ ulint* err_col_no,
+ /*!< out: column number where
+ error happened */
+ dict_index_t** err_index)
+ /*!< out: index where error
+ happened */
+ MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the last index on the table.
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+ const dict_index_t* index) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes)
+# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes)
+# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
+#endif /* UNIV_DEBUG */
+
+/* Skip corrupted index */
+#define dict_table_skip_corrupt_index(index) \
+ while (index && index->is_corrupted()) { \
+ index = dict_table_get_next_index(index); \
+ }
+
+/* Get the next non-corrupt index */
+#define dict_table_next_uncorrupted_index(index) \
+do { \
+ index = dict_table_get_next_index(index); \
+ dict_table_skip_corrupt_index(index); \
+} while (0)
+
+#define dict_index_is_clust(index) (index)->is_clust()
+#define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust()
+#define dict_index_is_unique(index) (index)->is_unique()
+#define dict_index_is_spatial(index) (index)->is_spatial()
+#define dict_index_is_ibuf(index) (index)->is_ibuf()
+#define dict_index_is_sec_or_ibuf(index) !(index)->is_primary()
+#define dict_index_has_virtual(index) (index)->has_virtual()
+
+/** Get all the FTS indexes on a table.
+@param[in] table table
+@param[out] indexes all FTS indexes on this table
+@return number of FTS indexes */
+ulint
+dict_table_get_all_fts_indexes(
+ const dict_table_t* table,
+ ib_vector_t* indexes);
+
+/********************************************************************//**
+Gets the number of user-defined non-virtual columns in a table in the
+dictionary cache.
+@return number of user-defined (e.g., not ROW_ID) non-virtual
+columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_user_cols(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Gets the number of all non-virtual columns (also system) in a table
+in the dictionary cache.
+@return number of columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_cols(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Gets the number of virtual columns in a table in the dictionary cache.
+@param[in] table the table to check
+@return number of virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_v_cols(
+ const dict_table_t* table);
+
+/** Check if a table has indexed virtual columns
+@param[in] table the table to check
+@return true is the table has indexed virtual columns */
+UNIV_INLINE
+bool
+dict_table_has_indexed_v_cols(
+ const dict_table_t* table);
+
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+ dict_table_t* table) /*!< in/out: table */
+ MY_ATTRIBUTE((nonnull));
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+ dict_table_t* table) /*!< in/out: table */
+ MY_ATTRIBUTE((nonnull));
+
+/** Get nth virtual column
+@param[in] table target table
+@param[in] col_nr column number in MySQL Table definition
+@return dict_v_col_t ptr */
+dict_v_col_t*
+dict_table_get_nth_v_col_mysql(
+ const dict_table_t* table,
+ ulint col_nr);
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint pos) /*!< in: position of column */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Gets the nth virtual column of a table.
+@param[in] table table
+@param[in] pos position of virtual column
+@return pointer to virtual column object */
+UNIV_INLINE
+dict_v_col_t*
+dict_table_get_nth_v_col(
+ const dict_table_t* table,
+ ulint pos);
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ unsigned sys) /*!< in: DATA_ROW_ID, ... */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+#define dict_table_get_nth_col(table, pos) (&(table)->cols[pos])
+#define dict_table_get_sys_col(table, sys) \
+ &(table)->cols[(table)->n_cols + (sys) - DATA_N_SYS_COLS]
+/* Get nth virtual columns */
+#define dict_table_get_nth_v_col(table, pos) (&(table)->v_cols[pos])
+#endif /* UNIV_DEBUG */
+/** Wrapper function.
+@see dict_col_t::name()
+@param[in] table table
+@param[in] col_nr column number in table
+@return column name */
+inline
+const char*
+dict_table_get_col_name(const dict_table_t* table, ulint col_nr)
+{
+ return(dict_table_get_nth_col(table, col_nr)->name(*table));
+}
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+unsigned
+dict_table_get_sys_col_no(
+/*======================*/
+ const dict_table_t* table, /*!< in: table */
+ unsigned sys) /*!< in: DATA_ROW_ID, ... */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+unsigned
+dict_index_get_min_size(
+/*====================*/
+ const dict_index_t* index) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define dict_table_is_comp(table) (table)->not_redundant()
+
+/** Determine if a table uses atomic BLOBs (no locally stored prefix).
+@param[in] table InnoDB table
+@return whether BLOBs are atomic */
+inline
+bool
+dict_table_has_atomic_blobs(const dict_table_t* table)
+{
+ return(DICT_TF_HAS_ATOMIC_BLOBS(table->flags));
+}
+
+/** @return potential max length stored inline for externally stored fields */
+inline size_t dict_table_t::get_overflow_field_local_len() const
+{
+ if (dict_table_has_atomic_blobs(this)) {
+ /* ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED: do not
+ store any BLOB prefix locally */
+ return BTR_EXTERN_FIELD_REF_SIZE;
+ }
+ /* up to MySQL 5.1: store a 768-byte prefix locally */
+ return BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN;
+}
+
+/** Set the various values in a dict_table_t::flags pointer.
+@param[in,out] flags, Pointer to a 4 byte Table Flags
+@param[in] format, File Format
+@param[in] zip_ssize Zip Shift Size
+@param[in] use_data_dir Table uses DATA DIRECTORY
+@param[in] page_compressed Table uses page compression
+@param[in] page_compression_level Page compression level */
+UNIV_INLINE
+void
+dict_tf_set(
+ ulint* flags,
+ rec_format_t format,
+ ulint zip_ssize,
+ bool use_data_dir,
+ bool page_compressed,
+ ulint page_compression_level);
+
+/** Convert a 32 bit integer table flags to the 32 bit FSP Flags.
+Fsp Flags are written into the tablespace header at the offset
+FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field.
+The following chart shows the translation of the low order bit.
+Other bits are the same.
+========================= Low order bit ==========================
+ | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags | 0 | 1 | 1 | 1
+fil_space_t::flags | 0 | 0 | 1 | 1
+==================================================================
+@param[in] table_flags dict_table_t::flags
+@return tablespace flags (fil_space_t::flags) */
+UNIV_INLINE
+ulint
+dict_tf_to_fsp_flags(ulint table_flags)
+ MY_ATTRIBUTE((const));
+
+
+/** Extract the ROW_FORMAT=COMPRESSED page size from table flags.
+@param[in] flags flags
+@return ROW_FORMAT=COMPRESSED page size
+@retval 0 if not compressed */
+inline ulint dict_tf_get_zip_size(ulint flags)
+{
+ flags &= DICT_TF_MASK_ZIP_SSIZE;
+ return flags
+ ? (UNIV_ZIP_SIZE_MIN >> 1)
+ << (FSP_FLAGS_GET_ZIP_SSIZE(flags >> DICT_TF_POS_ZIP_SSIZE
+ << FSP_FLAGS_POS_ZIP_SSIZE))
+ : 0;
+}
+
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return TRUE if the column, or its prefix, is in the clustered key */
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n) /*!< in: column number */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************************//**
+Check if the table has an FTS index.
+@return TRUE if table has an FTS index */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+ dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Copies types of virtual columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value. This function should
+be called right after dtuple_create().
+@param[in,out] tuple data tuple
+@param[in] table table
+*/
+void
+dict_table_copy_v_types(
+ dtuple_t* tuple,
+ const dict_table_t* table);
+
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value. This function should
+be called right after dtuple_create(). */
+void
+dict_table_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Looks for an index with the given id. NOTE that we do not reserve
+the dictionary mutex: this function is for emergency purposes like
+printing info of a corrupt database page!
+@return index or NULL if not found from cache */
+dict_index_t*
+dict_index_find_on_id_low(
+/*======================*/
+ index_id_t id) /*!< in: index id */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Make room in the table cache by evicting an unused table. The unused table
+should not be part of FK relationship and currently not used in any user
+transaction. There is no guarantee that it will remove a table.
+@return number of tables evicted. */
+ulint
+dict_make_room_in_cache(
+/*====================*/
+ ulint max_tables, /*!< in: max tables allowed in cache */
+ ulint pct_check); /*!< in: max percent to check */
+
+/** Adds an index to the dictionary cache, with possible indexing newly
+added column.
+@param[in,out] index index; NOTE! The index memory
+ object is freed in this function!
+@param[in] page_no root page number of the index
+@param[in] add_v virtual columns being added along with ADD INDEX
+@return DB_SUCCESS, or DB_CORRUPTION */
+dberr_t
+dict_index_add_to_cache(
+ dict_index_t*& index,
+ ulint page_no,
+ const dict_add_v_col_t* add_v = NULL)
+ MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_fields(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal
+ representation of index (in
+ the dictionary cache) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree(
+/*============================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** The number of fields in the nonleaf page of spatial index, except
+the page no field. */
+#define DICT_INDEX_SPATIAL_NODEPTR_SIZE 1
+/**
+Gets the number of fields on nonleaf page level in the internal representation
+of an index which uniquely determine the position of an index entry in the
+index, if we also take multiversioning into account. Note, it doesn't
+include page no field.
+@param[in] index index
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree_nonleaf(
+ const dict_index_t* index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation we add the row id to the ordering fields to make all indexes
+unique, but this function returns the number of fields the user defined
+in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of field */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos))
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the column number of the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint n, /*!< in: column number */
+ ulint* prefix_col_pos) /*!< out: col num if prefix */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Looks for column n in an index.
+@param[in] index index
+@param[in] n column number
+@param[in] inc_prefix true=consider column prefixes too
+@param[in] is_virtual true==virtual column
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_col_or_prefix_pos(
+ const dict_index_t* index, /*!< in: index */
+ ulint n, /*!< in: column number */
+ bool inc_prefix, /*!< in: TRUE=consider
+ column prefixes too */
+ bool is_virtual, /*!< in: is a virtual column
+ */
+ ulint* prefix_col_pos) /*!< out: col num if prefix
+ */
+ __attribute__((warn_unused_result));
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+ const dict_index_t* index, /*!< in: index from which to search */
+ const dict_index_t* index2, /*!< in: index */
+ ulint n) /*!< in: field number in index2 */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return position in internal representation of the clustered index */
+unsigned
+dict_table_get_nth_col_pos(
+/*=======================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n, /*!< in: column number */
+ ulint* prefix_col_pos) /*!< out: col num if prefix */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+/*******************************************************************//**
+Adds a column to index. */
+void
+dict_index_add_col(
+/*===============*/
+ dict_index_t* index, /*!< in/out: index */
+ const dict_table_t* table, /*!< in: table */
+ dict_col_t* col, /*!< in: column */
+ ulint prefix_len) /*!< in: column prefix length */
+ MY_ATTRIBUTE((nonnull));
+
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+void
+dict_index_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_index_t* index, /*!< in: index */
+ ulint n_fields) /*!< in: number of
+ field types to copy */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+ const dict_field_t* field) /*!< in: index field */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+Assumes that dict_sys.mutex is already being held.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+ index_id_t index_id) /*!< in: index id */
+ MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+ index_id_t index_id) /*!< in: index id */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return TRUE if ok */
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+ const dict_index_t* index, /*!< in: index tree */
+ const dtuple_t* tuple) /*!< in: tuple used in a search */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Whether and when to allow temporary index names */
+enum check_name {
+ /** Require all indexes to be complete. */
+ CHECK_ALL_COMPLETE,
+ /** Allow aborted online index creation. */
+ CHECK_ABORTED_OK,
+ /** Allow partial indexes to exist. */
+ CHECK_PARTIAL_OK
+};
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+ const dict_table_t* table, /*!< in: Check for dup indexes
+ in this table */
+ enum check_name check) /*!< in: whether and when to allow
+ temporary index names */
+ MY_ATTRIBUTE((nonnull));
+#endif /* UNIV_DEBUG */
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return own: node pointer */
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record for which to build node
+ pointer */
+ ulint page_no,/*!< in: page number to put in node
+ pointer */
+ mem_heap_t* heap, /*!< in: memory heap where pointer
+ created */
+ ulint level) /*!< in: level of rec in tree:
+ 0 means leaf level */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Convert a physical record into a search tuple.
+@param[in] rec index record (not necessarily in an index page)
+@param[in] index index
+@param[in] leaf whether rec is in a leaf page
+@param[in] n_fields number of data fields
+@param[in,out] heap memory heap for allocation
+@return own: data tuple */
+dtuple_t*
+dict_index_build_data_tuple(
+ const rec_t* rec,
+ const dict_index_t* index,
+ bool leaf,
+ ulint n_fields,
+ mem_heap_t* heap)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+uint32_t
+dict_index_get_page(
+/*================*/
+ const dict_index_t* tree) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the read-write lock of the index tree.
+@return read-write lock */
+UNIV_INLINE
+rw_lock_t*
+dict_index_get_lock(
+/*================*/
+ const dict_index_t* index) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void);
+/*==============================*/
+
+/* Online index creation @{ */
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+ const dict_index_t* index) /*!< in: secondary index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+ dict_index_t* index, /*!< in/out: index */
+ enum online_index_status status) /*!< in: status */
+ MY_ATTRIBUTE((nonnull));
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+ const dict_index_t* index) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+ const dict_index_t* index) /*!< in: index */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define dict_mutex_enter_for_mysql() mutex_enter(&dict_sys.mutex)
+#define dict_mutex_exit_for_mysql() mutex_exit(&dict_sys.mutex)
+
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return TRUE if same db name */
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+ const char* name1, /*!< in: table name in the form
+ dbname '/' tablename */
+ const char* name2) /*!< in: table name in the form
+ dbname '/' tablename */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Get an index by name.
+@param[in] table the table where to look for the index
+@param[in] name the index name to look for
+@return index, NULL if does not exist */
+dict_index_t*
+dict_table_get_index_on_name(dict_table_t* table, const char* name)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Get an index by name.
+@param[in] table the table where to look for the index
+@param[in] name the index name to look for
+@return index, NULL if does not exist */
+inline
+const dict_index_t*
+dict_table_get_index_on_name(const dict_table_t* table, const char* name)
+{
+ return dict_table_get_index_on_name(const_cast<dict_table_t*>(table),
+ name);
+}
+
+/***************************************************************
+Check whether a column exists in an FTS index. */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+ /* out: ULINT_UNDEFINED if no match else
+ the offset within the vector */
+ ib_vector_t* indexes,/* in: vector containing only FTS indexes */
+ ulint col_no, /* in: col number to search for */
+ bool is_virtual)/*!< in: whether it is a virtual column */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Looks for an index with the given id given a table instance.
+@param[in] table table instance
+@param[in] id index id
+@return index or NULL */
+dict_index_t*
+dict_table_find_index_on_id(
+ const dict_table_t* table,
+ index_id_t id)
+ MY_ATTRIBUTE((nonnull(1)));
+
+/** Maximum number of columns in a foreign key constraint. Please Note MySQL
+has a much lower limit on the number of columns allowed in a foreign key
+constraint */
+#define MAX_NUM_FK_COLUMNS 500
+
+/* Buffers for storing detailed information about the latest foreign key
+and unique key errors */
+extern FILE* dict_foreign_err_file;
+extern ib_mutex_t dict_foreign_err_mutex; /* mutex protecting the
+ foreign key error messages */
+
+/** InnoDB data dictionary cache */
+class dict_sys_t
+{
+public:
+ DictSysMutex mutex; /*!< mutex protecting the data
+ dictionary; protects also the
+ disk-based dictionary system tables;
+ this mutex serializes CREATE TABLE
+ and DROP TABLE, as well as reading
+ the dictionary data for a table from
+ system tables */
+ /** @brief the data dictionary rw-latch protecting dict_sys
+
+ Table create, drop, etc. reserve this in X-mode; implicit or
+ backround operations purge, rollback, foreign key checks reserve this
+ in S-mode; not all internal InnoDB operations are covered by MDL.
+
+ This latch also prevents lock waits when accessing the InnoDB
+ data dictionary tables. @see trx_t::dict_operation_lock_mode */
+ rw_lock_t latch;
+ row_id_t row_id; /*!< the next row id to assign;
+ NOTE that at a checkpoint this
+ must be written to the dict system
+ header and flushed to a file; in
+ recovery this must be derived from
+ the log records */
+ hash_table_t table_hash; /*!< hash table of the tables, based
+ on name */
+ /** hash table of persistent table IDs */
+ hash_table_t table_id_hash;
+ dict_table_t* sys_tables; /*!< SYS_TABLES table */
+ dict_table_t* sys_columns; /*!< SYS_COLUMNS table */
+ dict_table_t* sys_indexes; /*!< SYS_INDEXES table */
+ dict_table_t* sys_fields; /*!< SYS_FIELDS table */
+ dict_table_t* sys_virtual; /*!< SYS_VIRTUAL table */
+
+ /*=============================*/
+ UT_LIST_BASE_NODE_T(dict_table_t)
+ table_LRU; /*!< List of tables that can be evicted
+ from the cache */
+ UT_LIST_BASE_NODE_T(dict_table_t)
+ table_non_LRU; /*!< List of tables that can't be
+ evicted from the cache */
+private:
+ bool m_initialised;
+ /** the sequence of temporary table IDs */
+ std::atomic<table_id_t> temp_table_id;
+ /** hash table of temporary table IDs */
+ hash_table_t temp_id_hash;
+public:
+ /** @return a new temporary table ID */
+ table_id_t get_temporary_table_id() {
+ return temp_table_id.fetch_add(1, std::memory_order_relaxed);
+ }
+
+ /** Look up a temporary table.
+ @param id temporary table ID
+ @return temporary table
+ @retval NULL if the table does not exist
+ (should only happen during the rollback of CREATE...SELECT) */
+ dict_table_t* get_temporary_table(table_id_t id)
+ {
+ ut_ad(mutex_own(&mutex));
+ dict_table_t* table;
+ ulint fold = ut_fold_ull(id);
+ HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table,
+ ut_ad(table->cached), table->id == id);
+ if (UNIV_LIKELY(table != NULL)) {
+ DBUG_ASSERT(table->is_temporary());
+ DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID);
+ table->acquire();
+ }
+ return table;
+ }
+
+ /** Look up a persistent table.
+ @param id table ID
+ @return table
+ @retval NULL if not cached */
+ dict_table_t* get_table(table_id_t id)
+ {
+ ut_ad(mutex_own(&mutex));
+ dict_table_t* table;
+ ulint fold = ut_fold_ull(id);
+ HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*,
+ table,
+ ut_ad(table->cached), table->id == id);
+ DBUG_ASSERT(!table || !table->is_temporary());
+ return table;
+ }
+
+ /**
+ Constructor. Further initialisation happens in create().
+ */
+
+ dict_sys_t() : m_initialised(false), temp_table_id(DICT_HDR_FIRST_ID) {}
+
+ bool is_initialised() const { return m_initialised; }
+
+ /** Initialise the data dictionary cache. */
+ void create();
+
+ /** Close the data dictionary cache on shutdown. */
+ void close();
+
+ /** Resize the hash tables based on the current buffer pool size. */
+ void resize();
+
+ /** Add a table definition to the data dictionary cache */
+ inline void add(dict_table_t* table);
+ /** Remove a table definition from the data dictionary cache.
+ @param[in,out] table cached table definition to be evicted
+ @param[in] lru whether this is part of least-recently-used evictiono
+ @param[in] keep whether to keep (not free) the object */
+ void remove(dict_table_t* table, bool lru = false, bool keep = false);
+
+#ifdef UNIV_DEBUG
+ /** Find a table */
+ template <bool in_lru> bool find(dict_table_t* table)
+ {
+ ut_ad(table);
+ ut_ad(table->can_be_evicted == in_lru);
+ ut_ad(mutex_own(&mutex));
+ for (const dict_table_t* t = UT_LIST_GET_FIRST(in_lru
+ ? table_LRU : table_non_LRU);
+ t; t = UT_LIST_GET_NEXT(table_LRU, t))
+ {
+ if (t == table) return true;
+ ut_ad(t->can_be_evicted == in_lru);
+ }
+ return false;
+ }
+ /** Find a table */
+ bool find(dict_table_t* table)
+ {
+ return table->can_be_evicted ? find<true>(table) : find<false>(table);
+ }
+#endif
+
+ /** Move a table to the non-LRU list from the LRU list. */
+ void prevent_eviction(dict_table_t* table)
+ {
+ ut_ad(find(table));
+ if (table->can_be_evicted)
+ {
+ table->can_be_evicted = FALSE;
+ UT_LIST_REMOVE(table_LRU, table);
+ UT_LIST_ADD_LAST(table_non_LRU, table);
+ }
+ }
+ /** Acquire a reference to a cached table. */
+ inline void acquire(dict_table_t* table);
+
+#ifdef UNIV_DEBUG
+ /** Assert that the data dictionary is locked */
+ void assert_locked()
+ {
+ ut_ad(mutex_own(&mutex));
+ ut_ad(rw_lock_own(&latch, RW_LOCK_X));
+ }
+#endif
+ /** Lock the data dictionary cache. */
+ void lock(const char* file, unsigned line)
+ {
+ rw_lock_x_lock_func(&latch, 0, file, line);
+ mutex_enter_loc(&mutex, file, line);
+ }
+
+ /** Unlock the data dictionary cache. */
+ void unlock()
+ {
+ mutex_exit(&mutex);
+ rw_lock_x_unlock(&latch);
+ }
+
+ /** Estimate the used memory occupied by the data dictionary
+ table and index objects.
+ @return number of bytes occupied */
+ ulint rough_size() const
+ {
+ /* No mutex; this is a very crude approximation anyway */
+ ulint size = UT_LIST_GET_LEN(table_LRU) + UT_LIST_GET_LEN(table_non_LRU);
+ size *= sizeof(dict_table_t)
+ + sizeof(dict_index_t) * 2
+ + (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10
+ + sizeof(dict_field_t) * 5 /* total number of key fields */
+ + 200; /* arbitrary, covering names and overhead */
+ size += (table_hash.n_cells + table_id_hash.n_cells
+ + temp_id_hash.n_cells) * sizeof(hash_cell_t);
+ return size;
+ }
+};
+
+/** the data dictionary cache */
+extern dict_sys_t dict_sys;
+
+#define dict_table_prevent_eviction(table) dict_sys.prevent_eviction(table)
+#define dict_sys_lock() dict_sys.lock(__FILE__, __LINE__)
+#define dict_sys_unlock() dict_sys.unlock()
+
+/* Auxiliary structs for checking a table definition @{ */
+
+/* This struct is used to specify the name and type that a column must
+have when checking a table's schema. */
+struct dict_col_meta_t {
+ const char* name; /* column name */
+ ulint mtype; /* required column main type */
+ ulint prtype_mask; /* required column precise type mask;
+ if this is non-zero then all the
+ bits it has set must also be set
+ in the column's prtype */
+ ulint len; /* required column length */
+};
+
+/* This struct is used for checking whether a given table exists and
+whether it has a predefined schema (number of columns and column names
+and types) */
+struct dict_table_schema_t {
+ const char* table_name; /* the name of the table whose
+ structure we are checking */
+ ulint n_cols; /* the number of columns the
+ table must have */
+ dict_col_meta_t* columns; /* metadata for the columns;
+ this array has n_cols
+ elements */
+ ulint n_foreign; /* number of foreign keys this
+ table has, pointing to other
+ tables (where this table is
+ FK child) */
+ ulint n_referenced; /* number of foreign keys other
+ tables have, pointing to this
+ table (where this table is
+ parent) */
+};
+/* @} */
+
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The table must have the same number of columns with the same names and
+types. The order of the columns does not matter.
+The caller must own the dictionary mutex.
+dict_table_schema_check() @{
+@return DB_SUCCESS if the table exists and contains the necessary columns */
+dberr_t
+dict_table_schema_check(
+/*====================*/
+ dict_table_schema_t* req_schema, /*!< in/out: required table
+ schema */
+ char* errstr, /*!< out: human readable error
+ message if != DB_SUCCESS and
+ != DB_TABLE_NOT_FOUND is
+ returned */
+ size_t errstr_sz) /*!< in: errstr size */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/* @} */
+
+/*********************************************************************//**
+Converts a database and table name from filesystem encoding
+(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
+strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be
+at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */
+void
+dict_fs2utf8(
+/*=========*/
+ const char* db_and_table, /*!< in: database and table names,
+ e.g. d@i1b/a@q1b@1Kc */
+ char* db_utf8, /*!< out: database name, e.g. dцb */
+ size_t db_utf8_size, /*!< in: dbname_utf8 size */
+ char* table_utf8, /*!< out: table name, e.g. aюbØc */
+ size_t table_utf8_size)/*!< in: table_utf8 size */
+ MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Check whether the table is corrupted.
+@return nonzero for corrupted table, zero for valid tables */
+UNIV_INLINE
+ulint
+dict_table_is_corrupted(
+/*====================*/
+ const dict_table_t* table) /*!< in: table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Flags an index and table corrupted both in the data dictionary cache
+and in the system table SYS_INDEXES. */
+void
+dict_set_corrupted(
+/*===============*/
+ dict_index_t* index, /*!< in/out: index */
+ trx_t* trx, /*!< in/out: transaction */
+ const char* ctx) /*!< in: context */
+ ATTRIBUTE_COLD __attribute__((nonnull));
+
+/** Flags an index corrupted in the data dictionary cache only. This
+is used mostly to mark a corrupted index when index's own dictionary
+is corrupted, and we force to load such index for repair purpose
+@param[in,out] index index that is corrupted */
+void
+dict_set_corrupted_index_cache_only(
+ dict_index_t* index);
+
+/**********************************************************************//**
+Flags a table with specified space_id corrupted in the table dictionary
+cache.
+@return TRUE if successful */
+bool dict_set_corrupted_by_space(const fil_space_t* space);
+
+/** Flag a table encrypted in the data dictionary cache. */
+void dict_set_encrypted_by_space(const fil_space_t* space);
+
+/** Sets merge_threshold in the SYS_INDEXES
+@param[in,out] index index
+@param[in] merge_threshold value to set */
+void
+dict_index_set_merge_threshold(
+ dict_index_t* index,
+ ulint merge_threshold);
+
+#ifdef UNIV_DEBUG
+/** Sets merge_threshold for all indexes in dictionary cache for debug.
+@param[in] merge_threshold_all value to set for all indexes */
+void
+dict_set_merge_threshold_all_debug(
+ uint merge_threshold_all);
+#endif /* UNIV_DEBUG */
+
+/** Validate the table flags.
+@param[in] flags Table flags
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+ ulint flags);
+
+/** Validate both table flags and table flags2 and make sure they
+are compatible.
+@param[in] flags Table flags
+@param[in] flags2 Table flags2
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf2_is_valid(
+ ulint flags,
+ ulint flags2);
+
+/*********************************************************************//**
+This function should be called whenever a page is successfully
+compressed. Updates the compression padding information. */
+void
+dict_index_zip_success(
+/*===================*/
+ dict_index_t* index) /*!< in/out: index to be updated. */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+This function should be called whenever a page compression attempt
+fails. Updates the compression padding information. */
+void
+dict_index_zip_failure(
+/*===================*/
+ dict_index_t* index) /*!< in/out: index to be updated. */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Return the optimal page size, for which page will likely compress.
+@return page size beyond which page may not compress*/
+ulint
+dict_index_zip_pad_optimal_page_size(
+/*=================================*/
+ dict_index_t* index) /*!< in: index for which page size
+ is requested */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Convert table flag to row format string.
+@return row format name */
+const char*
+dict_tf_to_row_format_string(
+/*=========================*/
+ ulint table_flag); /*!< in: row format setting */
+
+/** encode number of columns and number of virtual columns in one
+4 bytes value. We could do this because the number of columns in
+InnoDB is limited to 1017
+@param[in] n_col number of non-virtual column
+@param[in] n_v_col number of virtual column
+@return encoded value */
+UNIV_INLINE
+ulint
+dict_table_encode_n_col(
+ ulint n_col,
+ ulint n_v_col);
+
+/** Decode number of virtual and non-virtual columns in one 4 bytes value.
+@param[in] encoded encoded value
+@param[in,out] n_col number of non-virtual column
+@param[in,out] n_v_col number of virtual column */
+UNIV_INLINE
+void
+dict_table_decode_n_col(
+ ulint encoded,
+ ulint* n_col,
+ ulint* n_v_col);
+
+/** Free the virtual column template
+@param[in,out] vc_templ virtual column template */
+UNIV_INLINE
+void
+dict_free_vc_templ(
+ dict_vcol_templ_t* vc_templ);
+
+/** Check whether the table have virtual index.
+@param[in] table InnoDB table
+@return true if the table have virtual index, false otherwise. */
+UNIV_INLINE
+bool
+dict_table_have_virtual_index(
+ dict_table_t* table);
+
+#include "dict0dict.ic"
+
+#endif
diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic
new file mode 100644
index 00000000..eda639ba
--- /dev/null
+++ b/storage/innobase/include/dict0dict.ic
@@ -0,0 +1,1248 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0dict.ic
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "fsp0sysspace.h"
+#include "dict0pagecompress.h"
+
+/*********************************************************************//**
+Gets the minimum number of bytes per character.
+@return minimum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbminlen(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return col->mbminlen;
+}
+/*********************************************************************//**
+Gets the maximum number of bytes per character.
+@return maximum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbmaxlen(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return col->mbmaxlen;
+}
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+ const dict_col_t* col, /*!< in: column */
+ dtype_t* type) /*!< out: data type */
+{
+ ut_ad(col != NULL);
+ ut_ad(type != NULL);
+
+ type->mtype = col->mtype;
+ type->prtype = col->prtype;
+ type->len = col->len;
+ type->mbminlen = col->mbminlen;
+ type->mbmaxlen = col->mbmaxlen;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(col->mtype == type->mtype);
+ ut_ad(col->prtype == type->prtype);
+ //ut_ad(col->len == type->len);
+ ut_ad(col->mbminlen == type->mbminlen);
+ ut_ad(col->mbmaxlen == type->mbmaxlen);
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dict_col_get_min_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return(dtype_get_min_size_low(col->mtype, col->prtype, col->len,
+ col->mbminlen, col->mbmaxlen));
+}
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return(dtype_get_max_size_low(col->mtype, col->len));
+}
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dict_col_get_fixed_size(
+/*====================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len,
+ col->mbminlen, col->mbmaxlen, comp));
+}
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+unsigned
+dict_col_get_sql_null_size(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ return(dict_col_get_fixed_size(col, comp));
+}
+
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+unsigned
+dict_col_get_no(
+/*============*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return(col->ind);
+}
+
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+ const dict_col_t* col, /*!< in: table column */
+ const dict_index_t* clust_index) /*!< in: clustered index */
+{
+ ulint i;
+
+ ut_ad(dict_index_is_clust(clust_index));
+
+ for (i = 0; i < clust_index->n_def; i++) {
+ const dict_field_t* field = &clust_index->fields[i];
+
+ if (!field->prefix_len && field->col == col) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/** Gets the column position in the given index.
+@param[in] col table column
+@param[in] index index to be searched for column
+@return position of column in the given index. */
+UNIV_INLINE
+ulint
+dict_col_get_index_pos(
+ const dict_col_t* col,
+ const dict_index_t* index)
+{
+ ulint i;
+
+ for (i = 0; i < index->n_def; i++) {
+ const dict_field_t* field = &index->fields[i];
+
+ if (!field->prefix_len && field->col == col) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes));
+}
+
+/********************************************************************//**
+Gets the last index on the table.
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ return(UT_LIST_GET_LAST((const_cast<dict_table_t*>(table))
+ ->indexes));
+}
+
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index));
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the number of user-defined non-virtual columns in a table in the
+dictionary cache.
+@return number of user-defined (e.g., not ROW_ID) non-virtual
+columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_user_cols(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ /* n_cols counts stored columns only. A table may contain
+ virtual columns and no user-specified stored columns at all. */
+ ut_ad(table->n_cols >= DATA_N_SYS_COLS);
+ return unsigned(table->n_cols) - DATA_N_SYS_COLS;
+}
+
+/********************************************************************//**
+Gets the number of all non-virtual columns (also system) in a table
+in the dictionary cache.
+@return number of non-virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_cols(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ return(table->n_cols);
+}
+
+/** Gets the number of virtual columns in a table in the dictionary cache.
+@param[in] table the table to check
+@return number of virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_v_cols(
+ const dict_table_t* table)
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(table->n_v_cols);
+}
+
+/** Check if a table has indexed virtual columns
+@param[in] table the table to check
+@return true is the table has indexed virtual columns */
+UNIV_INLINE
+bool
+dict_table_has_indexed_v_cols(
+ const dict_table_t* table)
+{
+
+ for (unsigned i = 0; i < table->n_v_cols; i++) {
+ const dict_v_col_t* col = dict_table_get_nth_v_col(table, i);
+ if (col->m_col.ord_part) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->stat_initialized);
+
+ return(table->stat_n_rows);
+}
+
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ if (table->stat_initialized) {
+ ib_uint64_t n_rows = table->stat_n_rows;
+ if (n_rows < 0xFFFFFFFFFFFFFFFFULL) {
+ table->stat_n_rows = n_rows + 1;
+ }
+ }
+}
+
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ if (table->stat_initialized) {
+ ib_uint64_t n_rows = table->stat_n_rows;
+ if (n_rows > 0) {
+ table->stat_n_rows = n_rows - 1;
+ }
+ }
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint pos) /*!< in: position of column */
+{
+ ut_ad(pos < table->n_def);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return((dict_col_t*) (table->cols) + pos);
+}
+
+/** Gets the nth virtual column of a table.
+@param[in] table table
+@param[in] pos position of virtual column
+@return pointer to virtual column object */
+UNIV_INLINE
+dict_v_col_t*
+dict_table_get_nth_v_col(
+ const dict_table_t* table,
+ ulint pos)
+{
+ ut_ad(table);
+ ut_ad(pos < table->n_v_def);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(!table->v_cols[pos].m_col.is_added());
+ ut_ad(!table->v_cols[pos].m_col.is_dropped());
+ return &table->v_cols[pos];
+}
+
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ unsigned sys) /*!< in: DATA_ROW_ID, ... */
+{
+ dict_col_t* col;
+ col = dict_table_get_nth_col(table,
+ dict_table_get_sys_col_no(table, sys));
+ ut_ad(col->mtype == DATA_SYS);
+ ut_ad(col->prtype == (sys | DATA_NOT_NULL));
+
+ return(col);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+unsigned
+dict_table_get_sys_col_no(
+/*======================*/
+ const dict_table_t* table, /*!< in: table */
+ unsigned sys) /*!< in: DATA_ROW_ID, ... */
+{
+ ut_ad(sys < DATA_N_SYS_COLS);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ return unsigned(table->n_cols) + (sys - DATA_N_SYS_COLS);
+}
+
+/************************************************************************
+Check if the table has an FTS index. */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+ /* out: TRUE if table has an FTS index */
+ dict_table_t* table) /* in: table */
+{
+ return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS));
+}
+
+/** Validate the flags for tables that are not ROW_FORMAT=REDUNDANT.
+@param[in] flags table flags
+@return whether the flags are valid */
+inline
+bool
+dict_tf_is_valid_not_redundant(ulint flags)
+{
+ const bool atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags);
+
+ ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
+
+ if (!zip_ssize) {
+ /* Not ROW_FORMAT=COMPRESSED */
+ } else if (!atomic_blobs) {
+ /* ROW_FORMAT=COMPRESSED implies ROW_FORMAT=DYNAMIC
+ for the uncompressed page format */
+ return(false);
+ } else if (zip_ssize > PAGE_ZIP_SSIZE_MAX
+ || zip_ssize > srv_page_size_shift
+ || srv_page_size_shift > UNIV_ZIP_SIZE_SHIFT_MAX) {
+ /* KEY_BLOCK_SIZE is out of bounds, or
+ ROW_FORMAT=COMPRESSED is not supported with this
+ innodb_page_size (only up to 16KiB) */
+ return(false);
+ }
+
+ switch (DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags)) {
+ case 0:
+ /* PAGE_COMPRESSION_LEVEL=0 should imply PAGE_COMPRESSED=NO */
+ return(!DICT_TF_GET_PAGE_COMPRESSION(flags));
+ case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 9:
+ /* PAGE_COMPRESSION_LEVEL requires
+ ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC
+ (not ROW_FORMAT=COMPRESSED or ROW_FORMAT=REDUNDANT)
+ and PAGE_COMPRESSED=YES */
+ return(!zip_ssize && DICT_TF_GET_PAGE_COMPRESSION(flags));
+ default:
+ /* Invalid PAGE_COMPRESSION_LEVEL value */
+ return(false);
+ }
+}
+
+/** Validate the table flags.
+@param[in] flags Table flags
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+ ulint flags)
+{
+ ut_ad(flags < 1U << DICT_TF_BITS);
+ /* The DATA_DIRECTORY flag can be assigned fully independently
+ of all other persistent table flags. */
+ flags &= ~DICT_TF_MASK_DATA_DIR;
+ if (!(flags & 1)) {
+ /* Only ROW_FORMAT=REDUNDANT has 0 in the least significant
+ bit. For ROW_FORMAT=REDUNDANT, only the DATA_DIR flag
+ (which we cleared above) can be set. If any other flags
+ are set, the flags are invalid. */
+ return(flags == 0 || flags == DICT_TF_MASK_NO_ROLLBACK);
+ }
+
+ return(dict_tf_is_valid_not_redundant(flags));
+}
+
+/** Validate both table flags and table flags2 and make sure they
+are compatible.
+@param[in] flags Table flags
+@param[in] flags2 Table flags2
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf2_is_valid(
+ ulint flags,
+ ulint flags2)
+{
+ if (!dict_tf_is_valid(flags)) {
+ return(false);
+ }
+
+ if ((flags2 & DICT_TF2_UNUSED_BIT_MASK) != 0) {
+ return(false);
+ }
+
+ return(true);
+}
+
+/********************************************************************//**
+Determine the file format from dict_table_t::flags
+The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any
+other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set.
+@return file format version */
+UNIV_INLINE
+rec_format_t
+dict_tf_get_rec_format(
+/*===================*/
+ ulint flags) /*!< in: dict_table_t::flags */
+{
+ ut_a(dict_tf_is_valid(flags));
+
+ if (!DICT_TF_GET_COMPACT(flags)) {
+ return(REC_FORMAT_REDUNDANT);
+ }
+
+ if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+ return(REC_FORMAT_COMPACT);
+ }
+
+ if (DICT_TF_GET_ZIP_SSIZE(flags)) {
+ return(REC_FORMAT_COMPRESSED);
+ }
+
+ return(REC_FORMAT_DYNAMIC);
+}
+
+/** Set the various values in a dict_table_t::flags pointer.
+@param[in,out] flags, Pointer to a 4 byte Table Flags
+@param[in] format File Format
+@param[in] zip_ssize Zip Shift Size
+@param[in] use_data_dir Table uses DATA DIRECTORY
+@param[in] page_compressed Table uses page compression
+@param[in] page_compression_level Page compression level */
+UNIV_INLINE
+void
+dict_tf_set(
+/*========*/
+ ulint* flags,
+ rec_format_t format,
+ ulint zip_ssize,
+ bool use_data_dir,
+ bool page_compressed,
+ ulint page_compression_level)
+{
+ *flags = use_data_dir ? 1 << DICT_TF_POS_DATA_DIR : 0;
+
+ switch (format) {
+ case REC_FORMAT_REDUNDANT:
+ ut_ad(zip_ssize == 0);
+ /* no other options are allowed */
+ ut_ad(!page_compressed);
+ return;
+ case REC_FORMAT_COMPACT:
+ *flags |= DICT_TF_COMPACT;
+ ut_ad(zip_ssize == 0);
+ break;
+ case REC_FORMAT_COMPRESSED:
+ *flags |= DICT_TF_COMPACT
+ | (1 << DICT_TF_POS_ATOMIC_BLOBS)
+ | (zip_ssize << DICT_TF_POS_ZIP_SSIZE);
+ break;
+ case REC_FORMAT_DYNAMIC:
+ *flags |= DICT_TF_COMPACT
+ | (1 << DICT_TF_POS_ATOMIC_BLOBS);
+ ut_ad(zip_ssize == 0);
+ break;
+ }
+
+ if (page_compressed) {
+ *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS)
+ | (1 << DICT_TF_POS_PAGE_COMPRESSION)
+ | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
+
+ ut_ad(zip_ssize == 0);
+ ut_ad(dict_tf_get_page_compression(*flags) == TRUE);
+ ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level);
+ }
+}
+
+/** Convert a 32 bit integer table flags to the 32 bit FSP Flags.
+Fsp Flags are written into the tablespace header at the offset
+FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field.
+The following chart shows the translation of the low order bit.
+Other bits are the same.
+========================= Low order bit ==========================
+ | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags | 0 | 1 | 1 | 1
+fil_space_t::flags | 0 | 0 | 1 | 1
+==================================================================
+@param[in] table_flags dict_table_t::flags
+@return tablespace flags (fil_space_t::flags) */
+UNIV_INLINE
+ulint
+dict_tf_to_fsp_flags(ulint table_flags)
+{
+ ulint fsp_flags;
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(
+ table_flags);
+
+ ut_ad((DICT_TF_GET_PAGE_COMPRESSION(table_flags) == 0)
+ == (page_compression_level == 0));
+
+ DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure",
+ return(ULINT_UNDEFINED););
+
+ /* No ROW_FORMAT=COMPRESSED for innodb_checksum_algorithm=full_crc32 */
+ if ((srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
+ || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_FULL_CRC32)
+ && !(table_flags & DICT_TF_MASK_ZIP_SSIZE)) {
+
+ fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER
+ | FSP_FLAGS_FCRC32_PAGE_SSIZE();
+
+ if (page_compression_level) {
+ fsp_flags |= innodb_compression_algorithm
+ << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+ }
+ } else {
+ /* Adjust bit zero. */
+ fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0;
+
+ /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */
+ fsp_flags |= table_flags
+ & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS);
+
+ fsp_flags |= FSP_FLAGS_PAGE_SSIZE();
+
+ if (page_compression_level) {
+ fsp_flags |= FSP_FLAGS_MASK_PAGE_COMPRESSION;
+ }
+ }
+
+ ut_a(fil_space_t::is_valid_flags(fsp_flags, false));
+
+ if (DICT_TF_HAS_DATA_DIR(table_flags)) {
+ fsp_flags |= 1U << FSP_FLAGS_MEM_DATA_DIR;
+ }
+
+ fsp_flags |= page_compression_level << FSP_FLAGS_MEM_COMPRESSION_LEVEL;
+
+ return(fsp_flags);
+}
+
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32bit integer that is written
+to a SYS_TABLES.TYPE field. The following chart shows the translation of
+the low order bit. Other bits are the same.
+========================= Low order bit ==========================
+ | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+dict_table_t::flags | 0 | 1 | 1
+SYS_TABLES.TYPE | 1 | 1 | 1
+==================================================================
+@return ulint containing SYS_TABLES.TYPE */
+UNIV_INLINE
+ulint
+dict_tf_to_sys_tables_type(
+/*=======================*/
+ ulint flags) /*!< in: dict_table_t::flags */
+{
+ ulint type;
+
+ ut_a(dict_tf_is_valid(flags));
+
+ /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */
+ type = 1;
+
+ /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+ PAGE_COMPRESSION_LEVEL are the same. */
+ type |= flags & (DICT_TF_MASK_ZIP_SSIZE
+ | DICT_TF_MASK_ATOMIC_BLOBS
+ | DICT_TF_MASK_DATA_DIR
+ | DICT_TF_MASK_PAGE_COMPRESSION
+ | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+ | DICT_TF_MASK_NO_ROLLBACK);
+
+ return(type);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_fields(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal
+ representation of index (in
+ the dictionary cache) */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ return(index->n_fields);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+ return(index->n_uniq);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree(
+/*============================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ if (dict_index_is_clust(index)) {
+
+ return(dict_index_get_n_unique(index));
+ }
+
+ return(dict_index_get_n_fields(index));
+}
+
+/**
+Gets the number of fields on nonleaf page level in the internal representation
+of an index which uniquely determine the position of an index entry in the
+index, if we also take multiversioning into account. Note, it doesn't
+include page no field.
+@param[in] index index
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree_nonleaf(
+ const dict_index_t* index)
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ if (dict_index_is_spatial(index)) {
+ /* For spatial index, on non-leaf page, we have only
+ 2 fields(mbr+page_no). So, except page no field,
+ there's one field there. */
+ return(DICT_INDEX_SPATIAL_NODEPTR_SIZE);
+ } else {
+ return(dict_index_get_n_unique_in_tree(index));
+ }
+}
+
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation of clustered indexes we add the row id to the ordering fields
+to make a clustered index unique, but this function returns the number of
+fields the user defined in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ return(index->n_user_defined_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of field */
+{
+ ut_ad(pos < index->n_def);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return((dict_field_t*) (index->fields) + pos);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+ const dict_field_t* field) /*!< in: index field */
+{
+ return(field->col);
+}
+
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+{
+ return(dict_field_get_col(dict_index_get_nth_field(index, pos)));
+}
+
+/********************************************************************//**
+Gets the column number the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+{
+ return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
+}
+
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint n, /*!< in: column number */
+ ulint* prefix_col_pos) /*!< out: col num if prefix */
+{
+ return(dict_index_get_nth_col_or_prefix_pos(index, n, false, false,
+ prefix_col_pos));
+}
+
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+unsigned
+dict_index_get_min_size(
+/*====================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ unsigned n= dict_index_get_n_fields(index);
+ unsigned size= 0;
+
+ while (n--)
+ size+= dict_col_get_min_size(dict_index_get_nth_col(index, n));
+
+ return size;
+}
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+uint32_t
+dict_index_get_page(
+/*================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(index->page);
+}
+
+/*********************************************************************//**
+Gets the read-write lock of the index tree.
+@return read-write lock */
+UNIV_INLINE
+rw_lock_t*
+dict_index_get_lock(
+/*================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(&(index->lock));
+}
+
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void)
+/*==============================*/
+{
+ return(srv_page_size / 16);
+}
+
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+ const dict_index_t* index) /*!< in: secondary index */
+{
+ enum online_index_status status;
+
+ status = (enum online_index_status) index->online_status;
+
+ /* Without the index->lock protection, the online
+ status can change from ONLINE_INDEX_CREATION to
+ ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in
+ row_log_apply() once log application is done. So to make
+ sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE
+ you should always do the recheck after acquiring index->lock */
+
+#ifdef UNIV_DEBUG
+ switch (status) {
+ case ONLINE_INDEX_COMPLETE:
+ case ONLINE_INDEX_CREATION:
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ return(status);
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return(status);
+}
+
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+ dict_index_t* index, /*!< in/out: index */
+ enum online_index_status status) /*!< in: status */
+{
+ ut_ad(!(index->type & DICT_FTS));
+ ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+
+#ifdef UNIV_DEBUG
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_COMPLETE:
+ case ONLINE_INDEX_CREATION:
+ break;
+ case ONLINE_INDEX_ABORTED:
+ ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED);
+ break;
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ index->online_status = status & 3;
+ ut_ad(dict_index_get_online_status(index) == status);
+}
+
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+ const dict_index_t* index) /*!< in: index */
+{
+#ifdef UNIV_DEBUG
+ if (dict_index_is_clust(index)) {
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_CREATION:
+ return(true);
+ case ONLINE_INDEX_COMPLETE:
+ return(false);
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ break;
+ }
+ ut_ad(0);
+ return(false);
+ }
+#endif /* UNIV_DEBUG */
+
+ return(UNIV_UNLIKELY(dict_index_get_online_status(index)
+ != ONLINE_INDEX_COMPLETE));
+}
+
+/**********************************************************************//**
+Check whether a column exists in an FTS index.
+@return ULINT_UNDEFINED if no match else the offset within the vector */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+ ib_vector_t* indexes,/*!< in: vector containing only FTS indexes */
+ ulint col_no, /*!< in: col number to search for */
+ bool is_virtual) /*!< in: whether it is a virtual column */
+
+{
+ ulint i;
+
+ for (i = 0; i < ib_vector_size(indexes); ++i) {
+ dict_index_t* index;
+
+ index = (dict_index_t*) ib_vector_getp(indexes, i);
+
+ if (index->contains_col_or_prefix(col_no, is_virtual)) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Determine bytes of column prefix to be stored in the undo log. Please
+note that if !dict_table_has_atomic_blobs(table), no prefix
+needs to be stored in the undo log.
+@return bytes of column prefix to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_field_len_store_undo(
+/*==========================*/
+ dict_table_t* table, /*!< in: table */
+ const dict_col_t* col) /*!< in: column which index prefix
+ is based on */
+{
+ if (!dict_table_has_atomic_blobs(table)) {
+ return(0);
+ }
+
+ if (col->max_prefix != 0) {
+ return(col->max_prefix);
+ }
+
+ return(REC_VERSION_56_MAX_INDEX_COL_LEN);
+}
+
+/** Determine maximum bytes of a virtual column need to be stored
+in the undo log.
+@param[in] table dict_table_t for the table
+@param[in] col_no virtual column number
+@return maximum bytes of virtual column to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_v_field_len_store_undo(
+ dict_table_t* table,
+ ulint col_no)
+{
+ const dict_col_t* col
+ = &dict_table_get_nth_v_col(table, col_no)->m_col;
+ ulint max_log_len;
+
+ /* This calculation conforms to the non-virtual column
+ maximum log length calculation:
+ 1) if No atomic BLOB, upto REC_ANTELOPE_MAX_INDEX_COL_LEN
+ 2) if atomic BLOB, upto col->max_prefix or
+ REC_VERSION_56_MAX_INDEX_COL_LEN, whichever is less */
+ if (dict_table_has_atomic_blobs(table)) {
+ if (DATA_BIG_COL(col) && col->max_prefix > 0) {
+ max_log_len = col->max_prefix;
+ } else {
+ max_log_len = DICT_MAX_FIELD_LEN_BY_FORMAT(table);
+ }
+ } else {
+ max_log_len = REC_ANTELOPE_MAX_INDEX_COL_LEN;
+ }
+
+ return(max_log_len);
+}
+
+/********************************************************************//**
+Check whether the table is corrupted.
+@return nonzero for corrupted table, zero for valid tables */
+UNIV_INLINE
+ulint
+dict_table_is_corrupted(
+/*====================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ return(table->corrupted);
+}
+
+/** Check if the table is found is a file_per_table tablespace.
+This test does not use table flags2 since some REDUNDANT tables in the
+system tablespace may have garbage in the MIX_LEN field where flags2 is
+stored. These garbage MIX_LEN fields were written before v3.23.52.
+A patch was added to v3.23.52 which initializes the MIX_LEN field to 0.
+Since file-per-table tablespaces were added in 4.1, any SYS_TABLES
+record with a non-zero space ID will have a reliable MIX_LEN field.
+However, this test does not use flags2 from SYS_TABLES.MIX_LEN. Instead,
+assume that if the tablespace is not a predefined system tablespace,
+ then it must be file-per-table.
+Also, during ALTER TABLE, the DICT_TF2_USE_FILE_PER_TABLE flag may not be
+set on one of the file-per-table tablespaces.
+This test cannot be done on a table in the process of being created
+because the space_id will be zero until the tablespace is created.
+@param[in] table An existing open table to check
+@return true if this table was created as a file-per-table tablespace. */
+UNIV_INLINE
+bool
+dict_table_is_file_per_table(
+ const dict_table_t* table) /*!< in: table to check */
+{
+ return table->space != fil_system.sys_space
+ && table->space != fil_system.temp_space;
+}
+
+/** Acquire the table handle. */
+inline
+void
+dict_table_t::acquire()
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+ n_ref_count++;
+}
+
+/** Release the table handle.
+@return whether the last handle was released */
+inline
+bool
+dict_table_t::release()
+{
+ auto n = n_ref_count--;
+ ut_ad(n > 0);
+ return n == 1;
+}
+
+/** Encode the number of columns and number of virtual columns in a
+4 bytes value. We could do this because the number of columns in
+InnoDB is limited to 1017
+@param[in] n_col number of non-virtual column
+@param[in] n_v_col number of virtual column
+@return encoded value */
+UNIV_INLINE
+ulint
+dict_table_encode_n_col(
+ ulint n_col,
+ ulint n_v_col)
+{
+ return(n_col + (n_v_col<<16));
+}
+
+/** decode number of virtual and non-virtual columns in one 4 bytes value.
+@param[in] encoded encoded value
+@param[in,out] n_col number of non-virtual column
+@param[in,out] n_v_col number of virtual column */
+UNIV_INLINE
+void
+dict_table_decode_n_col(
+ ulint encoded,
+ ulint* n_col,
+ ulint* n_v_col)
+{
+
+ ulint num = encoded & ~DICT_N_COLS_COMPACT;
+ *n_v_col = num >> 16;
+ *n_col = num & 0xFFFF;
+}
+
+/** Free the virtual column template
+@param[in,out] vc_templ virtual column template */
+void
+dict_free_vc_templ(
+ dict_vcol_templ_t* vc_templ)
+{
+ UT_DELETE_ARRAY(vc_templ->default_rec);
+ vc_templ->default_rec = NULL;
+
+ if (vc_templ->vtempl != NULL) {
+ ut_ad(vc_templ->n_v_col > 0);
+ for (ulint i = 0; i < vc_templ->n_col
+ + vc_templ->n_v_col; i++) {
+ if (vc_templ->vtempl[i] != NULL) {
+ ut_free(vc_templ->vtempl[i]);
+ }
+ }
+ ut_free(vc_templ->vtempl);
+ vc_templ->vtempl = NULL;
+ }
+}
+
+/** Check whether the table have virtual index.
+@param[in] table InnoDB table
+@return true if the table have virtual index, false otherwise. */
+UNIV_INLINE
+bool
+dict_table_have_virtual_index(
+ dict_table_t* table)
+{
+ for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table);
+ col_no++) {
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(table, col_no);
+
+ if (col->m_col.ord_part) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
new file mode 100644
index 00000000..f067571c
--- /dev/null
+++ b/storage/innobase/include/dict0load.h
@@ -0,0 +1,309 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.h
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0load_h
+#define dict0load_h
+
+#include "dict0types.h"
+#include "trx0types.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "btr0types.h"
+
+#include <deque>
+
+/** A stack of table names related through foreign key constraints */
+typedef std::deque<const char*, ut_allocator<const char*> > dict_names_t;
+
+/** enum that defines all system table IDs. @see SYSTEM_TABLE_NAME[] */
+enum dict_system_id_t {
+ SYS_TABLES = 0,
+ SYS_INDEXES,
+ SYS_COLUMNS,
+ SYS_FIELDS,
+ SYS_FOREIGN,
+ SYS_FOREIGN_COLS,
+ SYS_TABLESPACES,
+ SYS_DATAFILES,
+ SYS_VIRTUAL,
+
+ /* This must be last item. Defines the number of system tables. */
+ SYS_NUM_SYSTEM_TABLES
+};
+
+/** Check each tablespace found in the data dictionary.
+Look at each table defined in SYS_TABLES that has a space_id > 0.
+If the tablespace is not yet in the fil_system cache, look up the
+tablespace in SYS_DATAFILES to ensure the correct path.
+
+In a crash recovery we already have some tablespace objects created from
+processing the REDO log. Any other tablespace in SYS_TABLESPACES not
+previously used in recovery will be opened here. We will compare the
+space_id information in the data dictionary to what we find in the
+tablespace file. In addition, more validation will be done if recovery
+was needed and force_recovery is not set.
+
+We also scan the biggest space id, and store it to fil_system. */
+void dict_check_tablespaces_and_store_max_id();
+
+/********************************************************************//**
+Finds the first table name in the given database.
+@return own: table name, NULL if does not exist; the caller must free
+the memory in the string! */
+char*
+dict_get_first_table_name_in_db(
+/*============================*/
+ const char* name); /*!< in: database name which ends to '/' */
+
+/** Make sure the data_file_name is saved in dict_table_t if needed.
+Try to read it from the fil_system first, then from SYS_DATAFILES.
+@param[in] table Table object
+@param[in] dict_mutex_own true if dict_sys.mutex is owned already */
+void
+dict_get_and_save_data_dir_path(
+ dict_table_t* table,
+ bool dict_mutex_own);
+
+/** Loads a table definition and also all its index definitions, and also
+the cluster definition if the table is a member in a cluster. Also loads
+all foreign key constraints where the foreign key is in the table or where
+a foreign key references columns in this table.
+@param[in] name Table name in the dbname/tablename format
+@param[in] ignore_err Error to be ignored when loading
+ table and its index definition
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the file_unreadable
+flag in the table object we return. */
+dict_table_t* dict_load_table(const char* name, dict_err_ignore_t ignore_err);
+
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return table; NULL if table does not exist */
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+ table_id_t table_id, /*!< in: table id */
+ dict_err_ignore_t ignore_err); /*!< in: errors to ignore
+ when loading the table */
+/********************************************************************//**
+This function is called when the database is booted.
+Loads system table index definitions except for the clustered index which
+is added to the dictionary cache at booting before calling this function. */
+void
+dict_load_sys_table(
+/*================*/
+ dict_table_t* table); /*!< in: system table */
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary.
+
+The foreign key constraint is loaded only if the referenced table is also
+in the dictionary cache. If the referenced table is not in dictionary
+cache, then it is added to the output parameter (fk_tables).
+
+@return DB_SUCCESS or error code */
+dberr_t
+dict_load_foreigns(
+/*===============*/
+ const char* table_name, /*!< in: table name */
+ const char** col_names, /*!< in: column names, or NULL
+ to use table->col_names */
+ bool check_recursive,/*!< in: Whether to check
+ recursive load of tables
+ chained by FK */
+ bool check_charsets, /*!< in: whether to check
+ charset compatibility */
+ dict_err_ignore_t ignore_err, /*!< in: error to be ignored */
+ dict_names_t& fk_tables) /*!< out: stack of table names
+ which must be loaded
+ subsequently to load all the
+ foreign key constraints. */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/********************************************************************//**
+This function opens a system table, and return the first record.
+@return first record of the system table */
+const rec_t*
+dict_startscan_system(
+/*==================*/
+ btr_pcur_t* pcur, /*!< out: persistent cursor to
+ the record */
+ mtr_t* mtr, /*!< in: the mini-transaction */
+ dict_system_id_t system_id); /*!< in: which system table to open */
+/********************************************************************//**
+This function get the next system table record as we scan the table.
+@return the record if found, NULL if end of scan. */
+const rec_t*
+dict_getnext_system(
+/*================*/
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor
+ to the record */
+ mtr_t* mtr); /*!< in: the mini-transaction */
+/********************************************************************//**
+This function processes one SYS_TABLES record and populate the dict_table_t
+struct for the table.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_tables_rec_and_mtr_commit(
+/*=======================================*/
+ mem_heap_t* heap, /*!< in: temporary memory heap */
+ const rec_t* rec, /*!< in: SYS_TABLES record */
+ dict_table_t** table, /*!< out: dict_table_t to fill */
+ bool cached, /*!< in: whether to load from cache */
+ mtr_t* mtr); /*!< in/out: mini-transaction,
+ will be committed */
+/********************************************************************//**
+This function parses a SYS_INDEXES record and populate a dict_index_t
+structure with the information from the record. For detail information
+about SYS_INDEXES fields, please refer to dict_boot() function.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_indexes_rec(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_INDEXES rec */
+ dict_index_t* index, /*!< out: dict_index_t to be
+ filled */
+ table_id_t* table_id); /*!< out: table id */
+/********************************************************************//**
+This function parses a SYS_COLUMNS record and populate a dict_column_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_columns_rec(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_COLUMNS rec */
+ dict_col_t* column, /*!< out: dict_col_t to be filled */
+ table_id_t* table_id, /*!< out: table id */
+ const char** col_name, /*!< out: column name */
+ ulint* nth_v_col); /*!< out: if virtual col, this is
+ records its sequence number */
+
+/** This function parses a SYS_VIRTUAL record and extract virtual column
+information
+@param[in,out] heap heap memory
+@param[in] rec current SYS_COLUMNS rec
+@param[in,out] table_id table id
+@param[in,out] pos virtual column position
+@param[in,out] base_pos base column position
+@return error message, or NULL on success */
+const char*
+dict_process_sys_virtual_rec(
+ const rec_t* rec,
+ table_id_t* table_id,
+ ulint* pos,
+ ulint* base_pos);
+/********************************************************************//**
+This function parses a SYS_FIELDS record and populate a dict_field_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_fields_rec(
+/*========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_FIELDS rec */
+ dict_field_t* sys_field, /*!< out: dict_field_t to be
+ filled */
+ ulint* pos, /*!< out: Field position */
+ index_id_t* index_id, /*!< out: current index id */
+ index_id_t last_id); /*!< in: previous index id */
+/********************************************************************//**
+This function parses a SYS_FOREIGN record and populate a dict_foreign_t
+structure with the information from the record. For detail information
+about SYS_FOREIGN fields, please refer to dict_load_foreign() function
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_rec(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_FOREIGN rec */
+ dict_foreign_t* foreign); /*!< out: dict_foreign_t to be
+ filled */
+/********************************************************************//**
+This function parses a SYS_FOREIGN_COLS record and extract necessary
+information from the record and return to caller.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_col_rec(
+/*=============================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_FOREIGN_COLS rec */
+ const char** name, /*!< out: foreign key constraint name */
+ const char** for_col_name, /*!< out: referencing column name */
+ const char** ref_col_name, /*!< out: referenced column name
+ in referenced table */
+ ulint* pos); /*!< out: column position */
+/********************************************************************//**
+This function parses a SYS_TABLESPACES record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_tablespaces(
+/*=========================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_TABLESPACES rec */
+ uint32_t* space, /*!< out: tablespace identifier */
+ const char** name, /*!< out: tablespace name */
+ ulint* flags); /*!< out: tablespace flags */
+/********************************************************************//**
+This function parses a SYS_DATAFILES record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_datafiles(
+/*=======================*/
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ const rec_t* rec, /*!< in: current SYS_DATAFILES rec */
+ uint32_t* space, /*!< out: tablespace identifier */
+ const char** path); /*!< out: datafile path */
+
+/** Update the record for space_id in SYS_TABLESPACES to this filepath.
+@param[in] space_id Tablespace ID
+@param[in] filepath Tablespace filepath
+@return DB_SUCCESS if OK, dberr_t if the insert failed */
+dberr_t
+dict_update_filepath(
+ ulint space_id,
+ const char* filepath);
+
+/** Replace records in SYS_TABLESPACES and SYS_DATAFILES associated with
+the given space_id using an independent transaction.
+@param[in] space_id Tablespace ID
+@param[in] name Tablespace name
+@param[in] filepath First filepath
+@param[in] fsp_flags Tablespace flags
+@return DB_SUCCESS if OK, dberr_t if the insert failed */
+dberr_t
+dict_replace_tablespace_and_filepath(
+ ulint space_id,
+ const char* name,
+ const char* filepath,
+ ulint fsp_flags);
+
+#endif
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
new file mode 100644
index 00000000..9d7dcf47
--- /dev/null
+++ b/storage/innobase/include/dict0mem.h
@@ -0,0 +1,2542 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0mem.h
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0mem_h
+#define dict0mem_h
+
+#include "data0type.h"
+#include "mem0mem.h"
+#include "row0types.h"
+#include "rem0types.h"
+#include "btr0types.h"
+#include "lock0types.h"
+#include "que0types.h"
+#include "sync0rw.h"
+#include "ut0mem.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+#include "fts0fts.h"
+#include "buf0buf.h"
+#include "gis0type.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "mysql_com.h"
+#include <sql_const.h>
+#include <set>
+#include <algorithm>
+#include <iterator>
+#include <ostream>
+#include <mutex>
+
+/* Forward declaration. */
+struct ib_rbt_t;
+
+/** Type flags of an index: OR'ing of the flags is allowed to define a
+combination of types */
+/* @{ */
+#define DICT_CLUSTERED 1 /*!< clustered index; for other than
+ auto-generated clustered indexes,
+ also DICT_UNIQUE will be set */
+#define DICT_UNIQUE 2 /*!< unique index */
+#define DICT_IBUF 8 /*!< insert buffer tree */
+#define DICT_CORRUPT 16 /*!< bit to store the corrupted flag
+ in SYS_INDEXES.TYPE */
+#define DICT_FTS 32 /* FTS index; can't be combined with the
+ other flags */
+#define DICT_SPATIAL 64 /* SPATIAL index; can't be combined with the
+ other flags */
+#define DICT_VIRTUAL 128 /* Index on Virtual column */
+
+#define DICT_IT_BITS 8 /*!< number of bits used for
+ SYS_INDEXES.TYPE */
+/* @} */
+
+#if 0 /* not implemented, retained for history */
+/** Types for a table object */
+#define DICT_TABLE_ORDINARY 1 /*!< ordinary table */
+#define DICT_TABLE_CLUSTER_MEMBER 2
+#define DICT_TABLE_CLUSTER 3 /* this means that the table is
+ really a cluster definition */
+#endif
+
+/* Table and tablespace flags are generally not used for the Antelope file
+format except for the low order bit, which is used differently depending on
+where the flags are stored.
+
+==================== Low order flags bit =========================
+ | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+SYS_TABLES.TYPE | 1 | 1 | 1
+dict_table_t::flags | 0 | 1 | 1
+FSP_SPACE_FLAGS | 0 | 0 | 1
+fil_space_t::flags | 0 | 0 | 1
+
+Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1)
+and the tablespace flags field was always 0. In the 5.1 plugin, these fields
+were repurposed to identify compressed and dynamic row formats.
+
+The following types and constants describe the flags found in dict_table_t
+and SYS_TABLES.TYPE. Similar flags found in fil_space_t and FSP_SPACE_FLAGS
+are described in fsp0fsp.h. */
+
+/* @{ */
+/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */
+#define DICT_TF_REDUNDANT 0 /*!< Redundant row format. */
+/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */
+#define DICT_TF_COMPACT 1U /*!< Compact row format. */
+
+/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether
+the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */
+#define DICT_N_COLS_COMPACT 0x80000000UL
+
+/** Width of the COMPACT flag */
+#define DICT_TF_WIDTH_COMPACT 1
+
+/** Width of the ZIP_SSIZE flag */
+#define DICT_TF_WIDTH_ZIP_SSIZE 4
+
+/** Width of the ATOMIC_BLOBS flag. The ROW_FORMAT=REDUNDANT and
+ROW_FORMAT=COMPACT broke up BLOB and TEXT fields, storing the first 768 bytes
+in the clustered index. ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED
+store the whole blob or text field off-page atomically.
+Secondary indexes are created from this external data using row_ext_t
+to cache the BLOB prefixes. */
+#define DICT_TF_WIDTH_ATOMIC_BLOBS 1
+
+/** If a table is created with the MYSQL option DATA DIRECTORY and
+innodb-file-per-table, an older engine will not be able to find that table.
+This flag prevents older engines from attempting to open the table and
+allows InnoDB to update_create_info() accordingly. */
+#define DICT_TF_WIDTH_DATA_DIR 1
+
+/**
+Width of the page compression flag
+*/
+#define DICT_TF_WIDTH_PAGE_COMPRESSION 1
+#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4
+
+/**
+The NO_ROLLBACK flag (3=yes; the values 1,2 used stand for
+ATOMIC_WRITES=ON and ATOMIC_WRITES=OFF between MariaDB 10.1.0 and 10.2.3)
+*/
+#define DICT_TF_WIDTH_NO_ROLLBACK 2
+
+/** Width of all the currently known table flags */
+#define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \
+ + DICT_TF_WIDTH_ZIP_SSIZE \
+ + DICT_TF_WIDTH_ATOMIC_BLOBS \
+ + DICT_TF_WIDTH_DATA_DIR \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \
+ + DICT_TF_WIDTH_NO_ROLLBACK)
+
+/** Zero relative shift position of the COMPACT field */
+#define DICT_TF_POS_COMPACT 0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define DICT_TF_POS_ZIP_SSIZE (DICT_TF_POS_COMPACT \
+ + DICT_TF_WIDTH_COMPACT)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define DICT_TF_POS_ATOMIC_BLOBS (DICT_TF_POS_ZIP_SSIZE \
+ + DICT_TF_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the DATA_DIR field */
+#define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \
+ + DICT_TF_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \
+ + DICT_TF_WIDTH_DATA_DIR)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION)
+/** Zero relative shift position of the NO_ROLLBACK field */
+#define DICT_TF_POS_NO_ROLLBACK (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)
+#define DICT_TF_POS_UNUSED (DICT_TF_POS_NO_ROLLBACK \
+ + DICT_TF_WIDTH_NO_ROLLBACK)
+
+/** Bit mask of the COMPACT field */
+#define DICT_TF_MASK_COMPACT \
+ ((~(~0U << DICT_TF_WIDTH_COMPACT)) \
+ << DICT_TF_POS_COMPACT)
+/** Bit mask of the ZIP_SSIZE field */
+#define DICT_TF_MASK_ZIP_SSIZE \
+ ((~(~0U << DICT_TF_WIDTH_ZIP_SSIZE)) \
+ << DICT_TF_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define DICT_TF_MASK_ATOMIC_BLOBS \
+ ((~(~0U << DICT_TF_WIDTH_ATOMIC_BLOBS)) \
+ << DICT_TF_POS_ATOMIC_BLOBS)
+/** Bit mask of the DATA_DIR field */
+#define DICT_TF_MASK_DATA_DIR \
+ ((~(~0U << DICT_TF_WIDTH_DATA_DIR)) \
+ << DICT_TF_POS_DATA_DIR)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define DICT_TF_MASK_PAGE_COMPRESSION \
+ ((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION)) \
+ << DICT_TF_POS_PAGE_COMPRESSION)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \
+ ((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \
+ << DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+/** Bit mask of the NO_ROLLBACK field */
+#define DICT_TF_MASK_NO_ROLLBACK \
+ ((~(~0U << DICT_TF_WIDTH_NO_ROLLBACK)) \
+ << DICT_TF_POS_NO_ROLLBACK)
+
+/** Return the value of the COMPACT field */
+#define DICT_TF_GET_COMPACT(flags) \
+ ((flags & DICT_TF_MASK_COMPACT) \
+ >> DICT_TF_POS_COMPACT)
+/** Return the value of the ZIP_SSIZE field */
+#define DICT_TF_GET_ZIP_SSIZE(flags) \
+ ((flags & DICT_TF_MASK_ZIP_SSIZE) \
+ >> DICT_TF_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define DICT_TF_HAS_ATOMIC_BLOBS(flags) \
+ ((flags & DICT_TF_MASK_ATOMIC_BLOBS) \
+ >> DICT_TF_POS_ATOMIC_BLOBS)
+/** Return the value of the DATA_DIR field */
+#define DICT_TF_HAS_DATA_DIR(flags) \
+ ((flags & DICT_TF_MASK_DATA_DIR) \
+ >> DICT_TF_POS_DATA_DIR)
+/** Return the value of the PAGE_COMPRESSION field */
+#define DICT_TF_GET_PAGE_COMPRESSION(flags) \
+ ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \
+ >> DICT_TF_POS_PAGE_COMPRESSION)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \
+ ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \
+ >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+
+/* @} */
+
+/** @brief Table Flags set number 2.
+
+These flags will be stored in SYS_TABLES.MIX_LEN. All unused flags
+will be written as 0. The column may contain garbage for tables
+created with old versions of InnoDB that only implemented
+ROW_FORMAT=REDUNDANT. InnoDB engines do not check these flags
+for unknown bits in order to protect backward incompatibility. */
+/* @{ */
+/** Total number of bits in table->flags2. */
+#define DICT_TF2_BITS 7
+#define DICT_TF2_UNUSED_BIT_MASK (~0U << DICT_TF2_BITS)
+#define DICT_TF2_BIT_MASK ~DICT_TF2_UNUSED_BIT_MASK
+
+/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */
+#define DICT_TF2_TEMPORARY 1U
+
+/** The table has an internal defined DOC ID column */
+#define DICT_TF2_FTS_HAS_DOC_ID 2U
+
+/** The table has an FTS index */
+#define DICT_TF2_FTS 4U
+
+/** Need to add Doc ID column for FTS index build.
+This is a transient bit for index build */
+#define DICT_TF2_FTS_ADD_DOC_ID 8U
+
+/** This bit is used during table creation to indicate that it will
+use its own tablespace instead of the system tablespace. */
+#define DICT_TF2_USE_FILE_PER_TABLE 16U
+
+/** Set when we discard/detach the tablespace */
+#define DICT_TF2_DISCARDED 32U
+
+/** This bit is set if all aux table names (both common tables and
+index tables) of a FTS table are in HEX format. */
+#define DICT_TF2_FTS_AUX_HEX_NAME 64U
+
+/* @} */
+
+#define DICT_TF2_FLAG_SET(table, flag) \
+ (table->flags2 |= (flag))
+
+#define DICT_TF2_FLAG_IS_SET(table, flag) \
+ (table->flags2 & (flag))
+
+#define DICT_TF2_FLAG_UNSET(table, flag) \
+ (table->flags2 &= ~(flag) & ((1U << DICT_TF2_BITS) - 1))
+
+/** Tables could be chained together with Foreign key constraint. When
+first load the parent table, we would load all of its descedents.
+This could result in rescursive calls and out of stack error eventually.
+DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads,
+when exceeded, the child table will not be loaded. It will be loaded when
+the foreign constraint check needs to be run. */
+#define DICT_FK_MAX_RECURSIVE_LOAD 20
+
+/** Similarly, when tables are chained together with foreign key constraints
+with on cascading delete/update clause, delete from parent table could
+result in recursive cascading calls. This defines the maximum number of
+such cascading deletes/updates allowed. When exceeded, the delete from
+parent table will fail, and user has to drop excessive foreign constraint
+before proceeds. */
+#define FK_MAX_CASCADE_DEL 15
+
+/** Create a table memory object.
+@param name table name
+@param space tablespace
+@param n_cols total number of columns (both virtual and non-virtual)
+@param n_v_cols number of virtual columns
+@param flags table flags
+@param flags2 table flags2
+@return own: table object */
+dict_table_t *dict_mem_table_create(const char *name, fil_space_t *space,
+ ulint n_cols, ulint n_v_cols, ulint flags,
+ ulint flags2);
+/****************************************************************/ /**
+ Free a table memory object. */
+void
+dict_mem_table_free(
+/*================*/
+ dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Adds a column definition to a table. */
+void
+dict_mem_table_add_col(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */
+ const char* name, /*!< in: column name, or NULL */
+ ulint mtype, /*!< in: main datatype */
+ ulint prtype, /*!< in: precise type */
+ ulint len) /*!< in: precision */
+ MY_ATTRIBUTE((nonnull(1)));
+/** Adds a virtual column definition to a table.
+@param[in,out] table table
+@param[in] heap temporary memory heap, or NULL. It is
+ used to store name when we have not finished
+ adding all columns. When all columns are
+ added, the whole name will copy to memory from
+ table->heap
+@param[in] name column name
+@param[in] mtype main datatype
+@param[in] prtype precise type
+@param[in] len length
+@param[in] pos position in a table
+@param[in] num_base number of base columns
+@return the virtual column definition */
+dict_v_col_t*
+dict_mem_table_add_v_col(
+ dict_table_t* table,
+ mem_heap_t* heap,
+ const char* name,
+ ulint mtype,
+ ulint prtype,
+ ulint len,
+ ulint pos,
+ ulint num_base);
+
+/** Adds a stored column definition to a table.
+@param[in] table table
+@param[in] num_base number of base columns. */
+void
+dict_mem_table_add_s_col(
+ dict_table_t* table,
+ ulint num_base);
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+void
+dict_mem_table_col_rename(
+/*======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ulint nth_col,/*!< in: column index */
+ const char* from, /*!< in: old column name */
+ const char* to, /*!< in: new column name */
+ bool is_virtual);
+ /*!< in: if this is a virtual column */
+/**********************************************************************//**
+This function populates a dict_col_t memory structure with
+supplied information. */
+void
+dict_mem_fill_column_struct(
+/*========================*/
+ dict_col_t* column, /*!< out: column struct to be
+ filled */
+ ulint col_pos, /*!< in: column position */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint col_len); /*!< in: column length */
+/**********************************************************************//**
+This function poplulates a dict_index_t index memory structure with
+supplied information. */
+UNIV_INLINE
+void
+dict_mem_fill_index_struct(
+/*=======================*/
+ dict_index_t* index, /*!< out: index to be filled */
+ mem_heap_t* heap, /*!< in: memory heap */
+ const char* index_name, /*!< in: index name */
+ ulint type, /*!< in: DICT_UNIQUE,
+ DICT_CLUSTERED, ... ORed */
+ ulint n_fields); /*!< in: number of fields */
+/**********************************************************************//**
+Creates an index memory object.
+@return own: index object */
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+ dict_table_t* table, /*!< in: table */
+ const char* index_name, /*!< in: index name */
+ ulint type, /*!< in: DICT_UNIQUE,
+ DICT_CLUSTERED, ... ORed */
+ ulint n_fields); /*!< in: number of fields */
+/**********************************************************************//**
+Adds a field definition to an index. NOTE: does not take a copy
+of the column name if the field is a column. The memory occupied
+by the column name may be released only after publishing the index. */
+void
+dict_mem_index_add_field(
+/*=====================*/
+ dict_index_t* index, /*!< in: index */
+ const char* name, /*!< in: column name */
+ ulint prefix_len); /*!< in: 0 or the column prefix length
+ in a MySQL index like
+ INDEX (textcol(25)) */
+/**********************************************************************//**
+Frees an index memory object. */
+void
+dict_mem_index_free(
+/*================*/
+ dict_index_t* index); /*!< in: index */
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return own: foreign constraint struct */
+dict_foreign_t*
+dict_mem_foreign_create(void);
+/*=========================*/
+
+/**********************************************************************//**
+Sets the foreign_table_name_lookup pointer based on the value of
+lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup
+will point to foreign_table_name. If 2, then another string is
+allocated from the heap and set to lower case. */
+void
+dict_mem_foreign_table_name_lookup_set(
+/*===================================*/
+ dict_foreign_t* foreign, /*!< in/out: foreign struct */
+ ibool do_alloc); /*!< in: is an alloc needed */
+
+/**********************************************************************//**
+Sets the referenced_table_name_lookup pointer based on the value of
+lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup
+will point to referenced_table_name. If 2, then another string is
+allocated from the heap and set to lower case. */
+void
+dict_mem_referenced_table_name_lookup_set(
+/*======================================*/
+ dict_foreign_t* foreign, /*!< in/out: foreign struct */
+ ibool do_alloc); /*!< in: is an alloc needed */
+
+/** Fills the dependent virtual columns in a set.
+Reason for being dependent are
+1) FK can be present on base column of virtual columns
+2) FK can be present on column which is a part of virtual index
+@param[in,out] foreign foreign key information. */
+void
+dict_mem_foreign_fill_vcol_set(
+ dict_foreign_t* foreign);
+
+/** Fill virtual columns set in each fk constraint present in the table.
+@param[in,out] table innodb table object. */
+void
+dict_mem_table_fill_foreign_vcol_set(
+ dict_table_t* table);
+
+/** Free the vcol_set from all foreign key constraint on the table.
+@param[in,out] table innodb table object. */
+void
+dict_mem_table_free_foreign_vcol_set(
+ dict_table_t* table);
+
+/** Create a temporary tablename like "#sql-ibNNN".
+@param[in] heap A memory heap
+@param[in] dbtab Table name in the form database/table name
+@param[in] id Table id
+@return A unique temporary tablename suitable for InnoDB use */
+char*
+dict_mem_create_temporary_tablename(
+ mem_heap_t* heap,
+ const char* dbtab,
+ table_id_t id);
+
+/** SQL identifier name wrapper for pretty-printing */
+class id_name_t
+{
+public:
+ /** Default constructor */
+ id_name_t()
+ : m_name()
+ {}
+ /** Constructor
+ @param[in] name identifier to assign */
+ explicit id_name_t(
+ const char* name)
+ : m_name(name)
+ {}
+
+ /** Assignment operator
+ @param[in] name identifier to assign */
+ id_name_t& operator=(
+ const char* name)
+ {
+ m_name = name;
+ return(*this);
+ }
+
+ /** Implicit type conversion
+ @return the name */
+ operator const char*() const
+ {
+ return(m_name);
+ }
+
+ /** Explicit type conversion
+ @return the name */
+ const char* operator()() const
+ {
+ return(m_name);
+ }
+
+private:
+ /** The name in internal representation */
+ const char* m_name;
+};
+
+/** Data structure for a column in a table */
+struct dict_col_t{
+ /*----------------------*/
+ /** The following are copied from dtype_t,
+ so that all bit-fields can be packed tightly. */
+ /* @{ */
+ unsigned prtype:32; /*!< precise type; MySQL data
+ type, charset code, flags to
+ indicate nullability,
+ signedness, whether this is a
+ binary string, whether this is
+ a true VARCHAR where MySQL
+ uses 2 bytes to store the length */
+ unsigned mtype:8; /*!< main data type */
+
+ /* the remaining fields do not affect alphabetical ordering: */
+
+ unsigned len:16; /*!< length; for MySQL data this
+ is field->pack_length(),
+ except that for a >= 5.0.3
+ type true VARCHAR this is the
+ maximum byte length of the
+ string data (in addition to
+ the string, MySQL uses 1 or 2
+ bytes to store the string length) */
+
+ unsigned mbminlen:3; /*!< minimum length of a
+ character, in bytes */
+ unsigned mbmaxlen:3; /*!< maximum length of a
+ character, in bytes */
+ /*----------------------*/
+ /* End of definitions copied from dtype_t */
+ /* @} */
+
+ unsigned ind:10; /*!< table column position
+ (starting from 0) */
+ unsigned ord_part:1; /*!< nonzero if this column
+ appears in the ordering fields
+ of an index */
+ unsigned max_prefix:12; /*!< maximum index prefix length on
+ this column. Our current max limit is
+ 3072 (REC_VERSION_56_MAX_INDEX_COL_LEN)
+ bytes. */
+private:
+ /** Special value of ind for a dropped column */
+ static const unsigned DROPPED = 1023;
+public:
+
+ /** Detach a virtual column from an index.
+ @param index being-freed index */
+ inline void detach(const dict_index_t &index);
+
+ /** Data for instantly added columns */
+ struct def_t
+ {
+ /** original default value of instantly added column */
+ const void *data;
+ /** len of data, or UNIV_SQL_DEFAULT if unavailable */
+ ulint len;
+ } def_val;
+
+ /** Retrieve the column name.
+ @param table the table of this column */
+ const char *name(const dict_table_t &table) const;
+
+ /** @return whether this is a virtual column */
+ bool is_virtual() const { return prtype & DATA_VIRTUAL; }
+ /** @return whether NULL is an allowed value for this column */
+ bool is_nullable() const { return !(prtype & DATA_NOT_NULL); }
+
+ /** @return whether table of this system field is TRX_ID-based */
+ bool vers_native() const
+ {
+ ut_ad(vers_sys_start() || vers_sys_end());
+ ut_ad(mtype == DATA_INT || mtype == DATA_FIXBINARY);
+ return mtype == DATA_INT;
+ }
+ /** @return whether this user column (not row_start, row_end)
+ has System Versioning property */
+ bool is_versioned() const { return !(~prtype & DATA_VERSIONED); }
+ /** @return whether this is the system version start */
+ bool vers_sys_start() const
+ {
+ return (prtype & DATA_VERSIONED) == DATA_VERS_START;
+ }
+ /** @return whether this is the system version end */
+ bool vers_sys_end() const
+ {
+ return (prtype & DATA_VERSIONED) == DATA_VERS_END;
+ }
+
+ /** @return whether this is an instantly-added column */
+ bool is_added() const
+ {
+ DBUG_ASSERT(def_val.len != UNIV_SQL_DEFAULT || !def_val.data);
+ return def_val.len != UNIV_SQL_DEFAULT;
+ }
+ /** Flag the column instantly dropped */
+ void set_dropped() { ind = DROPPED; }
+ /** Flag the column instantly dropped.
+ @param not_null whether the column was NOT NULL
+ @param len2 whether the length exceeds 255 bytes
+ @param fixed_len the fixed length in bytes, or 0 */
+ void set_dropped(bool not_null, bool len2, unsigned fixed)
+ {
+ DBUG_ASSERT(!len2 || !fixed);
+ prtype= not_null ? DATA_NOT_NULL | DATA_BINARY_TYPE : DATA_BINARY_TYPE;
+ if (fixed)
+ {
+ mtype= DATA_FIXBINARY;
+ len= static_cast<uint16_t>(fixed);
+ }
+ else
+ {
+ mtype= DATA_BINARY;
+ len= len2 ? 65535 : 255;
+ }
+ mbminlen= mbmaxlen= 0;
+ ind= DROPPED;
+ ord_part= 0;
+ max_prefix= 0;
+ }
+ /** @return whether the column was instantly dropped */
+ bool is_dropped() const { return ind == DROPPED; }
+ /** @return whether the column was instantly dropped
+ @param index the clustered index */
+ inline bool is_dropped(const dict_index_t &index) const;
+
+ /** Get the default value of an instantly-added column.
+ @param[out] len value length (in bytes), or UNIV_SQL_NULL
+ @return default value
+ @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */
+ const byte *instant_value(ulint *len) const
+ {
+ DBUG_ASSERT(is_added());
+ *len= def_val.len;
+ return static_cast<const byte*>(def_val.data);
+ }
+
+ /** Remove the 'instant ADD' status of the column */
+ void clear_instant()
+ {
+ def_val.len= UNIV_SQL_DEFAULT;
+ def_val.data= NULL;
+ }
+
+ /** @return whether two columns have compatible data type encoding */
+ bool same_type(const dict_col_t &other) const
+ {
+ if (mtype != other.mtype)
+ {
+ /* For latin1_swedish_ci, DATA_CHAR and DATA_VARCHAR
+ will be used instead of DATA_MYSQL and DATA_VARMYSQL.
+ As long as mtype,prtype are being written to InnoDB
+ data dictionary tables, we cannot simplify this. */
+ switch (mtype) {
+ default:
+ return false;
+ case DATA_VARCHAR:
+ if (other.mtype != DATA_VARMYSQL)
+ return false;
+ goto check_encoding;
+ case DATA_VARMYSQL:
+ if (other.mtype != DATA_VARCHAR)
+ return false;
+ goto check_encoding;
+ case DATA_CHAR:
+ if (other.mtype != DATA_MYSQL)
+ return false;
+ goto check_encoding;
+ case DATA_MYSQL:
+ if (other.mtype != DATA_CHAR)
+ return false;
+ goto check_encoding;
+ }
+ }
+ else if (dtype_is_string_type(mtype))
+ {
+ check_encoding:
+ const uint16_t cset= dtype_get_charset_coll(prtype);
+ const uint16_t ocset= dtype_get_charset_coll(other.prtype);
+ return cset == ocset || dict_col_t::same_encoding(cset, ocset);
+ }
+
+ return true;
+ }
+
+ /** @return whether two collations codes have the same character encoding */
+ static bool same_encoding(uint16_t a, uint16_t b);
+
+ /** Determine if the columns have the same format
+ except for is_nullable() and is_versioned().
+ @param other column to compare to
+ @return whether the columns have the same format */
+ bool same_format(const dict_col_t &other) const
+ {
+ return same_type(other) && len >= other.len &&
+ mbminlen == other.mbminlen && mbmaxlen >= other.mbmaxlen &&
+ !((prtype ^ other.prtype) & ~(DATA_NOT_NULL | DATA_VERSIONED |
+ CHAR_COLL_MASK << 16 |
+ DATA_LONG_TRUE_VARCHAR));
+ }
+
+ /** @return whether the column values are comparable by memcmp() */
+ bool is_binary() const { return prtype & DATA_BINARY_TYPE; }
+};
+
+/** Index information put in a list of virtual column structure. Index
+id and virtual column position in the index will be logged.
+There can be multiple entries for a given index, with a different position. */
+struct dict_v_idx_t {
+ /** active index on the column */
+ dict_index_t* index;
+
+ /** position in this index */
+ ulint nth_field;
+
+ dict_v_idx_t(dict_index_t* index, ulint nth_field)
+ : index(index), nth_field(nth_field) {}
+};
+
+/** Data structure for a virtual column in a table */
+struct dict_v_col_t{
+ /** column structure */
+ dict_col_t m_col;
+
+ /** array of base column ptr */
+ dict_col_t** base_col;
+
+ /** number of base column */
+ unsigned num_base:10;
+
+ /** column pos in table */
+ unsigned v_pos:10;
+
+ /** Virtual index list, and column position in the index */
+ std::forward_list<dict_v_idx_t, ut_allocator<dict_v_idx_t> >
+ v_indexes;
+
+ /** Detach the column from an index.
+ @param index index to be detached from */
+ void detach(const dict_index_t &index)
+ {
+ if (v_indexes.empty()) return;
+ auto i= v_indexes.before_begin();
+ do {
+ auto prev = i++;
+ if (i == v_indexes.end())
+ {
+ return;
+ }
+ if (i->index == &index)
+ {
+ v_indexes.erase_after(prev);
+ return;
+ }
+ }
+ while (i != v_indexes.end());
+ }
+};
+
+/** Data structure for newly added virtual column in a index.
+It is used only during rollback_inplace_alter_table() of
+addition of index depending on newly added virtual columns
+and uses index heap. Should be freed when index is being
+removed from cache. */
+struct dict_add_v_col_info
+{
+ ulint n_v_col;
+ dict_v_col_t *v_col;
+
+ /** Add the newly added virtual column while rollbacking
+ the index which contains new virtual columns
+ @param col virtual column to be duplicated
+ @param offset offset where to duplicate virtual column */
+ dict_v_col_t* add_drop_v_col(mem_heap_t *heap, dict_v_col_t *col,
+ ulint offset)
+ {
+ ut_ad(n_v_col);
+ ut_ad(offset < n_v_col);
+ if (!v_col)
+ v_col= static_cast<dict_v_col_t*>
+ (mem_heap_alloc(heap, n_v_col * sizeof *v_col));
+ new (&v_col[offset]) dict_v_col_t();
+ v_col[offset].m_col= col->m_col;
+ v_col[offset].v_pos= col->v_pos;
+ return &v_col[offset];
+ }
+};
+
+/** Data structure for newly added virtual column in a table */
+struct dict_add_v_col_t{
+ /** number of new virtual column */
+ ulint n_v_col;
+
+ /** column structures */
+ const dict_v_col_t* v_col;
+
+ /** new col names */
+ const char** v_col_name;
+};
+
+/** Data structure for a stored column in a table. */
+struct dict_s_col_t {
+ /** Stored column ptr */
+ dict_col_t* m_col;
+ /** array of base col ptr */
+ dict_col_t** base_col;
+ /** number of base columns */
+ ulint num_base;
+ /** column pos in table */
+ ulint s_pos;
+};
+
+/** list to put stored column for create_table_info_t */
+typedef std::forward_list<dict_s_col_t, ut_allocator<dict_s_col_t> >
+dict_s_col_list;
+
+/** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and
+is the maximum indexed column length (or indexed prefix length) in
+ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. Also, in any format,
+any fixed-length field that is longer than this will be encoded as
+a variable-length field.
+
+It is set to 3*256, so that one can create a column prefix index on
+256 characters of a TEXT or VARCHAR column also in the UTF-8
+charset. In that charset, a character may take at most 3 bytes. This
+constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define DICT_ANTELOPE_MAX_INDEX_COL_LEN REC_ANTELOPE_MAX_INDEX_COL_LEN
+
+/** Find out maximum indexed column length by its table format.
+For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum
+field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For
+ROW_FORMAT=COMPRESSED and ROW_FORMAT=DYNAMIC, the length could
+be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */
+#define DICT_MAX_FIELD_LEN_BY_FORMAT(table) \
+ (dict_table_has_atomic_blobs(table) \
+ ? REC_VERSION_56_MAX_INDEX_COL_LEN \
+ : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)
+
+#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags) \
+ (DICT_TF_HAS_ATOMIC_BLOBS(flags) \
+ ? REC_VERSION_56_MAX_INDEX_COL_LEN \
+ : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)
+
+/** Defines the maximum fixed length column size */
+#define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN
+
+#ifdef WITH_WSREP
+#define WSREP_MAX_SUPPORTED_KEY_LENGTH 3500
+#endif /* WITH_WSREP */
+
+/** Data structure for a field in an index */
+struct dict_field_t{
+ dict_col_t* col; /*!< pointer to the table column */
+ id_name_t name; /*!< name of the column */
+ unsigned prefix_len:12; /*!< 0 or the length of the column
+ prefix in bytes in a MySQL index of
+ type, e.g., INDEX (textcol(25));
+ must be smaller than
+ DICT_MAX_FIELD_LEN_BY_FORMAT;
+ NOTE that in the UTF-8 charset, MySQL
+ sets this to (mbmaxlen * the prefix len)
+ in UTF-8 chars */
+ unsigned fixed_len:10; /*!< 0 or the fixed length of the
+ column if smaller than
+ DICT_ANTELOPE_MAX_INDEX_COL_LEN */
+
+ /** Zero-initialize all fields */
+ dict_field_t() : col(NULL), name(NULL), prefix_len(0), fixed_len(0) {}
+
+ /** Check whether two index fields are equivalent.
+ @param[in] old the other index field
+ @return whether the index fields are equivalent */
+ bool same(const dict_field_t& other) const
+ {
+ return(prefix_len == other.prefix_len
+ && fixed_len == other.fixed_len);
+ }
+};
+
+/**********************************************************************//**
+PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID
+COMPRESSION FAILURES
+(Note: this is relevant only for compressed indexes)
+GOAL: Avoid compression failures by maintaining information about the
+compressibility of data. If data is not very compressible then leave
+some extra space 'padding' in the uncompressed page making it more
+likely that compression of less than fully packed uncompressed page will
+succeed.
+
+This padding heuristic works by increasing the pad linearly until the
+desired failure rate is reached. A "round" is a fixed number of
+compression operations.
+After each round, the compression failure rate for that round is
+computed. If the failure rate is too high, then padding is incremented
+by a fixed value, otherwise it's left intact.
+If the compression failure is lower than the desired rate for a fixed
+number of consecutive rounds, then the padding is decreased by a fixed
+value. This is done to prevent overshooting the padding value,
+and to accommodate the possible change in data compressibility. */
+
+/** Number of zip ops in one round. */
+#define ZIP_PAD_ROUND_LEN (128)
+
+/** Number of successful rounds after which the padding is decreased */
+#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT (5)
+
+/** Amount by which padding is increased. */
+#define ZIP_PAD_INCR (128)
+
+/** Percentage of compression failures that are allowed in a single
+round */
+extern ulong zip_failure_threshold_pct;
+
+/** Maximum percentage of a page that can be allowed as a pad to avoid
+compression failures */
+extern ulong zip_pad_max;
+
+/** Data structure to hold information about about how much space in
+an uncompressed page should be left as padding to avoid compression
+failures. This estimate is based on a self-adapting heuristic. */
+struct zip_pad_info_t {
+ /** Dummy assignment operator for dict_index_t::clone() */
+ zip_pad_info_t &operator=(const zip_pad_info_t&) { return *this; }
+ std::mutex mutex; /*!< mutex protecting the info */
+ Atomic_relaxed<ulint>
+ pad; /*!< number of bytes used as pad */
+ ulint success;/*!< successful compression ops during
+ current round */
+ ulint failure;/*!< failed compression ops during
+ current round */
+ ulint n_rounds;/*!< number of currently successful
+ rounds */
+};
+
+/** Number of samples of data size kept when page compression fails for
+a certain index.*/
+#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10
+
+/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
+system clustered index when there is no primary key. */
+const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
+
+/** Data structure for an index. Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_index_create(). */
+struct dict_index_t {
+ /** Maximum number of fields */
+ static constexpr unsigned MAX_N_FIELDS= (1U << 10) - 1;
+
+ index_id_t id; /*!< id of the index */
+ mem_heap_t* heap; /*!< memory heap */
+ id_name_t name; /*!< index name */
+ dict_table_t* table; /*!< back pointer to table */
+ /** root page number, or FIL_NULL if the index has been detached
+ from storage (DISCARD TABLESPACE or similar),
+ or 1 if the index is in table->freed_indexes */
+ unsigned page:32;
+ unsigned merge_threshold:6;
+ /*!< In the pessimistic delete, if the page
+ data size drops below this limit in percent,
+ merging it to a neighbor is tried */
+# define DICT_INDEX_MERGE_THRESHOLD_DEFAULT 50
+ unsigned type:DICT_IT_BITS;
+ /*!< index type (DICT_CLUSTERED, DICT_UNIQUE,
+ DICT_IBUF, DICT_CORRUPT) */
+#define MAX_KEY_LENGTH_BITS 12
+ unsigned trx_id_offset:MAX_KEY_LENGTH_BITS;
+ /*!< position of the trx id column
+ in a clustered index record, if the fields
+ before it are known to be of a fixed size,
+ 0 otherwise */
+#if (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH
+# error (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH
+#endif
+ unsigned n_user_defined_cols:10;
+ /*!< number of columns the user defined to
+ be in the index: in the internal
+ representation we add more columns */
+ unsigned nulls_equal:1;
+ /*!< if true, SQL NULL == SQL NULL */
+#ifdef BTR_CUR_HASH_ADAPT
+#ifdef MYSQL_INDEX_DISABLE_AHI
+ unsigned disable_ahi:1;
+ /*!< whether to disable the
+ adaptive hash index.
+ Maybe this could be disabled for
+ temporary tables? */
+#endif
+#endif /* BTR_CUR_HASH_ADAPT */
+ unsigned n_uniq:10;/*!< number of fields from the beginning
+ which are enough to determine an index
+ entry uniquely */
+ unsigned n_def:10;/*!< number of fields defined so far */
+ unsigned n_fields:10;/*!< number of fields in the index */
+ unsigned n_nullable:10;/*!< number of nullable fields */
+ unsigned n_core_fields:10;/*!< number of fields in the index
+ (before the first time of instant add columns) */
+ /** number of bytes of null bits in ROW_FORMAT!=REDUNDANT node pointer
+ records; usually equal to UT_BITS_IN_BYTES(n_nullable), but
+ can be less in clustered indexes with instant ADD COLUMN */
+ unsigned n_core_null_bytes:8;
+ /** magic value signalling that n_core_null_bytes was not
+ initialized yet */
+ static const unsigned NO_CORE_NULL_BYTES = 0xff;
+ /** The clustered index ID of the hard-coded SYS_INDEXES table. */
+ static const unsigned DICT_INDEXES_ID = 3;
+ unsigned cached:1;/*!< TRUE if the index object is in the
+ dictionary cache */
+ unsigned to_be_dropped:1;
+ /*!< TRUE if the index is to be dropped;
+ protected by dict_sys.latch */
+ unsigned online_status:2;
+ /*!< enum online_index_status.
+ Transitions from ONLINE_INDEX_COMPLETE (to
+ ONLINE_INDEX_CREATION) are protected
+ by dict_sys.latch and
+ dict_sys.mutex. Other changes are
+ protected by index->lock. */
+ unsigned uncommitted:1;
+ /*!< a flag that is set for secondary indexes
+ that have not been committed to the
+ data dictionary yet */
+
+#ifdef UNIV_DEBUG
+ /** whether this is a dummy index object */
+ bool is_dummy;
+ /** whether btr_cur_instant_init() is in progress */
+ bool in_instant_init;
+ uint32_t magic_n;/*!< magic number */
+/** Value of dict_index_t::magic_n */
+# define DICT_INDEX_MAGIC_N 76789786
+#endif
+ dict_field_t* fields; /*!< array of field descriptions */
+ st_mysql_ftparser*
+ parser; /*!< fulltext parser plugin */
+
+ /** It just indicates whether newly added virtual column
+ during alter. It stores column in case of alter failure.
+ It should use heap from dict_index_t. It should be freed
+ while removing the index from table. */
+ dict_add_v_col_info* new_vcol_info;
+ UT_LIST_NODE_T(dict_index_t)
+ indexes;/*!< list of indexes of the table */
+#ifdef BTR_CUR_ADAPT
+ btr_search_t* search_info;
+ /*!< info used in optimistic searches */
+#endif /* BTR_CUR_ADAPT */
+ row_log_t* online_log;
+ /*!< the log of modifications
+ during online index creation;
+ valid when online_status is
+ ONLINE_INDEX_CREATION */
+ /*----------------------*/
+ /** Statistics for query optimization */
+ /* @{ */
+ ib_uint64_t* stat_n_diff_key_vals;
+ /*!< approximate number of different
+ key values for this index, for each
+ n-column prefix where 1 <= n <=
+ dict_get_n_unique(index) (the array is
+ indexed from 0 to n_uniq-1); we
+ periodically calculate new
+ estimates */
+ ib_uint64_t* stat_n_sample_sizes;
+ /*!< number of pages that were sampled
+ to calculate each of stat_n_diff_key_vals[],
+ e.g. stat_n_sample_sizes[3] pages were sampled
+ to get the number stat_n_diff_key_vals[3]. */
+ ib_uint64_t* stat_n_non_null_key_vals;
+ /* approximate number of non-null key values
+ for this index, for each column where
+ 1 <= n <= dict_get_n_unique(index) (the array
+ is indexed from 0 to n_uniq-1); This
+ is used when innodb_stats_method is
+ "nulls_ignored". */
+ ulint stat_index_size;
+ /*!< approximate index size in
+ database pages */
+ ulint stat_n_leaf_pages;
+ /*!< approximate number of leaf pages in the
+ index tree */
+ bool stats_error_printed;
+ /*!< has persistent statistics error printed
+ for this index ? */
+ /* @} */
+ /** Statistics for defragmentation, these numbers are estimations and
+ could be very inaccurate at certain times, e.g. right after restart,
+ during defragmentation, etc. */
+ /* @{ */
+ ulint stat_defrag_modified_counter;
+ ulint stat_defrag_n_pages_freed;
+ /* number of pages freed by defragmentation. */
+ ulint stat_defrag_n_page_split;
+ /* number of page splits since last full index
+ defragmentation. */
+ ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
+ /* data size when compression failure happened
+ the most recent 10 times. */
+ ulint stat_defrag_sample_next_slot;
+ /* in which slot the next sample should be
+ saved. */
+ /* @} */
+private:
+ /** R-tree split sequence number */
+ Atomic_relaxed<node_seq_t> rtr_ssn;
+public:
+ void set_ssn(node_seq_t ssn) { rtr_ssn= ssn; }
+ node_seq_t assign_ssn() { return rtr_ssn.fetch_add(1) + 1; }
+ node_seq_t ssn() const { return rtr_ssn; }
+
+ rtr_info_track_t*
+ rtr_track;/*!< tracking all R-Tree search cursors */
+ trx_id_t trx_id; /*!< id of the transaction that created this
+ index, or 0 if the index existed
+ when InnoDB was started up */
+ zip_pad_info_t zip_pad;/*!< Information about state of
+ compression failures and successes */
+ mutable rw_lock_t lock; /*!< read-write lock protecting the
+ upper levels of the index tree */
+
+ /** Determine if the index has been committed to the
+ data dictionary.
+ @return whether the index definition has been committed */
+ bool is_committed() const
+ {
+ ut_ad(!uncommitted || !(type & DICT_CLUSTERED));
+ return(UNIV_LIKELY(!uncommitted));
+ }
+
+ /** Flag an index committed or uncommitted.
+ @param[in] committed whether the index is committed */
+ void set_committed(bool committed)
+ {
+ ut_ad(!to_be_dropped);
+ ut_ad(committed || !(type & DICT_CLUSTERED));
+ uncommitted = !committed;
+ }
+
+ /** Notify that the index pages are going to be modified.
+ @param[in,out] mtr mini-transaction */
+ inline void set_modified(mtr_t& mtr) const;
+
+ /** @return whether this index is readable
+ @retval true normally
+ @retval false if this is a single-table tablespace
+ and the .ibd file is missing, or a
+ page cannot be read or decrypted */
+ inline bool is_readable() const;
+
+ /** @return whether instant ALTER TABLE is in effect */
+ inline bool is_instant() const;
+
+ /** @return whether the index is the primary key index
+ (not the clustered index of the change buffer) */
+ bool is_primary() const
+ {
+ return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF));
+ }
+
+ /** @return whether this is a generated clustered index */
+ bool is_gen_clust() const { return type == DICT_CLUSTERED; }
+
+ /** @return whether this is a clustered index */
+ bool is_clust() const { return type & DICT_CLUSTERED; }
+
+ /** @return whether this is a unique index */
+ bool is_unique() const { return type & DICT_UNIQUE; }
+
+ /** @return whether this is a spatial index */
+ bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); }
+
+ /** @return whether this is the change buffer */
+ bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); }
+
+ /** @return whether the index includes virtual columns */
+ bool has_virtual() const { return type & DICT_VIRTUAL; }
+
+ /** @return the position of DB_TRX_ID */
+ uint16_t db_trx_id() const {
+ DBUG_ASSERT(is_primary());
+ DBUG_ASSERT(n_uniq);
+ DBUG_ASSERT(n_uniq <= MAX_REF_PARTS);
+ return n_uniq;
+ }
+ /** @return the position of DB_ROLL_PTR */
+ uint16_t db_roll_ptr() const
+ {
+ return static_cast<uint16_t>(db_trx_id() + 1);
+ }
+
+ /** @return the offset of the metadata BLOB field,
+ or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */
+ uint16_t first_user_field() const
+ {
+ return static_cast<uint16_t>(db_trx_id() + 2);
+ }
+
+ /** @return whether the index is corrupted */
+ inline bool is_corrupted() const;
+
+ /** Detach the virtual columns from the index that is to be removed. */
+ void detach_columns()
+ {
+ if (!has_virtual() || !cached)
+ return;
+ for (unsigned i= 0; i < n_fields; i++)
+ {
+ dict_col_t* col= fields[i].col;
+ if (!col || !col->is_virtual())
+ continue;
+ col->detach(*this);
+ }
+ }
+
+ /** Determine how many fields of a given prefix can be set NULL.
+ @param[in] n_prefix number of fields in the prefix
+ @return number of fields 0..n_prefix-1 that can be set NULL */
+ unsigned get_n_nullable(ulint n_prefix) const
+ {
+ DBUG_ASSERT(n_prefix > 0);
+ DBUG_ASSERT(n_prefix <= n_fields);
+ unsigned n = n_nullable;
+ for (; n_prefix < n_fields; n_prefix++) {
+ const dict_col_t* col = fields[n_prefix].col;
+ DBUG_ASSERT(!col->is_virtual());
+ n -= col->is_nullable();
+ }
+ DBUG_ASSERT(n < n_def);
+ return n;
+ }
+
+ /** Get the default value of an instantly-added clustered index field.
+ @param[in] n instantly added field position
+ @param[out] len value length (in bytes), or UNIV_SQL_NULL
+ @return default value
+ @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */
+ const byte* instant_field_value(ulint n, ulint* len) const
+ {
+ DBUG_ASSERT(is_instant() || id == DICT_INDEXES_ID);
+ DBUG_ASSERT(n + (id == DICT_INDEXES_ID) >= n_core_fields);
+ DBUG_ASSERT(n < n_fields);
+ return fields[n].col->instant_value(len);
+ }
+
+ /** Adjust index metadata for instant ADD/DROP/reorder COLUMN.
+ @param[in] clustered index definition after instant ALTER TABLE */
+ inline void instant_add_field(const dict_index_t& instant);
+ /** Remove instant ADD COLUMN metadata. */
+ inline void clear_instant_add();
+ /** Remove instant ALTER TABLE metadata. */
+ inline void clear_instant_alter();
+
+ /** Construct the metadata record for instant ALTER TABLE.
+ @param[in] row dummy or default values for existing columns
+ @param[in,out] heap memory heap for allocations
+ @return metadata record */
+ inline dtuple_t*
+ instant_metadata(const dtuple_t& row, mem_heap_t* heap) const;
+
+ /** Check if record in clustered index is historical row.
+ @param[in] rec clustered row
+ @param[in] offsets offsets
+ @return true if row is historical */
+ bool
+ vers_history_row(const rec_t* rec, const rec_offs* offsets);
+
+ /** Check if record in secondary index is historical row.
+ @param[in] rec record in a secondary index
+ @param[out] history_row true if row is historical
+ @return true on error */
+ bool
+ vers_history_row(const rec_t* rec, bool &history_row);
+
+ /** Assign the number of new column to be added as a part
+ of the index
+ @param n_vcol number of virtual columns to be added */
+ void assign_new_v_col(ulint n_vcol)
+ {
+ new_vcol_info= static_cast<dict_add_v_col_info*>
+ (mem_heap_zalloc(heap, sizeof *new_vcol_info));
+ new_vcol_info->n_v_col= n_vcol;
+ }
+
+ /* @return whether index has new virtual column */
+ bool has_new_v_col() const { return new_vcol_info; }
+
+ /* @return number of newly added virtual column */
+ ulint get_new_n_vcol() const
+ { return new_vcol_info ? new_vcol_info->n_v_col : 0; }
+
+ /** Reconstruct the clustered index fields. */
+ inline void reconstruct_fields();
+
+ /** Check if the index contains a column or a prefix of that column.
+ @param[in] n column number
+ @param[in] is_virtual whether it is a virtual col
+ @return whether the index contains the column or its prefix */
+ bool contains_col_or_prefix(ulint n, bool is_virtual) const
+ MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /** @return a clone of this */
+ dict_index_t* clone() const;
+ /** Clone this index for lazy dropping of the adaptive hash index.
+ @return this or a clone */
+ dict_index_t* clone_if_needed();
+ /** @return number of leaf pages pointed to by the adaptive hash index */
+ inline ulint n_ahi_pages() const;
+ /** @return whether mark_freed() had been invoked */
+ bool freed() const { return UNIV_UNLIKELY(page == 1); }
+ /** Note that the index is waiting for btr_search_lazy_free() */
+ void set_freed() { ut_ad(!freed()); page= 1; }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /** @return whether it is forbidden to invoke clear_instant_add() */
+ bool must_avoid_clear_instant_add() const
+ {
+ if (is_instant())
+ for (auto i= this; (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; )
+ if (i->to_be_dropped /* || i->online_log*/)
+ return true;
+ return false;
+ }
+
+ /** This ad-hoc class is used by record_size_info only. */
+ class record_size_info_t {
+ public:
+ record_size_info_t()
+ : max_leaf_size(0), shortest_size(0), too_big(false),
+ first_overrun_field_index(SIZE_T_MAX), overrun_size(0)
+ {
+ }
+
+ /** Mark row potentially too big for page and set up first
+ overflow field index. */
+ void set_too_big(size_t field_index)
+ {
+ ut_ad(field_index != SIZE_T_MAX);
+
+ too_big = true;
+ if (first_overrun_field_index > field_index) {
+ first_overrun_field_index = field_index;
+ overrun_size = shortest_size;
+ }
+ }
+
+ /** @return overrun field index or SIZE_T_MAX if nothing
+ overflowed*/
+ size_t get_first_overrun_field_index() const
+ {
+ ut_ad(row_is_too_big());
+ ut_ad(first_overrun_field_index != SIZE_T_MAX);
+ return first_overrun_field_index;
+ }
+
+ size_t get_overrun_size() const
+ {
+ ut_ad(row_is_too_big());
+ return overrun_size;
+ }
+
+ bool row_is_too_big() const { return too_big; }
+
+ size_t max_leaf_size; /** Bigger row size this index can
+ produce */
+ size_t shortest_size; /** shortest because it counts everything
+ as in overflow pages */
+
+ private:
+ bool too_big; /** This one is true when maximum row size this
+ index can produce is bigger than maximum row
+ size given page can hold. */
+ size_t first_overrun_field_index; /** After adding this field
+ index row overflowed maximum
+ allowed size. Useful for
+ reporting back to user. */
+ size_t overrun_size; /** Just overrun row size */
+ };
+
+ /** Returns max possibly record size for that index, size of a shortest
+ everything in overflow) size of the longest possible row and index
+ of a field which made index records too big to fit on a page.*/
+ inline record_size_info_t record_size_info() const;
+};
+
+/** Detach a virtual column from an index.
+@param index being-freed index */
+inline void dict_col_t::detach(const dict_index_t &index)
+{
+ if (is_virtual())
+ reinterpret_cast<dict_v_col_t*>(this)->detach(index);
+}
+
+/** The status of online index creation */
+enum online_index_status {
+ /** the index is complete and ready for access */
+ ONLINE_INDEX_COMPLETE = 0,
+ /** the index is being created, online
+ (allowing concurrent modifications) */
+ ONLINE_INDEX_CREATION,
+ /** secondary index creation was aborted and the index
+ should be dropped as soon as index->table->n_ref_count reaches 0,
+ or online table rebuild was aborted and the clustered index
+ of the original table should soon be restored to
+ ONLINE_INDEX_COMPLETE */
+ ONLINE_INDEX_ABORTED,
+ /** the online index creation was aborted, the index was
+ dropped from the data dictionary and the tablespace, and it
+ should be dropped from the data dictionary cache as soon as
+ index->table->n_ref_count reaches 0. */
+ ONLINE_INDEX_ABORTED_DROPPED
+};
+
+/** Set to store the virtual columns which are affected by Foreign
+key constraint. */
+typedef std::set<dict_v_col_t*, std::less<dict_v_col_t*>,
+ ut_allocator<dict_v_col_t*> > dict_vcol_set;
+
+/** Data structure for a foreign key constraint; an example:
+FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */
+struct dict_foreign_t{
+ mem_heap_t* heap; /*!< this object is allocated from
+ this memory heap */
+ char* id; /*!< id of the constraint as a
+ null-terminated string */
+ unsigned n_fields:10; /*!< number of indexes' first fields
+ for which the foreign key
+ constraint is defined: we allow the
+ indexes to contain more fields than
+ mentioned in the constraint, as long
+ as the first fields are as mentioned */
+ unsigned type:6; /*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE
+ or DICT_FOREIGN_ON_DELETE_SET_NULL */
+ char* foreign_table_name;/*!< foreign table name */
+ char* foreign_table_name_lookup;
+ /*!< foreign table name used for dict lookup */
+ dict_table_t* foreign_table; /*!< table where the foreign key is */
+ const char** foreign_col_names;/*!< names of the columns in the
+ foreign key */
+ char* referenced_table_name;/*!< referenced table name */
+ char* referenced_table_name_lookup;
+ /*!< referenced table name for dict lookup*/
+ dict_table_t* referenced_table;/*!< table where the referenced key
+ is */
+ const char** referenced_col_names;/*!< names of the referenced
+ columns in the referenced table */
+ dict_index_t* foreign_index; /*!< foreign index; we require that
+ both tables contain explicitly defined
+ indexes for the constraint: InnoDB
+ does not generate new indexes
+ implicitly */
+ dict_index_t* referenced_index;/*!< referenced index */
+
+ dict_vcol_set* v_cols; /*!< set of virtual columns affected
+ by foreign key constraint. */
+
+ /** Check whether the fulltext index gets affected by
+ foreign key constraint */
+ bool affects_fulltext() const;
+};
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_t& foreign);
+
+struct dict_foreign_print {
+
+ dict_foreign_print(std::ostream& out)
+ : m_out(out)
+ {}
+
+ void operator()(const dict_foreign_t* foreign) {
+ m_out << *foreign;
+ }
+private:
+ std::ostream& m_out;
+};
+
+/** Compare two dict_foreign_t objects using their ids. Used in the ordering
+of dict_table_t::foreign_set and dict_table_t::referenced_set. It returns
+true if the first argument is considered to go before the second in the
+strict weak ordering it defines, and false otherwise. */
+struct dict_foreign_compare {
+
+ bool operator()(
+ const dict_foreign_t* lhs,
+ const dict_foreign_t* rhs) const
+ {
+ return strcmp(lhs->id, rhs->id) < 0;
+ }
+};
+
+/** A function object to find a foreign key with the given index as the
+referenced index. Return the foreign key with matching criteria or NULL */
+struct dict_foreign_with_index {
+
+ dict_foreign_with_index(const dict_index_t* index)
+ : m_index(index)
+ {}
+
+ bool operator()(const dict_foreign_t* foreign) const
+ {
+ return(foreign->referenced_index == m_index);
+ }
+
+ const dict_index_t* m_index;
+};
+
+#ifdef WITH_WSREP
+/** A function object to find a foreign key with the given index as the
+foreign index. Return the foreign key with matching criteria or NULL */
+struct dict_foreign_with_foreign_index {
+
+ dict_foreign_with_foreign_index(const dict_index_t* index)
+ : m_index(index)
+ {}
+
+ bool operator()(const dict_foreign_t* foreign) const
+ {
+ return(foreign->foreign_index == m_index);
+ }
+
+ const dict_index_t* m_index;
+};
+#endif
+
+/* A function object to check if the foreign constraint is between different
+tables. Returns true if foreign key constraint is between different tables,
+false otherwise. */
+struct dict_foreign_different_tables {
+
+ bool operator()(const dict_foreign_t* foreign) const
+ {
+ return(foreign->foreign_table != foreign->referenced_table);
+ }
+};
+
+/** A function object to check if the foreign key constraint has the same
+name as given. If the full name of the foreign key constraint doesn't match,
+then, check if removing the database name from the foreign key constraint
+matches. Return true if it matches, false otherwise. */
+struct dict_foreign_matches_id {
+
+ dict_foreign_matches_id(const char* id)
+ : m_id(id)
+ {}
+
+ bool operator()(const dict_foreign_t* foreign) const
+ {
+ if (0 == innobase_strcasecmp(foreign->id, m_id)) {
+ return(true);
+ }
+ if (const char* pos = strchr(foreign->id, '/')) {
+ if (0 == innobase_strcasecmp(m_id, pos + 1)) {
+ return(true);
+ }
+ }
+ return(false);
+ }
+
+ const char* m_id;
+};
+
+typedef std::set<
+ dict_foreign_t*,
+ dict_foreign_compare,
+ ut_allocator<dict_foreign_t*> > dict_foreign_set;
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_set& fk_set);
+
+/** Function object to check if a foreign key object is there
+in the given foreign key set or not. It returns true if the
+foreign key is not found, false otherwise */
+struct dict_foreign_not_exists {
+ dict_foreign_not_exists(const dict_foreign_set& obj_)
+ : m_foreigns(obj_)
+ {}
+
+ /* Return true if the given foreign key is not found */
+ bool operator()(dict_foreign_t* const & foreign) const {
+ return(m_foreigns.find(foreign) == m_foreigns.end());
+ }
+private:
+ const dict_foreign_set& m_foreigns;
+};
+
+/** Validate the search order in the foreign key set.
+@param[in] fk_set the foreign key set to be validated
+@return true if search order is fine in the set, false otherwise. */
+bool
+dict_foreign_set_validate(
+ const dict_foreign_set& fk_set);
+
+/** Validate the search order in the foreign key sets of the table
+(foreign_set and referenced_set).
+@param[in] table table whose foreign key sets are to be validated
+@return true if foreign key sets are fine, false otherwise. */
+bool
+dict_foreign_set_validate(
+ const dict_table_t& table);
+
+/*********************************************************************//**
+Frees a foreign key struct. */
+inline
+void
+dict_foreign_free(
+/*==============*/
+ dict_foreign_t* foreign) /*!< in, own: foreign key struct */
+{
+ if (foreign->v_cols != NULL) {
+ UT_DELETE(foreign->v_cols);
+ }
+
+ mem_heap_free(foreign->heap);
+}
+
+/** The destructor will free all the foreign key constraints in the set
+by calling dict_foreign_free() on each of the foreign key constraints.
+This is used to free the allocated memory when a local set goes out
+of scope. */
+struct dict_foreign_set_free {
+
+ dict_foreign_set_free(const dict_foreign_set& foreign_set)
+ : m_foreign_set(foreign_set)
+ {}
+
+ ~dict_foreign_set_free()
+ {
+ std::for_each(m_foreign_set.begin(),
+ m_foreign_set.end(),
+ dict_foreign_free);
+ }
+
+ const dict_foreign_set& m_foreign_set;
+};
+
+/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that
+a foreign key constraint is enforced, therefore RESTRICT just means no flag */
+/* @{ */
+#define DICT_FOREIGN_ON_DELETE_CASCADE 1U /*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_DELETE_SET_NULL 2U /*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_UPDATE_CASCADE 4U /*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_UPDATE_SET_NULL 8U /*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16U /*!< ON DELETE NO ACTION */
+#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32U /*!< ON UPDATE NO ACTION */
+/* @} */
+
+/** Display an identifier.
+@param[in,out] s output stream
+@param[in] id_name SQL identifier (other than table name)
+@return the output stream */
+std::ostream&
+operator<<(
+ std::ostream& s,
+ const id_name_t& id_name);
+
+/** Display a table name.
+@param[in,out] s output stream
+@param[in] table_name table name
+@return the output stream */
+std::ostream&
+operator<<(
+ std::ostream& s,
+ const table_name_t& table_name);
+
+/** List of locks that different transactions have acquired on a table. This
+list has a list node that is embedded in a nested union/structure. We have to
+generate a specific template for it. */
+
+typedef ut_list_base<lock_t, ut_list_node<lock_t> lock_table_t::*>
+ table_lock_list_t;
+
+/** mysql template structure defined in row0mysql.cc */
+struct mysql_row_templ_t;
+
+/** Structure defines template related to virtual columns and
+their base columns */
+struct dict_vcol_templ_t {
+ /** number of regular columns */
+ ulint n_col;
+
+ /** number of virtual columns */
+ ulint n_v_col;
+
+ /** array of templates for virtual col and their base columns */
+ mysql_row_templ_t** vtempl;
+
+ /** table's database name */
+ std::string db_name;
+
+ /** table name */
+ std::string tb_name;
+
+ /** MySQL record length */
+ ulint rec_len;
+
+ /** default column value if any */
+ byte* default_rec;
+
+ /** cached MySQL TABLE object */
+ TABLE* mysql_table;
+
+ /** when mysql_table was cached */
+ uint64_t mysql_table_query_id;
+
+ dict_vcol_templ_t() : vtempl(0), mysql_table_query_id(~0ULL) {}
+};
+
+/** Metadata on clustered index fields starting from first_user_field() */
+class field_map_element_t
+{
+ /** Number of bits for representing a column number */
+ static constexpr uint16_t IND_BITS = 10;
+
+ /** Set if the column of the field has been instantly dropped */
+ static constexpr uint16_t DROPPED = 1U << (IND_BITS + 5);
+
+ /** Set if the column was dropped and originally declared NOT NULL */
+ static constexpr uint16_t NOT_NULL = 1U << (IND_BITS + 4);
+
+ /** Column index (if !(data & DROPPED)): table->cols[data & IND],
+ or field length (if (data & DROPPED)):
+ (data & IND) = 0 if variable-length with max_len < 256 bytes;
+ (data & IND) = 1 if variable-length with max_len > 255 bytes;
+ (data & IND) = 1 + L otherwise, with L=fixed length of the column */
+ static constexpr uint16_t IND = (1U << IND_BITS) - 1;
+
+ /** Field metadata */
+ uint16_t data;
+
+ void clear_not_null() { data &= uint16_t(~NOT_NULL); }
+public:
+ bool is_dropped() const { return data & DROPPED; }
+ void set_dropped() { data |= DROPPED; }
+ bool is_not_null() const { return data & NOT_NULL; }
+ void set_not_null() { ut_ad(is_dropped()); data |= NOT_NULL; }
+ uint16_t ind() const { return data & IND; }
+ void set_ind(uint16_t i)
+ {
+ DBUG_ASSERT(i <= IND);
+ DBUG_ASSERT(!ind());
+ data |= i;
+ }
+ field_map_element_t& operator= (uint16_t value)
+ {
+ data = value;
+ return *this;
+ }
+ operator uint16_t() { return data; }
+};
+
+static_assert(sizeof(field_map_element_t) == 2,
+ "Size mismatch for a persistent data item!");
+
+/** Instantly dropped or reordered columns */
+struct dict_instant_t
+{
+ /** Number of dropped columns */
+ unsigned n_dropped;
+ /** Dropped columns */
+ dict_col_t* dropped;
+ /** Map of clustered index non-PK fields[i - first_user_field()]
+ to table columns */
+ field_map_element_t* field_map;
+};
+
+/** These are used when MySQL FRM and InnoDB data dictionary are
+in inconsistent state. */
+typedef enum {
+ DICT_FRM_CONSISTENT = 0, /*!< Consistent state */
+ DICT_FRM_NO_PK = 1, /*!< MySQL has no primary key
+ but InnoDB dictionary has
+ non-generated one. */
+ DICT_NO_PK_FRM_HAS = 2, /*!< MySQL has primary key but
+ InnoDB dictionary has not. */
+ DICT_FRM_INCONSISTENT_KEYS = 3 /*!< Key count mismatch */
+} dict_frm_t;
+
+/** Data structure for a database table. Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_table_create(). */
+struct dict_table_t {
+
+ /** Get reference count.
+ @return current value of n_ref_count */
+ inline uint32_t get_ref_count() const { return n_ref_count; }
+
+ /** Acquire the table handle. */
+ inline void acquire();
+
+ /** Release the table handle.
+ @return whether the last handle was released */
+ inline bool release();
+
+ /** @return whether the table supports transactions */
+ bool no_rollback() const
+ {
+ return !(~unsigned(flags) & DICT_TF_MASK_NO_ROLLBACK);
+ }
+ /** @return whether this is a temporary table */
+ bool is_temporary() const
+ {
+ return flags2 & DICT_TF2_TEMPORARY;
+ }
+
+ /** @return whether the table is not in ROW_FORMAT=REDUNDANT */
+ bool not_redundant() const { return flags & DICT_TF_COMPACT; }
+
+ /** @return whether this table is readable
+ @retval true normally
+ @retval false if this is a single-table tablespace
+ and the .ibd file is missing, or a
+ page cannot be read or decrypted */
+ bool is_readable() const
+ {
+ ut_ad(file_unreadable || space);
+ return(UNIV_LIKELY(!file_unreadable));
+ }
+
+ /** @return whether the table is accessible */
+ bool is_accessible() const
+ {
+ return UNIV_LIKELY(is_readable() && !corrupted && space)
+ && !space->is_stopping();
+ }
+
+ /** Check if a table name contains the string "/#sql"
+ which denotes temporary or intermediate tables in MariaDB. */
+ static bool is_temporary_name(const char* name)
+ {
+ return strstr(name, "/" TEMP_FILE_PREFIX) != NULL;
+ }
+
+ /** @return whether instant ALTER TABLE is in effect */
+ bool is_instant() const
+ {
+ return(UT_LIST_GET_FIRST(indexes)->is_instant());
+ }
+
+ /** @return whether the table supports instant ALTER TABLE */
+ bool supports_instant() const
+ {
+ return(!(flags & DICT_TF_MASK_ZIP_SSIZE));
+ }
+
+ /** @return the number of instantly dropped columns */
+ unsigned n_dropped() const { return instant ? instant->n_dropped : 0; }
+
+ /** Look up an old column.
+ @param[in] cols the old columns of the table
+ @param[in] col_map map from old table columns to altered ones
+ @param[in] n_cols number of old columns
+ @param[in] i the number of the new column
+ @return old column
+ @retval NULL if column i was added to the table */
+ static const dict_col_t* find(const dict_col_t* cols,
+ const ulint* col_map, ulint n_cols,
+ ulint i)
+ {
+ for (ulint o = n_cols; o--; ) {
+ if (col_map[o] == i) {
+ return &cols[o];
+ }
+ }
+ return NULL;
+ }
+
+ /** Serialise metadata of dropped or reordered columns.
+ @param[in,out] heap memory heap for allocation
+ @param[out] field data field with the metadata */
+ inline void serialise_columns(mem_heap_t* heap, dfield_t* field) const;
+
+ /** Reconstruct dropped or reordered columns.
+ @param[in] metadata data from serialise_columns()
+ @param[in] len length of the metadata, in bytes
+ @return whether parsing the metadata failed */
+ bool deserialise_columns(const byte* metadata, ulint len);
+
+ /** Set is_instant() before instant_column().
+ @param[in] old previous table definition
+ @param[in] col_map map from old.cols[]
+ and old.v_cols[] to this
+ @param[out] first_alter_pos 0, or
+ 1 + first changed column position */
+ inline void prepare_instant(const dict_table_t& old,
+ const ulint* col_map,
+ unsigned& first_alter_pos);
+
+ /** Adjust table metadata for instant ADD/DROP/reorder COLUMN.
+ @param[in] table table on which prepare_instant() was invoked
+ @param[in] col_map mapping from cols[] and v_cols[] to table
+ @return whether the metadata record must be updated */
+ inline bool instant_column(const dict_table_t& table,
+ const ulint* col_map);
+
+ /** Roll back instant_column().
+ @param[in] old_n_cols original n_cols
+ @param[in] old_cols original cols
+ @param[in] old_col_names original col_names
+ @param[in] old_instant original instant structure
+ @param[in] old_fields original fields
+ @param[in] old_n_fields original number of fields
+ @param[in] old_n_core_fields original number of core fields
+ @param[in] old_n_v_cols original n_v_cols
+ @param[in] old_v_cols original v_cols
+ @param[in] old_v_col_names original v_col_names
+ @param[in] col_map column map */
+ inline void rollback_instant(
+ unsigned old_n_cols,
+ dict_col_t* old_cols,
+ const char* old_col_names,
+ dict_instant_t* old_instant,
+ dict_field_t* old_fields,
+ unsigned old_n_fields,
+ unsigned old_n_core_fields,
+ unsigned old_n_v_cols,
+ dict_v_col_t* old_v_cols,
+ const char* old_v_col_names,
+ const ulint* col_map);
+
+ /** Add the table definition to the data dictionary cache */
+ void add_to_cache();
+
+ /** @return whether the table is versioned.
+ It is assumed that both vers_start and vers_end set to 0
+ iff table is not versioned. In any other case,
+ these fields correspond to actual positions in cols[]. */
+ bool versioned() const { return vers_start || vers_end; }
+ bool versioned_by_id() const
+ {
+ return versioned() && cols[vers_start].mtype == DATA_INT;
+ }
+
+ void inc_fk_checks()
+ {
+#ifdef UNIV_DEBUG
+ int32_t fk_checks=
+#endif
+ n_foreign_key_checks_running++;
+ ut_ad(fk_checks >= 0);
+ }
+ void dec_fk_checks()
+ {
+#ifdef UNIV_DEBUG
+ int32_t fk_checks=
+#endif
+ n_foreign_key_checks_running--;
+ ut_ad(fk_checks > 0);
+ }
+
+ /** For overflow fields returns potential max length stored inline */
+ inline size_t get_overflow_field_local_len() const;
+
+ /** Parse the table file name into table name and database name.
+ @tparam dict_locked whether dict_sys.mutex is being held
+ @param[in,out] db_name database name buffer
+ @param[in,out] tbl_name table name buffer
+ @param[out] db_name_len database name length
+ @param[out] tbl_name_len table name length
+ @return whether the table name is visible to SQL */
+ template<bool dict_locked= false>
+ bool parse_name(char (&db_name)[NAME_LEN + 1],
+ char (&tbl_name)[NAME_LEN + 1],
+ size_t *db_name_len, size_t *tbl_name_len) const;
+
+private:
+ /** Initialize instant->field_map.
+ @param[in] table table definition to copy from */
+ inline void init_instant(const dict_table_t& table);
+public:
+ /** Id of the table. */
+ table_id_t id;
+ /** Hash chain node. */
+ hash_node_t id_hash;
+ /** Table name. */
+ table_name_t name;
+ /** Hash chain node. */
+ hash_node_t name_hash;
+
+ /** Memory heap */
+ mem_heap_t* heap;
+
+ /** NULL or the directory path specified by DATA DIRECTORY. */
+ char* data_dir_path;
+
+ /** The tablespace of the table */
+ fil_space_t* space;
+ /** Tablespace ID */
+ ulint space_id;
+
+ /** Stores information about:
+ 1 row format (redundant or compact),
+ 2 compressed page size (zip shift size),
+ 3 whether using atomic blobs,
+ 4 whether the table has been created with the option DATA DIRECTORY.
+ Use DICT_TF_GET_COMPACT(), DICT_TF_GET_ZIP_SSIZE(),
+ DICT_TF_HAS_ATOMIC_BLOBS() and DICT_TF_HAS_DATA_DIR() to parse this
+ flag. */
+ unsigned flags:DICT_TF_BITS;
+
+ /** Stores information about:
+ 1 whether the table has been created using CREATE TEMPORARY TABLE,
+ 2 whether the table has an internally defined DOC ID column,
+ 3 whether the table has a FTS index,
+ 4 whether DOC ID column need to be added to the FTS index,
+ 5 whether the table is being created its own tablespace,
+ 6 whether the table has been DISCARDed,
+ 7 whether the aux FTS tables names are in hex.
+ Use DICT_TF2_FLAG_IS_SET() to parse this flag. */
+ unsigned flags2:DICT_TF2_BITS;
+
+ /** TRUE if the table is an intermediate table during copy alter
+ operation or a partition/subpartition which is required for copying
+ data and skip the undo log for insertion of row in the table.
+ This variable will be set and unset during extra(), or during the
+ process of altering partitions */
+ unsigned skip_alter_undo:1;
+
+ /*!< whether this is in a single-table tablespace and the .ibd
+ file is missing or page decryption failed and page is corrupted */
+ unsigned file_unreadable:1;
+
+ /** TRUE if the table object has been added to the dictionary cache. */
+ unsigned cached:1;
+
+ /** TRUE if the table is to be dropped, but not yet actually dropped
+ (could in the background drop list). It is turned on at the beginning
+ of row_drop_table_for_mysql() and turned off just before we start to
+ update system tables for the drop. It is protected by dict_sys.latch. */
+ unsigned to_be_dropped:1;
+
+ /** Number of non-virtual columns defined so far. */
+ unsigned n_def:10;
+
+ /** Number of non-virtual columns. */
+ unsigned n_cols:10;
+
+ /** Number of total columns (inlcude virtual and non-virtual) */
+ unsigned n_t_cols:10;
+
+ /** Number of total columns defined so far. */
+ unsigned n_t_def:10;
+
+ /** Number of virtual columns defined so far. */
+ unsigned n_v_def:10;
+
+ /** Number of virtual columns. */
+ unsigned n_v_cols:10;
+
+ /** 1 + the position of autoinc counter field in clustered
+ index, or 0 if there is no persistent AUTO_INCREMENT column in
+ the table. */
+ unsigned persistent_autoinc:10;
+
+ /** TRUE if it's not an InnoDB system table or a table that has no FK
+ relationships. */
+ unsigned can_be_evicted:1;
+
+ /** TRUE if table is corrupted. */
+ unsigned corrupted:1;
+
+ /** TRUE if some indexes should be dropped after ONLINE_INDEX_ABORTED
+ or ONLINE_INDEX_ABORTED_DROPPED. */
+ unsigned drop_aborted:1;
+
+ /** Array of column descriptions. */
+ dict_col_t* cols;
+
+ /** Array of virtual column descriptions. */
+ dict_v_col_t* v_cols;
+
+ /** List of stored column descriptions. It is used only for foreign key
+ check during create table and copy alter operations.
+ During copy alter, s_cols list is filled during create table operation
+ and need to preserve till rename table operation. That is the
+ reason s_cols is a part of dict_table_t */
+ dict_s_col_list* s_cols;
+
+ /** Instantly dropped or reordered columns, or NULL if none */
+ dict_instant_t* instant;
+
+ /** Column names packed in a character string
+ "name1\0name2\0...nameN\0". Until the string contains n_cols, it will
+ be allocated from a temporary heap. The final string will be allocated
+ from table->heap. */
+ const char* col_names;
+
+ /** Virtual column names */
+ const char* v_col_names;
+ unsigned vers_start:10;
+ /*!< System Versioning: row start col index */
+ unsigned vers_end:10;
+ /*!< System Versioning: row end col index */
+ bool is_system_db;
+ /*!< True if the table belongs to a system
+ database (mysql, information_schema or
+ performance_schema) */
+ dict_frm_t dict_frm_mismatch;
+ /*!< !DICT_FRM_CONSISTENT==0 if data
+ dictionary information and
+ MySQL FRM information mismatch. */
+ /** The FTS_DOC_ID_INDEX, or NULL if no fulltext indexes exist */
+ dict_index_t* fts_doc_id_index;
+
+ /** List of indexes of the table. */
+ UT_LIST_BASE_NODE_T(dict_index_t) indexes;
+#ifdef BTR_CUR_HASH_ADAPT
+ /** List of detached indexes that are waiting to be freed along with
+ the last adaptive hash index entry */
+ UT_LIST_BASE_NODE_T(dict_index_t) freed_indexes;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /** List of foreign key constraints in the table. These refer to
+ columns in other tables. */
+ UT_LIST_BASE_NODE_T(dict_foreign_t) foreign_list;
+
+ /** List of foreign key constraints which refer to this table. */
+ UT_LIST_BASE_NODE_T(dict_foreign_t) referenced_list;
+
+ /** Node of the LRU list of tables. */
+ UT_LIST_NODE_T(dict_table_t) table_LRU;
+
+ /** Maximum recursive level we support when loading tables chained
+ together with FK constraints. If exceeds this level, we will stop
+ loading child table into memory along with its parent table. */
+ unsigned fk_max_recusive_level:8;
+
+ /** Count of how many foreign key check operations are currently being
+ performed on the table. We cannot drop the table while there are
+ foreign key checks running on it. */
+ Atomic_counter<int32_t> n_foreign_key_checks_running;
+
+ /** Transactions whose view low limit is greater than this number are
+ not allowed to store to the MySQL query cache or retrieve from it.
+ When a trx with undo logs commits, it sets this to the value of the
+ transaction id. */
+ trx_id_t query_cache_inv_trx_id;
+
+ /** Transaction id that last touched the table definition. Either when
+ loading the definition or CREATE TABLE, or ALTER TABLE (prepare,
+ commit, and rollback phases). */
+ trx_id_t def_trx_id;
+
+ /*!< set of foreign key constraints in the table; these refer to
+ columns in other tables */
+ dict_foreign_set foreign_set;
+
+ /*!< set of foreign key constraints which refer to this table */
+ dict_foreign_set referenced_set;
+
+ /** Statistics for query optimization. Mostly protected by
+ dict_sys.mutex. @{ */
+
+ /** TRUE if statistics have been calculated the first time after
+ database startup or table creation. */
+ unsigned stat_initialized:1;
+
+ /** Timestamp of last recalc of the stats. */
+ time_t stats_last_recalc;
+
+ /** The two bits below are set in the 'stat_persistent' member. They
+ have the following meaning:
+ 1. _ON=0, _OFF=0, no explicit persistent stats setting for this table,
+ the value of the global srv_stats_persistent is used to determine
+ whether the table has persistent stats enabled or not
+ 2. _ON=0, _OFF=1, persistent stats are explicitly disabled for this
+ table, regardless of the value of the global srv_stats_persistent
+ 3. _ON=1, _OFF=0, persistent stats are explicitly enabled for this
+ table, regardless of the value of the global srv_stats_persistent
+ 4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
+ #define DICT_STATS_PERSISTENT_ON (1 << 1)
+ #define DICT_STATS_PERSISTENT_OFF (1 << 2)
+
+ /** Indicates whether the table uses persistent stats or not. See
+ DICT_STATS_PERSISTENT_ON and DICT_STATS_PERSISTENT_OFF. */
+ ib_uint32_t stat_persistent;
+
+ /** The two bits below are set in the 'stats_auto_recalc' member. They
+ have the following meaning:
+ 1. _ON=0, _OFF=0, no explicit auto recalc setting for this table, the
+ value of the global srv_stats_persistent_auto_recalc is used to
+ determine whether the table has auto recalc enabled or not
+ 2. _ON=0, _OFF=1, auto recalc is explicitly disabled for this table,
+ regardless of the value of the global srv_stats_persistent_auto_recalc
+ 3. _ON=1, _OFF=0, auto recalc is explicitly enabled for this table,
+ regardless of the value of the global srv_stats_persistent_auto_recalc
+ 4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
+ #define DICT_STATS_AUTO_RECALC_ON (1 << 1)
+ #define DICT_STATS_AUTO_RECALC_OFF (1 << 2)
+
+ /** Indicates whether the table uses automatic recalc for persistent
+ stats or not. See DICT_STATS_AUTO_RECALC_ON and
+ DICT_STATS_AUTO_RECALC_OFF. */
+ ib_uint32_t stats_auto_recalc;
+
+ /** The number of pages to sample for this table during persistent
+ stats estimation. If this is 0, then the value of the global
+ srv_stats_persistent_sample_pages will be used instead. */
+ ulint stats_sample_pages;
+
+ /** Approximate number of rows in the table. We periodically calculate
+ new estimates. */
+ ib_uint64_t stat_n_rows;
+
+ /** Approximate clustered index size in database pages. */
+ ulint stat_clustered_index_size;
+
+ /** Approximate size of other indexes in database pages. */
+ ulint stat_sum_of_other_index_sizes;
+
+ /** How many rows are modified since last stats recalc. When a row is
+ inserted, updated, or deleted, we add 1 to this number; we calculate
+ new estimates for the table and the indexes if the table has changed
+ too much, see dict_stats_update_if_needed(). The counter is reset
+ to zero at statistics calculation. This counter is not protected by
+ any latch, because this is only used for heuristics. */
+ ib_uint64_t stat_modified_counter;
+
+ /** Background stats thread is not working on this table. */
+ #define BG_STAT_NONE 0
+
+ /** Set in 'stats_bg_flag' when the background stats code is working
+ on this table. The DROP TABLE code waits for this to be cleared before
+ proceeding. */
+ #define BG_STAT_IN_PROGRESS (1 << 0)
+
+ /** Set in 'stats_bg_flag' when DROP TABLE starts waiting on
+ BG_STAT_IN_PROGRESS to be cleared. The background stats thread will
+ detect this and will eventually quit sooner. */
+ #define BG_STAT_SHOULD_QUIT (1 << 1)
+
+ /** The state of the background stats thread wrt this table.
+ See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT.
+ Writes are covered by dict_sys.mutex. Dirty reads are possible. */
+ byte stats_bg_flag;
+
+ bool stats_error_printed;
+ /*!< Has persistent stats error beein
+ already printed for this table ? */
+ /* @} */
+
+ /** AUTOINC related members. @{ */
+
+ /* The actual collection of tables locked during AUTOINC read/write is
+ kept in trx_t. In order to quickly determine whether a transaction has
+ locked the AUTOINC lock we keep a pointer to the transaction here in
+ the 'autoinc_trx' member. This is to avoid acquiring the
+ lock_sys_t::mutex and scanning the vector in trx_t.
+ When an AUTOINC lock has to wait, the corresponding lock instance is
+ created on the trx lock heap rather than use the pre-allocated instance
+ in autoinc_lock below. */
+
+ /** A buffer for an AUTOINC lock for this table. We allocate the
+ memory here so that individual transactions can get it and release it
+ without a need to allocate space from the lock heap of the trx:
+ otherwise the lock heap would grow rapidly if we do a large insert
+ from a select. */
+ lock_t* autoinc_lock;
+
+ /** Mutex protecting the autoincrement counter. */
+ std::mutex autoinc_mutex;
+
+ /** Autoinc counter value to give to the next inserted row. */
+ ib_uint64_t autoinc;
+
+ /** This counter is used to track the number of granted and pending
+ autoinc locks on this table. This value is set after acquiring the
+ lock_sys_t::mutex but we peek the contents to determine whether other
+ transactions have acquired the AUTOINC lock or not. Of course only one
+ transaction can be granted the lock but there can be multiple
+ waiters. */
+ ulong n_waiting_or_granted_auto_inc_locks;
+
+ /** The transaction that currently holds the the AUTOINC lock on this
+ table. Protected by lock_sys.mutex. */
+ const trx_t* autoinc_trx;
+
+ /* @} */
+
+ /** FTS specific state variables. */
+ fts_t* fts;
+
+ /** Quiescing states, protected by the dict_index_t::lock. ie. we can
+ only change the state if we acquire all the latches (dict_index_t::lock)
+ in X mode of this table's indexes. */
+ ib_quiesce_t quiesce;
+
+ /** Count of the number of record locks on this table. We use this to
+ determine whether we can evict the table from the dictionary cache.
+ It is protected by lock_sys.mutex. */
+ ulint n_rec_locks;
+
+private:
+ /** Count of how many handles are opened to this table. Dropping of the
+ table is NOT allowed until this count gets to zero. MySQL does NOT
+ itself check the number of open handles at DROP. */
+ Atomic_counter<uint32_t> n_ref_count;
+
+public:
+ /** List of locks on the table. Protected by lock_sys.mutex. */
+ table_lock_list_t locks;
+
+ /** Timestamp of the last modification of this table. */
+ time_t update_time;
+
+#ifdef UNIV_DEBUG
+ /** Value of 'magic_n'. */
+ #define DICT_TABLE_MAGIC_N 76333786
+
+ /** Magic number. */
+ ulint magic_n;
+#endif /* UNIV_DEBUG */
+ /** mysql_row_templ_t for base columns used for compute the virtual
+ columns */
+ dict_vcol_templ_t* vc_templ;
+
+ /* @return whether the table has any other transcation lock
+ other than the given transaction */
+ bool has_lock_other_than(const trx_t *trx) const
+ {
+ for (lock_t *lock= UT_LIST_GET_FIRST(locks); lock;
+ lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+ if (lock->trx != trx)
+ return true;
+ return false;
+ }
+
+ /** Check whether the table name is same as mysql/innodb_stats_table
+ or mysql/innodb_index_stats.
+ @return true if the table name is same as stats table */
+ bool is_stats_table() const;
+};
+
+inline void dict_index_t::set_modified(mtr_t& mtr) const
+{
+ mtr.set_named_space(table->space);
+}
+
+inline bool table_name_t::is_temporary() const
+{
+ return dict_table_t::is_temporary_name(m_name);
+}
+
+inline bool dict_index_t::is_readable() const { return table->is_readable(); }
+
+inline bool dict_index_t::is_instant() const
+{
+ ut_ad(n_core_fields > 0);
+ ut_ad(n_core_fields <= n_fields || table->n_dropped());
+ ut_ad(n_core_fields == n_fields
+ || (type & ~(DICT_UNIQUE | DICT_CORRUPT)) == DICT_CLUSTERED);
+ ut_ad(n_core_fields == n_fields || table->supports_instant());
+ ut_ad(n_core_fields == n_fields || !table->is_temporary());
+ ut_ad(!table->instant || !table->is_temporary());
+
+ return n_core_fields != n_fields
+ || (is_primary() && table->instant);
+}
+
+inline bool dict_index_t::is_corrupted() const
+{
+ return UNIV_UNLIKELY(online_status >= ONLINE_INDEX_ABORTED
+ || (type & DICT_CORRUPT)
+ || (table && table->corrupted));
+}
+
+inline void dict_index_t::clear_instant_add()
+{
+ DBUG_ASSERT(is_primary());
+ DBUG_ASSERT(is_instant());
+ DBUG_ASSERT(!table->instant);
+ for (unsigned i= n_core_fields; i < n_fields; i++)
+ fields[i].col->clear_instant();
+ n_core_fields= n_fields;
+ n_core_null_bytes= static_cast<byte>
+ (UT_BITS_IN_BYTES(static_cast<unsigned>(n_nullable)));
+}
+
+inline void dict_index_t::clear_instant_alter()
+{
+ DBUG_ASSERT(is_primary());
+ DBUG_ASSERT(n_fields == n_def);
+
+ if (!table->instant) {
+ if (is_instant()) {
+ clear_instant_add();
+ }
+ return;
+ }
+
+#ifndef DBUG_OFF
+ for (unsigned i = first_user_field(); i--; ) {
+ DBUG_ASSERT(!fields[i].col->is_dropped());
+ DBUG_ASSERT(!fields[i].col->is_nullable());
+ }
+#endif
+ const dict_col_t* ai_col = table->persistent_autoinc
+ ? fields[table->persistent_autoinc - 1].col
+ : NULL;
+ dict_field_t* const begin = &fields[first_user_field()];
+ dict_field_t* end = &fields[n_fields];
+
+ for (dict_field_t* d = begin; d < end; ) {
+ /* Move fields for dropped columns to the end. */
+ if (!d->col->is_dropped()) {
+ d++;
+ } else {
+ if (d->col->is_nullable()) {
+ n_nullable--;
+ }
+
+ std::swap(*d, *--end);
+ }
+ }
+
+ DBUG_ASSERT(&fields[n_fields - table->n_dropped()] == end);
+ n_core_fields = n_fields = n_def
+ = static_cast<unsigned>(end - fields) & MAX_N_FIELDS;
+ n_core_null_bytes = static_cast<byte>(UT_BITS_IN_BYTES(n_nullable));
+ std::sort(begin, end, [](const dict_field_t& a, const dict_field_t& b)
+ { return a.col->ind < b.col->ind; });
+ table->instant = NULL;
+ if (ai_col) {
+ auto a = std::find_if(begin, end,
+ [ai_col](const dict_field_t& f)
+ { return f.col == ai_col; });
+ table->persistent_autoinc = (a == end)
+ ? 0
+ : (1 + static_cast<unsigned>(a - fields))
+ & MAX_N_FIELDS;
+ }
+}
+
+/** @return whether the column was instantly dropped
+@param[in] index the clustered index */
+inline bool dict_col_t::is_dropped(const dict_index_t& index) const
+{
+ DBUG_ASSERT(index.is_primary());
+ DBUG_ASSERT(!is_dropped() == !index.table->instant);
+ DBUG_ASSERT(!is_dropped() || (this >= index.table->instant->dropped
+ && this < index.table->instant->dropped
+ + index.table->instant->n_dropped));
+ return is_dropped();
+}
+
+/*******************************************************************//**
+Initialise the table lock list. */
+void
+lock_table_lock_list_init(
+/*======================*/
+ table_lock_list_t* locks); /*!< List to initialise */
+
+/** A function object to add the foreign key constraint to the referenced set
+of the referenced table, if it exists in the dictionary cache. */
+struct dict_foreign_add_to_referenced_table {
+ void operator()(dict_foreign_t* foreign) const
+ {
+ if (dict_table_t* table = foreign->referenced_table) {
+ std::pair<dict_foreign_set::iterator, bool> ret
+ = table->referenced_set.insert(foreign);
+ ut_a(ret.second);
+ }
+ }
+};
+
+/** Check whether the col is used in spatial index or regular index.
+@param[in] col column to check
+@return spatial status */
+inline
+spatial_status_t
+dict_col_get_spatial_status(
+ const dict_col_t* col)
+{
+ spatial_status_t spatial_status = SPATIAL_NONE;
+
+ /* Column is not a part of any index. */
+ if (!col->ord_part) {
+ return(spatial_status);
+ }
+
+ if (DATA_GEOMETRY_MTYPE(col->mtype)) {
+ if (col->max_prefix == 0) {
+ spatial_status = SPATIAL_ONLY;
+ } else {
+ /* Any regular index on a geometry column
+ should have a prefix. */
+ spatial_status = SPATIAL_MIXED;
+ }
+ }
+
+ return(spatial_status);
+}
+
+/** Clear defragmentation summary. */
+inline void dict_stats_empty_defrag_summary(dict_index_t* index)
+{
+ index->stat_defrag_n_pages_freed = 0;
+}
+
+/** Clear defragmentation related index stats. */
+inline void dict_stats_empty_defrag_stats(dict_index_t* index)
+{
+ index->stat_defrag_modified_counter = 0;
+ index->stat_defrag_n_page_split = 0;
+}
+
+#include "dict0mem.ic"
+
+#endif /* dict0mem_h */
diff --git a/storage/innobase/include/dict0mem.ic b/storage/innobase/include/dict0mem.ic
new file mode 100644
index 00000000..0a554a54
--- /dev/null
+++ b/storage/innobase/include/dict0mem.ic
@@ -0,0 +1,73 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0mem.ic
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "data0type.h"
+#include "dict0mem.h"
+#include "fil0fil.h"
+
+/**********************************************************************//**
+This function poplulates a dict_index_t index memory structure with
+supplied information. */
+UNIV_INLINE
+void
+dict_mem_fill_index_struct(
+/*=======================*/
+ dict_index_t* index, /*!< out: index to be filled */
+ mem_heap_t* heap, /*!< in: memory heap */
+ const char* index_name, /*!< in: index name */
+ ulint type, /*!< in: DICT_UNIQUE,
+ DICT_CLUSTERED, ... ORed */
+ ulint n_fields) /*!< in: number of fields */
+{
+
+ if (heap) {
+ index->heap = heap;
+ index->name = mem_heap_strdup(heap, index_name);
+ index->fields = (dict_field_t*) mem_heap_alloc(
+ heap, 1 + n_fields * sizeof(dict_field_t));
+ } else {
+ index->name = index_name;
+ index->heap = NULL;
+ index->fields = NULL;
+ }
+
+ index->type = type & ((1U << DICT_IT_BITS) - 1);
+ index->page = FIL_NULL;
+ index->merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+ index->n_fields = static_cast<unsigned>(n_fields)
+ & index->MAX_N_FIELDS;
+ index->n_core_fields = static_cast<unsigned>(n_fields)
+ & index->MAX_N_FIELDS;
+ /* The '1 +' above prevents allocation
+ of an empty mem block */
+ index->nulls_equal = false;
+#ifdef BTR_CUR_HASH_ADAPT
+#ifdef MYSQL_INDEX_DISABLE_AHI
+ index->disable_ahi = false;
+#endif
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_d(index->magic_n = DICT_INDEX_MAGIC_N);
+}
diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h
new file mode 100644
index 00000000..dfa6f2a2
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.h
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.h
+Helper functions for extracting/storing page compression information
+to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#ifndef dict0pagecompress_h
+#define dict0pagecompress_h
+
+/********************************************************************//**
+Extract the page compression level from table flags.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+/********************************************************************//**
+Extract the page compression flag from table flags
+@return page compression flag, or false if not compressed */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*==========================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+
+/********************************************************************//**
+Check whether the table uses the page compressed page format.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+ const dict_table_t* table) /*!< in: table */
+ __attribute__((const));
+
+#include "dict0pagecompress.ic"
+
+#endif
diff --git a/storage/innobase/include/dict0pagecompress.ic b/storage/innobase/include/dict0pagecompress.ic
new file mode 100644
index 00000000..c959f9ca
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.ic
@@ -0,0 +1,81 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.ic
+Inline implementation for helper functions for extracting/storing
+page compression and atomic writes information to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/********************************************************************//**
+Extract the page compression level from dict_table_t::flags.
+These flags are in memory, so assert that they are valid.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+ ulint flags) /*!< in: flags */
+{
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags);
+
+ ut_ad(page_compression_level <= 9);
+
+ return(page_compression_level);
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(dict_tf_get_page_compression(table->flags));
+
+ return(dict_tf_get_page_compression_level(table->flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*=========================*/
+ ulint flags) /*!< in: flags */
+{
+ return(DICT_TF_GET_PAGE_COMPRESSION(flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_table_is_page_compressed(
+/*==========================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ return (dict_tf_get_page_compression(table->flags));
+}
diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h
new file mode 100644
index 00000000..3f279205
--- /dev/null
+++ b/storage/innobase/include/dict0priv.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0priv.h
+Data dictionary private functions
+
+Created Fri 2 Jul 2010 13:30:38 EST - Sunny Bains
+*******************************************************/
+
+#ifndef dict0priv_h
+#define dict0priv_h
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function. Note: Not to be called from outside dict0*c functions.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+ const char* table_name); /*!< in: table name */
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+ const char* table_name); /*!< in: table name */
+
+#include "dict0priv.ic"
+
+#endif /* dict0priv.h */
diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic
new file mode 100644
index 00000000..2fcadc05
--- /dev/null
+++ b/storage/innobase/include/dict0priv.ic
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0priv.ic
+Data dictionary system private include file
+
+Created Wed 13 Oct 2010 16:10:14 EST Sunny Bains
+***********************************************************************/
+
+#include "dict0dict.h"
+#include "dict0load.h"
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+ const char* table_name) /*!< in: table name */
+{
+ dict_table_t* table;
+
+ ut_ad(table_name);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ table = dict_table_check_if_in_cache_low(table_name);
+
+ if (table && table->corrupted) {
+ ib::error error;
+ error << "Table " << table->name << "is corrupted";
+ if (srv_load_corrupted) {
+ error << ", but innodb_force_load_corrupted is set";
+ } else {
+ return(NULL);
+ }
+ }
+
+ if (table == NULL) {
+ table = dict_load_table(table_name, DICT_ERR_IGNORE_NONE);
+ }
+
+ ut_ad(!table || table->cached);
+
+ return(table);
+}
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+ const char* table_name) /*!< in: table name */
+{
+ dict_table_t* table;
+ ulint table_fold;
+
+ DBUG_ENTER("dict_table_check_if_in_cache_low");
+ DBUG_PRINT("dict_table_check_if_in_cache_low",
+ ("table: '%s'", table_name));
+
+ ut_ad(table_name);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ /* Look for the table name in the hash table */
+ table_fold = ut_fold_string(table_name);
+
+ HASH_SEARCH(name_hash, &dict_sys.table_hash, table_fold,
+ dict_table_t*, table, ut_ad(table->cached),
+ !strcmp(table->name.m_name, table_name));
+ DBUG_RETURN(table);
+}
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
new file mode 100644
index 00000000..cf0e2ada
--- /dev/null
+++ b/storage/innobase/include/dict0stats.h
@@ -0,0 +1,251 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.h
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_h
+#define dict0stats_h
+
+#include "dict0types.h"
+#include "trx0types.h"
+
+#define TABLE_STATS_NAME "mysql/innodb_table_stats"
+#define INDEX_STATS_NAME "mysql/innodb_index_stats"
+
+enum dict_stats_upd_option_t {
+ DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the
+ statistics using a precise and slow
+ algo and save them to the persistent
+ storage, if the persistent storage is
+ not present then emit a warning and
+ fall back to transient stats */
+ DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics
+ using an imprecise quick algo
+ without saving the results
+ persistently */
+ DICT_STATS_EMPTY_TABLE, /* Write all zeros (or 1 where it makes sense)
+ into a table and its indexes' statistics
+ members. The resulting stats correspond to an
+ empty table. If the table is using persistent
+ statistics, then they are saved on disk. */
+ DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats
+ from the persistent storage if the in-memory
+ structures have not been initialized yet,
+ otherwise do nothing */
+};
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool ps_on, /*!< in: persistent stats explicitly enabled */
+ ibool ps_off) /*!< in: persistent stats explicitly disabled */
+ MY_ATTRIBUTE((nonnull));
+
+/** @return whether persistent statistics is enabled for a given table */
+UNIV_INLINE
+bool
+dict_stats_is_persistent_enabled(const dict_table_t* table)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool auto_recalc_on, /*!< in: explicitly enabled */
+ ibool auto_recalc_off); /*!< in: explicitly disabled */
+
+/** @return whether auto recalc is enabled for a given table*/
+UNIV_INLINE
+bool
+dict_stats_auto_recalc_is_enabled(const dict_table_t* table)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+ dict_table_t* table); /*!< in/out: table */
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+ dict_table_t* table) /*!< in/out: table */
+ MY_ATTRIBUTE((nonnull));
+
+#ifdef WITH_WSREP
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out] table persistent or temporary table
+@param[in] trx transaction */
+void dict_stats_update_if_needed(dict_table_t *table, const trx_t &trx)
+ MY_ATTRIBUTE((nonnull));
+#else
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out] table persistent or temporary table */
+void dict_stats_update_if_needed_func(dict_table_t *table)
+ MY_ATTRIBUTE((nonnull));
+# define dict_stats_update_if_needed(t,trx) dict_stats_update_if_needed_func(t)
+#endif
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_* error code or DB_SUCCESS */
+dberr_t
+dict_stats_update(
+/*==============*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_stats_upd_option_t stats_upd_option);
+ /*!< in: whether to (re) calc
+ the stats or to fetch them from
+ the persistent storage */
+
+/** Remove the information for a particular index's stats from the persistent
+storage if it exists and if there is data stored for this index.
+This function creates its own trx and commits it.
+
+We must modify system tables in a separate transaction in order to
+adhere to the InnoDB design constraint that dict_sys.latch prevents
+lock waits on system tables. If we modified system and user tables in
+the same transaction, we should exclusively hold dict_sys.latch until
+the transaction is committed, and effectively block other transactions
+that will attempt to open any InnoDB tables. Because we have no
+guarantee that user transactions will be committed fast, we cannot
+afford to keep the system tables locked in a user transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_drop_index(
+/*==================*/
+ const char* tname, /*!< in: table name */
+ const char* iname, /*!< in: index name */
+ char* errstr, /*!< out: error message if != DB_SUCCESS
+ is returned */
+ ulint errstr_sz);/*!< in: size of the errstr buffer */
+
+/*********************************************************************//**
+Removes the statistics for a table and all of its indexes from the
+persistent storage if it exists and if there is data stored for the table.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_drop_table(
+/*==================*/
+ const char* table_name, /*!< in: table name */
+ char* errstr, /*!< out: error message
+ if != DB_SUCCESS is returned */
+ ulint errstr_sz); /*!< in: size of errstr buffer */
+
+/*********************************************************************//**
+Fetches or calculates new estimates for index statistics. */
+void
+dict_stats_update_for_index(
+/*========================*/
+ dict_index_t* index) /*!< in/out: index */
+ MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Renames a table in InnoDB persistent stats storage.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_rename_table(
+/*====================*/
+ const char* old_name, /*!< in: old table name */
+ const char* new_name, /*!< in: new table name */
+ char* errstr, /*!< out: error string if != DB_SUCCESS
+ is returned */
+ size_t errstr_sz); /*!< in: errstr size */
+/*********************************************************************//**
+Renames an index in InnoDB persistent stats storage.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code. DB_STATS_DO_NOT_EXIST will be returned
+if the persistent stats do not exist. */
+dberr_t
+dict_stats_rename_index(
+/*====================*/
+ const dict_table_t* table, /*!< in: table whose index
+ is renamed */
+ const char* old_index_name, /*!< in: old index name */
+ const char* new_index_name) /*!< in: new index name */
+ __attribute__((warn_unused_result));
+
+/** Save an individual index's statistic into the persistent statistics
+storage.
+@param[in] index index to be updated
+@param[in] last_update timestamp of the stat
+@param[in] stat_name name of the stat
+@param[in] stat_value value of the stat
+@param[in] sample_size n pages sampled or NULL
+@param[in] stat_description description of the stat
+@param[in,out] trx in case of NULL the function will
+allocate and free the trx object. If it is not NULL then it will be
+rolled back only in the case of error, but not freed.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_index_stat(
+ dict_index_t* index,
+ time_t last_update,
+ const char* stat_name,
+ ib_uint64_t stat_value,
+ ib_uint64_t* sample_size,
+ const char* stat_description,
+ trx_t* trx);
+
+/** Report an error if updating table statistics failed because
+.ibd file is missing, table decryption failed or table is corrupted.
+@param[in,out] table Table
+@param[in] defragment true if statistics is for defragment
+@retval DB_DECRYPTION_FAILED if decryption of the table failed
+@retval DB_TABLESPACE_DELETED if .ibd file is missing
+@retval DB_CORRUPTION if table is marked as corrupted */
+dberr_t
+dict_stats_report_error(dict_table_t* table, bool defragment = false)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#include "dict0stats.ic"
+
+#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS
+void test_dict_stats_all();
+#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */
+
+#endif /* dict0stats_h */
diff --git a/storage/innobase/include/dict0stats.ic b/storage/innobase/include/dict0stats.ic
new file mode 100644
index 00000000..4972efe8
--- /dev/null
+++ b/storage/innobase/include/dict0stats.ic
@@ -0,0 +1,221 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.ic
+Code used for calculating and manipulating table statistics.
+
+Created Jan 23, 2012 Vasil Dimov
+*******************************************************/
+
+#include "dict0dict.h"
+#include "srv0srv.h"
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool ps_on, /*!< in: persistent stats explicitly enabled */
+ ibool ps_off) /*!< in: persistent stats explicitly disabled */
+{
+ /* Not allowed to have both flags set, but a CREATE or ALTER
+ statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would
+ end up having both set. In this case we clear the OFF flag. */
+ if (ps_on && ps_off) {
+ ps_off = FALSE;
+ }
+
+ ib_uint32_t stat_persistent = 0;
+
+ if (ps_on) {
+ stat_persistent |= DICT_STATS_PERSISTENT_ON;
+ }
+
+ if (ps_off) {
+ stat_persistent |= DICT_STATS_PERSISTENT_OFF;
+ }
+
+ /* we rely on this assignment to be atomic */
+ table->stat_persistent = stat_persistent;
+}
+
+/** @return whether persistent statistics is enabled for a given table */
+UNIV_INLINE
+bool
+dict_stats_is_persistent_enabled(const dict_table_t* table)
+{
+ /* Because of the nature of this check (non-locking) it is possible
+ that a table becomes:
+ * PS-disabled immediately after this function has returned TRUE or
+ * PS-enabled immediately after this function has returned FALSE.
+ This means that it is possible that we do:
+ + dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has
+ just been PS-disabled or
+ + dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has
+ just been PS-enabled.
+ This is acceptable. Avoiding this would mean that we would have to
+ protect the stat_persistent with dict_sys.mutex like the
+ other ::stat_ members which would be too big performance penalty,
+ especially when this function is called from
+ dict_stats_update_if_needed(). */
+
+ /* we rely on this read to be atomic */
+ ib_uint32_t stat_persistent = table->stat_persistent;
+
+ if (stat_persistent & DICT_STATS_PERSISTENT_ON) {
+ ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF));
+ return(true);
+ } else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) {
+ return(false);
+ } else {
+ return(srv_stats_persistent);
+ }
+}
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool auto_recalc_on, /*!< in: explicitly enabled */
+ ibool auto_recalc_off) /*!< in: explicitly disabled */
+{
+ ut_ad(!auto_recalc_on || !auto_recalc_off);
+
+ ib_uint32_t stats_auto_recalc = 0;
+
+ if (auto_recalc_on) {
+ stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON;
+ }
+
+ if (auto_recalc_off) {
+ stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF;
+ }
+
+ /* we rely on this assignment to be atomic */
+ table->stats_auto_recalc = stats_auto_recalc;
+}
+
+/** @return whether auto recalc is enabled for a given table*/
+UNIV_INLINE
+bool
+dict_stats_auto_recalc_is_enabled(const dict_table_t* table)
+{
+ /* we rely on this read to be atomic */
+ ib_uint32_t stats_auto_recalc = table->stats_auto_recalc;
+
+ if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) {
+ ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF));
+ return(true);
+ } else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) {
+ return(false);
+ } else {
+ return(srv_stats_auto_recalc);
+ }
+}
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ ut_ad(!mutex_own(&dict_sys.mutex));
+
+ if (table->stat_initialized) {
+ return;
+ }
+
+ dict_stats_upd_option_t opt;
+
+ if (dict_stats_is_persistent_enabled(table)) {
+ opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
+ } else {
+ opt = DICT_STATS_RECALC_TRANSIENT;
+ }
+
+ dict_stats_update(table, opt);
+}
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ ut_a(table->get_ref_count() == 0);
+
+ if (!table->stat_initialized) {
+ return;
+ }
+
+ table->stat_initialized = FALSE;
+
+#ifdef HAVE_valgrind
+ MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows);
+ MEM_UNDEFINED(&table->stat_clustered_index_size,
+ sizeof table->stat_clustered_index_size);
+ MEM_UNDEFINED(&table->stat_sum_of_other_index_sizes,
+ sizeof table->stat_sum_of_other_index_sizes);
+ MEM_UNDEFINED(&table->stat_modified_counter,
+ sizeof table->stat_modified_counter);
+
+ dict_index_t* index;
+
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ MEM_UNDEFINED(
+ index->stat_n_diff_key_vals,
+ index->n_uniq
+ * sizeof index->stat_n_diff_key_vals[0]);
+ MEM_UNDEFINED(
+ index->stat_n_sample_sizes,
+ index->n_uniq
+ * sizeof index->stat_n_sample_sizes[0]);
+ MEM_UNDEFINED(
+ index->stat_n_non_null_key_vals,
+ index->n_uniq
+ * sizeof index->stat_n_non_null_key_vals[0]);
+ MEM_UNDEFINED(
+ &index->stat_index_size,
+ sizeof(index->stat_index_size));
+ MEM_UNDEFINED(
+ &index->stat_n_leaf_pages,
+ sizeof(index->stat_n_leaf_pages));
+ }
+#endif /* HAVE_valgrind */
+}
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
new file mode 100644
index 00000000..b210a2ec
--- /dev/null
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -0,0 +1,122 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats_bg.h
+Code used for background table and index stats gathering.
+
+Created Apr 26, 2012 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_bg_h
+#define dict0stats_bg_h
+
+#include "dict0types.h"
+#include "os0thread.h"
+
+#ifdef HAVE_PSI_INTERFACE
+extern mysql_pfs_key_t dict_stats_recalc_pool_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
+
+#ifdef UNIV_DEBUG
+/** Value of MySQL global used to disable dict_stats thread. */
+extern my_bool innodb_dict_stats_disabled_debug;
+#endif /* UNIV_DEBUG */
+
+/*****************************************************************//**
+Delete a given table from the auto recalc pool.
+dict_stats_recalc_pool_del() */
+void
+dict_stats_recalc_pool_del(
+/*=======================*/
+ const dict_table_t* table); /*!< in: table to remove */
+
+/** Yield the data dictionary latch when waiting
+for the background thread to stop accessing a table.
+@param trx transaction holding the data dictionary locks */
+#define DICT_BG_YIELD(trx) do { \
+ row_mysql_unlock_data_dictionary(trx); \
+ os_thread_sleep(250000); \
+ row_mysql_lock_data_dictionary(trx); \
+} while (0)
+
+/*****************************************************************//**
+Request the background collection of statistics to stop for a table.
+@retval true when no background process is active
+@retval false when it is not safe to modify the table definition */
+UNIV_INLINE
+bool
+dict_stats_stop_bg(
+/*===============*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ ut_ad(!srv_read_only_mode);
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ if (!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)) {
+ return(true);
+ }
+
+ table->stats_bg_flag |= BG_STAT_SHOULD_QUIT;
+ return(false);
+}
+
+/*****************************************************************//**
+Wait until background stats thread has stopped using the specified table.
+The caller must have locked the data dictionary using
+row_mysql_lock_data_dictionary() and this function may unlock it temporarily
+and restore the lock before it exits.
+The background stats thread is guaranteed not to start using the specified
+table after this function returns and before the caller unlocks the data
+dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag
+under dict_sys.mutex. */
+void
+dict_stats_wait_bg_to_stop_using_table(
+/*===================================*/
+ dict_table_t* table, /*!< in/out: table */
+ trx_t* trx); /*!< in/out: transaction to use for
+ unlocking/locking the data dict */
+/*****************************************************************//**
+Initialize global variables needed for the operation of dict_stats_thread().
+Must be called before dict_stats task is started. */
+void dict_stats_init();
+
+/*****************************************************************//**
+Free resources allocated by dict_stats_thread_init(), must be called
+after dict_stats task has exited. */
+void dict_stats_deinit();
+
+#ifdef UNIV_DEBUG
+/** Disables dict stats thread. It's used by:
+ SET GLOBAL innodb_dict_stats_disabled_debug = 1 (0).
+@param[in] save immediate result from check function */
+void dict_stats_disabled_debug_update(THD*, st_mysql_sys_var*, void*,
+ const void* save);
+#endif /* UNIV_DEBUG */
+
+/** Start the dict stats timer. */
+void dict_stats_start();
+
+/** Shut down the dict_stats timer. */
+void dict_stats_shutdown();
+
+/** Reschedule dict stats timer to run now. */
+void dict_stats_schedule_now();
+
+#endif /* dict0stats_bg_h */
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
new file mode 100644
index 00000000..d0da45ab
--- /dev/null
+++ b/storage/innobase/include/dict0types.h
@@ -0,0 +1,177 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0types.h
+Data dictionary global types
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0types_h
+#define dict0types_h
+
+#include <ut0mutex.h>
+#include <rem0types.h>
+
+struct dict_col_t;
+struct dict_field_t;
+struct dict_index_t;
+struct dict_table_t;
+struct dict_foreign_t;
+struct dict_v_col_t;
+
+struct ind_node_t;
+struct tab_node_t;
+struct dict_add_v_col_t;
+
+/* Space id and page no where the dictionary header resides */
+#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
+#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO
+
+/* The ibuf table and indexes's ID are assigned as the number
+DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN 0xFFFFFFFF00000000ULL
+
+typedef ib_id_t table_id_t;
+typedef ib_id_t index_id_t;
+
+/** Maximum transaction identifier */
+#define TRX_ID_MAX IB_ID_MAX
+
+/** The bit pattern corresponding to TRX_ID_MAX */
+extern const byte trx_id_max_bytes[8];
+extern const byte timestamp_max_bytes[7];
+
+/** Error to ignore when we load table dictionary into memory. However,
+the table and index will be marked as "corrupted", and caller will
+be responsible to deal with corrupted table or index.
+Note: please define the IGNORE_ERR_* as bits, so their value can
+be or-ed together */
+enum dict_err_ignore_t {
+ DICT_ERR_IGNORE_NONE = 0, /*!< no error to ignore */
+ DICT_ERR_IGNORE_FK_NOKEY = 1, /*!< ignore error if any foreign
+ key is missing */
+ DICT_ERR_IGNORE_INDEX_ROOT = 2, /*!< ignore error if index root
+ page is FIL_NULL or incorrect value */
+ DICT_ERR_IGNORE_CORRUPT = 4, /*!< skip corrupted indexes */
+ DICT_ERR_IGNORE_RECOVER_LOCK = 8,
+ /*!< Used when recovering table locks
+ for resurrected transactions.
+ Silently load a missing
+ tablespace, and do not load
+ incomplete index definitions. */
+ /** ignore all errors above */
+ DICT_ERR_IGNORE_ALL = 15,
+ /** prepare to drop the table; do not attempt to load tablespace */
+ DICT_ERR_IGNORE_DROP = 31
+};
+
+/** Quiescing states for flushing tables to disk. */
+enum ib_quiesce_t {
+ QUIESCE_NONE,
+ QUIESCE_START, /*!< Initialise, prepare to start */
+ QUIESCE_COMPLETE /*!< All done */
+};
+
+#ifndef UNIV_INNOCHECKSUM
+typedef ib_mutex_t DictSysMutex;
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Prefix for tmp tables, adopted from sql/table.h */
+#define TEMP_FILE_PREFIX "#sql"
+#define TEMP_FILE_PREFIX_LENGTH 4
+#define TEMP_FILE_PREFIX_INNODB "#sql-ib"
+
+#define TEMP_TABLE_PREFIX "#sql"
+#define TEMP_TABLE_PATH_PREFIX "/" TEMP_TABLE_PREFIX
+
+/** Table name wrapper for pretty-printing */
+struct table_name_t
+{
+ /** The name in internal representation */
+ char* m_name;
+
+ /** Default constructor */
+ table_name_t() {}
+ /** Constructor */
+ table_name_t(char* name) : m_name(name) {}
+
+ /** @return the end of the schema name */
+ const char* dbend() const
+ {
+ const char* sep = strchr(m_name, '/');
+ ut_ad(sep);
+ return sep;
+ }
+
+ /** @return the length of the schema name, in bytes */
+ size_t dblen() const { return size_t(dbend() - m_name); }
+
+ /** Determine the filename-safe encoded table name.
+ @return the filename-safe encoded table name */
+ const char* basename() const { return dbend() + 1; }
+
+ /** The start of the table basename suffix for partitioned tables */
+ static const char part_suffix[4];
+
+ /** Determine the partition or subpartition name suffix.
+ @return the partition name
+ @retval NULL if the table is not partitioned */
+ const char* part() const { return strstr(basename(), part_suffix); }
+
+ /** @return whether this is a temporary or intermediate table name */
+ inline bool is_temporary() const;
+};
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/** Dump the change buffer at startup */
+extern my_bool ibuf_dump;
+/** Flag to control insert buffer debugging. */
+extern uint ibuf_debug;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+/** Shift for spatial status */
+#define SPATIAL_STATUS_SHIFT 12
+
+/** Mask to encode/decode spatial status. */
+#define SPATIAL_STATUS_MASK (3U << SPATIAL_STATUS_SHIFT)
+
+#if SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN
+# error SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN
+#endif
+
+/** whether a col is used in spatial index or regular index
+Note: the spatial status is part of persistent undo log,
+so we should not modify the values in MySQL 5.7 */
+enum spatial_status_t {
+ /* Unkown status (undo format in 5.7.9) */
+ SPATIAL_UNKNOWN = 0,
+
+ /** Not used in gis index. */
+ SPATIAL_NONE = 1,
+
+ /** Used in both spatial index and regular index. */
+ SPATIAL_MIXED = 2,
+
+ /** Only used in spatial index. */
+ SPATIAL_ONLY = 3
+};
+
+#endif
diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h
new file mode 100644
index 00000000..cb8b998f
--- /dev/null
+++ b/storage/innobase/include/dyn0buf.h
@@ -0,0 +1,496 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0buf.h
+The dynamically allocated buffer implementation
+
+Created 2013-03-16 Sunny Bains
+*******************************************************/
+
+#ifndef dyn0buf_h
+#define dyn0buf_h
+
+#include "mem0mem.h"
+#include "dyn0types.h"
+#include "ilist.h"
+
+
+/** Class that manages dynamic buffers. It uses a UT_LIST of
+mtr_buf_t::block_t instances. We don't use STL containers in
+order to avoid the overhead of heap calls. Using a custom memory
+allocator doesn't solve the problem either because we have to get
+the memory from somewhere. We can't use the block_t::m_data as the
+backend for the custom allocator because we would like the data in
+the blocks to be contiguous. */
+class mtr_buf_t {
+public:
+ /** SIZE - sizeof(m_node) + sizeof(m_used) */
+ enum { MAX_DATA_SIZE = DYN_ARRAY_DATA_SIZE
+ - sizeof(ilist_node<>) + sizeof(uint32_t) };
+
+ class block_t : public ilist_node<> {
+ public:
+
+ block_t()
+ {
+ compile_time_assert(MAX_DATA_SIZE <= (2 << 15));
+ init();
+ }
+
+ /**
+ Gets the number of used bytes in a block.
+ @return number of bytes used */
+ ulint used() const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(static_cast<ulint>(m_used & ~DYN_BLOCK_FULL_FLAG));
+ }
+
+ /**
+ Gets pointer to the start of data.
+ @return pointer to data */
+ byte* start()
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(m_data);
+ }
+
+ /**
+ @return start of data - non const version */
+ byte* begin()
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(m_data);
+ }
+
+ /**
+ @return end of used data - non const version */
+ byte* end()
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(begin() + m_used);
+ }
+
+ /**
+ @return start of data - const version */
+ const byte* begin() const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(m_data);
+ }
+
+ /**
+ @return end of used data - const version */
+ const byte* end() const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(begin() + m_used);
+ }
+
+ private:
+ /**
+ @return pointer to start of reserved space */
+ template <typename Type>
+ Type push(uint32_t size)
+ {
+ Type ptr = reinterpret_cast<Type>(end());
+
+ m_used += size;
+ ut_ad(m_used <= uint32_t(MAX_DATA_SIZE));
+
+ return(ptr);
+ }
+
+ /**
+ Grow the stack. */
+ void close(const byte* ptr)
+ {
+ /* Check that it is within bounds */
+ ut_ad(ptr >= begin());
+ ut_ad(ptr <= begin() + m_buf_end);
+
+ /* We have done the boundary check above */
+ m_used = uint32_t(ptr - begin());
+
+ ut_ad(m_used <= MAX_DATA_SIZE);
+ ut_d(m_buf_end = 0);
+ }
+
+ /**
+ Initialise the block */
+ void init()
+ {
+ m_used = 0;
+ ut_d(m_buf_end = 0);
+ ut_d(m_magic_n = DYN_BLOCK_MAGIC_N);
+ }
+ private:
+#ifdef UNIV_DEBUG
+ /** If opened then this is the buffer end offset, else 0 */
+ ulint m_buf_end;
+
+ /** Magic number (DYN_BLOCK_MAGIC_N) */
+ ulint m_magic_n;
+#endif /* UNIV_DEBUG */
+
+ /** Storage */
+ byte m_data[MAX_DATA_SIZE];
+
+ /** number of data bytes used in this block;
+ DYN_BLOCK_FULL_FLAG is set when the block becomes full */
+ uint32_t m_used;
+
+ friend class mtr_buf_t;
+ };
+
+ typedef sized_ilist<block_t> list_t;
+
+ /** Default constructor */
+ mtr_buf_t()
+ :
+ m_heap(),
+ m_size()
+ {
+ push_back(&m_first_block);
+ }
+
+ /** Destructor */
+ ~mtr_buf_t()
+ {
+ erase();
+ }
+
+ /** Reset the buffer vector */
+ void erase()
+ {
+ if (m_heap != NULL) {
+ mem_heap_free(m_heap);
+ m_heap = NULL;
+
+ /* Initialise the list and add the first block. */
+ m_list.clear();
+ m_list.push_back(m_first_block);
+ } else {
+ m_first_block.init();
+ ut_ad(m_list.size() == 1);
+ }
+
+ m_size = 0;
+ }
+
+ /**
+ Makes room on top and returns a pointer to a buffer in it. After
+ copying the elements, the caller must close the buffer using close().
+ @param size in bytes of the buffer; MUST be <= MAX_DATA_SIZE!
+ @return pointer to the buffer */
+ byte* open(ulint size)
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ ut_ad(size > 0);
+ ut_ad(size <= MAX_DATA_SIZE);
+
+ block_t* block;
+
+ block = has_space(size) ? back() : add_block();
+
+ ut_ad(block->m_used <= MAX_DATA_SIZE);
+ ut_d(block->m_buf_end = block->m_used + size);
+
+ return(block->end());
+ }
+
+ /**
+ Closes the buffer returned by open.
+ @param ptr end of used space */
+ void close(const byte* ptr)
+ {
+ ut_ad(!m_list.empty());
+ block_t* block = back();
+
+ m_size -= block->used();
+
+ block->close(ptr);
+
+ m_size += block->used();
+ }
+
+ /**
+ Makes room on top and returns a pointer to the added element.
+ The caller must copy the element to the pointer returned.
+ @param size in bytes of the element
+ @return pointer to the element */
+ template <typename Type>
+ Type push(uint32_t size)
+ {
+ ut_ad(size > 0);
+ ut_ad(size <= MAX_DATA_SIZE);
+
+ block_t* block;
+
+ block = has_space(size) ? back() : add_block();
+
+ m_size += size;
+
+ /* See ISO C++03 14.2/4 for why "template" is required. */
+
+ return(block->template push<Type>(size));
+ }
+
+ /**
+ Pushes n bytes.
+ @param str string to write
+ @param len string length */
+ void push(const byte* ptr, uint32_t len)
+ {
+ while (len > 0) {
+ uint32_t n_copied = std::min(len,
+ uint32_t(MAX_DATA_SIZE));
+ ::memmove(push<byte*>(n_copied), ptr, n_copied);
+
+ ptr += n_copied;
+ len -= n_copied;
+ }
+ }
+
+ /**
+ Returns a pointer to an element in the buffer. const version.
+ @param pos position of element in bytes from start
+ @return pointer to element */
+ template <typename Type>
+ const Type at(ulint pos) const
+ {
+ block_t* block = const_cast<block_t*>(
+ const_cast<mtr_buf_t*>(this)->find(pos));
+
+ return(reinterpret_cast<Type>(block->begin() + pos));
+ }
+
+ /**
+ Returns a pointer to an element in the buffer. non const version.
+ @param pos position of element in bytes from start
+ @return pointer to element */
+ template <typename Type>
+ Type at(ulint pos)
+ {
+ block_t* block = const_cast<block_t*>(find(pos));
+
+ return(reinterpret_cast<Type>(block->begin() + pos));
+ }
+
+ /**
+ Returns the size of the total stored data.
+ @return data size in bytes */
+ ulint size() const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+#ifdef UNIV_DEBUG
+ ulint total_size = 0;
+
+ for (list_t::iterator it = m_list.begin(), end = m_list.end();
+ it != end; ++it) {
+ total_size += it->used();
+ }
+
+ ut_ad(total_size == m_size);
+#endif /* UNIV_DEBUG */
+ return(m_size);
+ }
+
+ /**
+ Iterate over each block and call the functor.
+ @return false if iteration was terminated. */
+ template <typename Functor>
+ bool for_each_block(Functor& functor) const
+ {
+ for (list_t::iterator it = m_list.begin(), end = m_list.end();
+ it != end; ++it) {
+
+ if (!functor(&*it)) {
+ return false;
+ }
+ }
+
+ return(true);
+ }
+
+ /**
+ Iterate over each block and call the functor.
+ @return false if iteration was terminated. */
+ template <typename Functor>
+ bool for_each_block(const Functor& functor) const
+ {
+ for (typename list_t::iterator it = m_list.begin(),
+ end = m_list.end();
+ it != end; ++it) {
+
+ if (!functor(&*it)) {
+ return false;
+ }
+ }
+
+ return(true);
+ }
+
+ /**
+ Iterate over all the blocks in reverse and call the iterator
+ @return false if iteration was terminated. */
+ template <typename Functor>
+ bool for_each_block_in_reverse(Functor& functor) const
+ {
+ for (list_t::reverse_iterator it = m_list.rbegin(),
+ end = m_list.rend();
+ it != end; ++it) {
+
+ if (!functor(&*it)) {
+ return false;
+ }
+ }
+
+ return(true);
+ }
+
+ /**
+ Iterate over all the blocks in reverse and call the iterator
+ @return false if iteration was terminated. */
+ template <typename Functor>
+ bool for_each_block_in_reverse(const Functor& functor) const
+ {
+ for (list_t::reverse_iterator it = m_list.rbegin(),
+ end = m_list.rend();
+ it != end; ++it) {
+
+ if (!functor(&*it)) {
+ return false;
+ }
+ }
+
+ return(true);
+ }
+
+ /**
+ @return the first block */
+ block_t* front()
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return &m_list.front();
+ }
+
+ /**
+ @return true if m_first_block block was not filled fully */
+ bool is_small() const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ return(m_heap == NULL);
+ }
+
+ /** @return whether the buffer is empty */
+ bool empty() const { return !back()->m_used; }
+
+private:
+ // Disable copying
+ mtr_buf_t(const mtr_buf_t&);
+ mtr_buf_t& operator=(const mtr_buf_t&);
+
+ /**
+ Add the block to the end of the list*/
+ void push_back(block_t* block)
+ {
+ block->init();
+ m_list.push_back(*block);
+ }
+
+ /** @return the last block in the list */
+ block_t* back() const
+ {
+ return &const_cast<block_t&>(m_list.back());
+ }
+
+ /*
+ @return true if request can be fullfilled */
+ bool has_space(ulint size) const
+ {
+ return(back()->m_used + size <= MAX_DATA_SIZE);
+ }
+
+ /*
+ @return true if request can be fullfilled */
+ bool has_space(ulint size)
+ {
+ return(back()->m_used + size <= MAX_DATA_SIZE);
+ }
+
+ /** Find the block that contains the pos.
+ @param pos absolute offset, it is updated to make it relative
+ to the block
+ @return the block containing the pos. */
+ block_t* find(ulint& pos)
+ {
+ ut_ad(!m_list.empty());
+
+ for (list_t::iterator it = m_list.begin(), end = m_list.end();
+ it != end; ++it) {
+
+ if (pos < it->used()) {
+ ut_ad(it->used() >= pos);
+
+ return &*it;
+ }
+
+ pos -= it->used();
+ }
+
+ return NULL;
+ }
+
+ /**
+ Allocate and add a new block to m_list */
+ block_t* add_block()
+ {
+ block_t* block;
+
+ if (m_heap == NULL) {
+ m_heap = mem_heap_create(sizeof(*block));
+ }
+
+ block = reinterpret_cast<block_t*>(
+ mem_heap_alloc(m_heap, sizeof(*block)));
+
+ push_back(block);
+
+ return(block);
+ }
+
+private:
+ /** Heap to use for memory allocation */
+ mem_heap_t* m_heap;
+
+ /** Allocated blocks */
+ list_t m_list;
+
+ /** Total size used by all blocks */
+ ulint m_size;
+
+ /** The default block, should always be the first element. This
+ is for backwards compatibility and to avoid an extra heap allocation
+ for small REDO log records */
+ block_t m_first_block;
+};
+
+#endif /* dyn0buf_h */
diff --git a/storage/innobase/include/dyn0types.h b/storage/innobase/include/dyn0types.h
new file mode 100644
index 00000000..83d0b0d6
--- /dev/null
+++ b/storage/innobase/include/dyn0types.h
@@ -0,0 +1,39 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0types.h
+The dynamically allocated buffer types and constants
+
+Created 2013-03-16 Sunny Bains
+*******************************************************/
+
+#ifndef dyn0types_h
+#define dyn0types_h
+
+/** Value of dyn_block_t::magic_n */
+#define DYN_BLOCK_MAGIC_N 375767
+
+/** This is the initial 'payload' size of a dynamic array */
+#define DYN_ARRAY_DATA_SIZE 512
+
+/** Flag for dyn_block_t::used that indicates a full block */
+#define DYN_BLOCK_FULL_FLAG 0x1000000UL
+
+#endif /* dyn0types_h */
diff --git a/storage/innobase/include/eval0eval.h b/storage/innobase/include/eval0eval.h
new file mode 100644
index 00000000..ebd40924
--- /dev/null
+++ b/storage/innobase/include/eval0eval.h
@@ -0,0 +1,109 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.h
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0eval_h
+#define eval0eval_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+void
+eval_node_free_val_buf(
+/*===================*/
+ que_node_t* node); /*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+ sym_node_t* sym_node); /*!< in: symbol table node */
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+ que_node_t* exp_node); /*!< in: expression */
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+ que_node_t* node, /*!< in: expression node */
+ lint val); /*!< in: value to set */
+/*****************************************************************//**
+Gets an integer value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+ que_node_t* node); /*!< in: expression node */
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+ que_node_t* node, /*!< in: query graph node */
+ const byte* str, /*!< in: binary string */
+ ulint len); /*!< in: string length or UNIV_SQL_NULL */
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+ que_node_t* node1, /*!< in: node to copy to */
+ que_node_t* node2); /*!< in: node to copy from */
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+ que_node_t* node); /*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a comparison node.
+@return the result of the comparison */
+ibool
+eval_cmp(
+/*=====*/
+ func_node_t* cmp_node); /*!< in: comparison node */
+
+
+#include "eval0eval.ic"
+
+#endif
diff --git a/storage/innobase/include/eval0eval.ic b/storage/innobase/include/eval0eval.ic
new file mode 100644
index 00000000..0ea4057f
--- /dev/null
+++ b/storage/innobase/include/eval0eval.ic
@@ -0,0 +1,254 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.ic
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "pars0grm.h"
+
+/*****************************************************************//**
+Evaluates a function node. */
+void
+eval_func(
+/*======*/
+ func_node_t* func_node); /*!< in: function node */
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return pointer to allocated buffer */
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+ que_node_t* node, /*!< in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size); /*!< in: buffer size */
+
+
+/*****************************************************************//**
+Allocates a new buffer if needed.
+@return pointer to buffer */
+UNIV_INLINE
+byte*
+eval_node_ensure_val_buf(
+/*=====================*/
+ que_node_t* node, /*!< in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size) /*!< in: buffer size */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+ dfield_set_len(dfield, size);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (!data || que_node_get_val_buf_size(node) < size) {
+
+ data = eval_node_alloc_val_buf(node, size);
+ }
+
+ return(data);
+}
+
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+ sym_node_t* sym_node) /*!< in: symbol table node */
+{
+
+ ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+ if (sym_node->indirection) {
+ /* The symbol table node is an alias for a variable or a
+ column */
+
+ dfield_copy_data(que_node_get_val(sym_node),
+ que_node_get_val(sym_node->indirection));
+ }
+}
+
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+ que_node_t* exp_node) /*!< in: expression */
+{
+ if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
+
+ eval_sym((sym_node_t*) exp_node);
+
+ return;
+ }
+
+ eval_func(static_cast<func_node_t*>(exp_node));
+}
+
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+ que_node_t* node, /*!< in: expression node */
+ lint val) /*!< in: value to set */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (data == NULL) {
+ data = eval_node_alloc_val_buf(node, 4);
+ }
+
+ ut_ad(dfield_get_len(dfield) == 4);
+
+ mach_write_to_4(data, (ulint) val);
+}
+
+/*****************************************************************//**
+Gets an integer non-SQL null value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+ que_node_t* node) /*!< in: expression node */
+{
+ const byte* ptr;
+ dfield_t* dfield;
+
+ dfield = que_node_get_val(node);
+ ptr = static_cast<byte*>(dfield_get_data(dfield));
+
+ ut_ad(dfield_get_len(dfield) == 4);
+
+ return((int) mach_read_from_4(ptr));
+}
+
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+ que_node_t* node) /*!< in: query graph node */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ ut_ad(data != NULL);
+
+ return(mach_read_from_1(data));
+}
+
+/*****************************************************************//**
+Sets a iboolean value as the value of a function node. */
+UNIV_INLINE
+void
+eval_node_set_ibool_val(
+/*====================*/
+ func_node_t* func_node, /*!< in: function node */
+ ibool val) /*!< in: value to set */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(func_node);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (data == NULL) {
+ /* Allocate 1 byte to hold the value */
+
+ data = eval_node_alloc_val_buf(func_node, 1);
+ }
+
+ ut_ad(dfield_get_len(dfield) == 1);
+
+ mach_write_to_1(data, val);
+}
+
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+ que_node_t* node, /*!< in: query graph node */
+ const byte* str, /*!< in: binary string */
+ ulint len) /*!< in: string length or UNIV_SQL_NULL */
+{
+ byte* data;
+
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_len(que_node_get_val(node), len);
+
+ return;
+ }
+
+ data = eval_node_ensure_val_buf(node, len);
+
+ memcpy(data, str, len);
+}
+
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+ que_node_t* node1, /*!< in: node to copy to */
+ que_node_t* node2) /*!< in: node to copy from */
+{
+ dfield_t* dfield2;
+
+ dfield2 = que_node_get_val(node2);
+
+ eval_node_copy_and_alloc_val(
+ node1,
+ static_cast<byte*>(dfield_get_data(dfield2)),
+ dfield_get_len(dfield2));
+}
diff --git a/storage/innobase/include/eval0proc.h b/storage/innobase/include/eval0proc.h
new file mode 100644
index 00000000..71700bb5
--- /dev/null
+++ b/storage/innobase/include/eval0proc.h
@@ -0,0 +1,94 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.h
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0proc_h
+#define eval0proc_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+if_step(
+/*====*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+while_step(
+/*=======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return query thread to run next or NULL */
+que_thr_t*
+for_step(
+/*=====*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+assign_step(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+exit_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+return_step(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+#include "eval0proc.ic"
+
+#endif
diff --git a/storage/innobase/include/eval0proc.ic b/storage/innobase/include/eval0proc.ic
new file mode 100644
index 00000000..b0c5f75b
--- /dev/null
+++ b/storage/innobase/include/eval0proc.ic
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.ic
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ proc_node_t* node;
+
+ ut_ad(thr);
+
+ node = static_cast<proc_node_t*>(thr->run_node);
+ ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ /* Start execution from the first statement in the statement
+ list */
+
+ thr->run_node = node->stat_list;
+ } else {
+ /* Move to the next statement */
+ ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+ thr->run_node = NULL;
+ }
+
+ if (thr->run_node == NULL) {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ func_node_t* node;
+
+ ut_ad(thr);
+
+ node = static_cast<func_node_t*>(thr->run_node);
+ ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+ /* Evaluate the procedure */
+
+ eval_exp(node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h
new file mode 100644
index 00000000..872053dc
--- /dev/null
+++ b/storage/innobase/include/fil0crypt.h
@@ -0,0 +1,455 @@
+/*****************************************************************************
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (c) 2015, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0crypt.h
+The low-level file system encryption support functions
+
+Created 04/01/2015 Jan Lindström
+*******************************************************/
+
+#ifndef fil0crypt_h
+#define fil0crypt_h
+
+#include "os0event.h"
+#include "my_crypt.h"
+#include "fil0fil.h"
+
+/**
+* Magic pattern in start of crypt data on page 0
+*/
+#define MAGIC_SZ 6
+
+static const unsigned char CRYPT_MAGIC[MAGIC_SZ] = {
+ 's', 0xE, 0xC, 'R', 'E', 't' };
+
+/* This key will be used if nothing else is given */
+#define FIL_DEFAULT_ENCRYPTION_KEY ENCRYPTION_KEY_SYSTEM_DATA
+
+extern os_event_t fil_crypt_threads_event;
+
+/**
+ * CRYPT_SCHEME_UNENCRYPTED
+ *
+ * Used as intermediate state when convering a space from unencrypted
+ * to encrypted
+ */
+/**
+ * CRYPT_SCHEME_1
+ *
+ * xxx is AES_CTR or AES_CBC (or another block cypher with the same key and iv lengths)
+ * L = AES_ECB(KEY, IV)
+ * CRYPT(PAGE) = xxx(KEY=L, IV=C, PAGE)
+ */
+
+#define CRYPT_SCHEME_1 1
+#define CRYPT_SCHEME_1_IV_LEN 16
+#define CRYPT_SCHEME_UNENCRYPTED 0
+
+/* Cached L or key for given key_version */
+struct key_struct
+{
+ uint key_version; /*!< Version of the key */
+ uint key_length; /*!< Key length */
+ unsigned char key[MY_AES_MAX_KEY_LENGTH]; /*!< Cached key
+ (that is L in CRYPT_SCHEME_1) */
+};
+
+/** is encryption enabled */
+extern ulong srv_encrypt_tables;
+
+/** Mutex helper for crypt_data->scheme
+@param[in, out] schme encryption scheme
+@param[in] exit should we exit or enter mutex ? */
+void
+crypt_data_scheme_locker(
+ st_encryption_scheme* scheme,
+ int exit);
+
+struct fil_space_rotate_state_t
+{
+ time_t start_time; /*!< time when rotation started */
+ ulint active_threads; /*!< active threads in space */
+ uint32_t next_offset; /*!< next "free" offset */
+ uint32_t max_offset; /*!< max offset needing to be rotated */
+ uint min_key_version_found; /*!< min key version found but not
+ rotated */
+ lsn_t end_lsn; /*!< max lsn created when rotating this
+ space */
+ bool starting; /*!< initial write of IV */
+ bool flushing; /*!< space is being flushed at end of rotate */
+};
+
+#ifndef UNIV_INNOCHECKSUM
+
+struct fil_space_crypt_t : st_encryption_scheme
+{
+ public:
+ /** Constructor. Does not initialize the members!
+ The object is expected to be placed in a buffer that
+ has been zero-initialized. */
+ fil_space_crypt_t(
+ uint new_type,
+ uint new_min_key_version,
+ uint new_key_id,
+ fil_encryption_t new_encryption)
+ : st_encryption_scheme(),
+ min_key_version(new_min_key_version),
+ encryption(new_encryption),
+ key_found(0),
+ rotate_state()
+ {
+ key_id = new_key_id;
+ my_random_bytes(iv, sizeof(iv));
+ mutex_create(LATCH_ID_FIL_CRYPT_DATA_MUTEX, &mutex);
+ locker = crypt_data_scheme_locker;
+ type = new_type;
+
+ if (new_encryption == FIL_ENCRYPTION_OFF ||
+ (!srv_encrypt_tables &&
+ new_encryption == FIL_ENCRYPTION_DEFAULT)) {
+ type = CRYPT_SCHEME_UNENCRYPTED;
+ } else {
+ type = CRYPT_SCHEME_1;
+ min_key_version = key_get_latest_version();
+ }
+
+ key_found = min_key_version;
+ }
+
+ /** Destructor */
+ ~fil_space_crypt_t()
+ {
+ mutex_free(&mutex);
+ }
+
+ /** Get latest key version from encryption plugin
+ @retval key_version or
+ @retval ENCRYPTION_KEY_VERSION_INVALID if used key_id
+ is not found from encryption plugin. */
+ uint key_get_latest_version(void);
+
+ /** Returns true if key was found from encryption plugin
+ and false if not. */
+ bool is_key_found() const {
+ return key_found != ENCRYPTION_KEY_VERSION_INVALID;
+ }
+
+ /** Returns true if tablespace should be encrypted */
+ bool should_encrypt() const {
+ return ((encryption == FIL_ENCRYPTION_ON) ||
+ (srv_encrypt_tables &&
+ encryption == FIL_ENCRYPTION_DEFAULT));
+ }
+
+ /** Return true if tablespace is encrypted. */
+ bool is_encrypted() const {
+ return (encryption != FIL_ENCRYPTION_OFF);
+ }
+
+ /** Return true if default tablespace encryption is used, */
+ bool is_default_encryption() const {
+ return (encryption == FIL_ENCRYPTION_DEFAULT);
+ }
+
+ /** Return true if tablespace is not encrypted. */
+ bool not_encrypted() const {
+ return (encryption == FIL_ENCRYPTION_OFF);
+ }
+
+ /** Fill crypt data information to the give page.
+ It should be called during ibd file creation.
+ @param[in] flags tablespace flags
+ @param[in,out] page first page of the tablespace */
+ void fill_page0(ulint flags, byte* page);
+
+ /** Write encryption metadata to the first page.
+ @param[in,out] block first page of the tablespace
+ @param[in,out] mtr mini-transaction */
+ void write_page0(buf_block_t* block, mtr_t* mtr);
+
+ uint min_key_version; // min key version for this space
+ fil_encryption_t encryption; // Encryption setup
+
+ ib_mutex_t mutex; // mutex protecting following variables
+
+ /** Return code from encryption_key_get_latest_version.
+ If ENCRYPTION_KEY_VERSION_INVALID encryption plugin
+ could not find the key and there is no need to call
+ get_latest_key_version again as keys are read only
+ at startup. */
+ uint key_found;
+
+ fil_space_rotate_state_t rotate_state;
+};
+
+/** Status info about encryption */
+struct fil_space_crypt_status_t {
+ ulint space; /*!< tablespace id */
+ ulint scheme; /*!< encryption scheme */
+ uint min_key_version; /*!< min key version */
+ uint current_key_version;/*!< current key version */
+ uint keyserver_requests;/*!< no of key requests to key server */
+ uint key_id; /*!< current key_id */
+ bool rotating; /*!< is key rotation ongoing */
+ bool flushing; /*!< is flush at end of rotation ongoing */
+ ulint rotate_next_page_number; /*!< next page if key rotating */
+ ulint rotate_max_page_number; /*!< max page if key rotating */
+};
+
+/** Statistics about encryption key rotation */
+struct fil_crypt_stat_t {
+ ulint pages_read_from_cache;
+ ulint pages_read_from_disk;
+ ulint pages_modified;
+ ulint pages_flushed;
+ ulint estimated_iops;
+};
+
+/*********************************************************************
+Init space crypt */
+UNIV_INTERN
+void
+fil_space_crypt_init();
+
+/*********************************************************************
+Cleanup space crypt */
+UNIV_INTERN
+void
+fil_space_crypt_cleanup();
+
+/**
+Create a fil_space_crypt_t object
+@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or
+ FIL_ENCRYPTION_ON or
+ FIL_ENCRYPTION_OFF
+
+@param[in] key_id Encryption key id
+@return crypt object */
+UNIV_INTERN
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+ fil_encryption_t encrypt_mode,
+ uint key_id)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************************
+Merge fil_space_crypt_t object
+@param[in,out] dst Destination cryp data
+@param[in] src Source crypt data */
+UNIV_INTERN
+void
+fil_space_merge_crypt_data(
+ fil_space_crypt_t* dst,
+ const fil_space_crypt_t* src);
+
+/** Initialize encryption parameters from a tablespace header page.
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] page first page of the tablespace
+@return crypt data from page 0
+@retval NULL if not present or not valid */
+fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**
+Free a crypt data object
+@param[in,out] crypt_data crypt data to be freed */
+UNIV_INTERN
+void
+fil_space_destroy_crypt_data(
+ fil_space_crypt_t **crypt_data);
+
+/** Amend encryption information from redo log.
+@param[in] space tablespace
+@param[in] data encryption metadata */
+void fil_crypt_parse(fil_space_t* space, const byte* data);
+
+/** Encrypt a buffer.
+@param[in,out] crypt_data Crypt data
+@param[in] space space_id
+@param[in] offset Page offset
+@param[in] src_frame Page to encrypt
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] dst_frame Output buffer
+@param[in] use_full_checksum full crc32 algo is used
+@return encrypted buffer or NULL */
+UNIV_INTERN
+byte*
+fil_encrypt_buf(
+ fil_space_crypt_t* crypt_data,
+ ulint space,
+ ulint offset,
+ const byte* src_frame,
+ ulint zip_size,
+ byte* dst_frame,
+ bool use_full_checksum)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/**
+Encrypt a page.
+
+@param[in] space Tablespace
+@param[in] offset Page offset
+@param[in] src_frame Page to encrypt
+@param[in,out] dst_frame Output buffer
+@return encrypted buffer or NULL */
+byte* fil_space_encrypt(
+ const fil_space_t* space,
+ ulint offset,
+ byte* src_frame,
+ byte* dst_frame)
+ MY_ATTRIBUTE((warn_unused_result));
+
+
+/** Decrypt a page.
+@param]in] space_id space id
+@param[in] crypt_data crypt_data
+@param[in] tmp_frame Temporary buffer
+@param[in] physical_size page size
+@param[in] fsp_flags Tablespace flags
+@param[in,out] src_frame Page to decrypt
+@param[out] err DB_SUCCESS or DB_DECRYPTION_FAILED
+@return true if page decrypted, false if not.*/
+UNIV_INTERN
+bool
+fil_space_decrypt(
+ ulint space_id,
+ fil_space_crypt_t* crypt_data,
+ byte* tmp_frame,
+ ulint physical_size,
+ ulint fsp_flags,
+ byte* src_frame,
+ dberr_t* err);
+
+/******************************************************************
+Decrypt a page
+@param[in] space Tablespace
+@param[in] tmp_frame Temporary buffer used for decrypting
+@param[in,out] src_frame Page to decrypt
+@return decrypted page, or original not encrypted page if decryption is
+not needed.*/
+UNIV_INTERN
+byte*
+fil_space_decrypt(
+ const fil_space_t* space,
+ byte* tmp_frame,
+ byte* src_frame)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/**
+Calculate post encryption checksum
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] dst_frame Block where checksum is calculated
+@return page checksum
+not needed. */
+uint32_t
+fil_crypt_calculate_checksum(ulint zip_size, const byte* dst_frame)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************
+Adjust thread count for key rotation
+@param[in] enw_cnt Number of threads to be used */
+UNIV_INTERN
+void
+fil_crypt_set_thread_cnt(
+ uint new_cnt);
+
+/*********************************************************************
+Adjust max key age
+@param[in] val New max key age */
+UNIV_INTERN
+void
+fil_crypt_set_rotate_key_age(
+ uint val);
+
+/*********************************************************************
+Adjust rotation iops
+@param[in] val New max roation iops */
+UNIV_INTERN
+void
+fil_crypt_set_rotation_iops(
+ uint val);
+
+/*********************************************************************
+Adjust encrypt tables
+@param[in] val New setting for innodb-encrypt-tables */
+void fil_crypt_set_encrypt_tables(ulong val);
+
+/*********************************************************************
+Init threads for key rotation */
+UNIV_INTERN
+void
+fil_crypt_threads_init();
+
+/*********************************************************************
+Clean up key rotation threads resources */
+UNIV_INTERN
+void
+fil_crypt_threads_cleanup();
+
+/*********************************************************************
+Wait for crypt threads to stop accessing space
+@param[in] space Tablespace */
+UNIV_INTERN
+void
+fil_space_crypt_close_tablespace(
+ const fil_space_t* space);
+
+/*********************************************************************
+Get crypt status for a space (used by information_schema)
+@param[in] space Tablespace
+@param[out] status Crypt status
+return 0 if crypt data present */
+UNIV_INTERN
+void
+fil_space_crypt_get_status(
+ const fil_space_t* space,
+ struct fil_space_crypt_status_t* status);
+
+/*********************************************************************
+Return crypt statistics
+@param[out] stat Crypt statistics */
+UNIV_INTERN
+void
+fil_crypt_total_stat(
+ fil_crypt_stat_t *stat);
+
+#include "fil0crypt.ic"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/**
+Verify that post encryption checksum match calculated checksum.
+This function should be called only if tablespace contains crypt_data
+metadata (this is strong indication that tablespace is encrypted).
+Function also verifies that traditional checksum does not match
+calculated checksum as if it does page could be valid unencrypted,
+encrypted, or corrupted.
+
+@param[in,out] page page frame (checksum is temporarily modified)
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return true if page is encrypted AND OK, false otherwise */
+bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Add the tablespace to the rotation list if
+innodb_encrypt_rotate_key_age is 0 or encryption plugin does
+not do key version rotation
+@return whether the tablespace should be added to rotation list */
+bool fil_crypt_must_default_encrypt();
+
+#endif /* fil0crypt_h */
diff --git a/storage/innobase/include/fil0crypt.ic b/storage/innobase/include/fil0crypt.ic
new file mode 100644
index 00000000..cc59b394
--- /dev/null
+++ b/storage/innobase/include/fil0crypt.ic
@@ -0,0 +1,81 @@
+/*****************************************************************************
+
+Copyright (c) 2015, 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0crypt.ic
+The low-level file system encryption support functions
+
+Created 04/01/2015 Jan Lindström
+*******************************************************/
+
+/*******************************************************************//**
+Find out whether the page is page encrypted
+@return true if page is page encrypted, false if not */
+UNIV_INLINE
+bool
+fil_page_is_encrypted(
+/*==================*/
+ const byte *buf) /*!< in: page */
+{
+ return(mach_read_from_4(buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0);
+}
+
+/*******************************************************************//**
+Get current encryption mode from crypt_data.
+@return string representation */
+UNIV_INLINE
+const char *
+fil_crypt_get_mode(
+/*===============*/
+ const fil_space_crypt_t* crypt_data)
+{
+ switch (crypt_data->encryption) {
+ case FIL_ENCRYPTION_DEFAULT:
+ return("Default tablespace encryption mode");
+ case FIL_ENCRYPTION_ON:
+ return("Tablespace encrypted");
+ case FIL_ENCRYPTION_OFF:
+ return("Tablespace not encrypted");
+ }
+
+ ut_error;
+ return ("NULL");
+}
+
+/*******************************************************************//**
+Get current encryption type from crypt_data.
+@return string representation */
+UNIV_INLINE
+const char *
+fil_crypt_get_type(
+ const fil_space_crypt_t* crypt_data)
+{
+ ut_ad(crypt_data != NULL);
+ switch (crypt_data->type) {
+ case CRYPT_SCHEME_UNENCRYPTED:
+ return("scheme unencrypted");
+ break;
+ case CRYPT_SCHEME_1:
+ return("scheme encrypted");
+ break;
+ default:
+ ut_error;
+ }
+
+ return ("NULL");
+}
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
new file mode 100644
index 00000000..57b10351
--- /dev/null
+++ b/storage/innobase/include/fil0fil.h
@@ -0,0 +1,1799 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.h
+The low-level file system
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fil0fil_h
+#define fil0fil_h
+
+#include "fsp0types.h"
+#include "mach0data.h"
+#include "assume_aligned.h"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "buf0dblwr.h"
+#include "hash0hash.h"
+#include "log0recv.h"
+#include "dict0types.h"
+#include "ilist.h"
+#include <set>
+#include <mutex>
+
+struct unflushed_spaces_tag_t;
+struct rotation_list_tag_t;
+
+// Forward declaration
+extern my_bool srv_use_doublewrite_buf;
+
+/** Possible values of innodb_flush_method */
+enum srv_flush_t
+{
+ /** fsync, the default */
+ SRV_FSYNC= 0,
+ /** open log files in O_DSYNC mode */
+ SRV_O_DSYNC,
+ /** do not call os_file_flush() when writing data files, but do flush
+ after writing to log files */
+ SRV_LITTLESYNC,
+ /** do not flush after writing */
+ SRV_NOSYNC,
+ /** invoke os_file_set_nocache() on data files. This implies using
+ non-buffered IO but still using fsync, the reason for which is that
+ some FS do not flush meta-data when unbuffered IO happens */
+ SRV_O_DIRECT,
+ /** do not use fsync() when using direct IO i.e.: it can be set to
+ avoid the fsync() call that we make when using SRV_UNIX_O_DIRECT.
+ However, in this case user/DBA should be sure about the integrity of
+ the meta-data */
+ SRV_O_DIRECT_NO_FSYNC
+#ifdef _WIN32
+ /** Traditional Windows appoach to open all files without caching,
+ and do FileFlushBuffers() */
+ ,SRV_ALL_O_DIRECT_FSYNC
+#endif
+};
+
+/** innodb_flush_method */
+extern ulong srv_file_flush_method;
+
+/** Undo tablespaces starts with space_id. */
+extern ulint srv_undo_space_id_start;
+/** The number of UNDO tablespaces that are open and ready to use. */
+extern ulint srv_undo_tablespaces_open;
+
+/** Check whether given space id is undo tablespace id
+@param[in] space_id space id to check
+@return true if it is undo tablespace else false. */
+inline bool srv_is_undo_tablespace(ulint space_id)
+{
+ return srv_undo_space_id_start > 0 &&
+ space_id >= srv_undo_space_id_start &&
+ space_id < srv_undo_space_id_start + srv_undo_tablespaces_open;
+}
+
+class page_id_t;
+
+/** Structure containing encryption specification */
+struct fil_space_crypt_t;
+
+/** File types */
+enum fil_type_t {
+ /** temporary tablespace (temporary undo log or tables) */
+ FIL_TYPE_TEMPORARY,
+ /** a tablespace that is being imported (no logging until finished) */
+ FIL_TYPE_IMPORT,
+ /** persistent tablespace (for system, undo log or tables) */
+ FIL_TYPE_TABLESPACE,
+};
+
+struct fil_node_t;
+
+/** Structure to store first and last value of range */
+struct range_t
+{
+ uint32_t first;
+ uint32_t last;
+};
+
+/** Sort the range based on first value of the range */
+struct range_compare
+{
+ bool operator() (const range_t lhs, const range_t rhs) const
+ {
+ return lhs.first < rhs.first;
+ }
+};
+
+using range_set_t= std::set<range_t, range_compare>;
+/** Range to store the set of ranges of integers */
+class range_set
+{
+private:
+ range_set_t ranges;
+
+ range_set_t::iterator find(uint32_t value) const
+ {
+ auto r_offset= ranges.lower_bound({value, value});
+ const auto r_end= ranges.end();
+ if (r_offset != r_end);
+ else if (empty())
+ return r_end;
+ else
+ r_offset= std::prev(r_end);
+ if (r_offset->first <= value && r_offset->last >= value)
+ return r_offset;
+ return r_end;
+ }
+public:
+ /** Merge the current range with previous range.
+ @param[in] range range to be merged
+ @param[in] prev_range range to be merged with next */
+ void merge_range(range_set_t::iterator range,
+ range_set_t::iterator prev_range)
+ {
+ if (range->first != prev_range->last + 1)
+ return;
+
+ /* Merge the current range with previous range */
+ range_t new_range {prev_range->first, range->last};
+ ranges.erase(prev_range);
+ ranges.erase(range);
+ ranges.emplace(new_range);
+ }
+
+ /** Split the range and add two more ranges
+ @param[in] range range to be split
+ @param[in] value Value to be removed from range */
+ void split_range(range_set_t::iterator range, uint32_t value)
+ {
+ range_t split1{range->first, value - 1};
+ range_t split2{value + 1, range->last};
+
+ /* Remove the existing element */
+ ranges.erase(range);
+
+ /* Insert the two elements */
+ ranges.emplace(split1);
+ ranges.emplace(split2);
+ }
+
+ /** Remove the value with the given range
+ @param[in,out] range range to be changed
+ @param[in] value value to be removed */
+ void remove_within_range(range_set_t::iterator range, uint32_t value)
+ {
+ range_t new_range{range->first, range->last};
+ if (value == range->first)
+ {
+ if (range->first == range->last)
+ {
+ ranges.erase(range);
+ return;
+ }
+ else
+ new_range.first++;
+ }
+ else if (value == range->last)
+ new_range.last--;
+ else if (range->first < value && range->last > value)
+ return split_range(range, value);
+
+ ranges.erase(range);
+ ranges.emplace(new_range);
+ }
+
+ /** Remove the value from the ranges.
+ @param[in] value Value to be removed. */
+ void remove_value(uint32_t value)
+ {
+ if (empty())
+ return;
+ range_t new_range {value, value};
+ range_set_t::iterator range= ranges.lower_bound(new_range);
+ if (range == ranges.end())
+ return remove_within_range(std::prev(range), value);
+
+ if (range->first > value && range != ranges.begin())
+ /* Iterate the previous ranges to delete */
+ return remove_within_range(std::prev(range), value);
+ return remove_within_range(range, value);
+ }
+ /** Add the value within the existing range
+ @param[in] range range to be modified
+ @param[in] value value to be added */
+ range_set_t::iterator add_within_range(range_set_t::iterator range,
+ uint32_t value)
+ {
+ if (range->first <= value && range->last >= value)
+ return range;
+
+ range_t new_range{range->first, range->last};
+ if (range->last + 1 == value)
+ new_range.last++;
+ else if (range->first - 1 == value)
+ new_range.first--;
+ else return ranges.end();
+ ranges.erase(range);
+ return ranges.emplace(new_range).first;
+ }
+ /** Add the range in the ranges set
+ @param[in] new_range range to be added */
+ void add_range(range_t new_range)
+ {
+ auto r_offset= ranges.lower_bound(new_range);
+ auto r_begin= ranges.begin();
+ auto r_end= ranges.end();
+ if (!ranges.size())
+ {
+new_range:
+ ranges.emplace(new_range);
+ return;
+ }
+
+ if (r_offset == r_end)
+ {
+ /* last range */
+ if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+ goto new_range;
+ }
+ else if (r_offset == r_begin)
+ {
+ /* First range */
+ if (add_within_range(r_offset, new_range.first) == r_end)
+ goto new_range;
+ }
+ else if (r_offset->first - 1 == new_range.first)
+ {
+ /* Change starting of the existing range */
+ auto r_value= add_within_range(r_offset, new_range.first);
+ if (r_value != ranges.begin())
+ merge_range(r_value, std::prev(r_value));
+ }
+ else
+ {
+ /* previous range last_value alone */
+ if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+ goto new_range;
+ }
+ }
+
+ /** Add the value in the ranges
+ @param[in] value value to be added */
+ void add_value(uint32_t value)
+ {
+ range_t new_range{value, value};
+ add_range(new_range);
+ }
+
+ bool remove_if_exists(uint32_t value)
+ {
+ auto r_offset= find(value);
+ if (r_offset != ranges.end())
+ {
+ remove_within_range(r_offset, value);
+ return true;
+ }
+ return false;
+ }
+
+ bool contains(uint32_t value) const
+ {
+ return find(value) != ranges.end();
+ }
+
+ ulint size() { return ranges.size(); }
+ void clear() { ranges.clear(); }
+ bool empty() const { return ranges.empty(); }
+ typename range_set_t::iterator begin() { return ranges.begin(); }
+ typename range_set_t::iterator end() { return ranges.end(); }
+};
+#endif
+
+/** Tablespace or log data space */
+#ifndef UNIV_INNOCHECKSUM
+struct fil_io_t
+{
+ /** error code */
+ dberr_t err;
+ /** file; node->space->release() must follow IORequestRead call */
+ fil_node_t *node;
+};
+
+/** Tablespace encryption mode */
+enum fil_encryption_t
+{
+ /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */
+ FIL_ENCRYPTION_DEFAULT,
+ /** Encrypted */
+ FIL_ENCRYPTION_ON,
+ /** Not encrypted */
+ FIL_ENCRYPTION_OFF
+};
+
+struct fil_space_t final :
+ ilist_node<unflushed_spaces_tag_t>, ilist_node<rotation_list_tag_t>
+#else
+struct fil_space_t final
+#endif
+{
+#ifndef UNIV_INNOCHECKSUM
+ friend fil_node_t;
+ ulint id; /*!< space id */
+ hash_node_t hash; /*!< hash chain node */
+ char* name; /*!< Tablespace name */
+ lsn_t max_lsn;
+ /*!< LSN of the most recent
+ fil_names_write_if_was_clean().
+ Reset to 0 by fil_names_clear().
+ Protected by log_sys.mutex.
+ If and only if this is nonzero, the
+ tablespace will be in named_spaces. */
+ /** whether undo tablespace truncation is in progress */
+ bool is_being_truncated;
+ fil_type_t purpose;/*!< purpose */
+ UT_LIST_BASE_NODE_T(fil_node_t) chain;
+ /*!< base node for the file chain */
+ uint32_t size; /*!< tablespace file size in pages;
+ 0 if not known yet */
+ uint32_t size_in_header;
+ /* FSP_SIZE in the tablespace header;
+ 0 if not known yet */
+ uint32_t free_len;
+ /*!< length of the FSP_FREE list */
+ uint32_t free_limit;
+ /*!< contents of FSP_FREE_LIMIT */
+ uint32_t recv_size;
+ /*!< recovered tablespace size in pages;
+ 0 if no size change was read from the redo log,
+ or if the size change was implemented */
+ uint32_t n_reserved_extents;
+ /*!< number of reserved free extents for
+ ongoing operations like B-tree page split */
+private:
+ /** the committed size of the tablespace in pages */
+ Atomic_relaxed<uint32_t> committed_size;
+ /** Number of pending operations on the file.
+ The tablespace cannot be freed while (n_pending & PENDING) != 0. */
+ std::atomic<uint32_t> n_pending;
+ /** Flag in n_pending that indicates that the tablespace is being
+ deleted, and no further operations should be performed */
+ static constexpr uint32_t STOPPING= 1U << 31;
+ /** Flag in n_pending that indicates that the tablespace is a candidate
+ for being closed, and fil_node_t::is_open() can only be trusted after
+ acquiring fil_system.mutex and resetting the flag */
+ static constexpr uint32_t CLOSING= 1U << 30;
+ /** Flag in n_pending that indicates that the tablespace needs fsync().
+ This must be the least significant flag bit; @see release_flush() */
+ static constexpr uint32_t NEEDS_FSYNC= 1U << 29;
+ /** The reference count */
+ static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC);
+public:
+ rw_lock_t latch; /*!< latch protecting the file space storage
+ allocation */
+ UT_LIST_NODE_T(fil_space_t) named_spaces;
+ /*!< list of spaces for which FILE_MODIFY
+ records have been issued */
+ UT_LIST_NODE_T(fil_space_t) space_list;
+ /*!< list of all spaces */
+
+ /** MariaDB encryption data */
+ fil_space_crypt_t* crypt_data;
+
+ /** Checks that this tablespace in a list of unflushed tablespaces. */
+ bool is_in_unflushed_spaces;
+
+ /** Checks that this tablespace needs key rotation. */
+ bool is_in_default_encrypt;
+
+ /** True if the device this filespace is on supports atomic writes */
+ bool atomic_write_supported;
+
+ /** True if file system storing this tablespace supports
+ punch hole */
+ bool punch_hole;
+
+ /** mutex to protect freed ranges */
+ std::mutex freed_range_mutex;
+
+ /** Variables to store freed ranges. This can be used to write
+ zeroes/punch the hole in files. Protected by freed_mutex */
+ range_set freed_ranges;
+
+ /** Stores last page freed lsn. Protected by freed_mutex */
+ lsn_t last_freed_lsn;
+
+ ulint magic_n;/*!< FIL_SPACE_MAGIC_N */
+
+ /** @return whether doublewrite buffering is needed */
+ bool use_doublewrite() const
+ {
+ return !atomic_write_supported && srv_use_doublewrite_buf &&
+ buf_dblwr.is_initialised();
+ }
+
+ /** Append a file to the chain of files of a space.
+ @param[in] name file name of a file that is not open
+ @param[in] handle file handle, or OS_FILE_CLOSED
+ @param[in] size file size in entire database pages
+ @param[in] is_raw whether this is a raw device
+ @param[in] atomic_write true if atomic write could be enabled
+ @param[in] max_pages maximum number of pages in file,
+ or UINT32_MAX for unlimited
+ @return file object */
+ fil_node_t* add(const char* name, pfs_os_file_t handle,
+ uint32_t size, bool is_raw, bool atomic_write,
+ uint32_t max_pages = UINT32_MAX);
+#ifdef UNIV_DEBUG
+ /** Assert that the mini-transaction is compatible with
+ updating an allocation bitmap page.
+ @param[in] mtr mini-transaction */
+ void modify_check(const mtr_t& mtr) const;
+#endif /* UNIV_DEBUG */
+
+ /** Try to reserve free extents.
+ @param[in] n_free_now current number of free extents
+ @param[in] n_to_reserve number of extents to reserve
+ @return whether the reservation succeeded */
+ bool reserve_free_extents(uint32_t n_free_now, uint32_t n_to_reserve)
+ {
+ ut_ad(rw_lock_own(&latch, RW_LOCK_X));
+ if (n_reserved_extents + n_to_reserve > n_free_now) {
+ return false;
+ }
+
+ n_reserved_extents += n_to_reserve;
+ return true;
+ }
+
+ /** Release the reserved free extents.
+ @param[in] n_reserved number of reserved extents */
+ void release_free_extents(uint32_t n_reserved)
+ {
+ if (!n_reserved) return;
+ ut_ad(rw_lock_own(&latch, RW_LOCK_X));
+ ut_a(n_reserved_extents >= n_reserved);
+ n_reserved_extents -= n_reserved;
+ }
+
+ /** Rename a file.
+ @param[in] name table name after renaming
+ @param[in] path tablespace file name after renaming
+ @param[in] log whether to write redo log
+ @param[in] replace whether to ignore the existence of path
+ @return error code
+ @retval DB_SUCCESS on success */
+ dberr_t rename(const char* name, const char* path, bool log,
+ bool replace = false);
+
+ /** Note that the tablespace has been imported.
+ Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
+ written while the space ID is being updated in each page. */
+ inline void set_imported();
+
+ /** @return whether the storage device is rotational (HDD, not SSD) */
+ inline bool is_rotational() const;
+
+ /** Open each file. Never invoked on .ibd files.
+ @param create_new_db whether to skip the call to fil_node_t::read_page0()
+ @return whether all files were opened */
+ bool open(bool create_new_db);
+ /** Close each file. Only invoked on fil_system.temp_space. */
+ void close();
+
+ /** Note that operations on the tablespace must stop or can resume */
+ inline void set_stopping(bool stopping);
+
+private:
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Try to acquire a tablespace reference.
+ @return the old reference count (if STOPPING is set, it was not acquired) */
+ uint32_t acquire_low()
+ {
+ uint32_t n= 0;
+ while (!n_pending.compare_exchange_strong(n, n + 1,
+ std::memory_order_acquire,
+ std::memory_order_relaxed) &&
+ !(n & STOPPING));
+ return n;
+ }
+public:
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Acquire a tablespace reference.
+ @return whether a tablespace reference was successfully acquired */
+ inline bool acquire_if_not_stopped();
+
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Acquire a tablespace reference for I/O.
+ @return whether the file is usable */
+ bool acquire()
+ {
+ uint32_t n= acquire_low();
+ if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
+ return true;
+ return UNIV_LIKELY(!(n & STOPPING)) && prepare();
+ }
+
+ /** Acquire another tablespace reference for I/O. */
+ inline void reacquire();
+
+ /** Release a tablespace reference.
+ @return whether this was the last reference */
+ bool release()
+ {
+ uint32_t n= n_pending.fetch_sub(1, std::memory_order_release);
+ ut_ad(n & PENDING);
+ return (n & PENDING) == 1;
+ }
+
+ /** Clear the NEEDS_FSYNC flag */
+ void clear_flush()
+ { n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release); }
+
+private:
+ /** @return pending operations (and flags) */
+ uint32_t pending()const { return n_pending.load(std::memory_order_acquire); }
+public:
+ /** @return whether close() of the file handle has been requested */
+ bool is_closing() const { return pending() & CLOSING; }
+ /** @return whether the tablespace is going to be dropped */
+ bool is_stopping() const { return pending() & STOPPING; }
+ /** @return number of pending operations */
+ bool is_ready_to_close() const
+ { return (pending() & (PENDING | CLOSING)) == CLOSING; }
+ /** @return whether fsync() or similar is needed */
+ bool needs_flush() const { return pending() & NEEDS_FSYNC; }
+ /** @return whether fsync() or similar is needed, and the tablespace is
+ not being dropped */
+ bool needs_flush_not_stopping() const
+ { return (pending() & (NEEDS_FSYNC | STOPPING)) == NEEDS_FSYNC; }
+
+ uint32_t referenced() const { return pending() & PENDING; }
+private:
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Prepare to close the file handle.
+ @return number of pending operations, possibly with NEEDS_FSYNC flag */
+ uint32_t set_closing()
+ {
+ return n_pending.fetch_or(CLOSING, std::memory_order_acquire) &
+ (PENDING | NEEDS_FSYNC);
+ }
+
+public:
+ /** Try to close a file to adhere to the innodb_open_files limit.
+ @param print_info whether to diagnose why a file cannot be closed
+ @return whether a file was closed */
+ static bool try_to_close(bool print_info);
+
+ /** Close all tablespace files at shutdown */
+ static void close_all();
+
+ /** @return last_freed_lsn */
+ lsn_t get_last_freed_lsn() { return last_freed_lsn; }
+ /** Update last_freed_lsn */
+ void update_last_freed_lsn(lsn_t lsn)
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ last_freed_lsn= lsn;
+ }
+
+ /** Note that the file will need fsync().
+ @return whether this needs to be added to fil_system.unflushed_spaces */
+ bool set_needs_flush()
+ {
+ uint32_t n= 1;
+ while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC,
+ std::memory_order_acquire,
+ std::memory_order_relaxed))
+ {
+ ut_ad(n & PENDING);
+ if (n & (NEEDS_FSYNC | STOPPING))
+ return false;
+ }
+
+ return true;
+ }
+
+ /** Clear all freed ranges for undo tablespace when InnoDB
+ encounters TRIM redo log record */
+ void clear_freed_ranges()
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ freed_ranges.clear();
+ }
+#endif /* !UNIV_INNOCHECKSUM */
+ /** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags;
+ check fsp0types.h to more info about flags. */
+ ulint flags;
+
+ /** Determine if full_crc32 is used for a data file
+ @param[in] flags tablespace flags (FSP_SPACE_FLAGS)
+ @return whether the full_crc32 algorithm is active */
+ static bool full_crc32(ulint flags) {
+ return flags & FSP_FLAGS_FCRC32_MASK_MARKER;
+ }
+ /** @return whether innodb_checksum_algorithm=full_crc32 is active */
+ bool full_crc32() const { return full_crc32(flags); }
+ /** Determine the logical page size.
+ @param flags tablespace flags (FSP_FLAGS)
+ @return the logical page size
+ @retval 0 if the flags are invalid */
+ static unsigned logical_size(ulint flags) {
+
+ ulint page_ssize = 0;
+
+ if (full_crc32(flags)) {
+ page_ssize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags);
+ } else {
+ page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags);
+ }
+
+ switch (page_ssize) {
+ case 3: return 4096;
+ case 4: return 8192;
+ case 5:
+ { ut_ad(full_crc32(flags)); return 16384; }
+ case 0:
+ { ut_ad(!full_crc32(flags)); return 16384; }
+ case 6: return 32768;
+ case 7: return 65536;
+ default: return 0;
+ }
+ }
+ /** Determine the ROW_FORMAT=COMPRESSED page size.
+ @param flags tablespace flags (FSP_FLAGS)
+ @return the ROW_FORMAT=COMPRESSED page size
+ @retval 0 if ROW_FORMAT=COMPRESSED is not used */
+ static unsigned zip_size(ulint flags) {
+
+ if (full_crc32(flags)) {
+ return 0;
+ }
+
+ ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+ return zip_ssize
+ ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize : 0;
+ }
+ /** Determine the physical page size.
+ @param flags tablespace flags (FSP_FLAGS)
+ @return the physical page size */
+ static unsigned physical_size(ulint flags) {
+
+ if (full_crc32(flags)) {
+ return logical_size(flags);
+ }
+
+ ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+ return zip_ssize
+ ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize
+ : unsigned(srv_page_size);
+ }
+ /** @return the ROW_FORMAT=COMPRESSED page size
+ @retval 0 if ROW_FORMAT=COMPRESSED is not used */
+ unsigned zip_size() const { return zip_size(flags); }
+ /** @return the physical page size */
+ unsigned physical_size() const { return physical_size(flags); }
+ /** Check whether the compression enabled in tablespace.
+ @param[in] flags tablespace flags */
+ static bool is_compressed(ulint flags) {
+
+ if (full_crc32(flags)) {
+ ulint algo = FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(
+ flags);
+ DBUG_ASSERT(algo <= PAGE_ALGORITHM_LAST);
+ return algo > 0;
+ }
+
+ return FSP_FLAGS_HAS_PAGE_COMPRESSION(flags);
+ }
+ /** @return whether the compression enabled for the tablespace. */
+ bool is_compressed() const { return is_compressed(flags); }
+
+ /** Get the compression algorithm for full crc32 format.
+ @param[in] flags tablespace flags
+ @return algorithm type of tablespace */
+ static ulint get_compression_algo(ulint flags)
+ {
+ return full_crc32(flags)
+ ? FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags)
+ : 0;
+ }
+ /** @return the page_compressed algorithm
+ @retval 0 if not page_compressed */
+ ulint get_compression_algo() const {
+ return fil_space_t::get_compression_algo(flags);
+ }
+ /** Determine if the page_compressed page contains an extra byte
+ for exact compressed stream length
+ @param[in] flags tablespace flags
+ @return whether the extra byte is needed */
+ static bool full_crc32_page_compressed_len(ulint flags)
+ {
+ DBUG_ASSERT(full_crc32(flags));
+ switch (get_compression_algo(flags)) {
+ case PAGE_LZ4_ALGORITHM:
+ case PAGE_LZO_ALGORITHM:
+ case PAGE_SNAPPY_ALGORITHM:
+ return true;
+ }
+ return false;
+ }
+
+ /** Whether the full checksum matches with non full checksum flags.
+ @param[in] flags flags present
+ @param[in] expected expected flags
+ @return true if it is equivalent */
+ static bool is_flags_full_crc32_equal(ulint flags, ulint expected)
+ {
+ ut_ad(full_crc32(flags));
+ ulint fcrc32_psize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags);
+
+ if (full_crc32(expected)) {
+ /* The data file may have been created with a
+ different innodb_compression_algorithm. But
+ we only support one innodb_page_size for all files. */
+ return fcrc32_psize
+ == FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected);
+ }
+
+ ulint non_fcrc32_psize = FSP_FLAGS_GET_PAGE_SSIZE(expected);
+
+ if (!non_fcrc32_psize) {
+ if (fcrc32_psize != 5) {
+ return false;
+ }
+ } else if (fcrc32_psize != non_fcrc32_psize) {
+ return false;
+ }
+
+ return true;
+ }
+ /** Whether old tablespace flags match full_crc32 flags.
+ @param[in] flags flags present
+ @param[in] expected expected flags
+ @return true if it is equivalent */
+ static bool is_flags_non_full_crc32_equal(ulint flags, ulint expected)
+ {
+ ut_ad(!full_crc32(flags));
+
+ if (!full_crc32(expected)) {
+ return false;
+ }
+
+ ulint non_fcrc32_psize = FSP_FLAGS_GET_PAGE_SSIZE(flags);
+ ulint fcrc32_psize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(
+ expected);
+
+ if (!non_fcrc32_psize) {
+ if (fcrc32_psize != 5) {
+ return false;
+ }
+ } else if (fcrc32_psize != non_fcrc32_psize) {
+ return false;
+ }
+
+ return true;
+ }
+ /** Whether both fsp flags are equivalent */
+ static bool is_flags_equal(ulint flags, ulint expected)
+ {
+ if (!((flags ^ expected) & ~(1U << FSP_FLAGS_POS_RESERVED))) {
+ return true;
+ }
+
+ return full_crc32(flags)
+ ? is_flags_full_crc32_equal(flags, expected)
+ : is_flags_non_full_crc32_equal(flags, expected);
+ }
+ /** Validate the tablespace flags for full crc32 format.
+ @param[in] flags the content of FSP_SPACE_FLAGS
+ @return whether the flags are correct in full crc32 format */
+ static bool is_fcrc32_valid_flags(ulint flags)
+ {
+ ut_ad(flags & FSP_FLAGS_FCRC32_MASK_MARKER);
+ const ulint page_ssize = physical_size(flags);
+ if (page_ssize < 3 || page_ssize & 8) {
+ return false;
+ }
+
+ flags >>= FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+
+ return flags <= PAGE_ALGORITHM_LAST;
+ }
+ /** Validate the tablespace flags.
+ @param[in] flags content of FSP_SPACE_FLAGS
+ @param[in] is_ibd whether this is an .ibd file
+ (not system tablespace)
+ @return whether the flags are correct. */
+ static bool is_valid_flags(ulint flags, bool is_ibd)
+ {
+ DBUG_EXECUTE_IF("fsp_flags_is_valid_failure",
+ return false;);
+
+ if (full_crc32(flags)) {
+ return is_fcrc32_valid_flags(flags);
+ }
+
+ if (flags == 0) {
+ return true;
+ }
+
+ if (flags & ~FSP_FLAGS_MASK) {
+ return false;
+ }
+
+ if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE
+ | FSP_FLAGS_MASK_ATOMIC_BLOBS))
+ == FSP_FLAGS_MASK_ATOMIC_BLOBS) {
+ /* If the "atomic blobs" flag (indicating
+ ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag
+ is set, then the "post Antelope"
+ (ROW_FORMAT!=REDUNDANT) flag must also be set. */
+ return false;
+ }
+
+ /* Bits 10..14 should be 0b0000d where d is the DATA_DIR flag
+ of MySQL 5.6 and MariaDB 10.0, which we ignore.
+ In the buggy FSP_SPACE_FLAGS written by MariaDB 10.1.0 to 10.1.20,
+ bits 10..14 would be nonzero 0bsssaa where sss is
+ nonzero PAGE_SSIZE (3, 4, 6, or 7)
+ and aa is ATOMIC_WRITES (not 0b11). */
+ if (FSP_FLAGS_GET_RESERVED(flags) & ~1U) {
+ return false;
+ }
+
+ const ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags);
+ if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) {
+ /* the page_size is not between 4k and 64k;
+ 16k should be encoded as 0, not 5 */
+ return false;
+ }
+
+ const ulint zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+ if (zssize == 0) {
+ /* not ROW_FORMAT=COMPRESSED */
+ } else if (zssize > (ssize ? ssize : 5)) {
+ /* Invalid KEY_BLOCK_SIZE */
+ return false;
+ } else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE
+ | FSP_FLAGS_MASK_ATOMIC_BLOBS)) {
+ /* both these flags should be set for
+ ROW_FORMAT=COMPRESSED */
+ return false;
+ }
+
+ /* The flags do look valid. But, avoid misinterpreting
+ buggy MariaDB 10.1 format flags for
+ PAGE_COMPRESSED=1 PAGE_COMPRESSION_LEVEL={0,2,3}
+ as valid-looking PAGE_SSIZE if this is known to be
+ an .ibd file and we are using the default innodb_page_size=16k. */
+ return(ssize == 0 || !is_ibd
+ || srv_page_size != UNIV_PAGE_SIZE_ORIG);
+ }
+
+#ifndef UNIV_INNOCHECKSUM
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Create a tablespace in fil_system.
+ @param name tablespace name
+ @param id tablespace identifier
+ @param flags tablespace flags
+ @param purpose tablespace purpose
+ @param crypt_data encryption information
+ @param mode encryption mode
+ @return pointer to created tablespace, to be filled in with add()
+ @retval nullptr on failure (such as when the same tablespace exists) */
+ static fil_space_t *create(const char *name, ulint id, ulint flags,
+ fil_type_t purpose, fil_space_crypt_t *crypt_data,
+ fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT);
+
+ MY_ATTRIBUTE((warn_unused_result))
+ /** Acquire a tablespace reference.
+ @param id tablespace identifier
+ @return tablespace
+ @retval nullptr if the tablespace is missing or inaccessible */
+ static fil_space_t *get(ulint id);
+
+ /** Add/remove the free page in the freed ranges list.
+ @param[in] offset page number to be added
+ @param[in] free true if page to be freed */
+ void free_page(uint32_t offset, bool add=true)
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ if (add)
+ return freed_ranges.add_value(offset);
+
+ if (freed_ranges.empty())
+ return;
+
+ return freed_ranges.remove_value(offset);
+ }
+
+ /** Add the range of freed pages */
+ void add_free_ranges(range_set ranges)
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ freed_ranges= std::move(ranges);
+ }
+
+ /** Add the set of freed page ranges */
+ void add_free_range(const range_t range)
+ {
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ freed_ranges.add_range(range);
+ }
+
+ /** Set the tablespace size in pages */
+ void set_sizes(uint32_t s)
+ {
+ ut_ad(id ? !size : (size >= s));
+ size= s; committed_size= s;
+ }
+
+ /** Update committed_size in mtr_t::commit() */
+ void set_committed_size()
+ {
+ ut_ad(rw_lock_own(&latch, RW_LOCK_X));
+ committed_size= size;
+ }
+
+ /** @return the last persisted page number */
+ uint32_t last_page_number() const { return committed_size - 1; }
+
+ /** @return the size in pages (0 if unreadable) */
+ inline uint32_t get_size();
+
+ /** Read or write data.
+ @param type I/O context
+ @param offset offset in bytes
+ @param len number of bytes
+ @param buf the data to be read or written
+ @param bpage buffer block (for type.is_async() completion callback)
+ @return status and file descriptor */
+ fil_io_t io(const IORequest &type, os_offset_t offset, size_t len,
+ void *buf, buf_page_t *bpage= nullptr);
+ /** Flush pending writes from the file system cache to the file. */
+ template<bool have_reference> inline void flush();
+ /** Flush pending writes from the file system cache to the file. */
+ void flush_low();
+
+ /** Read the first page of a data file.
+ @return whether the page was found valid */
+ bool read_page0();
+
+ /** Determine the next tablespace for encryption key rotation.
+ @param space current tablespace (nullptr to start from the beginning)
+ @param recheck whether the removal condition needs to be rechecked after
+ encryption parameters were changed
+ @param encrypt expected state of innodb_encrypt_tables
+ @return the next tablespace
+ @retval nullptr upon reaching the end of the iteration */
+ static inline fil_space_t *next(fil_space_t *space, bool recheck,
+ bool encrypt);
+
+private:
+ /** @return whether the file is usable for io() */
+ ATTRIBUTE_COLD bool prepare(bool have_mutex= false);
+#endif /*!UNIV_INNOCHECKSUM */
+};
+
+#ifndef UNIV_INNOCHECKSUM
+/** Value of fil_space_t::magic_n */
+#define FIL_SPACE_MAGIC_N 89472
+
+/** File node of a tablespace or the log data space */
+struct fil_node_t final
+{
+ /** tablespace containing this file */
+ fil_space_t* space;
+ /** file name; protected by fil_system.mutex and log_sys.mutex. */
+ char* name;
+ /** file handle (valid if is_open) */
+ pfs_os_file_t handle;
+ /** whether the file actually is a raw device or disk partition */
+ bool is_raw_disk;
+ /** whether the file is on non-rotational media (SSD) */
+ bool on_ssd;
+ /** size of the file in database pages (0 if not known yet);
+ the possible last incomplete megabyte may be ignored
+ if space->id == 0 */
+ uint32_t size;
+ /** initial size of the file in database pages;
+ FIL_IBD_FILE_INITIAL_SIZE by default */
+ uint32_t init_size;
+ /** maximum size of the file in database pages (0 if unlimited) */
+ uint32_t max_size;
+ /** whether the file is currently being extended */
+ Atomic_relaxed<bool> being_extended;
+ /** link to other files in this tablespace */
+ UT_LIST_NODE_T(fil_node_t) chain;
+
+ /** whether this file could use atomic write (data file) */
+ bool atomic_write;
+
+ /** Filesystem block size */
+ ulint block_size;
+
+ /** FIL_NODE_MAGIC_N */
+ ulint magic_n;
+
+ /** @return whether this file is open */
+ bool is_open() const
+ {
+ return(handle != OS_FILE_CLOSED);
+ }
+
+ /** Read the first page of a data file.
+ @return whether the page was found valid */
+ bool read_page0();
+
+ /** Determine some file metadata when creating or reading the file.
+ @param file the file that is being created, or OS_FILE_CLOSED */
+ void find_metadata(os_file_t file = OS_FILE_CLOSED
+#ifndef _WIN32
+ , struct stat* statbuf = NULL
+#endif
+ );
+
+ /** Close the file handle. */
+ void close();
+ /** Same as close() but returns file handle instead of closing it. */
+ pfs_os_file_t detach() MY_ATTRIBUTE((warn_unused_result));
+ /** Prepare to free a file from fil_system.
+ @param detach_handle whether to detach instead of closing a handle
+ @return detached handle or OS_FILE_CLOSED */
+ inline pfs_os_file_t close_to_free(bool detach_handle= false);
+
+ /** Update the data structures on write completion */
+ inline void complete_write();
+
+private:
+ /** Does stuff common for close() and detach() */
+ void prepare_to_close_or_detach();
+};
+
+/** Value of fil_node_t::magic_n */
+#define FIL_NODE_MAGIC_N 89389
+
+inline void fil_space_t::set_imported()
+{
+ ut_ad(purpose == FIL_TYPE_IMPORT);
+ purpose= FIL_TYPE_TABLESPACE;
+ UT_LIST_GET_FIRST(chain)->find_metadata();
+}
+
+inline bool fil_space_t::is_rotational() const
+{
+ for (const fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ if (!node->on_ssd)
+ return true;
+ return false;
+}
+
+/** Common InnoDB file extensions */
+enum ib_extention {
+ NO_EXT = 0,
+ IBD = 1,
+ ISL = 2,
+ CFG = 3
+};
+extern const char* dot_ext[];
+#define DOT_IBD dot_ext[IBD]
+#define DOT_ISL dot_ext[ISL]
+#define DOT_CFG dot_ext[CFG]
+
+/** When mysqld is run, the default directory "." is the mysqld datadir,
+but in the MySQL Embedded Server Library and mysqlbackup it is not the default
+directory, and we must set the base file path explicitly */
+extern const char* fil_path_to_mysql_datadir;
+#else
+# include "univ.i"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Initial size of a single-table tablespace in pages */
+#define FIL_IBD_FILE_INITIAL_SIZE 4U
+
+/** 'null' (undefined) page offset in the context of file spaces */
+#define FIL_NULL ULINT32_UNDEFINED
+
+
+#define FIL_ADDR_PAGE 0U /* first in address is the page offset */
+#define FIL_ADDR_BYTE 4U /* then comes 2-byte byte offset within page*/
+#define FIL_ADDR_SIZE 6U /* address size is 6 bytes */
+
+/** File space address */
+struct fil_addr_t {
+ /** page number within a tablespace */
+ uint32_t page;
+ /** byte offset within the page */
+ uint16_t boffset;
+};
+
+/** The byte offsets on a file page for various variables @{ */
+#define FIL_PAGE_SPACE_OR_CHKSUM 0 /*!< in < MySQL-4.0.14 space id the
+ page belongs to (== 0) but in later
+ versions the 'new' checksum of the
+ page */
+#define FIL_PAGE_OFFSET 4U /*!< page offset inside space */
+#define FIL_PAGE_PREV 8U /*!< if there is a 'natural'
+ predecessor of the page, its
+ offset. Otherwise FIL_NULL.
+ This field is not set on BLOB
+ pages, which are stored as a
+ singly-linked list. See also
+ FIL_PAGE_NEXT. */
+#define FIL_PAGE_NEXT 12U /*!< if there is a 'natural' successor
+ of the page, its offset.
+ Otherwise FIL_NULL.
+ B-tree index pages
+ (FIL_PAGE_TYPE contains FIL_PAGE_INDEX)
+ on the same PAGE_LEVEL are maintained
+ as a doubly linked list via
+ FIL_PAGE_PREV and FIL_PAGE_NEXT
+ in the collation order of the
+ smallest user record on each page. */
+#define FIL_PAGE_LSN 16U /*!< lsn of the end of the newest
+ modification log record to the page */
+#define FIL_PAGE_TYPE 24U /*!< file page type: FIL_PAGE_INDEX,...,
+ 2 bytes.
+
+ The contents of this field can only
+ be trusted in the following case:
+ if the page is an uncompressed
+ B-tree index page, then it is
+ guaranteed that the value is
+ FIL_PAGE_INDEX.
+ The opposite does not hold.
+
+ In tablespaces created by
+ MySQL/InnoDB 5.1.7 or later, the
+ contents of this field is valid
+ for all uncompressed pages. */
+
+/** For the first page in a system tablespace data file(ibdata*, not *.ibd):
+the file has been flushed to disk at least up to this lsn
+For other pages: 32-bit key version used to encrypt the page + 32-bit checksum
+or 64 bites of zero if no encryption */
+#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U
+
+/** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */
+#define FIL_RTREE_SPLIT_SEQ_NUM FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+
+/** Start of the page_compressed content */
+#define FIL_PAGE_COMP_ALGO FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+
+/** starting from 4.1.x this contains the space id of the page */
+#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34U
+
+#define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+#define FIL_PAGE_DATA 38U /*!< start of the data on the page */
+
+/** 32-bit key version used to encrypt the page in full_crc32 format.
+For non-encrypted page, it contains 0. */
+#define FIL_PAGE_FCRC32_KEY_VERSION 0
+
+/** page_compressed without innodb_checksum_algorithm=full_crc32 @{ */
+/** Number of bytes used to store actual payload data size on
+page_compressed pages when not using full_crc32. */
+#define FIL_PAGE_COMP_SIZE 0
+
+/** Number of bytes for FIL_PAGE_COMP_SIZE */
+#define FIL_PAGE_COMP_METADATA_LEN 2
+
+/** Number of bytes used to store actual compression method
+for encrypted tables when not using full_crc32. */
+#define FIL_PAGE_ENCRYPT_COMP_ALGO 2
+
+/** Extra header size for encrypted page_compressed pages when
+not using full_crc32 */
+#define FIL_PAGE_ENCRYPT_COMP_METADATA_LEN 4
+/* @} */
+
+/** File page trailer @{ */
+#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used
+ to store the page checksum, the
+ last 4 bytes should be identical
+ to the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_DATA_END 8 /*!< size of the page trailer */
+
+/** Store the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_FCRC32_END_LSN 8
+
+/** Store crc32 checksum at the end of the page */
+#define FIL_PAGE_FCRC32_CHECKSUM 4
+/* @} */
+
+/** File page types (values of FIL_PAGE_TYPE) @{ */
+/** page_compressed, encrypted=YES (not used for full_crc32) */
+constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED= 37401;
+/** page_compressed (not used for full_crc32) */
+constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED= 34354;
+/** B-tree index page */
+constexpr uint16_t FIL_PAGE_INDEX= 17855;
+/** R-tree index page (SPATIAL INDEX) */
+constexpr uint16_t FIL_PAGE_RTREE= 17854;
+/** Undo log page */
+constexpr uint16_t FIL_PAGE_UNDO_LOG= 2;
+/** Index node (of file-in-file metadata) */
+constexpr uint16_t FIL_PAGE_INODE= 3;
+/** Insert buffer free list */
+constexpr uint16_t FIL_PAGE_IBUF_FREE_LIST= 4;
+/** Freshly allocated page */
+constexpr uint16_t FIL_PAGE_TYPE_ALLOCATED= 0;
+/** Change buffer bitmap (pages n*innodb_page_size+1) */
+constexpr uint16_t FIL_PAGE_IBUF_BITMAP= 5;
+/** System page */
+constexpr uint16_t FIL_PAGE_TYPE_SYS= 6;
+/** Transaction system data */
+constexpr uint16_t FIL_PAGE_TYPE_TRX_SYS= 7;
+/** Tablespace header (page 0) */
+constexpr uint16_t FIL_PAGE_TYPE_FSP_HDR= 8;
+/** Extent descriptor page (pages n*innodb_page_size, except 0) */
+constexpr uint16_t FIL_PAGE_TYPE_XDES= 9;
+/** Uncompressed BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_BLOB= 10;
+/** First ROW_FORMAT=COMPRESSED BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_ZBLOB= 11;
+/** Subsequent ROW_FORMAT=COMPRESSED BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_ZBLOB2= 12;
+/** In old tablespaces, garbage in FIL_PAGE_TYPE is replaced with this
+value when flushing pages. */
+constexpr uint16_t FIL_PAGE_TYPE_UNKNOWN= 13;
+
+/* File page types introduced in MySQL 5.7, not supported in MariaDB */
+//constexpr uint16_t FIL_PAGE_COMPRESSED = 14;
+//constexpr uint16_t FIL_PAGE_ENCRYPTED = 15;
+//constexpr uint16_t FIL_PAGE_COMPRESSED_AND_ENCRYPTED = 16;
+//constexpr FIL_PAGE_ENCRYPTED_RTREE = 17;
+/** Clustered index root page after instant ADD COLUMN */
+constexpr uint16_t FIL_PAGE_TYPE_INSTANT= 18;
+
+/** Used by i_s.cc to index into the text description.
+Note: FIL_PAGE_TYPE_INSTANT maps to the same as FIL_PAGE_INDEX. */
+constexpr uint16_t FIL_PAGE_TYPE_LAST= FIL_PAGE_TYPE_UNKNOWN;
+
+/** Set in FIL_PAGE_TYPE for full_crc32 pages in page_compressed format.
+If the flag is set, then the following holds for the remaining bits
+of FIL_PAGE_TYPE:
+Bits 0..7 will contain the compressed page size in bytes.
+Bits 8..14 are reserved and must be 0. */
+constexpr uint16_t FIL_PAGE_COMPRESS_FCRC32_MARKER= 15;
+/* @} */
+
+/** @return whether the page type is B-tree or R-tree index */
+inline bool fil_page_type_is_index(uint16_t page_type)
+{
+ switch (page_type) {
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ return(true);
+ }
+ return(false);
+}
+
+/** Check whether the page is index page (either regular Btree index or Rtree
+index */
+#define fil_page_index_page_check(page) \
+ fil_page_type_is_index(fil_page_get_type(page))
+
+/** Get the file page type.
+@param[in] page file page
+@return page type */
+inline uint16_t fil_page_get_type(const byte *page)
+{
+ return mach_read_from_2(my_assume_aligned<2>(page + FIL_PAGE_TYPE));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Number of pending tablespace flushes */
+extern Atomic_counter<ulint> fil_n_pending_tablespace_flushes;
+
+/** Look up a tablespace.
+The caller should hold an InnoDB table lock or a MDL that prevents
+the tablespace from being dropped during the operation,
+or the caller should be in single-threaded crash recovery mode
+(no user connections that could drop tablespaces).
+Normally, fil_space_t::get() should be used instead.
+@param[in] id tablespace ID
+@return tablespace, or NULL if not found */
+fil_space_t*
+fil_space_get(
+ ulint id)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** The tablespace memory cache; also the totality of logs (the log
+data space) is stored here; below we talk about tablespaces */
+struct fil_system_t {
+ /**
+ Constructor.
+
+ Some members may require late initialisation, thus we just mark object as
+ uninitialised. Real initialisation happens in create().
+ */
+ fil_system_t(): m_initialised(false)
+ {
+ UT_LIST_INIT(space_list, &fil_space_t::space_list);
+ UT_LIST_INIT(named_spaces, &fil_space_t::named_spaces);
+ }
+
+ bool is_initialised() const { return m_initialised; }
+
+ /**
+ Create the file system interface at database start.
+
+ @param[in] hash_size hash table size
+ */
+ void create(ulint hash_size);
+
+ /** Close the file system interface at shutdown */
+ void close();
+
+private:
+ bool m_initialised;
+#ifdef UNIV_LINUX
+ /** available block devices that reside on non-rotational storage */
+ std::vector<dev_t> ssd;
+public:
+ /** @return whether a file system device is on non-rotational storage */
+ bool is_ssd(dev_t dev) const
+ {
+ /* Linux seems to allow up to 15 partitions per block device.
+ If the detected ssd carries "partition number 0" (it is the whole device),
+ compare the candidate file system number without the partition number. */
+ for (const auto s : ssd)
+ if (dev == s || (dev & ~15U) == s)
+ return true;
+ return false;
+ }
+#endif
+public:
+ /** Detach a tablespace from the cache and close the files.
+ @param space tablespace
+ @param detach_handle whether to detach or close handles
+ @return detached handles or empty vector */
+ std::vector<pfs_os_file_t> detach(fil_space_t *space,
+ bool detach_handle= false);
+
+ ib_mutex_t mutex; /*!< The mutex protecting the cache */
+ fil_space_t* sys_space; /*!< The innodb_system tablespace */
+ fil_space_t* temp_space; /*!< The innodb_temporary tablespace */
+ /** Map of fil_space_t::id to fil_space_t* */
+ hash_table_t spaces;
+ /** tablespaces for which fil_space_t::needs_flush() holds */
+ sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
+ /** number of currently open files; protected by mutex */
+ ulint n_open;
+ ulint max_assigned_id;/*!< maximum space id in the existing
+ tables, or assigned during the time
+ mysqld has been up; at an InnoDB
+ startup we scan the data dictionary
+ and set here the maximum of the
+ space id's of the tables there */
+ /** nonzero if fil_node_open_file_low() should avoid moving the tablespace
+ to the end of space_list, for FIFO policy of try_to_close() */
+ ulint freeze_space_list;
+ UT_LIST_BASE_NODE_T(fil_space_t) space_list;
+ /*!< list of all file spaces */
+ UT_LIST_BASE_NODE_T(fil_space_t) named_spaces;
+ /*!< list of all file spaces
+ for which a FILE_MODIFY
+ record has been written since
+ the latest redo log checkpoint.
+ Protected only by log_sys.mutex. */
+
+ /** List of all file spaces need key rotation */
+ ilist<fil_space_t, rotation_list_tag_t> default_encrypt_tables;
+
+ bool space_id_reuse_warned;
+ /*!< whether fil_space_t::create()
+ has issued a warning about
+ potential space_id reuse */
+
+ /** Return the next tablespace from default_encrypt_tables list.
+ @param space previous tablespace (nullptr to start from the start)
+ @param recheck whether the removal condition needs to be rechecked after
+ the encryption parameters were changed
+ @param encrypt expected state of innodb_encrypt_tables
+ @return the next tablespace to process (n_pending_ops incremented)
+ @retval fil_system.temp_space if there is no work to do
+ @retval nullptr upon reaching the end of the iteration */
+ inline fil_space_t* default_encrypt_next(fil_space_t *space, bool recheck,
+ bool encrypt);
+
+ /** Extend all open data files to the recovered size */
+ ATTRIBUTE_COLD void extend_to_recv_size();
+};
+
+/** The tablespace memory cache. */
+extern fil_system_t fil_system;
+
+inline void fil_space_t::reacquire()
+{
+ ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed);
+ ut_d(if (mutex_own(&fil_system.mutex)) return);
+ ut_ad(n & PENDING);
+ ut_ad(UT_LIST_GET_FIRST(chain)->is_open());
+}
+
+/** Note that operations on the tablespace must stop or can resume */
+inline void fil_space_t::set_stopping(bool stopping)
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+ ut_d(auto n=) n_pending.fetch_xor(STOPPING, std::memory_order_relaxed);
+ ut_ad(!(n & STOPPING) == stopping);
+}
+
+/** Flush pending writes from the file system cache to the file. */
+template<bool have_reference> inline void fil_space_t::flush()
+{
+ ut_ad(!mutex_own(&fil_system.mutex));
+ ut_ad(!have_reference || (pending() & PENDING));
+ ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT);
+ if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
+ {
+ ut_ad(!is_in_unflushed_spaces);
+ ut_ad(!needs_flush());
+ }
+ else if (have_reference)
+ flush_low();
+ else if (!(acquire_low() & STOPPING))
+ {
+ flush_low();
+ release();
+ }
+}
+
+/** @return the size in pages (0 if unreadable) */
+inline uint32_t fil_space_t::get_size()
+{
+ if (!size)
+ {
+ mutex_enter(&fil_system.mutex);
+ read_page0();
+ mutex_exit(&fil_system.mutex);
+ }
+ return size;
+}
+
+#include "fil0crypt.h"
+
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return true if assigned, false if not */
+bool
+fil_assign_new_space_id(
+/*====================*/
+ ulint* space_id); /*!< in/out: space id */
+
+/** Frees a space object from the tablespace memory cache.
+Closes the files in the chain but does not delete them.
+There must not be any pending i/o's or flushes on the files.
+@param[in] id tablespace identifier
+@param[in] x_latched whether the caller holds X-mode space->latch
+@return true if success */
+bool
+fil_space_free(
+ ulint id,
+ bool x_latched);
+
+/** Set the recovered size of a tablespace in pages.
+@param id tablespace ID
+@param size recovered size in pages
+@param flags tablespace flags */
+void fil_space_set_recv_size_and_flags(ulint id, uint32_t size,
+ uint32_t flags);
+
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+ ulint max_id);/*!< in: maximum known id */
+
+/** Write the flushed LSN to the page header of the first page in the
+system tablespace.
+@param[in] lsn flushed LSN
+@return DB_SUCCESS or error number */
+dberr_t
+fil_write_flushed_lsn(
+ lsn_t lsn)
+MY_ATTRIBUTE((warn_unused_result));
+
+/** Delete a tablespace and associated .ibd file.
+@param[in] id tablespace identifier
+@param[in] if_exists whether to ignore missing tablespace
+@param[out] leaked_handles return detached handles here
+@return DB_SUCCESS or error */
+dberr_t
+fil_delete_tablespace(ulint id, bool if_exists= false,
+ std::vector<pfs_os_file_t> *detached_handles= nullptr);
+
+/** Prepare to truncate an undo tablespace.
+@param[in] space_id undo tablespace id
+@return the tablespace
+@retval NULL if the tablespace does not exist */
+fil_space_t* fil_truncate_prepare(ulint space_id);
+
+/** Close a single-table tablespace on failed IMPORT TABLESPACE.
+The tablespace must be cached in the memory cache.
+Free all pages used by the tablespace. */
+void fil_close_tablespace(ulint id);
+
+/*******************************************************************//**
+Allocates and builds a file name from a path, a table or tablespace name
+and a suffix. The string must be freed by caller with ut_free().
+@param[in] path NULL or the directory path or the full path and filename.
+@param[in] name NULL if path is full, or Table/Tablespace name
+@param[in] suffix NULL or the file extention to use.
+@return own: file name */
+char*
+fil_make_filepath(
+ const char* path,
+ const char* name,
+ ib_extention suffix,
+ bool strip_name);
+
+/** Create a tablespace file.
+@param[in] space_id Tablespace ID
+@param[in] name Tablespace name in dbname/tablename format.
+@param[in] path Path and filename of the datafile to create.
+@param[in] flags Tablespace flags
+@param[in] size Initial size of the tablespace file in pages,
+must be >= FIL_IBD_FILE_INITIAL_SIZE
+@param[in] mode MariaDB encryption mode
+@param[in] key_id MariaDB encryption key_id
+@param[out] err DB_SUCCESS or error code
+@return the created tablespace
+@retval NULL on error */
+fil_space_t*
+fil_ibd_create(
+ ulint space_id,
+ const char* name,
+ const char* path,
+ ulint flags,
+ uint32_t size,
+ fil_encryption_t mode,
+ uint32_t key_id,
+ dberr_t* err)
+ MY_ATTRIBUTE((nonnull(2,8), warn_unused_result));
+
+/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations.
+(Typically when upgrading from MariaDB 10.1.0..10.1.20.)
+@param[in,out] space tablespace
+@param[in] flags desired tablespace flags */
+void fsp_flags_try_adjust(fil_space_t* space, ulint flags);
+
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks the space id is
+right in it. If does not succeed, prints an error message to the .err log. This
+function is used to open a tablespace when we start up mysqld, and also in
+IMPORT TABLESPACE.
+NOTE that we assume this operation is used either at the database startup
+or under the protection of the dictionary mutex, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file. This boolean may be initially false, but if
+a remote tablespace is found it will be changed to true.
+
+If the fix_dict boolean is set, then it is safe to use an internal SQL
+statement to update the dictionary tables if they are incorrect.
+
+@param[in] validate true if we should validate the tablespace
+@param[in] fix_dict true if the dictionary is available to be fixed
+@param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
+@param[in] id tablespace ID
+@param[in] flags expected FSP_SPACE_FLAGS
+@param[in] tablename table name
+If file-per-table, it is the table name in the databasename/tablename format
+@param[in] path_in expected filepath, usually read from dictionary
+@param[out] err DB_SUCCESS or error code
+@return tablespace
+@retval NULL if the tablespace could not be opened */
+fil_space_t*
+fil_ibd_open(
+ bool validate,
+ bool fix_dict,
+ fil_type_t purpose,
+ ulint id,
+ ulint flags,
+ const table_name_t& tablename,
+ const char* path_in,
+ dberr_t* err = NULL)
+ MY_ATTRIBUTE((warn_unused_result));
+
+enum fil_load_status {
+ /** The tablespace file(s) were found and valid. */
+ FIL_LOAD_OK,
+ /** The name no longer matches space_id */
+ FIL_LOAD_ID_CHANGED,
+ /** The file(s) were not found */
+ FIL_LOAD_NOT_FOUND,
+ /** The file(s) were not valid */
+ FIL_LOAD_INVALID
+};
+
+/** Open a single-file tablespace and add it to the InnoDB data structures.
+@param[in] space_id tablespace ID
+@param[in] filename path/to/databasename/tablename.ibd
+@param[out] space the tablespace, or NULL on error
+@return status of the operation */
+enum fil_load_status
+fil_ibd_load(
+ ulint space_id,
+ const char* filename,
+ fil_space_t*& space)
+ MY_ATTRIBUTE((warn_unused_result));
+
+
+/** Determine if a matching tablespace exists in the InnoDB tablespace
+memory cache. Note that if we have not done a crash recovery at the database
+startup, there may be many tablespaces which are not yet in the memory cache.
+@param[in] id Tablespace ID
+@param[in] name Tablespace name used in fil_space_t::create().
+@param[in] table_flags table flags
+@return the tablespace
+@retval NULL if no matching tablespace exists in the memory cache */
+fil_space_t*
+fil_space_for_table_exists_in_mem(
+ ulint id,
+ const char* name,
+ ulint table_flags);
+
+/** Try to extend a tablespace if it is smaller than the specified size.
+@param[in,out] space tablespace
+@param[in] size desired size in pages
+@return whether the tablespace is at least as big as requested */
+bool fil_space_extend(fil_space_t *space, uint32_t size);
+
+/** Flush to disk the writes in file spaces of the given type
+possibly cached by the OS. */
+void fil_flush_file_spaces();
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return true if ok */
+bool fil_validate();
+/*********************************************************************//**
+Sets the file page type. */
+void
+fil_page_set_type(
+/*==============*/
+ byte* page, /*!< in/out: file page */
+ ulint type); /*!< in: type */
+
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables. */
+void
+fil_delete_file(
+/*============*/
+ const char* path); /*!< in: filepath of the ibd tablespace */
+
+/********************************************************************//**
+Looks for a pre-existing fil_space_t with the given tablespace ID
+and, if found, returns the name and filepath in newly allocated buffers that the caller must free.
+@param[in] space_id The tablespace ID to search for.
+@param[out] name Name of the tablespace found.
+@param[out] fileapth The filepath of the first datafile for thtablespace found.
+@return true if tablespace is found, false if not. */
+bool
+fil_space_read_name_and_filepath(
+ ulint space_id,
+ char** name,
+ char** filepath);
+
+/** Convert a file name to a tablespace name.
+@param[in] filename directory/databasename/tablename.ibd
+@return database/tablename string, to be freed with ut_free() */
+char*
+fil_path_to_space_name(
+ const char* filename);
+
+/** Acquire the fil_system mutex. */
+#define fil_system_enter() mutex_enter(&fil_system.mutex)
+/** Release the fil_system mutex. */
+#define fil_system_exit() mutex_exit(&fil_system.mutex)
+
+/*******************************************************************//**
+Returns the table space by a given id, NULL if not found. */
+fil_space_t*
+fil_space_get_by_id(
+/*================*/
+ ulint id); /*!< in: space id */
+
+/** Note that a non-predefined persistent tablespace has been modified
+by redo log.
+@param[in,out] space tablespace */
+void
+fil_names_dirty(
+ fil_space_t* space);
+
+/** Write FILE_MODIFY records when a non-predefined persistent
+tablespace was modified for the first time since the latest
+fil_names_clear().
+@param[in,out] space tablespace */
+void fil_names_dirty_and_write(fil_space_t* space);
+
+/** Write FILE_MODIFY records if a persistent tablespace was modified
+for the first time since the latest fil_names_clear().
+@param[in,out] space tablespace
+@param[in,out] mtr mini-transaction
+@return whether any FILE_MODIFY record was written */
+inline bool fil_names_write_if_was_clean(fil_space_t* space)
+{
+ mysql_mutex_assert_owner(&log_sys.mutex);
+
+ if (space == NULL) {
+ return(false);
+ }
+
+ const bool was_clean = space->max_lsn == 0;
+ ut_ad(space->max_lsn <= log_sys.get_lsn());
+ space->max_lsn = log_sys.get_lsn();
+
+ if (was_clean) {
+ fil_names_dirty_and_write(space);
+ }
+
+ return(was_clean);
+}
+
+/** On a log checkpoint, reset fil_names_dirty_and_write() flags
+and write out FILE_MODIFY and FILE_CHECKPOINT if needed.
+@param[in] lsn checkpoint LSN
+@param[in] do_write whether to always write FILE_CHECKPOINT
+@return whether anything was written to the redo log
+@retval false if no flags were set and nothing written
+@retval true if anything was written to the redo log */
+bool
+fil_names_clear(
+ lsn_t lsn,
+ bool do_write);
+
+#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+void test_make_filepath();
+#endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */
+
+/** Determine the block size of the data file.
+@param[in] space tablespace
+@param[in] offset page number
+@return block size */
+UNIV_INTERN
+ulint
+fil_space_get_block_size(const fil_space_t* space, unsigned offset);
+
+#include "fil0fil.ic"
+#endif /* UNIV_INNOCHECKSUM */
+
+#endif /* fil0fil_h */
diff --git a/storage/innobase/include/fil0fil.ic b/storage/innobase/include/fil0fil.ic
new file mode 100644
index 00000000..fd5f5bc1
--- /dev/null
+++ b/storage/innobase/include/fil0fil.ic
@@ -0,0 +1,144 @@
+/*****************************************************************************
+
+Copyright (c) 2015, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.ic
+The low-level file system support functions
+
+Created 31/03/2015 Jan Lindström
+*******************************************************/
+
+#ifndef fil0fil_ic
+#define fil0fil_ic
+
+/*******************************************************************//**
+Return page type name */
+UNIV_INLINE
+const char*
+fil_get_page_type_name(
+/*===================*/
+ ulint page_type) /*!< in: FIL_PAGE_TYPE */
+{
+ switch(page_type) {
+ case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+ return "PAGE_COMPRESSED_ENRYPTED";
+ case FIL_PAGE_PAGE_COMPRESSED:
+ return "PAGE_COMPRESSED";
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_INDEX:
+ return "INDEX";
+ case FIL_PAGE_RTREE:
+ return "RTREE";
+ case FIL_PAGE_UNDO_LOG:
+ return "UNDO LOG";
+ case FIL_PAGE_INODE:
+ return "INODE";
+ case FIL_PAGE_IBUF_FREE_LIST:
+ return "IBUF_FREE_LIST";
+ case FIL_PAGE_TYPE_ALLOCATED:
+ return "ALLOCATED";
+ case FIL_PAGE_IBUF_BITMAP:
+ return "IBUF_BITMAP";
+ case FIL_PAGE_TYPE_SYS:
+ return "SYS";
+ case FIL_PAGE_TYPE_TRX_SYS:
+ return "TRX_SYS";
+ case FIL_PAGE_TYPE_FSP_HDR:
+ return "FSP_HDR";
+ case FIL_PAGE_TYPE_XDES:
+ return "XDES";
+ case FIL_PAGE_TYPE_BLOB:
+ return "BLOB";
+ case FIL_PAGE_TYPE_ZBLOB:
+ return "ZBLOB";
+ case FIL_PAGE_TYPE_ZBLOB2:
+ return "ZBLOB2";
+ case FIL_PAGE_TYPE_UNKNOWN:
+ return "OLD UNKNOWN PAGE TYPE";
+ default:
+ return "PAGE TYPE CORRUPTED";
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Validate page type.
+@param[in] space Tablespace object
+@param[in] page page to validate
+@return true if valid, false if not */
+UNIV_INLINE
+bool
+fil_page_type_validate(
+ fil_space_t* space,
+ const byte* page)
+{
+ const uint16_t page_type = fil_page_get_type(page);
+
+ if ((page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)
+ && space->full_crc32()
+ && space->is_compressed()) {
+ return true;
+ }
+
+ /* Validate page type */
+ if (!((page_type == FIL_PAGE_PAGE_COMPRESSED ||
+ page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED ||
+ page_type == FIL_PAGE_INDEX ||
+ page_type == FIL_PAGE_TYPE_INSTANT ||
+ page_type == FIL_PAGE_RTREE ||
+ page_type == FIL_PAGE_UNDO_LOG ||
+ page_type == FIL_PAGE_INODE ||
+ page_type == FIL_PAGE_IBUF_FREE_LIST ||
+ page_type == FIL_PAGE_TYPE_ALLOCATED ||
+ page_type == FIL_PAGE_IBUF_BITMAP ||
+ page_type == FIL_PAGE_TYPE_SYS ||
+ page_type == FIL_PAGE_TYPE_TRX_SYS ||
+ page_type == FIL_PAGE_TYPE_FSP_HDR ||
+ page_type == FIL_PAGE_TYPE_XDES ||
+ page_type == FIL_PAGE_TYPE_BLOB ||
+ page_type == FIL_PAGE_TYPE_ZBLOB ||
+ page_type == FIL_PAGE_TYPE_ZBLOB2 ||
+ page_type == FIL_PAGE_TYPE_UNKNOWN))) {
+
+ ulint space_id = mach_read_from_4(
+ page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ ulint offset = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+ ulint key_version = mach_read_from_4(
+ page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+ if (space && space->full_crc32()) {
+ key_version = mach_read_from_4(
+ page + FIL_PAGE_FCRC32_KEY_VERSION);
+ }
+
+ /* Dump out the page info */
+ ib::fatal() << "Page " << space_id << ":" << offset
+ << " name " << (space ? space->name : "???")
+ << " page_type " << page_type
+ << " key_version " << key_version
+ << " lsn " << mach_read_from_8(page + FIL_PAGE_LSN)
+ << " compressed_len " << mach_read_from_2(page + FIL_PAGE_DATA);
+ return false;
+ }
+
+ return true;
+}
+#endif /* UNIV_DEBUG */
+
+#endif /* fil0fil_ic */
diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h
new file mode 100644
index 00000000..c6ba24fa
--- /dev/null
+++ b/storage/innobase/include/fil0pagecompress.h
@@ -0,0 +1,60 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2019 MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef fil0pagecompress_h
+#define fil0pagecompress_h
+
+#include "fsp0fsp.h"
+
+/******************************************************************//**
+@file include/fil0pagecompress.h
+Helper functions for extracting/storing page compression and
+atomic writes information to table space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/** Compress a page_compressed page before writing to a data file.
+@param[in] buf page to be compressed
+@param[out] out_buf compressed page
+@param[in] flags tablespace flags
+@param[in] block_size file system block size
+@param[in] encrypted whether the page will be subsequently encrypted
+@return actual length of compressed page
+@retval 0 if the page was not compressed */
+ulint fil_page_compress(
+ const byte* buf,
+ byte* out_buf,
+ ulint flags,
+ ulint block_size,
+ bool encrypted)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Decompress a page that may be subject to page_compressed compression.
+@param[in,out] tmp_buf temporary buffer (of innodb_page_size)
+@param[in,out] buf compressed page buffer
+@param[in] flags talespace flags
+@return size of the compressed data
+@retval 0 if decompression failed
+@retval srv_page_size if the page was not compressed */
+ulint fil_page_decompress(
+ byte* tmp_buf,
+ byte* buf,
+ ulint flags)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif
diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h
new file mode 100644
index 00000000..7db85e87
--- /dev/null
+++ b/storage/innobase/include/fsp0file.h
@@ -0,0 +1,576 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0file.h
+Tablespace data file implementation.
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0file_h
+#define fsp0file_h
+
+#include "mem0mem.h"
+#include "os0file.h"
+#include "fil0fil.h"
+
+/** Types of raw partitions in innodb_data_file_path */
+enum device_t {
+ SRV_NOT_RAW = 0, /*!< Not a raw partition */
+ SRV_NEW_RAW, /*!< A 'newraw' partition, only to be
+ initialized */
+ SRV_OLD_RAW /*!< An initialized raw partition */
+};
+
+/** Data file control information. */
+class Datafile {
+
+ friend class Tablespace;
+ friend class SysTablespace;
+
+public:
+
+ Datafile()
+ :
+ m_name(),
+ m_filepath(),
+ m_filename(),
+ m_handle(),
+ m_open_flags(OS_FILE_OPEN),
+ m_size(),
+ m_order(),
+ m_type(SRV_NOT_RAW),
+ m_space_id(ULINT_UNDEFINED),
+ m_flags(),
+ m_exists(),
+ m_is_valid(),
+ m_first_page(),
+ m_last_os_error(),
+ m_file_info()
+ {
+ /* No op */
+ }
+
+ Datafile(const char* name, ulint flags, uint32_t size, ulint order)
+ :
+ m_name(mem_strdup(name)),
+ m_filepath(),
+ m_filename(),
+ m_handle(),
+ m_open_flags(OS_FILE_OPEN),
+ m_size(size),
+ m_order(order),
+ m_type(SRV_NOT_RAW),
+ m_space_id(ULINT_UNDEFINED),
+ m_flags(flags),
+ m_exists(),
+ m_is_valid(),
+ m_first_page(),
+ m_last_os_error(),
+ m_file_info()
+ {
+ ut_ad(m_name != NULL);
+ /* No op */
+ }
+
+ Datafile(const Datafile& file)
+ :
+ m_handle(file.m_handle),
+ m_open_flags(file.m_open_flags),
+ m_size(file.m_size),
+ m_order(file.m_order),
+ m_type(file.m_type),
+ m_space_id(file.m_space_id),
+ m_flags(file.m_flags),
+ m_exists(file.m_exists),
+ m_is_valid(file.m_is_valid),
+ m_first_page(),
+ m_last_os_error(),
+ m_file_info()
+ {
+ m_name = mem_strdup(file.m_name);
+ ut_ad(m_name != NULL);
+
+ if (file.m_filepath != NULL) {
+ m_filepath = mem_strdup(file.m_filepath);
+ ut_a(m_filepath != NULL);
+ set_filename();
+ } else {
+ m_filepath = NULL;
+ m_filename = NULL;
+ }
+ }
+
+ virtual ~Datafile()
+ {
+ shutdown();
+ }
+
+ Datafile& operator=(const Datafile& file)
+ {
+ ut_a(this != &file);
+
+ ut_ad(m_name == NULL);
+ m_name = mem_strdup(file.m_name);
+ ut_a(m_name != NULL);
+
+ m_size = file.m_size;
+ m_order = file.m_order;
+ m_type = file.m_type;
+
+ ut_a(m_handle == OS_FILE_CLOSED);
+ m_handle = file.m_handle;
+
+ m_exists = file.m_exists;
+ m_is_valid = file.m_is_valid;
+ m_open_flags = file.m_open_flags;
+ m_space_id = file.m_space_id;
+ m_flags = file.m_flags;
+ m_last_os_error = 0;
+
+ if (m_filepath != NULL) {
+ ut_free(m_filepath);
+ m_filepath = NULL;
+ m_filename = NULL;
+ }
+
+ if (file.m_filepath != NULL) {
+ m_filepath = mem_strdup(file.m_filepath);
+ ut_a(m_filepath != NULL);
+ set_filename();
+ }
+
+ /* Do not make a copy of the first page,
+ it should be reread if needed */
+ m_first_page = NULL;
+
+ return(*this);
+ }
+
+ /** Initialize the name and flags of this datafile.
+ @param[in] name tablespace name, will be copied
+ @param[in] flags tablespace flags */
+ void init(const char* name, ulint flags);
+
+ /** Release the resources. */
+ virtual void shutdown();
+
+ /** Open a data file in read-only mode to check if it exists
+ so that it can be validated.
+ @param[in] strict whether to issue error messages
+ @return DB_SUCCESS or error code */
+ virtual dberr_t open_read_only(bool strict);
+
+ /** Open a data file in read-write mode during start-up so that
+ doublewrite pages can be restored and then it can be validated.
+ @param[in] read_only_mode if true, then readonly mode checks
+ are enforced.
+ @return DB_SUCCESS or error code */
+ virtual dberr_t open_read_write(bool read_only_mode)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Initialize OS specific file info. */
+ void init_file_info();
+
+ /** Close a data file.
+ @return DB_SUCCESS or error code */
+ dberr_t close();
+
+ /** Make a full filepath from a directory path and a filename.
+ Prepend the dirpath to filename using the extension given.
+ If dirpath is NULL, prepend the default datadir to filepath.
+ Store the result in m_filepath.
+ @param[in] dirpath directory path
+ @param[in] filename filename or filepath
+ @param[in] ext filename extension */
+ void make_filepath(
+ const char* dirpath,
+ const char* filename,
+ ib_extention ext);
+
+ /** Set the filepath by duplicating the filepath sent in */
+ void set_filepath(const char* filepath);
+
+ /** Allocate and set the datafile or tablespace name in m_name.
+ If a name is provided, use it; else extract a file-per-table
+ tablespace name from m_filepath. The value of m_name
+ will be freed in the destructor.
+ @param[in] name Tablespace Name if known, NULL if not */
+ void set_name(const char* name);
+
+ /** Validates the datafile and checks that it conforms with
+ the expected space ID and flags. The file should exist and be
+ successfully opened in order for this function to validate it.
+ @param[in] space_id The expected tablespace ID.
+ @param[in] flags The expected tablespace flags.
+ @retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+ m_is_valid is also set true on success, else false. */
+ dberr_t validate_to_dd(ulint space_id, ulint flags)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Validates this datafile for the purpose of recovery.
+ The file should exist and be successfully opened. We initially
+ open it in read-only mode because we just want to read the SpaceID.
+ However, if the first page is corrupt and needs to be restored
+ from the doublewrite buffer, we will reopen it in write mode and
+ ry to restore that page.
+ @retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+ m_is_valid is also set true on success, else false. */
+ dberr_t validate_for_recovery()
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Checks the consistency of the first page of a datafile when the
+ tablespace is opened. This occurs before the fil_space_t is created
+ so the Space ID found here must not already be open.
+ m_is_valid is set true on success, else false.
+ @param[out] flush_lsn contents of FIL_PAGE_FILE_FLUSH_LSN
+ @retval DB_SUCCESS on if the datafile is valid
+ @retval DB_CORRUPTION if the datafile is not readable
+ @retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */
+ dberr_t validate_first_page(lsn_t* flush_lsn)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Get Datafile::m_name.
+ @return m_name */
+ const char* name() const
+ {
+ return(m_name);
+ }
+
+ /** Get Datafile::m_filepath.
+ @return m_filepath */
+ const char* filepath() const
+ {
+ return(m_filepath);
+ }
+
+ /** Get Datafile::m_handle.
+ @return m_handle */
+ pfs_os_file_t handle() const
+ {
+ return(m_handle);
+ }
+
+ /** @return detached file handle */
+ pfs_os_file_t detach()
+ {
+ pfs_os_file_t detached = m_handle;
+ m_handle = OS_FILE_CLOSED;
+ return detached;
+ }
+
+ /** Get Datafile::m_order.
+ @return m_order */
+ ulint order() const
+ {
+ return(m_order);
+ }
+
+ /** Get Datafile::m_space_id.
+ @return m_space_id */
+ ulint space_id() const
+ {
+ return(m_space_id);
+ }
+
+ /** Get Datafile::m_flags.
+ @return m_flags */
+ ulint flags() const
+ {
+ return(m_flags);
+ }
+
+ /**
+ @return true if m_handle is open, false if not */
+ bool is_open() const
+ {
+ return(m_handle != OS_FILE_CLOSED);
+ }
+
+ /** Get Datafile::m_is_valid.
+ @return m_is_valid */
+ bool is_valid() const
+ {
+ return(m_is_valid);
+ }
+
+ /** Get the last OS error reported
+ @return m_last_os_error */
+ ulint last_os_error() const
+ {
+ return(m_last_os_error);
+ }
+
+ /** Check whether the file is empty.
+ @return true if file is empty */
+ bool is_empty_file() const
+ {
+#ifdef _WIN32
+ os_offset_t offset =
+ (os_offset_t) m_file_info.nFileSizeLow
+ | ((os_offset_t) m_file_info.nFileSizeHigh << 32);
+
+ return (offset == 0);
+#else
+ return (m_file_info.st_size == 0);
+#endif
+ }
+
+ /** Check if the file exist.
+ @return true if file exists. */
+ bool exists() const { return m_exists; }
+
+ /** Test if the filepath provided looks the same as this filepath
+ by string comparison. If they are two different paths to the same
+ file, same_as() will be used to show that after the files are opened.
+ @param[in] other filepath to compare with
+ @retval true if it is the same filename by char comparison
+ @retval false if it looks different */
+ bool same_filepath_as(const char* other) const;
+
+ /** Test if another opened datafile is the same file as this object.
+ @param[in] other Datafile to compare with
+ @return true if it is the same file, else false */
+ bool same_as(const Datafile& other) const;
+
+ /** Get access to the first data page.
+ It is valid after open_read_only() succeeded.
+ @return the first data page */
+ const byte* get_first_page() const { return(m_first_page); }
+
+private:
+ /** Free the filepath buffer. */
+ void free_filepath();
+
+ /** Set the filename pointer to the start of the file name
+ in the filepath. */
+ void set_filename()
+ {
+ if (m_filepath == NULL) {
+ return;
+ }
+
+ char* last_slash = strrchr(m_filepath, OS_PATH_SEPARATOR);
+
+ m_filename = last_slash ? last_slash + 1 : m_filepath;
+ }
+
+ /** Create/open a data file.
+ @param[in] read_only_mode if true, then readonly mode checks
+ are enforced.
+ @return DB_SUCCESS or error code */
+ dberr_t open_or_create(bool read_only_mode)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Reads a few significant fields from the first page of the
+ datafile, which must already be open.
+ @param[in] read_only_mode if true, then readonly mode checks
+ are enforced.
+ @return DB_SUCCESS or DB_IO_ERROR if page cannot be read */
+ dberr_t read_first_page(bool read_only_mode)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Free the first page from memory when it is no longer needed. */
+ void free_first_page();
+
+ /** Set the Datafile::m_open_flags.
+ @param open_flags The Open flags to set. */
+ void set_open_flags(os_file_create_t open_flags)
+ {
+ m_open_flags = open_flags;
+ };
+
+ /** Determine if this datafile is on a Raw Device
+ @return true if it is a RAW device. */
+ bool is_raw_device()
+ {
+ return(m_type != SRV_NOT_RAW);
+ }
+
+ /* DATA MEMBERS */
+
+ /** Datafile name at the tablespace location.
+ This is either the basename of the file if an absolute path
+ was entered, or it is the relative path to the datadir or
+ Tablespace::m_path. */
+ char* m_name;
+
+protected:
+ /** Physical file path with base name and extension */
+ char* m_filepath;
+
+private:
+ /** Determine the space id of the given file descriptor by reading
+ a few pages from the beginning of the .ibd file.
+ @return DB_SUCCESS if space id was successfully identified,
+ else DB_ERROR. */
+ dberr_t find_space_id();
+
+ /** Restore the first page of the tablespace from
+ the double write buffer.
+ @return whether the operation failed */
+ bool restore_from_doublewrite();
+
+ /** Points into m_filepath to the file name with extension */
+ char* m_filename;
+
+ /** Open file handle */
+ pfs_os_file_t m_handle;
+
+ /** Flags to use for opening the data file */
+ os_file_create_t m_open_flags;
+
+ /** size in megabytes or pages; converted from megabytes to
+ pages in SysTablespace::normalize_size() */
+ uint32_t m_size;
+
+ /** ordinal position of this datafile in the tablespace */
+ ulint m_order;
+
+ /** The type of the data file */
+ device_t m_type;
+
+ /** Tablespace ID. Contained in the datafile header.
+ If this is a system tablespace, FSP_SPACE_ID is only valid
+ in the first datafile. */
+ ulint m_space_id;
+
+ /** Tablespace flags. Contained in the datafile header.
+ If this is a system tablespace, FSP_SPACE_FLAGS are only valid
+ in the first datafile. */
+ ulint m_flags;
+
+ /** true if file already existed on startup */
+ bool m_exists;
+
+ /* true if the tablespace is valid */
+ bool m_is_valid;
+
+ /** Aligned buffer to hold first page */
+ byte* m_first_page;
+
+protected:
+ /** Last OS error received so it can be reported if needed. */
+ ulint m_last_os_error;
+
+public:
+ /** Use the following to determine the uniqueness of this datafile. */
+#ifdef _WIN32
+ /* Use fields dwVolumeSerialNumber, nFileIndexLow, nFileIndexHigh. */
+ BY_HANDLE_FILE_INFORMATION m_file_info;
+#else
+ /* Use field st_ino. */
+ struct stat m_file_info;
+#endif /* WIN32 */
+};
+
+
+/** Data file control information. */
+class RemoteDatafile : public Datafile
+{
+private:
+ /** Link filename (full path) */
+ char* m_link_filepath;
+
+public:
+
+ RemoteDatafile()
+ :
+ m_link_filepath()
+ {
+ /* No op - base constructor is called. */
+ }
+
+ RemoteDatafile(const char*, ulint, ulint)
+ :
+ m_link_filepath()
+ {
+ /* No op - base constructor is called. */
+ }
+
+ ~RemoteDatafile() override
+ {
+ shutdown();
+ }
+
+ /** Release the resources. */
+ void shutdown() override;
+
+ /** Get the link filepath.
+ @return m_link_filepath */
+ const char* link_filepath() const
+ {
+ return(m_link_filepath);
+ }
+
+ /** Create a link filename based on the contents of m_name,
+ open that file, and read the contents into m_filepath.
+ @retval DB_SUCCESS if remote linked tablespace file is opened and read.
+ @retval DB_CANNOT_OPEN_FILE if the link file does not exist. */
+ dberr_t open_link_file();
+
+ /** Delete an InnoDB Symbolic Link (ISL) file. */
+ void delete_link_file(void);
+
+ /** Open a handle to the file linked to in an InnoDB Symbolic Link file
+ in read-only mode so that it can be validated.
+ @param[in] strict whether to issue error messages
+ @return DB_SUCCESS or error code */
+ dberr_t open_read_only(bool strict) override;
+
+ /** Opens a handle to the file linked to in an InnoDB Symbolic Link
+ file in read-write mode so that it can be restored from doublewrite
+ and validated.
+ @param[in] read_only_mode If true, then readonly mode checks
+ are enforced.
+ @return DB_SUCCESS or error code */
+ dberr_t open_read_write(bool read_only_mode) override
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /******************************************************************
+ Global Static Functions; Cannot refer to data members.
+ ******************************************************************/
+
+ /** Creates a new InnoDB Symbolic Link (ISL) file. It is always
+ created under the 'datadir' of MySQL. The datadir is the directory
+ of a running mysqld program. We can refer to it by simply using
+ the path ".".
+ @param[in] name tablespace name
+ @param[in] filepath remote filepath of tablespace datafile
+ @return DB_SUCCESS or error code */
+ static dberr_t create_link_file(
+ const char* name,
+ const char* filepath);
+
+ /** Delete an InnoDB Symbolic Link (ISL) file by name.
+ @param[in] name tablespace name */
+ static void delete_link_file(const char* name);
+
+ /** Read an InnoDB Symbolic Link (ISL) file by name.
+ It is always created under the datadir of MySQL.
+ For file-per-table tablespaces, the isl file is expected to be
+ in a 'database' directory and called 'tablename.isl'.
+ The caller must free the memory returned if it is not null.
+ @param[in] link_filepath filepath of the ISL file
+ @return Filepath of the IBD file read from the ISL file */
+ static char* read_link_file(
+ const char* link_filepath);
+};
+#endif /* fsp0file_h */
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
new file mode 100644
index 00000000..7245db39
--- /dev/null
+++ b/storage/innobase/include/fsp0fsp.h
@@ -0,0 +1,761 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.h
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fsp0fsp_h
+#define fsp0fsp_h
+
+#include "assume_aligned.h"
+#include "fsp0types.h"
+#include "fut0lst.h"
+#include "ut0byte.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0mtr.h"
+#include "page0types.h"
+#include "rem0types.h"
+#else
+# include "mach0data.h"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** @return the PAGE_SSIZE flags for the current innodb_page_size */
+#define FSP_FLAGS_PAGE_SSIZE() \
+ ((srv_page_size == UNIV_PAGE_SIZE_ORIG) ? \
+ 0U : (srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \
+ << FSP_FLAGS_POS_PAGE_SSIZE)
+
+/** @return the PAGE_SSIZE flags for the current innodb_page_size in
+full checksum format */
+#define FSP_FLAGS_FCRC32_PAGE_SSIZE() \
+ ((srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \
+ << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+
+/* @defgroup Compatibility macros for MariaDB 10.1.0 through 10.1.20;
+see the table in fsp0types.h @{ */
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101 \
+ (FSP_FLAGS_POS_ATOMIC_BLOBS \
+ + FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101 \
+ (FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101 + 1)
+/** Zero relative shift position of the ATOMIC_WRITES field */
+#define FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101 \
+ (FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101 + 4)
+/** Zero relative shift position of the PAGE_SSIZE field */
+#define FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101 \
+ (FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101 + 2)
+
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101 \
+ (1U << FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101 \
+ (15U << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101)
+/** Bit mask of the ATOMIC_WRITES field */
+#define FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101 \
+ (3U << FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101)
+/** Bit mask of the PAGE_SSIZE field */
+#define FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101 \
+ (15U << FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101)
+
+/** Return the value of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101) \
+ >> FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101) \
+ >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101)
+/** Return the value of the PAGE_SSIZE field */
+#define FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101) \
+ >> FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101)
+
+/* @} */
+
+/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */
+
+/** Offset of the space header within a file page */
+#define FSP_HEADER_OFFSET FIL_PAGE_DATA
+
+/* The data structures in files are defined just as byte strings in C */
+typedef byte xdes_t;
+
+/* SPACE HEADER
+ ============
+
+File space header data structure: this data structure is contained in the
+first page of a space. The space for this header is reserved in every extent
+descriptor page, but used only in the first. */
+
+/*-------------------------------------*/
+#define FSP_SPACE_ID 0 /* space id */
+#define FSP_NOT_USED 4 /* this field contained a value up to
+ which we know that the modifications
+ in the database have been flushed to
+ the file space; not used now */
+#define FSP_SIZE 8 /* Current size of the space in
+ pages */
+#define FSP_FREE_LIMIT 12 /* Minimum page number for which the
+ free list has not been initialized:
+ the pages >= this limit are, by
+ definition, free; note that in a
+ single-table tablespace where size
+ < 64 pages, this number is 64, i.e.,
+ we have initialized the space
+ about the first extent, but have not
+ physically allocated those pages to the
+ file */
+#define FSP_SPACE_FLAGS 16 /* fsp_space_t.flags, similar to
+ dict_table_t::flags */
+#define FSP_FRAG_N_USED 20 /* number of used pages in the
+ FSP_FREE_FRAG list */
+#define FSP_FREE 24 /* list of free extents */
+#define FSP_FREE_FRAG (24 + FLST_BASE_NODE_SIZE)
+ /* list of partially free extents not
+ belonging to any segment */
+#define FSP_FULL_FRAG (24 + 2 * FLST_BASE_NODE_SIZE)
+ /* list of full extents not belonging
+ to any segment */
+#define FSP_SEG_ID (24 + 3 * FLST_BASE_NODE_SIZE)
+ /* 8 bytes which give the first unused
+ segment id */
+#define FSP_SEG_INODES_FULL (32 + 3 * FLST_BASE_NODE_SIZE)
+ /* list of pages containing segment
+ headers, where all the segment inode
+ slots are reserved */
+#define FSP_SEG_INODES_FREE (32 + 4 * FLST_BASE_NODE_SIZE)
+ /* list of pages containing segment
+ headers, where not all the segment
+ header slots are reserved */
+/*-------------------------------------*/
+/* File space header size */
+#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE)
+
+#define FSP_FREE_ADD 4 /* this many free extents are added
+ to the free list from above
+ FSP_FREE_LIMIT at a time */
+/* @} */
+
+/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */
+
+/* FILE SEGMENT INODE
+ ==================
+
+Segment inode which is created for each segment in a tablespace. NOTE: in
+purge we assume that a segment having only one currently used page can be
+freed in a few steps, so that the freeing cannot fill the file buffer with
+bufferfixed file pages. */
+
+typedef byte fseg_inode_t;
+
+#define FSEG_INODE_PAGE_NODE FSEG_PAGE_DATA
+ /* the list node for linking
+ segment inode pages */
+
+#define FSEG_ARR_OFFSET (FSEG_PAGE_DATA + FLST_NODE_SIZE)
+/*-------------------------------------*/
+#define FSEG_ID 0 /* 8 bytes of segment id: if this is 0,
+ it means that the header is unused */
+#define FSEG_NOT_FULL_N_USED 8
+ /* number of used segment pages in
+ the FSEG_NOT_FULL list */
+#define FSEG_FREE 12
+ /* list of free extents of this
+ segment */
+#define FSEG_NOT_FULL (12 + FLST_BASE_NODE_SIZE)
+ /* list of partially free extents */
+#define FSEG_FULL (12 + 2 * FLST_BASE_NODE_SIZE)
+ /* list of full extents */
+#define FSEG_MAGIC_N (12 + 3 * FLST_BASE_NODE_SIZE)
+ /* magic number used in debugging */
+#define FSEG_FRAG_ARR (16 + 3 * FLST_BASE_NODE_SIZE)
+ /* array of individual pages
+ belonging to this segment in fsp
+ fragment extent lists */
+#define FSEG_FRAG_ARR_N_SLOTS (FSP_EXTENT_SIZE / 2)
+ /* number of slots in the array for
+ the fragment pages */
+#define FSEG_FRAG_SLOT_SIZE 4 /* a fragment page slot contains its
+ page number within space, FIL_NULL
+ means that the slot is not in use */
+/*-------------------------------------*/
+#define FSEG_INODE_SIZE \
+ (16 + 3 * FLST_BASE_NODE_SIZE \
+ + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
+
+static constexpr uint32_t FSEG_MAGIC_N_VALUE= 97937874;
+
+#define FSEG_FILLFACTOR 8 /* If this value is x, then if
+ the number of unused but reserved
+ pages in a segment is less than
+ reserved pages * 1/x, and there are
+ at least FSEG_FRAG_LIMIT used pages,
+ then we allow a new empty extent to
+ be added to the segment in
+ fseg_alloc_free_page. Otherwise, we
+ use unused pages of the segment. */
+
+#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS
+ /* If the segment has >= this many
+ used pages, it may be expanded by
+ allocating extents to the segment;
+ until that only individual fragment
+ pages are allocated from the space */
+
+#define FSEG_FREE_LIST_LIMIT 40 /* If the reserved size of a segment
+ is at least this many extents, we
+ allow extents to be put to the free
+ list of the extent: at most
+ FSEG_FREE_LIST_MAX_LEN many */
+#define FSEG_FREE_LIST_MAX_LEN 4
+/* @} */
+
+/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */
+
+/* EXTENT DESCRIPTOR
+ =================
+
+File extent descriptor data structure: contains bits to tell which pages in
+the extent are free and which contain old tuple version to clean. */
+
+/*-------------------------------------*/
+#define XDES_ID 0 /* The identifier of the segment
+ to which this extent belongs */
+#define XDES_FLST_NODE 8 /* The list node data structure
+ for the descriptors */
+#define XDES_STATE (FLST_NODE_SIZE + 8)
+ /* contains state information
+ of the extent */
+#define XDES_BITMAP (FLST_NODE_SIZE + 12)
+ /* Descriptor bitmap of the pages
+ in the extent */
+/*-------------------------------------*/
+
+#define XDES_BITS_PER_PAGE 2 /* How many bits are there per page */
+#define XDES_FREE_BIT 0 /* Index of the bit which tells if
+ the page is free */
+#define XDES_CLEAN_BIT 1 /* NOTE: currently not used!
+ Index of the bit which tells if
+ there are old versions of tuples
+ on the page */
+/* States of a descriptor */
+#define XDES_FREE 1 /* extent is in free list of space */
+#define XDES_FREE_FRAG 2 /* extent is in free fragment list of
+ space */
+#define XDES_FULL_FRAG 3 /* extent is in full fragment list of
+ space */
+#define XDES_FSEG 4 /* extent belongs to a segment */
+
+/** File extent data structure size in bytes. */
+#define XDES_SIZE \
+ (XDES_BITMAP \
+ + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MAX page size. */
+#define XDES_SIZE_MAX \
+ (XDES_BITMAP \
+ + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MIN page size. */
+#define XDES_SIZE_MIN \
+ (XDES_BITMAP \
+ + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE))
+
+/** Offset of the descriptor array on a descriptor page */
+#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
+/**
+Determine if a page is marked free.
+@param[in] descr extent descriptor
+@param[in] offset page offset within extent
+@return whether the page is free */
+inline bool xdes_is_free(const xdes_t *descr, ulint offset)
+{
+ ut_ad(offset < FSP_EXTENT_SIZE);
+ ulint index= XDES_FREE_BIT + XDES_BITS_PER_PAGE * offset;
+ return ut_bit_get_nth(descr[XDES_BITMAP + (index >> 3)], index & 7);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/* @} */
+
+/** Read a tablespace header field.
+@param[in] page first page of a tablespace
+@param[in] field the header field
+@return the contents of the header field */
+inline uint32_t fsp_header_get_field(const page_t* page, ulint field)
+{
+ return mach_read_from_4(FSP_HEADER_OFFSET + field +
+ my_assume_aligned<UNIV_ZIP_SIZE_MIN>(page));
+}
+
+/** Read the flags from the tablespace header page.
+@param[in] page first page of a tablespace
+@return the contents of FSP_SPACE_FLAGS */
+inline uint32_t fsp_header_get_flags(const page_t *page)
+{
+ return fsp_header_get_field(page, FSP_SPACE_FLAGS);
+}
+
+/** Get the byte offset of encryption information in page 0.
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return byte offset relative to FSP_HEADER_OFFSET */
+inline MY_ATTRIBUTE((pure, warn_unused_result))
+ulint fsp_header_get_encryption_offset(ulint zip_size)
+{
+ return zip_size
+ ? XDES_ARR_OFFSET + XDES_SIZE * zip_size / FSP_EXTENT_SIZE
+ : XDES_ARR_OFFSET + (XDES_SIZE << srv_page_size_shift)
+ / FSP_EXTENT_SIZE;
+}
+
+/** Check the encryption key from the first page of a tablespace.
+@param[in] fsp_flags tablespace flags
+@param[in] page first page of a tablespace
+@return true if success */
+bool
+fsp_header_check_encryption_key(
+ ulint fsp_flags,
+ page_t* page);
+
+/**********************************************************************//**
+Writes the space id and flags to a tablespace header. The flags contain
+row type, physical/compressed page size, and logical/uncompressed page
+size of the tablespace. */
+void
+fsp_header_init_fields(
+/*===================*/
+ page_t* page, /*!< in/out: first page in the space */
+ ulint space_id, /*!< in: space id */
+ ulint flags); /*!< in: tablespace flags (FSP_SPACE_FLAGS):
+ 0, or table->flags if newer than COMPACT */
+/** Initialize a tablespace header.
+@param[in,out] space tablespace
+@param[in] size current size in blocks
+@param[in,out] mtr mini-transaction */
+void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** Create a new segment.
+@param space tablespace
+@param byte_offset byte offset of the created segment header
+@param mtr mini-transaction
+@param has_done_reservation whether fsp_reserve_free_extents() was invoked
+@param block block where segment header is placed,
+ or NULL to allocate an additional page for that
+@return the block where the segment header is placed, x-latched
+@retval NULL if could not create segment because of lack of space */
+buf_block_t*
+fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
+ bool has_done_reservation= false, buf_block_t *block= NULL);
+
+/** Calculate the number of pages reserved by a segment,
+and how many pages are currently used.
+@param[in] block buffer block containing the file segment header
+@param[in] header file segment header
+@param[out] used number of pages that are used (not more than reserved)
+@param[in,out] mtr mini-transaction
+@return number of reserved pages */
+ulint fseg_n_reserved_pages(const buf_block_t &block,
+ const fseg_header_t *header, ulint *used,
+ mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize
+file space fragmentation.
+@param[in,out] seg_header segment header
+@param[in] hint hint of which page would be desirable
+@param[in] direction if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR
+@param[in,out] mtr mini-transaction
+@return X-latched block, or NULL if no page could be allocated */
+#define fseg_alloc_free_page(seg_header, hint, direction, mtr) \
+ fseg_alloc_free_page_general(seg_header, hint, direction, \
+ false, mtr, mtr)
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated */
+buf_block_t*
+fseg_alloc_free_page_general(
+/*=========================*/
+ fseg_header_t* seg_header,/*!< in/out: segment header */
+ uint32_t hint, /*!< in: hint of which page would be
+ desirable */
+ byte direction,/*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ bool has_done_reservation, /*!< in: true if the caller has
+ already done the reservation for the page
+ with fsp_reserve_free_extents, then there
+ is no need to do the check for this individual
+ page */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction
+ in which the page should be initialized. */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_t::release_free_extents()!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special
+case. In this function we would liberally reserve several extents for
+every page split or merge in a B-tree. But we do not want to waste disk space
+if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply
+different rules in that special case, just ensuring that there are n_pages
+free pages available.
+
+@param[out] n_reserved number of extents actually reserved; if we
+ return true and the tablespace size is <
+ FSP_EXTENT_SIZE pages, then this can be 0,
+ otherwise it is n_ext
+@param[in,out] space tablespace
+@param[in] n_ext number of extents to reserve
+@param[in] alloc_type page reservation type (FSP_BLOB, etc)
+@param[in,out] mtr the mini transaction
+@param[in] n_pages for small tablespaces (tablespace size is
+ less than FSP_EXTENT_SIZE), number of free
+ pages to reserve.
+@return true if we were able to make the reservation */
+bool
+fsp_reserve_free_extents(
+ uint32_t* n_reserved,
+ fil_space_t* space,
+ uint32_t n_ext,
+ fsp_reserve_t alloc_type,
+ mtr_t* mtr,
+ uint32_t n_pages = 2);
+
+/** Free a page in a file segment.
+@param[in,out] seg_header file segment header
+@param[in,out] space tablespace
+@param[in] offset page number
+@param[in,out] mtr mini-transaction */
+void
+fseg_free_page(
+ fseg_header_t* seg_header,
+ fil_space_t* space,
+ uint32_t offset,
+ mtr_t* mtr);
+/** Determine whether a page is free.
+@param[in,out] space tablespace
+@param[in] page page number
+@return whether the page is marked as free */
+bool
+fseg_page_is_free(fil_space_t* space, unsigned page)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************************//**
+Frees part of a segment. This function can be used to free a segment
+by repeatedly calling this function in different mini-transactions.
+Doing the freeing in a single mini-transaction might result in
+too big a mini-transaction.
+@return whether the freeing was completed */
+bool
+fseg_free_step(
+ fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header
+ resides on the first page of the frag list
+ of the segment, this pointer becomes obsolete
+ after the last freeing step */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Frees part of a segment. Differs from fseg_free_step because this function
+leaves the header page unfreed.
+@return whether the freeing was completed, except for the header page */
+bool
+fseg_free_step_not_header(
+ fseg_header_t* header, /*!< in: segment header which must reside on
+ the first fragment page of the segment */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Reset the page type.
+Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in] block block with invalid FIL_PAGE_TYPE
+@param[in] type expected page type
+@param[in,out] mtr mini-transaction */
+ATTRIBUTE_COLD
+void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr);
+
+/** Check (and if needed, reset) the page type.
+Data files created before MySQL 5.1.48 may contain
+garbage in the FIL_PAGE_TYPE field.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in] page_id page number
+@param[in,out] page page with possibly invalid FIL_PAGE_TYPE
+@param[in] type expected page type
+@param[in,out] mtr mini-transaction */
+inline void
+fil_block_check_type(
+ const buf_block_t& block,
+ ulint type,
+ mtr_t* mtr)
+{
+ if (UNIV_UNLIKELY(type != fil_page_get_type(block.frame))) {
+ fil_block_reset_type(block, type, mtr);
+ }
+}
+
+/** Checks if a page address is an extent descriptor page address.
+@param[in] page_id page id
+@param[in] physical_size page size
+@return whether a descriptor page */
+inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size)
+{
+ return (page_id.page_no() & (physical_size - 1)) == FSP_XDES_OFFSET;
+}
+
+/** Initialize a file page whose prior contents should be ignored.
+@param[in,out] block buffer pool block */
+void fsp_apply_init_file_page(buf_block_t *block);
+
+/** Initialize a file page.
+@param[in] space tablespace
+@param[in,out] block file page
+@param[in,out] mtr mini-transaction */
+inline void fsp_init_file_page(
+#ifdef UNIV_DEBUG
+ const fil_space_t* space,
+#endif
+ buf_block_t* block, mtr_t* mtr)
+{
+ ut_d(space->modify_check(*mtr));
+ ut_ad(space->id == block->page.id().space());
+ fsp_apply_init_file_page(block);
+ mtr->init(block);
+}
+
+#ifndef UNIV_DEBUG
+# define fsp_init_file_page(space, block, mtr) fsp_init_file_page(block, mtr)
+#endif
+
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+void
+fseg_print(
+/*=======*/
+ fseg_header_t* header, /*!< in: segment header */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
+#endif /* UNIV_BTR_PRINT */
+
+/** Convert FSP_SPACE_FLAGS from the buggy MariaDB 10.1.0..10.1.20 format.
+@param[in] flags the contents of FSP_SPACE_FLAGS
+@return the flags corrected from the buggy MariaDB 10.1 format
+@retval ULINT_UNDEFINED if the flags are not in the buggy 10.1 format */
+MY_ATTRIBUTE((warn_unused_result, const))
+UNIV_INLINE
+ulint
+fsp_flags_convert_from_101(ulint flags)
+{
+ DBUG_EXECUTE_IF("fsp_flags_is_valid_failure",
+ return(ULINT_UNDEFINED););
+ if (flags == 0 || fil_space_t::full_crc32(flags)) {
+ return(flags);
+ }
+
+ if (flags >> 18) {
+ /* The most significant FSP_SPACE_FLAGS bit that was ever set
+ by MariaDB 10.1.0 to 10.1.20 was bit 17 (misplaced DATA_DIR flag).
+ The flags must be less than 1<<18 in order to be valid. */
+ return(ULINT_UNDEFINED);
+ }
+
+ if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS))
+ == FSP_FLAGS_MASK_ATOMIC_BLOBS) {
+ /* If the "atomic blobs" flag (indicating
+ ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag
+ is set, then the "post Antelope" (ROW_FORMAT!=REDUNDANT) flag
+ must also be set. */
+ return(ULINT_UNDEFINED);
+ }
+
+ /* Bits 6..10 denote compression in MariaDB 10.1.0 to 10.1.20.
+ They must be either 0b00000 or 0b00011 through 0b10011.
+ In correct versions, these bits would be
+ 0bd0sss where d is the DATA_DIR flag (garbage bit) and
+ sss is the PAGE_SSIZE (3, 4, 6, or 7).
+
+ NOTE: MariaDB 10.1.0 to 10.1.20 can misinterpret
+ uncompressed data files with innodb_page_size=4k or 64k as
+ compressed innodb_page_size=16k files. Below is an exhaustive
+ state space analysis.
+
+ -0by1zzz: impossible (the bit 4 must be clean; see above)
+ -0b101xx: DATA_DIR, innodb_page_size>4k: invalid (COMPRESSION_LEVEL>9)
+ +0bx0011: innodb_page_size=4k:
+ !!! Misinterpreted as COMPRESSION_LEVEL=9 or 1, COMPRESSION=1.
+ -0bx0010: impossible, because sss must be 0b011 or 0b1xx
+ -0bx0001: impossible, because sss must be 0b011 or 0b1xx
+ -0b10000: DATA_DIR, innodb_page_size=16:
+ invalid (COMPRESSION_LEVEL=8 but COMPRESSION=0)
+ +0b00111: no DATA_DIR, innodb_page_size=64k:
+ !!! Misinterpreted as COMPRESSION_LEVEL=3, COMPRESSION=1.
+ -0b00101: impossible, because sss must be 0 for 16k, not 0b101
+ -0b001x0: no DATA_DIR, innodb_page_size=32k or 8k:
+ invalid (COMPRESSION_LEVEL=3 but COMPRESSION=0)
+ +0b00000: innodb_page_size=16k (looks like COMPRESSION=0)
+ ??? Could actually be compressed; see PAGE_SSIZE below */
+ const ulint level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101(
+ flags);
+ if (FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) != (level != 0)
+ || level > 9) {
+ /* The compression flags are not in the buggy MariaDB
+ 10.1 format. */
+ return(ULINT_UNDEFINED);
+ }
+ if (!(~flags & FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101)) {
+ /* The ATOMIC_WRITES flags cannot be 0b11.
+ (The bits 11..12 should actually never be 0b11,
+ because in MySQL they would be SHARED|TEMPORARY.) */
+ return(ULINT_UNDEFINED);
+ }
+
+ /* Bits 13..16 are the wrong position for PAGE_SSIZE, and they
+ should contain one of the values 3,4,6,7, that is, be of the form
+ 0b0011 or 0b01xx (except 0b0101).
+ In correct versions, these bits should be 0bc0se
+ where c is the MariaDB COMPRESSED flag
+ and e is the MySQL 5.7 ENCRYPTION flag
+ and s is the MySQL 8.0 SDI flag. MariaDB can only support s=0, e=0.
+
+ Compressed innodb_page_size=16k tables with correct FSP_SPACE_FLAGS
+ will be properly rejected by older MariaDB 10.1.x because they
+ would read as PAGE_SSIZE>=8 which is not valid. */
+
+ const ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags);
+ if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) {
+ /* the page_size is not between 4k and 64k;
+ 16k should be encoded as 0, not 5 */
+ return(ULINT_UNDEFINED);
+ }
+ const ulint zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+ if (zssize == 0) {
+ /* not ROW_FORMAT=COMPRESSED */
+ } else if (zssize > (ssize ? ssize : 5)) {
+ /* invalid KEY_BLOCK_SIZE */
+ return(ULINT_UNDEFINED);
+ } else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE
+ | FSP_FLAGS_MASK_ATOMIC_BLOBS)) {
+ /* both these flags should be set for
+ ROW_FORMAT=COMPRESSED */
+ return(ULINT_UNDEFINED);
+ }
+
+ flags = ((flags & 0x3f) | ssize << FSP_FLAGS_POS_PAGE_SSIZE
+ | FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags)
+ << FSP_FLAGS_POS_PAGE_COMPRESSION);
+ ut_ad(fil_space_t::is_valid_flags(flags, false));
+ return(flags);
+}
+
+/** Compare tablespace flags.
+@param[in] expected expected flags from dict_tf_to_fsp_flags()
+@param[in] actual flags read from FSP_SPACE_FLAGS
+@return whether the flags match */
+MY_ATTRIBUTE((warn_unused_result))
+UNIV_INLINE
+bool
+fsp_flags_match(ulint expected, ulint actual)
+{
+ expected &= ~FSP_FLAGS_MEM_MASK;
+ ut_ad(fil_space_t::is_valid_flags(expected, false));
+
+ if (actual == expected) {
+ return(true);
+ }
+
+ actual = fsp_flags_convert_from_101(actual);
+ return(actual == expected);
+}
+
+/** Determine the descriptor index within a descriptor page.
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] offset page offset
+@return descriptor index */
+inline ulint xdes_calc_descriptor_index(ulint zip_size, ulint offset)
+{
+ return ut_2pow_remainder<ulint>(offset,
+ zip_size ? zip_size : srv_page_size)
+ / FSP_EXTENT_SIZE;
+}
+
+/** Determine the descriptor page number for a page.
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] offset page offset
+@return descriptor page offset */
+inline uint32_t xdes_calc_descriptor_page(ulint zip_size, uint32_t offset)
+{
+ compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET
+ + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX)
+ * XDES_SIZE_MAX);
+ compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET
+ + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN)
+ * XDES_SIZE_MIN);
+
+ ut_ad(srv_page_size > XDES_ARR_OFFSET
+ + (srv_page_size / FSP_EXTENT_SIZE)
+ * XDES_SIZE);
+ ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET
+ + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE)
+ * XDES_SIZE);
+ ut_ad(!zip_size
+ || zip_size > XDES_ARR_OFFSET
+ + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE);
+ return ut_2pow_round(offset,
+ uint32_t(zip_size ? zip_size : srv_page_size));
+}
+
+#endif /* UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h
new file mode 100644
index 00000000..c00c8d68
--- /dev/null
+++ b/storage/innobase/include/fsp0space.h
@@ -0,0 +1,242 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0space.h
+Shared tablespace interface
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0space_h
+#define fsp0space_h
+
+#include "fsp0file.h"
+#include "fsp0fsp.h"
+#include "fsp0types.h"
+
+#include <vector>
+
+/** Data structure that contains the information about shared tablespaces.
+Currently this can be the system tablespace or a temporary table tablespace */
+class Tablespace {
+
+public:
+ typedef std::vector<Datafile, ut_allocator<Datafile> > files_t;
+
+ /** Data file information - each Datafile can be accessed globally */
+ files_t m_files;
+ /** Data file iterator */
+ typedef files_t::iterator iterator;
+ /** Data file iterator */
+ typedef files_t::const_iterator const_iterator;
+
+ Tablespace()
+ :
+ m_files(),
+ m_name(),
+ m_space_id(ULINT_UNDEFINED),
+ m_path(),
+ m_flags(),
+ m_ignore_read_only(false)
+ {
+ /* No op */
+ }
+
+ virtual ~Tablespace()
+ {
+ shutdown();
+ ut_ad(m_files.empty());
+ ut_ad(m_space_id == ULINT_UNDEFINED);
+ }
+
+ // Disable copying
+ Tablespace(const Tablespace&);
+ Tablespace& operator=(const Tablespace&);
+
+ /** Data file iterator */
+ const_iterator begin() const { return m_files.begin(); }
+ /** Data file iterator */
+ const_iterator end() const { return m_files.end(); }
+ /** Data file iterator */
+ iterator begin() { return m_files.begin(); }
+ /** Data file iterator */
+ iterator end() { return m_files.end(); }
+
+ void set_name(const char* name) { m_name = name; }
+ const char* name() const { return m_name; }
+
+ /** Set tablespace path and filename members.
+ @param[in] path where tablespace file(s) resides
+ @param[in] len length of the file path */
+ void set_path(const char* path, size_t len)
+ {
+ ut_ad(m_path == NULL);
+ m_path = mem_strdupl(path, len);
+ ut_ad(m_path != NULL);
+
+ os_normalize_path(m_path);
+ }
+
+ /** Set tablespace path and filename members.
+ @param[in] path where tablespace file(s) resides */
+ void set_path(const char* path)
+ {
+ set_path(path, strlen(path));
+ }
+
+ /** Get tablespace path
+ @return tablespace path */
+ const char* path() const
+ {
+ return(m_path);
+ }
+
+ /** Set the space id of the tablespace
+ @param[in] space_id tablespace ID to set */
+ void set_space_id(ulint space_id)
+ {
+ ut_ad(m_space_id == ULINT_UNDEFINED);
+ m_space_id = space_id;
+ }
+
+ /** Get the space id of the tablespace
+ @return m_space_id space id of the tablespace */
+ ulint space_id() const
+ {
+ return(m_space_id);
+ }
+
+ /** Set the tablespace flags
+ @param[in] fsp_flags tablespace flags */
+ void set_flags(ulint fsp_flags)
+ {
+ ut_ad(fil_space_t::is_valid_flags(fsp_flags, false));
+ m_flags = fsp_flags;
+ }
+
+ /** Get the tablespace flags
+ @return m_flags tablespace flags */
+ ulint flags() const
+ {
+ return(m_flags);
+ }
+
+ /** Get the tablespace encryption mode
+ @return m_mode tablespace encryption mode */
+ fil_encryption_t encryption_mode() const
+ {
+ return (m_mode);
+ }
+
+ /** Get the tablespace encryption key_id
+ @return m_key_id tablespace encryption key_id */
+ uint32_t key_id() const
+ {
+ return (m_key_id);
+ }
+
+ /** Set Ignore Read Only Status for tablespace.
+ @param[in] read_only_status read only status indicator */
+ void set_ignore_read_only(bool read_only_status)
+ {
+ m_ignore_read_only = read_only_status;
+ }
+
+ /** Free the memory allocated by the Tablespace object */
+ void shutdown();
+
+ /** @return the sum of the file sizes of each Datafile */
+ uint32_t get_sum_of_sizes() const
+ {
+ uint32_t sum = 0;
+
+ for (const_iterator it = begin(); it != end(); ++it) {
+ sum += it->m_size;
+ }
+
+ return(sum);
+ }
+
+ /** Open or Create the data files if they do not exist.
+ @param[in] is_temp whether this is a temporary tablespace
+ @return DB_SUCCESS or error code */
+ dberr_t open_or_create(bool is_temp)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Delete all the data files. */
+ void delete_files();
+
+ /** Check if two tablespaces have common data file names.
+ @param[in] other_space Tablespace to check against this.
+ @return true if they have the same data filenames and paths */
+ bool intersection(const Tablespace* other_space);
+
+ /** Use the ADD DATAFILE path to create a Datafile object and add
+ it to the front of m_files. Parse the datafile path into a path
+ and a basename with extension 'ibd'. This datafile_path provided
+ may be an absolute or relative path, but it must end with the
+ extension .ibd and have a basename of at least 1 byte.
+
+ Set tablespace m_path member and add a Datafile with the filename.
+ @param[in] datafile_path full path of the tablespace file. */
+ dberr_t add_datafile(
+ const char* datafile_path);
+
+ /* Return a pointer to the first Datafile for this Tablespace
+ @return pointer to the first Datafile for this Tablespace*/
+ Datafile* first_datafile()
+ {
+ ut_a(!m_files.empty());
+ return(&m_files.front());
+ }
+private:
+ /**
+ @param[in] filename Name to lookup in the data files.
+ @return true if the filename exists in the data files */
+ bool find(const char* filename) const;
+
+ /** Note that the data file was found.
+ @param[in] file data file object */
+ void file_found(Datafile& file);
+
+ /* DATA MEMBERS */
+
+ /** Name of the tablespace. */
+ const char* m_name;
+
+ /** Tablespace ID */
+ ulint m_space_id;
+
+ /** Path where tablespace files will reside, not including a filename.*/
+ char* m_path;
+
+ /** Tablespace flags */
+ ulint m_flags;
+
+ /** Encryption mode and key_id */
+ fil_encryption_t m_mode;
+ uint32_t m_key_id;
+
+protected:
+ /** Ignore server read only configuration for this tablespace. */
+ bool m_ignore_read_only;
+};
+
+#endif /* fsp0space_h */
diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h
new file mode 100644
index 00000000..2e0a395f
--- /dev/null
+++ b/storage/innobase/include/fsp0sysspace.h
@@ -0,0 +1,289 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0sysspace.h
+Multi file, shared, system tablespace implementation.
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0sysspace_h
+#define fsp0sysspace_h
+
+#include "fsp0space.h"
+
+/** If the last data file is auto-extended, we add this many pages to it
+at a time. We have to make this public because it is a config variable. */
+extern uint sys_tablespace_auto_extend_increment;
+
+/** Data structure that contains the information about shared tablespaces.
+Currently this can be the system tablespace or a temporary table tablespace */
+class SysTablespace : public Tablespace
+{
+public:
+
+ SysTablespace()
+ :
+ m_auto_extend_last_file(),
+ m_last_file_size_max(),
+ m_created_new_raw(),
+ m_is_tablespace_full(false),
+ m_sanity_checks_done(false)
+ {
+ /* No op */
+ }
+
+ ~SysTablespace() override
+ {
+ shutdown();
+ }
+
+ /** Set tablespace full status
+ @param[in] is_full true if full */
+ void set_tablespace_full_status(bool is_full)
+ {
+ m_is_tablespace_full = is_full;
+ }
+
+ /** Get tablespace full status
+ @return true if table is full */
+ bool get_tablespace_full_status()
+ {
+ return(m_is_tablespace_full);
+ }
+
+ /** Set sanity check status
+ @param[in] status true if sanity checks are done */
+ void set_sanity_check_status(bool status)
+ {
+ m_sanity_checks_done = status;
+ }
+
+ /** Get sanity check status
+ @return true if sanity checks are done */
+ bool get_sanity_check_status()
+ {
+ return(m_sanity_checks_done);
+ }
+
+ /** Parse the input params and populate member variables.
+ @param filepath path to data files
+ @param supports_raw true if it supports raw devices
+ @return true on success parse */
+ bool parse_params(const char* filepath, bool supports_raw);
+
+ /** Check the data file specification.
+ @param[out] create_new_db true if a new database
+ is to be created
+ @param[in] min_expected_size expected tablespace
+ size in bytes
+ @return DB_SUCCESS if all OK else error code */
+ dberr_t check_file_spec(
+ bool* create_new_db,
+ ulint min_expected_tablespace_size);
+
+ /** Free the memory allocated by parse() */
+ void shutdown();
+
+ /** Normalize the file size, convert to extents. */
+ void normalize_size();
+
+ /**
+ @return true if a new raw device was created. */
+ bool created_new_raw() const
+ {
+ return(m_created_new_raw);
+ }
+
+ /**
+ @return auto_extend value setting */
+ ulint can_auto_extend_last_file() const
+ {
+ return(m_auto_extend_last_file);
+ }
+
+ /** Set the last file size.
+ @param[in] size the size to set */
+ void set_last_file_size(uint32_t size)
+ {
+ ut_ad(!m_files.empty());
+ m_files.back().m_size = size;
+ }
+
+ /** Get the size of the last data file in the tablespace
+ @return the size of the last data file in the array */
+ uint32_t last_file_size() const
+ {
+ ut_ad(!m_files.empty());
+ return(m_files.back().m_size);
+ }
+
+ /**
+ @return the autoextend increment in pages. */
+ uint32_t get_autoextend_increment() const
+ {
+ return sys_tablespace_auto_extend_increment
+ << (20 - srv_page_size_shift);
+ }
+
+ /**
+ @return next increment size */
+ uint32_t get_increment() const;
+
+ /** Open or create the data files
+ @param[in] is_temp whether this is a temporary tablespace
+ @param[in] create_new_db whether we are creating a new database
+ @param[out] sum_new_sizes sum of sizes of the new files added
+ @param[out] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first file
+ @return DB_SUCCESS or error code */
+ dberr_t open_or_create(
+ bool is_temp,
+ bool create_new_db,
+ ulint* sum_new_sizes,
+ lsn_t* flush_lsn)
+ MY_ATTRIBUTE((warn_unused_result));
+
+private:
+ /** Check the tablespace header for this tablespace.
+ @param[out] flushed_lsn the value of FIL_PAGE_FILE_FLUSH_LSN
+ @return DB_SUCCESS or error code */
+ dberr_t read_lsn_and_check_flags(lsn_t* flushed_lsn);
+
+ /**
+ @return true if the last file size is valid. */
+ bool is_valid_size() const
+ {
+ return(m_last_file_size_max >= last_file_size());
+ }
+
+ /**
+ @return true if configured to use raw devices */
+ bool has_raw_device();
+
+ /** Note that the data file was not found.
+ @param[in] file data file object
+ @param[out] create_new_db true if a new instance to be created
+ @return DB_SUCESS or error code */
+ dberr_t file_not_found(Datafile& file, bool* create_new_db);
+
+ /** Note that the data file was found.
+ @param[in,out] file data file object
+ @return true if a new instance to be created */
+ bool file_found(Datafile& file);
+
+ /** Create a data file.
+ @param[in,out] file data file object
+ @return DB_SUCCESS or error code */
+ dberr_t create(Datafile& file);
+
+ /** Create a data file.
+ @param[in,out] file data file object
+ @return DB_SUCCESS or error code */
+ dberr_t create_file(Datafile& file);
+
+ /** Open a data file.
+ @param[in,out] file data file object
+ @return DB_SUCCESS or error code */
+ dberr_t open_file(Datafile& file);
+
+ /** Set the size of the file.
+ @param[in,out] file data file object
+ @return DB_SUCCESS or error code */
+ dberr_t set_size(Datafile& file);
+
+ /** Convert a numeric string that optionally ends in G or M, to a
+ number containing megabytes.
+ @param[in] ptr string with a quantity in bytes
+ @param[out] megs the number in megabytes
+ @return next character in string */
+ static char* parse_units(char* ptr, ulint* megs);
+
+private:
+ enum file_status_t {
+ FILE_STATUS_VOID = 0, /** status not set */
+ FILE_STATUS_RW_PERMISSION_ERROR,/** permission error */
+ FILE_STATUS_READ_WRITE_ERROR, /** not readable/writable */
+ FILE_STATUS_NOT_REGULAR_FILE_ERROR /** not a regular file */
+ };
+
+ /** Verify the size of the physical file
+ @param[in] file data file object
+ @return DB_SUCCESS if OK else error code. */
+ dberr_t check_size(Datafile& file);
+
+ /** Check if a file can be opened in the correct mode.
+ @param[in,out] file data file object
+ @param[out] reason exact reason if file_status check failed.
+ @return DB_SUCCESS or error code. */
+ dberr_t check_file_status(
+ const Datafile& file,
+ file_status_t& reason);
+
+ /* DATA MEMBERS */
+
+ /** if true, then we auto-extend the last data file */
+ bool m_auto_extend_last_file;
+
+ /** maximum size of the last data file (0=unlimited) */
+ ulint m_last_file_size_max;
+
+ /** If the following is true we do not allow
+ inserts etc. This protects the user from forgetting
+ the 'newraw' keyword to my.cnf */
+ bool m_created_new_raw;
+
+ /** Tablespace full status */
+ bool m_is_tablespace_full;
+
+ /** if false, then sanity checks are still pending */
+ bool m_sanity_checks_done;
+};
+
+/* GLOBAL OBJECTS */
+
+/** The control info of the system tablespace. */
+extern SysTablespace srv_sys_space;
+
+/** The control info of a temporary table shared tablespace. */
+extern SysTablespace srv_tmp_space;
+
+/** Check if the space_id is for a system-tablespace (shared + temp).
+@param[in] id Space ID to check
+@return true if id is a system tablespace, false if not. */
+UNIV_INLINE
+bool
+is_system_tablespace(ulint id)
+{
+ return(id == TRX_SYS_SPACE || id == SRV_TMP_SPACE_ID);
+}
+
+/** Check if predefined shared tablespace.
+@return true if predefined shared tablespace */
+UNIV_INLINE
+bool
+is_predefined_tablespace(
+ ulint id)
+{
+ ut_ad(srv_sys_space.space_id() == TRX_SYS_SPACE);
+ ut_ad(TRX_SYS_SPACE == 0);
+ return(id == TRX_SYS_SPACE
+ || id == SRV_TMP_SPACE_ID
+ || srv_is_undo_tablespace(id));
+}
+#endif /* fsp0sysspace_h */
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
new file mode 100644
index 00000000..f8e4c06b
--- /dev/null
+++ b/storage/innobase/include/fsp0types.h
@@ -0,0 +1,405 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+@file include/fsp0types.h
+File space management types
+
+Created May 26, 2009 Vasil Dimov
+*******************************************************/
+
+#pragma once
+#include <cstddef>
+
+/** The fil_space_t::id of the redo log. All persistent tablespaces
+have a smaller fil_space_t::id. */
+static constexpr size_t SRV_SPACE_ID_UPPER_BOUND= 0xFFFFFFF0;
+/** The fil_space_t::id of the innodb_temporary tablespace. */
+#define SRV_TMP_SPACE_ID 0xFFFFFFFEU
+
+#include "ut0byte.h"
+
+/* Possible values of innodb_compression_algorithm */
+#define PAGE_UNCOMPRESSED 0
+#define PAGE_ZLIB_ALGORITHM 1
+#define PAGE_LZ4_ALGORITHM 2
+#define PAGE_LZO_ALGORITHM 3
+#define PAGE_LZMA_ALGORITHM 4
+#define PAGE_BZIP2_ALGORITHM 5
+#define PAGE_SNAPPY_ALGORITHM 6
+#define PAGE_ALGORITHM_LAST PAGE_SNAPPY_ALGORITHM
+
+/** @name Flags for inserting records in order
+If records are inserted in order, there are the following
+flags to tell this (their type is made byte for the compiler
+to warn if direction and hint parameters are switched in
+fseg_alloc_free_page) */
+/* @{ */
+#define FSP_UP ((byte)111) /*!< alphabetically upwards */
+#define FSP_DOWN ((byte)112) /*!< alphabetically downwards */
+#define FSP_NO_DIR ((byte)113) /*!< no order */
+/* @} */
+
+/** File space extent size in pages
+page size | file space extent size
+----------+-----------------------
+ 4 KiB | 256 pages = 1 MiB
+ 8 KiB | 128 pages = 1 MiB
+ 16 KiB | 64 pages = 1 MiB
+ 32 KiB | 64 pages = 2 MiB
+ 64 KiB | 64 pages = 4 MiB
+*/
+#define FSP_EXTENT_SIZE (srv_page_size_shift < 14 ? \
+ (1048576U >> srv_page_size_shift) : 64U)
+
+/** File space extent size (four megabyte) in pages for MAX page size */
+#define FSP_EXTENT_SIZE_MAX (4194304 / UNIV_PAGE_SIZE_MAX)
+
+/** File space extent size (one megabyte) in pages for MIN page size */
+#define FSP_EXTENT_SIZE_MIN (1048576 / UNIV_PAGE_SIZE_MIN)
+
+/** On a page of any file segment, data may be put starting from this
+offset */
+#define FSEG_PAGE_DATA FIL_PAGE_DATA
+
+/** @name File segment header
+The file segment header points to the inode describing the file segment. */
+/* @{ */
+/** Data type for file segment header */
+typedef byte fseg_header_t;
+
+#define FSEG_HDR_SPACE 0 /*!< space id of the inode */
+#define FSEG_HDR_PAGE_NO 4 /*!< page number of the inode */
+#define FSEG_HDR_OFFSET 8 /*!< byte offset of the inode */
+
+#define FSEG_HEADER_SIZE 10 /*!< Length of the file system
+ header, in bytes */
+/* @} */
+
+#ifndef UNIV_INNOCHECKSUM
+#ifdef UNIV_DEBUG
+
+struct mtr_t;
+
+/** A wrapper class to print the file segment header information. */
+class fseg_header
+{
+public:
+ /** Constructor of fseg_header.
+ @param[in] header the underlying file segment header object
+ @param[in] mtr the mini-transaction. No redo logs are
+ generated, only latches are checked within
+ mini-transaction */
+ fseg_header(
+ const fseg_header_t* header,
+ mtr_t* mtr)
+ :
+ m_header(header),
+ m_mtr(mtr)
+ {}
+
+ /** Print the file segment header to the given output stream.
+ @param[in,out] out the output stream into which the object
+ is printed.
+ @retval the output stream into which the object was printed. */
+ std::ostream&
+ to_stream(std::ostream& out) const;
+private:
+ /** The underlying file segment header */
+ const fseg_header_t* m_header;
+
+ /** The mini transaction, which is used mainly to check whether
+ appropriate latches have been taken by the calling thread. */
+ mtr_t* m_mtr;
+};
+
+/* Overloading the global output operator to print a file segment header
+@param[in,out] out the output stream into which object will be printed
+@param[in] header the file segment header to be printed
+@retval the output stream */
+inline
+std::ostream&
+operator<<(
+ std::ostream& out,
+ const fseg_header& header)
+{
+ return(header.to_stream(out));
+}
+#endif /* UNIV_DEBUG */
+
+/** Flags for fsp_reserve_free_extents */
+enum fsp_reserve_t {
+ FSP_NORMAL, /* reservation during normal B-tree operations */
+ FSP_UNDO, /* reservation done for undo logging */
+ FSP_CLEANING, /* reservation done during purge operations */
+ FSP_BLOB /* reservation being done for BLOB insertion */
+};
+
+/* Number of pages described in a single descriptor page: currently each page
+description takes less than 1 byte; a descriptor page is repeated every
+this many file pages */
+/* #define XDES_DESCRIBED_PER_PAGE srv_page_size */
+/* This has been replaced with either srv_page_size or page_zip->size. */
+
+/** @name The space low address page map
+The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated
+every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */
+/* @{ */
+/*--------------------------------------*/
+#define FSP_XDES_OFFSET 0U /* !< extent descriptor */
+#define FSP_IBUF_BITMAP_OFFSET 1U /* !< insert buffer bitmap */
+ /* The ibuf bitmap pages are the ones whose
+ page number is the number above plus a
+ multiple of XDES_DESCRIBED_PER_PAGE */
+
+#define FSP_FIRST_INODE_PAGE_NO 2U /*!< in every tablespace */
+ /* The following pages exist
+ in the system tablespace (space 0). */
+#define FSP_IBUF_HEADER_PAGE_NO 3U /*!< insert buffer
+ header page, in
+ tablespace 0 */
+#define FSP_IBUF_TREE_ROOT_PAGE_NO 4U /*!< insert buffer
+ B-tree root page in
+ tablespace 0 */
+ /* The ibuf tree root page number in
+ tablespace 0; its fseg inode is on the page
+ number FSP_FIRST_INODE_PAGE_NO */
+#define FSP_TRX_SYS_PAGE_NO 5U /*!< transaction
+ system header, in
+ tablespace 0 */
+#define FSP_FIRST_RSEG_PAGE_NO 6U /*!< first rollback segment
+ page, in tablespace 0 */
+#define FSP_DICT_HDR_PAGE_NO 7U /*!< data dictionary header
+ page, in tablespace 0 */
+/*--------------------------------------*/
+/* @} */
+
+/** Check if tablespace is system temporary.
+@param[in] space_id verify is checksum is enabled for given space.
+@return true if tablespace is system temporary. */
+inline
+bool
+fsp_is_system_temporary(ulint space_id)
+{
+ return(space_id == SRV_TMP_SPACE_ID);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */
+
+/** Width of the POST_ANTELOPE flag */
+#define FSP_FLAGS_WIDTH_POST_ANTELOPE 1
+/** Number of flag bits used to indicate the tablespace zip page size */
+#define FSP_FLAGS_WIDTH_ZIP_SSIZE 4
+/** Width of the ATOMIC_BLOBS flag. The ability to break up a long
+column into an in-record prefix and an externally stored part is available
+to ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. */
+#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS 1
+/** Number of flag bits used to indicate the tablespace page size */
+#define FSP_FLAGS_WIDTH_PAGE_SSIZE 4
+/** Number of reserved bits */
+#define FSP_FLAGS_WIDTH_RESERVED 6
+/** Number of flag bits used to indicate the page compression */
+#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION 1
+
+/** Width of all the currently known persistent tablespace flags */
+#define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \
+ + FSP_FLAGS_WIDTH_ZIP_SSIZE \
+ + FSP_FLAGS_WIDTH_ATOMIC_BLOBS \
+ + FSP_FLAGS_WIDTH_PAGE_SSIZE \
+ + FSP_FLAGS_WIDTH_RESERVED \
+ + FSP_FLAGS_WIDTH_PAGE_COMPRESSION)
+
+/** A mask of all the known/used bits in FSP_SPACE_FLAGS */
+#define FSP_FLAGS_MASK (~(~0U << FSP_FLAGS_WIDTH))
+
+/** Number of flag bits used to indicate the tablespace page size */
+#define FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE 4
+
+/** Marker to indicate whether tablespace is in full checksum format. */
+#define FSP_FLAGS_FCRC32_WIDTH_MARKER 1
+
+/** Stores the compressed algo for full checksum format. */
+#define FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO 3
+
+/* FSP_SPACE_FLAGS position and name in MySQL 5.6/MariaDB 10.0 or older
+and MariaDB 10.1.20 or older MariaDB 10.1 and in MariaDB 10.1.21
+or newer.
+MySQL 5.6 MariaDB 10.1.x MariaDB 10.1.21
+====================================================================
+Below flags in same offset
+====================================================================
+0: POST_ANTELOPE 0:POST_ANTELOPE 0: POST_ANTELOPE
+1..4: ZIP_SSIZE(0..5) 1..4:ZIP_SSIZE(0..5) 1..4: ZIP_SSIZE(0..5)
+(NOTE: bit 4 is always 0)
+5: ATOMIC_BLOBS 5:ATOMIC_BLOBS 5: ATOMIC_BLOBS
+=====================================================================
+Below note the order difference:
+=====================================================================
+6..9: PAGE_SSIZE(3..7) 6: COMPRESSION 6..9: PAGE_SSIZE(3..7)
+10: DATA_DIR 7..10: COMP_LEVEL(0..9) 10: RESERVED (5.6 DATA_DIR)
+=====================================================================
+The flags below were in incorrect position in MariaDB 10.1,
+or have been introduced in MySQL 5.7 or 8.0:
+=====================================================================
+11: UNUSED 11..12:ATOMIC_WRITES 11: RESERVED (5.7 SHARED)
+ 12: RESERVED (5.7 TEMPORARY)
+ 13..15:PAGE_SSIZE(3..7) 13: RESERVED (5.7 ENCRYPTION)
+ 14: RESERVED (8.0 SDI)
+ 15: RESERVED
+ 16: PAGE_SSIZE_msb(0) 16: COMPRESSION
+ 17: DATA_DIR 17: UNUSED
+ 18: UNUSED
+=====================================================================
+The flags below only exist in fil_space_t::flags, not in FSP_SPACE_FLAGS:
+=====================================================================
+ 27: DATA_DIR
+ 28..31: COMPRESSION_LEVEL
+*/
+
+/** A mask of the memory-only flags in fil_space_t::flags */
+#define FSP_FLAGS_MEM_MASK (~0U << FSP_FLAGS_MEM_DATA_DIR)
+
+/** Zero relative shift position of the DATA_DIR flag */
+#define FSP_FLAGS_MEM_DATA_DIR 27
+/** Zero relative shift position of the COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MEM_COMPRESSION_LEVEL 28
+
+/** Zero relative shift position of the POST_ANTELOPE field */
+#define FSP_FLAGS_POS_POST_ANTELOPE 0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define FSP_FLAGS_POS_ZIP_SSIZE (FSP_FLAGS_POS_POST_ANTELOPE \
+ + FSP_FLAGS_WIDTH_POST_ANTELOPE)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_POS_ATOMIC_BLOBS (FSP_FLAGS_POS_ZIP_SSIZE \
+ + FSP_FLAGS_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the start of the PAGE_SSIZE bits */
+#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_BLOBS \
+ + FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the start of the RESERVED bits
+these are only used in MySQL 5.7 and used for compatibility. */
+#define FSP_FLAGS_POS_RESERVED (FSP_FLAGS_POS_PAGE_SSIZE \
+ + FSP_FLAGS_WIDTH_PAGE_SSIZE)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION (FSP_FLAGS_POS_RESERVED \
+ + FSP_FLAGS_WIDTH_RESERVED)
+
+/** Zero relative shift position of the PAGE_SIZE field
+in full crc32 format */
+#define FSP_FLAGS_FCRC32_POS_PAGE_SSIZE 0
+
+/** Zero relative shift position of the MARKER field in full crc32 format. */
+#define FSP_FLAGS_FCRC32_POS_MARKER (FSP_FLAGS_FCRC32_POS_PAGE_SSIZE \
+ + FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE)
+
+/** Zero relative shift position of the compressed algorithm stored
+in full crc32 format. */
+#define FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO (FSP_FLAGS_FCRC32_POS_MARKER \
+ + FSP_FLAGS_FCRC32_WIDTH_MARKER)
+
+/** Bit mask of the POST_ANTELOPE field */
+#define FSP_FLAGS_MASK_POST_ANTELOPE \
+ ((~(~0U << FSP_FLAGS_WIDTH_POST_ANTELOPE)) \
+ << FSP_FLAGS_POS_POST_ANTELOPE)
+/** Bit mask of the ZIP_SSIZE field */
+#define FSP_FLAGS_MASK_ZIP_SSIZE \
+ ((~(~0U << FSP_FLAGS_WIDTH_ZIP_SSIZE)) \
+ << FSP_FLAGS_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_MASK_ATOMIC_BLOBS \
+ ((~(~0U << FSP_FLAGS_WIDTH_ATOMIC_BLOBS)) \
+ << FSP_FLAGS_POS_ATOMIC_BLOBS)
+/** Bit mask of the PAGE_SSIZE field */
+#define FSP_FLAGS_MASK_PAGE_SSIZE \
+ ((~(~0U << FSP_FLAGS_WIDTH_PAGE_SSIZE)) \
+ << FSP_FLAGS_POS_PAGE_SSIZE)
+/** Bit mask of the RESERVED1 field */
+#define FSP_FLAGS_MASK_RESERVED \
+ ((~(~0U << FSP_FLAGS_WIDTH_RESERVED)) \
+ << FSP_FLAGS_POS_RESERVED)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION \
+ ((~(~0U << FSP_FLAGS_WIDTH_PAGE_COMPRESSION)) \
+ << FSP_FLAGS_POS_PAGE_COMPRESSION)
+
+/** Bit mask of the in-memory COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL \
+ (15U << FSP_FLAGS_MEM_COMPRESSION_LEVEL)
+
+/** Bit mask of the PAGE_SIZE field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE \
+ ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE)) \
+ << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+
+/** Bit mask of the MARKER field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_MARKER \
+ ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_MARKER)) \
+ << FSP_FLAGS_FCRC32_POS_MARKER)
+
+/** Bit mask of the COMPRESSED ALGO field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO \
+ ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO)) \
+ << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO)
+
+/** Return the value of the POST_ANTELOPE field */
+#define FSP_FLAGS_GET_POST_ANTELOPE(flags) \
+ ((flags & FSP_FLAGS_MASK_POST_ANTELOPE) \
+ >> FSP_FLAGS_POS_POST_ANTELOPE)
+/** Return the value of the ZIP_SSIZE field */
+#define FSP_FLAGS_GET_ZIP_SSIZE(flags) \
+ ((flags & FSP_FLAGS_MASK_ZIP_SSIZE) \
+ >> FSP_FLAGS_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags) \
+ ((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS) \
+ >> FSP_FLAGS_POS_ATOMIC_BLOBS)
+/** Return the value of the PAGE_SSIZE field */
+#define FSP_FLAGS_GET_PAGE_SSIZE(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_SSIZE) \
+ >> FSP_FLAGS_POS_PAGE_SSIZE)
+/** @return the RESERVED flags */
+#define FSP_FLAGS_GET_RESERVED(flags) \
+ ((flags & FSP_FLAGS_MASK_RESERVED) \
+ >> FSP_FLAGS_POS_RESERVED)
+/** @return the PAGE_COMPRESSION flag */
+#define FSP_FLAGS_HAS_PAGE_COMPRESSION(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \
+ >> FSP_FLAGS_POS_PAGE_COMPRESSION)
+/** @return the PAGE_SSIZE flags in full crc32 format */
+#define FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags) \
+ ((flags & FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE) \
+ >> FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+/** @return the COMPRESSED_ALGO flags in full crc32 format */
+#define FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags) \
+ ((flags & FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO) \
+ >> FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO)
+
+/** @return the value of the DATA_DIR field */
+#define FSP_FLAGS_HAS_DATA_DIR(flags) \
+ (flags & 1U << FSP_FLAGS_MEM_DATA_DIR)
+/** @return the COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \
+ ((flags & FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL) \
+ >> FSP_FLAGS_MEM_COMPRESSION_LEVEL)
+
+/* @} */
+
+struct fil_node_t;
+struct fil_space_t;
+class buf_page_t;
diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h
new file mode 100644
index 00000000..15bf30bc
--- /dev/null
+++ b/storage/innobase/include/fts0ast.h
@@ -0,0 +1,340 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0ast.h
+The FTS query parser (AST) abstract syntax tree routines
+
+Created 2007/03/16/03 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FST0AST_H
+#define INNOBASE_FST0AST_H
+
+#include "mem0mem.h"
+
+/* The type of AST Node */
+enum fts_ast_type_t {
+ FTS_AST_OPER, /*!< Operator */
+ FTS_AST_NUMB, /*!< Number */
+ FTS_AST_TERM, /*!< Term (or word) */
+ FTS_AST_TEXT, /*!< Text string */
+ FTS_AST_PARSER_PHRASE_LIST, /*!< Phase for plugin parser
+ The difference from text type
+ is that we tokenize text into
+ term list */
+ FTS_AST_LIST, /*!< Expression list */
+ FTS_AST_SUBEXP_LIST /*!< Sub-Expression list */
+};
+
+/* The FTS query operators that we support */
+enum fts_ast_oper_t {
+ FTS_NONE, /*!< No operator */
+
+ FTS_IGNORE, /*!< Ignore rows that contain
+ this word */
+
+ FTS_EXIST, /*!< Include rows that contain
+ this word */
+
+ FTS_NEGATE, /*!< Include rows that contain
+ this word but rank them
+ lower*/
+
+ FTS_INCR_RATING, /*!< Increase the rank for this
+ word*/
+
+ FTS_DECR_RATING, /*!< Decrease the rank for this
+ word*/
+
+ FTS_DISTANCE, /*!< Proximity distance */
+ FTS_IGNORE_SKIP, /*!< Transient node operator
+ signifies that this is a
+ FTS_IGNORE node, and ignored in
+ the first pass of
+ fts_ast_visit() */
+ FTS_EXIST_SKIP /*!< Transient node operator
+ signifies that this ia a
+ FTS_EXIST node, and ignored in
+ the first pass of
+ fts_ast_visit() */
+};
+
+/* Data types used by the FTS parser */
+struct fts_lexer_t;
+struct fts_ast_node_t;
+struct fts_ast_state_t;
+struct fts_ast_string_t;
+
+typedef dberr_t (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*);
+
+/********************************************************************
+Parse the string using the lexer setup within state.*/
+int
+fts_parse(
+/*======*/
+ /* out: 0 on OK, 1 on error */
+ fts_ast_state_t* state); /*!< in: ast state instance.*/
+
+/********************************************************************
+Create an AST operator node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+ void* arg, /*!< in: ast state */
+ fts_ast_oper_t oper); /*!< in: ast operator */
+/********************************************************************
+Create an AST term node, makes a copy of ptr */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+ void* arg, /*!< in: ast state */
+ const fts_ast_string_t* ptr); /*!< in: term string */
+/********************************************************************
+Create an AST text node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+ void* arg, /*!< in: ast state */
+ const fts_ast_string_t* ptr); /*!< in: text string */
+/********************************************************************
+Create an AST expr list node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+ void* arg, /*!< in: ast state */
+ fts_ast_node_t* expr); /*!< in: ast expr */
+/********************************************************************
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it. */
+extern
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+ /* out: new node */
+ void* arg, /*!< in: ast state instance */
+ fts_ast_node_t* expr); /*!< in: ast expr instance */
+/********************************************************************
+Set the wildcard attribute of a term.*/
+extern
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+ fts_ast_node_t* node); /*!< in: term to change */
+/********************************************************************
+Set the proximity attribute of a text node. */
+void
+fts_ast_text_set_distance(
+/*======================*/
+ fts_ast_node_t* node, /*!< in/out: text node */
+ ulint distance); /*!< in: the text proximity
+ distance */
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+ fts_ast_node_t* node); /*!< in: node to free */
+/********************************************************************
+Add a sub-expression to an AST*/
+extern
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+ fts_ast_node_t* list, /*!< in: list node instance */
+ fts_ast_node_t* node); /*!< in: (sub) expr to add */
+/********************************************************************
+Print the AST node recursively.*/
+extern
+void
+fts_ast_node_print(
+/*===============*/
+ fts_ast_node_t* node); /*!< in: ast node to print */
+/********************************************************************
+Free node and expr allocations.*/
+extern
+void
+fts_ast_state_free(
+/*===============*/
+ fts_ast_state_t*state); /*!< in: state instance
+ to free */
+/** Check only union operation involved in the node
+@param[in] node ast node to check
+@return true if the node contains only union else false. */
+bool
+fts_ast_node_check_union(
+ fts_ast_node_t* node);
+
+/******************************************************************//**
+Traverse the AST - in-order traversal.
+@return DB_SUCCESS if all went well */
+dberr_t
+fts_ast_visit(
+/*==========*/
+ fts_ast_oper_t oper, /*!< in: FTS operator */
+ fts_ast_node_t* node, /*!< in: instance to traverse*/
+ fts_ast_callback visitor, /*!< in: callback */
+ void* arg, /*!< in: callback arg */
+ bool* has_ignore) /*!< out: whether we encounter
+ and ignored processing an
+ operator, currently we only
+ ignore FTS_IGNORE operator */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************
+Create a lex instance.*/
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+ ibool boolean_mode, /*!< in: query type */
+ const byte* query, /*!< in: query string */
+ ulint query_len) /*!< in: query string len */
+ MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+fts_lexer_free(
+/*===========*/
+ fts_lexer_t* fts_lexer) /*!< in: lexer instance to
+ free */
+ MY_ATTRIBUTE((nonnull));
+
+/**
+Create an ast string object, with NUL-terminator, so the string
+has one more byte than len
+@param[in] str pointer to string
+@param[in] len length of the string
+@return ast string with NUL-terminator */
+fts_ast_string_t*
+fts_ast_string_create(
+ const byte* str,
+ ulint len);
+
+/**
+Free an ast string instance
+@param[in,out] ast_str string to free */
+void
+fts_ast_string_free(
+ fts_ast_string_t* ast_str);
+
+/**
+Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul
+@param[in] str string to translate
+@param[in] base the base
+@return translated number */
+ulint
+fts_ast_string_to_ul(
+ const fts_ast_string_t* ast_str,
+ int base);
+
+/* String of length len.
+We always store the string of length len with a terminating '\0',
+regardless of there is any 0x00 in the string itself */
+struct fts_ast_string_t {
+ /*!< Pointer to string. */
+ byte* str;
+
+ /*!< Length of the string. */
+ ulint len;
+};
+
+/* Query term type */
+struct fts_ast_term_t {
+ fts_ast_string_t* ptr; /*!< Pointer to term string.*/
+ ibool wildcard; /*!< TRUE if wild card set.*/
+};
+
+/* Query text type */
+struct fts_ast_text_t {
+ fts_ast_string_t* ptr; /*!< Pointer to text string.*/
+ ulint distance; /*!< > 0 if proximity distance
+ set */
+};
+
+/* The list of nodes in an expr list */
+struct fts_ast_list_t {
+ fts_ast_node_t* head; /*!< Children list head */
+ fts_ast_node_t* tail; /*!< Children list tail */
+};
+
+/* FTS AST node to store the term, text, operator and sub-expressions.*/
+struct fts_ast_node_t {
+ fts_ast_type_t type; /*!< The type of node */
+ fts_ast_text_t text; /*!< Text node */
+ fts_ast_term_t term; /*!< Term node */
+ fts_ast_oper_t oper; /*!< Operator value */
+ fts_ast_list_t list; /*!< Expression list */
+ fts_ast_node_t* next; /*!< Link for expr list */
+ fts_ast_node_t* next_alloc; /*!< For tracking allocations */
+ bool visited; /*!< whether this node is
+ already processed */
+ /** current transaction */
+ const trx_t* trx;
+ /* Used by plugin parser */
+ fts_ast_node_t* up_node; /*!< Direct up node */
+ bool go_up; /*!< Flag if go one level up */
+};
+
+/* To track state during parsing */
+struct fts_ast_state_t {
+ mem_heap_t* heap; /*!< Heap to use for alloc */
+ fts_ast_node_t* root; /*!< If all goes OK, then this
+ will point to the root.*/
+
+ fts_ast_list_t list; /*!< List of nodes allocated */
+
+ fts_lexer_t* lexer; /*!< Lexer callback + arg */
+ CHARSET_INFO* charset; /*!< charset used for
+ tokenization */
+ /* Used by plugin parser */
+ fts_ast_node_t* cur_node; /*!< Current node into which
+ we add new node */
+ int depth; /*!< Depth of parsing state */
+};
+
+/******************************************************************//**
+Create an AST term node, makes a copy of ptr for plugin parser
+@return node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term_for_parser(
+/*==========i=====================*/
+ void* arg, /*!< in: ast state */
+ const char* ptr, /*!< in: term string */
+ const ulint len); /*!< in: term string length */
+
+/******************************************************************//**
+Create an AST phrase list node for plugin parser
+@return node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_phrase_list(
+/*============================*/
+ void* arg); /*!< in: ast state */
+
+#ifdef UNIV_DEBUG
+const char*
+fts_ast_node_type_get(fts_ast_type_t type);
+#endif /* UNIV_DEBUG */
+
+#endif /* INNOBASE_FSTS0AST_H */
diff --git a/storage/innobase/include/fts0blex.h b/storage/innobase/include/fts0blex.h
new file mode 100644
index 00000000..b16e7f2c
--- /dev/null
+++ b/storage/innobase/include/fts0blex.h
@@ -0,0 +1,702 @@
+#ifndef fts0bHEADER_H
+#define fts0bHEADER_H 1
+#define fts0bIN_HEADER 1
+
+#line 6 "../include/fts0blex.h"
+
+#line 8 "../include/fts0blex.h"
+
+#define YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0b_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0b_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0b_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0b_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0b_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0b_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0b_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0b_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0b_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0b_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0b_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0b_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0b_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0b_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0b_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0b_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0b_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0b_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0bpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0bpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0bpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0bpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0bensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0bensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0blex_ALREADY_DEFINED
+#else
+#define yylex fts0blex
+#endif
+
+#ifdef yyrestart
+#define fts0brestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0brestart
+#endif
+
+#ifdef yylex_init
+#define fts0blex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0blex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0blex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0blex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0blex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0blex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0bget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0bget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0bset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0bset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0bget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0bget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0bset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0bset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0bget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0bget_in
+#endif
+
+#ifdef yyset_in
+#define fts0bset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0bset_in
+#endif
+
+#ifdef yyget_out
+#define fts0bget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0bget_out
+#endif
+
+#ifdef yyset_out
+#define fts0bset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0bset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0bget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0bget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0bget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0bget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0bget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0bget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0bset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0bset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0bget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0bget_column
+#endif
+
+#ifdef yyset_column
+#define fts0bset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0bset_column
+#endif
+
+#ifdef yywrap
+#define fts0bwrap_ALREADY_DEFINED
+#else
+#define yywrap fts0bwrap
+#endif
+
+#ifdef yyalloc
+#define fts0balloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0balloc
+#endif
+
+#ifdef yyrealloc
+#define fts0brealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0brealloc
+#endif
+
+#ifdef yyfree
+#define fts0bfree_ALREADY_DEFINED
+#else
+#define yyfree fts0bfree
+#endif
+
+/* First, we deal with platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+ are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+ {
+ FILE *yy_input_file;
+
+ char *yy_ch_buf; /* input buffer */
+ char *yy_buf_pos; /* current position in input buffer */
+
+ /* Size of input buffer in bytes, not including room for EOB
+ * characters.
+ */
+ int yy_buf_size;
+
+ /* Number of characters read into yy_ch_buf, not including EOB
+ * characters.
+ */
+ int yy_n_chars;
+
+ /* Whether we "own" the buffer - i.e., we know we created it,
+ * and can realloc() it to grow it, and should free() it to
+ * delete it.
+ */
+ int yy_is_our_buffer;
+
+ /* Whether this is an "interactive" input source; if so, and
+ * if we're using stdio for input, then we want to use getc()
+ * instead of fread(), to make sure we stop fetching input after
+ * each newline.
+ */
+ int yy_is_interactive;
+
+ /* Whether we're considered to be at the beginning of a line.
+ * If so, '^' rules will be active on the next match, otherwise
+ * not.
+ */
+ int yy_at_bol;
+
+ int yy_bs_lineno; /**< The line count. */
+ int yy_bs_column; /**< The column count. */
+
+ /* Whether to try to fill the input buffer when we reach the
+ * end of it.
+ */
+ int yy_fill_buffer;
+
+ int yy_buffer_status;
+
+ };
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0bwrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+ These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out ( FILE * _out_str , yyscan_t yyscanner );
+
+ int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#ifndef fts0b_create_buffer_ALREADY_DEFINED
+#undef yy_create_buffer
+#endif
+#ifndef fts0b_delete_buffer_ALREADY_DEFINED
+#undef yy_delete_buffer
+#endif
+#ifndef fts0b_scan_buffer_ALREADY_DEFINED
+#undef yy_scan_buffer
+#endif
+#ifndef fts0b_scan_string_ALREADY_DEFINED
+#undef yy_scan_string
+#endif
+#ifndef fts0b_scan_bytes_ALREADY_DEFINED
+#undef yy_scan_bytes
+#endif
+#ifndef fts0b_init_buffer_ALREADY_DEFINED
+#undef yy_init_buffer
+#endif
+#ifndef fts0b_flush_buffer_ALREADY_DEFINED
+#undef yy_flush_buffer
+#endif
+#ifndef fts0b_load_buffer_state_ALREADY_DEFINED
+#undef yy_load_buffer_state
+#endif
+#ifndef fts0b_switch_to_buffer_ALREADY_DEFINED
+#undef yy_switch_to_buffer
+#endif
+#ifndef fts0bpush_buffer_state_ALREADY_DEFINED
+#undef yypush_buffer_state
+#endif
+#ifndef fts0bpop_buffer_state_ALREADY_DEFINED
+#undef yypop_buffer_state
+#endif
+#ifndef fts0bensure_buffer_stack_ALREADY_DEFINED
+#undef yyensure_buffer_stack
+#endif
+#ifndef fts0blex_ALREADY_DEFINED
+#undef yylex
+#endif
+#ifndef fts0brestart_ALREADY_DEFINED
+#undef yyrestart
+#endif
+#ifndef fts0blex_init_ALREADY_DEFINED
+#undef yylex_init
+#endif
+#ifndef fts0blex_init_extra_ALREADY_DEFINED
+#undef yylex_init_extra
+#endif
+#ifndef fts0blex_destroy_ALREADY_DEFINED
+#undef yylex_destroy
+#endif
+#ifndef fts0bget_debug_ALREADY_DEFINED
+#undef yyget_debug
+#endif
+#ifndef fts0bset_debug_ALREADY_DEFINED
+#undef yyset_debug
+#endif
+#ifndef fts0bget_extra_ALREADY_DEFINED
+#undef yyget_extra
+#endif
+#ifndef fts0bset_extra_ALREADY_DEFINED
+#undef yyset_extra
+#endif
+#ifndef fts0bget_in_ALREADY_DEFINED
+#undef yyget_in
+#endif
+#ifndef fts0bset_in_ALREADY_DEFINED
+#undef yyset_in
+#endif
+#ifndef fts0bget_out_ALREADY_DEFINED
+#undef yyget_out
+#endif
+#ifndef fts0bset_out_ALREADY_DEFINED
+#undef yyset_out
+#endif
+#ifndef fts0bget_leng_ALREADY_DEFINED
+#undef yyget_leng
+#endif
+#ifndef fts0bget_text_ALREADY_DEFINED
+#undef yyget_text
+#endif
+#ifndef fts0bget_lineno_ALREADY_DEFINED
+#undef yyget_lineno
+#endif
+#ifndef fts0bset_lineno_ALREADY_DEFINED
+#undef yyset_lineno
+#endif
+#ifndef fts0bget_column_ALREADY_DEFINED
+#undef yyget_column
+#endif
+#ifndef fts0bset_column_ALREADY_DEFINED
+#undef yyset_column
+#endif
+#ifndef fts0bwrap_ALREADY_DEFINED
+#undef yywrap
+#endif
+#ifndef fts0bget_lval_ALREADY_DEFINED
+#undef yyget_lval
+#endif
+#ifndef fts0bset_lval_ALREADY_DEFINED
+#undef yyset_lval
+#endif
+#ifndef fts0bget_lloc_ALREADY_DEFINED
+#undef yyget_lloc
+#endif
+#ifndef fts0bset_lloc_ALREADY_DEFINED
+#undef yyset_lloc
+#endif
+#ifndef fts0balloc_ALREADY_DEFINED
+#undef yyalloc
+#endif
+#ifndef fts0brealloc_ALREADY_DEFINED
+#undef yyrealloc
+#endif
+#ifndef fts0bfree_ALREADY_DEFINED
+#undef yyfree
+#endif
+#ifndef fts0btext_ALREADY_DEFINED
+#undef yytext
+#endif
+#ifndef fts0bleng_ALREADY_DEFINED
+#undef yyleng
+#endif
+#ifndef fts0bin_ALREADY_DEFINED
+#undef yyin
+#endif
+#ifndef fts0bout_ALREADY_DEFINED
+#undef yyout
+#endif
+#ifndef fts0b_flex_debug_ALREADY_DEFINED
+#undef yy_flex_debug
+#endif
+#ifndef fts0blineno_ALREADY_DEFINED
+#undef yylineno
+#endif
+#ifndef fts0btables_fload_ALREADY_DEFINED
+#undef yytables_fload
+#endif
+#ifndef fts0btables_destroy_ALREADY_DEFINED
+#undef yytables_destroy
+#endif
+#ifndef fts0bTABLES_NAME_ALREADY_DEFINED
+#undef yyTABLES_NAME
+#endif
+
+#line 74 "fts0blex.l"
+
+
+#line 701 "../include/fts0blex.h"
+#undef fts0bIN_HEADER
+#endif /* fts0bHEADER_H */
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
new file mode 100644
index 00000000..906ece2e
--- /dev/null
+++ b/storage/innobase/include/fts0fts.h
@@ -0,0 +1,976 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0fts.h
+Full text search header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#pragma once
+
+#include "data0type.h"
+#include "data0types.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "ut0rbt.h"
+#include "ut0wqueue.h"
+#include "que0types.h"
+#include "ft_global.h"
+#include "mysql/plugin_ftparser.h"
+
+/** "NULL" value of a document id. */
+#define FTS_NULL_DOC_ID 0
+
+/** FTS hidden column that is used to map to and from the row */
+#define FTS_DOC_ID_COL_NAME "FTS_DOC_ID"
+
+/** The name of the index created by FTS */
+#define FTS_DOC_ID_INDEX_NAME "FTS_DOC_ID_INDEX"
+
+#define FTS_DOC_ID_INDEX_NAME_LEN 16
+
+/** Doc ID is a 8 byte value */
+#define FTS_DOC_ID_LEN 8
+
+/** The number of fields to sort when we build FT index with
+FIC. Three fields are sort: (word, doc_id, position) */
+#define FTS_NUM_FIELDS_SORT 3
+
+/** Maximum number of rows in a table, smaller than which, we will
+optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */
+#define MAX_DOC_ID_OPT_VAL 1073741824
+
+/** Document id type. */
+typedef ib_id_t doc_id_t;
+
+/** doc_id_t printf format */
+#define FTS_DOC_ID_FORMAT IB_ID_FMT
+
+/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */
+#define fts_write_doc_id(d, s) mach_write_to_8(d, s)
+
+/** Read a document id to internal format. */
+#define fts_read_doc_id(s) mach_read_from_8(s)
+
+/** Bind the doc id to a variable */
+#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v)
+
+/** Defines for FTS query mode, they have the same values as
+those defined in mysql file ft_global.h */
+#define FTS_NL 0
+#define FTS_BOOL 1
+#define FTS_SORTED 2
+#define FTS_EXPAND 4
+#define FTS_NO_RANKING 8
+#define FTS_PROXIMITY 16
+#define FTS_PHRASE 32
+#define FTS_OPT_RANKING 64
+
+#define FTS_INDEX_TABLE_IND_NAME "FTS_INDEX_TABLE_IND"
+
+/** The number of FTS index partitions for a fulltext idnex */
+#define FTS_NUM_AUX_INDEX 6
+
+/** Threshold where our optimize thread automatically kicks in */
+#define FTS_OPTIMIZE_THRESHOLD 10000000
+
+/** Threshold to avoid exhausting of doc ids. Consecutive doc id difference
+should not exceed FTS_DOC_ID_MAX_STEP */
+#define FTS_DOC_ID_MAX_STEP 65535
+
+/** Maximum possible Fulltext word length in bytes (assuming mbmaxlen=4) */
+#define FTS_MAX_WORD_LEN (HA_FT_MAXCHARLEN * 4)
+
+/** Maximum possible Fulltext word length (in characters) */
+#define FTS_MAX_WORD_LEN_IN_CHAR HA_FT_MAXCHARLEN
+
+/** Number of columns in FTS AUX Tables */
+#define FTS_DELETED_TABLE_NUM_COLS 1
+#define FTS_CONFIG_TABLE_NUM_COLS 2
+#define FTS_AUX_INDEX_TABLE_NUM_COLS 5
+
+/** DELETED_TABLE(doc_id BIGINT UNSIGNED) */
+#define FTS_DELETED_TABLE_COL_LEN 8
+/** CONFIG_TABLE(key CHAR(50), value CHAR(200)) */
+#define FTS_CONFIG_TABLE_KEY_COL_LEN 50
+#define FTS_CONFIG_TABLE_VALUE_COL_LEN 200
+
+#define FTS_INDEX_FIRST_DOC_ID_LEN 8
+#define FTS_INDEX_LAST_DOC_ID_LEN 8
+#define FTS_INDEX_DOC_COUNT_LEN 4
+/* BLOB COLUMN, 0 means VARIABLE SIZE */
+#define FTS_INDEX_ILIST_LEN 0
+
+
+/** Variable specifying the FTS parallel sort degree */
+extern ulong fts_sort_pll_degree;
+
+/** Variable specifying the number of word to optimize for each optimize table
+call */
+extern ulong fts_num_word_optimize;
+
+/** Variable specifying whether we do additional FTS diagnostic printout
+in the log */
+extern char fts_enable_diag_print;
+
+/** FTS rank type, which will be between 0 .. 1 inclusive */
+typedef float fts_rank_t;
+
+/** Type of a row during a transaction. FTS_NOTHING means the row can be
+forgotten from the FTS system's POV, FTS_INVALID is an internal value used
+to mark invalid states.
+
+NOTE: Do not change the order or value of these, fts_trx_row_get_new_state
+depends on them being exactly as they are. */
+enum fts_row_state {
+ FTS_INSERT = 0,
+ FTS_MODIFY,
+ FTS_DELETE,
+ FTS_NOTHING,
+ FTS_INVALID
+};
+
+/** The FTS table types. */
+enum fts_table_type_t {
+ FTS_INDEX_TABLE, /*!< FTS auxiliary table that is
+ specific to a particular FTS index
+ on a table */
+
+ FTS_COMMON_TABLE /*!< FTS auxiliary table that is common
+ for all FTS index on a table */
+};
+
+struct fts_doc_t;
+struct fts_cache_t;
+struct fts_token_t;
+struct fts_doc_ids_t;
+struct fts_index_cache_t;
+
+
+/** Initialize the "fts_table" for internal query into FTS auxiliary
+tables */
+#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\
+do { \
+ (fts_table)->suffix = m_suffix; \
+ (fts_table)->type = m_type; \
+ (fts_table)->table_id = m_table->id; \
+ (fts_table)->table = m_table; \
+} while (0);
+
+#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\
+do { \
+ (fts_table)->suffix = m_suffix; \
+ (fts_table)->type = m_type; \
+ (fts_table)->table_id = m_index->table->id; \
+ (fts_table)->table = m_index->table; \
+ (fts_table)->index_id = m_index->id; \
+} while (0);
+
+/** Information about changes in a single transaction affecting
+the FTS system. */
+struct fts_trx_t {
+ trx_t* trx; /*!< InnoDB transaction */
+
+ ib_vector_t* savepoints; /*!< Active savepoints, must have at
+ least one element, the implied
+ savepoint */
+ ib_vector_t* last_stmt; /*!< last_stmt */
+
+ mem_heap_t* heap; /*!< heap */
+};
+
+/** Information required for transaction savepoint handling. */
+struct fts_savepoint_t {
+ char* name; /*!< First entry is always NULL, the
+ default instance. Otherwise the name
+ of the savepoint */
+
+ ib_rbt_t* tables; /*!< Modified FTS tables */
+};
+
+/** Information about changed rows in a transaction for a single table. */
+struct fts_trx_table_t {
+ dict_table_t* table; /*!< table */
+
+ fts_trx_t* fts_trx; /*!< link to parent */
+
+ ib_rbt_t* rows; /*!< rows changed; indexed by doc-id,
+ cells are fts_trx_row_t* */
+
+ fts_doc_ids_t* added_doc_ids; /*!< list of added doc ids (NULL until
+ the first addition) */
+
+ /*!< for adding doc ids */
+ que_t* docs_added_graph;
+};
+
+/** Information about one changed row in a transaction. */
+struct fts_trx_row_t {
+ doc_id_t doc_id; /*!< Id of the ins/upd/del document */
+
+ fts_row_state state; /*!< state of the row */
+
+ ib_vector_t* fts_indexes; /*!< The indexes that are affected */
+};
+
+/** List of document ids that were added during a transaction. This
+list is passed on to a background 'Add' thread and OPTIMIZE, so it
+needs its own memory heap. */
+struct fts_doc_ids_t {
+ ib_vector_t* doc_ids; /*!< document ids (each element is
+ of type doc_id_t). */
+
+ ib_alloc_t* self_heap; /*!< Allocator used to create an
+ instance of this type and the
+ doc_ids vector */
+};
+
+// FIXME: Get rid of this if possible.
+/** Since MySQL's character set support for Unicode is woefully inadequate
+(it supports basic operations like isalpha etc. only for 8-bit characters),
+we have to implement our own. We use UTF-16 without surrogate processing
+as our in-memory format. This typedef is a single such character. */
+typedef unsigned short ib_uc_t;
+
+/** An UTF-16 ro UTF-8 string. */
+struct fts_string_t {
+ byte* f_str; /*!< string, not necessary terminated in
+ any way */
+ ulint f_len; /*!< Length of the string in bytes */
+ ulint f_n_char; /*!< Number of characters */
+};
+
+/** Query ranked doc ids. */
+struct fts_ranking_t {
+ doc_id_t doc_id; /*!< Document id */
+
+ fts_rank_t rank; /*!< Rank is between 0 .. 1 */
+
+ byte* words; /*!< this contains the words
+ that were queried
+ and found in this document */
+ ulint words_len; /*!< words len */
+};
+
+/** Query result. */
+struct fts_result_t {
+ ib_rbt_node_t* current; /*!< Current element */
+
+ ib_rbt_t* rankings_by_id; /*!< RB tree of type fts_ranking_t
+ indexed by doc id */
+ ib_rbt_t* rankings_by_rank;/*!< RB tree of type fts_ranking_t
+ indexed by rank */
+};
+
+/** This is used to generate the FTS auxiliary table name, we need the
+table id and the index id to generate the column specific FTS auxiliary
+table name. */
+struct fts_table_t {
+ fts_table_type_t
+ type; /*!< The auxiliary table type */
+
+ table_id_t table_id; /*!< The table id */
+
+ index_id_t index_id; /*!< The index id */
+
+ const char* suffix; /*!< The suffix of the fts auxiliary
+ table name, can be NULL, not used
+ everywhere (yet) */
+ const dict_table_t*
+ table; /*!< Parent table */
+ CHARSET_INFO* charset; /*!< charset info if it is for FTS
+ index auxiliary table */
+};
+
+/** The state of the FTS sub system. */
+class fts_t {
+public:
+ /** fts_t constructor.
+ @param[in] table table with FTS indexes
+ @param[in,out] heap memory heap where 'this' is stored */
+ fts_t(
+ const dict_table_t* table,
+ mem_heap_t* heap);
+
+ /** fts_t destructor. */
+ ~fts_t();
+
+ /** Whether the ADDED table record sync-ed after crash recovery */
+ unsigned added_synced:1;
+ /** Whether the table holds dict_sys.mutex */
+ unsigned dict_locked:1;
+
+ /** Work queue for scheduling jobs for the FTS 'Add' thread, or NULL
+ if the thread has not yet been created. Each work item is a
+ fts_trx_doc_ids_t*. */
+ ib_wqueue_t* add_wq;
+
+ /** FTS memory buffer for this table, or NULL if the table has no FTS
+ index. */
+ fts_cache_t* cache;
+
+ /** FTS doc id hidden column number in the CLUSTERED index. */
+ ulint doc_col;
+
+ /** Vector of FTS indexes, this is mainly for caching purposes. */
+ ib_vector_t* indexes;
+
+ /** Whether the table exists in fts_optimize_wq;
+ protected by fts_optimize_wq mutex */
+ bool in_queue;
+
+ /** Whether the sync message exists in fts_optimize_wq;
+ protected by fts_optimize_wq mutex */
+ bool sync_message;
+
+ /** Heap for fts_t allocation. */
+ mem_heap_t* fts_heap;
+};
+
+struct fts_stopword_t;
+
+/** status bits for fts_stopword_t status field. */
+#define STOPWORD_NOT_INIT 0x1
+#define STOPWORD_OFF 0x2
+#define STOPWORD_FROM_DEFAULT 0x4
+#define STOPWORD_USER_TABLE 0x8
+
+extern const char* fts_default_stopword[];
+
+/** Variable specifying the maximum FTS cache size for each table */
+extern ulong fts_max_cache_size;
+
+/** Variable specifying the total memory allocated for FTS cache */
+extern ulong fts_max_total_cache_size;
+
+/** Variable specifying the FTS result cache limit for each query */
+extern size_t fts_result_cache_limit;
+
+/** Variable specifying the maximum FTS max token size */
+extern ulong fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+extern ulong fts_min_token_size;
+
+/** Whether the total memory used for FTS cache is exhausted, and we will
+need a sync to free some memory */
+extern bool fts_need_sync;
+
+#define fts_que_graph_free(graph) \
+do { \
+ mutex_enter(&dict_sys.mutex); \
+ que_graph_free(graph); \
+ mutex_exit(&dict_sys.mutex); \
+} while (0)
+
+/******************************************************************//**
+Create a FTS cache. */
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+ dict_table_t* table); /*!< table owns the FTS cache */
+
+/******************************************************************//**
+Create a FTS index cache.
+@return Index Cache */
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+ dict_table_t* table, /*!< in: table with FTS index */
+ dict_index_t* index); /*!< in: FTS index */
+
+/******************************************************************//**
+Get the next available document id. This function creates a new
+transaction to generate the document id.
+@return DB_SUCCESS if OK */
+dberr_t
+fts_get_next_doc_id(
+/*================*/
+ const dict_table_t* table, /*!< in: table */
+ doc_id_t* doc_id);/*!< out: new document id */
+/*********************************************************************//**
+Update the next and last Doc ID in the CONFIG table to be the input
+"doc_id" value (+ 1). We would do so after each FTS index build or
+table truncate */
+void
+fts_update_next_doc_id(
+/*===================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const dict_table_t* table, /*!< in: table */
+ doc_id_t doc_id) /*!< in: DOC ID to set */
+ MY_ATTRIBUTE((nonnull(2)));
+
+/******************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t. */
+fts_doc_ids_t*
+fts_doc_ids_create(void);
+/*=====================*/
+
+/** Free fts_doc_ids_t */
+inline void fts_doc_ids_free(fts_doc_ids_t* doc_ids)
+{
+ mem_heap_free(static_cast<mem_heap_t*>(doc_ids->self_heap->arg));
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+void
+fts_trx_add_op(
+/*===========*/
+ trx_t* trx, /*!< in: InnoDB transaction */
+ dict_table_t* table, /*!< in: table */
+ doc_id_t doc_id, /*!< in: doc id */
+ fts_row_state state, /*!< in: state of the row */
+ ib_vector_t* fts_indexes); /*!< in: FTS indexes affected
+ (NULL=all) */
+
+/******************************************************************//**
+Free an FTS trx. */
+void
+fts_trx_free(
+/*=========*/
+ fts_trx_t* fts_trx); /*!< in, own: FTS trx */
+
+/** Creates the common auxiliary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+The following tables are created.
+CREATE TABLE $FTS_PREFIX_DELETED
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_DELETED_CACHE
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE
+ (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_CONFIG
+ (key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key)
+@param[in,out] trx transaction
+@param[in] table table with FTS index
+@param[in] skip_doc_id_index Skip index on doc id
+@return DB_SUCCESS if succeed */
+dberr_t
+fts_create_common_tables(
+ trx_t* trx,
+ dict_table_t* table,
+ bool skip_doc_id_index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table. row_mysql_lock_data_dictionary must have
+been called before this.
+
+All FTS AUX Index tables have the following schema.
+CREAT TABLE $FTS_PREFIX_INDEX_[1-6](
+ word VARCHAR(FTS_MAX_WORD_LEN),
+ first_doc_id INT NOT NULL,
+ last_doc_id UNSIGNED NOT NULL,
+ doc_count UNSIGNED INT NOT NULL,
+ ilist VARBINARY NOT NULL,
+ UNIQUE CLUSTERED INDEX ON (word, first_doc_id))
+@param[in,out] trx dictionary transaction
+@param[in] index fulltext index
+@param[in] id table id
+@return DB_SUCCESS or error code */
+dberr_t
+fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Add the FTS document id hidden column. */
+void
+fts_add_doc_id_column(
+/*==================*/
+ dict_table_t* table, /*!< in/out: Table with FTS index */
+ mem_heap_t* heap); /*!< in: temporary memory heap, or NULL */
+
+/*********************************************************************//**
+Drops the ancillary tables needed for supporting an FTS index on the
+given table. row_mysql_lock_data_dictionary must have been called before
+this.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_drop_tables(
+/*============*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table); /*!< in: table has the FTS
+ index */
+/******************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_commit(
+/*=======*/
+ trx_t* trx) /*!< in: transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** FTS Query entry point.
+@param[in,out] trx transaction
+@param[in] index fts index to search
+@param[in] flags FTS search mode
+@param[in] query_str FTS query
+@param[in] query_len FTS query string len in bytes
+@param[in,out] result result doc ids
+@return DB_SUCCESS if successful otherwise error code */
+dberr_t
+fts_query(
+ trx_t* trx,
+ dict_index_t* index,
+ uint flags,
+ const byte* query_str,
+ ulint query_len,
+ fts_result_t** result)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value. */
+float
+fts_retrieve_ranking(
+/*=================*/
+ fts_result_t* result, /*!< in: FTS result structure */
+ doc_id_t doc_id); /*!< in: the interested document
+ doc_id */
+
+/******************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+ fts_result_t* result); /*!< out: result instance
+ to sort.*/
+
+/******************************************************************//**
+FTS Query free result, returned by fts_query(). */
+void
+fts_query_free_result(
+/*==================*/
+ fts_result_t* result); /*!< in: result instance
+ to free.*/
+
+/******************************************************************//**
+Extract the doc id from the FTS hidden column. */
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ dtuple_t* row); /*!< in: row whose FTS doc id we
+ want to extract.*/
+
+/** Extract the doc id from the record that belongs to index.
+@param[in] rec record containing FTS_DOC_ID
+@param[in] index index of rec
+@param[in] offsets rec_get_offsets(rec,index)
+@return doc id that was extracted from rec */
+doc_id_t
+fts_get_doc_id_from_rec(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets);
+
+/** Add new fts doc id to the update vector.
+@param[in] table the table that contains the FTS index.
+@param[in,out] ufield the fts doc id field in the update vector.
+ No new memory is allocated for this in this
+ function.
+@param[in,out] next_doc_id the fts doc id that has been added to the
+ update vector. If 0, a new fts doc id is
+ automatically generated. The memory provided
+ for this argument will be used by the update
+ vector. Ensure that the life time of this
+ memory matches that of the update vector.
+@return the fts doc id used in the update vector */
+doc_id_t
+fts_update_doc_id(
+ dict_table_t* table,
+ upd_field_t* ufield,
+ doc_id_t* next_doc_id);
+
+/******************************************************************//**
+FTS initialize. */
+void
+fts_startup(void);
+/*==============*/
+
+/******************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+fts_t*
+fts_create(
+/*=======*/
+ dict_table_t* table); /*!< out: table with FTS
+ indexes */
+
+/**********************************************************************//**
+Free the FTS resources. */
+void
+fts_free(
+/*=====*/
+ dict_table_t* table); /*!< in/out: table with
+ FTS indexes */
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+dberr_t
+fts_optimize_table(
+/*===============*/
+ dict_table_t* table); /*!< in: table to optimiza */
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+void
+fts_optimize_init(void);
+/*====================*/
+
+/****************************************************************//**
+Drops index ancillary tables for a FTS index
+@return DB_SUCCESS or error code */
+dberr_t
+fts_drop_index_tables(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index) /*!< in: Index to drop */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Add the table to add to the OPTIMIZER's list.
+@param[in] table table to add */
+void
+fts_optimize_add_table(
+ dict_table_t* table);
+
+/******************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+void
+fts_optimize_remove_table(
+/*======================*/
+ dict_table_t* table); /*!< in: table to remove */
+
+/** Shutdown fts optimize thread. */
+void
+fts_optimize_shutdown();
+
+/** Send sync fts cache for the table.
+@param[in] table table to sync */
+void
+fts_optimize_request_sync_table(
+ dict_table_t* table);
+
+/**********************************************************************//**
+Take a FTS savepoint. */
+void
+fts_savepoint_take(
+/*===============*/
+ fts_trx_t* fts_trx, /*!< in: fts transaction */
+ const char* name); /*!< in: savepoint name */
+
+/**********************************************************************//**
+Refresh last statement savepoint. */
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+ trx_t* trx); /*!< in: transaction */
+
+/**********************************************************************//**
+Release the savepoint data identified by name. */
+void
+fts_savepoint_release(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* name); /*!< in: savepoint name */
+
+/** Clear cache.
+@param[in,out] cache fts cache */
+void
+fts_cache_clear(
+ fts_cache_t* cache);
+
+/*********************************************************************//**
+Initialize things in cache. */
+void
+fts_cache_init(
+/*===========*/
+ fts_cache_t* cache); /*!< in: cache */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+void
+fts_savepoint_rollback(
+/*===================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* name); /*!< in: savepoint name */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+ trx_t* trx); /*!< in: transaction */
+
+/** Drop all orphaned FTS auxiliary tables, those that don't have a parent
+table or FTS index defined on them. */
+void fts_drop_orphaned_tables();
+
+/** Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@param[in,out] table fts table
+@param[in] wait whether to wait for existing sync to finish
+@return DB_SUCCESS on success, error code on failure. */
+dberr_t fts_sync_table(dict_table_t* table, bool wait = true);
+
+/****************************************************************//**
+Free the query graph but check whether dict_sys.mutex is already
+held */
+void
+fts_que_graph_free_check_lock(
+/*==========================*/
+ fts_table_t* fts_table, /*!< in: FTS table */
+ const fts_index_cache_t*index_cache, /*!< in: FTS index cache */
+ que_t* graph); /*!< in: query graph */
+
+/****************************************************************//**
+Create an FTS index cache. */
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+ dict_index_t* index); /*!< in: FTS index */
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the CONFIG table
+@return initial Doc ID */
+doc_id_t
+fts_init_doc_id(
+/*============*/
+ const dict_table_t* table); /*!< in: table */
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp(
+/*==================*/
+ const void* cs, /*!< in: Character set */
+ const void* p1, /*!< in: key */
+ const void* p2); /*!< in: node */
+
+/******************************************************************//**
+Makes all characters in a string lower case. */
+extern
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+ CHARSET_INFO* cs, /*!< in: Character set */
+ char* src, /*!< in: string to put in
+ lower case */
+ size_t src_len, /*!< in: input string length */
+ char* dst, /*!< in: buffer for result
+ string */
+ size_t dst_len); /*!< in: buffer size */
+
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+ const void* cs, /*!< in: Character set */
+ const void* p1, /*!< in: key */
+ const void* p2); /*!< in: node */
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+extern
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+ CHARSET_INFO* charset, /*!< in: Character set */
+ const byte* start, /*!< in: start of text */
+ const byte* end, /*!< in: one character past
+ end of text */
+ fts_string_t* token); /*!< out: token's text */
+
+/*************************************************************//**
+Get token char size by charset
+@return the number of token char size */
+ulint
+fts_get_token_size(
+/*===============*/
+ const CHARSET_INFO* cs, /*!< in: Character set */
+ const char* token, /*!< in: token */
+ ulint len); /*!< in: token length */
+
+/*************************************************************//**
+FULLTEXT tokenizer internal in MYSQL_FTPARSER_SIMPLE_MODE
+@return 0 if tokenize sucessfully */
+int
+fts_tokenize_document_internal(
+/*===========================*/
+ MYSQL_FTPARSER_PARAM* param, /*!< in: parser parameter */
+ const char* doc, /*!< in: document to tokenize */
+ int len); /*!< in: document length */
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+ulint
+fts_get_rows_count(
+/*===============*/
+ fts_table_t* fts_table); /*!< in: fts table to read */
+
+/*************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+ dict_table_t* table); /*!< in: user table */
+
+/******************************************************************//**
+Check whether user supplied stopword table exists and is of
+the right format.
+@return the stopword column charset if qualifies */
+CHARSET_INFO*
+fts_valid_stopword_table(
+/*=====================*/
+ const char* stopword_table_name); /*!< in: Stopword table
+ name */
+/****************************************************************//**
+This function loads specified stopword into FTS cache
+@return true if success */
+bool
+fts_load_stopword(
+/*==============*/
+ const dict_table_t*
+ table, /*!< in: Table with FTS */
+ trx_t* trx, /*!< in: Transaction */
+ const char* session_stopword_table, /*!< in: Session stopword table
+ name */
+ bool stopword_is_on, /*!< in: Whether stopword
+ option is turned on/off */
+ bool reload); /*!< in: Whether it is during
+ reload of FTS table */
+
+/****************************************************************//**
+Read the rows from the FTS index
+@return DB_SUCCESS if OK */
+dberr_t
+fts_table_fetch_doc_ids(
+/*====================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table, /*!< in: aux table */
+ fts_doc_ids_t* doc_ids); /*!< in: For collecting
+ doc ids */
+/****************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations
+@return TRUE if all OK */
+ibool
+fts_init_index(
+/*===========*/
+ dict_table_t* table, /*!< in: Table with FTS */
+ ibool has_cache_lock); /*!< in: Whether we already
+ have cache lock */
+/*******************************************************************//**
+Add a newly create index in FTS cache */
+void
+fts_add_index(
+/*==========*/
+ dict_index_t* index, /*!< FTS index to be added */
+ dict_table_t* table); /*!< table */
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+dberr_t
+fts_drop_index(
+/*===========*/
+ dict_table_t* table, /*!< in: Table where indexes are dropped */
+ dict_index_t* index, /*!< in: Index to be dropped */
+ trx_t* trx); /*!< in: Transaction for the drop */
+
+/****************************************************************//**
+Rename auxiliary tables for all fts index for a table
+@return DB_SUCCESS or error code */
+dberr_t
+fts_rename_aux_tables(
+/*==================*/
+ dict_table_t* table, /*!< in: user Table */
+ const char* new_name, /*!< in: new table name */
+ trx_t* trx); /*!< in: transaction */
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+ibool
+fts_check_cached_index(
+/*===================*/
+ dict_table_t* table); /*!< in: Table where indexes are dropped */
+
+/** Fetch the document from tuple, tokenize the text data and
+insert the text data into fts auxiliary table and
+its cache. Moreover this tuple fields doesn't contain any information
+about externally stored field. This tuple contains data directly
+converted from mysql.
+@param[in] ftt FTS transaction table
+@param[in] doc_id doc id
+@param[in] tuple tuple from where data can be retrieved
+ and tuple should be arranged in table
+ schema order. */
+void
+fts_add_doc_from_tuple(
+ fts_trx_table_t*ftt,
+ doc_id_t doc_id,
+ const dtuple_t* tuple);
+
+/** Create an FTS trx.
+@param[in,out] trx InnoDB Transaction
+@return FTS transaction. */
+fts_trx_t*
+fts_trx_create(
+ trx_t* trx);
+
+/** Clear all fts resources when there is no internal DOC_ID
+and there are no new fts index to add.
+@param[in,out] table table where fts is to be freed
+@param[in] trx transaction to drop all fts tables */
+void fts_clear_all(dict_table_t *table, trx_t *trx);
+
+/** Check whether the given name is fts auxiliary table
+and fetch the parent table id and index id
+@param[in] name table name
+@param[in,out] table_id parent table id
+@param[in,out] index_id index id
+@return true if it is auxilary table */
+bool fts_check_aux_table(const char *name,
+ table_id_t *table_id,
+ index_id_t *index_id);
+
+/** Sync the table during commit phase
+@param[in] table table to be synced */
+void fts_sync_during_ddl(dict_table_t* table);
diff --git a/storage/innobase/include/fts0opt.h b/storage/innobase/include/fts0opt.h
new file mode 100644
index 00000000..c527ad8e
--- /dev/null
+++ b/storage/innobase/include/fts0opt.h
@@ -0,0 +1,39 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0opt.h
+Full Text Search optimize thread
+
+Created 2011-02-15 Jimmy Yang
+***********************************************************************/
+#ifndef INNODB_FTS0OPT_H
+#define INNODB_FTS0OPT_H
+
+/** The FTS optimize thread's work queue. */
+extern ib_wqueue_t* fts_optimize_wq;
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record. */
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+ /* out: always returns non-NULL */
+ void* row, /* in: sel_node_t* */
+ void* user_arg); /* in: pointer to ib_vector_t */
+#endif
diff --git a/storage/innobase/include/fts0pars.h b/storage/innobase/include/fts0pars.h
new file mode 100644
index 00000000..8108e811
--- /dev/null
+++ b/storage/innobase/include/fts0pars.h
@@ -0,0 +1,72 @@
+/* A Bison parser, made by GNU Bison 2.5. */
+
+/* Bison interface for Yacc-like parsers in C
+
+ Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* As a special exception, you may create a larger work that contains
+ part or all of the Bison parser skeleton and distribute that work
+ under terms of your choice, so long as that work isn't itself a
+ parser generator using the skeleton or a modified version thereof
+ as a parser skeleton. Alternatively, if you modify or redistribute
+ the parser skeleton itself, you may (at your option) remove this
+ special exception, which will cause the skeleton and the resulting
+ Bison output files to be licensed under the GNU General Public
+ License without this special exception.
+
+ This special exception was added by the Free Software Foundation in
+ version 2.2 of Bison. */
+
+
+/* Tokens. */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+ /* Put the tokens into the symbol table, so that GDB and other debuggers
+ know about them. */
+ enum yytokentype {
+ FTS_OPER = 258,
+ FTS_TEXT = 259,
+ FTS_TERM = 260,
+ FTS_NUMB = 261
+ };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 2068 of yacc.c */
+#line 61 "fts0pars.y"
+
+ int oper;
+ fts_ast_string_t* token;
+ fts_ast_node_t* node;
+
+
+
+/* Line 2068 of yacc.c */
+#line 64 "fts0pars.hh"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+
+
diff --git a/storage/innobase/include/fts0plugin.h b/storage/innobase/include/fts0plugin.h
new file mode 100644
index 00000000..18ec2d6d
--- /dev/null
+++ b/storage/innobase/include/fts0plugin.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0plugin.h
+Full text search plugin header file
+
+Created 2013/06/04 Shaohua Wang
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PLUGIN_H
+#define INNOBASE_FTS0PLUGIN_H
+
+#include "univ.i"
+
+extern struct st_mysql_ftparser fts_default_parser;
+
+struct fts_ast_state_t;
+
+#define PARSER_INIT(parser, arg) if (parser->init) { parser->init(arg); }
+#define PARSER_DEINIT(parser, arg) if (parser->deinit) { parser->deinit(arg); }
+
+/******************************************************************//**
+fts parse query by plugin parser.
+@return 0 if parse successfully, or return non-zero. */
+int
+fts_parse_by_parser(
+/*================*/
+ ibool mode, /*!< in: query boolean mode */
+ uchar* query, /*!< in: query string */
+ ulint len, /*!< in: query string length */
+ st_mysql_ftparser* parse, /*!< in: fts plugin parser */
+ fts_ast_state_t* state); /*!< in: query parser state */
+
+#endif /* INNOBASE_FTS0PLUGIN_H */
diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h
new file mode 100644
index 00000000..4261fc25
--- /dev/null
+++ b/storage/innobase/include/fts0priv.h
@@ -0,0 +1,502 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.h
+Full text search internal header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PRIV_H
+#define INNOBASE_FTS0PRIV_H
+
+#include "dict0dict.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "que0types.h"
+#include "fts0types.h"
+
+/* The various states of the FTS sub system pertaining to a table with
+FTS indexes defined on it. */
+enum fts_table_state_enum {
+ /* !<This must be 0 since we insert
+ a hard coded '0' at create time
+ to the config table */
+
+ FTS_TABLE_STATE_RUNNING = 0, /*!< Auxiliary tables created OK */
+
+ FTS_TABLE_STATE_OPTIMIZING, /*!< This is a substate of RUNNING */
+
+ FTS_TABLE_STATE_DELETED /*!< All aux tables to be dropped when
+ it's safe to do so */
+};
+
+typedef enum fts_table_state_enum fts_table_state_t;
+
+/** The default time to wait for the background thread (in microsecnds). */
+#define FTS_MAX_BACKGROUND_THREAD_WAIT 10000
+
+/** Maximum number of iterations to wait before we complain */
+#define FTS_BACKGROUND_THREAD_WAIT_COUNT 1000
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_NAME_LEN 64
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_VALUE_LEN 1024
+
+/** Approx. upper limit of ilist length in bytes. */
+#define FTS_ILIST_MAX_SIZE (64 * 1024)
+
+/** FTS config table name parameters */
+
+/** The number of seconds after which an OPTIMIZE run will stop */
+#define FTS_OPTIMIZE_LIMIT_IN_SECS "optimize_checkpoint_limit"
+
+/** The next doc id */
+#define FTS_SYNCED_DOC_ID "synced_doc_id"
+
+/** The last word that was OPTIMIZED */
+#define FTS_LAST_OPTIMIZED_WORD "last_optimized_word"
+
+/** Total number of documents that have been deleted. The next_doc_id
+minus this count gives us the total number of documents. */
+#define FTS_TOTAL_DELETED_COUNT "deleted_doc_count"
+
+/** Total number of words parsed from all documents */
+#define FTS_TOTAL_WORD_COUNT "total_word_count"
+
+/** Start of optimize of an FTS index */
+#define FTS_OPTIMIZE_START_TIME "optimize_start_time"
+
+/** End of optimize for an FTS index */
+#define FTS_OPTIMIZE_END_TIME "optimize_end_time"
+
+/** User specified stopword table name */
+#define FTS_STOPWORD_TABLE_NAME "stopword_table_name"
+
+/** Whether to use (turn on/off) stopword */
+#define FTS_USE_STOPWORD "use_stopword"
+
+/** State of the FTS system for this table. It can be one of
+ RUNNING, OPTIMIZING, DELETED. */
+#define FTS_TABLE_STATE "table_state"
+
+/** The minimum length of an FTS auxiliary table names's id component
+e.g., For an auxiliary table name
+
+ FTS_<TABLE_ID>_SUFFIX
+
+This constant is for the minimum length required to store the <TABLE_ID>
+component.
+*/
+#define FTS_AUX_MIN_TABLE_ID_LENGTH 48
+
+/** Maximum length of an integer stored in the config table value column. */
+#define FTS_MAX_INT_LEN 32
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+que_t*
+fts_parse_sql(
+/*==========*/
+ fts_table_t* fts_table, /*!< in: FTS aux table */
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql) /*!< in: SQL string to evaluate */
+ MY_ATTRIBUTE((nonnull(3), malloc, warn_unused_result));
+/******************************************************************//**
+Evaluate a parsed SQL statement
+@return DB_SUCCESS or error code */
+dberr_t
+fts_eval_sql(
+/*=========*/
+ trx_t* trx, /*!< in: transaction */
+ que_t* graph) /*!< in: Parsed statement */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Construct the name of an internal FTS table for the given table.
+@param[in] fts_table metadata on fulltext-indexed table
+@param[out] table_name a name up to MAX_FULL_NAME_LEN
+@param[in] dict_locked whether dict_sys.mutex is being held */
+void fts_get_table_name(const fts_table_t* fts_table, char* table_name,
+ bool dict_locked = false)
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+ dict_index_t* index, /*!< in: FTS index */
+ pars_info_t* info, /*!< in/out: parser info */
+ mem_heap_t* heap) /*!< in: memory heap */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether
+we want to get Doc whose ID is equal to or greater or smaller than supplied
+ID */
+#define FTS_FETCH_DOC_BY_ID_EQUAL 1
+#define FTS_FETCH_DOC_BY_ID_LARGE 2
+#define FTS_FETCH_DOC_BY_ID_SMALL 3
+
+/*************************************************************//**
+Fetch document (= a single row's indexed text) with the given
+document id.
+@return: DB_SUCCESS if fetch is successful, else error */
+dberr_t
+fts_doc_fetch_by_doc_id(
+/*====================*/
+ fts_get_doc_t* get_doc, /*!< in: state */
+ doc_id_t doc_id, /*!< in: id of document to fetch */
+ dict_index_t* index_to_use, /*!< in: caller supplied FTS index,
+ or NULL */
+ ulint option, /*!< in: search option, if it is
+ greater than doc_id or equal */
+ fts_sql_callback
+ callback, /*!< in: callback to read
+ records */
+ void* arg) /*!< in: callback arg */
+ MY_ATTRIBUTE((nonnull(6)));
+
+/*******************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return always FALSE */
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: fts_doc_t* */
+ MY_ATTRIBUTE((nonnull));
+/********************************************************************
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+dberr_t
+fts_write_node(
+/*===========*/
+ trx_t* trx, /*!< in: transaction */
+ que_t** graph, /*!< in: query graph */
+ fts_table_t* fts_table, /*!< in: the FTS aux index */
+ fts_string_t* word, /*!< in: word in UTF-8 */
+ fts_node_t* node) /*!< in: node columns */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if a fts token is a stopword or less than fts_min_token_size
+or greater than fts_max_token_size.
+@param[in] token token string
+@param[in] stopwords stopwords rb tree
+@param[in] cs token charset
+@retval true if it is not stopword and length in range
+@retval false if it is stopword or length not in range */
+bool
+fts_check_token(
+ const fts_string_t* token,
+ const ib_rbt_t* stopwords,
+ const CHARSET_INFO* cs);
+
+/******************************************************************//**
+Initialize a document. */
+void
+fts_doc_init(
+/*=========*/
+ fts_doc_t* doc) /*!< in: doc to initialize */
+ MY_ATTRIBUTE((nonnull));
+
+/******************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be
+ inserted if not found */
+int
+fts_bsearch(
+/*========*/
+ doc_id_t* array, /*!< in: array to sort */
+ int lower, /*!< in: lower bound of array*/
+ int upper, /*!< in: upper bound of array*/
+ doc_id_t doc_id) /*!< in: doc id to lookup */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Free document. */
+void
+fts_doc_free(
+/*=========*/
+ fts_doc_t* doc) /*!< in: document */
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+void
+fts_word_free(
+/*==========*/
+ fts_word_t* word) /*!< in: instance to free.*/
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Read the rows from the FTS inde
+@return DB_SUCCESS or error code */
+dberr_t
+fts_index_fetch_nodes(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ que_t** graph, /*!< in: prepared statement */
+ fts_table_t* fts_table, /*!< in: FTS aux table */
+ const fts_string_t*
+ word, /*!< in: the word to fetch */
+ fts_fetch_t* fetch) /*!< in: fetch callback.*/
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Compare two fts_trx_table_t instances, we actually compare the
+table id's here.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+ const void* v1, /*!< in: id1 */
+ const void* v2) /*!< in: id2 */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Compare a table id with a trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#define fts_sql_commit(trx) trx_commit_for_mysql(trx)
+#define fts_sql_rollback(trx) (trx)->rollback()
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id. Don't acquire
+the dict mutex
+@return query graph */
+que_t*
+fts_parse_sql_no_dict_lock(
+/*=======================*/
+ pars_info_t* info, /*!< in: parser info */
+ const char* sql) /*!< in: SQL string to evaluate */
+ MY_ATTRIBUTE((nonnull(2), malloc, warn_unused_result));
+/******************************************************************//**
+Get value from config table. The caller must ensure that enough
+space is allocated for value to hold the column contents
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_value(
+/*=================*/
+ trx_t* trx, /* transaction */
+ fts_table_t* fts_table, /*!< in: the indexed FTS table */
+ const char* name, /*!< in: get config value for
+ this parameter name */
+ fts_string_t* value) /*!< out: value read from
+ config table */
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_index_value(
+/*=======================*/
+ trx_t* trx, /*!< transaction */
+ dict_index_t* index, /*!< in: index */
+ const char* param, /*!< in: get config value for
+ this parameter name */
+ fts_string_t* value) /*!< out: value read from
+ config table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_value(
+/*=================*/
+ trx_t* trx, /*!< transaction */
+ fts_table_t* fts_table, /*!< in: the indexed FTS table */
+ const char* name, /*!< in: get config value for
+ this parameter name */
+ const fts_string_t*
+ value) /*!< in: value to update */
+ MY_ATTRIBUTE((nonnull));
+/****************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_set_ulint(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table, /*!< in: the indexed FTS table */
+ const char* name, /*!< in: param name */
+ ulint int_value) /*!< in: value */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_index_value(
+/*=======================*/
+ trx_t* trx, /*!< transaction */
+ dict_index_t* index, /*!< in: index */
+ const char* param, /*!< in: get config value for
+ this parameter name */
+ fts_string_t* value) /*!< out: value read from
+ config table */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_index_ulint(
+/*=======================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: FTS index */
+ const char* name, /*!< in: param name */
+ ulint* int_value) /*!< out: value */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* FTS_OPTIMIZE_DEBUG */
+
+/******************************************************************//**
+Set an ulint value int the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_index_ulint(
+/*=======================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: FTS index */
+ const char* name, /*!< in: param name */
+ ulint int_value) /*!< in: value */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_ulint(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ fts_table_t* fts_table, /*!< in: the indexed FTS table */
+ const char* name, /*!< in: param name */
+ ulint* int_value) /*!< out: value */
+ MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+ const fts_index_cache_t*
+ index_cache, /*!< in: cache to search */
+ const fts_string_t*
+ text) /*!< in: word to search for */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/******************************************************************//**
+Append deleted doc ids to vector and sort the vector. */
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+ const fts_cache_t*
+ cache, /*!< in: cache to use */
+ ib_vector_t* vector); /*!< in: append to this vector */
+/******************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+fts_index_cache_t*
+fts_find_index_cache(
+/*================*/
+ const fts_cache_t*
+ cache, /*!< in: cache to search */
+ const dict_index_t*
+ index) /*!< in: index to search for */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+ ib_id_t id, /*!< in: a table/index id */
+ char* str); /*!< in: buffer to write the id to */
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+ ib_id_t* id, /*!< out: a table id */
+ const char* str) /*!< in: buffer to read from */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+int
+fts_get_table_id(
+/*=============*/
+ const fts_table_t*
+ fts_table, /*!< in: FTS Auxiliary table */
+ char* table_id) /*!< out: table id, must be at least
+ FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+ long */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Construct the name of an internal FTS table for the given table.
+@param[in] fts_table metadata on fulltext-indexed table
+@param[in] dict_locked whether dict_sys.mutex is being held
+@return the prefix, must be freed with ut_free() */
+char* fts_get_table_name_prefix(const fts_table_t* fts_table)
+ MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
+/******************************************************************//**
+Add node positions. */
+void
+fts_cache_node_add_positions(
+/*=========================*/
+ fts_cache_t* cache, /*!< in: cache */
+ fts_node_t* node, /*!< in: word node */
+ doc_id_t doc_id, /*!< in: doc id */
+ ib_vector_t* positions) /*!< in: fts_token_t::positions */
+ MY_ATTRIBUTE((nonnull(2,4)));
+
+/******************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+ const char* param, /*!< in: base name of param */
+ const dict_index_t* index) /*!< in: index for config */
+ MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
+
+#include "fts0priv.ic"
+
+#endif /* INNOBASE_FTS0PRIV_H */
diff --git a/storage/innobase/include/fts0priv.ic b/storage/innobase/include/fts0priv.ic
new file mode 100644
index 00000000..da14cfcb
--- /dev/null
+++ b/storage/innobase/include/fts0priv.ic
@@ -0,0 +1,121 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.ic
+Full text search internal header file
+
+Created 2011/11/12 Sunny Bains
+***********************************************************************/
+
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+ ib_id_t id, /* in: a table/index id */
+ char* str) /* in: buffer to write the id to */
+{
+
+#ifdef _WIN32
+
+ DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name",
+ return(sprintf(str, UINT64PFx, id)););
+
+ /* Use this to construct old(5.6.14 and 5.7.3) windows
+ ambiguous aux table names */
+ DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+ return(sprintf(str, "%016llu", (ulonglong) id)););
+
+#else /* _WIN32 */
+
+ /* Use this to construct old(5.6.14 and 5.7.3) windows
+ ambiguous aux table names */
+ DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name",
+ return(sprintf(str, "%016llu", (ulonglong) id)););
+
+ DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+ return(sprintf(str, "%016llx", (ulonglong) id)););
+
+#endif /* _WIN32 */
+
+ return(sprintf(str, "%016llx", (ulonglong) id));
+}
+
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+ ib_id_t* id, /* out: an id */
+ const char* str) /* in: buffer to read from */
+{
+ /* NOTE: this func doesn't care about whether current table
+ is set with HEX_NAME, the user of the id read here will check
+ if the id is HEX or DEC and do the right thing with it. */
+ return(sscanf(str, UINT64PFx, id) == 1);
+}
+
+/******************************************************************//**
+Compare two fts_trx_table_t instances.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const dict_table_t* table1
+ = (*static_cast<const fts_trx_table_t* const*>(p1))->table;
+
+ const dict_table_t* table2
+ = (*static_cast<const fts_trx_table_t* const*>(p2))->table;
+
+ return((table1->id > table2->id)
+ ? 1
+ : (table1->id == table2->id)
+ ? 0
+ : -1);
+}
+
+/******************************************************************//**
+Compare a table id with a fts_trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const uintmax_t* table_id = static_cast<const uintmax_t*>(p1);
+ const dict_table_t* table2
+ = (*static_cast<const fts_trx_table_t* const*>(p2))->table;
+
+ return((*table_id > table2->id)
+ ? 1
+ : (*table_id == table2->id)
+ ? 0
+ : -1);
+}
diff --git a/storage/innobase/include/fts0tlex.h b/storage/innobase/include/fts0tlex.h
new file mode 100644
index 00000000..89655ca1
--- /dev/null
+++ b/storage/innobase/include/fts0tlex.h
@@ -0,0 +1,702 @@
+#ifndef fts0tHEADER_H
+#define fts0tHEADER_H 1
+#define fts0tIN_HEADER 1
+
+#line 6 "../include/fts0tlex.h"
+
+#line 8 "../include/fts0tlex.h"
+
+#define YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0t_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0t_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0t_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0t_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0t_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0t_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0t_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0t_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0t_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0t_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0t_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0t_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0t_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0t_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0t_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0t_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0t_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0t_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0tpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0tpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0tpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0tpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0tensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0tensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0tlex_ALREADY_DEFINED
+#else
+#define yylex fts0tlex
+#endif
+
+#ifdef yyrestart
+#define fts0trestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0trestart
+#endif
+
+#ifdef yylex_init
+#define fts0tlex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0tlex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0tlex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0tlex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0tlex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0tlex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0tget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0tget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0tset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0tset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0tget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0tget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0tset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0tset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0tget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0tget_in
+#endif
+
+#ifdef yyset_in
+#define fts0tset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0tset_in
+#endif
+
+#ifdef yyget_out
+#define fts0tget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0tget_out
+#endif
+
+#ifdef yyset_out
+#define fts0tset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0tset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0tget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0tget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0tget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0tget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0tget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0tget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0tset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0tset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0tget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0tget_column
+#endif
+
+#ifdef yyset_column
+#define fts0tset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0tset_column
+#endif
+
+#ifdef yywrap
+#define fts0twrap_ALREADY_DEFINED
+#else
+#define yywrap fts0twrap
+#endif
+
+#ifdef yyalloc
+#define fts0talloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0talloc
+#endif
+
+#ifdef yyrealloc
+#define fts0trealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0trealloc
+#endif
+
+#ifdef yyfree
+#define fts0tfree_ALREADY_DEFINED
+#else
+#define yyfree fts0tfree
+#endif
+
+/* First, we deal with platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+ are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+ {
+ FILE *yy_input_file;
+
+ char *yy_ch_buf; /* input buffer */
+ char *yy_buf_pos; /* current position in input buffer */
+
+ /* Size of input buffer in bytes, not including room for EOB
+ * characters.
+ */
+ int yy_buf_size;
+
+ /* Number of characters read into yy_ch_buf, not including EOB
+ * characters.
+ */
+ int yy_n_chars;
+
+ /* Whether we "own" the buffer - i.e., we know we created it,
+ * and can realloc() it to grow it, and should free() it to
+ * delete it.
+ */
+ int yy_is_our_buffer;
+
+ /* Whether this is an "interactive" input source; if so, and
+ * if we're using stdio for input, then we want to use getc()
+ * instead of fread(), to make sure we stop fetching input after
+ * each newline.
+ */
+ int yy_is_interactive;
+
+ /* Whether we're considered to be at the beginning of a line.
+ * If so, '^' rules will be active on the next match, otherwise
+ * not.
+ */
+ int yy_at_bol;
+
+ int yy_bs_lineno; /**< The line count. */
+ int yy_bs_column; /**< The column count. */
+
+ /* Whether to try to fill the input buffer when we reach the
+ * end of it.
+ */
+ int yy_fill_buffer;
+
+ int yy_buffer_status;
+
+ };
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0twrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+ These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out ( FILE * _out_str , yyscan_t yyscanner );
+
+ int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#ifndef fts0t_create_buffer_ALREADY_DEFINED
+#undef yy_create_buffer
+#endif
+#ifndef fts0t_delete_buffer_ALREADY_DEFINED
+#undef yy_delete_buffer
+#endif
+#ifndef fts0t_scan_buffer_ALREADY_DEFINED
+#undef yy_scan_buffer
+#endif
+#ifndef fts0t_scan_string_ALREADY_DEFINED
+#undef yy_scan_string
+#endif
+#ifndef fts0t_scan_bytes_ALREADY_DEFINED
+#undef yy_scan_bytes
+#endif
+#ifndef fts0t_init_buffer_ALREADY_DEFINED
+#undef yy_init_buffer
+#endif
+#ifndef fts0t_flush_buffer_ALREADY_DEFINED
+#undef yy_flush_buffer
+#endif
+#ifndef fts0t_load_buffer_state_ALREADY_DEFINED
+#undef yy_load_buffer_state
+#endif
+#ifndef fts0t_switch_to_buffer_ALREADY_DEFINED
+#undef yy_switch_to_buffer
+#endif
+#ifndef fts0tpush_buffer_state_ALREADY_DEFINED
+#undef yypush_buffer_state
+#endif
+#ifndef fts0tpop_buffer_state_ALREADY_DEFINED
+#undef yypop_buffer_state
+#endif
+#ifndef fts0tensure_buffer_stack_ALREADY_DEFINED
+#undef yyensure_buffer_stack
+#endif
+#ifndef fts0tlex_ALREADY_DEFINED
+#undef yylex
+#endif
+#ifndef fts0trestart_ALREADY_DEFINED
+#undef yyrestart
+#endif
+#ifndef fts0tlex_init_ALREADY_DEFINED
+#undef yylex_init
+#endif
+#ifndef fts0tlex_init_extra_ALREADY_DEFINED
+#undef yylex_init_extra
+#endif
+#ifndef fts0tlex_destroy_ALREADY_DEFINED
+#undef yylex_destroy
+#endif
+#ifndef fts0tget_debug_ALREADY_DEFINED
+#undef yyget_debug
+#endif
+#ifndef fts0tset_debug_ALREADY_DEFINED
+#undef yyset_debug
+#endif
+#ifndef fts0tget_extra_ALREADY_DEFINED
+#undef yyget_extra
+#endif
+#ifndef fts0tset_extra_ALREADY_DEFINED
+#undef yyset_extra
+#endif
+#ifndef fts0tget_in_ALREADY_DEFINED
+#undef yyget_in
+#endif
+#ifndef fts0tset_in_ALREADY_DEFINED
+#undef yyset_in
+#endif
+#ifndef fts0tget_out_ALREADY_DEFINED
+#undef yyget_out
+#endif
+#ifndef fts0tset_out_ALREADY_DEFINED
+#undef yyset_out
+#endif
+#ifndef fts0tget_leng_ALREADY_DEFINED
+#undef yyget_leng
+#endif
+#ifndef fts0tget_text_ALREADY_DEFINED
+#undef yyget_text
+#endif
+#ifndef fts0tget_lineno_ALREADY_DEFINED
+#undef yyget_lineno
+#endif
+#ifndef fts0tset_lineno_ALREADY_DEFINED
+#undef yyset_lineno
+#endif
+#ifndef fts0tget_column_ALREADY_DEFINED
+#undef yyget_column
+#endif
+#ifndef fts0tset_column_ALREADY_DEFINED
+#undef yyset_column
+#endif
+#ifndef fts0twrap_ALREADY_DEFINED
+#undef yywrap
+#endif
+#ifndef fts0tget_lval_ALREADY_DEFINED
+#undef yyget_lval
+#endif
+#ifndef fts0tset_lval_ALREADY_DEFINED
+#undef yyset_lval
+#endif
+#ifndef fts0tget_lloc_ALREADY_DEFINED
+#undef yyget_lloc
+#endif
+#ifndef fts0tset_lloc_ALREADY_DEFINED
+#undef yyset_lloc
+#endif
+#ifndef fts0talloc_ALREADY_DEFINED
+#undef yyalloc
+#endif
+#ifndef fts0trealloc_ALREADY_DEFINED
+#undef yyrealloc
+#endif
+#ifndef fts0tfree_ALREADY_DEFINED
+#undef yyfree
+#endif
+#ifndef fts0ttext_ALREADY_DEFINED
+#undef yytext
+#endif
+#ifndef fts0tleng_ALREADY_DEFINED
+#undef yyleng
+#endif
+#ifndef fts0tin_ALREADY_DEFINED
+#undef yyin
+#endif
+#ifndef fts0tout_ALREADY_DEFINED
+#undef yyout
+#endif
+#ifndef fts0t_flex_debug_ALREADY_DEFINED
+#undef yy_flex_debug
+#endif
+#ifndef fts0tlineno_ALREADY_DEFINED
+#undef yylineno
+#endif
+#ifndef fts0ttables_fload_ALREADY_DEFINED
+#undef yytables_fload
+#endif
+#ifndef fts0ttables_destroy_ALREADY_DEFINED
+#undef yytables_destroy
+#endif
+#ifndef fts0tTABLES_NAME_ALREADY_DEFINED
+#undef yyTABLES_NAME
+#endif
+
+#line 69 "fts0tlex.l"
+
+
+#line 701 "../include/fts0tlex.h"
+#undef fts0tIN_HEADER
+#endif /* fts0tHEADER_H */
diff --git a/storage/innobase/include/fts0tokenize.h b/storage/innobase/include/fts0tokenize.h
new file mode 100644
index 00000000..1cddaf5b
--- /dev/null
+++ b/storage/innobase/include/fts0tokenize.h
@@ -0,0 +1,189 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0tokenize.cc
+Full Text Search plugin tokenizer refer to MyISAM
+
+Created 2014/11/17 Shaohua Wang
+***********************************************************************/
+
+#include "ft_global.h"
+#include "mysql/plugin_ftparser.h"
+#include "m_ctype.h"
+
+/* Macros and structs below are from ftdefs.h in MyISAM */
+/** Check a char is true word */
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+/** Check if a char is misc word */
+#define misc_word_char(X) 0
+
+/** Boolean search syntax */
+static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
+
+#define FTB_YES (fts_boolean_syntax[0])
+#define FTB_EGAL (fts_boolean_syntax[1])
+#define FTB_NO (fts_boolean_syntax[2])
+#define FTB_INC (fts_boolean_syntax[3])
+#define FTB_DEC (fts_boolean_syntax[4])
+#define FTB_LBR (fts_boolean_syntax[5])
+#define FTB_RBR (fts_boolean_syntax[6])
+#define FTB_NEG (fts_boolean_syntax[7])
+#define FTB_TRUNC (fts_boolean_syntax[8])
+#define FTB_LQUOT (fts_boolean_syntax[10])
+#define FTB_RQUOT (fts_boolean_syntax[11])
+
+/** FTS query token */
+typedef struct st_ft_word {
+ uchar* pos; /*!< word start pointer */
+ uint len; /*!< word len */
+ double weight; /*!< word weight, unused in innodb */
+} FT_WORD;
+
+/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
+Differences: a. code format changed; b. stopword processing removed.
+@param[in] cs charset
+@param[in,out] start doc start pointer
+@param[in,out] end doc end pointer
+@param[in,out] word token
+@param[in,out] info token info
+@retval 0 eof
+@retval 1 word found
+@retval 2 left bracket
+@retval 3 right bracket
+@retval 4 stopword found */
+inline
+uchar
+fts_get_word(
+ const CHARSET_INFO* cs,
+ uchar** start,
+ uchar* end,
+ FT_WORD* word,
+ MYSQL_FTPARSER_BOOLEAN_INFO*
+ info)
+{
+ uchar* doc = *start;
+ int ctype;
+ uint mwc;
+ uint length;
+ int mbl;
+
+ info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
+ info->weight_adjust = info->wasign = 0;
+ info->type = FT_TOKEN_EOF;
+
+ while (doc < end) {
+ for (; doc < end;
+ doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+ mbl = cs->ctype(&ctype, doc, end);
+
+ if (true_word_char(ctype, *doc)) {
+ break;
+ }
+
+ if (*doc == FTB_RQUOT && info->quot) {
+ *start = doc + 1;
+ info->type = FT_TOKEN_RIGHT_PAREN;
+
+ return(info->type);
+ }
+
+ if (!info->quot) {
+ if (*doc == FTB_LBR
+ || *doc == FTB_RBR
+ || *doc == FTB_LQUOT) {
+ /* param->prev=' '; */
+ *start = doc + 1;
+ if (*doc == FTB_LQUOT) {
+ info->quot = (char*)1;
+ }
+
+ info->type = (*doc == FTB_RBR ?
+ FT_TOKEN_RIGHT_PAREN :
+ FT_TOKEN_LEFT_PAREN);
+
+ return(info->type);
+ }
+
+ if (info->prev == ' ') {
+ if (*doc == FTB_YES) {
+ info->yesno = +1;
+ continue;
+ } else if (*doc == FTB_EGAL) {
+ info->yesno = 0;
+ continue;
+ } else if (*doc == FTB_NO) {
+ info->yesno = -1;
+ continue;
+ } else if (*doc == FTB_INC) {
+ info->weight_adjust++;
+ continue;
+ } else if (*doc == FTB_DEC) {
+ info->weight_adjust--;
+ continue;
+ } else if (*doc == FTB_NEG) {
+ info->wasign = !info->wasign;
+ continue;
+ }
+ }
+ }
+
+ info->prev = char(*doc);
+ info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
+ info->weight_adjust = info->wasign = 0;
+ }
+
+ mwc = length = 0;
+ for (word->pos = doc;
+ doc < end;
+ length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+ mbl = cs->ctype(&ctype, doc, end);
+
+ if (true_word_char(ctype, *doc)) {
+ mwc = 0;
+ } else if (!misc_word_char(*doc) || mwc) {
+ break;
+ } else {
+ mwc++;
+ }
+ }
+
+ /* Be sure *prev is true_word_char. */
+ info->prev = 'A';
+ word->len = (uint)(doc-word->pos) - mwc;
+
+ if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
+ doc++;
+ }
+
+ /* We don't check stopword here. */
+ *start = doc;
+ info->type = FT_TOKEN_WORD;
+
+ return(info->type);
+ }
+
+ if (info->quot) {
+ *start = doc;
+ info->type = FT_TOKEN_RIGHT_PAREN;
+ }
+
+ return(info->type);
+}
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
new file mode 100644
index 00000000..f5760a16
--- /dev/null
+++ b/storage/innobase/include/fts0types.h
@@ -0,0 +1,386 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.h
+Full text search types file
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_H
+#define INNOBASE_FTS0TYPES_H
+
+#include "fts0fts.h"
+#include "fut0fut.h"
+#include "pars0pars.h"
+#include "que0types.h"
+#include "ut0byte.h"
+#include "ut0rbt.h"
+
+/** Types used within FTS. */
+struct fts_que_t;
+struct fts_node_t;
+
+/** Callbacks used within FTS. */
+typedef pars_user_func_cb_t fts_sql_callback;
+typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
+
+/** Statistics relevant to a particular document, used during retrieval. */
+struct fts_doc_stats_t {
+ doc_id_t doc_id; /*!< Document id */
+ ulint word_count; /*!< Total words in the document */
+};
+
+/** It's main purpose is to store the SQL prepared statements that
+are required to retrieve a document from the database. */
+struct fts_get_doc_t {
+ fts_index_cache_t*
+ index_cache; /*!< The index cache instance */
+
+ /*!< Parsed sql statement */
+ que_t* get_document_graph;
+ fts_cache_t* cache; /*!< The parent cache */
+};
+
+/** Since we can have multiple FTS indexes on a table, we keep a
+per index cache of words etc. */
+struct fts_index_cache_t {
+ dict_index_t* index; /*!< The FTS index instance */
+
+ ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*,
+ cells are fts_tokenizer_word_t*.*/
+
+ ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t
+ contained in the memory buffer.
+ Must be in sorted order (ascending).
+ The ideal choice is an rb tree but
+ the rb tree imposes a space overhead
+ that we can do without */
+
+ que_t** ins_graph; /*!< Insert query graphs */
+
+ que_t** sel_graph; /*!< Select query graphs */
+ CHARSET_INFO* charset; /*!< charset */
+};
+
+/** Stop word control infotmation. */
+struct fts_stopword_t {
+ ulint status; /*!< Status of the stopword tree */
+ ib_alloc_t* heap; /*!< The memory allocator to use */
+ ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */
+ CHARSET_INFO* charset; /*!< charset for stopword */
+};
+
+/** The SYNC state of the cache. There is one instance of this struct
+associated with each ADD thread. */
+struct fts_sync_t {
+ trx_t* trx; /*!< The transaction used for SYNCing
+ the cache to disk */
+ dict_table_t* table; /*!< Table with FTS index(es) */
+ ulint max_cache_size; /*!< Max size in bytes of the cache */
+ ibool cache_full; /*!< flag, when true it indicates that
+ we need to sync the cache to disk */
+ ulint lower_index; /*!< the start index of the doc id
+ vector from where to start adding
+ documents to the FTS cache */
+ ulint upper_index; /*!< max index of the doc id vector to
+ add to the FTS cache */
+ ibool interrupted; /*!< TRUE if SYNC was interrupted */
+ doc_id_t min_doc_id; /*!< The smallest doc id added to the
+ cache. It should equal to
+ doc_ids[lower_index] */
+ doc_id_t max_doc_id; /*!< The doc id at which the cache was
+ noted as being full, we use this to
+ set the upper_limit field */
+ time_t start_time; /*!< SYNC start time; only used if
+ fts_enable_diag_print */
+ bool in_progress; /*!< flag whether sync is in progress.*/
+ bool unlock_cache; /*!< flag whether unlock cache when
+ write fts node */
+ os_event_t event; /*!< sync finish event;
+ only os_event_set() and os_event_wait()
+ are used */
+};
+
+/** The cache for the FTS system. It is a memory-based inverted index
+that new entries are added to, until it grows over the configured maximum
+size, at which time its contents are written to the INDEX table. */
+struct fts_cache_t {
+ rw_lock_t lock; /*!< lock protecting all access to the
+ memory buffer. FIXME: this needs to
+ be our new upgrade-capable rw-lock */
+
+ rw_lock_t init_lock; /*!< lock used for the cache
+ intialization, it has different
+ SYNC level as above cache lock */
+
+ ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */
+
+ ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */
+
+ ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each
+ element is of type fts_update_t */
+
+ ib_vector_t* indexes; /*!< We store the stats and inverted
+ index for the individual FTS indexes
+ in this vector. Each element is
+ an instance of fts_index_cache_t */
+
+ ib_vector_t* get_docs; /*!< information required to read
+ the document from the table. Each
+ element is of type fts_doc_t */
+
+ size_t total_size; /*!< total size consumed by the ilist
+ field of all nodes. SYNC is run
+ whenever this gets too big */
+ fts_sync_t* sync; /*!< sync structure to sync data to
+ disk */
+ ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes
+ and deleted_doc_ids, ie. transient
+ objects, they are recreated after
+ a SYNC is completed */
+
+ ib_alloc_t* self_heap; /*!< This heap is the heap out of
+ which an instance of the cache itself
+ was created. Objects created using
+ this heap will last for the lifetime
+ of the cache */
+
+ doc_id_t next_doc_id; /*!< Next doc id */
+
+ doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
+
+ doc_id_t first_doc_id; /*!< first doc id since this table
+ was opened */
+
+ ulint deleted; /*!< Number of doc ids deleted since
+ last optimized. This variable is
+ covered by deleted_lock */
+
+ ulint added; /*!< Number of doc ids added since last
+ optimized. This variable is covered by
+ the deleted lock */
+
+ fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
+ mem_heap_t* cache_heap; /*!< Cache Heap */
+};
+
+/** Columns of the FTS auxiliary INDEX table */
+struct fts_node_t {
+ doc_id_t first_doc_id; /*!< First document id in ilist. */
+
+ doc_id_t last_doc_id; /*!< Last document id in ilist. */
+
+ byte* ilist; /*!< Binary list of documents & word
+ positions the token appears in.
+ TODO: For now, these are simply
+ ut_malloc'd, but if testing shows
+ that they waste memory unacceptably, a
+ special memory allocator will have
+ to be written */
+
+ ulint doc_count; /*!< Number of doc ids in ilist */
+
+ ulint ilist_size; /*!< Used size of ilist in bytes. */
+
+ ulint ilist_size_alloc;
+ /*!< Allocated size of ilist in
+ bytes */
+ bool synced; /*!< flag whether the node is synced */
+};
+
+/** A tokenizer word. Contains information about one word. */
+struct fts_tokenizer_word_t {
+ fts_string_t text; /*!< Token text. */
+
+ ib_vector_t* nodes; /*!< Word node ilists, each element is
+ of type fts_node_t */
+};
+
+/** Word text plus it's array of nodes as on disk in FTS index */
+struct fts_word_t {
+ fts_string_t text; /*!< Word value in UTF-8 */
+ ib_vector_t* nodes; /*!< Nodes read from disk */
+
+ ib_alloc_t* heap_alloc; /*!< For handling all allocations */
+};
+
+/** Callback for reading and filtering nodes that are read from FTS index */
+struct fts_fetch_t {
+ void* read_arg; /*!< Arg for the sql_callback */
+
+ fts_sql_callback
+ read_record; /*!< Callback for reading index
+ record */
+ size_t total_memory; /*!< Total memory used */
+};
+
+/** For horizontally splitting an FTS auxiliary index */
+struct fts_index_selector_t {
+ ulint value; /*!< Character value at which
+ to split */
+
+ const char* suffix; /*!< FTS aux index suffix */
+};
+
+/** This type represents a single document. */
+struct fts_doc_t {
+ fts_string_t text; /*!< document text */
+
+ ibool found; /*!< TRUE if the document was found
+ successfully in the database */
+
+ ib_rbt_t* tokens; /*!< This is filled when the document
+ is tokenized. Tokens; indexed by
+ fts_string_t*, cells are of type
+ fts_token_t* */
+
+ ib_alloc_t* self_heap; /*!< An instance of this type is
+ allocated from this heap along
+ with any objects that have the
+ same lifespan, most notably
+ the vector of token positions */
+ CHARSET_INFO* charset; /*!< Document's charset info */
+
+ st_mysql_ftparser* parser; /*!< fts plugin parser */
+
+ ib_rbt_t* stopwords; /*!< Stopwords */
+};
+
+/** A token and its positions within a document. */
+struct fts_token_t {
+ fts_string_t text; /*!< token text */
+
+ ib_vector_t* positions; /*!< an array of the positions the
+ token is found in; each item is
+ actually an ulint. */
+};
+
+/** It's defined in fts/fts0fts.c */
+extern const fts_index_selector_t fts_index_selector[];
+
+/******************************************************************//**
+Compare two fts_trx_row_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_ranking_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Compare two doc_ids. */
+UNIV_INLINE
+int fts_doc_id_cmp(
+/*==================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Decode and return the integer that was encoded using our VLC scheme.*/
+UNIV_INLINE
+ulint
+fts_decode_vlc(
+/*===========*/
+ /*!< out: value decoded */
+ byte** ptr); /*!< in: ptr to decode from, this ptr is
+ incremented by the number of bytes decoded */
+
+/******************************************************************//**
+Duplicate a string. */
+UNIV_INLINE
+void
+fts_string_dup(
+/*===========*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ fts_string_t* dst, /*!< in: dup to here */
+ const fts_string_t* src, /*!< in: src string */
+ mem_heap_t* heap); /*!< in: heap to use */
+
+/******************************************************************//**
+Return length of val if it were encoded using our VLC scheme. */
+UNIV_INLINE
+ulint
+fts_get_encoded_len(
+/*================*/
+ /*!< out: length of value
+ encoded, in bytes */
+ ulint val); /*!< in: value to encode */
+
+/******************************************************************//**
+Encode an integer using our VLC scheme and return the length in bytes. */
+UNIV_INLINE
+ulint
+fts_encode_int(
+/*===========*/
+ /*!< out: length of value
+ encoded, in bytes */
+ ulint val, /*!< in: value to encode */
+ byte* buf); /*!< in: buffer, must have
+ enough space */
+
+/******************************************************************//**
+Get the selected FTS aux INDEX suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+ ulint selected); /*!< in: selected index */
+
+/** Select the FTS auxiliary index for the given character.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length in bytes
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len);
+
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+
+#endif /* INNOBASE_FTS0TYPES_H */
diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic
new file mode 100644
index 00000000..facc1e5c
--- /dev/null
+++ b/storage/innobase/include/fts0types.ic
@@ -0,0 +1,231 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.ic
+Full text search types.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_IC
+#define INNOBASE_FTS0TYPES_IC
+
+/******************************************************************//**
+Duplicate a string.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+void
+fts_string_dup(
+/*===========*/
+ fts_string_t* dst, /*!< in: dup to here */
+ const fts_string_t* src, /*!< in: src string */
+ mem_heap_t* heap) /*!< in: heap to use */
+{
+ dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1);
+ memcpy(dst->f_str, src->f_str, src->f_len);
+
+ dst->f_len = src->f_len;
+ dst->f_str[src->f_len] = 0;
+ dst->f_n_char = src->f_n_char;
+}
+
+/******************************************************************//**
+Compare two fts_trx_row_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1;
+ const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2;
+
+ return((int)(tr1->doc_id - tr2->doc_id));
+}
+
+/******************************************************************//**
+Compare two fts_ranking_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const fts_ranking_t* rk1 = (const fts_ranking_t*) p1;
+ const fts_ranking_t* rk2 = (const fts_ranking_t*) p2;
+
+ return((int)(rk1->doc_id - rk2->doc_id));
+}
+
+/******************************************************************//**
+Compare two doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int fts_doc_id_cmp(
+/*==================*/
+ const void* p1, /*!< in: id1 */
+ const void* p2) /*!< in: id2 */
+{
+ const doc_id_t* up1 = static_cast<const doc_id_t*>(p1);
+ const doc_id_t* up2 = static_cast<const doc_id_t*>(p2);
+
+ return static_cast<int>(*up1 - *up2);
+}
+
+/******************************************************************//**
+Get the first character's code position for FTS index partition */
+extern
+ulint
+innobase_strnxfrm(
+/*==============*/
+ const CHARSET_INFO* cs, /*!< in: Character set */
+ const uchar* p2, /*!< in: string */
+ const ulint len2); /*!< in: string length */
+
+/** Check if fts index charset is cjk
+@param[in] cs charset
+@retval true if the charset is cjk
+@retval false if not. */
+inline bool fts_is_charset_cjk(const CHARSET_INFO* cs)
+{
+ switch (cs->number) {
+ case 24: /* my_charset_gb2312_chinese_ci */
+ case 28: /* my_charset_gbk_chinese_ci */
+ case 1: /* my_charset_big5_chinese_ci */
+ case 12: /* my_charset_ujis_japanese_ci */
+ case 13: /* my_charset_sjis_japanese_ci */
+ case 95: /* my_charset_cp932_japanese_ci */
+ case 97: /* my_charset_eucjpms_japanese_ci */
+ case 19: /* my_charset_euckr_korean_ci */
+ return true;
+ default:
+ return false;
+ }
+}
+
+/** Select the FTS auxiliary index for the given character by range.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length
+@retval the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index_by_range(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len)
+{
+ ulint selected = 0;
+ ulint value = innobase_strnxfrm(cs, str, len);
+
+ while (fts_index_selector[selected].value != 0) {
+
+ if (fts_index_selector[selected].value == value) {
+
+ return(selected);
+
+ } else if (fts_index_selector[selected].value > value) {
+
+ return(selected > 0 ? selected - 1 : 0);
+ }
+
+ ++selected;
+ }
+
+ ut_ad(selected > 1);
+
+ return(selected - 1);
+}
+
+/** Select the FTS auxiliary index for the given character by hash.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length
+@retval the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index_by_hash(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len)
+{
+ ulong nr1 = 1;
+ ulong nr2 = 4;
+
+ ut_ad(!(str == NULL && len > 0));
+
+ if (str == NULL || len == 0) {
+ return 0;
+ }
+
+ /* Get the first char */
+ /* JAN: TODO: MySQL 5.7 had
+ char_len = my_mbcharlen_ptr(cs, reinterpret_cast<const char*>(str),
+ reinterpret_cast<const char*>(str + len));
+ */
+ size_t char_len = size_t(cs->charlen(str, str + len));
+
+ ut_ad(char_len <= len);
+
+ /* Get collation hash code */
+ my_ci_hash_sort(cs, str, char_len, &nr1, &nr2);
+
+ return(nr1 % FTS_NUM_AUX_INDEX);
+}
+
+/** Select the FTS auxiliary index for the given character.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length in bytes
+@retval the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len)
+{
+ ulint selected;
+
+ if (fts_is_charset_cjk(cs)) {
+ selected = fts_select_index_by_hash(cs, str, len);
+ } else {
+ selected = fts_select_index_by_range(cs, str, len);
+ }
+
+ return(selected);
+}
+
+/******************************************************************//**
+Return the selected FTS aux index suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+ ulint selected) /*!< in: selected index */
+{
+ return(fts_index_selector[selected].suffix);
+}
+
+#endif /* INNOBASE_FTS0TYPES_IC */
diff --git a/storage/innobase/include/fts0vlc.ic b/storage/innobase/include/fts0vlc.ic
new file mode 100644
index 00000000..75d85350
--- /dev/null
+++ b/storage/innobase/include/fts0vlc.ic
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0vlc.ic
+Full text variable length integer encoding/decoding.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0VLC_IC
+#define INNOBASE_FTS0VLC_IC
+
+#include "fts0types.h"
+
+/******************************************************************//**
+Return length of val if it were encoded using our VLC scheme.
+FIXME: We will need to be able encode 8 bytes value
+@return length of value encoded, in bytes */
+UNIV_INLINE
+ulint
+fts_get_encoded_len(
+/*================*/
+ ulint val) /* in: value to encode */
+{
+ if (val <= 127) {
+ return(1);
+ } else if (val <= 16383) {
+ return(2);
+ } else if (val <= 2097151) {
+ return(3);
+ } else if (val <= 268435455) {
+ return(4);
+ } else {
+ /* Possibly we should care that on 64-bit machines ulint can
+ contain values that we can't encode in 5 bytes, but
+ fts_encode_int doesn't handle them either so it doesn't much
+ matter. */
+
+ return(5);
+ }
+}
+
+/******************************************************************//**
+Encode an integer using our VLC scheme and return the length in bytes.
+@return length of value encoded, in bytes */
+UNIV_INLINE
+ulint
+fts_encode_int(
+/*===========*/
+ ulint val, /* in: value to encode */
+ byte* buf) /* in: buffer, must have enough space */
+{
+ ulint len;
+
+ if (val <= 127) {
+ *buf = (byte) val;
+
+ len = 1;
+ } else if (val <= 16383) {
+ *buf++ = (byte)(val >> 7);
+ *buf = (byte)(val & 0x7F);
+
+ len = 2;
+ } else if (val <= 2097151) {
+ *buf++ = (byte)(val >> 14);
+ *buf++ = (byte)((val >> 7) & 0x7F);
+ *buf = (byte)(val & 0x7F);
+
+ len = 3;
+ } else if (val <= 268435455) {
+ *buf++ = (byte)(val >> 21);
+ *buf++ = (byte)((val >> 14) & 0x7F);
+ *buf++ = (byte)((val >> 7) & 0x7F);
+ *buf = (byte)(val & 0x7F);
+
+ len = 4;
+ } else {
+ /* Best to keep the limitations of the 32/64 bit versions
+ identical, at least for the time being. */
+ ut_ad(val <= 4294967295u);
+
+ *buf++ = (byte)(val >> 28);
+ *buf++ = (byte)((val >> 21) & 0x7F);
+ *buf++ = (byte)((val >> 14) & 0x7F);
+ *buf++ = (byte)((val >> 7) & 0x7F);
+ *buf = (byte)(val & 0x7F);
+
+ len = 5;
+ }
+
+ /* High-bit on means "last byte in the encoded integer". */
+ *buf |= 0x80;
+
+ return(len);
+}
+
+/******************************************************************//**
+Decode and return the integer that was encoded using our VLC scheme.
+@return value decoded */
+UNIV_INLINE
+ulint
+fts_decode_vlc(
+/*===========*/
+ byte** ptr) /* in: ptr to decode from, this ptr is
+ incremented by the number of bytes decoded */
+{
+ ulint val = 0;
+
+ for (;;) {
+ byte b = **ptr;
+
+ ++*ptr;
+ val |= (b & 0x7F);
+
+ /* High-bit on means "last byte in the encoded integer". */
+ if (b & 0x80) {
+ break;
+ } else {
+ val <<= 7;
+ }
+ }
+
+ return(val);
+}
+
+#endif
diff --git a/storage/innobase/include/fut0fut.h b/storage/innobase/include/fut0fut.h
new file mode 100644
index 00000000..a52fc256
--- /dev/null
+++ b/storage/innobase/include/fut0fut.h
@@ -0,0 +1,74 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0fut.h
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+
+#ifndef fut0fut_h
+#define fut0fut_h
+
+#include "mtr0mtr.h"
+
+/** Gets a pointer to a file address and latches the page.
+@param[in] space space id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] addr file address
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_SX_LATCH
+@param[out] ptr_block file page
+@param[in,out] mtr mini-transaction
+@return pointer to a byte in (*ptr_block)->frame; the *ptr_block is
+bufferfixed and latched */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+ ulint space,
+ ulint zip_size,
+ fil_addr_t addr,
+ rw_lock_type_t rw_latch,
+ mtr_t* mtr,
+ buf_block_t** ptr_block = NULL)
+{
+ buf_block_t* block;
+ byte* ptr = NULL;
+
+ ut_ad(addr.boffset < srv_page_size);
+ ut_ad((rw_latch == RW_S_LATCH)
+ || (rw_latch == RW_X_LATCH)
+ || (rw_latch == RW_SX_LATCH));
+
+ block = buf_page_get(page_id_t(space, addr.page), zip_size,
+ rw_latch, mtr);
+
+ ptr = buf_block_get_frame(block) + addr.boffset;
+
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ if (ptr_block != NULL) {
+ *ptr_block = block;
+ }
+
+ return(ptr);
+}
+
+#endif /* fut0fut_h */
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
new file mode 100644
index 00000000..1ade24cd
--- /dev/null
+++ b/storage/innobase/include/fut0lst.h
@@ -0,0 +1,163 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.h
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef fut0lst_h
+#define fut0lst_h
+
+#ifdef UNIV_INNOCHECKSUM
+# include "fil0fil.h"
+#else
+#include "fut0fut.h"
+#include "mtr0log.h"
+
+/* The C 'types' of base node and list node: these should be used to
+write self-documenting code. Of course, the sizeof macro cannot be
+applied to these types! */
+
+typedef byte flst_base_node_t;
+typedef byte flst_node_t;
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/* The physical size of a list base node in bytes */
+#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE)
+/* The physical size of a list node in bytes */
+#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE)
+
+#ifndef UNIV_INNOCHECKSUM
+/* We define the field offsets of a node for the list */
+#define FLST_PREV 0 /* 6-byte address of the previous list element;
+ the page part of address is FIL_NULL, if no
+ previous element */
+#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next
+ list element; the page part of address
+ is FIL_NULL, if no next element */
+
+/* We define the field offsets of a base node for the list */
+#define FLST_LEN 0 /* 32-bit list length field */
+#define FLST_FIRST 4 /* 6-byte address of the first element
+ of the list; undefined if empty list */
+#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the
+ last element of the list; undefined
+ if empty list */
+
+/** Initialize a zero-initialized list base node.
+@param[in,out] block file page
+@param[in] ofs byte offset of the list base node
+@param[in,out] mtr mini-transaction */
+inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr)
+{
+ ut_ad(!mach_read_from_2(FLST_LEN + ofs + block->frame));
+ ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + block->frame));
+ ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + block->frame));
+ compile_time_assert(FIL_NULL == 0xffU * 0x1010101U);
+ mtr->memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff);
+ mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff);
+}
+
+/** Initialize a list base node.
+@param[in] block file page
+@param[in,out] base base node
+@param[in,out] mtr mini-transaction */
+void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** Append a file list node to a list.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] add block to be added
+@param[in] aoffset byte offset of the node to be added
+@param[in,outr] mtr mini-transaction */
+void flst_add_last(buf_block_t *base, uint16_t boffset,
+ buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+/** Prepend a file list node to a list.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] add block to be added
+@param[in] aoffset byte offset of the node to be added
+@param[in,outr] mtr mini-transaction */
+void flst_add_first(buf_block_t *base, uint16_t boffset,
+ buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+/** Remove a file list node.
+@param[in,out] base base node block
+@param[in] boffset byte offset of the base node
+@param[in,out] cur block to be removed
+@param[in] coffset byte offset of the current record to be removed
+@param[in,outr] mtr mini-transaction */
+void flst_remove(buf_block_t *base, uint16_t boffset,
+ buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** @return the length of a list */
+inline uint32_t flst_get_len(const flst_base_node_t *base)
+{
+ return mach_read_from_4(base + FLST_LEN);
+}
+
+/** @return a file address */
+inline fil_addr_t flst_read_addr(const byte *faddr)
+{
+ fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE),
+ mach_read_from_2(faddr + FIL_ADDR_BYTE) };
+ ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+ ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
+ return addr;
+}
+
+/** @return list first node address */
+inline fil_addr_t flst_get_first(const flst_base_node_t *base)
+{
+ return flst_read_addr(base + FLST_FIRST);
+}
+
+/** @return list last node address */
+inline fil_addr_t flst_get_last(const flst_base_node_t *base)
+{
+ return flst_read_addr(base + FLST_LAST);
+}
+
+/** @return list next node address */
+inline fil_addr_t flst_get_next_addr(const flst_node_t* node)
+{
+ return flst_read_addr(node + FLST_NEXT);
+}
+
+/** @return list prev node address */
+inline fil_addr_t flst_get_prev_addr(const flst_node_t *node)
+{
+ return flst_read_addr(node + FLST_PREV);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate a file-based list. */
+void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr);
+#endif
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/gis0geo.h b/storage/innobase/include/gis0geo.h
new file mode 100644
index 00000000..3fd01a3a
--- /dev/null
+++ b/storage/innobase/include/gis0geo.h
@@ -0,0 +1,122 @@
+/*****************************************************************************
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software Foundation,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+*****************************************************************************/
+
+/**************************************************//**
+@file gis0geo.h
+The r-tree define from MyISAM
+*******************************************************/
+
+#ifndef _gis0geo_h
+#define _gis0geo_h
+
+#include "my_global.h"
+#include "string.h"
+
+#define SPTYPE HA_KEYTYPE_DOUBLE
+#define SPLEN 8
+
+/* Since the mbr could be a point or a linestring, in this case, area of
+mbr is 0. So, we define this macro for calculating the area increasing
+when we need to enlarge the mbr. */
+#define LINE_MBR_WEIGHTS 0.001
+
+/* Types of "well-known binary representation" (wkb) format. */
+enum wkbType
+{
+ wkbPoint = 1,
+ wkbLineString = 2,
+ wkbPolygon = 3,
+ wkbMultiPoint = 4,
+ wkbMultiLineString = 5,
+ wkbMultiPolygon = 6,
+ wkbGeometryCollection = 7
+};
+
+/* Byte order of "well-known binary representation" (wkb) format. */
+enum wkbByteOrder
+{
+ wkbXDR = 0, /* Big Endian */
+ wkbNDR = 1 /* Little Endian */
+};
+
+/*************************************************************//**
+Calculate minimal bounding rectangle (mbr) of the spatial object
+stored in "well-known binary representation" (wkb) format.
+@return 0 if ok */
+int
+rtree_mbr_from_wkb(
+/*===============*/
+ const uchar* wkb, /*!< in: pointer to wkb. */
+ uint size, /*!< in: size of wkb. */
+ uint n_dims, /*!< in: dimensions. */
+ double* mbr); /*!< in/out: mbr. */
+
+/* Rtree split node structure. */
+struct rtr_split_node_t
+{
+ double square; /* square of the mbr.*/
+ int n_node; /* which group in.*/
+ uchar* key; /* key. */
+ double* coords; /* mbr. */
+};
+
+/*************************************************************//**
+Inline function for reserving coords */
+inline
+static
+double*
+reserve_coords(double **d_buffer, /*!< in/out: buffer. */
+ int n_dim) /*!< in: dimensions. */
+/*===========*/
+{
+ double *coords = *d_buffer;
+ (*d_buffer) += n_dim * 2;
+ return coords;
+}
+
+/*************************************************************//**
+Split rtree nodes.
+Return which group the first rec is in. */
+int
+split_rtree_node(
+/*=============*/
+ rtr_split_node_t* node, /*!< in: split nodes.*/
+ int n_entries, /*!< in: entries number.*/
+ int all_size, /*!< in: total key's size.*/
+ int key_size, /*!< in: key's size.*/
+ int min_size, /*!< in: minimal group size.*/
+ int size1, /*!< in: size of group.*/
+ int size2, /*!< in: initial group sizes */
+ double** d_buffer, /*!< in/out: buffer.*/
+ int n_dim, /*!< in: dimensions. */
+ uchar* first_rec); /*!< in: the first rec. */
+
+/** Compare two minimum bounding rectangles.
+@param mode comparison operator
+ MBR_INTERSECT(a,b) a overlaps b
+ MBR_CONTAIN(a,b) a contains b
+ MBR_DISJOINT(a,b) a disjoint b
+ MBR_WITHIN(a,b) a within b
+ MBR_EQUAL(a,b) All coordinates of MBRs are equal
+ MBR_DATA(a,b) Data reference is the same
+@param b first MBR
+@param a second MBR
+@retval 0 if the predicate holds
+@retval 1 if the precidate does not hold */
+int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a);
+#endif
diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h
new file mode 100644
index 00000000..f7a2d6cd
--- /dev/null
+++ b/storage/innobase/include/gis0rtree.h
@@ -0,0 +1,494 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0rtree.h
+R-tree header file
+
+Created 2013/03/27 Jimmy Yang and Allen Lai
+***********************************************************************/
+
+#ifndef gis0rtree_h
+#define gis0rtree_h
+
+#include "btr0cur.h"
+#include "rem0types.h"
+
+/* Whether MBR 'a' contains 'b' */
+#define MBR_CONTAIN_CMP(a, b) \
+ ((((b)->xmin >= (a)->xmin) && ((b)->xmax <= (a)->xmax) \
+ && ((b)->ymin >= (a)->ymin) && ((b)->ymax <= (a)->ymax)))
+
+/* Whether MBR 'a' equals to 'b' */
+#define MBR_EQUAL_CMP(a, b) \
+ ((((b)->xmin == (a)->xmin) && ((b)->xmax == (a)->xmax)) \
+ && (((b)->ymin == (a)->ymin) && ((b)->ymax == (a)->ymax)))
+
+/* Whether MBR 'a' intersects 'b' */
+#define MBR_INTERSECT_CMP(a, b) \
+ ((((b)->xmin <= (a)->xmax) || ((b)->xmax >= (a)->xmin)) \
+ && (((b)->ymin <= (a)->ymax) || ((b)->ymax >= (a)->ymin)))
+
+/* Whether MBR 'a' and 'b' disjoint */
+#define MBR_DISJOINT_CMP(a, b) (!MBR_INTERSECT_CMP(a, b))
+
+/* Whether MBR 'a' within 'b' */
+#define MBR_WITHIN_CMP(a, b) \
+ ((((b)->xmin <= (a)->xmin) && ((b)->xmax >= (a)->xmax)) \
+ && (((b)->ymin <= (a)->ymin) && ((b)->ymax >= (a)->ymax)))
+
+/* Define it for rtree search mode checking. */
+#define RTREE_SEARCH_MODE(mode) \
+ (((mode) >= PAGE_CUR_CONTAIN) && ((mode <= PAGE_CUR_RTREE_GET_FATHER)))
+
+/* Geometry data header */
+#define GEO_DATA_HEADER_SIZE 4
+/**********************************************************************//**
+Builds a Rtree node pointer out of a physical record and a page number.
+@return own: node pointer */
+dtuple_t*
+rtr_index_build_node_ptr(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ const rtr_mbr_t* mbr, /*!< in: mbr of lower page */
+ const rec_t* rec, /*!< in: record for which to build node
+ pointer */
+ ulint page_no,/*!< in: page number to put in node
+ pointer */
+ mem_heap_t* heap); /*!< in: memory heap where pointer
+ created */
+
+/*************************************************************//**
+Splits an R-tree index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+@return inserted record */
+rec_t*
+rtr_page_split_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in/out: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr); /*!< in: mtr */
+
+/**************************************************************//**
+Sets the child node mbr in a node pointer. */
+UNIV_INLINE
+void
+rtr_page_cal_mbr(
+/*=============*/
+ const dict_index_t* index, /*!< in: index */
+ const buf_block_t* block, /*!< in: buffer block */
+ rtr_mbr_t* mbr, /*!< out: MBR encapsulates the page */
+ mem_heap_t* heap); /*!< in: heap for the memory
+ allocation */
+/*************************************************************//**
+Find the next matching record. This function will first exhaust
+the copied record listed in the rtr_info->matches vector before
+moving to next page
+@return true if there is next qualified record found, otherwise(if
+exhausted) false */
+bool
+rtr_pcur_move_to_next(
+/*==================*/
+ const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
+ tuple must be set so that it cannot get
+ compared to the node ptr page number field! */
+ page_cur_mode_t mode, /*!< in: cursor search mode */
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ ulint cur_level,
+ /*!< in: current level */
+ mtr_t* mtr); /*!< in: mtr */
+
+/****************************************************************//**
+Searches the right position in rtree for a page cursor. */
+bool
+rtr_cur_search_with_match(
+/*======================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ dict_index_t* index, /*!< in: index descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ rtr_info_t* rtr_info);/*!< in/out: search stack */
+
+/****************************************************************//**
+Calculate the area increased for a new record
+@return area increased */
+double
+rtr_rec_cal_increase(
+/*=================*/
+ const dtuple_t* dtuple, /*!< in: data tuple to insert, which
+ cause area increase */
+ const rec_t* rec, /*!< in: physical record which differs from
+ dtuple in some of the common fields, or which
+ has an equal number or more fields than
+ dtuple */
+ double* area); /*!< out: increased area */
+
+/****************************************************************//**
+Following the right link to find the proper block for insert.
+@return the proper block.*/
+dberr_t
+rtr_ins_enlarge_mbr(
+/*=================*/
+ btr_cur_t* cursor, /*!< in: btr cursor */
+ mtr_t* mtr); /*!< in: mtr */
+
+/**************************************************************//**
+push a nonleaf index node to the search path */
+UNIV_INLINE
+void
+rtr_non_leaf_stack_push(
+/*====================*/
+ rtr_node_path_t* path, /*!< in/out: search path */
+ uint32_t pageno, /*!< in: pageno to insert */
+ node_seq_t seq_no, /*!< in: Node sequence num */
+ ulint level, /*!< in: index level */
+ uint32_t child_no, /*!< in: child page no */
+ btr_pcur_t* cursor, /*!< in: position cursor */
+ double mbr_inc); /*!< in: MBR needs to be
+ enlarged */
+
+/**************************************************************//**
+push a nonleaf index node to the search path for insertion */
+void
+rtr_non_leaf_insert_stack_push(
+/*===========================*/
+ dict_index_t* index, /*!< in: index descriptor */
+ rtr_node_path_t* path, /*!< in/out: search path */
+ ulint level, /*!< in: index level */
+ const buf_block_t* block, /*!< in: block of the page */
+ const rec_t* rec, /*!< in: positioned record */
+ double mbr_inc); /*!< in: MBR needs to be
+ enlarged */
+
+#define rtr_get_new_ssn_id(index) (index)->assign_ssn()
+#define rtr_get_current_ssn_id(index) (index)->ssn()
+
+/********************************************************************//**
+Create a RTree search info structure */
+rtr_info_t*
+rtr_create_rtr_info(
+/******************/
+ bool need_prdt, /*!< in: Whether predicate lock is
+ needed */
+ bool init_matches, /*!< in: Whether to initiate the
+ "matches" structure for collecting
+ matched leaf records */
+ btr_cur_t* cursor, /*!< in: tree search cursor */
+ dict_index_t* index); /*!< in: index struct */
+
+/********************************************************************//**
+Update a btr_cur_t with rtr_info */
+void
+rtr_info_update_btr(
+/******************/
+ btr_cur_t* cursor, /*!< in/out: tree cursor */
+ rtr_info_t* rtr_info); /*!< in: rtr_info to set to the
+ cursor */
+
+/********************************************************************//**
+Update a btr_cur_t with rtr_info */
+void
+rtr_init_rtr_info(
+/****************/
+ rtr_info_t* rtr_info, /*!< in: rtr_info to set to the
+ cursor */
+ bool need_prdt, /*!< in: Whether predicate lock is
+ needed */
+ btr_cur_t* cursor, /*!< in: tree search cursor */
+ dict_index_t* index, /*!< in: index structure */
+ bool reinit); /*!< in: Whether this is a reinit */
+
+/**************************************************************//**
+Clean up Rtree cursor */
+void
+rtr_clean_rtr_info(
+/*===============*/
+ rtr_info_t* rtr_info, /*!< in: RTree search info */
+ bool free_all); /*!< in: need to free rtr_info itself */
+
+/****************************************************************//**
+Get the bounding box content from an index record*/
+void
+rtr_get_mbr_from_rec(
+/*=================*/
+ const rec_t* rec, /*!< in: data tuple */
+ const rec_offs* offsets,/*!< in: offsets array */
+ rtr_mbr_t* mbr); /*!< out MBR */
+
+/****************************************************************//**
+Get the bounding box content from a MBR data record */
+void
+rtr_get_mbr_from_tuple(
+/*===================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ rtr_mbr* mbr); /*!< out: mbr to fill */
+
+/* Get the rtree page father.
+@param[in] offsets work area for the return value
+@param[in] index rtree index
+@param[in] block child page in the index
+@param[in] mtr mtr
+@param[in] sea_cur search cursor, contains information
+ about parent nodes in search
+@param[in] cursor cursor on node pointer record,
+ its page x-latched */
+void
+rtr_page_get_father(
+ dict_index_t* index,
+ buf_block_t* block,
+ mtr_t* mtr,
+ btr_cur_t* sea_cur,
+ btr_cur_t* cursor);
+
+/************************************************************//**
+Returns the father block to a page. It is assumed that mtr holds
+an X or SX latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+rec_offs*
+rtr_page_get_father_block(
+/*======================*/
+ rec_offs* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ dict_index_t* index, /*!< in: b-tree index */
+ buf_block_t* block, /*!< in: child page in the index */
+ mtr_t* mtr, /*!< in: mtr */
+ btr_cur_t* sea_cur,/*!< in: search cursor, contains information
+ about parent nodes in search */
+ btr_cur_t* cursor);/*!< out: cursor on node pointer record,
+ its page x-latched */
+/**************************************************************//**
+Store the parent path cursor
+@return number of cursor stored */
+ulint
+rtr_store_parent_path(
+/*==================*/
+ const buf_block_t* block, /*!< in: block of the page */
+ btr_cur_t* btr_cur,/*!< in/out: persistent cursor */
+ ulint latch_mode,
+ /*!< in: latch_mode */
+ ulint level, /*!< in: index level */
+ mtr_t* mtr); /*!< in: mtr */
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+void
+rtr_pcur_open_low(
+/*==============*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level in the btree */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page from the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+
+#define rtr_pcur_open(i,t,md,l,c,m) \
+ rtr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,m)
+
+struct btr_cur_t;
+
+/*********************************************************//**
+Returns the R-Tree node stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+node_visit_t*
+rtr_get_parent_node(
+/*================*/
+ btr_cur_t* btr_cur, /*!< in: persistent cursor */
+ ulint level, /*!< in: index level of buffer page */
+ ulint is_insert); /*!< in: whether it is insert */
+
+/*********************************************************//**
+Returns the R-Tree cursor stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+btr_pcur_t*
+rtr_get_parent_cursor(
+/*==================*/
+ btr_cur_t* btr_cur, /*!< in: persistent cursor */
+ ulint level, /*!< in: index level of buffer page */
+ ulint is_insert); /*!< in: whether insert operation */
+
+/*************************************************************//**
+Copy recs from a page to new_block of rtree. */
+void
+rtr_page_copy_rec_list_end_no_locks(
+/*================================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ rtr_rec_move_t* rec_move, /*!< in: recording records moved */
+ ulint max_move, /*!< in: num of rec to move */
+ ulint* num_moved, /*!< out: num of rec to move */
+ mtr_t* mtr); /*!< in: mtr */
+
+/*************************************************************//**
+Copy recs till a specified rec from a page to new_block of rtree. */
+void
+rtr_page_copy_rec_list_start_no_locks(
+/*==================================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mem_heap_t* heap, /*!< in/out: heap memory */
+ rtr_rec_move_t* rec_move, /*!< in: recording records moved */
+ ulint max_move, /*!< in: num of rec to move */
+ ulint* num_moved, /*!< out: num of rec to move */
+ mtr_t* mtr); /*!< in: mtr */
+
+/****************************************************************//**
+Merge 2 mbrs and update the the mbr that cursor is on. */
+dberr_t
+rtr_merge_and_update_mbr(
+/*=====================*/
+ btr_cur_t* cursor, /*!< in/out: cursor */
+ btr_cur_t* cursor2, /*!< in: the other cursor */
+ rec_offs* offsets, /*!< in: rec offsets */
+ rec_offs* offsets2, /*!< in: rec offsets */
+ page_t* child_page, /*!< in: the child page. */
+ mtr_t* mtr); /*!< in: mtr */
+
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+void
+rtr_node_ptr_delete(
+/*================*/
+ btr_cur_t* cursor, /*!< in: search cursor, contains information
+ about parent nodes in search */
+ mtr_t* mtr); /*!< in: mtr */
+
+/****************************************************************//**
+Check two MBRs are identical or need to be merged */
+bool
+rtr_merge_mbr_changed(
+/*==================*/
+ btr_cur_t* cursor, /*!< in: cursor */
+ btr_cur_t* cursor2, /*!< in: the other cursor */
+ rec_offs* offsets, /*!< in: rec offsets */
+ rec_offs* offsets2, /*!< in: rec offsets */
+ rtr_mbr_t* new_mbr); /*!< out: MBR to update */
+
+
+/**************************************************************//**
+Update the mbr field of a spatial index row.
+@return true if successful */
+bool
+rtr_update_mbr_field(
+/*=================*/
+ btr_cur_t* cursor, /*!< in: cursor pointed to rec.*/
+ rec_offs* offsets, /*!< in: offsets on rec. */
+ btr_cur_t* cursor2, /*!< in/out: cursor pointed to rec
+ that should be deleted.
+ this cursor is for btr_compress to
+ delete the merged page's father rec.*/
+ page_t* child_page, /*!< in: child page. */
+ rtr_mbr_t* new_mbr, /*!< in: the new mbr. */
+ rec_t* new_rec, /*!< in: rec to use */
+ mtr_t* mtr); /*!< in: mtr */
+
+/**************************************************************//**
+Check whether a Rtree page is child of a parent page
+@return true if there is child/parent relationship */
+bool
+rtr_check_same_block(
+/*=================*/
+ dict_index_t* index, /*!< in: index tree */
+ btr_cur_t* cur, /*!< in/out: position at the parent entry
+ pointing to the child if successful */
+ buf_block_t* parentb,/*!< in: parent page to check */
+ buf_block_t* childb, /*!< in: child Page */
+ mem_heap_t* heap); /*!< in: memory heap */
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_write_mbr(
+/*==========*/
+ byte* data, /*!< out: data */
+ const rtr_mbr_t* mbr); /*!< in: data */
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_read_mbr(
+/*==========*/
+ const byte* data, /*!< in: data */
+ rtr_mbr_t* mbr); /*!< out: data */
+
+/**************************************************************//**
+Check whether a discarding page is in anyone's search path */
+void
+rtr_check_discard_page(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
+ the root page */
+ buf_block_t* block); /*!< in: block of page to be discarded */
+
+/********************************************************************//**
+Reinitialize a RTree search info */
+UNIV_INLINE
+void
+rtr_info_reinit_in_cursor(
+/************************/
+ btr_cur_t* cursor, /*!< in/out: tree cursor */
+ dict_index_t* index, /*!< in: index struct */
+ bool need_prdt); /*!< in: Whether predicate lock is
+ needed */
+
+/** Estimates the number of rows in a given area.
+@param[in] index index
+@param[in] tuple range tuple containing mbr, may also be empty tuple
+@param[in] mode search mode
+@return estimated number of rows */
+ha_rows
+rtr_estimate_n_rows_in_range(
+ dict_index_t* index,
+ const dtuple_t* tuple,
+ page_cur_mode_t mode);
+
+#include "gis0rtree.ic"
+#endif /*!< gis0rtree.h */
diff --git a/storage/innobase/include/gis0rtree.ic b/storage/innobase/include/gis0rtree.ic
new file mode 100644
index 00000000..1b53caa3
--- /dev/null
+++ b/storage/innobase/include/gis0rtree.ic
@@ -0,0 +1,242 @@
+/*****************************************************************************
+
+Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0rtree.h
+R-tree Inline code
+
+Created 2013/03/27 Jimmy Yang and Allen Lai
+***********************************************************************/
+
+/**************************************************************//**
+Sets the child node mbr in a node pointer. */
+UNIV_INLINE
+void
+rtr_page_cal_mbr(
+/*=============*/
+ const dict_index_t* index, /*!< in: index */
+ const buf_block_t* block, /*!< in: buffer block */
+ rtr_mbr_t* rtr_mbr,/*!< out: MBR encapsulates the page */
+ mem_heap_t* heap) /*!< in: heap for the memory
+ allocation */
+{
+ page_t* page;
+ rec_t* rec;
+ const byte* field;
+ ulint len;
+ rec_offs* offsets = NULL;
+ double bmin, bmax;
+ double* amin;
+ double* amax;
+ ulint inc = 0;
+ double* mbr;
+
+ rtr_mbr->xmin = DBL_MAX;
+ rtr_mbr->ymin = DBL_MAX;
+ rtr_mbr->xmax = -DBL_MAX;
+ rtr_mbr->ymax = -DBL_MAX;
+
+ mbr = reinterpret_cast<double*>(rtr_mbr);
+
+ page = buf_block_get_frame(block);
+
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page)
+ ? index->n_fields : 0,
+ ULINT_UNDEFINED, &heap);
+
+ do {
+ /* The mbr address is in the first field. */
+ field = rec_get_nth_field(rec, offsets, 0, &len);
+
+ ut_ad(len == DATA_MBR_LEN);
+ inc = 0;
+ for (unsigned i = 0; i < SPDIMS; i++) {
+ bmin = mach_double_read(field + inc);
+ bmax = mach_double_read(field + inc + sizeof(double));
+
+ amin = mbr + i * SPDIMS;
+ amax = mbr + i * SPDIMS + 1;
+
+ if (*amin > bmin)
+ *amin = bmin;
+ if (*amax < bmax)
+ *amax = bmax;
+
+ inc += 2 * sizeof(double);
+ }
+
+ rec = page_rec_get_next(rec);
+
+ if (rec == NULL) {
+ break;
+ }
+ } while (!page_rec_is_supremum(rec));
+}
+
+/**************************************************************//**
+push a nonleaf index node to the search path */
+UNIV_INLINE
+void
+rtr_non_leaf_stack_push(
+/*====================*/
+ rtr_node_path_t* path, /*!< in/out: search path */
+ uint32_t pageno, /*!< in: pageno to insert */
+ node_seq_t seq_no, /*!< in: Node sequence num */
+ ulint level, /*!< in: index page level */
+ uint32_t child_no, /*!< in: child page no */
+ btr_pcur_t* cursor, /*!< in: position cursor */
+ double mbr_inc) /*!< in: MBR needs to be
+ enlarged */
+{
+ node_visit_t insert_val;
+
+ insert_val.page_no = pageno;
+ insert_val.seq_no = seq_no;
+ insert_val.level = level;
+ insert_val.child_no = child_no;
+ insert_val.cursor = cursor;
+ insert_val.mbr_inc = mbr_inc;
+
+ path->push_back(insert_val);
+
+#ifdef RTR_SEARCH_DIAGNOSTIC
+ fprintf(stderr, "INNODB_RTR: Push page %d, level %d, seq %d"
+ " to search stack \n",
+ static_cast<int>(pageno), static_cast<int>(level),
+ static_cast<int>(seq_no));
+#endif /* RTR_SEARCH_DIAGNOSTIC */
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_write_mbr(
+/*==========*/
+ byte* data, /*!< out: data */
+ const rtr_mbr_t* mbr) /*!< in: data */
+{
+ const double* my_mbr = reinterpret_cast<const double*>(mbr);
+
+ for (unsigned i = 0; i < SPDIMS * 2; i++) {
+ mach_double_write(data + i * sizeof(double), my_mbr[i]);
+ }
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_read_mbr(
+/*==========*/
+ const byte* data, /*!< in: data */
+ rtr_mbr_t* mbr) /*!< out: MBR */
+{
+ for (unsigned i = 0; i < SPDIMS * 2; i++) {
+ (reinterpret_cast<double*>(mbr))[i] = mach_double_read(
+ data
+ + i * sizeof(double));
+ }
+}
+
+/*********************************************************//**
+Returns the R-Tree node stored in the parent search path
+@return pointer to R-Tree cursor component in the parent path,
+NULL if parent path is empty or index is larger than num of items contained */
+UNIV_INLINE
+node_visit_t*
+rtr_get_parent_node(
+/*================*/
+ btr_cur_t* btr_cur, /*!< in: persistent cursor */
+ ulint level, /*!< in: index level of buffer page */
+ ulint is_insert) /*!< in: whether it is insert */
+{
+ ulint num;
+ ulint tree_height = btr_cur->tree_height;
+ node_visit_t* found_node = NULL;
+
+ if (level >= tree_height) {
+ return(NULL);
+ }
+
+ mutex_enter(&btr_cur->rtr_info->rtr_path_mutex);
+
+ num = btr_cur->rtr_info->parent_path->size();
+
+ if (!num) {
+ mutex_exit(&btr_cur->rtr_info->rtr_path_mutex);
+ return(NULL);
+ }
+
+ if (is_insert) {
+ ulint idx = tree_height - level - 1;
+ ut_ad(idx < num);
+
+ found_node = &(*btr_cur->rtr_info->parent_path)[idx];
+ } else {
+ node_visit_t* node;
+
+ while (num > 0) {
+ node = &(*btr_cur->rtr_info->parent_path)[num - 1];
+
+ if (node->level == level) {
+ found_node = node;
+ break;
+ }
+ num--;
+ }
+ }
+
+ mutex_exit(&btr_cur->rtr_info->rtr_path_mutex);
+
+ return(found_node);
+}
+
+/*********************************************************//**
+Returns the R-Tree cursor stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+btr_pcur_t*
+rtr_get_parent_cursor(
+/*==================*/
+ btr_cur_t* btr_cur, /*!< in: persistent cursor */
+ ulint level, /*!< in: index level of buffer page */
+ ulint is_insert) /*!< in: whether insert operation */
+{
+ node_visit_t* found_node = rtr_get_parent_node(
+ btr_cur, level, is_insert);
+
+ return((found_node) ? found_node->cursor : NULL);
+}
+
+/********************************************************************//**
+Reinitialize a R-Tree search info in btr_cur_t */
+UNIV_INLINE
+void
+rtr_info_reinit_in_cursor(
+/************************/
+ btr_cur_t* cursor, /*!< in/out: tree cursor */
+ dict_index_t* index, /*!< in: index struct */
+ bool need_prdt) /*!< in: Whether predicate lock is
+ needed */
+{
+ rtr_clean_rtr_info(cursor->rtr_info, false);
+ rtr_init_rtr_info(cursor->rtr_info, need_prdt, cursor, index, true);
+}
diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h
new file mode 100644
index 00000000..55944bfc
--- /dev/null
+++ b/storage/innobase/include/gis0type.h
@@ -0,0 +1,152 @@
+/*****************************************************************************
+
+Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0type.h
+R-tree header file
+
+Created 2013/03/27 Jimmy Yang
+***********************************************************************/
+
+#ifndef gis0type_h
+#define gis0type_h
+
+#include "buf0buf.h"
+#include "data0type.h"
+#include "data0types.h"
+#include "dict0types.h"
+#include "ut0vec.h"
+#include "gis0geo.h"
+
+#include <vector>
+#include <forward_list>
+
+/** Node Sequence Number. Only updated when page splits */
+typedef uint32_t node_seq_t;
+
+/* RTree internal non-leaf Nodes to be searched, from root to leaf */
+struct node_visit_t {
+ uint32_t page_no; /*!< the page number */
+ node_seq_t seq_no; /*!< the SSN (split sequence number */
+ ulint level; /*!< the page's index level */
+ uint32_t child_no; /*!< child page num if for parent
+ recording */
+ btr_pcur_t* cursor; /*!< cursor structure if we positioned
+ FIXME: there is no need to use whole
+ btr_pcur_t, just the position related
+ members */
+ double mbr_inc; /*!< whether this node needs to be
+ enlarged for insertion */
+};
+
+typedef std::vector<node_visit_t, ut_allocator<node_visit_t> > rtr_node_path_t;
+
+typedef struct rtr_rec {
+ rec_t* r_rec; /*!< matched record */
+ bool locked; /*!< whether the record locked */
+} rtr_rec_t;
+
+typedef std::vector<rtr_rec_t, ut_allocator<rtr_rec_t> > rtr_rec_vector;
+
+/* Structure for matched records on the leaf page */
+typedef struct matched_rec {
+ byte* bufp; /*!< aligned buffer point */
+ byte rec_buf[UNIV_PAGE_SIZE_MAX * 2];
+ /*!< buffer used to copy matching rec */
+ buf_block_t block; /*!< the shadow buffer block */
+ ulint used; /*!< memory used */
+ rtr_rec_vector* matched_recs; /*!< vector holding the matching rec */
+ ib_mutex_t rtr_match_mutex;/*!< mutex protect the match_recs
+ vector */
+ bool valid; /*!< whether result in matched_recs
+ or this search is valid (page not
+ dropped) */
+ bool locked; /*!< whether these recs locked */
+} matched_rec_t;
+
+/* In memory representation of a minimum bounding rectangle */
+typedef struct rtr_mbr {
+ double xmin; /*!< minimum on x */
+ double xmax; /*!< maximum on x */
+ double ymin; /*!< minimum on y */
+ double ymax; /*!< maximum on y */
+} rtr_mbr_t;
+
+/* Maximum index level for R-Tree, this is consistent with BTR_MAX_LEVELS */
+#define RTR_MAX_LEVELS 100
+
+/* Number of pages we latch at leaf level when there is possible Tree
+modification (split, shrink), we always latch left, current
+and right pages */
+#define RTR_LEAF_LATCH_NUM 3
+
+/** Vectors holding the matching internal pages/nodes and leaf records */
+typedef struct rtr_info{
+ rtr_node_path_t*path; /*!< vector holding matching pages */
+ rtr_node_path_t*parent_path;
+ /*!< vector holding parent pages during
+ search */
+ matched_rec_t* matches;/*!< struct holding matching leaf records */
+ ib_mutex_t rtr_path_mutex;
+ /*!< mutex protect the "path" vector */
+ buf_block_t* tree_blocks[RTR_MAX_LEVELS + RTR_LEAF_LATCH_NUM];
+ /*!< tracking pages that would be locked
+ at leaf level, for future free */
+ ulint tree_savepoints[RTR_MAX_LEVELS + RTR_LEAF_LATCH_NUM];
+ /*!< savepoint used to release latches/blocks
+ on each level and leaf level */
+ rtr_mbr_t mbr; /*!< the search MBR */
+ que_thr_t* thr; /*!< the search thread */
+ mem_heap_t* heap; /*!< memory heap */
+ btr_cur_t* cursor; /*!< cursor used for search */
+ dict_index_t* index; /*!< index it is searching */
+ bool need_prdt_lock;
+ /*!< whether we will need predicate lock
+ the tree */
+ bool need_page_lock;
+ /*!< whether we will need predicate page lock
+ the tree */
+ bool allocated;/*!< whether this structure is allocate or
+ on stack */
+ bool mbr_adj;/*!< whether mbr will need to be enlarged
+ for an insertion operation */
+ bool fd_del; /*!< found deleted row */
+ const dtuple_t* search_tuple;
+ /*!< search tuple being used */
+ page_cur_mode_t search_mode;
+ /*!< current search mode */
+} rtr_info_t;
+
+/* Tracking structure for all ongoing search for an index */
+struct rtr_info_track_t {
+ /** Active search info */
+ std::forward_list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_active;
+ ib_mutex_t rtr_active_mutex;
+ /*!< mutex to protect
+ rtr_active */
+};
+
+/* This is to record the record movement between pages. Used for corresponding
+lock movement */
+typedef struct rtr_rec_move {
+ rec_t* old_rec; /*!< record being moved in old page */
+ rec_t* new_rec; /*!< new record location */
+ bool moved; /*!< whether lock are moved too */
+} rtr_rec_move_t;
+#endif /*!< gis0rtree.h */
diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h
new file mode 100644
index 00000000..561c3225
--- /dev/null
+++ b/storage/innobase/include/ha0ha.h
@@ -0,0 +1,60 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0ha.h
+The hash table interface for the adaptive hash index
+
+Created 8/18/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef ha0ha_h
+#define ha0ha_h
+
+#include "hash0hash.h"
+#include "page0types.h"
+#include "buf0types.h"
+#include "rem0types.h"
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+const rec_t*
+ha_search_and_get_data(
+/*===================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: folded value of the searched data */
+
+/** The hash table external chain node */
+struct ha_node_t {
+ ulint fold; /*!< fold value for the data */
+ ha_node_t* next; /*!< next chain node or NULL if none */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block; /*!< buffer block containing the data, or NULL */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ const rec_t* data; /*!< pointer to the data */
+};
+
+#include "ha0ha.ic"
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#endif
diff --git a/storage/innobase/include/ha0ha.ic b/storage/innobase/include/ha0ha.ic
new file mode 100644
index 00000000..0b256257
--- /dev/null
+++ b/storage/innobase/include/ha0ha.ic
@@ -0,0 +1,154 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ha0ha.ic
+The hash table interface for the adaptive hash index
+
+Created 8/18/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef BTR_CUR_HASH_ADAPT
+#include "btr0types.h"
+
+/******************************************************************//**
+Gets a hash node data.
+@return pointer to the data */
+UNIV_INLINE
+const rec_t*
+ha_node_get_data(
+/*=============*/
+ const ha_node_t* node) /*!< in: hash chain node */
+{
+ return(node->data);
+}
+
+/******************************************************************//**
+Sets hash node data. */
+UNIV_INLINE
+void
+ha_node_set_data_func(
+/*==================*/
+ ha_node_t* node, /*!< in: hash chain node */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block, /*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ const rec_t* data) /*!< in: pointer to the data */
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ node->data = data;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/******************************************************************//**
+Gets the next node in a hash chain.
+@return next node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_next(
+/*==============*/
+ const ha_node_t* node) /*!< in: hash chain node */
+{
+ return(node->next);
+}
+
+/******************************************************************//**
+Gets the first node in a hash chain.
+@return first node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_first(
+/*===============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold value determining the chain */
+{
+ return static_cast<ha_node_t*>(table->array[table->calc_hash(fold)].node);
+}
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+const rec_t*
+ha_search_and_get_data(
+/*===================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: folded value of the searched data */
+{
+ ut_ad(btr_search_enabled);
+
+ for (const ha_node_t* node = ha_chain_get_first(table, fold);
+ node != NULL;
+ node = ha_chain_get_next(node)) {
+
+ if (node->fold == fold) {
+
+ return(node->data);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data.
+@return pointer to the hash table node, NULL if not found in the table */
+UNIV_INLINE
+ha_node_t*
+ha_search_with_data(
+/*================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ const rec_t* data) /*!< in: pointer to the data */
+{
+ ha_node_t* node;
+
+ ut_ad(btr_search_enabled);
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->data == data) {
+
+ return(node);
+ }
+
+ node = ha_chain_get_next(node);
+ }
+
+ return(NULL);
+}
+
+#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h
new file mode 100644
index 00000000..db23ddc6
--- /dev/null
+++ b/storage/innobase/include/ha0storage.h
@@ -0,0 +1,137 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.h
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef ha0storage_h
+#define ha0storage_h
+
+#include "univ.i"
+
+/** This value is used by default by ha_storage_create(). More memory
+is allocated later when/if it is needed. */
+#define HA_STORAGE_DEFAULT_HEAP_BYTES 1024
+
+/** This value is used by default by ha_storage_create(). It is a
+constant per ha_storage's lifetime. */
+#define HA_STORAGE_DEFAULT_HASH_CELLS 4096
+
+/** Hash storage */
+struct ha_storage_t;
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+ ulint initial_heap_bytes, /*!< in: initial heap's size */
+ ulint initial_hash_cells); /*!< in: initial number of cells
+ in the hash table */
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit".
+@return pointer to the copy */
+const void*
+ha_storage_put_memlim(
+/*==================*/
+ ha_storage_t* storage, /*!< in/out: hash storage */
+ const void* data, /*!< in: data to store */
+ ulint data_len, /*!< in: data length */
+ ulint memlim); /*!< in: memory limit to obey */
+
+/*******************************************************************//**
+Same as ha_storage_put_memlim() but without memory limit.
+@param storage in/out: hash storage
+@param data in: data to store
+@param data_len in: data length
+@return pointer to the copy of the string */
+#define ha_storage_put(storage, data, data_len) \
+ ha_storage_put_memlim((storage), (data), (data_len), 0)
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy. If the
+same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@return pointer to the copy of the string */
+#define ha_storage_put_str(storage, str) \
+ ((const char*) ha_storage_put((storage), (str), strlen(str) + 1))
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy obeying
+a memory limit.
+If the same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@param memlim in: memory limit to obey
+@return pointer to the copy of the string */
+#define ha_storage_put_str_memlim(storage, str, memlim) \
+ ((const char*) ha_storage_put_memlim((storage), (str), \
+ strlen(str) + 1, (memlim)))
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+ ha_storage_t** storage); /*!< in/out: hash storage */
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+ ha_storage_t* storage); /*!< in, own: hash storage */
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+ const ha_storage_t* storage); /*!< in: hash storage */
+
+#include "ha0storage.ic"
+
+#endif /* ha0storage_h */
diff --git a/storage/innobase/include/ha0storage.ic b/storage/innobase/include/ha0storage.ic
new file mode 100644
index 00000000..df9679cf
--- /dev/null
+++ b/storage/innobase/include/ha0storage.ic
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.ic
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 24, 2007 Vasil Dimov
+*******************************************************/
+
+#include "hash0hash.h"
+#include "mem0mem.h"
+
+/** Hash storage for strings */
+struct ha_storage_t {
+ mem_heap_t* heap; /*!< memory heap from which memory is
+ allocated */
+ hash_table_t hash; /*!< hash table used to avoid
+ duplicates */
+};
+
+/** Objects of this type are stored in ha_storage_t */
+struct ha_storage_node_t {
+ ulint data_len;/*!< length of the data */
+ const void* data; /*!< pointer to data */
+ ha_storage_node_t* next; /*!< next node in hash chain */
+};
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+ ulint initial_heap_bytes, /*!< in: initial heap's size */
+ ulint initial_hash_cells) /*!< in: initial number of cells
+ in the hash table */
+{
+ ha_storage_t* storage;
+ mem_heap_t* heap;
+
+ if (initial_heap_bytes == 0) {
+
+ initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES;
+ }
+
+ if (initial_hash_cells == 0) {
+
+ initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS;
+ }
+
+ /* we put "storage" within "storage->heap" */
+
+ heap = mem_heap_create(sizeof(ha_storage_t)
+ + initial_heap_bytes);
+
+ storage = (ha_storage_t*) mem_heap_alloc(heap,
+ sizeof(ha_storage_t));
+
+ storage->heap = heap;
+ storage->hash.create(initial_hash_cells);
+
+ return(storage);
+}
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+ ha_storage_t** storage) /*!< in/out: hash storage */
+{
+ ha_storage_t temp_storage;
+
+ temp_storage.heap = (*storage)->heap;
+ temp_storage.hash = (*storage)->hash;
+
+ temp_storage.hash.clear();
+ mem_heap_empty(temp_storage.heap);
+
+ *storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap,
+ sizeof(ha_storage_t));
+
+ (*storage)->heap = temp_storage.heap;
+ (*storage)->hash = temp_storage.hash;
+}
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+ ha_storage_t* storage) /*!< in, own: hash storage */
+{
+ storage->hash.free();
+ mem_heap_free(storage->heap);
+}
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+ const ha_storage_t* storage) /*!< in: hash storage */
+{
+ ulint ret;
+
+ ret = mem_heap_get_size(storage->heap);
+
+ /* this assumes hash->heap and hash->heaps are NULL */
+ ret += sizeof(hash_table_t);
+ ret += sizeof(hash_cell_t) * storage->hash.n_cells;
+
+ return(ret);
+}
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
new file mode 100644
index 00000000..453f9e02
--- /dev/null
+++ b/storage/innobase/include/ha_prototypes.h
@@ -0,0 +1,522 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ha_prototypes.h
+Prototypes for global functions in ha_innodb.cc that are called by
+InnoDB C code.
+
+NOTE: This header is intended to insulate InnoDB from SQL names and functions.
+Do not include any headers other than univ.i into this unless they are very
+simple headers.
+************************************************************************/
+
+#ifndef HA_INNODB_PROTOTYPES_H
+#define HA_INNODB_PROTOTYPES_H
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* Forward declarations */
+class THD;
+class Field;
+
+// JAN: TODO missing features:
+#undef MYSQL_FT_INIT_EXT
+#undef MYSQL_PFS
+#undef MYSQL_STORE_FTS_DOC_ID
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+innobase_raw_format(
+/*================*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint charset_coll, /*!< in: charset collation */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size); /*!< in: output buffer size
+ in bytes */
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+void
+innobase_invalidate_query_cache(
+/*============================*/
+ trx_t* trx, /*!< in: transaction which
+ modifies the table */
+ const char* full_name); /*!< in: concatenation of
+ database name, path separator,
+ table name, null char NUL;
+ NOTE that in Windows this is
+ always in LOWER CASE! */
+
+/** Quote a standard SQL identifier like tablespace, index or column name.
+@param[in] file output stream
+@param[in] trx InnoDB transaction, or NULL
+@param[in] id identifier to quote */
+void
+innobase_quote_identifier(
+ FILE* file,
+ trx_t* trx,
+ const char* id);
+
+/** Quote an standard SQL identifier like tablespace, index or column name.
+Return the string as an std:string object.
+@param[in] trx InnoDB transaction, or NULL
+@param[in] id identifier to quote
+@return a std::string with id properly quoted. */
+std::string
+innobase_quote_identifier(
+ trx_t* trx,
+ const char* id);
+
+/*****************************************************************//**
+Convert a table name to the MySQL system_charset_info (UTF-8).
+@return pointer to the end of buf */
+char*
+innobase_convert_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: table name to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ THD* thd); /*!< in: MySQL connection thread, or NULL */
+
+/******************************************************************//**
+Returns true if the thread is the replication thread on the slave
+server.
+@return true if thd is the replication thread */
+ibool
+thd_is_replication_slave_thread(
+/*============================*/
+ THD* thd); /*!< in: thread handle */
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return true if non-transactional tables have been edited */
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+ THD* thd); /*!< in: thread handle */
+
+/**
+Get high resolution timestamp for the current query start time.
+
+@retval timestamp in microseconds precision
+*/
+unsigned long long thd_query_start_micro(const MYSQL_THD thd);
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+void
+innobase_mysql_print_thd(
+/*=====================*/
+ FILE* f, /*!< in: output stream */
+ THD* thd, /*!< in: pointer to a MySQL THD object */
+ uint max_query_len); /*!< in: max query length to print, or 0 to
+ use the default max length */
+
+/** Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@param[out] unsigned_flag DATA_UNSIGNED if an 'unsigned type';
+at least ENUM and SET, and unsigned integer types are 'unsigned types'
+@param[in] f MySQL Field
+@return DATA_BINARY, DATA_VARCHAR, ... */
+uint8_t
+get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field);
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+void
+innobase_get_cset_width(
+/*====================*/
+ ulint cset, /*!< in: MySQL charset-collation code */
+ unsigned*mbminlen, /*!< out: minimum length of a char (in bytes) */
+ unsigned*mbmaxlen); /*!< out: maximum length of a char (in bytes) */
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+int
+innobase_strcasecmp(
+/*================*/
+ const char* a, /*!< in: first string to compare */
+ const char* b); /*!< in: second string to compare */
+
+/** Strip dir name from a full path name and return only the file name
+@param[in] path_name full path name
+@return file name or "null" if no file name */
+const char*
+innobase_basename(
+ const char* path_name);
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+void
+innobase_convert_from_table_id(
+/*===========================*/
+ CHARSET_INFO* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len); /*!< in: length of 'to', in bytes; should
+ be at least 5 * strlen(to) + 1 */
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+void
+innobase_convert_from_id(
+/*=====================*/
+ CHARSET_INFO* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len); /*!< in: length of 'to', in bytes;
+ should be at least 3 * strlen(to) + 1 */
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+void
+innobase_casedn_str(
+/*================*/
+ char* a); /*!< in/out: string to put in lower case */
+
+#ifdef WITH_WSREP
+void wsrep_innobase_kill_one_trx(THD *bf_thd, trx_t *victim_trx, bool signal);
+ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
+ unsigned char* str, ulint str_length,
+ ulint buf_length);
+#endif /* WITH_WSREP */
+
+extern "C" struct charset_info_st *thd_charset(THD *thd);
+
+/** Determines the current SQL statement.
+Thread unsafe, can only be called from the thread owning the THD.
+@param[in] thd MySQL thread handle
+@param[out] length Length of the SQL statement
+@return SQL statement string */
+const char*
+innobase_get_stmt_unsafe(
+ THD* thd,
+ size_t* length);
+
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return number of bytes occupied by the first n characters */
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+ ulint charset_id, /*!< in: character set id */
+ ulint prefix_len, /*!< in: prefix length in bytes of the index
+ (this has to be divided by mbmaxlen to get the
+ number of CHARACTERS n in the prefix) */
+ ulint data_len, /*!< in: length of the string in bytes */
+ const char* str); /*!< in: character string */
+
+/** Get status of innodb_tmpdir.
+@param[in] thd thread handle, or NULL to query
+ the global innodb_tmpdir.
+@retval NULL if innodb_tmpdir="" */
+UNIV_INTERN
+const char*
+thd_innodb_tmpdir(
+ THD* thd);
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return the lock wait timeout, in seconds */
+ulong
+thd_lock_wait_timeout(
+/*==================*/
+ THD* thd); /*!< in: thread handle, or NULL to query
+ the global innodb_lock_wait_timeout */
+/** Get status of innodb_tmpdir.
+@param[in] thd thread handle, or NULL to query
+ the global innodb_tmpdir.
+@retval NULL if innodb_tmpdir="" */
+const char*
+thd_innodb_tmpdir(
+ THD* thd);
+
+/**********************************************************************//**
+Get the current setting of the table_cache_size global parameter. We do
+a dirty read because for one there is no synchronization object and
+secondly there is little harm in doing so even if we get a torn read.
+@return SQL statement string */
+ulint
+innobase_get_table_cache_size(void);
+/*===============================*/
+
+/**********************************************************************//**
+Get the current setting of the lower_case_table_names global parameter from
+mysqld.cc. We do a dirty read because for one there is no synchronization
+object and secondly there is little harm in doing so even if we get a torn
+read.
+@return value of lower_case_table_names */
+ulint
+innobase_get_lower_case_table_names(void);
+/*=====================================*/
+
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+ const void* cs, /*!< in: Character set */
+ const void* p1, /*!< in: key */
+ const void* p2); /*!< in: node */
+
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return true if the thd is marked as read-only */
+bool
+thd_trx_is_read_only(
+/*=================*/
+ THD* thd); /*!< in/out: thread handle */
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return true if the transaction is an auto commit read-only transaction. */
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+ THD* thd); /*!< in: thread handle, or NULL */
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table name
+to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return pointer to the end of buf */
+void
+innobase_format_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* name); /*!< in: table name to format */
+
+/** Corresponds to Sql_condition:enum_warning_level. */
+enum ib_log_level_t {
+ IB_LOG_LEVEL_INFO,
+ IB_LOG_LEVEL_WARN,
+ IB_LOG_LEVEL_ERROR,
+ IB_LOG_LEVEL_FATAL
+};
+
+/******************************************************************//**
+Use this when the args are first converted to a formatted string and then
+passed to the format string from errmsg-utf8.txt. The error message format
+must be: "Some string ... %s".
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+ THD *thd, Sql_condition::enum_warning_level level,
+ uint code, const char *format, ...);
+*/
+void
+ib_errf(
+/*====*/
+ THD* thd, /*!< in/out: session */
+ ib_log_level_t level, /*!< in: warning level */
+ ib_uint32_t code, /*!< MySQL error code */
+ const char* format, /*!< printf format */
+ ...) /*!< Args */
+ MY_ATTRIBUTE((format(printf, 4, 5)));
+
+/******************************************************************//**
+Use this when the args are passed to the format string from
+errmsg-utf8.txt directly as is.
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+ THD *thd, Sql_condition::enum_warning_level level,
+ uint code, const char *format, ...);
+*/
+void
+ib_senderrf(
+/*========*/
+ THD* thd, /*!< in/out: session */
+ ib_log_level_t level, /*!< in: warning level */
+ ib_uint32_t code, /*!< MySQL error code */
+ ...); /*!< Args */
+
+extern const char* TROUBLESHOOTING_MSG;
+extern const char* TROUBLESHOOT_DATADICT_MSG;
+extern const char* BUG_REPORT_MSG;
+extern const char* FORCE_RECOVERY_MSG;
+extern const char* OPERATING_SYSTEM_ERROR_MSG;
+extern const char* FOREIGN_KEY_CONSTRAINTS_MSG;
+extern const char* SET_TRANSACTION_MSG;
+extern const char* INNODB_PARAMETERS_MSG;
+
+/******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return pointer to glob_hostname. */
+const char*
+server_get_hostname();
+/*=================*/
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+ INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return the next value */
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+ ulonglong current, /*!< in: Current value */
+ ulonglong need, /*!< in: count of values needed */
+ ulonglong step, /*!< in: AUTOINC increment step */
+ ulonglong offset, /*!< in: AUTOINC offset */
+ ulonglong max_value) /*!< in: max value for type */
+ MY_ATTRIBUTE((pure, warn_unused_result));
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+ char* to, /* out: converted identifier */
+ const char* from, /* in: identifier to convert */
+ ulint len, /* in: length of 'to', in bytes */
+ uint* errors); /* out: error return */
+/**********************************************************************
+Check if the length of the identifier exceeds the maximum allowed.
+The input to this function is an identifier in charset my_charset_filename.
+return true when length of identifier is too long. */
+my_bool
+innobase_check_identifier_length(
+/*=============================*/
+ const char* id); /* in: identifier to check. it must belong
+ to charset my_charset_filename */
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+ char* to, /* out: converted identifier */
+ const char* from, /* in: identifier to convert */
+ ulint len, /* in: length of 'to', in bytes */
+ uint* errors); /* out: error return */
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_filename_charset(
+/*=================================*/
+ char* to, /* out: converted identifier */
+ const char* from, /* in: identifier to convert */
+ ulint len); /* in: length of 'to', in bytes */
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+UNIV_INTERN
+void
+ib_push_warning(
+ trx_t* trx, /*!< in: trx */
+ dberr_t error, /*!< in: error code to push as warning */
+ const char *format,/*!< in: warning message */
+ ...);
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+UNIV_INTERN
+void
+ib_push_warning(
+ void* ithd, /*!< in: thd */
+ dberr_t error, /*!< in: error code to push as warning */
+ const char *format,/*!< in: warning message */
+ ...);
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+UNIV_INTERN
+void
+ib_foreign_warn(
+ trx_t* trx, /*!< in: trx */
+ dberr_t error, /*!< in: error code to push as warning */
+ const char *table_name,
+ const char *format,/*!< in: warning message */
+ ...);
+
+/*****************************************************************//**
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE. */
+void
+normalize_table_name_c_low(
+/*=======================*/
+ char* norm_name, /*!< out: normalized name as a
+ null-terminated string */
+ const char* name, /*!< in: table name string */
+ ibool set_lower_case); /*!< in: TRUE if we want to set
+ name to lower case */
+/** Update the system variable with the given value of the InnoDB
+buffer pool size.
+@param[in] buf_pool_size given value of buffer pool size.*/
+void
+innodb_set_buf_pool_size(ulonglong buf_pool_size);
+
+/** Create a MYSQL_THD for a background thread and mark it as such.
+@param name thread info for SHOW PROCESSLIST
+@return new MYSQL_THD */
+MYSQL_THD
+innobase_create_background_thd(const char* name);
+
+/** Destroy a background purge thread THD.
+@param[in] thd MYSQL_THD to destroy */
+void
+innobase_destroy_background_thd(MYSQL_THD);
+
+/** Close opened tables, free memory, delete items for a MYSQL_THD.
+@param[in] thd MYSQL_THD to reset */
+void
+innobase_reset_background_thd(MYSQL_THD);
+
+#endif /* !UNIV_INNOCHECKSUM */
+#endif /* HA_INNODB_PROTOTYPES_H */
diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h
new file mode 100644
index 00000000..add983a0
--- /dev/null
+++ b/storage/innobase/include/handler0alter.h
@@ -0,0 +1,108 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/handler0alter.h
+Smart ALTER TABLE
+*******************************************************/
+
+#include "rem0types.h"
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+void
+innobase_rec_to_mysql(
+/*==================*/
+ struct TABLE* table, /*!< in/out: MySQL table */
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(
+ rec, index, ...) */
+ MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB index entry to table->record[0]. */
+void
+innobase_fields_to_mysql(
+/*=====================*/
+ struct TABLE* table, /*!< in/out: MySQL table */
+ const dict_index_t* index, /*!< in: InnoDB index */
+ const dfield_t* fields) /*!< in: InnoDB index fields */
+ MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB row to table->record[0]. */
+void
+innobase_row_to_mysql(
+/*==================*/
+ struct TABLE* table, /*!< in/out: MySQL table */
+ const dict_table_t* itab, /*!< in: InnoDB table */
+ const dtuple_t* row) /*!< in: InnoDB row */
+ MY_ATTRIBUTE((nonnull));
+
+/** Generate the next autoinc based on a snapshot of the session
+auto_increment_increment and auto_increment_offset variables. */
+struct ib_sequence_t {
+
+ /**
+ @param thd the session
+ @param start_value the lower bound
+ @param max_value the upper bound (inclusive) */
+ ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value);
+
+ /** Postfix increment
+ @return the value to insert */
+ ulonglong operator++(int) UNIV_NOTHROW;
+
+ /** Check if the autoinc "sequence" is exhausted.
+ @return true if the sequence is exhausted */
+ bool eof() const UNIV_NOTHROW
+ {
+ return(m_eof);
+ }
+
+ /**
+ @return the next value in the sequence */
+ ulonglong last() const UNIV_NOTHROW
+ {
+ ut_ad(m_next_value > 0);
+
+ return(m_next_value);
+ }
+
+ /** @return maximum column value
+ @retval 0 if not adding AUTO_INCREMENT column */
+ ulonglong max_value() const { return m_max_value; }
+
+private:
+ /** Maximum value if adding an AUTO_INCREMENT column, else 0 */
+ ulonglong m_max_value;
+
+ /** Value of auto_increment_increment */
+ ulong m_increment;
+
+ /** Value of auto_increment_offset */
+ ulong m_offset;
+
+ /** Next value in the sequence */
+ ulonglong m_next_value;
+
+ /** true if no more values left in the sequence */
+ bool m_eof;
+};
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
new file mode 100644
index 00000000..981ff5a0
--- /dev/null
+++ b/storage/innobase/include/hash0hash.h
@@ -0,0 +1,236 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.h
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "ut0rnd.h"
+
+struct hash_table_t;
+struct hash_cell_t{
+ void* node; /*!< hash chain node, NULL if none */
+};
+typedef void* hash_node_t;
+
+/*******************************************************************//**
+Inserts a struct to a hash table. */
+
+#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+\
+ (DATA)->NAME = NULL;\
+\
+ cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
+\
+ if (cell3333->node == NULL) {\
+ cell3333->node = DATA;\
+ } else {\
+ struct3333 = (TYPE*) cell3333->node;\
+\
+ while (struct3333->NAME != NULL) {\
+\
+ struct3333 = (TYPE*) struct3333->NAME;\
+ }\
+\
+ struct3333->NAME = DATA;\
+ }\
+} while (0)
+
+/*******************************************************************//**
+Inserts a struct to the head of hash table. */
+
+#define HASH_PREPEND(TYPE, NAME, TABLE, FOLD, DATA) \
+do { \
+ hash_cell_t* cell3333; \
+ TYPE* struct3333; \
+ \
+ (DATA)->NAME = NULL; \
+ \
+ cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
+ \
+ if (cell3333->node == NULL) { \
+ cell3333->node = DATA; \
+ DATA->NAME = NULL; \
+ } else { \
+ struct3333 = (TYPE*) cell3333->node; \
+ \
+ DATA->NAME = struct3333; \
+ \
+ cell3333->node = DATA; \
+ } \
+} while (0)
+#ifdef UNIV_HASH_DEBUG
+# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
+# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
+#else
+# define HASH_ASSERT_VALID(DATA) do {} while (0)
+# define HASH_INVALIDATE(DATA, NAME) do {} while (0)
+#endif
+
+/*******************************************************************//**
+Deletes a struct from a hash table. */
+
+#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+\
+ cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
+\
+ if (cell3333->node == DATA) {\
+ HASH_ASSERT_VALID(DATA->NAME);\
+ cell3333->node = DATA->NAME;\
+ } else {\
+ struct3333 = (TYPE*) cell3333->node;\
+\
+ while (struct3333->NAME != DATA) {\
+\
+ struct3333 = (TYPE*) struct3333->NAME;\
+ ut_a(struct3333);\
+ }\
+\
+ struct3333->NAME = DATA->NAME;\
+ }\
+ HASH_INVALIDATE(DATA, NAME);\
+} while (0)
+
+#define HASH_REPLACE(TYPE, NAME, TABLE, FOLD, DATA_OLD, DATA_NEW) \
+ do { \
+ (DATA_NEW)->NAME = (DATA_OLD)->NAME; \
+ \
+ hash_cell_t& cell3333 \
+ = (TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
+ TYPE** struct3333 = (TYPE**)&cell3333.node; \
+ while (*struct3333 != DATA_OLD) { \
+ struct3333 = &((*struct3333)->NAME); \
+ } \
+ *struct3333 = DATA_NEW; \
+ } while (0)
+/*******************************************************************//**
+Gets the first struct in a hash chain, NULL if none. */
+
+#define HASH_GET_FIRST(TABLE, HASH_VAL) (TABLE)->array[HASH_VAL].node
+
+/*******************************************************************//**
+Gets the next struct in a hash chain, NULL if none. */
+
+#define HASH_GET_NEXT(NAME, DATA) ((DATA)->NAME)
+
+/********************************************************************//**
+Looks for a struct in a hash table. */
+#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
+{\
+ (DATA) = (TYPE) HASH_GET_FIRST(TABLE, (TABLE)->calc_hash(FOLD)); \
+ HASH_ASSERT_VALID(DATA);\
+\
+ while ((DATA) != NULL) {\
+ ASSERTION;\
+ if (TEST) {\
+ break;\
+ } else {\
+ HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\
+ (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\
+ }\
+ }\
+}
+
+/********************************************************************//**
+Looks for an item in all hash buckets. */
+#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST) \
+do { \
+ ulint i3333; \
+ \
+ for (i3333 = (TABLE)->n_cells; i3333--; ) { \
+ (DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333); \
+ \
+ while ((DATA) != NULL) { \
+ HASH_ASSERT_VALID(DATA); \
+ ASSERTION; \
+ \
+ if (TEST) { \
+ break; \
+ } \
+ \
+ (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA); \
+ } \
+ \
+ if ((DATA) != NULL) { \
+ break; \
+ } \
+ } \
+} while (0)
+
+/****************************************************************//**
+Move all hash table entries from OLD_TABLE to NEW_TABLE. */
+
+#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \
+do {\
+ ulint i2222;\
+ ulint cell_count2222;\
+\
+ cell_count2222 = (OLD_TABLE)->n_cells; \
+\
+ for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
+ NODE_TYPE* node2222 = static_cast<NODE_TYPE*>(\
+ HASH_GET_FIRST((OLD_TABLE), i2222));\
+\
+ while (node2222) {\
+ NODE_TYPE* next2222 = static_cast<NODE_TYPE*>(\
+ node2222->PTR_NAME);\
+ ulint fold2222 = FOLD_FUNC(node2222);\
+\
+ HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\
+ fold2222, node2222);\
+\
+ node2222 = next2222;\
+ }\
+ }\
+} while (0)
+
+/** Hash table with singly-linked overflow lists */
+struct hash_table_t
+{
+ /** number of elements in array (a prime number) */
+ ulint n_cells;
+ /** the hash array */
+ hash_cell_t *array;
+
+ /** Create the hash table.
+ @param n the lower bound of n_cells */
+ void create(ulint n)
+ {
+ n_cells= ut_find_prime(n);
+ array= static_cast<hash_cell_t*>(ut_zalloc_nokey(n_cells * sizeof *array));
+ }
+
+ /** Clear the hash table. */
+ void clear() { memset(array, 0, n_cells * sizeof *array); }
+
+ /** Free the hash table. */
+ void free() { ut_free(array); array= nullptr; }
+
+ ulint calc_hash(ulint fold) const { return ut_hash_ulint(fold, n_cells); }
+};
diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h
new file mode 100644
index 00000000..81ab7566
--- /dev/null
+++ b/storage/innobase/include/ib0mutex.h
@@ -0,0 +1,773 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ib0mutex.h
+Policy based mutexes.
+
+Created 2013-03-26 Sunny Bains.
+***********************************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+
+#ifndef ib0mutex_h
+#define ib0mutex_h
+
+#include "my_cpu.h"
+#include "os0event.h"
+#include "sync0arr.h"
+
+/** OS mutex for tracking lock/unlock for debugging */
+template <template <typename> class Policy>
+struct OSTrackMutex {
+
+ typedef Policy<OSTrackMutex> MutexPolicy;
+
+ explicit OSTrackMutex(bool destroy_mutex_at_exit = true)
+ UNIV_NOTHROW
+ {
+ ut_d(m_freed = true);
+ ut_d(m_locked = false);
+ ut_d(m_destroy_at_exit = destroy_mutex_at_exit);
+ }
+
+ ~OSTrackMutex() UNIV_NOTHROW
+ {
+ ut_ad(!m_destroy_at_exit || !m_locked);
+ }
+
+ /** Initialise the mutex. */
+ void init(latch_id_t, const char*, uint32_t) UNIV_NOTHROW
+ {
+ ut_ad(m_freed);
+ ut_ad(!m_locked);
+
+ m_mutex.init();
+
+ ut_d(m_freed = false);
+ }
+
+ /** Destroy the mutex */
+ void destroy() UNIV_NOTHROW
+ {
+ ut_ad(!m_locked);
+ ut_ad(!m_freed);
+
+ m_mutex.destroy();
+
+ ut_d(m_freed = true);
+ }
+
+ /** Release the mutex. */
+ void exit() UNIV_NOTHROW
+ {
+ ut_ad(m_locked);
+ ut_d(m_locked = false);
+ ut_ad(!m_freed);
+
+ m_mutex.exit();
+ }
+
+ /** Acquire the mutex. */
+ void enter(uint32_t, uint32_t, const char*, uint32_t)
+ UNIV_NOTHROW
+ {
+ ut_ad(!m_freed);
+
+ m_mutex.enter();
+
+ ut_ad(!m_locked);
+ ut_d(m_locked = true);
+ }
+
+ /** @return true if locking succeeded */
+ bool try_lock() UNIV_NOTHROW
+ {
+ ut_ad(!m_freed);
+
+ bool locked = m_mutex.try_lock();
+
+ if (locked) {
+ ut_ad(!m_locked);
+ ut_d(m_locked = locked);
+ }
+
+ return(locked);
+ }
+
+ /** @return non-const version of the policy */
+ MutexPolicy& policy()
+ UNIV_NOTHROW
+ {
+ return(m_policy);
+ }
+
+ /** @return the const version of the policy */
+ const MutexPolicy& policy() const
+ UNIV_NOTHROW
+ {
+ return(m_policy);
+ }
+
+private:
+#ifdef UNIV_DEBUG
+ /** true if the mutex has not be initialized */
+ bool m_freed;
+
+ /** true if the mutex has been locked. */
+ bool m_locked;
+
+ /** Do/Dont destroy mutex at exit */
+ bool m_destroy_at_exit;
+#endif /* UNIV_DEBUG */
+
+ /** OS Mutex instance */
+ OSMutex m_mutex;
+
+ /** Policy data */
+ MutexPolicy m_policy;
+};
+
+
+#ifdef __linux__
+
+#include <linux/futex.h>
+#include <sys/syscall.h>
+
+/** Mutex implementation that used the Linux futex. */
+template <template <typename> class Policy>
+struct TTASFutexMutex {
+
+ typedef Policy<TTASFutexMutex> MutexPolicy;
+
+ TTASFutexMutex() UNIV_NOTHROW
+ :
+ m_lock_word(MUTEX_STATE_UNLOCKED)
+ {
+ /* Check that lock_word is aligned. */
+ ut_ad(!((ulint) &m_lock_word % sizeof(ulint)));
+ }
+
+ ~TTASFutexMutex()
+ {
+ ut_ad(m_lock_word.load(std::memory_order_relaxed)
+ == MUTEX_STATE_UNLOCKED);
+ }
+
+ /** Called when the mutex is "created". Note: Not from the constructor
+ but when the mutex is initialised. */
+ void init(latch_id_t, const char*, uint32_t) UNIV_NOTHROW
+ {
+ ut_ad(m_lock_word.load(std::memory_order_relaxed)
+ == MUTEX_STATE_UNLOCKED);
+ }
+
+ /** Destroy the mutex. */
+ void destroy() UNIV_NOTHROW
+ {
+ /* The destructor can be called at shutdown. */
+ ut_ad(m_lock_word.load(std::memory_order_relaxed)
+ == MUTEX_STATE_UNLOCKED);
+ }
+
+ /** Acquire the mutex.
+ @param[in] max_spins max number of spins
+ @param[in] max_delay max delay per spin */
+ void enter(uint32_t max_spins, uint32_t max_delay,
+ const char*, uint32_t) UNIV_NOTHROW
+ {
+ uint32_t n_spins, n_waits;
+
+ for (n_spins= 0; n_spins < max_spins; n_spins++) {
+ if (try_lock()) {
+ m_policy.add(n_spins, 0);
+ return;
+ }
+
+ ut_delay(max_delay);
+ }
+
+ for (n_waits= 0;; n_waits++) {
+ if (m_lock_word.exchange(MUTEX_STATE_WAITERS,
+ std::memory_order_acquire)
+ == MUTEX_STATE_UNLOCKED) {
+ break;
+ }
+
+ syscall(SYS_futex, &m_lock_word,
+ FUTEX_WAIT_PRIVATE, MUTEX_STATE_WAITERS,
+ 0, 0, 0);
+ }
+
+ m_policy.add(n_spins, n_waits);
+ }
+
+ /** Release the mutex. */
+ void exit() UNIV_NOTHROW
+ {
+ if (m_lock_word.exchange(MUTEX_STATE_UNLOCKED,
+ std::memory_order_release)
+ == MUTEX_STATE_WAITERS) {
+ syscall(SYS_futex, &m_lock_word, FUTEX_WAKE_PRIVATE,
+ 1, 0, 0, 0);
+ }
+ }
+
+ /** Try and lock the mutex.
+ @return true if successful */
+ bool try_lock() UNIV_NOTHROW
+ {
+ int32 oldval = MUTEX_STATE_UNLOCKED;
+ return m_lock_word.compare_exchange_strong(
+ oldval,
+ MUTEX_STATE_LOCKED,
+ std::memory_order_acquire,
+ std::memory_order_relaxed);
+ }
+
+ /** @return non-const version of the policy */
+ MutexPolicy& policy() UNIV_NOTHROW
+ {
+ return(m_policy);
+ }
+
+ /** @return const version of the policy */
+ const MutexPolicy& policy() const UNIV_NOTHROW
+ {
+ return(m_policy);
+ }
+private:
+ /** Policy data */
+ MutexPolicy m_policy;
+
+ /** lock_word is the target of the atomic test-and-set instruction
+ when atomic operations are enabled. */
+ std::atomic<int32> m_lock_word;
+};
+
+#endif /* __linux__ */
+
+template <template <typename> class Policy>
+struct TTASMutex {
+
+ typedef Policy<TTASMutex> MutexPolicy;
+
+ TTASMutex() UNIV_NOTHROW
+ :
+ m_lock_word(MUTEX_STATE_UNLOCKED)
+ {
+ /* Check that lock_word is aligned. */
+ ut_ad(!((ulint) &m_lock_word % sizeof(ulint)));
+ }
+
+ ~TTASMutex()
+ {
+ ut_ad(m_lock_word.load(std::memory_order_relaxed)
+ == MUTEX_STATE_UNLOCKED);
+ }
+
+ /** Called when the mutex is "created". Note: Not from the constructor
+ but when the mutex is initialised. */
+ void init(latch_id_t) UNIV_NOTHROW
+ {
+ ut_ad(m_lock_word.load(std::memory_order_relaxed)
+ == MUTEX_STATE_UNLOCKED);
+ }
+
+ /** Destroy the mutex. */
+ void destroy() UNIV_NOTHROW
+ {
+ /* The destructor can be called at shutdown. */
+ ut_ad(m_lock_word.load(std::memory_order_relaxed)
+ == MUTEX_STATE_UNLOCKED);
+ }
+
+ /** Try and lock the mutex.
+ @return true on success */
+ bool try_lock() UNIV_NOTHROW
+ {
+ uint32_t oldval = MUTEX_STATE_UNLOCKED;
+ return m_lock_word.compare_exchange_strong(
+ oldval,
+ MUTEX_STATE_LOCKED,
+ std::memory_order_acquire,
+ std::memory_order_relaxed);
+ }
+
+ /** Release the mutex. */
+ void exit() UNIV_NOTHROW
+ {
+ ut_ad(m_lock_word.load(std::memory_order_relaxed)
+ == MUTEX_STATE_LOCKED);
+ m_lock_word.store(MUTEX_STATE_UNLOCKED,
+ std::memory_order_release);
+ }
+
+ /** Acquire the mutex.
+ @param max_spins max number of spins
+ @param max_delay max delay per spin */
+ void enter(uint32_t max_spins, uint32_t max_delay,
+ const char*, uint32_t) UNIV_NOTHROW
+ {
+ const uint32_t step = max_spins;
+ uint32_t n_spins = 0;
+
+ while (!try_lock()) {
+ ut_delay(max_delay);
+ if (++n_spins == max_spins) {
+ os_thread_yield();
+ max_spins+= step;
+ }
+ }
+
+ m_policy.add(n_spins, 0);
+ }
+
+ /** @return non-const version of the policy */
+ MutexPolicy& policy() UNIV_NOTHROW
+ {
+ return(m_policy);
+ }
+
+ /** @return const version of the policy */
+ const MutexPolicy& policy() const UNIV_NOTHROW
+ {
+ return(m_policy);
+ }
+
+private:
+ // Disable copying
+ TTASMutex(const TTASMutex&);
+ TTASMutex& operator=(const TTASMutex&);
+
+ /** Policy data */
+ MutexPolicy m_policy;
+
+ /** mutex state */
+ std::atomic<uint32_t> m_lock_word;
+};
+
+template <template <typename> class Policy>
+struct TTASEventMutex {
+
+ typedef Policy<TTASEventMutex> MutexPolicy;
+
+ TTASEventMutex()
+ UNIV_NOTHROW
+ :
+ m_lock_word(MUTEX_STATE_UNLOCKED),
+ m_event()
+ {
+ /* Check that lock_word is aligned. */
+ ut_ad(!((ulint) &m_lock_word % sizeof(ulint)));
+ }
+
+ ~TTASEventMutex()
+ UNIV_NOTHROW
+ {
+ ut_ad(state() == MUTEX_STATE_UNLOCKED);
+ }
+
+ /** Called when the mutex is "created". Note: Not from the constructor
+ but when the mutex is initialised.
+ @param[in] id Mutex ID */
+ void init(latch_id_t id, const char*, uint32_t) UNIV_NOTHROW
+ {
+ ut_a(m_event == 0);
+ ut_ad(state() == MUTEX_STATE_UNLOCKED);
+
+ m_event = os_event_create(sync_latch_get_name(id));
+ }
+
+ /** This is the real desctructor. This mutex can be created in BSS and
+ its desctructor will be called on exit(). We can't call
+ os_event_destroy() at that stage. */
+ void destroy()
+ UNIV_NOTHROW
+ {
+ ut_ad(state() == MUTEX_STATE_UNLOCKED);
+
+ /* We have to free the event before InnoDB shuts down. */
+ os_event_destroy(m_event);
+ m_event = 0;
+ }
+
+ /** Try and lock the mutex. Note: POSIX returns 0 on success.
+ @return true on success */
+ bool try_lock()
+ UNIV_NOTHROW
+ {
+ uint32_t oldval = MUTEX_STATE_UNLOCKED;
+ return m_lock_word.compare_exchange_strong(
+ oldval,
+ MUTEX_STATE_LOCKED,
+ std::memory_order_acquire,
+ std::memory_order_relaxed);
+ }
+
+ /** Release the mutex. */
+ void exit()
+ UNIV_NOTHROW
+ {
+ if (m_lock_word.exchange(MUTEX_STATE_UNLOCKED,
+ std::memory_order_release)
+ == MUTEX_STATE_WAITERS) {
+ os_event_set(m_event);
+ sync_array_object_signalled();
+ }
+ }
+
+ /** Acquire the mutex.
+ @param[in] max_spins max number of spins
+ @param[in] max_delay max delay per spin
+ @param[in] filename from where called
+ @param[in] line within filename */
+ void enter(
+ uint32_t max_spins,
+ uint32_t max_delay,
+ const char* filename,
+ uint32_t line)
+ UNIV_NOTHROW
+ {
+ uint32_t n_spins = 0;
+ uint32_t n_waits = 0;
+ const uint32_t step = max_spins;
+
+ while (!try_lock()) {
+ if (n_spins++ == max_spins) {
+ max_spins += step;
+ n_waits++;
+ os_thread_yield();
+
+ sync_cell_t* cell;
+ sync_array_t *sync_arr = sync_array_get_and_reserve_cell(
+ this, SYNC_MUTEX,
+ filename, line, &cell);
+
+ uint32_t oldval = MUTEX_STATE_LOCKED;
+ m_lock_word.compare_exchange_strong(
+ oldval,
+ MUTEX_STATE_WAITERS,
+ std::memory_order_relaxed,
+ std::memory_order_relaxed);
+
+ if (oldval == MUTEX_STATE_UNLOCKED) {
+ sync_array_free_cell(sync_arr, cell);
+ } else {
+ sync_array_wait_event(sync_arr, cell);
+ }
+ } else {
+ ut_delay(max_delay);
+ }
+ }
+
+ m_policy.add(n_spins, n_waits);
+ }
+
+ /** @return the lock state. */
+ int32 state() const
+ UNIV_NOTHROW
+ {
+ return m_lock_word.load(std::memory_order_relaxed);
+ }
+
+ /** The event that the mutex will wait in sync0arr.cc
+ @return even instance */
+ os_event_t event()
+ UNIV_NOTHROW
+ {
+ return(m_event);
+ }
+
+ /** @return non-const version of the policy */
+ MutexPolicy& policy()
+ UNIV_NOTHROW
+ {
+ return(m_policy);
+ }
+
+ /** @return const version of the policy */
+ const MutexPolicy& policy() const
+ UNIV_NOTHROW
+ {
+ return(m_policy);
+ }
+
+private:
+ /** Disable copying */
+ TTASEventMutex(const TTASEventMutex&);
+ TTASEventMutex& operator=(const TTASEventMutex&);
+
+ /** mutex state */
+ std::atomic<uint32_t> m_lock_word;
+
+ /** Used by sync0arr.cc for the wait queue */
+ os_event_t m_event;
+
+ /** Policy data */
+ MutexPolicy m_policy;
+};
+
+/** Mutex interface for all policy mutexes. This class handles the interfacing
+with the Performance Schema instrumentation. */
+template <typename MutexImpl>
+struct PolicyMutex
+{
+ typedef typename MutexImpl::MutexPolicy Policy;
+
+ PolicyMutex() UNIV_NOTHROW : m_impl()
+ {
+#ifdef UNIV_PFS_MUTEX
+ m_ptr = 0;
+#endif /* UNIV_PFS_MUTEX */
+ }
+
+ ~PolicyMutex() { }
+
+ /** @return non-const version of the policy */
+ Policy& policy() UNIV_NOTHROW
+ {
+ return(m_impl.policy());
+ }
+
+ /** @return const version of the policy */
+ const Policy& policy() const UNIV_NOTHROW
+ {
+ return(m_impl.policy());
+ }
+
+ /** Release the mutex. */
+ void exit() UNIV_NOTHROW
+ {
+#ifdef UNIV_PFS_MUTEX
+ pfs_exit();
+#endif /* UNIV_PFS_MUTEX */
+
+ ut_d(policy().context.release(m_impl));
+
+ m_impl.exit();
+ }
+
+ /** Acquire the mutex.
+ @param n_spins max number of spins
+ @param n_delay max delay per spin
+ @param name filename where locked
+ @param line line number where locked */
+ void enter(
+ uint32_t n_spins,
+ uint32_t n_delay,
+ const char* name,
+ uint32_t line) UNIV_NOTHROW
+ {
+#ifdef UNIV_PFS_MUTEX
+ /* Note: locker is really an alias for state. That's why
+ it has to be in the same scope during pfs_end(). */
+
+ PSI_mutex_locker_state state;
+ PSI_mutex_locker* locker;
+
+ locker = pfs_begin_lock(&state, name, line);
+#endif /* UNIV_PFS_MUTEX */
+
+ ut_d(policy().context.enter(m_impl, name, line));
+
+ m_impl.enter(n_spins, n_delay, name, line);
+
+ ut_d(policy().context.locked(m_impl, name, line));
+#ifdef UNIV_PFS_MUTEX
+ pfs_end(locker, 0);
+#endif /* UNIV_PFS_MUTEX */
+ }
+
+ /** Try and lock the mutex, return 0 on SUCCESS and 1 otherwise.
+ @param name filename where locked
+ @param line line number where locked */
+ int trylock(const char* name, uint32_t line) UNIV_NOTHROW
+ {
+#ifdef UNIV_PFS_MUTEX
+ /* Note: locker is really an alias for state. That's why
+ it has to be in the same scope during pfs_end(). */
+
+ PSI_mutex_locker_state state;
+ PSI_mutex_locker* locker;
+
+ locker = pfs_begin_trylock(&state, name, line);
+#endif /* UNIV_PFS_MUTEX */
+
+ /* There is a subtlety here, we check the mutex ordering
+ after locking here. This is only done to avoid add and
+ then remove if the trylock was unsuccesful. */
+
+ int ret = m_impl.try_lock() ? 0 : 1;
+
+ if (ret == 0) {
+
+ ut_d(policy().context.enter(m_impl, name, line));
+
+ ut_d(policy().context.locked(m_impl, name, line));
+ }
+
+#ifdef UNIV_PFS_MUTEX
+ pfs_end(locker, 0);
+#endif /* UNIV_PFS_MUTEX */
+
+ return(ret);
+ }
+
+#ifdef UNIV_DEBUG
+ /** @return true if the thread owns the mutex. */
+ bool is_owned() const UNIV_NOTHROW
+ {
+ return(policy().context.is_owned());
+ }
+#endif /* UNIV_DEBUG */
+
+ /**
+ Initialise the mutex.
+
+ @param[in] id Mutex ID
+ @param[in] filename file where created
+ @param[in] line line number in file where created */
+ void init(
+ latch_id_t id,
+ const char* filename,
+ uint32_t line)
+ UNIV_NOTHROW
+ {
+#ifdef UNIV_PFS_MUTEX
+ pfs_add(sync_latch_get_pfs_key(id));
+#endif /* UNIV_PFS_MUTEX */
+
+ m_impl.init(id, filename, line);
+ policy().init(m_impl, id, filename, line);
+ ut_d(policy().context.init(id));
+ }
+
+ /** Free resources (if any) */
+ void destroy() UNIV_NOTHROW
+ {
+#ifdef UNIV_PFS_MUTEX
+ pfs_del();
+#endif /* UNIV_PFS_MUTEX */
+ m_impl.destroy();
+ policy().destroy();
+ ut_d(policy().context.destroy());
+ }
+
+ /** Required for os_event_t */
+ operator sys_mutex_t*() UNIV_NOTHROW
+ {
+ return(m_impl.operator sys_mutex_t*());
+ }
+
+#ifdef UNIV_PFS_MUTEX
+ /** Performance schema monitoring - register mutex with PFS.
+
+ Note: This is public only because we want to get around an issue
+ with registering a subset of buffer pool pages with PFS when
+ PFS_GROUP_BUFFER_SYNC is defined. Therefore this has to then
+ be called by external code (see buf0buf.cc).
+
+ @param key - Performance Schema key. */
+ void pfs_add(mysql_pfs_key_t key) UNIV_NOTHROW
+ {
+ ut_ad(m_ptr == 0);
+ m_ptr = PSI_MUTEX_CALL(init_mutex)(key, this);
+ }
+
+private:
+
+ /** Performance schema monitoring.
+ @param state - PFS locker state
+ @param name - file name where locked
+ @param line - line number in file where locked */
+ PSI_mutex_locker* pfs_begin_lock(
+ PSI_mutex_locker_state* state,
+ const char* name,
+ uint32_t line) UNIV_NOTHROW
+ {
+ if (m_ptr != 0) {
+ return(PSI_MUTEX_CALL(start_mutex_wait)(
+ state, m_ptr,
+ PSI_MUTEX_LOCK, name, (uint) line));
+ }
+
+ return(0);
+ }
+
+ /** Performance schema monitoring.
+ @param state - PFS locker state
+ @param name - file name where locked
+ @param line - line number in file where locked */
+ PSI_mutex_locker* pfs_begin_trylock(
+ PSI_mutex_locker_state* state,
+ const char* name,
+ uint32_t line) UNIV_NOTHROW
+ {
+ if (m_ptr != 0) {
+ return(PSI_MUTEX_CALL(start_mutex_wait)(
+ state, m_ptr,
+ PSI_MUTEX_TRYLOCK, name, (uint) line));
+ }
+
+ return(0);
+ }
+
+ /** Performance schema monitoring
+ @param locker - PFS identifier
+ @param ret - 0 for success and 1 for failure */
+ void pfs_end(PSI_mutex_locker* locker, int ret) UNIV_NOTHROW
+ {
+ if (locker != 0) {
+ PSI_MUTEX_CALL(end_mutex_wait)(locker, ret);
+ }
+ }
+
+ /** Performance schema monitoring - register mutex release */
+ void pfs_exit()
+ {
+ if (m_ptr != 0) {
+ PSI_MUTEX_CALL(unlock_mutex)(m_ptr);
+ }
+ }
+
+ /** Performance schema monitoring - deregister */
+ void pfs_del()
+ {
+ if (m_ptr != 0) {
+ PSI_MUTEX_CALL(destroy_mutex)(m_ptr);
+ m_ptr = 0;
+ }
+ }
+#endif /* UNIV_PFS_MUTEX */
+
+private:
+ /** The mutex implementation */
+ MutexImpl m_impl;
+
+#ifdef UNIV_PFS_MUTEX
+ /** The performance schema instrumentation hook. */
+ PSI_mutex* m_ptr;
+#endif /* UNIV_PFS_MUTEX */
+
+};
+
+#endif /* ib0mutex_h */
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
new file mode 100644
index 00000000..cb418e57
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -0,0 +1,411 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.h
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0ibuf_h
+#define ibuf0ibuf_h
+
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+#include "ibuf0types.h"
+
+/** Default value for maximum on-disk size of change buffer in terms
+of percentage of the buffer pool. */
+#define CHANGE_BUFFER_DEFAULT_SIZE (25)
+
+/* Possible operations buffered in the insert/whatever buffer. See
+ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */
+typedef enum {
+ IBUF_OP_INSERT = 0,
+ IBUF_OP_DELETE_MARK = 1,
+ IBUF_OP_DELETE = 2,
+
+ /* Number of different operation types. */
+ IBUF_OP_COUNT = 3
+} ibuf_op_t;
+
+/** Combinations of operations that can be buffered.
+@see innodb_change_buffering_names */
+enum ibuf_use_t {
+ IBUF_USE_NONE = 0,
+ IBUF_USE_INSERT, /* insert */
+ IBUF_USE_DELETE_MARK, /* delete */
+ IBUF_USE_INSERT_DELETE_MARK, /* insert+delete */
+ IBUF_USE_DELETE, /* delete+purge */
+ IBUF_USE_ALL /* insert+delete+purge */
+};
+
+/** Operations that can currently be buffered. */
+extern ulong innodb_change_buffering;
+
+/** The insert buffer control structure */
+extern ibuf_t ibuf;
+
+/* The purpose of the insert buffer is to reduce random disk access.
+When we wish to insert a record into a non-unique secondary index and
+the B-tree leaf page where the record belongs to is not in the buffer
+pool, we insert the record into the insert buffer B-tree, indexed by
+(space_id, page_no). When the page is eventually read into the buffer
+pool, we look up the insert buffer B-tree for any modifications to the
+page, and apply these upon the completion of the read operation. This
+is called the insert buffer merge. */
+
+/* The insert buffer merge must always succeed. To guarantee this,
+the insert buffer subsystem keeps track of the free space in pages for
+which it can buffer operations. Two bits per page in the insert
+buffer bitmap indicate the available space in coarse increments. The
+free bits in the insert buffer bitmap must never exceed the free space
+on a page. It is safe to decrement or reset the bits in the bitmap in
+a mini-transaction that is committed before the mini-transaction that
+affects the free space. It is unsafe to increment the bits in a
+separately committed mini-transaction, because in crash recovery, the
+free bits could momentarily be set too high. */
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup.
+@return DB_SUCCESS or failure */
+dberr_t
+ibuf_init_at_db_start(void);
+/*=======================*/
+/*********************************************************************//**
+Updates the max_size value for ibuf. */
+void
+ibuf_max_size_update(
+/*=================*/
+ ulint new_val); /*!< in: new value in terms of
+ percentage of the buffer pool size */
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+void
+ibuf_update_max_tablespace_id(void);
+/*===============================*/
+/***************************************************************//**
+Starts an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_start(
+/*===========*/
+ mtr_t* mtr) /*!< out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Commits an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_commit(
+/*============*/
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+void
+ibuf_reset_free_bits(
+/*=================*/
+ buf_block_t* block); /*!< in: index page; free bits are set to 0
+ if the index is a non-clustered
+ non-unique, and page level is 0 */
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high. It is only safe to use this function for
+decrementing the free bits. Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+ buf_block_t* block, /*!< in: index page to which we have added new
+ records; the free bits are updated if the
+ index is non-clustered and non-unique and
+ the page level is 0, and the page becomes
+ fuller */
+ ulint max_ins_size,/*!< in: value of maximum insert size with
+ reorganize before the latest operation
+ performed to the page */
+ ulint increase);/*!< in: upper limit for the additional space
+ used in the latest operation, if known, or
+ ULINT_UNDEFINED */
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_low(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ ulint max_ins_size, /*!< in: value of
+ maximum insert size
+ with reorganize before
+ the latest operation
+ performed to the page */
+ mtr_t* mtr); /*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+ buf_block_t* block, /*!< in/out: index page */
+ mtr_t* mtr); /*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page. It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+ buf_block_t* block1, /*!< in: index page */
+ buf_block_t* block2, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint ignore_sec_unique); /*!< in: if != 0, we should
+ ignore UNIQUE constraint on
+ a secondary index when we
+ decide */
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INLINE
+ibool
+ibuf_inside(
+/*========*/
+ const mtr_t* mtr) /*!< in: mini-transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Checks if a page address is an ibuf bitmap page (level 3 page) address.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return TRUE if a bitmap page */
+inline bool ibuf_bitmap_page(const page_id_t page_id, ulint zip_size)
+{
+ ut_ad(ut_is_2pow(zip_size));
+ ulint size = zip_size ? zip_size : srv_page_size;
+ return (page_id.page_no() & (size - 1)) == FSP_IBUF_BITMAP_OFFSET;
+}
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] x_latch FALSE if relaxed check (avoid latching the
+bitmap page)
+@param[in] file file name
+@param[in] line line where called
+@param[in,out] mtr mtr which will contain an x-latch to the
+bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
+in which case a new transaction is created.
+@return true if level 2 or level 3 page */
+bool
+ibuf_page_low(
+ const page_id_t page_id,
+ ulint zip_size,
+#ifdef UNIV_DEBUG
+ bool x_latch,
+#endif /* UNIV_DEBUG */
+ const char* file,
+ unsigned line,
+ mtr_t* mtr)
+ MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef UNIV_DEBUG
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in] page_id tablespace/page identifier
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction or NULL
+@return TRUE if level 2 or level 3 page */
+# define ibuf_page(page_id, zip_size, mtr) \
+ ibuf_page_low(page_id, zip_size, true, __FILE__, __LINE__, mtr)
+
+#else /* UVIV_DEBUG */
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in] page_id tablespace/page identifier
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction or NULL
+@return TRUE if level 2 or level 3 page */
+# define ibuf_page(page_id, zip_size, mtr) \
+ ibuf_page_low(page_id, zip_size, __FILE__, __LINE__, mtr)
+
+#endif /* UVIV_DEBUG */
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+void
+ibuf_free_excess_pages(void);
+/*========================*/
+
+/** Buffer an operation in the change buffer, instead of applying it
+directly to the file page, if this is possible. Does not do it if the index
+is clustered or unique.
+@param[in] op operation type
+@param[in] entry index entry to insert
+@param[in,out] index index where to insert
+@param[in] page_id page id where to insert
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] thr query thread
+@return true if success */
+bool
+ibuf_insert(
+ ibuf_op_t op,
+ const dtuple_t* entry,
+ dict_index_t* index,
+ const page_id_t page_id,
+ ulint zip_size,
+ que_thr_t* thr);
+
+/** Check whether buffered changes exist for a page.
+@param[in] id page identifier
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return whether buffered changes exist */
+bool ibuf_page_exists(const page_id_t id, ulint zip_size);
+
+/** When an index page is read from a disk to the buffer pool, this function
+applies any buffered operations to the page and deletes the entries from the
+insert buffer. If the page is not read, but created in the buffer pool, this
+function deletes its buffered entries from the insert buffer; there can
+exist entries for such a page if the page belonged to an index which
+subsequently was dropped.
+@param block X-latched page to try to apply changes to, or NULL to discard
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */
+void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
+ ulint zip_size);
+
+/** Delete all change buffer entries for a tablespace,
+in DISCARD TABLESPACE, IMPORT TABLESPACE, or crash recovery.
+@param[in] space missing or to-be-discarded tablespace */
+void ibuf_delete_for_discarded_space(ulint space);
+
+/** Contract the change buffer by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+ulint ibuf_merge_all();
+
+/** Contracts insert buffer trees by reading pages referring to space_id
+to the buffer pool.
+@returns number of pages merged.*/
+ulint
+ibuf_merge_space(
+/*=============*/
+ ulint space); /*!< in: space id */
+
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return true if empty */
+bool
+ibuf_is_empty(void);
+/*===============*/
+/******************************************************************//**
+Prints info of ibuf. */
+void
+ibuf_print(
+/*=======*/
+ FILE* file); /*!< in: file where to print */
+/********************************************************************
+Read the first two bytes from a record's fourth field (counter field in new
+records; something else in older records).
+@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */
+ulint
+ibuf_rec_get_counter(
+/*=================*/
+ const rec_t* rec); /*!< in: ibuf record */
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+void
+ibuf_close(void);
+/*============*/
+
+/** Check the insert buffer bitmaps on IMPORT TABLESPACE.
+@param[in] trx transaction
+@param[in,out] space tablespace being imported
+@return DB_SUCCESS or error code */
+dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Updates free bits and buffered bits for bulk loaded page.
+@param[in] block index page
+@param]in] reset flag if reset free val */
+void
+ibuf_set_bitmap_for_bulk_load(
+ buf_block_t* block,
+ bool reset);
+
+#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO
+#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO
+
+/* The ibuf header page currently contains only the file segment header
+for the file segment from which the pages for the ibuf tree are allocated */
+#define IBUF_HEADER PAGE_DATA
+#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */
+
+/* The insert buffer tree itself is always located in space 0. */
+#define IBUF_SPACE_ID static_cast<ulint>(0)
+
+#include "ibuf0ibuf.ic"
+
+#endif
diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic
new file mode 100644
index 00000000..2c262051
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.ic
@@ -0,0 +1,307 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.ic
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0zip.h"
+#include "fsp0types.h"
+#include "buf0lru.h"
+
+/** An index page must contain at least srv_page_size /
+IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
+buffer inserts to this page. If there is this much of free space, the
+corresponding bits are set in the ibuf bitmap. */
+#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32
+
+/***************************************************************//**
+Starts an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_start(
+/*===========*/
+ mtr_t* mtr) /*!< out: mini-transaction */
+{
+ mtr_start(mtr);
+ mtr->enter_ibuf();
+
+ if (high_level_read_only || srv_read_only_mode) {
+ mtr_set_log_mode(mtr, MTR_LOG_NO_REDO);
+ }
+
+}
+/***************************************************************//**
+Commits an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_commit(
+/*============*/
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(mtr->is_inside_ibuf());
+ ut_d(mtr->exit_ibuf());
+
+ mtr_commit(mtr);
+}
+
+/** Insert buffer struct */
+struct ibuf_t{
+ ulint size; /*!< current size of the ibuf index
+ tree, in pages */
+ ulint max_size; /*!< recommended maximum size of the
+ ibuf index tree, in pages */
+ ulint seg_size; /*!< allocated pages of the file
+ segment containing ibuf header and
+ tree */
+ bool empty; /*!< Protected by the page
+ latch of the root page of the
+ insert buffer tree
+ (FSP_IBUF_TREE_ROOT_PAGE_NO). true
+ if and only if the insert
+ buffer tree is empty. */
+ ulint free_list_len; /*!< length of the free list */
+ ulint height; /*!< tree height */
+ dict_index_t* index; /*!< insert buffer index */
+
+ /** number of pages merged */
+ Atomic_counter<ulint> n_merges;
+ Atomic_counter<ulint> n_merged_ops[IBUF_OP_COUNT];
+ /*!< number of operations of each type
+ merged to index pages */
+ Atomic_counter<ulint> n_discarded_ops[IBUF_OP_COUNT];
+ /*!< number of operations of each type
+ discarded without merging due to the
+ tablespace being deleted or the
+ index being dropped */
+};
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+void
+ibuf_set_free_bits_func(
+/*====================*/
+ buf_block_t* block, /*!< in: index page of a non-clustered index;
+ free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+ ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum
+ value which the bits must have before
+ setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+ ulint val); /*!< in: value to set: < 4 */
+#ifdef UNIV_IBUF_DEBUG
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v)
+#else /* UNIV_IBUF_DEBUG */
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v)
+#endif /* UNIV_IBUF_DEBUG */
+
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint ignore_sec_unique) /*!< in: if != 0, we should
+ ignore UNIQUE constraint on
+ a secondary index when we
+ decide */
+{
+ return(innodb_change_buffering
+ && ibuf.max_size != 0
+ && !dict_index_is_clust(index)
+ && !dict_index_is_spatial(index)
+ && index->table->quiesce == QUIESCE_NONE
+ && (ignore_sec_unique || !dict_index_is_unique(index)));
+}
+
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INLINE
+ibool
+ibuf_inside(
+/*========*/
+ const mtr_t* mtr) /*!< in: mini-transaction */
+{
+ return(mtr->is_inside_ibuf());
+}
+
+/** Translates the free space on a page to a value in the ibuf bitmap.
+@param[in] page_size page size in bytes
+@param[in] max_ins_size maximum insert size after reorganize for
+the page
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_bits(
+ ulint page_size,
+ ulint max_ins_size)
+{
+ ulint n;
+ ut_ad(ut_is_2pow(page_size));
+ ut_ad(page_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+ n = max_ins_size / (page_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+ if (n == 3) {
+ n = 2;
+ }
+
+ if (n > 3) {
+ n = 3;
+ }
+
+ return(n);
+}
+
+/*********************************************************************//**
+Translates the free space on a compressed page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_zip(
+/*==========================*/
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ ulint max_ins_size;
+ const page_zip_des_t* page_zip;
+ lint zip_max_ins;
+
+ ut_ad(block->page.zip.data);
+
+ /* Consider the maximum insert size on the uncompressed page
+ without reorganizing the page. We must not assume anything
+ about the compression ratio. If zip_max_ins > max_ins_size and
+ there is 1/4 garbage on the page, recompression after the
+ reorganize could fail, in theory. So, let us guarantee that
+ merging a buffered insert to a compressed page will always
+ succeed without reorganizing or recompressing the page, just
+ by using the page modification log. */
+ max_ins_size = page_get_max_insert_size(
+ buf_block_get_frame(block), 1);
+
+ page_zip = buf_block_get_page_zip(block);
+ zip_max_ins = page_zip_max_ins_size(page_zip,
+ FALSE/* not clustered */);
+
+ if (zip_max_ins < 0) {
+ return(0);
+ } else if (max_ins_size > (ulint) zip_max_ins) {
+ max_ins_size = (ulint) zip_max_ins;
+ }
+
+ return(ibuf_index_page_calc_free_bits(block->physical_size(),
+ max_ins_size));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free(
+/*======================*/
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ if (!block->page.zip.data) {
+ ulint max_ins_size;
+
+ max_ins_size = page_get_max_insert_size_after_reorganize(
+ buf_block_get_frame(block), 1);
+
+ return(ibuf_index_page_calc_free_bits(
+ block->physical_size(), max_ins_size));
+ } else {
+ return(ibuf_index_page_calc_free_zip(block));
+ }
+}
+
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high. It is only safe to use this function for
+decrementing the free bits. Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+ buf_block_t* block, /*!< in: index page to which we have added new
+ records; the free bits are updated if the
+ index is non-clustered and non-unique and
+ the page level is 0, and the page becomes
+ fuller */
+ ulint max_ins_size,/*!< in: value of maximum insert size with
+ reorganize before the latest operation
+ performed to the page */
+ ulint increase)/*!< in: upper limit for the additional space
+ used in the latest operation, if known, or
+ ULINT_UNDEFINED */
+{
+ ulint before;
+ ulint after;
+
+ ut_ad(buf_block_get_page_zip(block) == NULL);
+
+ before = ibuf_index_page_calc_free_bits(
+ srv_page_size, max_ins_size);
+
+ if (max_ins_size >= increase) {
+ compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX);
+ after = ibuf_index_page_calc_free_bits(
+ srv_page_size, max_ins_size - increase);
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(after <= ibuf_index_page_calc_free(block));
+#endif
+ } else {
+ after = ibuf_index_page_calc_free(block);
+ }
+
+ if (after == 0) {
+ /* We move the page to the front of the buffer pool LRU list:
+ the purpose of this is to prevent those pages to which we
+ cannot make inserts using the insert buffer from slipping
+ out of the buffer pool */
+
+ buf_page_make_young(&block->page);
+ }
+
+ if (before > after) {
+ ibuf_set_free_bits(block, after, before);
+ }
+}
diff --git a/storage/innobase/include/ibuf0types.h b/storage/innobase/include/ibuf0types.h
new file mode 100644
index 00000000..6b7c4720
--- /dev/null
+++ b/storage/innobase/include/ibuf0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0types.h
+Insert buffer global types
+
+Created 7/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0types_h
+#define ibuf0types_h
+
+struct ibuf_t;
+
+#endif
diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h
new file mode 100644
index 00000000..a7e61395
--- /dev/null
+++ b/storage/innobase/include/lock0iter.h
@@ -0,0 +1,66 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0iter.h
+Lock queue iterator type and function prototypes.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0iter_h
+#define lock0iter_h
+
+#include "lock0types.h"
+
+struct lock_queue_iterator_t {
+ const lock_t* current_lock;
+ /* In case this is a record lock queue (not table lock queue)
+ then bit_no is the record number within the heap in which the
+ record is stored. */
+ ulint bit_no;
+};
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+ bit_no is calculated in this function by using
+ lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+ of a wait lock. */
+void
+lock_queue_iterator_reset(
+/*======================*/
+ lock_queue_iterator_t* iter, /*!< out: iterator */
+ const lock_t* lock, /*!< in: lock to start from */
+ ulint bit_no);/*!< in: record number in the
+ heap */
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return previous lock or NULL */
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+ lock_queue_iterator_t* iter); /*!< in/out: iterator */
+
+#endif /* lock0iter_h */
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
new file mode 100644
index 00000000..3b63b06a
--- /dev/null
+++ b/storage/innobase/include/lock0lock.h
@@ -0,0 +1,990 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.h
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0lock_h
+#define lock0lock_h
+
+#include "buf0types.h"
+#include "trx0types.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "que0types.h"
+#include "lock0types.h"
+#include "hash0hash.h"
+#include "srv0srv.h"
+#include "ut0vec.h"
+#include "gis0rtree.h"
+#include "lock0prdt.h"
+
+/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by
+ setting innodb_lock_schedule_algorithm. */
+enum innodb_lock_schedule_algorithm_t {
+ /*!< First Come First Served */
+ INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
+ /*!< Variance-Aware-Transaction-Scheduling */
+ INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+};
+
+extern ulong innodb_lock_schedule_algorithm;
+
+// Forward declaration
+class ReadView;
+
+/** The value of innodb_deadlock_detect */
+extern my_bool innobase_deadlock_detect;
+
+/*********************************************************************//**
+Gets the size of a lock struct.
+@return size in bytes */
+ulint
+lock_get_size(void);
+/*===============*/
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+ const buf_block_t* block); /*!< in: buffer block */
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+void
+lock_move_reorganize_page(
+/*======================*/
+ const buf_block_t* block, /*!< in: old index page, now
+ reorganized */
+ const buf_block_t* oblock);/*!< in: copy of the old, not
+ reorganized page */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+void
+lock_move_rec_list_end(
+/*===================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec); /*!< in: record on page: this
+ is the first record moved */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+void
+lock_move_rec_list_start(
+/*=====================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec, /*!< in: record on page:
+ this is the first
+ record NOT copied */
+ const rec_t* old_end); /*!< in: old
+ previous-to-last
+ record on new_page
+ before the records
+ were copied */
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+void
+lock_update_split_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block); /*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+void
+lock_update_merge_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page to
+ which merged */
+ const rec_t* orig_succ, /*!< in: original
+ successor of infimum
+ on the right page
+ before merge */
+ const buf_block_t* left_block); /*!< in: merged index
+ page which will be
+ discarded */
+/*************************************************************//**
+Updates the lock table when the root page is copied to another in
+btr_root_raise_and_insert. Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+void
+lock_update_root_raise(
+/*===================*/
+ const buf_block_t* block, /*!< in: index page to which copied */
+ const buf_block_t* root); /*!< in: root page */
+/*************************************************************//**
+Updates the lock table when a page is copied to another and the original page
+is removed from the chain of leaf pages, except if page is the root! */
+void
+lock_update_copy_and_discard(
+/*=========================*/
+ const buf_block_t* new_block, /*!< in: index page to
+ which copied */
+ const buf_block_t* block); /*!< in: index page;
+ NOT the root! */
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+void
+lock_update_split_left(
+/*===================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block); /*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the left. */
+void
+lock_update_merge_left(
+/*===================*/
+ const buf_block_t* left_block, /*!< in: left page to
+ which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor
+ of supremum on the left page
+ before merge */
+ const buf_block_t* right_block); /*!< in: merged index page
+ which will be discarded */
+/*************************************************************//**
+Updates the lock table when a page is split and merged to
+two pages. */
+UNIV_INTERN
+void
+lock_update_split_and_merge(
+ const buf_block_t* left_block, /*!< in: left page to which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor of
+ supremum on the left page before merge*/
+ const buf_block_t* right_block);/*!< in: right page from which merged */
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+ const buf_block_t* heir_block, /*!< in: block containing the
+ record which inherits */
+ const buf_block_t* block, /*!< in: block containing the
+ record from which inherited;
+ does NOT reset the locks on
+ this record */
+ ulint heir_heap_no, /*!< in: heap_no of the
+ inheriting record */
+ ulint heap_no); /*!< in: heap_no of the
+ donating record */
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+void
+lock_update_discard(
+/*================*/
+ const buf_block_t* heir_block, /*!< in: index page
+ which will inherit the locks */
+ ulint heir_heap_no, /*!< in: heap_no of the record
+ which will inherit the locks */
+ const buf_block_t* block); /*!< in: index page
+ which will be discarded */
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+void
+lock_update_insert(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: the inserted record */
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+void
+lock_update_delete(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: the record to be removed */
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is in such an update moved, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: record whose lock state
+ is stored on the infimum
+ record of the same page; lock
+ bits are reset on the
+ record */
+/*********************************************************************//**
+Restores the state of explicit lock requests on a single record, where the
+state was stored on the infimum of the page. */
+void
+lock_rec_restore_from_page_infimum(
+/*===============================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record whose lock state
+ is restored */
+ const buf_block_t* donator);/*!< in: page (rec is not
+ necessarily on this page)
+ whose infimum stored the lock
+ state; lock bits are reset on
+ the infimum */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_rec_insert_check_and_lock(
+/*===========================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is
+ set, does nothing */
+ const rec_t* rec, /*!< in: record after which to insert */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ dict_index_t* index, /*!< in: index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ bool* inherit)/*!< out: set to true if the new
+ inserted record maybe should inherit
+ LOCK_GAP type locks from the successor
+ record */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified */
+ dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify
+(delete mark or delete unmark) of a secondary index record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified; NOTE: as this is a secondary
+ index, we always have to modify the
+ clustered index record first: see the
+ comment below */
+ dict_index_t* index, /*!< in: secondary index */
+ que_thr_t* thr, /*!< in: query thread
+ (can be NULL if BTR_NO_LOCKING_FLAG) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Checks that a record is seen in a consistent read.
+@return true if sees, or false if an earlier version of the record
+should be retrieved */
+bool
+lock_clust_rec_cons_read_sees(
+/*==========================*/
+ const rec_t* rec, /*!< in: user record which should be read or
+ passed over by a read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ReadView* view); /*!< in: consistent read view */
+/*********************************************************************//**
+Checks that a non-clustered index record is seen in a consistent read.
+
+NOTE that a non-clustered index page contains so little information on
+its modifications that also in the case false, the present version of
+rec may be the right, but we must check this from the clustered index
+record.
+
+@return true if certainly sees, or false if an earlier version of the
+clustered index record might be needed */
+bool
+lock_sec_rec_cons_read_sees(
+/*========================*/
+ const rec_t* rec, /*!< in: user record which
+ should be read or passed over
+ by a read cursor */
+ const dict_index_t* index, /*!< in: index */
+ const ReadView* view) /*!< in: consistent read view */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Locks the specified database table in the mode given. If the lock cannot
+be granted immediately, the query thread is put to wait.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_table(
+/*=======*/
+ unsigned flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ dict_table_t* table, /*!< in/out: database table
+ in dictionary cache */
+ lock_mode mode, /*!< in: lock mode */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Creates a table IX lock object for a resurrected transaction. */
+void
+lock_table_ix_resurrect(
+/*====================*/
+ dict_table_t* table, /*!< in/out: table */
+ trx_t* trx); /*!< in/out: transaction */
+
+/** Sets a lock on a table based on the given mode.
+@param[in] table table to lock
+@param[in,out] trx transaction
+@param[in] mode LOCK_X or LOCK_S
+@return error code or DB_SUCCESS. */
+dberr_t
+lock_table_for_trx(
+ dict_table_t* table,
+ trx_t* trx,
+ enum lock_mode mode)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+void
+lock_rec_unlock(
+/*============*/
+ trx_t* trx, /*!< in/out: transaction that has
+ set a record lock */
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record */
+ lock_mode lock_mode);/*!< in: LOCK_S or LOCK_X */
+
+/** Release the explicit locks of a committing transaction,
+and release possible other transactions waiting because of these locks. */
+void lock_release(trx_t* trx);
+
+/*************************************************************//**
+Get the lock hash table */
+UNIV_INLINE
+hash_table_t*
+lock_hash_get(
+/*==========*/
+ ulint mode); /*!< in: lock mode */
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+ const lock_t* lock); /*!< in: record lock with at least one
+ bit set */
+
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return whether lock1 has to wait for lock2 to be removed */
+bool
+lock_has_to_wait(
+/*=============*/
+ const lock_t* lock1, /*!< in: waiting lock */
+ const lock_t* lock2); /*!< in: another lock; NOTE that it is
+ assumed that this has a lock bit set
+ on the same record as in lock1 if the
+ locks are record locks */
+/*********************************************************************//**
+Reports that a transaction id is insensible, i.e., in the future. */
+ATTRIBUTE_COLD
+void
+lock_report_trx_id_insanity(
+/*========================*/
+ trx_id_t trx_id, /*!< in: trx id */
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ trx_id_t max_trx_id); /*!< in: trx_sys.get_max_trx_id() */
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain lock mutex and exits without
+printing info */
+ibool
+lock_print_info_summary(
+/*====================*/
+ FILE* file, /*!< in: file where to print */
+ ibool nowait) /*!< in: whether to wait for the lock mutex */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Prints transaction lock wait and MVCC state.
+@param[in,out] file file where to print
+@param[in] trx transaction
+@param[in] now current time */
+void
+lock_trx_print_wait_and_mvcc_state(FILE* file, const trx_t* trx, time_t now);
+
+/*********************************************************************//**
+Prints info of locks for each transaction. This function assumes that the
+caller holds the lock mutex and more importantly it will release the lock
+mutex on behalf of the caller. (This should be fixed in the future). */
+void
+lock_print_info_all_transactions(
+/*=============================*/
+ FILE* file); /*!< in: file where to print */
+/*********************************************************************//**
+Return approximate number or record locks (bits set in the bitmap) for
+this transaction. Since delete-marked records may be removed, the
+record count will not be precise.
+The caller must be holding lock_sys.mutex. */
+ulint
+lock_number_of_rows_locked(
+/*=======================*/
+ const trx_lock_t* trx_lock) /*!< in: transaction locks */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Return the number of table locks for a transaction.
+The caller must be holding lock_sys.mutex. */
+ulint
+lock_number_of_tables_locked(
+/*=========================*/
+ const trx_lock_t* trx_lock) /*!< in: transaction locks */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*******************************************************************//**
+Gets the type of a lock. Non-inline version for using outside of the
+lock module.
+@return LOCK_TABLE or LOCK_REC */
+ulint
+lock_get_type(
+/*==========*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the id of the table on which the lock is.
+@return id of the table */
+table_id_t
+lock_get_table_id(
+/*==============*/
+ const lock_t* lock); /*!< in: lock */
+
+/** Determine which table a lock is associated with.
+@param[in] lock the lock
+@return name of the table */
+const table_name_t&
+lock_get_table_name(
+ const lock_t* lock);
+
+/*******************************************************************//**
+For a record lock, gets the index on which the lock is.
+@return index */
+const dict_index_t*
+lock_rec_get_index(
+/*===============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the name of the index on which the lock is.
+The string should not be free()'d or modified.
+@return name of the index */
+const char*
+lock_rec_get_index_name(
+/*====================*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Check if there are any locks (table or rec) against table.
+@return TRUE if locks exist */
+bool
+lock_table_has_locks(
+/*=================*/
+ const dict_table_t* table); /*!< in: check if there are any locks
+ held on records in this table or on the
+ table itself */
+
+/** A task which wakes up threads whose lock wait may have lasted too long */
+void lock_wait_timeout_task(void*);
+
+/********************************************************************//**
+Releases a user OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+void
+lock_wait_release_thread_if_suspended(
+/*==================================*/
+ que_thr_t* thr); /*!< in: query thread associated with the
+ user OS thread */
+
+/***************************************************************//**
+Puts a user OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+void
+lock_wait_suspend_thread(
+/*=====================*/
+ que_thr_t* thr); /*!< in: query thread associated with the
+ user OS thread */
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+void
+lock_unlock_table_autoinc(
+/*======================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Check whether the transaction has already been rolled back because it
+was selected as a deadlock victim, or if it has to wait then cancel
+the wait lock.
+@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
+dberr_t
+lock_trx_handle_wait(
+/*=================*/
+ trx_t* trx); /*!< in/out: trx lock state */
+/*********************************************************************//**
+Get the number of locks on a table.
+@return number of locks */
+ulint
+lock_table_get_n_locks(
+/*===================*/
+ const dict_table_t* table); /*!< in: table */
+/*******************************************************************//**
+Initialise the trx lock list. */
+void
+lock_trx_lock_list_init(
+/*====================*/
+ trx_lock_list_t* lock_list); /*!< List to initialise */
+
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return true if ok */
+bool
+lock_check_trx_id_sanity(
+/*=====================*/
+ trx_id_t trx_id, /*!< in: trx id */
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets); /*!< in: rec_get_offsets(rec, index) */
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Check if the transaction holds any locks on the sys tables
+or its records.
+@return the strongest lock found on any sys table or 0 for none */
+const lock_t*
+lock_trx_has_sys_table_locks(
+/*=========================*/
+ const trx_t* trx) /*!< in: transaction to check */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if the transaction holds an explicit exclusive lock on a record.
+@param[in] trx transaction
+@param[in] table table
+@param[in] block leaf page
+@param[in] heap_no heap number identifying the record
+@return whether an explicit X-lock is held */
+bool
+lock_trx_has_expl_x_lock(
+ const trx_t* trx, /*!< in: transaction to check */
+ const dict_table_t* table, /*!< in: table to check */
+ const buf_block_t* block, /*!< in: buffer block of the record */
+ ulint heap_no)/*!< in: record heap number */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+/** Lock operation struct */
+struct lock_op_t{
+ dict_table_t* table; /*!< table to be locked */
+ lock_mode mode; /*!< lock mode */
+};
+
+typedef ib_mutex_t LockMutex;
+
+/** The lock system struct */
+class lock_sys_t
+{
+ bool m_initialised;
+
+public:
+ MY_ALIGNED(CACHE_LINE_SIZE)
+ LockMutex mutex; /*!< Mutex protecting the
+ locks */
+ /** record locks */
+ hash_table_t rec_hash;
+ /** predicate locks for SPATIAL INDEX */
+ hash_table_t prdt_hash;
+ /** page locks for SPATIAL INDEX */
+ hash_table_t prdt_page_hash;
+
+ MY_ALIGNED(CACHE_LINE_SIZE)
+ LockMutex wait_mutex; /*!< Mutex protecting the
+ next two fields */
+ srv_slot_t* waiting_threads; /*!< Array of user threads
+ suspended while waiting for
+ locks within InnoDB, protected
+ by the lock_sys.wait_mutex;
+ os_event_set() and
+ os_event_reset() on
+ waiting_threads[]->event
+ are protected by
+ trx_t::mutex */
+ srv_slot_t* last_slot; /*!< highest slot ever used
+ in the waiting_threads array,
+ protected by
+ lock_sys.wait_mutex */
+
+ ulint n_lock_max_wait_time; /*!< Max wait time */
+
+ std::unique_ptr<tpool::timer> timeout_timer; /*!< Thread pool timer task */
+ bool timeout_timer_active;
+
+
+ /**
+ Constructor.
+
+ Some members may require late initialisation, thus we just mark object as
+ uninitialised. Real initialisation happens in create().
+ */
+ lock_sys_t(): m_initialised(false) {}
+
+
+ bool is_initialised() { return m_initialised; }
+
+
+ /**
+ Creates the lock system at database start.
+
+ @param[in] n_cells number of slots in lock hash table
+ */
+ void create(ulint n_cells);
+
+
+ /**
+ Resize the lock hash table.
+
+ @param[in] n_cells number of slots in lock hash table
+ */
+ void resize(ulint n_cells);
+
+
+ /** Closes the lock system at database shutdown. */
+ void close();
+
+ /** @return the hash value for a page address */
+ ulint hash(const page_id_t id) const
+ { ut_ad(mutex_own(&mutex)); return rec_hash.calc_hash(id.fold()); }
+
+ /** Get the first lock on a page.
+ @param lock_hash hash table to look at
+ @param id page number
+ @return first lock
+ @retval nullptr if none exists */
+ lock_t *get_first(const hash_table_t &lock_hash, const page_id_t id) const
+ {
+ ut_ad(&lock_hash == &rec_hash || &lock_hash == &prdt_hash ||
+ &lock_hash == &prdt_page_hash);
+ for (lock_t *lock= static_cast<lock_t*>
+ (HASH_GET_FIRST(&lock_hash, hash(id)));
+ lock; lock= static_cast<lock_t*>(HASH_GET_NEXT(hash, lock)))
+ if (lock->un_member.rec_lock.page_id == id)
+ return lock;
+ return nullptr;
+ }
+
+ /** Get the first record lock on a page.
+ @param id page number
+ @return first lock
+ @retval nullptr if none exists */
+ lock_t *get_first(const page_id_t id) const
+ { return get_first(rec_hash, id); }
+ /** Get the first predicate lock on a SPATIAL INDEX page.
+ @param id page number
+ @return first lock
+ @retval nullptr if none exists */
+ lock_t *get_first_prdt(const page_id_t id) const
+ { return get_first(prdt_hash, id); }
+ /** Get the first predicate lock on a SPATIAL INDEX page.
+ @param id page number
+ @return first lock
+ @retval nullptr if none exists */
+ lock_t *get_first_prdt_page(const page_id_t id) const
+ { return get_first(prdt_page_hash, id); }
+};
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return created lock */
+UNIV_INLINE
+lock_t*
+lock_rec_create(
+/*============*/
+#ifdef WITH_WSREP
+ lock_t* c_lock, /*!< conflicting lock */
+ que_thr_t* thr, /*!< thread owning trx */
+#endif
+ unsigned type_mode,/*!< in: lock mode and wait
+ flag, type is ignored and
+ replaced by LOCK_REC */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ dict_index_t* index, /*!< in: index of record */
+ trx_t* trx, /*!< in,out: transaction */
+ bool caller_owns_trx_mutex);
+ /*!< in: true if caller owns
+ trx mutex */
+
+/*************************************************************//**
+Removes a record lock request, waiting or granted, from the queue. */
+void
+lock_rec_discard(
+/*=============*/
+ lock_t* in_lock); /*!< in: record lock object: all
+ record locks which are contained
+ in this lock object are removed */
+
+/** Create a new record lock and inserts it to the lock queue,
+without checking for deadlocks or conflicts.
+@param[in] type_mode lock mode and wait flag; type will be replaced
+ with LOCK_REC
+@param[in] page_id index page number
+@param[in] page R-tree index page, or NULL
+@param[in] heap_no record heap number in the index page
+@param[in] index the index tree
+@param[in,out] trx transaction
+@param[in] holds_trx_mutex whether the caller holds trx->mutex
+@return created lock */
+lock_t*
+lock_rec_create_low(
+#ifdef WITH_WSREP
+ lock_t* c_lock, /*!< conflicting lock */
+ que_thr_t* thr, /*!< thread owning trx */
+#endif
+ unsigned type_mode,
+ const page_id_t page_id,
+ const page_t* page,
+ ulint heap_no,
+ dict_index_t* index,
+ trx_t* trx,
+ bool holds_trx_mutex);
+/** Enqueue a waiting request for a lock which cannot be granted immediately.
+Check for deadlocks.
+@param[in] type_mode the requested lock mode (LOCK_S or LOCK_X)
+ possibly ORed with LOCK_GAP or
+ LOCK_REC_NOT_GAP, ORed with
+ LOCK_INSERT_INTENTION if this
+ waiting lock request is set
+ when performing an insert of
+ an index record
+@param[in] block leaf page in the index
+@param[in] heap_no record heap number in the block
+@param[in] index index tree
+@param[in,out] thr query thread
+@param[in] prdt minimum bounding box (spatial index)
+@retval DB_LOCK_WAIT if the waiting lock was enqueued
+@retval DB_DEADLOCK if this transaction was chosen as the victim
+@retval DB_SUCCESS_LOCKED_REC if the other transaction was chosen as a victim
+ (or it happened to commit) */
+dberr_t
+lock_rec_enqueue_waiting(
+#ifdef WITH_WSREP
+ lock_t* c_lock, /*!< conflicting lock */
+#endif
+ unsigned type_mode,
+ const buf_block_t* block,
+ ulint heap_no,
+ dict_index_t* index,
+ que_thr_t* thr,
+ lock_prdt_t* prdt);
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+void
+lock_rtr_move_rec_list(
+/*===================*/
+ const buf_block_t* new_block, /*!< in: index page to
+ move to */
+ const buf_block_t* block, /*!< in: index page */
+ rtr_rec_move_t* rec_move, /*!< in: recording records
+ moved */
+ ulint num_move); /*!< in: num of rec to move */
+
+/*************************************************************//**
+Removes record lock objects set on an index page which is discarded. This
+function does not move locks, or check for waiting locks, therefore the
+lock bitmaps must already be reset when this function is called. */
+void
+lock_rec_free_all_from_discard_page(
+/*================================*/
+ const buf_block_t* block); /*!< in: page to be discarded */
+
+/** The lock system */
+extern lock_sys_t lock_sys;
+
+/** Test if lock_sys.mutex can be acquired without waiting. */
+#define lock_mutex_enter_nowait() \
+ (lock_sys.mutex.trylock(__FILE__, __LINE__))
+
+/** Test if lock_sys.mutex is owned. */
+#define lock_mutex_own() (lock_sys.mutex.is_owned())
+
+/** Acquire the lock_sys.mutex. */
+#define lock_mutex_enter() do { \
+ mutex_enter(&lock_sys.mutex); \
+} while (0)
+
+/** Release the lock_sys.mutex. */
+#define lock_mutex_exit() do { \
+ lock_sys.mutex.exit(); \
+} while (0)
+
+/** Test if lock_sys.wait_mutex is owned. */
+#define lock_wait_mutex_own() (lock_sys.wait_mutex.is_owned())
+
+/** Acquire the lock_sys.wait_mutex. */
+#define lock_wait_mutex_enter() do { \
+ mutex_enter(&lock_sys.wait_mutex); \
+} while (0)
+
+/** Release the lock_sys.wait_mutex. */
+#define lock_wait_mutex_exit() do { \
+ lock_sys.wait_mutex.exit(); \
+} while (0)
+
+#ifdef WITH_WSREP
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+ lock_t* lock); /*!< in/out: waiting lock request */
+
+/*******************************************************************//**
+Get lock mode and table/index name
+@return string containing lock info */
+std::string
+lock_get_info(
+ const lock_t*);
+
+#endif /* WITH_WSREP */
+
+#include "lock0lock.ic"
+
+#endif
diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic
new file mode 100644
index 00000000..2d5b6ff3
--- /dev/null
+++ b/storage/innobase/include/lock0lock.ic
@@ -0,0 +1,103 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.ic
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "page0page.h"
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ const page_t* page = block->frame;
+
+ if (page_is_comp(page)) {
+ return(rec_get_heap_no_new(
+ page
+ + rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+ TRUE)));
+ } else {
+ return(rec_get_heap_no_old(
+ page
+ + rec_get_next_offs(page + PAGE_OLD_INFIMUM,
+ FALSE)));
+ }
+}
+
+/*************************************************************//**
+Get the lock hash table */
+UNIV_INLINE
+hash_table_t*
+lock_hash_get(
+/*==========*/
+ ulint mode) /*!< in: lock mode */
+{
+ if (mode & LOCK_PREDICATE) {
+ return &lock_sys.prdt_hash;
+ } else if (mode & LOCK_PRDT_PAGE) {
+ return &lock_sys.prdt_page_hash;
+ } else {
+ return &lock_sys.rec_hash;
+ }
+}
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return created lock */
+UNIV_INLINE
+lock_t*
+lock_rec_create(
+/*============*/
+#ifdef WITH_WSREP
+ lock_t* c_lock, /*!< conflicting lock */
+ que_thr_t* thr, /*!< thread owning trx */
+#endif
+ unsigned type_mode,/*!< in: lock mode and wait
+ flag, type is ignored and
+ replaced by LOCK_REC */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ dict_index_t* index, /*!< in: index of record */
+ trx_t* trx, /*!< in,out: transaction */
+ bool caller_owns_trx_mutex)
+ /*!< in: TRUE if caller owns
+ trx mutex */
+{
+ btr_assert_not_corrupted(block, index);
+ return lock_rec_create_low(
+#ifdef WITH_WSREP
+ c_lock, thr,
+#endif
+ type_mode, block->page.id(), block->frame, heap_no,
+ index, trx, caller_owns_trx_mutex);
+}
diff --git a/storage/innobase/include/lock0prdt.h b/storage/innobase/include/lock0prdt.h
new file mode 100644
index 00000000..43d68996
--- /dev/null
+++ b/storage/innobase/include/lock0prdt.h
@@ -0,0 +1,204 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0prdt.h
+The predicate lock system
+
+Created 9/7/2013 Jimmy Yang
+*******************************************************/
+#ifndef lock0prdt_h
+#define lock0prdt_h
+
+#include "lock0lock.h"
+
+/* Predicate lock data */
+typedef struct lock_prdt {
+ void* data; /* Predicate data */
+ uint16 op; /* Predicate operator */
+} lock_prdt_t;
+
+/*********************************************************************//**
+Acquire a predicate lock on a block
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_lock(
+/*===========*/
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ lock_prdt_t* prdt, /*!< in: Predicate for the lock */
+ dict_index_t* index, /*!< in: secondary index */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned type_mode,
+ /*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */
+ que_thr_t* thr); /*!< in: query thread
+ (can be NULL if BTR_NO_LOCKING_FLAG) */
+
+/*********************************************************************//**
+Acquire a "Page" lock on a block
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_place_prdt_page_lock(
+ const page_id_t page_id, /*!< in: page identifier */
+ dict_index_t* index, /*!< in: secondary index */
+ que_thr_t* thr); /*!< in: query thread */
+
+/*********************************************************************//**
+Initiate a Predicate lock from a MBR */
+void
+lock_init_prdt_from_mbr(
+/*====================*/
+ lock_prdt_t* prdt, /*!< in/out: predicate to initialized */
+ rtr_mbr_t* mbr, /*!< in: Minimum Bounding Rectangle */
+ ulint mode, /*!< in: Search mode */
+ mem_heap_t* heap); /*!< in: heap for allocating memory */
+
+/*********************************************************************//**
+Get predicate lock's minimum bounding box
+@return the minimum bounding box*/
+lock_prdt_t*
+lock_get_prdt_from_lock(
+/*====================*/
+ const lock_t* lock); /*!< in: the lock */
+
+/*********************************************************************//**
+Checks if a predicate lock request for a new lock has to wait for
+request lock2.
+@return true if new lock has to wait for lock2 to be removed */
+bool
+lock_prdt_has_to_wait(
+/*==================*/
+ const trx_t* trx, /*!< in: trx of new lock */
+ unsigned type_mode,/*!< in: precise mode of the new lock
+ to set: LOCK_S or LOCK_X, possibly
+ ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
+ LOCK_INSERT_INTENTION */
+ lock_prdt_t* prdt, /*!< in: lock predicate to check */
+ const lock_t* lock2); /*!< in: another record lock; NOTE that
+ it is assumed that this has a lock bit
+ set on the same record as in the new
+ lock we are setting */
+
+/**************************************************************//**
+Update predicate lock when page splits */
+void
+lock_prdt_update_split(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: the new half page */
+ lock_prdt_t* prdt, /*!< in: MBR on the old page */
+ lock_prdt_t* new_prdt, /*!< in: MBR on the new page */
+ const page_id_t page_id); /*!< in: page number */
+
+/**************************************************************//**
+Ajust locks from an ancester page of Rtree on the appropriate level . */
+void
+lock_prdt_update_parent(
+/*====================*/
+ buf_block_t* left_block, /*!< in/out: page to be split */
+ buf_block_t* right_block, /*!< in/out: the new half page */
+ lock_prdt_t* left_prdt, /*!< in: MBR on the old page */
+ lock_prdt_t* right_prdt, /*!< in: MBR on the new page */
+ const page_id_t page_id); /*!< in: parent page */
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a predicate record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_insert_check_and_lock(
+/*============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is
+ set, does nothing */
+ const rec_t* rec, /*!< in: record after which to insert */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ dict_index_t* index, /*!< in: index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ lock_prdt_t* prdt); /*!< in: Minimum Bound Rectangle */
+
+/*********************************************************************//**
+Append a predicate to the lock */
+void
+lock_prdt_set_prdt(
+/*===============*/
+ lock_t* lock, /*!< in: lock */
+ const lock_prdt_t* prdt); /*!< in: Predicate */
+
+#if 0
+
+/*********************************************************************//**
+Checks if a predicate lock request for a new lock has to wait for
+request lock2.
+@return true if new lock has to wait for lock2 to be removed */
+UNIV_INLINE
+bool
+lock_prdt_has_to_wait(
+/*==================*/
+ const trx_t* trx, /*!< in: trx of new lock */
+ unsigned type_mode,/*!< in: precise mode of the new lock
+ to set: LOCK_S or LOCK_X, possibly
+ ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
+ LOCK_INSERT_INTENTION */
+ lock_prdt_t* prdt, /*!< in: lock predicate to check */
+ const lock_t* lock2); /*!< in: another record lock; NOTE that
+ it is assumed that this has a lock bit
+ set on the same record as in the new
+ lock we are setting */
+
+/*********************************************************************//**
+Get predicate lock's minimum bounding box
+@return the minimum bounding box*/
+UNIV_INLINE
+rtr_mbr_t*
+prdt_get_mbr_from_prdt(
+/*===================*/
+ const lock_prdt_t* prdt); /*!< in: the lock predicate */
+
+
+#endif
+/*************************************************************//**
+Moves the locks of a record to another record and resets the lock bits of
+the donating record. */
+void
+lock_prdt_rec_move(
+/*===============*/
+ const buf_block_t* receiver, /*!< in: buffer block containing
+ the receiving record */
+ const buf_block_t* donator); /*!< in: buffer block containing
+ the donating record */
+
+/** Check whether there are R-tree Page lock on a page
+@param[in] trx trx to test the lock
+@param[in] page_id page identifier
+@return true if there is none */
+bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id);
+
+/** Removes predicate lock objects set on an index page which is discarded.
+@param[in] block page to be discarded
+@param[in] lock_hash lock hash */
+void
+lock_prdt_page_free_from_discard(
+/*=============================*/
+ const buf_block_t* block,
+ hash_table_t* lock_hash);
+
+#endif
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
new file mode 100644
index 00000000..1b2f9d0f
--- /dev/null
+++ b/storage/innobase/include/lock0priv.h
@@ -0,0 +1,653 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.h
+Lock module internal structures and methods.
+
+Created July 12, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0priv_h
+#define lock0priv_h
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+/* If you need to access members of the structures defined in this
+file, please write appropriate functions that retrieve them and put
+those functions in lock/ */
+#error Do not include lock0priv.h outside of the lock/ module
+#endif
+
+#include "hash0hash.h"
+#include "rem0types.h"
+#include "trx0trx.h"
+
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+/** Print the table lock into the given output stream
+@param[in,out] out the output stream
+@return the given output stream. */
+inline
+std::ostream& lock_table_t::print(std::ostream& out) const
+{
+ out << "[lock_table_t: name=" << table->name << "]";
+ return(out);
+}
+
+/** The global output operator is overloaded to conveniently
+print the lock_table_t object into the given output stream.
+@param[in,out] out the output stream
+@param[in] lock the table lock
+@return the given output stream */
+inline
+std::ostream&
+operator<<(std::ostream& out, const lock_table_t& lock)
+{
+ return(lock.print(out));
+}
+
+/** Convert the member 'type_mode' into a human readable string.
+@return human readable string */
+inline
+std::string
+ib_lock_t::type_mode_string() const
+{
+ std::ostringstream sout;
+ sout << type_string();
+ sout << " | " << lock_mode_string(mode());
+
+ if (is_record_not_gap()) {
+ sout << " | LOCK_REC_NOT_GAP";
+ }
+
+ if (is_waiting()) {
+ sout << " | LOCK_WAIT";
+ }
+
+ if (is_gap()) {
+ sout << " | LOCK_GAP";
+ }
+
+ if (is_insert_intention()) {
+ sout << " | LOCK_INSERT_INTENTION";
+ }
+ return(sout.str());
+}
+
+inline
+std::ostream&
+ib_lock_t::print(std::ostream& out) const
+{
+ out << "[lock_t: type_mode=" << type_mode << "("
+ << type_mode_string() << ")";
+
+ if (is_record_lock()) {
+ out << un_member.rec_lock;
+ } else {
+ out << un_member.tab_lock;
+ }
+
+ out << "]";
+ return(out);
+}
+
+inline
+std::ostream&
+operator<<(std::ostream& out, const ib_lock_t& lock)
+{
+ return(lock.print(out));
+}
+
+#ifdef UNIV_DEBUG
+extern ibool lock_print_waits;
+#endif /* UNIV_DEBUG */
+
+/** Restricts the length of search we will do in the waits-for
+graph of transactions */
+static const ulint LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK = 1000000;
+
+/** Restricts the search depth we will do in the waits-for graph of
+transactions */
+static const ulint LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK = 200;
+
+/** When releasing transaction locks, this specifies how often we release
+the lock mutex for a moment to give also others access to it */
+static const ulint LOCK_RELEASE_INTERVAL = 1000;
+
+/* Safety margin when creating a new record lock: this many extra records
+can be inserted to the page without need to create a lock with a bigger
+bitmap */
+
+static const ulint LOCK_PAGE_BITMAP_MARGIN = 64;
+
+/* An explicit record lock affects both the record and the gap before it.
+An implicit x-lock does not affect the gap, it only locks the index
+record from read or update.
+
+If a transaction has modified or inserted an index record, then
+it owns an implicit x-lock on the record. On a secondary index record,
+a transaction has an implicit x-lock also if it has modified the
+clustered index record, the max trx id of the page where the secondary
+index record resides is >= trx id of the transaction (or database recovery
+is running), and there are no explicit non-gap lock requests on the
+secondary index record.
+
+This complicated definition for a secondary index comes from the
+implementation: we want to be able to determine if a secondary index
+record has an implicit x-lock, just by looking at the present clustered
+index record, not at the historical versions of the record. The
+complicated definition can be explained to the user so that there is
+nondeterminism in the access path when a query is answered: we may,
+or may not, access the clustered index record and thus may, or may not,
+bump into an x-lock set there.
+
+Different transaction can have conflicting locks set on the gap at the
+same time. The locks on the gap are purely inhibitive: an insert cannot
+be made, or a select cursor may have to wait if a different transaction
+has a conflicting lock on the gap. An x-lock on the gap does not give
+the right to insert into the gap.
+
+An explicit lock can be placed on a user record or the supremum record of
+a page. The locks on the supremum record are always thought to be of the gap
+type, though the gap bit is not set. When we perform an update of a record
+where the size of the record changes, we may temporarily store its explicit
+locks on the infimum record of the page, though the infimum otherwise never
+carries locks.
+
+A waiting record lock can also be of the gap type. A waiting lock request
+can be granted when there is no conflicting mode lock request by another
+transaction ahead of it in the explicit lock queue.
+
+In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP.
+It only locks the record it is placed on, not the gap before the record.
+This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation
+level.
+
+-------------------------------------------------------------------------
+RULE 1: If there is an implicit x-lock on a record, and there are non-gap
+-------
+lock requests waiting in the queue, then the transaction holding the implicit
+x-lock also has an explicit non-gap record x-lock. Therefore, as locks are
+released, we can grant locks to waiting lock requests purely by looking at
+the explicit lock requests in the queue.
+
+RULE 3: Different transactions cannot have conflicting granted non-gap locks
+-------
+on a record at the same time. However, they can have conflicting granted gap
+locks.
+RULE 4: If a there is a waiting lock request in a queue, no lock request,
+-------
+gap or not, can be inserted ahead of it in the queue. In record deletes
+and page splits new gap type locks can be created by the database manager
+for a transaction, and without rule 4, the waits-for graph of transactions
+might become cyclic without the database noticing it, as the deadlock check
+is only performed when a transaction itself requests a lock!
+-------------------------------------------------------------------------
+
+An insert is allowed to a gap if there are no explicit lock requests by
+other transactions on the next record. It does not matter if these lock
+requests are granted or waiting, gap bit set or not, with the exception
+that a gap type request set by another transaction to wait for
+its turn to do an insert is ignored. On the other hand, an
+implicit x-lock by another transaction does not prevent an insert, which
+allows for more concurrency when using an Oracle-style sequence number
+generator for the primary key with many transactions doing inserts
+concurrently.
+
+A modify of a record is allowed if the transaction has an x-lock on the
+record, or if other transactions do not have any non-gap lock requests on the
+record.
+
+A read of a single user record with a cursor is allowed if the transaction
+has a non-gap explicit, or an implicit lock on the record, or if the other
+transactions have no x-lock requests on the record. At a page supremum a
+read is always allowed.
+
+In summary, an implicit lock is seen as a granted x-lock only on the
+record, not on the gap. An explicit lock with no gap bit set is a lock
+both on the record and the gap. If the gap bit is set, the lock is only
+on the gap. Different transaction cannot own conflicting locks on the
+record at the same time, but they may own conflicting locks on the gap.
+Granted locks on a record give an access right to the record, but gap type
+locks just inhibit operations.
+
+NOTE: Finding out if some transaction has an implicit x-lock on a secondary
+index record can be cumbersome. We may have to look at previous versions of
+the corresponding clustered index record to find out if a delete marked
+secondary index record was delete marked by an active transaction, not by
+a committed one.
+
+FACT A: If a transaction has inserted a row, it can delete it any time
+without need to wait for locks.
+
+PROOF: The transaction has an implicit x-lock on every index record inserted
+for the row, and can thus modify each record without the need to wait. Q.E.D.
+
+FACT B: If a transaction has read some result set with a cursor, it can read
+it again, and retrieves the same result set, if it has not modified the
+result set in the meantime. Hence, there is no phantom problem. If the
+biggest record, in the alphabetical order, touched by the cursor is removed,
+a lock wait may occur, otherwise not.
+
+PROOF: When a read cursor proceeds, it sets an s-lock on each user record
+it passes, and a gap type s-lock on each page supremum. The cursor must
+wait until it has these locks granted. Then no other transaction can
+have a granted x-lock on any of the user records, and therefore cannot
+modify the user records. Neither can any other transaction insert into
+the gaps which were passed over by the cursor. Page splits and merges,
+and removal of obsolete versions of records do not affect this, because
+when a user record or a page supremum is removed, the next record inherits
+its locks as gap type locks, and therefore blocks inserts to the same gap.
+Also, if a page supremum is inserted, it inherits its locks from the successor
+record. When the cursor is positioned again at the start of the result set,
+the records it will touch on its course are either records it touched
+during the last pass or new inserted page supremums. It can immediately
+access all these records, and when it arrives at the biggest record, it
+notices that the result set is complete. If the biggest record was removed,
+lock wait can occur because the next record only inherits a gap type lock,
+and a wait may be needed. Q.E.D. */
+
+/* If an index record should be changed or a new inserted, we must check
+the lock on the record or the next. When a read cursor starts reading,
+we will set a record level s-lock on each record it passes, except on the
+initial record on which the cursor is positioned before we start to fetch
+records. Our index tree search has the convention that the B-tree
+cursor is positioned BEFORE the first possibly matching record in
+the search. Optimizations are possible here: if the record is searched
+on an equality condition to a unique key, we could actually set a special
+lock on the record, a lock which would not prevent any insert before
+this record. In the next key locking an x-lock set on a record also
+prevents inserts just before that record.
+ There are special infimum and supremum records on each page.
+A supremum record can be locked by a read cursor. This records cannot be
+updated but the lock prevents insert of a user record to the end of
+the page.
+ Next key locks will prevent the phantom problem where new rows
+could appear to SELECT result sets after the select operation has been
+performed. Prevention of phantoms ensures the serilizability of
+transactions.
+ What should we check if an insert of a new record is wanted?
+Only the lock on the next record on the same page, because also the
+supremum record can carry a lock. An s-lock prevents insertion, but
+what about an x-lock? If it was set by a searched update, then there
+is implicitly an s-lock, too, and the insert should be prevented.
+What if our transaction owns an x-lock to the next record, but there is
+a waiting s-lock request on the next record? If this s-lock was placed
+by a read cursor moving in the ascending order in the index, we cannot
+do the insert immediately, because when we finally commit our transaction,
+the read cursor should see also the new inserted record. So we should
+move the read cursor backward from the next record for it to pass over
+the new inserted record. This move backward may be too cumbersome to
+implement. If we in this situation just enqueue a second x-lock request
+for our transaction on the next record, then the deadlock mechanism
+notices a deadlock between our transaction and the s-lock request
+transaction. This seems to be an ok solution.
+ We could have the convention that granted explicit record locks,
+lock the corresponding records from changing, and also lock the gaps
+before them from inserting. A waiting explicit lock request locks the gap
+before from inserting. Implicit record x-locks, which we derive from the
+transaction id in the clustered index record, only lock the record itself
+from modification, not the gap before it from inserting.
+ How should we store update locks? If the search is done by a unique
+key, we could just modify the record trx id. Otherwise, we could put a record
+x-lock on the record. If the update changes ordering fields of the
+clustered index record, the inserted new record needs no record lock in
+lock table, the trx id is enough. The same holds for a secondary index
+record. Searched delete is similar to update.
+
+PROBLEM:
+What about waiting lock requests? If a transaction is waiting to make an
+update to a record which another modified, how does the other transaction
+know to send the end-lock-wait signal to the waiting transaction? If we have
+the convention that a transaction may wait for just one lock at a time, how
+do we preserve it if lock wait ends?
+
+PROBLEM:
+Checking the trx id label of a secondary index record. In the case of a
+modification, not an insert, is this necessary? A secondary index record
+is modified only by setting or resetting its deleted flag. A secondary index
+record contains fields to uniquely determine the corresponding clustered
+index record. A secondary index record is therefore only modified if we
+also modify the clustered index record, and the trx id checking is done
+on the clustered index record, before we come to modify the secondary index
+record. So, in the case of delete marking or unmarking a secondary index
+record, we do not have to care about trx ids, only the locks in the lock
+table must be checked. In the case of a select from a secondary index, the
+trx id is relevant, and in this case we may have to search the clustered
+index record.
+
+PROBLEM: How to update record locks when page is split or merged, or
+--------------------------------------------------------------------
+a record is deleted or updated?
+If the size of fields in a record changes, we perform the update by
+a delete followed by an insert. How can we retain the locks set or
+waiting on the record? Because a record lock is indexed in the bitmap
+by the heap number of the record, when we remove the record from the
+record list, it is possible still to keep the lock bits. If the page
+is reorganized, we could make a table of old and new heap numbers,
+and permute the bitmaps in the locks accordingly. We can add to the
+table a row telling where the updated record ended. If the update does
+not require a reorganization of the page, we can simply move the lock
+bits for the updated record to the position determined by its new heap
+number (we may have to allocate a new lock, if we run out of the bitmap
+in the old one).
+ A more complicated case is the one where the reinsertion of the
+updated record is done pessimistically, because the structure of the
+tree may change.
+
+PROBLEM: If a supremum record is removed in a page merge, or a record
+---------------------------------------------------------------------
+removed in a purge, what to do to the waiting lock requests? In a split to
+the right, we just move the lock requests to the new supremum. If a record
+is removed, we could move the waiting lock request to its inheritor, the
+next record in the index. But, the next record may already have lock
+requests on its own queue. A new deadlock check should be made then. Maybe
+it is easier just to release the waiting transactions. They can then enqueue
+new lock requests on appropriate records.
+
+PROBLEM: When a record is inserted, what locks should it inherit from the
+-------------------------------------------------------------------------
+upper neighbor? An insert of a new supremum record in a page split is
+always possible, but an insert of a new user record requires that the upper
+neighbor does not have any lock requests by other transactions, granted or
+waiting, in its lock queue. Solution: We can copy the locks as gap type
+locks, so that also the waiting locks are transformed to granted gap type
+locks on the inserted record. */
+
+/* LOCK COMPATIBILITY MATRIX
+ * IS IX S X AI
+ * IS + + + - +
+ * IX + + - - +
+ * S + - + - -
+ * X - - - - -
+ * AI + + - - -
+ *
+ * Note that for rows, InnoDB only acquires S or X locks.
+ * For tables, InnoDB normally acquires IS or IX locks.
+ * S or X table locks are only acquired for LOCK TABLES.
+ * Auto-increment (AI) locks are needed because of
+ * statement-level MySQL binlog.
+ * See also lock_mode_compatible().
+ */
+static const byte lock_compatibility_matrix[5][5] = {
+ /** IS IX S X AI */
+ /* IS */ { TRUE, TRUE, TRUE, FALSE, TRUE},
+ /* IX */ { TRUE, TRUE, FALSE, FALSE, TRUE},
+ /* S */ { TRUE, FALSE, TRUE, FALSE, FALSE},
+ /* X */ { FALSE, FALSE, FALSE, FALSE, FALSE},
+ /* AI */ { TRUE, TRUE, FALSE, FALSE, FALSE}
+};
+
+/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column)
+ * IS IX S X AI
+ * IS + - - - -
+ * IX + + - - -
+ * S + - + - -
+ * X + + + + +
+ * AI - - - - +
+ * See lock_mode_stronger_or_eq().
+ */
+static const byte lock_strength_matrix[5][5] = {
+ /** IS IX S X AI */
+ /* IS */ { TRUE, FALSE, FALSE, FALSE, FALSE},
+ /* IX */ { TRUE, TRUE, FALSE, FALSE, FALSE},
+ /* S */ { TRUE, FALSE, TRUE, FALSE, FALSE},
+ /* X */ { TRUE, TRUE, TRUE, TRUE, TRUE},
+ /* AI */ { FALSE, FALSE, FALSE, FALSE, TRUE}
+};
+
+/** Maximum depth of the DFS stack. */
+static const ulint MAX_STACK_SIZE = 4096;
+
+#define PRDT_HEAPNO PAGE_HEAP_NO_INFIMUM
+/** Record locking request status */
+enum lock_rec_req_status {
+ /** Failed to acquire a lock */
+ LOCK_REC_FAIL,
+ /** Succeeded in acquiring a lock (implicit or already acquired) */
+ LOCK_REC_SUCCESS,
+ /** Explicitly created a new lock */
+ LOCK_REC_SUCCESS_CREATED
+};
+
+#ifdef UNIV_DEBUG
+/** The count of the types of locks. */
+static const ulint lock_types = UT_ARR_SIZE(lock_compatibility_matrix);
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the type of a lock.
+@return LOCK_TABLE or LOCK_REC */
+UNIV_INLINE
+ulint
+lock_get_type_low(
+/*==============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return previous lock on the same record, NULL if none exists */
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+ const lock_t* in_lock,/*!< in: record lock */
+ ulint heap_no);/*!< in: heap number of the record */
+
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+ lock_t* lock); /*!< in/out: waiting lock request */
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+ const rec_t* rec, /*!< in: user record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_on_page_const(
+/*============================*/
+ const lock_t* lock); /*!< in: a record lock */
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+ const lock_t* lock, /*!< in: record lock */
+ ulint i); /*!< in: index of the bit */
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+ const lock_t* lock); /*!< in: record lock */
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+UNIV_INLINE
+void
+lock_rec_set_nth_bit(
+/*=================*/
+ lock_t* lock, /*!< in: record lock */
+ ulint i); /*!< in: index of the bit */
+
+/** Reset the nth bit of a record lock.
+@param[in,out] lock record lock
+@param[in] i index of the bit that will be reset
+@return previous value of the bit */
+inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
+{
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+ byte* b = reinterpret_cast<byte*>(&lock[1]) + (i >> 3);
+ byte mask = byte(1U << (i & 7));
+ byte bit = *b & mask;
+ *b &= byte(~mask);
+
+ if (bit != 0) {
+ ut_ad(lock->trx->lock.n_rec_locks > 0);
+ --lock->trx->lock.n_rec_locks;
+ }
+
+ return(bit);
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+ lock_t* lock); /*!< in: a record lock */
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+ ulint heap_no,/*!< in: heap number of the record */
+ lock_t* lock); /*!< in: lock */
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+ ulint heap_no,/*!< in: heap number of the record */
+ const lock_t* lock); /*!< in: lock */
+
+/*********************************************************************//**
+Gets the first explicit lock request on a record.
+@return first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first(
+/*===============*/
+ hash_table_t* hash, /*!< in: hash chain the lock on */
+ const buf_block_t* block, /*!< in: block containing the record */
+ ulint heap_no);/*!< in: heap number of the record */
+
+/*********************************************************************//**
+Gets the mode of a lock.
+@return mode */
+UNIV_INLINE
+enum lock_mode
+lock_get_mode(
+/*==========*/
+ const lock_t* lock); /*!< in: lock */
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+ enum lock_mode mode1, /*!< in: lock mode */
+ enum lock_mode mode2); /*!< in: lock mode */
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+ enum lock_mode mode1, /*!< in: lock mode */
+ enum lock_mode mode2); /*!< in: lock mode */
+
+/*********************************************************************//**
+Gets the wait flag of a lock.
+@return LOCK_WAIT if waiting, 0 if not */
+UNIV_INLINE
+ulint
+lock_get_wait(
+/*==========*/
+ const lock_t* lock); /*!< in: lock */
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
+@return lock or NULL */
+UNIV_INLINE
+const lock_t*
+lock_table_has(
+/*===========*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_table_t* table, /*!< in: table */
+ enum lock_mode mode); /*!< in: lock mode */
+
+/** Set the wait status of a lock.
+@param[in,out] lock lock that will be waited for
+@param[in,out] trx transaction that will wait for the lock */
+inline void lock_set_lock_and_trx_wait(lock_t* lock, trx_t* trx)
+{
+ ut_ad(lock);
+ ut_ad(lock->trx == trx);
+ ut_ad(trx->lock.wait_lock == NULL);
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(trx));
+
+ trx->lock.wait_lock = lock;
+ lock->type_mode |= LOCK_WAIT;
+}
+
+/** Reset the wait status of a lock.
+@param[in,out] lock lock that was possibly being waited for */
+inline void lock_reset_lock_and_trx_wait(lock_t* lock)
+{
+ ut_ad(lock_get_wait(lock));
+ ut_ad(lock_mutex_own());
+ ut_ad(lock->trx->lock.wait_lock == NULL
+ || lock->trx->lock.wait_lock == lock);
+ lock->trx->lock.wait_lock = NULL;
+ lock->type_mode &= ~LOCK_WAIT;
+}
+
+#include "lock0priv.ic"
+
+#endif /* lock0priv_h */
diff --git a/storage/innobase/include/lock0priv.ic b/storage/innobase/include/lock0priv.ic
new file mode 100644
index 00000000..e16949a4
--- /dev/null
+++ b/storage/innobase/include/lock0priv.ic
@@ -0,0 +1,321 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.ic
+Lock module internal inline methods.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+/* This file contains only methods which are used in
+lock/lock0* files, other than lock/lock0lock.cc.
+I.e. lock/lock0lock.cc contains more internal inline
+methods but they are used only in that file. */
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+#error Do not include lock0priv.ic outside of the lock/ module
+#endif
+
+#include "row0row.h"
+
+/*********************************************************************//**
+Gets the type of a lock.
+@return LOCK_TABLE or LOCK_REC */
+UNIV_INLINE
+ulint
+lock_get_type_low(
+/*==============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_ad(lock);
+
+ return(lock->type_mode & LOCK_TYPE_MASK);
+}
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+ const rec_t* rec, /*!< in: user record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(page_rec_is_user_rec(rec));
+
+ return(row_get_rec_trx_id(rec, index, offsets));
+}
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+ const lock_t* lock) /*!< in: record lock */
+{
+ return(lock->un_member.rec_lock.n_bits);
+}
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+UNIV_INLINE
+void
+lock_rec_set_nth_bit(
+/*=================*/
+ lock_t* lock, /*!< in: record lock */
+ ulint i) /*!< in: index of the bit */
+{
+ ulint byte_index;
+ ulint bit_index;
+
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+ byte_index = i / 8;
+ bit_index = i % 8;
+
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+ ((byte*) &lock[1])[byte_index] |= static_cast<byte>(1 << bit_index);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ ++lock->trx->lock.n_rec_locks;
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+ lock_t* lock) /*!< in: a record lock */
+{
+ return((lock_t*) lock_rec_get_next_on_page_const(lock));
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+ ulint heap_no,/*!< in: heap number of the record */
+ lock_t* lock) /*!< in: lock */
+{
+ ut_ad(lock_mutex_own());
+
+ do {
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ lock = lock_rec_get_next_on_page(lock);
+ } while (lock && !lock_rec_get_nth_bit(lock, heap_no));
+
+ return(lock);
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+ ulint heap_no,/*!< in: heap number of the record */
+ const lock_t* lock) /*!< in: lock */
+{
+ return(lock_rec_get_next(heap_no, (lock_t*) lock));
+}
+
+/*********************************************************************//**
+Gets the first explicit lock request on a record.
+@return first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first(
+/*===============*/
+ hash_table_t* hash, /*!< in: hash chain the lock on */
+ const buf_block_t* block, /*!< in: block containing the record */
+ ulint heap_no)/*!< in: heap number of the record */
+{
+ for (lock_t *lock= lock_sys.get_first(*hash, block->page.id());
+ lock; lock= lock_rec_get_next_on_page(lock))
+ if (lock_rec_get_nth_bit(lock, heap_no))
+ return lock;
+ return nullptr;
+}
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+ const lock_t* lock, /*!< in: record lock */
+ ulint i) /*!< in: index of the bit */
+{
+ const byte* b;
+
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ if (i >= lock->un_member.rec_lock.n_bits) {
+
+ return(FALSE);
+ }
+
+ b = ((const byte*) &lock[1]) + (i / 8);
+
+ return(1 & *b >> (i % 8));
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_on_page_const(
+/*============================*/
+ const lock_t* lock) /*!< in: a record lock */
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ const page_id_t page_id(lock->un_member.rec_lock.page_id);
+
+ while (!!(lock= static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))))
+ if (lock->un_member.rec_lock.page_id == page_id)
+ break;
+ return lock;
+}
+
+/*********************************************************************//**
+Gets the mode of a lock.
+@return mode */
+UNIV_INLINE
+enum lock_mode
+lock_get_mode(
+/*==========*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_ad(lock);
+
+ return(static_cast<enum lock_mode>(lock->type_mode & LOCK_MODE_MASK));
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+ enum lock_mode mode1, /*!< in: lock mode */
+ enum lock_mode mode2) /*!< in: lock mode */
+{
+ ut_ad((ulint) mode1 < lock_types);
+ ut_ad((ulint) mode2 < lock_types);
+
+ return(lock_compatibility_matrix[mode1][mode2]);
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+ enum lock_mode mode1, /*!< in: lock mode */
+ enum lock_mode mode2) /*!< in: lock mode */
+{
+ ut_ad((ulint) mode1 < lock_types);
+ ut_ad((ulint) mode2 < lock_types);
+
+ return(lock_strength_matrix[mode1][mode2]);
+}
+
+/*********************************************************************//**
+Gets the wait flag of a lock.
+@return LOCK_WAIT if waiting, 0 if not */
+UNIV_INLINE
+ulint
+lock_get_wait(
+/*==========*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_ad(lock);
+
+ return(lock->type_mode & LOCK_WAIT);
+}
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
+@return lock or NULL */
+UNIV_INLINE
+const lock_t*
+lock_table_has(
+/*===========*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_table_t* table, /*!< in: table */
+ lock_mode in_mode)/*!< in: lock mode */
+{
+ /* Look for stronger locks the same trx already has on the table */
+
+ for (lock_list::const_iterator it = trx->lock.table_locks.begin(),
+ end = trx->lock.table_locks.end(); it != end; ++it) {
+
+ const lock_t* lock = *it;
+
+ if (lock == NULL) {
+ continue;
+ }
+
+ lock_mode mode = lock_get_mode(lock);
+
+ ut_ad(trx == lock->trx);
+ ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+ ut_ad(lock->un_member.tab_lock.table != NULL);
+
+ if (table == lock->un_member.tab_lock.table
+ && lock_mode_stronger_or_eq(mode, in_mode)) {
+
+ ut_ad(!lock_get_wait(lock));
+
+ return(lock);
+ }
+ }
+
+ return(NULL);
+}
+
+/* vim: set filetype=c: */
diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h
new file mode 100644
index 00000000..23307375
--- /dev/null
+++ b/storage/innobase/include/lock0types.h
@@ -0,0 +1,273 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0types.h
+The transaction lock system global types
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0types.h"
+#include "buf0types.h"
+#include "ut0lst.h"
+
+#ifndef lock0types_h
+#define lock0types_h
+
+#define lock_t ib_lock_t
+
+struct lock_t;
+struct lock_table_t;
+
+/* Basic lock modes */
+enum lock_mode {
+ LOCK_IS = 0, /* intention shared */
+ LOCK_IX, /* intention exclusive */
+ LOCK_S, /* shared */
+ LOCK_X, /* exclusive */
+ LOCK_AUTO_INC, /* locks the auto-inc counter of a table
+ in an exclusive mode */
+ LOCK_NONE, /* this is used elsewhere to note consistent read */
+ LOCK_NUM = LOCK_NONE, /* number of lock modes */
+ LOCK_NONE_UNSET = 255
+};
+
+/** Convert the given enum value into string.
+@param[in] mode the lock mode
+@return human readable string of the given enum value */
+inline
+const char* lock_mode_string(enum lock_mode mode)
+{
+ switch (mode) {
+ case LOCK_IS:
+ return("LOCK_IS");
+ case LOCK_IX:
+ return("LOCK_IX");
+ case LOCK_S:
+ return("LOCK_S");
+ case LOCK_X:
+ return("LOCK_X");
+ case LOCK_AUTO_INC:
+ return("LOCK_AUTO_INC");
+ case LOCK_NONE:
+ return("LOCK_NONE");
+ case LOCK_NONE_UNSET:
+ return("LOCK_NONE_UNSET");
+ default:
+ ut_error;
+ }
+}
+
+/** A table lock */
+struct lock_table_t {
+ dict_table_t* table; /*!< database table in dictionary
+ cache */
+ UT_LIST_NODE_T(ib_lock_t)
+ locks; /*!< list of locks on the same
+ table */
+ /** Print the table lock into the given output stream
+ @param[in,out] out the output stream
+ @return the given output stream. */
+ std::ostream& print(std::ostream& out) const;
+};
+
+/** Record lock for a page */
+struct lock_rec_t {
+ /** page identifier */
+ page_id_t page_id;
+ ib_uint32_t n_bits; /*!< number of bits in the lock
+ bitmap; NOTE: the lock bitmap is
+ placed immediately after the
+ lock struct */
+
+ /** Print the record lock into the given output stream
+ @param[in,out] out the output stream
+ @return the given output stream. */
+ std::ostream& print(std::ostream& out) const;
+};
+
+/** Print the record lock into the given output stream
+@param[in,out] out the output stream
+@return the given output stream. */
+inline std::ostream &lock_rec_t::print(std::ostream &out) const
+{
+ out << "[lock_rec_t: space=" << page_id.space()
+ << ", page_no=" << page_id.page_no()
+ << ", n_bits=" << n_bits << "]";
+ return out;
+}
+
+inline
+std::ostream&
+operator<<(std::ostream& out, const lock_rec_t& lock)
+{
+ return(lock.print(out));
+}
+
+#define LOCK_MODE_MASK 0xFUL /*!< mask used to extract mode from the
+ type_mode field in a lock */
+/** Lock types */
+/* @{ */
+#define LOCK_TABLE 16U /*!< table lock */
+#define LOCK_REC 32U /*!< record lock */
+#define LOCK_TYPE_MASK 0xF0UL /*!< mask used to extract lock type from the
+ type_mode field in a lock */
+#if LOCK_MODE_MASK & LOCK_TYPE_MASK
+# error "LOCK_MODE_MASK & LOCK_TYPE_MASK"
+#endif
+
+#define LOCK_WAIT 256U /*!< Waiting lock flag; when set, it
+ means that the lock has not yet been
+ granted, it is just waiting for its
+ turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY 0 /*!< this flag denotes an ordinary
+ next-key lock in contrast to LOCK_GAP
+ or LOCK_REC_NOT_GAP */
+#define LOCK_GAP 512U /*!< when this bit is set, it means that the
+ lock holds only on the gap before the record;
+ for instance, an x-lock on the gap does not
+ give permission to modify the record on which
+ the bit is set; locks of this type are created
+ when records are removed from the index chain
+ of records */
+#define LOCK_REC_NOT_GAP 1024U /*!< this bit means that the lock is only on
+ the index record and does NOT block inserts
+ to the gap before the index record; this is
+ used in the case when we retrieve a record
+ with a unique key, and is also used in
+ locking plain SELECTs (not part of UPDATE
+ or DELETE) when the user has set the READ
+ COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048U/*!< this bit is set when we place a waiting
+ gap type record lock request in order to let
+ an insert of an index record to wait until
+ there are no conflicting locks by other
+ transactions on the gap; note that this flag
+ remains set when the waiting lock is granted,
+ or if the lock is inherited to a neighboring
+ record */
+#define LOCK_PREDICATE 8192U /*!< Predicate lock */
+#define LOCK_PRDT_PAGE 16384U /*!< Page lock */
+
+
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_MODE_MASK
+# error
+#endif
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_TYPE_MASK
+# error
+#endif
+/* @} */
+
+/** Lock struct; protected by lock_sys.mutex */
+struct ib_lock_t
+{
+ trx_t* trx; /*!< transaction owning the
+ lock */
+ UT_LIST_NODE_T(ib_lock_t)
+ trx_locks; /*!< list of the locks of the
+ transaction */
+
+ dict_index_t* index; /*!< index for a record lock */
+
+ ib_lock_t* hash; /*!< hash chain node for a record
+ lock. The link node in a singly linked
+ list, used during hashing. */
+
+ /** time(NULL) of the lock request creation.
+ Used for computing wait_time and diagnostics only.
+ Note: bogus durations may be reported
+ when the system time is adjusted! */
+ time_t requested_time;
+ /** Cumulated wait time in seconds.
+ Note: may be bogus when the system time is adjusted! */
+ ulint wait_time;
+
+ union {
+ lock_table_t tab_lock;/*!< table lock */
+ lock_rec_t rec_lock;/*!< record lock */
+ } un_member; /*!< lock details */
+
+ ib_uint32_t type_mode; /*!< lock type, mode, LOCK_GAP or
+ LOCK_REC_NOT_GAP,
+ LOCK_INSERT_INTENTION,
+ wait flag, ORed */
+
+ /** Determine if the lock object is a record lock.
+ @return true if record lock, false otherwise. */
+ bool is_record_lock() const
+ {
+ return(type() == LOCK_REC);
+ }
+
+ bool is_waiting() const
+ {
+ return(type_mode & LOCK_WAIT);
+ }
+
+ bool is_gap() const
+ {
+ return(type_mode & LOCK_GAP);
+ }
+
+ bool is_record_not_gap() const
+ {
+ return(type_mode & LOCK_REC_NOT_GAP);
+ }
+
+ bool is_insert_intention() const
+ {
+ return(type_mode & LOCK_INSERT_INTENTION);
+ }
+
+ ulint type() const {
+ return(type_mode & LOCK_TYPE_MASK);
+ }
+
+ enum lock_mode mode() const
+ {
+ return(static_cast<enum lock_mode>(type_mode & LOCK_MODE_MASK));
+ }
+
+ /** Print the lock object into the given output stream.
+ @param[in,out] out the output stream
+ @return the given output stream. */
+ std::ostream& print(std::ostream& out) const;
+
+ /** Convert the member 'type_mode' into a human readable string.
+ @return human readable string */
+ std::string type_mode_string() const;
+
+ const char* type_string() const
+ {
+ switch (type_mode & LOCK_TYPE_MASK) {
+ case LOCK_REC:
+ return("LOCK_REC");
+ case LOCK_TABLE:
+ return("LOCK_TABLE");
+ default:
+ ut_error;
+ }
+ }
+};
+
+typedef UT_LIST_BASE_NODE_T(ib_lock_t) trx_lock_list_t;
+
+#endif /* lock0types_h */
diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h
new file mode 100644
index 00000000..980a79d8
--- /dev/null
+++ b/storage/innobase/include/log0crypt.h
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (C) 2014, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file include/log0crypt.h
+Innodb log encrypt/decrypt
+
+Created 11/25/2013 Minli Zhu
+Modified Jan Lindström jan.lindstrom@mariadb.com
+MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation.
+*******************************************************/
+#ifndef log0crypt_h
+#define log0crypt_h
+
+#include "log0log.h"
+
+/** innodb_encrypt_log: whether to encrypt the redo log */
+extern my_bool srv_encrypt_log;
+
+/** Initialize the redo log encryption key and random parameters
+when creating a new redo log.
+The random parameters will be persisted in the log checkpoint pages.
+@see log_crypt_write_checkpoint_buf()
+@see log_crypt_read_checkpoint_buf()
+@return whether the operation succeeded */
+UNIV_INTERN
+bool
+log_crypt_init();
+
+/*********************************************************************//**
+Writes the crypto (version, msg and iv) info, which has been used for
+log blocks with lsn <= this checkpoint's lsn, to a log header's
+checkpoint buf. */
+UNIV_INTERN
+void
+log_crypt_write_checkpoint_buf(
+/*===========================*/
+ byte* buf); /*!< in/out: checkpoint buffer */
+
+/** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info.
+@param[in] buf checkpoint buffer
+@return whether the operation was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf);
+
+/** Decrypt a MariaDB 10.1 redo log block.
+@param[in,out] buf log block
+@param[in] start_lsn server start LSN
+@return whether the decryption was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn);
+
+/** Read the checkpoint crypto (version, msg and iv) info.
+@param[in] buf checkpoint buffer
+@return whether the operation was successful */
+bool log_crypt_read_checkpoint_buf(const byte* buf);
+
+/** log_crypt() operation code */
+enum log_crypt_t {
+ /** encrypt a log block without rotating key */
+ LOG_ENCRYPT,
+ /** decrypt a log block */
+ LOG_DECRYPT,
+ /** attempt to rotate the key, and encrypt a log block */
+ LOG_ENCRYPT_ROTATE_KEY
+};
+
+/** Encrypt or decrypt log blocks.
+@param[in,out] buf log blocks to encrypt or decrypt
+@param[in] lsn log sequence number of the start of the buffer
+@param[in] size size of the buffer, in bytes
+@param[in] op whether to decrypt, encrypt, or rotate key and encrypt
+@return whether the operation succeeded (encrypt always does) */
+bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op = LOG_ENCRYPT);
+
+/** Encrypt or decrypt a temporary file block.
+@param[in] src block to encrypt or decrypt
+@param[in] size size of the block
+@param[out] dst destination block
+@param[in] offs offset to block
+@param[in] encrypt true=encrypt; false=decrypt
+@return whether the operation succeeded */
+UNIV_INTERN
+bool
+log_tmp_block_encrypt(
+ const byte* src,
+ ulint size,
+ byte* dst,
+ uint64_t offs,
+ bool encrypt = true)
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Decrypt a temporary file block.
+@param[in] src block to decrypt
+@param[in] size size of the block
+@param[out] dst destination block
+@param[in] offs offset to block
+@return whether the operation succeeded */
+inline
+bool
+log_tmp_block_decrypt(
+ const byte* src,
+ ulint size,
+ byte* dst,
+ uint64_t offs)
+{
+ return(log_tmp_block_encrypt(src, size, dst, offs, false));
+}
+
+/** @return whether temporary files are encrypted */
+inline bool log_tmp_is_encrypted() { return srv_encrypt_log; }
+#endif // log0crypt.h
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
new file mode 100644
index 00000000..460acaf5
--- /dev/null
+++ b/storage/innobase/include/log0log.h
@@ -0,0 +1,751 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2009, Google Inc.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.h
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0log_h
+#define log0log_h
+
+#include "log0types.h"
+#include "os0file.h"
+#include "span.h"
+#include "my_atomic_wrapper.h"
+#include <vector>
+#include <string>
+
+using st_::span;
+
+static const char LOG_FILE_NAME_PREFIX[] = "ib_logfile";
+static const char LOG_FILE_NAME[] = "ib_logfile0";
+
+/** Composes full path for a redo log file
+@param[in] filename name of the redo log file
+@return path with log file name*/
+std::string get_log_file_path(const char *filename= LOG_FILE_NAME);
+
+/** Returns paths for all existing log files */
+std::vector<std::string> get_existing_log_files_paths();
+
+/** Delete log file.
+@param[in] suffix suffix of the file name */
+static inline void delete_log_file(const char* suffix)
+{
+ auto path = get_log_file_path(LOG_FILE_NAME_PREFIX).append(suffix);
+ os_file_delete_if_exists(innodb_log_file_key, path.c_str(), nullptr);
+}
+
+/** Append a string to the log.
+@param[in] str string
+@param[in] len string length
+@param[out] start_lsn start LSN of the log record
+@return end lsn of the log record, zero if did not succeed */
+UNIV_INLINE
+lsn_t
+log_reserve_and_write_fast(
+ const void* str,
+ ulint len,
+ lsn_t* start_lsn);
+/***********************************************************************//**
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void);
+/*================*/
+
+/** Extends the log buffer.
+@param[in] len requested minimum size in bytes */
+void log_buffer_extend(ulong len);
+
+/** Calculate the recommended highest values for lsn - last_checkpoint_lsn
+and lsn - buf_pool.get_oldest_modification().
+@param[in] file_size requested innodb_log_file_size
+@retval true on success
+@retval false if the smallest log is too small to
+accommodate the number of OS threads in the database server */
+bool
+log_set_capacity(ulonglong file_size)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Ensure that the log has been written to the log file up to a given
+log entry (such as that of a transaction commit). Start a new write, or
+wait and check if an already running write is covering the request.
+@param[in] lsn log sequence number that should be
+included in the redo log file write
+@param[in] flush_to_disk whether the written log should also
+be flushed to the file system
+@param[in] rotate_key whether to rotate the encryption key */
+void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false);
+
+/** write to the log file up to the last log entry.
+@param[in] sync whether we want the written log
+also to be flushed to disk. */
+void
+log_buffer_flush_to_disk(
+ bool sync = true);
+
+/** Make a checkpoint */
+ATTRIBUTE_COLD void log_make_checkpoint();
+
+/** Make a checkpoint at the latest lsn on shutdown. */
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown();
+
+/** Write checkpoint info to the log header and release log_sys.mutex.
+@param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */
+ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn);
+
+/**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+ATTRIBUTE_COLD void log_check_margins();
+
+/************************************************************//**
+Gets a log block flush bit.
+@return TRUE if this block was the first to be written in a log flush */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Gets a log block number stored in the header.
+@return log block number stored in the block header */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Gets a log block data length.
+@return log block data length measured as a byte offset from the block start */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint len); /*!< in: data length */
+/** Calculate the CRC-32C checksum of a log block.
+@param[in] block log block
+@return checksum */
+inline ulint log_block_calc_checksum_crc32(const byte* block);
+
+/************************************************************//**
+Gets a log block checksum field value.
+@return checksum */
+UNIV_INLINE
+ulint
+log_block_get_checksum(
+/*===================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Sets a log block checksum field value. */
+UNIV_INLINE
+void
+log_block_set_checksum(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint checksum); /*!< in: checksum */
+/************************************************************//**
+Gets a log block first mtr log record group offset.
+@return first mtr log record group byte offset from the block start, 0
+if none */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint offset); /*!< in: offset, 0 if none */
+/************************************************************//**
+Gets a log block checkpoint number field (4 lowest bytes).
+@return checkpoint no (4 lowest bytes) */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+ byte* log_block, /*!< in: pointer to the log buffer */
+ lsn_t lsn); /*!< in: lsn within the log block */
+/************************************************************//**
+Converts a lsn to a log block number.
+@return log block number, it is > 0 and <= 1G */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+ lsn_t lsn); /*!< in: lsn of a byte within the block */
+/******************************************************//**
+Prints info of the log. */
+void
+log_print(
+/*======*/
+ FILE* file); /*!< in: file where to print */
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+void
+log_refresh_stats(void);
+/*===================*/
+
+/* The counting of lsn's starts from this value: this must be non-zero */
+#define LOG_START_LSN ((lsn_t) (16 * OS_FILE_LOG_BLOCK_SIZE))
+
+/* Offsets of a log block header */
+#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and
+ is allowed to wrap around at 2G; the
+ highest bit is set to 1 if this is the
+ first log block in a log flush write
+ segment */
+#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL
+ /* mask used to get the highest bit in
+ the preceding field */
+#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to
+ this block */
+#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an
+ mtr log record group in this log block,
+ 0 if none; if the value is the same
+ as LOG_BLOCK_HDR_DATA_LEN, it means
+ that the first rec group has not yet
+ been catenated to this log block, but
+ if it will, it will start at this
+ offset; an archive recovery can
+ start parsing the log records starting
+ from this offset in this log block,
+ if value not 0 */
+#define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of
+ log_sys.next_checkpoint_no when the
+ log block was last written to: if the
+ block has not yet been written full,
+ this value is only updated before a
+ log buffer flush */
+#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in
+ bytes */
+
+#define LOG_BLOCK_KEY 4 /* encryption key version
+ before LOG_BLOCK_CHECKSUM;
+ after log_t::FORMAT_ENC_10_4 only */
+#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block
+ contents; in InnoDB versions
+ < 3.23.52 this did not contain the
+ checksum but the same value as
+ LOG_BLOCK_HDR_NO */
+
+/** Offsets inside the checkpoint pages (redo log format version 1) @{ */
+/** Checkpoint number */
+#define LOG_CHECKPOINT_NO 0
+/** Log sequence number up to which all changes have been flushed */
+#define LOG_CHECKPOINT_LSN 8
+/** Byte offset of the log record corresponding to LOG_CHECKPOINT_LSN */
+#define LOG_CHECKPOINT_OFFSET 16
+/** srv_log_buffer_size at the time of the checkpoint (not used) */
+#define LOG_CHECKPOINT_LOG_BUF_SIZE 24
+/** MariaDB 10.2.5 encrypted redo log encryption key version (32 bits)*/
+#define LOG_CHECKPOINT_CRYPT_KEY 32
+/** MariaDB 10.2.5 encrypted redo log random nonce (32 bits) */
+#define LOG_CHECKPOINT_CRYPT_NONCE 36
+/** MariaDB 10.2.5 encrypted redo log random message (MY_AES_BLOCK_SIZE) */
+#define LOG_CHECKPOINT_CRYPT_MESSAGE 40
+/** start LSN of the MLOG_CHECKPOINT mini-transaction corresponding
+to this checkpoint, or 0 if the information has not been written */
+#define LOG_CHECKPOINT_END_LSN OS_FILE_LOG_BLOCK_SIZE - 16
+
+/* @} */
+
+/** Offsets of a log file header */
+/* @{ */
+/** Log file header format identifier (32-bit unsigned big-endian integer).
+This used to be called LOG_GROUP_ID and always written as 0,
+because InnoDB never supported more than one copy of the redo log. */
+#define LOG_HEADER_FORMAT 0
+/** Redo log subformat (originally 0). In format version 0, the
+LOG_FILE_START_LSN started here, 4 bytes earlier than LOG_HEADER_START_LSN,
+which the LOG_FILE_START_LSN was renamed to.
+Subformat 1 is for the fully redo-logged TRUNCATE
+(no MLOG_TRUNCATE records or extra log checkpoints or log file) */
+#define LOG_HEADER_SUBFORMAT 4
+/** LSN of the start of data in this log file (with format version 1;
+in format version 0, it was called LOG_FILE_START_LSN and at offset 4). */
+#define LOG_HEADER_START_LSN 8
+/** A null-terminated string which will contain either the string 'ibbackup'
+and the creation time if the log file was created by mysqlbackup --restore,
+or the MySQL version that created the redo log file. */
+#define LOG_HEADER_CREATOR 16
+/** End of the log file creator field. */
+#define LOG_HEADER_CREATOR_END (LOG_HEADER_CREATOR + 32)
+/** Contents of the LOG_HEADER_CREATOR field */
+#define LOG_HEADER_CREATOR_CURRENT \
+ "MariaDB " \
+ IB_TO_STR(MYSQL_VERSION_MAJOR) "." \
+ IB_TO_STR(MYSQL_VERSION_MINOR) "." \
+ IB_TO_STR(MYSQL_VERSION_PATCH)
+
+/* @} */
+
+#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE
+ /* first checkpoint field in the log
+ header; we write alternately to the
+ checkpoint fields when we make new
+ checkpoints; this field is only defined
+ in the first log file of a log */
+#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE)
+ /* second checkpoint field in the log
+ header */
+#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE)
+
+/** Memory mapped file */
+class mapped_file_t
+{
+public:
+ mapped_file_t()= default;
+ mapped_file_t(const mapped_file_t &)= delete;
+ mapped_file_t &operator=(const mapped_file_t &)= delete;
+ mapped_file_t(mapped_file_t &&)= delete;
+ mapped_file_t &operator=(mapped_file_t &&)= delete;
+ ~mapped_file_t() noexcept;
+
+ dberr_t map(const char *path, bool read_only= false,
+ bool nvme= false) noexcept;
+ dberr_t unmap() noexcept;
+ byte *data() noexcept { return m_area.data(); }
+
+private:
+ span<byte> m_area;
+};
+
+/** Abstraction for reading, writing and flushing file cache to disk */
+class file_io
+{
+public:
+ file_io(bool durable_writes= false) : m_durable_writes(durable_writes) {}
+ virtual ~file_io() noexcept {};
+ virtual dberr_t open(const char *path, bool read_only) noexcept= 0;
+ virtual dberr_t rename(const char *old_path,
+ const char *new_path) noexcept= 0;
+ virtual dberr_t close() noexcept= 0;
+ virtual dberr_t read(os_offset_t offset, span<byte> buf) noexcept= 0;
+ virtual dberr_t write(const char *path, os_offset_t offset,
+ span<const byte> buf) noexcept= 0;
+ virtual dberr_t flush() noexcept= 0;
+
+ /** Durable writes doesn't require calling flush() */
+ bool writes_are_durable() const noexcept { return m_durable_writes; }
+
+protected:
+ bool m_durable_writes;
+};
+
+class file_os_io final: public file_io
+{
+public:
+ file_os_io()= default;
+ file_os_io(const file_os_io &)= delete;
+ file_os_io &operator=(const file_os_io &)= delete;
+ file_os_io(file_os_io &&rhs);
+ file_os_io &operator=(file_os_io &&rhs);
+ ~file_os_io() noexcept;
+
+ dberr_t open(const char *path, bool read_only) noexcept final;
+ bool is_opened() const noexcept { return m_fd != OS_FILE_CLOSED; }
+ dberr_t rename(const char *old_path, const char *new_path) noexcept final;
+ dberr_t close() noexcept final;
+ dberr_t read(os_offset_t offset, span<byte> buf) noexcept final;
+ dberr_t write(const char *path, os_offset_t offset,
+ span<const byte> buf) noexcept final;
+ dberr_t flush() noexcept final;
+
+private:
+ pfs_os_file_t m_fd{OS_FILE_CLOSED};
+};
+
+/** File abstraction + path */
+class log_file_t
+{
+public:
+ log_file_t(std::string path= "") noexcept : m_path{std::move(path)} {}
+
+ dberr_t open(bool read_only) noexcept;
+ bool is_opened() const noexcept;
+
+ const std::string &get_path() const noexcept { return m_path; }
+
+ dberr_t rename(std::string new_path) noexcept;
+ dberr_t close() noexcept;
+ dberr_t read(os_offset_t offset, span<byte> buf) noexcept;
+ bool writes_are_durable() const noexcept;
+ dberr_t write(os_offset_t offset, span<const byte> buf) noexcept;
+ dberr_t flush() noexcept;
+ void free()
+ {
+ m_path.clear();
+ m_path.shrink_to_fit();
+ }
+
+private:
+ std::unique_ptr<file_io> m_file;
+ std::string m_path;
+};
+
+/** Redo log buffer */
+struct log_t{
+ /** The original (not version-tagged) InnoDB redo log format */
+ static constexpr uint32_t FORMAT_3_23 = 0;
+ /** The MySQL 5.7.9/MariaDB 10.2.2 log format */
+ static constexpr uint32_t FORMAT_10_2 = 1;
+ /** The MariaDB 10.3.2 log format.
+ To prevent crash-downgrade to earlier 10.2 due to the inability to
+ roll back a retroactively introduced TRX_UNDO_RENAME_TABLE undo log record,
+ MariaDB 10.2.18 and later will use the 10.3 format, but LOG_HEADER_SUBFORMAT
+ 1 instead of 0. MariaDB 10.3 will use subformat 0 (5.7-style TRUNCATE) or 2
+ (MDEV-13564 backup-friendly TRUNCATE). */
+ static constexpr uint32_t FORMAT_10_3 = 103;
+ /** The MariaDB 10.4.0 log format. */
+ static constexpr uint32_t FORMAT_10_4 = 104;
+ /** Encrypted MariaDB redo log */
+ static constexpr uint32_t FORMAT_ENCRYPTED = 1U << 31;
+ /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */
+ static constexpr uint32_t FORMAT_ENC_10_4 = FORMAT_10_4 | FORMAT_ENCRYPTED;
+ /** The MariaDB 10.5 physical redo log format */
+ static constexpr uint32_t FORMAT_10_5 = 0x50485953;
+ /** The MariaDB 10.5 physical format (only with innodb_encrypt_log=ON) */
+ static constexpr uint32_t FORMAT_ENC_10_5 = FORMAT_10_5 | FORMAT_ENCRYPTED;
+
+private:
+ /** The log sequence number of the last change of durable InnoDB files */
+ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE)
+ std::atomic<lsn_t> lsn;
+ /** the first guaranteed-durable log sequence number */
+ std::atomic<lsn_t> flushed_to_disk_lsn;
+ /** set when there may be need to flush the log buffer, or
+ preflush buffer pool pages, or initiate a log checkpoint.
+ This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */
+ std::atomic<bool> check_flush_or_checkpoint_;
+public:
+ /** mutex protecting the log */
+ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+ /** first free offset within the log buffer in use */
+ size_t buf_free;
+ /** recommended maximum size of buf, after which the buffer is flushed */
+ size_t max_buf_free;
+ /** mutex to serialize access to the flush list when we are putting
+ dirty blocks in the list. The idea behind this mutex is to be able
+ to release log_sys.mutex during mtr_commit and still ensure that
+ insertions in the flush_list happen in the LSN order. */
+ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_order_mutex;
+ /** log_buffer, append data here */
+ byte *buf;
+ /** log_buffer, writing data to file from this buffer.
+ Before flushing write_buf is swapped with flush_buf */
+ byte *flush_buf;
+ /** Log file stuff. Protected by mutex. */
+ struct file {
+ /** format of the redo log: e.g., FORMAT_10_5 */
+ uint32_t format;
+ /** redo log subformat: 0 with separately logged TRUNCATE,
+ 2 with fully redo-logged TRUNCATE (1 in MariaDB 10.2) */
+ uint32_t subformat;
+ /** individual log file size in bytes, including the header */
+ lsn_t file_size;
+ private:
+ /** lsn used to fix coordinates within the log group */
+ lsn_t lsn;
+ /** the byte offset of the above lsn */
+ lsn_t lsn_offset;
+ /** log file */
+ log_file_t fd;
+
+ public:
+ /** used only in recovery: recovery scan succeeded up to this
+ lsn in this log group */
+ lsn_t scanned_lsn;
+
+ /** opens log file which must be closed prior this call */
+ void open_file(std::string path);
+ /** writes header */
+ void write_header_durable(lsn_t lsn);
+ /** opens log file which must be closed prior this call */
+ dberr_t rename(std::string path) { return fd.rename(path); }
+ /** reads buffer from log file
+ @param[in] offset offset in log file
+ @param[in] buf buffer where to read */
+ void read(os_offset_t offset, span<byte> buf);
+ /** Tells whether writes require calling flush() */
+ bool writes_are_durable() const noexcept;
+ /** writes buffer to log file
+ @param[in] offset offset in log file
+ @param[in] buf buffer from which to write */
+ void write(os_offset_t offset, span<byte> buf);
+ /** flushes OS page cache (excluding metadata!) for log file */
+ void flush();
+ /** closes log file */
+ void close_file();
+
+ /** @return whether the redo log is encrypted */
+ bool is_encrypted() const { return format & FORMAT_ENCRYPTED; }
+ /** @return whether the redo log is in the physical format */
+ bool is_physical() const
+ { return (format & ~FORMAT_ENCRYPTED) == FORMAT_10_5; }
+ /** @return capacity in bytes */
+ lsn_t capacity() const{ return file_size - LOG_FILE_HDR_SIZE; }
+ /** Calculate the offset of a log sequence number.
+ @param[in] lsn log sequence number
+ @return offset within the log */
+ inline lsn_t calc_lsn_offset(lsn_t lsn) const;
+ inline lsn_t calc_lsn_offset_old(lsn_t lsn) const;
+
+ /** Set the field values to correspond to a given lsn. */
+ void set_fields(lsn_t lsn)
+ {
+ lsn_t c_lsn_offset = calc_lsn_offset(lsn);
+ set_lsn(lsn);
+ set_lsn_offset(c_lsn_offset);
+ }
+
+ /** Read a log segment to log_sys.buf.
+ @param[in,out] start_lsn in: read area start,
+ out: the last read valid lsn
+ @param[in] end_lsn read area end
+ @return whether no invalid blocks (e.g checksum mismatch) were found */
+ bool read_log_seg(lsn_t* start_lsn, lsn_t end_lsn);
+
+ /** Initialize the redo log buffer. */
+ void create();
+
+ /** Close the redo log buffer. */
+ void close() { close_file(); }
+ void set_lsn(lsn_t a_lsn);
+ lsn_t get_lsn() const { return lsn; }
+ void set_lsn_offset(lsn_t a_lsn);
+ lsn_t get_lsn_offset() const { return lsn_offset; }
+ } log;
+
+ /** The fields involved in the log buffer flush @{ */
+
+ size_t buf_next_to_write;/*!< first offset in the log buffer
+ where the byte content may not exist
+ written to file, e.g., the start
+ offset of a log record catenated
+ later; this is advanced when a flush
+ operation is completed to all the log
+ groups */
+ lsn_t write_lsn; /*!< last written lsn */
+ lsn_t current_flush_lsn;/*!< end lsn for the current running
+ write + flush operation */
+ std::atomic<size_t> pending_flushes; /*!< system calls in progress */
+ std::atomic<size_t> flushes; /*!< system calls counter */
+
+ ulint n_log_ios; /*!< number of log i/os initiated thus
+ far */
+ ulint n_log_ios_old; /*!< number of log i/o's at the
+ previous printout */
+ time_t last_printout_time;/*!< when log_print was last time
+ called */
+ /* @} */
+
+ /** Fields involved in checkpoints @{ */
+ lsn_t log_capacity; /*!< capacity of the log; if
+ the checkpoint age exceeds this, it is
+ a serious error because it is possible
+ we will then overwrite log and spoil
+ crash recovery */
+ lsn_t max_modified_age_async;
+ /*!< when this recommended
+ value for lsn -
+ buf_pool.get_oldest_modification()
+ is exceeded, we start an
+ asynchronous preflush of pool pages */
+ lsn_t max_checkpoint_age;
+ /*!< this is the maximum allowed value
+ for lsn - last_checkpoint_lsn when a
+ new query step is started */
+ ib_uint64_t next_checkpoint_no;
+ /*!< next checkpoint number */
+ /** latest completed checkpoint (protected by log_sys.mutex) */
+ Atomic_relaxed<lsn_t> last_checkpoint_lsn;
+ lsn_t next_checkpoint_lsn;
+ /*!< next checkpoint lsn */
+ ulint n_pending_checkpoint_writes;
+ /*!< number of currently pending
+ checkpoint writes */
+
+ /** buffer for checkpoint header */
+ byte *checkpoint_buf;
+ /* @} */
+
+private:
+ bool m_initialised;
+public:
+ /**
+ Constructor.
+
+ Some members may require late initialisation, thus we just mark object as
+ uninitialised. Real initialisation happens in create().
+ */
+ log_t(): m_initialised(false) {}
+
+ /** @return whether the redo log is encrypted */
+ bool is_encrypted() const { return(log.is_encrypted()); }
+ /** @return whether the redo log is in the physical format */
+ bool is_physical() const { return log.is_physical(); }
+
+ bool is_initialised() const { return m_initialised; }
+
+ lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const
+ { return lsn.load(order); }
+ void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_release); }
+
+ lsn_t get_flushed_lsn() const
+ { return flushed_to_disk_lsn.load(std::memory_order_acquire); }
+ void set_flushed_lsn(lsn_t lsn)
+ { flushed_to_disk_lsn.store(lsn, std::memory_order_release); }
+
+ bool check_flush_or_checkpoint() const
+ {
+ return UNIV_UNLIKELY
+ (check_flush_or_checkpoint_.load(std::memory_order_relaxed));
+ }
+ void set_check_flush_or_checkpoint(bool flag= true)
+ { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); }
+
+ bool has_encryption_key_rotation() const {
+ return log.format == FORMAT_ENC_10_4 || log.format == FORMAT_ENC_10_5;
+ }
+
+ /** @return the log block header + trailer size */
+ unsigned framing_size() const
+ {
+ return has_encryption_key_rotation()
+ ? LOG_BLOCK_HDR_SIZE + LOG_BLOCK_KEY + LOG_BLOCK_CHECKSUM
+ : LOG_BLOCK_HDR_SIZE + LOG_BLOCK_CHECKSUM;
+ }
+ /** @return the log block payload size */
+ unsigned payload_size() const
+ {
+ return has_encryption_key_rotation()
+ ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM -
+ LOG_BLOCK_KEY
+ : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE - LOG_BLOCK_CHECKSUM;
+ }
+ /** @return the log block trailer offset */
+ unsigned trailer_offset() const
+ {
+ return has_encryption_key_rotation()
+ ? OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY
+ : OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM;
+ }
+
+ size_t get_pending_flushes() const
+ {
+ return pending_flushes.load(std::memory_order_relaxed);
+ }
+
+ size_t get_flushes() const
+ {
+ return flushes.load(std::memory_order_relaxed);
+ }
+
+ /** Initialise the redo log subsystem. */
+ void create();
+
+ /** Shut down the redo log subsystem. */
+ void close();
+};
+
+/** Redo log system */
+extern log_t log_sys;
+#ifdef UNIV_DEBUG
+extern bool log_write_lock_own();
+#endif
+
+/** Calculate the offset of a log sequence number.
+@param[in] lsn log sequence number
+@return offset within the log */
+inline lsn_t log_t::file::calc_lsn_offset(lsn_t lsn) const
+{
+ ut_ad(this == &log_sys.log);
+ /* The lsn parameters are updated while holding both the mutexes
+ and it is ok to have either of them while reading */
+#ifdef SAFE_MUTEX
+ ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own());
+#endif /* SAFE_MUTEX */
+ const lsn_t size = capacity();
+ lsn_t l= lsn - this->lsn;
+ if (longlong(l) < 0) {
+ l = lsn_t(-longlong(l)) % size;
+ l = size - l;
+ }
+
+ l+= lsn_offset - LOG_FILE_HDR_SIZE * (1 + lsn_offset / file_size);
+ l %= size;
+ return l + LOG_FILE_HDR_SIZE * (1 + l / (file_size - LOG_FILE_HDR_SIZE));
+}
+
+inline void log_t::file::set_lsn(lsn_t a_lsn)
+{
+#ifdef SAFE_MUTEX
+ ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own());
+#endif /* SAFE_MUTEX */
+ lsn= a_lsn;
+}
+
+inline void log_t::file::set_lsn_offset(lsn_t a_lsn)
+{
+#ifdef SAFE_MUTEX
+ ut_ad(mysql_mutex_is_owner(&log_sys.mutex) || log_write_lock_own());
+#endif /* SAFE_MUTEX */
+ ut_ad((lsn % OS_FILE_LOG_BLOCK_SIZE) == (a_lsn % OS_FILE_LOG_BLOCK_SIZE));
+ lsn_offset= a_lsn;
+}
+
+#include "log0log.ic"
+
+#endif
diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic
new file mode 100644
index 00000000..d503e3ff
--- /dev/null
+++ b/storage/innobase/include/log0log.ic
@@ -0,0 +1,326 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.ic
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "assume_aligned.h"
+#include "ut0crc32.h"
+
+extern ulong srv_log_buffer_size;
+
+/************************************************************//**
+Gets a log block flush bit.
+@return TRUE if this block was the first to be written in a log flush */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+ const byte* log_block) /*!< in: log block */
+{
+ static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility");
+ static_assert(LOG_BLOCK_FLUSH_BIT_MASK == 0x80000000, "compatibility");
+
+ return *log_block & 0x80;
+}
+
+/************************************************************//**
+Sets the log block flush bit. */
+UNIV_INLINE
+void
+log_block_set_flush_bit(
+/*====================*/
+ byte* log_block, /*!< in/out: log block */
+ ibool val) /*!< in: value to set */
+{
+ static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility");
+ static_assert(LOG_BLOCK_FLUSH_BIT_MASK == 0x80000000, "compatibility");
+
+ if (val)
+ *log_block|= 0x80;
+ else
+ *log_block&= 0x7f;
+}
+
+/************************************************************//**
+Gets a log block number stored in the header.
+@return log block number stored in the block header */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+ const byte* log_block) /*!< in: log block */
+{
+ static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility");
+ return mach_read_from_4(my_assume_aligned<4>(log_block)) &
+ ~LOG_BLOCK_FLUSH_BIT_MASK;
+}
+
+/************************************************************//**
+Sets the log block number stored in the header; NOTE that this must be set
+before the flush bit! */
+UNIV_INLINE
+void
+log_block_set_hdr_no(
+/*=================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint n) /*!< in: log block number: must be > 0 and
+ < LOG_BLOCK_FLUSH_BIT_MASK */
+{
+ static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility");
+ ut_ad(n > 0);
+ ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK);
+
+ mach_write_to_4(my_assume_aligned<4>(log_block), n);
+}
+
+/************************************************************//**
+Gets a log block data length.
+@return log block data length measured as a byte offset from the block start */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return mach_read_from_2(my_assume_aligned<2>
+ (log_block + LOG_BLOCK_HDR_DATA_LEN));
+}
+
+/************************************************************//**
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint len) /*!< in: data length */
+{
+ mach_write_to_2(my_assume_aligned<2>(log_block + LOG_BLOCK_HDR_DATA_LEN),
+ len);
+}
+
+/************************************************************//**
+Gets a log block first mtr log record group offset.
+@return first mtr log record group byte offset from the block start, 0
+if none */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return mach_read_from_2(my_assume_aligned<2>
+ (log_block + LOG_BLOCK_FIRST_REC_GROUP));
+}
+
+/************************************************************//**
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint offset) /*!< in: offset, 0 if none */
+{
+ mach_write_to_2(my_assume_aligned<2>
+ (log_block + LOG_BLOCK_FIRST_REC_GROUP), offset);
+}
+
+/************************************************************//**
+Gets a log block checkpoint number field (4 lowest bytes).
+@return checkpoint no (4 lowest bytes) */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return mach_read_from_4(my_assume_aligned<4>
+ (log_block + LOG_BLOCK_CHECKPOINT_NO));
+}
+
+/************************************************************//**
+Sets a log block checkpoint number field (4 lowest bytes). */
+UNIV_INLINE
+void
+log_block_set_checkpoint_no(
+/*========================*/
+ byte* log_block, /*!< in/out: log block */
+ ib_uint64_t no) /*!< in: checkpoint no */
+{
+ mach_write_to_4(my_assume_aligned<4>(log_block + LOG_BLOCK_CHECKPOINT_NO),
+ static_cast<uint32_t>(no));
+}
+
+/************************************************************//**
+Converts a lsn to a log block number.
+@return log block number, it is > 0 and <= 1G */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+ lsn_t lsn) /*!< in: lsn of a byte within the block */
+{
+ return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) &
+ DBUG_EVALUATE_IF("innodb_small_log_block_no_limit",
+ 0xFUL, 0x3FFFFFFFUL)) + 1);
+}
+
+/** Calculate the CRC-32C checksum of a log block.
+@param[in] block log block
+@return checksum */
+inline ulint log_block_calc_checksum_crc32(const byte* block)
+{
+ return ut_crc32(block, OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM);
+}
+
+/************************************************************//**
+Gets a log block checksum field value.
+@return checksum */
+UNIV_INLINE
+ulint
+log_block_get_checksum(
+/*===================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return mach_read_from_4(my_assume_aligned<4>
+ (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM +
+ log_block));
+}
+
+/************************************************************//**
+Sets a log block checksum field value. */
+UNIV_INLINE
+void
+log_block_set_checksum(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint checksum) /*!< in: checksum */
+{
+ mach_write_to_4(my_assume_aligned<4>
+ (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_CHECKSUM +
+ log_block), checksum);
+}
+
+/************************************************************//**
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+ byte* log_block, /*!< in: pointer to the log buffer */
+ lsn_t lsn) /*!< in: lsn within the log block */
+{
+ ulint no;
+
+ no = log_block_convert_lsn_to_no(lsn);
+
+ log_block_set_hdr_no(log_block, no);
+
+ log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+ log_block_set_first_rec_group(log_block, 0);
+}
+
+/** Append a string to the log.
+@param[in] str string
+@param[in] len string length
+@param[out] start_lsn start LSN of the log record
+@return end lsn of the log record, zero if did not succeed */
+UNIV_INLINE
+lsn_t
+log_reserve_and_write_fast(
+ const void* str,
+ ulint len,
+ lsn_t* start_lsn)
+{
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(len > 0);
+
+ const ulint data_len = len
+ + log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
+
+ if (data_len >= log_sys.trailer_offset()) {
+
+ /* The string does not fit within the current log block
+ or the log block would become full */
+
+ return(0);
+ }
+
+ lsn_t lsn = log_sys.get_lsn();
+ *start_lsn = lsn;
+
+ memcpy(log_sys.buf + log_sys.buf_free, str, len);
+
+ log_block_set_data_len(
+ reinterpret_cast<byte*>(ut_align_down(
+ log_sys.buf + log_sys.buf_free,
+ OS_FILE_LOG_BLOCK_SIZE)),
+ data_len);
+
+ log_sys.buf_free += len;
+
+ ut_ad(log_sys.buf_free <= size_t{srv_log_buffer_size});
+
+ lsn += len;
+ log_sys.set_lsn(lsn);
+
+ return lsn;
+}
+
+/***********************************************************************//**
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void)
+/*================*/
+{
+ /* During row_log_table_apply(), this function will be called while we
+ are holding some latches. This is OK, as long as we are not holding
+ any latches on buffer blocks. */
+
+#ifdef UNIV_DEBUG
+ static const latch_level_t latches[] = {
+ SYNC_DICT, /* dict_sys.mutex during
+ commit_try_rebuild() */
+ SYNC_DICT_OPERATION, /* dict_sys.latch X-latch during
+ commit_try_rebuild() */
+ SYNC_FTS_CACHE, /* fts_cache_t::lock */
+ SYNC_INDEX_TREE /* index->lock */
+ };
+#endif /* UNIV_DEBUG */
+
+ ut_ad(!sync_check_iterate(
+ sync_allowed_latches(latches,
+ latches + UT_ARR_SIZE(latches))));
+
+ if (log_sys.check_flush_or_checkpoint()) {
+
+ log_check_margins();
+ }
+}
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
new file mode 100644
index 00000000..f822a874
--- /dev/null
+++ b/storage/innobase/include/log0recv.h
@@ -0,0 +1,426 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.h
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "ut0byte.h"
+#include "buf0types.h"
+#include "log0log.h"
+#include "mtr0types.h"
+
+#include <deque>
+
+/** @return whether recovery is currently running. */
+#define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on)
+
+/** Find the latest checkpoint in the log header.
+@param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2
+@return error code or DB_SUCCESS */
+dberr_t
+recv_find_max_checkpoint(ulint* max_field)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Apply any buffered redo log to a page that was just read from a data file.
+@param[in,out] space tablespace
+@param[in,out] bpage buffer pool page */
+ATTRIBUTE_COLD void recv_recover_page(fil_space_t* space, buf_page_t* bpage)
+ MY_ATTRIBUTE((nonnull));
+
+/** Start recovering from a redo log checkpoint.
+@param[in] flush_lsn FIL_PAGE_FILE_FLUSH_LSN
+of first system tablespace page
+@return error code or DB_SUCCESS */
+dberr_t
+recv_recovery_from_checkpoint_start(
+ lsn_t flush_lsn);
+
+/** Whether to store redo log records in recv_sys.pages */
+enum store_t {
+ /** Do not store redo log records. */
+ STORE_NO,
+ /** Store redo log records. */
+ STORE_YES,
+ /** Store redo log records if the tablespace exists. */
+ STORE_IF_EXISTS
+};
+
+
+/** Adds data from a new log block to the parsing buffer of recv_sys if
+recv_sys.parse_start_lsn is non-zero.
+@param[in] log_block log block to add
+@param[in] scanned_lsn lsn of how far we were able to find
+ data in this log block
+@return true if more data added */
+bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn);
+
+/** Moves the parsing buffer data left to the buffer start */
+void recv_sys_justify_left_parsing_buf();
+
+/** Report an operation to create, delete, or rename a file during backup.
+@param[in] space_id tablespace identifier
+@param[in] create whether the file is being created
+@param[in] name file name (not NUL-terminated)
+@param[in] len length of name, in bytes
+@param[in] new_name new file name (NULL if not rename)
+@param[in] new_len length of new_name, in bytes (0 if NULL) */
+extern void (*log_file_op)(ulint space_id, bool create,
+ const byte* name, ulint len,
+ const byte* new_name, ulint new_len);
+
+/** Stored redo log record */
+struct log_rec_t
+{
+ log_rec_t(lsn_t lsn) : next(nullptr), lsn(lsn) { ut_ad(lsn); }
+ log_rec_t()= delete;
+ log_rec_t(const log_rec_t&)= delete;
+ log_rec_t &operator=(const log_rec_t&)= delete;
+
+ /** next record */
+ log_rec_t *next;
+ /** mtr_t::commit_lsn() of the mini-transaction */
+ const lsn_t lsn;
+};
+
+struct recv_dblwr_t
+{
+ /** Add a page frame to the doublewrite recovery buffer. */
+ void add(byte *page) { pages.push_front(page); }
+
+ /** Validate the page.
+ @param page_id page identifier
+ @param page page contents
+ @param space the tablespace of the page (not available for page 0)
+ @param tmp_buf 2*srv_page_size for decrypting and decompressing any
+ page_compressed or encrypted pages
+ @return whether the page is valid */
+ bool validate_page(const page_id_t page_id, const byte *page,
+ const fil_space_t *space, byte *tmp_buf);
+
+ /** Find a doublewrite copy of a page.
+ @param page_id page identifier
+ @param space tablespace (not available for page_id.page_no()==0)
+ @param tmp_buf 2*srv_page_size for decrypting and decompressing any
+ page_compressed or encrypted pages
+ @return page frame
+ @retval NULL if no valid page for page_id was found */
+ byte* find_page(const page_id_t page_id, const fil_space_t *space= NULL,
+ byte *tmp_buf= NULL);
+
+ typedef std::deque<byte*, ut_allocator<byte*> > list;
+
+ /** Recovered doublewrite buffer page frames */
+ list pages;
+};
+
+/** the recovery state and buffered records for a page */
+struct page_recv_t
+{
+ /** Recovery state; protected by recv_sys.mutex */
+ enum
+ {
+ /** not yet processed */
+ RECV_NOT_PROCESSED,
+ /** not processed; the page will be reinitialized */
+ RECV_WILL_NOT_READ,
+ /** page is being read */
+ RECV_BEING_READ,
+ /** log records are being applied on the page */
+ RECV_BEING_PROCESSED
+ } state= RECV_NOT_PROCESSED;
+ /** Latest written byte offset when applying the log records.
+ @see mtr_t::m_last_offset */
+ uint16_t last_offset= 1;
+ /** log records for a page */
+ class recs_t
+ {
+ /** The first log record */
+ log_rec_t *head= nullptr;
+ /** The last log record */
+ log_rec_t *tail= nullptr;
+ friend struct page_recv_t;
+ public:
+ /** Append a redo log snippet for the page
+ @param recs log snippet */
+ void append(log_rec_t* recs)
+ {
+ if (tail)
+ tail->next= recs;
+ else
+ head= recs;
+ tail= recs;
+ }
+
+ /** @return the last log snippet */
+ const log_rec_t* last() const { return tail; }
+ /** @return the last log snippet */
+ log_rec_t* last() { return tail; }
+
+ class iterator
+ {
+ log_rec_t *cur;
+ public:
+ iterator(log_rec_t* rec) : cur(rec) {}
+ log_rec_t* operator*() const { return cur; }
+ iterator &operator++() { cur= cur->next; return *this; }
+ bool operator!=(const iterator& i) const { return cur != i.cur; }
+ };
+ iterator begin() { return head; }
+ iterator end() { return NULL; }
+ bool empty() const { ut_ad(!head == !tail); return !head; }
+ /** Clear and free the records; @see recv_sys_t::alloc() */
+ inline void clear();
+ } log;
+
+ /** Trim old log records for a page.
+ @param start_lsn oldest log sequence number to preserve
+ @return whether all the log for the page was trimmed */
+ inline bool trim(lsn_t start_lsn);
+ /** Ignore any earlier redo log records for this page. */
+ inline void will_not_read();
+ /** @return whether the log records for the page are being processed */
+ bool is_being_processed() const { return state == RECV_BEING_PROCESSED; }
+};
+
+/** Recovery system data structure */
+struct recv_sys_t
+{
+ /** mutex protecting apply_log_recs and page_recv_t::state */
+ ib_mutex_t mutex;
+ /** whether we are applying redo log records during crash recovery */
+ bool recovery_on;
+ /** whether recv_recover_page(), invoked from buf_page_read_complete(),
+ should apply log records*/
+ bool apply_log_recs;
+ /** whether recv_apply_hashed_log_recs() is running */
+ bool apply_batch_on;
+ byte* buf; /*!< buffer for parsing log records */
+ ulint len; /*!< amount of data in buf */
+ lsn_t parse_start_lsn;
+ /*!< this is the lsn from which we were able to
+ start parsing log records and adding them to
+ pages; zero if a suitable
+ start point not found yet */
+ lsn_t scanned_lsn;
+ /*!< the log data has been scanned up to this
+ lsn */
+ ulint scanned_checkpoint_no;
+ /*!< the log data has been scanned up to this
+ checkpoint number (lowest 4 bytes) */
+ ulint recovered_offset;
+ /*!< start offset of non-parsed log records in
+ buf */
+ lsn_t recovered_lsn;
+ /*!< the log records have been parsed up to
+ this lsn */
+ bool found_corrupt_log;
+ /*!< set when finding a corrupt log
+ block or record, or there is a log
+ parsing buffer overflow */
+ bool found_corrupt_fs;
+ /*!< set when an inconsistency with
+ the file system contents is detected
+ during log scan or apply */
+ lsn_t mlog_checkpoint_lsn;
+ /*!< the LSN of a FILE_CHECKPOINT
+ record, or 0 if none was parsed */
+ /** the time when progress was last reported */
+ time_t progress_time;
+
+ using map = std::map<const page_id_t, page_recv_t,
+ std::less<const page_id_t>,
+ ut_allocator<std::pair<const page_id_t, page_recv_t>>>;
+ /** buffered records waiting to be applied to pages */
+ map pages;
+
+private:
+ /** Process a record that indicates that a tablespace size is being shrunk.
+ @param page_id first page that is not in the file
+ @param lsn log sequence number of the shrink operation */
+ inline void trim(const page_id_t page_id, lsn_t lsn);
+
+ /** Undo tablespaces for which truncate has been logged
+ (indexed by page_id_t::space() - srv_undo_space_id_start) */
+ struct trunc
+ {
+ /** log sequence number of FILE_CREATE, or 0 if none */
+ lsn_t lsn;
+ /** truncated size of the tablespace, or 0 if not truncated */
+ unsigned pages;
+ } truncated_undo_spaces[127];
+
+public:
+ /** The contents of the doublewrite buffer */
+ recv_dblwr_t dblwr;
+
+ /** Last added LSN to pages. */
+ lsn_t last_stored_lsn= 0;
+
+ void read(os_offset_t offset, span<byte> buf);
+ inline size_t files_size();
+ void close_files() { files.clear(); files.shrink_to_fit(); }
+
+private:
+ /** Attempt to initialize a page based on redo log records.
+ @param page_id page identifier
+ @param p iterator pointing to page_id
+ @param mtr mini-transaction
+ @param b pre-allocated buffer pool block
+ @return whether the page was successfully initialized */
+ inline buf_block_t *recover_low(const page_id_t page_id, map::iterator &p,
+ mtr_t &mtr, buf_block_t *b);
+ /** Attempt to initialize a page based on redo log records.
+ @param page_id page identifier
+ @return the recovered block
+ @retval nullptr if the page cannot be initialized based on log records */
+ buf_block_t *recover_low(const page_id_t page_id);
+
+ /** All found log files (multiple ones are possible if we are upgrading
+ from before MariaDB Server 10.5.1) */
+ std::vector<log_file_t> files;
+
+ void open_log_files_if_needed();
+
+ /** Base node of the redo block list.
+ List elements are linked via buf_block_t::unzip_LRU. */
+ UT_LIST_BASE_NODE_T(buf_block_t) blocks;
+public:
+ /** Check whether the number of read redo log blocks exceeds the maximum.
+ Store last_stored_lsn if the recovery is not in the last phase.
+ @param[in,out] store whether to store page operations
+ @return whether the memory is exhausted */
+ inline bool is_memory_exhausted(store_t *store);
+ /** Apply buffered log to persistent data pages.
+ @param last_batch whether it is possible to write more redo log */
+ void apply(bool last_batch);
+
+#ifdef UNIV_DEBUG
+ /** whether all redo log in the current batch has been applied */
+ bool after_apply= false;
+#endif
+ /** Initialize the redo log recovery subsystem. */
+ void create();
+
+ /** Free most recovery data structures. */
+ void debug_free();
+
+ /** Clean up after create() */
+ void close();
+
+ bool is_initialised() const { return last_stored_lsn != 0; }
+
+ /** Register a redo log snippet for a page.
+ @param page_id page identifier
+ @param start_lsn start LSN of the mini-transaction
+ @param lsn @see mtr_t::commit_lsn()
+ @param l redo log snippet @see log_t::FORMAT_10_5
+ @param len length of l, in bytes */
+ inline void add(const page_id_t page_id, lsn_t start_lsn, lsn_t lsn,
+ const byte *l, size_t len);
+
+ /** Parse and register one mini-transaction in log_t::FORMAT_10_5.
+ @param checkpoint_lsn the log sequence number of the latest checkpoint
+ @param store whether to store the records
+ @param apply whether to apply file-level log records
+ @return whether FILE_CHECKPOINT record was seen the first time,
+ or corruption was noticed */
+ bool parse(lsn_t checkpoint_lsn, store_t *store, bool apply);
+
+ /** Clear a fully processed set of stored redo log records. */
+ inline void clear();
+
+ /** Determine whether redo log recovery progress should be reported.
+ @param time the current time
+ @return whether progress should be reported
+ (the last report was at least 15 seconds ago) */
+ bool report(time_t time)
+ {
+ if (time - progress_time < 15)
+ return false;
+
+ progress_time= time;
+ return true;
+ }
+
+ /** The alloc() memory alignment, in bytes */
+ static constexpr size_t ALIGNMENT= sizeof(size_t);
+
+ /** Allocate memory for log_rec_t
+ @param len allocation size, in bytes
+ @return pointer to len bytes of memory (never NULL) */
+ inline void *alloc(size_t len);
+
+ /** Free a redo log snippet.
+ @param data buffer returned by alloc() */
+ inline void free(const void *data);
+
+ /** Remove records for a corrupted page.
+ This function should only be called when innodb_force_recovery is set.
+ @param page_id corrupted page identifier */
+ ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id);
+
+ /** Attempt to initialize a page based on redo log records.
+ @param page_id page identifier
+ @return the recovered block
+ @retval nullptr if the page cannot be initialized based on log records */
+ buf_block_t *recover(const page_id_t page_id)
+ {
+ return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr;
+ }
+};
+
+/** The recovery system */
+extern recv_sys_t recv_sys;
+
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this will be set if
+recv_sys.pages becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+extern bool recv_no_ibuf_operations;
+/** TRUE when recv_init_crash_recovery() has been called. */
+extern bool recv_needed_recovery;
+#ifdef UNIV_DEBUG
+/** TRUE if writing to the redo log (mtr_commit) is forbidden.
+Protected by log_sys.mutex. */
+extern bool recv_no_log_write;
+#endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by
+recv_recovery_from_checkpoint_start(). */
+extern bool recv_lsn_checks_on;
+
+/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many
+times! */
+#define RECV_PARSING_BUF_SIZE (2U << 20)
+
+/** Size of block reads when the log groups are scanned forward to do a
+roll-forward */
+#define RECV_SCAN_SIZE (4U << srv_page_size_shift)
diff --git a/storage/innobase/include/log0types.h b/storage/innobase/include/log0types.h
new file mode 100644
index 00000000..337fcd31
--- /dev/null
+++ b/storage/innobase/include/log0types.h
@@ -0,0 +1,44 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0types.h
+Log types
+
+Created 2013-03-15 Sunny Bains
+*******************************************************/
+
+#ifndef log0types_h
+#define log0types_h
+
+#include "univ.i"
+
+/* Type used for all log sequence number storage and arithmetics */
+typedef ib_uint64_t lsn_t;
+
+#define LSN_MAX IB_UINT64_MAX
+
+#define LSN_PF UINT64PF
+
+#endif /* log0types_h */
diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h
new file mode 100644
index 00000000..88317a73
--- /dev/null
+++ b/storage/innobase/include/mach0data.h
@@ -0,0 +1,353 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.h
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef mach0data_h
+#define mach0data_h
+
+#include "univ.i"
+#include "mtr0types.h"
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* The data and all fields are always stored in a database file
+in the same format: ascii, big-endian, ... .
+All data in the files MUST be accessed using the functions in this
+module. */
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+ byte* b, /*!< in: pointer to byte where to store */
+ ulint n); /*!< in: ulint integer to be stored, >= 0, < 256 */
+/** The following function is used to fetch data from one byte.
+@param[in] b pointer to a byte to read
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+uint8_t
+mach_read_from_1(
+ const byte* b)
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lower address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+ byte* b, /*!< in: pointer to two bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored, >= 0, < 64k */
+#endif /* !UNIV_INNOCHECKSUM */
+/** The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 2 bytes where to store
+@return 2-byte integer, >= 0, < 64k */
+UNIV_INLINE
+uint16_t
+mach_read_from_2(
+ const byte* b)
+ MY_ATTRIBUTE((warn_unused_result));
+
+#ifndef UNIV_INNOCHECKSUM
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+ ulint n) /*!< in: integer in machine-dependent format */
+ MY_ATTRIBUTE((const));
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+ uint16 n) /*!< in: 16-bit integer in canonical format */
+ MY_ATTRIBUTE((const));
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+ byte* b, /*!< in: pointer to 3 bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/** The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 3 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_3(
+ const byte* b)
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+ byte* b, /*!< in: pointer to four bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/** The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 4 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_4(
+ const byte* b)
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a ulint in a compressed form (1..5 bytes).
+@return stored size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/*********************************************************//**
+Returns the size of an ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+ ulint n) /*!< in: ulint integer to be stored */
+ MY_ATTRIBUTE((const));
+/** Read a 32-bit integer in a compressed form.
+@param[in,out] b pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint32_t
+mach_read_next_compressed(
+ const byte** b);
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+ byte* b, /*!< in: pointer to 6 bytes where to store */
+ ib_uint64_t id); /*!< in: 48-bit integer */
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 48-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_6(
+/*=============*/
+ const byte* b) /*!< in: pointer to 6 bytes */
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+ byte* b, /*!< in: pointer to 7 bytes where to store */
+ ib_uint64_t n); /*!< in: 56-bit integer */
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 56-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_7(
+/*=============*/
+ const byte* b) /*!< in: pointer to 7 bytes */
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+ void* b, /*!< in: pointer to 8 bytes where to store */
+ ib_uint64_t n); /*!< in: 64-bit integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_8(
+/*=============*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_compressed(
+/*======================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ib_uint64_t n); /*!< in: 64-bit integer to be stored */
+/** Read a 64-bit integer in a compressed form.
+@param[in,out] b pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_next_compressed(
+ const byte** b);
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_much_compressed(
+/*===========================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ib_uint64_t n); /*!< in: 64-bit integer to be stored */
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_much_compressed(
+/*==========================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ double d); /*!< in: double */
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ float d); /*!< in: float */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+ const byte* buf, /*!< in: from where to read */
+ ulint buf_size) /*!< in: from how many bytes to read */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint dest_size, /*!< in: into how many bytes to write */
+ ulint n); /*!< in: unsigned long int to write */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+ const byte* buf) /*!< in: from where to read */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint n); /*!< in: unsigned long int to write */
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_int_type(
+/*===============*/
+ const byte* src, /*!< in: where to read from */
+ ulint len, /*!< in: length of src */
+ ibool unsigned_type); /*!< in: signed or unsigned flag */
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+ byte* dest, /*!< in: where to write */
+ ulonglong src, /*!< in: where to read from */
+ ulint len, /*!< in: length of dest */
+ bool usign); /*!< in: signed or unsigned flag */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#include "mach0data.ic"
+
+#endif
diff --git a/storage/innobase/include/mach0data.ic b/storage/innobase/include/mach0data.ic
new file mode 100644
index 00000000..bfccf611
--- /dev/null
+++ b/storage/innobase/include/mach0data.ic
@@ -0,0 +1,836 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.ic
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "mtr0types.h"
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+ byte* b, /*!< in: pointer to byte where to store */
+ ulint n) /*!< in: ulint integer to be stored, >= 0, < 256 */
+{
+ ut_ad((n & ~0xFFUL) == 0);
+
+ b[0] = (byte) n;
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+ byte* b, /*!< in: pointer to two bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ ut_ad((n & ~0xFFFFUL) == 0);
+
+ b[0] = (byte)(n >> 8);
+ b[1] = (byte)(n);
+}
+
+/** The following function is used to fetch data from one byte.
+@param[in] b pointer to a byte to read
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+uint8_t
+mach_read_from_1(
+ const byte* b)
+{
+ return(uint8_t(*b));
+}
+
+/** The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 2 bytes to read
+@return 2-byte integer, >= 0, < 64k */
+UNIV_INLINE
+uint16_t
+mach_read_from_2(
+ const byte* b)
+{
+ return(uint16_t(uint16_t(b[0]) << 8 | b[1]));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+ ulint n) /*!< in: integer in machine-dependent format */
+{
+ uint16 ret;
+ ut_ad(2 == sizeof ret);
+ mach_write_to_2((byte*) &ret, n);
+ return(ret);
+}
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+ uint16 n) /*!< in: 16-bit integer in canonical format */
+{
+ ut_ad(2 == sizeof n);
+ return(mach_read_from_2((const byte*) &n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+ byte* b, /*!< in: pointer to 3 bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ ut_ad((n & ~0xFFFFFFUL) == 0);
+
+ b[0] = (byte)(n >> 16);
+ b[1] = (byte)(n >> 8);
+ b[2] = (byte)(n);
+}
+
+/** The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 3 bytes to read
+@return uint32_t integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_3(
+ const byte* b)
+{
+ return( (static_cast<uint32_t>(b[0]) << 16)
+ | (static_cast<uint32_t>(b[1]) << 8)
+ | static_cast<uint32_t>(b[2])
+ );
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+ byte* b, /*!< in: pointer to four bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ b[0] = (byte)(n >> 24);
+ b[1] = (byte)(n >> 16);
+ b[2] = (byte)(n >> 8);
+ b[3] = (byte) n;
+}
+
+/** The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in] b pointer to 4 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_4(
+ const byte* b)
+{
+ return( (static_cast<uint32_t>(b[0]) << 24)
+ | (static_cast<uint32_t>(b[1]) << 16)
+ | (static_cast<uint32_t>(b[2]) << 8)
+ | static_cast<uint32_t>(b[3])
+ );
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*********************************************************//**
+Writes a ulint in a compressed form where the first byte codes the
+length of the stored ulint. We look at the most significant bits of
+the byte. If the most significant bit is zero, it means 1-byte storage,
+else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0,
+it means 3-byte storage, else if 4th is 0, it means 4-byte storage,
+else the storage is 5-byte.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ulint n) /*!< in: ulint integer (< 2^32) to be stored */
+{
+ if (n < 0x80) {
+ /* 0nnnnnnn (7 bits) */
+ mach_write_to_1(b, n);
+ return(1);
+ } else if (n < 0x4000) {
+ /* 10nnnnnn nnnnnnnn (14 bits) */
+ mach_write_to_2(b, n | 0x8000);
+ return(2);
+ } else if (n < 0x200000) {
+ /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+ mach_write_to_3(b, n | 0xC00000);
+ return(3);
+ } else if (n < 0x10000000) {
+ /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+ mach_write_to_4(b, n | 0xE0000000);
+ return(4);
+ } else {
+ /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+ mach_write_to_1(b, 0xF0);
+ mach_write_to_4(b + 1, n);
+ return(5);
+ }
+}
+
+/*********************************************************//**
+Returns the size of a ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+ ulint n) /*!< in: ulint integer (< 2^32) to be stored */
+{
+ if (n < 0x80) {
+ /* 0nnnnnnn (7 bits) */
+ return(1);
+ } else if (n < 0x4000) {
+ /* 10nnnnnn nnnnnnnn (14 bits) */
+ return(2);
+ } else if (n < 0x200000) {
+ /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+ return(3);
+ } else if (n < 0x10000000) {
+ /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+ return(4);
+ } else {
+ /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+ return(5);
+ }
+}
+
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return read integer (< 2^32) */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ ulint val;
+
+ val = mach_read_from_1(b);
+
+ if (val < 0x80) {
+ /* 0nnnnnnn (7 bits) */
+ } else if (val < 0xC0) {
+ /* 10nnnnnn nnnnnnnn (14 bits) */
+ val = mach_read_from_2(b) & 0x3FFF;
+ ut_ad(val > 0x7F);
+ } else if (val < 0xE0) {
+ /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+ val = mach_read_from_3(b) & 0x1FFFFF;
+ ut_ad(val > 0x3FFF);
+ } else if (val < 0xF0) {
+ /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+ val = mach_read_from_4(b) & 0xFFFFFFF;
+ ut_ad(val > 0x1FFFFF);
+ } else {
+ /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+ ut_ad(val == 0xF0);
+ val = mach_read_from_4(b + 1);
+ ut_ad(val > 0xFFFFFFF);
+ }
+
+ return(val);
+}
+
+/** Read a 32-bit integer in a compressed form.
+@param[in,out] b pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint32_t
+mach_read_next_compressed(
+ const byte** b)
+{
+ ulint val = mach_read_from_1(*b);
+
+ if (val < 0x80) {
+ /* 0nnnnnnn (7 bits) */
+ ++*b;
+ } else if (val < 0xC0) {
+ /* 10nnnnnn nnnnnnnn (14 bits) */
+ val = mach_read_from_2(*b) & 0x3FFF;
+ ut_ad(val > 0x7F);
+ *b += 2;
+ } else if (val < 0xE0) {
+ /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+ val = mach_read_from_3(*b) & 0x1FFFFF;
+ ut_ad(val > 0x3FFF);
+ *b += 3;
+ } else if (val < 0xF0) {
+ /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+ val = mach_read_from_4(*b) & 0xFFFFFFF;
+ ut_ad(val > 0x1FFFFF);
+ *b += 4;
+ } else {
+ /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+ ut_ad(val == 0xF0);
+ val = mach_read_from_4(*b + 1);
+ ut_ad(val > 0xFFFFFFF);
+ *b += 5;
+ }
+
+ return(static_cast<ib_uint32_t>(val));
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+ void* b, /*!< in: pointer to 8 bytes where to store */
+ ib_uint64_t n) /*!< in: 64-bit integer to be stored */
+{
+ mach_write_to_4(static_cast<byte*>(b), (ulint) (n >> 32));
+ mach_write_to_4(static_cast<byte*>(b) + 4, (ulint) n);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_8(
+/*=============*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+{
+ ib_uint64_t u64;
+
+ u64 = mach_read_from_4(b);
+ u64 <<= 32;
+ u64 |= mach_read_from_4(b + 4);
+
+ return(u64);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+ byte* b, /*!< in: pointer to 7 bytes where to store */
+ ib_uint64_t n) /*!< in: 56-bit integer */
+{
+ mach_write_to_3(b, (ulint) (n >> 32));
+ mach_write_to_4(b + 3, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 56-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_7(
+/*=============*/
+ const byte* b) /*!< in: pointer to 7 bytes */
+{
+ return(ut_ull_create(mach_read_from_3(b), mach_read_from_4(b + 3)));
+}
+
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+ byte* b, /*!< in: pointer to 6 bytes where to store */
+ ib_uint64_t n) /*!< in: 48-bit integer */
+{
+ mach_write_to_2(b, (ulint) (n >> 32));
+ mach_write_to_4(b + 2, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 48-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_6(
+/*=============*/
+ const byte* b) /*!< in: pointer to 6 bytes */
+{
+ return(ut_ull_create(mach_read_from_2(b), mach_read_from_4(b + 2)));
+}
+
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_compressed(
+/*======================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ib_uint64_t n) /*!< in: 64-bit integer to be stored */
+{
+ ulint size = mach_write_compressed(b, (ulint) (n >> 32));
+ mach_write_to_4(b + size, (ulint) n);
+
+ return(size + 4);
+}
+
+/** Read a 64-bit integer in a compressed form.
+@param[in,out] b pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_next_compressed(
+ const byte** b)
+{
+ ib_uint64_t val;
+
+ val = mach_read_next_compressed(b);
+ val <<= 32;
+ val |= mach_read_from_4(*b);
+ *b += 4;
+ return(val);
+}
+
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_much_compressed(
+/*===========================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ib_uint64_t n) /*!< in: 64-bit integer to be stored */
+{
+ ulint size;
+
+ if (!(n >> 32)) {
+ return(mach_write_compressed(b, (ulint) n));
+ }
+
+ *b = (byte)0xFF;
+ size = 1 + mach_write_compressed(b + 1, (ulint) (n >> 32));
+
+ size += mach_write_compressed(b + size, (ulint) n & 0xFFFFFFFF);
+
+ return(size);
+}
+
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_much_compressed(
+/*==========================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ ib_uint64_t n;
+
+ if (*b != 0xFF) {
+ return(mach_read_compressed(b));
+ }
+
+ b++;
+ n = mach_read_next_compressed(&b);
+ n <<= 32;
+ n |= mach_read_compressed(b);
+
+ return(n);
+}
+
+/** Read a 64-bit integer in a compressed form.
+@param[in,out] b pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_next_much_compressed(
+ const byte** b)
+{
+ ib_uint64_t val = mach_read_from_1(*b);
+
+ if (val < 0x80) {
+ /* 0nnnnnnn (7 bits) */
+ ++*b;
+ } else if (val < 0xC0) {
+ /* 10nnnnnn nnnnnnnn (14 bits) */
+ val = mach_read_from_2(*b) & 0x3FFF;
+ ut_ad(val > 0x7F);
+ *b += 2;
+ } else if (val < 0xE0) {
+ /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+ val = mach_read_from_3(*b) & 0x1FFFFF;
+ ut_ad(val > 0x3FFF);
+ *b += 3;
+ } else if (val < 0xF0) {
+ /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+ val = mach_read_from_4(*b) & 0xFFFFFFF;
+ ut_ad(val > 0x1FFFFF);
+ *b += 4;
+ } else if (val == 0xF0) {
+ /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+ val = mach_read_from_4(*b + 1);
+ ut_ad(val > 0xFFFFFFF);
+ *b += 5;
+ } else {
+ /* 11111111 followed by up to 64 bits */
+ ut_ad(val == 0xFF);
+ ++*b;
+ val = mach_read_next_compressed(b);
+ ut_ad(val > 0);
+ val <<= 32;
+ val |= mach_read_next_compressed(b);
+ }
+
+ return(val);
+}
+
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ double d;
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*) &d;
+
+ for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+ ptr[sizeof(double) - i - 1] = b[i];
+#else
+ ptr[i] = b[i];
+#endif
+ }
+
+ return(d);
+}
+
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ double d) /*!< in: double */
+{
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*) &d;
+
+ for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+ b[i] = ptr[sizeof(double) - i - 1];
+#else
+ b[i] = ptr[i];
+#endif
+ }
+}
+
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ float d;
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*) &d;
+
+ for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+ ptr[sizeof(float) - i - 1] = b[i];
+#else
+ ptr[i] = b[i];
+#endif
+ }
+
+ return(d);
+}
+
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ float d) /*!< in: float */
+{
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*) &d;
+
+ for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+ b[i] = ptr[sizeof(float) - i - 1];
+#else
+ b[i] = ptr[i];
+#endif
+ }
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+ const byte* buf, /*!< in: from where to read */
+ ulint buf_size) /*!< in: from how many bytes to read */
+{
+ ulint n = 0;
+ const byte* ptr;
+
+ ut_ad(buf_size > 0);
+
+ ptr = buf + buf_size;
+
+ for (;;) {
+ ptr--;
+
+ n = n << 8;
+
+ n += (ulint)(*ptr);
+
+ if (ptr == buf) {
+ break;
+ }
+ }
+
+ return(n);
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint dest_size, /*!< in: into how many bytes to write */
+ ulint n) /*!< in: unsigned long int to write */
+{
+ byte* end;
+
+ ut_ad(dest_size <= sizeof(ulint));
+ ut_ad(dest_size > 0);
+
+ end = dest + dest_size;
+
+ for (;;) {
+ *dest = (byte)(n & 0xFF);
+
+ n = n >> 8;
+
+ dest++;
+
+ if (dest == end) {
+ break;
+ }
+ }
+
+ ut_ad(n == 0);
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+ const byte* buf) /*!< in: from where to read */
+{
+ return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8));
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint n) /*!< in: unsigned long int to write */
+{
+ ut_ad(n < 256 * 256);
+
+ *dest = (byte)(n & 0xFFUL);
+
+ n = n >> 8;
+ dest++;
+
+ *dest = (byte)(n & 0xFFUL);
+}
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_int_type(
+/*===============*/
+ const byte* src, /*!< in: where to read from */
+ ulint len, /*!< in: length of src */
+ ibool unsigned_type) /*!< in: signed or unsigned flag */
+{
+ /* XXX this can be optimized on big-endian machines */
+
+ uintmax_t ret;
+ uint i;
+
+ if (unsigned_type || (src[0] & 0x80)) {
+
+ ret = 0x0000000000000000ULL;
+ } else {
+
+ ret = 0xFFFFFFFFFFFFFF00ULL;
+ }
+
+ if (unsigned_type) {
+
+ ret |= src[0];
+ } else {
+
+ ret |= src[0] ^ 0x80;
+ }
+
+ for (i = 1; i < len; i++) {
+ ret <<= 8;
+ ret |= src[i];
+ }
+
+ return(ret);
+}
+/*********************************************************//**
+Swap byte ordering. */
+UNIV_INLINE
+void
+mach_swap_byte_order(
+/*=================*/
+ byte* dest, /*!< out: where to write */
+ const byte* from, /*!< in: where to read from */
+ ulint len) /*!< in: length of src */
+{
+ ut_ad(len > 0);
+ ut_ad(len <= 8);
+
+ dest += len;
+
+ switch (len & 0x7) {
+ case 0: *--dest = *from++; /* fall through */
+ case 7: *--dest = *from++; /* fall through */
+ case 6: *--dest = *from++; /* fall through */
+ case 5: *--dest = *from++; /* fall through */
+ case 4: *--dest = *from++; /* fall through */
+ case 3: *--dest = *from++; /* fall through */
+ case 2: *--dest = *from++; /* fall through */
+ case 1: *--dest = *from;
+ }
+}
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+ byte* dest, /*!< in: where to write */
+ ulonglong src, /*!< in: where to read from */
+ ulint len, /*!< in: length of dest */
+ bool usign) /*!< in: signed or unsigned flag */
+{
+ byte* ptr = reinterpret_cast<byte*>(&src);
+
+ ut_ad(len <= sizeof(ulonglong));
+
+#ifdef WORDS_BIGENDIAN
+ memcpy(dest, ptr + (sizeof(src) - len), len);
+#else
+ mach_swap_byte_order(dest, reinterpret_cast<byte*>(ptr), len);
+#endif /* WORDS_BIGENDIAN */
+
+ if (!usign) {
+ *dest ^= 0x80;
+ }
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h
new file mode 100644
index 00000000..b7fd9c09
--- /dev/null
+++ b/storage/innobase/include/mem0mem.h
@@ -0,0 +1,345 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0mem.h
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0mem_h
+#define mem0mem_h
+
+#include "ut0mem.h"
+#include "ut0rnd.h"
+#include "mach0data.h"
+
+#include <memory>
+
+/* -------------------- MEMORY HEAPS ----------------------------- */
+
+/** A block of a memory heap consists of the info structure
+followed by an area of memory */
+typedef struct mem_block_info_t mem_block_t;
+
+/** A memory heap is a nonempty linear list of memory blocks */
+typedef mem_block_t mem_heap_t;
+
+/** Types of allocation for memory heaps: DYNAMIC means allocation from the
+dynamic memory pool of the C compiler, BUFFER means allocation from the
+buffer pool; the latter method is used for very big heaps */
+
+#define MEM_HEAP_DYNAMIC 0 /* the most common type */
+#define MEM_HEAP_BUFFER 1
+#define MEM_HEAP_BTR_SEARCH 2 /* this flag can optionally be
+ ORed to MEM_HEAP_BUFFER, in which
+ case heap->free_block is used in
+ some cases for memory allocations,
+ and if it's NULL, the memory
+ allocation functions can return
+ NULL. */
+
+/** Different type of heaps in terms of which datastructure is using them */
+#define MEM_HEAP_FOR_BTR_SEARCH (MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER)
+#define MEM_HEAP_FOR_LOCK_HEAP (MEM_HEAP_BUFFER)
+
+/** The following start size is used for the first block in the memory heap if
+the size is not specified, i.e., 0 is given as the parameter in the call of
+create. The standard size is the maximum (payload) size of the blocks used for
+allocations of small buffers. */
+
+#define MEM_BLOCK_START_SIZE 64
+#define MEM_BLOCK_STANDARD_SIZE \
+ (srv_page_size >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF)
+
+/** If a memory heap is allowed to grow into the buffer pool, the following
+is the maximum size for a single allocated buffer: */
+#define MEM_MAX_ALLOC_IN_BUF (srv_page_size - 200 + REDZONE_SIZE)
+
+/** Space needed when allocating for a user a field of length N.
+The space is allocated only in multiples of UNIV_MEM_ALIGNMENT. */
+#define MEM_SPACE_NEEDED(N) UT_CALC_ALIGN((N), UNIV_MEM_ALIGNMENT)
+
+#ifdef UNIV_DEBUG
+/** Macro for memory heap creation.
+@param[in] size Desired start block size. */
+# define mem_heap_create(size) \
+ mem_heap_create_func((size), __FILE__, __LINE__, MEM_HEAP_DYNAMIC)
+
+/** Macro for memory heap creation.
+@param[in] size Desired start block size.
+@param[in] type Heap type */
+# define mem_heap_create_typed(size, type) \
+ mem_heap_create_func((size), __FILE__, __LINE__, (type))
+
+#else /* UNIV_DEBUG */
+/** Macro for memory heap creation.
+@param[in] size Desired start block size. */
+# define mem_heap_create(size) mem_heap_create_func((size), MEM_HEAP_DYNAMIC)
+
+/** Macro for memory heap creation.
+@param[in] size Desired start block size.
+@param[in] type Heap type */
+# define mem_heap_create_typed(size, type) \
+ mem_heap_create_func((size), (type))
+
+#endif /* UNIV_DEBUG */
+
+/** Creates a memory heap.
+NOTE: Use the corresponding macros instead of this function.
+A single user buffer of 'size' will fit in the block.
+0 creates a default size block.
+@param[in] size Desired start block size.
+@param[in] file_name File name where created
+@param[in] line Line where created
+@param[in] type Heap type
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+ ulint size,
+#ifdef UNIV_DEBUG
+ const char* file_name,
+ unsigned line,
+#endif /* UNIV_DEBUG */
+ ulint type);
+
+/** Frees the space occupied by a memory heap.
+NOTE: Use the corresponding macro instead of this function.
+@param[in] heap Heap to be freed */
+UNIV_INLINE
+void
+mem_heap_free(
+ mem_heap_t* heap);
+
+/** Allocates and zero-fills n bytes of memory from a memory heap.
+@param[in] heap memory heap
+@param[in] n number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+ mem_heap_t* heap,
+ ulint n);
+
+/** Allocates n bytes of memory from a memory heap.
+@param[in] heap memory heap
+@param[in] n number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+ mem_heap_t* heap,
+ ulint n);
+
+/** Returns a pointer to the heap top.
+@param[in] heap memory heap
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+ mem_heap_t* heap);
+
+/** Frees the space in a memory heap exceeding the pointer given.
+The pointer must have been acquired from mem_heap_get_heap_top.
+The first memory block of the heap is not freed.
+@param[in] heap heap from which to free
+@param[in] old_top pointer to old top of heap */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+ mem_heap_t* heap,
+ byte* old_top);
+
+/** Empties a memory heap.
+The first memory block of the heap is not freed.
+@param[in] heap heap to empty */
+UNIV_INLINE
+void
+mem_heap_empty(
+ mem_heap_t* heap);
+
+/** Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@param[in] heap memory heap
+@param[in] n size of the topmost element
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+ mem_heap_t* heap,
+ ulint n);
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap.
+The size of the element must be given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: size of the topmost element */
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+ mem_heap_t* heap); /*!< in: heap */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+ const char* str); /*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+ const char* str, /*!< in: string to be copied */
+ ulint len); /*!< in: length of str, in bytes */
+
+/** Duplicate a block of data, allocated from a memory heap.
+@param[in] heap memory heap where string is allocated
+@param[in] data block of data to be copied
+@param[in] len length of data, in bytes
+@return own: a copy of data */
+inline
+void*
+mem_heap_dup(mem_heap_t* heap, const void* data, size_t len)
+{
+ ut_ad(data || !len);
+ return UNIV_LIKELY(data != NULL)
+ ? memcpy(mem_heap_alloc(heap, len), data, len)
+ : NULL;
+}
+
+/** Duplicate a NUL-terminated string, allocated from a memory heap.
+@param[in] heap memory heap where string is allocated
+@param[in] str string to be copied
+@return own: a copy of the string */
+inline
+char*
+mem_heap_strdup(mem_heap_t* heap, const char* str)
+{
+ return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1)));
+}
+
+/** Duplicate a string, allocated from a memory heap.
+@param[in] heap memory heap where string is allocated
+@param[in] str string to be copied
+@param[in] len length of str, in bytes
+@return own: a NUL-terminated copy of str */
+inline
+char*
+mem_heap_strdupl(mem_heap_t* heap, const char* str, size_t len)
+{
+ char* s = static_cast<char*>(mem_heap_alloc(heap, len + 1));
+ s[len] = 0;
+ return(static_cast<char*>(memcpy(s, str, len)));
+}
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return own: the result */
+char*
+mem_heap_strcat(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* s1, /*!< in: string 1 */
+ const char* s2); /*!< in: string 2 */
+
+/****************************************************************//**
+A simple sprintf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return heap-allocated formatted string */
+char*
+mem_heap_printf(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ const char* format, /*!< in: format string */
+ ...) MY_ATTRIBUTE ((format (printf, 2, 3)));
+
+#ifdef UNIV_DEBUG
+/** Validates the contents of a memory heap.
+Asserts that the memory heap is consistent
+@param[in] heap Memory heap to validate */
+void
+mem_heap_validate(
+ const mem_heap_t* heap);
+
+#endif /* UNIV_DEBUG */
+
+/*#######################################################################*/
+
+/** The info structure stored at the beginning of a heap block */
+struct mem_block_info_t {
+#ifdef UNIV_DEBUG
+ char file_name[8];/* file name where the mem heap was created */
+ unsigned line; /*!< line number where the mem heap was created */
+#endif /* UNIV_DEBUG */
+ UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the
+ the list this is the base node of the list of blocks;
+ in subsequent blocks this is undefined */
+ UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next
+ and prev in the list. The first block allocated
+ to the heap is also the first block in this list,
+ though it also contains the base node of the list. */
+ ulint len; /*!< physical length of this block in bytes */
+ ulint total_size; /*!< physical length in bytes of all blocks
+ in the heap. This is defined only in the base
+ node and is set to ULINT_UNDEFINED in others. */
+ ulint type; /*!< type of heap: MEM_HEAP_DYNAMIC, or
+ MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
+ ulint free; /*!< offset in bytes of the first free position for
+ user data in the block */
+ ulint start; /*!< the value of the struct field 'free' at the
+ creation of the block */
+
+ void* free_block;
+ /* if the MEM_HEAP_BTR_SEARCH bit is set in type,
+ and this is the heap root, this can contain an
+ allocated buffer frame, which can be appended as a
+ free block to the heap, if we need more space;
+ otherwise, this is NULL */
+ void* buf_block;
+ /* if this block has been allocated from the buffer
+ pool, this contains the buf_block_t handle;
+ otherwise, this is NULL */
+};
+
+/* Header size for a memory heap block */
+#define MEM_BLOCK_HEADER_SIZE UT_CALC_ALIGN(sizeof(mem_block_info_t),\
+ UNIV_MEM_ALIGNMENT)
+
+#include "mem0mem.ic"
+#endif
diff --git a/storage/innobase/include/mem0mem.ic b/storage/innobase/include/mem0mem.ic
new file mode 100644
index 00000000..9236bbef
--- /dev/null
+++ b/storage/innobase/include/mem0mem.ic
@@ -0,0 +1,466 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0mem.ic
+The memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_DEBUG
+# define mem_heap_create_block(heap, n, type, file_name, line) \
+ mem_heap_create_block_func(heap, n, file_name, line, type)
+# define mem_heap_create_at(N, file_name, line) \
+ mem_heap_create_func(N, file_name, line, MEM_HEAP_DYNAMIC)
+#else /* UNIV_DEBUG */
+# define mem_heap_create_block(heap, n, type, file_name, line) \
+ mem_heap_create_block_func(heap, n, type)
+# define mem_heap_create_at(N, file_name, line) \
+ mem_heap_create_func(N, MEM_HEAP_DYNAMIC)
+#endif /* UNIV_DEBUG */
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_create_block_func(
+/*=======================*/
+ mem_heap_t* heap, /*!< in: memory heap or NULL if first block
+ should be created */
+ ulint n, /*!< in: number of bytes needed for user data */
+#ifdef UNIV_DEBUG
+ const char* file_name,/*!< in: file name where created */
+ unsigned line, /*!< in: line where created */
+#endif /* UNIV_DEBUG */
+ ulint type); /*!< in: type of heap: MEM_HEAP_DYNAMIC or
+ MEM_HEAP_BUFFER */
+
+/******************************************************************//**
+Frees a block from a memory heap. */
+void
+mem_heap_block_free(
+/*================*/
+ mem_heap_t* heap, /*!< in: heap */
+ mem_block_t* block); /*!< in: block to free */
+
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+void
+mem_heap_free_block_free(
+/*=====================*/
+ mem_heap_t* heap); /*!< in: heap */
+
+/***************************************************************//**
+Adds a new block to a memory heap.
+@param[in] heap memory heap
+@param[in] n number of bytes needed
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_add_block(
+ mem_heap_t* heap,
+ ulint n);
+
+UNIV_INLINE
+void
+mem_block_set_len(mem_block_t* block, ulint len)
+{
+ ut_ad(len > 0);
+
+ block->len = len;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_len(mem_block_t* block)
+{
+ return(block->len);
+}
+
+UNIV_INLINE
+void
+mem_block_set_type(mem_block_t* block, ulint type)
+{
+ ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+ || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+ block->type = type;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_type(mem_block_t* block)
+{
+ return(block->type);
+}
+
+UNIV_INLINE
+void
+mem_block_set_free(mem_block_t* block, ulint free)
+{
+ ut_ad(free > 0);
+ ut_ad(free <= mem_block_get_len(block));
+
+ block->free = free;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_free(mem_block_t* block)
+{
+ return(block->free);
+}
+
+UNIV_INLINE
+void
+mem_block_set_start(mem_block_t* block, ulint start)
+{
+ ut_ad(start > 0);
+
+ block->start = start;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_start(mem_block_t* block)
+{
+ return(block->start);
+}
+
+/** Allocates and zero-fills n bytes of memory from a memory heap.
+@param[in] heap memory heap
+@param[in] n number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+ mem_heap_t* heap,
+ ulint n)
+{
+ ut_ad(heap);
+ ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH));
+ return(memset(mem_heap_alloc(heap, n), 0, n));
+}
+
+/** Allocates n bytes of memory from a memory heap.
+@param[in] heap memory heap
+@param[in] n number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+ mem_heap_t* heap,
+ ulint n)
+{
+ mem_block_t* block;
+ byte* buf;
+ ulint free;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ n += REDZONE_SIZE;
+
+ ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF));
+
+ /* Check if there is enough space in block. If not, create a new
+ block to the heap */
+
+ if (mem_block_get_len(block)
+ < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) {
+
+ block = mem_heap_add_block(heap, n);
+
+ if (block == NULL) {
+
+ return(NULL);
+ }
+ }
+
+ free = mem_block_get_free(block);
+
+ buf = (byte*) block + free;
+
+ mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
+
+ buf = buf + REDZONE_SIZE;
+ MEM_MAKE_ADDRESSABLE(buf, n - REDZONE_SIZE);
+ return(buf);
+}
+
+/** Returns a pointer to the heap top.
+@param[in] heap memory heap
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+ mem_heap_t* heap)
+{
+ mem_block_t* block;
+ byte* buf;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ buf = (byte*) block + mem_block_get_free(block);
+
+ return(buf);
+}
+
+/** Frees the space in a memory heap exceeding the pointer given.
+The pointer must have been acquired from mem_heap_get_heap_top.
+The first memory block of the heap is not freed.
+@param[in] heap heap from which to free
+@param[in] old_top pointer to old top of heap */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+ mem_heap_t* heap,
+ byte* old_top)
+{
+ mem_block_t* block;
+ mem_block_t* prev_block;
+
+ ut_d(mem_heap_validate(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ while (block != NULL) {
+ if (((byte*) block + mem_block_get_free(block) >= old_top)
+ && ((byte*) block <= old_top)) {
+ /* Found the right block */
+
+ break;
+ }
+
+ /* Store prev_block value before freeing the current block
+ (the current block will be erased in freeing) */
+
+ prev_block = UT_LIST_GET_PREV(list, block);
+
+ mem_heap_block_free(heap, block);
+
+ block = prev_block;
+ }
+
+ ut_ad(block);
+
+ /* Set the free field of block */
+ mem_block_set_free(block,
+ ulint(old_top - reinterpret_cast<byte*>(block)));
+
+ ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+ MEM_NOACCESS(old_top, (byte*) block + block->len - old_top);
+
+ /* If free == start, we may free the block if it is not the first
+ one */
+
+ if ((heap != block) && (mem_block_get_free(block)
+ == mem_block_get_start(block))) {
+ mem_heap_block_free(heap, block);
+ }
+}
+
+/** Empties a memory heap.
+The first memory block of the heap is not freed.
+@param[in] heap heap to empty */
+UNIV_INLINE
+void
+mem_heap_empty(
+ mem_heap_t* heap)
+{
+ mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap));
+
+ if (heap->free_block) {
+ mem_heap_free_block_free(heap);
+ }
+}
+
+/** Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@param[in] heap memory heap
+@param[in] n size of the topmost element
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+ mem_heap_t* heap,
+ ulint n)
+{
+ mem_block_t* block;
+ byte* buf;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ buf = (byte*) block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n);
+
+ return((void*) buf);
+}
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap. The size of the element must be
+given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: size of the topmost element */
+{
+ mem_block_t* block;
+
+ n += REDZONE_SIZE;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ /* Subtract the free field of block */
+ mem_block_set_free(block, mem_block_get_free(block)
+ - MEM_SPACE_NEEDED(n));
+
+ /* If free == start, we may free the block if it is not the first
+ one */
+
+ if ((heap != block) && (mem_block_get_free(block)
+ == mem_block_get_start(block))) {
+ mem_heap_block_free(heap, block);
+ } else {
+ MEM_NOACCESS((byte*) block + mem_block_get_free(block), n);
+ }
+}
+
+/** Creates a memory heap.
+NOTE: Use the corresponding macros instead of this function.
+A single user buffer of 'size' will fit in the block.
+0 creates a default size block.
+@param[in] size Desired start block size.
+@param[in] file_name File name where created
+@param[in] line Line where created
+@param[in] type Heap type
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+ ulint size,
+#ifdef UNIV_DEBUG
+ const char* file_name,
+ unsigned line,
+#endif /* UNIV_DEBUG */
+ ulint type)
+{
+ mem_block_t* block;
+
+ if (!size) {
+ size = MEM_BLOCK_START_SIZE;
+ }
+
+ block = mem_heap_create_block(NULL, size, type, file_name, line);
+
+ if (block == NULL) {
+
+ return(NULL);
+ }
+
+ /* The first block should not be in buffer pool,
+ because it might be relocated to resize buffer pool. */
+ ut_ad(block->buf_block == NULL);
+
+ UT_LIST_INIT(block->base, &mem_block_t::list);
+
+ /* Add the created block itself as the first block in the list */
+ UT_LIST_ADD_FIRST(block->base, block);
+
+ return(block);
+}
+
+/** Frees the space occupied by a memory heap.
+NOTE: Use the corresponding macro instead of this function.
+@param[in] heap Heap to be freed */
+UNIV_INLINE
+void
+mem_heap_free(
+ mem_heap_t* heap)
+{
+ mem_block_t* block;
+ mem_block_t* prev_block;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ if (heap->free_block) {
+ mem_heap_free_block_free(heap);
+ }
+
+ while (block != NULL) {
+ /* Store the contents of info before freeing current block
+ (it is erased in freeing) */
+
+ prev_block = UT_LIST_GET_PREV(list, block);
+
+ mem_heap_block_free(heap, block);
+
+ block = prev_block;
+ }
+}
+
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+ mem_heap_t* heap) /*!< in: heap */
+{
+ ulint size = heap->total_size;
+
+ if (heap->free_block) {
+ size += srv_page_size;
+ }
+
+ return(size);
+}
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+ const char* str) /*!< in: string to be copied */
+{
+ ulint len = strlen(str) + 1;
+ return(static_cast<char*>(memcpy(ut_malloc_nokey(len), str, len)));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+ const char* str, /*!< in: string to be copied */
+ ulint len) /*!< in: length of str, in bytes */
+{
+ char* s = static_cast<char*>(ut_malloc_nokey(len + 1));
+ s[len] = 0;
+ return(static_cast<char*>(memcpy(s, str, len)));
+}
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
new file mode 100644
index 00000000..0d83d83b
--- /dev/null
+++ b/storage/innobase/include/mtr0log.h
@@ -0,0 +1,673 @@
+/*****************************************************************************
+
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+@file include/mtr0log.h
+Mini-transaction log record encoding and decoding
+*******************************************************/
+
+#pragma once
+#include "mtr0mtr.h"
+
+/** The minimum 2-byte integer (0b10xxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_2BYTE= 1 << 7;
+/** The minimum 3-byte integer (0b110xxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_3BYTE= MIN_2BYTE + (1 << 14);
+/** The minimum 4-byte integer (0b1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_4BYTE= MIN_3BYTE + (1 << 21);
+/** Minimum 5-byte integer (0b11110000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_5BYTE= MIN_4BYTE + (1 << 28);
+
+/** Error from mlog_decode_varint() */
+constexpr uint32_t MLOG_DECODE_ERROR= ~0U;
+
+/** Decode the length of a variable-length encoded integer.
+@param first first byte of the encoded integer
+@return the length, in bytes */
+inline uint8_t mlog_decode_varint_length(byte first)
+{
+ uint8_t len= 1;
+ for (; first & 0x80; len++, first= static_cast<uint8_t>(first << 1));
+ return len;
+}
+
+/** Decode an integer in a redo log record.
+@param log redo log record buffer
+@return the decoded integer
+@retval MLOG_DECODE_ERROR on error */
+inline uint32_t mlog_decode_varint(const byte* log)
+{
+ uint32_t i= *log;
+ if (i < MIN_2BYTE)
+ return i;
+ if (i < 0xc0)
+ return MIN_2BYTE + ((i & ~0x80) << 8 | log[1]);
+ if (i < 0xe0)
+ return MIN_3BYTE + ((i & ~0xc0) << 16 | uint32_t{log[1]} << 8 | log[2]);
+ if (i < 0xf0)
+ return MIN_4BYTE + ((i & ~0xe0) << 24 | uint32_t{log[1]} << 16 |
+ uint32_t{log[2]} << 8 | log[3]);
+ if (i == 0xf0)
+ {
+ i= uint32_t{log[1]} << 24 | uint32_t{log[2]} << 16 |
+ uint32_t{log[3]} << 8 | log[4];
+ if (i <= ~MIN_5BYTE)
+ return MIN_5BYTE + i;
+ }
+ return MLOG_DECODE_ERROR;
+}
+
+/** Encode an integer in a redo log record.
+@param log redo log record buffer
+@param i the integer to encode
+@return end of the encoded integer */
+inline byte *mlog_encode_varint(byte *log, size_t i)
+{
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+ if (i < MIN_2BYTE)
+ {
+ }
+ else if (i < MIN_3BYTE)
+ {
+ i-= MIN_2BYTE;
+ static_assert(MIN_3BYTE - MIN_2BYTE == 1 << 14, "compatibility");
+ *log++= 0x80 | static_cast<byte>(i >> 8);
+ }
+ else if (i < MIN_4BYTE)
+ {
+ i-= MIN_3BYTE;
+ static_assert(MIN_4BYTE - MIN_3BYTE == 1 << 21, "compatibility");
+ *log++= 0xc0 | static_cast<byte>(i >> 16);
+ goto last2;
+ }
+ else if (i < MIN_5BYTE)
+ {
+ i-= MIN_4BYTE;
+ static_assert(MIN_5BYTE - MIN_4BYTE == 1 << 28, "compatibility");
+ *log++= 0xe0 | static_cast<byte>(i >> 24);
+ goto last3;
+ }
+ else
+ {
+ ut_ad(i < MLOG_DECODE_ERROR);
+ i-= MIN_5BYTE;
+ *log++= 0xf0;
+ *log++= static_cast<byte>(i >> 24);
+last3:
+ *log++= static_cast<byte>(i >> 16);
+last2:
+ *log++= static_cast<byte>(i >> 8);
+ }
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ *log++= static_cast<byte>(i);
+ return log;
+}
+
+/** Determine the length of a log record.
+@param log start of log record
+@param end end of the log record buffer
+@return the length of the record, in bytes
+@retval 0 if the log extends past the end
+@retval MLOG_DECODE_ERROR if the record is corrupted */
+inline uint32_t mlog_decode_len(const byte *log, const byte *end)
+{
+ ut_ad(log < end);
+ uint32_t i= *log;
+ if (!i)
+ return 0; /* end of mini-transaction */
+ if (~i & 15)
+ return (i & 15) + 1; /* 1..16 bytes */
+ if (UNIV_UNLIKELY(++log == end))
+ return 0; /* end of buffer */
+ i= *log;
+ if (UNIV_LIKELY(i < MIN_2BYTE)) /* 1 additional length byte: 16..143 bytes */
+ return 16 + i;
+ if (i < 0xc0) /* 2 additional length bytes: 144..16,527 bytes */
+ {
+ if (UNIV_UNLIKELY(log + 1 == end))
+ return 0; /* end of buffer */
+ return 16 + MIN_2BYTE + ((i & ~0xc0) << 8 | log[1]);
+ }
+ if (i < 0xe0) /* 3 additional length bytes: 16528..1065103 bytes */
+ {
+ if (UNIV_UNLIKELY(log + 2 == end))
+ return 0; /* end of buffer */
+ return 16 + MIN_3BYTE + ((i & ~0xe0) << 16 |
+ static_cast<uint32_t>(log[1]) << 8 | log[2]);
+ }
+ /* 1,065,103 bytes per log record ought to be enough for everyone */
+ return MLOG_DECODE_ERROR;
+}
+
+/** Write 1, 2, 4, or 8 bytes to a file page.
+@param[in] block file page
+@param[in,out] ptr pointer in file page
+@param[in] val value to write
+@tparam l number of bytes to write
+@tparam w write request type
+@tparam V type of val
+@return whether any log was written */
+template<unsigned l,mtr_t::write_type w,typename V>
+inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
+{
+ ut_ad(ut_align_down(ptr, srv_page_size) == block.frame);
+ static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length");
+ byte buf[l];
+
+ switch (l) {
+ case 1:
+ ut_ad(val == static_cast<byte>(val));
+ buf[0]= static_cast<byte>(val);
+ break;
+ case 2:
+ ut_ad(val == static_cast<uint16_t>(val));
+ mach_write_to_2(buf, static_cast<uint16_t>(val));
+ break;
+ case 4:
+ ut_ad(val == static_cast<uint32_t>(val));
+ mach_write_to_4(buf, static_cast<uint32_t>(val));
+ break;
+ case 8:
+ mach_write_to_8(buf, val);
+ break;
+ }
+ byte *p= static_cast<byte*>(ptr);
+ const byte *const end= p + l;
+ if (w != FORCED && m_log_mode == MTR_LOG_ALL)
+ {
+ const byte *b= buf;
+ while (*p++ == *b++)
+ {
+ if (p == end)
+ {
+ ut_ad(w == MAYBE_NOP);
+ return false;
+ }
+ }
+ p--;
+ }
+ ::memcpy(ptr, buf, l);
+ memcpy_low(block, static_cast<uint16_t>
+ (ut_align_offset(p, srv_page_size)), p, end - p);
+ return true;
+}
+
+/** Log an initialization of a string of bytes.
+@param[in] b buffer page
+@param[in] ofs byte offset from b->frame
+@param[in] len length of the data to write
+@param[in] val the data byte to write */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val)
+{
+ ut_ad(len);
+ set_modified(b);
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+
+ static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+ size_t lenlen= (len < MIN_2BYTE ? 1 + 1 : len < MIN_3BYTE ? 2 + 1 : 3 + 1);
+ byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen, true, ofs);
+ l= mlog_encode_varint(l, len);
+ *l++= val;
+ m_log.close(l);
+ m_last_offset= static_cast<uint16_t>(ofs + len);
+}
+
+/** Initialize a string of bytes.
+@param[in,out] b buffer page
+@param[in] ofs byte offset from block->frame
+@param[in] len length of the data to write
+@param[in] val the data byte to write */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val)
+{
+ ut_ad(ofs <= ulint(srv_page_size));
+ ut_ad(ofs + len <= ulint(srv_page_size));
+ ::memset(ofs + b->frame, val, len);
+ memset(*b, ofs, len, val);
+}
+
+/** Log an initialization of a repeating string of bytes.
+@param[in] b buffer page
+@param[in] ofs byte offset from b->frame
+@param[in] len length of the data to write, in bytes
+@param[in] str the string to write
+@param[in] size size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len,
+ const void *str, size_t size)
+{
+ ut_ad(size);
+ ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+ set_modified(b);
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+
+ static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+ size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+ byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen + size, true, ofs);
+ l= mlog_encode_varint(l, len);
+ ::memcpy(l, str, size);
+ l+= size;
+ m_log.close(l);
+ m_last_offset= static_cast<uint16_t>(ofs + len);
+}
+
+/** Initialize a repeating string of bytes.
+@param[in,out] b buffer page
+@param[in] ofs byte offset from b->frame
+@param[in] len length of the data to write, in bytes
+@param[in] str the string to write
+@param[in] size size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len,
+ const void *str, size_t size)
+{
+ ut_ad(ofs <= ulint(srv_page_size));
+ ut_ad(ofs + len <= ulint(srv_page_size));
+ ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+ size_t s= 0;
+ while (s < len)
+ {
+ ::memcpy(ofs + s + b->frame, str, size);
+ s+= len;
+ }
+ ::memcpy(ofs + s + b->frame, str, len - s);
+ memset(*b, ofs, len, str, size);
+}
+
+/** Log a write of a byte string to a page.
+@param[in] b buffer page
+@param[in] offset byte offset from b->frame
+@param[in] str the data to write
+@param[in] len length of the data to write */
+inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len)
+{
+ ut_ad(len);
+ ut_ad(offset <= ulint(srv_page_size));
+ ut_ad(offset + len <= ulint(srv_page_size));
+ memcpy_low(b, uint16_t(offset), &b.frame[offset], len);
+}
+
+/** Log a write of a byte string to a page.
+@param block page
+@param offset byte offset within page
+@param data data to be written
+@param len length of the data, in bytes */
+inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset,
+ const void *data, size_t len)
+{
+ ut_ad(len);
+ set_modified(block);
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5))
+ {
+ byte *end= log_write<WRITE>(block.page.id(), &block.page, len, true,
+ offset);
+ ::memcpy(end, data, len);
+ m_log.close(end + len);
+ }
+ else
+ {
+ m_log.close(log_write<WRITE>(block.page.id(), &block.page, len, false,
+ offset));
+ m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+ }
+ m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Log that a string of bytes was copied from the same page.
+@param[in] b buffer page
+@param[in] d destination offset within the page
+@param[in] s source offset within the page
+@param[in] len length of the data to copy */
+inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len)
+{
+ ut_ad(d >= 8);
+ ut_ad(s >= 8);
+ ut_ad(len);
+ ut_ad(s <= ulint(srv_page_size));
+ ut_ad(s + len <= ulint(srv_page_size));
+ ut_ad(s != d);
+ ut_ad(d <= ulint(srv_page_size));
+ ut_ad(d + len <= ulint(srv_page_size));
+
+ set_modified(b);
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+ size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+ /* The source offset is encoded relative to the destination offset,
+ with the sign in the least significant bit. */
+ if (s > d)
+ s= (s - d) << 1;
+ else
+ s= (d - s) << 1 | 1;
+ /* The source offset 0 is not possible. */
+ s-= 1 << 1;
+ size_t slen= (s < MIN_2BYTE ? 1 : s < MIN_3BYTE ? 2 : 3);
+ byte *l= log_write<MEMMOVE>(b.page.id(), &b.page, lenlen + slen, true, d);
+ l= mlog_encode_varint(l, len);
+ l= mlog_encode_varint(l, s);
+ m_log.close(l);
+ m_last_offset= static_cast<uint16_t>(d + len);
+}
+
+/**
+Write a log record.
+@tparam type redo log record type
+@param id persistent page identifier
+@param bpage buffer pool page, or nullptr
+@param len number of additional bytes to write
+@param alloc whether to allocate the additional bytes
+@param offset byte offset, or 0 if the record type does not allow one
+@return end of mini-transaction log, minus len */
+template<byte type>
+inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
+ size_t len, bool alloc, size_t offset)
+{
+ static_assert(!(type & 15) && type != RESERVED && type != OPTION &&
+ type <= FILE_CHECKPOINT, "invalid type");
+ ut_ad(type >= FILE_CREATE || is_named_space(id.space()));
+ ut_ad(!bpage || bpage->id() == id);
+ constexpr bool have_len= type != INIT_PAGE && type != FREE_PAGE;
+ constexpr bool have_offset= type == WRITE || type == MEMSET ||
+ type == MEMMOVE;
+ static_assert(!have_offset || have_len, "consistency");
+ ut_ad(have_len || len == 0);
+ ut_ad(have_len || !alloc);
+ ut_ad(have_offset || offset == 0);
+ ut_ad(offset + len <= srv_page_size);
+ static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency");
+
+ size_t max_len;
+ if (!have_len)
+ max_len= 1 + 5 + 5;
+ else if (!have_offset)
+ max_len= bpage && m_last == bpage
+ ? 1 + 3
+ : 1 + 3 + 5 + 5;
+ else if (bpage && m_last == bpage && m_last_offset <= offset)
+ {
+ /* Encode the offset relative from m_last_offset. */
+ offset-= m_last_offset;
+ max_len= 1 + 3 + 3;
+ }
+ else
+ max_len= 1 + 3 + 5 + 5 + 3;
+ byte *const log_ptr= m_log.open(alloc ? max_len + len : max_len);
+ byte *end= log_ptr + 1;
+ const byte same_page= max_len < 1 + 5 + 5 ? 0x80 : 0;
+ if (!same_page)
+ {
+ end= mlog_encode_varint(end, id.space());
+ end= mlog_encode_varint(end, id.page_no());
+ m_last= bpage;
+ }
+ if (have_offset)
+ {
+ byte* oend= mlog_encode_varint(end, offset);
+ if (oend + len > &log_ptr[16])
+ {
+ len+= oend - log_ptr - 15;
+ if (len >= MIN_3BYTE - 1)
+ len+= 2;
+ else if (len >= MIN_2BYTE)
+ len++;
+
+ *log_ptr= type | same_page;
+ end= mlog_encode_varint(log_ptr + 1, len);
+ if (!same_page)
+ {
+ end= mlog_encode_varint(end, id.space());
+ end= mlog_encode_varint(end, id.page_no());
+ }
+ end= mlog_encode_varint(end, offset);
+ return end;
+ }
+ else
+ end= oend;
+ }
+ else if (len >= 3 && end + len > &log_ptr[16])
+ {
+ len+= end - log_ptr - 15;
+ if (len >= MIN_3BYTE - 1)
+ len+= 2;
+ else if (len >= MIN_2BYTE)
+ len++;
+
+ end= log_ptr;
+ *end++= type | same_page;
+ end= mlog_encode_varint(end, len);
+
+ if (!same_page)
+ {
+ end= mlog_encode_varint(end, id.space());
+ end= mlog_encode_varint(end, id.page_no());
+ }
+ return end;
+ }
+
+ ut_ad(end + len >= &log_ptr[1] + !same_page);
+ ut_ad(end + len <= &log_ptr[16]);
+ ut_ad(end <= &log_ptr[max_len]);
+ *log_ptr= type | same_page | static_cast<byte>(end + len - log_ptr - 1);
+ ut_ad(*log_ptr & 15);
+ return end;
+}
+
+/** Write a byte string to a page.
+@param[in] b buffer page
+@param[in] dest destination within b.frame
+@param[in] str the data to write
+@param[in] len length of the data to write
+@tparam w write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len)
+{
+ ut_ad(ut_align_down(dest, srv_page_size) == b.frame);
+ char *d= static_cast<char*>(dest);
+ const char *s= static_cast<const char*>(str);
+ if (w != FORCED && m_log_mode == MTR_LOG_ALL)
+ {
+ ut_ad(len);
+ const char *const end= d + len;
+ while (*d++ == *s++)
+ {
+ if (d == end)
+ {
+ ut_ad(w == MAYBE_NOP);
+ return;
+ }
+ }
+ s--;
+ d--;
+ len= static_cast<ulint>(end - d);
+ }
+ ::memcpy(d, s, len);
+ memcpy(b, ut_align_offset(d, srv_page_size), len);
+}
+
+/** Initialize an entire page.
+@param[in,out] b buffer page */
+inline void mtr_t::init(buf_block_t *b)
+{
+ const page_id_t id{b->page.id()};
+ ut_ad(is_named_space(id.space()));
+ ut_ad(!m_freed_pages == !m_freed_space);
+
+ if (UNIV_LIKELY_NULL(m_freed_space) &&
+ m_freed_space->id == id.space() &&
+ m_freed_pages->remove_if_exists(b->page.id().page_no()) &&
+ m_freed_pages->empty())
+ {
+ delete m_freed_pages;
+ m_freed_pages= nullptr;
+ m_freed_space= nullptr;
+ }
+
+ b->page.status= buf_page_t::INIT_ON_FLUSH;
+
+ if (m_log_mode != MTR_LOG_ALL)
+ {
+ ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO);
+ return;
+ }
+
+ m_log.close(log_write<INIT_PAGE>(b->page.id(), &b->page));
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Free a page.
+@param[in] space tablespace contains page to be freed
+@param[in] offset page offset to be freed */
+inline void mtr_t::free(fil_space_t &space, uint32_t offset)
+{
+ ut_ad(is_named_space(&space));
+ ut_ad(!m_freed_space || m_freed_space == &space);
+
+ if (m_log_mode == MTR_LOG_ALL)
+ m_log.close(log_write<FREE_PAGE>({space.id, offset}, nullptr));
+}
+
+/** Write an EXTENDED log record.
+@param block buffer pool page
+@param type extended record subtype; @see mrec_ext_t */
+inline void mtr_t::log_write_extended(const buf_block_t &block, byte type)
+{
+ set_modified(block);
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ byte *l= log_write<EXTENDED>(block.page.id(), &block.page, 1, true);
+ *l++= type;
+ m_log.close(l);
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for partly initializing a B-tree or R-tree page.
+@param block B-tree or R-tree page
+@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+inline void mtr_t::page_create(const buf_block_t &block, bool comp)
+{
+ static_assert(false == INIT_ROW_FORMAT_REDUNDANT, "encoding");
+ static_assert(true == INIT_ROW_FORMAT_DYNAMIC, "encoding");
+ log_write_extended(block, comp);
+}
+
+/** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
+@param block B-tree or R-tree page
+@param prev_rec byte offset of the predecessor of the record to delete,
+ starting from PAGE_OLD_INFIMUM */
+inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec)
+{
+ ut_ad(!block.zip_size());
+ ut_ad(prev_rec < block.physical_size());
+ set_modified(block);
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4);
+ byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
+ ut_d(byte *end= l + len);
+ *l++= DELETE_ROW_FORMAT_REDUNDANT;
+ l= mlog_encode_varint(l, prev_rec);
+ ut_ad(end == l);
+ m_log.close(l);
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
+@param block B-tree or R-tree page
+@param prev_rec byte offset of the predecessor of the record to delete,
+ starting from PAGE_NEW_INFIMUM
+@param prev_rec the predecessor of the record to delete
+@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size data payload size, in bytes */
+inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec,
+ size_t hdr_size, size_t data_size)
+{
+ ut_ad(!block.zip_size());
+ set_modified(block);
+ ut_ad(hdr_size < MIN_3BYTE);
+ ut_ad(prev_rec < block.physical_size());
+ ut_ad(data_size < block.physical_size());
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+ len+= hdr_size < MIN_2BYTE ? 1 : 2;
+ len+= data_size < MIN_2BYTE ? 1 : data_size < MIN_3BYTE ? 2 : 3;
+ byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
+ ut_d(byte *end= l + len);
+ *l++= DELETE_ROW_FORMAT_DYNAMIC;
+ l= mlog_encode_varint(l, prev_rec);
+ l= mlog_encode_varint(l, hdr_size);
+ l= mlog_encode_varint(l, data_size);
+ ut_ad(end == l);
+ m_log.close(l);
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for initializing an undo log page.
+@param block undo page */
+inline void mtr_t::undo_create(const buf_block_t &block)
+{
+ log_write_extended(block, UNDO_INIT);
+}
+
+/** Write log for appending an undo log record.
+@param block undo page
+@param data record within the undo page
+@param len length of the undo record, in bytes */
+inline void mtr_t::undo_append(const buf_block_t &block,
+ const void *data, size_t len)
+{
+ ut_ad(len > 2);
+ set_modified(block);
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+ byte *end= log_write<EXTENDED>(block.page.id(), &block.page, len + 1, small);
+ if (UNIV_LIKELY(small))
+ {
+ *end++= UNDO_APPEND;
+ ::memcpy(end, data, len);
+ m_log.close(end + len);
+ }
+ else
+ {
+ m_log.close(end);
+ *m_log.push<byte*>(1)= UNDO_APPEND;
+ m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+ }
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Trim the end of a tablespace.
+@param id first page identifier that will not be in the file */
+inline void mtr_t::trim_pages(const page_id_t id)
+{
+ if (m_log_mode != MTR_LOG_ALL)
+ return;
+ byte *l= log_write<EXTENDED>(id, nullptr, 1, true);
+ *l++= TRIM_PAGES;
+ m_log.close(l);
+ set_trim_pages();
+}
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
new file mode 100644
index 00000000..f3db0008
--- /dev/null
+++ b/storage/innobase/include/mtr0mtr.h
@@ -0,0 +1,696 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.h
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0mtr_h
+#define mtr0mtr_h
+
+#include "fil0fil.h"
+#include "dyn0buf.h"
+
+/** Start a mini-transaction. */
+#define mtr_start(m) (m)->start()
+
+/** Commit a mini-transaction. */
+#define mtr_commit(m) (m)->commit()
+
+/** Set and return a savepoint in mtr.
+@return savepoint */
+#define mtr_set_savepoint(m) (m)->get_savepoint()
+
+/** Release the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+#define mtr_release_s_latch_at_savepoint(m, s, l) \
+ (m)->release_s_latch_at_savepoint((s), (l))
+
+/** Change the logging mode of a mini-transaction.
+@return old mode */
+#define mtr_set_log_mode(m, d) (m)->set_log_mode((d))
+
+/** Release an object in the memo stack.
+@return true if released */
+#define mtr_memo_release(m, o, t) \
+ (m)->memo_release((o), (t))
+
+/** Print info of an mtr handle. */
+#define mtr_print(m) (m)->print()
+
+/** Return the log object of a mini-transaction buffer.
+@return log */
+#define mtr_get_log(m) (m)->get_log()
+
+/** Push an object to an mtr memo stack. */
+#define mtr_memo_push(m, o, t) (m)->memo_push(o, t)
+
+#define mtr_x_lock_space(s, m) (m)->x_lock_space((s), __FILE__, __LINE__)
+#define mtr_sx_lock_space(s, m) (m)->sx_lock_space((s), __FILE__, __LINE__)
+
+#define mtr_s_lock_index(i, m) (m)->s_lock(&(i)->lock, __FILE__, __LINE__)
+#define mtr_x_lock_index(i, m) (m)->x_lock(&(i)->lock, __FILE__, __LINE__)
+#define mtr_sx_lock_index(i, m) (m)->sx_lock(&(i)->lock, __FILE__, __LINE__)
+
+#define mtr_release_block_at_savepoint(m, s, b) \
+ (m)->release_block_at_savepoint((s), (b))
+
+#define mtr_block_sx_latch_at_savepoint(m, s, b) \
+ (m)->sx_latch_at_savepoint((s), (b))
+
+#define mtr_block_x_latch_at_savepoint(m, s, b) \
+ (m)->x_latch_at_savepoint((s), (b))
+
+/** Mini-transaction memo stack slot. */
+struct mtr_memo_slot_t {
+ /** pointer to the object */
+ void* object;
+
+ /** type of the stored object */
+ mtr_memo_type_t type;
+};
+
+/** Mini-transaction handle and buffer */
+struct mtr_t {
+ /** Start a mini-transaction. */
+ void start();
+
+ /** Commit the mini-transaction. */
+ void commit();
+
+ /** Commit a mini-transaction that did not modify any pages,
+ but generated some redo log on a higher level, such as
+ FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
+ The caller must hold log_sys.mutex.
+ This is to be used at log_checkpoint().
+ @param checkpoint_lsn the log sequence number of a checkpoint, or 0 */
+ void commit_files(lsn_t checkpoint_lsn= 0);
+
+ /** @return mini-transaction savepoint (current size of m_memo) */
+ ulint get_savepoint() const { ut_ad(is_active()); return m_memo.size(); }
+
+ /** Release the (index tree) s-latch stored in an mtr memo after a
+ savepoint.
+ @param savepoint value returned by @see set_savepoint.
+ @param lock latch to release */
+ inline void release_s_latch_at_savepoint(
+ ulint savepoint,
+ rw_lock_t* lock);
+
+ /** Release the block in an mtr memo after a savepoint. */
+ inline void release_block_at_savepoint(
+ ulint savepoint,
+ buf_block_t* block);
+
+ /** SX-latch a not yet latched block after a savepoint. */
+ inline void sx_latch_at_savepoint(ulint savepoint, buf_block_t* block);
+
+ /** X-latch a not yet latched block after a savepoint. */
+ inline void x_latch_at_savepoint(ulint savepoint, buf_block_t* block);
+
+ /** @return the logging mode */
+ mtr_log_t get_log_mode() const
+ {
+ static_assert(MTR_LOG_ALL == 0, "efficiency");
+ ut_ad(m_log_mode <= MTR_LOG_NO_REDO);
+ return static_cast<mtr_log_t>(m_log_mode);
+ }
+
+ /** Change the logging mode.
+ @param mode logging mode
+ @return old mode */
+ mtr_log_t set_log_mode(mtr_log_t mode)
+ {
+ const mtr_log_t old_mode= get_log_mode();
+ m_log_mode= mode & 3;
+ return old_mode;
+ }
+
+ /** Check if we are holding a block latch in exclusive mode
+ @param block buffer pool block to search for */
+ bool have_x_latch(const buf_block_t &block) const;
+
+ /** Copy the tablespaces associated with the mini-transaction
+ (needed for generating FILE_MODIFY records)
+ @param[in] mtr mini-transaction that may modify
+ the same set of tablespaces as this one */
+ void set_spaces(const mtr_t& mtr)
+ {
+ ut_ad(!m_user_space_id);
+ ut_ad(!m_user_space);
+
+ ut_d(m_user_space_id = mtr.m_user_space_id);
+ m_user_space = mtr.m_user_space;
+ }
+
+ /** Set the tablespace associated with the mini-transaction
+ (needed for generating a FILE_MODIFY record)
+ @param[in] space_id user or system tablespace ID
+ @return the tablespace */
+ fil_space_t* set_named_space_id(ulint space_id)
+ {
+ ut_ad(!m_user_space_id);
+ ut_d(m_user_space_id = static_cast<uint32_t>(space_id));
+ if (!space_id) {
+ return fil_system.sys_space;
+ } else {
+ ut_ad(m_user_space_id == space_id);
+ ut_ad(!m_user_space);
+ m_user_space = fil_space_get(space_id);
+ ut_ad(m_user_space);
+ return m_user_space;
+ }
+ }
+
+ /** Set the tablespace associated with the mini-transaction
+ (needed for generating a FILE_MODIFY record)
+ @param[in] space user or system tablespace */
+ void set_named_space(fil_space_t* space)
+ {
+ ut_ad(!m_user_space_id);
+ ut_d(m_user_space_id = static_cast<uint32_t>(space->id));
+ if (space->id) {
+ m_user_space = space;
+ }
+ }
+
+#ifdef UNIV_DEBUG
+ /** Check the tablespace associated with the mini-transaction
+ (needed for generating a FILE_MODIFY record)
+ @param[in] space tablespace
+ @return whether the mini-transaction is associated with the space */
+ bool is_named_space(ulint space) const;
+ /** Check the tablespace associated with the mini-transaction
+ (needed for generating a FILE_MODIFY record)
+ @param[in] space tablespace
+ @return whether the mini-transaction is associated with the space */
+ bool is_named_space(const fil_space_t* space) const;
+#endif /* UNIV_DEBUG */
+
+ /** Acquire a tablespace X-latch.
+ @param[in] space_id tablespace ID
+ @param[in] file file name from where called
+ @param[in] line line number in file
+ @return the tablespace object (never NULL) */
+ fil_space_t* x_lock_space(
+ ulint space_id,
+ const char* file,
+ unsigned line);
+
+ /** Acquire a shared rw-latch.
+ @param[in] lock rw-latch
+ @param[in] file file name from where called
+ @param[in] line line number in file */
+ void s_lock(rw_lock_t* lock, const char* file, unsigned line)
+ {
+ rw_lock_s_lock_inline(lock, 0, file, line);
+ memo_push(lock, MTR_MEMO_S_LOCK);
+ }
+
+ /** Acquire an exclusive rw-latch.
+ @param[in] lock rw-latch
+ @param[in] file file name from where called
+ @param[in] line line number in file */
+ void x_lock(rw_lock_t* lock, const char* file, unsigned line)
+ {
+ rw_lock_x_lock_inline(lock, 0, file, line);
+ memo_push(lock, MTR_MEMO_X_LOCK);
+ }
+
+ /** Acquire an shared/exclusive rw-latch.
+ @param[in] lock rw-latch
+ @param[in] file file name from where called
+ @param[in] line line number in file */
+ void sx_lock(rw_lock_t* lock, const char* file, unsigned line)
+ {
+ rw_lock_sx_lock_inline(lock, 0, file, line);
+ memo_push(lock, MTR_MEMO_SX_LOCK);
+ }
+
+ /** Acquire a tablespace X-latch.
+ @param[in] space tablespace
+ @param[in] file file name from where called
+ @param[in] line line number in file */
+ void x_lock_space(fil_space_t* space, const char* file, unsigned line)
+ {
+ ut_ad(space->purpose == FIL_TYPE_TEMPORARY
+ || space->purpose == FIL_TYPE_IMPORT
+ || space->purpose == FIL_TYPE_TABLESPACE);
+ memo_push(space, MTR_MEMO_SPACE_X_LOCK);
+ rw_lock_x_lock_inline(&space->latch, 0, file, line);
+ }
+
+ /** Acquire a tablespace SX-latch.
+ @param[in] space tablespace
+ @param[in] file file name from where called
+ @param[in] line line number in file */
+ void sx_lock_space(fil_space_t *space, const char *file, unsigned line)
+ {
+ ut_ad(space->purpose == FIL_TYPE_TEMPORARY
+ || space->purpose == FIL_TYPE_IMPORT
+ || space->purpose == FIL_TYPE_TABLESPACE);
+ sx_lock(&space->latch, file, line);
+ }
+
+ /** Release an object in the memo stack.
+ @param object object
+ @param type object type
+ @return bool if lock released */
+ bool memo_release(const void* object, ulint type);
+ /** Release a page latch.
+ @param[in] ptr pointer to within a page frame
+ @param[in] type object type: MTR_MEMO_PAGE_X_FIX, ... */
+ void release_page(const void* ptr, mtr_memo_type_t type);
+
+private:
+ /** Note that the mini-transaction will modify data. */
+ void flag_modified() { m_modifications = true; }
+ /** Mark the given latched page as modified.
+ @param block page that will be modified */
+ void modify(const buf_block_t& block);
+public:
+ /** Note that the mini-transaction will modify a block. */
+ void set_modified(const buf_block_t &block)
+ { flag_modified(); if (m_log_mode != MTR_LOG_NONE) modify(block); }
+
+ /** Set the state to not-modified. This will not log the changes.
+ This is only used during redo log apply, to avoid logging the changes. */
+ void discard_modifications() { m_modifications = false; }
+
+ /** Get the LSN of commit().
+ @return the commit LSN
+ @retval 0 if the transaction only modified temporary tablespaces */
+ lsn_t commit_lsn() const { ut_ad(has_committed()); return m_commit_lsn; }
+
+ /** Note that we are inside the change buffer code. */
+ void enter_ibuf() { m_inside_ibuf= true; }
+
+ /** Note that we have exited from the change buffer code. */
+ void exit_ibuf() { m_inside_ibuf= false; }
+
+ /** @return true if we are inside the change buffer code */
+ bool is_inside_ibuf() const { return m_inside_ibuf; }
+
+ /** Note that pages has been trimed */
+ void set_trim_pages() { m_trim_pages= true; }
+
+ /** @return true if pages has been trimed */
+ bool is_trim_pages() { return m_trim_pages; }
+
+#ifdef UNIV_DEBUG
+ /** Check if we are holding an rw-latch in this mini-transaction
+ @param lock latch to search for
+ @param type held latch type
+ @return whether (lock,type) is contained */
+ bool memo_contains(const rw_lock_t &lock, mtr_memo_type_t type)
+ MY_ATTRIBUTE((warn_unused_result));
+ /** Check if we are holding exclusive tablespace latch
+ @param space tablespace to search for
+ @return whether space.latch is being held */
+ bool memo_contains(const fil_space_t& space)
+ MY_ATTRIBUTE((warn_unused_result));
+
+
+ /** Check if memo contains the given item.
+ @param object object to search
+ @param flags specify types of object (can be ORred) of
+ MTR_MEMO_PAGE_S_FIX ... values
+ @return true if contains */
+ bool memo_contains_flagged(const void* ptr, ulint flags) const;
+
+ /** Check if memo contains the given page.
+ @param[in] ptr pointer to within buffer frame
+ @param[in] flags specify types of object with OR of
+ MTR_MEMO_PAGE_S_FIX... values
+ @return the block
+ @retval NULL if not found */
+ buf_block_t* memo_contains_page_flagged(
+ const byte* ptr,
+ ulint flags) const;
+
+ /** Print info of an mtr handle. */
+ void print() const;
+
+ /** @return true if mini-transaction contains modifications. */
+ bool has_modifications() const { return m_modifications; }
+
+ /** @return the memo stack */
+ const mtr_buf_t* get_memo() const { return &m_memo; }
+
+ /** @return the memo stack */
+ mtr_buf_t* get_memo() { return &m_memo; }
+#endif /* UNIV_DEBUG */
+
+ /** @return true if a record was added to the mini-transaction */
+ bool is_dirty() const { return m_made_dirty; }
+
+ /** Get the buffered redo log of this mini-transaction.
+ @return redo log */
+ const mtr_buf_t* get_log() const { return &m_log; }
+
+ /** Get the buffered redo log of this mini-transaction.
+ @return redo log */
+ mtr_buf_t* get_log() { return &m_log; }
+
+ /** Push an object to an mtr memo stack.
+ @param object object
+ @param type object type: MTR_MEMO_S_LOCK, ... */
+ inline void memo_push(void* object, mtr_memo_type_t type);
+
+ /** Check if this mini-transaction is dirtying a clean page.
+ @param block block being x-fixed
+ @return true if the mtr is dirtying a clean page. */
+ static inline bool is_block_dirtied(const buf_block_t* block)
+ MY_ATTRIBUTE((warn_unused_result));
+
+ /** Write request types */
+ enum write_type
+ {
+ /** the page is guaranteed to always change */
+ NORMAL= 0,
+ /** optional: the page contents might not change */
+ MAYBE_NOP,
+ /** force a write, even if the page contents is not changing */
+ FORCED
+ };
+
+ /** Write 1, 2, 4, or 8 bytes to a file page.
+ @param[in] block file page
+ @param[in,out] ptr pointer in file page
+ @param[in] val value to write
+ @tparam l number of bytes to write
+ @tparam w write request type
+ @tparam V type of val
+ @return whether any log was written */
+ template<unsigned l,write_type w= NORMAL,typename V>
+ inline bool write(const buf_block_t &block, void *ptr, V val)
+ MY_ATTRIBUTE((nonnull));
+
+ /** Log a write of a byte string to a page.
+ @param[in] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write */
+ inline void memcpy(const buf_block_t &b, ulint ofs, ulint len);
+
+ /** Write a byte string to a page.
+ @param[in,out] b buffer page
+ @param[in] dest destination within b.frame
+ @param[in] str the data to write
+ @param[in] len length of the data to write
+ @tparam w write request type */
+ template<write_type w= NORMAL>
+ inline void memcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len);
+
+ /** Log a write of a byte string to a ROW_FORMAT=COMPRESSED page.
+ @param[in] b ROW_FORMAT=COMPRESSED index page
+ @param[in] offset byte offset from b.zip.data
+ @param[in] len length of the data to write */
+ inline void zmemcpy(const buf_block_t &b, ulint offset, ulint len);
+
+ /** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+ @param[in] b ROW_FORMAT=COMPRESSED index page
+ @param[in] dest destination within b.zip.data
+ @param[in] str the data to write
+ @param[in] len length of the data to write
+ @tparam w write request type */
+ template<write_type w= NORMAL>
+ inline void zmemcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len);
+
+ /** Log an initialization of a string of bytes.
+ @param[in] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write
+ @param[in] val the data byte to write */
+ inline void memset(const buf_block_t &b, ulint ofs, ulint len, byte val);
+
+ /** Initialize a string of bytes.
+ @param[in,out] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write
+ @param[in] val the data byte to write */
+ inline void memset(const buf_block_t *b, ulint ofs, ulint len, byte val);
+
+ /** Log an initialization of a repeating string of bytes.
+ @param[in] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write, in bytes
+ @param[in] str the string to write
+ @param[in] size size of str, in bytes */
+ inline void memset(const buf_block_t &b, ulint ofs, size_t len,
+ const void *str, size_t size);
+
+ /** Initialize a repeating string of bytes.
+ @param[in,out] b buffer page
+ @param[in] ofs byte offset from b->frame
+ @param[in] len length of the data to write, in bytes
+ @param[in] str the string to write
+ @param[in] size size of str, in bytes */
+ inline void memset(const buf_block_t *b, ulint ofs, size_t len,
+ const void *str, size_t size);
+
+ /** Log that a string of bytes was copied from the same page.
+ @param[in] b buffer page
+ @param[in] d destination offset within the page
+ @param[in] s source offset within the page
+ @param[in] len length of the data to copy */
+ inline void memmove(const buf_block_t &b, ulint d, ulint s, ulint len);
+
+ /** Initialize an entire page.
+ @param[in,out] b buffer page */
+ void init(buf_block_t *b);
+ /** Free a page.
+ @param[in] space tablespace contains page to be freed
+ @param[in] offset page offset to be freed */
+ inline void free(fil_space_t &space, uint32_t offset);
+ /** Write log for partly initializing a B-tree or R-tree page.
+ @param block B-tree or R-tree page
+ @param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+ inline void page_create(const buf_block_t &block, bool comp);
+
+ /** Write log for inserting a B-tree or R-tree record in
+ ROW_FORMAT=REDUNDANT.
+ @param block B-tree or R-tree page
+ @param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+ @param prev_rec byte offset of the predecessor of the record to insert,
+ starting from PAGE_OLD_INFIMUM
+ @param info_bits info_bits of the record
+ @param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag()
+ @param hdr_c number of common record header bytes with prev_rec
+ @param data_c number of common data bytes with prev_rec
+ @param hdr record header bytes to copy to the log
+ @param hdr_l number of copied record header bytes
+ @param data record payload bytes to copy to the log
+ @param data_l number of copied record data bytes */
+ inline void page_insert(const buf_block_t &block, bool reuse,
+ ulint prev_rec, byte info_bits,
+ ulint n_fields_s, size_t hdr_c, size_t data_c,
+ const byte *hdr, size_t hdr_l,
+ const byte *data, size_t data_l);
+ /** Write log for inserting a B-tree or R-tree record in
+ ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC.
+ @param block B-tree or R-tree page
+ @param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+ @param prev_rec byte offset of the predecessor of the record to insert,
+ starting from PAGE_NEW_INFIMUM
+ @param info_status rec_get_info_and_status_bits()
+ @param shift unless !reuse: number of bytes the PAGE_FREE is moving
+ @param hdr_c number of common record header bytes with prev_rec
+ @param data_c number of common data bytes with prev_rec
+ @param hdr record header bytes to copy to the log
+ @param hdr_l number of copied record header bytes
+ @param data record payload bytes to copy to the log
+ @param data_l number of copied record data bytes */
+ inline void page_insert(const buf_block_t &block, bool reuse,
+ ulint prev_rec, byte info_status,
+ ssize_t shift, size_t hdr_c, size_t data_c,
+ const byte *hdr, size_t hdr_l,
+ const byte *data, size_t data_l);
+ /** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
+ @param block B-tree or R-tree page
+ @param prev_rec byte offset of the predecessor of the record to delete,
+ starting from PAGE_OLD_INFIMUM */
+ inline void page_delete(const buf_block_t &block, ulint prev_rec);
+ /** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
+ @param block B-tree or R-tree page
+ @param prev_rec byte offset of the predecessor of the record to delete,
+ starting from PAGE_NEW_INFIMUM
+ @param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES
+ @param data_size data payload size, in bytes */
+ inline void page_delete(const buf_block_t &block, ulint prev_rec,
+ size_t hdr_size, size_t data_size);
+
+ /** Write log for initializing an undo log page.
+ @param block undo page */
+ inline void undo_create(const buf_block_t &block);
+ /** Write log for appending an undo log record.
+ @param block undo page
+ @param data record within the undo page
+ @param len length of the undo record, in bytes */
+ inline void undo_append(const buf_block_t &block,
+ const void *data, size_t len);
+ /** Trim the end of a tablespace.
+ @param id first page identifier that will not be in the file */
+ inline void trim_pages(const page_id_t id);
+
+ /** Write a log record about a file operation.
+ @param type file operation
+ @param space_id tablespace identifier
+ @param path file path
+ @param new_path new file path for type=FILE_RENAME */
+ inline void log_file_op(mfile_type_t type, ulint space_id,
+ const char *path,
+ const char *new_path= nullptr);
+
+ /** Add freed page numbers to freed_pages */
+ void add_freed_offset(fil_space_t *space, uint32_t page)
+ {
+ ut_ad(is_named_space(space));
+ if (!m_freed_pages)
+ {
+ m_freed_pages= new range_set();
+ ut_ad(!m_freed_space);
+ m_freed_space= space;
+ }
+ else
+ ut_ad(m_freed_space == space);
+ m_freed_pages->add_value(page);
+ }
+
+ /** Determine the added buffer fix count of a block.
+ @param block block to be checked
+ @return number of buffer count added by this mtr */
+ uint32_t get_fix_count(const buf_block_t *block) const;
+
+ /** type of page flushing is needed during commit() */
+ enum page_flush_ahead
+ {
+ /** no need to trigger page cleaner */
+ PAGE_FLUSH_NO= 0,
+ /** asynchronous flushing is needed */
+ PAGE_FLUSH_ASYNC,
+ /** furious flushing is needed */
+ PAGE_FLUSH_SYNC
+ };
+
+private:
+ /** Log a write of a byte string to a page.
+ @param block buffer page
+ @param offset byte offset within page
+ @param data data to be written
+ @param len length of the data, in bytes */
+ inline void memcpy_low(const buf_block_t &block, uint16_t offset,
+ const void *data, size_t len);
+ /**
+ Write a log record.
+ @tparam type redo log record type
+ @param id persistent page identifier
+ @param bpage buffer pool page, or nullptr
+ @param len number of additional bytes to write
+ @param alloc whether to allocate the additional bytes
+ @param offset byte offset, or 0 if the record type does not allow one
+ @return end of mini-transaction log, minus len */
+ template<byte type>
+ inline byte *log_write(const page_id_t id, const buf_page_t *bpage,
+ size_t len= 0, bool alloc= false, size_t offset= 0);
+
+ /** Write an EXTENDED log record.
+ @param block buffer pool page
+ @param type extended record subtype; @see mrec_ext_t */
+ inline void log_write_extended(const buf_block_t &block, byte type);
+
+ /** Prepare to write the mini-transaction log to the redo log buffer.
+ @return number of bytes to write in finish_write() */
+ inline ulint prepare_write();
+
+ /** Append the redo log records to the redo log buffer.
+ @param len number of bytes to write
+ @return {start_lsn,flush_ahead} */
+ inline std::pair<lsn_t,page_flush_ahead> finish_write(ulint len);
+
+ /** Release the resources */
+ inline void release_resources();
+
+#ifdef UNIV_DEBUG
+public:
+ /** @return whether the mini-transaction is active */
+ bool is_active() const
+ { ut_ad(!m_commit || m_start); return m_start && !m_commit; }
+ /** @return whether the mini-transaction has been committed */
+ bool has_committed() const { ut_ad(!m_commit || m_start); return m_commit; }
+private:
+ /** whether start() has been called */
+ bool m_start= false;
+ /** whether commit() has been called */
+ bool m_commit= false;
+#endif
+
+ /** The page of the most recent m_log record written, or NULL */
+ const buf_page_t* m_last;
+ /** The current byte offset in m_last, or 0 */
+ uint16_t m_last_offset;
+
+ /** specifies which operations should be logged; default MTR_LOG_ALL */
+ uint16_t m_log_mode:2;
+
+ /** whether at least one buffer pool page was written to */
+ uint16_t m_modifications:1;
+
+ /** whether at least one previously clean buffer pool page was written to */
+ uint16_t m_made_dirty:1;
+
+ /** whether change buffer is latched; only needed in non-debug builds
+ to suppress some read-ahead operations, @see ibuf_inside() */
+ uint16_t m_inside_ibuf:1;
+
+ /** whether the pages has been trimmed */
+ uint16_t m_trim_pages:1;
+
+#ifdef UNIV_DEBUG
+ /** Persistent user tablespace associated with the
+ mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */
+ uint32_t m_user_space_id;
+#endif /* UNIV_DEBUG */
+
+ /** acquired dict_index_t::lock, fil_space_t::latch, buf_block_t */
+ mtr_buf_t m_memo;
+
+ /** mini-transaction log */
+ mtr_buf_t m_log;
+
+ /** user tablespace that is being modified by the mini-transaction */
+ fil_space_t* m_user_space;
+
+ /** LSN at commit time */
+ lsn_t m_commit_lsn;
+
+ /** tablespace where pages have been freed */
+ fil_space_t *m_freed_space= nullptr;
+ /** set of freed page ids */
+ range_set *m_freed_pages= nullptr;
+};
+
+#include "mtr0mtr.ic"
+
+#endif /* mtr0mtr_h */
diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic
new file mode 100644
index 00000000..4a483379
--- /dev/null
+++ b/storage/innobase/include/mtr0mtr.ic
@@ -0,0 +1,173 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.ic
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0buf.h"
+
+/** Check if a mini-transaction is dirtying a clean page.
+@return true if the mtr is dirtying a clean page. */
+inline bool mtr_t::is_block_dirtied(const buf_block_t *block)
+{
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->page.buf_fix_count());
+ return block->page.oldest_modification() <= 1;
+}
+
+/**
+Pushes an object to an mtr memo stack. */
+void
+mtr_t::memo_push(void* object, mtr_memo_type_t type)
+{
+ ut_ad(is_active());
+ ut_ad(object != NULL);
+ ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
+ ut_ad(type <= MTR_MEMO_SPACE_X_LOCK);
+ ut_ad(ut_is_2pow(type));
+
+ /* If this mtr has x-fixed a clean page then we set
+ the made_dirty flag. This tells us if we need to
+ grab log_flush_order_mutex at mtr_commit so that we
+ can insert the dirtied page to the flush list. */
+
+ if ((type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX)
+ && !m_made_dirty) {
+
+ m_made_dirty = is_block_dirtied(
+ reinterpret_cast<const buf_block_t*>(object));
+ }
+
+ mtr_memo_slot_t* slot = m_memo.push<mtr_memo_slot_t*>(sizeof(*slot));
+
+ slot->type = type;
+ slot->object = object;
+}
+
+/**
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+void
+mtr_t::release_s_latch_at_savepoint(
+ ulint savepoint,
+ rw_lock_t* lock)
+{
+ ut_ad(is_active());
+ ut_ad(m_memo.size() > savepoint);
+
+ mtr_memo_slot_t* slot = m_memo.at<mtr_memo_slot_t*>(savepoint);
+
+ ut_ad(slot->object == lock);
+ ut_ad(slot->type == MTR_MEMO_S_LOCK);
+
+ rw_lock_s_unlock(lock);
+
+ slot->object = NULL;
+}
+
+/**
+SX-latches the not yet latched block after a savepoint. */
+
+void
+mtr_t::sx_latch_at_savepoint(
+ ulint savepoint,
+ buf_block_t* block)
+{
+ ut_ad(is_active());
+ ut_ad(m_memo.size() > savepoint);
+
+ ut_ad(!memo_contains_flagged(
+ block,
+ MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_SX_FIX));
+
+ mtr_memo_slot_t* slot = m_memo.at<mtr_memo_slot_t*>(savepoint);
+
+ ut_ad(slot->object == block);
+
+ /* == RW_NO_LATCH */
+ ut_a(slot->type == MTR_MEMO_BUF_FIX);
+
+ rw_lock_sx_lock(&block->lock);
+
+ if (!m_made_dirty) {
+ m_made_dirty = is_block_dirtied(block);
+ }
+
+ slot->type = MTR_MEMO_PAGE_SX_FIX;
+}
+
+/**
+X-latches the not yet latched block after a savepoint. */
+
+void
+mtr_t::x_latch_at_savepoint(
+ ulint savepoint,
+ buf_block_t* block)
+{
+ ut_ad(is_active());
+ ut_ad(m_memo.size() > savepoint);
+
+ ut_ad(!memo_contains_flagged(
+ block,
+ MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_SX_FIX));
+
+ mtr_memo_slot_t* slot = m_memo.at<mtr_memo_slot_t*>(savepoint);
+
+ ut_ad(slot->object == block);
+
+ /* == RW_NO_LATCH */
+ ut_a(slot->type == MTR_MEMO_BUF_FIX);
+
+ rw_lock_x_lock(&block->lock);
+
+ if (!m_made_dirty) {
+ m_made_dirty = is_block_dirtied(block);
+ }
+
+ slot->type = MTR_MEMO_PAGE_X_FIX;
+}
+
+/**
+Releases the block in an mtr memo after a savepoint. */
+
+void
+mtr_t::release_block_at_savepoint(
+ ulint savepoint,
+ buf_block_t* block)
+{
+ ut_ad(is_active());
+
+ mtr_memo_slot_t* slot = m_memo.at<mtr_memo_slot_t*>(savepoint);
+
+ ut_a(slot->object == block);
+
+ buf_page_release_latch(block, slot->type);
+
+ reinterpret_cast<buf_block_t*>(block)->unfix();
+
+ slot->object = NULL;
+}
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
new file mode 100644
index 00000000..d1b6784a
--- /dev/null
+++ b/storage/innobase/include/mtr0types.h
@@ -0,0 +1,347 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0types.h
+Mini-transaction buffer global types
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0types_h
+#define mtr0types_h
+
+#ifndef UNIV_INNOCHECKSUM
+#include "sync0rw.h"
+#else
+#include "univ.i"
+#endif /* UNIV_INNOCHECKSUM */
+
+struct mtr_t;
+
+/** Logging modes for a mini-transaction */
+enum mtr_log_t {
+ /** Default mode: log all operations modifying disk-based data */
+ MTR_LOG_ALL = 0,
+
+ /** Log no operations and dirty pages are not added to the flush list.
+ Set for attempting modification of a ROW_FORMAT=COMPRESSED page. */
+ MTR_LOG_NONE,
+
+ /** Don't generate REDO log but add dirty pages to flush list */
+ MTR_LOG_NO_REDO
+};
+
+/*
+A mini-transaction is a stream of records that is always terminated by
+a NUL byte. The first byte of a mini-transaction record is never NUL,
+but NUL bytes can occur within mini-transaction records. The first
+bytes of each record will explicitly encode the length of the record.
+NUL bytes also acts as padding in log blocks, that is, there can be
+multiple sucessive NUL bytes between mini-transactions in a redo log
+block.
+
+The first byte of the record would contain a record type, flags, and a
+part of length. The optional second byte of the record will contain
+more length. (Not needed for short records.)
+
+Bit 7 of the first byte of a redo log record is the same_page flag.
+If same_page=1, the record is referring to the same page as the
+previous record. Records that do not refer to data pages but to file
+operations are identified by setting the same_page=1 in the very first
+record(s) of the mini-transaction. A mini-transaction record that
+carries same_page=0 must only be followed by page-oriented records.
+
+Bits 6..4 of the first byte of a redo log record identify the redo log
+type. The following record types refer to data pages:
+
+ FREE_PAGE (0): corresponds to MLOG_INIT_FREE_PAGE
+ INIT_PAGE (1): corresponds to MLOG_INIT_FILE_PAGE2
+ EXTENDED (2): extended record; followed by subtype code @see mrec_ext_t
+ WRITE (3): replaces MLOG_nBYTES, MLOG_WRITE_STRING, MLOG_ZIP_*
+ MEMSET (4): extends the 10.4 MLOG_MEMSET record
+ MEMMOVE (5): copy data within the page (avoids logging redundant data)
+ RESERVED (6): reserved for future use; a subtype code
+ (encoded immediately after the length) would be written
+ to reserve code space for further extensions
+ OPTION (7): optional record that may be ignored; a subtype code
+ (encoded immediately after the length) would distinguish actual
+ usage, such as:
+ * MDEV-18976 page checksum record
+ * binlog record
+ * SQL statement (at the start of statement)
+
+Bits 3..0 indicate the redo log record length, excluding the first
+byte, but including additional length bytes and any other bytes,
+such as the optional tablespace identifier and page number.
+Values 1..15 represent lengths of 1 to 15 bytes. The special value 0
+indicates that 1 to 3 length bytes will follow to encode the remaining
+length that exceeds 16 bytes.
+
+Additional length bytes if length>16: 0 to 3 bytes
+0xxxxxxx for 0 to 127 (total: 16 to 143 bytes)
+10xxxxxx xxxxxxxx for 128 to 16511 (total: 144 to 16527)
+110xxxxx xxxxxxxx xxxxxxxx for 16512 to 2113663 (total: 16528 to 2113679)
+111xxxxx reserved (corrupted record, and file!)
+
+If same_page=0, the tablespace identifier and page number will use
+similar 1-to-5-byte variable-length encoding:
+0xxxxxxx for 0 to 127
+10xxxxxx xxxxxxxx for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663
+1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx for 2,113,664 to 270,549,119
+11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx for 270,549,120 to 34,630,287,487
+11111xxx reserved (corrupted record)
+Note: Some 5-byte values are reserved, because the tablespace identifier
+and page number can only be up to 4,294,967,295.
+
+If same_page=1 is set in a record that follows a same_page=0 record
+in a mini-transaction, the tablespace identifier and page number
+fields will be omitted.
+
+(For some file-oriented records (if same_page=1 for the first records
+of a mini-transaction), we will write tablespace identifier using the
+same 1-to-5-byte encoding. TBD: describe the exact format of
+file-oriented records. With MDEV-14425, we could write file-level log
+records to a separate file, not interleaved with page-level redo log
+at all. We could reserve the file ib_logfile0 for checkpoint information
+and for file-level redo log records.)
+
+For FREE_PAGE or INIT_PAGE, if same_page=1, the record will be treated
+as corrupted (or reserved for future extension). The type code must
+be followed by 1+1 to 5+5 bytes (to encode the tablespace identifier
+and page number). If the record length does not match the encoded
+lengths of the tablespace identifier and page number, the record will
+be treated as corrupted. This allows future expansion of the format.
+
+If there is a FREE_PAGE record in a mini-transaction, it must be the
+only record for that page in the mini-transaction. If there is an
+INIT_PAGE record for a page in a mini-transaction, it must be the
+first record for that page in the mini-transaction.
+
+An EXTENDED record must be followed by 1+1 to 5+5 bytes for the page
+identifier (unless the same_page flag is set) and a subtype; @see mrec_ext_t
+
+For WRITE, MEMSET, MEMMOVE, the next 1 to 3 bytes are the byte offset
+on the page, relative from the previous offset. If same_page=0, the
+"previous offset" is 0. If same_page=1, the "previous offset" is where
+the previous operation ended (FIL_PAGE_TYPE for INIT_PAGE).
+0xxxxxxx for 0 to 127
+10xxxxxx xxxxxxxx for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663
+111xxxxx reserved (corrupted record)
+If the sum of the "previous offset" and the current offset exceeds the
+page size, the record is treated as corrupted. Negative relative offsets
+cannot be written. Instead, a record with same_page=0 can be written.
+
+For MEMSET and MEMMOVE, the target length will follow, encoded in 1 to
+3 bytes. If the length+offset exceeds the page size, the record will
+be treated as corrupted.
+
+For MEMMOVE, the source offset will follow, encoded in 1 to 3 bytes,
+relative to the current offset. The offset 0 is not possible, and
+the sign bit is the least significant bit. That is,
++x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) and
+-x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+The source offset must be within the page size, or else the record
+will be treated as corrupted.
+
+For MEMSET or WRITE, the byte(s) to be written will follow. For
+MEMSET, it usually is a single byte, but it could also be a multi-byte
+string, which would be copied over and over until the target length is
+reached. The length of the remaining bytes is implied by the length
+bytes at the start of the record.
+
+For MEMMOVE, if any bytes follow, the record is treated as corrupted
+(future expansion).
+
+As mentioned at the start of this comment, the type byte 0 would be
+special, marking the end of a mini-transaction. We could use the
+corresponding value 0x80 (with same_page=1) for something special,
+such as a future extension when more type codes are needed, or for
+encoding rarely needed redo log records.
+
+Examples:
+
+INIT could be logged as 0x12 0x34 0x56, meaning "type code 1 (INIT), 2
+bytes to follow" and "tablespace ID 0x34", "page number 0x56".
+The first byte must be between 0x12 and 0x1a, and the total length of
+the record must match the lengths of the encoded tablespace ID and
+page number.
+
+WRITE could be logged as 0x36 0x40 0x57 0x60 0x12 0x34 0x56, meaning
+"type code 3 (WRITE), 6 bytes to follow" and "tablespace ID 0x40",
+"page number 0x57", "byte offset 0x60", data 0x34,0x56.
+
+A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23
+0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to
+follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78.
+
+The end of the mini-transaction would be indicated by a NUL byte.
+*/
+
+/** Redo log record types. These bit patterns (3 bits) will be written
+to the redo log file, so the existing codes or their interpretation on
+crash recovery must not be changed. */
+enum mrec_type_t
+{
+ /** Free a page. On recovery, it is unnecessary to read the page.
+ The next record for the page (if any) must be INIT_PAGE.
+ After this record has been written, the page may be
+ overwritten with zeros, or discarded or trimmed. */
+ FREE_PAGE= 0,
+ /** Zero-initialize a page. The current byte offset (for subsequent
+ records) will be reset to FIL_PAGE_TYPE. */
+ INIT_PAGE= 0x10,
+ /** Insert a record into a page. FIXME: implement this! */
+ EXTENDED= 0x20,
+ /** Write a string of bytes. Followed by the byte offset (unsigned,
+ relative to the current byte offset, encoded in 1 to 3 bytes) and
+ the bytes to write (at least one). The current byte offset will be
+ set after the last byte written. */
+ WRITE= 0x30,
+ /** Like WRITE, but before the bytes to write, the data_length-1
+ (encoded in 1 to 3 bytes) will be encoded, and it must be more
+ than the length of the following data bytes to write.
+ The data byte(s) will be repeatedly copied to the output until
+ the data_length is reached. */
+ MEMSET= 0x40,
+ /** Like MEMSET, but instead of the bytes to write, a source byte
+ offset (signed, nonzero, relative to the target byte offset, encoded
+ in 1 to 3 bytes, with the sign bit in the least significant bit)
+ will be written.
+ That is, +x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...)
+ and -x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+ The source offset and data_length must be within the page size, or
+ else the record will be treated as corrupted. The data will be
+ copied from the page as it was at the start of the
+ mini-transaction. */
+ MEMMOVE= 0x50,
+ /** Reserved for future use. */
+ RESERVED= 0x60,
+ /** Optional record that may be ignored in crash recovery.
+ A subtype code will be encoded immediately after the length.
+ Possible subtypes would include a MDEV-18976 page checksum record,
+ a binlog record, or an SQL statement. */
+ OPTION= 0x70
+};
+
+
+/** Supported EXTENDED record subtypes. */
+enum mrec_ext_t
+{
+ /** Partly initialize a ROW_FORMAT=REDUNDANT B-tree or R-tree index page,
+ including writing the "infimum" and "supremum" pseudo-records.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INIT_ROW_FORMAT_REDUNDANT= 0,
+ /** Partly initialize a ROW_FORMAT=COMPACT or DYNAMIC index page,
+ including writing the "infimum" and "supremum" pseudo-records.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INIT_ROW_FORMAT_DYNAMIC= 1,
+ /** Initialize an undo log page.
+ This is roughly (not exactly) equivalent to the old MLOG_UNDO_INIT record.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ UNDO_INIT= 2,
+ /** Append a record to an undo log page.
+ This is equivalent to the old MLOG_UNDO_INSERT record.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ UNDO_APPEND= 3,
+ /** Insert a ROW_FORMAT=REDUNDANT record, extending PAGE_HEAP_TOP.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INSERT_HEAP_REDUNDANT= 4,
+ /** Insert a ROW_FORMAT=REDUNDANT record, reusing PAGE_FREE.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INSERT_REUSE_REDUNDANT= 5,
+ /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, extending PAGE_HEAP_TOP.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INSERT_HEAP_DYNAMIC= 6,
+ /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, reusing PAGE_FREE.
+ The current byte offset will be reset to FIL_PAGE_TYPE. */
+ INSERT_REUSE_DYNAMIC= 7,
+ /** Delete a record on a ROW_FORMAT=REDUNDANT page.
+ We point to the precedessor of the record to be deleted.
+ The current byte offset will be reset to FIL_PAGE_TYPE.
+ This is similar to the old MLOG_REC_DELETE record. */
+ DELETE_ROW_FORMAT_REDUNDANT= 8,
+ /** Delete a record on a ROW_FORMAT=COMPACT or DYNAMIC page.
+ We point to the precedessor of the record to be deleted
+ and include the total size of the record being deleted.
+ The current byte offset will be reset to FIL_PAGE_TYPE.
+ This is similar to the old MLOG_COMP_REC_DELETE record. */
+ DELETE_ROW_FORMAT_DYNAMIC= 9,
+ /** Truncate a data file. */
+ TRIM_PAGES= 10
+};
+
+
+/** Redo log record types for file-level operations. These bit
+patterns will be written to redo log files, so the existing codes or
+their interpretation on crash recovery must not be changed. */
+enum mfile_type_t
+{
+ /** Create a file. Followed by tablespace ID and the file name. */
+ FILE_CREATE = 0x80,
+ /** Delete a file. Followed by tablespace ID and the file name. */
+ FILE_DELETE = 0x90,
+ /** Rename a file. Followed by tablespace ID and the old file name,
+ NUL, and the new file name. */
+ FILE_RENAME = 0xa0,
+ /** Modify a file. Followed by tablespace ID and the file name. */
+ FILE_MODIFY = 0xb0,
+#if 1 /* MDEV-14425 FIXME: Remove this! */
+ /** End-of-checkpoint marker. Followed by 2 dummy bytes of page identifier,
+ 8 bytes of LSN, and padded with a NUL; @see SIZE_OF_FILE_CHECKPOINT. */
+ FILE_CHECKPOINT = 0xf0
+#endif
+};
+
+#if 1 /* MDEV-14425 FIXME: Remove this! */
+/** Size of a FILE_CHECKPOINT record, including the trailing byte to
+terminate the mini-transaction. */
+constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1;
+#endif
+
+#ifndef UNIV_INNOCHECKSUM
+/** Types for the mlock objects to store in the mtr_t::m_memo */
+enum mtr_memo_type_t {
+ MTR_MEMO_PAGE_S_FIX = RW_S_LATCH,
+
+ MTR_MEMO_PAGE_X_FIX = RW_X_LATCH,
+
+ MTR_MEMO_PAGE_SX_FIX = RW_SX_LATCH,
+
+ MTR_MEMO_BUF_FIX = RW_NO_LATCH,
+
+ MTR_MEMO_MODIFY = 16,
+
+ MTR_MEMO_PAGE_X_MODIFY = MTR_MEMO_PAGE_X_FIX | MTR_MEMO_MODIFY,
+ MTR_MEMO_PAGE_SX_MODIFY = MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_MODIFY,
+
+ MTR_MEMO_S_LOCK = RW_S_LATCH << 5,
+
+ MTR_MEMO_X_LOCK = RW_X_LATCH << 5,
+
+ MTR_MEMO_SX_LOCK = RW_SX_LATCH << 5,
+
+ /** acquire X-latch on fil_space_t::latch */
+ MTR_MEMO_SPACE_X_LOCK = MTR_MEMO_SX_LOCK << 1
+};
+#endif /* !UNIV_CHECKSUM */
+
+#endif /* mtr0types_h */
diff --git a/storage/innobase/include/os0event.h b/storage/innobase/include/os0event.h
new file mode 100644
index 00000000..52f6500a
--- /dev/null
+++ b/storage/innobase/include/os0event.h
@@ -0,0 +1,131 @@
+/*****************************************************************************
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0event.h
+The interface to the operating system condition variables
+
+Created 2012-09-23 Sunny Bains (split from os0sync.h)
+*******************************************************/
+
+#ifndef os0event_h
+#define os0event_h
+
+#include "univ.i"
+
+// Forward declaration.
+struct os_event;
+typedef struct os_event* os_event_t;
+
+/** Denotes an infinite delay for os_event_wait_time() */
+#define OS_SYNC_INFINITE_TIME ULINT_UNDEFINED
+
+/** Return value of os_event_wait_time() when the time is exceeded */
+#define OS_SYNC_TIME_EXCEEDED 1
+
+/**
+Creates an event semaphore, i.e., a semaphore which may just have two states:
+signaled and nonsignaled. The created event is manual reset: it must be reset
+explicitly by calling os_event_reset().
+@return the event handle */
+os_event_t os_event_create(const char*);
+
+/**
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+void
+os_event_set(
+/*=========*/
+ os_event_t event); /*!< in/out: event to set */
+
+/**
+Check if the event is set.
+@return true if set */
+bool
+os_event_is_set(
+/*============*/
+ const os_event_t event); /*!< in: event to set */
+
+/**
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event.
+The return value should be passed to os_even_wait_low() if it is desired
+that this thread should not wait in case of an intervening call to
+os_event_set() between this os_event_reset() and the
+os_event_wait_low() call. See comments for os_event_wait_low(). */
+int64_t
+os_event_reset(
+/*===========*/
+ os_event_t event); /*!< in/out: event to reset */
+
+/**
+Frees an event object. */
+void
+os_event_destroy(
+/*=============*/
+ os_event_t& event); /*!< in/own: event to free */
+
+/**
+Waits for an event object until it is in the signaled state.
+
+Typically, if the event has been signalled after the os_event_reset()
+we'll return immediately because event->is_set == TRUE.
+There are, however, situations (e.g.: sync_array code) where we may
+lose this information. For example:
+
+thread A calls os_event_reset()
+thread B calls os_event_set() [event->is_set == TRUE]
+thread C calls os_event_reset() [event->is_set == FALSE]
+thread A calls os_event_wait() [infinite wait!]
+thread C calls os_event_wait() [infinite wait!]
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by os_event_reset() should be passed in as
+reset_sig_count. */
+void
+os_event_wait_low(
+/*==============*/
+ os_event_t event, /*!< in/out: event to wait */
+ int64_t reset_sig_count);/*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+
+/** Blocking infinite wait on an event, until signealled.
+@param e - event to wait on. */
+#define os_event_wait(e) os_event_wait_low((e), 0)
+
+/**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded. In Unix the timeout is always infinite.
+@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+ulint
+os_event_wait_time_low(
+/*===================*/
+ os_event_t event, /*!< in/out: event to wait */
+ ulint time_in_usec, /*!< in: timeout in
+ microseconds, or
+ OS_SYNC_INFINITE_TIME */
+ int64_t reset_sig_count); /*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+
+/** Blocking timed wait on an event.
+@param e - event to wait on.
+@param t - timeout in microseconds */
+#define os_event_wait_time(e, t) os_event_wait_time_low((e), (t), 0)
+
+#endif /* !os0event_h */
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
new file mode 100644
index 00000000..9b5e5058
--- /dev/null
+++ b/storage/innobase/include/os0file.h
@@ -0,0 +1,1228 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/os0file.h
+The interface to the operating system file io
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0file_h
+#define os0file_h
+
+#include "fsp0types.h"
+#include "tpool.h"
+#include "my_counter.h"
+
+#ifndef _WIN32
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#endif /* !_WIN32 */
+
+extern bool os_has_said_disk_full;
+
+/** File offset in bytes */
+typedef ib_uint64_t os_offset_t;
+
+#ifdef _WIN32
+
+/** We define always WIN_ASYNC_IO, and check at run-time whether
+the OS actually supports it: Win 95 does not, NT does. */
+# define WIN_ASYNC_IO
+
+/** Use unbuffered I/O */
+# define UNIV_NON_BUFFERED_IO
+
+/** File handle */
+typedef native_file_handle os_file_t;
+
+
+#else /* _WIN32 */
+
+/** File handle */
+typedef int os_file_t;
+
+#endif /* _WIN32 */
+
+static const os_file_t OS_FILE_CLOSED = IF_WIN(os_file_t(INVALID_HANDLE_VALUE),-1);
+
+/** File descriptor with optional PERFORMANCE_SCHEMA instrumentation */
+struct pfs_os_file_t
+{
+ /** Default constructor */
+ pfs_os_file_t(os_file_t file = OS_FILE_CLOSED) : m_file(file)
+#ifdef UNIV_PFS_IO
+ , m_psi(NULL)
+#endif
+ {}
+
+ /** The wrapped file handle */
+ os_file_t m_file;
+#ifdef UNIV_PFS_IO
+ /** PERFORMANCE_SCHEMA descriptor */
+ struct PSI_file *m_psi;
+#endif
+ /** Implicit type conversion.
+ @return the wrapped file handle */
+ operator os_file_t() const { return m_file; }
+ /** Assignment operator.
+ @param[in] file file handle to be assigned */
+ void operator=(os_file_t file) { m_file = file; }
+ bool operator==(os_file_t file) const { return m_file == file; }
+ bool operator!=(os_file_t file) const { return !(*this == file); }
+#ifndef DBUG_OFF
+ friend std::ostream& operator<<(std::ostream& os, pfs_os_file_t f){
+ os << os_file_t(f);
+ return os;
+ }
+#endif
+};
+
+/** The next value should be smaller or equal to the smallest sector size used
+on any disk. A log block is required to be a portion of disk which is written
+so that if the start and the end of a block get written to disk, then the
+whole block gets written. This should be true even in most cases of a crash:
+if this fails for a log block, then it is equivalent to a media failure in the
+log. */
+
+#define OS_FILE_LOG_BLOCK_SIZE 512U
+
+/** Options for os_file_create_func @{ */
+enum os_file_create_t {
+ OS_FILE_OPEN = 51, /*!< to open an existing file (if
+ doesn't exist, error) */
+ OS_FILE_CREATE, /*!< to create new file (if
+ exists, error) */
+ OS_FILE_OVERWRITE, /*!< to create a new file, if exists
+ the overwrite old file */
+ OS_FILE_OPEN_RAW, /*!< to open a raw device or disk
+ partition */
+ OS_FILE_CREATE_PATH, /*!< to create the directories */
+ OS_FILE_OPEN_RETRY, /*!< open with retry */
+
+ /** Flags that can be combined with the above values. Please ensure
+ that the above values stay below 128. */
+
+ OS_FILE_ON_ERROR_NO_EXIT = 128, /*!< do not exit on unknown errors */
+ OS_FILE_ON_ERROR_SILENT = 256 /*!< don't print diagnostic messages to
+ the log unless it is a fatal error,
+ this flag is only used if
+ ON_ERROR_NO_EXIT is set */
+};
+
+static const ulint OS_FILE_READ_ONLY = 333;
+static const ulint OS_FILE_READ_WRITE = 444;
+
+/** Used by MySQLBackup */
+static const ulint OS_FILE_READ_ALLOW_DELETE = 555;
+
+/* Options for file_create */
+static const ulint OS_FILE_AIO = 61;
+static const ulint OS_FILE_NORMAL = 62;
+/* @} */
+
+/** Types for file create @{ */
+static const ulint OS_DATA_FILE = 100;
+static const ulint OS_LOG_FILE = 101;
+static const ulint OS_DATA_FILE_NO_O_DIRECT = 103;
+/* @} */
+
+/** Error codes from os_file_get_last_error @{ */
+static const ulint OS_FILE_NAME_TOO_LONG = 36;
+static const ulint OS_FILE_NOT_FOUND = 71;
+static const ulint OS_FILE_DISK_FULL = 72;
+static const ulint OS_FILE_ALREADY_EXISTS = 73;
+static const ulint OS_FILE_PATH_ERROR = 74;
+
+/** wait for OS aio resources to become available again */
+static const ulint OS_FILE_AIO_RESOURCES_RESERVED = 75;
+
+static const ulint OS_FILE_SHARING_VIOLATION = 76;
+static const ulint OS_FILE_ERROR_NOT_SPECIFIED = 77;
+static const ulint OS_FILE_INSUFFICIENT_RESOURCE = 78;
+static const ulint OS_FILE_AIO_INTERRUPTED = 79;
+static const ulint OS_FILE_OPERATION_ABORTED = 80;
+static const ulint OS_FILE_ACCESS_VIOLATION = 81;
+static const ulint OS_FILE_OPERATION_NOT_SUPPORTED = 125;
+static const ulint OS_FILE_ERROR_MAX = 200;
+/* @} */
+
+/**
+The I/O context that is passed down to the low level IO code */
+class IORequest
+{
+public:
+ enum Type
+ {
+ /** Synchronous read */
+ READ_SYNC= 2,
+ /** Asynchronous read; some errors will be ignored */
+ READ_ASYNC= READ_SYNC | 1,
+ /** Possibly partial read; only used with
+ os_file_read_no_error_handling() */
+ READ_MAYBE_PARTIAL= READ_SYNC | 4,
+ /** Read for doublewrite buffer recovery */
+ DBLWR_RECOVER= READ_SYNC | 8,
+ /** Synchronous write */
+ WRITE_SYNC= 16,
+ /** Asynchronous write */
+ WRITE_ASYNC= WRITE_SYNC | 1,
+ /** A doublewrite batch */
+ DBLWR_BATCH= WRITE_ASYNC | 8,
+ /** Write data; evict the block on write completion */
+ WRITE_LRU= WRITE_ASYNC | 32,
+ /** Write data and punch hole for the rest */
+ PUNCH= WRITE_ASYNC | 64,
+ /** Write data and punch hole; evict the block on write completion */
+ PUNCH_LRU= PUNCH | WRITE_LRU,
+ /** Zero out a range of bytes in fil_space_t::io() */
+ PUNCH_RANGE= WRITE_SYNC | 128,
+ };
+
+ constexpr IORequest(buf_page_t *bpage, fil_node_t *node, Type type) :
+ bpage(bpage), node(node), type(type) {}
+
+ constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr) :
+ bpage(bpage), type(type) {}
+
+ bool is_read() const { return (type & READ_SYNC) != 0; }
+ bool is_write() const { return (type & WRITE_SYNC) != 0; }
+ bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; }
+ bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; }
+
+ /** If requested, free storage space associated with a section of the file.
+ @param off byte offset from the start (SEEK_SET)
+ @param len size of the hole in bytes
+ @return DB_SUCCESS or error code */
+ dberr_t maybe_punch_hole(os_offset_t off, ulint len)
+ {
+ return off && len && node && (type & (PUNCH ^ WRITE_ASYNC))
+ ? punch_hole(off, len)
+ : DB_SUCCESS;
+ }
+
+private:
+ /** Free storage space associated with a section of the file.
+ @param off byte offset from the start (SEEK_SET)
+ @param len size of the hole in bytes
+ @return DB_SUCCESS or error code */
+ dberr_t punch_hole(os_offset_t off, ulint len) const;
+
+public:
+ /** Page to be written on write operation */
+ buf_page_t* const bpage= nullptr;
+
+ /** File descriptor */
+ fil_node_t *const node= nullptr;
+
+ /** Request type bit flags */
+ const Type type;
+};
+
+constexpr IORequest IORequestRead(IORequest::READ_SYNC);
+constexpr IORequest IORequestReadPartial(IORequest::READ_MAYBE_PARTIAL);
+constexpr IORequest IORequestWrite(IORequest::WRITE_SYNC);
+
+/** Sparse file size information. */
+struct os_file_size_t {
+ /** Total size of file in bytes */
+ os_offset_t m_total_size;
+
+ /** If it is a sparse file then this is the number of bytes
+ actually allocated for the file. */
+ os_offset_t m_alloc_size;
+};
+
+constexpr ulint OS_AIO_N_PENDING_IOS_PER_THREAD= 256;
+
+extern Atomic_counter<ulint> os_n_file_reads;
+extern ulint os_n_file_writes;
+extern ulint os_n_fsyncs;
+
+/* File types for directory entry data type */
+
+enum os_file_type_t {
+ OS_FILE_TYPE_UNKNOWN = 0,
+ OS_FILE_TYPE_FILE, /* regular file */
+ OS_FILE_TYPE_DIR, /* directory */
+ OS_FILE_TYPE_LINK, /* symbolic link */
+ OS_FILE_TYPE_BLOCK /* block device */
+};
+
+/* Maximum path string length in bytes when referring to tables with in the
+'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers
+of this size from the thread stack; that is why this should not be made much
+bigger than 4000 bytes. The maximum path length used by any storage engine
+in the server must be at least this big. */
+
+/* MySQL 5.7 my_global.h */
+#ifndef FN_REFLEN_SE
+#define FN_REFLEN_SE 4000
+#endif
+
+#define OS_FILE_MAX_PATH 4000
+#if (FN_REFLEN_SE < OS_FILE_MAX_PATH)
+# error "(FN_REFLEN_SE < OS_FILE_MAX_PATH)"
+#endif
+
+/** Struct used in fetching information of a file in a directory */
+struct os_file_stat_t {
+ char name[OS_FILE_MAX_PATH]; /*!< path to a file */
+ os_file_type_t type; /*!< file type */
+ os_offset_t size; /*!< file size in bytes */
+ os_offset_t alloc_size; /*!< Allocated size for
+ sparse files in bytes */
+ size_t block_size; /*!< Block size to use for IO
+ in bytes*/
+ time_t ctime; /*!< creation time */
+ time_t mtime; /*!< modification time */
+ time_t atime; /*!< access time */
+ bool rw_perm; /*!< true if can be opened
+ in read-write mode. Only valid
+ if type == OS_FILE_TYPE_FILE */
+};
+
+/** Create a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the in the mysql server configuration
+parameter (--tmpdir).
+@return temporary file handle, or NULL on error */
+FILE*
+os_file_create_tmpfile();
+
+/**
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix, the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true.
+
+@param[in] pathname directory name as null-terminated string
+@param[in] fail_if_exists if true, pre-existing directory is treated
+ as an error.
+@return true if call succeeds, false on error */
+bool
+os_file_create_directory(
+ const char* pathname,
+ bool fail_if_exists);
+
+/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeed, false if error
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success);
+
+/** NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option
+ is used by a backup program reading the file
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success)
+ MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef _WIN32
+#define os_file_set_nocache(fd, file_name, operation_name) do{}while(0)
+#else
+/** Tries to disable OS caching on an opened file descriptor.
+@param[in] fd file descriptor to alter
+@param[in] file_name file name, used in the diagnostic message
+@param[in] name "open" or "create"; used in the diagnostic
+ message */
+void
+os_file_set_nocache(
+/*================*/
+ int fd, /*!< in: file descriptor to alter */
+ const char* file_name,
+ const char* operation_name);
+#endif
+
+/** NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
+ is desired, OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async I/O or unbuffered I/O: look in the
+ function source code for the exact rules
+@param[in] type OS_DATA_FILE or OS_LOG_FILE
+@param[in] read_only if true read only mode checks are enforced
+@param[in] success true if succeeded
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_func(
+ const char* name,
+ ulint create_mode,
+ ulint purpose,
+ ulint type,
+ bool read_only,
+ bool* success)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Deletes a file. The file has to be closed before calling this.
+@param[in] name file path as a null-terminated string
+@return true if success */
+bool
+os_file_delete_func(const char* name);
+
+/** Deletes a file if it exists. The file has to be closed before calling this.
+@param[in] name file path as a null-terminated string
+@param[out] exist indicate if file pre-exist
+@return true if success */
+bool
+os_file_delete_if_exists_func(const char* name, bool* exist);
+
+/** NOTE! Use the corresponding macro os_file_rename(), not directly
+this function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@param[in] oldpath old file path as a null-terminated string
+@param[in] newpath new file path
+@return true if success */
+bool
+os_file_rename_func(const char* oldpath, const char* newpath);
+
+/** NOTE! Use the corresponding macro os_file_close(), not directly this
+function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@param[in] file own: handle to a file
+@return true if success */
+bool os_file_close_func(os_file_t file);
+
+#ifdef UNIV_PFS_IO
+
+/* Keys to register InnoDB I/O with performance schema */
+extern mysql_pfs_key_t innodb_data_file_key;
+extern mysql_pfs_key_t innodb_log_file_key;
+extern mysql_pfs_key_t innodb_temp_file_key;
+
+/* Following four macros are instumentations to register
+various file I/O operations with performance schema.
+1) register_pfs_file_open_begin() and register_pfs_file_open_end() are
+used to register file creation, opening, closing and renaming.
+2) register_pfs_file_rename_begin() and register_pfs_file_rename_end()
+are used to register file renaming
+2) register_pfs_file_io_begin() and register_pfs_file_io_end() are
+used to register actual file read, write and flush
+3) register_pfs_file_close_begin() and register_pfs_file_close_end()
+are used to register file deletion operations*/
+# define register_pfs_file_open_begin(state, locker, key, op, name, \
+ src_file, src_line) \
+do { \
+ locker = PSI_FILE_CALL(get_thread_file_name_locker)( \
+ state, key, op, name, &locker); \
+ if (locker != NULL) { \
+ PSI_FILE_CALL(start_file_open_wait)( \
+ locker, src_file, src_line); \
+ } \
+} while (0)
+
+# define register_pfs_file_open_end(locker, file, result) \
+do { \
+ if (locker != NULL) { \
+ file.m_psi = PSI_FILE_CALL(end_file_open_wait)( \
+ locker, result); \
+ } \
+} while (0)
+
+# define register_pfs_file_rename_begin(state, locker, key, op, name, \
+ src_file, src_line) \
+ register_pfs_file_open_begin(state, locker, key, op, name, \
+ src_file, src_line) \
+
+# define register_pfs_file_rename_end(locker, from, to, result) \
+do { \
+ if (locker != NULL) { \
+ PSI_FILE_CALL( \
+ end_file_rename_wait)( \
+ locker, from, to, result); \
+ } \
+} while (0)
+
+# define register_pfs_file_close_begin(state, locker, key, op, name, \
+ src_file, src_line) \
+do { \
+ locker = PSI_FILE_CALL(get_thread_file_name_locker)( \
+ state, key, op, name, &locker); \
+ if (locker != NULL) { \
+ PSI_FILE_CALL(start_file_close_wait)( \
+ locker, src_file, src_line); \
+ } \
+} while (0)
+
+# define register_pfs_file_close_end(locker, result) \
+do { \
+ if (locker != NULL) { \
+ PSI_FILE_CALL(end_file_close_wait)( \
+ locker, result); \
+ } \
+} while (0)
+
+# define register_pfs_file_io_begin(state, locker, file, count, op, \
+ src_file, src_line) \
+do { \
+ locker = PSI_FILE_CALL(get_thread_file_stream_locker)( \
+ state, file.m_psi, op); \
+ if (locker != NULL) { \
+ PSI_FILE_CALL(start_file_wait)( \
+ locker, count, src_file, src_line); \
+ } \
+} while (0)
+
+# define register_pfs_file_io_end(locker, count) \
+do { \
+ if (locker != NULL) { \
+ PSI_FILE_CALL(end_file_wait)(locker, count); \
+ } \
+} while (0)
+
+/* Following macros/functions are file I/O APIs that would be performance
+schema instrumented if "UNIV_PFS_IO" is defined. They would point to
+wrapper functions with performance schema instrumentation in such case.
+
+os_file_create
+os_file_create_simple
+os_file_create_simple_no_error_handling
+os_file_close
+os_file_rename
+os_aio
+os_file_read
+os_file_read_no_error_handling
+os_file_write
+
+The wrapper functions have the prefix of "innodb_". */
+
+# define os_file_create(key, name, create, purpose, type, read_only, \
+ success) \
+ pfs_os_file_create_func(key, name, create, purpose, type, \
+ read_only, success, __FILE__, __LINE__)
+
+# define os_file_create_simple(key, name, create, access, \
+ read_only, success) \
+ pfs_os_file_create_simple_func(key, name, create, access, \
+ read_only, success, __FILE__, __LINE__)
+
+# define os_file_create_simple_no_error_handling( \
+ key, name, create_mode, access, read_only, success) \
+ pfs_os_file_create_simple_no_error_handling_func( \
+ key, name, create_mode, access, \
+ read_only, success, __FILE__, __LINE__)
+
+# define os_file_close(file) \
+ pfs_os_file_close_func(file, __FILE__, __LINE__)
+
+# define os_file_read(type, file, buf, offset, n) \
+ pfs_os_file_read_func(type, file, buf, offset, n, __FILE__, __LINE__)
+
+# define os_file_read_no_error_handling(type, file, buf, offset, n, o) \
+ pfs_os_file_read_no_error_handling_func( \
+ type, file, buf, offset, n, o, __FILE__, __LINE__)
+
+# define os_file_write(type, name, file, buf, offset, n) \
+ pfs_os_file_write_func(type, name, file, buf, offset, \
+ n, __FILE__, __LINE__)
+
+# define os_file_flush(file) \
+ pfs_os_file_flush_func(file, __FILE__, __LINE__)
+
+# define os_file_rename(key, oldpath, newpath) \
+ pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
+
+# define os_file_delete(key, name) \
+ pfs_os_file_delete_func(key, name, __FILE__, __LINE__)
+
+# define os_file_delete_if_exists(key, name, exist) \
+ pfs_os_file_delete_if_exists_func(key, name, exist, __FILE__, __LINE__)
+
+/** NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
+ is desired, OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async I/O or unbuffered I/O: look in the
+ function source code for the exact rules
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint purpose,
+ ulint type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@param[in] file handle to a file
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_close_func(
+ pfs_os_file_t file,
+ const char* src_file,
+ uint src_line);
+
+/** NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_read_func(
+ const IORequest& type,
+ pfs_os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ const char* src_file,
+ uint src_line);
+
+/** NOTE! Please use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_read_no_error_handling_func() which requests a synchronous
+read operation.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[out] o number of bytes actually read
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_read_no_error_handling_func(
+ const IORequest& type,
+ pfs_os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ ulint* o,
+ const char* src_file,
+ uint src_line);
+
+/** NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@param[in] type IO request context
+@param[in] name Name of the file or path as NUL terminated
+ string
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_write_func(
+ const IORequest& type,
+ const char* name,
+ pfs_os_file_t file,
+ const void* buf,
+ os_offset_t offset,
+ ulint n,
+ const char* src_file,
+ uint src_line);
+
+/** NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+Flushes the write buffers of a given file to the disk.
+@param[in] file Open file handle
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_flush_func(
+ pfs_os_file_t file,
+ const char* src_file,
+ uint src_line);
+
+
+/** NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@param[in] key Performance Schema Key
+@param[in] oldpath old file path as a null-terminated string
+@param[in] newpath new file path
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_rename_func(
+ mysql_pfs_key_t key,
+ const char* oldpath,
+ const char* newpath,
+ const char* src_file,
+ uint src_line);
+
+/**
+NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@param[in] key Performance Schema Key
+@param[in] name old file path as a null-terminated string
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ const char* src_file,
+ uint src_line);
+
+/**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@param[in] key Performance Schema Key
+@param[in] name old file path as a null-terminated string
+@param[in] exist indicate if file pre-exist
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ bool* exist,
+ const char* src_file,
+ uint src_line);
+
+#else /* UNIV_PFS_IO */
+
+/* If UNIV_PFS_IO is not defined, these I/O APIs point
+to original un-instrumented file I/O APIs */
+# define os_file_create(key, name, create, purpose, type, read_only, \
+ success) \
+ os_file_create_func(name, create, purpose, type, read_only, \
+ success)
+
+# define os_file_create_simple(key, name, create_mode, access, \
+ read_only, success) \
+ os_file_create_simple_func(name, create_mode, access, \
+ read_only, success)
+
+# define os_file_create_simple_no_error_handling( \
+ key, name, create_mode, access, read_only, success) \
+ os_file_create_simple_no_error_handling_func( \
+ name, create_mode, access, read_only, success)
+
+# define os_file_close(file) os_file_close_func(file)
+
+# define os_file_read(type, file, buf, offset, n) \
+ os_file_read_func(type, file, buf, offset, n)
+
+# define os_file_read_no_error_handling(type, file, buf, offset, n, o) \
+ os_file_read_no_error_handling_func(type, file, buf, offset, n, o)
+
+# define os_file_write(type, name, file, buf, offset, n) \
+ os_file_write_func(type, name, file, buf, offset, n)
+
+# define os_file_flush(file) os_file_flush_func(file)
+
+# define os_file_rename(key, oldpath, newpath) \
+ os_file_rename_func(oldpath, newpath)
+
+# define os_file_delete(key, name) os_file_delete_func(name)
+
+# define os_file_delete_if_exists(key, name, exist) \
+ os_file_delete_if_exists_func(name, exist)
+
+#endif /* UNIV_PFS_IO */
+
+/** Gets a file size.
+@param[in] file handle to a file
+@return file size if OK, else set m_total_size to ~0 and m_alloc_size
+ to errno */
+os_file_size_t
+os_file_get_size(
+ const char* filename)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Gets a file size.
+@param[in] file handle to a file
+@return file size, or (os_offset_t) -1 on failure */
+os_offset_t
+os_file_get_size(
+ os_file_t file)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Extend a file.
+
+On Windows, extending a file allocates blocks for the file,
+unless the file is sparse.
+
+On Unix, we will extend the file with ftruncate(), if
+file needs to be sparse. Otherwise posix_fallocate() is used
+when available, and if not, binary zeroes are added to the end
+of file.
+
+@param[in] name file name
+@param[in] file file handle
+@param[in] size desired file size
+@param[in] sparse whether to create a sparse file (no preallocating)
+@return whether the operation succeeded */
+bool
+os_file_set_size(
+ const char* name,
+ os_file_t file,
+ os_offset_t size,
+ bool is_sparse = false)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Truncates a file at its current position.
+@param[in/out] file file to be truncated
+@return true if success */
+bool
+os_file_set_eof(
+ FILE* file); /*!< in: file to be truncated */
+
+/** Truncate a file to a specified size in bytes.
+@param[in] pathname file path
+@param[in] file file to be truncated
+@param[in] size size preserved in bytes
+@param[in] allow_shrink whether to allow the file to become smaller
+@return true if success */
+bool
+os_file_truncate(
+ const char* pathname,
+ os_file_t file,
+ os_offset_t size,
+ bool allow_shrink = false);
+
+/** NOTE! Use the corresponding macro os_file_flush(), not directly this
+function!
+Flushes the write buffers of a given file to the disk.
+@param[in] file handle to a file
+@return true if success */
+bool
+os_file_flush_func(
+ os_file_t file);
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@param[in] report true if we want an error message printed
+ for all errors
+@return error number, or OS error number + 100 */
+ulint
+os_file_get_last_error(
+ bool report);
+
+/** NOTE! Use the corresponding macro os_file_read(), not directly this
+function!
+Requests a synchronous read operation.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@return DB_SUCCESS if request was successful */
+dberr_t
+os_file_read_func(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files.
+@param[in,out] file file to read from
+@param[in,out] str buffer where to read
+@param[in] size size of buffer */
+void
+os_file_read_string(
+ FILE* file,
+ char* str,
+ ulint size);
+
+/** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[out] o number of bytes actually read
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_read_no_error_handling_func(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ ulint* o)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Use the corresponding macro os_file_write(), not directly this
+function!
+Requests a synchronous write operation.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@return DB_SUCCESS if request was successful */
+dberr_t
+os_file_write_func(
+ const IORequest& type,
+ const char* name,
+ os_file_t file,
+ const void* buf,
+ os_offset_t offset,
+ ulint n)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Check the existence and type of the given file.
+@param[in] path pathname of the file
+@param[out] exists true if file exists
+@param[out] type type of the file (if it exists)
+@return true if call succeeded */
+bool
+os_file_status(
+ const char* path,
+ bool* exists,
+ os_file_type_t* type);
+
+/** This function returns a new path name after replacing the basename
+in an old path with a new basename. The old_path is a full path
+name including the extension. The tablename is in the normal
+form "databasename/tablename". The new base name is found after
+the forward slash. Both input strings are null terminated.
+
+This function allocates memory to be returned. It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@param[in] old_path pathname
+@param[in] new_name new file name
+@return own: new full pathname */
+char*
+os_file_make_new_pathname(
+ const char* old_path,
+ const char* new_name);
+
+/** This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return. The result is used
+to inform a SHOW CREATE TABLE command.
+@param[in,out] data_dir_path Full path/data_dir_path */
+void
+os_file_make_data_dir_path(
+ char* data_dir_path);
+
+/** Create all missing subdirectories along the given path.
+@return DB_SUCCESS if OK, otherwise error code. */
+dberr_t
+os_file_create_subdirs_if_needed(
+ const char* path);
+
+#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+/* Test the function os_file_get_parent_dir. */
+void
+unit_test_os_file_get_parent_dir();
+#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
+
+/**
+Initializes the asynchronous io system. */
+int os_aio_init();
+
+/**
+Frees the asynchronous io system. */
+void os_aio_free();
+
+/** Request a read or write.
+@param type I/O request
+@param buf buffer
+@param offset file offset
+@param n number of bytes
+@retval DB_SUCCESS if request was queued successfully
+@retval DB_IO_ERROR on I/O error */
+dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n);
+
+/** Wait until there are no pending asynchronous writes.
+Only used on FLUSH TABLES...FOR EXPORT. */
+void os_aio_wait_until_no_pending_writes();
+
+
+/** Prints info of the aio arrays.
+@param[in/out] file file where to print */
+void
+os_aio_print(FILE* file);
+
+/** Refreshes the statistics used to print per-second averages. */
+void
+os_aio_refresh_stats();
+
+/** Checks that all slots in the system have been freed, that is, there are
+no pending io operations. */
+bool
+os_aio_all_slots_free();
+
+
+/** This function returns information about the specified file
+@param[in] path pathname of the file
+@param[in] stat_info information of a file in a directory
+@param[in] check_rw_perm for testing whether the file can be opened
+ in RW mode
+@param[in] read_only if true read only mode checks are enforced
+@return DB_SUCCESS if all OK */
+dberr_t
+os_file_get_status(
+ const char* path,
+ os_file_stat_t* stat_info,
+ bool check_rw_perm,
+ bool read_only);
+
+/** Set the file create umask
+@param[in] umask The umask to use for file creation. */
+void
+os_file_set_umask(ulint umask);
+
+#ifdef _WIN32
+
+/**
+Make file sparse, on Windows.
+
+@param[in] file file handle
+@param[in] is_sparse if true, make file sparse,
+ otherwise "unsparse" the file
+@return true on success, false on error */
+bool os_file_set_sparse_win32(os_file_t file, bool is_sparse = true);
+
+/**
+Changes file size on Windows
+
+If file is extended, following happens the bytes between
+old and new EOF are zeros.
+
+If file is sparse, "virtual" block is added at the end of
+allocated area.
+
+If file is normal, file system allocates storage.
+
+@param[in] pathname file path
+@param[in] file file handle
+@param[in] size size to preserve in bytes
+@return true if success */
+bool
+os_file_change_size_win32(
+ const char* pathname,
+ os_file_t file,
+ os_offset_t size);
+
+#endif /*_WIN32 */
+
+/** Free storage space associated with a section of the file.
+@param[in] fh Open file handle
+@param[in] off Starting offset (SEEK_SET)
+@param[in] len Size of the hole
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_punch_hole(
+ os_file_t fh,
+ os_offset_t off,
+ os_offset_t len)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Normalizes a directory path for the current OS:
+On Windows, we convert '/' to '\', else we convert '\' to '/'.
+@param[in,out] str A null-terminated directory and file path */
+void os_normalize_path(char* str);
+
+/* Determine if a path is an absolute path or not.
+@param[in] OS directory or file path to evaluate
+@retval true if an absolute path
+@retval false if a relative path */
+UNIV_INLINE
+bool
+is_absolute_path(
+ const char* path)
+{
+ if (path[0] == OS_PATH_SEPARATOR) {
+ return(true);
+ }
+
+#ifdef _WIN32
+ if (path[1] == ':' && path[2] == OS_PATH_SEPARATOR) {
+ return(true);
+ }
+#endif /* _WIN32 */
+
+ return(false);
+}
+
+#include "os0file.ic"
+
+#endif /* os0file_h */
diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic
new file mode 100644
index 00000000..e88f94b8
--- /dev/null
+++ b/storage/innobase/include/os0file.ic
@@ -0,0 +1,450 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0file.ic
+The interface to the operating system file io
+
+Created 2/20/2010 Jimmy Yang
+*******************************************************/
+
+#ifdef UNIV_PFS_IO
+/** NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register a file open or creation depending on "create_mode" */
+ register_pfs_file_open_begin(
+ &state, locker, key,
+ (create_mode == OS_FILE_CREATE)
+ ? PSI_FILE_CREATE : PSI_FILE_OPEN,
+ name, src_file, src_line);
+
+ pfs_os_file_t file = os_file_create_simple_func(
+ name, create_mode, access_type, read_only, success);
+
+ /* Register psi value for the file */
+ register_pfs_file_open_end(locker, file,
+ (*success == TRUE ? success : 0));
+
+ return(file);
+}
+
+/** NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register a file open or creation depending on "create_mode" */
+ register_pfs_file_open_begin(
+ &state, locker, key,
+ create_mode == OS_FILE_CREATE
+ ? PSI_FILE_CREATE : PSI_FILE_OPEN,
+ name, src_file, src_line);
+
+ pfs_os_file_t file = os_file_create_simple_no_error_handling_func(
+ name, create_mode, access_type, read_only, success);
+
+ register_pfs_file_open_end(locker, file,
+ (*success == TRUE ? success : 0));
+
+ return(file);
+}
+
+/** NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@param[in] key Performance Schema Key
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
+ is desired, OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really us
+ async I/O or unbuffered I/O: look in the
+ function source code for the exact rules
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ ulint create_mode,
+ ulint purpose,
+ ulint type,
+ bool read_only,
+ bool* success,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register a file open or creation depending on "create_mode" */
+ register_pfs_file_open_begin(
+ &state, locker, key,
+ create_mode == OS_FILE_CREATE
+ ? PSI_FILE_CREATE : PSI_FILE_OPEN,
+ name, src_file, src_line);
+
+ pfs_os_file_t file = os_file_create_func(
+ name, create_mode, purpose, type, read_only, success);
+
+ register_pfs_file_open_end(locker, file,
+ (*success == TRUE ? success : 0));
+
+ return(file);
+}
+/**
+NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@param[in] file handle to a file
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_close_func(
+ pfs_os_file_t file,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ /* register the file close */
+ register_pfs_file_io_begin(
+ &state, locker, file, 0, PSI_FILE_CLOSE, src_file, src_line);
+
+ bool result = os_file_close_func(file);
+
+ register_pfs_file_io_end(locker, 0);
+
+ return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_read_func(
+ const IORequest& type,
+ pfs_os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(
+ &state, locker, file, n, PSI_FILE_READ, src_file, src_line);
+
+ dberr_t result;
+
+ result = os_file_read_func(type, file, buf, offset, n);
+
+ register_pfs_file_io_end(locker, n);
+
+ return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_read_no_error_handling_func() which requests a synchronous
+read operation.
+@param[in] type IO request context
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[out] o number of bytes actually read
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_read_no_error_handling_func(
+ const IORequest& type,
+ pfs_os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ ulint* o,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(
+ &state, locker, file, n, PSI_FILE_READ, src_file, src_line);
+
+ dberr_t result = os_file_read_no_error_handling_func(
+ type, file, buf, offset, n, o);
+
+ register_pfs_file_io_end(locker, n);
+
+ return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@param[in] type IO request context
+@param[in] name Name of the file or path as NUL terminated
+ string
+@param[in] file Open file handle
+@param[out] buf buffer where to read
+@param[in] offset file offset where to read
+@param[in] n number of bytes to read
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return error code
+@retval DB_SUCCESS if the request was successfully fulfilled */
+UNIV_INLINE
+dberr_t
+pfs_os_file_write_func(
+ const IORequest& type,
+ const char* name,
+ pfs_os_file_t file,
+ const void* buf,
+ os_offset_t offset,
+ ulint n,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(
+ &state, locker, file, n, PSI_FILE_WRITE, src_file, src_line);
+
+ dberr_t result;
+
+ result = os_file_write_func(type, name, file, buf, offset, n);
+
+ register_pfs_file_io_end(locker, n);
+
+ return(result);
+}
+
+
+/** NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+Flushes the write buffers of a given file to the disk.
+@param[in] file Open file handle
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_flush_func(
+ pfs_os_file_t file,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_io_begin(
+ &state, locker, file, 0, PSI_FILE_SYNC, src_file, src_line);
+
+ bool result = os_file_flush_func(file);
+
+ register_pfs_file_io_end(locker, 0);
+
+ return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@param[in] key Performance Schema Key
+@param[in] oldpath old file path as a null-terminated string
+@param[in] newpath new file path
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_rename_func(
+ mysql_pfs_key_t key,
+ const char* oldpath,
+ const char* newpath,
+ const char* src_file,
+ uint src_line)
+
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_rename_begin(
+ &state, locker, key, PSI_FILE_RENAME, newpath,
+ src_file, src_line);
+
+ bool result = os_file_rename_func(oldpath, newpath);
+
+ register_pfs_file_rename_end(locker, oldpath, newpath, !result);
+
+ return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@param[in] key Performance Schema Key
+@param[in] name old file path as a null-terminated string
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_close_begin(
+ &state, locker, key, PSI_FILE_DELETE, name, src_file, src_line);
+
+ bool result = os_file_delete_func(name);
+
+ register_pfs_file_close_end(locker, 0);
+
+ return(result);
+}
+
+/**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@param[in] key Performance Schema Key
+@param[in] name old file path as a null-terminated string
+@param[in] exist indicate if file pre-exist
+@param[in] src_file file name where func invoked
+@param[in] src_line line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+ mysql_pfs_key_t key,
+ const char* name,
+ bool* exist,
+ const char* src_file,
+ uint src_line)
+{
+ PSI_file_locker_state state;
+ struct PSI_file_locker* locker = NULL;
+
+ register_pfs_file_close_begin(
+ &state, locker, key, PSI_FILE_DELETE, name, src_file, src_line);
+
+ bool result = os_file_delete_if_exists_func(name, exist);
+
+ register_pfs_file_close_end(locker, 0);
+
+ return(result);
+}
+#endif /* UNIV_PFS_IO */
diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h
new file mode 100644
index 00000000..ed989045
--- /dev/null
+++ b/storage/innobase/include/os0thread.h
@@ -0,0 +1,98 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0thread.h
+The interface to the operating system
+process and thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0thread_h
+#define os0thread_h
+
+#include "univ.i"
+
+/* Possible fixed priorities for threads */
+#define OS_THREAD_PRIORITY_NONE 100
+#define OS_THREAD_PRIORITY_BACKGROUND 1
+#define OS_THREAD_PRIORITY_NORMAL 2
+#define OS_THREAD_PRIORITY_ABOVE_NORMAL 3
+
+#ifdef _WIN32
+typedef DWORD os_thread_t;
+typedef DWORD os_thread_id_t; /*!< In Windows the thread id
+ is an unsigned long int */
+extern "C" {
+typedef LPTHREAD_START_ROUTINE os_thread_func_t;
+}
+
+/** Macro for specifying a Windows thread start function. */
+#define DECLARE_THREAD(func) WINAPI func
+#else
+
+typedef pthread_t os_thread_t;
+typedef pthread_t os_thread_id_t; /*!< In Unix we use the thread
+ handle itself as the id of
+ the thread */
+extern "C" { typedef void* (*os_thread_func_t)(void*); }
+
+/** Macro for specifying a POSIX thread start function. */
+#define DECLARE_THREAD(func) func
+#endif /* _WIN32 */
+
+/* Define a function pointer type to use in a typecast */
+typedef void* (*os_posix_f_t) (void*);
+
+#ifdef HAVE_PSI_INTERFACE
+/* Define for performance schema registration key */
+typedef unsigned int mysql_pfs_key_t;
+#endif /* HAVE_PSI_INTERFACE */
+
+#ifndef _WIN32
+#define os_thread_eq(a,b) pthread_equal(a, b)
+#define os_thread_yield() sched_yield()
+#define os_thread_get_curr_id() pthread_self()
+#else
+bool os_thread_eq(os_thread_id_t a, os_thread_id_t b);
+void os_thread_yield();
+os_thread_id_t os_thread_get_curr_id();
+#endif
+
+/****************************************************************//**
+Creates a new thread of execution. The execution starts from
+the function given.
+NOTE: We count the number of threads in os_thread_exit(). A created
+thread should always use that to exit so thatthe thread count will be
+decremented.
+We do not return an error code because if there is one, we crash here. */
+os_thread_t os_thread_create(os_thread_func_t func, void *arg= nullptr);
+
+/** Detach and terminate the current thread. */
+ATTRIBUTE_NORETURN void os_thread_exit();
+
+/*****************************************************************//**
+The thread sleeps at least the time given in microseconds. */
+void
+os_thread_sleep(
+/*============*/
+ ulint tm); /*!< in: time in microseconds */
+
+#endif
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
new file mode 100644
index 00000000..c0f3bf68
--- /dev/null
+++ b/storage/innobase/include/page0cur.h
@@ -0,0 +1,350 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.h
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef page0cur_h
+#define page0cur_h
+
+#include "page0page.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets the record where the cursor is positioned.
+@return record */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+ page_cur_t* cur); /*!< in: page cursor */
+#else /* UNIV_DEBUG */
+# define page_cur_get_page(cur) page_align((cur)->rec)
+# define page_cur_get_block(cur) (cur)->block
+# define page_cur_get_page_zip(cur) buf_block_get_page_zip((cur)->block)
+# define page_cur_get_rec(cur) (cur)->rec
+#endif /* UNIV_DEBUG */
+# define is_page_cur_get_page_zip(cur) is_buf_block_get_page_zip((cur)->block)
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+ const page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+ const page_cur_t* cur); /*!< in: cursor */
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+ const rec_t* rec, /*!< in: record on a page */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ page_cur_t* cur); /*!< out: page cursor */
+/**********************************************************//**
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+ page_cur_t* cur); /*!< in/out: cursor; must not be after last */
+/**********************************************************//**
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+ page_cur_t* cur); /*!< in/out: cursor; not before first */
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dtuple_t* tuple, /*!< in: pointer to a data tuple */
+ dict_index_t* index, /*!< in: record descriptor */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+Returns pointer to inserted record if succeed, i.e., enough
+space available, NULL otherwise. The cursor stays at the same position.
+@return pointer to record if succeed, NULL otherwise */
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+ const page_cur_t*cur, /*!< in: page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: record to insert after cur */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the
+next record after the deleted one. */
+void
+page_cur_delete_rec(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(
+ cursor->rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+
+/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@param enc_hdr encoded fixed-size header bits
+@param hdr_c number of common record header bytes with prev
+@param data_c number of common data bytes with prev
+@param data literal header and data bytes
+@param data_len length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_redundant(const buf_block_t &block, bool reuse,
+ ulint prev, ulint enc_hdr,
+ size_t hdr_c, size_t data_c,
+ const void *data, size_t data_len);
+
+/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param shift unless !reuse: number of bytes the PAGE_FREE is moving
+@param enc_hdr_l number of copied record header bytes, plus record type bits
+@param hdr_c number of common record header bytes with prev
+@param data_c number of common data bytes with prev
+@param data literal header and data bytes
+@param data_len length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse,
+ ulint prev, ulint shift, ulint enc_hdr_l,
+ size_t hdr_c, size_t data_c,
+ const void *data, size_t data_len);
+
+/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page.
+@param block B-tree or R-tree page in ROW_FORMAT=REDUNDANT
+@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_redundant(const buf_block_t &block, ulint prev);
+
+/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size data payload size, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
+ size_t hdr_size, size_t data_size);
+
+/** Search the right position for a page cursor.
+@param[in] block buffer block
+@param[in] index index tree
+@param[in] tuple data tuple
+@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE
+@param[out] cursor page cursor
+@return number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ page_cur_mode_t mode,
+ page_cur_t* cursor);
+
+/** Search the right position for a page cursor.
+@param[in] block buffer block
+@param[in] index index tree
+@param[in] tuple data tuple
+@param[out] cursor page cursor
+@return number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ page_cur_t* cursor);
+
+/****************************************************************//**
+Searches the right position for a page cursor. */
+void
+page_cur_search_with_match(
+/*=======================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ ulint* iup_matched_fields,
+ /*!< in/out: already matched
+ fields in upper limit record */
+ ulint* ilow_matched_fields,
+ /*!< in/out: already matched
+ fields in lower limit record */
+ page_cur_t* cursor, /*!< out: page cursor */
+ rtr_info_t* rtr_info);/*!< in/out: rtree search stack */
+#ifdef BTR_CUR_HASH_ADAPT
+/** Search the right position for a page cursor.
+@param[in] block buffer block
+@param[in] index index tree
+@param[in] tuple key to be searched for
+@param[in] mode search mode
+@param[in,out] iup_matched_fields already matched fields in the
+upper limit record
+@param[in,out] iup_matched_bytes already matched bytes in the
+first partially matched field in the upper limit record
+@param[in,out] ilow_matched_fields already matched fields in the
+lower limit record
+@param[in,out] ilow_matched_bytes already matched bytes in the
+first partially matched field in the lower limit record
+@param[out] cursor page cursor */
+void
+page_cur_search_with_match_bytes(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ page_cur_mode_t mode,
+ ulint* iup_matched_fields,
+ ulint* iup_matched_bytes,
+ ulint* ilow_matched_fields,
+ ulint* ilow_matched_bytes,
+ page_cur_t* cursor);
+#endif /* BTR_CUR_HASH_ADAPT */
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+ buf_block_t* block, /*!< in: page */
+ page_cur_t* cursor);/*!< out: page cursor */
+
+/** Index page cursor */
+
+struct page_cur_t{
+ const dict_index_t* index;
+ rec_t* rec; /*!< pointer to a record on page */
+ rec_offs* offsets;
+ buf_block_t* block; /*!< pointer to the block containing rec */
+};
+
+#include "page0cur.ic"
+
+#endif
diff --git a/storage/innobase/include/page0cur.ic b/storage/innobase/include/page0cur.ic
new file mode 100644
index 00000000..828be684
--- /dev/null
+++ b/storage/innobase/include/page0cur.ic
@@ -0,0 +1,291 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.ic
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ ut_ad(cur);
+
+ if (cur->rec) {
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ }
+
+ return(page_align(cur->rec));
+}
+
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ ut_ad(cur);
+
+ if (cur->rec) {
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ }
+
+ return(cur->block);
+}
+
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ return(buf_block_get_page_zip(page_cur_get_block(cur)));
+}
+
+/*********************************************************//**
+Gets the record where the cursor is positioned.
+@return record */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ ut_ad(cur);
+
+ if (cur->rec) {
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ }
+
+ return(cur->rec);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur) /*!< in: cursor */
+{
+ cur->block = (buf_block_t*) block;
+ cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur) /*!< in: cursor */
+{
+ cur->block = (buf_block_t*) block;
+ cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+ const page_cur_t* cur) /*!< in: cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ return(page_rec_is_infimum(cur->rec));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+ const page_cur_t* cur) /*!< in: cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ return(page_rec_is_supremum(cur->rec));
+}
+
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+ const rec_t* rec, /*!< in: record on a page */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ page_cur_t* cur) /*!< out: page cursor */
+{
+ ut_ad(rec && block && cur);
+ ut_ad(page_align(rec) == block->frame);
+
+ cur->rec = (rec_t*) rec;
+ cur->block = (buf_block_t*) block;
+}
+
+/**********************************************************//**
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+ page_cur_t* cur) /*!< in/out: cursor; must not be after last */
+{
+ ut_ad(!page_cur_is_after_last(cur));
+
+ cur->rec = page_rec_get_next(cur->rec);
+}
+
+/**********************************************************//**
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+ page_cur_t* cur) /*!< in/out: page cursor, not before first */
+{
+ ut_ad(!page_cur_is_before_first(cur));
+
+ cur->rec = page_rec_get_prev(cur->rec);
+}
+
+/** Search the right position for a page cursor.
+@param[in] block buffer block
+@param[in] index index tree
+@param[in] tuple data tuple
+@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE
+@param[out] cursor page cursor
+@return number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ page_cur_mode_t mode,
+ page_cur_t* cursor)
+{
+ ulint low_match = 0;
+ ulint up_match = 0;
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ page_cur_search_with_match(block, index, tuple, mode,
+ &up_match, &low_match, cursor, NULL);
+ return(low_match);
+}
+
+/** Search the right position for a page cursor.
+@param[in] block buffer block
+@param[in] index index tree
+@param[in] tuple data tuple
+@param[out] cursor page cursor
+@return number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ page_cur_t* cursor)
+{
+ return(page_cur_search(block, index, tuple, PAGE_CUR_LE, cursor));
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dtuple_t* tuple, /*!< in: pointer to a data tuple */
+ dict_index_t* index, /*!< in: record descriptor */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ rec_t* rec;
+ ulint size = rec_get_converted_size(index, tuple, n_ext);
+
+ if (!*heap) {
+ *heap = mem_heap_create(size
+ + (4 + REC_OFFS_HEADER_SIZE
+ + dtuple_get_n_fields(tuple))
+ * sizeof **offsets);
+ }
+
+ rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(*heap, size),
+ index, tuple, n_ext);
+
+ *offsets = rec_get_offsets(rec, index, *offsets,
+ page_is_leaf(cursor->block->frame)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, heap);
+ ut_ad(size == rec_offs_size(*offsets));
+
+ if (is_buf_block_get_page_zip(cursor->block)) {
+ rec = page_cur_insert_rec_zip(
+ cursor, index, rec, *offsets, mtr);
+ } else {
+ rec = page_cur_insert_rec_low(cursor,
+ index, rec, *offsets, mtr);
+ }
+
+ ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets));
+ return(rec);
+}
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
new file mode 100644
index 00000000..a73b9e48
--- /dev/null
+++ b/storage/innobase/include/page0page.h
@@ -0,0 +1,1171 @@
+/*****************************************************************************
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0page_h
+#define page0page_h
+
+#include "page0types.h"
+#include "fsp0fsp.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "rem0rec.h"
+#include "mach0data.h"
+#ifndef UNIV_INNOCHECKSUM
+#include "dict0dict.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+
+/* PAGE HEADER
+ ===========
+
+Index page header starts at the first offset left free by the FIL-module */
+
+typedef byte page_header_t;
+#endif /* !UNIV_INNOCHECKSUM */
+
+#define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this
+ offset */
+/*-----------------------------*/
+#define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */
+#define PAGE_HEAP_TOP 2 /* pointer to record heap top */
+#define PAGE_N_HEAP 4 /* number of records in the heap,
+ bit 15=flag: new-style compact page format */
+#define PAGE_FREE 6 /* pointer to start of page free record list */
+#define PAGE_GARBAGE 8 /* number of bytes in deleted records */
+#define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or
+ 0 if this info has been reset by a delete,
+ for example */
+
+/** This 10-bit field is usually 0. In B-tree index pages of
+ROW_FORMAT=REDUNDANT tables, this byte can contain garbage if the .ibd
+file was created in MySQL 4.1.0 or if the table resides in the system
+tablespace and was created before MySQL 4.1.1 or MySQL 4.0.14.
+In this case, the FIL_PAGE_TYPE would be FIL_PAGE_INDEX.
+
+In ROW_FORMAT=COMPRESSED tables, this field is always 0, because
+instant ADD COLUMN is not supported.
+
+In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables, this field is
+always 0, except in the root page of the clustered index after instant
+ADD COLUMN.
+
+Instant ADD COLUMN will change FIL_PAGE_TYPE to FIL_PAGE_TYPE_INSTANT
+and initialize the PAGE_INSTANT field to the original number of
+fields in the clustered index (dict_index_t::n_core_fields). The most
+significant bits are in the first byte, and the least significant 5
+bits are stored in the most significant 5 bits of PAGE_DIRECTION_B.
+
+These FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be assigned even if
+instant ADD COLUMN was not committed. Changes to these page header fields
+are not undo-logged, but changes to the hidden metadata record are.
+If the server is killed and restarted, the page header fields could
+remain set even though no metadata record is present.
+
+When the table becomes empty, the PAGE_INSTANT field and the
+FIL_PAGE_TYPE can be reset and any metadata record be removed. */
+#define PAGE_INSTANT 12
+
+/** last insert direction: PAGE_LEFT, ....
+In ROW_FORMAT=REDUNDANT tables created before MySQL 4.1.1 or MySQL 4.0.14,
+this byte can be garbage. */
+#define PAGE_DIRECTION_B 13
+#define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same
+ direction */
+#define PAGE_N_RECS 16 /* number of user records on the page */
+/** The largest DB_TRX_ID that may have modified a record on the page;
+Defined only in secondary index leaf pages and in change buffer leaf pages.
+Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */
+#define PAGE_MAX_TRX_ID 18
+/** The AUTO_INCREMENT value (on persistent clustered index root pages). */
+#define PAGE_ROOT_AUTO_INC PAGE_MAX_TRX_ID
+#define PAGE_HEADER_PRIV_END 26 /* end of private data structure of the page
+ header which are set in a page create */
+/*----*/
+#define PAGE_LEVEL 26 /* level of the node in an index tree; the
+ leaf level is the level 0. This field should
+ not be written to after page creation. */
+#define PAGE_INDEX_ID 28 /* index id where the page belongs.
+ This field should not be written to after
+ page creation. */
+
+#define PAGE_BTR_SEG_LEAF 36 /* file segment header for the leaf pages in
+ a B-tree: defined only on the root page of a
+ B-tree, but not in the root of an ibuf tree */
+#define PAGE_BTR_IBUF_FREE_LIST PAGE_BTR_SEG_LEAF
+#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF
+ /* in the place of PAGE_BTR_SEG_LEAF and _TOP
+ there is a free list base node if the page is
+ the root page of an ibuf tree, and at the same
+ place is the free list node if the page is in
+ a free list */
+#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE)
+ /* file segment header for the non-leaf pages
+ in a B-tree: defined only on the root page of
+ a B-tree, but not in the root of an ibuf
+ tree */
+/*----*/
+#define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE)
+ /* start of data on the page */
+
+#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES)
+ /* offset of the page infimum record on an
+ old-style page */
+#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8)
+ /* offset of the page supremum record on an
+ old-style page */
+#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9)
+ /* offset of the page supremum record end on
+ an old-style page */
+#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES)
+ /* offset of the page infimum record on a
+ new-style compact page */
+#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8)
+ /* offset of the page supremum record on a
+ new-style compact page */
+#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8)
+ /* offset of the page supremum record end on
+ a new-style compact page */
+/*-----------------------------*/
+
+/* Heap numbers */
+#define PAGE_HEAP_NO_INFIMUM 0U /* page infimum */
+#define PAGE_HEAP_NO_SUPREMUM 1U /* page supremum */
+#define PAGE_HEAP_NO_USER_LOW 2U /* first user record in
+ creation (insertion) order,
+ not necessarily collation order;
+ this record may have been deleted */
+
+/* Directions of cursor movement (stored in PAGE_DIRECTION field) */
+constexpr uint16_t PAGE_LEFT= 1;
+constexpr uint16_t PAGE_RIGHT= 2;
+constexpr uint16_t PAGE_SAME_REC= 3;
+constexpr uint16_t PAGE_SAME_PAGE= 4;
+constexpr uint16_t PAGE_NO_DIRECTION= 5;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* PAGE DIRECTORY
+ ==============
+*/
+
+typedef byte page_dir_slot_t;
+
+/* Offset of the directory start down from the page end. We call the
+slot with the highest file address directory start, as it points to
+the first record in the list of records. */
+#define PAGE_DIR FIL_PAGE_DATA_END
+
+/* We define a slot in the page directory as two bytes */
+constexpr uint16_t PAGE_DIR_SLOT_SIZE= 2;
+
+/* The offset of the physically lower end of the directory, counted from
+page end, when the page is empty */
+#define PAGE_EMPTY_DIR_START (PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE)
+
+/* The maximum and minimum number of records owned by a directory slot. The
+number may drop below the minimum in the first and the last slot in the
+directory. */
+#define PAGE_DIR_SLOT_MAX_N_OWNED 8
+#define PAGE_DIR_SLOT_MIN_N_OWNED 4
+
+extern my_bool srv_immediate_scrub_data_uncompressed;
+#endif /* UNIV_INNOCHECKSUM */
+
+/** Get the start of a page frame.
+@param[in] ptr pointer within a page frame
+@return start of the page frame */
+MY_ATTRIBUTE((const))
+inline page_t* page_align(void *ptr)
+{
+ return my_assume_aligned<UNIV_PAGE_SIZE_MIN>
+ (reinterpret_cast<page_t*>(ut_align_down(ptr, srv_page_size)));
+}
+inline const page_t *page_align(const void *ptr)
+{
+ return page_align(const_cast<void*>(ptr));
+}
+
+/** Gets the byte offset within a page frame.
+@param[in] ptr pointer within a page frame
+@return offset from the start of the page */
+MY_ATTRIBUTE((const))
+inline uint16_t page_offset(const void* ptr)
+{
+ return static_cast<uint16_t>(ut_align_offset(ptr, srv_page_size));
+}
+
+/** Determine whether an index page is not in ROW_FORMAT=REDUNDANT.
+@param[in] page index page
+@return nonzero if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@retval 0 if ROW_FORMAT=REDUNDANT */
+inline
+byte
+page_is_comp(const page_t* page)
+{
+ ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+ return(page[PAGE_HEADER + PAGE_N_HEAP] & 0x80);
+}
+
+/** Determine whether an index page is empty.
+@param[in] page index page
+@return whether the page is empty (PAGE_N_RECS = 0) */
+inline
+bool
+page_is_empty(const page_t* page)
+{
+ ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+ return !*reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_N_RECS
+ + page);
+}
+
+/** Determine whether an index page contains garbage.
+@param[in] page index page
+@return whether the page contains garbage (PAGE_GARBAGE is not 0) */
+inline
+bool
+page_has_garbage(const page_t* page)
+{
+ ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+ return *reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_GARBAGE
+ + page);
+}
+
+/** Determine whether an B-tree or R-tree index page is a leaf page.
+@param[in] page index page
+@return true if the page is a leaf (PAGE_LEVEL = 0) */
+inline
+bool
+page_is_leaf(const page_t* page)
+{
+ ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+ return !*reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_LEVEL
+ + page);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/** Determine whether an index page record is not in ROW_FORMAT=REDUNDANT.
+@param[in] rec record in an index page frame (not a copy)
+@return nonzero if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@retval 0 if ROW_FORMAT=REDUNDANT */
+inline
+byte
+page_rec_is_comp(const byte* rec)
+{
+ return(page_is_comp(page_align(rec)));
+}
+
+# ifdef UNIV_DEBUG
+/** Determine if the record is the metadata pseudo-record
+in the clustered index.
+@param[in] rec leaf page record on an index page
+@return whether the record is the metadata pseudo-record */
+inline bool page_rec_is_metadata(const rec_t* rec)
+{
+ return rec_get_info_bits(rec, page_rec_is_comp(rec))
+ & REC_INFO_MIN_REC_FLAG;
+}
+# endif /* UNIV_DEBUG */
+
+/** Determine the offset of the infimum record on the page.
+@param[in] page index page
+@return offset of the infimum record in record list, relative from page */
+inline
+unsigned
+page_get_infimum_offset(const page_t* page)
+{
+ ut_ad(!page_offset(page));
+ return page_is_comp(page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM;
+}
+
+/** Determine the offset of the supremum record on the page.
+@param[in] page index page
+@return offset of the supremum record in record list, relative from page */
+inline
+unsigned
+page_get_supremum_offset(const page_t* page)
+{
+ ut_ad(!page_offset(page));
+ return page_is_comp(page) ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM;
+}
+
+/** Determine whether an index page record is a user record.
+@param[in] offset record offset in the page
+@retval true if a user record
+@retval false if the infimum or supremum pseudo-record */
+inline
+bool
+page_rec_is_user_rec_low(ulint offset)
+{
+ compile_time_assert(PAGE_OLD_INFIMUM >= PAGE_NEW_INFIMUM);
+ compile_time_assert(PAGE_OLD_SUPREMUM >= PAGE_NEW_SUPREMUM);
+ compile_time_assert(PAGE_NEW_INFIMUM < PAGE_OLD_SUPREMUM);
+ compile_time_assert(PAGE_OLD_INFIMUM < PAGE_NEW_SUPREMUM);
+ compile_time_assert(PAGE_NEW_SUPREMUM < PAGE_OLD_SUPREMUM_END);
+ compile_time_assert(PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM_END);
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+ ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+
+ return(offset != PAGE_NEW_SUPREMUM
+ && offset != PAGE_NEW_INFIMUM
+ && offset != PAGE_OLD_INFIMUM
+ && offset != PAGE_OLD_SUPREMUM);
+}
+
+/** Determine if a record is the supremum record on an index page.
+@param[in] offset record offset in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum_low(ulint offset)
+{
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+ ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+ return(offset == PAGE_NEW_SUPREMUM || offset == PAGE_OLD_SUPREMUM);
+}
+
+/** Determine if a record is the infimum record on an index page.
+@param[in] offset record offset in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum_low(ulint offset)
+{
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+ ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+ return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM);
+}
+
+/** Determine whether an B-tree or R-tree index record is in a leaf page.
+@param[in] rec index record in an index page
+@return true if the record is in a leaf page */
+inline
+bool
+page_rec_is_leaf(const page_t* rec)
+{
+ const page_t* page = page_align(rec);
+ ut_ad(ulint(rec - page) >= page_get_infimum_offset(page));
+ bool leaf = page_is_leaf(page);
+ ut_ad(!page_rec_is_comp(rec)
+ || !page_rec_is_user_rec_low(ulint(rec - page))
+ || leaf == !rec_get_node_ptr_flag(rec));
+ return leaf;
+}
+
+/** Determine whether an index page record is a user record.
+@param[in] rec record in an index page
+@return true if a user record */
+inline
+bool
+page_rec_is_user_rec(const rec_t* rec);
+
+/** Determine whether an index page record is the supremum record.
+@param[in] rec record in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum(const rec_t* rec);
+
+/** Determine whether an index page record is the infimum record.
+@param[in] rec record in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum(const rec_t* rec);
+
+/** Read PAGE_MAX_TRX_ID.
+@param[in] page index page
+@return the value of PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline trx_id_t page_get_max_trx_id(const page_t *page)
+{
+ ut_ad(fil_page_index_page_check(page));
+ static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
+ const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_MAX_TRX_ID);
+ return mach_read_from_8(p);
+}
+
+/**
+Set the number of owned records.
+@tparam compressed whether to update any ROW_FORMAT=COMPRESSED page as well
+@param[in,out] block index page
+@param[in,out] rec record in block.frame
+@param[in] n_owned number of records skipped in the sparse page directory
+@param[in] comp whether ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@param[in,out] mtr mini-transaction */
+template<bool compressed>
+inline void page_rec_set_n_owned(buf_block_t *block, rec_t *rec, ulint n_owned,
+ bool comp, mtr_t *mtr)
+{
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(comp == (page_is_comp(block->frame) != 0));
+
+ if (page_zip_des_t *page_zip= compressed
+ ? buf_block_get_page_zip(block) : nullptr)
+ {
+ ut_ad(comp);
+ rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ if (rec_get_status(rec) != REC_STATUS_SUPREMUM)
+ page_zip_rec_set_owned(block, rec, n_owned, mtr);
+ }
+ else
+ {
+ rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED;
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, rec, (*rec & ~REC_N_OWNED_MASK) |
+ (n_owned << REC_N_OWNED_SHIFT));
+ }
+}
+
+/*************************************************************//**
+Sets the max trx id field value. */
+void
+page_set_max_trx_id(
+/*================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr); /*!< in/out: mini-transaction, or NULL */
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
+
+/** Persist the AUTO_INCREMENT value on a clustered index root page.
+@param[in,out] block clustered index root page
+@param[in] autoinc next available AUTO_INCREMENT value
+@param[in,out] mtr mini-transaction
+@param[in] reset whether to reset the AUTO_INCREMENT
+ to a possibly smaller value than currently
+ exists in the page */
+void
+page_set_autoinc(
+ buf_block_t* block,
+ ib_uint64_t autoinc,
+ mtr_t* mtr,
+ bool reset)
+ MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM).
+@return SPLIT SEQUENCE NUMBER */
+UNIV_INLINE
+node_seq_t
+page_get_ssn_id(
+/*============*/
+ const page_t* page); /*!< in: page */
+/*************************************************************//**
+Sets the RTREE SPLIT SEQUENCE NUMBER field value */
+UNIV_INLINE
+void
+page_set_ssn_id(
+/*============*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ node_seq_t ssn_id, /*!< in: split sequence id */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
+
+#endif /* !UNIV_INNOCHECKSUM */
+/** Read a page header field. */
+inline uint16_t page_header_get_field(const page_t *page, ulint field)
+{
+ ut_ad(field <= PAGE_INDEX_ID);
+ ut_ad(!(field & 1));
+ return mach_read_from_2(my_assume_aligned<2>(PAGE_HEADER + field + page));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+uint16_t
+page_header_get_offs(
+/*=================*/
+ const page_t* page, /*!< in: page */
+ ulint field) /*!< in: PAGE_FREE, ... */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+Returns the pointer stored in the given header field, or NULL. */
+#define page_header_get_ptr(page, field) \
+ (page_header_get_offs(page, field) \
+ ? page + page_header_get_offs(page, field) : NULL)
+
+/**
+Reset PAGE_LAST_INSERT.
+@param[in,out] block file page
+@param[in,out] mtr mini-transaction */
+inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page))
+#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page))
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record */
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+ const page_t* page, /*!< in: page */
+ ulint nth) /*!< in: nth record */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record */
+UNIV_INLINE
+rec_t*
+page_rec_get_nth(
+/*=============*/
+ page_t* page, /*< in: page */
+ ulint nth) /*!< in: nth record */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+ page_t* page) /*!< in: page */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+uint32_t
+page_get_page_no(
+/*=============*/
+ const page_t* page); /*!< in: page */
+
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+uint32_t
+page_get_space_id(
+/*==============*/
+ const page_t* page); /*!< in: page */
+
+/*************************************************************//**
+Gets the number of user records on page (the infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_get_n_recs(
+/*============*/
+ const page_t* page); /*!< in: index page */
+
+/***************************************************************//**
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records.
+This is the inverse function of page_rec_get_nth().
+@return number of records */
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+ const rec_t* rec); /*!< in: the physical record */
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_heap(
+/*================*/
+ const page_t* page); /*!< in: index page */
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_slots(
+/*=================*/
+ const page_t* page); /*!< in: index page */
+/** Gets the pointer to a directory slot.
+@param n sparse directory slot number
+@return pointer to the sparse directory slot */
+inline page_dir_slot_t *page_dir_get_nth_slot(page_t *page, ulint n)
+{
+ ut_ad(page_dir_get_n_slots(page) > n);
+ static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+ return my_assume_aligned<2>(page + srv_page_size - (PAGE_DIR + 2) - n * 2);
+}
+inline const page_dir_slot_t *page_dir_get_nth_slot(const page_t *page,ulint n)
+{
+ return page_dir_get_nth_slot(const_cast<page_t*>(page), n);
+}
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+ const rec_t* rec); /*!< in: record */
+/** Get the record pointed to by a directory slot.
+@param[in] slot directory slot
+@return pointer to record */
+inline rec_t *page_dir_slot_get_rec(page_dir_slot_t *slot)
+{
+ return page_align(slot) + mach_read_from_2(my_assume_aligned<2>(slot));
+}
+inline const rec_t *page_dir_slot_get_rec(const page_dir_slot_t *slot)
+{
+ return page_dir_slot_get_rec(const_cast<rec_t*>(slot));
+}
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+ const page_dir_slot_t* slot); /*!< in: page directory slot */
+/************************************************************//**
+Calculates the space reserved for directory slots of a given
+number of records. The exact value is a fraction number
+n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is
+rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+ ulint n_recs); /*!< in: number of records */
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return the directory slot number */
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+ const rec_t* rec); /*!< in: the physical record */
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+ const rec_t* rec); /*!< in: the physical record */
+/** Determine whether a page has any siblings.
+@param[in] page page frame
+@return true if the page has any siblings */
+inline bool page_has_siblings(const page_t* page)
+{
+ compile_time_assert(!(FIL_PAGE_PREV % 8));
+ compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ return *reinterpret_cast<const uint64_t*>(page + FIL_PAGE_PREV)
+ != ~uint64_t(0);
+}
+
+/** Determine whether a page has a predecessor.
+@param[in] page page frame
+@return true if the page has a predecessor */
+inline bool page_has_prev(const page_t* page)
+{
+ return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_PREV)
+ != FIL_NULL;
+}
+
+/** Determine whether a page has a successor.
+@param[in] page page frame
+@return true if the page has a successor */
+inline bool page_has_next(const page_t* page)
+{
+ return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_NEXT)
+ != FIL_NULL;
+}
+
+/** Read the AUTO_INCREMENT value from a clustered index root page.
+@param[in] page clustered index root page
+@return the persisted AUTO_INCREMENT value */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline uint64_t page_get_autoinc(const page_t *page)
+{
+ ut_d(uint16_t page_type= fil_page_get_type(page));
+ ut_ad(page_type == FIL_PAGE_INDEX || page_type == FIL_PAGE_TYPE_INSTANT);
+ ut_ad(!page_has_siblings(page));
+ const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_ROOT_AUTO_INC);
+ return mach_read_from_8(p);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+ const rec_t* rec, /*!< in: pointer to record */
+ ulint comp); /*!< in: nonzero=compact page layout */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+ rec_t* rec); /*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+ const rec_t* rec); /*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the next non delete-marked record on the page.
+If all subsequent records are delete-marked, then this function
+will return the supremum record.
+@return pointer to next non delete-marked record or pointer to supremum */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_non_del_marked(
+/*=============================*/
+ const rec_t* rec); /*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+ const rec_t* rec); /*!< in: pointer to record, must not be page
+ infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+ rec_t* rec); /*!< in: pointer to record,
+ must not be page infimum */
+
+/************************************************************//**
+true if the record is the first user record on a page.
+@return true if the first user record */
+UNIV_INLINE
+bool
+page_rec_is_first(
+/*==============*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+true if the record is the second user record on a page.
+@return true if the second user record */
+UNIV_INLINE
+bool
+page_rec_is_second(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+true if the record is the last user record on a page.
+@return true if the last user record */
+UNIV_INLINE
+bool
+page_rec_is_last(
+/*=============*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+true if distance between the records (measured in number of times we have to
+move to the next record) is at most the specified value
+@param[in] left_rec lefter record
+@param[in] right_rec righter record
+@param[in] val specified value to compare
+@return true if the distance is smaller than the value */
+UNIV_INLINE
+bool
+page_rec_distance_is_at_most(
+/*=========================*/
+ const rec_t* left_rec,
+ const rec_t* right_rec,
+ ulint val)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+true if the record is the second last user record on a page.
+@return true if the second last user record */
+UNIV_INLINE
+bool
+page_rec_is_second_last(
+/*====================*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs);/*!< in: number of records */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap if page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs);/*!< in: number of records */
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((const));
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list
+excluding the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+uint16_t
+page_get_data_size(
+/*===============*/
+ const page_t* page); /*!< in: index page */
+/** Read the PAGE_DIRECTION field from a byte.
+@param[in] ptr pointer to PAGE_DIRECTION_B
+@return the value of the PAGE_DIRECTION field */
+inline
+byte
+page_ptr_get_direction(const byte* ptr);
+
+/** Read the PAGE_DIRECTION field.
+@param[in] page index page
+@return the value of the PAGE_DIRECTION field */
+inline
+byte
+page_get_direction(const page_t* page)
+{
+ return page_ptr_get_direction(PAGE_HEADER + PAGE_DIRECTION_B + page);
+}
+
+/** Read the PAGE_INSTANT field.
+@param[in] page index page
+@return the value of the PAGE_INSTANT field */
+inline
+uint16_t
+page_get_instant(const page_t* page);
+
+/** Create an uncompressed index page.
+@param[in,out] block buffer block
+@param[in,out] mtr mini-transaction
+@param[in] comp set unless ROW_FORMAT=REDUNDANT */
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp);
+/**********************************************************//**
+Create a compressed B-tree index page. */
+void
+page_create_zip(
+/*============*/
+ buf_block_t* block, /*!< in/out: a buffer frame
+ where the page is created */
+ dict_index_t* index, /*!< in: the index of the
+ page */
+ ulint level, /*!< in: the B-tree level of
+ the page */
+ trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */
+ mtr_t* mtr); /*!< in/out: mini-transaction
+ handle */
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+void
+page_create_empty(
+/*==============*/
+ buf_block_t* block, /*!< in/out: B-tree block */
+ dict_index_t* index, /*!< in: the index of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull(1,2)));
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Copies records from page to new_page, from the given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original successor of the infimum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Copies records from page to new_page, up to the given record, NOT
+including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original predecessor of the supremum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+void
+page_delete_rec_list_end(
+/*=====================*/
+ rec_t* rec, /*!< in: pointer to record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_recs, /*!< in: number of records to delete,
+ or ULINT_UNDEFINED if not known */
+ ulint size, /*!< in: the sum of the sizes of the
+ records in the end of the chain to
+ delete, or ULINT_UNDEFINED if not known */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+void
+page_delete_rec_list_start(
+/*=======================*/
+ rec_t* rec, /*!< in: record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return TRUE on success; FALSE on compression failure (new_block will
+be decompressed) */
+ibool
+page_move_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in: index page from where to move */
+ rec_t* split_rec, /*!< in: first record to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull(1, 2, 4, 5)));
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return TRUE on success; FALSE on compression failure */
+ibool
+page_move_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in/out: page containing split_rec */
+ rec_t* split_rec, /*!< in: first record not to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull(1, 2, 4, 5)));
+/** Create an index page.
+@param[in,out] block buffer block
+@param[in] comp nonzero=compact page format */
+void page_create_low(const buf_block_t* block, bool comp);
+
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+void
+page_rec_print(
+/*===========*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets);/*!< in: record descriptor */
+# ifdef UNIV_BTR_PRINT
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+void
+page_dir_print(
+/*===========*/
+ page_t* page, /*!< in: index page */
+ ulint pr_n); /*!< in: print n first and n last entries */
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+void
+page_print_list(
+/*============*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint pr_n); /*!< in: print n first and n last entries */
+/***************************************************************//**
+Prints the info in a page header. */
+void
+page_header_print(
+/*==============*/
+ const page_t* page); /*!< in: index page */
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+void
+page_print(
+/*=======*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint dn, /*!< in: print dn first and last entries
+ in directory */
+ ulint rn); /*!< in: print rn first and last records
+ in directory */
+# endif /* UNIV_BTR_PRINT */
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return TRUE if ok */
+ibool
+page_rec_validate(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */
+#ifdef UNIV_DEBUG
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+void
+page_check_dir(
+/*===========*/
+ const page_t* page); /*!< in: index page */
+#endif /* UNIV_DEBUG */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_old(
+/*=====================*/
+ const page_t* page); /*!< in: index page in ROW_FORMAT=REDUNDANT */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_new(
+/*=====================*/
+ const page_t* page); /*!< in: index page in ROW_FORMAT!=REDUNDANT */
+/** Check the consistency of an index page.
+@param[in] page index page
+@param[in] index B-tree or R-tree index
+@return whether the page is valid */
+bool page_validate(const page_t* page, const dict_index_t* index)
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return record, NULL if not found */
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+ const page_t* page, /*!< in: index page */
+ ulint heap_no);/*!< in: heap number */
+/** Get the last non-delete-marked record on a page.
+@param[in] page index tree leaf page
+@return the last record, not delete-marked
+@retval infimum record if all records are delete-marked */
+const rec_t*
+page_find_rec_max_not_deleted(
+ const page_t* page);
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#include "page0page.ic"
+
+#endif
diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic
new file mode 100644
index 00000000..6514886d
--- /dev/null
+++ b/storage/innobase/include/page0page.ic
@@ -0,0 +1,724 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.ic
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0page_ic
+#define page0page_ic
+
+#ifndef UNIV_INNOCHECKSUM
+#include "rem0cmp.h"
+#include "mtr0log.h"
+#include "page0zip.h"
+
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(block);
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(trx_id);
+ ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+ if (page_get_max_trx_id(buf_block_get_frame(block)) < trx_id) {
+
+ page_set_max_trx_id(block, page_zip, trx_id, mtr);
+ }
+}
+
+/*************************************************************//**
+Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM).
+@return SPLIT SEQUENCE NUMBER */
+UNIV_INLINE
+node_seq_t
+page_get_ssn_id(
+/*============*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page);
+
+ return(static_cast<node_seq_t>(
+ mach_read_from_8(page + FIL_RTREE_SPLIT_SEQ_NUM)));
+}
+
+/*************************************************************//**
+Sets the RTREE SPLIT SEQUENCE NUMBER field value */
+UNIV_INLINE
+void
+page_set_ssn_id(
+/*============*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ node_seq_t ssn_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX |
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!page_zip || page_zip == &block->page.zip);
+ constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM;
+ byte *b= my_assume_aligned<2>(&block->frame[field]);
+ if (mtr->write<8,mtr_t::MAYBE_NOP>(*block, b, ssn_id) &&
+ UNIV_LIKELY_NULL(page_zip))
+ memcpy_aligned<2>(&page_zip->data[field], b, 8);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+uint16_t
+page_header_get_offs(
+/*=================*/
+ const page_t* page, /*!< in: page */
+ ulint field) /*!< in: PAGE_FREE, ... */
+{
+ ut_ad((field == PAGE_FREE)
+ || (field == PAGE_LAST_INSERT)
+ || (field == PAGE_HEAP_TOP));
+
+ uint16_t offs = page_header_get_field(page, field);
+
+ ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+ return(offs);
+}
+
+
+/**
+Reset PAGE_LAST_INSERT.
+@param[in,out] block file page
+@param[in,out] mtr mini-transaction */
+inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
+{
+ constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT;
+ byte *b= my_assume_aligned<2>(&block->frame[field]);
+ if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, 0U) &&
+ UNIV_LIKELY_NULL(block->page.zip.data))
+ memset_aligned<2>(&block->page.zip.data[field], 0, 2);
+}
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+ const rec_t* rec) /*!< in: the physical record */
+{
+ if (page_rec_is_comp(rec)) {
+ return(rec_get_heap_no_new(rec));
+ } else {
+ return(rec_get_heap_no_old(rec));
+ }
+}
+
+/** Determine whether an index page record is a user record.
+@param[in] rec record in an index page
+@return true if a user record */
+inline
+bool
+page_rec_is_user_rec(const rec_t* rec)
+{
+ ut_ad(page_rec_check(rec));
+ return(page_rec_is_user_rec_low(page_offset(rec)));
+}
+
+/** Determine whether an index page record is the supremum record.
+@param[in] rec record in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum(const rec_t* rec)
+{
+ ut_ad(page_rec_check(rec));
+ return(page_rec_is_supremum_low(page_offset(rec)));
+}
+
+/** Determine whether an index page record is the infimum record.
+@param[in] rec record in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum(const rec_t* rec)
+{
+ ut_ad(page_rec_check(rec));
+ return(page_rec_is_infimum_low(page_offset(rec)));
+}
+
+/************************************************************//**
+true if the record is the first user record on a page.
+@return true if the first user record */
+UNIV_INLINE
+bool
+page_rec_is_first(
+/*==============*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page_get_n_recs(page) > 0);
+
+ return(page_rec_get_next_const(page_get_infimum_rec(page)) == rec);
+}
+
+/************************************************************//**
+true if the record is the second user record on a page.
+@return true if the second user record */
+UNIV_INLINE
+bool
+page_rec_is_second(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page_get_n_recs(page) > 1);
+
+ return(page_rec_get_next_const(
+ page_rec_get_next_const(page_get_infimum_rec(page))) == rec);
+}
+
+/************************************************************//**
+true if the record is the last user record on a page.
+@return true if the last user record */
+UNIV_INLINE
+bool
+page_rec_is_last(
+/*=============*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page_get_n_recs(page) > 0);
+
+ return(page_rec_get_next_const(rec) == page_get_supremum_rec(page));
+}
+
+/************************************************************//**
+true if distance between the records (measured in number of times we have to
+move to the next record) is at most the specified value */
+UNIV_INLINE
+bool
+page_rec_distance_is_at_most(
+/*=========================*/
+ const rec_t* left_rec,
+ const rec_t* right_rec,
+ ulint val)
+{
+ for (ulint i = 0; i <= val; i++) {
+ if (left_rec == right_rec) {
+ return (true);
+ }
+ left_rec = page_rec_get_next_const(left_rec);
+ }
+ return (false);
+}
+
+/************************************************************//**
+true if the record is the second last user record on a page.
+@return true if the second last user record */
+UNIV_INLINE
+bool
+page_rec_is_second_last(
+/*====================*/
+ const rec_t* rec, /*!< in: record */
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page_get_n_recs(page) > 1);
+ ut_ad(!page_rec_is_last(rec, page));
+
+ return(page_rec_get_next_const(
+ page_rec_get_next_const(rec)) == page_get_supremum_rec(page));
+}
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record */
+UNIV_INLINE
+rec_t*
+page_rec_get_nth(
+/*=============*/
+ page_t* page, /*!< in: page */
+ ulint nth) /*!< in: nth record */
+{
+ return((rec_t*) page_rec_get_nth_const(page, nth));
+}
+
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+ page_t* page) /*!< in: page */
+{
+ ulint middle = (ulint(page_get_n_recs(page))
+ + PAGE_HEAP_NO_USER_LOW) / 2;
+
+ return(page_rec_get_nth(page, middle));
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+uint32_t
+page_get_page_no(
+/*=============*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page == page_align((page_t*) page));
+ return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_OFFSET));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+uint32_t
+page_get_space_id(
+/*==============*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page == page_align((page_t*) page));
+ return mach_read_from_4(my_assume_aligned<2>
+ (page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Gets the number of user records on page (infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_get_n_recs(
+/*============*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_RECS));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_slots(
+/*=================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_DIR_SLOTS));
+}
+
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_heap(
+/*================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff);
+}
+
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+ const rec_t* rec) /*!< in: record */
+{
+ const page_t* page = page_align(rec);
+
+ ut_a(rec);
+
+ ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP));
+ ut_a(page_offset(rec) >= PAGE_DATA);
+
+ return(TRUE);
+}
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+ const page_dir_slot_t* slot) /*!< in: page directory slot */
+{
+ const rec_t* rec = page_dir_slot_get_rec(slot);
+ if (page_rec_is_comp(slot)) {
+ return(rec_get_n_owned_new(rec));
+ } else {
+ return(rec_get_n_owned_old(rec));
+ }
+}
+
+/************************************************************//**
+Calculates the space reserved for directory slots of a given number of
+records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE /
+PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+ ulint n_recs) /*!< in: number of records */
+{
+ return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1)
+ / PAGE_DIR_SLOT_MIN_N_OWNED);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+ const rec_t* rec, /*!< in: pointer to record */
+ ulint comp) /*!< in: nonzero=compact page layout */
+{
+ ulint offs;
+ const page_t* page;
+
+ ut_ad(page_rec_check(rec));
+
+ page = page_align(rec);
+
+ offs = rec_get_next_offs(rec, comp);
+
+ if (offs >= srv_page_size) {
+ fprintf(stderr,
+ "InnoDB: Next record offset is nonsensical %lu"
+ " in record at offset %lu\n"
+ "InnoDB: rec address %p, space id %lu, page %lu\n",
+ (ulong) offs, (ulong) page_offset(rec),
+ (void*) rec,
+ (ulong) page_get_space_id(page),
+ (ulong) page_get_page_no(page));
+ ut_error;
+ } else if (offs == 0) {
+
+ return(NULL);
+ }
+
+ ut_ad(page_rec_is_infimum(rec)
+ || (!page_is_leaf(page) && !page_has_prev(page))
+ || !(rec_get_info_bits(page + offs, comp)
+ & REC_INFO_MIN_REC_FLAG));
+
+ return(page + offs);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+ rec_t* rec) /*!< in: pointer to record */
+{
+ return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+ const rec_t* rec) /*!< in: pointer to record */
+{
+ return(page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Gets the pointer to the next non delete-marked record on the page.
+If all subsequent records are delete-marked, then this function
+will return the supremum record.
+@return pointer to next non delete-marked record or pointer to supremum */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_non_del_marked(
+/*=============================*/
+ const rec_t* rec) /*!< in: pointer to record */
+{
+ const rec_t* r;
+ ulint page_is_compact = page_rec_is_comp(rec);
+
+ for (r = page_rec_get_next_const(rec);
+ !page_rec_is_supremum(r)
+ && rec_get_deleted_flag(r, page_is_compact);
+ r = page_rec_get_next_const(r)) {
+ /* noop */
+ }
+
+ return(r);
+}
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+ const rec_t* rec) /*!< in: pointer to record, must not be page
+ infimum */
+{
+ const page_dir_slot_t* slot;
+ ulint slot_no;
+ const rec_t* rec2;
+ const rec_t* prev_rec = NULL;
+ const page_t* page;
+
+ ut_ad(page_rec_check(rec));
+
+ page = page_align(rec);
+
+ ut_ad(!page_rec_is_infimum(rec));
+
+ slot_no = page_dir_find_owner_slot(rec);
+
+ ut_a(slot_no != 0);
+
+ slot = page_dir_get_nth_slot(page, slot_no - 1);
+
+ rec2 = page_dir_slot_get_rec(slot);
+
+ if (page_is_comp(page)) {
+ while (rec != rec2) {
+ prev_rec = rec2;
+ rec2 = page_rec_get_next_low(rec2, TRUE);
+ }
+ } else {
+ while (rec != rec2) {
+ prev_rec = rec2;
+ rec2 = page_rec_get_next_low(rec2, FALSE);
+ }
+ }
+
+ ut_a(prev_rec);
+
+ return(prev_rec);
+}
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+ rec_t* rec) /*!< in: pointer to record, must not be page
+ infimum */
+{
+ return((rec_t*) page_rec_get_prev_const(rec));
+}
+
+#endif /* UNIV_INNOCHECKSUM */
+
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list, excluding
+the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+uint16_t
+page_get_data_size(
+/*===============*/
+ const page_t* page) /*!< in: index page */
+{
+ unsigned ret = page_header_get_field(page, PAGE_HEAP_TOP)
+ - (page_is_comp(page)
+ ? PAGE_NEW_SUPREMUM_END
+ : PAGE_OLD_SUPREMUM_END)
+ - page_header_get_field(page, PAGE_GARBAGE);
+ ut_ad(ret < srv_page_size);
+ return static_cast<uint16_t>(ret);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+ ulint comp) /*!< in: nonzero=compact page layout */
+{
+ if (comp) {
+ return((ulint)(srv_page_size
+ - PAGE_NEW_SUPREMUM_END
+ - PAGE_DIR
+ - 2 * PAGE_DIR_SLOT_SIZE));
+ }
+
+ return((ulint)(srv_page_size
+ - PAGE_OLD_SUPREMUM_END
+ - PAGE_DIR
+ - 2 * PAGE_DIR_SLOT_SIZE));
+}
+
+/************************************************************//**
+Each user record on a page, and also the deleted user records in the heap
+takes its size plus the fraction of the dir cell size /
+PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the
+value of page_get_free_space_of_empty, the insert is impossible, otherwise
+it is allowed. This function returns the maximum combined size of records
+which can be inserted on top of the record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs) /*!< in: number of records */
+{
+ ulint occupied;
+ ulint free_space;
+
+ if (page_is_comp(page)) {
+ occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+ - PAGE_NEW_SUPREMUM_END
+ + page_dir_calc_reserved_space(
+ n_recs + page_dir_get_n_heap(page) - 2);
+
+ free_space = page_get_free_space_of_empty(TRUE);
+ } else {
+ occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+ - PAGE_OLD_SUPREMUM_END
+ + page_dir_calc_reserved_space(
+ n_recs + page_dir_get_n_heap(page) - 2);
+
+ free_space = page_get_free_space_of_empty(FALSE);
+ }
+
+ /* Above the 'n_recs +' part reserves directory space for the new
+ inserted records; the '- 2' excludes page infimum and supremum
+ records */
+
+ if (occupied > free_space) {
+
+ return(0);
+ }
+
+ return(free_space - occupied);
+}
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of the record heap if a page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs) /*!< in: number of records */
+{
+ ulint occupied;
+ ulint free_space;
+
+ occupied = page_get_data_size(page)
+ + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page));
+
+ free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+ if (occupied > free_space) {
+
+ return(0);
+ }
+
+ return(free_space - occupied);
+}
+
+/** Read the PAGE_DIRECTION field from a byte.
+@param[in] ptr pointer to PAGE_DIRECTION_B
+@return the value of the PAGE_DIRECTION field */
+inline
+byte
+page_ptr_get_direction(const byte* ptr)
+{
+ ut_ad(page_offset(ptr) == PAGE_HEADER + PAGE_DIRECTION_B);
+ return *ptr & ((1U << 3) - 1);
+}
+
+/** Read the PAGE_INSTANT field.
+@param[in] page index page
+@return the value of the PAGE_INSTANT field */
+inline
+uint16_t
+page_get_instant(const page_t* page)
+{
+ uint16_t i = page_header_get_field(page, PAGE_INSTANT);
+#ifdef UNIV_DEBUG
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_TYPE_INSTANT:
+ ut_ad(page_get_direction(page) <= PAGE_NO_DIRECTION);
+ ut_ad(i >> 3);
+ break;
+ case FIL_PAGE_INDEX:
+ ut_ad(i <= PAGE_NO_DIRECTION || !page_is_comp(page));
+ break;
+ case FIL_PAGE_RTREE:
+ ut_ad(i <= PAGE_NO_DIRECTION);
+ break;
+ default:
+ ut_ad("invalid page type" == 0);
+ break;
+ }
+#endif /* UNIV_DEBUG */
+ return static_cast<uint16_t>(i >> 3); /* i / 8 */
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h
new file mode 100644
index 00000000..6c5a681f
--- /dev/null
+++ b/storage/innobase/include/page0types.h
@@ -0,0 +1,161 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0types.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0types_h
+#define page0types_h
+
+#include "dict0types.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+
+#include <map>
+
+/** Eliminates a name collision on HP-UX */
+#define page_t ib_page_t
+/** Type of the index page */
+typedef byte page_t;
+#ifndef UNIV_INNOCHECKSUM
+/** Index page cursor */
+struct page_cur_t;
+/** Buffer pool block */
+struct buf_block_t;
+
+/** Compressed index page */
+typedef byte page_zip_t;
+
+/* The following definitions would better belong to page0zip.h,
+but we cannot include page0zip.h from rem0rec.ic, because
+page0*.h includes rem0rec.h and may include rem0rec.ic. */
+
+/** Number of bits needed for representing different compressed page sizes */
+#define PAGE_ZIP_SSIZE_BITS 3
+
+/** Maximum compressed page shift size */
+#define PAGE_ZIP_SSIZE_MAX \
+ (UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1)
+
+/* Make sure there are enough bits available to store the maximum zip
+ssize, which is the number of shifts from 512. */
+#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)
+# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)"
+#endif
+
+/* Page cursor search modes; the values must be in this order! */
+enum page_cur_mode_t {
+ PAGE_CUR_UNSUPP = 0,
+ PAGE_CUR_G = 1,
+ PAGE_CUR_GE = 2,
+ PAGE_CUR_L = 3,
+ PAGE_CUR_LE = 4,
+
+/* PAGE_CUR_LE_OR_EXTENDS = 5,*/ /* This is a search mode used in
+ "column LIKE 'abc%' ORDER BY column DESC";
+ we have to find strings which are <= 'abc' or
+ which extend it */
+
+/* These search mode is for search R-tree index. */
+ PAGE_CUR_CONTAIN = 7,
+ PAGE_CUR_INTERSECT = 8,
+ PAGE_CUR_WITHIN = 9,
+ PAGE_CUR_DISJOINT = 10,
+ PAGE_CUR_MBR_EQUAL = 11,
+ PAGE_CUR_RTREE_INSERT = 12,
+ PAGE_CUR_RTREE_LOCATE = 13,
+ PAGE_CUR_RTREE_GET_FATHER = 14
+};
+
+/** Compressed page descriptor */
+struct page_zip_des_t
+{
+ page_zip_t* data; /*!< compressed page data */
+
+#ifdef UNIV_DEBUG
+ unsigned m_start:16; /*!< start offset of modification log */
+ bool m_external; /*!< Allocated externally, not from the
+ buffer pool */
+#endif /* UNIV_DEBUG */
+ unsigned m_end:16; /*!< end offset of modification log */
+ unsigned m_nonempty:1; /*!< TRUE if the modification log
+ is not empty */
+ unsigned n_blobs:12; /*!< number of externally stored
+ columns on the page; the maximum
+ is 744 on a 16 KiB page */
+ unsigned ssize:PAGE_ZIP_SSIZE_BITS;
+ /*!< 0 or compressed page shift size;
+ the size in bytes is
+ (UNIV_ZIP_SIZE_MIN >> 1) << ssize. */
+};
+
+/** Compression statistics for a given page size */
+struct page_zip_stat_t {
+ /** Number of page compressions */
+ ulint compressed;
+ /** Number of successful page compressions */
+ ulint compressed_ok;
+ /** Number of page decompressions */
+ ulint decompressed;
+ /** Duration of page compressions in microseconds */
+ ib_uint64_t compressed_usec;
+ /** Duration of page decompressions in microseconds */
+ ib_uint64_t decompressed_usec;
+ page_zip_stat_t() :
+ /* Initialize members to 0 so that when we do
+ stlmap[key].compressed++ and element with "key" does not
+ exist it gets inserted with zeroed members. */
+ compressed(0),
+ compressed_ok(0),
+ decompressed(0),
+ compressed_usec(0),
+ decompressed_usec(0)
+ { }
+};
+
+/** Compression statistics types */
+typedef std::map<
+ index_id_t,
+ page_zip_stat_t,
+ std::less<index_id_t>,
+ ut_allocator<std::pair<const index_id_t, page_zip_stat_t> > >
+ page_zip_stat_per_index_t;
+
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+extern page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by dict_index_t::id */
+extern page_zip_stat_per_index_t page_zip_stat_per_index;
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page. The n_owned field
+must already have been written on the uncompressed page. */
+void
+page_zip_rec_set_owned(
+/*===================*/
+ buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag, /*!< in: the owned flag (nonzero=TRUE) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+#endif /* !UNIV_INNOCHECKSUM */
+#endif
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
new file mode 100644
index 00000000..5a70e995
--- /dev/null
+++ b/storage/innobase/include/page0zip.h
@@ -0,0 +1,392 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.h
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifndef page0zip_h
+#define page0zip_h
+
+#include "buf0types.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0types.h"
+#include "page0types.h"
+#include "dict0types.h"
+#include "srv0srv.h"
+#include "trx0types.h"
+#include "mem0mem.h"
+
+/* Compression level to be used by zlib. Settable by user. */
+extern uint page_zip_level;
+
+/* Default compression level. */
+#define DEFAULT_COMPRESSION_LEVEL 6
+/** Start offset of the area that will be compressed */
+#define PAGE_ZIP_START PAGE_NEW_SUPREMUM_END
+/** Size of an compressed page directory entry */
+#define PAGE_ZIP_DIR_SLOT_SIZE 2
+/** Predefine the sum of DIR_SLOT, TRX_ID & ROLL_PTR */
+#define PAGE_ZIP_CLUST_LEAF_SLOT_SIZE \
+ (PAGE_ZIP_DIR_SLOT_SIZE \
+ + DATA_TRX_ID_LEN \
+ + DATA_ROLL_PTR_LEN)
+/** Mask of record offsets */
+#define PAGE_ZIP_DIR_SLOT_MASK 0x3fffU
+/** 'owned' flag */
+#define PAGE_ZIP_DIR_SLOT_OWNED 0x4000U
+/** 'deleted' flag */
+#define PAGE_ZIP_DIR_SLOT_DEL 0x8000U
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint size); /*!< in: size in bytes */
+
+/** Determine if a record is so big that it needs to be stored externally.
+@param[in] rec_size length of the record in bytes
+@param[in] comp nonzero=compact format
+@param[in] n_fields number of fields in the record; ignored if
+tablespace is not compressed
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return false if the entire record can be stored locally on the page */
+inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields,
+ ulint zip_size)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+ulint
+page_zip_empty_size(
+/*================*/
+ ulint n_fields, /*!< in: number of columns in the index */
+ ulint zip_size) /*!< in: compressed page size in bytes */
+ MY_ATTRIBUTE((const));
+
+/** Check whether a tuple is too big for compressed table
+@param[in] index dict index object
+@param[in] entry entry for the index
+@return true if it's too big, otherwise false */
+bool
+page_zip_is_too_big(
+ const dict_index_t* index,
+ const dtuple_t* entry);
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+UNIV_INLINE
+void
+page_zip_des_init(
+/*==============*/
+ page_zip_des_t* page_zip); /*!< in/out: compressed page
+ descriptor */
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+void
+page_zip_set_alloc(
+/*===============*/
+ void* stream, /*!< in/out: zlib stream */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/** Attempt to compress a ROW_FORMAT=COMPRESSED page.
+@retval true on success
+@retval false on failure; block->page.zip will be left intact. */
+bool
+page_zip_compress(
+ buf_block_t* block, /*!< in/out: buffer block */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint level, /*!< in: commpression level */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return used size of buf */
+ulint
+page_zip_fields_encode(
+/*===================*/
+ ulint n, /*!< in: number of fields
+ to compress */
+ const dict_index_t* index, /*!< in: index comprising
+ at least n fields */
+ ulint trx_id_pos,
+ /*!< in: position of the trx_id column
+ in the index, or ULINT_UNDEFINED if
+ this is a non-leaf page */
+ byte* buf); /*!< out: buffer of (n + 1) * 2 bytes */
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+ibool
+page_zip_decompress(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page, /*!< out: uncompressed page, may be trashed */
+ ibool all) /*!< in: TRUE=decompress the whole page;
+ FALSE=verify but do not copy some
+ page header fields that should not change
+ after page creation */
+ MY_ATTRIBUTE((nonnull(1,2)));
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+ const page_zip_des_t* page_zip); /*!< in: compressed page
+ descriptor */
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate_low(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ const dict_index_t* index, /*!< in: index of the page, if known */
+ ibool sloppy) /*!< in: FALSE=strict,
+ TRUE=ignore the MIN_REC_FLAG */
+ MY_ATTRIBUTE((nonnull(1,2)));
+/**********************************************************************//**
+Check that the compressed and decompressed pages match. */
+ibool
+page_zip_validate(
+/*==============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ const dict_index_t* index) /*!< in: index of the page, if known */
+ MY_ATTRIBUTE((nonnull(1,2)));
+#endif /* UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust)/*!< in: TRUE if clustered index */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if page_zip_write_rec() will succeed */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust,/*!< in: TRUE if clustered index */
+ ulint length, /*!< in: combined size of the record */
+ ulint create) /*!< in: nonzero=add the record to
+ the heap */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
+The data must already have been written to the uncompressed page.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in] rec record in the uncompressed page
+@param[in] index the index that the page belongs to
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] create nonzero=insert, zero=update
+@param[in,out] mtr mini-transaction */
+void page_zip_write_rec(buf_block_t *block, const byte *rec,
+ const dict_index_t *index, const rec_offs *offsets,
+ ulint create, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+void
+page_zip_write_blob_ptr(
+/*====================*/
+ buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
+ const byte* rec, /*!< in/out: record whose data is being
+ written */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint n, /*!< in: column index */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+void
+page_zip_write_node_ptr(
+/*====================*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ ulint size, /*!< in: data size of rec */
+ ulint ptr, /*!< in: node pointer */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull));
+
+/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in,out] rec record
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields)
+@param[in] trx_id DB_TRX_ID value (transaction identifier)
+@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer)
+@param[in,out] mtr mini-transaction */
+void
+page_zip_write_trx_id_and_roll_ptr(
+ buf_block_t* block,
+ byte* rec,
+ const rec_offs* offsets,
+ ulint trx_id_col,
+ trx_id_t trx_id,
+ roll_ptr_t roll_ptr,
+ mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in] flag the value of the delete-mark flag
+@param[in,out] mtr mini-transaction */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+ mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+void
+page_zip_dir_insert(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ uint16_t free_rec,/*!< in: record from which rec was
+ allocated, or 0 */
+ byte* rec, /*!< in: record to insert */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ MY_ATTRIBUTE((nonnull(1,3,4)));
+
+/** Shift the dense page directory and the array of BLOB pointers
+when a record is deleted.
+@param[in,out] block index page
+@param[in,out] rec record being deleted
+@param[in] index the index that the page belongs to
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] free previous start of the free list
+@param[in,out] mtr mini-transaction */
+void page_zip_dir_delete(buf_block_t *block, byte *rec,
+ const dict_index_t *index, const rec_offs *offsets,
+ const byte *free, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull(1,2,3,4,6)));
+
+/**********************************************************************//**
+Reorganize and compress a page. This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, redo log will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@retval true on success
+@retval false on failure; the block_zip will be left intact */
+bool
+page_zip_reorganize(
+ buf_block_t* block, /*!< in/out: page with compressed page;
+ on the compressed page, in: size;
+ out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint z_level,/*!< in: compression level */
+ mtr_t* mtr, /*!< in: mini-transaction */
+ bool restore = false)/*!< whether to restore on failure */
+ MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Copy the records of a page byte for byte. Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records. Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+void
+page_zip_copy_recs(
+ buf_block_t* block, /*!< in/out: buffer block */
+ const page_zip_des_t* src_zip, /*!< in: compressed page */
+ const page_t* src, /*!< in: page */
+ dict_index_t* index, /*!< in: index of the B-tree */
+ mtr_t* mtr); /*!< in: mini-transaction */
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Calculate the compressed page checksum.
+@param[in] data compressed page
+@param[in] size size of compressed page
+@param[in] algo algorithm to use
+@return page checksum */
+uint32_t
+page_zip_calc_checksum(
+ const void* data,
+ ulint size,
+ srv_checksum_algorithm_t algo);
+
+/** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
+@param data ROW_FORMAT=COMPRESSED page
+@param size size of the page, in bytes
+@return whether the stored checksum matches innodb_checksum_algorithm */
+bool page_zip_verify_checksum(const byte *data, size_t size);
+
+#ifndef UNIV_INNOCHECKSUM
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index();
+/*===========================*/
+
+#include "page0zip.ic"
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif /* page0zip_h */
diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic
new file mode 100644
index 00000000..ede61283
--- /dev/null
+++ b/storage/innobase/include/page0zip.ic
@@ -0,0 +1,334 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.ic
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#include "page0page.h"
+
+/* The format of compressed pages is as follows.
+
+The header and trailer of the uncompressed pages, excluding the page
+directory in the trailer, are copied as is to the header and trailer
+of the compressed page.
+
+At the end of the compressed page, there is a dense page directory
+pointing to every user record contained on the page, including deleted
+records on the free list. The dense directory is indexed in the
+collation order, i.e., in the order in which the record list is
+linked on the uncompressed page. The infimum and supremum records are
+excluded. The two most significant bits of the entries are allocated
+for the delete-mark and an n_owned flag indicating the last record in
+a chain of records pointed to from the sparse page directory on the
+uncompressed page.
+
+The data between PAGE_ZIP_START and the last page directory entry will
+be written in compressed format, starting at offset PAGE_DATA.
+Infimum and supremum records are not stored. We exclude the
+REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered
+from the dense page directory stored at the end of the compressed
+page.
+
+The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and
+roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of
+externally stored columns are stored separately, in ascending order of
+heap_no and column index, starting backwards from the dense page
+directory.
+
+The compressed data stream may be followed by a modification log
+covering the compressed portion of the page, as follows.
+
+MODIFICATION LOG ENTRY FORMAT
+- write record:
+ - (heap_no - 1) << 1 (1..2 bytes)
+ - extra bytes backwards
+ - data bytes
+- clear record:
+ - (heap_no - 1) << 1 | 1 (1..2 bytes)
+
+The integer values are stored in a variable-length format:
+- 0xxxxxxx: 0..127
+- 1xxxxxxx xxxxxxxx: 0..32767
+
+The end of the modification log is marked by a 0 byte.
+
+In summary, the compressed page looks like this:
+
+(1) Uncompressed page header (PAGE_DATA bytes)
+(2) Compressed index information
+(3) Compressed page data
+(4) Page modification log (page_zip->m_start..page_zip->m_end)
+(5) Empty zero-filled space
+(6) BLOB pointers (on leaf pages)
+ - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column
+ - in descending collation order
+(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes,
+ - indexed by heap_no
+ - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes
+ - REC_NODE_PTR_SIZE for non-leaf pages
+ - 0 otherwise
+(8) dense page directory, stored backwards
+ - n_dense = n_heap - 2
+ - existing records in ascending collation order
+ - deleted records (free list) in link order
+*/
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ ulint size;
+
+ if (!page_zip->ssize) {
+ return(0);
+ }
+
+ size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize;
+
+ ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+ ut_ad(size <= srv_page_size);
+
+ return(size);
+}
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint size) /*!< in: size in bytes */
+{
+ if (size) {
+ unsigned ssize;
+
+ ut_ad(ut_is_2pow(size));
+
+ for (ssize = 1; size > (512U << ssize); ssize++) {
+ }
+
+ page_zip->ssize = ssize & ((1U << PAGE_ZIP_SSIZE_BITS) - 1);
+ } else {
+ page_zip->ssize = 0;
+ }
+
+ ut_ad(page_zip_get_size(page_zip) == size);
+}
+
+/** Determine if a record is so big that it needs to be stored externally.
+@param[in] rec_size length of the record in bytes
+@param[in] comp nonzero=compact format
+@param[in] n_fields number of fields in the record; ignored if
+tablespace is not compressed
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return false if the entire record can be stored locally on the page */
+inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields,
+ ulint zip_size)
+{
+ /* FIXME: row size check is this function seems to be the most correct.
+ Put it in a separate function and use in more places of InnoDB */
+
+ ut_ad(rec_size
+ > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES));
+ ut_ad(comp || !zip_size);
+
+#if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE
+ if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE :
+ rec_size >= REDUNDANT_REC_MAX_DATA_SIZE) {
+ return(TRUE);
+ }
+#endif
+
+ if (zip_size) {
+ ut_ad(comp);
+ /* On a compressed page, there is a two-byte entry in
+ the dense page directory for every record. But there
+ is no record header. There should be enough room for
+ one record on an empty leaf page. Subtract 1 byte for
+ the encoded heap number. Check also the available space
+ on the uncompressed page. */
+ return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1)
+ >= page_zip_empty_size(n_fields, zip_size)
+ || rec_size >= page_get_free_space_of_empty(TRUE) / 2);
+ }
+
+ return(rec_size >= page_get_free_space_of_empty(comp) / 2);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+ const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */
+{
+ ut_ad(page_zip);
+ ut_ad(page_zip->data);
+ ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX);
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
+ ut_ad(page_zip->m_start <= page_zip->m_end);
+ ut_ad(page_zip->m_end < page_zip_get_size(page_zip));
+ ut_ad(page_zip->n_blobs
+ < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE);
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Determine if the length of the page trailer.
+@return length of the page trailer, in bytes, not including the
+terminating zero byte of the modification log */
+UNIV_INLINE
+ibool
+page_zip_get_trailer_len(
+/*=====================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust)/*!< in: TRUE if clustered index */
+{
+ ulint uncompressed_size;
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ if (!page_is_leaf(page_zip->data)) {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + REC_NODE_PTR_SIZE;
+ ut_ad(!page_zip->n_blobs);
+ } else if (is_clust) {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ } else {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE;
+ ut_ad(!page_zip->n_blobs);
+ }
+
+ return (ulint(page_dir_get_n_heap(page_zip->data)) - 2)
+ * uncompressed_size
+ + ulint(page_zip->n_blobs) * BTR_EXTERN_FIELD_REF_SIZE;
+}
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust)/*!< in: TRUE if clustered index */
+{
+ ulint trailer_len;
+
+ trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
+
+ /* When a record is created, a pointer may be added to
+ the dense directory.
+ Likewise, space for the columns that will not be
+ compressed will be allocated from the page trailer.
+ Also the BLOB pointers will be allocated from there, but
+ we may as well count them in the length of the record. */
+
+ trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
+
+ return(lint(page_zip_get_size(page_zip)
+ - trailer_len - page_zip->m_end
+ - (REC_N_NEW_EXTRA_BYTES - 2)));
+}
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if enough space is available */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust,/*!< in: TRUE if clustered index */
+ ulint length, /*!< in: combined size of the record */
+ ulint create) /*!< in: nonzero=add the record to
+ the heap */
+{
+ ulint trailer_len;
+
+ ut_ad(length > REC_N_NEW_EXTRA_BYTES);
+
+ trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
+
+ /* Subtract the fixed extra bytes and add the maximum
+ space needed for identifying the record (encoded heap_no). */
+ length -= REC_N_NEW_EXTRA_BYTES - 2;
+
+ if (create > 0) {
+ /* When a record is created, a pointer may be added to
+ the dense directory.
+ Likewise, space for the columns that will not be
+ compressed will be allocated from the page trailer.
+ Also the BLOB pointers will be allocated from there, but
+ we may as well count them in the length of the record. */
+
+ trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+
+ return(length + trailer_len + page_zip->m_end
+ < page_zip_get_size(page_zip));
+}
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+UNIV_INLINE
+void
+page_zip_des_init(
+/*==============*/
+ page_zip_des_t* page_zip) /*!< in/out: compressed page
+ descriptor */
+{
+ memset(page_zip, 0, sizeof *page_zip);
+}
+
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index()
+/*===========================*/
+{
+ mutex_enter(&page_zip_stat_per_index_mutex);
+
+ page_zip_stat_per_index.erase(
+ page_zip_stat_per_index.begin(),
+ page_zip_stat_per_index.end());
+
+ mutex_exit(&page_zip_stat_per_index_mutex);
+}
diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h
new file mode 100644
index 00000000..58d424ab
--- /dev/null
+++ b/storage/innobase/include/pars0grm.h
@@ -0,0 +1,145 @@
+/* A Bison parser, made by GNU Bison 3.4.2. */
+
+/* Bison interface for Yacc-like parsers in C
+
+ Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2019 Free Software Foundation,
+ Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* As a special exception, you may create a larger work that contains
+ part or all of the Bison parser skeleton and distribute that work
+ under terms of your choice, so long as that work isn't itself a
+ parser generator using the skeleton or a modified version thereof
+ as a parser skeleton. Alternatively, if you modify or redistribute
+ the parser skeleton itself, you may (at your option) remove this
+ special exception, which will cause the skeleton and the resulting
+ Bison output files to be licensed under the GNU General Public
+ License without this special exception.
+
+ This special exception was added by the Free Software Foundation in
+ version 2.2 of Bison. */
+
+/* Undocumented macros, especially those whose name start with YY_,
+ are private implementation details. Do not rely on them. */
+
+#ifndef YY_YY_PARS0GRM_TAB_H_INCLUDED
+# define YY_YY_PARS0GRM_TAB_H_INCLUDED
+/* Debug traces. */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+#if YYDEBUG
+extern int yydebug;
+#endif
+
+/* Token type. */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+ enum yytokentype
+ {
+ PARS_INT_LIT = 258,
+ PARS_FLOAT_LIT = 259,
+ PARS_STR_LIT = 260,
+ PARS_NULL_LIT = 261,
+ PARS_ID_TOKEN = 262,
+ PARS_AND_TOKEN = 263,
+ PARS_OR_TOKEN = 264,
+ PARS_NOT_TOKEN = 265,
+ PARS_GE_TOKEN = 266,
+ PARS_LE_TOKEN = 267,
+ PARS_NE_TOKEN = 268,
+ PARS_PROCEDURE_TOKEN = 269,
+ PARS_IN_TOKEN = 270,
+ PARS_INT_TOKEN = 271,
+ PARS_CHAR_TOKEN = 272,
+ PARS_IS_TOKEN = 273,
+ PARS_BEGIN_TOKEN = 274,
+ PARS_END_TOKEN = 275,
+ PARS_IF_TOKEN = 276,
+ PARS_THEN_TOKEN = 277,
+ PARS_ELSE_TOKEN = 278,
+ PARS_ELSIF_TOKEN = 279,
+ PARS_LOOP_TOKEN = 280,
+ PARS_WHILE_TOKEN = 281,
+ PARS_RETURN_TOKEN = 282,
+ PARS_SELECT_TOKEN = 283,
+ PARS_COUNT_TOKEN = 284,
+ PARS_FROM_TOKEN = 285,
+ PARS_WHERE_TOKEN = 286,
+ PARS_FOR_TOKEN = 287,
+ PARS_DDOT_TOKEN = 288,
+ PARS_ORDER_TOKEN = 289,
+ PARS_BY_TOKEN = 290,
+ PARS_ASC_TOKEN = 291,
+ PARS_DESC_TOKEN = 292,
+ PARS_INSERT_TOKEN = 293,
+ PARS_INTO_TOKEN = 294,
+ PARS_VALUES_TOKEN = 295,
+ PARS_UPDATE_TOKEN = 296,
+ PARS_SET_TOKEN = 297,
+ PARS_DELETE_TOKEN = 298,
+ PARS_CURRENT_TOKEN = 299,
+ PARS_OF_TOKEN = 300,
+ PARS_CREATE_TOKEN = 301,
+ PARS_TABLE_TOKEN = 302,
+ PARS_INDEX_TOKEN = 303,
+ PARS_UNIQUE_TOKEN = 304,
+ PARS_CLUSTERED_TOKEN = 305,
+ PARS_ON_TOKEN = 306,
+ PARS_ASSIGN_TOKEN = 307,
+ PARS_DECLARE_TOKEN = 308,
+ PARS_CURSOR_TOKEN = 309,
+ PARS_SQL_TOKEN = 310,
+ PARS_OPEN_TOKEN = 311,
+ PARS_FETCH_TOKEN = 312,
+ PARS_CLOSE_TOKEN = 313,
+ PARS_NOTFOUND_TOKEN = 314,
+ PARS_TO_BINARY_TOKEN = 315,
+ PARS_SUBSTR_TOKEN = 316,
+ PARS_CONCAT_TOKEN = 317,
+ PARS_INSTR_TOKEN = 318,
+ PARS_LENGTH_TOKEN = 319,
+ PARS_COMMIT_TOKEN = 320,
+ PARS_ROLLBACK_TOKEN = 321,
+ PARS_WORK_TOKEN = 322,
+ PARS_EXIT_TOKEN = 323,
+ PARS_FUNCTION_TOKEN = 324,
+ PARS_LOCK_TOKEN = 325,
+ PARS_SHARE_TOKEN = 326,
+ PARS_MODE_TOKEN = 327,
+ PARS_LIKE_TOKEN = 328,
+ PARS_LIKE_TOKEN_EXACT = 329,
+ PARS_LIKE_TOKEN_PREFIX = 330,
+ PARS_LIKE_TOKEN_SUFFIX = 331,
+ PARS_LIKE_TOKEN_SUBSTR = 332,
+ PARS_TABLE_NAME_TOKEN = 333,
+ PARS_BIGINT_TOKEN = 334,
+ NEG = 335
+ };
+#endif
+
+/* Value type. */
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef int YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+extern YYSTYPE yylval;
+
+int yyparse (void);
+
+#endif /* !YY_YY_PARS0GRM_TAB_H_INCLUDED */
diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h
new file mode 100644
index 00000000..07a726ea
--- /dev/null
+++ b/storage/innobase/include/pars0opt.h
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.h
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0opt_h
+#define pars0opt_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "row0sel.h"
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+void
+opt_search_plan(
+/*============*/
+ sel_node_t* sel_node); /*!< in: parsed select node */
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+void
+opt_find_all_cols(
+/*==============*/
+ ibool copy_val, /*!< in: if TRUE, new found columns are
+ added as columns to copy */
+ dict_index_t* index, /*!< in: index to use */
+ sym_node_list_t* col_list, /*!< in: base node of a list where
+ to add new found columns */
+ plan_t* plan, /*!< in: plan or NULL */
+ que_node_t* exp); /*!< in: expression or condition */
+#ifdef UNIV_SQL_DEBUG
+/********************************************************************//**
+Prints info of a query plan. */
+void
+opt_print_query_plan(
+/*=================*/
+ sel_node_t* sel_node); /*!< in: select node */
+#endif /* UNIV_SQL_DEBUG */
+
+#endif
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
new file mode 100644
index 00000000..03aa72d3
--- /dev/null
+++ b/storage/innobase/include/pars0pars.h
@@ -0,0 +1,724 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.h
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0pars_h
+#define pars0pars_h
+
+#include "que0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "row0mysql.h"
+
+/** Type of the user functions. The first argument is always InnoDB-supplied
+and varies in type, while 'user_arg' is a user-supplied argument. The
+meaning of the return type also varies. See the individual use cases, e.g.
+the FETCH statement, for details on them. */
+typedef ibool (*pars_user_func_cb_t)(void* arg, void* user_arg);
+
+/** If the following is set TRUE, the parser will emit debugging
+information */
+extern int yydebug;
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+extern sym_tab_t* pars_sym_tab_global;
+
+extern pars_res_word_t pars_to_binary_token;
+extern pars_res_word_t pars_substr_token;
+extern pars_res_word_t pars_concat_token;
+extern pars_res_word_t pars_length_token;
+extern pars_res_word_t pars_instr_token;
+extern pars_res_word_t pars_count_token;
+extern pars_res_word_t pars_int_token;
+extern pars_res_word_t pars_bigint_token;
+extern pars_res_word_t pars_char_token;
+extern pars_res_word_t pars_update_token;
+extern pars_res_word_t pars_asc_token;
+extern pars_res_word_t pars_desc_token;
+extern pars_res_word_t pars_open_token;
+extern pars_res_word_t pars_close_token;
+extern pars_res_word_t pars_share_token;
+extern pars_res_word_t pars_unique_token;
+extern pars_res_word_t pars_clustered_token;
+
+extern ulint pars_star_denoter;
+
+/* Procedure parameter types */
+#define PARS_INPUT 0
+#define PARS_OUTPUT 1
+#define PARS_NOT_PARAM 2
+
+int
+yyparse(void);
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return own: the query graph */
+que_t*
+pars_sql(
+/*=====*/
+ pars_info_t* info, /*!< in: extra information, or NULL */
+ const char* str); /*!< in: SQL string */
+/*************************************************************//**
+Retrieves characters to the lexical analyzer.
+@return number of characters copied or 0 on EOF */
+int
+pars_get_lex_chars(
+/*===============*/
+ char* buf, /*!< in/out: buffer where to copy */
+ size_t max_size); /*!< in: maximum number of characters which fit
+ in the buffer */
+/*************************************************************//**
+Called by yyparse on error. */
+void
+yyerror(
+/*====*/
+ const char* s); /*!< in: error message string */
+/*********************************************************************//**
+Parses a variable declaration.
+@return own: symbol table node of type SYM_VAR */
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+ sym_node_t* node, /*!< in: symbol table node allocated for the
+ id of the variable */
+ pars_res_word_t* type); /*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses a function expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_func(
+/*======*/
+ que_node_t* res_word,/*!< in: function name reserved word */
+ que_node_t* arg); /*!< in: first argument in the argument list */
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.
+@return own: function node in a query tree */
+int
+pars_like_rebind(
+/*=============*/
+ sym_node_t* node, /* in: The search string node.*/
+ const byte* ptr, /* in: literal to (re) bind */
+ ulint len); /* in: length of literal to (re) bind*/
+/*********************************************************************//**
+Parses an operator expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_op(
+/*====*/
+ int func, /*!< in: operator token code */
+ que_node_t* arg1, /*!< in: first argument */
+ que_node_t* arg2); /*!< in: second argument or NULL for an unary
+ operator */
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return own: order-by node in a query tree */
+order_node_t*
+pars_order_by(
+/*==========*/
+ sym_node_t* column, /*!< in: column name */
+ pars_res_word_t* asc); /*!< in: &pars_asc_token or pars_desc_token */
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_list(
+/*=============*/
+ que_node_t* select_list, /*!< in: select list */
+ sym_node_t* into_list); /*!< in: variables list or NULL */
+/*********************************************************************//**
+Parses a cursor declaration.
+@return sym_node */
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+ sym_node_t* sym_node, /*!< in: cursor id node in the symbol
+ table */
+ sel_node_t* select_node); /*!< in: select node */
+/*********************************************************************//**
+Parses a function declaration.
+@return sym_node */
+que_node_t*
+pars_function_declaration(
+/*======================*/
+ sym_node_t* sym_node); /*!< in: function id node in the symbol
+ table */
+/*********************************************************************//**
+Parses a select statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_statement(
+/*==================*/
+ sel_node_t* select_node, /*!< in: select node already containing
+ the select list */
+ sym_node_t* table_list, /*!< in: table list */
+ que_node_t* search_cond, /*!< in: search condition or NULL */
+ pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */
+ pars_res_word_t* consistent_read,/*!< in: NULL or
+ &pars_consistent_token */
+ order_node_t* order_by); /*!< in: NULL or an order-by node */
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return column assignment node */
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+ sym_node_t* column, /*!< in: column to assign */
+ que_node_t* exp); /*!< in: value to assign */
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+ ibool is_delete, /*!< in: TRUE if delete */
+ sym_node_t* table_sym, /*!< in: table name node */
+ col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL
+ if delete */
+/*********************************************************************//**
+Parses an update or delete statement.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement(
+/*==================*/
+ upd_node_t* node, /*!< in: update node */
+ sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in
+ the symbol table or NULL */
+ que_node_t* search_cond); /*!< in: search condition or NULL */
+/*********************************************************************//**
+Parses an insert statement.
+@return own: update node in a query tree */
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+ sym_node_t* table_sym, /*!< in: table name node */
+ que_node_t* values_list, /*!< in: value expression list or NULL */
+ sel_node_t* select); /*!< in: select condition or NULL */
+/*********************************************************************//**
+Parses an elsif element.
+@return elsif node */
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses an if-statement.
+@return if-statement node */
+if_node_t*
+pars_if_statement(
+/*==============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list, /*!< in: statement list */
+ que_node_t* else_part); /*!< in: else-part statement list */
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return for-statement node */
+for_node_t*
+pars_for_statement(
+/*===============*/
+ sym_node_t* loop_var, /*!< in: loop variable */
+ que_node_t* loop_start_limit,/*!< in: loop start expression */
+ que_node_t* loop_end_limit, /*!< in: loop end expression */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses a while-statement.
+@return while-statement node */
+while_node_t*
+pars_while_statement(
+/*=================*/
+ que_node_t* cond, /*!< in: while-condition */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses an exit statement.
+@return exit statement node */
+exit_node_t*
+pars_exit_statement(void);
+/*=====================*/
+/*********************************************************************//**
+Parses a return-statement.
+@return return-statement node */
+return_node_t*
+pars_return_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a procedure call.
+@return function node */
+func_node_t*
+pars_procedure_call(
+/*================*/
+ que_node_t* res_word,/*!< in: procedure name reserved word */
+ que_node_t* args); /*!< in: argument list */
+/*********************************************************************//**
+Parses an assignment statement.
+@return assignment statement node */
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+ sym_node_t* var, /*!< in: variable to assign */
+ que_node_t* val); /*!< in: value to assign */
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return fetch statement node */
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+ sym_node_t* cursor, /*!< in: cursor node */
+ sym_node_t* into_list, /*!< in: variables to set, or NULL */
+ sym_node_t* user_func); /*!< in: user function name, or NULL */
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return fetch statement node */
+open_node_t*
+pars_open_statement(
+/*================*/
+ ulint type, /*!< in: ROW_SEL_OPEN_CURSOR
+ or ROW_SEL_CLOSE_CURSOR */
+ sym_node_t* cursor); /*!< in: cursor node */
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return row_printf-statement node */
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+ sel_node_t* sel_node); /*!< in: select node */
+/*********************************************************************//**
+Parses a commit statement.
+@return own: commit node struct */
+commit_node_t*
+pars_commit_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a rollback statement.
+@return own: rollback node struct */
+roll_node_t*
+pars_rollback_statement(void);
+/*=========================*/
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return column sym table node */
+sym_node_t*
+pars_column_def(
+/*============*/
+ sym_node_t* sym_node, /*!< in: column node in the
+ symbol table */
+ pars_res_word_t* type, /*!< in: data type */
+ sym_node_t* len, /*!< in: length of column, or
+ NULL */
+ void* is_not_null); /*!< in: if not NULL, column
+ is of type NOT NULL. */
+/*********************************************************************//**
+Parses a table creation operation.
+@return table create subgraph */
+tab_node_t*
+pars_create_table(
+/*==============*/
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_defs); /*!< in: list of column names */
+/*********************************************************************//**
+Parses an index creation operation.
+@return index create subgraph */
+ind_node_t*
+pars_create_index(
+/*==============*/
+ pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */
+ pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */
+ sym_node_t* index_sym, /*!< in: index name node in the symbol
+ table */
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_list); /*!< in: list of column names */
+/*********************************************************************//**
+Parses a procedure definition.
+@return query fork node */
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+ sym_node_t* sym_node, /*!< in: procedure id node in the symbol
+ table */
+ que_node_t* stat_list); /*!< in: statement list */
+
+/*************************************************************//**
+Parses a stored procedure call, when this is not within another stored
+procedure, that is, the client issues a procedure call directly.
+In MySQL/InnoDB, stored InnoDB procedures are invoked via the
+parsed procedure tree, not via InnoDB SQL, so this function is not used.
+@return query graph */
+que_fork_t*
+pars_stored_procedure_call(
+/*=======================*/
+ sym_node_t* sym_node); /*!< in: stored procedure name */
+/** Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running. The fork created is of
+type QUE_FORK_MYSQL_INTERFACE.
+@param[in] node root node for an incomplete query
+ graph, or NULL for dummy graph
+@param[in] trx transaction handle
+@param[in] heap memory heap from which allocated
+@param[in] prebuilt row prebuilt structure
+@return query thread node to run */
+que_thr_t*
+pars_complete_graph_for_exec(
+ que_node_t* node,
+ trx_t* trx,
+ mem_heap_t* heap,
+ row_prebuilt_t* prebuilt)
+ MY_ATTRIBUTE((nonnull(2,3), warn_unused_result));
+
+/****************************************************************//**
+Create parser info struct.
+@return own: info struct */
+pars_info_t*
+pars_info_create(void);
+/*==================*/
+
+/****************************************************************//**
+Free info struct and everything it contains. */
+void
+pars_info_free(
+/*===========*/
+ pars_info_t* info); /*!< in, own: info struct */
+
+/****************************************************************//**
+Add bound literal. */
+void
+pars_info_add_literal(
+/*==================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const void* address, /*!< in: address */
+ ulint length, /*!< in: length of data */
+ ulint type, /*!< in: type, e.g. DATA_FIXBINARY */
+ ulint prtype); /*!< in: precise type, e.g.
+ DATA_UNSIGNED */
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+void
+pars_info_add_str_literal(
+/*======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* str); /*!< in: string */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_literal(
+/*===================*/
+ pars_info_t* info, /* in: info struct */
+ const char* name, /* in: name */
+ const void* address, /* in: address */
+ ulint length, /* in: length of data */
+ ulint type, /* in: type, e.g. DATA_FIXBINARY */
+ ulint prtype); /* in: precise type, e.g. */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const byte* str, /*!< in: string */
+ ulint str_len); /*!< in: string length */
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_bind_int4_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const ib_uint32_t* val); /*!< in: value */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_int8_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const ib_uint64_t* val); /*!< in: value */
+/****************************************************************//**
+Add user function. */
+void
+pars_info_bind_function(
+/*===================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: function name */
+ pars_user_func_cb_t func, /*!< in: function address */
+ void* arg); /*!< in: user-supplied argument */
+/****************************************************************//**
+Add bound id. */
+void
+pars_info_bind_id(
+/*=============*/
+ pars_info_t* info, /*!< in: info struct */
+ ibool copy_name,/* in: make a copy of name if TRUE */
+ const char* name, /*!< in: name */
+ const char* id); /*!< in: id */
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_int4_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ ulint val); /*!< in: value */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_ull_literal(
+/*======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ ib_uint64_t val); /*!< in: value */
+
+/****************************************************************//**
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_ull_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const ib_uint64_t* val) /*!< in: value */
+ MY_ATTRIBUTE((nonnull));
+
+/****************************************************************//**
+Add bound id. */
+void
+pars_info_add_id(
+/*=============*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* id); /*!< in: id */
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return bound literal, or NULL if not found */
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name); /*!< in: bound literal name to find */
+
+/****************************************************************//**
+Get bound id with the given name.
+@return bound id, or NULL if not found */
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name); /*!< in: bound id name to find */
+
+/******************************************************************//**
+Release any resources used by the lexer. */
+void
+pars_lexer_close(void);
+/*==================*/
+
+/** Extra information supplied for pars_sql(). */
+struct pars_info_t {
+ mem_heap_t* heap; /*!< our own memory heap */
+
+ ib_vector_t* funcs; /*!< user functions, or NUll
+ (pars_user_func_t*) */
+ ib_vector_t* bound_lits; /*!< bound literals, or NULL
+ (pars_bound_lit_t*) */
+ ib_vector_t* bound_ids; /*!< bound ids, or NULL
+ (pars_bound_id_t*) */
+
+ ibool graph_owns_us; /*!< if TRUE (which is the default),
+ que_graph_free() will free us */
+};
+
+/** User-supplied function and argument. */
+struct pars_user_func_t {
+ const char* name; /*!< function name */
+ pars_user_func_cb_t func; /*!< function address */
+ void* arg; /*!< user-supplied argument */
+};
+
+/** Bound literal. */
+struct pars_bound_lit_t {
+ const char* name; /*!< name */
+ const void* address; /*!< address */
+ ulint length; /*!< length of data */
+ ulint type; /*!< type, e.g. DATA_FIXBINARY */
+ ulint prtype; /*!< precise type, e.g. DATA_UNSIGNED */
+ sym_node_t* node; /*!< symbol node */
+};
+
+/** Bound identifier. */
+struct pars_bound_id_t {
+ const char* name; /*!< name */
+ const char* id; /*!< identifier */
+};
+
+/** Struct used to denote a reserved word in a parsing tree */
+struct pars_res_word_t{
+ int code; /*!< the token code for the reserved word from
+ pars0grm.h */
+};
+
+/** A predefined function or operator node in a parsing tree; this construct
+is also used for some non-functions like the assignment ':=' */
+struct func_node_t{
+ que_common_t common; /*!< type: QUE_NODE_FUNC */
+ int func; /*!< token code of the function name */
+ ulint fclass; /*!< class of the function */
+ que_node_t* args; /*!< argument(s) of the function */
+ UT_LIST_NODE_T(func_node_t) cond_list;
+ /*!< list of comparison conditions; defined
+ only for comparison operator nodes except,
+ presently, for OPT_SCROLL_TYPE ones */
+ UT_LIST_NODE_T(func_node_t) func_node_list;
+ /*!< list of function nodes in a parsed
+ query graph */
+};
+
+/** An order-by node in a select */
+struct order_node_t{
+ que_common_t common; /*!< type: QUE_NODE_ORDER */
+ sym_node_t* column; /*!< order-by column */
+ ibool asc; /*!< TRUE if ascending, FALSE if descending */
+};
+
+/** Procedure definition node */
+struct proc_node_t{
+ que_common_t common; /*!< type: QUE_NODE_PROC */
+ sym_node_t* proc_id; /*!< procedure name symbol in the symbol
+ table of this same procedure */
+ que_node_t* stat_list; /*!< statement list */
+ sym_tab_t* sym_tab; /*!< symbol table of this procedure */
+};
+
+/** elsif-element node */
+struct elsif_node_t{
+ que_common_t common; /*!< type: QUE_NODE_ELSIF */
+ que_node_t* cond; /*!< if condition */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** if-statement node */
+struct if_node_t{
+ que_common_t common; /*!< type: QUE_NODE_IF */
+ que_node_t* cond; /*!< if condition */
+ que_node_t* stat_list; /*!< statement list */
+ que_node_t* else_part; /*!< else-part statement list */
+ elsif_node_t* elsif_list; /*!< elsif element list */
+};
+
+/** while-statement node */
+struct while_node_t{
+ que_common_t common; /*!< type: QUE_NODE_WHILE */
+ que_node_t* cond; /*!< while condition */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** for-loop-statement node */
+struct for_node_t{
+ que_common_t common; /*!< type: QUE_NODE_FOR */
+ sym_node_t* loop_var; /*!< loop variable: this is the
+ dereferenced symbol from the
+ variable declarations, not the
+ symbol occurrence in the for loop
+ definition */
+ que_node_t* loop_start_limit;/*!< initial value of loop variable */
+ que_node_t* loop_end_limit; /*!< end value of loop variable */
+ lint loop_end_value; /*!< evaluated value for the end value:
+ it is calculated only when the loop
+ is entered, and will not change within
+ the loop */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** exit statement node */
+struct exit_node_t{
+ que_common_t common; /*!< type: QUE_NODE_EXIT */
+};
+
+/** return-statement node */
+struct return_node_t{
+ que_common_t common; /*!< type: QUE_NODE_RETURN */
+};
+
+/** Assignment statement node */
+struct assign_node_t{
+ que_common_t common; /*!< type: QUE_NODE_ASSIGNMENT */
+ sym_node_t* var; /*!< variable to set */
+ que_node_t* val; /*!< value to assign */
+};
+
+/** Column assignment node */
+struct col_assign_node_t{
+ que_common_t common; /*!< type: QUE_NODE_COL_ASSIGN */
+ sym_node_t* col; /*!< column to set */
+ que_node_t* val; /*!< value to assign */
+};
+
+/** Classes of functions */
+/* @{ */
+#define PARS_FUNC_ARITH 1 /*!< +, -, *, / */
+#define PARS_FUNC_LOGICAL 2 /*!< AND, OR, NOT */
+#define PARS_FUNC_CMP 3 /*!< comparison operators */
+#define PARS_FUNC_PREDEFINED 4 /*!< TO_NUMBER, SUBSTR, ... */
+#define PARS_FUNC_AGGREGATE 5 /*!< COUNT */
+#define PARS_FUNC_OTHER 6 /*!< these are not real functions,
+ e.g., := */
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h
new file mode 100644
index 00000000..59f6cc31
--- /dev/null
+++ b/storage/innobase/include/pars0sym.h
@@ -0,0 +1,243 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.h
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0sym_h
+#define pars0sym_h
+
+#include "que0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return own: symbol table */
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+ mem_heap_t* heap); /*!< in: memory heap where to create */
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+void
+sym_tab_free_private(
+/*=================*/
+ sym_tab_t* sym_tab); /*!< in, own: symbol table */
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ ulint val); /*!< in: integer value */
+/******************************************************************//**
+Adds an string literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const byte* str, /*!< in: string with no quotes around
+ it */
+ ulint len); /*!< in: string length */
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name, /*!< in: name of bound literal */
+ ulint* lit_type); /*!< out: type of literal (PARS_*_LIT) */
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+ /* out: symbol table node */
+ sym_node_t* node, /* in: node that is bound to literal*/
+ const void* address, /* in: pointer to data */
+ ulint length); /* in: length of data */
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+ sym_tab_t* sym_tab); /*!< in: symbol table */
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ byte* name, /*!< in: identifier name */
+ ulint len); /*!< in: identifier length */
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name); /*!< in: name of bound id */
+
+/** Index of sym_node_t::field_nos corresponding to the clustered index */
+#define SYM_CLUST_FIELD_NO 0
+/** Index of sym_node_t::field_nos corresponding to a secondary index */
+#define SYM_SEC_FIELD_NO 1
+
+/** Types of a symbol table node */
+enum sym_tab_entry {
+ SYM_UNSET, /*!< Unset entry. */
+ SYM_VAR = 91, /*!< declared parameter or local
+ variable of a procedure */
+ SYM_IMPLICIT_VAR, /*!< storage for a intermediate result
+ of a calculation */
+ SYM_LIT, /*!< literal */
+ SYM_TABLE_REF_COUNTED, /*!< database table name, ref counted. Must
+ be closed explicitly. */
+ SYM_TABLE, /*!< database table name */
+ SYM_COLUMN, /*!< database table name */
+ SYM_CURSOR, /*!< named cursor */
+ SYM_PROCEDURE_NAME, /*!< stored procedure name */
+ SYM_INDEX, /*!< database index name */
+ SYM_FUNCTION /*!< user function name */
+};
+
+/** Symbol table node */
+struct sym_node_t{
+ que_common_t common; /*!< node type:
+ QUE_NODE_SYMBOL */
+ /* NOTE: if the data field in 'common.val' is not NULL and the symbol
+ table node is not for a temporary column, the memory for the value has
+ been allocated from dynamic memory and it should be freed when the
+ symbol table is discarded */
+
+ /* 'alias' and 'indirection' are almost the same, but not quite.
+ 'alias' always points to the primary instance of the variable, while
+ 'indirection' does the same only if we should use the primary
+ instance's values for the node's data. This is usually the case, but
+ when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM
+ t WHERE id = x;"), we copy the values from the primary instance to
+ the cursor's instance so that they are fixed for the duration of the
+ cursor, and set 'indirection' to NULL. If we did not, the value of
+ 'x' could change between fetches and things would break horribly.
+
+ TODO: It would be cleaner to make 'indirection' a boolean field and
+ always use 'alias' to refer to the primary node. */
+
+ sym_node_t* indirection; /*!< pointer to
+ another symbol table
+ node which contains
+ the value for this
+ node, NULL otherwise */
+ sym_node_t* alias; /*!< pointer to
+ another symbol table
+ node for which this
+ node is an alias,
+ NULL otherwise */
+ UT_LIST_NODE_T(sym_node_t) col_var_list; /*!< list of table
+ columns or a list of
+ input variables for an
+ explicit cursor */
+ ibool copy_val; /*!< TRUE if a column
+ and its value should
+ be copied to dynamic
+ memory when fetched */
+ ulint field_nos[2]; /*!< if a column, in
+ the position
+ SYM_CLUST_FIELD_NO is
+ the field number in the
+ clustered index; in
+ the position
+ SYM_SEC_FIELD_NO
+ the field number in the
+ non-clustered index to
+ use first; if not found
+ from the index, then
+ ULINT_UNDEFINED */
+ ibool resolved; /*!< TRUE if the
+ meaning of a variable
+ or a column has been
+ resolved; for literals
+ this is always TRUE */
+ enum sym_tab_entry token_type; /*!< type of the
+ parsed token */
+ const char* name; /*!< name of an id */
+ ulint name_len; /*!< id name length */
+ dict_table_t* table; /*!< table definition
+ if a table id or a
+ column id */
+ ulint col_no; /*!< column number if a
+ column */
+ sel_buf_t* prefetch_buf; /*!< NULL, or a buffer
+ for cached column
+ values for prefetched
+ rows */
+ sel_node_t* cursor_def; /*!< cursor definition
+ select node if a
+ named cursor */
+ ulint param_type; /*!< PARS_INPUT,
+ PARS_OUTPUT, or
+ PARS_NOT_PARAM if not a
+ procedure parameter */
+ sym_tab_t* sym_table; /*!< back pointer to
+ the symbol table */
+ UT_LIST_NODE_T(sym_node_t) sym_list; /*!< list of symbol
+ nodes */
+ sym_node_t* like_node; /* LIKE operator node*/
+};
+
+/** Symbol table */
+struct sym_tab_t{
+ que_t* query_graph;
+ /*!< query graph generated by the
+ parser */
+ const char* sql_string;
+ /*!< SQL string to parse */
+ size_t string_len;
+ /*!< SQL string length */
+ size_t next_char_pos;
+ /*!< position of the next character in
+ sql_string to give to the lexical
+ analyzer */
+ pars_info_t* info; /*!< extra information, or NULL */
+ sym_node_list_t sym_list;
+ /*!< list of symbol nodes in the symbol
+ table */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ func_node_list;
+ /*!< list of function nodes in the
+ parsed query graph */
+ mem_heap_t* heap; /*!< memory heap from which we can
+ allocate space */
+};
+
+#endif
diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h
new file mode 100644
index 00000000..f5b69522
--- /dev/null
+++ b/storage/innobase/include/pars0types.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0types.h
+SQL parser global types
+
+Created 1/11/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0types_h
+#define pars0types_h
+
+struct pars_info_t;
+struct pars_user_func_t;
+struct pars_bound_lit_t;
+struct pars_bound_id_t;
+struct sym_node_t;
+struct sym_tab_t;
+struct pars_res_word_t;
+struct func_node_t;
+struct order_node_t;
+struct proc_node_t;
+struct elsif_node_t;
+struct if_node_t;
+struct while_node_t;
+struct for_node_t;
+struct exit_node_t;
+struct return_node_t;
+struct assign_node_t;
+struct col_assign_node_t;
+
+typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t;
+
+#endif
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
new file mode 100644
index 00000000..e77857f4
--- /dev/null
+++ b/storage/innobase/include/que0que.h
@@ -0,0 +1,435 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.h
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0que_h
+#define que0que_h
+
+#include "data0data.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0srv.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "pars0types.h"
+
+/***********************************************************************//**
+Creates a query graph fork node.
+@return own: fork node */
+que_fork_t*
+que_fork_create(
+/*============*/
+ que_t* graph, /*!< in: graph, if NULL then this
+ fork node is assumed to be the
+ graph root */
+ que_node_t* parent, /*!< in: parent node */
+ ulint fork_type, /*!< in: fork type */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+ que_fork_t* fork); /*!< in: query fork */
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+ que_fork_t* fork); /*!< in: query fork */
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+ que_node_t* node, /*!< in: graph node */
+ que_node_t* parent);/*!< in: parent */
+/** Creates a query graph thread node.
+@param[in] parent parent node, i.e., a fork node
+@param[in] heap memory heap where created
+@param[in] prebuilt row prebuilt structure
+@return own: query thread node */
+que_thr_t*
+que_thr_create(
+ que_fork_t* parent,
+ mem_heap_t* heap,
+ row_prebuilt_t* prebuilt);
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+void
+que_graph_free_recursive(
+/*=====================*/
+ que_node_t* node); /*!< in: query graph node */
+/**********************************************************************//**
+Frees a query graph. */
+void
+que_graph_free(
+/*===========*/
+ que_t* graph); /*!< in: query graph; we assume that the memory
+ heap where this graph was created is private
+ to this graph: if not, then use
+ que_graph_free_recursive and free the heap
+ afterwards! */
+/**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx. The lock_sys_t::mutex
+has to be reserved.
+@return TRUE if stopped */
+ibool
+que_thr_stop(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
+query thread is stopped and made inactive, except in the case where
+it was put to the lock wait state in lock0lock.cc, but the lock has already
+been granted or the transaction chosen as a victim in deadlock resolution. */
+void
+que_thr_stop_for_mysql(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+void
+que_run_threads(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Moves a suspended query thread to the QUE_THR_RUNNING state and release
+a worker thread to execute it. This function should be used to end
+the wait state of a query thread waiting for a lock or a stored procedure
+completion.
+@return query thread instance of thread to wakeup or NULL */
+que_thr_t*
+que_thr_end_lock_wait(
+/*==================*/
+ trx_t* trx); /*!< in: transaction in the
+ QUE_THR_LOCK_WAIT state */
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+ que_fork_t* fork); /*!< in: a query fork */
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+ const que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+ que_node_t* node, /*!< in: graph node */
+ ulint size); /*!< in: size */
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes. */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+ que_node_t* node); /*!< in: node in a list */
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+ que_node_t* node); /*!< in: node */
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return containing loop node, or NULL. */
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+ que_node_t* node); /*!< in: node */
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+ que_node_t* node_list, /*!< in: node list, or NULL */
+ que_node_t* node); /*!< in: node */
+/*************************************************************************
+Get the last node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+ /* out: node last node from list.*/
+ que_node_t* node_list); /* in: node list, or NULL */
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+ que_node_t* node_list); /*!< in: node list, or NULL */
+/**********************************************************************//**
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped.
+@return TRUE if should be stopped; NOTE that if the peek is made
+without reserving the trx_t::mutex, then another peek with the mutex
+reserved is necessary before deciding the actual stopping */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************************//**
+Returns TRUE if the query graph is for a SELECT statement.
+@return TRUE if a select */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+ que_t* graph); /*!< in: graph */
+/**********************************************************************//**
+Prints info of an SQL query graph node. */
+void
+que_node_print_info(
+/*================*/
+ que_node_t* node); /*!< in: query graph node */
+/*********************************************************************//**
+Evaluate the given SQL
+@return error code or DB_SUCCESS */
+dberr_t
+que_eval_sql(
+/*=========*/
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql, /*!< in: SQL string */
+ bool reserve_dict_mutex,
+ /*!< in: whether to acquire/release
+ dict_sys.mutex around call to pars_sql. */
+ trx_t* trx); /*!< in: trx */
+
+/**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+ que_fork_t* fork, /*!< in: a query fork */
+ que_thr_t* thr); /*!< in: current pos */
+
+/** Query thread states */
+enum que_thr_state_t {
+ QUE_THR_RUNNING,
+ /** in selects this means that the thread is at the end of its
+ result set (or start, in case of a scroll cursor); in other
+ statements, this means the thread has done its task */
+ QUE_THR_COMPLETED,
+ QUE_THR_COMMAND_WAIT,
+ QUE_THR_LOCK_WAIT,
+ QUE_THR_SUSPENDED
+};
+
+/** Query thread lock states */
+enum que_thr_lock_t {
+ QUE_THR_LOCK_NOLOCK,
+ QUE_THR_LOCK_ROW,
+ QUE_THR_LOCK_TABLE
+};
+
+/* Query graph query thread node: the fields are protected by the
+trx_t::mutex with the exceptions named below */
+
+struct que_thr_t{
+ que_common_t common; /*!< type: QUE_NODE_THR */
+ que_node_t* child; /*!< graph child node */
+ que_t* graph; /*!< graph where this node belongs */
+ que_thr_state_t state; /*!< state of the query thread */
+ bool is_active; /*!< whether the thread is active */
+ /*------------------------------*/
+ /* The following fields are private to the OS thread executing the
+ query thread, and are not protected by any mutex: */
+
+ que_node_t* run_node; /*!< pointer to the node where the
+ subgraph down from this node is
+ currently executed */
+ que_node_t* prev_node; /*!< pointer to the node from which
+ the control came */
+ ulint resource; /*!< resource usage of the query thread
+ thus far */
+ ulint lock_state; /*!< lock state of thread (table or
+ row) */
+ struct srv_slot_t*
+ slot; /* The thread slot in the wait
+ array in srv_sys_t */
+ /*------------------------------*/
+ /* The following fields are links for the various lists that
+ this type can be on. */
+ UT_LIST_NODE_T(que_thr_t)
+ thrs; /*!< list of thread nodes of the fork
+ node */
+ UT_LIST_NODE_T(que_thr_t)
+ queue; /*!< list of runnable thread nodes in
+ the server task queue */
+ ulint fk_cascade_depth; /*!< maximum cascading call depth
+ supported for foreign key constraint
+ related delete/updates */
+ row_prebuilt_t* prebuilt; /*!< prebuilt structure processed by
+ the query thread */
+
+#ifdef UNIV_DEBUG
+ /** Change the 'active' status */
+ inline void set_active(bool active);
+#endif
+ /** Transition to the QUE_THR_RUNNING state. */
+ inline void start_running()
+ {
+ ut_d(if (!is_active) set_active(true));
+ is_active= true;
+ state= QUE_THR_RUNNING;
+ }
+
+ /** Stop query execution when there is no error or lock wait. */
+ void stop_no_error()
+ {
+ ut_ad(is_active);
+ ut_d(set_active(false));
+ state= QUE_THR_COMPLETED;
+ is_active= false;
+ }
+};
+
+/* Query graph fork node: its fields are protected by the query thread mutex */
+struct que_fork_t{
+ que_common_t common; /*!< type: QUE_NODE_FORK */
+ que_t* graph; /*!< query graph of this node */
+ ulint fork_type; /*!< fork type */
+#ifdef UNIV_DEBUG
+ /** For the query graph root, updated in set_active() */
+ ulint n_active_thrs;
+ /** Change the 'active' status */
+ void set_active(bool active);
+#endif
+ trx_t* trx; /*!< transaction: this is set only in
+ the root node */
+ ulint state; /*!< state of the fork node */
+ que_thr_t* caller; /*!< pointer to a possible calling query
+ thread */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ thrs; /*!< list of query threads */
+ /*------------------------------*/
+ /* The fields in this section are defined only in the root node */
+ sym_tab_t* sym_tab; /*!< symbol table of the query,
+ generated by the parser, or NULL
+ if the graph was created 'by hand' */
+ pars_info_t* info; /*!< info struct, or NULL */
+
+ sel_node_t* last_sel_node; /*!< last executed select node, or NULL
+ if none */
+ UT_LIST_NODE_T(que_fork_t)
+ graphs; /*!< list of query graphs of a session
+ or a stored procedure */
+ /*------------------------------*/
+ mem_heap_t* heap; /*!< memory heap where the fork was
+ created */
+
+};
+
+#ifdef UNIV_DEBUG
+inline void que_thr_t::set_active(bool active) { graph->set_active(active); };
+#endif
+
+/* Query fork (or graph) types */
+#define QUE_FORK_SELECT_NON_SCROLL 1 /* forward-only cursor */
+#define QUE_FORK_SELECT_SCROLL 2 /* scrollable cursor */
+#define QUE_FORK_INSERT 3
+#define QUE_FORK_UPDATE 4
+#define QUE_FORK_ROLLBACK 5
+ /* This is really the undo graph used in rollback,
+ no signal-sending roll_node in this graph */
+#define QUE_FORK_PURGE 6
+#define QUE_FORK_EXECUTE 7
+#define QUE_FORK_PROCEDURE 8
+#define QUE_FORK_PROCEDURE_CALL 9
+#define QUE_FORK_MYSQL_INTERFACE 10
+#define QUE_FORK_RECOVERY 11
+
+/* Query fork (or graph) states */
+#define QUE_FORK_ACTIVE 1
+#define QUE_FORK_COMMAND_WAIT 2
+#define QUE_FORK_INVALID 3
+#define QUE_FORK_BEING_FREED 4
+
+/* Flag which is ORed to control structure statement node types */
+#define QUE_NODE_CONTROL_STAT 1024
+
+#include "que0que.ic"
+
+#endif
diff --git a/storage/innobase/include/que0que.ic b/storage/innobase/include/que0que.ic
new file mode 100644
index 00000000..1c3ac242
--- /dev/null
+++ b/storage/innobase/include/que0que.ic
@@ -0,0 +1,293 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.ic
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(thr);
+
+ return(thr->graph->trx);
+}
+
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+ que_fork_t* fork) /*!< in: query fork */
+{
+ return(UT_LIST_GET_FIRST(fork->thrs));
+}
+
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+ que_fork_t* fork) /*!< in: query fork */
+{
+ que_thr_t* thr;
+
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+
+ return(thr->child);
+}
+
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+ const que_node_t* node) /*!< in: graph node */
+{
+ return(reinterpret_cast<const que_common_t*>(node)->type);
+}
+
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(&(((que_common_t*) node)->val));
+}
+
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(((que_common_t*) node)->val_buf_size);
+}
+
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+ que_node_t* node, /*!< in: graph node */
+ ulint size) /*!< in: size */
+{
+ ut_ad(node);
+
+ ((que_common_t*) node)->val_buf_size = size;
+}
+
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+ que_node_t* node, /*!< in: graph node */
+ que_node_t* parent) /*!< in: parent */
+{
+ ut_ad(node);
+
+ ((que_common_t*) node)->parent = parent;
+}
+
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(dfield_get_type(&((que_common_t*) node)->val));
+}
+
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+ que_node_t* node_list, /*!< in: node list, or NULL */
+ que_node_t* node) /*!< in: node */
+{
+ que_common_t* cnode;
+ que_common_t* cnode2;
+
+ cnode = (que_common_t*) node;
+
+ cnode->brother = NULL;
+
+ if (node_list == NULL) {
+
+ return(node);
+ }
+
+ cnode2 = (que_common_t*) node_list;
+
+ while (cnode2->brother != NULL) {
+ cnode2 = (que_common_t*) cnode2->brother;
+ }
+
+ cnode2->brother = node;
+
+ return(node_list);
+}
+
+/*************************************************************************
+Removes a query graph node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+ /* out: last node in list.*/
+ que_node_t* node_list) /* in: node list */
+{
+ que_common_t* node;
+
+ ut_a(node_list != NULL);
+
+ node = (que_common_t*) node_list;
+
+ /* We need the last element */
+ while (node->brother != NULL) {
+ node = (que_common_t*) node->brother;
+ }
+
+ return(node);
+}
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes.
+@return next node in a list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+ que_node_t* node) /*!< in: node in a list */
+{
+ return(((que_common_t*) node)->brother);
+}
+
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+ que_node_t* node_list) /*!< in: node list, or NULL */
+{
+ const que_common_t* cnode;
+ ulint len;
+
+ cnode = (const que_common_t*) node_list;
+ len = 0;
+
+ while (cnode != NULL) {
+ len++;
+ cnode = (const que_common_t*) cnode->brother;
+ }
+
+ return(len);
+}
+
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+ que_node_t* node) /*!< in: node */
+{
+ return(((que_common_t*) node)->parent);
+}
+
+/**********************************************************************//**
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped.
+@return TRUE if should be stopped; NOTE that if the peek is made
+without reserving the trx mutex, then another peek with the mutex
+reserved is necessary before deciding the actual stopping */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+ que_t* graph;
+
+ graph = thr->graph;
+ trx = graph->trx;
+
+ if (graph->state != QUE_FORK_ACTIVE
+ || trx->lock.que_state == TRX_QUE_LOCK_WAIT
+ || (trx->lock.que_state != TRX_QUE_ROLLING_BACK
+ && trx->lock.que_state != TRX_QUE_RUNNING)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************************//**
+Returns TRUE if the query graph is for a SELECT statement.
+@return TRUE if a select */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+ que_t* graph) /*!< in: graph */
+{
+ if (graph->fork_type == QUE_FORK_SELECT_SCROLL
+ || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h
new file mode 100644
index 00000000..38f6e380
--- /dev/null
+++ b/storage/innobase/include/que0types.h
@@ -0,0 +1,97 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0types.h
+Query graph global types
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0types_h
+#define que0types_h
+
+#include "data0data.h"
+
+/* Pseudotype for all graph nodes */
+typedef void que_node_t;
+
+/* Query graph root is a fork node */
+typedef struct que_fork_t que_t;
+
+struct row_prebuilt_t;
+struct que_thr_t;
+
+/* Query graph node types */
+#define QUE_NODE_LOCK 1
+#define QUE_NODE_INSERT 2
+#define QUE_NODE_UPDATE 4
+#define QUE_NODE_CURSOR 5
+#define QUE_NODE_SELECT 6
+#define QUE_NODE_AGGREGATE 7
+#define QUE_NODE_FORK 8
+#define QUE_NODE_THR 9
+#define QUE_NODE_UNDO 10
+#define QUE_NODE_COMMIT 11
+#define QUE_NODE_ROLLBACK 12
+#define QUE_NODE_PURGE 13
+#define QUE_NODE_CREATE_TABLE 14
+#define QUE_NODE_CREATE_INDEX 15
+#define QUE_NODE_SYMBOL 16
+#define QUE_NODE_RES_WORD 17
+#define QUE_NODE_FUNC 18
+#define QUE_NODE_ORDER 19
+#define QUE_NODE_PROC (20 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_IF (21 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_WHILE (22 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_ASSIGNMENT 23
+#define QUE_NODE_FETCH 24
+#define QUE_NODE_OPEN 25
+#define QUE_NODE_COL_ASSIGNMENT 26
+#define QUE_NODE_FOR (27 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_RETURN 28
+#define QUE_NODE_ROW_PRINTF 29
+#define QUE_NODE_ELSIF 30
+#define QUE_NODE_CALL 31
+#define QUE_NODE_EXIT 32
+
+/* Common struct at the beginning of each query graph node; the name of this
+substruct must be 'common' */
+
+struct que_common_t{
+ ulint type; /*!< query node type */
+ que_node_t* parent; /*!< back pointer to parent node, or NULL */
+ que_node_t* brother;/* pointer to a possible brother node */
+ dfield_t val; /*!< evaluated value for an expression */
+ ulint val_buf_size;
+ /* buffer size for the evaluated value data,
+ if the buffer has been allocated dynamically:
+ if this field is != 0, and the node is a
+ symbol node or a function node, then we
+ have to free the data field in val
+ explicitly */
+
+ /** Constructor */
+ que_common_t(ulint type, que_node_t* parent) :
+ type(type), parent(parent), brother(NULL),
+ val(), val_buf_size(0)
+ {}
+};
+
+#endif
diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h
new file mode 100644
index 00000000..21143ab6
--- /dev/null
+++ b/storage/innobase/include/read0types.h
@@ -0,0 +1,293 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0types.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0types_h
+#define read0types_h
+
+#include "dict0mem.h"
+#include "trx0types.h"
+#include <algorithm>
+
+
+/**
+ Read view lists the trx ids of those transactions for which a consistent read
+ should not see the modifications to the database.
+*/
+class ReadViewBase
+{
+ /**
+ The read should not see any transaction with trx id >= this value.
+ In other words, this is the "high water mark".
+ */
+ trx_id_t m_low_limit_id;
+
+ /**
+ The read should see all trx ids which are strictly
+ smaller (<) than this value. In other words, this is the
+ low water mark".
+ */
+ trx_id_t m_up_limit_id;
+
+ /** Set of RW transactions that was active when this snapshot was taken */
+ trx_ids_t m_ids;
+
+ /**
+ The view does not need to see the undo logs for transactions whose
+ transaction number is strictly smaller (<) than this value: they can be
+ removed in purge if not needed by other views.
+ */
+ trx_id_t m_low_limit_no;
+
+protected:
+ bool empty() { return m_ids.empty(); }
+
+ /** @return the up limit id */
+ trx_id_t up_limit_id() const { return m_up_limit_id; }
+
+public:
+ ReadViewBase(): m_low_limit_id(0) {}
+
+
+ /**
+ Append state from another view.
+
+ This method is used to find min(m_low_limit_no), min(m_low_limit_id) and
+ all transaction ids below min(m_low_limit_id). These values effectively
+ form oldest view.
+
+ @param other view to copy from
+ */
+ void append(const ReadViewBase &other)
+ {
+ ut_ad(&other != this);
+ if (m_low_limit_no > other.m_low_limit_no)
+ m_low_limit_no= other.m_low_limit_no;
+ if (m_low_limit_id > other.m_low_limit_id)
+ m_low_limit_id= other.m_low_limit_id;
+
+ trx_ids_t::iterator dst= m_ids.begin();
+ for (const trx_id_t id : other.m_ids)
+ {
+ if (id >= m_low_limit_id)
+ break;
+loop:
+ if (dst == m_ids.end())
+ {
+ m_ids.push_back(id);
+ dst= m_ids.end();
+ continue;
+ }
+ if (*dst < id)
+ {
+ dst++;
+ goto loop;
+ }
+ else if (*dst > id)
+ dst= m_ids.insert(dst, id) + 1;
+ }
+ m_ids.erase(std::lower_bound(dst, m_ids.end(), m_low_limit_id),
+ m_ids.end());
+
+ m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front();
+ ut_ad(m_up_limit_id <= m_low_limit_id);
+ }
+
+
+ /**
+ Creates a snapshot where exactly the transactions serialized before this
+ point in time are seen in the view.
+
+ @param[in,out] trx transaction
+ */
+ inline void snapshot(trx_t *trx);
+
+
+ /**
+ Check whether transaction id is valid.
+ @param[in] id transaction id to check
+ @param[in] name table name
+
+ @todo changes_visible() was an unfortunate choice for this check.
+ It should be moved towards the functions that load trx id like
+ trx_read_trx_id(). No need to issue a warning, error log message should
+ be enough. Although statement should ideally fail if it sees corrupt
+ data.
+ */
+ static void check_trx_id_sanity(trx_id_t id, const table_name_t &name);
+
+
+ /**
+ Check whether the changes by id are visible.
+ @param[in] id transaction id to check against the view
+ @param[in] name table name
+ @return whether the view sees the modifications of id.
+ */
+ bool changes_visible(trx_id_t id, const table_name_t &name) const
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ if (id >= m_low_limit_id)
+ {
+ check_trx_id_sanity(id, name);
+ return false;
+ }
+ return id < m_up_limit_id ||
+ m_ids.empty() ||
+ !std::binary_search(m_ids.begin(), m_ids.end(), id);
+ }
+
+
+ /**
+ @param id transaction to check
+ @return true if view sees transaction id
+ */
+ bool sees(trx_id_t id) const { return id < m_up_limit_id; }
+
+ /** @return the low limit no */
+ trx_id_t low_limit_no() const { return m_low_limit_no; }
+
+ /** @return the low limit id */
+ trx_id_t low_limit_id() const { return m_low_limit_id; }
+};
+
+
+/** A ReadView with extra members required for trx_t::read_view. */
+class ReadView: public ReadViewBase
+{
+ /**
+ View state.
+
+ Implemented as atomic to allow mutex-free view close and re-use.
+ Non-owner thread is allowed to call is_open() alone without mutex
+ protection as well. E.g. trx_sys.view_count() does this.
+
+ If non-owner thread intends to access other members as well, both
+ is_open() and other members accesses must be protected by m_mutex.
+ E.g. copy_to().
+ */
+ std::atomic<bool> m_open;
+
+ /** For synchronisation with purge coordinator. */
+ mutable ib_mutex_t m_mutex;
+
+ /**
+ trx id of creating transaction.
+ Used exclusively by the read view owner thread.
+ */
+ trx_id_t m_creator_trx_id;
+
+public:
+ ReadView(): m_open(false) { mutex_create(LATCH_ID_READ_VIEW, &m_mutex); }
+ ~ReadView() { mutex_free(&m_mutex); }
+
+
+ /**
+ Opens a read view where exactly the transactions serialized before this
+ point in time are seen in the view.
+
+ View becomes visible to purge thread. Intended to be called by the ReadView
+ owner thread.
+
+ @param[in,out] trx transaction
+ */
+ void open(trx_t *trx);
+
+
+ /**
+ Closes the view.
+
+ View becomes not visible to purge thread. Intended to be called by the
+ ReadView owner thread.
+ */
+ void close() { m_open.store(false, std::memory_order_relaxed); }
+
+
+ /** Returns true if view is open. */
+ bool is_open() const { return m_open.load(std::memory_order_relaxed); }
+
+
+ /**
+ Sets the creator transaction id.
+
+ This should be set only for views created by RW transactions.
+ Intended to be called by the ReadView owner thread.
+ */
+ void set_creator_trx_id(trx_id_t id)
+ {
+ ut_ad(id > 0);
+ ut_ad(m_creator_trx_id == 0);
+ m_creator_trx_id= id;
+ }
+
+
+ /**
+ Writes the limits to the file.
+ @param file file to write to
+ */
+ void print_limits(FILE *file) const
+ {
+ mutex_enter(&m_mutex);
+ if (is_open())
+ fprintf(file, "Trx read view will not see trx with"
+ " id >= " TRX_ID_FMT ", sees < " TRX_ID_FMT "\n",
+ low_limit_id(), up_limit_id());
+ mutex_exit(&m_mutex);
+ }
+
+
+ /**
+ A wrapper around ReadViewBase::changes_visible().
+ Intended to be called by the ReadView owner thread.
+ */
+ bool changes_visible(trx_id_t id, const table_name_t &name) const
+ { return id == m_creator_trx_id || ReadViewBase::changes_visible(id, name); }
+
+
+ /**
+ A wrapper around ReadViewBase::append().
+ Intended to be called by the purge coordinator task.
+ */
+ void append_to(ReadViewBase *to) const
+ {
+ mutex_enter(&m_mutex);
+ if (is_open())
+ to->append(*this);
+ mutex_exit(&m_mutex);
+ }
+
+
+ /**
+ Declare the object mostly unaccessible.
+ innodb_monitor_set_option is operating also on freed transaction objects.
+ */
+ void mem_noaccess() const
+ {
+ MEM_NOACCESS(&m_open, sizeof m_open);
+ /* m_mutex is accessed by innodb_show_mutex_status()
+ and innodb_monitor_update() even after trx_t::free() */
+ MEM_NOACCESS(&m_creator_trx_id, sizeof m_creator_trx_id);
+ }
+};
+#endif
diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h
new file mode 100644
index 00000000..8d770405
--- /dev/null
+++ b/storage/innobase/include/rem0cmp.h
@@ -0,0 +1,263 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.h
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef rem0cmp_h
+#define rem0cmp_h
+
+#include "data0data.h"
+#include "data0type.h"
+#include "rem0types.h"
+#include "page0types.h"
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return TRUE if the columns are considered equal in comparisons */
+ibool
+cmp_cols_are_equal(
+/*===============*/
+ const dict_col_t* col1, /*!< in: column 1 */
+ const dict_col_t* col2, /*!< in: column 2 */
+ ibool check_charsets);
+ /*!< in: whether to check charsets */
+/** Compare two data fields.
+@param[in] mtype main type
+@param[in] prtype precise type
+@param[in] data1 data field
+@param[in] len1 length of data1 in bytes, or UNIV_SQL_NULL
+@param[in] data2 data field
+@param[in] len2 length of data2 in bytes, or UNIV_SQL_NULL
+@return the comparison result of data1 and data2
+@retval 0 if data1 is equal to data2
+@retval negative if data1 is less than data2
+@retval positive if data1 is greater than data2 */
+int
+cmp_data_data(
+ ulint mtype,
+ ulint prtype,
+ const byte* data1,
+ ulint len1,
+ const byte* data2,
+ ulint len2)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Compare two data fields.
+@param[in] dfield1 data field; must have type field set
+@param[in] dfield2 data field
+@return the comparison result of dfield1 and dfield2
+@retval 0 if dfield1 is equal to dfield2
+@retval negative if dfield1 is less than dfield2
+@retval positive if dfield1 is greater than dfield2 */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+ const dfield_t* dfield1,/*!< in: data field; must have type field set */
+ const dfield_t* dfield2);/*!< in: data field */
+
+#ifdef UNIV_DEBUG
+/** Compare a GIS data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec R-tree record
+@param[in] mode compare mode
+@retval negative if dtuple is less than rec */
+int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec,
+ page_cur_mode_t mode)
+ MY_ATTRIBUTE((nonnull));
+#endif
+
+/** Compare two minimum bounding rectangles.
+@return 1, 0, -1, if a is greater, equal, less than b, respectively */
+inline int cmp_geometry_field(const void *a, const void *b)
+{
+ const byte *mbr1= static_cast<const byte*>(a);
+ const byte *mbr2= static_cast<const byte*>(b);
+
+ static_assert(SPDIMS == 2, "compatibility");
+ static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility");
+
+ /* Try to compare mbr left lower corner (xmin, ymin) */
+ double x1= mach_double_read(mbr1);
+ double x2= mach_double_read(mbr2);
+ if (x1 > x2)
+ return 1;
+ if (x2 > x1)
+ return -1;
+
+ double y1= mach_double_read(mbr1 + sizeof(double) * SPDIMS);
+ double y2= mach_double_read(mbr2 + sizeof(double) * SPDIMS);
+
+ if (y1 > y2)
+ return 1;
+ if (y2 > y1)
+ return -1;
+
+ /* left lower corner (xmin, ymin) overlaps, now right upper corner */
+ x1= mach_double_read(mbr1 + sizeof(double));
+ x2= mach_double_read(mbr2 + sizeof(double));
+
+ if (x1 > x2)
+ return 1;
+ if (x2 > x1)
+ return -1;
+
+ y1= mach_double_read(mbr1 + sizeof(double) * 2 + sizeof(double));
+ y2= mach_double_read(mbr2 + sizeof(double) * 2 + sizeof(double));
+
+ if (y1 > y2)
+ return 1;
+ if (y2 > y1)
+ return -1;
+
+ return 0;
+}
+
+/** Compare a data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec B-tree record
+@param[in] offsets rec_get_offsets(rec)
+@param[in] n_cmp number of fields to compare
+@param[in,out] matched_fields number of completely matched fields
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int
+cmp_dtuple_rec_with_match_low(
+ const dtuple_t* dtuple,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ ulint n_cmp,
+ ulint* matched_fields)
+ MY_ATTRIBUTE((nonnull));
+#define cmp_dtuple_rec_with_match(tuple,rec,offsets,fields) \
+ cmp_dtuple_rec_with_match_low( \
+ tuple,rec,offsets,dtuple_get_n_fields_cmp(tuple),fields)
+/** Compare a data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec B-tree or R-tree index record
+@param[in] index index tree
+@param[in] offsets rec_get_offsets(rec)
+@param[in,out] matched_fields number of completely matched fields
+@param[in,out] matched_bytes number of matched bytes in the first
+field that is not matched
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int
+cmp_dtuple_rec_with_match_bytes(
+ const dtuple_t* dtuple,
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ ulint* matched_fields,
+ ulint* matched_bytes)
+ MY_ATTRIBUTE((warn_unused_result));
+/** Compare a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@param[in] dtuple data tuple
+@param[in] rec B-tree record
+@param[in] offsets rec_get_offsets(rec)
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int
+cmp_dtuple_rec(
+ const dtuple_t* dtuple,
+ const rec_t* rec,
+ const rec_offs* offsets);
+/**************************************************************//**
+Checks if a dtuple is a prefix of a record. The last field in dtuple
+is allowed to be a prefix of the corresponding field in the record.
+@return TRUE if prefix */
+ibool
+cmp_dtuple_is_prefix_of_rec(
+/*========================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */
+/** Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@retval positive if rec1 (including non-ordering columns) is greater than rec2
+@retval negative if rec1 (including non-ordering columns) is less than rec2
+@retval 0 if rec1 is a duplicate of rec2 */
+int
+cmp_rec_rec_simple(
+/*===============*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const rec_offs* offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+ const rec_offs* offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+ const dict_index_t* index, /*!< in: data dictionary index */
+ struct TABLE* table) /*!< in: MySQL table, for reporting
+ duplicate key value if applicable,
+ or NULL */
+ MY_ATTRIBUTE((nonnull(1,2,3,4), warn_unused_result));
+
+/** Compare two B-tree or R-tree records.
+Only the common first fields are compared, and externally stored field
+are treated as equal.
+@param[in] rec1 record (possibly not on an index page)
+@param[in] rec2 B-tree or R-tree record in an index page
+@param[in] offsets1 rec_get_offsets(rec1, index)
+@param[in] offsets2 rec_get_offsets(rec2, index)
+@param[in] nulls_unequal true if this is for index cardinality
+ statistics estimation with
+ innodb_stats_method=nulls_unequal
+ or innodb_stats_method=nulls_ignored
+@param[out] matched_fields number of completely matched fields
+ within the first field not completely matched
+@retval 0 if rec1 is equal to rec2
+@retval negative if rec1 is less than rec2
+@retval positive if rec1 is greater than rec2 */
+int
+cmp_rec_rec(
+ const rec_t* rec1,
+ const rec_t* rec2,
+ const rec_offs* offsets1,
+ const rec_offs* offsets2,
+ const dict_index_t* index,
+ bool nulls_unequal = false,
+ ulint* matched_fields = NULL)
+ MY_ATTRIBUTE((nonnull(1,2,3,4,5)));
+
+/** Compare two data fields.
+@param[in] dfield1 data field
+@param[in] dfield2 data field
+@return the comparison result of dfield1 and dfield2
+@retval 0 if dfield1 is equal to dfield2, or a prefix of dfield1
+@retval negative if dfield1 is less than dfield2
+@retval positive if dfield1 is greater than dfield2 */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_prefix(
+ const dfield_t* dfield1,
+ const dfield_t* dfield2);
+
+#include "rem0cmp.ic"
+
+#endif
diff --git a/storage/innobase/include/rem0cmp.ic b/storage/innobase/include/rem0cmp.ic
new file mode 100644
index 00000000..6e21382d
--- /dev/null
+++ b/storage/innobase/include/rem0cmp.ic
@@ -0,0 +1,107 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.ic
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#include <mysql_com.h>
+#include <my_sys.h>
+
+/** Compare two data fields.
+@param[in] dfield1 data field; must have type field set
+@param[in] dfield2 data field
+@return the comparison result of dfield1 and dfield2
+@retval 0 if dfield1 is equal to dfield2
+@retval negative if dfield1 is less than dfield2
+@retval positive if dfield1 is greater than dfield2 */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+ const dfield_t* dfield1,
+ const dfield_t* dfield2)
+{
+ const dtype_t* type;
+
+ ut_ad(dfield_check_typed(dfield1));
+
+ type = dfield_get_type(dfield1);
+
+ return(cmp_data_data(type->mtype, type->prtype,
+ (const byte*) dfield_get_data(dfield1),
+ dfield_get_len(dfield1),
+ (const byte*) dfield_get_data(dfield2),
+ dfield_get_len(dfield2)));
+}
+
+/** Compare two data fields.
+@param[in] dfield1 data field
+@param[in] dfield2 data field
+@return the comparison result of dfield1 and dfield2
+@retval 0 if dfield1 is equal to dfield2, or a prefix of dfield1
+@retval negative if dfield1 is less than dfield2
+@retval positive if dfield1 is greater than dfield2 */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_prefix(
+ const dfield_t* dfield1,
+ const dfield_t* dfield2)
+{
+ const dtype_t* type;
+
+ ut_ad(dfield_check_typed(dfield1));
+ ut_ad(dfield_check_typed(dfield2));
+
+ type = dfield_get_type(dfield1);
+
+#ifdef UNIV_DEBUG
+ switch (type->prtype & DATA_MYSQL_TYPE_MASK) {
+ case MYSQL_TYPE_BIT:
+ case MYSQL_TYPE_STRING:
+ case MYSQL_TYPE_VAR_STRING:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ case MYSQL_TYPE_VARCHAR:
+ break;
+ default:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ uint cs_num = (uint) dtype_get_charset_coll(type->prtype);
+
+ if (CHARSET_INFO* cs = get_charset(cs_num, MYF(MY_WME))) {
+ return(cs->strnncoll(
+ static_cast<const uchar*>(
+ dfield_get_data(dfield1)),
+ dfield_get_len(dfield1),
+ static_cast<const uchar*>(
+ dfield_get_data(dfield2)),
+ dfield_get_len(dfield2),
+ 1));
+ }
+
+ ib::fatal() << "Unable to find charset-collation " << cs_num;
+ return(0);
+}
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
new file mode 100644
index 00000000..dbcff3e6
--- /dev/null
+++ b/storage/innobase/include/rem0rec.h
@@ -0,0 +1,1299 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.h
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0rec_h
+#define rem0rec_h
+
+#ifndef UNIV_INNOCHECKSUM
+#include "data0data.h"
+#include "rem0types.h"
+#include "mtr0types.h"
+#include "page0types.h"
+#include "dict0dict.h"
+#include "trx0types.h"
+#endif /*! UNIV_INNOCHECKSUM */
+#include <ostream>
+#include <sstream>
+
+/* Number of extra bytes in an old-style record,
+in addition to the data and the offsets */
+#define REC_N_OLD_EXTRA_BYTES 6
+/* Number of extra bytes in a new-style record,
+in addition to the data and the offsets */
+#define REC_N_NEW_EXTRA_BYTES 5
+
+#define REC_NEW_STATUS 3 /* This is single byte bit-field */
+#define REC_NEW_STATUS_MASK 0x7UL
+#define REC_NEW_STATUS_SHIFT 0
+
+/* The following four constants are needed in page0zip.cc in order to
+efficiently compress and decompress pages. */
+
+/* The offset of heap_no in a compact record */
+#define REC_NEW_HEAP_NO 4
+/* The shift of heap_no in a compact record.
+The status is stored in the low-order bits. */
+#define REC_HEAP_NO_SHIFT 3
+
+/* Length of a B-tree node pointer, in bytes */
+#define REC_NODE_PTR_SIZE 4
+
+#ifndef UNIV_INNOCHECKSUM
+/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */
+constexpr rec_offs REC_1BYTE_SQL_NULL_MASK= 0x80;
+/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */
+constexpr rec_offs REC_2BYTE_SQL_NULL_MASK= 0x8000;
+
+/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most
+significant bit denotes that the tail of a field is stored off-page. */
+constexpr rec_offs REC_2BYTE_EXTERN_MASK= 0x4000;
+
+constexpr size_t RECORD_OFFSET= 2;
+constexpr size_t INDEX_OFFSET=
+ RECORD_OFFSET + sizeof(rec_t *) / sizeof(rec_offs);
+#endif /* UNIV_INNOCHECKSUM */
+
+/* Length of the rec_get_offsets() header */
+constexpr size_t REC_OFFS_HEADER_SIZE=
+#ifdef UNIV_DEBUG
+#ifndef UNIV_INNOCHECKSUM
+ sizeof(rec_t *) / sizeof(rec_offs) +
+ sizeof(dict_index_t *) / sizeof(rec_offs) +
+#endif /* UNIV_INNOCHECKSUM */
+#endif /* UNIV_DEBUG */
+ 2;
+
+/* Number of elements that should be initially allocated for the
+offsets[] array, first passed to rec_get_offsets() */
+constexpr size_t REC_OFFS_NORMAL_SIZE= 300;
+constexpr size_t REC_OFFS_SMALL_SIZE= 18;
+constexpr size_t REC_OFFS_SEC_INDEX_SIZE=
+ /* PK max key parts */ 16 + /* sec idx max key parts */ 16 +
+ /* child page number for non-leaf pages */ 1;
+
+/** Get the base address of offsets. The extra_size is stored at
+this position, and following positions hold the end offsets of
+the fields. */
+#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE)
+
+#ifndef UNIV_INNOCHECKSUM
+/* Offset consists of two parts: 2 upper bits is type and all other bits is
+value */
+
+/** Only 4 different values is possible! */
+enum field_type_t
+{
+ /** normal field */
+ STORED_IN_RECORD= 0 << 14,
+ /** this field is stored off-page */
+ STORED_OFFPAGE= 1 << 14,
+ /** just an SQL NULL */
+ SQL_NULL= 2 << 14,
+ /** instantly added field */
+ DEFAULT= 3 << 14,
+};
+
+/** without 2 upper bits */
+static constexpr rec_offs DATA_MASK= 0x3fff;
+/** 2 upper bits */
+static constexpr rec_offs TYPE_MASK= ~DATA_MASK;
+inline field_type_t get_type(rec_offs n)
+{
+ return static_cast<field_type_t>(n & TYPE_MASK);
+}
+inline void set_type(rec_offs &n, field_type_t type)
+{
+ n= static_cast<rec_offs>((n & DATA_MASK) | type);
+}
+inline rec_offs get_value(rec_offs n) { return n & DATA_MASK; }
+inline rec_offs combine(rec_offs value, field_type_t type)
+{
+ return static_cast<rec_offs>(get_value(value) | type);
+}
+
+/** Compact flag ORed to the extra size returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_COMPACT= rec_offs(~(rec_offs(~0) >> 1));
+/** External flag in offsets returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1;
+/** Default value flag in offsets returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2;
+constexpr rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1;
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+const rec_t*
+rec_get_next_ptr_const(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+rec_t*
+rec_get_next_ptr(
+/*=============*/
+ rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the offset of the
+next chained record on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint next) /*!< in: offset of the next record */
+ MY_ATTRIBUTE((nonnull));
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint next) /*!< in: offset of the next record */
+ MY_ATTRIBUTE((nonnull));
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index) /*!< in: record descriptor */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Confirms the n_fields of the entry is sane with comparing the other
+record in the same page specified
+@param[in] index index
+@param[in] rec record of the same page
+@param[in] entry index entry
+@return true if n_fields is sane */
+UNIV_INLINE
+bool
+rec_n_fields_is_sane(
+ dict_index_t* index,
+ const rec_t* rec,
+ const dtuple_t* entry)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+ const rec_t* rec) /*!< in: old-style physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+ const rec_t* rec) /*!< in: new-style physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to retrieve the info bits of
+a record.
+@return info bits */
+UNIV_INLINE
+byte
+rec_get_info_bits(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Determine the status bits of a non-REDUNDANT record.
+@param[in] rec ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record
+@return status bits */
+inline
+rec_comp_status_t
+rec_get_status(const rec_t* rec)
+{
+ byte bits = rec[-REC_NEW_STATUS] & REC_NEW_STATUS_MASK;
+ ut_ad(bits <= REC_STATUS_INSTANT);
+ return static_cast<rec_comp_status_t>(bits);
+}
+
+/** Set the status bits of a non-REDUNDANT record.
+@param[in,out] rec ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record
+@param[in] bits status bits */
+inline void rec_set_status(rec_t *rec, byte bits)
+{
+ ut_ad(bits <= REC_STATUS_INSTANT);
+ rec[-REC_NEW_STATUS]= static_cast<byte>((rec[-REC_NEW_STATUS] &
+ ~REC_NEW_STATUS_MASK) | bits);
+}
+
+/** Get the length of added field count in a REC_STATUS_INSTANT record.
+@param[in] n_add_field number of added fields, minus one
+@return storage size of the field count, in bytes */
+inline unsigned rec_get_n_add_field_len(ulint n_add_field)
+{
+ ut_ad(n_add_field < REC_MAX_N_FIELDS);
+ return n_add_field < 0x80 ? 1 : 2;
+}
+
+/** Get the added field count in a REC_STATUS_INSTANT record.
+@param[in,out] header variable header of a REC_STATUS_INSTANT record
+@return number of added fields */
+inline unsigned rec_get_n_add_field(const byte*& header)
+{
+ unsigned n_fields_add = *--header;
+ if (n_fields_add < 0x80) {
+ ut_ad(rec_get_n_add_field_len(n_fields_add) == 1);
+ return n_fields_add;
+ }
+
+ n_fields_add &= 0x7f;
+ n_fields_add |= unsigned(*--header) << 7;
+ ut_ad(n_fields_add < REC_MAX_N_FIELDS);
+ ut_ad(rec_get_n_add_field_len(n_fields_add) == 2);
+ return n_fields_add;
+}
+
+/** Set the added field count in a REC_STATUS_INSTANT record.
+@param[in,out] header variable header of a REC_STATUS_INSTANT record
+@param[in] n_add number of added fields, minus 1
+@return record header before the number of added fields */
+inline void rec_set_n_add_field(byte*& header, ulint n_add)
+{
+ ut_ad(n_add < REC_MAX_N_FIELDS);
+
+ if (n_add < 0x80) {
+ *header-- = byte(n_add);
+ } else {
+ *header-- = byte(byte(n_add) | 0x80);
+ *header-- = byte(n_add >> 7);
+ }
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record. (Only compact records have status bits.)
+@return info and status bits */
+UNIV_INLINE
+byte
+rec_get_info_and_status_bits(
+/*=========================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record. (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+ rec_t* rec, /*!< in/out: compact physical record */
+ ulint bits) /*!< in: info bits */
+ MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+bool
+rec_get_node_ptr_flag(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to test whether the data offsets
+in the record are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+ rec_t* rec, /*!< in: physical record */
+ ibool flag) /*!< in: TRUE if 1byte form */
+ MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+uint8_t
+rec_1_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+uint16_t
+rec_2_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return number of externally stored columns */
+ulint
+rec_get_n_extern_new(
+/*=================*/
+ const rec_t* rec, /*!< in: compact physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n) /*!< in: number of columns to scan */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine the offsets to each field in an index record.
+@param[in] rec physical record
+@param[in] index the index that the record belongs to
+@param[in,out] offsets array comprising offsets[0] allocated elements,
+ or an array from rec_get_offsets(), or NULL
+@param[in] n_core 0, or index->n_core_fields for leaf page
+@param[in] n_fields maximum number of offsets to compute
+ (ULINT_UNDEFINED to compute all offsets)
+@param[in,out] heap memory heap
+@return the new offsets */
+rec_offs*
+rec_get_offsets_func(
+ const rec_t* rec,
+ const dict_index_t* index,
+ rec_offs* offsets,
+ ulint n_core,
+ ulint n_fields,
+#ifdef UNIV_DEBUG
+ const char* file, /*!< in: file name where called */
+ unsigned line, /*!< in: line number where called */
+#endif /* UNIV_DEBUG */
+ mem_heap_t** heap) /*!< in/out: memory heap */
+#ifdef UNIV_DEBUG
+ MY_ATTRIBUTE((nonnull(1,2,6,8),warn_unused_result));
+#else /* UNIV_DEBUG */
+ MY_ATTRIBUTE((nonnull(1,2,6),warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+# define rec_get_offsets(rec, index, offsets, leaf, n, heap) \
+ rec_get_offsets_func(rec,index,offsets,leaf,n,__FILE__,__LINE__,heap)
+#else /* UNIV_DEBUG */
+# define rec_get_offsets(rec, index, offsets, leaf, n, heap) \
+ rec_get_offsets_func(rec, index, offsets, leaf, n, heap)
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record. It can reuse a previously allocated array. */
+void
+rec_get_offsets_reverse(
+/*====================*/
+ const byte* extra, /*!< in: the extra bytes of a
+ compact record in reverse order,
+ excluding the fixed-size
+ REC_N_NEW_EXTRA_BYTES */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint node_ptr,/*!< in: nonzero=node pointer,
+ 0=leaf node */
+ rec_offs* offsets)/*!< in/out: array consisting of
+ offsets[0] allocated elements */
+ MY_ATTRIBUTE((nonnull));
+#ifdef UNIV_DEBUG
+/** Validate offsets returned by rec_get_offsets().
+@param[in] rec record, or NULL
+@param[in] index the index that the record belongs in, or NULL
+@param[in,out] offsets the offsets of the record
+@return true */
+bool
+rec_offs_validate(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets)
+ MY_ATTRIBUTE((nonnull(3), warn_unused_result));
+/** Update debug data in offsets, in order to tame rec_offs_validate().
+@param[in] rec record
+@param[in] index the index that the record belongs in
+@param[in] leaf whether the record resides in a leaf page
+@param[in,out] offsets offsets from rec_get_offsets() to adjust */
+void
+rec_offs_make_valid(
+ const rec_t* rec,
+ const dict_index_t* index,
+ bool leaf,
+ rec_offs* offsets)
+ MY_ATTRIBUTE((nonnull));
+#else
+# define rec_offs_make_valid(rec, index, leaf, offsets)
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return offset to the field */
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: index of the field */
+ ulint* len) /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+ MY_ATTRIBUTE((nonnull));
+#define rec_get_nth_field_old(rec, n, len) \
+((rec) + rec_get_nth_field_offs_old(rec, n, len))
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: index of the field */
+ MY_ATTRIBUTE((warn_unused_result));
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+rec_offs
+rec_get_nth_field_offs(
+/*===================*/
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index of the field */
+ ulint* len) /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+ MY_ATTRIBUTE((nonnull));
+#define rec_get_nth_field(rec, offsets, n, len) \
+((rec) + rec_get_nth_field_offs(offsets, n, len))
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets) /*!< in: rec_get_offsets(rec) */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Mark the nth field as externally stored.
+@param[in] offsets array returned by rec_get_offsets()
+@param[in] n nth field */
+void
+rec_offs_make_nth_extern(
+ rec_offs* offsets,
+ const ulint n);
+
+MY_ATTRIBUTE((nonnull))
+/** Determine the number of allocated elements for an array of offsets.
+@param[in] offsets offsets after rec_offs_set_n_alloc()
+@return number of elements */
+inline ulint rec_offs_get_n_alloc(const rec_offs *offsets)
+{
+ ut_ad(offsets);
+ ulint n_alloc= offsets[0];
+ ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+ MEM_CHECK_ADDRESSABLE(offsets, n_alloc * sizeof *offsets);
+ return n_alloc;
+}
+
+/** Determine the number of fields for which offsets have been initialized.
+@param[in] offsets rec_get_offsets()
+@return number of fields */
+inline
+ulint
+rec_offs_n_fields(const rec_offs* offsets)
+{
+ ulint n_fields;
+ ut_ad(offsets);
+ n_fields = offsets[1];
+ ut_ad(n_fields > 0);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+ <= rec_offs_get_n_alloc(offsets));
+ return(n_fields);
+}
+
+/** Get a flag of a record field.
+@param[in] offsets rec_get_offsets()
+@param[in] n nth field
+@param[in] flag flag to extract
+@return type of the record field */
+inline field_type_t rec_offs_nth_type(const rec_offs *offsets, ulint n)
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ ut_ad(n < rec_offs_n_fields(offsets));
+ return get_type(rec_offs_base(offsets)[1 + n]);
+}
+
+/** Determine if a record field is missing
+(should be replaced by dict_index_t::instant_field_value()).
+@param[in] offsets rec_get_offsets()
+@param[in] n nth field
+@return nonzero if default bit is set */
+inline ulint rec_offs_nth_default(const rec_offs *offsets, ulint n)
+{
+ return rec_offs_nth_type(offsets, n) == DEFAULT;
+}
+
+/** Determine if a record field is SQL NULL
+(should be replaced by dict_index_t::instant_field_value()).
+@param[in] offsets rec_get_offsets()
+@param[in] n nth field
+@return nonzero if SQL NULL set */
+inline ulint rec_offs_nth_sql_null(const rec_offs *offsets, ulint n)
+{
+ return rec_offs_nth_type(offsets, n) == SQL_NULL;
+}
+
+/** Determine if a record field is stored off-page.
+@param[in] offsets rec_get_offsets()
+@param[in] n nth field
+Returns nonzero if the extern bit is set in nth field of rec.
+@return nonzero if externally stored */
+inline ulint rec_offs_nth_extern(const rec_offs *offsets, ulint n)
+{
+ return rec_offs_nth_type(offsets, n) == STORED_OFFPAGE;
+}
+
+/** Get a global flag of a record.
+@param[in] offsets rec_get_offsets()
+@param[in] flag flag to extract
+@return the flag of the record field */
+inline ulint rec_offs_any_flag(const rec_offs *offsets, ulint flag)
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ return *rec_offs_base(offsets) & flag;
+}
+
+/** Determine if the offsets are for a record containing off-page columns.
+@param[in] offsets rec_get_offsets()
+@return nonzero if any off-page columns exist */
+inline bool rec_offs_any_extern(const rec_offs *offsets)
+{
+ return rec_offs_any_flag(offsets, REC_OFFS_EXTERNAL);
+}
+
+/** Determine if the offsets are for a record that is missing fields.
+@param[in] offsets rec_get_offsets()
+@return nonzero if any fields need to be replaced with
+ dict_index_t::instant_field_value() */
+inline ulint rec_offs_any_default(const rec_offs *offsets)
+{
+ return rec_offs_any_flag(offsets, REC_OFFS_DEFAULT);
+}
+
+/** Determine if the offsets are for other than ROW_FORMAT=REDUNDANT.
+@param[in] offsets rec_get_offsets()
+@return nonzero if ROW_FORMAT is COMPACT,DYNAMIC or COMPRESSED
+@retval 0 if ROW_FORMAT=REDUNDANT */
+inline ulint rec_offs_comp(const rec_offs *offsets)
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ return (*rec_offs_base(offsets) & REC_OFFS_COMPACT);
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN or ALTER TABLE.
+@param[in] rec leaf page record
+@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return whether the record is the metadata pseudo-record */
+inline bool rec_is_metadata(const rec_t* rec, ulint comp)
+{
+ bool is = !!(rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG);
+ ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT);
+ return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN or ALTER TABLE.
+@param[in] rec leaf page record
+@param[in] index index of the record
+@return whether the record is the metadata pseudo-record */
+inline bool rec_is_metadata(const rec_t* rec, const dict_index_t& index)
+{
+ bool is = rec_is_metadata(rec, dict_table_is_comp(index.table));
+ ut_ad(!is || index.is_instant());
+ return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN (not other ALTER TABLE).
+@param[in] rec leaf page record
+@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return whether the record is the metadata pseudo-record */
+inline bool rec_is_add_metadata(const rec_t* rec, ulint comp)
+{
+ bool is = rec_get_info_bits(rec, comp) == REC_INFO_MIN_REC_FLAG;
+ ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT);
+ return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN (not other ALTER TABLE).
+@param[in] rec leaf page record
+@param[in] index index of the record
+@return whether the record is the metadata pseudo-record */
+inline bool rec_is_add_metadata(const rec_t* rec, const dict_index_t& index)
+{
+ bool is = rec_is_add_metadata(rec, dict_table_is_comp(index.table));
+ ut_ad(!is || index.is_instant());
+ return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ALTER TABLE (not plain ADD COLUMN).
+@param[in] rec leaf page record
+@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return whether the record is the ALTER TABLE metadata pseudo-record */
+inline bool rec_is_alter_metadata(const rec_t* rec, ulint comp)
+{
+ bool is = !(~rec_get_info_bits(rec, comp)
+ & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG));
+ ut_ad(!is || rec_is_metadata(rec, comp));
+ return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ALTER TABLE (not plain ADD COLUMN).
+@param[in] rec leaf page record
+@param[in] index index of the record
+@return whether the record is the ALTER TABLE metadata pseudo-record */
+inline bool rec_is_alter_metadata(const rec_t* rec, const dict_index_t& index)
+{
+ bool is = rec_is_alter_metadata(rec, dict_table_is_comp(index.table));
+ ut_ad(!is || index.is_dummy || index.is_instant());
+ return is;
+}
+
+/** Determine if a record is delete-marked (not a metadata pseudo-record).
+@param[in] rec record
+@param[in] comp nonzero if ROW_FORMAT!=REDUNDANT
+@return whether the record is a delete-marked user record */
+inline bool rec_is_delete_marked(const rec_t* rec, ulint comp)
+{
+ return (rec_get_info_bits(rec, comp)
+ & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG))
+ == REC_INFO_DELETED_FLAG;
+}
+
+/** Get the nth field from an index.
+@param[in] rec index record
+@param[in] index index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] n field number
+@param[out] len length of the field in bytes, or UNIV_SQL_NULL
+@return a read-only copy of the index field */
+inline
+const byte*
+rec_get_nth_cfield(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ ulint n,
+ ulint* len)
+{
+ /* Because this function may be invoked by innobase_rec_to_mysql()
+ for reporting a duplicate key during ALTER TABLE or
+ CREATE UNIQUE INDEX, and in that case the rec omit the fixed-size
+ header of 5 or 6 bytes, the check
+ rec_offs_validate(rec, index, offsets) must be avoided here. */
+ if (!rec_offs_nth_default(offsets, n)) {
+ return rec_get_nth_field(rec, offsets, n, len);
+ }
+ return index->instant_field_value(n, len);
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: nth field */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+ rec_offs*offsets, /*!< out: array for rec_get_offsets(),
+ must be allocated */
+ ulint n_alloc) /*!< in: number of elements */
+ MY_ATTRIBUTE((nonnull));
+#define rec_offs_init(offsets) \
+ rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets)
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the total size of record minus data size of record.
+The value returned by the function is the distance from record
+start to record origin in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+ const rec_t* rec, /*!< in: pointer to record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+ const rec_t* rec, /*!< in: pointer to record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((warn_unused_result));
+#else /* UNIV_DEBUG */
+# define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets))
+# define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets))
+#endif /* UNIV_DEBUG */
+
+/** Copy a physical record to a buffer.
+@param[in] buf buffer
+@param[in] rec physical record
+@param[in] offsets array returned by rec_get_offsets()
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+ void* buf,
+ const rec_t* rec,
+ const rec_offs* offsets);
+
+/** Determine the size of a data tuple prefix in a temporary file.
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[in] index clustered or secondary index
+@param[in] fields data fields
+@param[in] n_fields number of data fields
+@param[out] extra record header size
+@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT
+@return total size, in bytes */
+template<bool redundant_temp>
+ulint
+rec_get_converted_size_temp(
+ const dict_index_t* index,
+ const dfield_t* fields,
+ ulint n_fields,
+ ulint* extra,
+ rec_comp_status_t status = REC_STATUS_ORDINARY)
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Determine the offset to each field in temporary file.
+@param[in] rec temporary file record
+@param[in] index index of that the record belongs to
+@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets)
+@param[in] n_core number of core fields (index->n_core_fields)
+@param[in] def_val default values for non-core fields
+@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */
+void
+rec_init_offsets_temp(
+ const rec_t* rec,
+ const dict_index_t* index,
+ rec_offs* offsets,
+ ulint n_core,
+ const dict_col_t::def_t*def_val,
+ rec_comp_status_t status = REC_STATUS_ORDINARY)
+ MY_ATTRIBUTE((nonnull(1,2,3)));
+/** Determine the offset to each field in temporary file.
+@param[in] rec temporary file record
+@param[in] index index of that the record belongs to
+@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets)
+*/
+void
+rec_init_offsets_temp(
+ const rec_t* rec,
+ const dict_index_t* index,
+ rec_offs* offsets)
+ MY_ATTRIBUTE((nonnull));
+
+/** Convert a data tuple prefix to the temporary file format.
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[out] rec record in temporary file format
+@param[in] index clustered or secondary index
+@param[in] fields data fields
+@param[in] n_fields number of data fields
+@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */
+template<bool redundant_temp>
+void
+rec_convert_dtuple_to_temp(
+ rec_t* rec,
+ const dict_index_t* index,
+ const dfield_t* fields,
+ ulint n_fields,
+ rec_comp_status_t status = REC_STATUS_ORDINARY)
+ MY_ATTRIBUTE((nonnull));
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return own: copied record */
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n_fields, /*!< in: number of fields
+ to copy */
+ byte** buf, /*!< in/out: memory buffer
+ for the copied prefix,
+ or NULL */
+ ulint* buf_size) /*!< in/out: buffer size */
+ MY_ATTRIBUTE((nonnull));
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it into the given buffer.
+@return pointer to the origin of physical record */
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+ byte* buf, /*!< in: start address of the
+ physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of
+ externally stored columns */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+ ulint data_size, /*!< in: data size */
+ ulint n_fields, /*!< in: number of fields */
+ ulint n_ext) /*!< in: number of externally stored columns */
+ MY_ATTRIBUTE((const));
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return total size */
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields,/*!< in: number of data fields */
+ ulint* extra) /*!< out: extra size */
+ MY_ATTRIBUTE((warn_unused_result, nonnull(1,2)));
+
+/** Determine the size of a record in ROW_FORMAT=COMPACT.
+@param[in] index record descriptor. dict_table_is_comp()
+ is assumed to hold, even if it doesn't
+@param[in] tuple logical record
+@param[out] extra extra size
+@return total size */
+ulint
+rec_get_converted_size_comp(
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ ulint* extra)
+ MY_ATTRIBUTE((nonnull(1,2)));
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+ dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of externally stored columns */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+/** Copy the first n fields of a (copy of a) physical record to a data tuple.
+The fields are copied into the memory heap.
+@param[out] tuple data tuple
+@param[in] rec index record, or a copy thereof
+@param[in] index index of rec
+@param[in] n_core index->n_core_fields at the time rec was
+ copied, or 0 if non-leaf page record
+@param[in] n_fields number of fields to copy
+@param[in,out] heap memory heap */
+void
+rec_copy_prefix_to_dtuple(
+ dtuple_t* tuple,
+ const rec_t* rec,
+ const dict_index_t* index,
+ ulint n_core,
+ ulint n_fields,
+ mem_heap_t* heap)
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return TRUE if ok */
+ibool
+rec_validate(
+/*=========*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints an old-style physical record. */
+void
+rec_print_old(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec) /*!< in: physical record */
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a spatial index record. */
+void
+rec_print_mbr_rec(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print_new(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+ MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print(
+/*======*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index) /*!< in: record descriptor */
+ MY_ATTRIBUTE((nonnull));
+
+/** Pretty-print a record.
+@param[in,out] o output stream
+@param[in] rec physical record
+@param[in] info rec_get_info_bits(rec)
+@param[in] offsets rec_get_offsets(rec) */
+void
+rec_print(
+ std::ostream& o,
+ const rec_t* rec,
+ ulint info,
+ const rec_offs* offsets);
+
+/** Wrapper for pretty-printing a record */
+struct rec_index_print
+{
+ /** Constructor */
+ rec_index_print(const rec_t* rec, const dict_index_t* index) :
+ m_rec(rec), m_index(index)
+ {}
+
+ /** Record */
+ const rec_t* m_rec;
+ /** Index */
+ const dict_index_t* m_index;
+};
+
+/** Display a record.
+@param[in,out] o output stream
+@param[in] r record to display
+@return the output stream */
+std::ostream&
+operator<<(std::ostream& o, const rec_index_print& r);
+
+/** Wrapper for pretty-printing a record */
+struct rec_offsets_print
+{
+ /** Constructor */
+ rec_offsets_print(const rec_t* rec, const rec_offs* offsets) :
+ m_rec(rec), m_offsets(offsets)
+ {}
+
+ /** Record */
+ const rec_t* m_rec;
+ /** Offsets to each field */
+ const rec_offs* m_offsets;
+};
+
+/** Display a record.
+@param[in,out] o output stream
+@param[in] r record to display
+@return the output stream */
+ATTRIBUTE_COLD
+std::ostream&
+operator<<(std::ostream& o, const rec_offsets_print& r);
+
+/** Pretty-printer of records and tuples */
+class rec_printer : public std::ostringstream {
+public:
+ /** Construct a pretty-printed record.
+ @param rec record with header
+ @param offsets rec_get_offsets(rec, ...) */
+ ATTRIBUTE_COLD
+ rec_printer(const rec_t* rec, const rec_offs* offsets)
+ :
+ std::ostringstream ()
+ {
+ rec_print(*this, rec,
+ rec_get_info_bits(rec, rec_offs_comp(offsets)),
+ offsets);
+ }
+
+ /** Construct a pretty-printed record.
+ @param rec record, possibly lacking header
+ @param info rec_get_info_bits(rec)
+ @param offsets rec_get_offsets(rec, ...) */
+ ATTRIBUTE_COLD
+ rec_printer(const rec_t* rec, ulint info, const rec_offs* offsets)
+ :
+ std::ostringstream ()
+ {
+ rec_print(*this, rec, info, offsets);
+ }
+
+ /** Construct a pretty-printed tuple.
+ @param tuple data tuple */
+ ATTRIBUTE_COLD
+ rec_printer(const dtuple_t* tuple)
+ :
+ std::ostringstream ()
+ {
+ dtuple_print(*this, tuple);
+ }
+
+ /** Construct a pretty-printed tuple.
+ @param field array of data tuple fields
+ @param n number of fields */
+ ATTRIBUTE_COLD
+ rec_printer(const dfield_t* field, ulint n)
+ :
+ std::ostringstream ()
+ {
+ dfield_print(*this, field, n);
+ }
+
+ /** Destructor */
+ ~rec_printer() override {}
+
+private:
+ /** Copy constructor */
+ rec_printer(const rec_printer& other);
+ /** Assignment operator */
+ rec_printer& operator=(const rec_printer& other);
+};
+
+
+# ifdef UNIV_DEBUG
+/** Read the DB_TRX_ID of a clustered index record.
+@param[in] rec clustered index record
+@param[in] index clustered index
+@return the value of DB_TRX_ID */
+trx_id_t
+rec_get_trx_id(
+ const rec_t* rec,
+ const dict_index_t* index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+# endif /* UNIV_DEBUG */
+
+/* Maximum lengths for the data in a physical record if the offsets
+are given in one byte (resp. two byte) format. */
+#define REC_1BYTE_OFFS_LIMIT 0x7FUL
+#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL
+
+/* The data size of record must not be larger than this on
+REDUNDANT row format because we reserve two upmost bits in a
+two byte offset for special purposes */
+#define REDUNDANT_REC_MAX_DATA_SIZE (16383)
+
+/* The data size of record must be smaller than this on
+COMPRESSED row format because we reserve two upmost bits in a
+two byte offset for special purposes */
+#define COMPRESSED_REC_MAX_DATA_SIZE (16384)
+
+#ifdef WITH_WSREP
+int wsrep_rec_get_foreign_key(
+ byte *buf, /* out: extracted key */
+ ulint *buf_len, /* in/out: length of buf */
+ const rec_t* rec, /* in: physical record */
+ dict_index_t* index_for, /* in: index for foreign table */
+ dict_index_t* index_ref, /* in: index for referenced table */
+ ibool new_protocol); /* in: protocol > 1 */
+#endif /* WITH_WSREP */
+
+#include "rem0rec.ic"
+
+#endif /* !UNIV_INNOCHECKSUM */
+#endif /* rem0rec_h */
diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic
new file mode 100644
index 00000000..30c72a74
--- /dev/null
+++ b/storage/innobase/include/rem0rec.ic
@@ -0,0 +1,1204 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.ic
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mach0data.h"
+#include "ut0byte.h"
+#include "dict0boot.h"
+#include "btr0types.h"
+
+/* Offsets of the bit-fields in an old-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+ (1) byte offset (2) bit usage within byte
+ downward from
+ origin -> 1 8 bits pointer to next record
+ 2 8 bits pointer to next record
+ 3 1 bit short flag
+ 7 bits number of fields
+ 4 3 bits number of fields
+ 5 bits heap number
+ 5 8 bits heap number
+ 6 4 bits n_owned
+ 4 bits info bits
+*/
+
+/* Offsets of the bit-fields in a new-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+ (1) byte offset (2) bit usage within byte
+ downward from
+ origin -> 1 8 bits relative offset of next record
+ 2 8 bits relative offset of next record
+ the relative offset is an unsigned 16-bit
+ integer:
+ (offset_of_next_record
+ - offset_of_this_record) mod 64Ki,
+ where mod is the modulo as a non-negative
+ number;
+ we can calculate the offset of the next
+ record with the formula:
+ relative_offset + offset_of_this_record
+ mod srv_page_size
+ 3 3 bits status:
+ 000=REC_STATUS_ORDINARY
+ 001=REC_STATUS_NODE_PTR
+ 010=REC_STATUS_INFIMUM
+ 011=REC_STATUS_SUPREMUM
+ 100=REC_STATUS_INSTANT
+ 1xx=reserved
+ 5 bits heap number
+ 4 8 bits heap number
+ 5 4 bits n_owned
+ 4 bits info bits
+*/
+
+/* We list the byte offsets from the origin of the record, the mask,
+and the shift needed to obtain each bit-field of the record. */
+
+#define REC_NEXT 2
+#define REC_NEXT_MASK 0xFFFFUL
+#define REC_NEXT_SHIFT 0
+
+#define REC_OLD_SHORT 3 /* This is single byte bit-field */
+#define REC_OLD_SHORT_MASK 0x1UL
+#define REC_OLD_SHORT_SHIFT 0
+
+#define REC_OLD_N_FIELDS 4
+#define REC_OLD_N_FIELDS_MASK 0x7FEUL
+#define REC_OLD_N_FIELDS_SHIFT 1
+
+#define REC_OLD_HEAP_NO 5
+#define REC_HEAP_NO_MASK 0xFFF8UL
+#if 0 /* defined in rem0rec.h for use of page0zip.cc */
+#define REC_NEW_HEAP_NO 4
+#define REC_HEAP_NO_SHIFT 3
+#endif
+
+#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */
+#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */
+#define REC_N_OWNED_MASK 0xFUL
+#define REC_N_OWNED_SHIFT 0
+
+#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */
+#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */
+#define REC_INFO_BITS_MASK 0xF0UL
+#define REC_INFO_BITS_SHIFT 0
+
+#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \
+ ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \
+ ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \
+ ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \
+ ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \
+ ^ 0xFFFFFFFFUL
+# error "sum of old-style masks != 0xFFFFFFFFUL"
+#endif
+#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \
+ ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \
+ ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \
+ ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \
+ ^ 0xFFFFFFUL
+# error "sum of new-style masks != 0xFFFFFFUL"
+#endif
+
+/******************************************************//**
+Gets a bit field from within 1 byte. */
+UNIV_INLINE
+byte
+rec_get_bit_field_1(
+/*================*/
+ const rec_t* rec, /*!< in: pointer to record origin */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ return static_cast<byte>((*(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 1 byte. */
+UNIV_INLINE
+void
+rec_set_bit_field_1(
+/*================*/
+ rec_t* rec, /*!< in: pointer to record origin */
+ ulint val, /*!< in: value to set */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+ ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+ ut_ad(mask);
+ ut_ad(mask <= 0xFFUL);
+ ut_ad(((mask >> shift) << shift) == mask);
+ ut_ad(((val << shift) & mask) == (val << shift));
+
+ mach_write_to_1(rec - offs,
+ (mach_read_from_1(rec - offs) & ~mask)
+ | (val << shift));
+}
+
+/******************************************************//**
+Gets a bit field from within 2 bytes. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_2(
+/*================*/
+ const rec_t* rec, /*!< in: pointer to record origin */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+
+ return((mach_read_from_2(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 2 bytes. */
+UNIV_INLINE
+void
+rec_set_bit_field_2(
+/*================*/
+ rec_t* rec, /*!< in: pointer to record origin */
+ ulint val, /*!< in: value to set */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+ ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+ ut_ad(mask > 0xFFUL);
+ ut_ad(mask <= 0xFFFFUL);
+ ut_ad((mask >> shift) & 1);
+ ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1)));
+ ut_ad(((mask >> shift) << shift) == mask);
+ ut_ad(((val << shift) & mask) == (val << shift));
+
+ mach_write_to_2(rec - offs,
+ (mach_read_from_2(rec - offs) & ~mask)
+ | (val << shift));
+}
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+const rec_t*
+rec_get_next_ptr_const(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ ulint field_value;
+
+ compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
+ compile_time_assert(REC_NEXT_SHIFT == 0);
+
+ field_value = mach_read_from_2(rec - REC_NEXT);
+
+ if (field_value == 0) {
+
+ return(NULL);
+ }
+
+ if (comp) {
+#if UNIV_PAGE_SIZE_MAX <= 32768
+ /* Note that for 64 KiB pages, field_value can 'wrap around'
+ and the debug assertion is not valid */
+
+ /* In the following assertion, field_value is interpreted
+ as signed 16-bit integer in 2's complement arithmetics.
+ If all platforms defined int16_t in the standard headers,
+ the expression could be written simpler as
+ (int16_t) field_value + ut_align_offset(...) < srv_page_size
+ */
+ ut_ad((field_value >= 32768
+ ? field_value - 65536
+ : field_value)
+ + ut_align_offset(rec, srv_page_size)
+ < srv_page_size);
+#endif
+ /* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+ between each record. */
+ ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+ && field_value < 32768)
+ || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+ return((byte*) ut_align_down(rec, srv_page_size)
+ + ut_align_offset(rec + field_value, srv_page_size));
+ } else {
+ ut_ad(field_value < srv_page_size);
+
+ return((byte*) ut_align_down(rec, srv_page_size)
+ + field_value);
+ }
+}
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+rec_t*
+rec_get_next_ptr(
+/*=============*/
+ rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ return(const_cast<rec_t*>(rec_get_next_ptr_const(rec, comp)));
+}
+
+/******************************************************//**
+The following function is used to get the offset of the next chained record
+on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ ulint field_value;
+ compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
+ compile_time_assert(REC_NEXT_SHIFT == 0);
+
+ field_value = mach_read_from_2(rec - REC_NEXT);
+
+ if (comp) {
+#if UNIV_PAGE_SIZE_MAX <= 32768
+ /* Note that for 64 KiB pages, field_value can 'wrap around'
+ and the debug assertion is not valid */
+
+ /* In the following assertion, field_value is interpreted
+ as signed 16-bit integer in 2's complement arithmetics.
+ If all platforms defined int16_t in the standard headers,
+ the expression could be written simpler as
+ (int16_t) field_value + ut_align_offset(...) < srv_page_size
+ */
+ ut_ad((field_value >= 32768
+ ? field_value - 65536
+ : field_value)
+ + ut_align_offset(rec, srv_page_size)
+ < srv_page_size);
+#endif
+ if (field_value == 0) {
+
+ return(0);
+ }
+
+ /* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+ between each record. */
+ ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+ && field_value < 32768)
+ || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+ return(ut_align_offset(rec + field_value, srv_page_size));
+ } else {
+ ut_ad(field_value < srv_page_size);
+
+ return(field_value);
+ }
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint next) /*!< in: offset of the next record */
+{
+ ut_ad(srv_page_size > next);
+ compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
+ compile_time_assert(REC_NEXT_SHIFT == 0);
+ mach_write_to_2(rec - REC_NEXT, next);
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint next) /*!< in: offset of the next record */
+{
+ ulint field_value;
+
+ ut_ad(srv_page_size > next);
+
+ if (!next) {
+ field_value = 0;
+ } else {
+ /* The following two statements calculate
+ next - offset_of_rec mod 64Ki, where mod is the modulo
+ as a non-negative number */
+
+ field_value = (ulint)
+ ((lint) next
+ - (lint) ut_align_offset(rec, srv_page_size));
+ field_value &= REC_NEXT_MASK;
+ }
+
+ mach_write_to_2(rec - REC_NEXT, field_value);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS,
+ REC_OLD_N_FIELDS_MASK,
+ REC_OLD_N_FIELDS_SHIFT);
+ ut_ad(ret <= REC_MAX_N_FIELDS);
+ ut_ad(ret > 0);
+
+ return(ret);
+}
+
+/******************************************************//**
+The following function is used to set the number of fields
+in an old-style record. */
+UNIV_INLINE
+void
+rec_set_n_fields_old(
+/*=================*/
+ rec_t* rec, /*!< in: physical record */
+ ulint n_fields) /*!< in: the number of fields */
+{
+ ut_ad(rec);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields > 0);
+
+ rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS,
+ REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index) /*!< in: record descriptor */
+{
+ ut_ad(rec);
+ ut_ad(index);
+
+ if (!dict_table_is_comp(index->table)) {
+ return(rec_get_n_fields_old(rec));
+ }
+
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_INSTANT:
+ case REC_STATUS_ORDINARY:
+ return(dict_index_get_n_fields(index));
+ case REC_STATUS_NODE_PTR:
+ return(dict_index_get_n_unique_in_tree(index) + 1);
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ return(1);
+ }
+
+ ut_error;
+ return(ULINT_UNDEFINED);
+}
+
+/** Confirms the n_fields of the entry is sane with comparing the other
+record in the same page specified
+@param[in] index index
+@param[in] rec record of the same page
+@param[in] entry index entry
+@return true if n_fields is sane */
+UNIV_INLINE
+bool
+rec_n_fields_is_sane(
+ dict_index_t* index,
+ const rec_t* rec,
+ const dtuple_t* entry)
+{
+ const ulint n_fields = rec_get_n_fields(rec, index);
+
+ return(n_fields == dtuple_get_n_fields(entry)
+ || (index->is_instant()
+ && n_fields >= index->n_core_fields)
+ /* a record for older SYS_INDEXES table
+ (missing merge_threshold column) is acceptable. */
+ || (index->table->id == DICT_INDEXES_ID
+ && n_fields == dtuple_get_n_fields(entry) - 1));
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+ const rec_t* rec) /*!< in: old-style physical record */
+{
+ return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+ const rec_t* rec) /*!< in: new-style physical record */
+{
+ return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to retrieve the info bits of a record.
+@return info bits */
+UNIV_INLINE
+byte
+rec_get_info_bits(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ return rec_get_bit_field_1(
+ rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record. (Only compact records have status bits.)
+@return info and status bits */
+UNIV_INLINE
+byte
+rec_get_info_and_status_bits(
+/*=========================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
+ & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+ if (comp)
+ return static_cast<byte>(rec_get_info_bits(rec, TRUE) |
+ rec_get_status(rec));
+ else
+ return rec_get_info_bits(rec, FALSE);
+}
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record. (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint bits) /*!< in: info bits */
+{
+ compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
+ & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+ rec_set_status(rec, bits & REC_NEW_STATUS_MASK);
+ rec_set_bit_field_1(rec, bits & ~REC_NEW_STATUS_MASK,
+ REC_NEW_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ if (comp) {
+ return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
+ REC_INFO_DELETED_FLAG,
+ REC_INFO_BITS_SHIFT));
+ } else {
+ return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
+ REC_INFO_DELETED_FLAG,
+ REC_INFO_BITS_SHIFT));
+ }
+}
+
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+bool
+rec_get_node_ptr_flag(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(REC_STATUS_NODE_PTR == rec_get_status(rec));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to test whether the data offsets in the record
+are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+ REC_OLD_SHORT_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+ rec_t* rec, /*!< in: physical record */
+ ibool flag) /*!< in: TRUE if 1byte form */
+{
+ ut_ad(flag <= 1);
+
+ rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+ REC_OLD_SHORT_SHIFT);
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+uint8_t
+rec_1_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1)));
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+uint16_t
+rec_2_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2)));
+}
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK);
+}
+
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+ rec_offs*offsets, /*!< out: array for rec_get_offsets(),
+ must be allocated */
+ ulint n_alloc) /*!< in: number of elements */
+{
+ ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+ MEM_UNDEFINED(offsets, n_alloc * sizeof *offsets);
+ offsets[0] = static_cast<rec_offs>(n_alloc);
+}
+
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+rec_offs
+rec_get_nth_field_offs(
+/*===================*/
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index of the field */
+ ulint* len) /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null; UNIV_SQL_DEFAULT is default value */
+{
+ ut_ad(n < rec_offs_n_fields(offsets));
+
+ rec_offs offs = n == 0 ? 0 : get_value(rec_offs_base(offsets)[n]);
+ rec_offs next_offs = rec_offs_base(offsets)[1 + n];
+
+ if (get_type(next_offs) == SQL_NULL) {
+ *len = UNIV_SQL_NULL;
+ } else if (get_type(next_offs) == DEFAULT) {
+ *len = UNIV_SQL_DEFAULT;
+ } else {
+ *len = get_value(next_offs) - offs;
+ }
+
+ return(offs);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets) /*!< in: rec_get_offsets(rec) */
+{
+ ulint i;
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(NULL);
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ const byte* field
+ = rec_get_nth_field(rec, offsets, i, &len);
+
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ if (!memcmp(field + len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE)) {
+ return(field);
+ }
+ }
+ }
+
+ return(NULL);
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: nth field */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ ut_ad(n < rec_offs_n_fields(offsets));
+ if (!n) {
+ return get_value(rec_offs_base(offsets)[1 + n]);
+ }
+ return get_value((rec_offs_base(offsets)[1 + n]))
+ - get_value(rec_offs_base(offsets)[n]);
+}
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n = 0;
+
+ if (rec_offs_any_extern(offsets)) {
+ ulint i;
+
+ for (i = rec_offs_n_fields(offsets); i--; ) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ n++;
+ }
+ }
+ }
+
+ return(n);
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. This function and the 2-byte counterpart are defined here because the
+C-compiler was not able to sum negative and positive constant offsets, and
+warned of constant arithmetic overflow within the compiler.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_prev_field_end_info(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n)));
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_prev_field_end_info(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n)));
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+1-byte format. */
+UNIV_INLINE
+void
+rec_1_set_field_end_info(
+/*=====================*/
+ rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: field index */
+ ulint info) /*!< in: value to set */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info);
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+2-byte format. */
+UNIV_INLINE
+void
+rec_2_set_field_end_info(
+/*=====================*/
+ rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: field index */
+ ulint info) /*!< in: value to set */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 1-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_1_get_field_start_offs(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ return(rec_1_get_prev_field_end_info(rec, n)
+ & ~REC_1BYTE_SQL_NULL_MASK);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 2-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_2_get_field_start_offs(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ return(rec_2_get_prev_field_end_info(rec, n)
+ & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK));
+}
+
+/******************************************************//**
+The following function is used to read the offset of the start of a data field
+in the record. The start of an SQL null field is the end offset of the
+previous non-null field, or 0, if none exists. If n is the number of the last
+field + 1, then the end offset of the last field is returned.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_get_field_start_offs(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec);
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ if (rec_get_1byte_offs_flag(rec)) {
+
+ return(rec_1_get_field_start_offs(rec, n));
+ }
+
+ return(rec_2_get_field_start_offs(rec, n));
+}
+
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: index of the field */
+{
+ ulint os;
+ ulint next_os;
+
+ os = rec_get_field_start_offs(rec, n);
+ next_os = rec_get_field_start_offs(rec, n + 1);
+
+ ut_ad(next_os - os < srv_page_size);
+
+ return(next_os - os);
+}
+
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ut_ad(rec);
+
+ return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec)));
+}
+
+/**********************************************************//**
+The following function sets the number of fields in offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_fields(
+/*==================*/
+ rec_offs* offsets, /*!< in/out: array returned by
+ rec_get_offsets() */
+ ulint n_fields) /*!< in: number of fields */
+{
+ ut_ad(offsets);
+ ut_ad(n_fields > 0);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+ <= rec_offs_get_n_alloc(offsets));
+ offsets[1] = static_cast<rec_offs>(n_fields);
+}
+
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint size;
+
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ size = get_value(rec_offs_base(offsets)[rec_offs_n_fields(offsets)]);
+ ut_ad(size < srv_page_size);
+ return(size);
+}
+
+/**********************************************************//**
+Returns the total size of record minus data size of record. The value
+returned by the function is the distance from record start to record origin
+in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint size;
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ size = *rec_offs_base(offsets) & REC_OFFS_MASK;
+ ut_ad(size < srv_page_size);
+ return(size);
+}
+
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets));
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+ const rec_t* rec, /*!< in: pointer to record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ return(const_cast<rec_t*>(rec + rec_offs_data_size(offsets)));
+}
+
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+ const rec_t* rec, /*!< in: pointer to record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ return(const_cast<rec_t*>(rec - rec_offs_extra_size(offsets)));
+}
+#endif /* UNIV_DEBUG */
+
+/** Copy a physical record to a buffer.
+@param[in] buf buffer
+@param[in] rec physical record
+@param[in] offsets array returned by rec_get_offsets()
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+ void* buf,
+ const rec_t* rec,
+ const rec_offs* offsets)
+{
+ ulint extra_len;
+ ulint data_len;
+
+ ut_ad(rec != NULL);
+ ut_ad(buf != NULL);
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_validate(rec, offsets));
+
+ extra_len = rec_offs_extra_size(offsets);
+ data_len = rec_offs_data_size(offsets);
+
+ memcpy(buf, rec - extra_len, extra_len + data_len);
+
+ return((byte*) buf + extra_len);
+}
+
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+ ulint data_size, /*!< in: data size */
+ ulint n_fields, /*!< in: number of fields */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+ return(REC_N_OLD_EXTRA_BYTES + n_fields);
+ }
+
+ return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields);
+}
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+ dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ ulint data_size;
+ ulint extra_size;
+
+ ut_ad(dtuple_check_typed(dtuple));
+#ifdef UNIV_DEBUG
+ if (dict_index_is_ibuf(index)) {
+ ut_ad(dtuple->n_fields > 1);
+ } else if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
+ == REC_STATUS_NODE_PTR) {
+ ut_ad(dtuple->n_fields - 1
+ == dict_index_get_n_unique_in_tree_nonleaf(index));
+ } else if (index->table->id == DICT_INDEXES_ID) {
+ /* The column SYS_INDEXES.MERGE_THRESHOLD was
+ instantly added in MariaDB 10.2.2 (MySQL 5.7). */
+ ut_ad(!index->table->is_temporary());
+ ut_ad(index->n_fields == DICT_NUM_FIELDS__SYS_INDEXES);
+ ut_ad(dtuple->n_fields == DICT_NUM_FIELDS__SYS_INDEXES
+ || dtuple->n_fields
+ == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD);
+ } else {
+ ut_ad(dtuple->n_fields >= index->n_core_fields);
+ ut_ad(dtuple->n_fields <= index->n_fields
+ || dtuple->is_alter_metadata());
+ }
+#endif
+
+ if (dict_table_is_comp(index->table)) {
+ return rec_get_converted_size_comp(index, dtuple, NULL);
+ }
+
+ data_size = dtuple_get_data_size(dtuple, 0);
+
+ /* If primary key is being updated then the new record inherits
+ externally stored fields from the delete-marked old record.
+ In that case, n_ext may be less value than
+ dtuple_get_n_ext(tuple). */
+ ut_ad(n_ext <= dtuple_get_n_ext(dtuple));
+ extra_size = rec_get_converted_extra_size(
+ data_size, dtuple_get_n_fields(dtuple), n_ext);
+
+ return(data_size + extra_size);
+}
diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h
new file mode 100644
index 00000000..0e4075a9
--- /dev/null
+++ b/storage/innobase/include/rem0types.h
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0types.h
+Record manager global types
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0types_h
+#define rem0types_h
+
+/* We define the physical record simply as an array of bytes */
+typedef byte rec_t;
+
+/** This type represents a field offset in a rec_t* */
+typedef unsigned short int rec_offs;
+
+/* Maximum values for various fields (for non-blob tuples) */
+#define REC_MAX_N_FIELDS (1024 - 1)
+#define REC_MAX_HEAP_NO (2 * 8192 - 1)
+#define REC_MAX_N_OWNED (16 - 1)
+
+/* Maximum number of user defined fields/columns. The reserved columns
+are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR.
+Before MariaDB Server 10.5, we needed "* 2" because mlog_parse_index()
+created a dummy table object possibly, with some of the system columns
+in it, and then adds the 3 system columns (again) using
+dict_table_add_system_columns().
+For now, we will keep this limitation to maintain file format compatibility
+with older versions. */
+#define REC_MAX_N_USER_FIELDS (REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2)
+
+/* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed field length (or indexed prefix length) for indexes on tables of
+ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format.
+Before we support UTF-8 encodings with mbmaxlen = 4, a UTF-8 character
+may take at most 3 bytes. So the limit was set to 3*256, so that one
+can create a column prefix index on 256 characters of a TEXT or VARCHAR
+column also in the UTF-8 charset.
+This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define REC_ANTELOPE_MAX_INDEX_COL_LEN 768
+
+/** Maximum indexed field length for tables that have atomic BLOBs.
+This (3072) is the maximum index row length allowed, so we cannot create index
+prefix column longer than that. */
+#define REC_VERSION_56_MAX_INDEX_COL_LEN 3072
+
+/** Innodb row types are a subset of the MySQL global enum row_type.
+They are made into their own enum so that switch statements can account
+for each of them. */
+enum rec_format_enum {
+ REC_FORMAT_REDUNDANT = 0, /*!< REDUNDANT row format */
+ REC_FORMAT_COMPACT = 1, /*!< COMPACT row format */
+ REC_FORMAT_COMPRESSED = 2, /*!< COMPRESSED row format */
+ REC_FORMAT_DYNAMIC = 3 /*!< DYNAMIC row format */
+};
+typedef enum rec_format_enum rec_format_t;
+
+#endif
diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h
new file mode 100644
index 00000000..251f3125
--- /dev/null
+++ b/storage/innobase/include/row0ext.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.h
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#ifndef row0ext_h
+#define row0ext_h
+
+#include "data0types.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+#include "fsp0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+row_ext_t*
+row_ext_create(
+/*===========*/
+ ulint n_ext, /*!< in: number of externally stored columns */
+ const ulint* ext, /*!< in: col_no's of externally stored columns
+ in the InnoDB table object, as reported by
+ dict_col_get_no(); NOT relative to the records
+ in the clustered index */
+ const dict_table_t& table, /*!< in: table */
+ const dtuple_t* tuple, /*!< in: data tuple containing the field
+ references of the externally stored
+ columns; must be indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch
+ to prevent deletion (rollback or purge). */
+ mem_heap_t* heap); /*!< in: heap where created */
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+ const row_ext_t* ext, /*!< in/out: column prefix cache */
+ ulint i, /*!< in: index of ext->ext[] */
+ ulint* len); /*!< out: length of prefix, in bytes,
+ at most the length determined by
+ DICT_MAX_FIELD_LEN_BY_FORMAT() */
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+ const row_ext_t* ext, /*!< in: column prefix cache */
+ ulint col, /*!< in: column number in the InnoDB
+ table object, as reported by
+ dict_col_get_no(); NOT relative to the
+ records in the clustered index */
+ ulint* len); /*!< out: length of prefix, in bytes,
+ at most the length determined by
+ DICT_MAX_FIELD_LEN_BY_FORMAT() */
+
+/** Prefixes of externally stored columns */
+struct row_ext_t{
+ ulint n_ext; /*!< number of externally stored columns */
+ const ulint* ext; /*!< col_no's of externally stored columns */
+ byte* buf; /*!< backing store of the column prefix cache */
+ ulint max_len;/*!< maximum prefix length, it could be
+ REC_ANTELOPE_MAX_INDEX_COL_LEN or
+ REC_VERSION_56_MAX_INDEX_COL_LEN depending
+ on row format */
+ ulint zip_size;/*!< ROW_FORMAT=COMPRESSED page size, or 0 */
+ ulint len[1]; /*!< prefix lengths; 0 if not cached */
+};
+
+#include "row0ext.ic"
+
+#endif
diff --git a/storage/innobase/include/row0ext.ic b/storage/innobase/include/row0ext.ic
new file mode 100644
index 00000000..913b51b3
--- /dev/null
+++ b/storage/innobase/include/row0ext.ic
@@ -0,0 +1,87 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.ic
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "rem0types.h"
+#include "btr0types.h"
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+ const row_ext_t* ext, /*!< in/out: column prefix cache */
+ ulint i, /*!< in: index of ext->ext[] */
+ ulint* len) /*!< out: length of prefix, in bytes,
+ at most ext->max_len */
+{
+ ut_ad(ext);
+ ut_ad(len);
+ ut_ad(i < ext->n_ext);
+
+ *len = ext->len[i];
+
+ ut_ad(*len <= ext->max_len);
+ ut_ad(ext->max_len > 0);
+
+ if (*len == 0) {
+ /* The BLOB could not be fetched to the cache. */
+ return(field_ref_zero);
+ } else {
+ return(ext->buf + i * ext->max_len);
+ }
+}
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+ const row_ext_t* ext, /*!< in: column prefix cache */
+ ulint col, /*!< in: column number in the InnoDB
+ table object, as reported by
+ dict_col_get_no(); NOT relative to the
+ records in the clustered index */
+ ulint* len) /*!< out: length of prefix, in bytes,
+ at most ext->max_len */
+{
+ ulint i;
+
+ ut_ad(ext);
+ ut_ad(len);
+
+ for (i = 0; i < ext->n_ext; i++) {
+ if (col == ext->ext[i]) {
+ return(row_ext_lookup_ith(ext, i, len));
+ }
+ }
+
+ return(NULL);
+}
diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
new file mode 100644
index 00000000..99c85601
--- /dev/null
+++ b/storage/innobase/include/row0ftsort.h
@@ -0,0 +1,265 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ftsort.h
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#ifndef row0ftsort_h
+#define row0ftsort_h
+
+#include "data0data.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "rem0types.h"
+#include "row0merge.h"
+#include "btr0bulk.h"
+#include "srv0srv.h"
+
+/** This structure defineds information the scan thread will fetch
+and put to the linked list for parallel tokenization/sort threads
+to process */
+typedef struct fts_doc_item fts_doc_item_t;
+
+/** Information about temporary files used in merge sort */
+struct fts_doc_item {
+ dfield_t* field; /*!< field contains document string */
+ doc_id_t doc_id; /*!< document ID */
+ UT_LIST_NODE_T(fts_doc_item_t) doc_list;
+ /*!< list of doc items */
+};
+
+/** This defines the list type that scan thread would feed the parallel
+tokenization threads and sort threads. */
+typedef UT_LIST_BASE_NODE_T(fts_doc_item_t) fts_doc_list_t;
+
+#define FTS_PLL_MERGE 1
+
+/** Sort information passed to each individual parallel sort thread */
+struct fts_psort_t;
+
+/** Common info passed to each parallel sort thread */
+struct fts_psort_common_t {
+ row_merge_dup_t* dup; /*!< descriptor of FTS index */
+ dict_table_t* new_table; /*!< source table */
+ /** Old table page size */
+ ulint old_zip_size;
+ trx_t* trx; /*!< transaction */
+ fts_psort_t* all_info; /*!< all parallel sort info */
+ os_event_t sort_event; /*!< sort event */
+ ibool opt_doc_id_size;/*!< whether to use 4 bytes
+ instead of 8 bytes integer to
+ store Doc ID during sort, if
+ Doc ID will not be big enough
+ to use 8 bytes value */
+};
+
+struct fts_psort_t {
+ ulint psort_id; /*!< Parallel sort ID */
+ row_merge_buf_t* merge_buf[FTS_NUM_AUX_INDEX];
+ /*!< sort buffer */
+ merge_file_t* merge_file[FTS_NUM_AUX_INDEX];
+ /*!< sort file */
+ row_merge_block_t* merge_block[FTS_NUM_AUX_INDEX];
+ /*!< buffer to write to file */
+ row_merge_block_t* crypt_block[FTS_NUM_AUX_INDEX];
+ /*!< buffer to crypt data */
+ ulint child_status; /*!< child task status */
+ ulint state; /*!< parent state */
+ fts_doc_list_t fts_doc_list; /*!< doc list to process */
+ fts_psort_common_t* psort_common; /*!< ptr to all psort info */
+ tpool::waitable_task* task; /*!< threadpool task */
+ dberr_t error; /*!< db error during psort */
+ ulint memory_used; /*!< memory used by fts_doc_list */
+ ib_mutex_t mutex; /*!< mutex for fts_doc_list */
+};
+
+/** Row fts token for plugin parser */
+struct row_fts_token_t {
+ fts_string_t* text; /*!< token */
+ UT_LIST_NODE_T(row_fts_token_t)
+ token_list; /*!< next token link */
+};
+
+typedef UT_LIST_BASE_NODE_T(row_fts_token_t) fts_token_list_t;
+
+/** Structure stores information from string tokenization operation */
+struct fts_tokenize_ctx {
+ ulint processed_len; /*!< processed string length */
+ ulint init_pos; /*!< doc start position */
+ ulint buf_used; /*!< the sort buffer (ID) when
+ tokenization stops, which
+ could due to sort buffer full */
+ ulint rows_added[FTS_NUM_AUX_INDEX];
+ /*!< number of rows added for
+ each FTS index partition */
+ ib_rbt_t* cached_stopword;/*!< in: stopword list */
+ dfield_t sort_field[FTS_NUM_FIELDS_SORT];
+ /*!< in: sort field */
+ fts_token_list_t fts_token_list;
+
+ fts_tokenize_ctx() :
+ processed_len(0), init_pos(0), buf_used(0),
+ rows_added(), cached_stopword(NULL), sort_field(),
+ fts_token_list()
+ {
+ memset(rows_added, 0, sizeof rows_added);
+ memset(sort_field, 0, sizeof sort_field);
+ UT_LIST_INIT(fts_token_list, &row_fts_token_t::token_list);
+ }
+};
+
+typedef struct fts_tokenize_ctx fts_tokenize_ctx_t;
+
+/** Structure stores information needed for the insertion phase of FTS
+parallel sort. */
+struct fts_psort_insert {
+ CHARSET_INFO* charset; /*!< charset info */
+ mem_heap_t* heap; /*!< heap */
+ ibool opt_doc_id_size;/*!< Whether to use smaller (4 bytes)
+ integer for Doc ID */
+ BtrBulk* btr_bulk; /*!< Bulk load instance */
+ dtuple_t* tuple; /*!< Tuple to insert */
+
+#ifdef UNIV_DEBUG
+ ulint aux_index_id; /*!< Auxiliary index id */
+#endif
+};
+
+typedef struct fts_psort_insert fts_psort_insert_t;
+
+
+/** status bit used for communication between parent and child thread */
+#define FTS_PARENT_COMPLETE 1
+#define FTS_PARENT_EXITING 2
+#define FTS_CHILD_COMPLETE 1
+#define FTS_CHILD_EXITING 2
+
+/** Print some debug information */
+#define FTSORT_PRINT
+
+#ifdef FTSORT_PRINT
+#define DEBUG_FTS_SORT_PRINT(str) \
+ do { \
+ ut_print_timestamp(stderr); \
+ fprintf(stderr, str); \
+ } while (0)
+#else
+#define DEBUG_FTS_SORT_PRINT(str)
+#endif /* FTSORT_PRINT */
+
+/*************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID
+3) Word's position in original 'doc'.
+
+@return dict_index_t structure for the fts sort index */
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+ dict_index_t* index, /*!< in: Original FTS index
+ based on which this sort index
+ is created */
+ dict_table_t* table, /*!< in,out: table that FTS index
+ is being created on */
+ ibool* opt_doc_id_size);
+ /*!< out: whether to use 4 bytes
+ instead of 8 bytes integer to
+ store Doc ID during sort */
+
+/** Initialize FTS parallel sort structures.
+@param[in] trx transaction
+@param[in,out] dup descriptor of FTS index being created
+@param[in] new_table table where indexes are created
+@param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes
+ integer to store Doc ID during sort
+@param[in] old_zip_size page size of the old table during alter
+@param[out] psort parallel sort info to be instantiated
+@param[out] merge parallel merge info to be instantiated
+@return true if all successful */
+bool
+row_fts_psort_info_init(
+ trx_t* trx,
+ row_merge_dup_t*dup,
+ dict_table_t* new_table,
+ bool opt_doc_id_size,
+ ulint old_zip_size,
+ fts_psort_t** psort,
+ fts_psort_t** merge)
+ MY_ATTRIBUTE((nonnull));
+
+/********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close
+temparary merge sort files */
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+ fts_psort_t* psort_info, /*!< parallel sort info */
+ fts_psort_t* merge_info); /*!< parallel merge info */
+/********************************************************************//**
+Free up merge buffers when merge sort is done */
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+ fts_psort_t* psort_info); /*!< in: parallel sort info */
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+void
+row_fts_start_psort(
+/*================*/
+ fts_psort_t* psort_info); /*!< in: parallel sort info */
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+ fts_psort_t* merge_info); /*!< in: parallel sort info */
+/********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+int
+row_merge_fts_sel_propagate(
+/*========================*/
+ int propogated, /*<! in: tree node propagated */
+ int* sel_tree, /*<! in: selection tree */
+ ulint level, /*<! in: selection tree level */
+ const mrec_t** mrec, /*<! in: sort record */
+ rec_offs** offsets, /*<! in: record offsets */
+ dict_index_t* index); /*<! in: FTS index */
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+ dict_index_t* index, /*!< in: index */
+ dict_table_t* table, /*!< in: new table */
+ fts_psort_t* psort_info, /*!< parallel sort info */
+ ulint id) /* !< in: which auxiliary table's data
+ to insert to */
+ MY_ATTRIBUTE((nonnull));
+#endif /* row0ftsort_h */
diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h
new file mode 100644
index 00000000..fd2651da
--- /dev/null
+++ b/storage/innobase/include/row0import.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0import.h
+Header file for import tablespace functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0import_h
+#define row0import_h
+
+#include "dict0types.h"
+
+// Forward declarations
+struct trx_t;
+struct dict_table_t;
+struct row_prebuilt_t;
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_for_mysql(
+/*=================*/
+ dict_table_t* table, /*!< in/out: table */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct
+ in MySQL */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN.
+@param[in,out] trx dictionary transaction
+@param[in] table_id table identifier
+@param[in] discarded whether to set or clear the flag
+@return DB_SUCCESS or error code */
+dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id,
+ bool discarded)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update the root page numbers and tablespace ID of a table.
+@param[in,out] trx dictionary transaction
+@param[in,out] table persistent table
+@param[in] reset whether to reset the fields to FIL_NULL
+@return DB_SUCCESS or error code */
+dberr_t
+row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#endif /* row0import_h */
diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h
new file mode 100644
index 00000000..9a16394a
--- /dev/null
+++ b/storage/innobase/include/row0ins.h
@@ -0,0 +1,224 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.h
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0ins_h
+#define row0ins_h
+
+#include "data0data.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include <vector>
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_foreign_key_check_lock.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or
+DB_ROW_IS_REFERENCED */
+dberr_t
+row_ins_check_foreign_constraint(
+/*=============================*/
+ ibool check_ref,/*!< in: TRUE If we want to check that
+ the referenced table is ok, FALSE if we
+ want to check the foreign key table */
+ dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the
+ tables mentioned in it must be in the
+ dictionary cache if they exist at all */
+ dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign
+ table, else the referenced table */
+ dtuple_t* entry, /*!< in: index entry for index */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+void
+ins_node_set_new_row(
+/*=================*/
+ ins_node_t* node, /*!< in: insert node */
+ dtuple_t* row); /*!< in: new row (or first row) for the node */
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint n_uniq, /*!< in: 0 or index->n_uniq */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr) /*!< in: query thread or NULL */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: secondary index */
+ mem_heap_t* offsets_heap,
+ /*!< in/out: memory heap that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during
+ row_log_table_apply(), or 0 */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ ulint n_ext) /*!< in: number of externally stored columns */
+ MY_ATTRIBUTE((warn_unused_result));
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+ dict_index_t* index, /*!< in: secondary index */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ bool check_foreign = true) /*!< in: true if check
+ foreign table is needed, false otherwise */
+ MY_ATTRIBUTE((warn_unused_result));
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in
+SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_ins_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/* Insert node types */
+#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */
+#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */
+#define INS_DIRECT 2 /* this is for internal use in dict0crea:
+ insert the row directly */
+
+/* Node execution states */
+#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */
+#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */
+#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and
+ inserted */
+
+struct row_prebuilt_t;
+
+/** Insert node structure */
+struct ins_node_t
+{
+ explicit ins_node_t(ulint ins_type, dict_table_t *table) :
+ common(QUE_NODE_INSERT, NULL),
+ ins_type(ins_type),
+ row(NULL), table(table), select(NULL), values_list(NULL),
+ state(INS_NODE_SET_IX_LOCK), index(NULL),
+ entry_list(), entry(entry_list.end()),
+ trx_id(0), entry_sys_heap(mem_heap_create(128))
+ {
+ }
+ que_common_t common; /*!< node type: QUE_NODE_INSERT */
+ ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */
+ dtuple_t* row; /*!< row to insert */
+ dict_table_t* table; /*!< table where to insert */
+ sel_node_t* select; /*!< select in searched insert */
+ que_node_t* values_list;/* list of expressions to evaluate and
+ insert in an INS_VALUES insert */
+ ulint state; /*!< node execution state */
+ dict_index_t* index; /*!< NULL, or the next index where the index
+ entry should be inserted */
+ std::vector<dtuple_t*>
+ entry_list;/* list of entries, one for each index */
+ std::vector<dtuple_t*>::iterator
+ entry; /*!< NULL, or entry to insert in the index;
+ after a successful insert of the entry,
+ this should be reset to NULL */
+ /** buffer for the system columns */
+ byte sys_buf[DATA_ROW_ID_LEN
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+ trx_id_t trx_id; /*!< trx id or the last trx which executed the
+ node */
+ byte vers_start_buf[8]; /* Buffers for System Versioning */
+ byte vers_end_buf[8]; /* system fields. */
+ mem_heap_t* entry_sys_heap;
+ /* memory heap used as auxiliary storage;
+ entry_list and sys fields are stored here;
+ if this is NULL, entry list should be created
+ and buffers for sys fields in row allocated */
+ void vers_update_end(row_prebuilt_t *prebuilt, bool history_row);
+ bool vers_history_row() const; /* true if 'row' is historical */
+};
+
+/** Create an insert object.
+@param ins_type INS_VALUES, ...
+@param table table where to insert
+@param heap memory heap
+@return the created object */
+inline ins_node_t *ins_node_create(ulint ins_type, dict_table_t *table,
+ mem_heap_t *heap)
+{
+ return new (mem_heap_alloc(heap, sizeof(ins_node_t)))
+ ins_node_t(ins_type, table);
+}
+
+#endif
diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h
new file mode 100644
index 00000000..88fce314
--- /dev/null
+++ b/storage/innobase/include/row0log.h
@@ -0,0 +1,268 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0log.h
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#ifndef row0log_h
+#define row0log_h
+
+#include "que0types.h"
+#include "mtr0types.h"
+#include "row0types.h"
+#include "rem0types.h"
+#include "data0types.h"
+#include "trx0types.h"
+
+class ut_stage_alter_t;
+
+extern Atomic_counter<ulint> onlineddl_rowlog_rows;
+extern ulint onlineddl_rowlog_pct_used;
+extern ulint onlineddl_pct_progress;
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+bool
+row_log_allocate(
+/*=============*/
+ const trx_t* trx, /*!< in: the ALTER TABLE transaction */
+ dict_index_t* index, /*!< in/out: index */
+ dict_table_t* table, /*!< in/out: new table being rebuilt,
+ or NULL when creating a secondary index */
+ bool same_pk,/*!< in: whether the definition of the
+ PRIMARY KEY has remained the same */
+ const dtuple_t* defaults,
+ /*!< in: default values of
+ added, changed columns, or NULL */
+ const ulint* col_map,/*!< in: mapping of old column
+ numbers to new ones, or NULL if !table */
+ const char* path, /*!< in: where to create temporary file */
+ const TABLE* old_table, /*!< in:table definition before alter */
+ bool allow_not_null) /*!< in: allow null to non-null
+ conversion */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+void
+row_log_free(
+/*=========*/
+ row_log_t* log) /*!< in,own: row log */
+ MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+Free the row log for an index on which online creation was aborted. */
+UNIV_INLINE
+void
+row_log_abort_sec(
+/*==============*/
+ dict_index_t* index) /*!< in/out: index (x-latched) */
+ MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+Try to log an operation to a secondary index that is
+(or was) being created.
+@retval true if the operation was logged or can be ignored
+@retval false if online index creation is not taking place */
+UNIV_INLINE
+bool
+row_log_online_op_try(
+/*==================*/
+ dict_index_t* index, /*!< in/out: index, S or X latched */
+ const dtuple_t* tuple, /*!< in: index tuple */
+ trx_id_t trx_id) /*!< in: transaction ID for insert,
+ or 0 for delete */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************//**
+Logs an operation to a secondary index that is (or was) being created. */
+void
+row_log_online_op(
+/*==============*/
+ dict_index_t* index, /*!< in/out: index, S or X latched */
+ const dtuple_t* tuple, /*!< in: index tuple */
+ trx_id_t trx_id) /*!< in: transaction ID for insert,
+ or 0 for delete */
+ ATTRIBUTE_COLD __attribute__((nonnull));
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+dberr_t
+row_log_table_get_error(
+/*====================*/
+ const dict_index_t* index) /*!< in: clustered index of a table
+ that is being rebuilt online */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check whether a virtual column is indexed in the new table being
+created during alter table
+@param[in] index cluster index
+@param[in] v_no virtual column number
+@return true if it is indexed, else false */
+bool
+row_log_col_is_indexed(
+ const dict_index_t* index,
+ ulint v_no);
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+void
+row_log_table_delete(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ const byte* sys) /*!< in: DB_TRX_ID,DB_ROLL_PTR that should
+ be logged, or NULL to use those in rec */
+ ATTRIBUTE_COLD __attribute__((nonnull(1,2,3)));
+
+/******************************************************//**
+Logs an update operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+void
+row_log_table_update(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ const dtuple_t* old_pk);/*!< in: row_log_table_get_pk()
+ before the update */
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index),
+ or NULL */
+ byte* sys, /*!< out: DB_TRX_ID,DB_ROLL_PTR for
+ row_log_table_delete(), or NULL */
+ mem_heap_t** heap) /*!< in/out: memory heap where allocated */
+ ATTRIBUTE_COLD __attribute__((nonnull(1,2,5), warn_unused_result));
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+void
+row_log_table_insert(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets);/*!< in: rec_get_offsets(rec,index) */
+/******************************************************//**
+Notes that a BLOB is being freed during online ALTER TABLE. */
+void
+row_log_table_blob_free(
+/*====================*/
+ dict_index_t* index, /*!< in/out: clustered index, X-latched */
+ ulint page_no)/*!< in: starting page number of the BLOB */
+ ATTRIBUTE_COLD __attribute__((nonnull));
+/******************************************************//**
+Notes that a BLOB is being allocated during online ALTER TABLE. */
+void
+row_log_table_blob_alloc(
+/*=====================*/
+ dict_index_t* index, /*!< in/out: clustered index, X-latched */
+ ulint page_no)/*!< in: starting page number of the BLOB */
+ ATTRIBUTE_COLD __attribute__((nonnull));
+
+/** Apply the row_log_table log to a table upon completing rebuild.
+@param[in] thr query graph
+@param[in] old_table old table
+@param[in,out] table MySQL table (for reporting duplicates)
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_table() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@param[in] new_table Altered table
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_table_apply(
+ que_thr_t* thr,
+ dict_table_t* old_table,
+ struct TABLE* table,
+ ut_stage_alter_t* stage,
+ dict_table_t* new_table)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+ dict_index_t* index) /*!< in: index, must be locked */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Apply the row log to the index upon completing index creation.
+@param[in] trx transaction (for checking if the operation was
+interrupted)
+@param[in,out] index secondary index
+@param[in,out] table MySQL table (for reporting duplicates)
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_index() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_apply(
+ const trx_t* trx,
+ dict_index_t* index,
+ struct TABLE* table,
+ ut_stage_alter_t* stage)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Get the n_core_fields of online log for the index
+@param index index whose n_core_fields of log to be accessed
+@return number of n_core_fields */
+unsigned row_log_get_n_core_fields(const dict_index_t *index);
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Estimate how much work is to be done by the log apply phase
+of an ALTER TABLE for this index.
+@param[in] index index whose log to assess
+@return work to be done by log-apply in abstract units
+*/
+ulint
+row_log_estimate_work(
+ const dict_index_t* index);
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+#include "row0log.ic"
+
+#endif /* row0log.h */
diff --git a/storage/innobase/include/row0log.ic b/storage/innobase/include/row0log.ic
new file mode 100644
index 00000000..44d17bbc
--- /dev/null
+++ b/storage/innobase/include/row0log.ic
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0log.ic
+Modification log for online index creation and online table rebuild
+
+Created 2012-10-18 Marko Makela
+*******************************************************/
+
+#include "dict0dict.h"
+
+/******************************************************//**
+Free the row log for an index on which online creation was aborted. */
+UNIV_INLINE
+void
+row_log_abort_sec(
+/*===============*/
+ dict_index_t* index) /*!< in/out: index (x-latched) */
+{
+ ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+
+ ut_ad(!dict_index_is_clust(index));
+ dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+ row_log_free(index->online_log);
+ index->online_log = NULL;
+}
+
+/******************************************************//**
+Try to log an operation to a secondary index that is
+(or was) being created.
+@retval true if the operation was logged or can be ignored
+@retval false if online index creation is not taking place */
+UNIV_INLINE
+bool
+row_log_online_op_try(
+/*==================*/
+ dict_index_t* index, /*!< in/out: index, S or X latched */
+ const dtuple_t* tuple, /*!< in: index tuple */
+ trx_id_t trx_id) /*!< in: transaction ID for insert,
+ or 0 for delete */
+{
+
+ ut_ad(rw_lock_own_flagged(
+ dict_index_get_lock(index),
+ RW_LOCK_FLAG_S | RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_COMPLETE:
+ /* This is a normal index. Do not log anything.
+ The caller must perform the operation on the
+ index tree directly. */
+ return(false);
+ case ONLINE_INDEX_CREATION:
+ /* The index is being created online. Log the
+ operation. */
+ row_log_online_op(index, tuple, trx_id);
+ break;
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ /* The index was created online, but the operation was
+ aborted. Do not log the operation and tell the caller
+ to skip the operation. */
+ break;
+ }
+
+ return(true);
+}
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
new file mode 100644
index 00000000..1d7f9bb1
--- /dev/null
+++ b/storage/innobase/include/row0merge.h
@@ -0,0 +1,464 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0merge.h
+Index build routines using a merge sort
+
+Created 13/06/2005 Jan Lindstrom
+*******************************************************/
+
+#ifndef row0merge_h
+#define row0merge_h
+
+#include "que0types.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "rem0rec.h"
+#include "btr0types.h"
+#include "row0mysql.h"
+#include "lock0types.h"
+#include "srv0srv.h"
+#include "ut0stage.h"
+
+/* Reserve free space from every block for key_version */
+#define ROW_MERGE_RESERVE_SIZE 4
+
+/* Cluster index read task is mandatory */
+#define COST_READ_CLUSTERED_INDEX 1.0
+
+/* Basic fixed cost to build all type of index */
+#define COST_BUILD_INDEX_STATIC 0.5
+/* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */
+#define COST_BUILD_INDEX_DYNAMIC 0.5
+
+/* Sum of below two must be 1.0 */
+#define PCT_COST_MERGESORT_INDEX 0.4
+#define PCT_COST_INSERT_INDEX 0.6
+
+// Forward declaration
+struct ib_sequence_t;
+
+/** @brief Block size for I/O operations in merge sort.
+
+The minimum is srv_page_size, or page_get_free_space_of_empty()
+rounded to a power of 2.
+
+When not creating a PRIMARY KEY that contains column prefixes, this
+can be set as small as srv_page_size / 2. */
+typedef byte row_merge_block_t;
+
+/** @brief Secondary buffer for I/O operations of merge records.
+
+This buffer is used for writing or reading a record that spans two
+row_merge_block_t. Thus, it must be able to hold one merge record,
+whose maximum size is the same as the minimum size of
+row_merge_block_t. */
+typedef byte mrec_buf_t[UNIV_PAGE_SIZE_MAX];
+
+/** @brief Merge record in row_merge_block_t.
+
+The format is the same as a record in ROW_FORMAT=COMPACT with the
+exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
+typedef byte mrec_t;
+
+/** Merge record in row_merge_buf_t */
+struct mtuple_t {
+ dfield_t* fields; /*!< data fields */
+};
+
+/** Buffer for sorting in main memory. */
+struct row_merge_buf_t {
+ mem_heap_t* heap; /*!< memory heap where allocated */
+ dict_index_t* index; /*!< the index the tuples belong to */
+ ulint total_size; /*!< total amount of data bytes */
+ ulint n_tuples; /*!< number of data tuples */
+ ulint max_tuples; /*!< maximum number of data tuples */
+ mtuple_t* tuples; /*!< array of data tuples */
+ mtuple_t* tmp_tuples; /*!< temporary copy of tuples,
+ for sorting */
+};
+
+/** Information about temporary files used in merge sort */
+struct merge_file_t {
+ pfs_os_file_t fd; /*!< file descriptor */
+ ulint offset; /*!< file offset (end of file) */
+ ib_uint64_t n_rec; /*!< number of records in the file */
+};
+
+/** Index field definition */
+struct index_field_t {
+ ulint col_no; /*!< column offset */
+ ulint prefix_len; /*!< column prefix length, or 0
+ if indexing the whole column */
+ bool is_v_col; /*!< whether this is a virtual column */
+};
+
+/** Definition of an index being created */
+struct index_def_t {
+ const char* name; /*!< index name */
+ bool rebuild; /*!< whether the table is rebuilt */
+ ulint ind_type; /*!< 0, DICT_UNIQUE,
+ or DICT_CLUSTERED */
+ ulint key_number; /*!< MySQL key number,
+ or ULINT_UNDEFINED if none */
+ ulint n_fields; /*!< number of fields in index */
+ index_field_t* fields; /*!< field definitions */
+ st_mysql_ftparser*
+ parser; /*!< fulltext parser plugin */
+};
+
+/** Structure for reporting duplicate records. */
+struct row_merge_dup_t {
+ dict_index_t* index; /*!< index being sorted */
+ struct TABLE* table; /*!< MySQL table object */
+ const ulint* col_map;/*!< mapping of column numbers
+ in table to the rebuilt table
+ (index->table), or NULL if not
+ rebuilding table */
+ ulint n_dup; /*!< number of duplicates */
+};
+
+/*************************************************************//**
+Report a duplicate key. */
+void
+row_merge_dup_report(
+/*=================*/
+ row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
+ const dfield_t* entry) /*!< in: duplicate index entry */
+ MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return error code or DB_SUCCESS */
+dberr_t
+row_merge_lock_table(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_table_t* table, /*!< in: table to lock */
+ enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */
+ MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
+
+/*********************************************************************//**
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+void
+row_merge_drop_indexes_dict(
+/*========================*/
+ trx_t* trx, /*!< in/out: dictionary transaction */
+ table_id_t table_id)/*!< in: table identifier */
+ MY_ATTRIBUTE((nonnull));
+
+/** Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@param trx dictionary transaction
+@param table table containing the indexes
+@param locked True if table is locked,
+ false - may need to do lazy drop
+@param alter_trx Alter table transaction */
+void
+row_merge_drop_indexes(
+ trx_t* trx,
+ dict_table_t* table,
+ bool locked,
+ const trx_t* alter_trx=NULL);
+
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+void
+row_merge_drop_temp_indexes(void);
+/*=============================*/
+
+/** Create temporary merge files in the given paramater path, and if
+UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
+@param[in] path location for creating temporary merge files, or NULL
+@return File descriptor */
+pfs_os_file_t
+row_merge_file_create_low(
+ const char* path)
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+void
+row_merge_file_destroy_low(
+/*=======================*/
+ const pfs_os_file_t& fd); /*!< in: merge file descriptor */
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+ trx_t* trx, /*!< in/out: transaction */
+ table_id_t table_id, /*!< in: table identifier */
+ index_id_t index_id) /*!< in: index identifier */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/*********************************************************************//**
+Rename an index in the dictionary that is to be dropped. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+dberr_t
+row_merge_rename_index_to_drop(
+/*===========================*/
+ trx_t* trx, /*!< in/out: transaction */
+ table_id_t table_id, /*!< in: table identifier */
+ index_id_t index_id) /*!< in: index identifier */
+ MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Create the index and load in to the dictionary.
+@param[in,out] table the index is on this table
+@param[in] index_def the index definition
+@param[in] add_v new virtual columns added along with add
+ index call
+@return index, or NULL on error */
+dict_index_t*
+row_merge_create_index(
+ dict_table_t* table,
+ const index_def_t* index_def,
+ const dict_add_v_col_t* add_v)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Check if a transaction can use an index.
+@return whether the index can be used by the transaction */
+bool
+row_merge_is_index_usable(
+/*======================*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index) /*!< in: index to check */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Drop a table. The caller must have ensured that the background stats
+thread is not processing the table. This can be done by calling
+dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and
+before calling this function.
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_drop_table(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table) /*!< in: table instance to drop */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Build indexes on a table by reading a clustered index, creating a temporary
+file containing index entries, merge sorting these index entries and inserting
+sorted index entries to indexes.
+@param[in] trx transaction
+@param[in] old_table table where rows are read from
+@param[in] new_table table where indexes are created; identical to
+old_table unless creating a PRIMARY KEY
+@param[in] online true if creating indexes online
+@param[in] indexes indexes to be created
+@param[in] key_numbers MySQL key numbers
+@param[in] n_indexes size of indexes[]
+@param[in,out] table MySQL table, for reporting erroneous key value
+if applicable
+@param[in] defaults default values of added, changed columns, or NULL
+@param[in] col_map mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in] add_autoinc number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out] sequence autoinc sequence
+@param[in] skip_pk_sort whether the new PRIMARY KEY will follow
+existing order
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of
+this function and it will be passed to other functions for further accounting.
+@param[in] add_v new virtual columns added along with indexes
+@param[in] eval_table mysql table used to evaluate virtual column
+ value, see innobase_get_computed_value().
+@param[in] allow_non_null allow the conversion from null to not-null
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_build_indexes(
+ trx_t* trx,
+ dict_table_t* old_table,
+ dict_table_t* new_table,
+ bool online,
+ dict_index_t** indexes,
+ const ulint* key_numbers,
+ ulint n_indexes,
+ struct TABLE* table,
+ const dtuple_t* defaults,
+ const ulint* col_map,
+ ulint add_autoinc,
+ ib_sequence_t& sequence,
+ bool skip_pk_sort,
+ ut_stage_alter_t* stage,
+ const dict_add_v_col_t* add_v,
+ struct TABLE* eval_table,
+ bool allow_non_null)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Write a buffer to a block. */
+void
+row_merge_buf_write(
+/*================*/
+ const row_merge_buf_t* buf, /*!< in: sorted buffer */
+ const merge_file_t* of, /*!< in: output file */
+ row_merge_block_t* block) /*!< out: buffer for writing to file */
+ MY_ATTRIBUTE((nonnull));
+
+/********************************************************************//**
+Sort a buffer. */
+void
+row_merge_buf_sort(
+/*===============*/
+ row_merge_buf_t* buf, /*!< in/out: sort buffer */
+ row_merge_dup_t* dup) /*!< in/out: reporter of duplicates
+ (NULL if non-unique index) */
+ MY_ATTRIBUTE((nonnull(1)));
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return whether the request was completed successfully
+@retval false on error
+@retval true on success */
+UNIV_INTERN
+bool
+row_merge_write(
+/*============*/
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint offset, /*!< in: offset where to write,
+ in number of row_merge_block_t elements */
+ const void* buf, /*!< in: data */
+ void* crypt_buf, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+ row_merge_buf_t* buf) /*!< in,own: sort buffer */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Create a merge file in the given location.
+@param[out] merge_file merge file structure
+@param[in] path location for creating temporary file, or NULL
+@return file descriptor, or -1 on failure */
+pfs_os_file_t
+row_merge_file_create(
+ merge_file_t* merge_file,
+ const char* path)
+ MY_ATTRIBUTE((warn_unused_result, nonnull(1)));
+
+/** Merge disk files.
+@param[in] trx transaction
+@param[in] dup descriptor of index being created
+@param[in,out] file file containing index entries
+@param[in,out] block 3 buffers
+@param[in,out] tmpfd temporary file handle
+@param[in] update_progress true, if we should update progress status
+@param[in] pct_progress total progress percent until now
+@param[in] pct_ocst current progress percent
+@param[in] crypt_block crypt buf or NULL
+@param[in] space space_id
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially
+and then stage->inc() will be called for each record processed.
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_sort(
+/*===========*/
+ trx_t* trx,
+ const row_merge_dup_t* dup,
+ merge_file_t* file,
+ row_merge_block_t* block,
+ pfs_os_file_t* tmpfd,
+ const bool update_progress,
+ const double pct_progress,
+ const double pct_cost,
+ row_merge_block_t* crypt_block,
+ ulint space,
+ ut_stage_alter_t* stage = NULL)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+ dict_index_t* index) /*!< in: secondary index */
+ MY_ATTRIBUTE((warn_unused_result, nonnull, malloc));
+
+/*********************************************************************//**
+Deallocate a sort buffer. */
+void
+row_merge_buf_free(
+/*===============*/
+ row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */
+ MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Destroy a merge file. */
+void
+row_merge_file_destroy(
+/*===================*/
+ merge_file_t* merge_file) /*!< in/out: merge file structure */
+ MY_ATTRIBUTE((nonnull));
+
+/** Read a merge block from the file system.
+@return whether the request was completed successfully */
+bool
+row_merge_read(
+/*===========*/
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint offset, /*!< in: offset where to read
+ in number of row_merge_block_t
+ elements */
+ row_merge_block_t* buf, /*!< out: data */
+ row_merge_block_t* crypt_buf, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+const byte*
+row_merge_read_rec(
+/*===============*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ mrec_buf_t* buf, /*!< in/out: secondary buffer */
+ const byte* b, /*!< in: pointer to record */
+ const dict_index_t* index, /*!< in: index of the record */
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint* foffs, /*!< in/out: file offset */
+ const mrec_t** mrec, /*!< out: pointer to merge record,
+ or NULL on end of list
+ (non-NULL on I/O error) */
+ rec_offs* offsets,/*!< out: offsets of mrec */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+ MY_ATTRIBUTE((warn_unused_result));
+#endif /* row0merge.h */
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
new file mode 100644
index 00000000..73e96930
--- /dev/null
+++ b/storage/innobase/include/row0mysql.h
@@ -0,0 +1,975 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.h
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0mysql_h
+#define row0mysql_h
+
+#include "que0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "lock0types.h"
+#include "fil0fil.h"
+#include "fts0fts.h"
+#include "gis0type.h"
+
+#include "sql_list.h"
+#include "sql_cmd.h"
+
+extern ibool row_rollback_on_timeout;
+
+struct row_prebuilt_t;
+class ha_innobase;
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct of a
+ ha_innobase:: table handle */
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+ byte* dest, /*!< in: where to store */
+ ulint len, /*!< in: length, must fit in two bytes */
+ ulint lenlen);/*!< in: storage length of len: either 1 or 2 bytes */
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+ ulint* len, /*!< out: variable-length field length */
+ const byte* field, /*!< in: field in the MySQL format */
+ ulint lenlen);/*!< in: storage length of len: either 1
+ or 2 bytes */
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+ byte* dest, /*!< in: where to store */
+ ulint col_len,/*!< in: dest buffer size: determines into
+ how many bytes the BLOB length is stored,
+ the space for the length may vary from 1
+ to 4 bytes */
+ const void* data, /*!< in: BLOB data; if the value to store
+ is SQL NULL this should be NULL pointer */
+ ulint len); /*!< in: BLOB length; if the value to store
+ is SQL NULL this should be 0; remember
+ also to set the NULL bit in the MySQL record
+ header! */
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+ ulint* len, /*!< out: BLOB length */
+ const byte* ref, /*!< in: BLOB reference in the
+ MySQL format */
+ ulint col_len); /*!< in: BLOB reference length
+ (not BLOB length) */
+/*******************************************************************//**
+Converts InnoDB geometry data format to MySQL data format. */
+void
+row_mysql_store_geometry(
+/*=====================*/
+ byte* dest, /*!< in/out: where to store */
+ ulint dest_len, /*!< in: dest buffer size: determines into
+ how many bytes the geometry length is stored,
+ the space for the length may vary from 1
+ to 4 bytes */
+ const byte* src, /*!< in: geometry data; if the value to store
+ is SQL NULL this should be NULL pointer */
+ ulint src_len); /*!< in: geometry length; if the value to store
+ is SQL NULL this should be 0; remember
+ also to set the NULL bit in the MySQL record
+ header! */
+/**************************************************************//**
+Pad a column with spaces. */
+void
+row_mysql_pad_col(
+/*==============*/
+ ulint mbminlen, /*!< in: minimum size of a character,
+ in bytes */
+ byte* pad, /*!< out: padded buffer */
+ ulint len); /*!< in: number of bytes to pad */
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.cc.
+@return up to which byte we used buf in the conversion */
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+ dfield_t* dfield, /*!< in/out: dfield where dtype
+ information must be already set when
+ this function is called! */
+ byte* buf, /*!< in/out: buffer for a converted
+ integer value; this must be at least
+ col_len long then! NOTE that dfield
+ may also get a pointer to 'buf',
+ therefore do not discard this as long
+ as dfield is used! */
+ ibool row_format_col, /*!< TRUE if the mysql_data is from
+ a MySQL row, FALSE if from a MySQL
+ key value;
+ in MySQL, a true VARCHAR storage
+ format differs in a row and in a
+ key value: in a key value the length
+ is always stored in 2 bytes! */
+ const byte* mysql_data, /*!< in: MySQL column value, not
+ SQL NULL; NOTE that dfield may also
+ get a pointer to mysql_data,
+ therefore do not discard this as long
+ as dfield is used! */
+ ulint col_len, /*!< in: MySQL column length; NOTE that
+ this is the storage length of the
+ column in the MySQL format row, not
+ necessarily the length of the actual
+ payload data; if the column is a true
+ VARCHAR then this is irrelevant */
+ ulint comp); /*!< in: nonzero=compact format */
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return true if it was a lock wait and we should continue running the
+query thread */
+bool
+row_mysql_handle_errors(
+/*====================*/
+ dberr_t* new_err,/*!< out: possible new error encountered in
+ rollback, or the old error which was
+ during the function entry */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* thr, /*!< in: query thread, or NULL */
+ trx_savept_t* savept) /*!< in: savepoint, or NULL */
+ MY_ATTRIBUTE((nonnull(1,2)));
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+ dict_table_t* table, /*!< in: Innobase table handle */
+ ulint mysql_row_len); /*!< in: length in bytes of a row in
+ the MySQL format */
+/********************************************************************//**
+Free a prebuilt struct for a MySQL table handle. */
+void
+row_prebuilt_free(
+/*==============*/
+ row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */
+ ibool dict_locked); /*!< in: TRUE=data dictionary locked */
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+void
+row_update_prebuilt_trx(
+/*====================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
+ in MySQL handle */
+ trx_t* trx); /*!< in: transaction handle */
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL
+ table handle */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Lock a table.
+@param[in,out] prebuilt table handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table(row_prebuilt_t* prebuilt);
+
+/** System Versioning: row_insert_for_mysql() modes */
+enum ins_mode_t {
+ /* plain row (without versioning) */
+ ROW_INS_NORMAL = 0,
+ /* row_start = TRX_ID, row_end = MAX */
+ ROW_INS_VERSIONED,
+ /* row_end = TRX_ID */
+ ROW_INS_HISTORICAL
+};
+
+/** Does an insert for MySQL.
+@param[in] mysql_rec row in the MySQL format
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@param[in] ins_mode what row type we're inserting
+@return error code or DB_SUCCESS*/
+dberr_t
+row_insert_for_mysql(
+ const byte* mysql_rec,
+ row_prebuilt_t* prebuilt,
+ ins_mode_t ins_mode)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+void
+row_prebuild_sel_graph(
+/*===================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/** Does an update or delete of a row for MySQL.
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_for_mysql(
+ row_prebuilt_t* prebuilt)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** This can only be used when the current transaction is at
+READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_for_mysql() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@param[in] has_latches_on_recs TRUE if called so that we have the
+ latches on the records under pcur
+ and clust_pcur, and we do not need
+ to reposition the cursors. */
+void
+row_unlock_for_mysql(
+ row_prebuilt_t* prebuilt,
+ ibool has_latches_on_recs);
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+ dict_table_t* table, /*!< in: table to update */
+ mem_heap_t* heap); /*!< in: mem heap from which allocated */
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_cascade_for_mysql(
+/*=========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ upd_node_t* node, /*!< in: update node used in the cascade
+ or set null operation */
+ dict_table_t* table) /*!< in: table where we do the operation */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+void
+row_mysql_lock_data_dictionary_func(
+/*================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const char* file, /*!< in: file name */
+ unsigned line); /*!< in: line number */
+#define row_mysql_lock_data_dictionary(trx) \
+ row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__)
+/*********************************************************************//**
+Unlocks the data dictionary exclusive lock. */
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+void
+row_mysql_freeze_data_dictionary_func(
+/*==================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const char* file, /*!< in: file name */
+ unsigned line); /*!< in: line number */
+#define row_mysql_freeze_data_dictionary(trx) \
+ row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__)
+/*********************************************************************//**
+Unlocks the data dictionary shared lock. */
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Creates a table for MySQL. On failure the transaction will be rolled back
+and the 'table' object will be freed.
+@return error code or DB_SUCCESS */
+dberr_t
+row_create_table_for_mysql(
+/*=======================*/
+ dict_table_t* table, /*!< in, own: table definition
+ (will be freed, or on DB_SUCCESS
+ added to the data dictionary cache) */
+ trx_t* trx, /*!< in/out: transaction */
+ fil_encryption_t mode, /*!< in: encryption mode */
+ uint32_t key_id) /*!< in: encryption key_id */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Create an index when creating a table.
+On failure, the caller must drop the table!
+@return error number or DB_SUCCESS */
+dberr_t
+row_create_index_for_mysql(
+/*=======================*/
+ dict_index_t* index, /*!< in, own: index definition
+ (will be freed) */
+ trx_t* trx, /*!< in: transaction handle */
+ const ulint* field_lengths) /*!< in: if not NULL, must contain
+ dict_index_get_n_fields(index)
+ actual field lengths for the
+ index columns, which are
+ then checked for not being too
+ large. */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+The master thread in srv0srv.cc calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix.
+@return how many tables dropped + remaining tables in list */
+ulint
+row_drop_tables_for_mysql_in_background(void);
+/*=========================================*/
+/*********************************************************************//**
+Get the background drop list length. NOTE: the caller must own the kernel
+mutex!
+@return how many tables in list */
+ulint
+row_get_background_drop_list_len_low(void);
+/*======================================*/
+
+/** Drop garbage tables during recovery. */
+void
+row_mysql_drop_garbage_tables();
+
+/*********************************************************************//**
+Sets an exclusive lock on a table.
+@return error code or DB_SUCCESS */
+dberr_t
+row_mysql_lock_table(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_table_t* table, /*!< in: table to lock */
+ enum lock_mode mode, /*!< in: LOCK_X or LOCK_S */
+ const char* op_info) /*!< in: string for trx->op_info */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Drop a table.
+If the data dictionary was not already locked by the transaction,
+the transaction will be committed. Otherwise, the data dictionary
+will remain locked.
+@param[in] name Table name
+@param[in,out] trx Transaction handle
+@param[in] sqlcom type of SQL operation
+@param[in] create_failed true=create table failed
+ because e.g. foreign key column
+@param[in] nonatomic Whether it is permitted to release
+ and reacquire dict_sys.latch
+@return error code */
+dberr_t
+row_drop_table_for_mysql(
+ const char* name,
+ trx_t* trx,
+ enum_sql_command sqlcom,
+ bool create_failed = false,
+ bool nonatomic = true);
+
+/** Drop a table after failed CREATE TABLE. */
+dberr_t row_drop_table_after_create_fail(const char* name, trx_t* trx);
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the file_unreadable flag is set.
+@return error code or DB_SUCCESS */
+dberr_t
+row_discard_tablespace_for_mysql(
+/*=============================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx) /*!< in: transaction handle */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_tablespace_for_mysql(
+/*============================*/
+ dict_table_t* table, /*!< in/out: table */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Drop a database for MySQL.
+@param[in] name database name which ends at '/'
+@param[in] trx transaction handle
+@param[out] found number of dropped tables/partitions
+@return error code or DB_SUCCESS */
+dberr_t
+row_drop_database_for_mysql(
+ const char* name,
+ trx_t* trx,
+ ulint* found);
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+row_rename_table_for_mysql(
+/*=======================*/
+ const char* old_name, /*!< in: old table name */
+ const char* new_name, /*!< in: new table name */
+ trx_t* trx, /*!< in/out: transaction */
+ bool commit, /*!< in: whether to commit trx */
+ bool use_fk) /*!< in: whether to parse and enforce
+ FOREIGN KEY constraints */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Scans an index for either COOUNT(*) or CHECK TABLE.
+If CHECK TABLE; Checks that the index contains entries in an ascending order,
+unique constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return DB_SUCCESS or other error */
+dberr_t
+row_scan_index_for_mysql(
+/*=====================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct
+ in MySQL handle */
+ const dict_index_t* index, /*!< in: index */
+ ulint* n_rows) /*!< out: number of entries
+ seen in the consistent read */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Initialize this module */
+void
+row_mysql_init(void);
+/*================*/
+
+/*********************************************************************//**
+Close this module */
+void
+row_mysql_close(void);
+/*=================*/
+
+/* A struct describing a place for an individual column in the MySQL
+row format which is presented to the table handler in ha_innobase.
+This template struct is used to speed up row transformations between
+Innobase and MySQL. */
+
+struct mysql_row_templ_t {
+ ulint col_no; /*!< column number of the column */
+ ulint rec_field_no; /*!< field number of the column in an
+ Innobase record in the current index;
+ not defined if template_type is
+ ROW_MYSQL_WHOLE_ROW */
+ ibool rec_field_is_prefix; /* is this field in a prefix index? */
+ ulint rec_prefix_field_no; /* record field, even if just a
+ prefix; same as rec_field_no when not a
+ prefix, otherwise rec_field_no is
+ ULINT_UNDEFINED but this is the true
+ field number*/
+ ulint clust_rec_field_no; /*!< field number of the column in an
+ Innobase record in the clustered index;
+ not defined if template_type is
+ ROW_MYSQL_WHOLE_ROW */
+ ulint icp_rec_field_no; /*!< field number of the column in an
+ Innobase record in the current index;
+ not defined unless
+ index condition pushdown is used */
+ ulint mysql_col_offset; /*!< offset of the column in the MySQL
+ row format */
+ ulint mysql_col_len; /*!< length of the column in the MySQL
+ row format */
+ ulint mysql_null_byte_offset; /*!< MySQL NULL bit byte offset in a
+ MySQL record */
+ ulint mysql_null_bit_mask; /*!< bit mask to get the NULL bit,
+ zero if column cannot be NULL */
+ ulint type; /*!< column type in Innobase mtype
+ numbers DATA_CHAR... */
+ ulint mysql_type; /*!< MySQL type code; this is always
+ < 256 */
+ ulint mysql_length_bytes; /*!< if mysql_type
+ == DATA_MYSQL_TRUE_VARCHAR, this tells
+ whether we should use 1 or 2 bytes to
+ store the MySQL true VARCHAR data
+ length at the start of row in the MySQL
+ format (NOTE that the MySQL key value
+ format always uses 2 bytes for the data
+ len) */
+ ulint charset; /*!< MySQL charset-collation code
+ of the column, or zero */
+ ulint mbminlen; /*!< minimum length of a char, in bytes,
+ or zero if not a char type */
+ ulint mbmaxlen; /*!< maximum length of a char, in bytes,
+ or zero if not a char type */
+ ulint is_unsigned; /*!< if a column type is an integer
+ type and this field is != 0, then
+ it is an unsigned integer type */
+ ulint is_virtual; /*!< if a column is a virtual column */
+};
+
+#define MYSQL_FETCH_CACHE_SIZE 8
+/* After fetching this many rows, we start caching them in fetch_cache */
+#define MYSQL_FETCH_CACHE_THRESHOLD 4
+
+#define ROW_PREBUILT_ALLOCATED 78540783
+#define ROW_PREBUILT_FREED 26423527
+
+/** A struct for (sometimes lazily) prebuilt structures in an Innobase table
+handle used within MySQL; these are used to save CPU time. */
+
+struct row_prebuilt_t {
+ ulint magic_n; /*!< this magic number is set to
+ ROW_PREBUILT_ALLOCATED when created,
+ or ROW_PREBUILT_FREED when the
+ struct has been freed */
+ dict_table_t* table; /*!< Innobase table handle */
+ dict_index_t* index; /*!< current index for a search, if
+ any */
+ trx_t* trx; /*!< current transaction handle */
+ unsigned sql_stat_start:1;/*!< TRUE when we start processing of
+ an SQL statement: we may have to set
+ an intention lock on the table,
+ create a consistent read view etc. */
+ unsigned clust_index_was_generated:1;
+ /*!< if the user did not define a
+ primary key in MySQL, then Innobase
+ automatically generated a clustered
+ index where the ordering column is
+ the row id: in this case this flag
+ is set to TRUE */
+ unsigned index_usable:1; /*!< caches the value of
+ row_merge_is_index_usable(trx,index) */
+ unsigned read_just_key:1;/*!< set to 1 when MySQL calls
+ ha_innobase::extra with the
+ argument HA_EXTRA_KEYREAD; it is enough
+ to read just columns defined in
+ the index (i.e., no read of the
+ clustered index record necessary) */
+ unsigned used_in_HANDLER:1;/*!< TRUE if we have been using this
+ handle in a MySQL HANDLER low level
+ index cursor command: then we must
+ store the pcur position even in a
+ unique search from a clustered index,
+ because HANDLER allows NEXT and PREV
+ in such a situation */
+ unsigned template_type:2;/*!< ROW_MYSQL_WHOLE_ROW,
+ ROW_MYSQL_REC_FIELDS,
+ ROW_MYSQL_DUMMY_TEMPLATE, or
+ ROW_MYSQL_NO_TEMPLATE */
+ unsigned n_template:10; /*!< number of elements in the
+ template */
+ unsigned null_bitmap_len:10;/*!< number of bytes in the SQL NULL
+ bitmap at the start of a row in the
+ MySQL format */
+ unsigned need_to_access_clustered:1; /*!< if we are fetching
+ columns through a secondary index
+ and at least one column is not in
+ the secondary index, then this is
+ set to TRUE; note that sometimes this
+ is set but we later optimize out the
+ clustered index lookup */
+ unsigned templ_contains_blob:1;/*!< TRUE if the template contains
+ a column with DATA_LARGE_MTYPE(
+ get_innobase_type_from_mysql_type())
+ is TRUE;
+ not to be confused with InnoDB
+ externally stored columns
+ (VARCHAR can be off-page too) */
+ unsigned versioned_write:1;/*!< whether this is
+ a versioned write */
+ mysql_row_templ_t* mysql_template;/*!< template used to transform
+ rows fast between MySQL and Innobase
+ formats; memory for this template
+ is not allocated from 'heap' */
+ mem_heap_t* heap; /*!< memory heap from which
+ these auxiliary structures are
+ allocated when needed */
+ ins_node_t* ins_node; /*!< Innobase SQL insert node
+ used to perform inserts
+ to the table */
+ byte* ins_upd_rec_buff;/*!< buffer for storing data converted
+ to the Innobase format from the MySQL
+ format */
+ const byte* default_rec; /*!< the default values of all columns
+ (a "default row") in MySQL format */
+ ulint hint_need_to_fetch_extra_cols;
+ /*!< normally this is set to 0; if this
+ is set to ROW_RETRIEVE_PRIMARY_KEY,
+ then we should at least retrieve all
+ columns in the primary key; if this
+ is set to ROW_RETRIEVE_ALL_COLS, then
+ we must retrieve all columns in the
+ key (if read_just_key == 1), or all
+ columns in the table */
+ upd_node_t* upd_node; /*!< Innobase SQL update node used
+ to perform updates and deletes */
+ trx_id_t trx_id; /*!< The table->def_trx_id when
+ ins_graph was built */
+ que_fork_t* ins_graph; /*!< Innobase SQL query graph used
+ in inserts. Will be rebuilt on
+ trx_id or n_indexes mismatch. */
+ que_fork_t* upd_graph; /*!< Innobase SQL query graph used
+ in updates or deletes */
+ btr_pcur_t* pcur; /*!< persistent cursor used in selects
+ and updates */
+ btr_pcur_t* clust_pcur; /*!< persistent cursor used in
+ some selects and updates */
+ que_fork_t* sel_graph; /*!< dummy query graph used in
+ selects */
+ dtuple_t* search_tuple; /*!< prebuilt dtuple used in selects */
+ byte row_id[DATA_ROW_ID_LEN];
+ /*!< if the clustered index was
+ generated, the row id of the
+ last row fetched is stored
+ here */
+ doc_id_t fts_doc_id; /* if the table has an FTS index on
+ it then we fetch the doc_id.
+ FTS-FIXME: Currently we fetch it always
+ but in the future we must only fetch
+ it when FTS columns are being
+ updated */
+ dtuple_t* clust_ref; /*!< prebuilt dtuple used in
+ sel/upd/del */
+ lock_mode select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+ lock_mode stored_select_lock_type;/*!< this field is used to
+ remember the original select_lock_type
+ that was decided in ha_innodb.cc,
+ ::store_lock(), ::external_lock(),
+ etc. */
+ ulint row_read_type; /*!< ROW_READ_WITH_LOCKS if row locks
+ should be the obtained for records
+ under an UPDATE or DELETE cursor.
+ At READ UNCOMMITTED or
+ READ COMMITTED isolation level,
+ this can be set to
+ ROW_READ_TRY_SEMI_CONSISTENT, so that
+ if the row under an UPDATE or DELETE
+ cursor was locked by another
+ transaction, InnoDB will resort
+ to reading the last committed value
+ ('semi-consistent read'). Then,
+ this field will be set to
+ ROW_READ_DID_SEMI_CONSISTENT to
+ indicate that. If the row does not
+ match the WHERE condition, MySQL will
+ invoke handler::unlock_row() to
+ clear the flag back to
+ ROW_READ_TRY_SEMI_CONSISTENT and
+ to simply skip the row. If
+ the row matches, the next call to
+ row_search_for_mysql() will lock
+ the row.
+ This eliminates lock waits in some
+ cases; note that this breaks
+ serializability. */
+ ulint new_rec_locks; /*!< normally 0; if
+ the session is using READ
+ COMMITTED or READ UNCOMMITTED
+ isolation level, set in
+ row_search_for_mysql() if we set a new
+ record lock on the secondary
+ or clustered index; this is
+ used in row_unlock_for_mysql()
+ when releasing the lock under
+ the cursor if we determine
+ after retrieving the row that
+ it does not need to be locked
+ ('mini-rollback') */
+ ulint mysql_prefix_len;/*!< byte offset of the end of
+ the last requested column */
+ ulint mysql_row_len; /*!< length in bytes of a row in the
+ MySQL format */
+ ulint n_rows_fetched; /*!< number of rows fetched after
+ positioning the current cursor */
+ ulint fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */
+ byte* fetch_cache[MYSQL_FETCH_CACHE_SIZE];
+ /*!< a cache for fetched rows if we
+ fetch many rows from the same cursor:
+ it saves CPU time to fetch them in a
+ batch; we reserve mysql_row_len
+ bytes for each such row; these
+ pointers point 4 bytes past the
+ allocated mem buf start, because
+ there is a 4 byte magic number at the
+ start and at the end */
+ bool keep_other_fields_on_keyread; /*!< when using fetch
+ cache with HA_EXTRA_KEYREAD, don't
+ overwrite other fields in mysql row
+ row buffer.*/
+ ulint fetch_cache_first;/*!< position of the first not yet
+ fetched row in fetch_cache */
+ ulint n_fetch_cached; /*!< number of not yet fetched rows
+ in fetch_cache */
+ mem_heap_t* blob_heap; /*!< in SELECTS BLOB fields are copied
+ to this heap */
+ mem_heap_t* old_vers_heap; /*!< memory heap where a previous
+ version is built in consistent read */
+ bool in_fts_query; /*!< Whether we are in a FTS query */
+ bool fts_doc_id_in_read_set; /*!< true if table has externally
+ defined FTS_DOC_ID coulmn. */
+ /*----------------------*/
+ ulonglong autoinc_last_value;
+ /*!< last value of AUTO-INC interval */
+ ulonglong autoinc_increment;/*!< The increment step of the auto
+ increment column. Value must be
+ greater than or equal to 1. Required to
+ calculate the next value */
+ ulonglong autoinc_offset; /*!< The offset passed to
+ get_auto_increment() by MySQL. Required
+ to calculate the next value */
+ dberr_t autoinc_error; /*!< The actual error code encountered
+ while trying to init or read the
+ autoinc value from the table. We
+ store it here so that we can return
+ it to MySQL */
+ /*----------------------*/
+
+ /** Argument of handler_rowid_filter_check(),
+ or NULL if no PRIMARY KEY filter is pushed */
+ ha_innobase* pk_filter;
+
+ /** Argument to handler_index_cond_check(),
+ or NULL if no index condition pushdown (ICP) is used. */
+ ha_innobase* idx_cond;
+ ulint idx_cond_n_cols;/*!< Number of fields in idx_cond_cols.
+ 0 if and only if idx_cond == NULL. */
+ /*----------------------*/
+
+ /*----------------------*/
+ rtr_info_t* rtr_info; /*!< R-tree Search Info */
+ /*----------------------*/
+
+ ulint magic_n2; /*!< this should be the same as
+ magic_n */
+
+ byte* srch_key_val1; /*!< buffer used in converting
+ search key values from MySQL format
+ to InnoDB format.*/
+ byte* srch_key_val2; /*!< buffer used in converting
+ search key values from MySQL format
+ to InnoDB format.*/
+ uint srch_key_val_len; /*!< Size of search key */
+ /** The MySQL table object */
+ TABLE* m_mysql_table;
+
+ /** Get template by dict_table_t::cols[] number */
+ const mysql_row_templ_t* get_template_by_col(ulint col) const
+ {
+ ut_ad(col < n_template);
+ ut_ad(mysql_template);
+ for (ulint i = col; i < n_template; ++i) {
+ const mysql_row_templ_t* templ = &mysql_template[i];
+ if (!templ->is_virtual && templ->col_no == col) {
+ return templ;
+ }
+ }
+ return NULL;
+ }
+};
+
+/** Callback for row_mysql_sys_index_iterate() */
+struct SysIndexCallback {
+ virtual ~SysIndexCallback() { }
+
+ /** Callback method
+ @param mtr current mini transaction
+ @param pcur persistent cursor. */
+ virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0;
+};
+
+
+/** Storage for calculating virtual columns */
+
+class String;
+struct VCOL_STORAGE
+{
+ TABLE *maria_table;
+ byte *innobase_record;
+ byte *maria_record;
+ String *blob_value_storage;
+ VCOL_STORAGE(): maria_table(NULL), innobase_record(NULL),
+ maria_record(NULL), blob_value_storage(NULL) {}
+};
+
+/**
+ Allocate a heap and record for calculating virtual fields
+ Used mainly for virtual fields in indexes
+
+@param[in] thd MariaDB THD
+@param[in] index Index in use
+@param[out] heap Heap that holds temporary row
+@param[in,out] mysql_table MariaDB table
+@param[out] rec Pointer to allocated MariaDB record
+@param[out] storage Internal storage for blobs etc
+
+@return FALSE ok
+@return TRUE malloc failure
+*/
+
+bool innobase_allocate_row_for_vcol(
+ THD * thd,
+ dict_index_t* index,
+ mem_heap_t** heap,
+ TABLE** table,
+ VCOL_STORAGE* storage);
+
+/** Free memory allocated by innobase_allocate_row_for_vcol() */
+void innobase_free_row_for_vcol(VCOL_STORAGE *storage);
+
+class ib_vcol_row
+{
+ VCOL_STORAGE storage;
+public:
+ mem_heap_t *heap;
+
+ ib_vcol_row(mem_heap_t *heap) : heap(heap) {}
+
+ byte *record(THD *thd, dict_index_t *index, TABLE **table)
+ {
+ if (!storage.innobase_record)
+ {
+ bool ok = innobase_allocate_row_for_vcol(thd, index, &heap, table,
+ &storage);
+ if (!ok)
+ return NULL;
+ }
+ return storage.innobase_record;
+ };
+
+ ~ib_vcol_row()
+ {
+ if (heap)
+ {
+ if (storage.innobase_record)
+ innobase_free_row_for_vcol(&storage);
+ mem_heap_free(heap);
+ }
+ }
+};
+
+/** Report virtual value computation failure in ib::error
+@param[in] row the data row
+*/
+ATTRIBUTE_COLD
+void innobase_report_computed_value_failed(dtuple_t *row);
+
+/** Get the computed value by supplying the base column values.
+@param[in,out] row the data row
+@param[in] col virtual column
+@param[in] index index on the virtual column
+@param[in,out] local_heap heap memory for processing large data etc.
+@param[in,out] heap memory heap that copies the actual index row
+@param[in] ifield index field
+@param[in] thd MySQL thread handle
+@param[in,out] mysql_table mysql table object
+@param[in] old_table during ALTER TABLE, this is the old table
+ or NULL.
+@param[in] parent_update update vector for the parent row
+@param[in] foreign foreign key information
+@return the field filled with computed value */
+dfield_t*
+innobase_get_computed_value(
+ dtuple_t* row,
+ const dict_v_col_t* col,
+ const dict_index_t* index,
+ mem_heap_t** local_heap,
+ mem_heap_t* heap,
+ const dict_field_t* ifield,
+ THD* thd,
+ TABLE* mysql_table,
+ byte* mysql_rec,
+ const dict_table_t* old_table,
+ upd_t* parent_update,
+ dict_foreign_t* foreign);
+
+/** Get the computed value by supplying the base column values.
+@param[in,out] table the table whose virtual column
+ template to be built */
+TABLE* innobase_init_vc_templ(dict_table_t* table);
+
+/** Change dbname and table name in table->vc_templ.
+@param[in,out] table the table whose virtual column template
+dbname and tbname to be renamed. */
+void
+innobase_rename_vc_templ(
+ dict_table_t* table);
+
+#define ROW_PREBUILT_FETCH_MAGIC_N 465765687
+
+#define ROW_MYSQL_WHOLE_ROW 0
+#define ROW_MYSQL_REC_FIELDS 1
+#define ROW_MYSQL_NO_TEMPLATE 2
+#define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in
+ row_scan_and_check_index */
+
+/* Values for hint_need_to_fetch_extra_cols */
+#define ROW_RETRIEVE_PRIMARY_KEY 1
+#define ROW_RETRIEVE_ALL_COLS 2
+
+/* Values for row_read_type */
+#define ROW_READ_WITH_LOCKS 0
+#define ROW_READ_TRY_SEMI_CONSISTENT 1
+#define ROW_READ_DID_SEMI_CONSISTENT 2
+
+#ifdef UNIV_DEBUG
+/** Wait for the background drop list to become empty. */
+void
+row_wait_for_background_drop_list_empty();
+#endif /* UNIV_DEBUG */
+
+#endif /* row0mysql.h */
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
new file mode 100644
index 00000000..091d80ad
--- /dev/null
+++ b/storage/innobase/include/row0purge.h
@@ -0,0 +1,268 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0purge.h
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0purge_h
+#define row0purge_h
+
+#include "que0types.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "row0mysql.h"
+#include "mysqld.h"
+#include <queue>
+
+class MDL_ticket;
+/** Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page). It is possible that
+this function first returns true and then false, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@param[in,out] node row purge node
+@param[in] index secondary index
+@param[in] entry secondary index entry
+@param[in,out] sec_pcur secondary index cursor or NULL
+ if it is called for purge buffering
+ operation.
+@param[in,out] sec_mtr mini-transaction which holds
+ secondary index entry or NULL if it is
+ called for purge buffering operation.
+@param[in] is_tree true=pessimistic purge,
+ false=optimistic (leaf-page only)
+@return true if the secondary index record can be purged */
+bool
+row_purge_poss_sec(
+ purge_node_t* node,
+ dict_index_t* index,
+ const dtuple_t* entry,
+ btr_pcur_t* sec_pcur=NULL,
+ mtr_t* sec_mtr=NULL,
+ bool is_tree=false);
+
+/***************************************************************
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph.
+@return query thread to run next or NULL */
+que_thr_t*
+row_purge_step(
+/*===========*/
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Info required to purge a record */
+struct trx_purge_rec_t
+{
+ /** Record to purge */
+ trx_undo_rec_t *undo_rec;
+ /** File pointer to undo record */
+ roll_ptr_t roll_ptr;
+};
+
+/* Purge node structure */
+
+struct purge_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_PURGE */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ roll_ptr_t roll_ptr;/* roll pointer to undo log record */
+
+ undo_no_t undo_no;/*!< undo number of the record */
+
+ ulint rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC,
+ ... */
+private:
+ /** latest unavailable table ID (do not bother looking up again) */
+ table_id_t unavailable_table_id;
+ /** the latest modification of the table definition identified by
+ unavailable_table_id, or TRX_ID_MAX */
+ trx_id_t def_trx_id;
+public:
+ dict_table_t* table; /*!< table where purge is done */
+
+ ulint cmpl_info;/* compiler analysis info of an update */
+
+ upd_t* update; /*!< update vector for a clustered index
+ record */
+ const dtuple_t* ref; /*!< NULL, or row reference to the next row to
+ handle */
+ dtuple_t* row; /*!< NULL, or a copy (also fields copied to
+ heap) of the indexed fields of the row to
+ handle */
+ dict_index_t* index; /*!< NULL, or the next index whose record should
+ be handled */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage for
+ row; this must be emptied after a successful
+ purge of a row */
+ ibool found_clust;/*!< whether the clustered index record
+ determined by ref was found in the clustered
+ index, and we were able to position pcur on
+ it */
+ btr_pcur_t pcur; /*!< persistent cursor used in searching the
+ clustered index record */
+#ifdef UNIV_DEBUG
+ /** whether the operation is in progress */
+ bool in_progress;
+#endif
+ trx_id_t trx_id; /*!< trx id for this purging record */
+
+ /** meta-data lock for the table name */
+ MDL_ticket* mdl_ticket;
+
+ /** table id of the previous undo log record */
+ table_id_t last_table_id;
+
+ /** purge thread */
+ THD* purge_thd;
+
+ /** metadata lock holds for this number of undo log recs */
+ int mdl_hold_recs;
+
+ /** Undo recs to purge */
+ std::queue<trx_purge_rec_t> undo_recs;
+
+ /** Constructor */
+ explicit purge_node_t(que_thr_t* parent) :
+ common(QUE_NODE_PURGE, parent),
+ unavailable_table_id(0),
+ table(NULL),
+ heap(mem_heap_create(256)),
+#ifdef UNIV_DEBUG
+ in_progress(false),
+#endif
+ mdl_ticket(NULL),
+ last_table_id(0),
+ purge_thd(NULL),
+ mdl_hold_recs(0)
+ {
+ }
+
+#ifdef UNIV_DEBUG
+ /***********************************************************//**
+ Validate the persisent cursor. The purge node has two references
+ to the clustered index record - one via the ref member, and the
+ other via the persistent cursor. These two references must match
+ each other if the found_clust flag is set.
+ @return true if the persistent cursor is consistent with
+ the ref member.*/
+ bool validate_pcur();
+#endif
+
+ /** Determine if a table should be skipped in purge.
+ @param[in] table_id table identifier
+ @return whether to skip the table lookup and processing */
+ bool is_skipped(table_id_t id) const
+ {
+ return id == unavailable_table_id && trx_id <= def_trx_id;
+ }
+
+ /** Remember that a table should be skipped in purge.
+ @param[in] id table identifier
+ @param[in] limit last transaction for which to skip */
+ void skip(table_id_t id, trx_id_t limit)
+ {
+ DBUG_ASSERT(limit >= trx_id);
+ unavailable_table_id = id;
+ def_trx_id = limit;
+ }
+
+ /** Start processing an undo log record. */
+ void start()
+ {
+ ut_ad(in_progress);
+ DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+
+ row= nullptr;
+ ref= nullptr;
+ index= nullptr;
+ update= nullptr;
+ found_clust= FALSE;
+ rec_type= ULINT_UNDEFINED;
+ cmpl_info= ULINT_UNDEFINED;
+ if (!purge_thd)
+ purge_thd= current_thd;
+ }
+
+
+ /** Close the existing table and release the MDL for it. */
+ void close_table()
+ {
+ last_table_id= 0;
+ if (!table)
+ {
+ ut_ad(!mdl_ticket);
+ return;
+ }
+
+ innobase_reset_background_thd(purge_thd);
+ dict_table_close(table, false, false, purge_thd, mdl_ticket);
+ table= nullptr;
+ mdl_ticket= nullptr;
+ }
+
+
+ /** Retail mdl for the table id.
+ @param[in] table_id table id to be processed
+ @return true if retain mdl */
+ bool retain_mdl(table_id_t table_id)
+ {
+ ut_ad(table_id);
+ if (last_table_id == table_id && mdl_hold_recs < 100)
+ {
+ ut_ad(table);
+ mdl_hold_recs++;
+ return true;
+ }
+
+ mdl_hold_recs= 0;
+ close_table();
+ return false;
+ }
+
+
+ /** Reset the state at end
+ @return the query graph parent */
+ que_node_t* end()
+ {
+ DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+ close_table();
+ ut_ad(undo_recs.empty());
+ ut_d(in_progress= false);
+ purge_thd= nullptr;
+ mem_heap_empty(heap);
+ return common.parent;
+ }
+};
+
+#endif
diff --git a/storage/innobase/include/row0quiesce.h b/storage/innobase/include/row0quiesce.h
new file mode 100644
index 00000000..b05b7666
--- /dev/null
+++ b/storage/innobase/include/row0quiesce.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0quiesce.h
+
+Header file for tablespace quiesce functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0quiesce_h
+#define row0quiesce_h
+
+#include "dict0types.h"
+
+struct trx_t;
+
+/** The version number of the export meta-data text file. */
+#define IB_EXPORT_CFG_VERSION_V1 0x1UL
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+void
+row_quiesce_table_start(
+/*====================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ trx_t* trx) /*!< in/out: transaction/session */
+ MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or errro code. */
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ ib_quiesce_t state, /*!< in: quiesce state to set */
+ trx_t* trx) /*!< in/out: transaction */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+void
+row_quiesce_table_complete(
+/*=======================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ trx_t* trx) /*!< in/out: transaction/session */
+ MY_ATTRIBUTE((nonnull));
+
+#endif /* row0quiesce_h */
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
new file mode 100644
index 00000000..b4dab3c2
--- /dev/null
+++ b/storage/innobase/include/row0row.h
@@ -0,0 +1,432 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.h
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0row_h
+#define row0row_h
+
+#include "que0types.h"
+#include "ibuf0ibuf.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+
+/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: record offsets */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/* Flags for row build type. */
+#define ROW_BUILD_NORMAL 0 /*!< build index row */
+#define ROW_BUILD_FOR_PURGE 1 /*!< build row for purge. */
+#define ROW_BUILD_FOR_UNDO 2 /*!< build row for undo. */
+#define ROW_BUILD_FOR_INSERT 3 /*!< build row for insert. */
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ const row_ext_t* ext, /*!< in: externally stored column
+ prefixes, or NULL */
+ const dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap, /*!< in,out: memory heap from which
+ the memory for the index entry
+ is allocated */
+ ulint flag) /*!< in: ROW_BUILD_NORMAL,
+ ROW_BUILD_FOR_PURGE
+ or ROW_BUILD_FOR_UNDO */
+ MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4)));
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ const row_ext_t* ext, /*!< in: externally stored column
+ prefixes, or NULL */
+ const dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap) /*!< in,out: memory heap from which
+ the memory for the index entry
+ is allocated */
+ MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4)));
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+dtuple_t*
+row_build(
+/*======*/
+ ulint type, /*!< in: ROW_COPY_POINTERS or
+ ROW_COPY_DATA; the latter
+ copies also the data fields to
+ heap while the first only
+ places pointers to data fields
+ on the index page, and thus is
+ more efficient */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_t* rec, /*!< in: record in the clustered
+ index; NOTE: in the case
+ ROW_COPY_POINTERS the data
+ fields in the row will point
+ directly into this record,
+ therefore, the buffer page of
+ this record must be at least
+ s-latched and the latch held
+ as long as the row dtuple is used! */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index)
+ or NULL, in which case this function
+ will invoke rec_get_offsets() */
+ const dict_table_t* col_table,
+ /*!< in: table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead; the user
+ columns in this table should be
+ the same columns as in index->table */
+ const dtuple_t* defaults,
+ /*!< in: default values of
+ added, changed columns, or NULL */
+ const ulint* col_map,/*!< in: mapping of old column
+ numbers to new ones, or NULL */
+ row_ext_t** ext, /*!< out, own: cache of
+ externally stored column
+ prefixes, or NULL */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ the memory needed is allocated */
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in] type ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in] index clustered index
+@param[in] rec record in the clustered index
+@param[in] offsets rec_get_offsets(rec,index) or NULL
+@param[in] col_table table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead
+@param[in] defaults default values of added, changed columns, or NULL
+@param[in] add_v new virtual columns added
+ along with new indexes
+@param[in] col_map mapping of old column
+ numbers to new ones, or NULL
+@param[in] ext cache of externally stored column
+ prefixes, or NULL
+@param[in] heap memory heap from which
+ the memory needed is allocated
+@return own: row built */
+dtuple_t*
+row_build_w_add_vcol(
+ ulint type,
+ const dict_index_t* index,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ const dict_table_t* col_table,
+ const dtuple_t* defaults,
+ const dict_add_v_col_t* add_v,
+ const ulint* col_map,
+ row_ext_t** ext,
+ mem_heap_t* heap);
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+ const rec_t* rec, /*!< in: record in the index */
+ const dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built */
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+ const rec_t* rec, /*!< in: record in the index */
+ const dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in/out: rec_get_offsets(rec) */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Convert a metadata record to a data tuple.
+@param[in] rec metadata record
+@param[in] index clustered index after instant ALTER TABLE
+@param[in] offsets rec_get_offsets(rec)
+@param[in,out] heap memory heap for allocations
+@param[in] info_bits the info_bits after an update
+@param[in] pad whether to pad to index->n_fields */
+dtuple_t*
+row_metadata_to_tuple(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ mem_heap_t* heap,
+ ulint info_bits,
+ bool pad)
+ MY_ATTRIBUTE((nonnull,warn_unused_result));
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+ ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap, whereas the latter only places pointers
+ to data fields on the index page */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+ MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+ dtuple_t* ref, /*!< in/out: row reference built;
+ see the NOTE below! */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: the data fields in ref
+ will point directly into this
+ record, therefore, the buffer
+ page of this record must be at
+ least s-latched and the latch
+ held as long as the row
+ reference is used! */
+ const dict_index_t* index, /*!< in: secondary index */
+ rec_offs* offsets)/*!< in: rec_get_offsets(rec, index)
+ or NULL */
+ MY_ATTRIBUTE((nonnull(1,2,3)));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+ dtuple_t* ref, /*!< in/out: typed data tuple where the
+ reference is built */
+ const ulint* map, /*!< in: array of field numbers in rec
+ telling how ref should be built from
+ the fields of rec */
+ const rec_t* rec, /*!< in: secondary index record;
+ must be preserved while ref is used, as we do
+ not copy field values to heap */
+ const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row
+reference.
+@return TRUE if found */
+ibool
+row_search_on_row_ref(
+/*==================*/
+ btr_pcur_t* pcur, /*!< out: persistent cursor, which must
+ be closed by the caller */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const dict_table_t* table, /*!< in: table */
+ const dtuple_t* ref, /*!< in: row reference */
+ mtr_t* mtr) /*!< in/out: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+rec_t*
+row_get_clust_rec(
+/*==============*/
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const rec_t* rec, /*!< in: record in a secondary index */
+ dict_index_t* index, /*!< in: secondary index */
+ dict_index_t** clust_index,/*!< out: clustered index */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Parse the integer data from specified data, which could be
+DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
+and the type is not unsigned then we reset the value to 0
+@param[in] data data to read
+@param[in] len length of data
+@param[in] mtype mtype of data
+@param[in] unsigned_type if the data is unsigned
+@return the integer value from the data */
+inline
+ib_uint64_t
+row_parse_int(
+ const byte* data,
+ ulint len,
+ ulint mtype,
+ bool unsigned_type);
+
+/** Result of row_search_index_entry */
+enum row_search_result {
+ ROW_FOUND = 0, /*!< the record was found */
+ ROW_NOT_FOUND, /*!< record not found */
+ ROW_BUFFERED, /*!< one of BTR_INSERT, BTR_DELETE, or
+ BTR_DELETE_MARK was specified, the
+ secondary index leaf page was not in
+ the buffer pool, and the operation was
+ enqueued in the insert/delete buffer */
+ ROW_NOT_DELETED_REF /*!< BTR_DELETE was specified, and
+ row_purge_poss_sec() failed */
+};
+
+/***************************************************************//**
+Searches an index record.
+@return whether the record was found or buffered */
+enum row_search_result
+row_search_index_entry(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry, /*!< in: index entry */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must
+ be closed by the caller */
+ mtr_t* mtr) /*!< in: mtr */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define ROW_COPY_DATA 1
+#define ROW_COPY_POINTERS 2
+
+/* The allowed latching order of index records is the following:
+(1) a secondary index record ->
+(2) the clustered index record ->
+(3) rollback segment data for the clustered index record. */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+row_raw_format(
+/*===========*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ const dict_field_t* dict_field, /*!< in: index field */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Prepare to start a mini-transaction to modify an index.
+@param[in,out] mtr mini-transaction
+@param[in,out] index possibly secondary index
+@param[in] pessimistic whether this is a pessimistic operation */
+inline
+void
+row_mtr_start(mtr_t* mtr, dict_index_t* index, bool pessimistic)
+{
+ mtr->start();
+
+ switch (index->table->space_id) {
+ case IBUF_SPACE_ID:
+ if (pessimistic
+ && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) {
+ ibuf_free_excess_pages();
+ }
+ break;
+ case SRV_TMP_SPACE_ID:
+ mtr->set_log_mode(MTR_LOG_NO_REDO);
+ break;
+ default:
+ index->set_modified(*mtr);
+ break;
+ }
+
+ log_free_check();
+}
+
+#include "row0row.ic"
+
+#endif
diff --git a/storage/innobase/include/row0row.ic b/storage/innobase/include/row0row.ic
new file mode 100644
index 00000000..e89adb58
--- /dev/null
+++ b/storage/innobase/include/row0row.ic
@@ -0,0 +1,221 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.ic
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "trx0undo.h"
+
+/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: record offsets */
+{
+ ulint offset;
+ ulint len;
+
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+
+ offset = rec_get_nth_field_offs(offsets, index->db_trx_id(), &len);
+
+ ut_ad(len == DATA_TRX_ID_LEN);
+
+ return(offset);
+}
+
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint offset;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ offset = index->trx_id_offset;
+
+ if (!offset) {
+ offset = row_get_trx_id_offset(index, offsets);
+ }
+
+ return(trx_read_trx_id(rec + offset));
+}
+
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint offset;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ offset = index->trx_id_offset;
+
+ if (!offset) {
+ offset = row_get_trx_id_offset(index, offsets);
+ }
+
+ return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ const row_ext_t* ext, /*!< in: externally stored column
+ prefixes, or NULL */
+ const dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap) /*!< in,out: memory heap from which
+ the memory for the index entry
+ is allocated */
+{
+ dtuple_t* entry;
+
+ ut_ad(dtuple_check_typed(row));
+ entry = row_build_index_entry_low(row, ext, index, heap,
+ ROW_BUILD_NORMAL);
+ ut_ad(!entry || dtuple_check_typed(entry));
+ return(entry);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+ dtuple_t* ref, /*!< in/out: typed data tuple where the
+ reference is built */
+ const ulint* map, /*!< in: array of field numbers in rec
+ telling how ref should be built from
+ the fields of rec */
+ const rec_t* rec, /*!< in: secondary index record;
+ must be preserved while ref is used, as we do
+ not copy field values to heap */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ dfield_t* dfield;
+ const byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint field_no;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!rec_offs_any_extern(offsets));
+ ref_len = dtuple_get_n_fields(ref);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ field_no = *(map + i);
+
+ if (field_no != ULINT_UNDEFINED) {
+
+ field = rec_get_nth_field(rec, offsets,
+ field_no, &len);
+ dfield_set_data(dfield, field, len);
+ }
+ }
+}
+
+/** Parse the integer data from specified data, which could be
+DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
+and the type is not unsigned then we reset the value to 0
+@param[in] data data to read
+@param[in] len length of data
+@param[in] mtype mtype of data
+@param[in] unsigned_type if the data is unsigned
+@return the integer value from the data */
+ib_uint64_t
+row_parse_int(
+ const byte* data,
+ ulint len,
+ ulint mtype,
+ bool unsigned_type)
+{
+ ib_uint64_t value = 0;
+
+ switch (mtype) {
+ case DATA_INT:
+
+ ut_a(len <= sizeof value);
+ value = mach_read_int_type(data, len, unsigned_type);
+ break;
+
+ case DATA_FLOAT:
+
+ ut_a(len == sizeof(float));
+ value = static_cast<ib_uint64_t>(mach_float_read(data));
+ break;
+
+ case DATA_DOUBLE:
+
+ ut_a(len == sizeof(double));
+ value = static_cast<ib_uint64_t>(mach_double_read(data));
+ break;
+
+ default:
+ ut_error;
+
+ }
+
+ if (!unsigned_type && static_cast<int64_t>(value) < 0) {
+ value = 0;
+ }
+
+ return(value);
+}
+
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
new file mode 100644
index 00000000..60107712
--- /dev/null
+++ b/storage/innobase/include/row0sel.h
@@ -0,0 +1,482 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.h
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0sel_h
+#define row0sel_h
+
+#include "data0data.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "read0types.h"
+#include "row0types.h"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "btr0pcur.h"
+#include "row0mysql.h"
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+sel_node_t*
+sel_node_create(
+/*============*/
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+void
+sel_node_free_private(
+/*==================*/
+ sel_node_t* node); /*!< in: select node struct */
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+ sel_buf_t* prefetch_buf); /*!< in, own: prefetch buffer */
+/*********************************************************************//**
+Gets the plan node for the nth table in a join.
+@return plan node */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+ sel_node_t* node, /*!< in: select node */
+ ulint i); /*!< in: get ith plan node */
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_sel_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+que_thr_t*
+fetch_step(
+/*=======*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+que_thr_t*
+row_printf_step(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out] buf Where to copy the MySQL row.
+@param[in] cached_rec What to copy (in MySQL row format).
+@param[in] prebuilt prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+ byte* buf,
+ const byte* cached_rec,
+ row_prebuilt_t* prebuilt);
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. */
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+ dtuple_t* tuple, /*!< in/out: tuple where to build;
+ NOTE: we assume that the type info
+ in the tuple is already according
+ to index! */
+ byte* buf, /*!< in: buffer to use in field
+ conversions; NOTE that dtuple->data
+ may end up pointing inside buf so
+ do not discard that buffer while
+ the tuple is being used. See
+ row_mysql_store_col_in_innobase_format()
+ in the case of DATA_INT */
+ ulint buf_len, /*!< in: buffer length */
+ dict_index_t* index, /*!< in: index of the key value */
+ const byte* key_ptr, /*!< in: MySQL key value */
+ ulint key_len); /*!< in: MySQL key value length */
+
+
+/** Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor!
+
+@param[out] buf buffer for the fetched row in MySQL format
+@param[in] mode search mode PAGE_CUR_L
+@param[in,out] prebuilt prebuilt struct for the table handler;
+ this contains the info to search_tuple,
+ index; if search tuple contains 0 field then
+ we position the cursor at start or the end of
+ index, depending on 'mode'
+@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
+@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV;
+ Note: if this is != 0, then prebuilt must has a
+ pcur with stored position! In opening of a
+ cursor 'direction' should be 0.
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
+UNIV_INLINE
+dberr_t
+row_search_for_mysql(
+ byte* buf,
+ page_cur_mode_t mode,
+ row_prebuilt_t* prebuilt,
+ ulint match_mode,
+ ulint direction)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Searches for rows in the database using cursor.
+Function is mainly used for tables that are shared across connections and
+so it employs technique that can help re-construct the rows that
+transaction is suppose to see.
+It also has optimization such as pre-caching the rows, using AHI, etc.
+
+@param[out] buf buffer for the fetched row in MySQL format
+@param[in] mode search mode PAGE_CUR_L
+@param[in,out] prebuilt prebuilt struct for the table handler;
+ this contains the info to search_tuple,
+ index; if search tuple contains 0 field then
+ we position the cursor at start or the end of
+ index, depending on 'mode'
+@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
+@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV;
+ Note: if this is != 0, then prebuilt must has a
+ pcur with stored position! In opening of a
+ cursor 'direction' should be 0.
+@return DB_SUCCESS or error code */
+dberr_t
+row_search_mvcc(
+ byte* buf,
+ page_cur_mode_t mode,
+ row_prebuilt_t* prebuilt,
+ ulint match_mode,
+ ulint direction)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Count rows in a R-Tree leaf level.
+@return DB_SUCCESS if successful */
+dberr_t
+row_count_rtree_recs(
+/*=================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
+ table handle; this contains the info
+ of search_tuple, index; if search
+ tuple contains 0 fields then we
+ position the cursor at the start or
+ the end of the index, depending on
+ 'mode' */
+ ulint* n_rows); /*!< out: number of entries
+ seen in the consistent read */
+
+/** Read the max AUTOINC value from an index.
+@param[in] index index starting with an AUTO_INCREMENT column
+@return the largest AUTO_INCREMENT value
+@retval 0 if no records were found */
+ib_uint64_t
+row_search_max_autoinc(dict_index_t* index)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** A structure for caching column values for prefetched rows */
+struct sel_buf_t{
+ byte* data; /*!< data, or NULL; if not NULL, this field
+ has allocated memory which must be explicitly
+ freed; can be != NULL even when len is
+ UNIV_SQL_NULL */
+ ulint len; /*!< data length or UNIV_SQL_NULL */
+ ulint val_buf_size;
+ /*!< size of memory buffer allocated for data:
+ this can be more than len; this is defined
+ when data != NULL */
+};
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out] buf Where to copy the MySQL row.
+@param[in] cached_rec What to copy (in MySQL row format).
+@param[in] prebuilt prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+ byte* buf,
+ const byte* cached_rec,
+ row_prebuilt_t* prebuilt);
+
+/** Query plan */
+struct plan_t{
+ dict_table_t* table; /*!< table struct in the dictionary
+ cache */
+ dict_index_t* index; /*!< table index used in the search */
+ btr_pcur_t pcur; /*!< persistent cursor used to search
+ the index */
+ ibool asc; /*!< TRUE if cursor traveling upwards */
+ ibool pcur_is_open; /*!< TRUE if pcur has been positioned
+ and we can try to fetch new rows */
+ ibool cursor_at_end; /*!< TRUE if the cursor is open but
+ we know that there are no more
+ qualifying rows left to retrieve from
+ the index tree; NOTE though, that
+ there may still be unprocessed rows in
+ the prefetch stack; always FALSE when
+ pcur_is_open is FALSE */
+ ibool stored_cursor_rec_processed;
+ /*!< TRUE if the pcur position has been
+ stored and the record it is positioned
+ on has already been processed */
+ que_node_t** tuple_exps; /*!< array of expressions
+ which are used to calculate
+ the field values in the search
+ tuple: there is one expression
+ for each field in the search
+ tuple */
+ dtuple_t* tuple; /*!< search tuple */
+ page_cur_mode_t mode; /*!< search mode: PAGE_CUR_G, ... */
+ ulint n_exact_match; /*!< number of first fields in
+ the search tuple which must be
+ exactly matched */
+ ibool unique_search; /*!< TRUE if we are searching an
+ index record with a unique key */
+ ulint n_rows_fetched; /*!< number of rows fetched using pcur
+ after it was opened */
+ ulint n_rows_prefetched;/*!< number of prefetched rows cached
+ for fetch: fetching several rows in
+ the same mtr saves CPU time */
+ ulint first_prefetched;/*!< index of the first cached row in
+ select buffer arrays for each column */
+ ibool no_prefetch; /*!< no prefetch for this table */
+ sym_node_list_t columns; /*!< symbol table nodes for the columns
+ to retrieve from the table */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ end_conds; /*!< conditions which determine the
+ fetch limit of the index segment we
+ have to look at: when one of these
+ fails, the result set has been
+ exhausted for the cursor in this
+ index; these conditions are normalized
+ so that in a comparison the column
+ for this table is the first argument */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ other_conds; /*!< the rest of search conditions we can
+ test at this table in a join */
+ ibool must_get_clust; /*!< TRUE if index is a non-clustered
+ index and we must also fetch the
+ clustered index record; this is the
+ case if the non-clustered record does
+ not contain all the needed columns, or
+ if this is a single-table explicit
+ cursor, or a searched update or
+ delete */
+ ulint* clust_map; /*!< map telling how clust_ref is built
+ from the fields of a non-clustered
+ record */
+ dtuple_t* clust_ref; /*!< the reference to the clustered
+ index entry is built here if index is
+ a non-clustered index */
+ btr_pcur_t clust_pcur; /*!< if index is non-clustered, we use
+ this pcur to search the clustered
+ index */
+ mem_heap_t* old_vers_heap; /*!< memory heap used in building an old
+ version of a row, or NULL */
+};
+
+/** Select node states */
+enum sel_node_state {
+ SEL_NODE_CLOSED, /*!< it is a declared cursor which is not
+ currently open */
+ SEL_NODE_OPEN, /*!< intention locks not yet set on tables */
+ SEL_NODE_FETCH, /*!< intention locks have been set */
+ SEL_NODE_NO_MORE_ROWS /*!< cursor has reached the result set end */
+};
+
+/** Select statement node */
+struct sel_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_SELECT */
+ enum sel_node_state
+ state; /*!< node state */
+ que_node_t* select_list; /*!< select list */
+ sym_node_t* into_list; /*!< variables list or NULL */
+ sym_node_t* table_list; /*!< table list */
+ ibool asc; /*!< TRUE if the rows should be fetched
+ in an ascending order */
+ ibool set_x_locks; /*!< TRUE if the cursor is for update or
+ delete, which means that a row x-lock
+ should be placed on the cursor row */
+ lock_mode row_lock_mode; /*!< LOCK_X or LOCK_S */
+ ulint n_tables; /*!< number of tables */
+ ulint fetch_table; /*!< number of the next table to access
+ in the join */
+ plan_t* plans; /*!< array of n_tables many plan nodes
+ containing the search plan and the
+ search data structures */
+ que_node_t* search_cond; /*!< search condition */
+ ReadView* read_view; /*!< if the query is a non-locking
+ consistent read, its read view is
+ placed here, otherwise NULL */
+ ibool consistent_read;/*!< TRUE if the select is a consistent,
+ non-locking read */
+ order_node_t* order_by; /*!< order by column definition, or
+ NULL */
+ ibool is_aggregate; /*!< TRUE if the select list consists of
+ aggregate functions */
+ ibool aggregate_already_fetched;
+ /*!< TRUE if the aggregate row has
+ already been fetched for the current
+ cursor */
+ ibool can_get_updated;/*!< this is TRUE if the select
+ is in a single-table explicit
+ cursor which can get updated
+ within the stored procedure,
+ or in a searched update or
+ delete; NOTE that to determine
+ of an explicit cursor if it
+ can get updated, the parser
+ checks from a stored procedure
+ if it contains positioned
+ update or delete statements */
+ sym_node_t* explicit_cursor;/*!< not NULL if an explicit cursor */
+ UT_LIST_BASE_NODE_T(sym_node_t)
+ copy_variables; /*!< variables whose values we have to
+ copy when an explicit cursor is opened,
+ so that they do not change between
+ fetches */
+};
+
+/** Fetch statement node */
+struct fetch_node_t{
+ que_common_t common; /*!< type: QUE_NODE_FETCH */
+ sel_node_t* cursor_def; /*!< cursor definition */
+ sym_node_t* into_list; /*!< variables to set */
+
+ pars_user_func_t*
+ func; /*!< User callback function or NULL.
+ The first argument to the function
+ is a sel_node_t*, containing the
+ results of the SELECT operation for
+ one row. If the function returns
+ NULL, it is not interested in
+ further rows and the cursor is
+ modified so (cursor % NOTFOUND) is
+ true. If it returns not-NULL,
+ continue normally. */
+};
+
+/** Open or close cursor operation type */
+enum open_node_op {
+ ROW_SEL_OPEN_CURSOR, /*!< open cursor */
+ ROW_SEL_CLOSE_CURSOR /*!< close cursor */
+};
+
+/** Open or close cursor statement node */
+struct open_node_t{
+ que_common_t common; /*!< type: QUE_NODE_OPEN */
+ enum open_node_op
+ op_type; /*!< operation type: open or
+ close cursor */
+ sel_node_t* cursor_def; /*!< cursor definition */
+};
+
+/** Row printf statement node */
+struct row_printf_node_t{
+ que_common_t common; /*!< type: QUE_NODE_ROW_PRINTF */
+ sel_node_t* sel_node; /*!< select */
+};
+
+/** Search direction for the MySQL interface */
+enum row_sel_direction {
+ ROW_SEL_NEXT = 1, /*!< ascending direction */
+ ROW_SEL_PREV = 2 /*!< descending direction */
+};
+
+/** Match mode for the MySQL interface */
+enum row_sel_match_mode {
+ ROW_SEL_EXACT = 1, /*!< search using a complete key value */
+ ROW_SEL_EXACT_PREFIX /*!< search using a key prefix which
+ must match rows: the prefix may
+ contain an incomplete field (the last
+ field in prefix may be just a prefix
+ of a fixed length column) */
+};
+
+#ifdef UNIV_DEBUG
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+ row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len)
+#else /* UNIV_DEBUG */
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+ row_sel_field_store_in_mysql_format_func(dest,templ,src,len)
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+
+void
+row_sel_field_store_in_mysql_format_func(
+/*=====================================*/
+ byte* dest, /*!< in/out: buffer where to store; NOTE
+ that BLOBs are not in themselves
+ stored here: the caller must allocate
+ and copy the BLOB into buffer before,
+ and pass the pointer to the BLOB in
+ 'data' */
+ const mysql_row_templ_t* templ,
+ /*!< in: MySQL column template.
+ Its following fields are referenced:
+ type, is_unsigned, mysql_col_len,
+ mbminlen, mbmaxlen */
+#ifdef UNIV_DEBUG
+ const dict_index_t* index,
+ /*!< in: InnoDB index */
+ ulint field_no,
+ /*!< in: templ->rec_field_no or
+ templ->clust_rec_field_no or
+ templ->icp_rec_field_no */
+#endif /* UNIV_DEBUG */
+ const byte* data, /*!< in: data to store */
+ ulint len); /*!< in: length of the data */
+
+#include "row0sel.ic"
+
+#endif
diff --git a/storage/innobase/include/row0sel.ic b/storage/innobase/include/row0sel.ic
new file mode 100644
index 00000000..7880605c
--- /dev/null
+++ b/storage/innobase/include/row0sel.ic
@@ -0,0 +1,138 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.ic
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+
+/*********************************************************************//**
+Gets the plan node for the nth table in a join.
+@return plan node */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+ sel_node_t* node, /*!< in: select node */
+ ulint i) /*!< in: get ith plan node */
+{
+ ut_ad(i < node->n_tables);
+
+ return(node->plans + i);
+}
+
+/*********************************************************************//**
+Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means
+that it will start fetching from the start of the result set again, regardless
+of where it was before, and it will set intention locks on the tables. */
+UNIV_INLINE
+void
+sel_node_reset_cursor(
+/*==================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ node->state = SEL_NODE_OPEN;
+}
+
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ sel_node_t* sel_node;
+ open_node_t* node;
+ ulint err;
+
+ ut_ad(thr);
+
+ node = (open_node_t*) thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_OPEN);
+
+ sel_node = node->cursor_def;
+
+ err = DB_SUCCESS;
+
+ if (node->op_type == ROW_SEL_OPEN_CURSOR) {
+
+ /* if (sel_node->state == SEL_NODE_CLOSED) { */
+
+ sel_node_reset_cursor(sel_node);
+ /* } else {
+ err = DB_ERROR;
+ } */
+ } else {
+ if (sel_node->state != SEL_NODE_CLOSED) {
+
+ sel_node->state = SEL_NODE_CLOSED;
+ } else {
+ err = DB_ERROR;
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ /* SQL error detected */
+ fprintf(stderr, "SQL error %lu\n", (ulong) err);
+
+ ut_error;
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+
+/** Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor!
+
+@param[out] buf buffer for the fetched row in MySQL format
+@param[in] mode search mode PAGE_CUR_L
+@param[in,out] prebuilt prebuilt struct for the table handler;
+ this contains the info to search_tuple,
+ index; if search tuple contains 0 field then
+ we position the cursor at start or the end of
+ index, depending on 'mode'
+@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
+@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV;
+ Note: if this is != 0, then prebuilt must has a
+ pcur with stored position! In opening of a
+ cursor 'direction' should be 0.
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
+UNIV_INLINE
+dberr_t
+row_search_for_mysql(
+ byte* buf,
+ page_cur_mode_t mode,
+ row_prebuilt_t* prebuilt,
+ ulint match_mode,
+ ulint direction)
+{
+ return(row_search_mvcc(buf, mode, prebuilt, match_mode, direction));
+}
diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h
new file mode 100644
index 00000000..5e737c1c
--- /dev/null
+++ b/storage/innobase/include/row0types.h
@@ -0,0 +1,54 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0types.h
+Row operation global types
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "buf0types.h"
+
+struct plan_t;
+
+struct upd_t;
+struct upd_field_t;
+struct upd_node_t;
+struct del_node_t;
+struct ins_node_t;
+struct sel_node_t;
+struct open_node_t;
+struct fetch_node_t;
+
+struct row_printf_node_t;
+struct sel_buf_t;
+
+struct undo_node_t;
+
+struct purge_node_t;
+
+struct row_ext_t;
+
+/** Buffer for logging modifications during online index creation */
+struct row_log_t;
+
+/* MySQL data types */
+struct TABLE;
diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h
new file mode 100644
index 00000000..a9877969
--- /dev/null
+++ b/storage/innobase/include/row0uins.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.h
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0uins_h
+#define row0uins_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS */
+dberr_t
+row_undo_ins(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#endif
diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h
new file mode 100644
index 00000000..5032e103
--- /dev/null
+++ b/storage/innobase/include/row0umod.h
@@ -0,0 +1,46 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.h
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0umod_h
+#define row0umod_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+dberr_t
+row_undo_mod(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+ MY_ATTRIBUTE((warn_unused_result));
+
+#endif
diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h
new file mode 100644
index 00000000..4357a908
--- /dev/null
+++ b/storage/innobase/include/row0undo.h
@@ -0,0 +1,128 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.h
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0undo_h
+#define row0undo_h
+
+#include "trx0sys.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return true if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+bool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+ undo_node_t* node) /*!< in/out: row undo node */
+ MY_ATTRIBUTE((warn_unused_result));
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_undo_step(
+/*==========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/* A single query thread will try to perform the undo for all successive
+versions of a clustered index record, if the transaction has modified it
+several times during the execution which is rolled back. It may happen
+that the task is transferred to another query thread, if the other thread
+is assigned to handle an undo log record in the chain of different versions
+of the record, and the other thread happens to get the x-latch to the
+clustered index record at the right time.
+ If a query thread notices that the clustered index record it is looking
+for is missing, or the roll ptr field in the record doed not point to the
+undo log record the thread was assigned to handle, then it gives up the undo
+task for that undo log record, and fetches the next. This situation can occur
+just in the case where the transaction modified the same record several times
+and another thread is currently doing the undo for successive versions of
+that index record. */
+
+/** Execution state of an undo node */
+enum undo_exec {
+ UNDO_NODE_FETCH_NEXT = 1, /*!< we should fetch the next
+ undo log record */
+ /** rollback an insert into persistent table */
+ UNDO_INSERT_PERSISTENT,
+ /** rollback an update (or delete) in a persistent table */
+ UNDO_UPDATE_PERSISTENT,
+ /** rollback an insert into temporary table */
+ UNDO_INSERT_TEMPORARY,
+ /** rollback an update (or delete) in a temporary table */
+ UNDO_UPDATE_TEMPORARY,
+};
+
+/** Undo node structure */
+struct undo_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_UNDO */
+ undo_exec state; /*!< rollback execution state */
+ trx_t* trx; /*!< trx for which undo is done */
+ roll_ptr_t roll_ptr;/*!< roll pointer to undo log record */
+ trx_undo_rec_t* undo_rec;/*!< undo log record */
+ undo_no_t undo_no;/*!< undo number of the record */
+ ulint rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC,
+ ... */
+ trx_id_t new_trx_id; /*!< trx id to restore to clustered index
+ record */
+ btr_pcur_t pcur; /*!< persistent cursor used in searching the
+ clustered index record */
+ dict_table_t* table; /*!< table where undo is done */
+ ulint cmpl_info;/*!< compiler analysis of an update */
+ upd_t* update; /*!< update vector for a clustered index
+ record */
+ const dtuple_t* ref; /*!< row reference to the next row to handle */
+ dtuple_t* row; /*!< a copy (also fields copied to heap) of the
+ row to handle */
+ row_ext_t* ext; /*!< NULL, or prefixes of the externally
+ stored columns of the row */
+ dtuple_t* undo_row;/*!< NULL, or the row after undo */
+ row_ext_t* undo_ext;/*!< NULL, or prefixes of the externally
+ stored columns of undo_row */
+ dict_index_t* index; /*!< the next index whose record should be
+ handled */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage for
+ row; this must be emptied after undo is tried
+ on a row */
+};
+
+#endif
diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h
new file mode 100644
index 00000000..58c60a0a
--- /dev/null
+++ b/storage/innobase/include/row0upd.h
@@ -0,0 +1,568 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.h
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0upd_h
+#define row0upd_h
+
+#include "data0data.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "trx0types.h"
+#include "btr0pcur.h"
+#include "que0types.h"
+#include "pars0types.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+ ulint n, /*!< in: number of fields */
+ mem_heap_t* heap); /*!< in: heap from which memory allocated */
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+ const upd_t* update); /*!< in: update vector */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+ const upd_t* update, /*!< in: update vector */
+ ulint n); /*!< in: field position in update vector */
+#else
+# define upd_get_nth_field(update, n) ((update)->fields + (n))
+#endif
+
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+ upd_field_t* upd_field, /*!< in: update vector field */
+ uint16_t field_no, /*!< in: field number in a clustered
+ index */
+ dict_index_t* index);
+
+/** set field number to a update vector field, marks this field is updated
+@param[in,out] upd_field update vector field
+@param[in] field_no virtual column sequence num
+@param[in] index index */
+UNIV_INLINE
+void
+upd_field_set_v_field_no(
+ upd_field_t* upd_field,
+ uint16_t field_no,
+ dict_index_t* index);
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+ const upd_t* update, /*!< in: update vector */
+ uint16_t no, /*!< in: field_no */
+ bool is_virtual) /*!< in: if it is a virtual column */
+ MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+upd_node_t*
+upd_node_create(
+/*============*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update);/*!< in: update vector */
+/***********************************************************//**
+Returns true if row update contains disowned external fields.
+@return true if the update contains disowned external fields. */
+bool
+row_upd_changes_disowned_external(
+/*==============================*/
+ const upd_t* update) /*!< in: update vector */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+ const rec_t* rec, /*!< in: secondary index record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ mem_heap_t* heap) /*!< in: memory heap from which allocated */
+ MY_ATTRIBUTE((warn_unused_result, nonnull));
+/** Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@param[in] index clustered index
+@param[in] entry clustered index entry to insert
+@param[in] rec clustered index record
+@param[in] offsets rec_get_offsets(rec,index), or NULL
+@param[in] no_sys skip the system columns
+ DB_TRX_ID and DB_ROLL_PTR
+@param[in] trx transaction (for diagnostics),
+ or NULL
+@param[in] heap memory heap from which allocated
+@param[in,out] mysql_table NULL, or mysql table object when
+ user thread invokes dml
+@param[out] error error number in case of failure
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+upd_t*
+row_upd_build_difference_binary(
+ dict_index_t* index,
+ const dtuple_t* entry,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ bool no_sys,
+ trx_t* trx,
+ mem_heap_t* heap,
+ TABLE* mysql_table,
+ dberr_t* error)
+ MY_ATTRIBUTE((nonnull(1,2,3,7,9), warn_unused_result));
+/** Apply an update vector to an index entry.
+@param[in,out] entry index entry to be updated; the clustered index record
+ must be covered by a lock or a page latch to prevent
+ deletion (rollback or purge)
+@param[in] index index of the entry
+@param[in] update update vector built for the entry
+@param[in,out] heap memory heap for copying off-page columns */
+void
+row_upd_index_replace_new_col_vals_index_pos(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update,
+ mem_heap_t* heap)
+ MY_ATTRIBUTE((nonnull));
+/** Replace the new column values stored in the update vector,
+during trx_undo_prev_version_build().
+@param entry clustered index tuple where the values are replaced
+ (the clustered index leaf page latch must be held)
+@param index clustered index
+@param update update vector for the clustered index
+@param heap memory heap for allocating and copying values
+@return whether the previous version was built successfully */
+bool
+row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index,
+ const upd_t *update, mem_heap_t *heap)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+void
+row_upd_replace(
+/*============*/
+ dtuple_t* row, /*!< in/out: row where replaced,
+ indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ row_ext_t** ext, /*!< out, own: NULL, or externally
+ stored column prefixes */
+ const dict_index_t* index, /*!< in: clustered index */
+ const upd_t* update, /*!< in: an update vector built for the
+ clustered index */
+ mem_heap_t* heap); /*!< in: memory heap */
+/** Replaces the virtual column values stored in a dtuple with that of
+a update vector.
+@param[in,out] row dtuple whose column to be updated
+@param[in] table table
+@param[in] update an update vector built for the clustered index
+@param[in] upd_new update to new or old value
+@param[in,out] undo_row undo row (if needs to be updated)
+@param[in] ptr remaining part in update undo log */
+void
+row_upd_replace_vcol(
+ dtuple_t* row,
+ const dict_table_t* table,
+ const upd_t* update,
+ bool upd_new,
+ dtuple_t* undo_row,
+ const byte* ptr);
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+ibool
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the record */
+ const upd_t* update, /*!< in: update vector for the row; NOTE: the
+ field numbers in this MUST be clustered index
+ positions! */
+#ifdef UNIV_DEBUG
+ const que_thr_t*thr, /*!< in: query thread */
+#endif /* UNIV_DEBUG */
+ const dtuple_t* row, /*!< in: old value of row, or NULL if the
+ row and the data values in update are not
+ known when this function is called, e.g., at
+ compile time */
+ const row_ext_t*ext, /*!< NULL, or prefixes of the externally
+ stored columns in the old row */
+ ulint flag) /*!< in: ROW_BUILD_NORMAL,
+ ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */
+ MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
+#ifdef UNIV_DEBUG
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \
+ row_upd_changes_ord_field_binary_func(index,update,thr,row,ext,0)
+#else /* UNIV_DEBUG */
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \
+ row_upd_changes_ord_field_binary_func(index,update,row,ext,0)
+#endif /* UNIV_DEBUG */
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+ dict_table_t* table, /*!< in: table */
+ upd_field_t* upd_field); /*!< in: field to check */
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether Doc ID column is affected */
+bool
+row_upd_changes_doc_id(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ upd_field_t* upd_field) /*!< in: field to check */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+ const dict_table_t* table, /*!< in: table */
+ const upd_t* update);/*!< in: update vector for the row */
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_upd_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/* Update vector field */
+struct upd_field_t{
+ uint16_t field_no; /*!< field number in an index, usually
+ the clustered index, but in updating
+ a secondary index record in btr0cur.cc
+ this is the position in the secondary
+ index. If this field is a virtual
+ column, then field_no represents
+ the nth virtual column in the table */
+ uint16_t orig_len; /*!< original length of the locally
+ stored part of an externally stored
+ column, or 0 */
+ que_node_t* exp; /*!< expression for calculating a new
+ value: it refers to column values and
+ constants in the symbol table of the
+ query graph */
+ dfield_t new_val; /*!< new value for the column */
+ dfield_t* old_v_val; /*!< old value for the virtual column */
+};
+
+
+/* check whether an update field is on virtual column */
+#define upd_fld_is_virtual_col(upd_fld) \
+ (((upd_fld)->new_val.type.prtype & DATA_VIRTUAL) == DATA_VIRTUAL)
+
+/* set DATA_VIRTUAL bit on update field to show it is a virtual column */
+#define upd_fld_set_virtual_col(upd_fld) \
+ ((upd_fld)->new_val.type.prtype |= DATA_VIRTUAL)
+
+/* Update vector structure */
+struct upd_t{
+ mem_heap_t* heap; /*!< heap from which memory allocated */
+ byte info_bits; /*!< new value of info bits to record;
+ default is 0 */
+ dtuple_t* old_vrow; /*!< pointer to old row, used for
+ virtual column update now */
+ ulint n_fields; /*!< number of update fields */
+ upd_field_t* fields; /*!< array of update fields */
+ byte vers_sys_value[8]; /*!< buffer for updating system fields */
+
+ /** Append an update field to the end of array
+ @param[in] field an update field */
+ void append(const upd_field_t& field)
+ {
+ fields[n_fields++] = field;
+ }
+
+ void remove_element(ulint i)
+ {
+ ut_ad(n_fields > 0);
+ ut_ad(i < n_fields);
+ while (i < n_fields - 1)
+ {
+ fields[i]= fields[i + 1];
+ i++;
+ }
+ n_fields--;
+ }
+
+ bool remove(const ulint field_no)
+ {
+ for (ulint i= 0; i < n_fields; ++i)
+ {
+ if (field_no == fields[i].field_no)
+ {
+ remove_element(i);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /** Determine if the given field_no is modified.
+ @return true if modified, false otherwise. */
+ bool is_modified(uint16_t field_no) const
+ {
+ for (ulint i = 0; i < n_fields; ++i) {
+ if (field_no == fields[i].field_no) {
+ return(true);
+ }
+ }
+ return(false);
+ }
+
+ /** Determine if the update affects a system versioned column or row_end. */
+ bool affects_versioned() const
+ {
+ for (ulint i = 0; i < n_fields; i++) {
+ dtype_t type = fields[i].new_val.type;
+ if (type.is_versioned()) {
+ return true;
+ }
+ // versioned DELETE is UPDATE SET row_end=NOW
+ if (type.vers_sys_end()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /** @return whether this is for a hidden metadata record
+ for instant ALTER TABLE */
+ bool is_metadata() const { return dtuple_t::is_metadata(info_bits); }
+ /** @return whether this is for a hidden metadata record
+ for instant ALTER TABLE (not only ADD COLUMN) */
+ bool is_alter_metadata() const
+ { return dtuple_t::is_alter_metadata(info_bits); }
+
+#ifdef UNIV_DEBUG
+ bool validate() const
+ {
+ for (ulint i = 0; i < n_fields; ++i) {
+ dfield_t* field = &fields[i].new_val;
+ if (dfield_is_ext(field)) {
+ ut_ad(dfield_get_len(field)
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ return(true);
+ }
+#endif // UNIV_DEBUG
+};
+
+/** Kinds of update operation */
+enum delete_mode_t {
+ NO_DELETE = 0, /*!< this operation does not delete */
+ PLAIN_DELETE, /*!< ordinary delete */
+ VERSIONED_DELETE /*!< update old and insert a new row */
+};
+
+/* Update node structure which also implements the delete operation
+of a row */
+
+struct upd_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_UPDATE */
+ delete_mode_t is_delete; /*!< kind of DELETE */
+ ibool searched_update;
+ /* TRUE if searched update, FALSE if
+ positioned */
+ bool in_mysql_interface;
+ /* whether the update node was created
+ for the MySQL interface */
+ dict_foreign_t* foreign;/* NULL or pointer to a foreign key
+ constraint if this update node is used in
+ doing an ON DELETE or ON UPDATE operation */
+ upd_node_t* cascade_node;/* NULL or an update node template which
+ is used to implement ON DELETE/UPDATE CASCADE
+ or ... SET NULL for foreign keys */
+ mem_heap_t* cascade_heap;
+ /*!< NULL or a mem heap where cascade
+ node is created.*/
+ sel_node_t* select; /*!< query graph subtree implementing a base
+ table cursor: the rows returned will be
+ updated */
+ btr_pcur_t* pcur; /*!< persistent cursor placed on the clustered
+ index record which should be updated or
+ deleted; the cursor is stored in the graph
+ of 'select' field above, except in the case
+ of the MySQL interface */
+ dict_table_t* table; /*!< table where updated */
+ upd_t* update; /*!< update vector for the row */
+ ulint update_n_fields;
+ /* when this struct is used to implement
+ a cascade operation for foreign keys, we store
+ here the size of the buffer allocated for use
+ as the update vector */
+ sym_node_list_t columns;/* symbol table nodes for the columns
+ to retrieve from the table */
+ ibool has_clust_rec_x_lock;
+ /* TRUE if the select which retrieves the
+ records to update already sets an x-lock on
+ the clustered record; note that it must always
+ set at least an s-lock */
+ ulint cmpl_info;/* information extracted during query
+ compilation; speeds up execution:
+ UPD_NODE_NO_ORD_CHANGE and
+ UPD_NODE_NO_SIZE_CHANGE, ORed */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ dict_index_t* index; /*!< NULL, or the next index whose record should
+ be updated */
+ dtuple_t* row; /*!< NULL, or a copy (also fields copied to
+ heap) of the row to update; this must be reset
+ to NULL after a successful update */
+ dtuple_t* historical_row; /*!< historical row used in
+ CASCADE UPDATE/SET NULL;
+ allocated from historical_heap */
+ mem_heap_t* historical_heap; /*!< heap for historical row insertion;
+ created when row to update is located;
+ freed right before row update */
+ row_ext_t* ext; /*!< NULL, or prefixes of the externally
+ stored columns in the old row */
+ dtuple_t* upd_row;/* NULL, or a copy of the updated row */
+ row_ext_t* upd_ext;/* NULL, or prefixes of the externally
+ stored columns in upd_row */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage;
+ this must be emptied after a successful
+ update */
+ /*----------------------*/
+ sym_node_t* table_sym;/* table node in symbol table */
+ que_node_t* col_assign_list;
+ /* column assignment list */
+ ulint magic_n;
+
+private:
+ /** Appends row_start or row_end field to update vector and sets a
+ CURRENT_TIMESTAMP/trx->id value to it.
+ Supposed to be called only by make_versioned_update() and
+ make_versioned_delete().
+ @param[in] trx transaction
+ @param[in] vers_sys_idx table->row_start or table->row_end */
+ void vers_update_fields(const trx_t *trx, ulint idx);
+
+public:
+ /** Also set row_start = CURRENT_TIMESTAMP/trx->id
+ @param[in] trx transaction */
+ void vers_make_update(const trx_t *trx)
+ {
+ vers_update_fields(trx, table->vers_start);
+ }
+
+ /** Only set row_end = CURRENT_TIMESTAMP/trx->id.
+ Do not touch other fields at all.
+ @param[in] trx transaction */
+ void vers_make_delete(const trx_t *trx)
+ {
+ update->n_fields = 0;
+ is_delete = VERSIONED_DELETE;
+ vers_update_fields(trx, table->vers_end);
+ }
+};
+
+#define UPD_NODE_MAGIC_N 1579975
+
+/* Node execution states */
+#define UPD_NODE_SET_IX_LOCK 1 /* execution came to the node from
+ a node above and if the field
+ has_clust_rec_x_lock is FALSE, we
+ should set an intention x-lock on
+ the table */
+#define UPD_NODE_UPDATE_CLUSTERED 2 /* clustered index record should be
+ updated */
+#define UPD_NODE_INSERT_CLUSTERED 3 /* clustered index record should be
+ inserted, old record is already delete
+ marked */
+#define UPD_NODE_UPDATE_ALL_SEC 5 /* an ordering field of the clustered
+ index record was changed, or this is
+ a delete operation: should update
+ all the secondary index records */
+#define UPD_NODE_UPDATE_SOME_SEC 6 /* secondary index entries should be
+ looked at and updated if an ordering
+ field changed */
+
+/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */
+#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be
+ changed in the update and no ordering
+ field of the clustered index */
+#define UPD_NODE_NO_SIZE_CHANGE 2 /* no record field size will be
+ changed in the update */
+
+
+#include "row0upd.ic"
+
+#endif
diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic
new file mode 100644
index 00000000..13aacf3f
--- /dev/null
+++ b/storage/innobase/include/row0upd.ic
@@ -0,0 +1,153 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.ic
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "row0row.h"
+#include "lock0lock.h"
+#include "page0zip.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+ ulint n, /*!< in: number of fields */
+ mem_heap_t* heap) /*!< in: heap from which memory allocated */
+{
+ upd_t* update;
+
+ update = static_cast<upd_t*>(mem_heap_zalloc(
+ heap, sizeof(upd_t) + sizeof(upd_field_t) * n));
+
+ update->n_fields = n;
+ update->fields = reinterpret_cast<upd_field_t*>(&update[1]);
+ update->heap = heap;
+
+ return(update);
+}
+
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+ const upd_t* update) /*!< in: update vector */
+{
+ ut_ad(update);
+
+ return(update->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+ const upd_t* update, /*!< in: update vector */
+ ulint n) /*!< in: field position in update vector */
+{
+ ut_ad(update);
+ ut_ad(n < update->n_fields);
+
+ return((upd_field_t*) update->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+ upd_field_t* upd_field, /*!< in: update vector field */
+ uint16_t field_no, /*!< in: field number in a clustered
+ index */
+ dict_index_t* index) /*!< in: index */
+{
+ upd_field->field_no = field_no;
+ upd_field->orig_len = 0;
+ dict_col_copy_type(dict_index_get_nth_col(index, field_no),
+ dfield_get_type(&upd_field->new_val));
+}
+
+/** set field number to a update vector field, marks this field is updated.
+@param[in,out] upd_field update vector field
+@param[in] field_no virtual column sequence num
+@param[in] index index */
+UNIV_INLINE
+void
+upd_field_set_v_field_no(
+ upd_field_t* upd_field,
+ uint16_t field_no,
+ dict_index_t* index)
+{
+ ut_a(field_no < dict_table_get_n_v_cols(index->table));
+ upd_field->field_no = field_no;
+ upd_field->orig_len = 0;
+
+ dict_col_copy_type(&dict_table_get_nth_v_col(
+ index->table, field_no)->m_col,
+ dfield_get_type(&upd_field->new_val));
+}
+
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+ const upd_t* update, /*!< in: update vector */
+ uint16_t no, /*!< in: field_no */
+ bool is_virtual) /*!< in: if it is virtual column */
+{
+ ulint i;
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+
+ /* matches only if the field matches that of is_virtual */
+ if ((!is_virtual) != (!upd_fld_is_virtual_col(uf))) {
+ continue;
+ }
+
+ if (uf->field_no == no) {
+
+ return(uf);
+ }
+ }
+
+ return(NULL);
+}
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
new file mode 100644
index 00000000..d54384f8
--- /dev/null
+++ b/storage/innobase/include/row0vers.h
@@ -0,0 +1,141 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.h
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0vers_h
+#define row0vers_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "rem0types.h"
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "row0types.h"
+
+// Forward declaration
+class ReadView;
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out] caller_trx trx of current thread
+@param[in] rec secondary index record
+@param[in] index secondary index
+@param[in] offsets rec_get_offsets(rec, index)
+@return the active transaction; state must be rechecked after
+trx_mutex_enter(), and trx->release_reference() must be invoked
+@retval NULL if the record was committed */
+trx_t*
+row_vers_impl_x_locked(
+ trx_t* caller_trx,
+ const rec_t* rec,
+ dict_index_t* index,
+ const rec_offs* offsets);
+
+/** Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@param[in] also_curr TRUE if also rec is included in the versions
+ to search; otherwise only versions prior
+ to it are searched
+@param[in] rec record in the clustered index; the caller
+ must have a latch on the page
+@param[in] mtr mtr holding the latch on rec; it will
+ also hold the latch on purge_view
+@param[in] index secondary index
+@param[in] ientry secondary index entry
+@param[in] roll_ptr roll_ptr for the purge record
+@param[in] trx_id transaction ID on the purging record
+@return TRUE if earlier version should have */
+bool
+row_vers_old_has_index_entry(
+ bool also_curr,
+ const rec_t* rec,
+ mtr_t* mtr,
+ dict_index_t* index,
+ const dtuple_t* ientry,
+ roll_ptr_t roll_ptr,
+ trx_id_t trx_id);
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return DB_SUCCESS or DB_MISSING_HISTORY */
+dberr_t
+row_vers_build_for_consistent_read(
+/*===============================*/
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /*!< in: the clustered index */
+ rec_offs** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ ReadView* view, /*!< in: the consistent read view */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ rec_t** old_vers,/*!< out, own: old version, or NULL
+ if the history is missing or the record
+ does not exist in the view, that is,
+ it was freshly inserted afterwards */
+ dtuple_t** vrow); /*!< out: reports virtual column info if any */
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read. */
+void
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+ trx_t* caller_trx,/*!<in/out: trx of current thread */
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec */
+ dict_index_t* index, /*!< in: the clustered index */
+ rec_offs** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ const rec_t** old_vers,/*!< out: rec, old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+ dtuple_t** vrow); /*!< out: holds virtual column info if any
+ is updated in the view */
+
+#endif
diff --git a/storage/innobase/include/rw_lock.h b/storage/innobase/include/rw_lock.h
new file mode 100644
index 00000000..b50a76fa
--- /dev/null
+++ b/storage/innobase/include/rw_lock.h
@@ -0,0 +1,112 @@
+/*****************************************************************************
+
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include <atomic>
+#include "my_dbug.h"
+
+/** Simple read-write lock based on std::atomic */
+class rw_lock
+{
+ /** The lock word */
+ std::atomic<uint32_t> lock;
+
+protected:
+ /** Available lock */
+ static constexpr uint32_t UNLOCKED= 0;
+ /** Flag to indicate that write_lock() is being held */
+ static constexpr uint32_t WRITER= 1U << 31;
+ /** Flag to indicate that write_lock_wait() is pending */
+ static constexpr uint32_t WRITER_WAITING= 1U << 30;
+ /** Flag to indicate that write_lock() or write_lock_wait() is pending */
+ static constexpr uint32_t WRITER_PENDING= WRITER | WRITER_WAITING;
+
+ /** Start waiting for an exclusive lock. */
+ void write_lock_wait_start()
+ { lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); }
+ /** Try to acquire a shared lock.
+ @param l the value of the lock word
+ @return whether the lock was acquired */
+ bool read_trylock(uint32_t &l)
+ {
+ l= UNLOCKED;
+ while (!lock.compare_exchange_strong(l, l + 1, std::memory_order_acquire,
+ std::memory_order_relaxed))
+ {
+ DBUG_ASSERT(!(WRITER & l) || !(~WRITER_PENDING & l));
+ if (l & WRITER_PENDING)
+ return false;
+ }
+ return true;
+ }
+ /** Wait for an exclusive lock.
+ @return whether the exclusive lock was acquired */
+ bool write_lock_poll()
+ {
+ auto l= WRITER_WAITING;
+ if (lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+ std::memory_order_relaxed))
+ return true;
+ if (!(l & WRITER_WAITING))
+ /* write_lock() must have succeeded for another thread */
+ write_lock_wait_start();
+ return false;
+ }
+
+public:
+ /** Default constructor */
+ rw_lock() : lock(UNLOCKED) {}
+
+ /** Release a shared lock */
+ void read_unlock()
+ {
+ IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(1, std::memory_order_release);
+ DBUG_ASSERT(l & ~WRITER_PENDING); /* at least one read lock */
+ DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */
+ }
+ /** Release an exclusive lock */
+ void write_unlock()
+ {
+ IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release);
+ DBUG_ASSERT(l & WRITER); /* the write lock must have existed */
+ }
+ /** Try to acquire a shared lock.
+ @return whether the lock was acquired */
+ bool read_trylock() { uint32_t l; return read_trylock(l); }
+ /** Try to acquire an exclusive lock.
+ @return whether the lock was acquired */
+ bool write_trylock()
+ {
+ auto l= UNLOCKED;
+ return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+ std::memory_order_relaxed);
+ }
+
+ /** @return whether an exclusive lock is being held by any thread */
+ bool is_write_locked() const
+ { return !!(lock.load(std::memory_order_relaxed) & WRITER); }
+ /** @return whether a shared lock is being held by any thread */
+ bool is_read_locked() const
+ {
+ auto l= lock.load(std::memory_order_relaxed);
+ return (l & ~WRITER_PENDING) && !(l & WRITER);
+ }
+ /** @return whether any lock is being held by any thread */
+ bool is_locked() const
+ { return (lock.load(std::memory_order_relaxed) & ~WRITER_WAITING) != 0; }
+};
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
new file mode 100644
index 00000000..cbda9d06
--- /dev/null
+++ b/storage/innobase/include/srv0mon.h
@@ -0,0 +1,892 @@
+/***********************************************************************
+
+Copyright (c) 2010, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/srv0mon.h
+Server monitor counter related defines
+
+Created 12/15/2009 Jimmy Yang
+*******************************************************/
+
+#ifndef srv0mon_h
+#define srv0mon_h
+
+#include "univ.i"
+
+#ifndef __STDC_LIMIT_MACROS
+/* Required for FreeBSD so that INT64_MAX is defined. */
+#define __STDC_LIMIT_MACROS
+#endif /* __STDC_LIMIT_MACROS */
+
+#include <stdint.h>
+#include "my_atomic.h"
+#include "my_atomic_wrapper.h"
+
+/** Possible status values for "mon_status" in "struct monitor_value" */
+enum monitor_running_status {
+ MONITOR_STARTED = 1, /*!< Monitor has been turned on */
+ MONITOR_STOPPED = 2 /*!< Monitor has been turned off */
+};
+
+typedef enum monitor_running_status monitor_running_t;
+
+/** Monitor counter value type */
+typedef int64_t mon_type_t;
+
+/** Two monitor structures are defined in this file. One is
+"monitor_value_t" which contains dynamic counter values for each
+counter. The other is "monitor_info_t", which contains
+static information (counter name, desc etc.) for each counter.
+In addition, an enum datatype "monitor_id_t" is also defined,
+it identifies each monitor with an internally used symbol, whose
+integer value indexes into above two structure for its dynamic
+and static information.
+Developer who intend to add new counters would require to
+fill in counter information as described in "monitor_info_t" and
+create the internal counter ID in "monitor_id_t". */
+
+/** Structure containing the actual values of a monitor counter. */
+struct monitor_value_t {
+ time_t mon_start_time; /*!< Start time of monitoring */
+ time_t mon_stop_time; /*!< Stop time of monitoring */
+ time_t mon_reset_time; /*!< Time of resetting the counter */
+ mon_type_t mon_value; /*!< Current counter Value */
+ mon_type_t mon_max_value; /*!< Current Max value */
+ mon_type_t mon_min_value; /*!< Current Min value */
+ mon_type_t mon_value_reset;/*!< value at last reset */
+ mon_type_t mon_max_value_start; /*!< Max value since start */
+ mon_type_t mon_min_value_start; /*!< Min value since start */
+ mon_type_t mon_start_value;/*!< Value at the start time */
+ mon_type_t mon_last_value; /*!< Last set of values */
+ monitor_running_t mon_status; /* whether monitor still running */
+};
+
+/** Follwoing defines are possible values for "monitor_type" field in
+"struct monitor_info" */
+enum monitor_type_t {
+ MONITOR_NONE = 0, /*!< No monitoring */
+ MONITOR_MODULE = 1, /*!< This is a monitor module type,
+ not a counter */
+ MONITOR_EXISTING = 2, /*!< The monitor carries information from
+ an existing system status variable */
+ MONITOR_NO_AVERAGE = 4, /*!< Set this status if we don't want to
+ calculate the average value for the counter */
+ MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the
+ counter, rather than incremental value
+ over the period. Mostly for counters
+ displaying current resource usage */
+ MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off
+ only as a module, but not individually */
+ MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at
+ server start up */
+ MONITOR_SET_OWNER = 64, /*!< Owner of "monitor set", a set of
+ monitor counters */
+ MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */
+ MONITOR_HIDDEN = 256 /*!< Do not display this monitor in the
+ metrics table */
+};
+
+/** Counter minimum value is initialized to be max value of
+ mon_type_t (int64_t) */
+#ifndef INT64_MAX
+#define INT64_MAX (9223372036854775807LL)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN (-9223372036854775807LL-1)
+#endif
+#define MIN_RESERVED INT64_MAX
+#define MAX_RESERVED INT64_MIN
+
+/** This enumeration defines internal monitor identifier used internally
+to identify each particular counter. Its value indexes into two arrays,
+one is the "innodb_counter_value" array which records actual monitor
+counter values, the other is "innodb_counter_info" array which describes
+each counter's basic information (name, desc etc.). A couple of
+naming rules here:
+1) If the monitor defines a module, it starts with MONITOR_MODULE
+2) If the monitor uses exisitng counters from "status variable", its ID
+name shall start with MONITOR_OVLD
+
+Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail
+information for each monitor counter */
+
+enum monitor_id_t {
+ /* This is to identify the default value set by the metrics
+ control global variables */
+ MONITOR_DEFAULT_START = 0,
+
+ /* Start of Metadata counter */
+ MONITOR_MODULE_METADATA,
+ MONITOR_TABLE_OPEN,
+ MONITOR_TABLE_CLOSE,
+ MONITOR_TABLE_REFERENCE,
+
+ /* Lock manager related counters */
+ MONITOR_MODULE_LOCK,
+ MONITOR_DEADLOCK,
+ MONITOR_TIMEOUT,
+ MONITOR_LOCKREC_WAIT,
+ MONITOR_TABLELOCK_WAIT,
+ MONITOR_NUM_RECLOCK_REQ,
+ MONITOR_RECLOCK_CREATED,
+ MONITOR_RECLOCK_REMOVED,
+ MONITOR_NUM_RECLOCK,
+ MONITOR_TABLELOCK_CREATED,
+ MONITOR_TABLELOCK_REMOVED,
+ MONITOR_NUM_TABLELOCK,
+ MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT,
+ MONITOR_OVLD_LOCK_WAIT_TIME,
+ MONITOR_OVLD_LOCK_MAX_WAIT_TIME,
+ MONITOR_OVLD_ROW_LOCK_WAIT,
+ MONITOR_OVLD_LOCK_AVG_WAIT_TIME,
+
+ /* Buffer and I/O realted counters. */
+ MONITOR_MODULE_BUFFER,
+ MONITOR_OVLD_BUFFER_POOL_SIZE,
+ MONITOR_OVLD_BUF_POOL_READS,
+ MONITOR_OVLD_BUF_POOL_READ_REQUESTS,
+ MONITOR_OVLD_BUF_POOL_WRITE_REQUEST,
+ MONITOR_OVLD_BUF_POOL_WAIT_FREE,
+ MONITOR_OVLD_BUF_POOL_READ_AHEAD,
+ MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED,
+ MONITOR_OVLD_BUF_POOL_PAGE_TOTAL,
+ MONITOR_OVLD_BUF_POOL_PAGE_MISC,
+ MONITOR_OVLD_BUF_POOL_PAGES_DATA,
+ MONITOR_OVLD_BUF_POOL_BYTES_DATA,
+ MONITOR_OVLD_BUF_POOL_PAGES_DIRTY,
+ MONITOR_OVLD_BUF_POOL_BYTES_DIRTY,
+ MONITOR_OVLD_BUF_POOL_PAGES_FREE,
+ MONITOR_OVLD_PAGE_CREATED,
+ MONITOR_OVLD_PAGES_WRITTEN,
+ MONITOR_OVLD_INDEX_PAGES_WRITTEN,
+ MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN,
+ MONITOR_OVLD_PAGES_READ,
+ MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS,
+ MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED,
+ MONITOR_OVLD_BYTE_READ,
+ MONITOR_OVLD_BYTE_WRITTEN,
+ MONITOR_FLUSH_BATCH_SCANNED,
+ MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+ MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+ MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_COUNT,
+ MONITOR_FLUSH_BATCH_PAGES,
+ MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+ MONITOR_FLUSH_NEIGHBOR_COUNT,
+ MONITOR_FLUSH_NEIGHBOR_PAGES,
+ MONITOR_FLUSH_N_TO_FLUSH_REQUESTED,
+
+ MONITOR_FLUSH_N_TO_FLUSH_BY_AGE,
+ MONITOR_FLUSH_ADAPTIVE_AVG_TIME,
+
+ MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
+
+ MONITOR_LRU_GET_FREE_LOOPS,
+ MONITOR_LRU_GET_FREE_WAITS,
+
+ MONITOR_FLUSH_AVG_PAGE_RATE,
+ MONITOR_FLUSH_LSN_AVG_RATE,
+ MONITOR_FLUSH_PCT_FOR_DIRTY,
+ MONITOR_FLUSH_PCT_FOR_LSN,
+ MONITOR_FLUSH_SYNC_WAITS,
+ MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+ MONITOR_FLUSH_ADAPTIVE_COUNT,
+ MONITOR_FLUSH_ADAPTIVE_PAGES,
+ MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+ MONITOR_FLUSH_SYNC_COUNT,
+ MONITOR_FLUSH_SYNC_PAGES,
+ MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+ MONITOR_FLUSH_BACKGROUND_COUNT,
+ MONITOR_FLUSH_BACKGROUND_PAGES,
+ MONITOR_LRU_BATCH_SCANNED,
+ MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+ MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_FLUSH_COUNT,
+ MONITOR_LRU_BATCH_FLUSH_PAGES,
+ MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_COUNT,
+ MONITOR_LRU_BATCH_EVICT_PAGES,
+ MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
+ MONITOR_LRU_GET_FREE_SEARCH,
+ MONITOR_LRU_SEARCH_SCANNED,
+ MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+
+ /* Buffer Page I/O specific counters. */
+ MONITOR_MODULE_BUF_PAGE,
+ MONITOR_INDEX_LEAF_PAGE_READ,
+ MONITOR_INDEX_NON_LEAF_PAGE_READ,
+ MONITOR_INDEX_IBUF_LEAF_PAGE_READ,
+ MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ,
+ MONITOR_UNDO_LOG_PAGE_READ,
+ MONITOR_INODE_PAGE_READ,
+ MONITOR_IBUF_FREELIST_PAGE_READ,
+ MONITOR_IBUF_BITMAP_PAGE_READ,
+ MONITOR_SYSTEM_PAGE_READ,
+ MONITOR_TRX_SYSTEM_PAGE_READ,
+ MONITOR_FSP_HDR_PAGE_READ,
+ MONITOR_XDES_PAGE_READ,
+ MONITOR_BLOB_PAGE_READ,
+ MONITOR_ZBLOB_PAGE_READ,
+ MONITOR_ZBLOB2_PAGE_READ,
+ MONITOR_OTHER_PAGE_READ,
+ MONITOR_INDEX_LEAF_PAGE_WRITTEN,
+ MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN,
+ MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN,
+ MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN,
+ MONITOR_UNDO_LOG_PAGE_WRITTEN,
+ MONITOR_INODE_PAGE_WRITTEN,
+ MONITOR_IBUF_FREELIST_PAGE_WRITTEN,
+ MONITOR_IBUF_BITMAP_PAGE_WRITTEN,
+ MONITOR_SYSTEM_PAGE_WRITTEN,
+ MONITOR_TRX_SYSTEM_PAGE_WRITTEN,
+ MONITOR_FSP_HDR_PAGE_WRITTEN,
+ MONITOR_XDES_PAGE_WRITTEN,
+ MONITOR_BLOB_PAGE_WRITTEN,
+ MONITOR_ZBLOB_PAGE_WRITTEN,
+ MONITOR_ZBLOB2_PAGE_WRITTEN,
+ MONITOR_OTHER_PAGE_WRITTEN,
+
+ /* OS level counters (I/O) */
+ MONITOR_MODULE_OS,
+ MONITOR_OVLD_OS_FILE_READ,
+ MONITOR_OVLD_OS_FILE_WRITE,
+ MONITOR_OVLD_OS_FSYNC,
+ MONITOR_OS_PENDING_READS,
+ MONITOR_OS_PENDING_WRITES,
+ MONITOR_OVLD_OS_LOG_WRITTEN,
+ MONITOR_OVLD_OS_LOG_FSYNC,
+ MONITOR_OVLD_OS_LOG_PENDING_FSYNC,
+ MONITOR_OVLD_OS_LOG_PENDING_WRITES,
+
+ /* Transaction related counters */
+ MONITOR_MODULE_TRX,
+ MONITOR_TRX_RW_COMMIT,
+ MONITOR_TRX_RO_COMMIT,
+ MONITOR_TRX_NL_RO_COMMIT,
+ MONITOR_TRX_COMMIT_UNDO,
+ MONITOR_TRX_ROLLBACK,
+ MONITOR_TRX_ROLLBACK_SAVEPOINT,
+ MONITOR_TRX_ACTIVE,
+ MONITOR_RSEG_HISTORY_LEN,
+ MONITOR_NUM_UNDO_SLOT_USED,
+ MONITOR_NUM_UNDO_SLOT_CACHED,
+ MONITOR_RSEG_CUR_SIZE,
+
+ /* Purge related counters */
+ MONITOR_MODULE_PURGE,
+ MONITOR_N_DEL_ROW_PURGE,
+ MONITOR_N_UPD_EXIST_EXTERN,
+ MONITOR_PURGE_INVOKED,
+ MONITOR_PURGE_N_PAGE_HANDLED,
+ MONITOR_DML_PURGE_DELAY,
+ MONITOR_PURGE_STOP_COUNT,
+ MONITOR_PURGE_RESUME_COUNT,
+
+ /* Recovery related counters */
+ MONITOR_MODULE_RECOVERY,
+ MONITOR_NUM_CHECKPOINT,
+ MONITOR_OVLD_LSN_FLUSHDISK,
+ MONITOR_OVLD_LSN_CHECKPOINT,
+ MONITOR_OVLD_LSN_CURRENT,
+ MONITOR_LSN_CHECKPOINT_AGE,
+ MONITOR_OVLD_BUF_OLDEST_LSN,
+ MONITOR_OVLD_MAX_AGE_ASYNC,
+ MONITOR_PENDING_LOG_FLUSH,
+ MONITOR_PENDING_CHECKPOINT_WRITE,
+ MONITOR_LOG_IO,
+ MONITOR_OVLD_LOG_WAITS,
+ MONITOR_OVLD_LOG_WRITE_REQUEST,
+ MONITOR_OVLD_LOG_WRITES,
+ MONITOR_OVLD_LOG_PADDED,
+
+ /* Page Manager related counters */
+ MONITOR_MODULE_PAGE,
+ MONITOR_PAGE_COMPRESS,
+ MONITOR_PAGE_DECOMPRESS,
+ MONITOR_PAD_INCREMENTS,
+ MONITOR_PAD_DECREMENTS,
+ /* New monitor variables for page compression */
+ MONITOR_OVLD_PAGE_COMPRESS_SAVED,
+ MONITOR_OVLD_PAGES_PAGE_COMPRESSED,
+ MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP,
+ MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED,
+ MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR,
+
+ /* New monitor variables for page encryption */
+ MONITOR_OVLD_PAGES_ENCRYPTED,
+ MONITOR_OVLD_PAGES_DECRYPTED,
+
+ /* Index related counters */
+ MONITOR_MODULE_INDEX,
+ MONITOR_INDEX_SPLIT,
+ MONITOR_INDEX_MERGE_ATTEMPTS,
+ MONITOR_INDEX_MERGE_SUCCESSFUL,
+ MONITOR_INDEX_REORG_ATTEMPTS,
+ MONITOR_INDEX_REORG_SUCCESSFUL,
+ MONITOR_INDEX_DISCARD,
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* Adaptive Hash Index related counters */
+ MONITOR_MODULE_ADAPTIVE_HASH,
+ MONITOR_OVLD_ADAPTIVE_HASH_SEARCH,
+#endif /* BTR_CUR_HASH_ADAPT */
+ MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE,
+#ifdef BTR_CUR_HASH_ADAPT
+ MONITOR_ADAPTIVE_HASH_PAGE_ADDED,
+ MONITOR_ADAPTIVE_HASH_PAGE_REMOVED,
+ MONITOR_ADAPTIVE_HASH_ROW_ADDED,
+ MONITOR_ADAPTIVE_HASH_ROW_REMOVED,
+ MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND,
+ MONITOR_ADAPTIVE_HASH_ROW_UPDATED,
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /* Tablespace related counters */
+ MONITOR_MODULE_FIL_SYSTEM,
+ MONITOR_OVLD_N_FILE_OPENED,
+
+ /* InnoDB Change Buffer related counters */
+ MONITOR_MODULE_IBUF_SYSTEM,
+ MONITOR_OVLD_IBUF_MERGE_INSERT,
+ MONITOR_OVLD_IBUF_MERGE_DELETE,
+ MONITOR_OVLD_IBUF_MERGE_PURGE,
+ MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT,
+ MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE,
+ MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE,
+ MONITOR_OVLD_IBUF_MERGES,
+ MONITOR_OVLD_IBUF_SIZE,
+
+ /* Counters for server operations */
+ MONITOR_MODULE_SERVER,
+ MONITOR_MASTER_THREAD_SLEEP,
+ MONITOR_OVLD_SERVER_ACTIVITY,
+ MONITOR_MASTER_ACTIVE_LOOPS,
+ MONITOR_MASTER_IDLE_LOOPS,
+ MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
+ MONITOR_SRV_LOG_FLUSH_MICROSECOND,
+ MONITOR_SRV_DICT_LRU_MICROSECOND,
+ MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE,
+ MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE,
+ MONITOR_OVLD_SRV_DBLWR_WRITES,
+ MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN,
+ MONITOR_OVLD_SRV_PAGE_SIZE,
+ MONITOR_OVLD_RWLOCK_S_SPIN_WAITS,
+ MONITOR_OVLD_RWLOCK_X_SPIN_WAITS,
+ MONITOR_OVLD_RWLOCK_SX_SPIN_WAITS,
+ MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS,
+ MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS,
+ MONITOR_OVLD_RWLOCK_SX_SPIN_ROUNDS,
+ MONITOR_OVLD_RWLOCK_S_OS_WAITS,
+ MONITOR_OVLD_RWLOCK_X_OS_WAITS,
+ MONITOR_OVLD_RWLOCK_SX_OS_WAITS,
+
+ /* Data DML related counters */
+ MONITOR_MODULE_DML_STATS,
+ MONITOR_OLVD_ROW_READ,
+ MONITOR_OLVD_ROW_INSERTED,
+ MONITOR_OLVD_ROW_DELETED,
+ MONITOR_OLVD_ROW_UPDTATED,
+ MONITOR_OLVD_SYSTEM_ROW_READ,
+ MONITOR_OLVD_SYSTEM_ROW_INSERTED,
+ MONITOR_OLVD_SYSTEM_ROW_DELETED,
+ MONITOR_OLVD_SYSTEM_ROW_UPDATED,
+
+ /* Data DDL related counters */
+ MONITOR_MODULE_DDL_STATS,
+ MONITOR_BACKGROUND_DROP_INDEX,
+ MONITOR_BACKGROUND_DROP_TABLE,
+ MONITOR_ONLINE_CREATE_INDEX,
+ MONITOR_PENDING_ALTER_TABLE,
+ MONITOR_ALTER_TABLE_SORT_FILES,
+ MONITOR_ALTER_TABLE_LOG_FILES,
+
+ MONITOR_MODULE_ICP,
+ MONITOR_ICP_ATTEMPTS,
+ MONITOR_ICP_NO_MATCH,
+ MONITOR_ICP_OUT_OF_RANGE,
+ MONITOR_ICP_MATCH,
+
+ /* Mutex/RW-Lock related counters */
+ MONITOR_MODULE_LATCHES,
+ MONITOR_LATCHES,
+
+ /* This is used only for control system to turn
+ on/off and reset all monitor counters */
+ MONITOR_ALL_COUNTER,
+
+ /* This must be the last member */
+ NUM_MONITOR
+};
+
+/** This informs the monitor control system to turn
+on/off and reset monitor counters through wild card match */
+#define MONITOR_WILDCARD_MATCH (NUM_MONITOR + 1)
+
+/** Cannot find monitor counter with a specified name */
+#define MONITOR_NO_MATCH (NUM_MONITOR + 2)
+
+/** struct monitor_info describes the basic/static information
+about each monitor counter. */
+struct monitor_info_t {
+ const char* monitor_name; /*!< Monitor name */
+ const char* monitor_module; /*!< Sub Module the monitor
+ belongs to */
+ const char* monitor_desc; /*!< Brief desc of monitor counter */
+ monitor_type_t monitor_type; /*!< Type of Monitor Info */
+ monitor_id_t monitor_related_id;/*!< Monitor ID of counter that
+ related to this monitor. This is
+ set when the monitor belongs to
+ a "monitor set" */
+ monitor_id_t monitor_id; /*!< Monitor ID as defined in enum
+ monitor_id_t */
+};
+
+/** Following are the "set_option" values allowed for
+srv_mon_process_existing_counter() and srv_mon_process_existing_counter()
+functions. To turn on/off/reset the monitor counters. */
+enum mon_option_t {
+ MONITOR_TURN_ON = 1, /*!< Turn on the counter */
+ MONITOR_TURN_OFF, /*!< Turn off the counter */
+ MONITOR_RESET_VALUE, /*!< Reset current values */
+ MONITOR_RESET_ALL_VALUE, /*!< Reset all values */
+ MONITOR_GET_VALUE /*!< Option for
+ srv_mon_process_existing_counter()
+ function */
+};
+
+/** Number of bit in a ulint datatype */
+#define NUM_BITS_ULINT (sizeof(ulint) * CHAR_BIT)
+
+/** This "monitor_set_tbl" is a bitmap records whether a particular monitor
+counter has been turned on or off */
+extern Atomic_relaxed<ulint>
+ monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / NUM_BITS_ULINT];
+
+/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor
+counter option. */
+#define MONITOR_ON(monitor) \
+ (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_or( \
+ (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))))
+
+#define MONITOR_OFF(monitor) \
+ (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_and( \
+ ~(ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))))
+
+/** Check whether the requested monitor is turned on/off */
+#define MONITOR_IS_ON(monitor) \
+ (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT] & \
+ (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT)))
+
+/** The actual monitor counter array that records each monintor counter
+value */
+extern monitor_value_t innodb_counter_value[NUM_MONITOR];
+
+/** Following are macro defines for basic montior counter manipulations.
+Please note we do not provide any synchronization for these monitor
+operations due to performance consideration. Most counters can
+be placed under existing mutex protections in respective code
+module. */
+
+/** Macros to access various fields of a monitor counters */
+#define MONITOR_FIELD(monitor, field) \
+ (innodb_counter_value[monitor].field)
+
+#define MONITOR_VALUE(monitor) \
+ MONITOR_FIELD(monitor, mon_value)
+
+#define MONITOR_MAX_VALUE(monitor) \
+ MONITOR_FIELD(monitor, mon_max_value)
+
+#define MONITOR_MIN_VALUE(monitor) \
+ MONITOR_FIELD(monitor, mon_min_value)
+
+#define MONITOR_VALUE_RESET(monitor) \
+ MONITOR_FIELD(monitor, mon_value_reset)
+
+#define MONITOR_MAX_VALUE_START(monitor) \
+ MONITOR_FIELD(monitor, mon_max_value_start)
+
+#define MONITOR_MIN_VALUE_START(monitor) \
+ MONITOR_FIELD(monitor, mon_min_value_start)
+
+#define MONITOR_LAST_VALUE(monitor) \
+ MONITOR_FIELD(monitor, mon_last_value)
+
+#define MONITOR_START_VALUE(monitor) \
+ MONITOR_FIELD(monitor, mon_start_value)
+
+#define MONITOR_VALUE_SINCE_START(monitor) \
+ (MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor))
+
+#define MONITOR_STATUS(monitor) \
+ MONITOR_FIELD(monitor, mon_status)
+
+#define MONITOR_SET_START(monitor) \
+ do { \
+ MONITOR_STATUS(monitor) = MONITOR_STARTED; \
+ MONITOR_FIELD((monitor), mon_start_time) = time(NULL); \
+ } while (0)
+
+#define MONITOR_SET_OFF(monitor) \
+ do { \
+ MONITOR_STATUS(monitor) = MONITOR_STOPPED; \
+ MONITOR_FIELD((monitor), mon_stop_time) = time(NULL); \
+ } while (0)
+
+#define MONITOR_INIT_ZERO_VALUE 0
+
+/** Max and min values are initialized when we first turn on the monitor
+counter, and set the MONITOR_STATUS. */
+#define MONITOR_MAX_MIN_NOT_INIT(monitor) \
+ (MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE \
+ && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \
+ && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE)
+
+#define MONITOR_INIT(monitor) \
+ if (MONITOR_MAX_MIN_NOT_INIT(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \
+ MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \
+ MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \
+ MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \
+ }
+
+/** Macros to increment/decrement the counters. The normal
+monitor counter operation expects appropriate synchronization
+already exists. No additional mutex is necessary when operating
+on the counters */
+#define MONITOR_INC(monitor) \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor)++; \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+/** Atomically increment a monitor counter.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor monitor to be incremented by 1
+@param enabled whether the monitor is enabled */
+#define MONITOR_ATOMIC_INC_LOW(monitor, enabled) \
+ if (enabled) { \
+ ib_uint64_t value; \
+ value = my_atomic_add64_explicit( \
+ (int64*) &MONITOR_VALUE(monitor), 1, \
+ MY_MEMORY_ORDER_RELAXED) + 1; \
+ /* Note: This is not 100% accurate because of the \
+ inherent race, we ignore it due to performance. */ \
+ if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = value; \
+ } \
+ }
+
+/** Atomically decrement a monitor counter.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor monitor to be decremented by 1
+@param enabled whether the monitor is enabled */
+#define MONITOR_ATOMIC_DEC_LOW(monitor, enabled) \
+ if (enabled) { \
+ ib_uint64_t value; \
+ value = my_atomic_add64_explicit( \
+ (int64*) &MONITOR_VALUE(monitor), -1, \
+ MY_MEMORY_ORDER_RELAXED) - 1; \
+ /* Note: This is not 100% accurate because of the \
+ inherent race, we ignore it due to performance. */ \
+ if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = value; \
+ } \
+ }
+
+/** Atomically increment a monitor counter if it is enabled.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor monitor to be incremented by 1 */
+#define MONITOR_ATOMIC_INC(monitor) \
+ MONITOR_ATOMIC_INC_LOW(monitor, MONITOR_IS_ON(monitor))
+/** Atomically decrement a monitor counter if it is enabled.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor monitor to be decremented by 1 */
+#define MONITOR_ATOMIC_DEC(monitor) \
+ MONITOR_ATOMIC_DEC_LOW(monitor, MONITOR_IS_ON(monitor))
+
+#define MONITOR_DEC(monitor) \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor)--; \
+ if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+#ifdef HAVE_MEM_CHECK
+# define MONITOR_CHECK_DEFINED(value) do { \
+ mon_type_t m __attribute__((unused))= value; \
+ MEM_CHECK_DEFINED(&m, sizeof m); \
+} while (0)
+#else /* HAVE_MEM_CHECK */
+# define MONITOR_CHECK_DEFINED(value) (void) 0
+#endif /* HAVE_MEM_CHECK */
+
+#define MONITOR_INC_VALUE(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor) += (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+#define MONITOR_DEC_VALUE(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value); \
+ MONITOR_VALUE(monitor) -= (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+/* Increment/decrement counter without check the monitor on/off bit, which
+could already be checked as a module group */
+#define MONITOR_INC_NOCHECK(monitor) \
+ do { \
+ MONITOR_VALUE(monitor)++; \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ } while (0) \
+
+#define MONITOR_DEC_NOCHECK(monitor) \
+ do { \
+ MONITOR_VALUE(monitor)--; \
+ if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ } while (0)
+
+/** Directly set a monitor counter's value */
+#define MONITOR_SET(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor) = (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \
+ MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+/** Add time difference between now and input "value" (in seconds) to the
+monitor counter
+@param monitor monitor to update for the time difference
+@param value the start time value */
+#define MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ uintmax_t old_time = value; \
+ value = microsecond_interval_timer(); \
+ MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\
+ }
+
+/** This macro updates 3 counters in one call. However, it only checks the
+main/first monitor counter 'monitor', to see it is on or off to decide
+whether to do the update.
+@param monitor the main monitor counter to update. It accounts for
+ the accumulative value for the counter.
+@param monitor_n_calls counter that counts number of times this macro is
+ called
+@param monitor_per_call counter that records the current and max value of
+ each incremental value
+@param value incremental value to record this time */
+#define MONITOR_INC_VALUE_CUMULATIVE( \
+ monitor, monitor_n_calls, monitor_per_call, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor_n_calls)++; \
+ MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor_per_call) \
+ > MONITOR_MAX_VALUE(monitor_per_call)) { \
+ MONITOR_MAX_VALUE(monitor_per_call) = \
+ (mon_type_t) (value); \
+ } \
+ MONITOR_VALUE(monitor) += (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+/** Directly set a monitor counter's value, and if the value
+is monotonically increasing, only max value needs to be updated */
+#define MONITOR_SET_UPD_MAX_ONLY(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor) = (mon_type_t) (value); \
+ if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+ MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+ } \
+ }
+
+/** Some values such as log sequence number are montomically increasing
+number, do not need to record max/min values */
+#define MONITOR_SET_SIMPLE(monitor, value) \
+ MONITOR_CHECK_DEFINED(value); \
+ if (MONITOR_IS_ON(monitor)) { \
+ MONITOR_VALUE(monitor) = (mon_type_t) (value); \
+ }
+
+/** Reset the monitor value and max/min value to zero. The reset
+operation would only be conducted when the counter is turned off */
+#define MONITOR_RESET_ALL(monitor) \
+ do { \
+ MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \
+ MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \
+ MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \
+ MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE; \
+ MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \
+ MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \
+ MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \
+ MONITOR_FIELD(monitor, mon_start_time) = \
+ MONITOR_INIT_ZERO_VALUE; \
+ MONITOR_FIELD(monitor, mon_stop_time) = \
+ MONITOR_INIT_ZERO_VALUE; \
+ MONITOR_FIELD(monitor, mon_reset_time) = \
+ MONITOR_INIT_ZERO_VALUE; \
+ } while (0)
+
+/** Following four macros defines necessary operations to fetch and
+consolidate information from existing system status variables. */
+
+/** Save the passed-in value to mon_start_value field of monitor
+counters */
+#define MONITOR_SAVE_START(monitor, value) do { \
+ MONITOR_CHECK_DEFINED(value); \
+ (MONITOR_START_VALUE(monitor) = \
+ (mon_type_t) (value) - MONITOR_VALUE_RESET(monitor)); \
+ } while (0)
+
+/** Save the passed-in value to mon_last_value field of monitor
+counters */
+#define MONITOR_SAVE_LAST(monitor) \
+ do { \
+ MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor); \
+ MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor); \
+ } while (0)
+
+/** Set monitor value to the difference of value and mon_start_value
+compensated by mon_last_value if accumulated value is required. */
+#define MONITOR_SET_DIFF(monitor, value) \
+ MONITOR_SET_UPD_MAX_ONLY(monitor, ((value) \
+ - MONITOR_VALUE_RESET(monitor) \
+ - MONITOR_FIELD(monitor, mon_start_value) \
+ + MONITOR_FIELD(monitor, mon_last_value)))
+
+/****************************************************************//**
+Get monitor's monitor_info_t by its monitor id (index into the
+innodb_counter_info array
+@return Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+ monitor_id_t monitor_id); /*!< id index into the
+ innodb_counter_info array */
+/****************************************************************//**
+Get monitor's name by its monitor id (index into the
+innodb_counter_info array
+@return corresponding monitor name, or NULL if no such
+monitor */
+const char*
+srv_mon_get_name(
+/*=============*/
+ monitor_id_t monitor_id); /*!< id index into the
+ innodb_counter_info array */
+
+/****************************************************************//**
+Turn on/off/reset monitor counters in a module. If module_value
+is NUM_MONITOR then turn on all monitor counters.
+@return 0 if successful, or the first monitor that cannot be
+turned on because it is already turned on. */
+void
+srv_mon_set_module_control(
+/*=======================*/
+ monitor_id_t module_id, /*!< in: Module ID as in
+ monitor_counter_id. If it is
+ set to NUM_MONITOR, this means
+ we shall turn on all the counters */
+ mon_option_t set_option); /*!< in: Turn on/off reset the
+ counter */
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. */
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+ monitor_id_t monitor_id, /*!< in: the monitor's ID as in
+ monitor_counter_id */
+ mon_option_t set_option); /*!< in: Turn on/off reset the
+ counter */
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+ monitor_id_t monitor); /*!< in: monitor id */
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+ monitor_id_t monitor); /*!< in: monitor id*/
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+void
+srv_mon_reset(
+/*==========*/
+ monitor_id_t monitor); /*!< in: monitor id*/
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+ monitor_id_t monitor); /*!< in: monitor id*/
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+void
+srv_mon_default_on(void);
+/*====================*/
+
+#include "srv0mon.ic"
+
+#endif
diff --git a/storage/innobase/include/srv0mon.ic b/storage/innobase/include/srv0mon.ic
new file mode 100644
index 00000000..158345b2
--- /dev/null
+++ b/storage/innobase/include/srv0mon.ic
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/srv0mon.ic
+Server monitoring system
+
+Created 1/20/2010 Jimmy Yang
+************************************************************************/
+
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+ monitor_id_t monitor) /*!< in: monitor id */
+{
+ if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) {
+
+ /* MONITOR_MAX_VALUE_START has not yet been
+ initialized, the max value since start is the
+ max count in MONITOR_MAX_VALUE */
+ MONITOR_MAX_VALUE_START(monitor) =
+ MONITOR_MAX_VALUE(monitor);
+
+ } else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED
+ && (MONITOR_MAX_VALUE(monitor)
+ + MONITOR_VALUE_RESET(monitor)
+ > MONITOR_MAX_VALUE_START(monitor))) {
+
+ /* If the max value since reset (as specified
+ in MONITOR_MAX_VALUE) plus the reset value is
+ larger than MONITOR_MAX_VALUE_START, reset
+ MONITOR_MAX_VALUE_START to this new max value */
+ MONITOR_MAX_VALUE_START(monitor) =
+ MONITOR_MAX_VALUE(monitor)
+ + MONITOR_VALUE_RESET(monitor);
+ }
+
+ return(MONITOR_MAX_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+ monitor_id_t monitor) /*!< in: monitor id */
+{
+ if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) {
+
+ /* MONITOR_MIN_VALUE_START has not yet been
+ initialized, the min value since start is the
+ min count in MONITOR_MIN_VALUE */
+ MONITOR_MIN_VALUE_START(monitor) =
+ MONITOR_MIN_VALUE(monitor);
+
+ } else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED
+ && (MONITOR_MIN_VALUE(monitor)
+ + MONITOR_VALUE_RESET(monitor)
+ < MONITOR_MIN_VALUE_START(monitor))) {
+
+ /* If the min value since reset (as specified
+ in MONITOR_MIN_VALUE) plus the reset value is
+ less than MONITOR_MIN_VALUE_START, reset
+ MONITOR_MIN_VALUE_START to this new min value */
+ MONITOR_MIN_VALUE_START(monitor) =
+ MONITOR_MIN_VALUE(monitor)
+ + MONITOR_VALUE_RESET(monitor);
+ }
+
+ return(MONITOR_MIN_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+ monitor_id_t monitor) /*!< in: monitor id */
+{
+ /* Do not reset all counter values if monitor is still on. */
+ if (MONITOR_IS_ON(monitor)) {
+ fprintf(stderr, "InnoDB: Cannot reset all values for"
+ " monitor counter %s while it is on. Please"
+ " turn it off and retry.\n",
+ srv_mon_get_name(monitor));
+ } else {
+ MONITOR_RESET_ALL(monitor);
+ }
+}
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
new file mode 100644
index 00000000..a5bebc34
--- /dev/null
+++ b/storage/innobase/include/srv0srv.h
@@ -0,0 +1,868 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2008, 2009, Google Inc.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.h
+The server main program
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0log.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "fil0fil.h"
+
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+#include <tpool.h>
+#include <memory>
+
+/** Global counters used inside InnoDB. */
+struct srv_stats_t
+{
+ typedef ib_counter_t<ulint, 64> ulint_ctr_64_t;
+ typedef simple_counter<lsn_t> lsn_ctr_1_t;
+ typedef simple_counter<ulint> ulint_ctr_1_t;
+ typedef simple_counter<int64_t> int64_ctr_1_t;
+
+ /** Count the amount of data written in total (in bytes) */
+ ulint_ctr_1_t data_written;
+
+ /** Number of the log write requests done */
+ ulint_ctr_1_t log_write_requests;
+
+ /** Number of physical writes to the log performed */
+ ulint_ctr_1_t log_writes;
+
+ /** Amount of data padded for log write ahead */
+ ulint_ctr_1_t log_padded;
+
+ /** Amount of data written to the log files in bytes */
+ lsn_ctr_1_t os_log_written;
+
+ /** Number of writes being done to the log files */
+ ulint_ctr_1_t os_log_pending_writes;
+
+ /** We increase this counter, when we don't have enough
+ space in the log buffer and have to flush it */
+ ulint_ctr_1_t log_waits;
+
+ /** Store the number of write requests issued */
+ ulint_ctr_1_t buf_pool_write_requests;
+
+ /** Number of buffer pool reads that led to the reading of
+ a disk page */
+ ulint_ctr_1_t buf_pool_reads;
+
+ /** Number of bytes saved by page compression */
+ ulint_ctr_64_t page_compression_saved;
+ /* Number of index pages written */
+ ulint_ctr_64_t index_pages_written;
+ /* Number of non index pages written */
+ ulint_ctr_64_t non_index_pages_written;
+ /* Number of pages compressed with page compression */
+ ulint_ctr_64_t pages_page_compressed;
+ /* Number of TRIM operations induced by page compression */
+ ulint_ctr_64_t page_compressed_trim_op;
+ /* Number of pages decompressed with page compression */
+ ulint_ctr_64_t pages_page_decompressed;
+ /* Number of page compression errors */
+ ulint_ctr_64_t pages_page_compression_error;
+ /* Number of pages encrypted */
+ ulint_ctr_64_t pages_encrypted;
+ /* Number of pages decrypted */
+ ulint_ctr_64_t pages_decrypted;
+ /* Number of merge blocks encrypted */
+ ulint_ctr_64_t n_merge_blocks_encrypted;
+ /* Number of merge blocks decrypted */
+ ulint_ctr_64_t n_merge_blocks_decrypted;
+ /* Number of row log blocks encrypted */
+ ulint_ctr_64_t n_rowlog_blocks_encrypted;
+ /* Number of row log blocks decrypted */
+ ulint_ctr_64_t n_rowlog_blocks_decrypted;
+
+ /** Number of data read in total (in bytes) */
+ ulint_ctr_1_t data_read;
+
+ /** Wait time of database locks */
+ int64_ctr_1_t n_lock_wait_time;
+
+ /** Number of database lock waits */
+ ulint_ctr_1_t n_lock_wait_count;
+
+ /** Number of threads currently waiting on database locks */
+ MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<ulint>
+ n_lock_wait_current_count;
+
+ /** Number of rows read. */
+ ulint_ctr_64_t n_rows_read;
+
+ /** Number of rows updated */
+ ulint_ctr_64_t n_rows_updated;
+
+ /** Number of rows deleted */
+ ulint_ctr_64_t n_rows_deleted;
+
+ /** Number of rows inserted */
+ ulint_ctr_64_t n_rows_inserted;
+
+ /** Number of system rows read. */
+ ulint_ctr_64_t n_system_rows_read;
+
+ /** Number of system rows updated */
+ ulint_ctr_64_t n_system_rows_updated;
+
+ /** Number of system rows deleted */
+ ulint_ctr_64_t n_system_rows_deleted;
+
+ /** Number of system rows inserted */
+ ulint_ctr_64_t n_system_rows_inserted;
+
+ /** Number of times secondary index lookup triggered cluster lookup */
+ ulint_ctr_64_t n_sec_rec_cluster_reads;
+
+ /** Number of times prefix optimization avoided triggering cluster lookup */
+ ulint_ctr_64_t n_sec_rec_cluster_reads_avoided;
+
+ /** Number of encryption_get_latest_key_version calls */
+ ulint_ctr_64_t n_key_requests;
+
+ /** Number of spaces in keyrotation list */
+ ulint_ctr_64_t key_rotation_list_length;
+
+ /** Number of temporary tablespace blocks encrypted */
+ ulint_ctr_64_t n_temp_blocks_encrypted;
+
+ /** Number of temporary tablespace blocks decrypted */
+ ulint_ctr_64_t n_temp_blocks_decrypted;
+
+ /** Number of lock deadlocks */
+ ulint_ctr_1_t lock_deadlock_count;
+};
+
+/** We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. srv_start() sets the value. */
+extern ulint srv_max_n_threads;
+
+extern const char* srv_main_thread_op_info;
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+extern const char srv_mysql50_table_name_prefix[10];
+
+/** The buffer pool dump/load file name */
+#define SRV_BUF_DUMP_FILENAME_DEFAULT "ib_buffer_pool"
+extern char* srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+extern char srv_buffer_pool_dump_at_shutdown;
+extern char srv_buffer_pool_load_at_startup;
+
+/* Whether to disable file system cache if it is defined */
+extern char srv_disable_sort_file_cache;
+
+/* If the last data file is auto-extended, we add this many pages to it
+at a time */
+#define SRV_AUTO_EXTEND_INCREMENT (srv_sys_space.get_autoextend_increment())
+
+/** Mutex protecting page_zip_stat_per_index */
+extern ib_mutex_t page_zip_stat_per_index_mutex;
+/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */
+extern ib_mutex_t srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+extern FILE* srv_monitor_file;
+/* Mutex for locking srv_misc_tmpfile. Only created if !srv_read_only_mode.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+extern ib_mutex_t srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+extern FILE* srv_misc_tmpfile;
+
+/* Server parameters which are read from the initfile */
+
+extern char* srv_data_home;
+
+/** Set if InnoDB must operate in read-only mode. We don't do any
+recovery and open all tables in RO mode instead of RW mode. We don't
+sync the max trx id to disk either. */
+extern my_bool srv_read_only_mode;
+/** Set if InnoDB operates in read-only mode or innodb-force-recovery
+is greater than SRV_FORCE_NO_IBUF_MERGE. */
+extern my_bool high_level_read_only;
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+extern my_bool srv_file_per_table;
+
+/** Sort buffer size in index creation */
+extern ulong srv_sort_buf_size;
+/** Maximum modification log file size for online index creation */
+extern unsigned long long srv_online_max_size;
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio.
+Currently we support native aio on windows and linux */
+extern my_bool srv_use_native_aio;
+extern my_bool srv_numa_interleave;
+
+/* Use atomic writes i.e disable doublewrite buffer */
+extern my_bool srv_use_atomic_writes;
+
+/* Compression algorithm*/
+extern ulong innodb_compression_algorithm;
+
+/** TRUE if the server was successfully started */
+extern bool srv_was_started;
+
+/** Server undo tablespaces directory, can be absolute path. */
+extern char* srv_undo_dir;
+
+/** Number of undo tablespaces to use. */
+extern ulong srv_undo_tablespaces;
+
+/** The number of UNDO tablespaces that are active (hosting some rollback
+segment). It is quite possible that some of the tablespaces doesn't host
+any of the rollback-segment based on configuration used. */
+extern ulint srv_undo_tablespaces_active;
+
+/** Maximum size of undo tablespace. */
+extern unsigned long long srv_max_undo_log_size;
+
+extern uint srv_n_fil_crypt_threads;
+extern uint srv_n_fil_crypt_threads_started;
+
+/** Rate at which UNDO records should be purged. */
+extern ulong srv_purge_rseg_truncate_frequency;
+
+/** Enable or Disable Truncate of UNDO tablespace. */
+extern my_bool srv_undo_log_truncate;
+
+/* Optimize prefix index queries to skip cluster index lookup when possible */
+/* Enables or disables this prefix optimization. Disabled by default. */
+extern my_bool srv_prefix_index_cluster_optimization;
+
+/** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */
+constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
+ UNIV_PAGE_SIZE_DEF;
+
+extern char* srv_log_group_home_dir;
+
+/** The InnoDB redo log file size, or 0 when changing the redo log format
+at startup (while disallowing writes to the redo log). */
+extern ulonglong srv_log_file_size;
+extern ulong srv_log_buffer_size;
+extern ulong srv_flush_log_at_trx_commit;
+extern uint srv_flush_log_at_timeout;
+extern ulong srv_log_write_ahead_size;
+extern my_bool srv_adaptive_flushing;
+extern my_bool srv_flush_sync;
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+/* When this event is reset we do not allow any file writes to take place. */
+extern os_event_t srv_allow_writes_event;
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
+/* If this flag is TRUE, then we will load the indexes' (and tables') metadata
+even if they are marked as "corrupted". Mostly it is for DBA to process
+corrupted index and table */
+extern my_bool srv_load_corrupted;
+
+/** Requested size in bytes */
+extern ulint srv_buf_pool_size;
+/** Minimum pool size in bytes */
+extern const ulint srv_buf_pool_min_size;
+/** Default pool size in bytes */
+extern const ulint srv_buf_pool_def_size;
+/** Requested buffer pool chunk size */
+extern ulong srv_buf_pool_chunk_unit;
+/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/
+extern ulong srv_LRU_scan_depth;
+/** Whether or not to flush neighbors of a block */
+extern ulong srv_flush_neighbors;
+/** Previously requested size */
+extern ulint srv_buf_pool_old_size;
+/** Current size as scaling factor for the other components */
+extern ulint srv_buf_pool_base_size;
+/** Current size in bytes */
+extern ulint srv_buf_pool_curr_size;
+/** Dump this % of each buffer pool during BP dump */
+extern ulong srv_buf_pool_dump_pct;
+#ifdef UNIV_DEBUG
+/** Abort load after this amount of pages */
+extern ulong srv_buf_pool_load_pages_abort;
+#endif
+/** Lock table size in bytes */
+extern ulint srv_lock_table_size;
+
+extern uint srv_n_file_io_threads;
+extern my_bool srv_random_read_ahead;
+extern ulong srv_read_ahead_threshold;
+extern uint srv_n_read_io_threads;
+extern uint srv_n_write_io_threads;
+
+/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
+#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
+extern my_bool srv_defragment;
+extern uint srv_defragment_n_pages;
+extern uint srv_defragment_stats_accuracy;
+extern uint srv_defragment_fill_factor_n_recs;
+extern double srv_defragment_fill_factor;
+extern uint srv_defragment_frequency;
+extern ulonglong srv_defragment_interval;
+
+extern uint srv_change_buffer_max_size;
+
+/* Number of IO operations per second the server can do */
+extern ulong srv_io_capacity;
+
+/* We use this dummy default value at startup for max_io_capacity.
+The real value is set based on the value of io_capacity. */
+#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT (~0UL)
+#define SRV_MAX_IO_CAPACITY_LIMIT (~0UL)
+extern ulong srv_max_io_capacity;
+
+/* The "innodb_stats_method" setting, decides how InnoDB is going
+to treat NULL value when collecting statistics. It is not defined
+as enum type because the configure option takes unsigned integer type. */
+extern ulong srv_innodb_stats_method;
+
+extern ulint srv_max_n_open_files;
+
+extern double srv_max_buf_pool_modified_pct;
+extern double srv_max_dirty_pages_pct_lwm;
+
+extern double srv_adaptive_flushing_lwm;
+extern ulong srv_flushing_avg_loops;
+
+extern ulong srv_force_recovery;
+
+/** innodb_fast_shutdown=1 skips purge and change buffer merge.
+innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint).
+innodb_fast_shutdown=3 is a clean shutdown that skips the rollback
+of active transaction (to be done on restart). */
+extern uint srv_fast_shutdown;
+
+extern ibool srv_innodb_status;
+
+extern unsigned long long srv_stats_transient_sample_pages;
+extern my_bool srv_stats_persistent;
+extern unsigned long long srv_stats_persistent_sample_pages;
+extern my_bool srv_stats_auto_recalc;
+extern my_bool srv_stats_include_delete_marked;
+extern unsigned long long srv_stats_modified_counter;
+extern my_bool srv_stats_sample_traditional;
+
+extern my_bool srv_use_doublewrite_buf;
+extern ulong srv_checksum_algorithm;
+
+extern my_bool srv_force_primary_key;
+
+extern ulong srv_max_purge_lag;
+extern ulong srv_max_purge_lag_delay;
+
+extern my_bool innodb_encrypt_temporary_tables;
+
+extern my_bool srv_immediate_scrub_data_uncompressed;
+/*-------------------------------------------*/
+
+/** Modes of operation */
+enum srv_operation_mode {
+ /** Normal mode (MariaDB Server) */
+ SRV_OPERATION_NORMAL,
+ /** Mariabackup taking a backup */
+ SRV_OPERATION_BACKUP,
+ /** Mariabackup restoring a backup for subsequent --copy-back */
+ SRV_OPERATION_RESTORE,
+ /** Mariabackup restoring the incremental part of a backup */
+ SRV_OPERATION_RESTORE_DELTA,
+ /** Mariabackup restoring a backup for subsequent --export */
+ SRV_OPERATION_RESTORE_EXPORT
+};
+
+/** Current mode of operation */
+extern enum srv_operation_mode srv_operation;
+
+extern my_bool srv_print_innodb_monitor;
+extern my_bool srv_print_innodb_lock_monitor;
+extern ibool srv_print_verbose_log;
+
+extern bool srv_monitor_active;
+
+
+extern ulong srv_n_spin_wait_rounds;
+extern uint srv_spin_wait_delay;
+
+extern ulint srv_truncated_status_writes;
+/** Number of initialized rollback segments for persistent undo log */
+extern ulong srv_available_undo_logs;
+/** Iterations of the loop bounded by 'srv_active' label. */
+extern ulint srv_main_active_loops;
+/** Iterations of the loop bounded by the 'srv_idle' label. */
+extern ulint srv_main_idle_loops;
+/** Log writes involving flush. */
+extern ulint srv_log_writes_and_flush;
+
+#ifdef UNIV_DEBUG
+extern my_bool innodb_evict_tables_on_commit_debug;
+extern my_bool srv_sync_debug;
+extern my_bool srv_purge_view_update_only_debug;
+
+/** Value of MySQL global used to disable master thread. */
+extern my_bool srv_master_thread_disabled_debug;
+/** InnoDB system tablespace to set during recovery */
+extern uint srv_sys_space_size_debug;
+/** whether redo log file has been created at startup */
+extern bool srv_log_file_created;
+#endif /* UNIV_DEBUG */
+
+extern ulint srv_dml_needed_delay;
+
+#define SRV_MAX_N_IO_THREADS 130
+
+/** innodb_purge_threads; the number of purge tasks to use */
+extern uint srv_n_purge_threads;
+
+/* the number of pages to purge in one batch */
+extern ulong srv_purge_batch_size;
+
+/* the number of sync wait arrays */
+extern ulong srv_sync_array_size;
+
+/* print all user-level transactions deadlocks to mysqld stderr */
+extern my_bool srv_print_all_deadlocks;
+
+extern my_bool srv_cmp_per_index_enabled;
+
+/* is encryption enabled */
+extern ulong srv_encrypt_tables;
+
+/** Status variables to be passed to MySQL */
+extern struct export_var_t export_vars;
+
+/** Global counters */
+extern srv_stats_t srv_stats;
+
+/** Fatal semaphore wait threshold = maximum number of seconds
+that semaphore times out in InnoDB */
+#define DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT 600
+extern ulong srv_fatal_semaphore_wait_threshold;
+
+/** Buffer pool dump status frequence in percentages */
+extern ulong srv_buf_dump_status_frequency;
+
+# ifdef UNIV_PFS_THREAD
+extern mysql_pfs_key_t page_cleaner_thread_key;
+extern mysql_pfs_key_t trx_rollback_clean_thread_key;
+extern mysql_pfs_key_t thread_pool_thread_key;
+
+/* This macro register the current thread and its key with performance
+schema */
+# define pfs_register_thread(key) \
+do { \
+ struct PSI_thread* psi __attribute__((unused)) \
+ = PSI_CALL_new_thread(key, NULL, 0); \
+ PSI_CALL_set_thread_os_id(psi); \
+ PSI_CALL_set_thread(psi); \
+} while (0)
+
+/* This macro delist the current thread from performance schema */
+# define pfs_delete_thread() \
+do { \
+ PSI_CALL_delete_current_thread(); \
+} while (0)
+# else
+# define pfs_register_thread(key)
+# define pfs_delete_thread()
+# endif /* UNIV_PFS_THREAD */
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Performance schema stage event for monitoring ALTER TABLE progress
+everything after flush log_make_checkpoint(). */
+extern PSI_stage_info srv_stage_alter_table_end;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_insert_index_tuples(). */
+extern PSI_stage_info srv_stage_alter_table_insert;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_apply(). */
+extern PSI_stage_info srv_stage_alter_table_log_index;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_table_apply(). */
+extern PSI_stage_info srv_stage_alter_table_log_table;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_sort(). */
+extern PSI_stage_info srv_stage_alter_table_merge_sort;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_read_clustered_index(). */
+extern PSI_stage_info srv_stage_alter_table_read_pk_internal_sort;
+
+/** Performance schema stage event for monitoring buffer pool load progress. */
+extern PSI_stage_info srv_stage_buffer_pool_load;
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Alternatives for srv_force_recovery. Non-zero values are intended
+to help the user get a damaged database up so that he can dump intact
+tables and rows with SELECT INTO OUTFILE. The database must not otherwise
+be used with these options! A bigger number below means that all precautions
+of lower numbers are included. */
+enum {
+ SRV_FORCE_IGNORE_CORRUPT = 1, /*!< let the server run even if it
+ detects a corrupt page */
+ SRV_FORCE_NO_BACKGROUND = 2, /*!< prevent the main thread from
+ running: if a crash would occur
+ in purge, this prevents it */
+ SRV_FORCE_NO_TRX_UNDO = 3, /*!< do not run trx rollback after
+ recovery */
+ SRV_FORCE_NO_IBUF_MERGE = 4, /*!< prevent also ibuf operations:
+ if they would cause a crash, better
+ not do them */
+ SRV_FORCE_NO_UNDO_LOG_SCAN = 5, /*!< do not look at undo logs when
+ starting the database: InnoDB will
+ treat even incomplete transactions
+ as committed */
+ SRV_FORCE_NO_LOG_REDO = 6 /*!< do not do the log roll-forward
+ in connection with recovery */
+};
+
+/* Alternatives for srv_innodb_stats_method, which could be changed by
+setting innodb_stats_method */
+enum srv_stats_method_name_enum {
+ SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as
+ equal. This is the default setting
+ for innodb_stats_method */
+ SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as
+ NOT equal. */
+ SRV_STATS_NULLS_IGNORED /* NULL values are ignored */
+};
+
+typedef enum srv_stats_method_name_enum srv_stats_method_name_t;
+
+/*********************************************************************//**
+Boots Innobase server. */
+void
+srv_boot(void);
+/*==========*/
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+void
+srv_free(void);
+
+/** Wake up the purge if there is work to do. */
+void
+srv_wake_purge_thread_if_not_active();
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+ FILE* file, /*!< in: output stream */
+ ibool nowait, /*!< in: whether to wait for the
+ lock_sys_t::mutex */
+ ulint* trx_start, /*!< out: file position of the start of
+ the list of active transactions */
+ ulint* trx_end); /*!< out: file position of the end of
+ the list of active transactions */
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+void
+srv_export_innodb_status(void);
+/*==========================*/
+/*******************************************************************//**
+Get current server activity count.
+@return activity count. */
+ulint
+srv_get_activity_count(void);
+/*========================*/
+
+/******************************************************************//**
+Increment the server activity counter. */
+void
+srv_inc_activity_count(void);
+/*=========================*/
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/**
+Flag which is set, whenever innodb_purge_threads changes.
+It is read and reset in srv_do_purge().
+
+Thus it is Atomic_counter<int>, not bool, since unprotected
+reads are used. We just need an atomic with relaxed memory
+order, to please Thread Sanitizer.
+*/
+extern Atomic_counter<int> srv_purge_thread_count_changed;
+
+#ifdef UNIV_DEBUG
+/** @return whether purge or master task is active */
+bool srv_any_background_activity();
+#endif
+
+extern "C" {
+
+
+/** Periodic task which prints the info output by various InnoDB monitors.*/
+void srv_monitor_task(void*);
+
+
+/** The periodic master task controlling the server. */
+void srv_master_callback(void*);
+
+
+/**
+Complete the shutdown tasks such as background DROP TABLE,
+and optionally change buffer merge (on innodb_fast_shutdown=0). */
+void srv_shutdown(bool ibuf_merge);
+
+} /* extern "C" */
+
+#ifdef UNIV_DEBUG
+/** @return number of tasks in queue */
+ulint srv_get_task_queue_length();
+#endif
+
+/** Shut down the purge threads. */
+void srv_purge_shutdown();
+
+/** Init purge tasks*/
+void srv_init_purge_tasks();
+
+#ifdef UNIV_DEBUG
+/** Disables master thread. It's used by:
+ SET GLOBAL innodb_master_thread_disabled_debug = 1 (0).
+@param[in] save immediate result from check function */
+void
+srv_master_thread_disabled_debug_update(THD*, st_mysql_sys_var*, void*,
+ const void* save);
+#endif /* UNIV_DEBUG */
+
+/** Status variables to be passed to MySQL */
+struct export_var_t{
+ char innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */
+ char innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */
+ char innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */
+ my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */
+ ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */
+ ulint innodb_buffer_pool_pages_data; /*!< Data pages */
+ ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */
+ ulint innodb_buffer_pool_pages_dirty; /*!< Dirty data pages */
+ ulint innodb_buffer_pool_bytes_dirty; /*!< File bytes modified */
+ ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */
+ ulint innodb_buffer_pool_pages_free; /*!< Free pages */
+#ifdef UNIV_DEBUG
+ ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */
+#endif /* UNIV_DEBUG */
+ ulint innodb_buffer_pool_pages_made_not_young;
+ ulint innodb_buffer_pool_pages_made_young;
+ ulint innodb_buffer_pool_pages_old;
+ ulint innodb_buffer_pool_read_requests; /*!< buf_pool.stat.n_page_gets */
+ ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */
+ ulint innodb_buffer_pool_write_requests;/*!< srv_stats.buf_pool_write_requests */
+ ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */
+ ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */
+ ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
+ ulint innodb_checkpoint_age;
+ ulint innodb_checkpoint_max_age;
+ ulint innodb_data_pending_reads; /*!< Pending reads */
+ ulint innodb_data_pending_writes; /*!< Pending writes */
+ ulint innodb_data_pending_fsyncs; /*!< Pending fsyncs */
+ ulint innodb_data_fsyncs; /*!< Number of fsyncs so far */
+ ulint innodb_data_read; /*!< Data bytes read */
+ ulint innodb_data_writes; /*!< I/O write requests */
+ ulint innodb_data_written; /*!< Data bytes written */
+ ulint innodb_data_reads; /*!< I/O read requests */
+ ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */
+ ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */
+ ulint innodb_deadlocks;
+ ulint innodb_history_list_length;
+ ulint innodb_log_waits; /*!< srv_log_waits */
+ ulint innodb_log_write_requests; /*!< srv_log_write_requests */
+ ulint innodb_log_writes; /*!< srv_log_writes */
+ lsn_t innodb_lsn_current;
+ lsn_t innodb_lsn_flushed;
+ lsn_t innodb_lsn_last_checkpoint;
+ trx_id_t innodb_max_trx_id;
+#ifdef BTR_CUR_HASH_ADAPT
+ ulint innodb_mem_adaptive_hash;
+#endif
+ ulint innodb_mem_dictionary;
+ lsn_t innodb_os_log_written; /*!< srv_os_log_written */
+ ulint innodb_os_log_fsyncs; /*!< n_log_flushes */
+ ulint innodb_os_log_pending_writes; /*!< srv_os_log_pending_writes */
+ ulint innodb_os_log_pending_fsyncs; /*!< n_pending_log_flushes */
+ ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */
+ ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */
+ int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time
+ / 1000 */
+ ulint innodb_row_lock_time_avg; /*!< srv_n_lock_wait_time
+ / 1000
+ / srv_n_lock_wait_count */
+ ulint innodb_row_lock_time_max; /*!< srv_n_lock_max_wait_time
+ / 1000 */
+ ulint innodb_rows_read; /*!< srv_n_rows_read */
+ ulint innodb_rows_inserted; /*!< srv_n_rows_inserted */
+ ulint innodb_rows_updated; /*!< srv_n_rows_updated */
+ ulint innodb_rows_deleted; /*!< srv_n_rows_deleted */
+ ulint innodb_system_rows_read; /*!< srv_n_system_rows_read */
+ ulint innodb_system_rows_inserted; /*!< srv_n_system_rows_inserted */
+ ulint innodb_system_rows_updated; /*!< srv_n_system_rows_updated */
+ ulint innodb_system_rows_deleted; /*!< srv_n_system_rows_deleted*/
+ ulint innodb_truncated_status_writes; /*!< srv_truncated_status_writes */
+
+ /** Number of undo tablespace truncation operations */
+ ulong innodb_undo_truncations;
+ ulint innodb_defragment_compression_failures; /*!< Number of
+ defragment re-compression
+ failures */
+
+ ulint innodb_defragment_failures; /*!< Number of defragment
+ failures*/
+ ulint innodb_defragment_count; /*!< Number of defragment
+ operations*/
+
+ /** Number of instant ALTER TABLE operations that affect columns */
+ ulong innodb_instant_alter_column;
+
+ ulint innodb_onlineddl_rowlog_rows; /*!< Online alter rows */
+ ulint innodb_onlineddl_rowlog_pct_used; /*!< Online alter percentage
+ of used row log buffer */
+ ulint innodb_onlineddl_pct_progress; /*!< Online alter progress */
+
+ int64_t innodb_page_compression_saved;/*!< Number of bytes saved
+ by page compression */
+ int64_t innodb_index_pages_written; /*!< Number of index pages
+ written */
+ int64_t innodb_non_index_pages_written; /*!< Number of non index pages
+ written */
+ int64_t innodb_pages_page_compressed;/*!< Number of pages
+ compressed by page compression */
+ int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations
+ induced by page compression */
+ int64_t innodb_pages_page_decompressed;/*!< Number of pages
+ decompressed by page
+ compression */
+ int64_t innodb_pages_page_compression_error;/*!< Number of page
+ compression errors */
+ int64_t innodb_pages_encrypted; /*!< Number of pages
+ encrypted */
+ int64_t innodb_pages_decrypted; /*!< Number of pages
+ decrypted */
+
+ /*!< Number of merge blocks encrypted */
+ ib_int64_t innodb_n_merge_blocks_encrypted;
+ /*!< Number of merge blocks decrypted */
+ ib_int64_t innodb_n_merge_blocks_decrypted;
+ /*!< Number of row log blocks encrypted */
+ ib_int64_t innodb_n_rowlog_blocks_encrypted;
+ /*!< Number of row log blocks decrypted */
+ ib_int64_t innodb_n_rowlog_blocks_decrypted;
+
+ /* Number of temporary tablespace pages encrypted */
+ ib_int64_t innodb_n_temp_blocks_encrypted;
+
+ /* Number of temporary tablespace pages decrypted */
+ ib_int64_t innodb_n_temp_blocks_decrypted;
+
+ ulint innodb_sec_rec_cluster_reads; /*!< srv_sec_rec_cluster_reads */
+ ulint innodb_sec_rec_cluster_reads_avoided;/*!< srv_sec_rec_cluster_reads_avoided */
+
+ ulint innodb_encryption_rotation_pages_read_from_cache;
+ ulint innodb_encryption_rotation_pages_read_from_disk;
+ ulint innodb_encryption_rotation_pages_modified;
+ ulint innodb_encryption_rotation_pages_flushed;
+ ulint innodb_encryption_rotation_estimated_iops;
+ int64_t innodb_encryption_key_requests;
+ int64_t innodb_key_rotation_list_length;
+};
+
+/** Thread slot in the thread table. */
+struct srv_slot_t{
+ ibool in_use; /*!< TRUE if this slot
+ is in use */
+ /** time(NULL) when the thread was suspended.
+ FIXME: Use my_interval_timer() or similar, to avoid bogus
+ timeouts in lock_wait_check_and_cancel() or lock_wait_suspend_thread()
+ when the system time is adjusted to the past!
+
+ FIXME: This is duplicating trx_lock_t::wait_started,
+ which is being used for diagnostic purposes only. */
+ time_t suspend_time;
+ ulong wait_timeout; /*!< wait time that if exceeded
+ the thread will be timed out.
+ Initialized by
+ lock_wait_table_reserve_slot()
+ for lock wait */
+ os_event_t event; /*!< event used in suspending
+ the thread when it has nothing
+ to do */
+ que_thr_t* thr; /*!< suspended query thread
+ (only used for user threads) */
+};
+
+extern tpool::thread_pool *srv_thread_pool;
+extern std::unique_ptr<tpool::timer> srv_master_timer;
+extern std::unique_ptr<tpool::timer> srv_monitor_timer;
+
+/** The interval at which srv_monitor_task is invoked, in milliseconds */
+constexpr unsigned SRV_MONITOR_INTERVAL= 15000; /* 4 times per minute */
+
+static inline void srv_monitor_timer_schedule_now()
+{
+ srv_monitor_timer->set_time(0, SRV_MONITOR_INTERVAL);
+}
+static inline void srv_start_periodic_timer(std::unique_ptr<tpool::timer>& t,
+ void (*func)(void*), int period)
+{
+ t.reset(srv_thread_pool->create_timer(func));
+ t->set_time(0, period);
+}
+
+void srv_thread_pool_init();
+void srv_thread_pool_end();
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
new file mode 100644
index 00000000..324e3f04
--- /dev/null
+++ b/storage/innobase/include/srv0start.h
@@ -0,0 +1,129 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0start.h
+Starts the Innobase database server
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0log.h"
+#include "ut0byte.h"
+
+// Forward declaration
+struct dict_table_t;
+
+/** Open the configured number of dedicated undo tablespaces.
+@param[in] create_new_db whether the database is being initialized
+@return DB_SUCCESS or error code */
+dberr_t
+srv_undo_tablespaces_init(bool create_new_db);
+
+/** Start InnoDB.
+@param[in] create_new_db whether to create a new database
+@return DB_SUCCESS or error code */
+dberr_t srv_start(bool create_new_db);
+
+/**
+ Shutdown purge to make sure that there is no possibility that we call any
+ plugin code (e.g., audit) inside virtual column computation.
+*/
+void innodb_preshutdown();
+
+/** Shut down InnoDB. */
+void innodb_shutdown();
+
+/** Shut down background threads that can generate undo log. */
+void srv_shutdown_bg_undo_sources();
+
+/*************************************************************//**
+Copy the file path component of the physical file to parameter. It will
+copy up to and including the terminating path separator.
+@return number of bytes copied or ULINT_UNDEFINED if destination buffer
+ is smaller than the path to be copied. */
+ulint
+srv_path_copy(
+/*==========*/
+ char* dest, /*!< out: destination buffer */
+ ulint dest_len, /*!< in: max bytes to copy */
+ const char* basedir, /*!< in: base directory */
+ const char* table_name) /*!< in: source table name */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Get the meta-data filename from the table name for a
+single-table tablespace.
+@param[in] table table object
+@param[out] filename filename
+@param[in] max_len filename max length */
+void
+srv_get_meta_data_filename(
+ dict_table_t* table,
+ char* filename,
+ ulint max_len);
+
+/** Get the encryption-data filename from the table name for a
+single-table tablespace.
+@param[in] table table object
+@param[out] filename filename
+@param[in] max_len filename max length */
+void
+srv_get_encryption_data_filename(
+ dict_table_t* table,
+ char* filename,
+ ulint max_len);
+
+/** Log sequence number at shutdown */
+extern lsn_t srv_shutdown_lsn;
+
+/** TRUE if the server is being started */
+extern bool srv_is_being_started;
+/** TRUE if SYS_TABLESPACES is available for lookups */
+extern bool srv_sys_tablespaces_open;
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+extern bool srv_startup_is_before_trx_rollback_phase;
+
+/** TRUE if a raw partition is in use */
+extern ibool srv_start_raw_disk_in_use;
+
+/** Shutdown state */
+enum srv_shutdown_t {
+ SRV_SHUTDOWN_NONE = 0, /*!< Database running normally */
+ /** Shutdown initiated in srv_shutdown_bg_undo_sources() */
+ SRV_SHUTDOWN_INITIATED,
+ SRV_SHUTDOWN_CLEANUP, /*!< Cleaning up in
+ logs_empty_and_mark_files_at_shutdown() */
+ SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
+ the buffer pool can be freed: flush
+ all file spaces and close all files */
+ SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */
+};
+
+/** Whether any undo log records can be generated */
+extern bool srv_undo_sources;
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+extern enum srv_shutdown_t srv_shutdown_state;
+
+/** Files comprising the system tablespace */
+extern pfs_os_file_t files[1000];
diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h
new file mode 100644
index 00000000..f9f923f9
--- /dev/null
+++ b/storage/innobase/include/sync0arr.h
@@ -0,0 +1,129 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0arr.h
+The wait array used in synchronization primitives
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0arr_h
+#define sync0arr_h
+
+#include "univ.i"
+
+/** Synchronization wait array cell */
+struct sync_cell_t;
+
+/** Synchronization wait array */
+struct sync_array_t;
+
+/******************************************************************//**
+Get an instance of the sync wait array and reserve a wait array cell
+in the instance for waiting for an object. The event of the cell is
+reset to nonsignalled state.
+If reserving cell of the instance fails, try to get another new
+instance until we can reserve an empty cell of it.
+@return the sync array found, never NULL. */
+UNIV_INLINE
+sync_array_t*
+sync_array_get_and_reserve_cell(
+ void* object, /*!< in: pointer to the object to wait for */
+ ulint type, /*!< in: lock request type */
+ const char* file, /*!< in: file where requested */
+ unsigned line, /*!< in: line where requested */
+ sync_cell_t** cell); /*!< out: the cell reserved, never NULL */
+/******************************************************************//**
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state. */
+sync_cell_t*
+sync_array_reserve_cell(
+ sync_array_t* arr, /*!< in: wait array */
+ void* object, /*!< in: pointer to the object to wait for */
+ ulint type, /*!< in: lock request type */
+ const char* file, /*!< in: file where requested */
+ unsigned line); /*!< in: line where requested */
+
+/******************************************************************//**
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+void
+sync_array_wait_event(
+ sync_array_t* arr, /*!< in: wait array */
+ sync_cell_t*& cell); /*!< in: the reserved cell */
+
+/******************************************************************//**
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+void
+sync_array_free_cell(
+ sync_array_t* arr, /*!< in: wait array */
+ sync_cell_t*& cell); /*!< in: the reserved cell */
+
+/** count of how many times an object has been signalled */
+extern ulint sg_count;
+#define sync_array_object_signalled() ++sg_count
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return TRUE if fatal semaphore wait threshold was exceeded */
+ibool
+sync_array_print_long_waits(
+ os_thread_id_t* waiter, /*!< out: longest waiting thread */
+ const void** sema); /*!< out: longest-waited-for semaphore */
+
+/**********************************************************************//**
+Prints info of the wait array. */
+void
+sync_array_print(
+ FILE* file); /*!< in: file where to print */
+
+/** Create the primary system wait arrays */
+void sync_array_init();
+
+/** Destroy the sync array wait sub-system. */
+void sync_array_close();
+
+/**********************************************************************//**
+Get an instance of the sync wait array. */
+UNIV_INLINE
+sync_array_t*
+sync_array_get();
+/**********************************************************************//**
+Prints info of the wait array without using any mutexes/semaphores. */
+UNIV_INTERN
+void
+sync_array_print_innodb(void);
+
+/*****************************************************************//**
+Gets the nth cell in array.
+@return cell */
+UNIV_INTERN
+sync_cell_t*
+sync_array_get_nth_cell(
+/*====================*/
+ sync_array_t* arr, /*!< in: sync array */
+ ulint n); /*!< in: index */
+
+#include "sync0arr.ic"
+
+#endif /* sync0arr_h */
diff --git a/storage/innobase/include/sync0arr.ic b/storage/innobase/include/sync0arr.ic
new file mode 100644
index 00000000..962226b4
--- /dev/null
+++ b/storage/innobase/include/sync0arr.ic
@@ -0,0 +1,85 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0arr.ic
+The wait array for synchronization primitives
+
+Inline code
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+extern ulint sync_array_size;
+extern sync_array_t** sync_wait_array;
+
+#include "ut0counter.h"
+
+/**********************************************************************//**
+Get an instance of the sync wait array.
+@return an instance of the sync wait array. */
+
+UNIV_INLINE
+sync_array_t*
+sync_array_get()
+/*============*/
+{
+ if (sync_array_size <= 1) {
+ return(sync_wait_array[0]);
+ }
+
+ return(sync_wait_array[get_rnd_value() % sync_array_size]);
+}
+
+/******************************************************************//**
+Get an instance of the sync wait array and reserve a wait array cell
+in the instance for waiting for an object. The event of the cell is
+reset to nonsignalled state.
+If reserving cell of the instance fails, try to get another new
+instance until we can reserve an empty cell of it.
+@return the sync array reserved, never NULL. */
+UNIV_INLINE
+sync_array_t*
+sync_array_get_and_reserve_cell(
+/*============================*/
+ void* object, /*!< in: pointer to the object to wait for */
+ ulint type, /*!< in: lock request type */
+ const char* file, /*!< in: file where requested */
+ unsigned line, /*!< in: line where requested */
+ sync_cell_t** cell) /*!< out: the cell reserved, never NULL */
+{
+ sync_array_t* sync_arr = NULL;
+
+ *cell = NULL;
+ for (ulint i = 0; i < sync_array_size && *cell == NULL; ++i) {
+ /* Although the sync_array is get in a random way currently,
+ we still try at most sync_array_size times, in case any
+ of the sync_array we get is full */
+ sync_arr = sync_array_get();
+ *cell = sync_array_reserve_cell(sync_arr, object, type,
+ file, line);
+ }
+
+ /* This won't be true every time, for the loop above may execute
+ more than srv_sync_array_size times to reserve a cell.
+ But an assertion here makes the code more solid. */
+ ut_a(*cell != NULL);
+
+ return(sync_arr);
+}
diff --git a/storage/innobase/include/sync0debug.h b/storage/innobase/include/sync0debug.h
new file mode 100644
index 00000000..07e98546
--- /dev/null
+++ b/storage/innobase/include/sync0debug.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0debug.h
+Debug checks for latches, header file
+
+Created 2012-08-21 Sunny Bains
+*******************************************************/
+
+#ifndef sync0debug_h
+#define sync0debug_h
+
+#include "univ.i"
+
+/** Initializes the synchronization data structures. */
+void
+sync_check_init();
+
+/** Free the InnoDB synchronization data structures. */
+void
+sync_check_close();
+
+#ifdef UNIV_DEBUG
+/** Check if it is OK to acquire the latch.
+@param[in] latch latch type */
+void
+sync_check_lock_validate(const latch_t* latch);
+
+/** Note that the lock has been granted
+@param[in] latch latch type */
+void
+sync_check_lock_granted(const latch_t* latch);
+
+/** Check if it is OK to acquire the latch.
+@param[in] latch latch type
+@param[in] level the level of the mutex */
+void
+sync_check_lock(const latch_t* latch, latch_level_t level);
+
+/**
+Check if it is OK to re-acquire the lock. */
+void
+sync_check_relock(const latch_t* latch);
+
+/** Removes a latch from the thread level array if it is found there.
+@param[in] latch to unlock */
+void
+sync_check_unlock(const latch_t* latch);
+
+/** Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@param[in] level to find
+@return a matching latch, or NULL if not found */
+const latch_t*
+sync_check_find(latch_level_t level);
+
+/** Checks that the level array for the current thread is empty.
+Terminate iteration if the functor returns true.
+@param[in] functor called for each element.
+@return true if the functor returns true for any element */
+bool
+sync_check_iterate(const sync_check_functor_t& functor);
+
+/** Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
+because the debug mutex is also acquired in sync0arr while holding the OS
+mutex protecting the sync array, and the ordinary mutex_enter might
+recursively call routines in sync0arr, leading to a deadlock on the OS
+mutex. */
+void
+rw_lock_debug_mutex_enter();
+
+/** Releases the debug mutex. */
+void
+rw_lock_debug_mutex_exit();
+
+#endif /* UNIV_DEBUG */
+
+#endif /* !sync0debug_h */
diff --git a/storage/innobase/include/sync0policy.h b/storage/innobase/include/sync0policy.h
new file mode 100644
index 00000000..68397827
--- /dev/null
+++ b/storage/innobase/include/sync0policy.h
@@ -0,0 +1,296 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/sync0policy.h
+Policies for mutexes.
+
+Created 2012-08-21 Sunny Bains.
+***********************************************************************/
+
+#ifndef sync0policy_h
+#define sync0policy_h
+
+#include "ut0rnd.h"
+#include "os0thread.h"
+#include "srv0mon.h"
+#include "sync0debug.h"
+
+#ifdef UNIV_DEBUG
+
+template <typename Mutex> class MutexDebug: public latch_t
+{
+ /** Mutex to check for lock order violation */
+ const Mutex *m_mutex;
+ /** Filename from where enter was called */
+ const char *m_filename;
+ /** Line mumber in filename */
+ unsigned m_line;
+ /** Thread ID of the thread that owns the mutex */
+ os_thread_id_t m_thread_id;
+ /** Mutex protecting the above members */
+ mutable OSMutex m_debug_mutex;
+
+
+ void set(const Mutex *mutex, const char *filename, unsigned line,
+ os_thread_id_t thread_id)
+ {
+ m_debug_mutex.enter();
+ m_mutex= mutex;
+ m_filename= filename;
+ m_line= line;
+ m_thread_id= thread_id;
+ m_debug_mutex.exit();
+ }
+
+
+ const MutexDebug get() const
+ {
+ MutexDebug ret;
+ m_debug_mutex.enter();
+ ret.m_mutex= m_mutex;
+ ret.m_filename= m_filename;
+ ret.m_line= m_line;
+ ret.m_thread_id= m_thread_id;
+ m_debug_mutex.exit();
+ return ret;
+ }
+
+
+ /**
+ Called either when mutex is locked or destroyed. Thus members are protected
+ from concurrent modification.
+ */
+ void assert_clean_context()
+ {
+ ut_ad(!m_mutex);
+ ut_ad(!m_filename);
+ ut_ad(!m_line);
+ ut_ad(m_thread_id == os_thread_id_t(ULINT_UNDEFINED));
+ }
+
+
+public:
+ /**
+ Called when the mutex is "created". Note: Not from the constructor
+ but when the mutex is initialised.
+ @param[in] id Mutex ID
+ */
+ void init(latch_id_t id)
+ {
+ ut_ad(id != LATCH_ID_NONE);
+ m_id= id;
+ m_debug_mutex.init();
+ set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED));
+ }
+
+
+ /** Mutex is being destroyed. */
+ void destroy()
+ {
+ assert_clean_context();
+ m_debug_mutex.destroy();
+ }
+
+
+ /**
+ Called when an attempt is made to lock the mutex
+ @param[in] mutex Mutex instance to be locked
+ @param[in] filename Filename from where it was called
+ @param[in] line Line number from where it was called
+ */
+ void enter(const Mutex &mutex, const char *filename, unsigned line)
+ {
+ MutexDebug context;
+ ut_ad(!is_owned());
+ context.init(m_id);
+ context.set(&mutex, filename, line, os_thread_get_curr_id());
+ /* Check for latch order violation. */
+ sync_check_lock_validate(&context);
+ context.set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED));
+ context.destroy();
+ }
+
+
+ /**
+ Called when the mutex is locked
+ @param[in] mutex Mutex instance that was locked
+ @param[in] filename Filename from where it was called
+ @param[in] line Line number from where it was called
+ */
+ void locked(const Mutex &mutex, const char *filename, unsigned line)
+ {
+ assert_clean_context();
+ set(&mutex, filename, line, os_thread_get_curr_id());
+ sync_check_lock_granted(this);
+ }
+
+
+ /**
+ Called when the mutex is released
+ @param[in] mutex Mutex that was released
+ */
+ void release(const Mutex &mutex)
+ {
+ ut_ad(is_owned());
+ set(0, 0, 0, os_thread_id_t(ULINT_UNDEFINED));
+ sync_check_unlock(this);
+ }
+
+
+ /** @return true if thread owns the mutex */
+ bool is_owned() const
+ {
+ return os_thread_eq(get_thread_id(), os_thread_get_curr_id());
+ }
+
+
+ /** @return the name of the file from the mutex was acquired */
+ const char* get_enter_filename() const { return get().m_filename; }
+
+
+ /** @return the name of the file from the mutex was acquired */
+ unsigned get_enter_line() const { return get().m_line; }
+
+
+ /** @return id of the thread that was trying to acquire the mutex */
+ os_thread_id_t get_thread_id() const { return get().m_thread_id; }
+
+
+ /**
+ Print information about the latch
+ @return the string representation
+ */
+ virtual std::string to_string() const
+ {
+ std::ostringstream msg;
+ const MutexDebug ctx= get();
+
+ msg << m_mutex->policy().to_string();
+ if (ctx.m_mutex)
+ msg << " addr: " << ctx.m_mutex << " acquired: "
+ << sync_basename(ctx.get_enter_filename()) << ":"
+ << ctx.get_enter_line();
+ else
+ msg << "Not locked";
+
+ return(msg.str());
+ }
+};
+#endif /* UNIV_DEBUG */
+
+/** Collect the metrics per mutex instance, no aggregation. */
+template <typename Mutex>
+struct GenericPolicy
+{
+public:
+ /** Called when the mutex is "created". Note: Not from the constructor
+ but when the mutex is initialised.
+ @param[in] id Mutex ID
+ @param[in] filename File where mutex was created
+ @param[in] line Line in filename */
+ void init(
+ const Mutex&,
+ latch_id_t id,
+ const char* filename,
+ uint32_t line)
+ UNIV_NOTHROW
+ {
+ m_id = id;
+
+ latch_meta_t& meta = sync_latch_get_meta(id);
+
+ ut_ad(meta.get_id() == id);
+
+ meta.get_counter()->single_register(&m_count);
+
+ m_filename = filename;
+ m_line = line;
+ }
+
+ /** Called when the mutex is destroyed. */
+ void destroy()
+ UNIV_NOTHROW
+ {
+ latch_meta_t& meta = sync_latch_get_meta(m_id);
+
+ meta.get_counter()->single_deregister(&m_count);
+ }
+
+ /** Called after a successful mutex acquire.
+ @param[in] n_spins Number of times the thread did
+ spins while trying to acquire the mutex
+ @param[in] n_waits Number of times the thread waited
+ in some type of OS queue */
+ void add(
+ uint32_t n_spins,
+ uint32_t n_waits)
+ UNIV_NOTHROW
+ {
+ /* Currently global on/off. Keeps things simple and fast */
+
+ if (!m_count.m_enabled) {
+
+ return;
+ }
+
+ m_count.m_spins += n_spins;
+ m_count.m_waits += n_waits;
+
+ ++m_count.m_calls;
+ }
+
+ /** Print the information about the latch
+ @return the string representation */
+ std::string print() const
+ UNIV_NOTHROW;
+
+ /** @return the latch ID */
+ latch_id_t get_id() const
+ UNIV_NOTHROW
+ {
+ return(m_id);
+ }
+
+
+ /** @return the string representation */
+ std::string to_string() const
+ {
+ return sync_mutex_to_string(get_id(),
+ std::string(m_filename)
+ .append(":")
+ .append(std::to_string(m_line)));
+ }
+
+#ifdef UNIV_DEBUG
+ MutexDebug<Mutex> context;
+#endif
+
+private:
+ const char *m_filename;
+ uint32_t m_line;
+
+ /** The user visible counters, registered with the meta-data. */
+ latch_meta_t::CounterType::Count m_count;
+
+ /** Latch meta data ID */
+ latch_id_t m_id;
+};
+
+#endif /* sync0policy_h */
diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h
new file mode 100644
index 00000000..084acc51
--- /dev/null
+++ b/storage/innobase/include/sync0rw.h
@@ -0,0 +1,838 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0rw.h
+The read-write lock (for threads, not for database transactions)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0rw_h
+#define sync0rw_h
+
+#include "os0event.h"
+#include "ut0mutex.h"
+#include "ilist.h"
+
+/** Counters for RW locks. */
+struct rw_lock_stats_t {
+ typedef ib_counter_t<int64_t, IB_N_SLOTS> int64_counter_t;
+
+ /** number of spin waits on rw-latches,
+ resulted during shared (read) locks */
+ int64_counter_t rw_s_spin_wait_count;
+
+ /** number of spin loop rounds on rw-latches,
+ resulted during shared (read) locks */
+ int64_counter_t rw_s_spin_round_count;
+
+ /** number of OS waits on rw-latches,
+ resulted during shared (read) locks */
+ int64_counter_t rw_s_os_wait_count;
+
+ /** number of spin waits on rw-latches,
+ resulted during exclusive (write) locks */
+ int64_counter_t rw_x_spin_wait_count;
+
+ /** number of spin loop rounds on rw-latches,
+ resulted during exclusive (write) locks */
+ int64_counter_t rw_x_spin_round_count;
+
+ /** number of OS waits on rw-latches,
+ resulted during exclusive (write) locks */
+ int64_counter_t rw_x_os_wait_count;
+
+ /** number of spin waits on rw-latches,
+ resulted during sx locks */
+ int64_counter_t rw_sx_spin_wait_count;
+
+ /** number of spin loop rounds on rw-latches,
+ resulted during sx locks */
+ int64_counter_t rw_sx_spin_round_count;
+
+ /** number of OS waits on rw-latches,
+ resulted during sx locks */
+ int64_counter_t rw_sx_os_wait_count;
+};
+
+/* Latch types; these are used also in btr0btr.h and mtr0mtr.h: keep the
+numerical values smaller than 30 (smaller than BTR_MODIFY_TREE and
+MTR_MEMO_MODIFY) and the order of the numerical values like below! and they
+should be 2pow value to be used also as ORed combination of flag. */
+enum rw_lock_type_t {
+ RW_S_LATCH = 1,
+ RW_X_LATCH = 2,
+ RW_SX_LATCH = 4,
+ RW_NO_LATCH = 8
+};
+
+/* We decrement lock_word by X_LOCK_DECR for each x_lock. It is also the
+start value for the lock_word, meaning that it limits the maximum number
+of concurrent read locks before the rw_lock breaks. */
+/* We decrement lock_word by X_LOCK_HALF_DECR for sx_lock. */
+#define X_LOCK_DECR 0x20000000
+#define X_LOCK_HALF_DECR 0x10000000
+
+#ifdef rw_lock_t
+#undef rw_lock_t
+#endif
+struct rw_lock_t;
+
+#ifdef UNIV_DEBUG
+struct rw_lock_debug_t;
+#endif /* UNIV_DEBUG */
+
+extern ilist<rw_lock_t> rw_lock_list;
+extern ib_mutex_t rw_lock_list_mutex;
+
+/** Counters for RW locks. */
+extern rw_lock_stats_t rw_lock_stats;
+
+#ifndef UNIV_PFS_RWLOCK
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed.
+if MySQL performance schema is enabled and "UNIV_PFS_RWLOCK" is
+defined, the rwlock are instrumented with performance schema probes. */
+# ifdef UNIV_DEBUG
+# define rw_lock_create(K, L, level) \
+ rw_lock_create_func((L), (level), __FILE__, __LINE__)
+# else /* UNIV_DEBUG */
+# define rw_lock_create(K, L, level) \
+ rw_lock_create_func((L), __FILE__, __LINE__)
+# endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+NOTE! The following macros should be used in rw locking and
+unlocking, not the corresponding function. */
+
+# define rw_lock_s_lock(M) \
+ rw_lock_s_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_s_lock_inline(M, P, F, L) \
+ rw_lock_s_lock_func((M), (P), (F), (L))
+
+# define rw_lock_s_lock_gen(M, P) \
+ rw_lock_s_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_s_lock_nowait(M, F, L) \
+ rw_lock_s_lock_low((M), 0, (F), (L))
+
+# ifdef UNIV_DEBUG
+# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(P, L)
+# else
+# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L)
+# endif /* UNIV_DEBUG */
+
+#define rw_lock_sx_lock(L) \
+ rw_lock_sx_lock_func((L), 0, __FILE__, __LINE__)
+
+#define rw_lock_sx_lock_inline(M, P, F, L) \
+ rw_lock_sx_lock_func((M), (P), (F), (L))
+
+#define rw_lock_sx_lock_gen(M, P) \
+ rw_lock_sx_lock_func((M), (P), __FILE__, __LINE__)
+
+#define rw_lock_sx_lock_nowait(M, P) \
+ rw_lock_sx_lock_low((M), (P), __FILE__, __LINE__)
+
+#define rw_lock_sx_lock(L) \
+ rw_lock_sx_lock_func((L), 0, __FILE__, __LINE__)
+
+#define rw_lock_sx_lock_inline(M, P, F, L) \
+ rw_lock_sx_lock_func((M), (P), (F), (L))
+
+#define rw_lock_sx_lock_gen(M, P) \
+ rw_lock_sx_lock_func((M), (P), __FILE__, __LINE__)
+
+#define rw_lock_sx_lock_nowait(M, P) \
+ rw_lock_sx_lock_low((M), (P), __FILE__, __LINE__)
+
+# ifdef UNIV_DEBUG
+# define rw_lock_sx_unlock(L) rw_lock_sx_unlock_func(0, L)
+# define rw_lock_sx_unlock_gen(L, P) rw_lock_sx_unlock_func(P, L)
+# else /* UNIV_DEBUG */
+# define rw_lock_sx_unlock(L) rw_lock_sx_unlock_func(L)
+# define rw_lock_sx_unlock_gen(L, P) rw_lock_sx_unlock_func(L)
+# endif /* UNIV_DEBUG */
+
+# define rw_lock_x_lock(M) \
+ rw_lock_x_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_x_lock_inline(M, P, F, L) \
+ rw_lock_x_lock_func((M), (P), (F), (L))
+
+# define rw_lock_x_lock_gen(M, P) \
+ rw_lock_x_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_x_lock_nowait(M) \
+ rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__)
+
+# define rw_lock_x_lock_func_nowait_inline(M, F, L) \
+ rw_lock_x_lock_func_nowait((M), (F), (L))
+
+# ifdef UNIV_DEBUG
+# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(P, L)
+# else
+# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L)
+# endif
+
+# define rw_lock_free(M) rw_lock_free_func(M)
+
+#else /* !UNIV_PFS_RWLOCK */
+
+/* Following macros point to Performance Schema instrumented functions. */
+# ifdef UNIV_DEBUG
+# define rw_lock_create(K, L, level) \
+ pfs_rw_lock_create_func((K), (L), (level), __FILE__, __LINE__)
+# else /* UNIV_DEBUG */
+# define rw_lock_create(K, L, level) \
+ pfs_rw_lock_create_func((K), (L), __FILE__, __LINE__)
+# endif /* UNIV_DEBUG */
+
+/******************************************************************
+NOTE! The following macros should be used in rw locking and
+unlocking, not the corresponding function. */
+
+# define rw_lock_s_lock(M) \
+ pfs_rw_lock_s_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_s_lock_inline(M, P, F, L) \
+ pfs_rw_lock_s_lock_func((M), (P), (F), (L))
+
+# define rw_lock_s_lock_gen(M, P) \
+ pfs_rw_lock_s_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_s_lock_nowait(M, F, L) \
+ pfs_rw_lock_s_lock_low((M), 0, (F), (L))
+
+# ifdef UNIV_DEBUG
+# define rw_lock_s_unlock_gen(L, P) pfs_rw_lock_s_unlock_func(P, L)
+# else
+# define rw_lock_s_unlock_gen(L, P) pfs_rw_lock_s_unlock_func(L)
+# endif
+
+# define rw_lock_sx_lock(M) \
+ pfs_rw_lock_sx_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_sx_lock_inline(M, P, F, L) \
+ pfs_rw_lock_sx_lock_func((M), (P), (F), (L))
+
+# define rw_lock_sx_lock_gen(M, P) \
+ pfs_rw_lock_sx_lock_func((M), (P), __FILE__, __LINE__)
+
+#define rw_lock_sx_lock_nowait(M, P) \
+ pfs_rw_lock_sx_lock_low((M), (P), __FILE__, __LINE__)
+
+# ifdef UNIV_DEBUG
+# define rw_lock_sx_unlock(L) pfs_rw_lock_sx_unlock_func(0, L)
+# define rw_lock_sx_unlock_gen(L, P) pfs_rw_lock_sx_unlock_func(P, L)
+# else
+# define rw_lock_sx_unlock(L) pfs_rw_lock_sx_unlock_func(L)
+# define rw_lock_sx_unlock_gen(L, P) pfs_rw_lock_sx_unlock_func(L)
+# endif
+
+# define rw_lock_x_lock(M) \
+ pfs_rw_lock_x_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_x_lock_inline(M, P, F, L) \
+ pfs_rw_lock_x_lock_func((M), (P), (F), (L))
+
+# define rw_lock_x_lock_gen(M, P) \
+ pfs_rw_lock_x_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_x_lock_nowait(M) \
+ pfs_rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__)
+
+# define rw_lock_x_lock_func_nowait_inline(M, F, L) \
+ pfs_rw_lock_x_lock_func_nowait((M), (F), (L))
+
+# ifdef UNIV_DEBUG
+# define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(P, L)
+# else
+# define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(L)
+# endif
+
+# define rw_lock_free(M) pfs_rw_lock_free_func(M)
+
+#endif /* !UNIV_PFS_RWLOCK */
+
+#define rw_lock_s_unlock(L) rw_lock_s_unlock_gen(L, 0)
+#define rw_lock_x_unlock(L) rw_lock_x_unlock_gen(L, 0)
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+void
+rw_lock_create_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+ latch_level_t level, /*!< in: level */
+#endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ unsigned cline); /*!< in: file line where created */
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the rw-lock is freed. Removes an rw-lock object from the global list. The
+rw-lock is checked to be in the non-locked state. */
+void
+rw_lock_free_func(
+/*==============*/
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return true */
+bool
+rw_lock_validate(
+/*=============*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+#endif /* UNIV_DEBUG */
+/******************************************************************//**
+Low-level function which tries to lock an rw-lock in s-mode.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass MY_ATTRIBUTE((unused)),
+ /*!< in: pass value; != 0, if the lock will be
+ passed to another thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function, except if
+you supply the file name and line number. Lock an rw-lock in shared mode
+for the current thread. If the rw-lock is locked in exclusive mode, or
+there is an exclusive lock request waiting, the function spins a preset
+time (controlled by srv_n_spin_wait_rounds), waiting for the lock, before
+suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by srv_n_spin_wait_rounds), waiting
+for the lock, before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+void
+rw_lock_x_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Low-level function for acquiring an sx lock.
+@return FALSE if did not succeed, TRUE if success. */
+ibool
+rw_lock_sx_lock_low(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in SX mode for the current thread. If the rw-lock is locked
+in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single sx-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+void
+rw_lock_sx_lock_func(
+/*=================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+
+/******************************************************************//**
+Releases an sx mode lock. */
+UNIV_INLINE
+void
+rw_lock_sx_unlock_func(
+/*===================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+
+/******************************************************************//**
+This function is used in the insert buffer to move the ownership of an
+x-latch on a buffer frame to the current thread. The x-latch was set by
+the buffer read operation and it protected the buffer frame while the
+read was done. The ownership is moved because we want that the current
+thread is able to acquire a second x-latch which is stored in an mtr.
+This, in turn, is needed to pass the debug checks of index page
+operations. */
+void
+rw_lock_x_lock_move_ownership(
+/*==========================*/
+ rw_lock_t* lock); /*!< in: lock which was x-locked in the
+ buffer read */
+/******************************************************************//**
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/******************************************************************//**
+Returns the number of sx-lock for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_sx_lock_count(
+/*======================*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/******************************************************************//**
+Returns the write-status of the lock - this function made more sense
+with the old rw_lock implementation.
+@return RW_LOCK_NOT_LOCKED, RW_LOCK_X, RW_LOCK_X_WAIT, RW_LOCK_SX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/******************************************************************//**
+Returns the number of readers (s-locks).
+@return number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/******************************************************************//**
+Decrements lock_word the specified amount if it is greater than 0.
+This is used by both s_lock and x_lock operations.
+@return true if decr occurs */
+UNIV_INLINE
+bool
+rw_lock_lock_word_decr(
+/*===================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ int32_t amount, /*!< in: amount to decrement */
+ int32_t threshold); /*!< in: threshold of judgement */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0. */
+bool
+rw_lock_own(
+/*========*/
+ const rw_lock_t*lock, /*!< in: rw-lock */
+ ulint lock_type) /*!< in: lock type: RW_LOCK_S,
+ RW_LOCK_X */
+ MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************************//**
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0. */
+bool
+rw_lock_own_flagged(
+/*================*/
+ const rw_lock_t* lock, /*!< in: rw-lock */
+ rw_lock_flags_t flags) /*!< in: specify lock types with
+ OR of the rw_lock_flag_t values */
+ MY_ATTRIBUTE((warn_unused_result));
+#endif /* UNIV_DEBUG */
+/******************************************************************//**
+Checks if somebody has locked the rw-lock in the specified mode.
+@return true if locked */
+bool
+rw_lock_is_locked(
+/*==============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint lock_type); /*!< in: lock type: RW_LOCK_S,
+ RW_LOCK_X or RW_LOCK_SX */
+#ifdef UNIV_DEBUG
+/***************************************************************//**
+Prints debug info of currently locked rw-locks. */
+void
+rw_lock_list_print_info(
+/*====================*/
+ FILE* file); /*!< in: file where to print */
+
+/*#####################################################################*/
+
+/*********************************************************************//**
+Prints info of a debug struct. */
+void
+rw_lock_debug_print(
+/*================*/
+ FILE* f, /*!< in: output stream */
+ const rw_lock_debug_t* info); /*!< in: debug struct */
+#endif /* UNIV_DEBUG */
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! */
+
+/** The structure used in the spin lock implementation of a read-write
+lock. Several threads may have a shared lock simultaneously in this
+lock, but only one writer may have an exclusive lock, in which case no
+shared locks are allowed. To prevent starving of a writer blocked by
+readers, a writer may queue for x-lock by decrementing lock_word: no
+new readers will be let in while the thread waits for readers to
+exit. */
+
+struct rw_lock_t :
+#ifdef UNIV_DEBUG
+ public latch_t,
+#endif /* UNIV_DEBUG */
+ public ilist_node<>
+{
+ ut_d(bool created= false;)
+
+ /** Holds the state of the lock. */
+ Atomic_relaxed<int32_t> lock_word;
+
+ /** 0=no waiters, 1=waiters for X or SX lock exist */
+ Atomic_relaxed<uint32_t> waiters;
+
+ /** number of granted SX locks. */
+ volatile ulint sx_recursive;
+
+ /** The value is typically set to thread id of a writer thread making
+ normal rw_locks recursive. In case of asynchronous IO, when a non-zero
+ value of 'pass' is passed then we keep the lock non-recursive.
+
+ writer_thread must be reset in x_unlock functions before incrementing
+ the lock_word. */
+ volatile os_thread_id_t writer_thread;
+
+ /** Used by sync0arr.cc for thread queueing */
+ os_event_t event;
+
+ /** Event for next-writer to wait on. A thread must decrement
+ lock_word before waiting. */
+ os_event_t wait_ex_event;
+
+ /** File name where lock created */
+ const char* cfile_name;
+
+ /** File name where last x-locked */
+ const char* last_x_file_name;
+
+ /** Line where created */
+ unsigned cline:13;
+
+ /** If 1 then the rw-lock is a block lock */
+ unsigned is_block_lock:1;
+
+ /** Line number where last time x-locked */
+ unsigned last_x_line:14;
+
+ /** Count of os_waits. May not be accurate */
+ uint32_t count_os_wait;
+
+#ifdef UNIV_PFS_RWLOCK
+ /** The instrumentation hook */
+ struct PSI_rwlock* pfs_psi;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_DEBUG
+ std::string to_string() const override;
+
+ /** In the debug version: pointer to the debug info list of the lock */
+ UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list;
+
+ /** Level in the global latching order. */
+ latch_level_t level;
+#endif /* UNIV_DEBUG */
+};
+#ifdef UNIV_DEBUG
+/** The structure for storing debug info of an rw-lock. All access to this
+structure must be protected by rw_lock_debug_mutex_enter(). */
+struct rw_lock_debug_t {
+
+ os_thread_id_t thread_id; /*!< The thread id of the thread which
+ locked the rw-lock */
+ ulint pass; /*!< Pass value given in the lock operation */
+ ulint lock_type; /*!< Type of the lock: RW_LOCK_X,
+ RW_LOCK_S, RW_LOCK_X_WAIT */
+ const char* file_name;/*!< File name where the lock was obtained */
+ unsigned line; /*!< Line where the rw-lock was locked */
+ UT_LIST_NODE_T(rw_lock_debug_t) list;
+ /*!< Debug structs are linked in a two-way
+ list */
+};
+#endif /* UNIV_DEBUG */
+
+/* For performance schema instrumentation, a new set of rwlock
+wrap functions are created if "UNIV_PFS_RWLOCK" is defined.
+The instrumentations are not planted directly into original
+functions, so that we keep the underlying function as they
+are. And in case, user wants to "take out" some rwlock from
+instrumentation even if performance schema (UNIV_PFS_RWLOCK)
+is defined, they can do so by reinstating APIs directly link to
+original underlying functions.
+The instrumented function names have prefix of "pfs_rw_lock_" vs.
+original name prefix of "rw_lock_". Following are list of functions
+that have been instrumented:
+
+rw_lock_create()
+rw_lock_x_lock()
+rw_lock_x_lock_gen()
+rw_lock_x_lock_nowait()
+rw_lock_x_unlock_gen()
+rw_lock_s_lock()
+rw_lock_s_lock_gen()
+rw_lock_s_lock_nowait()
+rw_lock_s_unlock_gen()
+rw_lock_sx_lock()
+rw_lock_sx_unlock_gen()
+rw_lock_free()
+*/
+
+#ifdef UNIV_PFS_RWLOCK
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_create_func()
+NOTE! Please use the corresponding macro rw_lock_create(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_create_func(
+/*====================*/
+ PSI_rwlock_key key, /*!< in: key registered with
+ performance schema */
+ rw_lock_t* lock, /*!< in: rw lock */
+#ifdef UNIV_DEBUG
+ latch_level_t level, /*!< in: level */
+#endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ unsigned cline); /*!< in: file line where created */
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for
+rw_lock_x_lock_func_nowait()
+NOTE! Please use the corresponding macro, not directly this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_x_lock_func_nowait(
+/*===========================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly
+this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_s_lock_low(
+/*===================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock will be passed to another
+ thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_s_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_unlock_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock may have been passed to another
+ thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_unlock_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock may have been passed to another
+ thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_sx_lock_func()
+NOTE! Please use the corresponding macro rw_lock_sx_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_sx_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_sx_lock_nowait()
+NOTE! Please use the corresponding macro, not directly
+this function! */
+UNIV_INLINE
+ibool
+pfs_rw_lock_sx_lock_low(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_sx_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_sx_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_sx_unlock_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock may have been passed to another
+ thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_free_func()
+NOTE! Please use the corresponding macro rw_lock_free(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_free_func(
+/*==================*/
+ rw_lock_t* lock); /*!< in: rw-lock */
+#endif /* UNIV_PFS_RWLOCK */
+
+#include "sync0rw.ic"
+
+#endif /* sync0rw.h */
diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic
new file mode 100644
index 00000000..169cbdd9
--- /dev/null
+++ b/storage/innobase/include/sync0rw.ic
@@ -0,0 +1,842 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0rw.ic
+The read-write lock (for threads)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0event.h"
+
+/******************************************************************//**
+Lock an rw-lock in shared mode for the current thread. If the rw-lock is
+locked in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by srv_n_spin_wait_rounds),
+waiting for the lock before suspending the thread. */
+void
+rw_lock_s_lock_spin(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line); /*!< in: line where requested */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Inserts the debug information for an rw-lock. */
+void
+rw_lock_add_debug_info(
+/*===================*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint pass, /*!< in: pass value */
+ ulint lock_type, /*!< in: lock type */
+ const char* file_name, /*!< in: file where requested */
+ unsigned line); /*!< in: line where requested */
+/******************************************************************//**
+Removes a debug information struct for an rw-lock. */
+void
+rw_lock_remove_debug_info(
+/*======================*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint pass, /*!< in: pass value */
+ ulint lock_type); /*!< in: lock type */
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Returns the write-status of the lock - this function made more sense
+with the old rw_lock implementation.
+@return RW_LOCK_NOT_LOCKED, RW_LOCK_X, RW_LOCK_X_WAIT, RW_LOCK_SX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ int32_t lock_word = lock->lock_word;
+
+ ut_ad(lock_word <= X_LOCK_DECR);
+ if (lock_word > X_LOCK_HALF_DECR) {
+ /* return NOT_LOCKED in s-lock state, like the writer
+ member of the old lock implementation. */
+ return(RW_LOCK_NOT_LOCKED);
+ } else if (lock_word > 0) {
+ /* sx-locked, no x-locks */
+ return(RW_LOCK_SX);
+ } else if (lock_word == 0
+ || lock_word == -X_LOCK_HALF_DECR
+ || lock_word <= -X_LOCK_DECR) {
+ /* x-lock with sx-lock is also treated as RW_LOCK_EX */
+ return(RW_LOCK_X);
+ } else {
+ /* x-waiter with sx-lock is also treated as RW_LOCK_WAIT_EX
+ e.g. -X_LOCK_HALF_DECR < lock_word < 0 : without sx
+ -X_LOCK_DECR < lock_word < -X_LOCK_HALF_DECR : with sx */
+ return(RW_LOCK_X_WAIT);
+ }
+}
+
+/******************************************************************//**
+Returns the number of readers (s-locks).
+@return number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ int32_t lock_word = lock->lock_word;
+ ut_ad(lock_word <= X_LOCK_DECR);
+
+ if (lock_word > X_LOCK_HALF_DECR) {
+ /* s-locked, no x-waiter */
+ return ulint(X_LOCK_DECR - lock_word);
+ } else if (lock_word > 0) {
+ /* s-locked, with sx-locks only */
+ return ulint(X_LOCK_HALF_DECR - lock_word);
+ } else if (lock_word == 0) {
+ /* x-locked */
+ return(0);
+ } else if (lock_word > -X_LOCK_HALF_DECR) {
+ /* s-locked, with x-waiter */
+ return((ulint)(-lock_word));
+ } else if (lock_word == -X_LOCK_HALF_DECR) {
+ /* x-locked with sx-locks */
+ return(0);
+ } else if (lock_word > -X_LOCK_DECR) {
+ /* s-locked, with x-waiter and sx-lock */
+ return((ulint)(-(lock_word + X_LOCK_HALF_DECR)));
+ }
+ /* no s-locks */
+ return(0);
+}
+
+/******************************************************************//**
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ int32_t lock_copy = lock->lock_word;
+ ut_ad(lock_copy <= X_LOCK_DECR);
+
+ if (lock_copy == 0 || lock_copy == -X_LOCK_HALF_DECR) {
+ /* "1 x-lock" or "1 x-lock + sx-locks" */
+ return(1);
+ } else if (lock_copy > -X_LOCK_DECR) {
+ /* s-locks, one or more sx-locks if > 0, or x-waiter if < 0 */
+ return(0);
+ } else if (lock_copy > -(X_LOCK_DECR + X_LOCK_HALF_DECR)) {
+ /* no s-lock, no sx-lock, 2 or more x-locks.
+ First 2 x-locks are set with -X_LOCK_DECR,
+ all other recursive x-locks are set with -1 */
+ return ulint(2 - X_LOCK_DECR - lock_copy);
+ } else {
+ /* no s-lock, 1 or more sx-lock, 2 or more x-locks.
+ First 2 x-locks are set with -(X_LOCK_DECR + X_LOCK_HALF_DECR),
+ all other recursive x-locks are set with -1 */
+ return ulint(2 - X_LOCK_DECR - X_LOCK_HALF_DECR - lock_copy);
+ }
+}
+
+/******************************************************************//**
+Returns the number of sx-lock for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return value of sx-lock count */
+UNIV_INLINE
+ulint
+rw_lock_get_sx_lock_count(
+/*======================*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+#ifdef UNIV_DEBUG
+ int32_t lock_copy = lock->lock_word;
+
+ ut_ad(lock_copy <= X_LOCK_DECR);
+
+ while (lock_copy < 0) {
+ lock_copy += X_LOCK_DECR;
+ }
+
+ if (lock_copy > 0 && lock_copy <= X_LOCK_HALF_DECR) {
+ return(lock->sx_recursive);
+ }
+
+ return(0);
+#else /* UNIV_DEBUG */
+ return(lock->sx_recursive);
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************************//**
+Recursive x-locks are not supported: they should be handled by the caller and
+need not be atomic since they are performed by the current lock holder.
+Returns true if the decrement was made, false if not.
+@return true if decr occurs */
+UNIV_INLINE
+bool
+rw_lock_lock_word_decr(
+/*===================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ int32_t amount, /*!< in: amount to decrement */
+ int32_t threshold) /*!< in: threshold of judgement */
+{
+ int32_t lock_copy = lock->lock_word;
+
+ while (lock_copy > threshold) {
+ if (lock->lock_word.compare_exchange_strong(
+ lock_copy,
+ lock_copy - amount,
+ std::memory_order_acquire,
+ std::memory_order_relaxed)) {
+
+ return(true);
+ }
+
+ /* Note that lock_copy was reloaded above. We will
+ keep trying if a spurious conflict occurred, typically
+ caused by concurrent executions of
+ rw_lock_s_lock(). */
+
+ /* Note: unlike this implementation, rw_lock::read_lock()
+ allows concurrent calls without a spin loop */
+ }
+
+ /* A real conflict was detected. */
+ return(false);
+}
+
+/******************************************************************//**
+Low-level function which tries to lock an rw-lock in s-mode.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass MY_ATTRIBUTE((unused)),
+ /*!< in: pass value; != 0, if the lock will be
+ passed to another thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ if (!rw_lock_lock_word_decr(lock, 1, 0)) {
+ /* Locking did not succeed */
+ return(FALSE);
+ }
+
+ ut_d(rw_lock_add_debug_info(lock, pass, RW_LOCK_S, file_name, line));
+
+ return(TRUE); /* locking succeeded */
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in shared mode for the current thread. If the rw-lock is locked
+in exclusive mode, or there is an exclusive lock request waiting, the
+function spins a preset time (controlled by srv_n_spin_wait_rounds), waiting for
+the lock, before suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ /* NOTE: As we do not know the thread ids for threads which have
+ s-locked a latch, and s-lockers will be served only after waiting
+ x-lock requests have been fulfilled, then if this thread already
+ owns an s-lock here, it may end up in a deadlock with another thread
+ which requests an x-lock here. Therefore, we will forbid recursive
+ s-locking of a latch: the following assert will warn the programmer
+ of the possibility of this kind of a deadlock. If we want to implement
+ safe recursive s-locking, we should keep in a list the thread ids of
+ the threads which have s-locked a latch. This would use some CPU
+ time. */
+
+ ut_ad(!rw_lock_own_flagged(lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+ if (!rw_lock_s_lock_low(lock, pass, file_name, line)) {
+
+ /* Did not succeed, try spin wait */
+
+ rw_lock_s_lock_spin(lock, pass, file_name, line);
+ }
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ int32_t oldval = X_LOCK_DECR;
+
+ if (lock->lock_word.compare_exchange_strong(oldval, 0,
+ std::memory_order_acquire,
+ std::memory_order_relaxed)) {
+ lock->writer_thread = os_thread_get_curr_id();
+
+ } else if (os_thread_eq(lock->writer_thread, os_thread_get_curr_id())) {
+ /* Relock: even though no other thread can modify (lock, unlock
+ or reserve) lock_word while there is an exclusive writer and
+ this is the writer thread, we still want concurrent threads to
+ observe consistent values. */
+ if (oldval == 0 || oldval == -X_LOCK_HALF_DECR) {
+ /* There are 1 x-locks */
+ lock->lock_word.fetch_sub(X_LOCK_DECR,
+ std::memory_order_relaxed);
+ } else if (oldval <= -X_LOCK_DECR) {
+ /* There are 2 or more x-locks */
+ lock->lock_word.fetch_sub(1,
+ std::memory_order_relaxed);
+ /* Watch for too many recursive locks */
+ ut_ad(oldval < 1);
+ } else {
+ /* Failure */
+ return(FALSE);
+ }
+ } else {
+ /* Failure */
+ return(FALSE);
+ }
+
+ ut_d(rw_lock_add_debug_info(lock, 0, RW_LOCK_X, file_name, line));
+
+ lock->last_x_file_name = file_name;
+ lock->last_x_line = line & ((1 << 14) - 1);
+
+ ut_ad(rw_lock_validate(lock));
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_S));
+
+ /* Increment lock_word to indicate 1 less reader */
+ int32_t lock_word = lock->lock_word.fetch_add(
+ 1, std::memory_order_release);
+
+ if (lock_word == -1 || lock_word == -X_LOCK_HALF_DECR - 1) {
+ /* wait_ex waiter exists. It may not be asleep, but we signal
+ anyway. We do not wake other waiters, because they can't
+ exist without wait_ex waiter and wait_ex waiter goes first.*/
+ os_event_set(lock->wait_ex_event);
+ sync_array_object_signalled();
+ } else {
+ ut_ad(lock_word > -X_LOCK_DECR);
+ ut_ad(lock_word < X_LOCK_DECR);
+ }
+
+ ut_ad(rw_lock_validate(lock));
+}
+
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ int32_t lock_word = lock->lock_word;
+
+ if (lock_word == 0) {
+ /* Last caller in a possible recursive chain. */
+ lock->writer_thread = 0;
+ }
+
+ ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_X));
+
+ if (lock_word == 0 || lock_word == -X_LOCK_HALF_DECR) {
+ /* Last X-lock owned by this thread, it may still hold SX-locks.
+ ACQ_REL due to...
+ RELEASE: we release rw-lock
+ ACQUIRE: we want waiters to be loaded after lock_word is stored */
+ lock->lock_word.fetch_add(X_LOCK_DECR,
+ std::memory_order_acq_rel);
+
+ /* This no longer has an X-lock but it may still have
+ an SX-lock. So it is now free for S-locks by other threads.
+ We need to signal read/write waiters.
+ We do not need to signal wait_ex waiters, since they cannot
+ exist when there is a writer. */
+ if (lock->waiters) {
+ lock->waiters = 0;
+ os_event_set(lock->event);
+ sync_array_object_signalled();
+ }
+ } else if (lock_word == -X_LOCK_DECR
+ || lock_word == -(X_LOCK_DECR + X_LOCK_HALF_DECR)) {
+ /* There are 2 x-locks */
+ lock->lock_word.fetch_add(X_LOCK_DECR);
+ } else {
+ /* There are more than 2 x-locks. */
+ ut_ad(lock_word < -X_LOCK_DECR);
+ lock->lock_word.fetch_add(1);
+ }
+
+ ut_ad(rw_lock_validate(lock));
+}
+
+/******************************************************************//**
+Releases a sx mode lock. */
+UNIV_INLINE
+void
+rw_lock_sx_unlock_func(
+/*===================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ ut_ad(rw_lock_get_sx_lock_count(lock));
+ ut_ad(lock->sx_recursive > 0);
+
+ --lock->sx_recursive;
+
+ ut_d(rw_lock_remove_debug_info(lock, pass, RW_LOCK_SX));
+
+ if (lock->sx_recursive == 0) {
+ int32_t lock_word = lock->lock_word;
+ /* Last caller in a possible recursive chain. */
+ if (lock_word > 0) {
+ lock->writer_thread = 0;
+ ut_ad(lock_word <= INT_MAX32 - X_LOCK_HALF_DECR);
+
+ /* Last SX-lock owned by this thread, doesn't own X-lock.
+ ACQ_REL due to...
+ RELEASE: we release rw-lock
+ ACQUIRE: we want waiters to be loaded after lock_word is stored */
+ lock->lock_word.fetch_add(X_LOCK_HALF_DECR,
+ std::memory_order_acq_rel);
+
+ /* Lock is now free. May have to signal read/write
+ waiters. We do not need to signal wait_ex waiters,
+ since they cannot exist when there is an sx-lock
+ holder. */
+ if (lock->waiters) {
+ lock->waiters = 0;
+ os_event_set(lock->event);
+ sync_array_object_signalled();
+ }
+ } else {
+ /* still has x-lock */
+ ut_ad(lock_word == -X_LOCK_HALF_DECR ||
+ lock_word <= -(X_LOCK_DECR + X_LOCK_HALF_DECR));
+ lock->lock_word.fetch_add(X_LOCK_HALF_DECR);
+ }
+ }
+
+ ut_ad(rw_lock_validate(lock));
+}
+
+#ifdef UNIV_PFS_RWLOCK
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_create_func().
+NOTE! Please use the corresponding macro rw_lock_create(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_create_func(
+/*====================*/
+ mysql_pfs_key_t key, /*!< in: key registered with
+ performance schema */
+ rw_lock_t* lock, /*!< in/out: pointer to memory */
+# ifdef UNIV_DEBUG
+ latch_level_t level, /*!< in: level */
+# endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ unsigned cline) /*!< in: file line where created */
+{
+ ut_d(new(lock) rw_lock_t());
+
+ /* Initialize the rwlock for performance schema */
+ lock->pfs_psi = PSI_RWLOCK_CALL(init_rwlock)(key, lock);
+
+ /* The actual function to initialize an rwlock */
+ rw_lock_create_func(lock,
+#ifdef UNIV_DEBUG
+ level,
+#endif /* UNIV_DEBUG */
+ cfile_name,
+ cline);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ if (lock->pfs_psi != NULL) {
+ PSI_rwlock_locker* locker;
+ PSI_rwlock_locker_state state;
+
+ /* Record the acquisition of a read-write lock in exclusive
+ mode in performance schema */
+
+ locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
+ &state, lock->pfs_psi, PSI_RWLOCK_EXCLUSIVELOCK,
+ file_name, static_cast<uint>(line));
+
+ rw_lock_x_lock_func(
+ lock, pass, file_name, static_cast<uint>(line));
+
+ if (locker != NULL) {
+ PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0);
+ }
+ } else {
+ rw_lock_x_lock_func(lock, pass, file_name, line);
+ }
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for
+rw_lock_x_lock_func_nowait()
+NOTE! Please use the corresponding macro rw_lock_x_lock_func(),
+not directly this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_x_lock_func_nowait(
+/*===========================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ const char* file_name,/*!< in: file name where lock
+ requested */
+ unsigned line) /*!< in: line where requested */
+{
+ ibool ret;
+
+ if (lock->pfs_psi != NULL) {
+ PSI_rwlock_locker* locker;
+ PSI_rwlock_locker_state state;
+
+ /* Record the acquisition of a read-write trylock in exclusive
+ mode in performance schema */
+
+ locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
+ &state, lock->pfs_psi, PSI_RWLOCK_TRYEXCLUSIVELOCK,
+ file_name, static_cast<uint>(line));
+
+ ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
+
+ if (locker != NULL) {
+ PSI_RWLOCK_CALL(end_rwlock_wrwait)(
+ locker, static_cast<int>(ret));
+ }
+ } else {
+ ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
+ }
+
+ return(ret);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_free_func()
+NOTE! Please use the corresponding macro rw_lock_free(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_free_func(
+/*==================*/
+ rw_lock_t* lock) /*!< in: pointer to rw-lock */
+{
+ if (lock->pfs_psi != NULL) {
+ PSI_RWLOCK_CALL(destroy_rwlock)(lock->pfs_psi);
+ lock->pfs_psi = NULL;
+ }
+
+ rw_lock_free_func(lock);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock will be passed to another
+ thread to unlock */
+ const char* file_name,/*!< in: file name where lock
+ requested */
+ unsigned line) /*!< in: line where requested */
+{
+ if (lock->pfs_psi != NULL) {
+ PSI_rwlock_locker* locker;
+ PSI_rwlock_locker_state state;
+
+ /* Instrumented to inform we are aquiring a shared rwlock */
+ locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
+ &state, lock->pfs_psi, PSI_RWLOCK_SHAREDLOCK,
+ file_name, static_cast<uint>(line));
+
+ rw_lock_s_lock_func(lock, pass, file_name, line);
+
+ if (locker != NULL) {
+ PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+ }
+ } else {
+ rw_lock_s_lock_func(lock, pass, file_name, line);
+ }
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_sx_lock_func()
+NOTE! Please use the corresponding macro rw_lock_sx_lock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_sx_lock_func(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock will be passed to another
+ thread to unlock */
+ const char* file_name,/*!< in: file name where lock
+ requested */
+ unsigned line) /*!< in: line where requested */
+{
+ if (lock->pfs_psi != NULL) {
+ PSI_rwlock_locker* locker;
+ PSI_rwlock_locker_state state;
+
+ /* Instrumented to inform we are aquiring a shared rwlock */
+ locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
+ &state, lock->pfs_psi, PSI_RWLOCK_SHAREDEXCLUSIVELOCK,
+ file_name, static_cast<uint>(line));
+
+ rw_lock_sx_lock_func(lock, pass, file_name, line);
+
+ if (locker != NULL) {
+ PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0);
+ }
+ } else {
+ rw_lock_sx_lock_func(lock, pass, file_name, line);
+ }
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not
+directly this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_s_lock_low(
+/*===================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock will be passed to another
+ thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ ibool ret;
+
+ if (lock->pfs_psi != NULL) {
+ PSI_rwlock_locker* locker;
+ PSI_rwlock_locker_state state;
+
+ /* Instrumented to inform we are aquiring a shared rwlock */
+ locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
+ &state, lock->pfs_psi, PSI_RWLOCK_TRYSHAREDLOCK,
+ file_name, static_cast<uint>(line));
+
+ ret = rw_lock_s_lock_low(lock, pass, file_name, line);
+
+ if (locker != NULL) {
+ PSI_RWLOCK_CALL(end_rwlock_rdwait)(
+ locker, static_cast<int>(ret));
+ }
+ } else {
+ ret = rw_lock_s_lock_low(lock, pass, file_name, line);
+ }
+
+ return(ret);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_sx_lock_nowait()
+NOTE! Please use the corresponding macro, not
+directly this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_sx_lock_low(
+/*====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock will be passed to another
+ thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ ibool ret;
+
+ if (lock->pfs_psi != NULL) {
+ PSI_rwlock_locker* locker;
+ PSI_rwlock_locker_state state;
+
+ /* Instrumented to inform we are aquiring a shared
+ exclusive rwlock */
+ locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
+ &state, lock->pfs_psi,
+ PSI_RWLOCK_TRYSHAREDEXCLUSIVELOCK,
+ file_name, static_cast<uint>(line));
+
+ ret = rw_lock_sx_lock_low(lock, pass, file_name, line);
+
+ if (locker != NULL) {
+ PSI_RWLOCK_CALL(end_rwlock_rdwait)(
+ locker, static_cast<int>(ret));
+ }
+ } else {
+ ret = rw_lock_sx_lock_low(lock, pass, file_name, line);
+ }
+
+ return(ret);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_unlock_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock may have been passed to another
+ thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ /* Inform performance schema we are unlocking the lock */
+ if (lock->pfs_psi != NULL) {
+ PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi);
+ }
+
+ rw_lock_x_unlock_func(
+#ifdef UNIV_DEBUG
+ pass,
+#endif /* UNIV_DEBUG */
+ lock);
+}
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_sx_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_sx_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_sx_unlock_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock may have been passed to another
+ thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ /* Inform performance schema we are unlocking the lock */
+ if (lock->pfs_psi != NULL) {
+ PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi);
+ }
+
+ rw_lock_sx_unlock_func(
+#ifdef UNIV_DEBUG
+ pass,
+#endif /* UNIV_DEBUG */
+ lock);
+}
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro pfs_rw_lock_s_unlock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_unlock_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the
+ lock may have been passed to another
+ thread to unlock */
+#endif /* UNIV_DEBUG */
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ /* Inform performance schema we are unlocking the lock */
+ if (lock->pfs_psi != NULL) {
+ PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi);
+ }
+
+ rw_lock_s_unlock_func(
+#ifdef UNIV_DEBUG
+ pass,
+#endif /* UNIV_DEBUG */
+ lock);
+
+}
+#endif /* UNIV_PFS_RWLOCK */
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
new file mode 100644
index 00000000..b7f3cff2
--- /dev/null
+++ b/storage/innobase/include/sync0sync.h
@@ -0,0 +1,107 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2020, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0sync.h
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0sync_h
+#define sync0sync_h
+
+#include "univ.i"
+
+#ifdef UNIV_PFS_MUTEX
+/* Key defines to register InnoDB mutexes with performance schema */
+extern mysql_pfs_key_t buf_pool_mutex_key;
+extern mysql_pfs_key_t dict_foreign_err_mutex_key;
+extern mysql_pfs_key_t dict_sys_mutex_key;
+extern mysql_pfs_key_t fil_system_mutex_key;
+extern mysql_pfs_key_t flush_list_mutex_key;
+extern mysql_pfs_key_t fts_delete_mutex_key;
+extern mysql_pfs_key_t fts_doc_id_mutex_key;
+extern mysql_pfs_key_t fts_pll_tokenize_mutex_key;
+extern mysql_pfs_key_t ibuf_bitmap_mutex_key;
+extern mysql_pfs_key_t ibuf_mutex_key;
+extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
+extern mysql_pfs_key_t log_sys_mutex_key;
+extern mysql_pfs_key_t log_cmdq_mutex_key;
+extern mysql_pfs_key_t log_flush_order_mutex_key;
+extern mysql_pfs_key_t recalc_pool_mutex_key;
+extern mysql_pfs_key_t purge_sys_pq_mutex_key;
+extern mysql_pfs_key_t recv_sys_mutex_key;
+extern mysql_pfs_key_t rtr_active_mutex_key;
+extern mysql_pfs_key_t rtr_match_mutex_key;
+extern mysql_pfs_key_t rtr_path_mutex_key;
+extern mysql_pfs_key_t redo_rseg_mutex_key;
+extern mysql_pfs_key_t noredo_rseg_mutex_key;
+extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
+# ifdef UNIV_DEBUG
+extern mysql_pfs_key_t rw_lock_debug_mutex_key;
+# endif /* UNIV_DEBUG */
+extern mysql_pfs_key_t rw_lock_list_mutex_key;
+extern mysql_pfs_key_t srv_innodb_monitor_mutex_key;
+extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key;
+extern mysql_pfs_key_t srv_monitor_file_mutex_key;
+extern mysql_pfs_key_t buf_dblwr_mutex_key;
+extern mysql_pfs_key_t trx_mutex_key;
+extern mysql_pfs_key_t trx_pool_mutex_key;
+extern mysql_pfs_key_t trx_pool_manager_mutex_key;
+extern mysql_pfs_key_t lock_mutex_key;
+extern mysql_pfs_key_t lock_wait_mutex_key;
+extern mysql_pfs_key_t trx_sys_mutex_key;
+extern mysql_pfs_key_t srv_threads_mutex_key;
+extern mysql_pfs_key_t sync_array_mutex_key;
+extern mysql_pfs_key_t thread_mutex_key;
+extern mysql_pfs_key_t row_drop_list_mutex_key;
+extern mysql_pfs_key_t rw_trx_hash_element_mutex_key;
+extern mysql_pfs_key_t read_view_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_PFS_RWLOCK
+/* Following are rwlock keys used to register with MySQL
+performance schema */
+extern mysql_pfs_key_t btr_search_latch_key;
+extern mysql_pfs_key_t dict_operation_lock_key;
+extern mysql_pfs_key_t fil_space_latch_key;
+extern mysql_pfs_key_t fts_cache_rw_lock_key;
+extern mysql_pfs_key_t fts_cache_init_rw_lock_key;
+extern mysql_pfs_key_t trx_i_s_cache_lock_key;
+extern mysql_pfs_key_t trx_purge_latch_key;
+extern mysql_pfs_key_t index_tree_rw_lock_key;
+extern mysql_pfs_key_t index_online_log_key;
+extern mysql_pfs_key_t trx_sys_rw_lock_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+/** Prints info of the sync system.
+@param[in] file where to print */
+void
+sync_print(FILE* file);
+
+#endif /* !sync0sync_h */
diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h
new file mode 100644
index 00000000..feb1e3b4
--- /dev/null
+++ b/storage/innobase/include/sync0types.h
@@ -0,0 +1,1060 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0types.h
+Global types for sync
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0types_h
+#define sync0types_h
+
+#include <vector>
+
+#include "ut0new.h"
+
+#ifdef _WIN32
+/** Native mutex */
+typedef CRITICAL_SECTION sys_mutex_t;
+#else
+/** Native mutex */
+typedef pthread_mutex_t sys_mutex_t;
+#endif /* _WIN32 */
+
+/** Mutex states. */
+enum mutex_state_t {
+ /** Mutex is free */
+ MUTEX_STATE_UNLOCKED = 0,
+
+ /** Mutex is acquired by some thread. */
+ MUTEX_STATE_LOCKED = 1,
+
+ /** Mutex is contended and there are threads waiting on the lock. */
+ MUTEX_STATE_WAITERS = 2
+};
+
+/*
+ LATCHING ORDER WITHIN THE DATABASE
+ ==================================
+
+The mutex or latch in the central memory object, for instance, a rollback
+segment object, must be acquired before acquiring the latch or latches to
+the corresponding file data structure. In the latching order below, these
+file page object latches are placed immediately below the corresponding
+central memory object latch or mutex.
+
+Synchronization object Notes
+---------------------- -----
+
+Dictionary mutex If we have a pointer to a dictionary
+| object, e.g., a table, it can be
+| accessed without reserving the
+| dictionary mutex. We must have a
+| reservation, a memoryfix, to the
+| appropriate table object in this case,
+| and the table must be explicitly
+| released later.
+V
+Dictionary header
+|
+V
+Secondary index tree latch The tree latch protects also all
+| the B-tree non-leaf pages. These
+V can be read with the page only
+Secondary index non-leaf bufferfixed to save CPU time,
+| no s-latch is needed on the page.
+| Modification of a page requires an
+| x-latch on the page, however. If a
+| thread owns an x-latch to the tree,
+| it is allowed to latch non-leaf pages
+| even after it has acquired the fsp
+| latch.
+V
+Secondary index leaf The latch on the secondary index leaf
+| can be kept while accessing the
+| clustered index, to save CPU time.
+V
+Clustered index tree latch To increase concurrency, the tree
+| latch is usually released when the
+| leaf page latch has been acquired.
+V
+Clustered index non-leaf
+|
+V
+Clustered index leaf
+|
+V
+Transaction system header
+|
+V
+Rollback segment mutex The rollback segment mutex must be
+| reserved, if, e.g., a new page must
+| be added to an undo log. The rollback
+| segment and the undo logs in its
+| history list can be seen as an
+| analogue of a B-tree, and the latches
+| reserved similarly, using a version of
+| lock-coupling. If an undo log must be
+| extended by a page when inserting an
+| undo log record, this corresponds to
+| a pessimistic insert in a B-tree.
+V
+Rollback segment header
+|
+V
+Purge system latch
+|
+V
+Undo log pages If a thread owns the trx undo mutex,
+| or for a log in the history list, the
+| rseg mutex, it is allowed to latch
+| undo log pages in any order, and even
+| after it has acquired the fsp latch.
+| If a thread does not have the
+| appropriate mutex, it is allowed to
+| latch only a single undo log page in
+| a mini-transaction.
+V
+File space management latch If a mini-transaction must allocate
+| several file pages, it can do that,
+| because it keeps the x-latch to the
+| file space management in its memo.
+V
+File system pages
+|
+V
+lock_sys_wait_mutex Mutex protecting lock timeout data
+|
+V
+lock_sys_mutex Mutex protecting lock_sys_t
+|
+V
+trx_sys.mutex Mutex protecting trx_sys.trx_list
+|
+V
+Threads mutex Background thread scheduling mutex
+|
+V
+query_thr_mutex Mutex protecting query threads
+|
+V
+trx_mutex Mutex protecting trx_t fields
+|
+V
+Search system mutex
+|
+V
+Buffer pool mutex
+|
+V
+Log mutex
+|
+Any other latch
+|
+V
+Memory pool mutex */
+
+/** Latching order levels. If you modify these, you have to also update
+LatchDebug internals in sync0debug.cc */
+
+enum latch_level_t {
+ SYNC_UNKNOWN = 0,
+
+ SYNC_MUTEX = 1,
+
+ RW_LOCK_SX,
+ RW_LOCK_X_WAIT,
+ RW_LOCK_S,
+ RW_LOCK_X,
+ RW_LOCK_NOT_LOCKED,
+
+ SYNC_ANY_LATCH,
+
+ SYNC_POOL,
+ SYNC_POOL_MANAGER,
+
+ SYNC_SEARCH_SYS,
+
+ SYNC_WORK_QUEUE,
+
+ SYNC_FTS_TOKENIZE,
+ SYNC_FTS_OPTIMIZE,
+ SYNC_FTS_CACHE_INIT,
+ SYNC_RECV,
+ SYNC_PURGE_QUEUE,
+ SYNC_TRX_SYS_HEADER,
+ SYNC_TRX,
+ SYNC_RW_TRX_HASH_ELEMENT,
+ SYNC_READ_VIEW,
+ SYNC_TRX_SYS,
+ SYNC_LOCK_SYS,
+ SYNC_LOCK_WAIT_SYS,
+
+ SYNC_INDEX_ONLINE_LOG,
+
+ SYNC_IBUF_BITMAP,
+ SYNC_IBUF_BITMAP_MUTEX,
+ SYNC_IBUF_TREE_NODE,
+ SYNC_IBUF_TREE_NODE_NEW,
+ SYNC_IBUF_INDEX_TREE,
+
+ SYNC_IBUF_MUTEX,
+
+ SYNC_FSP_PAGE,
+ SYNC_FSP,
+ SYNC_EXTERN_STORAGE,
+ SYNC_TRX_UNDO_PAGE,
+ SYNC_RSEG_HEADER,
+ SYNC_RSEG_HEADER_NEW,
+ SYNC_NOREDO_RSEG,
+ SYNC_REDO_RSEG,
+ SYNC_PURGE_LATCH,
+ SYNC_TREE_NODE,
+ SYNC_TREE_NODE_FROM_HASH,
+ SYNC_TREE_NODE_NEW,
+ SYNC_IBUF_PESS_INSERT_MUTEX,
+ SYNC_INDEX_TREE,
+
+ SYNC_IBUF_HEADER,
+ SYNC_DICT_HEADER,
+ SYNC_STATS_AUTO_RECALC,
+ SYNC_DICT,
+ SYNC_FTS_CACHE,
+
+ SYNC_DICT_OPERATION,
+
+ SYNC_TRX_I_S_RWLOCK,
+
+ /** Level is varying. Only used with buffer pool page locks, which
+ do not have a fixed level, but instead have their level set after
+ the page is locked; see e.g. ibuf_bitmap_get_map_page(). */
+
+ SYNC_LEVEL_VARYING,
+
+ /** This can be used to suppress order checking. */
+ SYNC_NO_ORDER_CHECK,
+
+ /** Maximum level value */
+ SYNC_LEVEL_MAX = SYNC_NO_ORDER_CHECK
+};
+
+/** Each latch has an ID. This id is used for creating the latch and to look
+up its meta-data. See sync0debug.cc. */
+enum latch_id_t {
+ LATCH_ID_NONE = 0,
+ LATCH_ID_DICT_FOREIGN_ERR,
+ LATCH_ID_DICT_SYS,
+ LATCH_ID_FIL_SYSTEM,
+ LATCH_ID_FTS_DELETE,
+ LATCH_ID_FTS_DOC_ID,
+ LATCH_ID_FTS_PLL_TOKENIZE,
+ LATCH_ID_IBUF_BITMAP,
+ LATCH_ID_IBUF,
+ LATCH_ID_IBUF_PESSIMISTIC_INSERT,
+ LATCH_ID_PURGE_SYS_PQ,
+ LATCH_ID_RECALC_POOL,
+ LATCH_ID_RECV_SYS,
+ LATCH_ID_REDO_RSEG,
+ LATCH_ID_NOREDO_RSEG,
+ LATCH_ID_RW_LOCK_DEBUG,
+ LATCH_ID_RTR_ACTIVE_MUTEX,
+ LATCH_ID_RTR_MATCH_MUTEX,
+ LATCH_ID_RTR_PATH_MUTEX,
+ LATCH_ID_RW_LOCK_LIST,
+ LATCH_ID_SRV_INNODB_MONITOR,
+ LATCH_ID_SRV_MISC_TMPFILE,
+ LATCH_ID_SRV_MONITOR_FILE,
+ LATCH_ID_TRX_POOL,
+ LATCH_ID_TRX_POOL_MANAGER,
+ LATCH_ID_TRX,
+ LATCH_ID_LOCK_SYS,
+ LATCH_ID_LOCK_SYS_WAIT,
+ LATCH_ID_TRX_SYS,
+ LATCH_ID_SRV_SYS_TASKS,
+ LATCH_ID_PAGE_ZIP_STAT_PER_INDEX,
+ LATCH_ID_SYNC_ARRAY_MUTEX,
+ LATCH_ID_ROW_DROP_LIST,
+ LATCH_ID_INDEX_ONLINE_LOG,
+ LATCH_ID_WORK_QUEUE,
+ LATCH_ID_BTR_SEARCH,
+ LATCH_ID_BUF_BLOCK_LOCK,
+ LATCH_ID_BUF_BLOCK_DEBUG,
+ LATCH_ID_DICT_OPERATION,
+ LATCH_ID_FIL_SPACE,
+ LATCH_ID_FTS_CACHE,
+ LATCH_ID_FTS_CACHE_INIT,
+ LATCH_ID_TRX_I_S_CACHE,
+ LATCH_ID_TRX_PURGE,
+ LATCH_ID_IBUF_INDEX_TREE,
+ LATCH_ID_INDEX_TREE,
+ LATCH_ID_DICT_TABLE_STATS,
+ LATCH_ID_DEFRAGMENT_MUTEX,
+ LATCH_ID_BTR_DEFRAGMENT_MUTEX,
+ LATCH_ID_FIL_CRYPT_STAT_MUTEX,
+ LATCH_ID_FIL_CRYPT_DATA_MUTEX,
+ LATCH_ID_FIL_CRYPT_THREADS_MUTEX,
+ LATCH_ID_RW_TRX_HASH_ELEMENT,
+ LATCH_ID_READ_VIEW,
+ LATCH_ID_MAX = LATCH_ID_READ_VIEW
+};
+
+#ifndef UNIV_INNOCHECKSUM
+/** OS mutex, without any policy. It is a thin wrapper around the
+system mutexes. The interface is different from the policy mutexes,
+to ensure that it is called directly and not confused with the
+policy mutexes. */
+struct OSMutex {
+
+ /** Constructor */
+ OSMutex()
+ UNIV_NOTHROW
+ {
+ ut_d(m_freed = true);
+ }
+
+ /** Create the mutex by calling the system functions. */
+ void init()
+ UNIV_NOTHROW
+ {
+ ut_ad(m_freed);
+
+#ifdef _WIN32
+ InitializeCriticalSection((LPCRITICAL_SECTION) &m_mutex);
+#else
+ {
+ int ret = pthread_mutex_init(&m_mutex, NULL);
+ ut_a(ret == 0);
+ }
+#endif /* _WIN32 */
+
+ ut_d(m_freed = false);
+ }
+
+ /** Destructor */
+ ~OSMutex() { }
+
+ /** Destroy the mutex */
+ void destroy()
+ UNIV_NOTHROW
+ {
+ ut_ad(!m_freed);
+#ifdef _WIN32
+ DeleteCriticalSection((LPCRITICAL_SECTION) &m_mutex);
+#else
+ int ret;
+
+ ret = pthread_mutex_destroy(&m_mutex);
+
+ if (ret != 0) {
+
+ ib::error()
+ << "Return value " << ret << " when calling "
+ << "pthread_mutex_destroy().";
+ }
+#endif /* _WIN32 */
+ ut_d(m_freed = true);
+ }
+
+ /** Release the mutex. */
+ void exit()
+ UNIV_NOTHROW
+ {
+ ut_ad(!m_freed);
+#ifdef _WIN32
+ LeaveCriticalSection(&m_mutex);
+#else
+ int ret = pthread_mutex_unlock(&m_mutex);
+ ut_a(ret == 0);
+#endif /* _WIN32 */
+ }
+
+ /** Acquire the mutex. */
+ void enter()
+ UNIV_NOTHROW
+ {
+ ut_ad(!m_freed);
+#ifdef _WIN32
+ EnterCriticalSection((LPCRITICAL_SECTION) &m_mutex);
+#else
+ int ret = pthread_mutex_lock(&m_mutex);
+ ut_a(ret == 0);
+#endif /* _WIN32 */
+ }
+
+ /** @return true if locking succeeded */
+ bool try_lock()
+ UNIV_NOTHROW
+ {
+ ut_ad(!m_freed);
+#ifdef _WIN32
+ return(TryEnterCriticalSection(&m_mutex) != 0);
+#else
+ return(pthread_mutex_trylock(&m_mutex) == 0);
+#endif /* _WIN32 */
+ }
+
+ /** Required for os_event_t */
+ operator sys_mutex_t*()
+ UNIV_NOTHROW
+ {
+ return(&m_mutex);
+ }
+
+private:
+#ifdef DBUG_ASSERT_EXISTS
+ /** true if the mutex has been freed/destroyed. */
+ bool m_freed;
+#endif /* DBUG_ASSERT_EXISTS */
+
+ sys_mutex_t m_mutex;
+};
+
+#ifdef UNIV_PFS_MUTEX
+/** Latch element.
+Used for mutexes which have PFS keys defined under UNIV_PFS_MUTEX.
+@param[in] id Latch id
+@param[in] level Latch level
+@param[in] key PFS key */
+# define LATCH_ADD_MUTEX(id, level, key) latch_meta[LATCH_ID_ ## id] =\
+ UT_NEW_NOKEY(latch_meta_t(LATCH_ID_ ## id, #id, level, #level, key))
+
+#ifdef UNIV_PFS_RWLOCK
+/** Latch element.
+Used for rwlocks which have PFS keys defined under UNIV_PFS_RWLOCK.
+@param[in] id Latch id
+@param[in] level Latch level
+@param[in] key PFS key */
+# define LATCH_ADD_RWLOCK(id, level, key) latch_meta[LATCH_ID_ ## id] =\
+ UT_NEW_NOKEY(latch_meta_t(LATCH_ID_ ## id, #id, level, #level, key))
+#else
+# define LATCH_ADD_RWLOCK(id, level, key) latch_meta[LATCH_ID_ ## id] =\
+ UT_NEW_NOKEY(latch_meta_t(LATCH_ID_ ## id, #id, level, #level, \
+ PSI_NOT_INSTRUMENTED))
+#endif /* UNIV_PFS_RWLOCK */
+
+#else
+# define LATCH_ADD_MUTEX(id, level, key) latch_meta[LATCH_ID_ ## id] =\
+ UT_NEW_NOKEY(latch_meta_t(LATCH_ID_ ## id, #id, level, #level))
+# define LATCH_ADD_RWLOCK(id, level, key) latch_meta[LATCH_ID_ ## id] =\
+ UT_NEW_NOKEY(latch_meta_t(LATCH_ID_ ## id, #id, level, #level))
+#endif /* UNIV_PFS_MUTEX */
+
+/** Default latch counter */
+class LatchCounter {
+
+public:
+ /** The counts we collect for a mutex */
+ struct Count {
+
+ /** Constructor */
+ Count()
+ UNIV_NOTHROW
+ :
+ m_spins(),
+ m_waits(),
+ m_calls(),
+ m_enabled()
+ {
+ /* No op */
+ }
+
+ /** Rest the values to zero */
+ void reset()
+ UNIV_NOTHROW
+ {
+ m_spins = 0;
+ m_waits = 0;
+ m_calls = 0;
+ }
+
+ /** Number of spins trying to acquire the latch. */
+ uint32_t m_spins;
+
+ /** Number of waits trying to acquire the latch */
+ uint32_t m_waits;
+
+ /** Number of times it was called */
+ uint32_t m_calls;
+
+ /** true if enabled */
+ bool m_enabled;
+ };
+
+ /** Constructor */
+ LatchCounter()
+ UNIV_NOTHROW
+ :
+ m_active(false)
+ {
+ m_mutex.init();
+ }
+
+ /** Destructor */
+ ~LatchCounter()
+ UNIV_NOTHROW
+ {
+ m_mutex.destroy();
+
+ for (Counters::iterator it = m_counters.begin();
+ it != m_counters.end();
+ ++it) {
+
+ Count* count = *it;
+
+ UT_DELETE(count);
+ }
+ }
+
+ /** Reset all counters to zero. It is not protected by any
+ mutex and we don't care about atomicity. Unless it is a
+ demonstrated problem. The information collected is not
+ required for the correct functioning of the server. */
+ void reset()
+ UNIV_NOTHROW
+ {
+ m_mutex.enter();
+
+ Counters::iterator end = m_counters.end();
+
+ for (Counters::iterator it = m_counters.begin();
+ it != end;
+ ++it) {
+
+ (*it)->reset();
+ }
+
+ m_mutex.exit();
+ }
+
+ /** @return the aggregate counter */
+ Count* sum_register()
+ UNIV_NOTHROW
+ {
+ m_mutex.enter();
+
+ Count* count;
+
+ if (m_counters.empty()) {
+ count = UT_NEW_NOKEY(Count());
+ m_counters.push_back(count);
+ } else {
+ ut_a(m_counters.size() == 1);
+ count = m_counters[0];
+ }
+
+ m_mutex.exit();
+
+ return(count);
+ }
+
+ /** Register a single instance counter */
+ void single_register(Count* count)
+ UNIV_NOTHROW
+ {
+ m_mutex.enter();
+
+ m_counters.push_back(count);
+
+ m_mutex.exit();
+ }
+
+ /** Deregister a single instance counter
+ @param[in] count The count instance to deregister */
+ void single_deregister(Count* count)
+ UNIV_NOTHROW
+ {
+ m_mutex.enter();
+
+ m_counters.erase(
+ std::remove(
+ m_counters.begin(),
+ m_counters.end(), count),
+ m_counters.end());
+
+ m_mutex.exit();
+ }
+
+ /** Iterate over the counters */
+ template<typename C> void iterate(const C& callback) UNIV_NOTHROW
+ {
+ m_mutex.enter();
+
+ Counters::const_iterator end = m_counters.end();
+
+ for (Counters::const_iterator it = m_counters.begin();
+ it != end;
+ ++it) {
+
+ callback(*it);
+ }
+
+ m_mutex.exit();
+ }
+
+ /** Disable the monitoring */
+ void enable()
+ UNIV_NOTHROW
+ {
+ m_mutex.enter();
+
+ Counters::const_iterator end = m_counters.end();
+
+ for (Counters::const_iterator it = m_counters.begin();
+ it != end;
+ ++it) {
+
+ (*it)->m_enabled = true;
+ }
+
+ m_active = true;
+
+ m_mutex.exit();
+ }
+
+ /** Disable the monitoring */
+ void disable()
+ UNIV_NOTHROW
+ {
+ m_mutex.enter();
+
+ Counters::const_iterator end = m_counters.end();
+
+ for (Counters::const_iterator it = m_counters.begin();
+ it != end;
+ ++it) {
+
+ (*it)->m_enabled = false;
+ }
+
+ m_active = false;
+
+ m_mutex.exit();
+ }
+
+ /** @return if monitoring is active */
+ bool is_enabled() const
+ UNIV_NOTHROW
+ {
+ return(m_active);
+ }
+
+private:
+ /* Disable copying */
+ LatchCounter(const LatchCounter&);
+ LatchCounter& operator=(const LatchCounter&);
+
+private:
+ typedef OSMutex Mutex;
+ typedef std::vector<Count*> Counters;
+
+ /** Mutex protecting m_counters */
+ Mutex m_mutex;
+
+ /** Counters for the latches */
+ Counters m_counters;
+
+ /** if true then we collect the data */
+ bool m_active;
+};
+
+/** Latch meta data */
+template <typename Counter = LatchCounter>
+class LatchMeta {
+
+public:
+ typedef Counter CounterType;
+
+#ifdef UNIV_PFS_MUTEX
+ typedef mysql_pfs_key_t pfs_key_t;
+#endif /* UNIV_PFS_MUTEX */
+
+ /** Constructor */
+ LatchMeta()
+ :
+ m_id(LATCH_ID_NONE),
+ m_name(),
+ m_level(SYNC_UNKNOWN),
+ m_level_name()
+#ifdef UNIV_PFS_MUTEX
+ ,m_pfs_key()
+#endif /* UNIV_PFS_MUTEX */
+ {
+ }
+
+ /** Destructor */
+ ~LatchMeta() { }
+
+ /** Constructor
+ @param[in] id Latch id
+ @param[in] name Latch name
+ @param[in] level Latch level
+ @param[in] level_name Latch level text representation
+ @param[in] key PFS key */
+ LatchMeta(
+ latch_id_t id,
+ const char* name,
+ latch_level_t level,
+ const char* level_name
+#ifdef UNIV_PFS_MUTEX
+ ,pfs_key_t key
+#endif /* UNIV_PFS_MUTEX */
+ )
+ :
+ m_id(id),
+ m_name(name),
+ m_level(level),
+ m_level_name(level_name)
+#ifdef UNIV_PFS_MUTEX
+ ,m_pfs_key(key)
+#endif /* UNIV_PFS_MUTEX */
+ {
+ /* No op */
+ }
+
+ /* Less than operator.
+ @param[in] rhs Instance to compare against
+ @return true if this.get_id() < rhs.get_id() */
+ bool operator<(const LatchMeta& rhs) const
+ {
+ return(get_id() < rhs.get_id());
+ }
+
+ /** @return the latch id */
+ latch_id_t get_id() const
+ {
+ return(m_id);
+ }
+
+ /** @return the latch name */
+ const char* get_name() const
+ {
+ return(m_name);
+ }
+
+ /** @return the latch level */
+ latch_level_t get_level() const
+ {
+ return(m_level);
+ }
+
+ /** @return the latch level name */
+ const char* get_level_name() const
+ {
+ return(m_level_name);
+ }
+
+#ifdef UNIV_PFS_MUTEX
+ /** @return the PFS key for the latch */
+ pfs_key_t get_pfs_key() const
+ {
+ return(m_pfs_key);
+ }
+#endif /* UNIV_PFS_MUTEX */
+
+ /** @return the counter instance */
+ Counter* get_counter()
+ {
+ return(&m_counter);
+ }
+
+private:
+ /** Latch id */
+ latch_id_t m_id;
+
+ /** Latch name */
+ const char* m_name;
+
+ /** Latch level in the ordering */
+ latch_level_t m_level;
+
+ /** Latch level text representation */
+ const char* m_level_name;
+
+#ifdef UNIV_PFS_MUTEX
+ /** PFS key */
+ pfs_key_t m_pfs_key;
+#endif /* UNIV_PFS_MUTEX */
+
+ /** For gathering latch statistics */
+ Counter m_counter;
+};
+
+typedef LatchMeta<LatchCounter> latch_meta_t;
+typedef std::vector<latch_meta_t*, ut_allocator<latch_meta_t*> > LatchMetaData;
+
+/** Note: This is accessed without any mutex protection. It is initialised
+at startup and elements should not be added to or removed from it after
+that. See sync_latch_meta_init() */
+extern LatchMetaData latch_meta;
+
+/** Get the latch meta-data from the latch ID
+@param[in] id Latch ID
+@return the latch meta data */
+inline
+latch_meta_t&
+sync_latch_get_meta(latch_id_t id)
+{
+ ut_ad(static_cast<size_t>(id) < latch_meta.size());
+ ut_ad(id == latch_meta[id]->get_id());
+
+ return(*latch_meta[id]);
+}
+
+/** Fetch the counter for the latch
+@param[in] id Latch ID
+@return the latch counter */
+inline
+latch_meta_t::CounterType*
+sync_latch_get_counter(latch_id_t id)
+{
+ latch_meta_t& meta = sync_latch_get_meta(id);
+
+ return(meta.get_counter());
+}
+
+/** Get the latch name from the latch ID
+@param[in] id Latch ID
+@return the name, will assert if not found */
+inline
+const char*
+sync_latch_get_name(latch_id_t id)
+{
+ const latch_meta_t& meta = sync_latch_get_meta(id);
+
+ return(meta.get_name());
+}
+
+/** Get the latch ordering level
+@param[in] id Latch id to lookup
+@return the latch level */
+inline
+latch_level_t
+sync_latch_get_level(latch_id_t id)
+{
+ const latch_meta_t& meta = sync_latch_get_meta(id);
+
+ return(meta.get_level());
+}
+
+#ifdef UNIV_PFS_MUTEX
+/** Get the latch PFS key from the latch ID
+@param[in] id Latch ID
+@return the PFS key */
+inline
+mysql_pfs_key_t
+sync_latch_get_pfs_key(latch_id_t id)
+{
+ const latch_meta_t& meta = sync_latch_get_meta(id);
+
+ return(meta.get_pfs_key());
+}
+#endif
+
+/** String representation of the filename and line number where the
+latch was created
+@param[in] id Latch ID
+@param[in] created Filename and line number where it was crated
+@return the string representation */
+std::string
+sync_mutex_to_string(
+ latch_id_t id,
+ const std::string& created);
+
+/** Get the latch name from a sync level
+@param[in] level Latch level to lookup
+@return 0 if not found. */
+const char*
+sync_latch_get_name(latch_level_t level);
+
+/** Print the filename "basename"
+@return the basename */
+const char*
+sync_basename(const char* filename);
+
+#ifdef UNIV_DEBUG
+
+/** All (ordered) latches, used in debugging, must derive from this class. */
+struct latch_t {
+
+ /** Constructor
+ @param[in] id The latch ID */
+ explicit latch_t(latch_id_t id = LATCH_ID_NONE)
+ UNIV_NOTHROW
+ :
+ m_id(id),
+ m_rw_lock() {}
+
+ /** Destructor */
+ virtual ~latch_t() UNIV_NOTHROW { }
+
+ /** @return the latch ID */
+ latch_id_t get_id() const
+ {
+ return(m_id);
+ }
+
+ /** @return true if it is a rw-lock */
+ bool is_rw_lock() const
+ UNIV_NOTHROW
+ {
+ return(m_rw_lock);
+ }
+
+ /** Print the latch context
+ @return the string representation */
+ virtual std::string to_string() const = 0;
+
+ /** @return the latch level */
+ latch_level_t get_level() const
+ UNIV_NOTHROW
+ {
+ ut_a(m_id != LATCH_ID_NONE);
+
+ return(sync_latch_get_level(m_id));
+ }
+
+ /** @return the latch name, m_id must be set */
+ const char* get_name() const
+ UNIV_NOTHROW
+ {
+ ut_a(m_id != LATCH_ID_NONE);
+
+ return(sync_latch_get_name(m_id));
+ }
+
+ /** Latch ID */
+ latch_id_t m_id;
+
+ /** true if it is a rw-lock. In debug mode, rw_lock_t derives from
+ this class and sets this variable. */
+ bool m_rw_lock;
+};
+
+/** Subclass this to iterate over a thread's acquired latch levels. */
+struct sync_check_functor_t {
+ virtual ~sync_check_functor_t() { }
+ virtual bool operator()(const latch_level_t) const = 0;
+};
+
+/** Check that no latch is being held.
+@tparam some_allowed whether some latches are allowed to be held */
+template<bool some_allowed = false>
+struct sync_checker : public sync_check_functor_t
+{
+ /** Check the latching constraints
+ @param[in] level The level held by the thread
+ @return whether a latch violation was detected */
+ bool operator()(const latch_level_t level) const override
+ {
+ if (some_allowed) {
+ switch (level) {
+ case SYNC_FSP:
+ case SYNC_DICT:
+ case SYNC_DICT_OPERATION:
+ case SYNC_FTS_CACHE:
+ case SYNC_NO_ORDER_CHECK:
+ return(false);
+ default:
+ return(true);
+ }
+ }
+
+ return(true);
+ }
+};
+
+/** The strict latch checker (no InnoDB latches may be held) */
+typedef struct sync_checker<false> sync_check;
+/** The sloppy latch checker (can hold InnoDB dictionary or SQL latches) */
+typedef struct sync_checker<true> dict_sync_check;
+
+/** Functor to check for given latching constraints. */
+struct sync_allowed_latches : public sync_check_functor_t {
+
+ /** Constructor
+ @param[in] from first element in an array of latch_level_t
+ @param[in] to last element in an array of latch_level_t */
+ sync_allowed_latches(
+ const latch_level_t* from,
+ const latch_level_t* to)
+ : begin(from), end(to) { }
+
+ /** Checks whether the given latch_t violates the latch constraint.
+ This object maintains a list of allowed latch levels, and if the given
+ latch belongs to a latch level that is not there in the allowed list,
+ then it is a violation.
+
+ @param[in] latch The latch level to check
+ @return true if there is a latch violation */
+ bool operator()(const latch_level_t level) const override
+ {
+ return(std::find(begin, end, level) == end);
+ }
+
+private:
+ /** First element in an array of allowed latch levels */
+ const latch_level_t* const begin;
+ /** First element after the end of the array of allowed latch levels */
+ const latch_level_t* const end;
+};
+
+/** Get the latch id from a latch name.
+@param[in] id Latch name
+@return LATCH_ID_NONE. */
+latch_id_t
+sync_latch_get_id(const char* name);
+
+typedef ulint rw_lock_flags_t;
+
+/* Flags to specify lock types for rw_lock_own_flagged() */
+enum rw_lock_flag_t {
+ RW_LOCK_FLAG_S = 1 << 0,
+ RW_LOCK_FLAG_X = 1 << 1,
+ RW_LOCK_FLAG_SX = 1 << 2
+};
+
+#endif /* UNIV_DBEUG */
+
+#endif /* UNIV_INNOCHECKSUM */
+
+/** Simple non-atomic counter aligned to CACHE_LINE_SIZE
+@tparam Type the integer type of the counter */
+template <typename Type>
+struct MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter
+{
+ /** Increment the counter */
+ Type inc() { return add(1); }
+ /** Decrement the counter */
+ Type dec() { return add(Type(~0)); }
+
+ /** Add to the counter
+ @param[in] i amount to be added
+ @return the value of the counter after adding */
+ Type add(Type i) { return m_counter += i; }
+
+ /** @return the value of the counter */
+ operator Type() const { return m_counter; }
+
+private:
+ /** The counter */
+ Type m_counter;
+};
+#endif /* sync0types_h */
diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h
new file mode 100644
index 00000000..40160ce4
--- /dev/null
+++ b/storage/innobase/include/trx0i_s.h
@@ -0,0 +1,278 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0i_s.h
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables cache structures and public
+functions.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef trx0i_s_h
+#define trx0i_s_h
+
+#include "trx0types.h"
+#include "dict0types.h"
+#include "buf0types.h"
+
+/** The maximum amount of memory that can be consumed by innodb_trx,
+innodb_locks and innodb_lock_waits information schema tables. */
+#define TRX_I_S_MEM_LIMIT 16777216 /* 16 MiB */
+
+/** The maximum length of a string that can be stored in
+i_s_locks_row_t::lock_data */
+#define TRX_I_S_LOCK_DATA_MAX_LEN 8192
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_query */
+#define TRX_I_S_TRX_QUERY_MAX_LEN 1024
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_foreign_key_error */
+#define TRX_I_S_TRX_FK_ERROR_MAX_LEN 256
+
+/** Safely copy strings in to the INNODB_TRX table's
+string based columns */
+#define TRX_I_S_STRING_COPY(data, field, constraint, tcache) \
+do { \
+ if (strlen(data) > constraint) { \
+ char buff[constraint + 1]; \
+ strncpy(buff, data, constraint); \
+ buff[constraint] = '\0'; \
+ \
+ field = static_cast<const char*>( \
+ ha_storage_put_memlim( \
+ (tcache)->storage, buff, constraint + 1,\
+ MAX_ALLOWED_FOR_STORAGE(tcache))); \
+ } else { \
+ field = static_cast<const char*>( \
+ ha_storage_put_str_memlim( \
+ (tcache)->storage, data, \
+ MAX_ALLOWED_FOR_STORAGE(tcache))); \
+ } \
+} while (0)
+
+/** A row of INFORMATION_SCHEMA.innodb_locks */
+struct i_s_locks_row_t;
+
+/** Objects of trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_t;
+
+/** Objects of this type are added to the hash table
+trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_t {
+ i_s_locks_row_t* value; /*!< row of
+ INFORMATION_SCHEMA.innodb_locks*/
+ i_s_hash_chain_t* next; /*!< next item in the hash chain */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_locks row */
+struct i_s_locks_row_t {
+ trx_id_t lock_trx_id; /*!< transaction identifier */
+ const char* lock_table; /*!< table name from
+ lock_get_table_name() */
+ /** index name of a record lock; NULL for table locks */
+ const char* lock_index;
+ /** page identifier of the record; (0,0) if !lock_index */
+ page_id_t lock_page;
+ /** heap number of the record; 0 if !lock_index */
+ uint16_t lock_rec;
+ /** lock mode corresponding to lock_mode_values_typelib */
+ uint8_t lock_mode;
+ /** (some) content of the record, if available in the buffer pool;
+ NULL if !lock_index */
+ const char* lock_data;
+
+ /** The following are auxiliary and not included in the table */
+ /* @{ */
+ table_id_t lock_table_id;
+ /*!< table identifier from
+ lock_get_table_id */
+ i_s_hash_chain_t hash_chain; /*!< hash table chain node for
+ trx_i_s_cache_t::locks_hash */
+ /* @} */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_trx row */
+struct i_s_trx_row_t {
+ trx_id_t trx_id; /*!< transaction identifier */
+ const char* trx_state; /*!< transaction state from
+ trx_get_que_state_str() */
+ time_t trx_started; /*!< trx_t::start_time */
+ const i_s_locks_row_t* requested_lock_row;
+ /*!< pointer to a row
+ in innodb_locks if trx
+ is waiting, or NULL */
+ time_t trx_wait_started; /*!< trx_t->lock.wait_started */
+ uintmax_t trx_weight; /*!< TRX_WEIGHT() */
+ ulint trx_mysql_thread_id; /*!< thd_get_thread_id() */
+ const char* trx_query; /*!< MySQL statement being
+ executed in the transaction */
+ CHARSET_INFO* trx_query_cs; /*!< the charset of trx_query */
+ const char* trx_operation_state; /*!< trx_t::op_info */
+ ulint trx_tables_in_use;/*!< n_mysql_tables_in_use in
+ trx_t */
+ ulint trx_tables_locked;
+ /*!< mysql_n_tables_locked in
+ trx_t */
+ ulint trx_lock_structs;/*!< list len of trx_locks in
+ trx_t */
+ ulint trx_lock_memory_bytes;
+ /*!< mem_heap_get_size(
+ trx->lock_heap) */
+ ulint trx_rows_locked;/*!< lock_number_of_rows_locked() */
+ uintmax_t trx_rows_modified;/*!< trx_t::undo_no */
+ uint trx_isolation_level;
+ /*!< trx_t::isolation_level */
+ bool trx_unique_checks;
+ /*!< check_unique_secondary in trx_t*/
+ bool trx_foreign_key_checks;
+ /*!< check_foreigns in trx_t */
+ const char* trx_foreign_key_error;
+ /*!< detailed_error in trx_t */
+ bool trx_is_read_only;
+ /*!< trx_t::read_only */
+ bool trx_is_autocommit_non_locking;
+ /*!< trx:t::is_autocommit_non_locking()
+ */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */
+struct i_s_lock_waits_row_t {
+ const i_s_locks_row_t* requested_lock_row; /*!< requested lock */
+ const i_s_locks_row_t* blocking_lock_row; /*!< blocking lock */
+};
+
+/** Cache of INFORMATION_SCHEMA table data */
+struct trx_i_s_cache_t;
+
+/** Auxiliary enum used by functions that need to select one of the
+INFORMATION_SCHEMA tables */
+enum i_s_table {
+ I_S_INNODB_TRX, /*!< INFORMATION_SCHEMA.innodb_trx */
+ I_S_INNODB_LOCKS, /*!< INFORMATION_SCHEMA.innodb_locks */
+ I_S_INNODB_LOCK_WAITS /*!< INFORMATION_SCHEMA.innodb_lock_waits */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+extern trx_i_s_cache_t* trx_i_s_cache;
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_init(
+/*===============*/
+ trx_i_s_cache_t* cache); /*!< out: cache to init */
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_free(
+/*===============*/
+ trx_i_s_cache_t* cache); /*!< in/out: cache to free */
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_end_read(
+/*===================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_start_write(
+/*======================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_end_write(
+/*====================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table); /*!< in: which table */
+
+/*******************************************************************//**
+Retrieves the nth row in the cache for a given INFORMATION SCHEMA
+table.
+@return row */
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table, /*!< in: which table */
+ ulint n); /*!< in: row number */
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+@return 0 - fetched, 1 - not */
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+ trx_i_s_cache_t* cache); /*!< in/out: cache */
+
+/*******************************************************************//**
+Returns true, if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+bool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+/** The maximum length of a resulting lock_id_size in
+trx_i_s_create_lock_id(), not including the terminating NUL.
+":%lu:%lu:%lu" -> 63 chars */
+#define TRX_I_S_LOCK_ID_MAX_LEN (TRX_ID_MAX_LEN + 63)
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ char* lock_id,/*!< out: resulting lock_id */
+ ulint lock_id_size);/*!< in: size of the lock id
+ buffer */
+
+#endif /* trx0i_s_h */
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
new file mode 100644
index 00000000..ef942076
--- /dev/null
+++ b/storage/innobase/include/trx0purge.h
@@ -0,0 +1,268 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.h
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0purge_h
+#define trx0purge_h
+
+#include "trx0rseg.h"
+#include "que0types.h"
+
+#include <queue>
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+extern trx_undo_rec_t trx_purge_dummy_rec;
+
+/** Prepend the history list with an undo log.
+Remove the undo log segment from the rseg slot if it is too big for reuse.
+@param[in] trx transaction
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction */
+void
+trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr);
+/**
+Run a purge batch.
+@param n_tasks number of purge tasks to submit to the queue
+@param truncate whether to truncate the history at the end of the batch
+@return number of undo log pages handled in the batch */
+ulint trx_purge(ulint n_tasks, bool truncate);
+
+/** Rollback segements from a given transaction with trx-no
+scheduled for purge. */
+class TrxUndoRsegs {
+private:
+ typedef std::vector<trx_rseg_t*, ut_allocator<trx_rseg_t*> >
+ trx_rsegs_t;
+public:
+ typedef trx_rsegs_t::iterator iterator;
+ typedef trx_rsegs_t::const_iterator const_iterator;
+
+ TrxUndoRsegs() {}
+
+ /** Constructor */
+ TrxUndoRsegs(trx_rseg_t& rseg)
+ : trx_no(rseg.last_trx_no()), m_rsegs(1, &rseg) {}
+ /** Constructor */
+ TrxUndoRsegs(trx_id_t trx_no, trx_rseg_t& rseg)
+ : trx_no(trx_no), m_rsegs(1, &rseg) {}
+
+ bool operator!=(const TrxUndoRsegs& other) const
+ { return trx_no != other.trx_no; }
+ bool empty() const { return m_rsegs.empty(); }
+ void erase(iterator& it) { m_rsegs.erase(it); }
+ iterator begin() { return(m_rsegs.begin()); }
+ iterator end() { return(m_rsegs.end()); }
+ const_iterator begin() const { return m_rsegs.begin(); }
+ const_iterator end() const { return m_rsegs.end(); }
+
+ /** Compare two TrxUndoRsegs based on trx_no.
+ @param elem1 first element to compare
+ @param elem2 second element to compare
+ @return true if elem1 > elem2 else false.*/
+ bool operator()(const TrxUndoRsegs& lhs, const TrxUndoRsegs& rhs)
+ {
+ return(lhs.trx_no > rhs.trx_no);
+ }
+
+ /** Copy of trx_rseg_t::last_trx_no() */
+ trx_id_t trx_no= 0;
+private:
+ /** Rollback segments of a transaction, scheduled for purge. */
+ trx_rsegs_t m_rsegs{};
+};
+
+typedef std::priority_queue<
+ TrxUndoRsegs,
+ std::vector<TrxUndoRsegs, ut_allocator<TrxUndoRsegs> >,
+ TrxUndoRsegs> purge_pq_t;
+
+/** Chooses the rollback segment with the oldest committed transaction */
+struct TrxUndoRsegsIterator {
+ /** Constructor */
+ TrxUndoRsegsIterator();
+ /** Sets the next rseg to purge in purge_sys.
+ Executed in the purge coordinator thread.
+ @return whether anything is to be purged */
+ inline bool set_next();
+
+private:
+ // Disable copying
+ TrxUndoRsegsIterator(const TrxUndoRsegsIterator&);
+ TrxUndoRsegsIterator& operator=(const TrxUndoRsegsIterator&);
+
+ /** The current element to process */
+ TrxUndoRsegs m_rsegs;
+ /** Track the current element in m_rsegs */
+ TrxUndoRsegs::const_iterator m_iter;
+};
+
+/** The control structure used in the purge operation */
+class purge_sys_t
+{
+public:
+ /** latch protecting view, m_enabled */
+ MY_ALIGNED(CACHE_LINE_SIZE)
+ mutable rw_lock_t latch;
+private:
+ /** The purge will not remove undo logs which are >= this view */
+ MY_ALIGNED(CACHE_LINE_SIZE)
+ ReadViewBase view;
+ /** whether purge is enabled; protected by latch and std::atomic */
+ std::atomic<bool> m_enabled;
+ /** number of pending stop() calls without resume() */
+ Atomic_counter<int32_t> m_paused;
+public:
+ que_t* query; /*!< The query graph which will do the
+ parallelized purge operation */
+
+ /** Iterator to the undo log records of committed transactions */
+ struct iterator
+ {
+ bool operator<=(const iterator& other) const
+ {
+ if (trx_no < other.trx_no) return true;
+ if (trx_no > other.trx_no) return false;
+ return undo_no <= other.undo_no;
+ }
+
+ /** trx_t::no of the committed transaction */
+ trx_id_t trx_no;
+ /** The record number within the committed transaction's undo
+ log, increasing, purged from from 0 onwards */
+ undo_no_t undo_no;
+ };
+
+ /** The tail of the purge queue; the last parsed undo log of a
+ committed transaction. */
+ iterator tail;
+ /** The head of the purge queue; any older undo logs of committed
+ transactions may be discarded (history list truncation). */
+ iterator head;
+ /*-----------------------------*/
+ bool next_stored; /*!< whether rseg holds the next record
+ to purge */
+ trx_rseg_t* rseg; /*!< Rollback segment for the next undo
+ record to purge */
+ uint32_t page_no; /*!< Page number for the next undo
+ record to purge, page number of the
+ log header, if dummy record */
+ uint32_t hdr_page_no; /*!< Header page of the undo log where
+ the next record to purge belongs */
+ uint16_t offset; /*!< Page offset for the next undo
+ record to purge, 0 if the dummy
+ record */
+ uint16_t hdr_offset; /*!< Header byte offset on the page */
+
+
+ TrxUndoRsegsIterator
+ rseg_iter; /*!< Iterator to get the next rseg
+ to process */
+
+ purge_pq_t purge_queue; /*!< Binary min-heap, ordered on
+ TrxUndoRsegs::trx_no. It is protected
+ by the pq_mutex */
+ PQMutex pq_mutex; /*!< Mutex protecting purge_queue */
+
+ /** Undo tablespace file truncation (only accessed by the
+ srv_purge_coordinator_thread) */
+ struct {
+ /** The undo tablespace that is currently being truncated */
+ fil_space_t* current;
+ /** The undo tablespace that was last truncated */
+ fil_space_t* last;
+ } truncate;
+
+ /** Heap for reading the undo log records */
+ mem_heap_t* heap;
+ /**
+ Constructor.
+
+ Some members may require late initialisation, thus we just mark object as
+ uninitialised. Real initialisation happens in create().
+ */
+
+ purge_sys_t(): m_enabled(false), heap(nullptr) {}
+
+ /** Create the instance */
+ void create();
+
+ /** Close the purge system on shutdown */
+ void close();
+
+ /** @return whether purge is enabled */
+ bool enabled() { return m_enabled.load(std::memory_order_relaxed); }
+ /** @return whether the purge coordinator is paused */
+ bool paused()
+ { return m_paused != 0; }
+
+ /** Enable purge at startup. Not protected by latch; the main thread
+ will wait for purge_sys.enabled() in srv_start() */
+ void coordinator_startup()
+ {
+ ut_ad(!enabled());
+ m_enabled.store(true, std::memory_order_relaxed);
+ }
+
+ /** Disable purge at shutdown */
+ void coordinator_shutdown()
+ {
+ ut_ad(enabled());
+ m_enabled.store(false, std::memory_order_relaxed);
+ }
+
+ /** @return whether the purge tasks are active */
+ bool running() const;
+ /** Stop purge during FLUSH TABLES FOR EXPORT */
+ void stop();
+ /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
+ void resume();
+ /** A wrapper around ReadView::changes_visible(). */
+ bool changes_visible(trx_id_t id, const table_name_t &name) const
+ {
+ ut_ad(rw_lock_own(&latch, RW_LOCK_S));
+ return view.changes_visible(id, name);
+ }
+ /** A wrapper around ReadView::low_limit_no(). */
+ trx_id_t low_limit_no() const
+ {
+#if 0 /* Unfortunately we don't hold this assertion, see MDEV-22718. */
+ ut_ad(rw_lock_own(&latch, RW_LOCK_S));
+#endif
+ return view.low_limit_no();
+ }
+ /** A wrapper around trx_sys_t::clone_oldest_view(). */
+ void clone_oldest_view()
+ {
+ rw_lock_x_lock(&latch);
+ trx_sys.clone_oldest_view(&view);
+ rw_lock_x_unlock(&latch);
+ }
+};
+
+/** The global data structure coordinating a purge */
+extern purge_sys_t purge_sys;
+
+#endif /* trx0purge_h */
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
new file mode 100644
index 00000000..9aeff631
--- /dev/null
+++ b/storage/innobase/include/trx0rec.h
@@ -0,0 +1,321 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.h
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rec_h
+#define trx0rec_h
+
+#include "trx0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "page0types.h"
+#include "row0log.h"
+#include "que0types.h"
+
+/***********************************************************************//**
+Copies the undo record to the heap.
+@return own: copy of undo log record */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+ const trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ mem_heap_t* heap); /*!< in: heap where copied */
+/**********************************************************************//**
+Reads the undo log record type.
+@return record type */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+/**********************************************************************//**
+Reads the undo log record number.
+@return undo no */
+UNIV_INLINE
+undo_no_t
+trx_undo_rec_get_undo_no(
+/*=====================*/
+ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+
+/**********************************************************************//**
+Returns the start of the undo record data area. */
+#define trx_undo_rec_get_ptr(undo_rec, undo_no) \
+ ((undo_rec) + trx_undo_rec_get_offset(undo_no))
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ ulint* type, /*!< out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ ulint* cmpl_info, /*!< out: compiler info, relevant only
+ for update type records */
+ bool* updated_extern, /*!< out: true if we updated an
+ externally stored fild */
+ undo_no_t* undo_no, /*!< out: undo log record number */
+ table_id_t* table_id) /*!< out: table id */
+ MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ byte* ptr, /*!< in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t**ref, /*!< out, own: row reference */
+ mem_heap_t* heap); /*!< in: memory heap from which the memory
+ needed is allocated */
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ const byte* ptr, /*!< in: remaining part of undo
+ log record after reading
+ general parameters */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr, /*!< out: roll ptr */
+ byte* info_bits); /*!< out: info bits state */
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ const byte* ptr, /*!< in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ trx_id_t trx_id, /*!< in: transaction id from this undorecord */
+ roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */
+ byte info_bits,/*!< in: info bits from this undo record */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd); /*!< out, own: update vector */
+/*******************************************************************//**
+Builds a partial row from an update undo log record, for purge.
+It contains the columns which occur as ordering in any index of the table.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
+@return pointer to remaining part of undo record */
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+ const byte* ptr, /*!< in: remaining part in update undo log
+ record of a suitable type, at the start of
+ the stored index columns;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the partial row is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ const upd_t* update, /*!< in: updated columns */
+ dtuple_t** row, /*!< out, own: partial row */
+ ibool ignore_prefix, /*!< in: flag to indicate if we
+ expect blob prefixes in undo. Used
+ only in the assertion. */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Report a RENAME TABLE operation.
+@param[in,out] trx transaction
+@param[in] table table that is being renamed
+@return DB_SUCCESS or error code */
+dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: in the case of an insert,
+ index entry to insert into the
+ clustered index; in updates,
+ may contain a clustered index
+ record tuple that also contains
+ virtual columns of the table;
+ otherwise, NULL */
+ const upd_t* update, /*!< in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const rec_t* rec, /*!< in: case of an update or delete
+ marking, the record in the clustered
+ index; NULL if insert */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ roll_ptr_t* roll_ptr) /*!< out: DB_ROLL_PTR to the
+ undo log record */
+ MY_ATTRIBUTE((nonnull(1,2,8), warn_unused_result));
+
+/** status bit used for trx_undo_prev_version_build() */
+
+/** TRX_UNDO_PREV_IN_PURGE tells trx_undo_prev_version_build() that it
+is being called purge view and we would like to get the purge record
+even it is in the purge view (in normal case, it will return without
+fetching the purge record */
+#define TRX_UNDO_PREV_IN_PURGE 0x1
+
+/** This tells trx_undo_prev_version_build() to fetch the old value in
+the undo log (which is the after image for an update) */
+#define TRX_UNDO_GET_OLD_V_VALUE 0x2
+
+/*******************************************************************//**
+Build a previous version of a clustered index record. The caller must
+hold a latch on the index page of the clustered index record.
+@retval true if previous version was built, or if it was an insert
+or the table has been rebuilt
+@retval false if the previous version is earlier than purge_view,
+which means that it may have been removed */
+bool
+trx_undo_prev_version_build(
+/*========================*/
+ const rec_t* index_rec,/*!< in: clustered index record in the
+ index tree */
+ mtr_t* index_mtr,/*!< in: mtr which contains the latch to
+ index_rec page and purge_view */
+ const rec_t* rec, /*!< in: version of a clustered index record */
+ dict_index_t* index, /*!< in: clustered index */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ rec_t** old_vers,/*!< out, own: previous version, or NULL if
+ rec is the first inserted version, or if
+ history data has been deleted */
+ mem_heap_t* v_heap, /* !< in: memory heap used to create vrow
+ dtuple if it is not yet created. This heap
+ diffs from "heap" above in that it could be
+ prebuilt->old_vers_heap for selection */
+ dtuple_t** vrow, /*!< out: virtual column info, if any */
+ ulint v_status);
+ /*!< in: status determine if it is going
+ into this function by purge thread or not.
+ And if we read "after image" of undo log */
+
+/** Read from an undo log record a non-virtual column value.
+@param[in,out] ptr pointer to remaining part of the undo record
+@param[in,out] field stored field
+@param[in,out] len length of the field, or UNIV_SQL_NULL
+@param[in,out] orig_len original length of the locally stored part
+of an externally stored column, or 0
+@return remaining part of undo log record after reading these values */
+byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+ uint32_t *len, uint32_t *orig_len);
+
+/** Read virtual column value from undo log
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[in,out] row the dtuple to fill
+@param[in] in_purge whether this is called by purge */
+void
+trx_undo_read_v_cols(
+ const dict_table_t* table,
+ const byte* ptr,
+ dtuple_t* row,
+ bool in_purge);
+
+/** Read virtual column index from undo log if the undo log contains such
+info, and verify the column is still indexed, and output its position
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[in] first_v_col if this is the first virtual column, which
+ has the version marker
+@param[in,out] is_undo_log his function is used to parse both undo log,
+ and online log for virtual columns. So
+ check to see if this is undo log
+@param[out] field_no the column number, or FIL_NULL if not indexed
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_read_v_idx(
+ const dict_table_t* table,
+ const byte* ptr,
+ bool first_v_col,
+ bool* is_undo_log,
+ uint32_t* field_no);
+
+/* Types of an undo log record: these have to be smaller than 16, as the
+compilation info multiplied by 16 is ORed to this value in an undo log
+record */
+
+#define TRX_UNDO_RENAME_TABLE 9 /*!< RENAME TABLE */
+#define TRX_UNDO_INSERT_METADATA 10 /*!< insert a metadata
+ pseudo-record for instant ALTER */
+#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */
+#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked
+ record */
+#define TRX_UNDO_UPD_DEL_REC 13 /* update of a delete marked record to
+ a not delete marked record; also the
+ fields of the record can change */
+#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields
+ do not change */
+#define TRX_UNDO_CMPL_INFO_MULT 16U /* compilation info is multiplied by
+ this and ORed to the type above */
+#define TRX_UNDO_UPD_EXTERN 128U /* This bit can be ORed to type_cmpl
+ to denote that we updated external
+ storage fields: used by purge to
+ free the external storage */
+
+/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */
+extern const dtuple_t trx_undo_metadata;
+
+/** Read the table id from an undo log record.
+@param[in] rec Undo log record
+@return table id stored as a part of undo log record */
+inline table_id_t trx_undo_rec_get_table_id(const trx_undo_rec_t *rec)
+{
+ rec+= 3;
+ mach_read_next_much_compressed(&rec);
+ return mach_read_next_much_compressed(&rec);
+}
+
+#include "trx0rec.ic"
+
+#endif /* trx0rec_h */
diff --git a/storage/innobase/include/trx0rec.ic b/storage/innobase/include/trx0rec.ic
new file mode 100644
index 00000000..02244d68
--- /dev/null
+++ b/storage/innobase/include/trx0rec.ic
@@ -0,0 +1,73 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.ic
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Reads from an undo log record the record type.
+@return record type */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+ const trx_undo_rec_t* undo_rec) /*!< in: undo log record */
+{
+ return(mach_read_from_1(undo_rec + 2) & (TRX_UNDO_CMPL_INFO_MULT - 1));
+}
+
+/**********************************************************************//**
+Reads the undo log record number.
+@return undo no */
+UNIV_INLINE
+undo_no_t
+trx_undo_rec_get_undo_no(
+/*=====================*/
+ const trx_undo_rec_t* undo_rec) /*!< in: undo log record */
+{
+ const byte* ptr;
+
+ ptr = undo_rec + 3;
+
+ return(mach_u64_read_much_compressed(ptr));
+}
+
+/***********************************************************************//**
+Copies the undo record to the heap.
+@return own: copy of undo log record */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+ const trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ mem_heap_t* heap) /*!< in: heap where copied */
+{
+ ulint len;
+
+ len = mach_read_from_2(undo_rec)
+ - ut_align_offset(undo_rec, srv_page_size);
+ ut_ad(len < srv_page_size);
+ trx_undo_rec_t* rec = static_cast<trx_undo_rec_t*>(
+ mem_heap_dup(heap, undo_rec, len));
+ mach_write_to_2(rec, len);
+ return rec;
+}
diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h
new file mode 100644
index 00000000..6a562dcb
--- /dev/null
+++ b/storage/innobase/include/trx0roll.h
@@ -0,0 +1,187 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.h
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0roll_h
+#define trx0roll_h
+
+#include "trx0trx.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+
+extern bool trx_rollback_is_active;
+extern const trx_t* trx_roll_crash_recv_trx;
+
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return savepoint */
+trx_savept_t
+trx_savept_take(
+/*============*/
+ trx_t* trx); /*!< in: transaction */
+
+/** Report progress when rolling back a row of a recovered transaction. */
+void trx_roll_report_progress();
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+@param all true=roll back all recovered active transactions;
+false=roll back any incomplete dictionary transaction */
+void
+trx_rollback_recovered(bool all);
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return a dummy parameter */
+extern "C"
+os_thread_ret_t
+DECLARE_THREAD(trx_rollback_all_recovered)(void*);
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+roll_node_t*
+roll_node_create(
+/*=============*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ que_thr_t* thr); /*!< in: query thread */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_for_mysql(
+/*===================*/
+ trx_t* trx) /*!< in/out: transaction */
+ MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ trx_t* trx) /*!< in/out: transaction */
+ MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache
+ position corresponding to this
+ savepoint; MySQL needs this
+ information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ int64_t binlog_cache_pos) /*!< in: MySQL binlog cache
+ position corresponding to this
+ connection at the time of the
+ savepoint */
+ MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Releases a named savepoint. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name) /*!< in: savepoint name */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************************//**
+Frees savepoint structs starting from savep. */
+void
+trx_roll_savepoints_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep); /*!< in: free all savepoints > this one;
+ if this is NULL, free all savepoints
+ of trx */
+/** Rollback node states */
+enum roll_node_state {
+ ROLL_NODE_NONE = 0, /*!< Unknown state */
+ ROLL_NODE_SEND, /*!< about to send a rollback signal to
+ the transaction */
+ ROLL_NODE_WAIT /*!< rollback signal sent to the
+ transaction, waiting for completion */
+};
+
+/** Rollback command node in a query graph */
+struct roll_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_ROLLBACK */
+ enum roll_node_state state; /*!< node execution state */
+ const trx_savept_t* savept; /*!< savepoint to which to
+ roll back, in the case of a
+ partial rollback */
+ que_thr_t* undo_thr;/*!< undo query graph */
+};
+
+/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */
+struct trx_named_savept_t{
+ char* name; /*!< savepoint name */
+ trx_savept_t savept; /*!< the undo number corresponding to
+ the savepoint */
+ int64_t mysql_binlog_cache_pos;
+ /*!< the MySQL binlog cache position
+ corresponding to this savepoint, not
+ defined if the MySQL binlogging is not
+ enabled */
+ UT_LIST_NODE_T(trx_named_savept_t)
+ trx_savepoints; /*!< the list of savepoints of a
+ transaction */
+};
+
+#endif
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
new file mode 100644
index 00000000..7e4511b8
--- /dev/null
+++ b/storage/innobase/include/trx0rseg.h
@@ -0,0 +1,277 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.h
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rseg_h
+#define trx0rseg_h
+
+#include "trx0sys.h"
+#include "fut0lst.h"
+
+/** Gets a rollback segment header.
+@param[in] space space where placed
+@param[in] page_no page number of the header
+@param[in,out] mtr mini-transaction
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+buf_block_t*
+trx_rsegf_get(fil_space_t* space, uint32_t page_no, mtr_t* mtr);
+
+/** Gets a newly created rollback segment header.
+@param[in] space space where placed
+@param[in] page_no page number of the header
+@param[in,out] mtr mini-transaction
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+buf_block_t*
+trx_rsegf_get_new(
+ ulint space,
+ uint32_t page_no,
+ mtr_t* mtr);
+
+/** Create a rollback segment header.
+@param[in,out] space system, undo, or temporary tablespace
+@param[in] rseg_id rollback segment identifier
+@param[in,out] sys_header the TRX_SYS page (NULL for temporary rseg)
+@param[in,out] mtr mini-transaction
+@return the created rollback segment
+@retval NULL on failure */
+buf_block_t*
+trx_rseg_header_create(
+ fil_space_t* space,
+ ulint rseg_id,
+ buf_block_t* sys_header,
+ mtr_t* mtr);
+
+/** Initialize or recover the rollback segments at startup. */
+dberr_t trx_rseg_array_init();
+
+/** Free a rollback segment in memory. */
+void
+trx_rseg_mem_free(trx_rseg_t* rseg);
+
+/** Create a persistent rollback segment.
+@param[in] space_id system or undo tablespace id
+@return pointer to new rollback segment
+@retval NULL on failure */
+trx_rseg_t*
+trx_rseg_create(ulint space_id)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/** Create the temporary rollback segments. */
+void
+trx_temp_rseg_create();
+
+/* Number of undo log slots in a rollback segment file copy */
+#define TRX_RSEG_N_SLOTS (srv_page_size / 16)
+
+/* Maximum number of transactions supported by a single rollback segment */
+#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2)
+
+/** The rollback segment memory object */
+struct trx_rseg_t {
+ /*--------------------------------------------------------*/
+ /** rollback segment id == the index of its slot in the trx
+ system file copy */
+ ulint id;
+
+ /** mutex protecting the fields in this struct except id,space,page_no
+ which are constant */
+ RsegMutex mutex;
+
+ /** space where the rollback segment header is placed */
+ fil_space_t* space;
+
+ /** page number of the rollback segment header */
+ uint32_t page_no;
+
+ /** current size in pages */
+ uint32_t curr_size;
+
+ /*--------------------------------------------------------*/
+ /* Fields for undo logs */
+ /** List of undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) undo_list;
+
+ /** List of undo log segments cached for fast reuse */
+ UT_LIST_BASE_NODE_T(trx_undo_t) undo_cached;
+
+ /*--------------------------------------------------------*/
+
+ /** Last not yet purged undo log header; FIL_NULL if all purged */
+ uint32_t last_page_no;
+
+ /** trx_t::no | last_offset << 48 */
+ uint64_t last_commit_and_offset;
+
+ /** Whether the log segment needs purge */
+ bool needs_purge;
+
+ /** Reference counter to track rseg allocated transactions. */
+ ulint trx_ref_count;
+
+ /** If true, then skip allocating this rseg as it reside in
+ UNDO-tablespace marked for truncate. */
+ bool skip_allocation;
+
+ /** @return the commit ID of the last committed transaction */
+ trx_id_t last_trx_no() const
+ { return last_commit_and_offset & ((1ULL << 48) - 1); }
+ /** @return header offset of the last committed transaction */
+ uint16_t last_offset() const
+ { return static_cast<uint16_t>(last_commit_and_offset >> 48); }
+
+ void set_last_commit(uint16_t last_offset, trx_id_t trx_no)
+ {
+ last_commit_and_offset= static_cast<uint64_t>(last_offset) << 48 | trx_no;
+ }
+
+ /** @return whether the rollback segment is persistent */
+ bool is_persistent() const
+ {
+ ut_ad(space == fil_system.temp_space
+ || space == fil_system.sys_space
+ || (srv_undo_space_id_start > 0
+ && space->id >= srv_undo_space_id_start
+ && space->id <= srv_undo_space_id_start
+ + TRX_SYS_MAX_UNDO_SPACES));
+ ut_ad(space == fil_system.temp_space
+ || space == fil_system.sys_space
+ || (srv_undo_space_id_start > 0
+ && space->id >= srv_undo_space_id_start
+ && space->id <= srv_undo_space_id_start
+ + srv_undo_tablespaces_open)
+ || !srv_was_started);
+ return(space->id != SRV_TMP_SPACE_ID);
+ }
+};
+
+/* Undo log segment slot in a rollback segment header */
+/*-------------------------------------------------------------*/
+#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of
+ an undo log segment */
+/*-------------------------------------------------------------*/
+/* Slot size */
+#define TRX_RSEG_SLOT_SIZE 4
+
+/* The offset of the rollback segment header on its page */
+#define TRX_RSEG FSEG_PAGE_DATA
+
+/* Transaction rollback segment header */
+/*-------------------------------------------------------------*/
+/** 0xfffffffe = pre-MariaDB 10.3.5 format; 0=MariaDB 10.3.5 or later */
+#define TRX_RSEG_FORMAT 0
+/** Number of pages in the TRX_RSEG_HISTORY list */
+#define TRX_RSEG_HISTORY_SIZE 4
+/** Committed transaction logs that have not been purged yet */
+#define TRX_RSEG_HISTORY 8
+#define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE)
+ /* Header for the file segment where
+ this page is placed */
+#define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE)
+ /* Undo log segment slots */
+/** Maximum transaction ID (valid only if TRX_RSEG_FORMAT is 0) */
+#define TRX_RSEG_MAX_TRX_ID (TRX_RSEG_UNDO_SLOTS + TRX_RSEG_N_SLOTS \
+ * TRX_RSEG_SLOT_SIZE)
+
+/** 8 bytes offset within the binlog file */
+#define TRX_RSEG_BINLOG_OFFSET TRX_RSEG_MAX_TRX_ID + 8
+/** MySQL log file name, 512 bytes, including terminating NUL
+(valid only if TRX_RSEG_FORMAT is 0).
+If no binlog information is present, the first byte is NUL. */
+#define TRX_RSEG_BINLOG_NAME TRX_RSEG_MAX_TRX_ID + 16
+/** Maximum length of binlog file name, including terminating NUL, in bytes */
+#define TRX_RSEG_BINLOG_NAME_LEN 512
+
+#ifdef WITH_WSREP
+/** The offset to WSREP XID headers */
+#define TRX_RSEG_WSREP_XID_INFO TRX_RSEG_MAX_TRX_ID + 16 + 512
+
+/** WSREP XID format (1 if present and valid, 0 if not present) */
+#define TRX_RSEG_WSREP_XID_FORMAT TRX_RSEG_WSREP_XID_INFO
+/** WSREP XID GTRID length */
+#define TRX_RSEG_WSREP_XID_GTRID_LEN TRX_RSEG_WSREP_XID_INFO + 4
+/** WSREP XID bqual length */
+#define TRX_RSEG_WSREP_XID_BQUAL_LEN TRX_RSEG_WSREP_XID_INFO + 8
+/** WSREP XID data (XIDDATASIZE bytes) */
+#define TRX_RSEG_WSREP_XID_DATA TRX_RSEG_WSREP_XID_INFO + 12
+#endif /* WITH_WSREP*/
+
+/*-------------------------------------------------------------*/
+
+/** Read the page number of an undo log slot.
+@param[in] rseg_header rollback segment header
+@param[in] n slot number */
+inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n)
+{
+ ut_ad(n < TRX_RSEG_N_SLOTS);
+ return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+ n * TRX_RSEG_SLOT_SIZE + rseg_header->frame);
+}
+
+#ifdef WITH_WSREP
+/** Update the WSREP XID information in rollback segment header.
+@param[in,out] rseg_header rollback segment header
+@param[in] xid WSREP XID
+@param[in,out] mtr mini-transaction */
+void
+trx_rseg_update_wsrep_checkpoint(
+ buf_block_t* rseg_header,
+ const XID* xid,
+ mtr_t* mtr);
+
+/** Update WSREP checkpoint XID in first rollback segment header
+as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
+are no wsrep transactions committing.
+If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
+stored into rollback segments, the WSREP XID in all the remaining rollback
+segments will be reset.
+@param[in] xid WSREP XID */
+void trx_rseg_update_wsrep_checkpoint(const XID* xid);
+
+/** Recover the latest WSREP checkpoint XID.
+@param[out] xid WSREP XID
+@return whether the WSREP XID was found */
+bool trx_rseg_read_wsrep_checkpoint(XID& xid);
+#endif /* WITH_WSREP */
+
+/** Upgrade a rollback segment header page to MariaDB 10.3 format.
+@param[in,out] rseg_header rollback segment header page
+@param[in,out] mtr mini-transaction */
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr);
+
+/** Update the offset information about the end of the binlog entry
+which corresponds to the transaction just being committed.
+In a replication slave, this updates the master binlog position
+up to which replication has proceeded.
+@param[in,out] rseg_header rollback segment header
+@param[in] trx committing transaction
+@param[in,out] mtr mini-transaction */
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
+ mtr_t *mtr);
+
+#include "trx0rseg.ic"
+
+#endif
diff --git a/storage/innobase/include/trx0rseg.ic b/storage/innobase/include/trx0rseg.ic
new file mode 100644
index 00000000..b293d9f1
--- /dev/null
+++ b/storage/innobase/include/trx0rseg.ic
@@ -0,0 +1,72 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.ic
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "srv0srv.h"
+#include "mtr0log.h"
+
+/** Gets a rollback segment header.
+@param[in] space space where placed
+@param[in] page_no page number of the header
+@param[in,out] mtr mini-transaction
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+buf_block_t*
+trx_rsegf_get(fil_space_t* space, uint32_t page_no, mtr_t* mtr)
+{
+ ut_ad(space == fil_system.sys_space || space == fil_system.temp_space
+ || srv_is_undo_tablespace(space->id)
+ || !srv_was_started);
+
+ buf_block_t* block = buf_page_get(page_id_t(space->id, page_no),
+ 0, RW_X_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_RSEG_HEADER);
+ return block;
+}
+
+/** Gets a newly created rollback segment header.
+@param[in] space space where placed
+@param[in] page_no page number of the header
+@param[in,out] mtr mini-transaction
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+buf_block_t*
+trx_rsegf_get_new(
+ ulint space,
+ uint32_t page_no,
+ mtr_t* mtr)
+{
+ buf_block_t* block;
+
+ ut_ad(space <= srv_undo_tablespaces_active || space == SRV_TMP_SPACE_ID
+ || !srv_was_started);
+ ut_ad(space <= TRX_SYS_MAX_UNDO_SPACES || space == SRV_TMP_SPACE_ID);
+
+ block = buf_page_get(page_id_t(space, page_no), 0, RW_X_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+ return block;
+}
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
new file mode 100644
index 00000000..424e4447
--- /dev/null
+++ b/storage/innobase/include/trx0sys.h
@@ -0,0 +1,1235 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.h
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0sys_h
+#define trx0sys_h
+
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "trx0types.h"
+#include "mem0mem.h"
+#include "mtr0mtr.h"
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "read0types.h"
+#include "page0types.h"
+#include "ut0mutex.h"
+#include "trx0trx.h"
+#ifdef WITH_WSREP
+#include "trx0xa.h"
+#endif /* WITH_WSREP */
+#include "ilist.h"
+
+/** Checks if a page address is the trx sys header page.
+@param[in] page_id page id
+@return true if trx sys header page */
+inline bool trx_sys_hdr_page(const page_id_t page_id)
+{
+ return page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO);
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+void
+trx_sys_create_sys_pages(void);
+/*==========================*/
+/** Find an available rollback segment.
+@param[in] sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
+@retval ULINT_UNDEFINED if not found */
+ulint
+trx_sys_rseg_find_free(const buf_block_t* sys_header);
+/** Request the TRX_SYS page.
+@param[in] rw whether to lock the page for writing
+@return the TRX_SYS page
+@retval NULL if the page cannot be read */
+inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true)
+{
+ buf_block_t* block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+ 0, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
+ ut_d(if (block) buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);)
+ return block;
+}
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+extern uint trx_rseg_n_slots_debug;
+#endif
+
+/** Write DB_TRX_ID.
+@param[out] db_trx_id the DB_TRX_ID field to be written to
+@param[in] id transaction ID */
+UNIV_INLINE
+void
+trx_write_trx_id(byte* db_trx_id, trx_id_t id)
+{
+ compile_time_assert(DATA_TRX_ID_LEN == 6);
+ mach_write_to_6(db_trx_id, id);
+}
+
+/** Read a transaction identifier.
+@return id */
+inline
+trx_id_t
+trx_read_trx_id(const byte* ptr)
+{
+ compile_time_assert(DATA_TRX_ID_LEN == 6);
+ return(mach_read_from_6(ptr));
+}
+
+#ifdef UNIV_DEBUG
+/** Check that the DB_TRX_ID in a record is valid.
+@param[in] db_trx_id the DB_TRX_ID column to validate
+@param[in] trx_id the id of the ALTER TABLE transaction */
+inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id)
+{
+ trx_id_t id = trx_read_trx_id(static_cast<const byte*>(db_trx_id));
+ ut_ad(id == 0 || id > trx_id);
+ return true;
+}
+#endif
+
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+ const char* file_name,/*!< in: MySQL log file name */
+ int64_t offset, /*!< in: position in that log file */
+ buf_block_t* sys_header, /*!< in,out: trx sys header */
+ mtr_t* mtr); /*!< in,out: mini-transaction */
+/** Display the MySQL binlog offset info if it is present in the trx
+system header. */
+void
+trx_sys_print_mysql_binlog_offset();
+
+/** Create the rollback segments.
+@return whether the creation succeeded */
+bool
+trx_sys_create_rsegs();
+
+/** The automatically created system rollback segment has this id */
+#define TRX_SYS_SYSTEM_RSEG_ID 0
+
+/** The offset of the transaction system header on the page */
+#define TRX_SYS FSEG_PAGE_DATA
+
+/** Transaction system header */
+/*------------------------------------------------------------- @{ */
+/** In old versions of InnoDB, this persisted the value of
+trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5,
+the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages
+and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages
+are used instead. The field only exists for the purpose of upgrading
+from older MySQL or MariaDB versions. */
+#define TRX_SYS_TRX_ID_STORE 0
+#define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the
+ tablespace segment the trx
+ system is created into */
+#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE)
+ /*!< the start of the array of
+ rollback segment specification
+ slots */
+/*------------------------------------------------------------- @} */
+
+/** The number of rollback segments; rollback segment id must fit in
+the 7 bits reserved for it in DB_ROLL_PTR. */
+#define TRX_SYS_N_RSEGS 128
+/** Maximum number of undo tablespaces (not counting the system tablespace) */
+#define TRX_SYS_MAX_UNDO_SPACES (TRX_SYS_N_RSEGS - 1)
+
+/* Rollback segment specification slot offsets */
+
+/** the tablespace ID of an undo log header; starting with
+MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */
+#define TRX_SYS_RSEG_SPACE 0
+/** the page number of an undo log header, or FIL_NULL if unused */
+#define TRX_SYS_RSEG_PAGE_NO 4
+/** Size of a rollback segment specification slot */
+#define TRX_SYS_RSEG_SLOT_SIZE 8
+
+/** Read the tablespace ID of a rollback segment slot.
+@param[in] sys_header TRX_SYS page
+@param[in] rseg_id rollback segment identifier
+@return undo tablespace id */
+inline
+uint32_t
+trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
+{
+ ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+ return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
+ + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+ + sys_header->frame);
+}
+
+/** Read the page number of a rollback segment slot.
+@param[in] sys_header TRX_SYS page
+@param[in] rseg_id rollback segment identifier
+@return undo page number */
+inline uint32_t
+trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id)
+{
+ ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+ return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE +
+ sys_header->frame);
+}
+
+/** Maximum length of MySQL binlog file name, in bytes.
+(Used before MariaDB 10.3.5.) */
+#define TRX_SYS_MYSQL_LOG_NAME_LEN 512
+/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
+#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344
+
+#if UNIV_PAGE_SIZE_MIN < 4096
+# error "UNIV_PAGE_SIZE_MIN < 4096"
+#endif
+/** The offset of the MySQL binlog offset info in the trx system header */
+#define TRX_SYS_MYSQL_LOG_INFO (srv_page_size - 1000)
+#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is
+ TRX_SYS_MYSQL_LOG_MAGIC_N
+ if we have valid data in the
+ MySQL binlog info */
+#define TRX_SYS_MYSQL_LOG_OFFSET 4 /*!< the 64-bit offset
+ within that file */
+#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */
+
+/** Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096
+
+0...37 FIL_HEADER
+38...45 TRX_SYS_TRX_ID_STORE
+46...55 TRX_SYS_FSEG_HEADER (FSEG_HEADER_SIZE == 10)
+56 TRX_SYS_RSEGS
+ 56...59 TRX_SYS_RSEG_SPACE for slot 0
+ 60...63 TRX_SYS_RSEG_PAGE_NO for slot 0
+ 64...67 TRX_SYS_RSEG_SPACE for slot 1
+ 68...71 TRX_SYS_RSEG_PAGE_NO for slot 1
+....
+ 594..597 TRX_SYS_RSEG_SPACE for slot 72
+ 598..601 TRX_SYS_RSEG_PAGE_NO for slot 72
+...
+ ...1063 TRX_SYS_RSEG_PAGE_NO for slot 126
+
+(srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace
+space_id, page_no pairs :::)
+596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD
+600 TRX_SYS_WSREP_XID_FORMAT
+604 TRX_SYS_WSREP_XID_GTRID_LEN
+608 TRX_SYS_WSREP_XID_BQUAL_LEN
+612 TRX_SYS_WSREP_XID_DATA (len = 128)
+739 TRX_SYS_WSREP_XID_DATA_END
+
+FIXED WSREP XID info offsets for 4k page size 10.0.32-galera
+(srv_page_size-2500)
+1596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD
+1600 TRX_SYS_WSREP_XID_FORMAT
+1604 TRX_SYS_WSREP_XID_GTRID_LEN
+1608 TRX_SYS_WSREP_XID_BQUAL_LEN
+1612 TRX_SYS_WSREP_XID_DATA (len = 128)
+1739 TRX_SYS_WSREP_XID_DATA_END
+
+(srv_page_size - 2000 MYSQL MASTER LOG)
+2096 TRX_SYS_MYSQL_MASTER_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+2100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH
+2104 TRX_SYS_MYSQL_LOG_OFFSET_LOW
+2108 TRX_SYS_MYSQL_LOG_NAME
+
+(srv_page_size - 1000 MYSQL LOG)
+3096 TRX_SYS_MYSQL_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+3100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH
+3104 TRX_SYS_MYSQL_LOG_OFFSET_LOW
+3108 TRX_SYS_MYSQL_LOG_NAME
+
+(srv_page_size - 200 DOUBLEWRITE)
+3896 TRX_SYS_DOUBLEWRITE TRX_SYS_DOUBLEWRITE_FSEG
+3906 TRX_SYS_DOUBLEWRITE_MAGIC
+3910 TRX_SYS_DOUBLEWRITE_BLOCK1
+3914 TRX_SYS_DOUBLEWRITE_BLOCK2
+3918 TRX_SYS_DOUBLEWRITE_REPEAT
+3930 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N
+
+(srv_page_size - 8, TAILER)
+4088..4096 FIL_TAILER
+
+*/
+#ifdef WITH_WSREP
+/** The offset to WSREP XID headers (used before MariaDB 10.3.5) */
+#define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL)
+#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
+#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
+
+/** XID field: formatID, gtrid_len, bqual_len, xid_data */
+#define TRX_SYS_WSREP_XID_LEN (4 + 4 + 4 + XIDDATASIZE)
+#define TRX_SYS_WSREP_XID_FORMAT 4
+#define TRX_SYS_WSREP_XID_GTRID_LEN 8
+#define TRX_SYS_WSREP_XID_BQUAL_LEN 12
+#define TRX_SYS_WSREP_XID_DATA 16
+#endif /* WITH_WSREP*/
+
+/** Doublewrite buffer */
+/* @{ */
+/** The offset of the doublewrite buffer header on the trx system header page */
+#define TRX_SYS_DOUBLEWRITE (srv_page_size - 200)
+/*-------------------------------------------------------------*/
+#define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg
+ containing the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE
+ /*!< 4-byte magic number which
+ shows if we already have
+ created the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE)
+ /*!< page number of the
+ first page in the first
+ sequence of 64
+ (= FSP_EXTENT_SIZE) consecutive
+ pages in the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE)
+ /*!< page number of the
+ first page in the second
+ sequence of 64 consecutive
+ pages in the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat
+ TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_BLOCK1,
+ TRX_SYS_DOUBLEWRITE_BLOCK2
+ so that if the trx sys
+ header is half-written
+ to disk, we still may
+ be able to recover the
+ information */
+/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+we must reset the doublewrite buffer, because starting from 4.1.x the
+space id of a data page is stored into
+FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
+
+/*-------------------------------------------------------------*/
+/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
+constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855;
+/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
+constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386;
+/* @} */
+
+trx_t* current_trx();
+
+struct rw_trx_hash_element_t
+{
+ rw_trx_hash_element_t(): trx(0)
+ {
+ mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex);
+ }
+
+
+ ~rw_trx_hash_element_t()
+ {
+ mutex_free(&mutex);
+ }
+
+
+ trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
+
+ /**
+ Transaction serialization number.
+
+ Assigned shortly before the transaction is moved to COMMITTED_IN_MEMORY
+ state. Initially set to TRX_ID_MAX.
+ */
+ Atomic_counter<trx_id_t> no;
+ trx_t *trx;
+ ib_mutex_t mutex;
+};
+
+
+/**
+ Wrapper around LF_HASH to store set of in memory read-write transactions.
+*/
+
+class rw_trx_hash_t
+{
+ LF_HASH hash;
+
+
+ template <typename T>
+ using walk_action= my_bool(rw_trx_hash_element_t *element, T *action);
+
+
+ /**
+ Constructor callback for lock-free allocator.
+
+ Object is just allocated and is not yet accessible via rw_trx_hash by
+ concurrent threads. Object can be reused multiple times before it is freed.
+ Every time object is being reused initializer() callback is called.
+ */
+
+ static void rw_trx_hash_constructor(uchar *arg)
+ {
+ new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t();
+ }
+
+
+ /**
+ Destructor callback for lock-free allocator.
+
+ Object is about to be freed and is not accessible via rw_trx_hash by
+ concurrent threads.
+ */
+
+ static void rw_trx_hash_destructor(uchar *arg)
+ {
+ reinterpret_cast<rw_trx_hash_element_t*>
+ (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t();
+ }
+
+
+ /**
+ Destructor callback for lock-free allocator.
+
+ This destructor is used at shutdown. It frees remaining transaction
+ objects.
+
+ XA PREPARED transactions may remain if they haven't been committed or
+ rolled back. ACTIVE transactions may remain if startup was interrupted or
+ server is running in read-only mode or for certain srv_force_recovery
+ levels.
+ */
+
+ static void rw_trx_hash_shutdown_destructor(uchar *arg)
+ {
+ rw_trx_hash_element_t *element=
+ reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD);
+ if (trx_t *trx= element->trx)
+ {
+ ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
+ (trx_state_eq(trx, TRX_STATE_ACTIVE) &&
+ (!srv_was_started ||
+ srv_read_only_mode ||
+ srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)));
+ trx_free_at_shutdown(trx);
+ }
+ element->~rw_trx_hash_element_t();
+ }
+
+
+ /**
+ Initializer callback for lock-free hash.
+
+ Object is not yet accessible via rw_trx_hash by concurrent threads, but is
+ about to become such. Object id can be changed only by this callback and
+ remains the same until all pins to this object are released.
+
+ Object trx can be changed to 0 by erase() under object mutex protection,
+ which indicates it is about to be removed from lock-free hash and become
+ not accessible by concurrent threads.
+ */
+
+ static void rw_trx_hash_initializer(LF_HASH *,
+ rw_trx_hash_element_t *element,
+ trx_t *trx)
+ {
+ ut_ad(element->trx == 0);
+ element->trx= trx;
+ element->id= trx->id;
+ element->no= TRX_ID_MAX;
+ trx->rw_trx_hash_element= element;
+ }
+
+
+ /**
+ Gets LF_HASH pins.
+
+ Pins are used to protect object from being destroyed or reused. They are
+ normally stored in trx object for quick access. If caller doesn't have trx
+ available, we try to get it using currnet_trx(). If caller doesn't have trx
+ at all, temporary pins are allocated.
+ */
+
+ LF_PINS *get_pins(trx_t *trx)
+ {
+ if (!trx->rw_trx_hash_pins)
+ {
+ trx->rw_trx_hash_pins= lf_hash_get_pins(&hash);
+ ut_a(trx->rw_trx_hash_pins);
+ }
+ return trx->rw_trx_hash_pins;
+ }
+
+
+ template <typename T> struct eliminate_duplicates_arg
+ {
+ trx_ids_t ids;
+ walk_action<T> *action;
+ T *argument;
+ eliminate_duplicates_arg(size_t size, walk_action<T> *act, T *arg):
+ action(act), argument(arg) { ids.reserve(size); }
+ };
+
+
+ template <typename T>
+ static my_bool eliminate_duplicates(rw_trx_hash_element_t *element,
+ eliminate_duplicates_arg<T> *arg)
+ {
+ for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++)
+ {
+ if (*it == element->id)
+ return 0;
+ }
+ arg->ids.push_back(element->id);
+ return arg->action(element, arg->argument);
+ }
+
+
+#ifdef UNIV_DEBUG
+ static void validate_element(trx_t *trx)
+ {
+ ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg);
+ ut_ad(!trx->is_autocommit_non_locking());
+ /* trx->state can be anything except TRX_STATE_NOT_STARTED */
+ mutex_enter(&trx->mutex);
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+ trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED));
+ mutex_exit(&trx->mutex);
+ }
+
+
+ template <typename T> struct debug_iterator_arg
+ {
+ walk_action<T> *action;
+ T *argument;
+ };
+
+
+ template <typename T>
+ static my_bool debug_iterator(rw_trx_hash_element_t *element,
+ debug_iterator_arg<T> *arg)
+ {
+ mutex_enter(&element->mutex);
+ if (element->trx)
+ validate_element(element->trx);
+ mutex_exit(&element->mutex);
+ return arg->action(element, arg->argument);
+ }
+#endif
+
+
+public:
+ void init()
+ {
+ lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0,
+ sizeof(trx_id_t), 0, &my_charset_bin);
+ hash.alloc.constructor= rw_trx_hash_constructor;
+ hash.alloc.destructor= rw_trx_hash_destructor;
+ hash.initializer=
+ reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer);
+ }
+
+
+ void destroy()
+ {
+ hash.alloc.destructor= rw_trx_hash_shutdown_destructor;
+ lf_hash_destroy(&hash);
+ }
+
+
+ /**
+ Releases LF_HASH pins.
+
+ Must be called by thread that owns trx_t object when the latter is being
+ "detached" from thread (e.g. released to the pool by trx_t::free()). Can be
+ called earlier if thread is expected not to use rw_trx_hash.
+
+ Since pins are not allowed to be transferred to another thread,
+ initialisation thread calls this for recovered transactions.
+ */
+
+ void put_pins(trx_t *trx)
+ {
+ if (trx->rw_trx_hash_pins)
+ {
+ lf_hash_put_pins(trx->rw_trx_hash_pins);
+ trx->rw_trx_hash_pins= 0;
+ }
+ }
+
+
+ /**
+ Finds trx object in lock-free hash with given id.
+
+ Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless
+ the transaction may get committed before this method returns.
+
+ With do_ref_count == false the caller may dereference returned trx pointer
+ only if lock_sys.mutex was acquired before calling find().
+
+ With do_ref_count == true caller may dereference trx even if it is not
+ holding lock_sys.mutex. Caller is responsible for calling
+ trx->release_reference() when it is done playing with trx.
+
+ Ideally this method should get caller rw_trx_hash_pins along with trx
+ object as a parameter, similar to insert() and erase(). However most
+ callers lose trx early in their call chains and it is not that easy to pass
+ them through.
+
+ So we take more expensive approach: get trx through current_thd()->ha_data.
+ Some threads don't have trx attached to THD, and at least server
+ initialisation thread, fts_optimize_thread, srv_master_thread,
+ dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even
+ have THD at all. For such cases we allocate pins only for duration of
+ search and free them immediately.
+
+ This has negative performance impact and should be fixed eventually (by
+ passing caller_trx as a parameter). Still stream of DML is more or less Ok.
+
+ @return
+ @retval 0 not found
+ @retval pointer to trx
+ */
+
+ trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count)
+ {
+ /*
+ In MariaDB 10.3, purge will reset DB_TRX_ID to 0
+ when the history is lost. Read/write transactions will
+ always have a nonzero trx_t::id; there the value 0 is
+ reserved for transactions that did not write or lock
+ anything yet.
+
+ The caller should already have handled trx_id==0 specially.
+ */
+ ut_ad(trx_id);
+ ut_ad(!caller_trx || caller_trx->id != trx_id || !do_ref_count);
+
+ trx_t *trx= 0;
+ LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
+ ut_a(pins);
+
+ rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*>
+ (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id),
+ sizeof(trx_id_t)));
+ if (element)
+ {
+ mutex_enter(&element->mutex);
+ lf_hash_search_unpin(pins);
+ if ((trx= element->trx)) {
+ DBUG_ASSERT(trx_id == trx->id);
+ ut_d(validate_element(trx));
+ if (do_ref_count)
+ {
+ /*
+ We have an early state check here to avoid committer
+ starvation in a wait loop for transaction references,
+ when there's a stream of trx_sys.find() calls from other
+ threads. The trx->state may change to COMMITTED after
+ trx->mutex is released, and it will have to be rechecked
+ by the caller after reacquiring the mutex.
+ */
+ trx_mutex_enter(trx);
+ const trx_state_t state= trx->state;
+ trx_mutex_exit(trx);
+ if (state == TRX_STATE_COMMITTED_IN_MEMORY)
+ trx= NULL;
+ else
+ trx->reference();
+ }
+ }
+ mutex_exit(&element->mutex);
+ }
+ if (!caller_trx)
+ lf_hash_put_pins(pins);
+ return trx;
+ }
+
+
+ /**
+ Inserts trx to lock-free hash.
+
+ Object becomes accessible via rw_trx_hash.
+ */
+
+ void insert(trx_t *trx)
+ {
+ ut_d(validate_element(trx));
+ int res= lf_hash_insert(&hash, get_pins(trx),
+ reinterpret_cast<void*>(trx));
+ ut_a(res == 0);
+ }
+
+
+ /**
+ Removes trx from lock-free hash.
+
+ Object becomes not accessible via rw_trx_hash. But it still can be pinned
+ by concurrent find(), which is supposed to release it immediately after
+ it sees object trx is 0.
+ */
+
+ void erase(trx_t *trx)
+ {
+ ut_d(validate_element(trx));
+ mutex_enter(&trx->rw_trx_hash_element->mutex);
+ trx->rw_trx_hash_element->trx= 0;
+ mutex_exit(&trx->rw_trx_hash_element->mutex);
+ int res= lf_hash_delete(&hash, get_pins(trx),
+ reinterpret_cast<const void*>(&trx->id),
+ sizeof(trx_id_t));
+ ut_a(res == 0);
+ }
+
+
+ /**
+ Returns the number of elements in the hash.
+
+ The number is exact only if hash is protected against concurrent
+ modifications (e.g. single threaded startup or hash is protected
+ by some mutex). Otherwise the number may be used as a hint only,
+ because it may change even before this method returns.
+ */
+
+ uint32_t size() { return uint32_t(lf_hash_size(&hash)); }
+
+
+ /**
+ Iterates the hash.
+
+ @param caller_trx used to get/set pins
+ @param action called for every element in hash
+ @param argument opque argument passed to action
+
+ May return the same element multiple times if hash is under contention.
+ If caller doesn't like to see the same transaction multiple times, it has
+ to call iterate_no_dups() instead.
+
+ May return element with committed transaction. If caller doesn't like to
+ see committed transactions, it has to skip those under element mutex:
+
+ mutex_enter(&element->mutex);
+ if (trx_t trx= element->trx)
+ {
+ // trx is protected against commit in this branch
+ }
+ mutex_exit(&element->mutex);
+
+ May miss concurrently inserted transactions.
+
+ @return
+ @retval 0 iteration completed successfully
+ @retval 1 iteration was interrupted (action returned 1)
+ */
+
+ template <typename T>
+ int iterate(trx_t *caller_trx, walk_action<T> *action, T *argument= nullptr)
+ {
+ LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
+ ut_a(pins);
+#ifdef UNIV_DEBUG
+ debug_iterator_arg<T> debug_arg= { action, argument };
+ action= reinterpret_cast<decltype(action)>(debug_iterator<T>);
+ argument= reinterpret_cast<T*>(&debug_arg);
+#endif
+ int res= lf_hash_iterate(&hash, pins,
+ reinterpret_cast<my_hash_walk_action>(action),
+ const_cast<void*>(static_cast<const void*>
+ (argument)));
+ if (!caller_trx)
+ lf_hash_put_pins(pins);
+ return res;
+ }
+
+
+ template <typename T>
+ int iterate(walk_action<T> *action, T *argument= nullptr)
+ {
+ return iterate(current_trx(), action, argument);
+ }
+
+
+ /**
+ Iterates the hash and eliminates duplicate elements.
+
+ @sa iterate()
+ */
+
+ template <typename T>
+ int iterate_no_dups(trx_t *caller_trx, walk_action<T> *action,
+ T *argument= nullptr)
+ {
+ eliminate_duplicates_arg<T> arg(size() + 32, action, argument);
+ return iterate(caller_trx, eliminate_duplicates<T>, &arg);
+ }
+
+
+ template <typename T>
+ int iterate_no_dups(walk_action<T> *action, T *argument= nullptr)
+ {
+ return iterate_no_dups(current_trx(), action, argument);
+ }
+};
+
+class thread_safe_trx_ilist_t
+{
+public:
+ void create() { mutex_create(LATCH_ID_TRX_SYS, &mutex); }
+ void close() { mutex_free(&mutex); }
+
+ bool empty() const
+ {
+ mutex_enter(&mutex);
+ auto result= trx_list.empty();
+ mutex_exit(&mutex);
+ return result;
+ }
+
+ void push_front(trx_t &trx)
+ {
+ mutex_enter(&mutex);
+ trx_list.push_front(trx);
+ mutex_exit(&mutex);
+ }
+
+ void remove(trx_t &trx)
+ {
+ mutex_enter(&mutex);
+ trx_list.remove(trx);
+ mutex_exit(&mutex);
+ }
+
+ template <typename Callable> void for_each(Callable &&callback) const
+ {
+ mutex_enter(&mutex);
+ for (const auto &trx : trx_list)
+ callback(trx);
+ mutex_exit(&mutex);
+ }
+
+ template <typename Callable> void for_each(Callable &&callback)
+ {
+ mutex_enter(&mutex);
+ for (auto &trx : trx_list)
+ callback(trx);
+ mutex_exit(&mutex);
+ }
+
+ void freeze() const { mutex_enter(&mutex); }
+ void unfreeze() const { mutex_exit(&mutex); }
+
+private:
+ alignas(CACHE_LINE_SIZE) mutable TrxSysMutex mutex;
+ alignas(CACHE_LINE_SIZE) ilist<trx_t> trx_list;
+};
+
+/** The transaction system central memory data structure. */
+class trx_sys_t
+{
+ /**
+ The smallest number not yet assigned as a transaction id or transaction
+ number. Accessed and updated with atomic operations.
+ */
+ MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<trx_id_t> m_max_trx_id;
+
+
+ /**
+ Solves race conditions between register_rw() and snapshot_ids() as well as
+ race condition between assign_new_trx_no() and snapshot_ids().
+
+ @sa register_rw()
+ @sa assign_new_trx_no()
+ @sa snapshot_ids()
+ */
+ MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<trx_id_t> m_rw_trx_hash_version;
+
+
+ bool m_initialised;
+
+public:
+ /**
+ TRX_RSEG_HISTORY list length (number of committed transactions to purge)
+ */
+ MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<uint32_t> rseg_history_len;
+
+ /** List of all transactions. */
+ thread_safe_trx_ilist_t trx_list;
+
+ MY_ALIGNED(CACHE_LINE_SIZE)
+ /** Temporary rollback segments */
+ trx_rseg_t* temp_rsegs[TRX_SYS_N_RSEGS];
+
+ MY_ALIGNED(CACHE_LINE_SIZE)
+ trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS];
+ /*!< Pointer array to rollback
+ segments; NULL if slot not in use;
+ created and destroyed in
+ single-threaded mode; not protected
+ by any mutex, because it is read-only
+ during multi-threaded operation */
+
+ /**
+ Lock-free hash of in memory read-write transactions.
+ Works faster when it is on it's own cache line (tested).
+ */
+
+ MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash;
+
+
+#ifdef WITH_WSREP
+ /** Latest recovered XID during startup */
+ XID recovered_wsrep_xid;
+#endif
+ /** Latest recovered binlog offset */
+ uint64_t recovered_binlog_offset;
+ /** Latest recovered binlog file name */
+ char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];
+ /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */
+ lsn_t recovered_binlog_lsn;
+
+
+ /**
+ Constructor.
+
+ Some members may require late initialisation, thus we just mark object as
+ uninitialised. Real initialisation happens in create().
+ */
+
+ trx_sys_t(): m_initialised(false) {}
+
+
+ /**
+ Returns the minimum trx id in rw trx list.
+
+ This is the smallest id for which the trx can possibly be active. (But, you
+ must look at the trx->state to find out if the minimum trx id transaction
+ itself is active, or already committed.)
+
+ @return the minimum trx id, or m_max_trx_id if the trx list is empty
+ */
+
+ trx_id_t get_min_trx_id()
+ {
+ trx_id_t id= get_max_trx_id();
+ rw_trx_hash.iterate(get_min_trx_id_callback, &id);
+ return id;
+ }
+
+
+ /**
+ Determines the maximum transaction id.
+
+ @return maximum currently allocated trx id; will be stale after the
+ next call to trx_sys.get_new_trx_id()
+ */
+
+ trx_id_t get_max_trx_id()
+ {
+ return m_max_trx_id;
+ }
+
+
+ /**
+ Allocates a new transaction id.
+ @return new, allocated trx id
+ */
+
+ trx_id_t get_new_trx_id()
+ {
+ trx_id_t id= get_new_trx_id_no_refresh();
+ refresh_rw_trx_hash_version();
+ return id;
+ }
+
+
+ /**
+ Allocates and assigns new transaction serialisation number.
+
+ There's a gap between m_max_trx_id increment and transaction serialisation
+ number becoming visible through rw_trx_hash. While we're in this gap
+ concurrent thread may come and do MVCC snapshot without seeing allocated
+ but not yet assigned serialisation number. Then at some point purge thread
+ may clone this view. As a result it won't see newly allocated serialisation
+ number and may remove "unnecessary" history data of this transaction from
+ rollback segments.
+
+ m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
+ to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
+ means that all transaction serialisation numbers up to m_max_trx_id are
+ available through rw_trx_hash.
+
+ We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
+ that m_rw_trx_hash_version increment happens after
+ trx->rw_trx_hash_element->no becomes visible through rw_trx_hash.
+
+ @param trx transaction
+ */
+ void assign_new_trx_no(trx_t *trx)
+ {
+ trx->rw_trx_hash_element->no= get_new_trx_id_no_refresh();
+ refresh_rw_trx_hash_version();
+ }
+
+
+ /**
+ Takes MVCC snapshot.
+
+ To reduce malloc probablility we reserve rw_trx_hash.size() + 32 elements
+ in ids.
+
+ For details about get_rw_trx_hash_version() != get_max_trx_id() spin
+ @sa register_rw() and @sa assign_new_trx_no().
+
+ We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so
+ that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash.
+
+ To optimise snapshot creation rw_trx_hash.iterate() is being used instead
+ of rw_trx_hash.iterate_no_dups(). It means that some transaction
+ identifiers may appear multiple times in ids.
+
+ @param[in,out] caller_trx used to get access to rw_trx_hash_pins
+ @param[out] ids array to store registered transaction identifiers
+ @param[out] max_trx_id variable to store m_max_trx_id value
+ @param[out] mix_trx_no variable to store min(no) value
+ */
+
+ void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id,
+ trx_id_t *min_trx_no)
+ {
+ snapshot_ids_arg arg(ids);
+
+ while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
+ ut_delay(1);
+ arg.m_no= arg.m_id;
+
+ ids->clear();
+ ids->reserve(rw_trx_hash.size() + 32);
+ rw_trx_hash.iterate(caller_trx, copy_one_id, &arg);
+
+ *max_trx_id= arg.m_id;
+ *min_trx_no= arg.m_no;
+ }
+
+
+ /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */
+ void init_max_trx_id(trx_id_t value)
+ {
+ m_max_trx_id= value;
+ m_rw_trx_hash_version.store(value, std::memory_order_relaxed);
+ }
+
+
+ bool is_initialised() { return m_initialised; }
+
+
+ /** Initialise the transaction subsystem. */
+ void create();
+
+ /** Close the transaction subsystem on shutdown. */
+ void close();
+
+ /** @return total number of active (non-prepared) transactions */
+ ulint any_active_transactions();
+
+
+ /**
+ Registers read-write transaction.
+
+ Transaction becomes visible to MVCC.
+
+ There's a gap between m_max_trx_id increment and transaction becoming
+ visible through rw_trx_hash. While we're in this gap concurrent thread may
+ come and do MVCC snapshot. As a result concurrent read view will be able to
+ observe records owned by this transaction even before it was committed.
+
+ m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
+ to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
+ means that all transactions up to m_max_trx_id are available through
+ rw_trx_hash.
+
+ We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
+ that m_rw_trx_hash_version increment happens after transaction becomes
+ visible through rw_trx_hash.
+ */
+
+ void register_rw(trx_t *trx)
+ {
+ trx->id= get_new_trx_id_no_refresh();
+ rw_trx_hash.insert(trx);
+ refresh_rw_trx_hash_version();
+ }
+
+
+ /**
+ Deregisters read-write transaction.
+
+ Transaction is removed from rw_trx_hash, which releases all implicit locks.
+ MVCC snapshot won't see this transaction anymore.
+ */
+
+ void deregister_rw(trx_t *trx)
+ {
+ rw_trx_hash.erase(trx);
+ }
+
+
+ bool is_registered(trx_t *caller_trx, trx_id_t id)
+ {
+ return id && find(caller_trx, id, false);
+ }
+
+
+ trx_t *find(trx_t *caller_trx, trx_id_t id, bool do_ref_count= true)
+ {
+ return rw_trx_hash.find(caller_trx, id, do_ref_count);
+ }
+
+
+ /**
+ Registers transaction in trx_sys.
+
+ @param trx transaction
+ */
+ void register_trx(trx_t *trx)
+ {
+ trx_list.push_front(*trx);
+ }
+
+
+ /**
+ Deregisters transaction in trx_sys.
+
+ @param trx transaction
+ */
+ void deregister_trx(trx_t *trx)
+ {
+ trx_list.remove(*trx);
+ }
+
+
+ /**
+ Clones the oldest view and stores it in view.
+
+ No need to call ReadView::close(). The caller owns the view that is passed
+ in. This function is called by purge thread to determine whether it should
+ purge the delete marked record or not.
+ */
+ void clone_oldest_view(ReadViewBase *view) const;
+
+
+ /** @return the number of active views */
+ size_t view_count() const
+ {
+ size_t count= 0;
+
+ trx_list.for_each([&count](const trx_t &trx) {
+ if (trx.read_view.is_open())
+ ++count;
+ });
+
+ return count;
+ }
+
+private:
+ static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element,
+ trx_id_t *id)
+ {
+ if (element->id < *id)
+ {
+ mutex_enter(&element->mutex);
+ /* We don't care about read-only transactions here. */
+ if (element->trx && element->trx->rsegs.m_redo.rseg)
+ *id= element->id;
+ mutex_exit(&element->mutex);
+ }
+ return 0;
+ }
+
+
+ struct snapshot_ids_arg
+ {
+ snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {}
+ trx_ids_t *m_ids;
+ trx_id_t m_id;
+ trx_id_t m_no;
+ };
+
+
+ static my_bool copy_one_id(rw_trx_hash_element_t *element,
+ snapshot_ids_arg *arg)
+ {
+ if (element->id < arg->m_id)
+ {
+ trx_id_t no= element->no;
+ arg->m_ids->push_back(element->id);
+ if (no < arg->m_no)
+ arg->m_no= no;
+ }
+ return 0;
+ }
+
+
+ /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */
+ trx_id_t get_rw_trx_hash_version()
+ {
+ return m_rw_trx_hash_version.load(std::memory_order_acquire);
+ }
+
+
+ /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */
+ void refresh_rw_trx_hash_version()
+ {
+ m_rw_trx_hash_version.fetch_add(1, std::memory_order_release);
+ }
+
+
+ /**
+ Allocates new transaction id without refreshing rw_trx_hash version.
+
+ This method is extracted for exclusive use by register_rw() and
+ assign_new_trx_no() where new id must be allocated atomically with
+ payload of these methods from MVCC snapshot point of view.
+
+ @sa get_new_trx_id()
+ @sa assign_new_trx_no()
+
+ @return new transaction id
+ */
+
+ trx_id_t get_new_trx_id_no_refresh()
+ {
+ return m_max_trx_id++;
+ }
+};
+
+
+/** The transaction system */
+extern trx_sys_t trx_sys;
+
+#endif
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
new file mode 100644
index 00000000..09132e7c
--- /dev/null
+++ b/storage/innobase/include/trx0trx.h
@@ -0,0 +1,1126 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.h
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0trx_h
+#define trx0trx_h
+
+#include "trx0types.h"
+#include "lock0types.h"
+#include "que0types.h"
+#include "mem0mem.h"
+#include "trx0xa.h"
+#include "ut0vec.h"
+#include "fts0fts.h"
+#include "read0types.h"
+#include "ilist.h"
+
+#include <vector>
+#include <set>
+
+// Forward declaration
+struct mtr_t;
+struct rw_trx_hash_element_t;
+
+/******************************************************************//**
+Set detailed error message for the transaction. */
+void
+trx_set_detailed_error(
+/*===================*/
+ trx_t* trx, /*!< in: transaction struct */
+ const char* msg); /*!< in: detailed error message */
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction struct */
+ FILE* file); /*!< in: file to read message from */
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+ const trx_t* trx); /*!< in: trx object */
+
+/** @return an allocated transaction */
+trx_t *trx_create();
+
+/** At shutdown, frees a transaction object. */
+void trx_free_at_shutdown(trx_t *trx);
+
+/** Disconnect a prepared transaction from MySQL.
+@param[in,out] trx transaction */
+void trx_disconnect_prepared(trx_t *trx);
+
+/** Initialize (resurrect) transactions at startup. */
+dberr_t trx_lists_init_at_db_start();
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+ trx_t* trx, /*!< in/out: transaction */
+ bool read_write); /*!< in: true if read write transaction */
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_low(
+/*=========================*/
+ trx_t* trx, /*!< in/out: transaction */
+ bool read_write); /*!< in: true if read write transaction */
+
+/*************************************************************//**
+Starts a transaction for internal processing. */
+void
+trx_start_internal_low(
+/*===================*/
+ trx_t* trx); /*!< in/out: transaction */
+
+/** Starts a read-only transaction for internal processing.
+@param[in,out] trx transaction to be started */
+void
+trx_start_internal_read_only_low(
+ trx_t* trx);
+
+#ifdef UNIV_DEBUG
+#define trx_start_if_not_started_xa(t, rw) \
+ do { \
+ (t)->start_line = __LINE__; \
+ (t)->start_file = __FILE__; \
+ trx_start_if_not_started_xa_low((t), rw); \
+ } while (false)
+
+#define trx_start_if_not_started(t, rw) \
+ do { \
+ (t)->start_line = __LINE__; \
+ (t)->start_file = __FILE__; \
+ trx_start_if_not_started_low((t), rw); \
+ } while (false)
+
+#define trx_start_internal(t) \
+ do { \
+ (t)->start_line = __LINE__; \
+ (t)->start_file = __FILE__; \
+ trx_start_internal_low((t)); \
+ } while (false)
+
+#define trx_start_internal_read_only(t) \
+ do { \
+ (t)->start_line = __LINE__; \
+ (t)->start_file = __FILE__; \
+ trx_start_internal_read_only_low(t); \
+ } while (false)
+#else
+#define trx_start_if_not_started(t, rw) \
+ trx_start_if_not_started_low((t), rw)
+
+#define trx_start_internal(t) \
+ trx_start_internal_low((t))
+
+#define trx_start_internal_read_only(t) \
+ trx_start_internal_read_only_low(t)
+
+#define trx_start_if_not_started_xa(t, rw) \
+ trx_start_if_not_started_xa_low((t), (rw))
+#endif /* UNIV_DEBUG */
+
+/*************************************************************//**
+Starts the transaction for a DDL operation. */
+void
+trx_start_for_ddl_low(
+/*==================*/
+ trx_t* trx, /*!< in/out: transaction */
+ trx_dict_op_t op); /*!< in: dictionary operation type */
+
+#ifdef UNIV_DEBUG
+#define trx_start_for_ddl(t, o) \
+ do { \
+ ut_ad((t)->start_file == 0); \
+ (t)->start_line = __LINE__; \
+ (t)->start_file = __FILE__; \
+ trx_start_for_ddl_low((t), (o)); \
+ } while (0)
+#else
+#define trx_start_for_ddl(t, o) \
+ trx_start_for_ddl_low((t), (o))
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+ trx_t* trx); /*!< in/out: transaction */
+/** XA PREPARE a transaction.
+@param[in,out] trx transaction to prepare */
+void trx_prepare_for_mysql(trx_t* trx);
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return number of prepared transactions */
+int
+trx_recover_for_mysql(
+/*==================*/
+ XID* xid_list, /*!< in/out: prepared transactions */
+ uint len); /*!< in: number of slots in xid_list */
+/** Look up an X/Open distributed transaction in XA PREPARE state.
+@param[in] xid X/Open XA transaction identifier
+@return transaction on match (the trx_t::xid will be invalidated);
+note that the trx may have been committed before the caller acquires
+trx_t::mutex
+@retval NULL if no match */
+trx_t* trx_get_trx_by_xid(const XID* xid);
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+void
+trx_commit_complete_for_mysql(
+/*==========================*/
+ trx_t* trx); /*!< in/out: transaction */
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx); /*!< in: trx handle */
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_commit_step(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/**********************************************************************//**
+Prints info about a transaction. */
+void
+trx_print_low(
+/*==========*/
+ FILE* f,
+ /*!< in: output stream */
+ const trx_t* trx,
+ /*!< in: transaction */
+ ulint max_query_len,
+ /*!< in: max query length to print,
+ or 0 to use the default max length */
+ ulint n_rec_locks,
+ /*!< in: lock_number_of_rows_locked(&trx->lock) */
+ ulint n_trx_locks,
+ /*!< in: length of trx->lock.trx_locks */
+ ulint heap_size);
+ /*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+
+/**********************************************************************//**
+Prints info about a transaction.
+When possible, use trx_print() instead. */
+void
+trx_print_latched(
+/*==============*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ ulint max_query_len); /*!< in: max query length to print,
+ or 0 to use the default max length */
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys.mutex. */
+void
+trx_print(
+/*======*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ ulint max_query_len); /*!< in: max query length to print,
+ or 0 to use the default max length */
+
+/**********************************************************************//**
+Determine if a transaction is a dictionary operation.
+@return dictionary operation mode */
+UNIV_INLINE
+enum trx_dict_op_t
+trx_get_dict_operation(
+/*===================*/
+ const trx_t* trx) /*!< in: transaction */
+ MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Flag a transaction a dictionary operation. */
+UNIV_INLINE
+void
+trx_set_dict_operation(
+/*===================*/
+ trx_t* trx, /*!< in/out: transaction */
+ enum trx_dict_op_t op); /*!< in: operation, not
+ TRX_DICT_OP_NONE */
+
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx->mutex, or it must be the thread
+that is serving a running transaction.
+A running RW transaction must be in trx_sys.rw_trx_hash.
+@return TRUE if trx->state == state */
+UNIV_INLINE
+bool
+trx_state_eq(
+/*=========*/
+ const trx_t* trx, /*!< in: transaction */
+ trx_state_t state, /*!< in: state;
+ if state != TRX_STATE_NOT_STARTED
+ asserts that
+ trx->state != TRX_STATE_NOT_STARTED */
+ bool relaxed = false)
+ /*!< in: whether to allow
+ trx->state == TRX_STATE_NOT_STARTED
+ after an error has been reported */
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return true if interrupted */
+bool
+trx_is_interrupted(
+/*===============*/
+ const trx_t* trx); /*!< in: transaction */
+
+/*******************************************************************//**
+Calculates the "weight" of a transaction. The weight of one transaction
+is estimated as the number of altered rows + the number of locked rows.
+@param t transaction
+@return transaction weight */
+#define TRX_WEIGHT(t) ((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks))
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return true if weight(a) >= weight(b) */
+bool
+trx_weight_ge(
+/*==========*/
+ const trx_t* a, /*!< in: the transaction to be compared */
+ const trx_t* b); /*!< in: the transaction to be compared */
+/* Maximum length of a string that can be returned by
+trx_get_que_state_str(). */
+#define TRX_QUE_STATE_STR_MAX_LEN 12 /* "ROLLING BACK" */
+
+/*******************************************************************//**
+Retrieves transaction's que state in a human readable string. The string
+should not be free()'d or modified.
+@return string in the data segment */
+UNIV_INLINE
+const char*
+trx_get_que_state_str(
+/*==================*/
+ const trx_t* trx); /*!< in: transaction */
+
+/** Retreieves the transaction ID.
+In a given point in time it is guaranteed that IDs of the running
+transactions are unique. The values returned by this function for readonly
+transactions may be reused, so a subsequent RO transaction may get the same ID
+as a RO transaction that existed in the past. The values returned by this
+function should be used for printing purposes only.
+@param[in] trx transaction whose id to retrieve
+@return transaction id */
+UNIV_INLINE
+trx_id_t
+trx_get_id_for_print(
+ const trx_t* trx);
+
+/** Create the trx_t pool */
+void
+trx_pool_init();
+
+/** Destroy the trx_t pool */
+void
+trx_pool_close();
+
+/**
+Set the transaction as a read-write transaction if it is not already
+tagged as such.
+@param[in,out] trx Transaction that needs to be "upgraded" to RW from RO */
+void
+trx_set_rw_mode(
+ trx_t* trx);
+
+/**
+Transactions that aren't started by the MySQL server don't set
+the trx_t::mysql_thd field. For such transactions we set the lock
+wait timeout to 0 instead of the user configured value that comes
+from innodb_lock_wait_timeout via trx_t::mysql_thd.
+@param trx transaction
+@return lock wait timeout in seconds */
+#define trx_lock_wait_timeout_get(t) \
+ ((t)->mysql_thd != NULL \
+ ? thd_lock_wait_timeout((t)->mysql_thd) \
+ : 0)
+
+typedef std::vector<ib_lock_t*, ut_allocator<ib_lock_t*> > lock_list;
+
+/*******************************************************************//**
+Latching protocol for trx_lock_t::que_state. trx_lock_t::que_state
+captures the state of the query thread during the execution of a query.
+This is different from a transaction state. The query state of a transaction
+can be updated asynchronously by other threads. The other threads can be
+system threads, like the timeout monitor thread or user threads executing
+other queries. Another thing to be mindful of is that there is a delay between
+when a query thread is put into LOCK_WAIT state and before it actually starts
+waiting. Between these two events it is possible that the query thread is
+granted the lock it was waiting for, which implies that the state can be changed
+asynchronously.
+
+All these operations take place within the context of locking. Therefore state
+changes within the locking code must acquire both the lock mutex and the
+trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or
+trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient
+to only acquire the trx->mutex.
+To query the state either of the mutexes is sufficient within the locking
+code and no mutex is required when the query thread is no longer waiting. */
+
+/** The locks and state of an active transaction. Protected by
+lock_sys.mutex, trx->mutex or both. */
+struct trx_lock_t {
+#ifdef UNIV_DEBUG
+ /** number of active query threads; at most 1, except for the
+ dummy transaction in trx_purge() */
+ ulint n_active_thrs;
+#endif
+ trx_que_t que_state; /*!< valid when trx->state
+ == TRX_STATE_ACTIVE: TRX_QUE_RUNNING,
+ TRX_QUE_LOCK_WAIT, ... */
+
+ lock_t* wait_lock; /*!< if trx execution state is
+ TRX_QUE_LOCK_WAIT, this points to
+ the lock request, otherwise this is
+ NULL; set to non-NULL when holding
+ both trx->mutex and lock_sys.mutex;
+ set to NULL when holding
+ lock_sys.mutex; readers should
+ hold lock_sys.mutex, except when
+ they are holding trx->mutex and
+ wait_lock==NULL */
+ ib_uint64_t deadlock_mark; /*!< A mark field that is initialized
+ to and checked against lock_mark_counter
+ by lock_deadlock_recursive(). */
+ bool was_chosen_as_deadlock_victim;
+ /*!< when the transaction decides to
+ wait for a lock, it sets this to false;
+ if another transaction chooses this
+ transaction as a victim in deadlock
+ resolution, it sets this to true.
+ Protected by trx->mutex. */
+ time_t wait_started; /*!< lock wait started at this time,
+ protected only by lock_sys.mutex */
+
+ que_thr_t* wait_thr; /*!< query thread belonging to this
+ trx that is in QUE_THR_LOCK_WAIT
+ state. For threads suspended in a
+ lock wait, this is protected by
+ lock_sys.mutex. Otherwise, this may
+ only be modified by the thread that is
+ serving the running transaction. */
+#ifdef WITH_WSREP
+ bool was_chosen_as_wsrep_victim;
+ /*!< high priority wsrep thread has
+ marked this trx to abort */
+#endif /* WITH_WSREP */
+
+ /** Pre-allocated record locks */
+ struct {
+ ib_lock_t lock; byte pad[256];
+ } rec_pool[8];
+
+ /** Pre-allocated table locks */
+ ib_lock_t table_pool[8];
+
+ /** Next available rec_pool[] entry */
+ unsigned rec_cached;
+
+ /** Next available table_pool[] entry */
+ unsigned table_cached;
+
+ mem_heap_t* lock_heap; /*!< memory heap for trx_locks;
+ protected by lock_sys.mutex */
+
+ trx_lock_list_t trx_locks; /*!< locks requested by the transaction;
+ insertions are protected by trx->mutex
+ and lock_sys.mutex; removals are
+ protected by lock_sys.mutex */
+
+ lock_list table_locks; /*!< All table locks requested by this
+ transaction, including AUTOINC locks */
+
+ /** List of pending trx_t::evict_table() */
+ UT_LIST_BASE_NODE_T(dict_table_t) evicted_tables;
+
+ bool cancel; /*!< true if the transaction is being
+ rolled back either via deadlock
+ detection or due to lock timeout. The
+ caller has to acquire the trx_t::mutex
+ in order to cancel the locks. In
+ lock_trx_table_locks_remove() we
+ check for this cancel of a transaction's
+ locks and avoid reacquiring the trx
+ mutex to prevent recursive deadlocks.
+ Protected by both the lock sys mutex
+ and the trx_t::mutex. */
+ ulint n_rec_locks; /*!< number of rec locks in this trx */
+};
+
+/** Logical first modification time of a table in a transaction */
+class trx_mod_table_time_t
+{
+ /** First modification of the table */
+ undo_no_t first;
+ /** First modification of a system versioned column */
+ undo_no_t first_versioned;
+
+ /** Magic value signifying that a system versioned column of a
+ table was never modified in a transaction. */
+ static const undo_no_t UNVERSIONED = IB_ID_MAX;
+
+public:
+ /** Constructor
+ @param[in] rows number of modified rows so far */
+ trx_mod_table_time_t(undo_no_t rows)
+ : first(rows), first_versioned(UNVERSIONED) {}
+
+#ifdef UNIV_DEBUG
+ /** Validation
+ @param[in] rows number of modified rows so far
+ @return whether the object is valid */
+ bool valid(undo_no_t rows = UNVERSIONED) const
+ {
+ return first <= first_versioned && first <= rows;
+ }
+#endif /* UNIV_DEBUG */
+ /** @return if versioned columns were modified */
+ bool is_versioned() const { return first_versioned != UNVERSIONED; }
+
+ /** After writing an undo log record, set is_versioned() if needed
+ @param[in] rows number of modified rows so far */
+ void set_versioned(undo_no_t rows)
+ {
+ ut_ad(!is_versioned());
+ first_versioned = rows;
+ ut_ad(valid());
+ }
+
+ /** Invoked after partial rollback
+ @param[in] limit number of surviving modified rows
+ @return whether this should be erased from trx_t::mod_tables */
+ bool rollback(undo_no_t limit)
+ {
+ ut_ad(valid());
+ if (first >= limit) {
+ return true;
+ }
+
+ if (first_versioned < limit && is_versioned()) {
+ first_versioned = UNVERSIONED;
+ }
+
+ return false;
+ }
+};
+
+/** Collection of persistent tables and their first modification
+in a transaction.
+We store pointers to the table objects in memory because
+we know that a table object will not be destroyed while a transaction
+that modified it is running. */
+typedef std::map<
+ dict_table_t*, trx_mod_table_time_t,
+ std::less<dict_table_t*>,
+ ut_allocator<std::pair<dict_table_t* const, trx_mod_table_time_t> > >
+ trx_mod_tables_t;
+
+/** The transaction handle
+
+Normally, there is a 1:1 relationship between a transaction handle
+(trx) and a session (client connection). One session is associated
+with exactly one user transaction. There are some exceptions to this:
+
+* For DDL operations, a subtransaction is allocated that modifies the
+data dictionary tables. Lock waits and deadlocks are prevented by
+acquiring the dict_sys.latch before starting the subtransaction
+and releasing it after committing the subtransaction.
+
+* The purge system uses a special transaction that is not associated
+with any session.
+
+* If the system crashed or it was quickly shut down while there were
+transactions in the ACTIVE or PREPARED state, these transactions would
+no longer be associated with a session when the server is restarted.
+
+A session may be served by at most one thread at a time. The serving
+thread of a session might change in some MySQL implementations.
+Therefore we do not have os_thread_get_curr_id() assertions in the code.
+
+Normally, only the thread that is currently associated with a running
+transaction may access (read and modify) the trx object, and it may do
+so without holding any mutex. The following are exceptions to this:
+
+* trx_rollback_recovered() may access resurrected (connectionless)
+transactions (state == TRX_STATE_ACTIVE && is_recovered)
+while the system is already processing new user transactions (!is_recovered).
+
+* trx_print_low() may access transactions not associated with the current
+thread. The caller must be holding lock_sys.mutex.
+
+* When a transaction handle is in the trx_sys.trx_list, some of its fields
+must not be modified without holding trx->mutex.
+
+* The locking code (in particular, lock_deadlock_recursive() and
+lock_rec_convert_impl_to_expl()) will access transactions associated
+to other connections. The locks of transactions are protected by
+lock_sys.mutex (insertions also by trx->mutex). */
+
+/** Represents an instance of rollback segment along with its state variables.*/
+struct trx_undo_ptr_t {
+ trx_rseg_t* rseg; /*!< rollback segment assigned to the
+ transaction, or NULL if not assigned
+ yet */
+ trx_undo_t* undo; /*!< pointer to the undo log, or
+ NULL if nothing logged yet */
+};
+
+/** An instance of temporary rollback segment. */
+struct trx_temp_undo_t {
+ /** temporary rollback segment, or NULL if not assigned yet */
+ trx_rseg_t* rseg;
+ /** pointer to the undo log, or NULL if nothing logged yet */
+ trx_undo_t* undo;
+};
+
+/** Rollback segments assigned to a transaction for undo logging. */
+struct trx_rsegs_t {
+ /** undo log ptr holding reference to a rollback segment that resides in
+ system/undo tablespace used for undo logging of tables that needs
+ to be recovered on crash. */
+ trx_undo_ptr_t m_redo;
+
+ /** undo log for temporary tables; discarded immediately after
+ transaction commit/rollback */
+ trx_temp_undo_t m_noredo;
+};
+
+struct trx_t : ilist_node<> {
+private:
+ /**
+ Count of references.
+
+ We can't release the locks nor commit the transaction until this reference
+ is 0. We can change the state to TRX_STATE_COMMITTED_IN_MEMORY to signify
+ that it is no longer "active".
+ */
+
+ Atomic_counter<int32_t> n_ref;
+
+
+public:
+ TrxMutex mutex; /*!< Mutex protecting the fields
+ state and lock (except some fields
+ of lock, which are protected by
+ lock_sys.mutex) */
+
+ trx_id_t id; /*!< transaction id */
+
+ /** State of the trx from the point of view of concurrency control
+ and the valid state transitions.
+
+ Possible states:
+
+ TRX_STATE_NOT_STARTED
+ TRX_STATE_ACTIVE
+ TRX_STATE_PREPARED
+ TRX_STATE_PREPARED_RECOVERED (special case of TRX_STATE_PREPARED)
+ TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
+
+ Valid state transitions are:
+
+ Regular transactions:
+ * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
+
+ Auto-commit non-locking read-only:
+ * NOT_STARTED -> ACTIVE -> NOT_STARTED
+
+ XA (2PC):
+ * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
+
+ Recovered XA:
+ * NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
+
+ Recovered XA followed by XA ROLLBACK:
+ * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed)
+
+ XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):
+ * NOT_STARTED -> PREPARED -> (freed)
+
+ Disconnected XA can become recovered:
+ * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)
+ Disconnected means from mysql e.g due to the mysql client disconnection.
+ Latching and various transaction lists membership rules:
+
+ XA (2PC) transactions are always treated as non-autocommit.
+
+ Transitions to ACTIVE or NOT_STARTED occur when transaction
+ is not in rw_trx_hash.
+
+ Autocommit non-locking read-only transactions move between states
+ without holding any mutex. They are not in rw_trx_hash.
+
+ All transactions, unless they are determined to be ac-nl-ro,
+ explicitly tagged as read-only or read-write, will first be put
+ on the read-only transaction list. Only when a !read-only transaction
+ in the read-only list tries to acquire an X or IX lock on a table
+ do we remove it from the read-only list and put it on the read-write
+ list. During this switch we assign it a rollback segment.
+
+ When a transaction is NOT_STARTED, it can be in trx_list. It cannot be
+ in rw_trx_hash.
+
+ ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash.
+ The transition ACTIVE->PREPARED is protected by trx->mutex.
+
+ ACTIVE->COMMITTED is possible when the transaction is in
+ rw_trx_hash.
+
+ Transitions to COMMITTED are protected by trx_t::mutex. */
+ trx_state_t state;
+#ifdef WITH_WSREP
+ /** whether wsrep_on(mysql_thd) held at the start of transaction */
+ bool wsrep;
+ bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); }
+ /** true, if BF thread is performing unique secondary index scanning */
+ bool wsrep_UK_scan;
+ bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep_UK_scan); }
+#else /* WITH_WSREP */
+ bool is_wsrep() const { return false; }
+#endif /* WITH_WSREP */
+
+ ReadView read_view; /*!< consistent read view used in the
+ transaction, or NULL if not yet set */
+ trx_lock_t lock; /*!< Information about the transaction
+ locks and state. Protected by
+ lock_sys.mutex (insertions also
+ by trx_t::mutex). */
+
+ /* These fields are not protected by any mutex. */
+
+ /** false=normal transaction, true=recovered (must be rolled back)
+ or disconnected transaction in XA PREPARE STATE.
+
+ This field is accessed by the thread that owns the transaction,
+ without holding any mutex.
+ There is only one foreign-thread access in trx_print_low()
+ and a possible race condition with trx_disconnect_prepared(). */
+ bool is_recovered;
+ const char* op_info; /*!< English text describing the
+ current operation, or an empty
+ string */
+ uint isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */
+ bool check_foreigns; /*!< normally TRUE, but if the user
+ wants to suppress foreign key checks,
+ (in table imports, for example) we
+ set this FALSE */
+ /*------------------------------*/
+ /* MySQL has a transaction coordinator to coordinate two phase
+ commit between multiple storage engines and the binary log. When
+ an engine participates in a transaction, it's responsible for
+ registering itself using the trans_register_ha() API. */
+ bool is_registered; /* This flag is set to true after the
+ transaction has been registered with
+ the coordinator using the XA API, and
+ is set to false after commit or
+ rollback. */
+ /** whether this is holding the prepare mutex */
+ bool active_commit_ordered;
+ /*------------------------------*/
+ bool check_unique_secondary;
+ /*!< normally TRUE, but if the user
+ wants to speed up inserts by
+ suppressing unique key checks
+ for secondary indexes when we decide
+ if we can use the insert buffer for
+ them, we set this FALSE */
+ bool flush_log_later;/* In 2PC, we hold the
+ prepare_commit mutex across
+ both phases. In that case, we
+ defer flush of the logs to disk
+ until after we release the
+ mutex. */
+ bool must_flush_log_later;/*!< set in commit()
+ if flush_log_later was
+ set and redo log was written;
+ in that case we will
+ flush the log in
+ trx_commit_complete_for_mysql() */
+ ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
+ trx_dict_op_t dict_operation; /**< @see enum trx_dict_op_t */
+
+ ib_uint32_t dict_operation_lock_mode;
+ /*!< 0, RW_S_LATCH, or RW_X_LATCH:
+ the latch mode trx currently holds
+ on dict_sys.latch. Protected
+ by dict_sys.latch. */
+
+ /** wall-clock time of the latest transition to TRX_STATE_ACTIVE;
+ used for diagnostic purposes only */
+ time_t start_time;
+ /** microsecond_interval_timer() of transaction start */
+ ulonglong start_time_micro;
+ lsn_t commit_lsn; /*!< lsn at the time of the commit */
+ table_id_t table_id; /*!< Table to drop iff dict_operation
+ == TRX_DICT_OP_TABLE, or 0. */
+ /*------------------------------*/
+ THD* mysql_thd; /*!< MySQL thread handle corresponding
+ to this trx, or NULL */
+
+ const char* mysql_log_file_name;
+ /*!< if MySQL binlog is used, this field
+ contains a pointer to the latest file
+ name; this is NULL if binlog is not
+ used */
+ ulonglong mysql_log_offset;
+ /*!< if MySQL binlog is used, this
+ field contains the end offset of the
+ binlog entry */
+ /*------------------------------*/
+ ib_uint32_t n_mysql_tables_in_use; /*!< number of Innobase tables
+ used in the processing of the current
+ SQL statement in MySQL */
+ ib_uint32_t mysql_n_tables_locked;
+ /*!< how many tables the current SQL
+ statement uses, except those
+ in consistent read */
+ dberr_t error_state; /*!< 0 if no error, otherwise error
+ number; NOTE That ONLY the thread
+ doing the transaction is allowed to
+ set this field: this is NOT protected
+ by any mutex */
+ const dict_index_t*error_info; /*!< if the error number indicates a
+ duplicate key error, a pointer to
+ the problematic index is stored here */
+ ulint error_key_num; /*!< if the index creation fails to a
+ duplicate key error, a mysql key
+ number of that index is stored here */
+ que_t* graph; /*!< query currently run in the session,
+ or NULL if none; NOTE that the query
+ belongs to the session, and it can
+ survive over a transaction commit, if
+ it is a stored procedure with a COMMIT
+ WORK statement, for instance */
+ /*------------------------------*/
+ UT_LIST_BASE_NODE_T(trx_named_savept_t)
+ trx_savepoints; /*!< savepoints set with SAVEPOINT ...,
+ oldest first */
+ /*------------------------------*/
+ undo_no_t undo_no; /*!< next undo log record number to
+ assign; since the undo log is
+ private for a transaction, this
+ is a simple ascending sequence
+ with no gaps; thus it represents
+ the number of modified/inserted
+ rows in a transaction */
+ trx_savept_t last_sql_stat_start;
+ /*!< undo_no when the last sql statement
+ was started: in case of an error, trx
+ is rolled back down to this number */
+ trx_rsegs_t rsegs; /* rollback segments for undo logging */
+ undo_no_t roll_limit; /*!< least undo number to undo during
+ a partial rollback; 0 otherwise */
+ bool in_rollback; /*!< true when the transaction is
+ executing a partial or full rollback */
+ ulint pages_undone; /*!< number of undo log pages undone
+ since the last undo log truncation */
+ /*------------------------------*/
+ ulint n_autoinc_rows; /*!< no. of AUTO-INC rows required for
+ an SQL statement. This is useful for
+ multi-row INSERTs */
+ ib_vector_t* autoinc_locks; /* AUTOINC locks held by this
+ transaction. Note that these are
+ also in the lock list trx_locks. This
+ vector needs to be freed explicitly
+ when the trx instance is destroyed.
+ Protected by lock_sys.mutex. */
+ /*------------------------------*/
+ bool read_only; /*!< true if transaction is flagged
+ as a READ-ONLY transaction.
+ if auto_commit && !will_lock
+ then it will be handled as a
+ AC-NL-RO-SELECT (Auto Commit Non-Locking
+ Read Only Select). A read only
+ transaction will not be assigned an
+ UNDO log. */
+ bool auto_commit; /*!< true if it is an autocommit */
+ bool will_lock; /*!< set to inform trx_start_low() that
+ the transaction may acquire locks */
+ /*------------------------------*/
+ fts_trx_t* fts_trx; /*!< FTS information, or NULL if
+ transaction hasn't modified tables
+ with FTS indexes (yet). */
+ doc_id_t fts_next_doc_id;/* The document id used for updates */
+ /*------------------------------*/
+ ib_uint32_t flush_tables; /*!< if "covering" the FLUSH TABLES",
+ count of tables being flushed. */
+
+ /*------------------------------*/
+ bool ddl; /*!< true if it is an internal
+ transaction for DDL */
+ bool internal; /*!< true if it is a system/internal
+ transaction background task. This
+ includes DDL transactions too. Such
+ transactions are always treated as
+ read-write. */
+ /*------------------------------*/
+#ifdef UNIV_DEBUG
+ unsigned start_line; /*!< Track where it was started from */
+ const char* start_file; /*!< Filename where it was started */
+#endif /* UNIV_DEBUG */
+
+ XID* xid; /*!< X/Open XA transaction
+ identification to identify a
+ transaction branch */
+ trx_mod_tables_t mod_tables; /*!< List of tables that were modified
+ by this transaction */
+ /*------------------------------*/
+ char* detailed_error; /*!< detailed error message for last
+ error, or empty. */
+ rw_trx_hash_element_t *rw_trx_hash_element;
+ LF_PINS *rw_trx_hash_pins;
+ ulint magic_n;
+
+ /** @return whether any persistent undo log has been generated */
+ bool has_logged_persistent() const
+ {
+ return(rsegs.m_redo.undo);
+ }
+
+ /** @return whether any undo log has been generated */
+ bool has_logged() const
+ {
+ return(has_logged_persistent() || rsegs.m_noredo.undo);
+ }
+
+ /** @return rollback segment for modifying temporary tables */
+ trx_rseg_t* get_temp_rseg()
+ {
+ if (trx_rseg_t* rseg = rsegs.m_noredo.rseg) {
+ ut_ad(id != 0);
+ return(rseg);
+ }
+
+ return(assign_temp_rseg());
+ }
+
+ /** Transition to committed state, to release implicit locks. */
+ inline void commit_state();
+
+ /** Release any explicit locks of a committing transaction. */
+ inline void release_locks();
+
+ /** Evict a table definition due to the rollback of ALTER TABLE.
+ @param[in] table_id table identifier */
+ void evict_table(table_id_t table_id);
+
+ /** Initiate rollback.
+ @param savept savepoint to which to roll back
+ @return error code or DB_SUCCESS */
+ dberr_t rollback(trx_savept_t *savept= nullptr);
+ /** Roll back an active transaction.
+ @param savept savepoint to which to roll back */
+ inline void rollback_low(trx_savept_t *savept= nullptr);
+ /** Finish rollback.
+ @return whether the rollback was completed normally
+ @retval false if the rollback was aborted by shutdown */
+ inline bool rollback_finish();
+private:
+ /** Mark a transaction committed in the main memory data structures. */
+ inline void commit_in_memory(const mtr_t *mtr);
+ /** Commit the transaction in a mini-transaction.
+ @param mtr mini-transaction (if there are any persistent modifications) */
+ void commit_low(mtr_t *mtr= nullptr);
+public:
+ /** Commit the transaction. */
+ void commit();
+
+
+ bool is_referenced() const { return n_ref > 0; }
+
+
+ void reference()
+ {
+#ifdef UNIV_DEBUG
+ auto old_n_ref=
+#endif
+ n_ref++;
+ ut_ad(old_n_ref >= 0);
+ }
+
+
+ void release_reference()
+ {
+#ifdef UNIV_DEBUG
+ auto old_n_ref=
+#endif
+ n_ref--;
+ ut_ad(old_n_ref > 0);
+ }
+
+ /** @return whether the table has lock on
+ mysql.innodb_table_stats and mysql.innodb_index_stats */
+ bool has_stats_table_lock() const;
+
+ /** Free the memory to trx_pools */
+ void free();
+
+
+ void assert_freed() const
+ {
+ ut_ad(state == TRX_STATE_NOT_STARTED);
+ ut_ad(!id);
+ ut_ad(!has_logged());
+ ut_ad(!is_referenced());
+ ut_ad(!is_wsrep());
+#ifdef WITH_WSREP
+ ut_ad(!lock.was_chosen_as_wsrep_victim);
+#endif
+ ut_ad(!read_view.is_open());
+ ut_ad(!lock.wait_thr);
+ ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+ ut_ad(lock.table_locks.empty());
+ ut_ad(!autoinc_locks || ib_vector_is_empty(autoinc_locks));
+ ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
+ ut_ad(dict_operation == TRX_DICT_OP_NONE);
+ }
+
+ /** @return whether this is a non-locking autocommit transaction */
+ bool is_autocommit_non_locking() const { return auto_commit && !will_lock; }
+
+private:
+ /** Assign a rollback segment for modifying temporary tables.
+ @return the assigned rollback segment */
+ trx_rseg_t *assign_temp_rseg();
+};
+
+/**
+Check if transaction is started.
+@param[in] trx Transaction whose state we need to check
+@reutrn true if transaction is in state started */
+inline bool trx_is_started(const trx_t* trx)
+{
+ return trx->state != TRX_STATE_NOT_STARTED;
+}
+
+/* Transaction isolation levels (trx->isolation_level) */
+#define TRX_ISO_READ_UNCOMMITTED 0 /* dirty read: non-locking
+ SELECTs are performed so that
+ we do not look at a possible
+ earlier version of a record;
+ thus they are not 'consistent'
+ reads under this isolation
+ level; otherwise like level
+ 2 */
+
+#define TRX_ISO_READ_COMMITTED 1 /* somewhat Oracle-like
+ isolation, except that in
+ range UPDATE and DELETE we
+ must block phantom rows
+ with next-key locks;
+ SELECT ... FOR UPDATE and ...
+ LOCK IN SHARE MODE only lock
+ the index records, NOT the
+ gaps before them, and thus
+ allow free inserting;
+ each consistent read reads its
+ own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ 2 /* this is the default;
+ all consistent reads in the
+ same trx read the same
+ snapshot;
+ full next-key locking used
+ in locking reads to block
+ insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE 3 /* all plain SELECTs are
+ converted to LOCK IN SHARE
+ MODE reads */
+
+/* Treatment of duplicate values (trx->duplicates; for example, in inserts).
+Multiple flags can be combined with bitwise OR. */
+#define TRX_DUP_IGNORE 1U /* duplicate rows are to be updated */
+#define TRX_DUP_REPLACE 2U /* duplicate rows are to be replaced */
+
+
+/** Commit node states */
+enum commit_node_state {
+ COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to
+ the transaction */
+ COMMIT_NODE_WAIT /*!< commit signal sent to the transaction,
+ waiting for completion */
+};
+
+/** Commit command node in a query graph */
+struct commit_node_t{
+ que_common_t common; /*!< node type: QUE_NODE_COMMIT */
+ enum commit_node_state
+ state; /*!< node execution state */
+};
+
+
+/** Test if trx->mutex is owned. */
+#define trx_mutex_own(t) mutex_own(&t->mutex)
+
+/** Acquire the trx->mutex. */
+#define trx_mutex_enter(t) do { \
+ mutex_enter(&t->mutex); \
+} while (0)
+
+/** Release the trx->mutex. */
+#define trx_mutex_exit(t) do { \
+ mutex_exit(&t->mutex); \
+} while (0)
+
+#include "trx0trx.ic"
+
+#endif
diff --git a/storage/innobase/include/trx0trx.ic b/storage/innobase/include/trx0trx.ic
new file mode 100644
index 00000000..93c9591e
--- /dev/null
+++ b/storage/innobase/include/trx0trx.ic
@@ -0,0 +1,206 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.ic
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx->mutex, or it must be the thread
+that is serving a running transaction.
+A running RW transaction must be in trx_sys.rw_trx_hash.
+@return TRUE if trx->state == state */
+UNIV_INLINE
+bool
+trx_state_eq(
+/*=========*/
+ const trx_t* trx, /*!< in: transaction */
+ trx_state_t state, /*!< in: state;
+ if state != TRX_STATE_NOT_STARTED
+ asserts that
+ trx->state != TRX_STATE_NOT_STARTED */
+ bool relaxed)
+ /*!< in: whether to allow
+ trx->state == TRX_STATE_NOT_STARTED
+ after an error has been reported */
+{
+#ifdef UNIV_DEBUG
+ switch (trx->state) {
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ ut_ad(!trx->is_autocommit_non_locking());
+ return(trx->state == state);
+
+ case TRX_STATE_ACTIVE:
+ if (trx->is_autocommit_non_locking()) {
+ ut_ad(!trx->is_recovered);
+ ut_ad(trx->read_only);
+ ut_ad(trx->mysql_thd);
+ }
+ return(state == trx->state);
+
+ case TRX_STATE_NOT_STARTED:
+ /* These states are not allowed for running transactions. */
+ ut_a(state == TRX_STATE_NOT_STARTED
+ || (relaxed
+ && thd_get_error_number(trx->mysql_thd)));
+
+ return(true);
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return(trx->state == state);
+}
+
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+ const trx_t* trx) /*!< in: trx object */
+{
+ return(trx->error_info);
+}
+
+/*******************************************************************//**
+Retrieves transaction's que state in a human readable string. The string
+should not be free()'d or modified.
+@return string in the data segment */
+UNIV_INLINE
+const char*
+trx_get_que_state_str(
+/*==================*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ /* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */
+ switch (trx->lock.que_state) {
+ case TRX_QUE_RUNNING:
+ return("RUNNING");
+ case TRX_QUE_LOCK_WAIT:
+ return("LOCK WAIT");
+ case TRX_QUE_ROLLING_BACK:
+ return("ROLLING BACK");
+ case TRX_QUE_COMMITTING:
+ return("COMMITTING");
+ default:
+ return("UNKNOWN");
+ }
+}
+
+/** Retreieves the transaction ID.
+In a given point in time it is guaranteed that IDs of the running
+transactions are unique. The values returned by this function for readonly
+transactions may be reused, so a subsequent RO transaction may get the same ID
+as a RO transaction that existed in the past. The values returned by this
+function should be used for printing purposes only.
+@param[in] trx transaction whose id to retrieve
+@return transaction id */
+UNIV_INLINE
+trx_id_t
+trx_get_id_for_print(
+ const trx_t* trx)
+{
+ /* Readonly and transactions whose intentions are unknown (whether
+ they will eventually do a WRITE) don't have trx_t::id assigned (it is
+ 0 for those transactions). Transaction IDs in
+ innodb_trx.trx_id,
+ innodb_locks.lock_id,
+ innodb_locks.lock_trx_id,
+ innodb_lock_waits.requesting_trx_id,
+ innodb_lock_waits.blocking_trx_id should match because those tables
+ could be used in an SQL JOIN on those columns. Also trx_t::id is
+ printed by SHOW ENGINE INNODB STATUS, and in logs, so we must have the
+ same value printed everywhere consistently. */
+
+ /* DATA_TRX_ID_LEN is the storage size in bytes. */
+ static const trx_id_t max_trx_id
+ = (1ULL << (DATA_TRX_ID_LEN * CHAR_BIT)) - 1;
+
+ ut_ad(trx->id <= max_trx_id);
+
+ return(trx->id != 0
+ ? trx->id
+ : reinterpret_cast<trx_id_t>(trx) | (max_trx_id + 1));
+}
+
+/**********************************************************************//**
+Determine if a transaction is a dictionary operation.
+@return dictionary operation mode */
+UNIV_INLINE
+enum trx_dict_op_t
+trx_get_dict_operation(
+/*===================*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ trx_dict_op_t op = static_cast<trx_dict_op_t>(trx->dict_operation);
+
+#ifdef UNIV_DEBUG
+ switch (op) {
+ case TRX_DICT_OP_NONE:
+ case TRX_DICT_OP_TABLE:
+ case TRX_DICT_OP_INDEX:
+ return(op);
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return(op);
+}
+/**********************************************************************//**
+Flag a transaction a dictionary operation. */
+UNIV_INLINE
+void
+trx_set_dict_operation(
+/*===================*/
+ trx_t* trx, /*!< in/out: transaction */
+ enum trx_dict_op_t op) /*!< in: operation, not
+ TRX_DICT_OP_NONE */
+{
+#ifdef UNIV_DEBUG
+ enum trx_dict_op_t old_op = trx_get_dict_operation(trx);
+
+ switch (op) {
+ case TRX_DICT_OP_NONE:
+ ut_error;
+ break;
+ case TRX_DICT_OP_TABLE:
+ switch (old_op) {
+ case TRX_DICT_OP_NONE:
+ case TRX_DICT_OP_INDEX:
+ case TRX_DICT_OP_TABLE:
+ goto ok;
+ }
+ ut_error;
+ break;
+ case TRX_DICT_OP_INDEX:
+ ut_ad(old_op == TRX_DICT_OP_NONE);
+ break;
+ }
+ok:
+#endif /* UNIV_DEBUG */
+
+ trx->ddl = true;
+ trx->dict_operation = op;
+}
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
new file mode 100644
index 00000000..99a9c66c
--- /dev/null
+++ b/storage/innobase/include/trx0types.h
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0types.h
+Transaction system global type definitions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0types_h
+#define trx0types_h
+
+#include "ut0byte.h"
+#include "ut0mutex.h"
+
+#include <vector>
+
+/** printf(3) format used for printing DB_TRX_ID and other system fields */
+#define TRX_ID_FMT IB_ID_FMT
+
+/** maximum length that a formatted trx_t::id could take, not including
+the terminating NUL character. */
+static const ulint TRX_ID_MAX_LEN = 17;
+
+/** Space id of the transaction system page (the system tablespace) */
+static const ulint TRX_SYS_SPACE = 0;
+
+/** Page number of the transaction system page */
+#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO
+
+/** Random value to check for corruption of trx_t */
+static const ulint TRX_MAGIC_N = 91118598;
+
+constexpr uint innodb_purge_threads_MAX= 32;
+
+/** Transaction execution states when trx->state == TRX_STATE_ACTIVE */
+enum trx_que_t {
+ TRX_QUE_RUNNING, /*!< transaction is running */
+ TRX_QUE_LOCK_WAIT, /*!< transaction is waiting for
+ a lock */
+ TRX_QUE_ROLLING_BACK, /*!< transaction is rolling back */
+ TRX_QUE_COMMITTING /*!< transaction is committing */
+};
+
+/** Transaction states (trx_t::state) */
+enum trx_state_t {
+ TRX_STATE_NOT_STARTED,
+
+ TRX_STATE_ACTIVE,
+ /** XA PREPARE has been executed; only XA COMMIT or XA ROLLBACK
+ are possible */
+ TRX_STATE_PREPARED,
+ /** XA PREPARE transaction that was returned to ha_recover() */
+ TRX_STATE_PREPARED_RECOVERED,
+ TRX_STATE_COMMITTED_IN_MEMORY
+};
+
+/** Type of data dictionary operation */
+enum trx_dict_op_t {
+ /** The transaction is not modifying the data dictionary. */
+ TRX_DICT_OP_NONE = 0,
+ /** The transaction is creating a table or an index, or
+ dropping a table. The table must be dropped in crash
+ recovery. This and TRX_DICT_OP_NONE are the only possible
+ operation modes in crash recovery. */
+ TRX_DICT_OP_TABLE = 1,
+ /** The transaction is creating or dropping an index in an
+ existing table. In crash recovery, the data dictionary
+ must be locked, but the table must not be dropped. */
+ TRX_DICT_OP_INDEX = 2
+};
+
+/** Memory objects */
+/* @{ */
+/** Transaction */
+struct trx_t;
+/** The locks and state of an active transaction */
+struct trx_lock_t;
+/** Rollback segment */
+struct trx_rseg_t;
+/** Transaction undo log */
+struct trx_undo_t;
+/** Rollback command node in a query graph */
+struct roll_node_t;
+/** Commit command node in a query graph */
+struct commit_node_t;
+/** SAVEPOINT command node in a query graph */
+struct trx_named_savept_t;
+/* @} */
+
+/** Row identifier (DB_ROW_ID, DATA_ROW_ID) */
+typedef ib_id_t row_id_t;
+/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */
+typedef ib_id_t trx_id_t;
+/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */
+typedef ib_id_t roll_ptr_t;
+/** Undo number */
+typedef ib_id_t undo_no_t;
+
+/** Transaction savepoint */
+struct trx_savept_t{
+ undo_no_t least_undo_no; /*!< least undo number to undo */
+};
+
+/** File objects */
+/* @{ */
+/** Undo segment header */
+typedef byte trx_usegf_t;
+/** Undo log header */
+typedef byte trx_ulogf_t;
+/** Undo log page header */
+typedef byte trx_upagef_t;
+
+/** Undo log record */
+typedef byte trx_undo_rec_t;
+
+/* @} */
+
+typedef ib_mutex_t RsegMutex;
+typedef ib_mutex_t TrxMutex;
+typedef ib_mutex_t PQMutex;
+typedef ib_mutex_t TrxSysMutex;
+
+typedef std::vector<trx_id_t, ut_allocator<trx_id_t> > trx_ids_t;
+#endif /* trx0types_h */
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
new file mode 100644
index 00000000..319ea4ee
--- /dev/null
+++ b/storage/innobase/include/trx0undo.h
@@ -0,0 +1,465 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.h
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0undo_h
+#define trx0undo_h
+
+#ifndef UNIV_INNOCHECKSUM
+#include "trx0sys.h"
+
+/** The LSB of the "is insert" flag in DB_ROLL_PTR */
+#define ROLL_PTR_INSERT_FLAG_POS 55
+/** The LSB of the 7-bit trx_rseg_t::id in DB_ROLL_PTR */
+#define ROLL_PTR_RSEG_ID_POS 48
+/** The LSB of the 32-bit undo log page number in DB_ROLL_PTR */
+#define ROLL_PTR_PAGE_POS 16
+/** The LSB of the 16-bit byte offset within an undo log page in DB_ROLL_PTR */
+#define ROLL_PTR_BYTE_POS 0
+
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+ bool is_insert, /*!< in: TRUE if insert undo log */
+ ulint rseg_id, /*!< in: rollback segment id */
+ uint32_t page_no, /*!< in: page number */
+ uint16_t offset); /*!< in: offset of the undo entry within page */
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer */
+ bool* is_insert, /*!< out: TRUE if insert undo log */
+ ulint* rseg_id, /*!< out: rollback segment id */
+ uint32_t* page_no, /*!< out: page number */
+ uint16_t* offset); /*!< out: offset of the undo
+ entry within page */
+/***********************************************************************//**
+Determine if DB_ROLL_PTR is of the insert type.
+@return true if insert */
+UNIV_INLINE
+bool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+ roll_ptr_t roll_ptr); /*!< in: roll pointer */
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+ const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+ MY_ATTRIBUTE((warn_unused_result));
+/** Write DB_ROLL_PTR.
+@param[out] ptr buffer
+@param[in] roll_ptr DB_ROLL_PTR value */
+inline void trx_write_roll_ptr(byte* ptr, roll_ptr_t roll_ptr)
+{
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ mach_write_to_7(ptr, roll_ptr);
+}
+/** Read DB_ROLL_PTR.
+@param[in] ptr buffer
+@return roll ptr */
+inline roll_ptr_t trx_read_roll_ptr(const byte* ptr)
+{
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ return mach_read_from_7(ptr);
+}
+
+/** Gets an undo log page and x-latches it.
+@param[in] page_id page id
+@param[in,out] mtr mini-transaction
+@return pointer to page x-latched */
+UNIV_INLINE
+buf_block_t*
+trx_undo_page_get(const page_id_t page_id, mtr_t* mtr);
+
+/** Gets an undo log page and s-latches it.
+@param[in] page_id page id
+@param[in,out] mtr mini-transaction
+@return pointer to page s-latched */
+UNIV_INLINE
+buf_block_t*
+trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr);
+
+/** Get the next record in an undo log.
+@param[in] undo_page undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@return undo log record, the page latched, NULL if none */
+inline trx_undo_rec_t*
+trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
+ uint32_t page_no, uint16_t offset);
+/** Get the previous record in an undo log.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+ uint16_t offset, bool shared, mtr_t *mtr);
+/** Get the next record in an undo log.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+ uint16_t offset, mtr_t *mtr);
+
+/** Get the first record in an undo log.
+@param[in] space undo log header space
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH
+@param[out] block undo log page
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
+ uint16_t offset, ulint mode, buf_block_t*& block,
+ mtr_t *mtr);
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param[in,out] block undo log page */
+void trx_undo_page_init(const buf_block_t &block);
+
+/** Allocate an undo log page.
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction that does not hold any page latch
+@return X-latched block if success
+@retval NULL on failure */
+buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Free the last undo log page. The caller must hold the rseg mutex.
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction that does not hold any undo log page
+ or that has allocated the undo log page */
+void
+trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** Truncate the tail of an undo log during rollback.
+@param[in,out] undo undo log
+@param[in] limit all undo logs after this limit will be discarded
+@param[in] is_temp whether this is temporary undo log */
+void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp);
+
+/** Truncate the head of an undo log.
+NOTE that only whole pages are freed; the header page is not
+freed, but emptied, if all the records there are below the limit.
+@param[in,out] rseg rollback segment
+@param[in] hdr_page_no header page number
+@param[in] hdr_offset header offset on the page
+@param[in] limit first undo number to preserve
+(everything below the limit will be truncated) */
+void
+trx_undo_truncate_start(
+ trx_rseg_t* rseg,
+ uint32_t hdr_page_no,
+ uint16_t hdr_offset,
+ undo_no_t limit);
+/** Mark that an undo log header belongs to a data dictionary transaction.
+@param[in] trx dictionary transaction
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction */
+void trx_undo_mark_as_dict(const trx_t* trx, trx_undo_t* undo, mtr_t* mtr);
+/** Assign an undo log for a persistent transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out] trx transaction
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return the undo log block
+@retval NULL on error */
+buf_block_t*
+trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull));
+/** Assign an undo log for a transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out] trx transaction
+@param[in] rseg rollback segment
+@param[out] undo the undo log
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return the undo log block
+@retval NULL on error */
+buf_block_t*
+trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+ dberr_t* err, mtr_t* mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return undo log segment header page, x-latched */
+buf_block_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr); /*!< in: mtr */
+
+/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
+@param[in,out] trx transaction
+@param[in,out] undo undo log
+@param[in] rollback false=XA PREPARE, true=XA ROLLBACK
+@param[in,out] mtr mini-transaction */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+ mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull));
+
+/** Free temporary undo log after commit or rollback.
+The information is not needed after a commit or rollback, therefore
+the data can be discarded.
+@param undo temporary undo log */
+void trx_undo_commit_cleanup(trx_undo_t *undo);
+
+/** At shutdown, frees the undo logs of a transaction. */
+void
+trx_undo_free_at_shutdown(trx_t *trx);
+
+/** Read an undo log when starting up the database.
+@param[in,out] rseg rollback segment
+@param[in] id rollback segment slot
+@param[in] page_no undo log segment page number
+@param[in,out] max_trx_id the largest observed transaction ID
+@return the undo log
+@retval nullptr on error */
+trx_undo_t *
+trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no,
+ trx_id_t &max_trx_id);
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** the only rollback segment type since MariaDB 10.3.1 */
+constexpr uint16_t TRX_UNDO_UPDATE= 2;
+/* TRX_UNDO_STATE values of an undo log segment */
+/** contains an undo log of an active transaction */
+constexpr uint16_t TRX_UNDO_ACTIVE = 1;
+/** cached for quick reuse */
+constexpr uint16_t TRX_UNDO_CACHED = 2;
+/** can be freed in purge when all undo data in it is removed */
+constexpr uint16_t TRX_UNDO_TO_PURGE = 4;
+/** contains an undo log of a prepared transaction */
+constexpr uint16_t TRX_UNDO_PREPARED = 5;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Transaction undo log memory object; modified by the thread associated
+with the transaction. */
+
+struct trx_undo_t {
+ /*-----------------------------*/
+ ulint id; /*!< undo log slot number within the
+ rollback segment */
+ ulint state; /*!< state of the corresponding undo log
+ segment */
+ trx_id_t trx_id; /*!< id of the trx assigned to the undo
+ log */
+ XID xid; /*!< X/Open XA transaction
+ identification */
+ ibool dict_operation; /*!< TRUE if a dict operation trx */
+ table_id_t table_id; /*!< if a dict operation, then the table
+ id */
+ trx_rseg_t* rseg; /*!< rseg where the undo log belongs */
+ /*-----------------------------*/
+ uint32_t hdr_page_no; /*!< page number of the header page in
+ the undo log */
+ uint32_t last_page_no; /*!< page number of the last page in the
+ undo log; this may differ from
+ top_page_no during a rollback */
+ uint16_t hdr_offset; /*!< header offset of the undo log on
+ the page */
+ uint32_t size; /*!< current size in pages */
+ /*-----------------------------*/
+ uint32_t top_page_no; /*!< page number where the latest undo
+ log record was catenated; during
+ rollback the page from which the latest
+ undo record was chosen */
+ uint16_t top_offset; /*!< offset of the latest undo record,
+ i.e., the topmost element in the undo
+ log if we think of it as a stack */
+ undo_no_t top_undo_no; /*!< undo number of the latest record
+ (IB_ID_MAX if the undo log is empty) */
+ buf_block_t* guess_block; /*!< guess for the buffer block where
+ the top page might reside */
+
+ /** @return whether the undo log is empty */
+ bool empty() const { return top_undo_no == IB_ID_MAX; }
+
+ /*-----------------------------*/
+ UT_LIST_NODE_T(trx_undo_t) undo_list;
+ /*!< undo log objects in the rollback
+ segment are chained into lists */
+};
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** The offset of the undo log page header on pages of the undo log */
+#define TRX_UNDO_PAGE_HDR FSEG_PAGE_DATA
+/*-------------------------------------------------------------*/
+/** Transaction undo log page header offsets */
+/* @{ */
+#define TRX_UNDO_PAGE_TYPE 0 /*!< unused; 0 (before MariaDB 10.3.1:
+ 1=TRX_UNDO_INSERT or
+ 2=TRX_UNDO_UPDATE) */
+#define TRX_UNDO_PAGE_START 2 /*!< Byte offset where the undo log
+ records for the LATEST transaction
+ start on this page (remember that
+ in an update undo log, the first page
+ can contain several undo logs) */
+#define TRX_UNDO_PAGE_FREE 4 /*!< On each page of the undo log this
+ field contains the byte offset of the
+ first free byte on the page */
+#define TRX_UNDO_PAGE_NODE 6 /*!< The file list node in the chain
+ of undo log pages */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE)
+ /*!< Size of the transaction undo
+ log page header, in bytes */
+/* @} */
+
+/** An update undo segment with just one page can be reused if it has
+at most this many bytes used; we must leave space at least for one new undo
+log header on the page */
+
+#define TRX_UNDO_PAGE_REUSE_LIMIT (3 << (srv_page_size_shift - 2))
+
+/* An update undo log segment may contain several undo logs on its first page
+if the undo logs took so little space that the segment could be cached and
+reused. All the undo log headers are then on the first page, and the last one
+owns the undo log records on subsequent pages if the segment is bigger than
+one page. If an undo log is stored in a segment, then on the first page it is
+allowed to have zero undo records, but if the segment extends to several
+pages, then all the rest of the pages must contain at least one undo log
+record. */
+
+/** The offset of the undo log segment header on the first page of the undo
+log segment */
+
+#define TRX_UNDO_SEG_HDR (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE)
+/** Undo log segment header */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_STATE 0 /*!< TRX_UNDO_ACTIVE, ... */
+
+#ifndef UNIV_INNOCHECKSUM
+
+#define TRX_UNDO_LAST_LOG 2 /*!< Offset of the last undo log header
+ on the segment header page, 0 if
+ none */
+#define TRX_UNDO_FSEG_HEADER 4 /*!< Header for the file segment which
+ the undo log segment occupies */
+#define TRX_UNDO_PAGE_LIST (4 + FSEG_HEADER_SIZE)
+ /*!< Base node for the list of pages in
+ the undo log segment; defined only on
+ the undo log segment's first page */
+/*-------------------------------------------------------------*/
+/** Size of the undo log segment header */
+#define TRX_UNDO_SEG_HDR_SIZE (4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE)
+/* @} */
+
+/** The undo log header. There can be several undo log headers on the first
+page of an update undo log segment. */
+/* @{ */
+/*-------------------------------------------------------------*/
+/** Transaction start identifier, or 0 if the undo log segment has been
+completely purged and trx_purge_free_segment() has started freeing it */
+#define TRX_UNDO_TRX_ID 0
+/** Transaction end identifier (if the log is in a history list),
+or 0 if the transaction has not been committed */
+#define TRX_UNDO_TRX_NO 8
+/** Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of
+surviving user records, this used to be called TRX_UNDO_DEL_MARKS.
+
+The value 1 indicates that purge needs to process the undo log segment.
+The value 0 indicates that all of it has been processed, and
+trx_purge_free_segment() has been invoked, so the log is not safe to access.
+
+Before MariaDB 10.3.1, a log segment may carry the value 0 even before
+trx_purge_free_segment() was called, for those undo log records for
+which purge would not result in removing delete-marked records. */
+#define TRX_UNDO_NEEDS_PURGE 16
+#define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record
+ of this log on the header page; purge
+ may remove undo log record from the
+ log start, and therefore this is not
+ necessarily the same as this log
+ header end offset */
+#define TRX_UNDO_XID_EXISTS 20 /*!< TRUE if undo log header includes
+ X/Open XA transaction identification
+ XID */
+#define TRX_UNDO_DICT_TRANS 21 /*!< TRUE if the transaction is a table
+ create, index create, or drop
+ transaction: in recovery
+ the transaction cannot be rolled back
+ in the usual way: a 'rollback' rather
+ means dropping the created or dropped
+ table, if it still exists */
+#define TRX_UNDO_TABLE_ID 22 /*!< Id of the table if the preceding
+ field is TRUE */
+#define TRX_UNDO_NEXT_LOG 30 /*!< Offset of the next undo log header
+ on this page, 0 if none */
+#define TRX_UNDO_PREV_LOG 32 /*!< Offset of the previous undo log
+ header on this page, 0 if none */
+#define TRX_UNDO_HISTORY_NODE 34 /*!< If the log is put to the history
+ list, the file list node is here */
+/*-------------------------------------------------------------*/
+/** Size of the undo log header without XID information */
+#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE)
+
+/** X/Open XA Transaction Identification (XID) */
+/* @{ */
+/** xid_t::formatID */
+#define TRX_UNDO_XA_FORMAT (TRX_UNDO_LOG_OLD_HDR_SIZE)
+/** xid_t::gtrid_length */
+#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4)
+/** xid_t::bqual_length */
+#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4)
+/** Distributed transaction identifier data */
+#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4)
+/*--------------------------------------------------------------*/
+#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE)
+ /*!< Total size of the undo log header
+ with the XA XID */
+/* @} */
+
+#include "trx0undo.ic"
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic
new file mode 100644
index 00000000..43af9327
--- /dev/null
+++ b/storage/innobase/include/trx0undo.ic
@@ -0,0 +1,158 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.ic
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+#include "page0page.h"
+
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+ bool is_insert, /*!< in: TRUE if insert undo log */
+ ulint rseg_id, /*!< in: rollback segment id */
+ uint32_t page_no, /*!< in: page number */
+ uint16_t offset) /*!< in: offset of the undo entry within page */
+{
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+
+ return roll_ptr_t{is_insert} << ROLL_PTR_INSERT_FLAG_POS |
+ roll_ptr_t{rseg_id} << ROLL_PTR_RSEG_ID_POS |
+ roll_ptr_t{page_no} << ROLL_PTR_PAGE_POS | offset;
+}
+
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer */
+ bool* is_insert, /*!< out: TRUE if insert undo log */
+ ulint* rseg_id, /*!< out: rollback segment id */
+ uint32_t* page_no, /*!< out: page number */
+ uint16_t* offset) /*!< out: offset of the undo
+ entry within page */
+{
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ ut_ad(roll_ptr < (1ULL << 56));
+ *offset= static_cast<uint16_t>(roll_ptr);
+ *page_no= static_cast<uint32_t>(roll_ptr >> 16);
+ *rseg_id= static_cast<ulint>(roll_ptr >> 48 & 0x7F);
+ *is_insert= static_cast<bool>(roll_ptr >> 55);
+}
+
+/***********************************************************************//**
+Determine if DB_ROLL_PTR is of the insert type.
+@return true if insert */
+UNIV_INLINE
+bool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+ roll_ptr_t roll_ptr) /*!< in: roll pointer */
+{
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ ut_ad(roll_ptr < (1ULL << (ROLL_PTR_INSERT_FLAG_POS + 1)));
+ return static_cast<bool>(roll_ptr >> ROLL_PTR_INSERT_FLAG_POS);
+}
+
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+ const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+{
+ compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+ return bool(trx_id[DATA_TRX_ID_LEN] >> 7);
+}
+
+/** Gets an undo log page and x-latches it.
+@param[in] page_id page id
+@param[in,out] mtr mini-transaction
+@return pointer to page x-latched */
+UNIV_INLINE
+buf_block_t*
+trx_undo_page_get(const page_id_t page_id, mtr_t* mtr)
+{
+ buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+ return block;
+}
+
+/** Gets an undo log page and s-latches it.
+@param[in] page_id page id
+@param[in,out] mtr mini-transaction
+@return pointer to page s-latched */
+UNIV_INLINE
+buf_block_t*
+trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr)
+{
+ buf_block_t* block = buf_page_get(page_id, 0, RW_S_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ return block;
+}
+
+/** Determine the end offset of undo log records of an undo log page.
+@param[in] undo_page undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset
+@return end offset */
+inline
+uint16_t trx_undo_page_get_end(const buf_block_t *undo_page, uint32_t page_no,
+ uint16_t offset)
+{
+ if (page_no == undo_page->page.id().page_no())
+ if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + offset +
+ undo_page->frame))
+ return end;
+
+ return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+ undo_page->frame);
+}
+
+/** Get the next record in an undo log.
+@param[in] undo_page undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@return undo log record, the page latched, NULL if none */
+inline trx_undo_rec_t*
+trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
+ uint32_t page_no, uint16_t offset)
+{
+ uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset);
+ uint16_t next= mach_read_from_2(undo_page->frame + rec);
+ return next == end ? nullptr : undo_page->frame + next;
+}
diff --git a/storage/innobase/include/trx0xa.h b/storage/innobase/include/trx0xa.h
new file mode 100644
index 00000000..cb5d67cf
--- /dev/null
+++ b/storage/innobase/include/trx0xa.h
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef XA_H
+#define XA_H
+
+#include "handler.h"
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#ifndef XIDDATASIZE
+
+/** Sizes of transaction identifier */
+#define XIDDATASIZE 128 /*!< maximum size of a transaction
+ identifier, in bytes */
+#define MAXGTRIDSIZE 64 /*!< maximum size in bytes of gtrid */
+#define MAXBQUALSIZE 64 /*!< maximum size in bytes of bqual */
+
+#endif
+/** X/Open XA distributed transaction status codes */
+/* @{ */
+#define XA_OK 0 /*!< normal execution */
+#define XAER_ASYNC -2 /*!< asynchronous operation already
+ outstanding */
+#define XAER_RMERR -3 /*!< a resource manager error
+ occurred in the transaction
+ branch */
+#define XAER_NOTA -4 /*!< the XID is not valid */
+#define XAER_INVAL -5 /*!< invalid arguments were given */
+#define XAER_PROTO -6 /*!< routine invoked in an improper
+ context */
+#define XAER_RMFAIL -7 /*!< resource manager unavailable */
+#define XAER_DUPID -8 /*!< the XID already exists */
+#define XAER_OUTSIDE -9 /*!< resource manager doing
+ work outside transaction */
+/* @} */
+#endif /* ifndef XA_H */
+/*
+ * End of xa.h header
+ */
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
new file mode 100644
index 00000000..6c68bf17
--- /dev/null
+++ b/storage/innobase/include/univ.i
@@ -0,0 +1,581 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***********************************************************************//**
+@file include/univ.i
+Version control for database, common definitions, and include files
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#ifndef univ_i
+#define univ_i
+
+/* aux macros to convert M into "123" (string) if M is defined like
+#define M 123 */
+#define _IB_TO_STR(s) #s
+#define IB_TO_STR(s) _IB_TO_STR(s)
+
+/* The following is the InnoDB version as shown in
+SELECT plugin_version FROM information_schema.plugins;
+calculated in make_version_string() in sql/sql_show.cc like this:
+"version >> 8" . "version & 0xff"
+because the version is shown with only one dot, we skip the last
+component, i.e. we show M.N.P as M.N */
+#define INNODB_VERSION_SHORT \
+ (MYSQL_VERSION_MAJOR << 8 | MYSQL_VERSION_MINOR)
+
+#define INNODB_VERSION_STR \
+ IB_TO_STR(MYSQL_VERSION_MAJOR) "." \
+ IB_TO_STR(MYSQL_VERSION_MINOR) "." \
+ IB_TO_STR(MYSQL_VERSION_PATCH)
+
+/** How far ahead should we tell the service manager the timeout
+(time in seconds) */
+#define INNODB_EXTEND_TIMEOUT_INTERVAL 30
+
+#ifdef MYSQL_DYNAMIC_PLUGIN
+/* In the dynamic plugin, redefine some externally visible symbols
+in order not to conflict with the symbols of a builtin InnoDB. */
+
+/* Rename all C++ classes that contain virtual functions, because we
+have not figured out how to apply the visibility=hidden attribute to
+the virtual method table (vtable) in GCC 3. */
+# define ha_innobase ha_innodb
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+#if defined(_WIN32)
+# include <windows.h>
+#endif /* _WIN32 */
+
+/* Include a minimum number of SQL header files so that few changes
+made in SQL code cause a complete InnoDB rebuild. These headers are
+used throughout InnoDB but do not include too much themselves. They
+support cross-platform development and expose comonly used SQL names. */
+
+#include <my_global.h>
+#include "my_counter.h"
+#include <m_string.h>
+
+/* JAN: TODO: missing 5.7 header */
+#ifdef HAVE_MY_THREAD_H
+//# include <my_thread.h>
+#endif
+
+#ifndef UNIV_INNOCHECKSUM
+# include <mysqld_error.h>
+#endif /* !UNIV_INNOCHECKSUM */
+
+/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */
+#include <sys/stat.h>
+
+#ifndef _WIN32
+# include <sched.h>
+# include "my_config.h"
+#endif
+
+#include <stdint.h>
+#include <inttypes.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#include "my_pthread.h"
+
+/* Following defines are to enable performance schema
+instrumentation in each of five InnoDB modules if
+HAVE_PSI_INTERFACE is defined. */
+#ifdef HAVE_PSI_INTERFACE
+# define UNIV_PFS_MUTEX
+# define UNIV_PFS_RWLOCK
+# define UNIV_PFS_IO
+# define UNIV_PFS_THREAD
+
+# include "mysql/psi/psi.h" /* HAVE_PSI_MEMORY_INTERFACE */
+# ifdef HAVE_PSI_MEMORY_INTERFACE
+# define UNIV_PFS_MEMORY
+# endif /* HAVE_PSI_MEMORY_INTERFACE */
+
+/* There are mutexes/rwlocks that we want to exclude from
+instrumentation even if their corresponding performance schema
+define is set. And this PFS_NOT_INSTRUMENTED is used
+as the key value to identify those objects that would
+be excluded from instrumentation. */
+# define PFS_NOT_INSTRUMENTED ULINT32_UNDEFINED
+
+# define PFS_IS_INSTRUMENTED(key) ((key) != PFS_NOT_INSTRUMENTED)
+
+#ifdef HAVE_PFS_THREAD_PROVIDER_H
+/* For PSI_MUTEX_CALL() and similar. */
+#include "pfs_thread_provider.h"
+#endif
+
+#include "mysql/psi/mysql_thread.h"
+/* For PSI_FILE_CALL(). */
+#ifdef HAVE_PFS_FILE_PROVIDER_H
+#include "pfs_file_provider.h"
+#endif
+
+#include "mysql/psi/mysql_file.h"
+
+#endif /* HAVE_PSI_INTERFACE */
+
+#ifdef _WIN32
+# define YY_NO_UNISTD_H 1
+/* VC++ tries to optimise for size by default, from V8+. The size of
+the pointer to member depends on whether the type is defined before the
+compiler sees the type in the translation unit. This default behaviour
+can cause the pointer to be a different size in different translation
+units, depending on the above rule. We force optimise for size behaviour
+for all cases. This is used by ut0lst.h related code. */
+# pragma pointers_to_members(full_generality, multiple_inheritance)
+#endif /* _WIN32 */
+
+/* DEBUG VERSION CONTROL
+ ===================== */
+
+/* When this macro is defined then additional test functions will be
+compiled. These functions live at the end of each relevant source file
+and have "test_" prefix. These functions can be called from the end of
+innodb_init() or they can be called from gdb after srv_start() has executed
+using the call command. */
+/*
+#define UNIV_COMPILE_TEST_FUNCS
+#define UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+#define UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+#define UNIV_ENABLE_UNIT_TEST_DICT_STATS
+#define UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT
+*/
+
+#ifdef DBUG_OFF
+# undef UNIV_DEBUG
+#elif !defined UNIV_DEBUG
+# define UNIV_DEBUG
+#endif
+
+#if 0
+#define UNIV_DEBUG_PRINT /* Enable the compilation of
+ some debug print functions */
+#define UNIV_AHI_DEBUG /* Enable adaptive hash index
+ debugging without UNIV_DEBUG */
+#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column
+ debugging without UNIV_DEBUG */
+#define UNIV_DEBUG_LOCK_VALIDATE /* Enable
+ ut_ad(lock_rec_validate_page())
+ assertions. */
+#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */
+#define UNIV_HASH_DEBUG /* debug HASH_ macros */
+#define UNIV_IBUF_DEBUG /* debug the insert buffer */
+#define UNIV_PERF_DEBUG /* debug flag that enables
+ light weight performance
+ related stuff. */
+#define UNIV_SEARCH_PERF_STAT /* statistics for the
+ adaptive hash index */
+#define UNIV_SRV_PRINT_LATCH_WAITS /* enable diagnostic output
+ in sync0sync.cc */
+#define UNIV_BTR_PRINT /* enable functions for
+ printing B-trees */
+#define UNIV_ZIP_DEBUG /* extensive consistency checks
+ for compressed pages */
+#define UNIV_ZIP_COPY /* call page_zip_copy_recs()
+ more often */
+#define UNIV_AIO_DEBUG /* prints info about
+ submitted and reaped AIO
+ requests to the log. */
+#define UNIV_STATS_DEBUG /* prints various stats
+ related debug info from
+ dict0stats.c */
+#define FTS_INTERNAL_DIAG_PRINT /* FTS internal debugging
+ info output */
+#endif
+
+#define UNIV_BTR_DEBUG /* check B-tree links */
+#define UNIV_LIGHT_MEM_DEBUG /* light memory debugging */
+
+// #define UNIV_SQL_DEBUG
+
+/* Linkage specifier for non-static InnoDB symbols (variables and functions)
+that are only referenced from within InnoDB, not from MySQL. We disable the
+GCC visibility directive on all Sun operating systems because there is no
+easy way to get it to work. See http://bugs.mysql.com/bug.php?id=52263. */
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(sun) || defined(__INTEL_COMPILER)
+# define UNIV_INTERN __attribute__((visibility ("hidden")))
+#else
+# define UNIV_INTERN
+#endif
+
+#ifndef MY_ATTRIBUTE
+#if defined(__GNUC__)
+# define MY_ATTRIBUTE(A) __attribute__(A)
+#else
+# define MY_ATTRIBUTE(A)
+#endif
+#endif
+
+#define UNIV_INLINE static inline
+
+#define UNIV_WORD_SIZE SIZEOF_SIZE_T
+
+/** The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT 8U
+
+/*
+ DATABASE VERSION CONTROL
+ ========================
+*/
+
+#ifdef HAVE_LZO
+#define IF_LZO(A,B) A
+#else
+#define IF_LZO(A,B) B
+#endif
+
+#ifdef HAVE_LZ4
+#define IF_LZ4(A,B) A
+#else
+#define IF_LZ4(A,B) B
+#endif
+
+#ifdef HAVE_LZMA
+#define IF_LZMA(A,B) A
+#else
+#define IF_LZMA(A,B) B
+#endif
+
+#ifdef HAVE_BZIP2
+#define IF_BZIP2(A,B) A
+#else
+#define IF_BZIP2(A,B) B
+#endif
+
+#ifdef HAVE_SNAPPY
+#define IF_SNAPPY(A,B) A
+#else
+#define IF_SNAPPY(A,B) B
+#endif
+
+#if defined (HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32)
+#define IF_PUNCH_HOLE(A,B) A
+#else
+#define IF_PUNCH_HOLE(A,B) B
+#endif
+
+/** log2 of smallest compressed page size (1<<10 == 1024 bytes)
+Note: This must never change! */
+#define UNIV_ZIP_SIZE_SHIFT_MIN 10U
+
+/** log2 of largest compressed page size (1<<14 == 16384 bytes).
+A compressed page directory entry reserves 14 bits for the start offset
+and 2 bits for flags. This limits the uncompressed page size to 16k.
+*/
+#define UNIV_ZIP_SIZE_SHIFT_MAX 14U
+
+/* Define the Min, Max, Default page sizes. */
+/** Minimum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MIN 12U
+/** log2 of largest page size (1<<16 == 64436 bytes). */
+/** Maximum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MAX 16U
+/** log2 of default page size (1<<14 == 16384 bytes). */
+/** Default Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_DEF 14U
+/** Original 16k InnoDB Page Size Shift, in case the default changes */
+#define UNIV_PAGE_SIZE_SHIFT_ORIG 14U
+/** Original 16k InnoDB Page Size as an ssize (log2 - 9) */
+#define UNIV_PAGE_SSIZE_ORIG (UNIV_PAGE_SIZE_SHIFT_ORIG - 9U)
+
+/** Minimum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MIN (1U << UNIV_PAGE_SIZE_SHIFT_MIN)
+/** Maximum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MAX (1U << UNIV_PAGE_SIZE_SHIFT_MAX)
+/** Default page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_DEF (1U << UNIV_PAGE_SIZE_SHIFT_DEF)
+/** Original 16k page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_ORIG (1U << UNIV_PAGE_SIZE_SHIFT_ORIG)
+
+/** Smallest compressed page size */
+#define UNIV_ZIP_SIZE_MIN (1U << UNIV_ZIP_SIZE_SHIFT_MIN)
+
+/** Largest compressed page size */
+#define UNIV_ZIP_SIZE_MAX (1U << UNIV_ZIP_SIZE_SHIFT_MAX)
+
+/** Largest possible ssize for an uncompressed page.
+(The convention 'ssize' is used for 'log2 minus 9' or the number of
+shifts starting with 512.)
+This max number varies depending on srv_page_size. */
+#define UNIV_PAGE_SSIZE_MAX \
+ ulint(srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1U)
+
+/** Smallest possible ssize for an uncompressed page. */
+#define UNIV_PAGE_SSIZE_MIN \
+ ulint(UNIV_PAGE_SIZE_SHIFT_MIN - UNIV_ZIP_SIZE_SHIFT_MIN + 1U)
+
+/** Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM 32
+
+/** This is the "mbmaxlen" for my_charset_filename (defined in
+strings/ctype-utf8.c), which is used to encode File and Database names. */
+#define FILENAME_CHARSET_MAXNAMLEN 5
+
+/** The maximum length of an encode table name in bytes. The max
+table and database names are NAME_CHAR_LEN (64) characters. After the
+encoding, the max length would be NAME_CHAR_LEN (64) *
+FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a
+terminating '\0'. InnoDB can handle longer names internally */
+#define MAX_TABLE_NAME_LEN 320
+
+/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is
+the MySQL's NAME_LEN, see check_and_convert_db_name(). */
+#define MAX_DATABASE_NAME_LEN MAX_TABLE_NAME_LEN
+
+/** MAX_FULL_NAME_LEN defines the full name path including the
+database name and table name. In addition, 14 bytes is added for:
+ 2 for surrounding quotes around table name
+ 1 for the separating dot (.)
+ 9 for the #mysql50# prefix */
+#define MAX_FULL_NAME_LEN \
+ (MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14)
+
+/** Maximum length of the compression alogrithm string. Currently we support
+only (NONE | ZLIB | LZ4). */
+#define MAX_COMPRESSION_LEN 4
+
+/** The maximum length in bytes that a database name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_DB_UTF8_LEN (NAME_LEN + 1)
+
+/** The maximum length in bytes that a table name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_TABLE_UTF8_LEN (NAME_LEN + sizeof(srv_mysql50_table_name_prefix))
+
+/*
+ UNIVERSAL TYPE DEFINITIONS
+ ==========================
+*/
+
+/** Unsigned octet of bits */
+typedef unsigned char byte;
+/** Machine-word-width unsigned integer */
+typedef size_t ulint;
+/** Machine-word-width signed integer */
+typedef ssize_t lint;
+
+/** ulint format for the printf() family of functions */
+#define ULINTPF "%zu"
+/** ulint hexadecimal format for the printf() family of functions */
+#define ULINTPFx "%zx"
+
+#ifdef _WIN32
+/* Use the integer types and formatting strings defined in Visual Studio. */
+# define UINT32PF "%u"
+# define INT64PF "%lld"
+# define UINT64scan "llu"
+# define UINT64PFx "%016llx"
+#elif defined __APPLE__
+/* Apple prefers to call the 64-bit types 'long long'
+in both 32-bit and 64-bit environments. */
+# define UINT32PF "%" PRIu32
+# define INT64PF "%lld"
+# define UINT64scan "llu"
+# define UINT64PFx "%016llx"
+#elif defined _AIX
+/* Workaround for macros expension trouble */
+# define UINT32PF "%u"
+# define INT64PF "%lld"
+# define UINT64scan "lu"
+# define UINT64PFx "%016llx"
+#else
+/* Use the integer types and formatting strings defined in the C99 standard. */
+# define UINT32PF "%" PRIu32
+# define INT64PF "%" PRId64
+# define UINT64scan PRIu64
+# define UINT64PFx "%016" PRIx64
+#endif
+
+#ifdef UNIV_INNOCHECKSUM
+extern bool strict_verify;
+extern FILE* log_file;
+extern uint32_t cur_page_num;
+#endif /* UNIV_INNOCHECKSUM */
+
+typedef int64_t ib_int64_t;
+typedef uint64_t ib_uint64_t;
+typedef uint32_t ib_uint32_t;
+
+#define UINT64PF "%" UINT64scan
+#define IB_ID_FMT UINT64PF
+
+/** Log sequence number (also used for redo log byte arithmetics) */
+typedef ib_uint64_t lsn_t;
+
+/** The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED ((ulint)(-1))
+
+/** The 'undefined' value for a ib_uint64_t */
+#define UINT64_UNDEFINED ((ib_uint64_t)(-1))
+
+/** The bitmask of 32-bit unsigned integer */
+#define ULINT32_MASK 0xFFFFFFFFU
+/** The undefined 32-bit unsigned integer */
+#define ULINT32_UNDEFINED ULINT32_MASK
+
+/** Maximum value for a ulint */
+#define ULINT_MAX ((ulint)(-2))
+
+/** Maximum value for ib_uint64_t */
+#define IB_UINT64_MAX ((ib_uint64_t) (~0ULL))
+
+/** The generic InnoDB system object identifier data type */
+typedef ib_uint64_t ib_id_t;
+#define IB_ID_MAX (~(ib_id_t) 0)
+#define IB_ID_FMT UINT64PF
+
+#ifndef UINTMAX_MAX
+#define UINTMAX_MAX IB_UINT64_MAX
+#endif
+/** This 'ibool' type is used within Innobase. Remember that different included
+headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
+#define ibool ulint
+
+#ifndef TRUE
+
+#define TRUE 1
+#define FALSE 0
+
+#endif
+
+#define UNIV_NOTHROW
+
+/** The following number as the length of a logical field means that the field
+has the SQL NULL as its value. NOTE that because we assume that the length
+of a field is a 32-bit integer when we store it, for example, to an undo log
+on disk, we must have also this number fit in 32 bits, also in 64-bit
+computers! */
+
+#define UNIV_SQL_NULL ULINT32_UNDEFINED
+
+/** Lengths which are not UNIV_SQL_NULL, but bigger than the following
+number indicate that a field contains a reference to an externally
+stored part of the field in the tablespace. The length field then
+contains the sum of the following flag and the locally stored len. */
+
+#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_DEF)
+
+#if defined(__GNUC__)
+/* Tell the compiler that variable/function is unused. */
+# define UNIV_UNUSED MY_ATTRIBUTE ((unused))
+#else
+# define UNIV_UNUSED
+#endif /* CHECK FOR GCC VER_GT_2 */
+
+/* Some macros to improve branch prediction and reduce cache misses */
+#if defined(COMPILER_HINTS) && defined(__GNUC__)
+/* Tell the compiler that 'expr' probably evaluates to 'constant'. */
+# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant)
+/* Tell the compiler that a pointer is likely to be NULL */
+# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ptr) != 0, 0)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read. */
+# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read or written. */
+# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
+
+/* Sun Studio includes sun_prefetch.h as of version 5.9 */
+#elif (defined(__SUNPRO_C) || defined(__SUNPRO_CC))
+
+# include <sun_prefetch.h>
+
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+
+# if defined(COMPILER_HINTS)
+//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+# else
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+# endif /* COMPILER_HINTS */
+
+# elif defined __WIN__ && defined COMPILER_HINTS
+# include <xmmintrin.h>
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+// __MM_HINT_T0 - (temporal data)
+// prefetch data into all levels of the cache hierarchy.
+# define UNIV_PREFETCH_R(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+# define UNIV_PREFETCH_RW(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+#else
+/* Dummy versions of the macros */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+#endif
+
+/* Tell the compiler that cond is likely to hold */
+#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE)
+/* Tell the compiler that cond is unlikely to hold */
+#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE)
+
+/* Compile-time constant of the given array's size. */
+#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+/* The return type from a thread's start function differs between Unix and
+Windows, so define a typedef for it and a macro to use at the end of such
+functions. */
+
+#ifdef _WIN32
+typedef DWORD os_thread_ret_t;
+# define OS_THREAD_DUMMY_RETURN return(0)
+# define OS_PATH_SEPARATOR '\\'
+# define OS_PATH_SEPARATOR_ALT '/'
+#else
+typedef void* os_thread_ret_t;
+# define OS_THREAD_DUMMY_RETURN return(NULL)
+# define OS_PATH_SEPARATOR '/'
+# define OS_PATH_SEPARATOR_ALT '\\'
+#endif
+
+#include <stdio.h>
+#include "db0err.h"
+#include "ut0dbg.h"
+#include "ut0lst.h"
+#include "ut0ut.h"
+#include "sync0types.h"
+
+extern ulong srv_page_size_shift;
+extern ulong srv_page_size;
+
+/* Dimension of spatial object we support so far. It has its root in
+myisam/sp_defs.h. We only support 2 dimension data */
+#define SPDIMS 2
+
+#endif
diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h
new file mode 100644
index 00000000..1a428d73
--- /dev/null
+++ b/storage/innobase/include/ut0byte.h
@@ -0,0 +1,117 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0byte.h
+Utilities for byte operations
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0byte_h
+#define ut0byte_h
+
+#include "univ.i"
+
+/*******************************************************//**
+Creates a 64-bit integer out of two 32-bit integers.
+@return created integer */
+UNIV_INLINE
+ib_uint64_t
+ut_ull_create(
+/*==========*/
+ ulint high, /*!< in: high-order 32 bits */
+ ulint low) /*!< in: low-order 32 bits */
+ MY_ATTRIBUTE((const));
+
+/********************************************************//**
+Rounds a 64-bit integer downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number
+ which must be a power of 2 */
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number
+ which must be a power of 2 */
+/** Round down a pointer to the nearest aligned address.
+@param ptr pointer
+@param alignment a power of 2
+@return aligned pointer */
+static inline void *ut_align_down(void *ptr, size_t alignment)
+{
+ ut_ad(alignment > 0);
+ ut_ad(ut_is_2pow(alignment));
+ ut_ad(ptr);
+ static_assert(sizeof ptr == sizeof(size_t), "compatibility");
+
+ return reinterpret_cast<void*>(reinterpret_cast<size_t>(ptr) &
+ ~(alignment - 1));
+}
+
+static inline const void *ut_align_down(const void *ptr, size_t alignment)
+{
+ return ut_align_down(const_cast<void*>(ptr), alignment);
+}
+
+/** Compute the offset of a pointer from the nearest aligned address.
+@param ptr pointer
+@param alignment a power of 2
+@return distance from aligned pointer */
+inline size_t ut_align_offset(const void *ptr, size_t alignment)
+{
+ ut_ad(alignment > 0);
+ ut_ad(ut_is_2pow(alignment));
+ ut_ad(ptr);
+ return reinterpret_cast<size_t>(ptr) & (alignment - 1);
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n); /*!< in: nth bit requested */
+/*****************************************************************//**
+Sets the nth bit of a ulint.
+@return the ulint with the bit set as requested */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n, /*!< in: nth bit requested */
+ ibool val); /*!< in: value for the bit to set */
+
+#include "ut0byte.ic"
+
+#endif
diff --git a/storage/innobase/include/ut0byte.ic b/storage/innobase/include/ut0byte.ic
new file mode 100644
index 00000000..a4b5d4a7
--- /dev/null
+++ b/storage/innobase/include/ut0byte.ic
@@ -0,0 +1,109 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0byte.ic
+Utilities for byte operations
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/*******************************************************//**
+Creates a 64-bit integer out of two 32-bit integers.
+@return created integer */
+UNIV_INLINE
+ib_uint64_t
+ut_ull_create(
+/*==========*/
+ ulint high, /*!< in: high-order 32 bits */
+ ulint low) /*!< in: low-order 32 bits */
+{
+ ut_ad(high <= ULINT32_MASK);
+ ut_ad(low <= ULINT32_MASK);
+ return(((ib_uint64_t) high) << 32 | low);
+}
+
+/********************************************************//**
+Rounds a 64-bit integer downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number
+ which must be a power of 2 */
+{
+ ut_ad(align_no > 0);
+ ut_ad(ut_is_2pow(align_no));
+
+ return(n & ~((ib_uint64_t) align_no - 1));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number
+ which must be a power of 2 */
+{
+ ib_uint64_t align_1 = (ib_uint64_t) align_no - 1;
+
+ ut_ad(align_no > 0);
+ ut_ad(ut_is_2pow(align_no));
+
+ return((n + align_1) & ~align_1);
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n) /*!< in: nth bit requested */
+{
+ ut_ad(n < 8 * sizeof(ulint));
+ return(1 & (a >> n));
+}
+
+/*****************************************************************//**
+Sets the nth bit of a ulint.
+@return the ulint with the bit set as requested */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n, /*!< in: nth bit requested */
+ ibool val) /*!< in: value for the bit to set */
+{
+ ut_ad(n < 8 * sizeof(ulint));
+ if (val) {
+ return(((ulint) 1 << n) | a);
+ } else {
+ return(~((ulint) 1 << n) & a);
+ }
+}
diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h
new file mode 100644
index 00000000..646a5f36
--- /dev/null
+++ b/storage/innobase/include/ut0counter.h
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ut0counter.h
+
+Counter utility class
+
+Created 2012/04/12 by Sunny Bains
+*******************************************************/
+
+#ifndef ut0counter_h
+#define ut0counter_h
+
+#include "os0thread.h"
+#include "my_rdtsc.h"
+
+/** CPU cache line size */
+#ifdef CPU_LEVEL1_DCACHE_LINESIZE
+# define CACHE_LINE_SIZE CPU_LEVEL1_DCACHE_LINESIZE
+#else
+# error CPU_LEVEL1_DCACHE_LINESIZE is undefined
+#endif /* CPU_LEVEL1_DCACHE_LINESIZE */
+
+/** Default number of slots to use in ib_counter_t */
+#define IB_N_SLOTS 64
+
+/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles
+as a random value. See the comments for my_timer_cycles() */
+/** @return result from RDTSC or similar functions. */
+static inline size_t
+get_rnd_value()
+{
+ size_t c = static_cast<size_t>(my_timer_cycles());
+
+ if (c != 0) {
+ return c;
+ }
+
+ /* We may go here if my_timer_cycles() returns 0,
+ so we have to have the plan B for the counter. */
+#if !defined(_WIN32)
+ return (size_t)os_thread_get_curr_id();
+#else
+ LARGE_INTEGER cnt;
+ QueryPerformanceCounter(&cnt);
+
+ return static_cast<size_t>(cnt.QuadPart);
+#endif /* !_WIN32 */
+}
+
+/** Class for using fuzzy counters. The counter is multi-instance relaxed atomic
+so the results are not guaranteed to be 100% accurate but close
+enough. Creates an array of counters and separates each element by the
+CACHE_LINE_SIZE bytes */
+template <typename Type, int N = IB_N_SLOTS>
+struct ib_counter_t {
+ /** Increment the counter by 1. */
+ void inc() { add(1); }
+
+ /** Increment the counter by 1.
+ @param[in] index a reasonably thread-unique identifier */
+ void inc(size_t index) { add(index, 1); }
+
+ /** Add to the counter.
+ @param[in] n amount to be added */
+ void add(Type n) { add(get_rnd_value(), n); }
+
+ /** Add to the counter.
+ @param[in] index a reasonably thread-unique identifier
+ @param[in] n amount to be added */
+ void add(size_t index, Type n) {
+ index = index % N;
+
+ ut_ad(index < UT_ARR_SIZE(m_counter));
+
+ m_counter[index].value.fetch_add(n, std::memory_order_relaxed);
+ }
+
+ /* @return total value - not 100% accurate, since it is relaxed atomic*/
+ operator Type() const {
+ Type total = 0;
+
+ for (const auto &counter : m_counter) {
+ total += counter.value.load(std::memory_order_relaxed);
+ }
+
+ return(total);
+ }
+
+private:
+ /** Atomic which occupies whole CPU cache line.
+ Note: We rely on the default constructor of std::atomic and
+ do not explicitly initialize the contents. This works for us,
+ because ib_counter_t is only intended for usage with global
+ memory that is allocated from the .bss and thus guaranteed to
+ be zero-initialized by the run-time environment.
+ @see srv_stats
+ @see rw_lock_stats */
+ struct ib_counter_element_t {
+ MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<Type> value;
+ };
+ static_assert(sizeof(ib_counter_element_t) == CACHE_LINE_SIZE, "");
+
+ /** Array of counter elements */
+ MY_ALIGNED(CACHE_LINE_SIZE) ib_counter_element_t m_counter[N];
+};
+
+#endif /* ut0counter_h */
diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h
new file mode 100644
index 00000000..0cbccb97
--- /dev/null
+++ b/storage/innobase/include/ut0crc32.h
@@ -0,0 +1,37 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ut0crc32.h
+CRC32 implementation
+
+Created Aug 10, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef ut0crc32_h
+#define ut0crc32_h
+
+#include "univ.i"
+#include <my_sys.h>
+static inline uint32_t ut_crc32(const byte *s, size_t size)
+{
+ return my_crc32c(0, s, size);
+}
+
+#endif /* ut0crc32_h */
diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h
new file mode 100644
index 00000000..85856660
--- /dev/null
+++ b/storage/innobase/include/ut0dbg.h
@@ -0,0 +1,179 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file include/ut0dbg.h
+Debug utilities for Innobase
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#ifndef ut0dbg_h
+#define ut0dbg_h
+
+#ifdef UNIV_INNOCHECKSUM
+#define ut_a assert
+#define ut_ad assert
+#define ut_error assert(0)
+#else /* !UNIV_INNOCHECKSUM */
+
+/* Do not include univ.i because univ.i includes this. */
+
+/*************************************************************//**
+Report a failed assertion. */
+ATTRIBUTE_NORETURN ATTRIBUTE_COLD __attribute__((nonnull(2)))
+void
+ut_dbg_assertion_failed(
+/*====================*/
+ const char* expr, /*!< in: the failed assertion */
+ const char* file, /*!< in: source file containing the assertion */
+ unsigned line); /*!< in: line number of the assertion */
+
+/** Abort execution if EXPR does not evaluate to nonzero.
+@param EXPR assertion expression that should hold */
+#define ut_a(EXPR) do { \
+ if (UNIV_UNLIKELY(!(ulint) (EXPR))) { \
+ ut_dbg_assertion_failed(#EXPR, \
+ __FILE__, __LINE__); \
+ } \
+} while (0)
+
+/** Abort execution. */
+#define ut_error \
+ ut_dbg_assertion_failed(0, __FILE__, __LINE__)
+
+/** Debug assertion */
+#define ut_ad DBUG_SLOW_ASSERT
+#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR) EXPR
+#else
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)
+#endif
+
+#if defined(HAVE_SYS_TIME_H) && defined(HAVE_SYS_RESOURCE_H)
+
+#define HAVE_UT_CHRONO_T
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/** A "chronometer" used to clock snippets of code.
+Example usage:
+ ut_chrono_t ch("this loop");
+ for (;;) { ... }
+ ch.show();
+would print the timings of the for() loop, prefixed with "this loop:" */
+class ut_chrono_t {
+public:
+ /** Constructor.
+ @param[in] name chrono's name, used when showing the values */
+ ut_chrono_t(
+ const char* name)
+ :
+ m_name(name),
+ m_show_from_destructor(true)
+ {
+ reset();
+ }
+
+ /** Resets the chrono (records the current time in it). */
+ void
+ reset()
+ {
+ gettimeofday(&m_tv, NULL);
+
+ getrusage(RUSAGE_SELF, &m_ru);
+ }
+
+ /** Shows the time elapsed and usage statistics since the last reset. */
+ void
+ show()
+ {
+ struct rusage ru_now;
+ struct timeval tv_now;
+ struct timeval tv_diff;
+
+ getrusage(RUSAGE_SELF, &ru_now);
+
+ gettimeofday(&tv_now, NULL);
+
+#ifndef timersub
+#define timersub(a, b, r) \
+ do { \
+ (r)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
+ (r)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
+ if ((r)->tv_usec < 0) { \
+ (r)->tv_sec--; \
+ (r)->tv_usec += 1000000; \
+ } \
+ } while (0)
+#endif /* timersub */
+
+#define CHRONO_PRINT(type, tvp) \
+ fprintf(stderr, "%s: %s% 5ld.%06ld sec\n", \
+ m_name, type, \
+ static_cast<long>((tvp)->tv_sec), \
+ static_cast<long>((tvp)->tv_usec))
+
+ timersub(&tv_now, &m_tv, &tv_diff);
+ CHRONO_PRINT("real", &tv_diff);
+
+ timersub(&ru_now.ru_utime, &m_ru.ru_utime, &tv_diff);
+ CHRONO_PRINT("user", &tv_diff);
+
+ timersub(&ru_now.ru_stime, &m_ru.ru_stime, &tv_diff);
+ CHRONO_PRINT("sys ", &tv_diff);
+ }
+
+ /** Cause the timings not to be printed from the destructor. */
+ void end()
+ {
+ m_show_from_destructor = false;
+ }
+
+ /** Destructor. */
+ ~ut_chrono_t()
+ {
+ if (m_show_from_destructor) {
+ show();
+ }
+ }
+
+private:
+ /** Name of this chronometer. */
+ const char* m_name;
+
+ /** True if the current timings should be printed by the destructor. */
+ bool m_show_from_destructor;
+
+ /** getrusage() result as of the last reset(). */
+ struct rusage m_ru;
+
+ /** gettimeofday() result as of the last reset(). */
+ struct timeval m_tv;
+};
+
+#endif /* HAVE_SYS_TIME_H && HAVE_SYS_RESOURCE_H */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h
new file mode 100644
index 00000000..7e27e108
--- /dev/null
+++ b/storage/innobase/include/ut0list.h
@@ -0,0 +1,146 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.h
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A double-linked list. This differs from the one in ut0lst.h in that in this
+one, each list node contains a pointer to the data, whereas the one in
+ut0lst.h uses a strategy where the list pointers are embedded in the data
+items themselves.
+
+Use this one when you need to store arbitrary data in the list where you
+can't embed the list pointers in the data, if a data item needs to be
+stored in multiple lists, etc.
+
+Note about the memory management: ib_list_t is a fixed-size struct whose
+allocation/deallocation is done through ib_list_create/ib_list_free, but the
+memory for the list nodes is allocated through a user-given memory heap,
+which can either be the same for all nodes or vary per node. Most users will
+probably want to create a memory heap to store the item-specific data, and
+pass in this same heap to the list node creation functions, thus
+automatically freeing the list node when the item's heap is freed.
+
+************************************************************************/
+
+#ifndef IB_LIST_H
+#define IB_LIST_H
+
+#include "mem0mem.h"
+
+struct ib_list_t;
+struct ib_list_node_t;
+
+/****************************************************************//**
+Create a new list using mem_alloc. Lists created with this function must be
+freed with ib_list_free.
+@return list */
+ib_list_t*
+ib_list_create(void);
+/*=================*/
+
+/****************************************************************//**
+Free a list. */
+void
+ib_list_free(
+/*=========*/
+ ib_list_t* list); /*!< in: list */
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return new list node */
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+ ib_list_t* list, /*!< in: list */
+ void* data, /*!< in: data */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/****************************************************************//**
+Remove the node from the list. */
+void
+ib_list_remove(
+/*===========*/
+ ib_list_t* list, /*!< in: list */
+ ib_list_node_t* node); /*!< in: node to remove */
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+ ib_list_t* list); /*!< in: list */
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+ ib_list_t* list); /*!< in: list */
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+ /* out: TRUE if empty else */
+ const ib_list_t* list); /* in: list */
+
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+ const ib_list_t* list); /*<! in: list */
+
+/* List. */
+struct ib_list_t {
+ ib_list_node_t* first; /*!< first node */
+ ib_list_node_t* last; /*!< last node */
+};
+
+/* A list node. */
+struct ib_list_node_t {
+ ib_list_node_t* prev; /*!< previous node */
+ ib_list_node_t* next; /*!< next node */
+ void* data; /*!< user data */
+};
+
+/* Quite often, the only additional piece of data you need is the per-item
+memory heap, so we have this generic struct available to use in those
+cases. */
+struct ib_list_helper_t {
+ mem_heap_t* heap; /*!< memory heap */
+ void* data; /*!< user data */
+};
+
+#include "ut0list.ic"
+
+#endif
diff --git a/storage/innobase/include/ut0list.ic b/storage/innobase/include/ut0list.ic
new file mode 100644
index 00000000..3bdba52b
--- /dev/null
+++ b/storage/innobase/include/ut0list.ic
@@ -0,0 +1,80 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.ic
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+ ib_list_t* list) /*!< in: list */
+{
+ return(list->first);
+}
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+ ib_list_t* list) /*!< in: list */
+{
+ return(list->last);
+}
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+ /* out: TRUE if empty else FALSE */
+ const ib_list_t* list) /* in: list */
+{
+ return(!(list->first || list->last));
+}
+
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+ const ib_list_t* list) /*<! in: list */
+{
+ ulint len = 0;
+ ib_list_node_t* node = list->first;
+
+ while(node) {
+ len++;
+ node = node->next;
+ }
+
+ return (len);
+}
diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
new file mode 100644
index 00000000..9a5f3059
--- /dev/null
+++ b/storage/innobase/include/ut0lst.h
@@ -0,0 +1,568 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0lst.h
+List utilities
+
+Created 9/10/1995 Heikki Tuuri
+Rewritten by Sunny Bains Dec 2011.
+***********************************************************************/
+
+#ifndef ut0lst_h
+#define ut0lst_h
+
+/* Do not include univ.i because univ.i includes this. */
+
+#include "ut0dbg.h"
+
+/* This module implements the two-way linear list. Note that a single
+list node may belong to two or more lists, but is only on one list
+at a time. */
+
+/*******************************************************************//**
+The two way list node.
+@param TYPE the list node type name */
+template <typename Type>
+struct ut_list_node {
+ Type* prev; /*!< pointer to the previous
+ node, NULL if start of list */
+ Type* next; /*!< pointer to next node,
+ NULL if end of list */
+
+ void reverse()
+ {
+ Type* tmp = prev;
+ prev = next;
+ next = tmp;
+ }
+};
+
+/** Macro used for legacy reasons */
+#define UT_LIST_NODE_T(t) ut_list_node<t>
+
+/*******************************************************************//**
+The two-way list base node. The base node contains pointers to both ends
+of the list and a count of nodes in the list (excluding the base node
+from the count). We also store a pointer to the member field so that it
+doesn't have to be specified when doing list operations.
+@param Type the type of the list element
+@param NodePtr field member pointer that points to the list node */
+template <typename Type, typename NodePtr>
+struct ut_list_base {
+ typedef Type elem_type;
+ typedef NodePtr node_ptr;
+ typedef ut_list_node<Type> node_type;
+
+ ulint count; /*!< count of nodes in list */
+ elem_type* start; /*!< pointer to list start,
+ NULL if empty */
+ elem_type* end; /*!< pointer to list end,
+ NULL if empty */
+ node_ptr node; /*!< Pointer to member field
+ that is used as a link node */
+#ifdef UNIV_DEBUG
+ ulint init; /*!< UT_LIST_INITIALISED if
+ the list was initialised with
+ UT_LIST_INIT() */
+#endif /* UNIV_DEBUG */
+
+ void reverse()
+ {
+ Type* tmp = start;
+ start = end;
+ end = tmp;
+ }
+};
+
+#define UT_LIST_BASE_NODE_T(t) ut_list_base<t, ut_list_node<t> t::*>
+
+#ifdef UNIV_DEBUG
+# define UT_LIST_INITIALISED 0xCAFE
+# define UT_LIST_INITIALISE(b) (b).init = UT_LIST_INITIALISED
+# define UT_LIST_IS_INITIALISED(b) ut_a(((b).init == UT_LIST_INITIALISED))
+#else
+# define UT_LIST_INITIALISE(b)
+# define UT_LIST_IS_INITIALISED(b)
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Note: This is really the list constructor. We should be able to use
+placement new here.
+Initializes the base node of a two-way list.
+@param b the list base node
+@param pmf point to member field that will be used as the link node */
+#define UT_LIST_INIT(b, pmf) \
+{ \
+ (b).count = 0; \
+ (b).start = 0; \
+ (b).end = 0; \
+ (b).node = pmf; \
+ UT_LIST_INITIALISE(b); \
+}
+
+/** Functor for accessing the embedded node within a list element. This is
+required because some lists can have the node emebedded inside a nested
+struct/union. See lock0priv.h (table locks) for an example. It provides a
+specialised functor to grant access to the list node. */
+template <typename Type>
+struct GenericGetNode {
+
+ typedef ut_list_node<Type> node_type;
+
+ GenericGetNode(node_type Type::* node) : m_node(node) {}
+
+ node_type& operator() (Type& elem)
+ {
+ return(elem.*m_node);
+ }
+
+ node_type Type::*m_node;
+};
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem the element to add */
+template <typename List>
+void
+ut_list_prepend(
+ List& list,
+ typename List::elem_type* elem)
+{
+ typename List::node_type& elem_node = elem->*list.node;
+
+ UT_LIST_IS_INITIALISED(list);
+
+ elem_node.prev = 0;
+ elem_node.next = list.start;
+
+ if (list.start != 0) {
+ typename List::node_type& base_node =
+ list.start->*list.node;
+
+ ut_ad(list.start != elem);
+
+ base_node.prev = elem;
+ }
+
+ list.start = elem;
+
+ if (list.end == 0) {
+ list.end = elem;
+ }
+
+ ++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param LIST the base node (not a pointer to it)
+@param ELEM the element to add */
+#define UT_LIST_ADD_FIRST(LIST, ELEM) ut_list_prepend(LIST, ELEM)
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list list
+@param elem the element to add
+@param get_node to get the list node for that element */
+template <typename List, typename Functor>
+void
+ut_list_append(
+ List& list,
+ typename List::elem_type* elem,
+ Functor get_node)
+{
+ typename List::node_type& node = get_node(*elem);
+
+ UT_LIST_IS_INITIALISED(list);
+
+ node.next = 0;
+ node.prev = list.end;
+
+ if (list.end != 0) {
+ typename List::node_type& base_node = get_node(*list.end);
+
+ ut_ad(list.end != elem);
+
+ base_node.next = elem;
+ }
+
+ list.end = elem;
+
+ if (list.start == 0) {
+ list.start = elem;
+ }
+
+ ++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list list
+@param elem the element to add */
+template <typename List>
+void
+ut_list_append(
+ List& list,
+ typename List::elem_type* elem)
+{
+ ut_list_append(
+ list, elem,
+ GenericGetNode<typename List::elem_type>(list.node));
+}
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param LIST list base node (not a pointer to it)
+@param ELEM the element to add */
+#define UT_LIST_ADD_LAST(LIST, ELEM) ut_list_append(LIST, ELEM)
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param list the base node
+@param elem1 node after which ELEM2 is inserted
+@param elem2 node being inserted after ELEM1 */
+template <typename List>
+void
+ut_list_insert(
+ List& list,
+ typename List::elem_type* elem1,
+ typename List::elem_type* elem2)
+{
+ ut_ad(elem1 != elem2);
+ UT_LIST_IS_INITIALISED(list);
+
+ typename List::node_type& elem1_node = elem1->*list.node;
+ typename List::node_type& elem2_node = elem2->*list.node;
+
+ elem2_node.prev = elem1;
+ elem2_node.next = elem1_node.next;
+
+ if (elem1_node.next != NULL) {
+ typename List::node_type& next_node =
+ elem1_node.next->*list.node;
+
+ next_node.prev = elem2;
+ }
+
+ elem1_node.next = elem2;
+
+ if (list.end == elem1) {
+ list.end = elem2;
+ }
+
+ ++list.count;
+}
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param LIST list base node (not a pointer to it)
+@param ELEM1 node after which ELEM2 is inserted
+@param ELEM2 node being inserted after ELEM1 */
+#define UT_LIST_INSERT_AFTER(LIST, ELEM1, ELEM2) \
+ ut_list_insert(LIST, ELEM1, ELEM2)
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param list the base node
+@param elem1 node after which ELEM2 is inserted
+@param elem2 node being inserted after ELEM1
+@param get_node to get the list node for that element */
+
+template <typename List, typename Functor>
+void
+ut_list_insert(
+ List& list,
+ typename List::elem_type* elem1,
+ typename List::elem_type* elem2,
+ Functor get_node)
+{
+ ut_ad(elem1 != elem2);
+ UT_LIST_IS_INITIALISED(list);
+
+ typename List::node_type& elem1_node = get_node(*elem1);
+ typename List::node_type& elem2_node = get_node(*elem2);
+
+ elem2_node.prev = elem1;
+ elem2_node.next = elem1_node.next;
+
+ if (elem1_node.next != NULL) {
+ typename List::node_type& next_node =
+ get_node(*elem1_node.next);
+
+ next_node.prev = elem2;
+ }
+
+ elem1_node.next = elem2;
+
+ if (list.end == elem1) {
+ list.end = elem2;
+ }
+
+ ++list.count;
+
+}
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param node member node within list element that is to be removed
+@param get_node functor to get the list node from elem */
+template <typename List, typename Functor>
+void
+ut_list_remove(
+ List& list,
+ typename List::node_type& node,
+ Functor get_node)
+{
+ ut_a(list.count > 0);
+ UT_LIST_IS_INITIALISED(list);
+
+ if (node.next != NULL) {
+ typename List::node_type& next_node =
+ get_node(*node.next);
+
+ next_node.prev = node.prev;
+ } else {
+ list.end = node.prev;
+ }
+
+ if (node.prev != NULL) {
+ typename List::node_type& prev_node =
+ get_node(*node.prev);
+
+ prev_node.next = node.next;
+ } else {
+ list.start = node.next;
+ }
+
+ node.next = 0;
+ node.prev = 0;
+
+ --list.count;
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem element to be removed from the list
+@param get_node functor to get the list node from elem */
+template <typename List, typename Functor>
+void
+ut_list_remove(
+ List& list,
+ typename List::elem_type* elem,
+ Functor get_node)
+{
+ ut_list_remove(list, get_node(*elem), get_node);
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem element to be removed from the list */
+template <typename List>
+void
+ut_list_remove(
+ List& list,
+ typename List::elem_type* elem)
+{
+ ut_list_remove(
+ list, elem->*list.node,
+ GenericGetNode<typename List::elem_type>(list.node));
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param LIST the base node (not a pointer to it)
+@param ELEM node to be removed from the list */
+#define UT_LIST_REMOVE(LIST, ELEM) ut_list_remove(LIST, ELEM)
+
+/********************************************************************//**
+Gets the next node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the successor of N in NAME, or NULL */
+#define UT_LIST_GET_NEXT(NAME, N) (((N)->NAME).next)
+
+/********************************************************************//**
+Gets the previous node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the predecessor of N in NAME, or NULL */
+#define UT_LIST_GET_PREV(NAME, N) (((N)->NAME).prev)
+
+/********************************************************************//**
+Alternative macro to get the number of nodes in a two-way list, i.e.,
+its length.
+@param BASE the base node (not a pointer to it).
+@return the number of nodes in the list */
+#define UT_LIST_GET_LEN(BASE) (BASE).count
+
+/********************************************************************//**
+Gets the first node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return first node, or NULL if the list is empty */
+#define UT_LIST_GET_FIRST(BASE) (BASE).start
+
+/********************************************************************//**
+Gets the last node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return last node, or NULL if the list is empty */
+#define UT_LIST_GET_LAST(BASE) (BASE).end
+
+struct NullValidate { void operator()(const void*) const {} };
+
+/** Iterate over all the elements and call the functor for each element.
+@param[in] list base node (not a pointer to it)
+@param[in,out] functor Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_map(const List& list, Functor& functor)
+{
+ ulint count = 0;
+
+ UT_LIST_IS_INITIALISED(list);
+
+ for (typename List::elem_type* elem = list.start; elem;
+ elem = (elem->*list.node).next, ++count) {
+
+ functor(elem);
+ }
+
+ ut_a(count == list.count);
+}
+
+/** Iterate over all the elements and call the functor for each element.
+@param[in] list base node (not a pointer to it)
+@param[in] functor Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_map(const List& list, const Functor& functor)
+{
+ ulint count = 0;
+
+ UT_LIST_IS_INITIALISED(list);
+
+ for (typename List::elem_type* elem = list.start; elem;
+ elem = (elem->*list.node).next, ++count) {
+
+ functor(elem);
+ }
+
+ ut_a(count == list.count);
+}
+
+/** Check the consistency of a doubly linked list.
+@param[in] list base node (not a pointer to it)
+@param[in,out] functor Functor that is called for each element in the list */
+template <typename List, class Functor>
+void ut_list_validate(const List& list, Functor& functor)
+{
+ ut_list_map(list, functor);
+
+ /* Validate the list backwards. */
+ ulint count = 0;
+
+ for (typename List::elem_type* elem = list.end;
+ elem != 0;
+ elem = (elem->*list.node).prev) {
+ ++count;
+ }
+
+ ut_a(count == list.count);
+}
+
+/** Check the consistency of a doubly linked list.
+@param[in] list base node (not a pointer to it)
+@param[in] functor Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_validate(const List& list, const Functor& functor)
+{
+ ut_list_map(list, functor);
+
+ /* Validate the list backwards. */
+ ulint count = 0;
+
+ for (typename List::elem_type* elem = list.end;
+ elem != 0;
+ elem = (elem->*list.node).prev) {
+ ++count;
+ }
+
+ ut_a(count == list.count);
+}
+
+template <typename List>
+inline void ut_list_validate(const List& list)
+{
+ ut_list_validate(list, NullValidate());
+}
+
+#ifdef UNIV_DEBUG
+template <typename List>
+inline void ut_list_reverse(List& list)
+{
+ UT_LIST_IS_INITIALISED(list);
+
+ for (typename List::elem_type* elem = list.start;
+ elem != 0;
+ elem = (elem->*list.node).prev) {
+ (elem->*list.node).reverse();
+ }
+
+ list.reverse();
+}
+
+/** Check if the given element exists in the list.
+@param[in,out] list the list object
+@param[in] elem the element of the list which will be checked */
+template <typename List>
+inline bool ut_list_exists(const List& list, typename List::elem_type* elem)
+{
+ for (typename List::elem_type* e1 = UT_LIST_GET_FIRST(list); e1;
+ e1 = (e1->*list.node).next) {
+ if (elem == e1) {
+ return true;
+ }
+ }
+ return false;
+}
+#endif
+
+/** Move the given element to the beginning of the list.
+@param[in,out] list the list object
+@param[in] elem the element of the list which will be moved
+ to the beginning of the list. */
+template <typename List>
+void
+ut_list_move_to_front(
+ List& list,
+ typename List::elem_type* elem)
+{
+ ut_ad(ut_list_exists(list, elem));
+
+ if (UT_LIST_GET_FIRST(list) != elem) {
+ ut_list_remove(list, elem);
+ ut_list_prepend(list, elem);
+ }
+}
+
+#ifdef UNIV_DEBUG
+#endif
+
+#endif /* ut0lst.h */
diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h
new file mode 100644
index 00000000..2fc864d4
--- /dev/null
+++ b/storage/innobase/include/ut0mem.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.h
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef ut0mem_h
+#define ut0mem_h
+
+#include "univ.i"
+
+/********************************************************************
+Concatenate 3 strings.*/
+char*
+ut_str3cat(
+/*=======*/
+ /* out, own: concatenated string, must be
+ freed with ut_free() */
+ const char* s1, /* in: string 1 */
+ const char* s2, /* in: string 2 */
+ const char* s3); /* in: string 3 */
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+ const void* raw, /*!< in: raw data */
+ ulint raw_size, /*!< in: "raw" length in bytes */
+ char* hex, /*!< out: hex string */
+ ulint hex_size); /*!< in: "hex" size in bytes */
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+ const char* str, /*!< in: string */
+ ulint str_len, /*!< in: string length in bytes */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size); /*!< in: output buffer size
+ in bytes */
+
+#include "ut0mem.ic"
+
+#endif
diff --git a/storage/innobase/include/ut0mem.ic b/storage/innobase/include/ut0mem.ic
new file mode 100644
index 00000000..cc95a036
--- /dev/null
+++ b/storage/innobase/include/ut0mem.ic
@@ -0,0 +1,246 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.ic
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#include "ut0byte.h"
+#include "mach0data.h"
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+ const void* raw, /*!< in: raw data */
+ ulint raw_size, /*!< in: "raw" length in bytes */
+ char* hex, /*!< out: hex string */
+ ulint hex_size) /*!< in: "hex" size in bytes */
+{
+
+#ifdef WORDS_BIGENDIAN
+
+#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b))
+
+#define UINT16_GET_A(u) ((char) ((u) >> 8))
+#define UINT16_GET_B(u) ((char) ((u) & 0xFF))
+
+#else /* WORDS_BIGENDIAN */
+
+#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a))
+
+#define UINT16_GET_A(u) ((char) ((u) & 0xFF))
+#define UINT16_GET_B(u) ((char) ((u) >> 8))
+
+#endif /* WORDS_BIGENDIAN */
+
+#define MK_ALL_UINT16_WITH_A(a) \
+ MK_UINT16(a, '0'), \
+ MK_UINT16(a, '1'), \
+ MK_UINT16(a, '2'), \
+ MK_UINT16(a, '3'), \
+ MK_UINT16(a, '4'), \
+ MK_UINT16(a, '5'), \
+ MK_UINT16(a, '6'), \
+ MK_UINT16(a, '7'), \
+ MK_UINT16(a, '8'), \
+ MK_UINT16(a, '9'), \
+ MK_UINT16(a, 'A'), \
+ MK_UINT16(a, 'B'), \
+ MK_UINT16(a, 'C'), \
+ MK_UINT16(a, 'D'), \
+ MK_UINT16(a, 'E'), \
+ MK_UINT16(a, 'F')
+
+ static const uint16 hex_map[256] = {
+ MK_ALL_UINT16_WITH_A('0'),
+ MK_ALL_UINT16_WITH_A('1'),
+ MK_ALL_UINT16_WITH_A('2'),
+ MK_ALL_UINT16_WITH_A('3'),
+ MK_ALL_UINT16_WITH_A('4'),
+ MK_ALL_UINT16_WITH_A('5'),
+ MK_ALL_UINT16_WITH_A('6'),
+ MK_ALL_UINT16_WITH_A('7'),
+ MK_ALL_UINT16_WITH_A('8'),
+ MK_ALL_UINT16_WITH_A('9'),
+ MK_ALL_UINT16_WITH_A('A'),
+ MK_ALL_UINT16_WITH_A('B'),
+ MK_ALL_UINT16_WITH_A('C'),
+ MK_ALL_UINT16_WITH_A('D'),
+ MK_ALL_UINT16_WITH_A('E'),
+ MK_ALL_UINT16_WITH_A('F')
+ };
+ const unsigned char* rawc;
+ ulint read_bytes;
+ ulint write_bytes;
+ ulint i;
+
+ rawc = (const unsigned char*) raw;
+
+ if (hex_size == 0) {
+
+ return(0);
+ }
+
+ if (hex_size <= 2 * raw_size) {
+
+ read_bytes = hex_size / 2;
+ write_bytes = hex_size;
+ } else {
+
+ read_bytes = raw_size;
+ write_bytes = 2 * raw_size + 1;
+ }
+
+#define LOOP_READ_BYTES(ASSIGN) \
+ for (i = 0; i < read_bytes; i++) { \
+ ASSIGN; \
+ hex += 2; \
+ rawc++; \
+ }
+
+ if (ut_align_offset(hex, 2) == 0) {
+
+ LOOP_READ_BYTES(
+ *(uint16*) hex = hex_map[*rawc]
+ );
+ } else {
+
+ LOOP_READ_BYTES(
+ *hex = UINT16_GET_A(hex_map[*rawc]);
+ *(hex + 1) = UINT16_GET_B(hex_map[*rawc])
+ );
+ }
+
+ if (hex_size <= 2 * raw_size && hex_size % 2 == 0) {
+
+ hex--;
+ }
+
+ *hex = '\0';
+
+ return(write_bytes);
+}
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+ const char* str, /*!< in: string */
+ ulint str_len, /*!< in: string length in bytes */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ ulint str_i;
+ ulint buf_i;
+
+ buf_i = 0;
+
+ switch (buf_size) {
+ case 3:
+
+ if (str_len == 0) {
+
+ buf[buf_i] = '\'';
+ buf_i++;
+ buf[buf_i] = '\'';
+ buf_i++;
+ }
+ /* FALLTHROUGH */
+ case 2:
+ case 1:
+
+ buf[buf_i] = '\0';
+ buf_i++;
+ /* FALLTHROUGH */
+ case 0:
+
+ return(buf_i);
+ }
+
+ /* buf_size >= 4 */
+
+ buf[0] = '\'';
+ buf_i = 1;
+
+ for (str_i = 0; str_i < str_len; str_i++) {
+
+ char ch;
+
+ if (buf_size - buf_i == 2) {
+
+ break;
+ }
+
+ ch = str[str_i];
+
+ switch (ch) {
+ case '\0':
+
+ if (buf_size - buf_i < 4) {
+
+ goto func_exit;
+ }
+ buf[buf_i] = '\\';
+ buf_i++;
+ buf[buf_i] = '0';
+ buf_i++;
+ break;
+ case '\'':
+ case '\\':
+
+ if (buf_size - buf_i < 4) {
+
+ goto func_exit;
+ }
+ buf[buf_i] = ch;
+ buf_i++;
+ /* FALLTHROUGH */
+ default:
+
+ buf[buf_i] = ch;
+ buf_i++;
+ }
+ }
+
+func_exit:
+
+ buf[buf_i] = '\'';
+ buf_i++;
+ buf[buf_i] = '\0';
+ buf_i++;
+
+ return(buf_i);
+}
diff --git a/storage/innobase/include/ut0mutex.h b/storage/innobase/include/ut0mutex.h
new file mode 100644
index 00000000..cb43583c
--- /dev/null
+++ b/storage/innobase/include/ut0mutex.h
@@ -0,0 +1,178 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0mutex.h
+Policy based mutexes.
+
+Created 2012-03-24 Sunny Bains.
+***********************************************************************/
+
+#pragma once
+#ifndef UNIV_INNOCHECKSUM
+#include "sync0policy.h"
+#include "ib0mutex.h"
+
+/** Create a typedef using the MutexType<PolicyType>
+@param[in] M Mutex type
+@param[in[ P Policy type
+@param[in] T The resulting typedef alias */
+#define UT_MUTEX_TYPE(M, P, T) typedef PolicyMutex<M<P> > T;
+
+# ifdef __linux__
+UT_MUTEX_TYPE(TTASFutexMutex, GenericPolicy, FutexMutex);
+# endif /* __linux__ */
+
+UT_MUTEX_TYPE(TTASMutex, GenericPolicy, SpinMutex);
+UT_MUTEX_TYPE(OSTrackMutex, GenericPolicy, SysMutex);
+UT_MUTEX_TYPE(TTASEventMutex, GenericPolicy, SyncArrayMutex);
+
+#ifdef MUTEX_FUTEX
+/** The default mutex type. */
+typedef FutexMutex ib_mutex_t;
+#define MUTEX_TYPE "Uses futexes"
+#elif defined(MUTEX_SYS)
+typedef SysMutex ib_mutex_t;
+#define MUTEX_TYPE "Uses system mutexes"
+#elif defined(MUTEX_EVENT)
+typedef SyncArrayMutex ib_mutex_t;
+#define MUTEX_TYPE "Uses event mutexes"
+#else
+#error "ib_mutex_t type is unknown"
+#endif /* MUTEX_FUTEX */
+
+extern uint srv_spin_wait_delay;
+extern ulong srv_n_spin_wait_rounds;
+
+#define mutex_create(I, M) mutex_init((M), (I), \
+ __FILE__, __LINE__)
+
+#define mutex_enter_loc(M,file,line) (M)->enter( \
+ uint32_t(srv_n_spin_wait_rounds), \
+ uint32_t(srv_spin_wait_delay), \
+ file, line)
+#define mutex_enter(M) mutex_enter_loc(M, __FILE__, __LINE__)
+
+#define mutex_enter_nospin(M) (M)->enter( \
+ 0, \
+ 0, \
+ __FILE__, uint32_t(__LINE__))
+
+#define mutex_enter_nowait(M) (M)->trylock(__FILE__, \
+ uint32_t(__LINE__))
+
+#define mutex_exit(M) (M)->exit()
+
+#define mutex_free(M) mutex_destroy(M)
+
+#ifdef UNIV_DEBUG
+/**
+Checks that the mutex has been initialized. */
+#define mutex_validate(M) (M)->validate()
+
+/**
+Checks that the current thread owns the mutex. Works only
+in the debug version. */
+#define mutex_own(M) (M)->is_owned()
+#else
+#define mutex_own(M) /* No op */
+#define mutex_validate(M) /* No op */
+#endif /* UNIV_DEBUG */
+
+/** Iterate over the mutex meta data */
+class MutexMonitor {
+public:
+ /** Constructor */
+ MutexMonitor() { }
+
+ /** Destructor */
+ ~MutexMonitor() { }
+
+ /** Enable the mutex monitoring */
+ void enable();
+
+ /** Disable the mutex monitoring */
+ void disable();
+
+ /** Reset the mutex monitoring values */
+ void reset();
+
+ /** Invoke the callback for each active mutex collection
+ @param[in,out] callback Functor to call
+ @return false if callback returned false */
+ template<typename Callback>
+ bool iterate(Callback& callback) const
+ UNIV_NOTHROW
+ {
+ LatchMetaData::iterator end = latch_meta.end();
+
+ for (LatchMetaData::iterator it = latch_meta.begin();
+ it != end;
+ ++it) {
+
+ /* Some of the slots will be null in non-debug mode */
+
+ if (latch_meta_t* l= *it) {
+ if (!callback(*l)) {
+ return false;
+ }
+ }
+ }
+
+ return(true);
+ }
+};
+
+/** Defined in sync0sync.cc */
+extern MutexMonitor mutex_monitor;
+
+/**
+Creates, or rather, initializes a mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed.
+Add the mutex instance to the global mutex list.
+@param[in,out] mutex mutex to initialise
+@param[in] id The mutex ID (Latch ID)
+@param[in] filename Filename from where it was called
+@param[in] line Line number in filename from where called */
+template <typename Mutex>
+void mutex_init(
+ Mutex* mutex,
+ latch_id_t id,
+ const char* file_name,
+ uint32_t line)
+{
+ new(mutex) Mutex();
+
+ mutex->init(id, file_name, line);
+}
+
+/**
+Removes a mutex instance from the mutex list. The mutex is checked to
+be in the reset state.
+@param[in,out] mutex mutex instance to destroy */
+template <typename Mutex>
+void mutex_destroy(
+ Mutex* mutex)
+{
+ mutex->destroy();
+}
+
+#endif /* UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h
new file mode 100644
index 00000000..e8469db9
--- /dev/null
+++ b/storage/innobase/include/ut0new.h
@@ -0,0 +1,1105 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ut/ut0new.h
+Instrumented memory allocator.
+
+Created May 26, 2014 Vasil Dimov
+*******************************************************/
+
+/** Dynamic memory allocation within InnoDB guidelines.
+All dynamic (heap) memory allocations (malloc(3), strdup(3), etc, "new",
+various std:: containers that allocate memory internally), that are done
+within InnoDB are instrumented. This means that InnoDB uses a custom set
+of functions for allocating memory, rather than calling e.g. "new" directly.
+
+Here follows a cheat sheet on what InnoDB functions to use whenever a
+standard one would have been used.
+
+Creating new objects with "new":
+--------------------------------
+Standard:
+ new expression
+ or
+ new(std::nothrow) expression
+InnoDB, default instrumentation:
+ UT_NEW_NOKEY(expression)
+InnoDB, custom instrumentation, preferred:
+ UT_NEW(expression, key)
+
+Destroying objects, created with "new":
+---------------------------------------
+Standard:
+ delete ptr
+InnoDB:
+ UT_DELETE(ptr)
+
+Creating new arrays with "new[]":
+---------------------------------
+Standard:
+ new type[num]
+ or
+ new(std::nothrow) type[num]
+InnoDB, default instrumentation:
+ UT_NEW_ARRAY_NOKEY(type, num)
+InnoDB, custom instrumentation, preferred:
+ UT_NEW_ARRAY(type, num, key)
+
+Destroying arrays, created with "new[]":
+----------------------------------------
+Standard:
+ delete[] ptr
+InnoDB:
+ UT_DELETE_ARRAY(ptr)
+
+Declaring a type with a std:: container, e.g. std::vector:
+----------------------------------------------------------
+Standard:
+ std::vector<t>
+InnoDB:
+ std::vector<t, ut_allocator<t> >
+
+Declaring objects of some std:: type:
+-------------------------------------
+Standard:
+ std::vector<t> v
+InnoDB, default instrumentation:
+ std::vector<t, ut_allocator<t> > v
+InnoDB, custom instrumentation, preferred:
+ std::vector<t, ut_allocator<t> > v(ut_allocator<t>(key))
+
+Raw block allocation (as usual in C++, consider whether using "new" would
+not be more appropriate):
+-------------------------------------------------------------------------
+Standard:
+ malloc(num)
+InnoDB, default instrumentation:
+ ut_malloc_nokey(num)
+InnoDB, custom instrumentation, preferred:
+ ut_malloc(num, key)
+
+Raw block resize:
+-----------------
+Standard:
+ realloc(ptr, new_size)
+InnoDB:
+ ut_realloc(ptr, new_size)
+
+Raw block deallocation:
+-----------------------
+Standard:
+ free(ptr)
+InnoDB:
+ ut_free(ptr)
+
+Note: the expression passed to UT_NEW() or UT_NEW_NOKEY() must always end
+with (), thus:
+Standard:
+ new int
+InnoDB:
+ UT_NEW_NOKEY(int())
+*/
+
+#ifndef ut0new_h
+#define ut0new_h
+
+#include <algorithm> /* std::min() */
+#include <limits> /* std::numeric_limits */
+#include <map> /* std::map */
+
+#include <stddef.h>
+#include <stdlib.h> /* malloc() */
+#include <string.h> /* strlen(), strrchr(), strncmp() */
+
+#include <my_sys.h> /* my_large_free/malloc() */
+
+#include "my_global.h" /* needed for headers from mysql/psi/ */
+
+#include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */
+
+#include "mysql/psi/psi_memory.h" /* PSI_memory_key, PSI_memory_info */
+
+#include "os0thread.h" /* os_thread_sleep() */
+#include "ut0ut.h" /* ut_strcmp_functor, ut_basename_noext() */
+
+#define OUT_OF_MEMORY_MSG \
+ "Check if you should increase the swap file or ulimits of your" \
+ " operating system. Note that on most 32-bit computers the process" \
+ " memory space is limited to 2 GB or 4 GB."
+
+/** The total amount of memory currently allocated from the operating
+system with allocate_large() */
+extern Atomic_counter<ulint> os_total_large_mem_allocated;
+
+/** Maximum number of retries to allocate memory. */
+extern const size_t alloc_max_retries;
+
+constexpr uint32_t INVALID_AUTOEVENT_IDX = 0xFFFFFFFFU;
+
+/** Keys for registering allocations with performance schema.
+Pointers to these variables are supplied to PFS code via the pfs_info[]
+array and the PFS code initializes them via PSI_MEMORY_CALL(register_memory)().
+mem_key_other and mem_key_std are special in the following way (see also
+ut_allocator::get_mem_key()):
+* If the caller has not provided a key and the file name of the caller is
+ unknown, then mem_key_std will be used. This happens only when called from
+ within std::* containers.
+* If the caller has not provided a key and the file name of the caller is
+ known, but is not amongst the predefined names (see ut_new_boot()) then
+ mem_key_other will be used. Generally this should not happen and if it
+ happens then that means that the list of predefined names must be extended.
+Keep this list alphabetically sorted. */
+extern PSI_memory_key mem_key_ahi;
+extern PSI_memory_key mem_key_buf_buf_pool;
+extern PSI_memory_key mem_key_dict_stats_bg_recalc_pool_t;
+extern PSI_memory_key mem_key_dict_stats_index_map_t;
+extern PSI_memory_key mem_key_dict_stats_n_diff_on_level;
+extern PSI_memory_key mem_key_other;
+extern PSI_memory_key mem_key_row_log_buf;
+extern PSI_memory_key mem_key_row_merge_sort;
+extern PSI_memory_key mem_key_std;
+
+/** Setup the internal objects needed for UT_NEW() to operate.
+This must be called before the first call to UT_NEW(). */
+void
+ut_new_boot();
+
+#ifdef UNIV_PFS_MEMORY
+
+/**
+Retrieve a memory key (registered with PFS),
+given AUTOEVENT_IDX of the caller
+
+@param[in] autoevent_idx - AUTOEVENT_IDX value of the caller
+@return registered memory key or PSI_NOT_INSTRUMENTED */
+PSI_memory_key ut_new_get_key_by_file(uint32_t autoevent_idx);
+
+#endif /* UNIV_PFS_MEMORY */
+
+/** A structure that holds the necessary data for performance schema
+accounting. An object of this type is put in front of each allocated block
+of memory when allocation is done by ut_allocator::allocate(). This is
+because the data is needed even when freeing the memory. Users of
+ut_allocator::allocate_large() are responsible for maintaining this
+themselves. */
+struct ut_new_pfx_t {
+
+#ifdef UNIV_PFS_MEMORY
+
+ /** Performance schema key. Assigned to a name at startup via
+ PSI_MEMORY_CALL(register_memory)() and later used for accounting
+ allocations and deallocations with
+ PSI_MEMORY_CALL(memory_alloc)(key, size, owner) and
+ PSI_MEMORY_CALL(memory_free)(key, size, owner). */
+ PSI_memory_key m_key;
+
+ /**
+ Thread owner.
+ Instrumented thread that owns the allocated memory.
+ This state is used by the performance schema to maintain
+ per thread statistics,
+ when memory is given from thread A to thread B.
+ */
+ struct PSI_thread *m_owner;
+
+#endif /* UNIV_PFS_MEMORY */
+
+ /** Size of the allocated block in bytes, including this prepended
+ aux structure (for ut_allocator::allocate()). For example if InnoDB
+ code requests to allocate 100 bytes, and sizeof(ut_new_pfx_t) is 16,
+ then 116 bytes are allocated in total and m_size will be 116.
+ ut_allocator::allocate_large() does not prepend this struct to the
+ allocated block and its users are responsible for maintaining it
+ and passing it later to ut_allocator::deallocate_large(). */
+ size_t m_size;
+#if SIZEOF_VOIDP == 4
+ /** Pad the header size to a multiple of 64 bits on 32-bit systems,
+ so that the payload will be aligned to 64 bits. */
+ size_t pad;
+#endif
+};
+
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
+static inline void ut_dontdump(void *ptr, size_t m_size, bool dontdump)
+{
+ ut_a(ptr != NULL);
+
+ if (dontdump && madvise(ptr, m_size, MADV_DONTDUMP)) {
+ ib::warn() << "Failed to set memory to " DONTDUMP_STR ": "
+ << strerror(errno)
+ << " ptr " << ptr
+ << " size " << m_size;
+ }
+}
+
+static inline void ut_dodump(void* ptr, size_t m_size)
+{
+ if (ptr && madvise(ptr, m_size, MADV_DODUMP)) {
+ ib::warn() << "Failed to set memory to " DODUMP_STR ": "
+ << strerror(errno)
+ << " ptr " << ptr
+ << " size " << m_size;
+ }
+}
+#else
+static inline void ut_dontdump(void *, size_t, bool) {}
+static inline void ut_dodump(void*, size_t) {}
+#endif
+
+/** Allocator class for allocating memory from inside std::* containers.
+@tparam T type of allocated object
+@tparam oom_fatal whether to commit suicide when running out of memory */
+template <class T, bool oom_fatal = true>
+class ut_allocator {
+public:
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef T value_type;
+ typedef size_t size_type;
+ typedef ptrdiff_t difference_type;
+
+#ifdef UNIV_PFS_MEMORY
+ /** Default constructor. */
+ explicit
+ ut_allocator(PSI_memory_key key = PSI_NOT_INSTRUMENTED)
+ : m_key(key)
+ {
+ }
+#else
+ ut_allocator() {}
+ ut_allocator(PSI_memory_key) {}
+#endif /* UNIV_PFS_MEMORY */
+
+ /** Constructor from allocator of another type. */
+ template <class U>
+ ut_allocator(const ut_allocator<U>&
+#ifdef UNIV_PFS_MEMORY
+ other
+#endif
+ )
+ {
+#ifdef UNIV_PFS_MEMORY
+ const PSI_memory_key other_key = other.get_mem_key();
+
+ m_key = (other_key != mem_key_std)
+ ? other_key
+ : PSI_NOT_INSTRUMENTED;
+#endif /* UNIV_PFS_MEMORY */
+ }
+
+ /** Return the maximum number of objects that can be allocated by
+ this allocator. */
+ size_type
+ max_size() const
+ {
+ const size_type s_max = std::numeric_limits<size_type>::max();
+
+#ifdef UNIV_PFS_MEMORY
+ return((s_max - sizeof(ut_new_pfx_t)) / sizeof(T));
+#else
+ return(s_max / sizeof(T));
+#endif /* UNIV_PFS_MEMORY */
+ }
+
+ pointer allocate(size_type n) { return allocate(n, NULL, INVALID_AUTOEVENT_IDX); }
+
+ /** Allocate a chunk of memory that can hold 'n_elements' objects of
+ type 'T' and trace the allocation.
+ If the allocation fails this method may throw an exception. This
+ is mandated by the standard and if it returns NULL instead, then
+ STL containers that use it (e.g. std::vector) may get confused.
+ After successfull allocation the returned pointer must be passed
+ to ut_allocator::deallocate() when no longer needed.
+ @param[in] n_elements number of elements
+ @param[in] set_to_zero if true, then the returned memory is
+ initialized with 0x0 bytes.
+ @param[in] throw_on_error if true, raize exception if too big
+ @return pointer to the allocated memory */
+ pointer
+ allocate(
+ size_type n_elements,
+ const_pointer,
+ uint32_t
+#ifdef UNIV_PFS_MEMORY
+ autoevent_idx /* AUTOEVENT_IDX of the caller */
+#endif
+ ,
+ bool set_to_zero = false,
+ bool throw_on_error = true)
+ {
+ if (n_elements == 0) {
+ return(NULL);
+ }
+
+ if (n_elements > max_size()) {
+ if (throw_on_error) {
+ throw(std::bad_alloc());
+ } else {
+ return(NULL);
+ }
+ }
+
+ void* ptr;
+ size_t total_bytes = n_elements * sizeof(T);
+
+#ifdef UNIV_PFS_MEMORY
+ /* The header size must not ruin the 64-bit alignment
+ on 32-bit systems. Some allocated structures use
+ 64-bit fields. */
+ ut_ad((sizeof(ut_new_pfx_t) & 7) == 0);
+ total_bytes += sizeof(ut_new_pfx_t);
+#endif /* UNIV_PFS_MEMORY */
+
+ for (size_t retries = 1; ; retries++) {
+
+ if (set_to_zero) {
+ ptr = calloc(1, total_bytes);
+ } else {
+ ptr = malloc(total_bytes);
+ }
+
+ if (ptr != NULL || retries >= alloc_max_retries) {
+ break;
+ }
+
+ os_thread_sleep(1000000 /* 1 second */);
+ }
+
+ if (ptr == NULL) {
+ ib::fatal_or_error(oom_fatal)
+ << "Cannot allocate " << total_bytes
+ << " bytes of memory after "
+ << alloc_max_retries << " retries over "
+ << alloc_max_retries << " seconds. OS error: "
+ << strerror(errno) << " (" << errno << "). "
+ << OUT_OF_MEMORY_MSG;
+ if (throw_on_error) {
+ throw(std::bad_alloc());
+ } else {
+ return(NULL);
+ }
+ }
+
+#ifdef UNIV_PFS_MEMORY
+ ut_new_pfx_t* pfx = static_cast<ut_new_pfx_t*>(ptr);
+
+ allocate_trace(total_bytes, autoevent_idx, pfx);
+
+ return(reinterpret_cast<pointer>(pfx + 1));
+#else
+ return(reinterpret_cast<pointer>(ptr));
+#endif /* UNIV_PFS_MEMORY */
+ }
+
+ /** Free a memory allocated by allocate() and trace the deallocation.
+ @param[in,out] ptr pointer to memory to free */
+ void deallocate(pointer ptr, size_type n_elements = 0)
+ {
+#ifdef UNIV_PFS_MEMORY
+ if (ptr == NULL) {
+ return;
+ }
+
+ ut_new_pfx_t* pfx = reinterpret_cast<ut_new_pfx_t*>(ptr) - 1;
+
+ deallocate_trace(pfx);
+
+ free(pfx);
+#else
+ free(ptr);
+#endif /* UNIV_PFS_MEMORY */
+ }
+
+ /** Create an object of type 'T' using the value 'val' over the
+ memory pointed by 'p'. */
+ void
+ construct(
+ pointer p,
+ const T& val)
+ {
+ new(p) T(val);
+ }
+
+ /** Destroy an object pointed by 'p'. */
+ void
+ destroy(
+ pointer p)
+ {
+ p->~T();
+ }
+
+ /** Return the address of an object. */
+ pointer
+ address(
+ reference x) const
+ {
+ return(&x);
+ }
+
+ /** Return the address of a const object. */
+ const_pointer
+ address(
+ const_reference x) const
+ {
+ return(&x);
+ }
+
+ template <class U>
+ struct rebind {
+ typedef ut_allocator<U> other;
+ };
+
+ /* The following are custom methods, not required by the standard. */
+
+#ifdef UNIV_PFS_MEMORY
+
+ /** realloc(3)-like method.
+ The passed in ptr must have been returned by allocate() and the
+ pointer returned by this method must be passed to deallocate() when
+ no longer needed.
+ @param[in,out] ptr old pointer to reallocate
+ @param[in] n_elements new number of elements to allocate
+ @param[in] file file name of the caller
+ @return newly allocated memory */
+ pointer
+ reallocate(
+ void* ptr,
+ size_type n_elements,
+ uint32_t autoevent_idx)
+ {
+ if (n_elements == 0) {
+ deallocate(static_cast<pointer>(ptr));
+ return(NULL);
+ }
+
+ if (ptr == NULL) {
+ return(allocate(n_elements, NULL, autoevent_idx, false, false));
+ }
+
+ if (n_elements > max_size()) {
+ return(NULL);
+ }
+
+ ut_new_pfx_t* pfx_old;
+ ut_new_pfx_t* pfx_new;
+ size_t total_bytes;
+
+ pfx_old = reinterpret_cast<ut_new_pfx_t*>(ptr) - 1;
+
+ total_bytes = n_elements * sizeof(T) + sizeof(ut_new_pfx_t);
+
+ for (size_t retries = 1; ; retries++) {
+
+ pfx_new = static_cast<ut_new_pfx_t*>(
+ realloc(pfx_old, total_bytes));
+
+ if (pfx_new != NULL || retries >= alloc_max_retries) {
+ break;
+ }
+
+ os_thread_sleep(1000000 /* 1 second */);
+ }
+
+ if (pfx_new == NULL) {
+ ib::fatal_or_error(oom_fatal)
+ << "Cannot reallocate " << total_bytes
+ << " bytes of memory after "
+ << alloc_max_retries << " retries over "
+ << alloc_max_retries << " seconds. OS error: "
+ << strerror(errno) << " (" << errno << "). "
+ << OUT_OF_MEMORY_MSG;
+ return(NULL);
+ }
+
+ /* pfx_new still contains the description of the old block
+ that was presumably freed by realloc(). */
+ deallocate_trace(pfx_new);
+
+ /* pfx_new is set here to describe the new block. */
+ allocate_trace(total_bytes, autoevent_idx, pfx_new);
+
+ return(reinterpret_cast<pointer>(pfx_new + 1));
+ }
+
+ /** Allocate, trace the allocation and construct 'n_elements' objects
+ of type 'T'. If the allocation fails or if some of the constructors
+ throws an exception, then this method will return NULL. It does not
+ throw exceptions. After successfull completion the returned pointer
+ must be passed to delete_array() when no longer needed.
+ @param[in] n_elements number of elements to allocate
+ @param[in] file file name of the caller
+ @return pointer to the first allocated object or NULL */
+ pointer
+ new_array(
+ size_type n_elements,
+ uint32_t autoevent_idx
+ )
+ {
+ T* p = allocate(n_elements, NULL, autoevent_idx, false, false);
+
+ if (p == NULL) {
+ return(NULL);
+ }
+
+ T* first = p;
+ size_type i;
+
+ try {
+ for (i = 0; i < n_elements; i++) {
+ new(p) T;
+ ++p;
+ }
+ } catch (...) {
+ for (size_type j = 0; j < i; j++) {
+ --p;
+ p->~T();
+ }
+
+ deallocate(first);
+
+ throw;
+ }
+
+ return(first);
+ }
+
+ /** Destroy, deallocate and trace the deallocation of an array created
+ by new_array().
+ @param[in,out] ptr pointer to the first object in the array */
+ void
+ delete_array(
+ T* ptr)
+ {
+ if (ptr == NULL) {
+ return;
+ }
+
+ const size_type n_elements = n_elements_allocated(ptr);
+
+ T* p = ptr + n_elements - 1;
+
+ for (size_type i = 0; i < n_elements; i++) {
+ p->~T();
+ --p;
+ }
+
+ deallocate(ptr);
+ }
+
+#endif /* UNIV_PFS_MEMORY */
+
+ /** Allocate a large chunk of memory that can hold 'n_elements'
+ objects of type 'T' and trace the allocation.
+ @param[in] n_elements number of elements
+ @param[in] dontdump if true, advise the OS is not to core
+ dump this memory.
+ @param[out] pfx storage for the description of the
+ allocated memory. The caller must provide space for this one and keep
+ it until the memory is no longer needed and then pass it to
+ deallocate_large().
+ @return pointer to the allocated memory or NULL */
+ pointer
+ allocate_large(
+ size_type n_elements,
+ ut_new_pfx_t* pfx,
+ bool dontdump = false)
+ {
+ if (n_elements == 0 || n_elements > max_size()) {
+ return(NULL);
+ }
+
+ ulint n_bytes = n_elements * sizeof(T);
+
+ pointer ptr = reinterpret_cast<pointer>(
+ my_large_malloc(&n_bytes, MYF(0)));
+
+ if (ptr == NULL) {
+ return NULL;
+ }
+
+ ut_dontdump(ptr, n_bytes, dontdump);
+
+ if (pfx != NULL) {
+#ifdef UNIV_PFS_MEMORY
+ allocate_trace(n_bytes, 0, pfx);
+#endif /* UNIV_PFS_MEMORY */
+ pfx->m_size = n_bytes;
+ }
+
+ os_total_large_mem_allocated += n_bytes;
+
+ return(ptr);
+ }
+
+ pointer
+ allocate_large_dontdump(
+ size_type n_elements,
+ ut_new_pfx_t* pfx)
+ {
+ return allocate_large(n_elements, pfx, true);
+ }
+ /** Free a memory allocated by allocate_large() and trace the
+ deallocation.
+ @param[in,out] ptr pointer to memory to free
+ @param[in] pfx descriptor of the memory, as returned by
+ allocate_large(). */
+ void
+ deallocate_large(
+ pointer ptr,
+ const ut_new_pfx_t* pfx)
+ {
+ size_t size = pfx->m_size;
+#ifdef UNIV_PFS_MEMORY
+ if (pfx) {
+ deallocate_trace(pfx);
+ }
+#endif /* UNIV_PFS_MEMORY */
+ os_total_large_mem_allocated -= size;
+
+ my_large_free(ptr, size);
+ }
+
+ void
+ deallocate_large_dodump(
+ pointer ptr,
+ const ut_new_pfx_t* pfx)
+ {
+ ut_dodump(ptr, pfx->m_size);
+ deallocate_large(ptr, pfx);
+ }
+
+#ifdef UNIV_PFS_MEMORY
+ /** Get the performance schema key to use for tracing allocations.
+ @param[in] file file name of the caller or NULL if unknown
+ @return performance schema key */
+ PSI_memory_key
+ get_mem_key(
+ uint32_t autoevent_idx = INVALID_AUTOEVENT_IDX) const
+ {
+ if (m_key != PSI_NOT_INSTRUMENTED) {
+ return(m_key);
+ }
+
+ if (autoevent_idx == INVALID_AUTOEVENT_IDX) {
+ return(mem_key_std);
+ }
+ const PSI_memory_key key = ut_new_get_key_by_file(autoevent_idx);
+
+ if (key != PSI_NOT_INSTRUMENTED) {
+ return(key);
+ }
+
+ return(mem_key_other);
+ }
+
+private:
+
+ /** Retrieve the size of a memory block allocated by new_array().
+ @param[in] ptr pointer returned by new_array().
+ @return size of memory block */
+ size_type
+ n_elements_allocated(
+ const_pointer ptr)
+ {
+ const ut_new_pfx_t* pfx
+ = reinterpret_cast<const ut_new_pfx_t*>(ptr) - 1;
+
+ const size_type user_bytes
+ = pfx->m_size - sizeof(ut_new_pfx_t);
+
+ ut_ad(user_bytes % sizeof(T) == 0);
+
+ return(user_bytes / sizeof(T));
+ }
+
+ /** Trace a memory allocation.
+ After the accounting, the data needed for tracing the deallocation
+ later is written into 'pfx'.
+ The PFS event name is picked on the following criteria:
+ 1. If key (!= PSI_NOT_INSTRUMENTED) has been specified when constructing
+ this ut_allocator object, then the name associated with that key will
+ be used (this is the recommended approach for new code)
+ 2. Otherwise, if "file" is NULL, then the name associated with
+ mem_key_std will be used
+ 3. Otherwise, if an entry is found by ut_new_get_key_by_file(), that
+ corresponds to "file", that will be used (see ut_new_boot())
+ 4. Otherwise, the name associated with mem_key_other will be used.
+ @param[in] size number of bytes that were allocated
+ @param[in] autoevent_idx autoevent_idx of the caller
+ @param[out] pfx placeholder to store the info which will be
+ needed when freeing the memory */
+ void
+ allocate_trace(
+ size_t size,
+ const uint32_t autoevent_idx,
+ ut_new_pfx_t* pfx)
+ {
+ const PSI_memory_key key = get_mem_key(autoevent_idx);
+
+ pfx->m_key = PSI_MEMORY_CALL(memory_alloc)(key, size, & pfx->m_owner);
+ pfx->m_size = size;
+ }
+
+ /** Trace a memory deallocation.
+ @param[in] pfx info for the deallocation */
+ void
+ deallocate_trace(
+ const ut_new_pfx_t* pfx)
+ {
+ PSI_MEMORY_CALL(memory_free)(pfx->m_key, pfx->m_size, pfx->m_owner);
+ }
+
+ /** Performance schema key. */
+ PSI_memory_key m_key;
+
+#endif /* UNIV_PFS_MEMORY */
+
+private:
+
+ /** Assignment operator, not used, thus disabled (private). */
+ template <class U>
+ void
+ operator=(
+ const ut_allocator<U>&);
+};
+
+/** Compare two allocators of the same type.
+As long as the type of A1 and A2 is the same, a memory allocated by A1
+could be freed by A2 even if the pfs mem key is different. */
+template <typename T>
+inline
+bool
+operator==(const ut_allocator<T>&, const ut_allocator<T>&) { return(true); }
+
+/** Compare two allocators of the same type. */
+template <typename T>
+inline
+bool
+operator!=(
+ const ut_allocator<T>& lhs,
+ const ut_allocator<T>& rhs)
+{
+ return(!(lhs == rhs));
+}
+
+#ifdef UNIV_PFS_MEMORY
+
+/*
+ constexpr trickery ahead.
+
+ Compute AUTOEVENT_IDX at compile time.
+ (index in the auto_event_names array, corresponding to basename of __FILE__)
+
+ The tricks are necessary to reduce the cost of lookup the
+ PSI_memory_key for auto event.
+*/
+
+static constexpr const char* cexpr_basename_helper(const char* s, const char* last_slash)
+{
+ return
+ *s == '\0' ? last_slash :
+ *s == '/' || *s == '\\' ? cexpr_basename_helper(s + 1, s + 1) :
+ cexpr_basename_helper(s + 1, last_slash);
+}
+
+static constexpr const char* cexpr_basename(const char* filename)
+{
+ return cexpr_basename_helper(filename, filename);
+}
+
+static constexpr bool cexpr_strequal_ignore_dot(const char* a, const char* b)
+{
+ return *a == 0 || *a == '.' ? (*b == 0 || *b == '.')
+ : *a == *b ? cexpr_strequal_ignore_dot(a + 1, b + 1) : false;
+}
+
+constexpr const char* const auto_event_names[] =
+{
+ "btr0btr",
+ "btr0buf",
+ "btr0bulk",
+ "btr0cur",
+ "btr0pcur",
+ "btr0sea",
+ "buf0buf",
+ "buf0dblwr",
+ "buf0dump",
+ "dict0dict",
+ "dict0mem",
+ "dict0stats",
+ "eval0eval",
+ "fil0crypt",
+ "fil0fil",
+ "fsp0file",
+ "fts0ast",
+ "fts0blex",
+ "fts0config",
+ "fts0file",
+ "fts0fts",
+ "fts0opt",
+ "fts0pars",
+ "fts0que",
+ "fts0sql",
+ "fts0tlex",
+ "gis0sea",
+ "ha_innodb",
+ "handler0alter",
+ "hash0hash",
+ "i_s",
+ "lexyy",
+ "lock0lock",
+ "mem0mem",
+ "os0event",
+ "os0file",
+ "pars0lex",
+ "rem0rec",
+ "row0ftsort",
+ "row0import",
+ "row0log",
+ "row0merge",
+ "row0mysql",
+ "row0sel",
+ "srv0start",
+ "sync0arr",
+ "sync0debug",
+ "sync0rw",
+ "sync0start",
+ "sync0types",
+ "trx0i_s",
+ "trx0i_s",
+ "trx0roll",
+ "trx0rseg",
+ "trx0seg",
+ "trx0trx",
+ "trx0undo",
+ "ut0list",
+ "ut0mem",
+ "ut0new",
+ "ut0pool",
+ "ut0rbt",
+ "ut0wqueue",
+ "xtrabackup",
+ nullptr
+};
+
+constexpr uint32_t cexpr_lookup_auto_event_name(const char* name, uint32_t idx = 0)
+{
+ return !auto_event_names[idx] ? INVALID_AUTOEVENT_IDX :
+ cexpr_strequal_ignore_dot(name, auto_event_names[idx]) ? idx :
+ cexpr_lookup_auto_event_name(name, idx + 1);
+}
+
+/*
+ The AUTOEVENT_IDX macro.
+
+ Note, that there is a static_assert that checks whether
+ basename of the __FILE is not registered in the auto_event_names array.
+ If you run into this assert, add the basename to the array.
+
+ Weird looking lambda is used to force the evaluation at the compile time.
+*/
+#define AUTOEVENT_IDX []()\
+{\
+ constexpr auto idx = cexpr_lookup_auto_event_name(cexpr_basename(__FILE__)); \
+ static_assert(idx != INVALID_AUTOEVENT_IDX, "auto_event_names contains no entry for " __FILE__);\
+ return idx; \
+}()
+
+
+/** Allocate, trace the allocation and construct an object.
+Use this macro instead of 'new' within InnoDB.
+For example: instead of
+ Foo* f = new Foo(args);
+use:
+ Foo* f = UT_NEW(Foo(args), mem_key_some);
+Upon failure to allocate the memory, this macro may return NULL. It
+will not throw exceptions. After successfull allocation the returned
+pointer must be passed to UT_DELETE() when no longer needed.
+@param[in] expr any expression that could follow "new"
+@param[in] key performance schema memory tracing key
+@return pointer to the created object or NULL */
+#define UT_NEW(expr, key) \
+ /* Placement new will return NULL and not attempt to construct an
+ object if the passed in pointer is NULL, e.g. if allocate() has
+ failed to allocate memory and has returned NULL. */ \
+ ::new(ut_allocator<byte>(key).allocate( \
+ sizeof expr, NULL, AUTOEVENT_IDX, false, false)) expr
+
+/** Allocate, trace the allocation and construct an object.
+Use this macro instead of 'new' within InnoDB and instead of UT_NEW()
+when creating a dedicated memory key is not feasible.
+For example: instead of
+ Foo* f = new Foo(args);
+use:
+ Foo* f = UT_NEW_NOKEY(Foo(args));
+Upon failure to allocate the memory, this macro may return NULL. It
+will not throw exceptions. After successfull allocation the returned
+pointer must be passed to UT_DELETE() when no longer needed.
+@param[in] expr any expression that could follow "new"
+@return pointer to the created object or NULL */
+#define UT_NEW_NOKEY(expr) UT_NEW(expr, PSI_NOT_INSTRUMENTED)
+
+/** Destroy, deallocate and trace the deallocation of an object created by
+UT_NEW() or UT_NEW_NOKEY().
+We can't instantiate ut_allocator without having the type of the object, thus
+we redirect this to a templated function. */
+#define UT_DELETE(ptr) ut_delete(ptr)
+
+
+/** Destroy and account object created by UT_NEW() or UT_NEW_NOKEY().
+@param[in,out] ptr pointer to the object */
+template <typename T>
+inline
+void
+ut_delete(
+ T* ptr)
+{
+ if (ptr == NULL) {
+ return;
+ }
+
+ ut_allocator<T> allocator;
+
+ allocator.destroy(ptr);
+ allocator.deallocate(ptr);
+}
+
+/** Allocate and account 'n_elements' objects of type 'type'.
+Use this macro to allocate memory within InnoDB instead of 'new[]'.
+The returned pointer must be passed to UT_DELETE_ARRAY().
+@param[in] type type of objects being created
+@param[in] n_elements number of objects to create
+@param[in] key performance schema memory tracing key
+@return pointer to the first allocated object or NULL */
+#define UT_NEW_ARRAY(type, n_elements, key) \
+ ut_allocator<type>(key).new_array(n_elements, AUTOEVENT_IDX)
+
+/** Allocate and account 'n_elements' objects of type 'type'.
+Use this macro to allocate memory within InnoDB instead of 'new[]' and
+instead of UT_NEW_ARRAY() when it is not feasible to create a dedicated key.
+@param[in] type type of objects being created
+@param[in] n_elements number of objects to create
+@return pointer to the first allocated object or NULL */
+#define UT_NEW_ARRAY_NOKEY(type, n_elements) \
+ UT_NEW_ARRAY(type, n_elements, PSI_NOT_INSTRUMENTED)
+
+/** Destroy, deallocate and trace the deallocation of an array created by
+UT_NEW_ARRAY() or UT_NEW_ARRAY_NOKEY().
+We can't instantiate ut_allocator without having the type of the object, thus
+we redirect this to a templated function. */
+#define UT_DELETE_ARRAY(ptr) ut_delete_array(ptr)
+
+/** Destroy and account objects created by UT_NEW_ARRAY() or
+UT_NEW_ARRAY_NOKEY().
+@param[in,out] ptr pointer to the first object in the array */
+template <typename T>
+inline
+void
+ut_delete_array(
+ T* ptr)
+{
+ ut_allocator<T>().delete_array(ptr);
+}
+
+#define ut_malloc(n_bytes, key) static_cast<void*>( \
+ ut_allocator<byte>(key).allocate( \
+ n_bytes, NULL, AUTOEVENT_IDX, false, false))
+
+#define ut_malloc_dontdump(n_bytes, key) static_cast<void*>( \
+ ut_allocator<byte>(key).allocate_large( \
+ n_bytes, NULL, true))
+
+#define ut_zalloc(n_bytes, key) static_cast<void*>( \
+ ut_allocator<byte>(key).allocate( \
+ n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_malloc_nokey(n_bytes) static_cast<void*>( \
+ ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \
+ n_bytes, NULL, AUTOEVENT_IDX, false, false))
+
+#define ut_zalloc_nokey(n_bytes) static_cast<void*>( \
+ ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \
+ n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_zalloc_nokey_nofatal(n_bytes) static_cast<void*>( \
+ ut_allocator<byte, false>(PSI_NOT_INSTRUMENTED).allocate( \
+ n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_realloc(ptr, n_bytes) static_cast<void*>( \
+ ut_allocator<byte>(PSI_NOT_INSTRUMENTED).reallocate( \
+ ptr, n_bytes, AUTOEVENT_IDX))
+
+#define ut_free(ptr) ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate( \
+ reinterpret_cast<byte*>(ptr))
+
+#else /* UNIV_PFS_MEMORY */
+
+/* Fallbacks when memory tracing is disabled at compile time. */
+
+#define UT_NEW(expr, key) ::new(std::nothrow) expr
+#define UT_NEW_NOKEY(expr) ::new(std::nothrow) expr
+#define UT_DELETE(ptr) ::delete ptr
+
+#define UT_NEW_ARRAY(type, n_elements, key) \
+ ::new(std::nothrow) type[n_elements]
+
+#define UT_NEW_ARRAY_NOKEY(type, n_elements) \
+ ::new(std::nothrow) type[n_elements]
+
+#define UT_DELETE_ARRAY(ptr) ::delete[] ptr
+
+#define ut_malloc(n_bytes, key) ::malloc(n_bytes)
+
+#define ut_zalloc(n_bytes, key) ::calloc(1, n_bytes)
+
+#define ut_malloc_nokey(n_bytes) ::malloc(n_bytes)
+
+static inline void *ut_malloc_dontdump(size_t n_bytes, ...)
+{
+ void *ptr = my_large_malloc(&n_bytes, MYF(0));
+
+ ut_dontdump(ptr, n_bytes, true);
+
+ if (ptr) {
+ os_total_large_mem_allocated += n_bytes;
+ }
+ return ptr;
+}
+
+#define ut_zalloc_nokey(n_bytes) ::calloc(1, n_bytes)
+
+#define ut_zalloc_nokey_nofatal(n_bytes) ::calloc(1, n_bytes)
+
+#define ut_realloc(ptr, n_bytes) ::realloc(ptr, n_bytes)
+
+#define ut_free(ptr) ::free(ptr)
+
+#endif /* UNIV_PFS_MEMORY */
+
+static inline void ut_free_dodump(void *ptr, size_t size)
+{
+ ut_dodump(ptr, size);
+ os_total_large_mem_allocated -= size;
+ my_large_free(ptr, size);
+}
+
+#endif /* ut0new_h */
diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h
new file mode 100644
index 00000000..e0a1f7c0
--- /dev/null
+++ b/storage/innobase/include/ut0pool.h
@@ -0,0 +1,363 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0pool.h
+Object pool.
+
+Created 2012-Feb-26 Sunny Bains
+***********************************************************************/
+
+#ifndef ut0pool_h
+#define ut0pool_h
+
+#include <vector>
+#include <queue>
+#include <functional>
+
+#include "ut0new.h"
+
+/** Allocate the memory for the object in blocks. We keep the objects sorted
+on pointer so that they are closer together in case they have to be iterated
+over in a list. */
+template <typename Type, typename Factory, typename LockStrategy>
+struct Pool {
+
+ typedef Type value_type;
+
+ // FIXME: Add an assertion to check alignment and offset is
+ // as we expect it. Also, sizeof(void*) can be 8, can we impove on this.
+ struct Element {
+ Pool* m_pool;
+ value_type m_type;
+ };
+
+ /** Constructor
+ @param size size of the memory block */
+ Pool(size_t size)
+ :
+ m_end(),
+ m_start(),
+ m_size(size),
+ m_last()
+ {
+ ut_a(size >= sizeof(Element));
+
+ m_lock_strategy.create();
+
+ ut_a(m_start == 0);
+
+ m_start = reinterpret_cast<Element*>(ut_zalloc_nokey(m_size));
+
+ m_last = m_start;
+
+ m_end = &m_start[m_size / sizeof(*m_start)];
+
+ /* Note: Initialise only a small subset, even though we have
+ allocated all the memory. This is required only because PFS
+ (MTR) results change if we instantiate too many mutexes up
+ front. */
+
+ init(ut_min(size_t(16), size_t(m_end - m_start)));
+
+ ut_ad(m_pqueue.size() <= size_t(m_last - m_start));
+ }
+
+ /** Destructor */
+ ~Pool()
+ {
+ m_lock_strategy.destroy();
+
+ for (Element* elem = m_start; elem != m_last; ++elem) {
+
+ ut_ad(elem->m_pool == this);
+ Factory::destroy(&elem->m_type);
+ }
+
+ ut_free(m_start);
+ m_end = m_last = m_start = 0;
+ m_size = 0;
+ }
+
+ /** Get an object from the pool.
+ @retrun a free instance or NULL if exhausted. */
+ Type* get()
+ {
+ Element* elem;
+
+ m_lock_strategy.enter();
+
+ if (!m_pqueue.empty()) {
+
+ elem = m_pqueue.top();
+ m_pqueue.pop();
+
+ } else if (m_last < m_end) {
+
+ /* Initialise the remaining elements. */
+ init(size_t(m_end - m_last));
+
+ ut_ad(!m_pqueue.empty());
+
+ elem = m_pqueue.top();
+ m_pqueue.pop();
+ } else {
+ elem = NULL;
+ }
+
+ m_lock_strategy.exit();
+ return elem ? &elem->m_type : NULL;
+ }
+
+ /** Add the object to the pool.
+ @param ptr object to free */
+ static void mem_free(value_type* ptr)
+ {
+ Element* elem;
+ byte* p = reinterpret_cast<byte*>(ptr + 1);
+
+ elem = reinterpret_cast<Element*>(p - sizeof(*elem));
+
+ elem->m_pool->m_lock_strategy.enter();
+
+ elem->m_pool->putl(elem);
+
+ elem->m_pool->m_lock_strategy.exit();
+ }
+
+protected:
+ // Disable copying
+ Pool(const Pool&);
+ Pool& operator=(const Pool&);
+
+private:
+
+ /* We only need to compare on pointer address. */
+ typedef std::priority_queue<
+ Element*,
+ std::vector<Element*, ut_allocator<Element*> >,
+ std::greater<Element*> > pqueue_t;
+
+ /** Release the object to the free pool
+ @param elem element to free */
+ void putl(Element* elem)
+ {
+ ut_ad(elem >= m_start && elem < m_last);
+ m_pqueue.push(elem);
+ }
+
+ /** Initialise the elements.
+ @param n_elems Number of elements to initialise */
+ void init(size_t n_elems)
+ {
+ ut_ad(size_t(m_end - m_last) >= n_elems);
+
+ for (size_t i = 0; i < n_elems; ++i, ++m_last) {
+
+ m_last->m_pool = this;
+ Factory::init(&m_last->m_type);
+ m_pqueue.push(m_last);
+ }
+
+ ut_ad(m_last <= m_end);
+ }
+
+private:
+ /** Pointer to the last element */
+ Element* m_end;
+
+ /** Pointer to the first element */
+ Element* m_start;
+
+ /** Size of the block in bytes */
+ size_t m_size;
+
+ /** Upper limit of used space */
+ Element* m_last;
+
+ /** Priority queue ordered on the pointer addresse. */
+ pqueue_t m_pqueue;
+
+ /** Lock strategy to use */
+ LockStrategy m_lock_strategy;
+};
+
+template <typename Pool, typename LockStrategy>
+struct PoolManager {
+
+ typedef Pool PoolType;
+ typedef typename PoolType::value_type value_type;
+
+ PoolManager(size_t size)
+ :
+ m_size(size)
+ {
+ create();
+ }
+
+ ~PoolManager()
+ {
+ destroy();
+
+ ut_a(m_pools.empty());
+ }
+
+ /** Get an element from one of the pools.
+ @return instance or NULL if pool is empty. */
+ value_type* get()
+ {
+ size_t index = 0;
+ size_t delay = 1;
+ value_type* ptr = NULL;
+
+ do {
+ m_lock_strategy.enter();
+
+ ut_ad(!m_pools.empty());
+
+ size_t n_pools = m_pools.size();
+
+ PoolType* pool = m_pools[index % n_pools];
+
+ m_lock_strategy.exit();
+
+ ptr = pool->get();
+
+ if (ptr == 0 && (index / n_pools) > 2) {
+
+ if (!add_pool(n_pools)) {
+
+ ib::error() << "Failed to allocate"
+ " memory for a pool of size "
+ << m_size << " bytes. Will"
+ " wait for " << delay
+ << " seconds for a thread to"
+ " free a resource";
+
+ /* There is nothing much we can do
+ except crash and burn, however lets
+ be a little optimistic and wait for
+ a resource to be freed. */
+ os_thread_sleep(delay * 1000000);
+
+ if (delay < 32) {
+ delay <<= 1;
+ }
+
+ } else {
+ delay = 1;
+ }
+ }
+
+ ++index;
+
+ } while (ptr == NULL);
+
+ return(ptr);
+ }
+
+ static void mem_free(value_type* ptr)
+ {
+ PoolType::mem_free(ptr);
+ }
+
+private:
+ /** Add a new pool
+ @param n_pools Number of pools that existed when the add pool was
+ called.
+ @return true on success */
+ bool add_pool(size_t n_pools)
+ {
+ bool added = false;
+
+ m_lock_strategy.enter();
+
+ if (n_pools < m_pools.size()) {
+ /* Some other thread already added a pool. */
+ added = true;
+ } else {
+ PoolType* pool;
+
+ ut_ad(n_pools == m_pools.size());
+
+ pool = UT_NEW_NOKEY(PoolType(m_size));
+
+ if (pool != NULL) {
+
+ ut_ad(n_pools <= m_pools.size());
+
+ m_pools.push_back(pool);
+
+ ib::info() << "Number of pools: "
+ << m_pools.size();
+
+ added = true;
+ }
+ }
+
+ ut_ad(n_pools < m_pools.size() || !added);
+
+ m_lock_strategy.exit();
+
+ return(added);
+ }
+
+ /** Create the pool manager. */
+ void create()
+ {
+ ut_a(m_size > sizeof(value_type));
+ m_lock_strategy.create();
+
+ add_pool(0);
+ }
+
+ /** Release the resources. */
+ void destroy()
+ {
+ typename Pools::iterator it;
+ typename Pools::iterator end = m_pools.end();
+
+ for (it = m_pools.begin(); it != end; ++it) {
+ PoolType* pool = *it;
+
+ UT_DELETE(pool);
+ }
+
+ m_pools.clear();
+
+ m_lock_strategy.destroy();
+ }
+private:
+ // Disable copying
+ PoolManager(const PoolManager&);
+ PoolManager& operator=(const PoolManager&);
+
+ typedef std::vector<PoolType*, ut_allocator<PoolType*> > Pools;
+
+ /** Size of each block */
+ size_t m_size;
+
+ /** Pools managed this manager */
+ Pools m_pools;
+
+ /** Lock strategy to use */
+ LockStrategy m_lock_strategy;
+};
+
+#endif /* ut0pool_h */
diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h
new file mode 100644
index 00000000..38071165
--- /dev/null
+++ b/storage/innobase/include/ut0rbt.h
@@ -0,0 +1,254 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/******************************************************************//**
+@file include/ut0rbt.h
+Various utilities
+
+Created 2007-03-20 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_UT0RBT_H
+#define INNOBASE_UT0RBT_H
+
+#if !defined(IB_RBT_TESTING)
+#include "ut0mem.h"
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define ut_malloc malloc
+#define ut_free free
+#define ulint unsigned long
+#define ut_a(c) assert(c)
+#define ut_error assert(0)
+#define ibool unsigned int
+#define TRUE 1
+#define FALSE 0
+#endif
+
+struct ib_rbt_node_t;
+typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
+typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2);
+
+/** Red black tree color types */
+enum ib_rbt_color_t {
+ IB_RBT_RED,
+ IB_RBT_BLACK
+};
+
+/** Red black tree node */
+struct ib_rbt_node_t {
+ ib_rbt_color_t color; /* color of this node */
+
+ ib_rbt_node_t* left; /* points left child */
+ ib_rbt_node_t* right; /* points right child */
+ ib_rbt_node_t* parent; /* points parent node */
+
+ char value[1]; /* Data value */
+};
+
+/** Red black tree instance.*/
+struct ib_rbt_t {
+ ib_rbt_node_t* nil; /* Black colored node that is
+ used as a sentinel. This is
+ pre-allocated too.*/
+
+ ib_rbt_node_t* root; /* Root of the tree, this is
+ pre-allocated and the first
+ data node is the left child.*/
+
+ ulint n_nodes; /* Total number of data nodes */
+
+ ib_rbt_compare compare; /* Fn. to use for comparison */
+ ib_rbt_arg_compare
+ compare_with_arg; /* Fn. to use for comparison
+ with argument */
+ ulint sizeof_value; /* Sizeof the item in bytes */
+ void* cmp_arg; /* Compare func argument */
+};
+
+/** The result of searching for a key in the tree, this is useful for
+a speedy lookup and insert if key doesn't exist.*/
+struct ib_rbt_bound_t {
+ const ib_rbt_node_t*
+ last; /* Last node visited */
+
+ int result; /* Result of comparing with
+ the last non-nil node that
+ was visited */
+};
+
+/* Size in elements (t is an rb tree instance) */
+#define rbt_size(t) (t->n_nodes)
+
+/* Check whether the rb tree is empty (t is an rb tree instance) */
+#define rbt_empty(t) (rbt_size(t) == 0)
+
+/* Get data value (t is the data type, n is an rb tree node instance) */
+#define rbt_value(t, n) ((t*) &n->value[0])
+
+/* Compare a key with the node value (t is tree, k is key, n is node)*/
+#define rbt_compare(t, k, n) (t->compare(k, n->value))
+
+/**********************************************************************//**
+Free an instance of a red black tree */
+void
+rbt_free(
+/*=====*/
+ ib_rbt_t* tree); /*!< in: rb tree to free */
+/**********************************************************************//**
+Create an instance of a red black tree
+@return rb tree instance */
+ib_rbt_t*
+rbt_create(
+/*=======*/
+ size_t sizeof_value, /*!< in: size in bytes */
+ ib_rbt_compare compare); /*!< in: comparator */
+/**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return rb tree instance */
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+ size_t sizeof_value, /*!< in: size in bytes */
+ ib_rbt_arg_compare
+ compare, /*!< in: comparator */
+ void* cmp_arg); /*!< in: compare fn arg */
+/**********************************************************************//**
+Delete a node from the red black tree, identified by key */
+ibool
+rbt_delete(
+/*=======*/
+ /* in: TRUE on success */
+ ib_rbt_t* tree, /* in: rb tree */
+ const void* key); /* in: key to delete */
+/**********************************************************************//**
+Remove a node from the red black tree, NOTE: This function will not delete
+the node instance, THAT IS THE CALLERS RESPONSIBILITY.
+@return the deleted node with the const. */
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t*
+ node); /*!< in: node to delete, this
+ is a fudge and declared const
+ because the caller has access
+ only to const nodes.*/
+/**********************************************************************//**
+Add data to the red black tree, identified by key (no dups yet!)
+@return inserted node */
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ const void* value); /*!< in: data that will be
+ copied to the node.*/
+/**********************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: parent */
+ const void* value); /*!< in: this value is copied
+ to the node */
+/**********************************************************************//**
+Return the left most data node in the tree
+@return left most node */
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+ const ib_rbt_t* tree); /*!< in: rb tree */
+/**********************************************************************//**
+Return the right most data node in the tree
+@return right most node */
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+ const ib_rbt_t* tree); /*!< in: rb tree */
+/**********************************************************************//**
+Return the next node from current.
+@return successor node to current that is passed in. */
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* /* in: current node */
+ current);
+/**********************************************************************//**
+Return the prev node from current.
+@return precedessor node to current that is passed in */
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* /* in: current node */
+ current);
+/**********************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+int
+rbt_search(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key); /*!< in: key to search */
+/**********************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+int
+rbt_search_cmp(
+/*===========*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key, /*!< in: key to search */
+ ib_rbt_compare compare, /*!< in: comparator */
+ ib_rbt_arg_compare
+ arg_compare); /*!< in: fn to compare items
+ with argument */
+/**********************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+ulint
+rbt_merge_uniq(
+/*===========*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ const ib_rbt_t* src); /*!< in: src rb tree */
+#if defined UNIV_DEBUG || defined IB_RBT_TESTING
+/**********************************************************************//**
+Verify the integrity of the RB tree. For debugging. 0 failure else height
+of tree (in count of black nodes).
+@return TRUE if OK FALSE if tree invalid. */
+ibool
+rbt_validate(
+/*=========*/
+ const ib_rbt_t* tree); /*!< in: tree to validate */
+#endif /* UNIV_DEBUG || IB_RBT_TESTING */
+
+#endif /* INNOBASE_UT0RBT_H */
diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h
new file mode 100644
index 00000000..5b1ae5bc
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.h
@@ -0,0 +1,137 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0rnd.h
+Random numbers and hashing
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0rnd_h
+#define ut0rnd_h
+
+#include "ut0byte.h"
+#include <my_sys.h>
+
+#ifndef UNIV_INNOCHECKSUM
+/** Seed value of ut_rnd_gen() */
+extern std::atomic<uint32_t> ut_rnd_current;
+
+/** @return a pseudo-random 32-bit number */
+inline uint32_t ut_rnd_gen()
+{
+ /* This is a Galois linear-feedback shift register.
+ https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Galois_LFSRs
+ The generating primitive Galois Field polynomial is the Castagnoli
+ polynomial that was made popular by CRC-32C:
+ x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+
+ x^19+x^18+x^14+x^13+x^11+x^10+x^9+x^8+x^6+1 */
+ const uint32_t crc32c= 0x1edc6f41;
+
+ uint32_t rnd= ut_rnd_current.load(std::memory_order_relaxed);
+
+ if (UNIV_UNLIKELY(rnd == 0))
+ {
+ rnd= static_cast<uint32_t>(my_interval_timer());
+ if (!rnd) rnd= 1;
+ }
+ else
+ {
+ bool lsb= rnd & 1;
+ rnd>>= 1;
+ if (lsb)
+ rnd^= crc32c;
+ }
+
+ ut_rnd_current.store(rnd, std::memory_order_relaxed);
+ return rnd;
+}
+
+/** @return a random number between 0 and n-1, inclusive */
+inline ulint ut_rnd_interval(ulint n)
+{
+ return n > 1 ? static_cast<ulint>(ut_rnd_gen() % n) : 0;
+}
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime or some
+random number to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+ ulint key, /*!< in: value to be hashed */
+ ulint table_size); /*!< in: hash table size */
+/*************************************************************//**
+Folds a 64-bit integer.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ull(
+/*========*/
+ ib_uint64_t d) /*!< in: 64-bit integer */
+ MY_ATTRIBUTE((const));
+/*************************************************************//**
+Folds a character string ending in the null character.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+ const char* str) /*!< in: null-terminated string */
+ MY_ATTRIBUTE((warn_unused_result));
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return prime */
+ulint
+ut_find_prime(
+/*==========*/
+ ulint n) /*!< in: positive number > 100 */
+ MY_ATTRIBUTE((const));
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+ ulint n1, /*!< in: ulint */
+ ulint n2) /*!< in: ulint */
+ MY_ATTRIBUTE((const));
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+ const byte* str, /*!< in: string of bytes */
+ ulint len) /*!< in: length */
+ MY_ATTRIBUTE((pure));
+
+#include "ut0rnd.ic"
+
+#endif
diff --git a/storage/innobase/include/ut0rnd.ic b/storage/innobase/include/ut0rnd.ic
new file mode 100644
index 00000000..c0105160
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.ic
@@ -0,0 +1,150 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0rnd.ic
+Random numbers and hashing
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#define UT_HASH_RANDOM_MASK 1463735687
+#define UT_HASH_RANDOM_MASK2 1653893711
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime
+or some random number for the hash table to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+ ulint key, /*!< in: value to be hashed */
+ ulint table_size) /*!< in: hash table size */
+{
+ ut_ad(table_size);
+ key = key ^ UT_HASH_RANDOM_MASK2;
+
+ return(key % table_size);
+}
+
+/*************************************************************//**
+Folds a 64-bit integer.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ull(
+/*========*/
+ ib_uint64_t d) /*!< in: 64-bit integer */
+{
+ return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK,
+ (ulint) (d >> 32)));
+}
+
+/*************************************************************//**
+Folds a character string ending in the null character.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+ const char* str) /*!< in: null-terminated string */
+{
+ ulint fold = 0;
+
+ ut_ad(str);
+
+ while (*str != '\0') {
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+ str++;
+ }
+
+ return(fold);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+ ulint n1, /*!< in: ulint */
+ ulint n2) /*!< in: ulint */
+{
+ return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+ ^ UT_HASH_RANDOM_MASK) + n2);
+}
+
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+ const byte* str, /*!< in: string of bytes */
+ ulint len) /*!< in: length */
+{
+ ulint fold = 0;
+ const byte* str_end = str + (len & 0xFFFFFFF8);
+
+ ut_ad(str || !len);
+
+ while (str < str_end) {
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ }
+
+ switch (len & 0x7) {
+ case 7:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 6:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 5:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 4:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 3:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 2:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ /* fall through */
+ case 1:
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+ }
+
+ return(fold);
+}
diff --git a/storage/innobase/include/ut0sort.h b/storage/innobase/include/ut0sort.h
new file mode 100644
index 00000000..4f1d4c04
--- /dev/null
+++ b/storage/innobase/include/ut0sort.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0sort.h
+Sort utility
+
+Created 11/9/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0sort_h
+#define ut0sort_h
+
+/* This module gives a macro definition of the body of
+a standard sort function for an array of elements of any
+type. The comparison function is given as a parameter to
+the macro. The sort algorithm is mergesort which has logarithmic
+worst case.
+*/
+
+/*******************************************************************//**
+This macro expands to the body of a standard sort function.
+The sort function uses mergesort and must be defined separately
+for each type of array.
+Also the comparison function has to be defined individually
+for each array cell type. SORT_FUN is the sort function name.
+The function takes the array to be sorted (ARR),
+the array of auxiliary space (AUX_ARR) of same size,
+and the low (LOW), inclusive, and high (HIGH), noninclusive,
+limits for the sort interval as arguments.
+CMP_FUN is the comparison function name. It takes as arguments
+two elements from the array and returns 1, if the first is bigger,
+0 if equal, and -1 if the second bigger. */
+
+#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\
+{\
+ ulint ut_sort_mid77;\
+ ulint ut_sort_i77;\
+ ulint ut_sort_low77;\
+ ulint ut_sort_high77;\
+\
+ ut_ad((LOW) < (HIGH));\
+ ut_ad(ARR);\
+ ut_ad(AUX_ARR);\
+\
+ if ((LOW) == (HIGH) - 1) {\
+ return;\
+ } else if ((LOW) == (HIGH) - 2) {\
+ if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\
+ (AUX_ARR)[LOW] = (ARR)[LOW];\
+ (ARR)[LOW] = (ARR)[(HIGH) - 1];\
+ (ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\
+ }\
+ return;\
+ }\
+\
+ ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\
+\
+ SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\
+ SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\
+\
+ ut_sort_low77 = (LOW);\
+ ut_sort_high77 = ut_sort_mid77;\
+\
+ for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\
+\
+ if (ut_sort_low77 >= ut_sort_mid77) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+ ut_sort_high77++;\
+ } else if (ut_sort_high77 >= (HIGH)) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+ ut_sort_low77++;\
+ } else if (CMP_FUN((ARR)[ut_sort_low77],\
+ (ARR)[ut_sort_high77]) > 0) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+ ut_sort_high77++;\
+ } else {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+ ut_sort_low77++;\
+ }\
+ }\
+\
+ memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\
+ ((HIGH) - (LOW)) * sizeof *(ARR));\
+}\
+
+
+#endif
+
diff --git a/storage/innobase/include/ut0stage.h b/storage/innobase/include/ut0stage.h
new file mode 100644
index 00000000..17fbd91b
--- /dev/null
+++ b/storage/innobase/include/ut0stage.h
@@ -0,0 +1,499 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ut/ut0stage.h
+Supplementary code to performance schema stage instrumentation.
+
+Created Nov 12, 2014 Vasil Dimov
+*******************************************************/
+
+#ifndef ut0stage_h
+#define ut0stage_h
+
+#include <algorithm>
+#include <math.h>
+
+#include "my_global.h" /* needed for headers from mysql/psi/ */
+
+#include "mysql/psi/mysql_stage.h" /* mysql_stage_inc_work_completed */
+#include "mysql/psi/psi.h" /* HAVE_PSI_STAGE_INTERFACE, PSI_stage_progress */
+
+#include "dict0mem.h" /* dict_index_t */
+#include "row0log.h" /* row_log_estimate_work() */
+#include "srv0srv.h" /* ut_stage_alter_t */
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+
+/** Class used to report ALTER TABLE progress via performance_schema.
+The only user of this class is the ALTER TABLE code and it calls the methods
+in the following order
+constructor
+begin_phase_read_pk()
+ multiple times:
+ n_pk_recs_inc() // once per record read
+ inc() // once per page read
+end_phase_read_pk()
+if any new indexes are being added, for each one:
+ begin_phase_sort()
+ multiple times:
+ inc() // once per record sorted
+ begin_phase_insert()
+ multiple times:
+ inc() // once per record inserted
+ being_phase_log_index()
+ multiple times:
+ inc() // once per log-block applied
+begin_phase_log_table()
+ multiple times:
+ inc() // once per log-block applied
+begin_phase_end()
+destructor
+
+This class knows the specifics of each phase and tries to increment the
+progress in an even manner across the entire ALTER TABLE lifetime. */
+class ut_stage_alter_t {
+public:
+ /** Constructor.
+ @param[in] pk primary key of the old table */
+ explicit
+ ut_stage_alter_t(
+ const dict_index_t* pk)
+ :
+ m_progress(NULL),
+ m_pk(pk),
+ m_n_pk_recs(0),
+ m_n_pk_pages(0),
+ m_n_recs_processed(0),
+ m_cur_phase(NOT_STARTED)
+ {
+ }
+
+ /** Destructor. */
+ ~ut_stage_alter_t();
+
+ /** Flag an ALTER TABLE start (read primary key phase).
+ @param[in] n_sort_indexes number of indexes that will be sorted
+ during ALTER TABLE, used for estimating the total work to be done */
+ void
+ begin_phase_read_pk(
+ ulint n_sort_indexes);
+
+ /** Increment the number of records in PK (table) with 1.
+ This is used to get more accurate estimate about the number of
+ records per page which is needed because some phases work on
+ per-page basis while some work on per-record basis and we want
+ to get the progress as even as possible. */
+ void
+ n_pk_recs_inc();
+
+ /** Flag either one record or one page processed, depending on the
+ current phase.
+ @param[in] inc_val flag this many units processed at once */
+ void
+ inc(
+ ulint inc_val = 1);
+
+ /** Flag the end of reading of the primary key.
+ Here we know the exact number of pages and records and calculate
+ the number of records per page and refresh the estimate. */
+ void
+ end_phase_read_pk();
+
+ /** Flag the beginning of the sort phase.
+ @param[in] sort_multi_factor since merge sort processes
+ one page more than once we only update the estimate once per this
+ many pages processed. */
+ void
+ begin_phase_sort(
+ double sort_multi_factor);
+
+ /** Flag the beginning of the insert phase. */
+ void
+ begin_phase_insert();
+
+ /** Flag the beginning of the log index phase. */
+ void
+ begin_phase_log_index();
+
+ /** Flag the beginning of the log table phase. */
+ void
+ begin_phase_log_table();
+
+ /** Flag the beginning of the end phase. */
+ void
+ begin_phase_end();
+
+private:
+
+ /** Update the estimate of total work to be done. */
+ void
+ reestimate();
+
+ /** Change the current phase.
+ @param[in] new_stage pointer to the new stage to change to */
+ void
+ change_phase(
+ const PSI_stage_info* new_stage);
+
+ /** Performance schema accounting object. */
+ PSI_stage_progress* m_progress;
+
+ /** Old table PK. Used for calculating the estimate. */
+ const dict_index_t* m_pk;
+
+ /** Number of records in the primary key (table), including delete
+ marked records. */
+ ulint m_n_pk_recs;
+
+ /** Number of leaf pages in the primary key. */
+ ulint m_n_pk_pages;
+
+ /** Estimated number of records per page in the primary key. */
+ double m_n_recs_per_page;
+
+ /** Number of indexes that are being added. */
+ ulint m_n_sort_indexes;
+
+ /** During the sort phase, increment the counter once per this
+ many pages processed. This is because sort processes one page more
+ than once. */
+ ulint m_sort_multi_factor;
+
+ /** Number of records processed during sort & insert phases. We
+ need to increment the counter only once page, or once per
+ recs-per-page records. */
+ ulint m_n_recs_processed;
+
+ /** Current phase. */
+ enum {
+ NOT_STARTED = 0,
+ READ_PK = 1,
+ SORT = 2,
+ INSERT = 3,
+ /* JAN: TODO: MySQL 5.7 vrs. MariaDB sql/log.h
+ LOG_INDEX = 5,
+ LOG_TABLE = 6, */
+ LOG_INNODB_INDEX = 5,
+ LOG_INNODB_TABLE = 6,
+ END = 7,
+ } m_cur_phase;
+};
+
+/** Destructor. */
+inline
+ut_stage_alter_t::~ut_stage_alter_t()
+{
+ if (m_progress == NULL) {
+ return;
+ }
+
+ /* Set completed = estimated before we quit. */
+ mysql_stage_set_work_completed(
+ m_progress,
+ mysql_stage_get_work_estimated(m_progress));
+
+ mysql_end_stage();
+}
+
+/** Flag an ALTER TABLE start (read primary key phase).
+@param[in] n_sort_indexes number of indexes that will be sorted
+during ALTER TABLE, used for estimating the total work to be done */
+inline
+void
+ut_stage_alter_t::begin_phase_read_pk(
+ ulint n_sort_indexes)
+{
+ m_n_sort_indexes = n_sort_indexes;
+
+ m_cur_phase = READ_PK;
+
+ m_progress = mysql_set_stage(
+ srv_stage_alter_table_read_pk_internal_sort.m_key);
+
+ mysql_stage_set_work_completed(m_progress, 0);
+ reestimate();
+}
+
+/** Increment the number of records in PK (table) with 1.
+This is used to get more accurate estimate about the number of
+records per page which is needed because some phases work on
+per-page basis while some work on per-record basis and we want
+to get the progress as even as possible. */
+inline
+void
+ut_stage_alter_t::n_pk_recs_inc()
+{
+ m_n_pk_recs++;
+}
+
+/** Flag either one record or one page processed, depending on the
+current phase. */
+inline
+void
+ut_stage_alter_t::inc(ulint inc_val)
+{
+ if (m_progress == NULL) {
+ return;
+ }
+
+ ulint multi_factor = 1;
+ bool should_proceed = true;
+
+ switch (m_cur_phase) {
+ case NOT_STARTED:
+ ut_error;
+ case READ_PK:
+ m_n_pk_pages++;
+ ut_ad(inc_val == 1);
+ /* Overall the read pk phase will read all the pages from the
+ PK and will do work, proportional to the number of added
+ indexes, thus when this is called once per read page we
+ increment with 1 + m_n_sort_indexes */
+ inc_val = 1 + m_n_sort_indexes;
+ break;
+ case SORT:
+ multi_factor = m_sort_multi_factor;
+ /* fall through */
+ case INSERT: {
+ /* Increment the progress every nth record. During
+ sort and insert phases, this method is called once per
+ record processed. We need fractional point numbers here
+ because "records per page" is such a number naturally and
+ to avoid rounding skew we want, for example: if there are
+ (double) N records per page, then the work_completed
+ should be incremented on the inc() calls round(k*N),
+ for k=1,2,3... */
+ const double every_nth = m_n_recs_per_page *
+ static_cast<double>(multi_factor);
+
+ const ulint k = static_cast<ulint>(
+ round(static_cast<double>(m_n_recs_processed) /
+ every_nth));
+
+ const ulint nth = static_cast<ulint>(
+ round(static_cast<double>(k) * every_nth));
+
+ should_proceed = m_n_recs_processed == nth;
+
+ m_n_recs_processed++;
+
+ break;
+ }
+ /* JAN: TODO: MySQL 5.7
+ case LOG_INDEX:
+ break;
+ case LOG_TABLE:
+ break; */
+ case LOG_INNODB_INDEX:
+ case LOG_INNODB_TABLE:
+ break;
+ case END:
+ break;
+ }
+
+ if (should_proceed) {
+ mysql_stage_inc_work_completed(m_progress, inc_val);
+ reestimate();
+ }
+}
+
+/** Flag the end of reading of the primary key.
+Here we know the exact number of pages and records and calculate
+the number of records per page and refresh the estimate. */
+inline
+void
+ut_stage_alter_t::end_phase_read_pk()
+{
+ reestimate();
+
+ if (m_n_pk_pages == 0) {
+ /* The number of pages in the PK could be 0 if the tree is
+ empty. In this case we set m_n_recs_per_page to 1 to avoid
+ division by zero later. */
+ m_n_recs_per_page = 1.0;
+ } else {
+ m_n_recs_per_page = std::max(
+ static_cast<double>(m_n_pk_recs)
+ / static_cast<double>(m_n_pk_pages),
+ 1.0);
+ }
+}
+
+/** Flag the beginning of the sort phase.
+@param[in] sort_multi_factor since merge sort processes
+one page more than once we only update the estimate once per this
+many pages processed. */
+inline
+void
+ut_stage_alter_t::begin_phase_sort(
+ double sort_multi_factor)
+{
+ if (sort_multi_factor <= 1.0) {
+ m_sort_multi_factor = 1;
+ } else {
+ m_sort_multi_factor = static_cast<ulint>(
+ round(sort_multi_factor));
+ }
+
+ change_phase(&srv_stage_alter_table_merge_sort);
+}
+
+/** Flag the beginning of the insert phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_insert()
+{
+ change_phase(&srv_stage_alter_table_insert);
+}
+
+/** Flag the beginning of the log index phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_log_index()
+{
+ change_phase(&srv_stage_alter_table_log_index);
+}
+
+/** Flag the beginning of the log table phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_log_table()
+{
+ change_phase(&srv_stage_alter_table_log_table);
+}
+
+/** Flag the beginning of the end phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_end()
+{
+ change_phase(&srv_stage_alter_table_end);
+}
+
+/** Update the estimate of total work to be done. */
+inline
+void
+ut_stage_alter_t::reestimate()
+{
+ if (m_progress == NULL) {
+ return;
+ }
+
+ /* During the log table phase we calculate the estimate as
+ work done so far + log size remaining. */
+ if (m_cur_phase == LOG_INNODB_TABLE) {
+ mysql_stage_set_work_estimated(
+ m_progress,
+ mysql_stage_get_work_completed(m_progress)
+ + row_log_estimate_work(m_pk));
+ return;
+ }
+
+ /* During the other phases we use a formula, regardless of
+ how much work has been done so far. */
+
+ /* For number of pages in the PK - if the PK has not been
+ read yet, use stat_n_leaf_pages (approximate), otherwise
+ use the exact number we gathered. */
+ const ulint n_pk_pages
+ = m_cur_phase != READ_PK
+ ? m_n_pk_pages
+ : m_pk->stat_n_leaf_pages;
+
+ ulonglong estimate __attribute__((unused))
+ = n_pk_pages
+ * (1 /* read PK */
+ + m_n_sort_indexes /* row_merge_buf_sort() inside the
+ read PK per created index */
+ + m_n_sort_indexes * 2 /* sort & insert per created index */)
+ + row_log_estimate_work(m_pk);
+
+ /* Prevent estimate < completed */
+ estimate = std::max(estimate,
+ mysql_stage_get_work_completed(m_progress));
+
+ mysql_stage_set_work_estimated(m_progress, estimate);
+}
+
+/** Change the current phase.
+@param[in] new_stage pointer to the new stage to change to */
+inline
+void
+ut_stage_alter_t::change_phase(
+ const PSI_stage_info* new_stage)
+{
+ if (m_progress == NULL) {
+ return;
+ }
+
+ if (new_stage == &srv_stage_alter_table_read_pk_internal_sort) {
+ m_cur_phase = READ_PK;
+ } else if (new_stage == &srv_stage_alter_table_merge_sort) {
+ m_cur_phase = SORT;
+ } else if (new_stage == &srv_stage_alter_table_insert) {
+ m_cur_phase = INSERT;
+ /* JAN: TODO: MySQL 5.7 used LOG_INDEX and LOG_TABLE */
+ } else if (new_stage == &srv_stage_alter_table_log_index) {
+ m_cur_phase = LOG_INNODB_INDEX;
+ } else if (new_stage == &srv_stage_alter_table_log_table) {
+ m_cur_phase = LOG_INNODB_TABLE;
+ } else if (new_stage == &srv_stage_alter_table_end) {
+ m_cur_phase = END;
+ } else {
+ ut_error;
+ }
+
+ const ulonglong c = mysql_stage_get_work_completed(m_progress);
+ const ulonglong e = mysql_stage_get_work_estimated(m_progress);
+
+ m_progress = mysql_set_stage(new_stage->m_key);
+
+ mysql_stage_set_work_completed(m_progress, c);
+ mysql_stage_set_work_estimated(m_progress, e);
+}
+#else /* HAVE_PSI_STAGE_INTERFACE */
+
+class ut_stage_alter_t {
+public:
+ explicit ut_stage_alter_t(const dict_index_t*) {}
+
+ void begin_phase_read_pk(ulint) {}
+
+ void n_pk_recs_inc() {}
+
+ void inc() {}
+ void inc(ulint) {}
+
+ void end_phase_read_pk() {}
+
+ void begin_phase_sort(double) {}
+
+ void begin_phase_insert() {}
+
+ void begin_phase_log_index() {}
+
+ void begin_phase_log_table() {}
+
+ void begin_phase_end() {}
+};
+
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+#endif /* ut0stage_h */
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
new file mode 100644
index 00000000..9f11944c
--- /dev/null
+++ b/storage/innobase/include/ut0ut.h
@@ -0,0 +1,453 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0ut.h
+Various utilities
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0ut_h
+#define ut0ut_h
+
+/* Do not include univ.i because univ.i includes this. */
+
+#include <ostream>
+#include <sstream>
+#include <string.h>
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "db0err.h"
+
+#include <time.h>
+
+#ifndef MYSQL_SERVER
+#include <ctype.h>
+#endif /* MYSQL_SERVER */
+
+#include <stdarg.h>
+
+#include <string>
+
+/** Index name prefix in fast index creation, as a string constant */
+#define TEMP_INDEX_PREFIX_STR "\377"
+
+#define ut_max std::max
+#define ut_min std::min
+
+/** Calculate the minimum of two pairs.
+@param[out] min_hi MSB of the minimum pair
+@param[out] min_lo LSB of the minimum pair
+@param[in] a_hi MSB of the first pair
+@param[in] a_lo LSB of the first pair
+@param[in] b_hi MSB of the second pair
+@param[in] b_lo LSB of the second pair */
+UNIV_INLINE
+void
+ut_pair_min(
+ ulint* min_hi,
+ ulint* min_lo,
+ ulint a_hi,
+ ulint a_lo,
+ ulint b_hi,
+ ulint b_lo);
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+ ulint a, /*!< in: ulint */
+ ulint b); /*!< in: ulint */
+/** Compare two pairs of integers.
+@param[in] a_h more significant part of first pair
+@param[in] a_l less significant part of first pair
+@param[in] b_h more significant part of second pair
+@param[in] b_l less significant part of second pair
+@return comparison result of (a_h,a_l) and (b_h,b_l)
+@retval -1 if (a_h,a_l) is less than (b_h,b_l)
+@retval 0 if (a_h,a_l) is equal to (b_h,b_l)
+@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */
+UNIV_INLINE
+int
+ut_pair_cmp(
+ ulint a_h,
+ ulint a_l,
+ ulint b_h,
+ ulint b_l)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+Calculates fast the remainder of n/m when m is a power of two.
+@param n in: numerator
+@param m in: denominator, must be a power of two
+@return the remainder of n/m */
+template <typename T> inline T ut_2pow_remainder(T n, T m){return n & (m - 1);}
+/*************************************************************//**
+Calculates the biggest multiple of m that is not bigger than n
+when m is a power of two. In other words, rounds n down to m * k.
+@param n in: number to round down
+@param m in: alignment, must be a power of two
+@return n rounded down to the biggest possible integer multiple of m */
+template <typename T> inline T ut_2pow_round(T n, T m) { return n & ~(m - 1); }
+/********************************************************//**
+Calculates the smallest multiple of m that is not smaller than n
+when m is a power of two. In other words, rounds n up to m * k.
+@param n in: number to round up
+@param m in: alignment, must be a power of two
+@return n rounded up to the smallest possible integer multiple of m */
+#define UT_CALC_ALIGN(n, m) ((n + m - 1) & ~(m - 1))
+template <typename T> inline T ut_calc_align(T n, T m)
+{ return static_cast<T>(UT_CALC_ALIGN(n, m)); }
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+ ulint n); /*!< in: number */
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+ ulint n); /*!< in: number */
+
+/**********************************************************//**
+Returns the number of milliseconds since some epoch. The
+value may wrap around. It should only be used for heuristic
+purposes.
+@return ms since epoch */
+ulint
+ut_time_ms(void);
+/*============*/
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Determine how many bytes (groups of 8 bits) are needed to
+store the given number of bits.
+@param b in: bits
+@return number of bytes (octets) needed to represent b */
+#define UT_BITS_IN_BYTES(b) (((b) + 7) >> 3)
+
+/** Determines if a number is zero or a power of two.
+@param[in] n number
+@return nonzero if n is zero or a power of two; zero otherwise */
+#define ut_is_2pow(n) (!((n) & ((n) - 1)))
+
+/** Functor that compares two C strings. Can be used as a comparator for
+e.g. std::map that uses char* as keys. */
+struct ut_strcmp_functor
+{
+ bool operator()(
+ const char* a,
+ const char* b) const
+ {
+ return(strcmp(a, b) < 0);
+ }
+};
+
+/**********************************************************//**
+Prints a timestamp to a file. */
+void
+ut_print_timestamp(
+/*===============*/
+ FILE* file) /*!< in: file where to print */
+ ATTRIBUTE_COLD __attribute__((nonnull));
+
+#ifndef UNIV_INNOCHECKSUM
+
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+void
+ut_sprintf_timestamp(
+/*=================*/
+ char* buf); /*!< in: buffer where to sprintf */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+ FILE* file, /*!< in: file where to print */
+ const void* buf, /*!< in: memory buffer */
+ ulint len); /*!< in: length of the buffer */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex. */
+void
+ut_print_buf_hex(
+/*=============*/
+ std::ostream& o, /*!< in/out: output stream */
+ const void* buf, /*!< in: memory buffer */
+ ulint len) /*!< in: length of the buffer */
+ MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+ std::ostream& o, /*!< in/out: output stream */
+ const void* buf, /*!< in: memory buffer */
+ ulint len) /*!< in: length of the buffer */
+ MY_ATTRIBUTE((nonnull));
+
+/* Forward declaration of transaction handle */
+struct trx_t;
+
+/** Get a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier.
+ @param [in] trx transaction (NULL=no quotes).
+ @param [in] name table name.
+ @retval String quoted as an SQL identifier.
+*/
+std::string
+ut_get_name(
+ const trx_t* trx,
+ const char* name);
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+void
+ut_print_name(
+/*==========*/
+ FILE* ef, /*!< in: stream */
+ const trx_t* trx, /*!< in: transaction */
+ const char* name); /*!< in: table name to print */
+/** Format a table name, quoted as an SQL identifier.
+If the name contains a slash '/', the result will contain two
+identifiers separated by a period (.), as in SQL
+database_name.table_name.
+@see table_name_t
+@param[in] name table or index name
+@param[out] formatted formatted result, will be NUL-terminated
+@param[in] formatted_size size of the buffer in bytes
+@return pointer to 'formatted' */
+char*
+ut_format_name(
+ const char* name,
+ char* formatted,
+ ulint formatted_size);
+
+/**********************************************************************//**
+Catenate files. */
+void
+ut_copy_file(
+/*=========*/
+ FILE* dest, /*!< in: output file */
+ FILE* src); /*!< in: input file to be appended to output */
+
+/*************************************************************//**
+Convert an error number to a human readable text message. The
+returned string is static and should not be freed or modified.
+@return string, describing the error */
+const char*
+ut_strerr(
+/*======*/
+ dberr_t num); /*!< in: error number */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#ifdef UNIV_PFS_MEMORY
+
+/** Extract the basename of a file without its extension.
+For example, extract "foo0bar" out of "/path/to/foo0bar.cc".
+@param[in] file file path, e.g. "/path/to/foo0bar.cc"
+@param[out] base result, e.g. "foo0bar"
+@param[in] base_size size of the output buffer 'base', if there
+is not enough space, then the result will be truncated, but always
+'\0'-terminated
+@return number of characters that would have been printed if the size
+were unlimited (not including the final ‘\0’) */
+size_t
+ut_basename_noext(
+ const char* file,
+ char* base,
+ size_t base_size);
+
+#endif /* UNIV_PFS_MEMORY */
+
+namespace ib {
+
+/** This is a wrapper class, used to print any unsigned integer type
+in hexadecimal format. The main purpose of this data type is to
+overload the global operator<<, so that we can print the given
+wrapper value in hex. */
+struct hex {
+ explicit hex(uintmax_t t): m_val(t) {}
+ const uintmax_t m_val;
+};
+
+/** This is an overload of the global operator<< for the user defined type
+ib::hex. The unsigned value held in the ib::hex wrapper class will be printed
+into the given output stream in hexadecimal format.
+@param[in,out] lhs the output stream into which rhs is written.
+@param[in] rhs the object to be written into lhs.
+@retval reference to the output stream. */
+inline
+std::ostream&
+operator<<(
+ std::ostream& lhs,
+ const hex& rhs)
+{
+ std::ios_base::fmtflags ff = lhs.flags();
+ lhs << std::showbase << std::hex << rhs.m_val;
+ lhs.setf(ff);
+ return(lhs);
+}
+
+/** The class logger is the base class of all the error log related classes.
+It contains a std::ostringstream object. The main purpose of this class is
+to forward operator<< to the underlying std::ostringstream object. Do not
+use this class directly, instead use one of the derived classes. */
+class logger
+{
+protected:
+ /* This class must not be used directly */
+ ATTRIBUTE_COLD ATTRIBUTE_NOINLINE logger() {}
+public:
+ template<typename T> ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
+ logger& operator<<(const T& rhs)
+ {
+ m_oss << rhs;
+ return *this;
+ }
+
+ /** Handle a fixed character string in the same way as a pointer to
+ an unknown-length character string, to reduce object code bloat. */
+ template<size_t N> logger& operator<<(const char (&rhs)[N])
+ { return *this << static_cast<const char*>(rhs); }
+
+ /** Output an error code name */
+ ATTRIBUTE_COLD logger& operator<<(dberr_t err);
+
+ /** Append a string.
+ @param buf string buffer
+ @param size buffer size
+ @return the output stream */
+ ATTRIBUTE_COLD __attribute__((noinline))
+ std::ostream &write(const char *buf, std::streamsize size)
+ {
+ return m_oss.write(buf, size);
+ }
+
+ std::ostream &write(const byte *buf, std::streamsize size)
+ { return write(reinterpret_cast<const char*>(buf), size); }
+
+ std::ostringstream m_oss;
+};
+
+/** The class info is used to emit informational log messages. It is to be
+used similar to std::cout. But the log messages will be emitted only when
+the dtor is called. The preferred usage of this class is to make use of
+unnamed temporaries as follows:
+
+info() << "The server started successfully.";
+
+In the above usage, the temporary object will be destroyed at the end of the
+statement and hence the log message will be emitted at the end of the
+statement. If a named object is created, then the log message will be emitted
+only when it goes out of scope or destroyed. */
+class info : public logger {
+public:
+ ATTRIBUTE_COLD
+ ~info();
+};
+
+/** The class warn is used to emit warnings. Refer to the documentation of
+class info for further details. */
+class warn : public logger {
+public:
+ ATTRIBUTE_COLD
+ ~warn();
+};
+
+/** The class error is used to emit error messages. Refer to the
+documentation of class info for further details. */
+class error : public logger {
+public:
+ ATTRIBUTE_COLD
+ ~error();
+ /** Indicates that error::~error() was invoked. Can be used to
+ determine if error messages were logged during innodb code execution.
+ @return true if there were error messages, false otherwise. */
+ static bool was_logged() { return logged; }
+
+private:
+ /** true if error::~error() was invoked, false otherwise */
+ static bool logged;
+};
+
+/** The class fatal is used to emit an error message and stop the server
+by crashing it. Use this class when MySQL server needs to be stopped
+immediately. Refer to the documentation of class info for usage details. */
+class fatal : public logger {
+public:
+ ATTRIBUTE_NORETURN
+ ~fatal();
+};
+
+/** Emit an error message if the given predicate is true, otherwise emit a
+warning message */
+class error_or_warn : public logger {
+public:
+ ATTRIBUTE_COLD
+ error_or_warn(bool pred)
+ : m_error(pred)
+ {}
+
+ ATTRIBUTE_COLD
+ ~error_or_warn();
+private:
+ const bool m_error;
+};
+
+/** Emit a fatal message if the given predicate is true, otherwise emit a
+error message. */
+class fatal_or_error : public logger {
+public:
+ ATTRIBUTE_COLD
+ fatal_or_error(bool pred)
+ : m_fatal(pred)
+ {}
+
+ ATTRIBUTE_COLD
+ ~fatal_or_error();
+private:
+ const bool m_fatal;
+};
+
+} // namespace ib
+
+#include "ut0ut.ic"
+
+#endif
+
diff --git a/storage/innobase/include/ut0ut.ic b/storage/innobase/include/ut0ut.ic
new file mode 100644
index 00000000..73feaf82
--- /dev/null
+++ b/storage/innobase/include/ut0ut.ic
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0ut.ic
+Various utilities
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#include <algorithm>
+
+/** Calculate the minimum of two pairs.
+@param[out] min_hi MSB of the minimum pair
+@param[out] min_lo LSB of the minimum pair
+@param[in] a_hi MSB of the first pair
+@param[in] a_lo LSB of the first pair
+@param[in] b_hi MSB of the second pair
+@param[in] b_lo LSB of the second pair */
+UNIV_INLINE
+void
+ut_pair_min(
+ ulint* min_hi,
+ ulint* min_lo,
+ ulint a_hi,
+ ulint a_lo,
+ ulint b_hi,
+ ulint b_lo)
+{
+ if (a_hi == b_hi) {
+ *min_hi = a_hi;
+ *min_lo = std::min(a_lo, b_lo);
+ } else if (a_hi < b_hi) {
+ *min_hi = a_hi;
+ *min_lo = a_lo;
+ } else {
+ *min_hi = b_hi;
+ *min_lo = b_lo;
+ }
+}
+
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+ ulint a, /*!< in: ulint */
+ ulint b) /*!< in: ulint */
+{
+ if (a < b) {
+ return(-1);
+ } else if (a == b) {
+ return(0);
+ } else {
+ return(1);
+ }
+}
+
+/** Compare two pairs of integers.
+@param[in] a_h more significant part of first pair
+@param[in] a_l less significant part of first pair
+@param[in] b_h more significant part of second pair
+@param[in] b_l less significant part of second pair
+@return comparison result of (a_h,a_l) and (b_h,b_l)
+@retval -1 if (a_h,a_l) is less than (b_h,b_l)
+@retval 0 if (a_h,a_l) is equal to (b_h,b_l)
+@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */
+UNIV_INLINE
+int
+ut_pair_cmp(
+ ulint a_h,
+ ulint a_l,
+ ulint b_h,
+ ulint b_l)
+{
+ if (a_h < b_h) {
+ return(-1);
+ }
+ if (a_h > b_h) {
+ return(1);
+ }
+ return(ut_ulint_cmp(a_l, b_l));
+}
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+ ulint n) /*!< in: number != 0 */
+{
+ ulint res;
+
+ res = 0;
+
+ ut_ad(n > 0);
+
+ n = n - 1;
+
+ for (;;) {
+ n = n / 2;
+
+ if (n == 0) {
+ break;
+ }
+
+ res++;
+ }
+
+ return(res + 1);
+}
+
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+ ulint n) /*!< in: number */
+{
+ return((ulint) 1 << n);
+}
diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h
new file mode 100644
index 00000000..cfdaee60
--- /dev/null
+++ b/storage/innobase/include/ut0vec.h
@@ -0,0 +1,285 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.h
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#ifndef IB_VECTOR_H
+#define IB_VECTOR_H
+
+#include "mem0mem.h"
+
+struct ib_alloc_t;
+struct ib_vector_t;
+
+typedef void* (*ib_mem_alloc_t)(
+ /* out: Pointer to allocated memory */
+ ib_alloc_t* allocator, /* in: Pointer to allocator instance */
+ ulint size); /* in: Number of bytes to allocate */
+
+typedef void (*ib_mem_free_t)(
+ ib_alloc_t* allocator, /* in: Pointer to allocator instance */
+ void* ptr); /* in: Memory to free */
+
+typedef void* (*ib_mem_resize_t)(
+ /* out: Pointer to resized memory */
+ ib_alloc_t* allocator, /* in: Pointer to allocator */
+ void* ptr, /* in: Memory to resize */
+ ulint old_size, /* in: Old memory size in bytes */
+ ulint new_size); /* in: New size in bytes */
+
+typedef int (*ib_compare_t)(const void*, const void*);
+
+/* An automatically resizing vector datatype with the following properties:
+
+ -All memory allocation is done through an allocator, which is responsible for
+freeing it when done with the vector.
+*/
+
+/* This is useful shorthand for elements of type void* */
+#define ib_vector_getp(v, n) (*(void**) ib_vector_get(v, n))
+#define ib_vector_getp_const(v, n) (*(void**) ib_vector_get_const(v, n))
+
+#define ib_vector_allocator(v) (v->allocator)
+
+/********************************************************************
+Create a new vector with the given initial size. */
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+ /* out: vector */
+ ib_alloc_t* alloc, /* in: Allocator */
+ /* in: size of the data item */
+ ulint sizeof_value,
+ ulint size); /* in: initial size */
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+ ib_vector_t* vec); /* in/out: vector */
+
+/********************************************************************
+Push a new element to the vector, increasing its size if necessary,
+if elem is not NULL then elem is copied to the vector.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+ /* out: pointer the "new" element */
+ ib_vector_t* vec, /* in/out: vector */
+ const void* elem); /* in: data element */
+
+/********************************************************************
+Pop the last element from the vector.*/
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+ /* out: pointer to the "new" element */
+ ib_vector_t* vec); /* in/out: vector */
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+ ib_vector_t* vec, /*!< in: vector */
+ const void* elem); /*!< in: value to remove */
+
+/********************************************************************
+Get the number of elements in the vector. */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+ /* out: number of elements in vector */
+ const ib_vector_t* vec); /* in: vector */
+
+/********************************************************************
+Increase the size of the vector. */
+void
+ib_vector_resize(
+/*=============*/
+ /* out: number of elements in vector */
+ ib_vector_t* vec); /* in/out: vector */
+
+/********************************************************************
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+ const ib_vector_t* vec); /*!< in: vector */
+
+/****************************************************************//**
+Get the n'th element.
+@return n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+ ib_vector_t* vec, /*!< in: vector */
+ ulint n); /*!< in: element index to get */
+
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+ const ib_vector_t* vec, /* in: vector */
+ ulint n); /* in: element index to get */
+/****************************************************************//**
+Get last element. The vector must not be empty.
+@return last element */
+UNIV_INLINE
+void*
+ib_vector_get_last(
+/*===============*/
+ ib_vector_t* vec); /*!< in: vector */
+/****************************************************************//**
+Set the n'th element. */
+UNIV_INLINE
+void
+ib_vector_set(
+/*==========*/
+ ib_vector_t* vec, /*!< in/out: vector */
+ ulint n, /*!< in: element index to set */
+ void* elem); /*!< in: data element */
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+ ib_vector_t* vec); /* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+ /* out: pointer to last element */
+ ib_vector_t* vec); /* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+ /* out: pointer to last element */
+ const ib_vector_t* vec); /* in: vector */
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+ ib_vector_t* vec, /* in/out: vector */
+ ib_compare_t compare); /* in: the comparator to use for sort */
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+ ib_alloc_t* allocator, /* in: allocator */
+ void* ptr); /* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+ /* out: pointer to allocated memory */
+ ib_alloc_t* allocator, /* in: allocator */
+ ulint size); /* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+ /* out: pointer to reallocated
+ memory */
+ ib_alloc_t* allocator, /* in: allocator */
+ void* old_ptr, /* in: pointer to memory */
+ ulint old_size, /* in: old size in bytes */
+ ulint new_size); /* in: new size in bytes */
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+ /* out: heap allocator instance */
+ mem_heap_t* heap); /* in: heap to use */
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+ ib_alloc_t* ib_ut_alloc); /* in: alloc instace to free */
+
+/* Allocator used by ib_vector_t. */
+struct ib_alloc_t {
+ ib_mem_alloc_t mem_malloc; /* For allocating memory */
+ ib_mem_free_t mem_release; /* For freeing memory */
+ ib_mem_resize_t mem_resize; /* For resizing memory */
+ void* arg; /* Currently if not NULL then it
+ points to the heap instance */
+};
+
+/* See comment at beginning of file. */
+struct ib_vector_t {
+ ib_alloc_t* allocator; /* Allocator, because one size
+ doesn't fit all */
+ void* data; /* data elements */
+ ulint used; /* number of elements currently used */
+ ulint total; /* number of elements allocated */
+ /* Size of a data item */
+ ulint sizeof_value;
+};
+
+#include "ut0vec.ic"
+
+#endif /* IB_VECTOR_H */
diff --git a/storage/innobase/include/ut0vec.ic b/storage/innobase/include/ut0vec.ic
new file mode 100644
index 00000000..531f0f22
--- /dev/null
+++ b/storage/innobase/include/ut0vec.ic
@@ -0,0 +1,348 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.ic
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#define IB_VEC_OFFSET(v, i) (vec->sizeof_value * i)
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+ ib_alloc_t* allocator, /* in: allocator */
+ ulint size) /* in: size in bytes */
+{
+ mem_heap_t* heap = (mem_heap_t*) allocator->arg;
+
+ return(mem_heap_alloc(heap, size));
+}
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+ ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */
+ void* ptr UNIV_UNUSED) /* in: size in bytes */
+{
+ /* We can't free individual elements. */
+}
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+We always assume new_size >= old_size, so the buffer won't overflow.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+ ib_alloc_t* allocator, /* in: allocator */
+ void* old_ptr, /* in: pointer to memory */
+ ulint old_size, /* in: old size in bytes */
+ ulint new_size) /* in: new size in bytes */
+{
+ void* new_ptr;
+ mem_heap_t* heap = (mem_heap_t*) allocator->arg;
+
+ ut_a(new_size >= old_size);
+ new_ptr = mem_heap_alloc(heap, new_size);
+ memcpy(new_ptr, old_ptr, old_size);
+
+ return(new_ptr);
+}
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+ mem_heap_t* heap) /* in: heap to use */
+{
+ ib_alloc_t* heap_alloc;
+
+ heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc));
+
+ heap_alloc->arg = heap;
+ heap_alloc->mem_release = ib_heap_free;
+ heap_alloc->mem_malloc = ib_heap_malloc;
+ heap_alloc->mem_resize = ib_heap_resize;
+
+ return(heap_alloc);
+}
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+ ib_alloc_t* ib_ut_alloc) /* in: alloc instace to free */
+{
+ mem_heap_free((mem_heap_t*) ib_ut_alloc->arg);
+}
+
+/********************************************************************
+Get number of elements in vector. */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+ /* out: number of elements in vector*/
+ const ib_vector_t* vec) /* in: vector */
+{
+ return(vec->used);
+}
+
+/****************************************************************//**
+Get n'th element. */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+ ib_vector_t* vec, /*!< in: vector */
+ ulint n) /*!< in: element index to get */
+{
+ ut_a(n < vec->used);
+
+ return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
+
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+ const ib_vector_t* vec, /* in: vector */
+ ulint n) /* in: element index to get */
+{
+ ut_a(n < vec->used);
+
+ return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
+/****************************************************************//**
+Get last element. The vector must not be empty.
+@return last element */
+UNIV_INLINE
+void*
+ib_vector_get_last(
+/*===============*/
+ ib_vector_t* vec) /*!< in: vector */
+{
+ ut_a(vec->used > 0);
+
+ return((byte*) ib_vector_get(vec, vec->used - 1));
+}
+
+/****************************************************************//**
+Set the n'th element. */
+UNIV_INLINE
+void
+ib_vector_set(
+/*==========*/
+ ib_vector_t* vec, /*!< in/out: vector */
+ ulint n, /*!< in: element index to set */
+ void* elem) /*!< in: data element */
+{
+ void* slot;
+
+ ut_a(n < vec->used);
+
+ slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+ memcpy(slot, elem, vec->sizeof_value);
+}
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+ /* out: void */
+ ib_vector_t* vec) /* in: vector */
+{
+ vec->used = 0;
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+ /* out: void */
+ ib_vector_t* vec) /* in: vector */
+{
+ ut_a(ib_vector_size(vec) > 0);
+
+ return(ib_vector_get(vec, ib_vector_size(vec) - 1));
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+ /* out: void */
+ const ib_vector_t* vec) /* in: vector */
+{
+ ut_a(ib_vector_size(vec) > 0);
+
+ return(ib_vector_get_const(vec, ib_vector_size(vec) - 1));
+}
+
+/****************************************************************//**
+Remove the last element from the vector.
+@return last vector element */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+ /* out: pointer to element */
+ ib_vector_t* vec) /* in: vector */
+{
+ void* elem;
+
+ ut_a(vec->used > 0);
+
+ elem = ib_vector_last(vec);
+ --vec->used;
+
+ return(elem);
+}
+
+/********************************************************************
+Append an element to the vector, if elem != NULL then copy the data
+from elem.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+ /* out: pointer to the "new" element */
+ ib_vector_t* vec, /* in: vector */
+ const void* elem) /* in: element to add (can be NULL) */
+{
+ void* last;
+
+ if (vec->used >= vec->total) {
+ ib_vector_resize(vec);
+ }
+
+ last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used);
+
+#ifdef UNIV_DEBUG
+ memset(last, 0, vec->sizeof_value);
+#endif
+
+ if (elem) {
+ memcpy(last, elem, vec->sizeof_value);
+ }
+
+ ++vec->used;
+
+ return(last);
+}
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+ ib_vector_t* vec, /*!< in: vector */
+ const void* elem) /*!< in: value to remove */
+{
+ void* current = NULL;
+ void* next;
+ ulint i;
+ ulint old_used_count = vec->used;
+
+ for (i = 0; i < vec->used; i++) {
+ current = ib_vector_get(vec, i);
+
+ if (*(void**) current == elem) {
+ if (i == vec->used - 1) {
+ return(ib_vector_pop(vec));
+ }
+
+ next = ib_vector_get(vec, i + 1);
+ memmove(current, next, vec->sizeof_value
+ * (vec->used - i - 1));
+ --vec->used;
+ break;
+ }
+ }
+
+ return((old_used_count != vec->used) ? current : NULL);
+}
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+ /* out: void */
+ ib_vector_t* vec, /* in: vector */
+ ib_compare_t compare)/* in: the comparator to use for sort */
+{
+ qsort(vec->data, vec->used, vec->sizeof_value, compare);
+}
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+ ib_vector_t* vec) /* in, own: vector */
+{
+ /* Currently we only support one type of allocator - heap,
+ when the heap is freed all the elements are freed too. */
+
+ /* Only the heap allocator uses the arg field. */
+ ut_ad(vec->allocator->arg != NULL);
+
+ mem_heap_free((mem_heap_t*) vec->allocator->arg);
+}
+
+/********************************************************************
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+ const ib_vector_t* vec) /*!< in: vector */
+{
+ return(ib_vector_size(vec) == 0);
+}
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
new file mode 100644
index 00000000..34762298
--- /dev/null
+++ b/storage/innobase/include/ut0wqueue.h
@@ -0,0 +1,94 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0wqueue.h
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A Work queue. Threads can add work items to the queue and other threads can
+wait for work items to be available and take them off the queue for
+processing.
+************************************************************************/
+
+#ifndef IB_WORK_QUEUE_H
+#define IB_WORK_QUEUE_H
+
+#include "ut0list.h"
+#include "mem0mem.h"
+
+// Forward declaration
+struct ib_list_t;
+
+/** Work queue */
+struct ib_wqueue_t
+{
+ /** Mutex protecting everything */
+ ib_mutex_t mutex;
+ /** Work item list */
+ ib_list_t* items;
+};
+
+/****************************************************************//**
+Create a new work queue.
+@return work queue */
+ib_wqueue_t*
+ib_wqueue_create();
+/*===============*/
+
+/****************************************************************//**
+Free a work queue. */
+void
+ib_wqueue_free(
+/*===========*/
+ ib_wqueue_t* wq); /*!< in: work queue */
+
+/** Add a work item to the queue.
+@param[in,out] wq work queue
+@param[in] item work item
+@param[in,out] heap memory heap to use for allocating list node
+@param[in] wq_locked work queue mutex locked */
+void
+ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap,
+ bool wq_locked = false);
+
+/** Check if queue is empty.
+@param wq wait queue
+@return whether the queue is empty */
+bool ib_wqueue_is_empty(ib_wqueue_t* wq);
+
+/********************************************************************
+Return first item on work queue or NULL if queue is empty
+@return work item or NULL */
+void*
+ib_wqueue_nowait(
+/*=============*/
+ ib_wqueue_t* wq); /*<! in: work queue */
+/********************************************************************
+Get number of items on queue.
+@return number of items on queue */
+ulint
+ib_wqueue_len(
+/*==========*/
+ ib_wqueue_t* wq); /*<! in: work queue */
+
+#endif /* IB_WORK_QUEUE_H */
diff --git a/storage/innobase/innodb.cmake b/storage/innobase/innodb.cmake
new file mode 100644
index 00000000..05a70aae
--- /dev/null
+++ b/storage/innobase/innodb.cmake
@@ -0,0 +1,191 @@
+# Copyright (c) 2006, 2016, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2017, 2020, MariaDB Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+# This is the CMakeLists for InnoDB
+
+INCLUDE(CheckFunctionExists)
+INCLUDE(CheckCSourceCompiles)
+INCLUDE(CheckCSourceRuns)
+INCLUDE(lz4.cmake)
+INCLUDE(lzo.cmake)
+INCLUDE(lzma.cmake)
+INCLUDE(bzip2.cmake)
+INCLUDE(snappy.cmake)
+INCLUDE(numa)
+INCLUDE(TestBigEndian)
+
+MYSQL_CHECK_LZ4()
+MYSQL_CHECK_LZO()
+MYSQL_CHECK_LZMA()
+MYSQL_CHECK_BZIP2()
+MYSQL_CHECK_SNAPPY()
+MYSQL_CHECK_NUMA()
+
+INCLUDE(${MYSQL_CMAKE_SCRIPT_DIR}/compile_flags.cmake)
+
+IF(CMAKE_CROSSCOMPILING)
+ # Use CHECK_C_SOURCE_COMPILES instead of CHECK_C_SOURCE_RUNS when
+ # cross-compiling. Not as precise, but usually good enough.
+ # This only make sense for atomic tests in this file, this trick doesn't
+ # work in a general case.
+ MACRO(CHECK_C_SOURCE SOURCE VAR)
+ CHECK_C_SOURCE_COMPILES("${SOURCE}" "${VAR}")
+ ENDMACRO()
+ELSE()
+ MACRO(CHECK_C_SOURCE SOURCE VAR)
+ CHECK_C_SOURCE_RUNS("${SOURCE}" "${VAR}")
+ ENDMACRO()
+ENDIF()
+
+# OS tests
+IF(UNIX)
+ IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+
+ ADD_DEFINITIONS("-DUNIV_LINUX -D_GNU_SOURCE=1")
+
+ CHECK_INCLUDE_FILES (libaio.h HAVE_LIBAIO_H)
+ CHECK_LIBRARY_EXISTS(aio io_queue_init "" HAVE_LIBAIO)
+
+ IF(HAVE_LIBAIO_H AND HAVE_LIBAIO)
+ ADD_DEFINITIONS(-DLINUX_NATIVE_AIO=1)
+ LINK_LIBRARIES(aio)
+ ENDIF()
+ IF(HAVE_LIBNUMA)
+ LINK_LIBRARIES(numa)
+ ENDIF()
+ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "HP*")
+ ADD_DEFINITIONS("-DUNIV_HPUX")
+ ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "AIX")
+ ADD_DEFINITIONS("-DUNIV_AIX")
+ ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+ ADD_DEFINITIONS("-DUNIV_SOLARIS")
+ ENDIF()
+ENDIF()
+
+OPTION(INNODB_COMPILER_HINTS "Compile InnoDB with compiler hints" ON)
+MARK_AS_ADVANCED(INNODB_COMPILER_HINTS)
+
+IF(INNODB_COMPILER_HINTS)
+ ADD_DEFINITIONS("-DCOMPILER_HINTS")
+ENDIF()
+ADD_FEATURE_INFO(INNODB_COMPILER_HINTS INNODB_COMPILER_HINTS "InnoDB compiled with compiler hints")
+
+# Enable InnoDB's UNIV_DEBUG in debug builds
+SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG")
+
+OPTION(WITH_INNODB_AHI "Include innodb_adaptive_hash_index" ON)
+OPTION(WITH_INNODB_ROOT_GUESS "Cache index root block descriptors" ON)
+IF(WITH_INNODB_AHI)
+ ADD_DEFINITIONS(-DBTR_CUR_HASH_ADAPT -DBTR_CUR_ADAPT)
+ IF(NOT WITH_INNODB_ROOT_GUESS)
+ MESSAGE(WARNING "WITH_INNODB_AHI implies WITH_INNODB_ROOT_GUESS")
+ SET(WITH_INNODB_ROOT_GUESS ON)
+ ENDIF()
+ELSEIF(WITH_INNODB_ROOT_GUESS)
+ ADD_DEFINITIONS(-DBTR_CUR_ADAPT)
+ENDIF()
+ADD_FEATURE_INFO(INNODB_AHI WITH_INNODB_AHI "InnoDB Adaptive Hash Index")
+ADD_FEATURE_INFO(INNODB_ROOT_GUESS WITH_INNODB_ROOT_GUESS
+ "Cache index root block descriptors in InnoDB")
+
+OPTION(WITH_INNODB_EXTRA_DEBUG "Enable extra InnoDB debug checks" OFF)
+IF(WITH_INNODB_EXTRA_DEBUG)
+ ADD_DEFINITIONS(-DUNIV_ZIP_DEBUG)
+ENDIF()
+ADD_FEATURE_INFO(INNODB_EXTRA_DEBUG WITH_INNODB_EXTRA_DEBUG "Extra InnoDB debug checks")
+
+
+CHECK_FUNCTION_EXISTS(sched_getcpu HAVE_SCHED_GETCPU)
+IF(HAVE_SCHED_GETCPU)
+ ADD_DEFINITIONS(-DHAVE_SCHED_GETCPU=1)
+ENDIF()
+
+CHECK_FUNCTION_EXISTS(nanosleep HAVE_NANOSLEEP)
+IF(HAVE_NANOSLEEP)
+ ADD_DEFINITIONS(-DHAVE_NANOSLEEP=1)
+ENDIF()
+
+IF(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE)
+ ADD_DEFINITIONS(-DHAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE=1)
+ENDIF()
+
+IF (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR
+ CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion -Wno-sign-conversion")
+ SET_SOURCE_FILES_PROPERTIES(fts/fts0pars.cc
+ PROPERTIES COMPILE_FLAGS -Wno-conversion)
+ENDIF()
+
+IF(NOT MSVC)
+ # Work around MDEV-18417, MDEV-18656, MDEV-18417
+ IF(WITH_ASAN AND CMAKE_COMPILER_IS_GNUCC AND
+ CMAKE_C_COMPILER_VERSION VERSION_LESS "6.0.0")
+ SET_SOURCE_FILES_PROPERTIES(trx/trx0rec.cc PROPERTIES COMPILE_FLAGS -O1)
+ ENDIF()
+ENDIF()
+
+CHECK_FUNCTION_EXISTS(vasprintf HAVE_VASPRINTF)
+
+CHECK_CXX_SOURCE_COMPILES("struct t1{ int a; char *b; }; struct t1 c= { .a=1, .b=0 }; main() { }" HAVE_C99_INITIALIZERS)
+IF(HAVE_C99_INITIALIZERS)
+ ADD_DEFINITIONS(-DHAVE_C99_INITIALIZERS)
+ENDIF()
+
+SET(MUTEXTYPE "event" CACHE STRING "Mutex type: event, sys or futex")
+
+IF(MUTEXTYPE MATCHES "event")
+ ADD_DEFINITIONS(-DMUTEX_EVENT)
+ELSEIF(MUTEXTYPE MATCHES "futex" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ ADD_DEFINITIONS(-DMUTEX_FUTEX)
+ELSE()
+ ADD_DEFINITIONS(-DMUTEX_SYS)
+ENDIF()
+
+OPTION(WITH_INNODB_DISALLOW_WRITES "InnoDB freeze writes patch from Google" ${WITH_WSREP})
+IF (WITH_INNODB_DISALLOW_WRITES)
+ ADD_DEFINITIONS(-DWITH_INNODB_DISALLOW_WRITES)
+ENDIF()
+ADD_FEATURE_INFO(INNODB_DISALLOW_WRITES WITH_INNODB_DISALLOW_WRITES "Expose innodb_disallow_writes switch to stop innodb from writing to disk")
+
+
+# Include directories under innobase
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include
+ ${CMAKE_SOURCE_DIR}/storage/innobase/handler)
+
+# Sun Studio bug with -xO2
+IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro"
+ AND CMAKE_CXX_FLAGS_RELEASE MATCHES "O2"
+ AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+ # Sun Studio 12 crashes with -xO2 flag, but not with higher optimization
+ # -xO3
+ SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.cc
+ PROPERTIES COMPILE_FLAGS -xO3)
+ENDIF()
+
+
+IF(MSVC)
+ # Avoid "unreferenced label" warning in generated file
+ GET_FILENAME_COMPONENT(_SRC_DIR ${CMAKE_CURRENT_LIST_FILE} PATH)
+ SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/pars0grm.c
+ PROPERTIES COMPILE_FLAGS "/wd4102")
+ SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/lexyy.c
+ PROPERTIES COMPILE_FLAGS "/wd4003")
+ENDIF()
+
+# Include directories under innobase
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include
+ ${CMAKE_SOURCE_DIR}/storage/innobase/handler
+ ${CMAKE_SOURCE_DIR}/libbinlogevents/include )
diff --git a/storage/innobase/lock/lock0iter.cc b/storage/innobase/lock/lock0iter.cc
new file mode 100644
index 00000000..7a7130ed
--- /dev/null
+++ b/storage/innobase/lock/lock0iter.cc
@@ -0,0 +1,107 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0iter.cc
+Lock queue iterator. Can iterate over table and record
+lock queues.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "dict0mem.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "lock0priv.h"
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+ bit_no is calculated in this function by using
+ lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+ of a wait lock. */
+void
+lock_queue_iterator_reset(
+/*======================*/
+ lock_queue_iterator_t* iter, /*!< out: iterator */
+ const lock_t* lock, /*!< in: lock to start from */
+ ulint bit_no) /*!< in: record number in the
+ heap */
+{
+ ut_ad(lock_mutex_own());
+
+ iter->current_lock = lock;
+
+ if (bit_no != ULINT_UNDEFINED) {
+
+ iter->bit_no = bit_no;
+ } else {
+
+ switch (lock_get_type_low(lock)) {
+ case LOCK_TABLE:
+ iter->bit_no = ULINT_UNDEFINED;
+ break;
+ case LOCK_REC:
+ iter->bit_no = lock_rec_find_set_bit(lock);
+ ut_a(iter->bit_no != ULINT_UNDEFINED);
+ break;
+ default:
+ ut_error;
+ }
+ }
+}
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return previous lock or NULL */
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+ lock_queue_iterator_t* iter) /*!< in/out: iterator */
+{
+ const lock_t* prev_lock;
+
+ ut_ad(lock_mutex_own());
+
+ switch (lock_get_type_low(iter->current_lock)) {
+ case LOCK_REC:
+ prev_lock = lock_rec_get_prev(
+ iter->current_lock, iter->bit_no);
+ break;
+ case LOCK_TABLE:
+ prev_lock = UT_LIST_GET_PREV(
+ un_member.tab_lock.locks, iter->current_lock);
+ break;
+ default:
+ ut_error;
+ }
+
+ if (prev_lock != NULL) {
+
+ iter->current_lock = prev_lock;
+ }
+
+ return(prev_lock);
+}
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
new file mode 100644
index 00000000..8dc2d7c5
--- /dev/null
+++ b/storage/innobase/lock/lock0lock.cc
@@ -0,0 +1,6818 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0lock.cc
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "univ.i"
+
+#include <mysql/service_thd_error_context.h>
+#include <sql_class.h>
+
+#include "lock0lock.h"
+#include "lock0priv.h"
+#include "dict0mem.h"
+#include "trx0purge.h"
+#include "trx0sys.h"
+#include "ut0vec.h"
+#include "btr0cur.h"
+#include "row0sel.h"
+#include "row0mysql.h"
+#include "row0vers.h"
+#include "pars0pars.h"
+
+#include <set>
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif /* WITH_WSREP */
+
+/** Lock scheduling algorithm */
+ulong innodb_lock_schedule_algorithm;
+
+/** The value of innodb_deadlock_detect */
+my_bool innobase_deadlock_detect;
+
+/*********************************************************************//**
+Checks if a waiting record lock request still has to wait in a queue.
+@return lock that is causing the wait */
+static
+const lock_t*
+lock_rec_has_to_wait_in_queue(
+/*==========================*/
+ const lock_t* wait_lock); /*!< in: waiting record lock */
+
+/** Grant a lock to a waiting lock request and release the waiting transaction
+after lock_reset_lock_and_trx_wait() has been called. */
+static void lock_grant_after_reset(lock_t* lock);
+
+extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
+extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
+extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
+
+/** Pretty-print a table lock.
+@param[in,out] file output stream
+@param[in] lock table lock */
+static void lock_table_print(FILE* file, const lock_t* lock);
+
+/** Pretty-print a record lock.
+@param[in,out] file output stream
+@param[in] lock record lock
+@param[in,out] mtr mini-transaction for accessing the record */
+static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr);
+
+/** Deadlock checker. */
+class DeadlockChecker {
+public:
+ /** Check if a joining lock request results in a deadlock.
+ If a deadlock is found, we will resolve the deadlock by
+ choosing a victim transaction and rolling it back.
+ We will attempt to resolve all deadlocks.
+
+ @param[in] lock the lock request
+ @param[in,out] trx transaction requesting the lock
+
+ @return trx if it was chosen as victim
+ @retval NULL if another victim was chosen,
+ or there is no deadlock (any more) */
+ static const trx_t* check_and_resolve(const lock_t* lock, trx_t* trx);
+
+private:
+ /** Do a shallow copy. Default destructor OK.
+ @param trx the start transaction (start node)
+ @param wait_lock lock that a transaction wants
+ @param mark_start visited node counter
+ @param report_waiters whether to call thd_rpl_deadlock_check() */
+ DeadlockChecker(
+ const trx_t* trx,
+ const lock_t* wait_lock,
+ ib_uint64_t mark_start,
+ bool report_waiters)
+ :
+ m_cost(),
+ m_start(trx),
+ m_too_deep(),
+ m_wait_lock(wait_lock),
+ m_mark_start(mark_start),
+ m_n_elems(),
+ m_report_waiters(report_waiters)
+ {
+ }
+
+ /** Check if the search is too deep. */
+ bool is_too_deep() const
+ {
+ return(m_n_elems > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK
+ || m_cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK);
+ }
+
+ /** Save current state.
+ @param lock lock to push on the stack.
+ @param heap_no the heap number to push on the stack.
+ @return false if stack is full. */
+ bool push(const lock_t* lock, ulint heap_no)
+ {
+ ut_ad((lock_get_type_low(lock) & LOCK_REC)
+ || (lock_get_type_low(lock) & LOCK_TABLE));
+
+ ut_ad(((lock_get_type_low(lock) & LOCK_TABLE) != 0)
+ == (heap_no == ULINT_UNDEFINED));
+
+ /* Ensure that the stack is bounded. */
+ if (m_n_elems >= UT_ARR_SIZE(s_states)) {
+ return(false);
+ }
+
+ state_t& state = s_states[m_n_elems++];
+
+ state.m_lock = lock;
+ state.m_wait_lock = m_wait_lock;
+ state.m_heap_no =heap_no;
+
+ return(true);
+ }
+
+ /** Restore state.
+ @param[out] lock current lock
+ @param[out] heap_no current heap_no */
+ void pop(const lock_t*& lock, ulint& heap_no)
+ {
+ ut_a(m_n_elems > 0);
+
+ const state_t& state = s_states[--m_n_elems];
+
+ lock = state.m_lock;
+ heap_no = state.m_heap_no;
+ m_wait_lock = state.m_wait_lock;
+ }
+
+ /** Check whether the node has been visited.
+ @param lock lock to check
+ @return true if the node has been visited */
+ bool is_visited(const lock_t* lock) const
+ {
+ return(lock->trx->lock.deadlock_mark > m_mark_start);
+ }
+
+ /** Get the next lock in the queue that is owned by a transaction
+ whose sub-tree has not already been searched.
+ Note: "next" here means PREV for table locks.
+ @param lock Lock in queue
+ @param heap_no heap_no if lock is a record lock else ULINT_UNDEFINED
+ @return next lock or NULL if at end of queue */
+ const lock_t* get_next_lock(const lock_t* lock, ulint heap_no) const;
+
+ /** Get the first lock to search. The search starts from the current
+ wait_lock. What we are really interested in is an edge from the
+ current wait_lock's owning transaction to another transaction that has
+ a lock ahead in the queue. We skip locks where the owning transaction's
+ sub-tree has already been searched.
+
+ Note: The record locks are traversed from the oldest lock to the
+ latest. For table locks we go from latest to oldest.
+
+ For record locks, we first position the iterator on first lock on
+ the page and then reposition on the actual heap_no. This is required
+ due to the way the record lock has is implemented.
+
+ @param[out] heap_no if rec lock, else ULINT_UNDEFINED.
+
+ @return first lock or NULL */
+ const lock_t* get_first_lock(ulint* heap_no) const;
+
+ /** Notify that a deadlock has been detected and print the conflicting
+ transaction info.
+ @param lock lock causing deadlock */
+ void notify(const lock_t* lock) const;
+
+ /** Select the victim transaction that should be rolledback.
+ @return victim transaction */
+ const trx_t* select_victim() const;
+
+ /** Rollback transaction selected as the victim. */
+ void trx_rollback();
+
+ /** Looks iteratively for a deadlock. Note: the joining transaction
+ may have been granted its lock by the deadlock checks.
+
+ @return 0 if no deadlock else the victim transaction.*/
+ const trx_t* search();
+
+ /** Print transaction data to the deadlock file and possibly to stderr.
+ @param trx transaction
+ @param max_query_len max query length to print */
+ static void print(const trx_t* trx, ulint max_query_len);
+
+ /** rewind(3) the file used for storing the latest detected deadlock
+ and print a heading message to stderr if printing of all deadlocks to
+ stderr is enabled. */
+ static void start_print();
+
+ /** Print lock data to the deadlock file and possibly to stderr.
+ @param lock record or table type lock */
+ static void print(const lock_t* lock);
+
+ /** Print a message to the deadlock file and possibly to stderr.
+ @param msg message to print */
+ static void print(const char* msg);
+
+ /** Print info about transaction that was rolled back.
+ @param trx transaction rolled back
+ @param lock lock trx wants */
+ static void rollback_print(const trx_t* trx, const lock_t* lock);
+
+private:
+ /** DFS state information, used during deadlock checking. */
+ struct state_t {
+ const lock_t* m_lock; /*!< Current lock */
+ const lock_t* m_wait_lock; /*!< Waiting for lock */
+ ulint m_heap_no; /*!< heap number if rec lock */
+ };
+
+ /** Used in deadlock tracking. Protected by lock_sys.mutex. */
+ static ib_uint64_t s_lock_mark_counter;
+
+ /** Calculation steps thus far. It is the count of the nodes visited. */
+ ulint m_cost;
+
+ /** Joining transaction that is requesting a lock in an
+ incompatible mode */
+ const trx_t* m_start;
+
+ /** TRUE if search was too deep and was aborted */
+ bool m_too_deep;
+
+ /** Lock that trx wants */
+ const lock_t* m_wait_lock;
+
+ /** Value of lock_mark_count at the start of the deadlock check. */
+ ib_uint64_t m_mark_start;
+
+ /** Number of states pushed onto the stack */
+ size_t m_n_elems;
+
+ /** This is to avoid malloc/free calls. */
+ static state_t s_states[MAX_STACK_SIZE];
+
+ /** Set if thd_rpl_deadlock_check() should be called for waits. */
+ const bool m_report_waiters;
+};
+
+/** Counter to mark visited nodes during deadlock search. */
+ib_uint64_t DeadlockChecker::s_lock_mark_counter = 0;
+
+/** The stack used for deadlock searches. */
+DeadlockChecker::state_t DeadlockChecker::s_states[MAX_STACK_SIZE];
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Validates the lock system.
+@return TRUE if ok */
+static
+bool
+lock_validate();
+/*============*/
+
+/*********************************************************************//**
+Validates the record lock queues on a page.
+@return TRUE if ok */
+static
+ibool
+lock_rec_validate_page(
+/*===================*/
+ const buf_block_t* block) /*!< in: buffer block */
+ MY_ATTRIBUTE((warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+/* The lock system */
+lock_sys_t lock_sys;
+
+/** We store info on the latest deadlock error to this buffer. InnoDB
+Monitor will then fetch it and print */
+static bool lock_deadlock_found = false;
+
+/** Only created if !srv_read_only_mode */
+static FILE* lock_latest_err_file;
+
+/*********************************************************************//**
+Reports that a transaction id is insensible, i.e., in the future. */
+ATTRIBUTE_COLD
+void
+lock_report_trx_id_insanity(
+/*========================*/
+ trx_id_t trx_id, /*!< in: trx id */
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ trx_id_t max_trx_id) /*!< in: trx_sys.get_max_trx_id() */
+{
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!rec_is_metadata(rec, *index));
+
+ ib::error()
+ << "Transaction id " << ib::hex(trx_id)
+ << " associated with record" << rec_offsets_print(rec, offsets)
+ << " in index " << index->name
+ << " of table " << index->table->name
+ << " is greater than the global counter " << max_trx_id
+ << "! The table is corrupted.";
+}
+
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return true if ok */
+bool
+lock_check_trx_id_sanity(
+/*=====================*/
+ trx_id_t trx_id, /*!< in: trx id */
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets) /*!< in: rec_get_offsets(rec, index) */
+{
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!rec_is_metadata(rec, *index));
+
+ trx_id_t max_trx_id= trx_sys.get_max_trx_id();
+ ut_ad(max_trx_id || srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN);
+
+ if (UNIV_LIKELY(max_trx_id != 0) && UNIV_UNLIKELY(trx_id >= max_trx_id))
+ {
+ lock_report_trx_id_insanity(trx_id, rec, index, offsets, max_trx_id);
+ return false;
+ }
+ return true;
+}
+
+/*********************************************************************//**
+Checks that a record is seen in a consistent read.
+@return true if sees, or false if an earlier version of the record
+should be retrieved */
+bool
+lock_clust_rec_cons_read_sees(
+/*==========================*/
+ const rec_t* rec, /*!< in: user record which should be read or
+ passed over by a read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ReadView* view) /*!< in: consistent read view */
+{
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!rec_is_metadata(rec, *index));
+
+ /* Temp-tables are not shared across connections and multiple
+ transactions from different connections cannot simultaneously
+ operate on same temp-table and so read of temp-table is
+ always consistent read. */
+ if (index->table->is_temporary()) {
+ return(true);
+ }
+
+ /* NOTE that we call this function while holding the search
+ system latch. */
+
+ trx_id_t trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ return(view->changes_visible(trx_id, index->table->name));
+}
+
+/*********************************************************************//**
+Checks that a non-clustered index record is seen in a consistent read.
+
+NOTE that a non-clustered index page contains so little information on
+its modifications that also in the case false, the present version of
+rec may be the right, but we must check this from the clustered index
+record.
+
+@return true if certainly sees, or false if an earlier version of the
+clustered index record might be needed */
+bool
+lock_sec_rec_cons_read_sees(
+/*========================*/
+ const rec_t* rec, /*!< in: user record which
+ should be read or passed over
+ by a read cursor */
+ const dict_index_t* index, /*!< in: index */
+ const ReadView* view) /*!< in: consistent read view */
+{
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(!index->is_primary());
+ ut_ad(!rec_is_metadata(rec, *index));
+
+ /* NOTE that we might call this function while holding the search
+ system latch. */
+
+ if (index->table->is_temporary()) {
+
+ /* Temp-tables are not shared across connections and multiple
+ transactions from different connections cannot simultaneously
+ operate on same temp-table and so read of temp-table is
+ always consistent read. */
+
+ return(true);
+ }
+
+ trx_id_t max_trx_id = page_get_max_trx_id(page_align(rec));
+
+ ut_ad(max_trx_id > 0);
+
+ return(view->sees(max_trx_id));
+}
+
+
+/**
+ Creates the lock system at database start.
+
+ @param[in] n_cells number of slots in lock hash table
+*/
+void lock_sys_t::create(ulint n_cells)
+{
+ ut_ad(this == &lock_sys);
+
+ m_initialised= true;
+
+ waiting_threads = static_cast<srv_slot_t*>
+ (ut_zalloc_nokey(srv_max_n_threads * sizeof *waiting_threads));
+ last_slot = waiting_threads;
+
+ mutex_create(LATCH_ID_LOCK_SYS, &mutex);
+
+ mutex_create(LATCH_ID_LOCK_SYS_WAIT, &wait_mutex);
+
+
+ rec_hash.create(n_cells);
+ prdt_hash.create(n_cells);
+ prdt_page_hash.create(n_cells);
+
+ if (!srv_read_only_mode) {
+ lock_latest_err_file = os_file_create_tmpfile();
+ ut_a(lock_latest_err_file);
+ }
+ timeout_timer_active = false;
+}
+
+/** Calculates the fold value of a lock: used in migrating the hash table.
+@param[in] lock record lock object
+@return folded value */
+static ulint lock_rec_lock_fold(const lock_t *lock)
+{
+ return lock->un_member.rec_lock.page_id.fold();
+}
+
+
+/**
+ Resize the lock hash table.
+
+ @param[in] n_cells number of slots in lock hash table
+*/
+void lock_sys_t::resize(ulint n_cells)
+{
+ ut_ad(this == &lock_sys);
+
+ mutex_enter(&mutex);
+
+ hash_table_t old_hash(rec_hash);
+ rec_hash.create(n_cells);
+ HASH_MIGRATE(&old_hash, &rec_hash, lock_t, hash,
+ lock_rec_lock_fold);
+ old_hash.free();
+
+ old_hash = prdt_hash;
+ prdt_hash.create(n_cells);
+ HASH_MIGRATE(&old_hash, &prdt_hash, lock_t, hash,
+ lock_rec_lock_fold);
+ old_hash.free();
+
+ old_hash = prdt_page_hash;
+ prdt_page_hash.create(n_cells);
+ HASH_MIGRATE(&old_hash, &prdt_page_hash, lock_t, hash,
+ lock_rec_lock_fold);
+ old_hash.free();
+ mutex_exit(&mutex);
+}
+
+
+/** Closes the lock system at database shutdown. */
+void lock_sys_t::close()
+{
+ ut_ad(this == &lock_sys);
+
+ if (!m_initialised) return;
+
+ if (lock_latest_err_file != NULL) {
+ my_fclose(lock_latest_err_file, MYF(MY_WME));
+ lock_latest_err_file = NULL;
+ }
+
+ rec_hash.free();
+ prdt_hash.free();
+ prdt_page_hash.free();
+
+ mutex_destroy(&mutex);
+ mutex_destroy(&wait_mutex);
+
+ for (ulint i = srv_max_n_threads; i--; ) {
+ if (os_event_t& event = waiting_threads[i].event) {
+ os_event_destroy(event);
+ }
+ }
+
+ ut_free(waiting_threads);
+ m_initialised= false;
+}
+
+/*********************************************************************//**
+Gets the size of a lock struct.
+@return size in bytes */
+ulint
+lock_get_size(void)
+/*===============*/
+{
+ return((ulint) sizeof(lock_t));
+}
+
+static inline void lock_grant_have_trx_mutex(lock_t* lock)
+{
+ lock_reset_lock_and_trx_wait(lock);
+ lock_grant_after_reset(lock);
+}
+
+/*********************************************************************//**
+Gets the gap flag of a record lock.
+@return LOCK_GAP or 0 */
+UNIV_INLINE
+ulint
+lock_rec_get_gap(
+/*=============*/
+ const lock_t* lock) /*!< in: record lock */
+{
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ return(lock->type_mode & LOCK_GAP);
+}
+
+/*********************************************************************//**
+Gets the LOCK_REC_NOT_GAP flag of a record lock.
+@return LOCK_REC_NOT_GAP or 0 */
+UNIV_INLINE
+ulint
+lock_rec_get_rec_not_gap(
+/*=====================*/
+ const lock_t* lock) /*!< in: record lock */
+{
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ return(lock->type_mode & LOCK_REC_NOT_GAP);
+}
+
+/*********************************************************************//**
+Gets the waiting insert flag of a record lock.
+@return LOCK_INSERT_INTENTION or 0 */
+UNIV_INLINE
+ulint
+lock_rec_get_insert_intention(
+/*==========================*/
+ const lock_t* lock) /*!< in: record lock */
+{
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ return(lock->type_mode & LOCK_INSERT_INTENTION);
+}
+
+#ifdef UNIV_DEBUG
+#ifdef WITH_WSREP
+/** Check if both conflicting lock transaction and other transaction
+requesting record lock are brute force (BF). If they are check is
+this BF-BF wait correct and if not report BF wait and assert.
+
+@param[in] lock_rec other waiting record lock
+@param[in] trx trx requesting conflicting record lock
+*/
+static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx)
+{
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(lock_mutex_own());
+ trx_t* lock_trx= lock->trx;
+
+ /* Note that we are holding lock_sys->mutex, thus we should
+ not acquire THD::LOCK_thd_data mutex below to avoid mutexing
+ order violation. */
+
+ if (!trx->is_wsrep() || !lock_trx->is_wsrep())
+ return;
+ if (UNIV_LIKELY(!wsrep_thd_is_BF(trx->mysql_thd, FALSE))
+ || UNIV_LIKELY(!wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE)))
+ return;
+
+ ut_ad(trx->state == TRX_STATE_ACTIVE);
+
+ trx_mutex_enter(lock_trx);
+ const trx_state_t trx2_state= lock_trx->state;
+ trx_mutex_exit(lock_trx);
+
+ /* If transaction is already committed in memory or
+ prepared we should wait. When transaction is committed in
+ memory we held trx mutex, but not lock_sys->mutex. Therefore,
+ we could end here before transaction has time to do
+ lock_release() that is protected with lock_sys->mutex. */
+ switch (trx2_state) {
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ case TRX_STATE_PREPARED:
+ return;
+ case TRX_STATE_ACTIVE:
+ break;
+ default:
+ ut_ad("invalid state" == 0);
+ }
+
+ /* If BF - BF order is honored, i.e. trx already holding
+ record lock should be ordered before this new lock request
+ we can keep trx waiting for the lock. If conflicting
+ transaction is already aborting or rolling back for replaying
+ we can also let new transaction waiting. */
+ if (wsrep_thd_order_before(lock_trx->mysql_thd, trx->mysql_thd)
+ || wsrep_thd_is_aborting(lock_trx->mysql_thd)) {
+ return;
+ }
+
+ mtr_t mtr;
+
+ ib::error() << "Conflicting lock on table: "
+ << lock->index->table->name
+ << " index: "
+ << lock->index->name()
+ << " that has lock ";
+ lock_rec_print(stderr, lock, mtr);
+
+ ib::error() << "WSREP state: ";
+
+ wsrep_report_bf_lock_wait(trx->mysql_thd,
+ trx->id);
+ wsrep_report_bf_lock_wait(lock_trx->mysql_thd,
+ lock_trx->id);
+ /* BF-BF wait is a bug */
+ ut_error;
+}
+#endif /* WITH_WSREP */
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Checks if a lock request for a new lock has to wait for request lock2.
+@return TRUE if new lock has to wait for lock2 to be removed */
+UNIV_INLINE
+bool
+lock_rec_has_to_wait(
+/*=================*/
+ bool for_locking,
+ /*!< in is called locking or releasing */
+ const trx_t* trx, /*!< in: trx of new lock */
+ unsigned type_mode,/*!< in: precise mode of the new lock
+ to set: LOCK_S or LOCK_X, possibly
+ ORed to LOCK_GAP or LOCK_REC_NOT_GAP,
+ LOCK_INSERT_INTENTION */
+ const lock_t* lock2, /*!< in: another record lock; NOTE that
+ it is assumed that this has a lock bit
+ set on the same record as in the new
+ lock we are setting */
+ bool lock_is_on_supremum)
+ /*!< in: TRUE if we are setting the
+ lock on the 'supremum' record of an
+ index page: we know then that the lock
+ request is really for a 'gap' type lock */
+{
+ ut_ad(trx && lock2);
+ ut_ad(lock_get_type_low(lock2) == LOCK_REC);
+ ut_ad(lock_mutex_own());
+
+ if (trx == lock2->trx
+ || lock_mode_compatible(
+ static_cast<lock_mode>(LOCK_MODE_MASK & type_mode),
+ lock_get_mode(lock2))) {
+ return false;
+ }
+
+ /* We have somewhat complex rules when gap type record locks
+ cause waits */
+
+ if ((lock_is_on_supremum || (type_mode & LOCK_GAP))
+ && !(type_mode & LOCK_INSERT_INTENTION)) {
+
+ /* Gap type locks without LOCK_INSERT_INTENTION flag
+ do not need to wait for anything. This is because
+ different users can have conflicting lock types
+ on gaps. */
+
+ return false;
+ }
+
+ if (!(type_mode & LOCK_INSERT_INTENTION) && lock_rec_get_gap(lock2)) {
+
+ /* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP
+ does not need to wait for a gap type lock */
+
+ return false;
+ }
+
+ if ((type_mode & LOCK_GAP) && lock_rec_get_rec_not_gap(lock2)) {
+
+ /* Lock on gap does not need to wait for
+ a LOCK_REC_NOT_GAP type lock */
+
+ return false;
+ }
+
+ if (lock_rec_get_insert_intention(lock2)) {
+
+ /* No lock request needs to wait for an insert
+ intention lock to be removed. This is ok since our
+ rules allow conflicting locks on gaps. This eliminates
+ a spurious deadlock caused by a next-key lock waiting
+ for an insert intention lock; when the insert
+ intention lock was granted, the insert deadlocked on
+ the waiting next-key lock.
+
+ Also, insert intention locks do not disturb each
+ other. */
+
+ return false;
+ }
+
+ if ((type_mode & LOCK_GAP || lock_rec_get_gap(lock2))
+ && !thd_need_ordering_with(trx->mysql_thd, lock2->trx->mysql_thd)) {
+ /* If the upper server layer has already decided on the
+ commit order between the transaction requesting the
+ lock and the transaction owning the lock, we do not
+ need to wait for gap locks. Such ordeering by the upper
+ server layer happens in parallel replication, where the
+ commit order is fixed to match the original order on the
+ master.
+
+ Such gap locks are mainly needed to get serialisability
+ between transactions so that they will be binlogged in
+ the correct order so that statement-based replication
+ will give the correct results. Since the right order
+ was already determined on the master, we do not need
+ to enforce it again here.
+
+ Skipping the locks is not essential for correctness,
+ since in case of deadlock we will just kill the later
+ transaction and retry it. But it can save some
+ unnecessary rollbacks and retries. */
+
+ return false;
+ }
+
+#ifdef WITH_WSREP
+ /* New lock request from a transaction is using unique key
+ scan and this transaction is a wsrep high priority transaction
+ (brute force). If conflicting transaction is also wsrep high
+ priority transaction we should avoid lock conflict because
+ ordering of these transactions is already decided and
+ conflicting transaction will be later replayed. Note
+ that thread holding conflicting lock can't be
+ committed or rolled back while we hold
+ lock_sys->mutex. */
+ if (trx->is_wsrep_UK_scan()
+ && wsrep_thd_is_BF(lock2->trx->mysql_thd, false)) {
+ return false;
+ }
+
+ /* We very well can let bf to wait normally as other
+ BF will be replayed in case of conflict. For debug
+ builds we will do additional sanity checks to catch
+ unsupported bf wait if any. */
+ ut_d(wsrep_assert_no_bf_bf_wait(lock2, trx));
+#endif /* WITH_WSREP */
+
+ return true;
+}
+
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return TRUE if lock1 has to wait for lock2 to be removed */
+bool
+lock_has_to_wait(
+/*=============*/
+ const lock_t* lock1, /*!< in: waiting lock */
+ const lock_t* lock2) /*!< in: another lock; NOTE that it is
+ assumed that this has a lock bit set
+ on the same record as in lock1 if the
+ locks are record locks */
+{
+ ut_ad(lock1 && lock2);
+
+ if (lock1->trx == lock2->trx
+ || lock_mode_compatible(lock_get_mode(lock1),
+ lock_get_mode(lock2))) {
+ return false;
+ }
+
+ if (lock_get_type_low(lock1) != LOCK_REC) {
+ return true;
+ }
+
+ ut_ad(lock_get_type_low(lock2) == LOCK_REC);
+
+ if (lock1->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)) {
+ return lock_prdt_has_to_wait(lock1->trx, lock1->type_mode,
+ lock_get_prdt_from_lock(lock1),
+ lock2);
+ }
+
+ return lock_rec_has_to_wait(
+ false, lock1->trx, lock1->type_mode, lock2,
+ lock_rec_get_nth_bit(lock1, PAGE_HEAP_NO_SUPREMUM));
+}
+
+/*============== RECORD LOCK BASIC FUNCTIONS ============================*/
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+ const lock_t* lock) /*!< in: record lock with at least one bit set */
+{
+ for (ulint i = 0; i < lock_rec_get_n_bits(lock); ++i) {
+
+ if (lock_rec_get_nth_bit(lock, i)) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/*********************************************************************//**
+Resets the record lock bitmap to zero. NOTE: does not touch the wait_lock
+pointer in the transaction! This function is used in lock object creation
+and resetting. */
+static
+void
+lock_rec_bitmap_reset(
+/*==================*/
+ lock_t* lock) /*!< in: record lock */
+{
+ ulint n_bytes;
+
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ /* Reset to zero the bitmap which resides immediately after the lock
+ struct */
+
+ n_bytes = lock_rec_get_n_bits(lock) / 8;
+
+ ut_ad((lock_rec_get_n_bits(lock) % 8) == 0);
+
+ memset(reinterpret_cast<void*>(&lock[1]), 0, n_bytes);
+}
+
+/*********************************************************************//**
+Copies a record lock to heap.
+@return copy of lock */
+static
+lock_t*
+lock_rec_copy(
+/*==========*/
+ const lock_t* lock, /*!< in: record lock */
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ulint size;
+
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8;
+
+ return(static_cast<lock_t*>(mem_heap_dup(heap, lock, size)));
+}
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return previous lock on the same record, NULL if none exists */
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+ const lock_t* in_lock,/*!< in: record lock */
+ ulint heap_no)/*!< in: heap number of the record */
+{
+ lock_t* lock;
+ lock_t* found_lock = NULL;
+
+ ut_ad(lock_mutex_own());
+ ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+
+ for (lock = lock_sys.get_first(*lock_hash_get(in_lock->type_mode),
+ in_lock->un_member.rec_lock.page_id);
+ lock != in_lock;
+ lock = lock_rec_get_next_on_page(lock)) {
+ if (lock_rec_get_nth_bit(lock, heap_no)) {
+ found_lock = lock;
+ }
+ }
+
+ return found_lock;
+}
+
+/*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/
+
+/*********************************************************************//**
+Checks if a transaction has a GRANTED explicit lock on rec stronger or equal
+to precise_mode.
+@return lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_rec_has_expl(
+/*==============*/
+ ulint precise_mode,/*!< in: LOCK_S or LOCK_X
+ possibly ORed to LOCK_GAP or
+ LOCK_REC_NOT_GAP, for a
+ supremum record we regard this
+ always a gap type request */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ const trx_t* trx) /*!< in: transaction */
+{
+ lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+ ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+ || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+ ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
+
+ for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
+ lock != NULL;
+ lock = lock_rec_get_next(heap_no, lock)) {
+
+ if (lock->trx == trx
+ && !lock_rec_get_insert_intention(lock)
+ && lock_mode_stronger_or_eq(
+ lock_get_mode(lock),
+ static_cast<lock_mode>(
+ precise_mode & LOCK_MODE_MASK))
+ && !lock_get_wait(lock)
+ && (!lock_rec_get_rec_not_gap(lock)
+ || (precise_mode & LOCK_REC_NOT_GAP)
+ || heap_no == PAGE_HEAP_NO_SUPREMUM)
+ && (!lock_rec_get_gap(lock)
+ || (precise_mode & LOCK_GAP)
+ || heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+
+ return(lock);
+ }
+ }
+
+ return(NULL);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Checks if some other transaction has a lock request in the queue.
+@return lock or NULL */
+static
+lock_t*
+lock_rec_other_has_expl_req(
+/*========================*/
+ lock_mode mode, /*!< in: LOCK_S or LOCK_X */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ bool wait, /*!< in: whether also waiting locks
+ are taken into account */
+ ulint heap_no,/*!< in: heap number of the record */
+ const trx_t* trx) /*!< in: transaction, or NULL if
+ requests by all transactions
+ are taken into account */
+{
+
+ ut_ad(lock_mutex_own());
+ ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+ /* Only GAP lock can be on SUPREMUM, and we are not looking for
+ GAP lock */
+ if (heap_no == PAGE_HEAP_NO_SUPREMUM) {
+ return(NULL);
+ }
+
+ for (lock_t* lock = lock_rec_get_first(&lock_sys.rec_hash,
+ block, heap_no);
+ lock != NULL;
+ lock = lock_rec_get_next(heap_no, lock)) {
+
+ if (lock->trx != trx
+ && !lock_rec_get_gap(lock)
+ && (wait || !lock_get_wait(lock))
+ && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) {
+
+ return(lock);
+ }
+ }
+
+ return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+#ifdef WITH_WSREP
+static void wsrep_kill_victim(const trx_t * const trx, const lock_t *lock)
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(trx->is_wsrep());
+ trx_t* lock_trx = lock->trx;
+ ut_ad(trx_mutex_own(lock_trx));
+ ut_ad(lock_trx != trx);
+
+ if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE))
+ return;
+
+ if (lock_trx->state == TRX_STATE_COMMITTED_IN_MEMORY
+ || lock_trx->lock.was_chosen_as_deadlock_victim)
+ return;
+
+ if (!wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE)
+ || wsrep_thd_order_before(trx->mysql_thd, lock_trx->mysql_thd)) {
+ if (lock_trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+ if (UNIV_UNLIKELY(wsrep_debug))
+ WSREP_INFO("BF victim waiting");
+ /* cannot release lock, until our lock
+ is in the queue*/
+ } else {
+ wsrep_innobase_kill_one_trx(trx->mysql_thd,
+ lock_trx, true);
+ }
+ }
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Checks if some other transaction has a conflicting explicit lock request
+in the queue, so that we have to wait.
+@return lock or NULL */
+static
+lock_t*
+lock_rec_other_has_conflicting(
+/*===========================*/
+ unsigned mode, /*!< in: LOCK_S or LOCK_X,
+ possibly ORed to LOCK_GAP or
+ LOC_REC_NOT_GAP,
+ LOCK_INSERT_INTENTION */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ const trx_t* trx) /*!< in: our transaction */
+{
+ lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+
+ bool is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM);
+
+ for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
+ lock != NULL;
+ lock = lock_rec_get_next(heap_no, lock)) {
+
+ if (lock_rec_has_to_wait(true, trx, mode, lock, is_supremum)) {
+#ifdef WITH_WSREP
+ if (trx->is_wsrep()) {
+ trx_mutex_enter(lock->trx);
+ /* Below function will roll back either trx
+ or lock->trx depending on priority of the
+ transaction. */
+ wsrep_kill_victim(const_cast<trx_t*>(trx), lock);
+ trx_mutex_exit(lock->trx);
+ }
+#endif /* WITH_WSREP */
+ return(lock);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a secondary
+index.
+@return transaction id of the transaction which has the x-lock, or 0;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active(). */
+static
+trx_t*
+lock_sec_rec_some_has_impl(
+/*=======================*/
+ trx_t* caller_trx,/*!<in/out: trx of current thread */
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ trx_t* trx;
+ trx_id_t max_trx_id;
+ const page_t* page = page_align(rec);
+
+ ut_ad(!lock_mutex_own());
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!rec_is_metadata(rec, *index));
+
+ max_trx_id = page_get_max_trx_id(page);
+
+ /* Some transaction may have an implicit x-lock on the record only
+ if the max trx id for the page >= min trx id for the trx list, or
+ database recovery is running. */
+
+ if (max_trx_id < trx_sys.get_min_trx_id()) {
+
+ trx = 0;
+
+ } else if (!lock_check_trx_id_sanity(max_trx_id, rec, index, offsets)) {
+
+ /* The page is corrupt: try to avoid a crash by returning 0 */
+ trx = 0;
+
+ /* In this case it is possible that some transaction has an implicit
+ x-lock. We have to look in the clustered index. */
+
+ } else {
+ trx = row_vers_impl_x_locked(caller_trx, rec, index, offsets);
+ }
+
+ return(trx);
+}
+
+/*********************************************************************//**
+Return approximate number or record locks (bits set in the bitmap) for
+this transaction. Since delete-marked records may be removed, the
+record count will not be precise.
+The caller must be holding lock_sys.mutex. */
+ulint
+lock_number_of_rows_locked(
+/*=======================*/
+ const trx_lock_t* trx_lock) /*!< in: transaction locks */
+{
+ ut_ad(lock_mutex_own());
+
+ return(trx_lock->n_rec_locks);
+}
+
+/*********************************************************************//**
+Return the number of table locks for a transaction.
+The caller must be holding lock_sys.mutex. */
+ulint
+lock_number_of_tables_locked(
+/*=========================*/
+ const trx_lock_t* trx_lock) /*!< in: transaction locks */
+{
+ const lock_t* lock;
+ ulint n_tables = 0;
+
+ ut_ad(lock_mutex_own());
+
+ for (lock = UT_LIST_GET_FIRST(trx_lock->trx_locks);
+ lock != NULL;
+ lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+ if (lock_get_type_low(lock) == LOCK_TABLE) {
+ n_tables++;
+ }
+ }
+
+ return(n_tables);
+}
+
+/*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/
+
+#ifdef WITH_WSREP
+ATTRIBUTE_COLD
+static
+void
+wsrep_print_wait_locks(
+/*===================*/
+ lock_t* c_lock) /* conflicting lock to print */
+{
+ if (c_lock->trx->lock.wait_lock != c_lock) {
+ mtr_t mtr;
+ ib::info() << "WSREP: c_lock != wait lock";
+ ib::info() << " SQL: "
+ << wsrep_thd_query(c_lock->trx->mysql_thd);
+
+ if (lock_get_type_low(c_lock) & LOCK_TABLE) {
+ lock_table_print(stderr, c_lock);
+ } else {
+ lock_rec_print(stderr, c_lock, mtr);
+ }
+
+ if (lock_get_type_low(c_lock->trx->lock.wait_lock) & LOCK_TABLE) {
+ lock_table_print(stderr, c_lock->trx->lock.wait_lock);
+ } else {
+ lock_rec_print(stderr, c_lock->trx->lock.wait_lock,
+ mtr);
+ }
+ }
+}
+#endif /* WITH_WSREP */
+
+#ifdef UNIV_DEBUG
+/** Check transaction state */
+static void check_trx_state(const trx_t *trx)
+{
+ ut_ad(!trx->auto_commit || trx->will_lock);
+ const auto state= trx->state;
+ ut_ad(state == TRX_STATE_ACTIVE ||
+ state == TRX_STATE_PREPARED_RECOVERED ||
+ state == TRX_STATE_PREPARED ||
+ state == TRX_STATE_COMMITTED_IN_MEMORY);
+}
+#endif
+
+/** Create a new record lock and inserts it to the lock queue,
+without checking for deadlocks or conflicts.
+@param[in] type_mode lock mode and wait flag; type will be replaced
+ with LOCK_REC
+@param[in] page_id index page number
+@param[in] page R-tree index page, or NULL
+@param[in] heap_no record heap number in the index page
+@param[in] index the index tree
+@param[in,out] trx transaction
+@param[in] holds_trx_mutex whether the caller holds trx->mutex
+@return created lock */
+lock_t*
+lock_rec_create_low(
+#ifdef WITH_WSREP
+ lock_t* c_lock, /*!< conflicting lock */
+ que_thr_t* thr, /*!< thread owning trx */
+#endif
+ unsigned type_mode,
+ const page_id_t page_id,
+ const page_t* page,
+ ulint heap_no,
+ dict_index_t* index,
+ trx_t* trx,
+ bool holds_trx_mutex)
+{
+ lock_t* lock;
+ ulint n_bits;
+ ulint n_bytes;
+
+ ut_ad(lock_mutex_own());
+ ut_ad(holds_trx_mutex == trx_mutex_own(trx));
+ ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+
+#ifdef UNIV_DEBUG
+ /* Non-locking autocommit read-only transactions should not set
+ any locks. See comment in trx_set_rw_mode explaining why this
+ conditional check is required in debug code. */
+ if (holds_trx_mutex) {
+ check_trx_state(trx);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* If rec is the supremum record, then we reset the gap and
+ LOCK_REC_NOT_GAP bits, as all locks on the supremum are
+ automatically of the gap type */
+
+ if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+ ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
+ type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
+ }
+
+ if (UNIV_LIKELY(!(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)))) {
+ /* Make lock bitmap bigger by a safety margin */
+ n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN;
+ n_bytes = 1 + n_bits / 8;
+ } else {
+ ut_ad(heap_no == PRDT_HEAPNO);
+
+ /* The lock is always on PAGE_HEAP_NO_INFIMUM (0), so
+ we only need 1 bit (which round up to 1 byte) for
+ lock bit setting */
+ n_bytes = 1;
+
+ if (type_mode & LOCK_PREDICATE) {
+ ulint tmp = UNIV_WORD_SIZE - 1;
+
+ /* We will attach predicate structure after lock.
+ Make sure the memory is aligned on 8 bytes,
+ the mem_heap_alloc will align it with
+ MEM_SPACE_NEEDED anyway. */
+ n_bytes = (n_bytes + sizeof(lock_prdt_t) + tmp) & ~tmp;
+ ut_ad(n_bytes == sizeof(lock_prdt_t) + UNIV_WORD_SIZE);
+ }
+ }
+
+ if (trx->lock.rec_cached >= UT_ARR_SIZE(trx->lock.rec_pool)
+ || sizeof *lock + n_bytes > sizeof *trx->lock.rec_pool) {
+ lock = static_cast<lock_t*>(
+ mem_heap_alloc(trx->lock.lock_heap,
+ sizeof *lock + n_bytes));
+ } else {
+ lock = &trx->lock.rec_pool[trx->lock.rec_cached++].lock;
+ }
+
+ lock->trx = trx;
+ lock->type_mode = (type_mode & unsigned(~LOCK_TYPE_MASK)) | LOCK_REC;
+ lock->index = index;
+ lock->un_member.rec_lock.page_id = page_id;
+
+ if (UNIV_LIKELY(!(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)))) {
+ lock->un_member.rec_lock.n_bits = uint32_t(n_bytes * 8);
+ } else {
+ /* Predicate lock always on INFIMUM (0) */
+ lock->un_member.rec_lock.n_bits = 8;
+ }
+ lock_rec_bitmap_reset(lock);
+ lock_rec_set_nth_bit(lock, heap_no);
+ index->table->n_rec_locks++;
+ ut_ad(index->table->get_ref_count() > 0 || !index->table->can_be_evicted);
+
+#ifdef WITH_WSREP
+ if (c_lock && trx->is_wsrep()
+ && wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+ lock_t *hash = (lock_t *)c_lock->hash;
+ lock_t *prev = NULL;
+
+ while (hash && wsrep_thd_is_BF(hash->trx->mysql_thd, FALSE)
+ && wsrep_thd_order_before(hash->trx->mysql_thd,
+ trx->mysql_thd)) {
+ prev = hash;
+ hash = (lock_t *)hash->hash;
+ }
+ lock->hash = hash;
+ if (prev) {
+ prev->hash = lock;
+ } else {
+ c_lock->hash = lock;
+ }
+ /*
+ * delayed conflict resolution '...kill_one_trx' was not called,
+ * if victim was waiting for some other lock
+ */
+ trx_mutex_enter(c_lock->trx);
+ if (c_lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+ c_lock->trx->lock.was_chosen_as_deadlock_victim = TRUE;
+
+ if (UNIV_UNLIKELY(wsrep_debug)) {
+ wsrep_print_wait_locks(c_lock);
+ }
+
+ trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+ lock_set_lock_and_trx_wait(lock, trx);
+ UT_LIST_ADD_LAST(trx->lock.trx_locks, lock);
+
+ trx->lock.wait_thr = thr;
+ thr->state = QUE_THR_LOCK_WAIT;
+
+ /* have to release trx mutex for the duration of
+ victim lock release. This will eventually call
+ lock_grant, which wants to grant trx mutex again
+ */
+ if (holds_trx_mutex) {
+ trx_mutex_exit(trx);
+ }
+ lock_cancel_waiting_and_release(
+ c_lock->trx->lock.wait_lock);
+
+ if (holds_trx_mutex) {
+ trx_mutex_enter(trx);
+ }
+
+ trx_mutex_exit(c_lock->trx);
+
+ /* have to bail out here to avoid lock_set_lock... */
+ return(lock);
+ }
+ trx_mutex_exit(c_lock->trx);
+ } else
+#endif /* WITH_WSREP */
+ if (!(type_mode & (LOCK_WAIT | LOCK_PREDICATE | LOCK_PRDT_PAGE))
+ && innodb_lock_schedule_algorithm
+ == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+ && !thd_is_replication_slave_thread(trx->mysql_thd)) {
+ HASH_PREPEND(lock_t, hash, &lock_sys.rec_hash,
+ page_id.fold(), lock);
+ } else {
+ HASH_INSERT(lock_t, hash, lock_hash_get(type_mode),
+ page_id.fold(), lock);
+ }
+
+ if (!holds_trx_mutex) {
+ trx_mutex_enter(trx);
+ }
+ ut_ad(trx_mutex_own(trx));
+ if (type_mode & LOCK_WAIT) {
+ lock_set_lock_and_trx_wait(lock, trx);
+ }
+ UT_LIST_ADD_LAST(trx->lock.trx_locks, lock);
+ if (!holds_trx_mutex) {
+ trx_mutex_exit(trx);
+ }
+ MONITOR_INC(MONITOR_RECLOCK_CREATED);
+ MONITOR_INC(MONITOR_NUM_RECLOCK);
+
+ return lock;
+}
+
+/*********************************************************************//**
+Check if lock1 has higher priority than lock2.
+NULL has lowest priority.
+If neither of them is wait lock, the first one has higher priority.
+If only one of them is a wait lock, it has lower priority.
+If either is a high priority transaction, the lock has higher priority.
+Otherwise, the one with an older transaction has higher priority.
+@returns true if lock1 has higher priority, false otherwise. */
+static bool has_higher_priority(lock_t *lock1, lock_t *lock2)
+{
+ if (lock1 == NULL) {
+ return false;
+ } else if (lock2 == NULL) {
+ return true;
+ }
+ // Granted locks has higher priority.
+ if (!lock_get_wait(lock1)) {
+ return true;
+ } else if (!lock_get_wait(lock2)) {
+ return false;
+ }
+ return lock1->trx->start_time_micro <= lock2->trx->start_time_micro;
+}
+
+/*********************************************************************//**
+Insert a lock to the hash list according to the mode (whether it is a wait
+lock) and the age of the transaction the it is associated with.
+If the lock is not a wait lock, insert it to the head of the hash list.
+Otherwise, insert it to the middle of the wait locks according to the age of
+the transaciton. */
+static
+dberr_t
+lock_rec_insert_by_trx_age(
+ lock_t *in_lock) /*!< in: lock to be insert */{
+ lock_t* node;
+ lock_t* next;
+ hash_table_t* hash;
+ hash_cell_t* cell;
+
+ ut_ad(!in_lock->trx->is_wsrep());
+ const page_id_t page_id(in_lock->un_member.rec_lock.page_id);
+ hash = lock_hash_get(in_lock->type_mode);
+ cell = &hash->array[hash->calc_hash(page_id.fold())];
+
+ node = (lock_t *) cell->node;
+ // If in_lock is not a wait lock, we insert it to the head of the list.
+ if (node == NULL || !lock_get_wait(in_lock) || has_higher_priority(in_lock, node)) {
+ cell->node = in_lock;
+ in_lock->hash = node;
+ if (lock_get_wait(in_lock)) {
+ lock_grant_have_trx_mutex(in_lock);
+ return DB_SUCCESS_LOCKED_REC;
+ }
+ return DB_SUCCESS;
+ }
+ while (node != NULL && has_higher_priority((lock_t *) node->hash,
+ in_lock)) {
+ node = (lock_t *) node->hash;
+ }
+ next = (lock_t *) node->hash;
+ node->hash = in_lock;
+ in_lock->hash = next;
+
+ if (lock_get_wait(in_lock) && !lock_rec_has_to_wait_in_queue(in_lock)) {
+ lock_grant_have_trx_mutex(in_lock);
+ if (cell->node != in_lock) {
+ // Move it to the front of the queue
+ node->hash = in_lock->hash;
+ next = (lock_t *) cell->node;
+ cell->node = in_lock;
+ in_lock->hash = next;
+ }
+ return DB_SUCCESS_LOCKED_REC;
+ }
+
+ return DB_SUCCESS;
+}
+
+#ifdef UNIV_DEBUG
+static
+bool
+lock_queue_validate(
+ const lock_t *in_lock) /*!< in: lock whose hash list is to be validated */
+{
+ hash_table_t* hash;
+ hash_cell_t* cell;
+ lock_t* next;
+ bool wait_lock __attribute__((unused))= false;
+
+ if (in_lock == NULL) {
+ return true;
+ }
+
+ const page_id_t page_id(in_lock->un_member.rec_lock.page_id);
+ hash = lock_hash_get(in_lock->type_mode);
+ cell = &hash->array[hash->calc_hash(page_id.fold())];
+ next = (lock_t *) cell->node;
+ while (next != NULL) {
+ // If this is a granted lock, check that there's no wait lock before it.
+ if (!lock_get_wait(next)) {
+ ut_ad(!wait_lock);
+ } else {
+ wait_lock = true;
+ }
+ next = next->hash;
+ }
+ return true;
+}
+#endif /* UNIV_DEBUG */
+
+static
+void
+lock_rec_insert_to_head(
+ lock_t *in_lock, /*!< in: lock to be insert */
+ ulint rec_fold) /*!< in: rec_fold of the page */
+{
+ hash_table_t* hash;
+ hash_cell_t* cell;
+ lock_t* node;
+
+ if (in_lock == NULL) {
+ return;
+ }
+
+ hash = lock_hash_get(in_lock->type_mode);
+ cell = &hash->array[hash->calc_hash(rec_fold)];
+ node = (lock_t *) cell->node;
+ if (node != in_lock) {
+ cell->node = in_lock;
+ in_lock->hash = node;
+ }
+}
+
+/** Enqueue a waiting request for a lock which cannot be granted immediately.
+Check for deadlocks.
+@param[in] type_mode the requested lock mode (LOCK_S or LOCK_X)
+ possibly ORed with LOCK_GAP or
+ LOCK_REC_NOT_GAP, ORed with
+ LOCK_INSERT_INTENTION if this
+ waiting lock request is set
+ when performing an insert of
+ an index record
+@param[in] block leaf page in the index
+@param[in] heap_no record heap number in the block
+@param[in] index index tree
+@param[in,out] thr query thread
+@param[in] prdt minimum bounding box (spatial index)
+@retval DB_LOCK_WAIT if the waiting lock was enqueued
+@retval DB_DEADLOCK if this transaction was chosen as the victim
+@retval DB_SUCCESS_LOCKED_REC if the other transaction was chosen as a victim
+ (or it happened to commit) */
+dberr_t
+lock_rec_enqueue_waiting(
+#ifdef WITH_WSREP
+ lock_t* c_lock, /*!< conflicting lock */
+#endif
+ unsigned type_mode,
+ const buf_block_t* block,
+ ulint heap_no,
+ dict_index_t* index,
+ que_thr_t* thr,
+ lock_prdt_t* prdt)
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(!srv_read_only_mode);
+ ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+
+ trx_t* trx = thr_get_trx(thr);
+
+ ut_ad(trx_mutex_own(trx));
+ ut_a(!que_thr_stop(thr));
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ break;
+ case TRX_DICT_OP_TABLE:
+ case TRX_DICT_OP_INDEX:
+ ib::error() << "A record lock wait happens in a dictionary"
+ " operation. index "
+ << index->name
+ << " of table "
+ << index->table->name
+ << ". " << BUG_REPORT_MSG;
+ ut_ad(0);
+ }
+
+ if (trx->mysql_thd && thd_lock_wait_timeout(trx->mysql_thd) == 0) {
+ trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+ return DB_LOCK_WAIT_TIMEOUT;
+ }
+
+ /* Enqueue the lock request that will wait to be granted, note that
+ we already own the trx mutex. */
+ lock_t* lock = lock_rec_create(
+#ifdef WITH_WSREP
+ c_lock, thr,
+#endif
+ type_mode | LOCK_WAIT, block, heap_no, index, trx, TRUE);
+
+ if (prdt && type_mode & LOCK_PREDICATE) {
+ lock_prdt_set_prdt(lock, prdt);
+ }
+
+ if (ut_d(const trx_t* victim =)
+ DeadlockChecker::check_and_resolve(lock, trx)) {
+ ut_ad(victim == trx);
+ lock_reset_lock_and_trx_wait(lock);
+ lock_rec_reset_nth_bit(lock, heap_no);
+ return DB_DEADLOCK;
+ }
+
+ if (!trx->lock.wait_lock) {
+ /* If there was a deadlock but we chose another
+ transaction as a victim, it is possible that we
+ already have the lock now granted! */
+#ifdef WITH_WSREP
+ if (UNIV_UNLIKELY(wsrep_debug)) {
+ ib::info() << "WSREP: BF thread got lock granted early, ID " << ib::hex(trx->id)
+ << " query: " << wsrep_thd_query(trx->mysql_thd);
+ }
+#endif
+ return DB_SUCCESS_LOCKED_REC;
+ }
+
+ trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+
+ trx->lock.was_chosen_as_deadlock_victim = false;
+ trx->lock.wait_started = time(NULL);
+
+ ut_a(que_thr_stop(thr));
+
+ DBUG_LOG("ib_lock", "trx " << ib::hex(trx->id)
+ << " waits for lock in index " << index->name
+ << " of table " << index->table->name);
+
+ MONITOR_INC(MONITOR_LOCKREC_WAIT);
+
+ if (innodb_lock_schedule_algorithm
+ == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+ && !prdt
+ && !thd_is_replication_slave_thread(lock->trx->mysql_thd)) {
+ HASH_DELETE(lock_t, hash, &lock_sys.rec_hash,
+ lock_rec_lock_fold(lock), lock);
+ dberr_t res = lock_rec_insert_by_trx_age(lock);
+ if (res != DB_SUCCESS) {
+ return res;
+ }
+ }
+
+ return DB_LOCK_WAIT;
+}
+
+/*********************************************************************//**
+Looks for a suitable type record lock struct by the same trx on the same page.
+This can be used to save space when a new record lock should be set on a page:
+no new struct is needed, if a suitable old is found.
+@return lock or NULL */
+static inline
+lock_t*
+lock_rec_find_similar_on_page(
+ ulint type_mode, /*!< in: lock type_mode field */
+ ulint heap_no, /*!< in: heap number of the record */
+ lock_t* lock, /*!< in: lock_sys.get_first() */
+ const trx_t* trx) /*!< in: transaction */
+{
+ ut_ad(lock_mutex_own());
+
+ for (/* No op */;
+ lock != NULL;
+ lock = lock_rec_get_next_on_page(lock)) {
+
+ if (lock->trx == trx
+ && lock->type_mode == type_mode
+ && lock_rec_get_n_bits(lock) > heap_no) {
+
+ return(lock);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Adds a record lock request in the record queue. The request is normally
+added as the last in the queue, but if there are no waiting lock requests
+on the record, and the request to be added is not a waiting request, we
+can reuse a suitable record lock object already existing on the same page,
+just setting the appropriate bit in its bitmap. This is a low-level function
+which does NOT check for deadlocks or lock compatibility!
+@return lock where the bit was set */
+static
+void
+lock_rec_add_to_queue(
+/*==================*/
+ unsigned type_mode,/*!< in: lock mode, wait, gap
+ etc. flags; type is ignored
+ and replaced by LOCK_REC */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ dict_index_t* index, /*!< in: index of record */
+ trx_t* trx, /*!< in/out: transaction */
+ bool caller_owns_trx_mutex)
+ /*!< in: TRUE if caller owns the
+ transaction mutex */
+{
+#ifdef UNIV_DEBUG
+ ut_ad(lock_mutex_own());
+ ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx));
+ ut_ad(dict_index_is_clust(index)
+ || dict_index_get_online_status(index) != ONLINE_INDEX_CREATION);
+ switch (type_mode & LOCK_MODE_MASK) {
+ case LOCK_X:
+ case LOCK_S:
+ break;
+ default:
+ ut_error;
+ }
+
+ if (!(type_mode & (LOCK_WAIT | LOCK_GAP))) {
+ lock_mode mode = (type_mode & LOCK_MODE_MASK) == LOCK_S
+ ? LOCK_X
+ : LOCK_S;
+ const lock_t* other_lock
+ = lock_rec_other_has_expl_req(
+ mode, block, false, heap_no, trx);
+#ifdef WITH_WSREP
+ if (UNIV_LIKELY_NULL(other_lock) && trx->is_wsrep()) {
+ /* Only BF transaction may be granted lock
+ before other conflicting lock request. */
+ if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)
+ && !wsrep_thd_is_BF(other_lock->trx->mysql_thd, FALSE)) {
+ /* If it is not BF, this case is a bug. */
+ wsrep_report_bf_lock_wait(trx->mysql_thd, trx->id);
+ wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id);
+ ut_error;
+ }
+ } else
+#endif /* WITH_WSREP */
+ ut_ad(!other_lock);
+ }
+#endif /* UNIV_DEBUG */
+
+ type_mode |= LOCK_REC;
+
+ /* If rec is the supremum record, then we can reset the gap bit, as
+ all locks on the supremum are automatically of the gap type, and we
+ try to avoid unnecessary memory consumption of a new record lock
+ struct for a gap type lock */
+
+ if (heap_no == PAGE_HEAP_NO_SUPREMUM) {
+ ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
+
+ /* There should never be LOCK_REC_NOT_GAP on a supremum
+ record, but let us play safe */
+
+ type_mode &= ~(LOCK_GAP | LOCK_REC_NOT_GAP);
+ }
+
+ lock_t* lock;
+ lock_t* first_lock;
+
+ /* Look for a waiting lock request on the same record or on a gap */
+
+ for (first_lock = lock = lock_sys.get_first(*lock_hash_get(type_mode),
+ block->page.id());
+ lock != NULL;
+ lock = lock_rec_get_next_on_page(lock)) {
+
+ if (lock_get_wait(lock)
+ && lock_rec_get_nth_bit(lock, heap_no)) {
+
+ break;
+ }
+ }
+
+ if (lock == NULL && !(type_mode & LOCK_WAIT)) {
+
+ /* Look for a similar record lock on the same page:
+ if one is found and there are no waiting lock requests,
+ we can just set the bit */
+
+ lock = lock_rec_find_similar_on_page(
+ type_mode, heap_no, first_lock, trx);
+
+ if (lock != NULL) {
+
+ lock_rec_set_nth_bit(lock, heap_no);
+
+ return;
+ }
+ }
+
+ lock_rec_create(
+#ifdef WITH_WSREP
+ NULL, NULL,
+#endif
+ type_mode, block, heap_no, index, trx, caller_owns_trx_mutex);
+}
+
+/*********************************************************************//**
+Tries to lock the specified record in the mode requested. If not immediately
+possible, enqueues a waiting lock request. This is a low-level function
+which does NOT look at implicit locks! Checks lock compatibility within
+explicit locks. This function sets a normal next-key lock, or in the case
+of a page supremum record, a gap type lock.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+static
+dberr_t
+lock_rec_lock(
+/*==========*/
+ bool impl, /*!< in: if true, no lock is set
+ if no wait is necessary: we
+ assume that the caller will
+ set an implicit lock */
+ unsigned mode, /*!< in: lock mode: LOCK_X or
+ LOCK_S possibly ORed to either
+ LOCK_GAP or LOCK_REC_NOT_GAP */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of record */
+ dict_index_t* index, /*!< in: index of record */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t *trx= thr_get_trx(thr);
+ dberr_t err= DB_SUCCESS;
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad((LOCK_MODE_MASK & mode) == LOCK_S ||
+ (LOCK_MODE_MASK & mode) == LOCK_X);
+ ut_ad((mode & LOCK_TYPE_MASK) == LOCK_GAP ||
+ (mode & LOCK_TYPE_MASK) == LOCK_REC_NOT_GAP ||
+ (mode & LOCK_TYPE_MASK) == 0);
+ ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+ DBUG_EXECUTE_IF("innodb_report_deadlock", return DB_DEADLOCK;);
+
+ lock_mutex_enter();
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_S ||
+ lock_table_has(trx, index->table, LOCK_IS));
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_X ||
+ lock_table_has(trx, index->table, LOCK_IX));
+
+ if (lock_table_has(trx, index->table,
+ static_cast<lock_mode>(LOCK_MODE_MASK & mode)));
+ else if (lock_t *lock= lock_sys.get_first(block->page.id()))
+ {
+ trx_mutex_enter(trx);
+ if (lock_rec_get_next_on_page(lock) ||
+ lock->trx != trx ||
+ lock->type_mode != (ulint(mode) | LOCK_REC) ||
+ lock_rec_get_n_bits(lock) <= heap_no)
+ {
+ /* Do nothing if the trx already has a strong enough lock on rec */
+ if (!lock_rec_has_expl(mode, block, heap_no, trx))
+ {
+ if (
+#ifdef WITH_WSREP
+ lock_t *c_lock=
+#endif
+ lock_rec_other_has_conflicting(mode, block, heap_no, trx))
+ {
+ /*
+ If another transaction has a non-gap conflicting
+ request in the queue, as this transaction does not
+ have a lock strong enough already granted on the
+ record, we have to wait. */
+ err = lock_rec_enqueue_waiting(
+#ifdef WITH_WSREP
+ c_lock,
+#endif /* WITH_WSREP */
+ mode, block, heap_no, index, thr, NULL);
+ }
+ else if (!impl)
+ {
+ /* Set the requested lock on the record. */
+ lock_rec_add_to_queue(LOCK_REC | mode, block, heap_no, index, trx,
+ true);
+ err= DB_SUCCESS_LOCKED_REC;
+ }
+ }
+ }
+ else if (!impl)
+ {
+ /*
+ If the nth bit of the record lock is already set then we do not set
+ a new lock bit, otherwise we do set
+ */
+ if (!lock_rec_get_nth_bit(lock, heap_no))
+ {
+ lock_rec_set_nth_bit(lock, heap_no);
+ err= DB_SUCCESS_LOCKED_REC;
+ }
+ }
+ trx_mutex_exit(trx);
+ }
+ else
+ {
+ /*
+ Simplified and faster path for the most common cases
+ Note that we don't own the trx mutex.
+ */
+ if (!impl)
+ lock_rec_create(
+#ifdef WITH_WSREP
+ NULL, NULL,
+#endif
+ mode, block, heap_no, index, trx, false);
+
+ err= DB_SUCCESS_LOCKED_REC;
+ }
+ lock_mutex_exit();
+ MONITOR_ATOMIC_INC(MONITOR_NUM_RECLOCK_REQ);
+ return err;
+}
+
+/*********************************************************************//**
+Checks if a waiting record lock request still has to wait in a queue.
+@return lock that is causing the wait */
+static
+const lock_t*
+lock_rec_has_to_wait_in_queue(
+/*==========================*/
+ const lock_t* wait_lock) /*!< in: waiting record lock */
+{
+ const lock_t* lock;
+ ulint heap_no;
+ ulint bit_mask;
+ ulint bit_offset;
+
+ ut_ad(wait_lock);
+ ut_ad(lock_mutex_own());
+ ut_ad(lock_get_wait(wait_lock));
+ ut_ad(lock_get_type_low(wait_lock) == LOCK_REC);
+
+ heap_no = lock_rec_find_set_bit(wait_lock);
+
+ bit_offset = heap_no / 8;
+ bit_mask = static_cast<ulint>(1) << (heap_no % 8);
+
+ for (lock = lock_sys.get_first(*lock_hash_get(wait_lock->type_mode),
+ wait_lock->un_member.rec_lock.page_id);
+ lock != wait_lock;
+ lock = lock_rec_get_next_on_page_const(lock)) {
+ const byte* p = (const byte*) &lock[1];
+
+ if (heap_no < lock_rec_get_n_bits(lock)
+ && (p[bit_offset] & bit_mask)
+ && lock_has_to_wait(wait_lock, lock)) {
+ return(lock);
+ }
+ }
+
+ return(NULL);
+}
+
+/** Grant a lock to a waiting lock request and release the waiting transaction
+after lock_reset_lock_and_trx_wait() has been called. */
+static void lock_grant_after_reset(lock_t* lock)
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(lock->trx));
+
+ if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+ dict_table_t* table = lock->un_member.tab_lock.table;
+
+ if (table->autoinc_trx == lock->trx) {
+ ib::error() << "Transaction already had an"
+ << " AUTO-INC lock!";
+ } else {
+ table->autoinc_trx = lock->trx;
+
+ ib_vector_push(lock->trx->autoinc_locks, &lock);
+ }
+ }
+
+ DBUG_PRINT("ib_lock", ("wait for trx " TRX_ID_FMT " ends",
+ trx_get_id_for_print(lock->trx)));
+
+ /* If we are resolving a deadlock by choosing another transaction
+ as a victim, then our original transaction may not be in the
+ TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait
+ for it */
+
+ if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+ que_thr_t* thr;
+
+ thr = que_thr_end_lock_wait(lock->trx);
+
+ if (thr != NULL) {
+ lock_wait_release_thread_if_suspended(thr);
+ }
+ }
+}
+
+/** Grant a lock to a waiting lock request and release the waiting transaction. */
+static void lock_grant(lock_t* lock)
+{
+ lock_reset_lock_and_trx_wait(lock);
+ trx_mutex_enter(lock->trx);
+ lock_grant_after_reset(lock);
+ trx_mutex_exit(lock->trx);
+}
+
+/*************************************************************//**
+Cancels a waiting record lock request and releases the waiting transaction
+that requested it. NOTE: does NOT check if waiting lock requests behind this
+one can now be granted! */
+static
+void
+lock_rec_cancel(
+/*============*/
+ lock_t* lock) /*!< in: waiting record lock request */
+{
+ que_thr_t* thr;
+
+ ut_ad(lock_mutex_own());
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ /* Reset the bit (there can be only one set bit) in the lock bitmap */
+ lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock));
+
+ /* Reset the wait flag and the back pointer to lock in trx */
+
+ lock_reset_lock_and_trx_wait(lock);
+
+ /* The following function releases the trx from lock wait */
+
+ trx_mutex_enter(lock->trx);
+
+ thr = que_thr_end_lock_wait(lock->trx);
+
+ if (thr != NULL) {
+ lock_wait_release_thread_if_suspended(thr);
+ }
+
+ trx_mutex_exit(lock->trx);
+}
+
+static void lock_grant_and_move_on_page(ulint rec_fold, const page_id_t id)
+{
+ lock_t* lock;
+ lock_t* previous = static_cast<lock_t*>(
+ lock_sys.rec_hash.array[lock_sys.rec_hash.calc_hash(rec_fold)].
+ node);
+ if (previous == NULL) {
+ return;
+ }
+ if (previous->un_member.rec_lock.page_id == id) {
+ lock = previous;
+ }
+ else {
+ while (previous->hash &&
+ (previous->hash->un_member.rec_lock.page_id != id)) {
+ previous = previous->hash;
+ }
+ lock = previous->hash;
+ }
+
+ ut_ad(previous->hash == lock || previous == lock);
+ /* Grant locks if there are no conflicting locks ahead.
+ Move granted locks to the head of the list. */
+ while (lock) {
+ /* If the lock is a wait lock on this page, and it does not need to wait. */
+ ut_ad(!lock->trx->is_wsrep());
+ if (lock_get_wait(lock)
+ && lock->un_member.rec_lock.page_id == id
+ && !lock_rec_has_to_wait_in_queue(lock)) {
+ lock_grant(lock);
+
+ if (previous != NULL) {
+ /* Move the lock to the head of the list. */
+ HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock);
+ lock_rec_insert_to_head(lock, rec_fold);
+ } else {
+ /* Already at the head of the list. */
+ previous = lock;
+ }
+ /* Move on to the next lock. */
+ lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, previous));
+ } else {
+ previous = lock;
+ lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, lock));
+ }
+ }
+}
+
+/** Remove a record lock request, waiting or granted, from the queue and
+grant locks to other transactions in the queue if they now are entitled
+to a lock. NOTE: all record locks contained in in_lock are removed.
+@param[in,out] in_lock record lock */
+static void lock_rec_dequeue_from_page(lock_t* in_lock)
+{
+ hash_table_t* lock_hash;
+
+ ut_ad(lock_mutex_own());
+ ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+ /* We may or may not be holding in_lock->trx->mutex here. */
+
+ const page_id_t page_id(in_lock->un_member.rec_lock.page_id);
+
+ in_lock->index->table->n_rec_locks--;
+
+ lock_hash = lock_hash_get(in_lock->type_mode);
+
+ const ulint rec_fold = page_id.fold();
+
+ HASH_DELETE(lock_t, hash, lock_hash, rec_fold, in_lock);
+ UT_LIST_REMOVE(in_lock->trx->lock.trx_locks, in_lock);
+
+ MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+ MONITOR_DEC(MONITOR_NUM_RECLOCK);
+
+ if (innodb_lock_schedule_algorithm
+ == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS
+ || lock_hash != &lock_sys.rec_hash
+ || thd_is_replication_slave_thread(in_lock->trx->mysql_thd)) {
+ /* Check if waiting locks in the queue can now be granted:
+ grant locks if there are no conflicting locks ahead. Stop at
+ the first X lock that is waiting or has been granted. */
+
+ for (lock_t* lock = lock_sys.get_first(*lock_hash, page_id);
+ lock != NULL;
+ lock = lock_rec_get_next_on_page(lock)) {
+
+ if (!lock_get_wait(lock)) {
+ continue;
+ }
+ const lock_t* c = lock_rec_has_to_wait_in_queue(lock);
+ if (!c) {
+ /* Grant the lock */
+ ut_ad(lock->trx != in_lock->trx);
+ lock_grant(lock);
+ }
+ }
+ } else {
+ lock_grant_and_move_on_page(rec_fold, page_id);
+ }
+}
+
+/*************************************************************//**
+Removes a record lock request, waiting or granted, from the queue. */
+void
+lock_rec_discard(
+/*=============*/
+ lock_t* in_lock) /*!< in: record lock object: all
+ record locks which are contained
+ in this lock object are removed */
+{
+ trx_lock_t* trx_lock;
+
+ ut_ad(lock_mutex_own());
+ ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+
+ trx_lock = &in_lock->trx->lock;
+
+ in_lock->index->table->n_rec_locks--;
+
+ HASH_DELETE(lock_t, hash, lock_hash_get(in_lock->type_mode),
+ in_lock->un_member.rec_lock.page_id.fold(), in_lock);
+
+ UT_LIST_REMOVE(trx_lock->trx_locks, in_lock);
+
+ MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+ MONITOR_DEC(MONITOR_NUM_RECLOCK);
+}
+
+/*************************************************************//**
+Removes record lock objects set on an index page which is discarded. This
+function does not move locks, or check for waiting locks, therefore the
+lock bitmaps must already be reset when this function is called. */
+static void lock_rec_free_all_from_discard_page_low(const page_id_t id,
+ hash_table_t *lock_hash)
+{
+ lock_t *lock= lock_sys.get_first(*lock_hash, id);
+
+ while (lock)
+ {
+ ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
+ ut_ad(!lock_get_wait(lock));
+ lock_t *next_lock= lock_rec_get_next_on_page(lock);
+ lock_rec_discard(lock);
+ lock= next_lock;
+ }
+}
+
+/*************************************************************//**
+Removes record lock objects set on an index page which is discarded. This
+function does not move locks, or check for waiting locks, therefore the
+lock bitmaps must already be reset when this function is called. */
+void
+lock_rec_free_all_from_discard_page(
+/*================================*/
+ const buf_block_t* block) /*!< in: page to be discarded */
+{
+ const page_id_t page_id(block->page.id());
+ lock_rec_free_all_from_discard_page_low(page_id, &lock_sys.rec_hash);
+ lock_rec_free_all_from_discard_page_low(page_id, &lock_sys.prdt_hash);
+ lock_rec_free_all_from_discard_page_low(page_id, &lock_sys.prdt_page_hash);
+}
+
+/*============= RECORD LOCK MOVING AND INHERITING ===================*/
+
+/*************************************************************//**
+Resets the lock bits for a single record. Releases transactions waiting for
+lock requests here. */
+static
+void
+lock_rec_reset_and_release_wait_low(
+/*================================*/
+ hash_table_t* hash, /*!< in: hash table */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no)/*!< in: heap number of record */
+{
+ lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+
+ for (lock = lock_rec_get_first(hash, block, heap_no);
+ lock != NULL;
+ lock = lock_rec_get_next(heap_no, lock)) {
+
+ if (lock_get_wait(lock)) {
+ lock_rec_cancel(lock);
+ } else {
+ lock_rec_reset_nth_bit(lock, heap_no);
+ }
+ }
+}
+
+/*************************************************************//**
+Resets the lock bits for a single record. Releases transactions waiting for
+lock requests here. */
+static
+void
+lock_rec_reset_and_release_wait(
+/*============================*/
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no)/*!< in: heap number of record */
+{
+ lock_rec_reset_and_release_wait_low(
+ &lock_sys.rec_hash, block, heap_no);
+
+ lock_rec_reset_and_release_wait_low(
+ &lock_sys.prdt_hash, block, PAGE_HEAP_NO_INFIMUM);
+ lock_rec_reset_and_release_wait_low(
+ &lock_sys.prdt_page_hash, block, PAGE_HEAP_NO_INFIMUM);
+}
+
+/*************************************************************//**
+Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of
+the other record. Also waiting lock requests on rec are inherited as
+GRANTED gap locks. */
+static
+void
+lock_rec_inherit_to_gap(
+/*====================*/
+ const buf_block_t* heir_block, /*!< in: block containing the
+ record which inherits */
+ const buf_block_t* block, /*!< in: block containing the
+ record from which inherited;
+ does NOT reset the locks on
+ this record */
+ ulint heir_heap_no, /*!< in: heap_no of the
+ inheriting record */
+ ulint heap_no) /*!< in: heap_no of the
+ donating record */
+{
+ lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+
+ /* At READ UNCOMMITTED or READ COMMITTED isolation level,
+ we do not want locks set
+ by an UPDATE or a DELETE to be inherited as gap type locks. But we
+ DO want S-locks/X-locks(taken for replace) set by a consistency
+ constraint to be inherited also then. */
+
+ for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
+ lock != NULL;
+ lock = lock_rec_get_next(heap_no, lock)) {
+
+ if (!lock_rec_get_insert_intention(lock)
+ && (lock->trx->isolation_level > TRX_ISO_READ_COMMITTED
+ || lock_get_mode(lock) !=
+ (lock->trx->duplicates ? LOCK_S : LOCK_X))) {
+ lock_rec_add_to_queue(
+ LOCK_REC | LOCK_GAP | lock_get_mode(lock),
+ heir_block, heir_heap_no, lock->index,
+ lock->trx, FALSE);
+ }
+ }
+}
+
+/*************************************************************//**
+Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of the
+other record. Also waiting lock requests are inherited as GRANTED gap locks. */
+static
+void
+lock_rec_inherit_to_gap_if_gap_lock(
+/*================================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ ulint heir_heap_no, /*!< in: heap_no of
+ record which inherits */
+ ulint heap_no) /*!< in: heap_no of record
+ from which inherited;
+ does NOT reset the locks
+ on this record */
+{
+ lock_t* lock;
+
+ lock_mutex_enter();
+
+ for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
+ lock != NULL;
+ lock = lock_rec_get_next(heap_no, lock)) {
+
+ if (!lock_rec_get_insert_intention(lock)
+ && (heap_no == PAGE_HEAP_NO_SUPREMUM
+ || !lock_rec_get_rec_not_gap(lock))) {
+
+ lock_rec_add_to_queue(
+ LOCK_REC | LOCK_GAP | lock_get_mode(lock),
+ block, heir_heap_no, lock->index,
+ lock->trx, FALSE);
+ }
+ }
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
+Moves the locks of a record to another record and resets the lock bits of
+the donating record. */
+static
+void
+lock_rec_move_low(
+/*==============*/
+ hash_table_t* lock_hash, /*!< in: hash table to use */
+ const buf_block_t* receiver, /*!< in: buffer block containing
+ the receiving record */
+ const buf_block_t* donator, /*!< in: buffer block containing
+ the donating record */
+ ulint receiver_heap_no,/*!< in: heap_no of the record
+ which gets the locks; there
+ must be no lock requests
+ on it! */
+ ulint donator_heap_no)/*!< in: heap_no of the record
+ which gives the locks */
+{
+ lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+
+ /* If the lock is predicate lock, it resides on INFIMUM record */
+ ut_ad(lock_rec_get_first(
+ lock_hash, receiver, receiver_heap_no) == NULL
+ || lock_hash == &lock_sys.prdt_hash
+ || lock_hash == &lock_sys.prdt_page_hash);
+
+ for (lock = lock_rec_get_first(lock_hash,
+ donator, donator_heap_no);
+ lock != NULL;
+ lock = lock_rec_get_next(donator_heap_no, lock)) {
+
+ const auto type_mode = lock->type_mode;
+
+ lock_rec_reset_nth_bit(lock, donator_heap_no);
+
+ if (type_mode & LOCK_WAIT) {
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ /* Note that we FIRST reset the bit, and then set the lock:
+ the function works also if donator == receiver */
+
+ lock_rec_add_to_queue(
+ type_mode, receiver, receiver_heap_no,
+ lock->index, lock->trx, FALSE);
+ }
+
+ ut_ad(!lock_rec_get_first(&lock_sys.rec_hash,
+ donator, donator_heap_no));
+}
+
+/** Move all the granted locks to the front of the given lock list.
+All the waiting locks will be at the end of the list.
+@param[in,out] lock_list the given lock list. */
+static
+void
+lock_move_granted_locks_to_front(
+ UT_LIST_BASE_NODE_T(lock_t)& lock_list)
+{
+ lock_t* lock;
+
+ bool seen_waiting_lock = false;
+
+ for (lock = UT_LIST_GET_FIRST(lock_list); lock;
+ lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+ if (!seen_waiting_lock) {
+ if (lock->is_waiting()) {
+ seen_waiting_lock = true;
+ }
+ continue;
+ }
+
+ ut_ad(seen_waiting_lock);
+
+ if (!lock->is_waiting()) {
+ lock_t* prev = UT_LIST_GET_PREV(trx_locks, lock);
+ ut_a(prev);
+ ut_list_move_to_front(lock_list, lock);
+ lock = prev;
+ }
+ }
+}
+
+/*************************************************************//**
+Moves the locks of a record to another record and resets the lock bits of
+the donating record. */
+UNIV_INLINE
+void
+lock_rec_move(
+/*==========*/
+ const buf_block_t* receiver, /*!< in: buffer block containing
+ the receiving record */
+ const buf_block_t* donator, /*!< in: buffer block containing
+ the donating record */
+ ulint receiver_heap_no,/*!< in: heap_no of the record
+ which gets the locks; there
+ must be no lock requests
+ on it! */
+ ulint donator_heap_no)/*!< in: heap_no of the record
+ which gives the locks */
+{
+ lock_rec_move_low(&lock_sys.rec_hash, receiver, donator,
+ receiver_heap_no, donator_heap_no);
+}
+
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+void
+lock_move_reorganize_page(
+/*======================*/
+ const buf_block_t* block, /*!< in: old index page, now
+ reorganized */
+ const buf_block_t* oblock) /*!< in: copy of the old, not
+ reorganized page */
+{
+ lock_t* lock;
+ UT_LIST_BASE_NODE_T(lock_t) old_locks;
+ mem_heap_t* heap = NULL;
+ ulint comp;
+
+ lock_mutex_enter();
+
+ /* FIXME: This needs to deal with predicate lock too */
+ lock = lock_sys.get_first(block->page.id());
+
+ if (lock == NULL) {
+ lock_mutex_exit();
+
+ return;
+ }
+
+ heap = mem_heap_create(256);
+
+ /* Copy first all the locks on the page to heap and reset the
+ bitmaps in the original locks; chain the copies of the locks
+ using the trx_locks field in them. */
+
+ UT_LIST_INIT(old_locks, &lock_t::trx_locks);
+
+ do {
+ /* Make a copy of the lock */
+ lock_t* old_lock = lock_rec_copy(lock, heap);
+
+ UT_LIST_ADD_LAST(old_locks, old_lock);
+
+ /* Reset bitmap of lock */
+ lock_rec_bitmap_reset(lock);
+
+ if (lock_get_wait(lock)) {
+
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ lock = lock_rec_get_next_on_page(lock);
+ } while (lock != NULL);
+
+ comp = page_is_comp(block->frame);
+ ut_ad(comp == page_is_comp(oblock->frame));
+
+ lock_move_granted_locks_to_front(old_locks);
+
+ DBUG_EXECUTE_IF("do_lock_reverse_page_reorganize",
+ ut_list_reverse(old_locks););
+
+ for (lock = UT_LIST_GET_FIRST(old_locks); lock;
+ lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+ /* NOTE: we copy also the locks set on the infimum and
+ supremum of the page; the infimum may carry locks if an
+ update of a record is occurring on the page, and its locks
+ were temporarily stored on the infimum */
+ const rec_t* rec1 = page_get_infimum_rec(
+ buf_block_get_frame(block));
+ const rec_t* rec2 = page_get_infimum_rec(
+ buf_block_get_frame(oblock));
+
+ /* Set locks according to old locks */
+ for (;;) {
+ ulint old_heap_no;
+ ulint new_heap_no;
+ ut_d(const rec_t* const orec = rec1);
+ ut_ad(page_rec_is_metadata(rec1)
+ == page_rec_is_metadata(rec2));
+
+ if (comp) {
+ old_heap_no = rec_get_heap_no_new(rec2);
+ new_heap_no = rec_get_heap_no_new(rec1);
+
+ rec1 = page_rec_get_next_low(rec1, TRUE);
+ rec2 = page_rec_get_next_low(rec2, TRUE);
+ } else {
+ old_heap_no = rec_get_heap_no_old(rec2);
+ new_heap_no = rec_get_heap_no_old(rec1);
+ ut_ad(!memcmp(rec1, rec2,
+ rec_get_data_size_old(rec2)));
+
+ rec1 = page_rec_get_next_low(rec1, FALSE);
+ rec2 = page_rec_get_next_low(rec2, FALSE);
+ }
+
+ /* Clear the bit in old_lock. */
+ if (old_heap_no < lock->un_member.rec_lock.n_bits
+ && lock_rec_reset_nth_bit(lock, old_heap_no)) {
+ ut_ad(!page_rec_is_metadata(orec));
+
+ /* NOTE that the old lock bitmap could be too
+ small for the new heap number! */
+
+ lock_rec_add_to_queue(
+ lock->type_mode, block, new_heap_no,
+ lock->index, lock->trx, FALSE);
+ }
+
+ if (new_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+ ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM);
+ break;
+ }
+ }
+
+ ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
+ }
+
+ lock_mutex_exit();
+
+ mem_heap_free(heap);
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+ ut_ad(lock_rec_validate_page(block));
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+void
+lock_move_rec_list_end(
+/*===================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec) /*!< in: record on page: this
+ is the first record moved */
+{
+ lock_t* lock;
+ const ulint comp = page_rec_is_comp(rec);
+
+ ut_ad(buf_block_get_frame(block) == page_align(rec));
+ ut_ad(comp == page_is_comp(buf_block_get_frame(new_block)));
+
+ lock_mutex_enter();
+
+ /* Note: when we move locks from record to record, waiting locks
+ and possible granted gap type locks behind them are enqueued in
+ the original order, because new elements are inserted to a hash
+ table to the end of the hash chain, and lock_rec_add_to_queue
+ does not reuse locks if there are waiters in the queue. */
+
+ for (lock = lock_sys.get_first(block->page.id());
+ lock;
+ lock = lock_rec_get_next_on_page(lock)) {
+ const rec_t* rec1 = rec;
+ const rec_t* rec2;
+ const auto type_mode = lock->type_mode;
+
+ if (comp) {
+ if (page_offset(rec1) == PAGE_NEW_INFIMUM) {
+ rec1 = page_rec_get_next_low(rec1, TRUE);
+ }
+
+ rec2 = page_rec_get_next_low(
+ buf_block_get_frame(new_block)
+ + PAGE_NEW_INFIMUM, TRUE);
+ } else {
+ if (page_offset(rec1) == PAGE_OLD_INFIMUM) {
+ rec1 = page_rec_get_next_low(rec1, FALSE);
+ }
+
+ rec2 = page_rec_get_next_low(
+ buf_block_get_frame(new_block)
+ + PAGE_OLD_INFIMUM, FALSE);
+ }
+
+ /* Copy lock requests on user records to new page and
+ reset the lock bits on the old */
+
+ for (;;) {
+ ut_ad(page_rec_is_metadata(rec1)
+ == page_rec_is_metadata(rec2));
+ ut_d(const rec_t* const orec = rec1);
+
+ ulint rec1_heap_no;
+ ulint rec2_heap_no;
+
+ if (comp) {
+ rec1_heap_no = rec_get_heap_no_new(rec1);
+
+ if (rec1_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+ break;
+ }
+
+ rec2_heap_no = rec_get_heap_no_new(rec2);
+ rec1 = page_rec_get_next_low(rec1, TRUE);
+ rec2 = page_rec_get_next_low(rec2, TRUE);
+ } else {
+ rec1_heap_no = rec_get_heap_no_old(rec1);
+
+ if (rec1_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+ break;
+ }
+
+ rec2_heap_no = rec_get_heap_no_old(rec2);
+
+ ut_ad(rec_get_data_size_old(rec1)
+ == rec_get_data_size_old(rec2));
+
+ ut_ad(!memcmp(rec1, rec2,
+ rec_get_data_size_old(rec1)));
+
+ rec1 = page_rec_get_next_low(rec1, FALSE);
+ rec2 = page_rec_get_next_low(rec2, FALSE);
+ }
+
+ if (rec1_heap_no < lock->un_member.rec_lock.n_bits
+ && lock_rec_reset_nth_bit(lock, rec1_heap_no)) {
+ ut_ad(!page_rec_is_metadata(orec));
+
+ if (type_mode & LOCK_WAIT) {
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ lock_rec_add_to_queue(
+ type_mode, new_block, rec2_heap_no,
+ lock->index, lock->trx, FALSE);
+ }
+ }
+ }
+
+ lock_mutex_exit();
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+ ut_ad(lock_rec_validate_page(block));
+ ut_ad(lock_rec_validate_page(new_block));
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+void
+lock_move_rec_list_start(
+/*=====================*/
+ const buf_block_t* new_block, /*!< in: index page to
+ move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec, /*!< in: record on page:
+ this is the first
+ record NOT copied */
+ const rec_t* old_end) /*!< in: old
+ previous-to-last
+ record on new_page
+ before the records
+ were copied */
+{
+ lock_t* lock;
+ const ulint comp = page_rec_is_comp(rec);
+
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(new_block->frame == page_align(old_end));
+ ut_ad(comp == page_rec_is_comp(old_end));
+ ut_ad(!page_rec_is_metadata(rec));
+
+ lock_mutex_enter();
+
+ for (lock = lock_sys.get_first(block->page.id());
+ lock;
+ lock = lock_rec_get_next_on_page(lock)) {
+ const rec_t* rec1;
+ const rec_t* rec2;
+ const auto type_mode = lock->type_mode;
+
+ if (comp) {
+ rec1 = page_rec_get_next_low(
+ buf_block_get_frame(block)
+ + PAGE_NEW_INFIMUM, TRUE);
+ rec2 = page_rec_get_next_low(old_end, TRUE);
+ } else {
+ rec1 = page_rec_get_next_low(
+ buf_block_get_frame(block)
+ + PAGE_OLD_INFIMUM, FALSE);
+ rec2 = page_rec_get_next_low(old_end, FALSE);
+ }
+
+ /* Copy lock requests on user records to new page and
+ reset the lock bits on the old */
+
+ while (rec1 != rec) {
+ ut_ad(page_rec_is_metadata(rec1)
+ == page_rec_is_metadata(rec2));
+ ut_d(const rec_t* const prev = rec1);
+
+ ulint rec1_heap_no;
+ ulint rec2_heap_no;
+
+ if (comp) {
+ rec1_heap_no = rec_get_heap_no_new(rec1);
+ rec2_heap_no = rec_get_heap_no_new(rec2);
+
+ rec1 = page_rec_get_next_low(rec1, TRUE);
+ rec2 = page_rec_get_next_low(rec2, TRUE);
+ } else {
+ rec1_heap_no = rec_get_heap_no_old(rec1);
+ rec2_heap_no = rec_get_heap_no_old(rec2);
+
+ ut_ad(!memcmp(rec1, rec2,
+ rec_get_data_size_old(rec2)));
+
+ rec1 = page_rec_get_next_low(rec1, FALSE);
+ rec2 = page_rec_get_next_low(rec2, FALSE);
+ }
+
+ if (rec1_heap_no < lock->un_member.rec_lock.n_bits
+ && lock_rec_reset_nth_bit(lock, rec1_heap_no)) {
+ ut_ad(!page_rec_is_metadata(prev));
+
+ if (type_mode & LOCK_WAIT) {
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ lock_rec_add_to_queue(
+ type_mode, new_block, rec2_heap_no,
+ lock->index, lock->trx, FALSE);
+ }
+ }
+
+#ifdef UNIV_DEBUG
+ if (page_rec_is_supremum(rec)) {
+ ulint i;
+
+ for (i = PAGE_HEAP_NO_USER_LOW;
+ i < lock_rec_get_n_bits(lock); i++) {
+ if (lock_rec_get_nth_bit(lock, i)) {
+ ib::fatal()
+ << "lock_move_rec_list_start():"
+ << i << " not moved in "
+ << (void*) lock;
+ }
+ }
+ }
+#endif /* UNIV_DEBUG */
+ }
+
+ lock_mutex_exit();
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+ ut_ad(lock_rec_validate_page(block));
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+void
+lock_rtr_move_rec_list(
+/*===================*/
+ const buf_block_t* new_block, /*!< in: index page to
+ move to */
+ const buf_block_t* block, /*!< in: index page */
+ rtr_rec_move_t* rec_move, /*!< in: recording records
+ moved */
+ ulint num_move) /*!< in: num of rec to move */
+{
+ lock_t* lock;
+ ulint comp;
+
+ if (!num_move) {
+ return;
+ }
+
+ comp = page_rec_is_comp(rec_move[0].old_rec);
+
+ ut_ad(block->frame == page_align(rec_move[0].old_rec));
+ ut_ad(new_block->frame == page_align(rec_move[0].new_rec));
+ ut_ad(comp == page_rec_is_comp(rec_move[0].new_rec));
+
+ lock_mutex_enter();
+
+ for (lock = lock_sys.get_first(block->page.id());
+ lock;
+ lock = lock_rec_get_next_on_page(lock)) {
+ ulint moved = 0;
+ const rec_t* rec1;
+ const rec_t* rec2;
+ const auto type_mode = lock->type_mode;
+
+ /* Copy lock requests on user records to new page and
+ reset the lock bits on the old */
+
+ while (moved < num_move) {
+ ulint rec1_heap_no;
+ ulint rec2_heap_no;
+
+ rec1 = rec_move[moved].old_rec;
+ rec2 = rec_move[moved].new_rec;
+ ut_ad(!page_rec_is_metadata(rec1));
+ ut_ad(!page_rec_is_metadata(rec2));
+
+ if (comp) {
+ rec1_heap_no = rec_get_heap_no_new(rec1);
+ rec2_heap_no = rec_get_heap_no_new(rec2);
+
+ } else {
+ rec1_heap_no = rec_get_heap_no_old(rec1);
+ rec2_heap_no = rec_get_heap_no_old(rec2);
+
+ ut_ad(!memcmp(rec1, rec2,
+ rec_get_data_size_old(rec2)));
+ }
+
+ if (rec1_heap_no < lock->un_member.rec_lock.n_bits
+ && lock_rec_reset_nth_bit(lock, rec1_heap_no)) {
+ if (type_mode & LOCK_WAIT) {
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ lock_rec_add_to_queue(
+ type_mode, new_block, rec2_heap_no,
+ lock->index, lock->trx, FALSE);
+
+ rec_move[moved].moved = true;
+ }
+
+ moved++;
+ }
+ }
+
+ lock_mutex_exit();
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+ ut_ad(lock_rec_validate_page(block));
+#endif
+}
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+void
+lock_update_split_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block) /*!< in: left page */
+{
+ ulint heap_no = lock_get_min_heap_no(right_block);
+
+ lock_mutex_enter();
+
+ /* Move the locks on the supremum of the left page to the supremum
+ of the right page */
+
+ lock_rec_move(right_block, left_block,
+ PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+
+ /* Inherit the locks to the supremum of left page from the successor
+ of the infimum on right page */
+
+ lock_rec_inherit_to_gap(left_block, right_block,
+ PAGE_HEAP_NO_SUPREMUM, heap_no);
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+void
+lock_update_merge_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page to
+ which merged */
+ const rec_t* orig_succ, /*!< in: original
+ successor of infimum
+ on the right page
+ before merge */
+ const buf_block_t* left_block) /*!< in: merged index
+ page which will be
+ discarded */
+{
+ ut_ad(!page_rec_is_metadata(orig_succ));
+
+ lock_mutex_enter();
+
+ /* Inherit the locks from the supremum of the left page to the
+ original successor of infimum on the right page, to which the left
+ page was merged */
+
+ lock_rec_inherit_to_gap(right_block, left_block,
+ page_rec_get_heap_no(orig_succ),
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Reset the locks on the supremum of the left page, releasing
+ waiting transactions */
+
+ lock_rec_reset_and_release_wait_low(
+ &lock_sys.rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM);
+
+ /* there should exist no page lock on the left page,
+ otherwise, it will be blocked from merge */
+ ut_ad(!lock_sys.get_first_prdt_page(left_block->page.id()));
+
+ lock_rec_free_all_from_discard_page(left_block);
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when the root page is copied to another in
+btr_root_raise_and_insert. Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+void
+lock_update_root_raise(
+/*===================*/
+ const buf_block_t* block, /*!< in: index page to which copied */
+ const buf_block_t* root) /*!< in: root page */
+{
+ lock_mutex_enter();
+
+ /* Move the locks on the supremum of the root to the supremum
+ of block */
+
+ lock_rec_move(block, root,
+ PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is copied to another and the original page
+is removed from the chain of leaf pages, except if page is the root! */
+void
+lock_update_copy_and_discard(
+/*=========================*/
+ const buf_block_t* new_block, /*!< in: index page to
+ which copied */
+ const buf_block_t* block) /*!< in: index page;
+ NOT the root! */
+{
+ lock_mutex_enter();
+
+ /* Move the locks on the supremum of the old page to the supremum
+ of new_page */
+
+ lock_rec_move(new_block, block,
+ PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+ lock_rec_free_all_from_discard_page(block);
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+void
+lock_update_split_left(
+/*===================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block) /*!< in: left page */
+{
+ ulint heap_no = lock_get_min_heap_no(right_block);
+
+ lock_mutex_enter();
+
+ /* Inherit the locks to the supremum of the left page from the
+ successor of the infimum on the right page */
+
+ lock_rec_inherit_to_gap(left_block, right_block,
+ PAGE_HEAP_NO_SUPREMUM, heap_no);
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is merged to the left. */
+void
+lock_update_merge_left(
+/*===================*/
+ const buf_block_t* left_block, /*!< in: left page to
+ which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor
+ of supremum on the left page
+ before merge */
+ const buf_block_t* right_block) /*!< in: merged index page
+ which will be discarded */
+{
+ const rec_t* left_next_rec;
+
+ ut_ad(left_block->frame == page_align(orig_pred));
+
+ lock_mutex_enter();
+
+ left_next_rec = page_rec_get_next_const(orig_pred);
+
+ if (!page_rec_is_supremum(left_next_rec)) {
+
+ /* Inherit the locks on the supremum of the left page to the
+ first record which was moved from the right page */
+
+ lock_rec_inherit_to_gap(left_block, left_block,
+ page_rec_get_heap_no(left_next_rec),
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Reset the locks on the supremum of the left page,
+ releasing waiting transactions */
+
+ lock_rec_reset_and_release_wait_low(
+ &lock_sys.rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM);
+ }
+
+ /* Move the locks from the supremum of right page to the supremum
+ of the left page */
+
+ lock_rec_move(left_block, right_block,
+ PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+
+ /* there should exist no page lock on the right page,
+ otherwise, it will be blocked from merge */
+ ut_ad(!lock_sys.get_first_prdt_page(right_block->page.id()));
+
+ lock_rec_free_all_from_discard_page(right_block);
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+ const buf_block_t* heir_block, /*!< in: block containing the
+ record which inherits */
+ const buf_block_t* block, /*!< in: block containing the
+ record from which inherited;
+ does NOT reset the locks on
+ this record */
+ ulint heir_heap_no, /*!< in: heap_no of the
+ inheriting record */
+ ulint heap_no) /*!< in: heap_no of the
+ donating record */
+{
+ lock_mutex_enter();
+
+ lock_rec_reset_and_release_wait(heir_block, heir_heap_no);
+
+ lock_rec_inherit_to_gap(heir_block, block, heir_heap_no, heap_no);
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+void
+lock_update_discard(
+/*================*/
+ const buf_block_t* heir_block, /*!< in: index page
+ which will inherit the locks */
+ ulint heir_heap_no, /*!< in: heap_no of the record
+ which will inherit the locks */
+ const buf_block_t* block) /*!< in: index page
+ which will be discarded */
+{
+ const page_t* page = block->frame;
+ const rec_t* rec;
+ ulint heap_no;
+ const page_id_t page_id(block->page.id());
+
+ lock_mutex_enter();
+
+ if (lock_sys.get_first(page_id)) {
+ ut_ad(!lock_sys.get_first_prdt(page_id));
+ ut_ad(!lock_sys.get_first_prdt_page(page_id));
+ /* Inherit all the locks on the page to the record and
+ reset all the locks on the page */
+
+ if (page_is_comp(page)) {
+ rec = page + PAGE_NEW_INFIMUM;
+
+ do {
+ heap_no = rec_get_heap_no_new(rec);
+
+ lock_rec_inherit_to_gap(heir_block, block,
+ heir_heap_no, heap_no);
+
+ lock_rec_reset_and_release_wait(
+ block, heap_no);
+
+ rec = page + rec_get_next_offs(rec, TRUE);
+ } while (heap_no != PAGE_HEAP_NO_SUPREMUM);
+ } else {
+ rec = page + PAGE_OLD_INFIMUM;
+
+ do {
+ heap_no = rec_get_heap_no_old(rec);
+
+ lock_rec_inherit_to_gap(heir_block, block,
+ heir_heap_no, heap_no);
+
+ lock_rec_reset_and_release_wait(
+ block, heap_no);
+
+ rec = page + rec_get_next_offs(rec, FALSE);
+ } while (heap_no != PAGE_HEAP_NO_SUPREMUM);
+ }
+
+ lock_rec_free_all_from_discard_page_low(page_id,
+ &lock_sys.rec_hash);
+ } else {
+ lock_rec_free_all_from_discard_page_low(page_id,
+ &lock_sys.prdt_hash);
+ lock_rec_free_all_from_discard_page_low(
+ page_id, &lock_sys.prdt_page_hash);
+ }
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+void
+lock_update_insert(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec) /*!< in: the inserted record */
+{
+ ulint receiver_heap_no;
+ ulint donator_heap_no;
+
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(!page_rec_is_metadata(rec));
+
+ /* Inherit the gap-locking locks for rec, in gap mode, from the next
+ record */
+
+ if (page_rec_is_comp(rec)) {
+ receiver_heap_no = rec_get_heap_no_new(rec);
+ donator_heap_no = rec_get_heap_no_new(
+ page_rec_get_next_low(rec, TRUE));
+ } else {
+ receiver_heap_no = rec_get_heap_no_old(rec);
+ donator_heap_no = rec_get_heap_no_old(
+ page_rec_get_next_low(rec, FALSE));
+ }
+
+ lock_rec_inherit_to_gap_if_gap_lock(
+ block, receiver_heap_no, donator_heap_no);
+}
+
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+void
+lock_update_delete(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec) /*!< in: the record to be removed */
+{
+ const page_t* page = block->frame;
+ ulint heap_no;
+ ulint next_heap_no;
+
+ ut_ad(page == page_align(rec));
+ ut_ad(!page_rec_is_metadata(rec));
+
+ if (page_is_comp(page)) {
+ heap_no = rec_get_heap_no_new(rec);
+ next_heap_no = rec_get_heap_no_new(page
+ + rec_get_next_offs(rec,
+ TRUE));
+ } else {
+ heap_no = rec_get_heap_no_old(rec);
+ next_heap_no = rec_get_heap_no_old(page
+ + rec_get_next_offs(rec,
+ FALSE));
+ }
+
+ lock_mutex_enter();
+
+ /* Let the next record inherit the locks from rec, in gap mode */
+
+ lock_rec_inherit_to_gap(block, block, next_heap_no, heap_no);
+
+ /* Reset the lock bits on rec and release waiting transactions */
+
+ lock_rec_reset_and_release_wait(block, heap_no);
+
+ lock_mutex_exit();
+}
+
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is moved in such an update, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec) /*!< in: record whose lock state
+ is stored on the infimum
+ record of the same page; lock
+ bits are reset on the
+ record */
+{
+ ulint heap_no = page_rec_get_heap_no(rec);
+
+ ut_ad(block->frame == page_align(rec));
+
+ lock_mutex_enter();
+
+ lock_rec_move(block, block, PAGE_HEAP_NO_INFIMUM, heap_no);
+
+ lock_mutex_exit();
+}
+
+/*********************************************************************//**
+Restores the state of explicit lock requests on a single record, where the
+state was stored on the infimum of the page. */
+void
+lock_rec_restore_from_page_infimum(
+/*===============================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record whose lock state
+ is restored */
+ const buf_block_t* donator)/*!< in: page (rec is not
+ necessarily on this page)
+ whose infimum stored the lock
+ state; lock bits are reset on
+ the infimum */
+{
+ ulint heap_no = page_rec_get_heap_no(rec);
+
+ lock_mutex_enter();
+
+ lock_rec_move(block, donator, heap_no, PAGE_HEAP_NO_INFIMUM);
+
+ lock_mutex_exit();
+}
+
+/*========================= TABLE LOCKS ==============================*/
+
+/** Functor for accessing the embedded node within a table lock. */
+struct TableLockGetNode {
+ ut_list_node<lock_t>& operator() (lock_t& elem)
+ {
+ return(elem.un_member.tab_lock.locks);
+ }
+};
+
+/*********************************************************************//**
+Creates a table lock object and adds it as the last in the lock queue
+of the table. Does NOT check for deadlocks or lock compatibility.
+@return own: new lock object */
+UNIV_INLINE
+lock_t*
+lock_table_create(
+/*==============*/
+ dict_table_t* table, /*!< in/out: database table
+ in dictionary cache */
+ unsigned type_mode,/*!< in: lock mode possibly ORed with
+ LOCK_WAIT */
+ trx_t* trx /*!< in: trx */
+#ifdef WITH_WSREP
+ , lock_t* c_lock = NULL /*!< in: conflicting lock */
+#endif
+ )
+{
+ lock_t* lock;
+
+ ut_ad(table && trx);
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(trx));
+ ut_ad(trx->is_recovered || trx->state == TRX_STATE_ACTIVE);
+ ut_ad(!trx->auto_commit || trx->will_lock);
+
+ if ((type_mode & LOCK_MODE_MASK) == LOCK_AUTO_INC) {
+ ++table->n_waiting_or_granted_auto_inc_locks;
+ }
+
+ /* For AUTOINC locking we reuse the lock instance only if
+ there is no wait involved else we allocate the waiting lock
+ from the transaction lock heap. */
+ if (type_mode == LOCK_AUTO_INC) {
+
+ lock = table->autoinc_lock;
+
+ table->autoinc_trx = trx;
+
+ ib_vector_push(trx->autoinc_locks, &lock);
+
+ } else if (trx->lock.table_cached
+ < UT_ARR_SIZE(trx->lock.table_pool)) {
+ lock = &trx->lock.table_pool[trx->lock.table_cached++];
+ } else {
+
+ lock = static_cast<lock_t*>(
+ mem_heap_alloc(trx->lock.lock_heap, sizeof(*lock)));
+
+ }
+
+ lock->type_mode = ib_uint32_t(type_mode | LOCK_TABLE);
+ lock->trx = trx;
+
+ lock->un_member.tab_lock.table = table;
+
+ ut_ad(table->get_ref_count() > 0 || !table->can_be_evicted);
+
+ UT_LIST_ADD_LAST(trx->lock.trx_locks, lock);
+
+#ifdef WITH_WSREP
+ if (c_lock && trx->is_wsrep()) {
+ if (wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+ ut_list_insert(table->locks, c_lock, lock,
+ TableLockGetNode());
+ if (UNIV_UNLIKELY(wsrep_debug)) {
+ wsrep_report_bf_lock_wait(trx->mysql_thd, trx->id);
+ wsrep_report_bf_lock_wait(c_lock->trx->mysql_thd, c_lock->trx->id);
+ }
+ } else {
+ ut_list_append(table->locks, lock, TableLockGetNode());
+ }
+
+ trx_mutex_enter(c_lock->trx);
+
+ if (c_lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+ c_lock->trx->lock.was_chosen_as_deadlock_victim = TRUE;
+
+ if (UNIV_UNLIKELY(wsrep_debug)) {
+ wsrep_report_bf_lock_wait(trx->mysql_thd, trx->id);
+ wsrep_report_bf_lock_wait(c_lock->trx->mysql_thd, c_lock->trx->id);
+ wsrep_print_wait_locks(c_lock);
+ }
+
+ /* The lock release will call lock_grant(),
+ which would acquire trx->mutex again. */
+ trx_mutex_exit(trx);
+ lock_cancel_waiting_and_release(
+ c_lock->trx->lock.wait_lock);
+ trx_mutex_enter(trx);
+ }
+
+ trx_mutex_exit(c_lock->trx);
+ } else
+#endif /* WITH_WSREP */
+ ut_list_append(table->locks, lock, TableLockGetNode());
+
+ if (type_mode & LOCK_WAIT) {
+
+ lock_set_lock_and_trx_wait(lock, trx);
+ }
+
+ lock->trx->lock.table_locks.push_back(lock);
+
+ MONITOR_INC(MONITOR_TABLELOCK_CREATED);
+ MONITOR_INC(MONITOR_NUM_TABLELOCK);
+
+ return(lock);
+}
+
+/*************************************************************//**
+Pops autoinc lock requests from the transaction's autoinc_locks. We
+handle the case where there are gaps in the array and they need to
+be popped off the stack. */
+UNIV_INLINE
+void
+lock_table_pop_autoinc_locks(
+/*=========================*/
+ trx_t* trx) /*!< in/out: transaction that owns the AUTOINC locks */
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
+
+ /* Skip any gaps, gaps are NULL lock entries in the
+ trx->autoinc_locks vector. */
+
+ do {
+ ib_vector_pop(trx->autoinc_locks);
+
+ if (ib_vector_is_empty(trx->autoinc_locks)) {
+ return;
+ }
+
+ } while (*(lock_t**) ib_vector_get_last(trx->autoinc_locks) == NULL);
+}
+
+/*************************************************************//**
+Removes an autoinc lock request from the transaction's autoinc_locks. */
+UNIV_INLINE
+void
+lock_table_remove_autoinc_lock(
+/*===========================*/
+ lock_t* lock, /*!< in: table lock */
+ trx_t* trx) /*!< in/out: transaction that owns the lock */
+{
+ lock_t* autoinc_lock;
+ lint i = ib_vector_size(trx->autoinc_locks) - 1;
+
+ ut_ad(lock_mutex_own());
+ ut_ad(lock_get_mode(lock) == LOCK_AUTO_INC);
+ ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+ ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
+
+ /* With stored functions and procedures the user may drop
+ a table within the same "statement". This special case has
+ to be handled by deleting only those AUTOINC locks that were
+ held by the table being dropped. */
+
+ autoinc_lock = *static_cast<lock_t**>(
+ ib_vector_get(trx->autoinc_locks, i));
+
+ /* This is the default fast case. */
+
+ if (autoinc_lock == lock) {
+ lock_table_pop_autoinc_locks(trx);
+ } else {
+ /* The last element should never be NULL */
+ ut_a(autoinc_lock != NULL);
+
+ /* Handle freeing the locks from within the stack. */
+
+ while (--i >= 0) {
+ autoinc_lock = *static_cast<lock_t**>(
+ ib_vector_get(trx->autoinc_locks, i));
+
+ if (autoinc_lock == lock) {
+ void* null_var = NULL;
+ ib_vector_set(trx->autoinc_locks, i, &null_var);
+ return;
+ }
+ }
+
+ /* Must find the autoinc lock. */
+ ut_error;
+ }
+}
+
+/*************************************************************//**
+Removes a table lock request from the queue and the trx list of locks;
+this is a low-level function which does NOT check if waiting requests
+can now be granted. */
+UNIV_INLINE
+void
+lock_table_remove_low(
+/*==================*/
+ lock_t* lock) /*!< in/out: table lock */
+{
+ trx_t* trx;
+ dict_table_t* table;
+
+ ut_ad(lock_mutex_own());
+
+ trx = lock->trx;
+ table = lock->un_member.tab_lock.table;
+
+ /* Remove the table from the transaction's AUTOINC vector, if
+ the lock that is being released is an AUTOINC lock. */
+ if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+
+ /* The table's AUTOINC lock can get transferred to
+ another transaction before we get here. */
+ if (table->autoinc_trx == trx) {
+ table->autoinc_trx = NULL;
+ }
+
+ /* The locks must be freed in the reverse order from
+ the one in which they were acquired. This is to avoid
+ traversing the AUTOINC lock vector unnecessarily.
+
+ We only store locks that were granted in the
+ trx->autoinc_locks vector (see lock_table_create()
+ and lock_grant()). Therefore it can be empty and we
+ need to check for that. */
+
+ if (!lock_get_wait(lock)
+ && !ib_vector_is_empty(trx->autoinc_locks)) {
+
+ lock_table_remove_autoinc_lock(lock, trx);
+ }
+
+ ut_a(table->n_waiting_or_granted_auto_inc_locks > 0);
+ table->n_waiting_or_granted_auto_inc_locks--;
+ }
+
+ UT_LIST_REMOVE(trx->lock.trx_locks, lock);
+ ut_list_remove(table->locks, lock, TableLockGetNode());
+
+ MONITOR_INC(MONITOR_TABLELOCK_REMOVED);
+ MONITOR_DEC(MONITOR_NUM_TABLELOCK);
+}
+
+/*********************************************************************//**
+Enqueues a waiting request for a table lock which cannot be granted
+immediately. Checks for deadlocks.
+@retval DB_LOCK_WAIT if the waiting lock was enqueued
+@retval DB_DEADLOCK if this transaction was chosen as the victim
+@retval DB_SUCCESS if the other transaction committed or aborted */
+static
+dberr_t
+lock_table_enqueue_waiting(
+/*=======================*/
+ unsigned mode, /*!< in: lock mode this transaction is
+ requesting */
+ dict_table_t* table, /*!< in/out: table */
+ que_thr_t* thr /*!< in: query thread */
+#ifdef WITH_WSREP
+ , lock_t* c_lock /*!< in: conflicting lock or NULL */
+#endif
+)
+{
+ trx_t* trx;
+ lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+ ut_ad(!srv_read_only_mode);
+
+ trx = thr_get_trx(thr);
+ ut_ad(trx_mutex_own(trx));
+ ut_a(!que_thr_stop(thr));
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ break;
+ case TRX_DICT_OP_TABLE:
+ case TRX_DICT_OP_INDEX:
+ ib::error() << "A table lock wait happens in a dictionary"
+ " operation. Table " << table->name
+ << ". " << BUG_REPORT_MSG;
+ ut_ad(0);
+ }
+
+#ifdef WITH_WSREP
+ if (trx->is_wsrep() && trx->lock.was_chosen_as_deadlock_victim) {
+ return(DB_DEADLOCK);
+ }
+#endif /* WITH_WSREP */
+
+ /* Enqueue the lock request that will wait to be granted */
+ lock = lock_table_create(table, mode | LOCK_WAIT, trx
+#ifdef WITH_WSREP
+ , c_lock
+#endif
+ );
+
+ const trx_t* victim_trx =
+ DeadlockChecker::check_and_resolve(lock, trx);
+
+ if (victim_trx != 0) {
+ ut_ad(victim_trx == trx);
+
+ /* The order here is important, we don't want to
+ lose the state of the lock before calling remove. */
+ lock_table_remove_low(lock);
+ lock_reset_lock_and_trx_wait(lock);
+
+ return(DB_DEADLOCK);
+
+ } else if (trx->lock.wait_lock == NULL) {
+ /* Deadlock resolution chose another transaction as a victim,
+ and we accidentally got our lock granted! */
+
+ return(DB_SUCCESS);
+ }
+
+ trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+
+ trx->lock.wait_started = time(NULL);
+ trx->lock.was_chosen_as_deadlock_victim = false;
+
+ ut_a(que_thr_stop(thr));
+
+ MONITOR_INC(MONITOR_TABLELOCK_WAIT);
+
+ return(DB_LOCK_WAIT);
+}
+
+/*********************************************************************//**
+Checks if other transactions have an incompatible mode lock request in
+the lock queue.
+@return lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_table_other_has_incompatible(
+/*==============================*/
+ const trx_t* trx, /*!< in: transaction, or NULL if all
+ transactions should be included */
+ ulint wait, /*!< in: LOCK_WAIT if also
+ waiting locks are taken into
+ account, or 0 if not */
+ const dict_table_t* table, /*!< in: table */
+ lock_mode mode) /*!< in: lock mode */
+{
+ lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+
+ for (lock = UT_LIST_GET_LAST(table->locks);
+ lock != NULL;
+ lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock)) {
+
+ if (lock->trx != trx
+ && !lock_mode_compatible(lock_get_mode(lock), mode)
+ && (wait || !lock_get_wait(lock))) {
+
+#ifdef WITH_WSREP
+ if (lock->trx->is_wsrep()) {
+ if (UNIV_UNLIKELY(wsrep_debug)) {
+ ib::info() << "WSREP: table lock abort for table:"
+ << table->name;
+ ib::info() << " SQL: "
+ << wsrep_thd_query(lock->trx->mysql_thd);
+ }
+ trx_mutex_enter(lock->trx);
+ wsrep_kill_victim((trx_t *)trx, (lock_t *)lock);
+ trx_mutex_exit(lock->trx);
+ }
+#endif /* WITH_WSREP */
+
+ return(lock);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Locks the specified database table in the mode given. If the lock cannot
+be granted immediately, the query thread is put to wait.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_table(
+/*=======*/
+ unsigned flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ dict_table_t* table, /*!< in/out: database table
+ in dictionary cache */
+ lock_mode mode, /*!< in: lock mode */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+ dberr_t err;
+ lock_t* wait_for;
+
+ ut_ad(table && thr);
+
+ /* Given limited visibility of temp-table we can avoid
+ locking overhead */
+ if ((flags & BTR_NO_LOCKING_FLAG)
+ || srv_read_only_mode
+ || table->is_temporary()) {
+
+ return(DB_SUCCESS);
+ }
+
+ ut_a(flags == 0);
+
+ trx = thr_get_trx(thr);
+
+ /* Look for equal or stronger locks the same trx already
+ has on the table. No need to acquire the lock mutex here
+ because only this transacton can add/access table locks
+ to/from trx_t::table_locks. */
+
+ if (lock_table_has(trx, table, mode)) {
+
+ return(DB_SUCCESS);
+ }
+
+ /* Read only transactions can write to temp tables, we don't want
+ to promote them to RW transactions. Their updates cannot be visible
+ to other transactions. Therefore we can keep them out
+ of the read views. */
+
+ if ((mode == LOCK_IX || mode == LOCK_X)
+ && !trx->read_only
+ && trx->rsegs.m_redo.rseg == 0) {
+
+ trx_set_rw_mode(trx);
+ }
+
+ lock_mutex_enter();
+
+ DBUG_EXECUTE_IF("fatal-semaphore-timeout",
+ { os_thread_sleep(3600000000LL); });
+
+ /* We have to check if the new lock is compatible with any locks
+ other transactions have in the table lock queue. */
+
+ wait_for = lock_table_other_has_incompatible(
+ trx, LOCK_WAIT, table, mode);
+
+ trx_mutex_enter(trx);
+
+ /* Another trx has a request on the table in an incompatible
+ mode: this trx may have to wait */
+
+ if (wait_for != NULL) {
+ err = lock_table_enqueue_waiting(flags | mode, table,
+ thr
+#ifdef WITH_WSREP
+ , wait_for
+#endif
+ );
+ } else {
+ lock_table_create(table, flags | mode, trx);
+
+ ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
+
+ err = DB_SUCCESS;
+ }
+
+ lock_mutex_exit();
+
+ trx_mutex_exit(trx);
+
+ return(err);
+}
+
+/*********************************************************************//**
+Creates a table IX lock object for a resurrected transaction. */
+void
+lock_table_ix_resurrect(
+/*====================*/
+ dict_table_t* table, /*!< in/out: table */
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_ad(trx->is_recovered);
+
+ if (lock_table_has(trx, table, LOCK_IX)) {
+ return;
+ }
+
+ lock_mutex_enter();
+
+ /* We have to check if the new lock is compatible with any locks
+ other transactions have in the table lock queue. */
+
+ ut_ad(!lock_table_other_has_incompatible(
+ trx, LOCK_WAIT, table, LOCK_IX));
+
+ trx_mutex_enter(trx);
+ lock_table_create(table, LOCK_IX, trx);
+ lock_mutex_exit();
+ trx_mutex_exit(trx);
+}
+
+/*********************************************************************//**
+Checks if a waiting table lock request still has to wait in a queue.
+@return TRUE if still has to wait */
+static
+bool
+lock_table_has_to_wait_in_queue(
+/*============================*/
+ const lock_t* wait_lock) /*!< in: waiting table lock */
+{
+ const dict_table_t* table;
+ const lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+ ut_ad(lock_get_wait(wait_lock));
+
+ table = wait_lock->un_member.tab_lock.table;
+
+ for (lock = UT_LIST_GET_FIRST(table->locks);
+ lock != wait_lock;
+ lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+
+ if (lock_has_to_wait(wait_lock, lock)) {
+
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/*************************************************************//**
+Removes a table lock request, waiting or granted, from the queue and grants
+locks to other transactions in the queue, if they now are entitled to a
+lock. */
+static
+void
+lock_table_dequeue(
+/*===============*/
+ lock_t* in_lock)/*!< in/out: table lock object; transactions waiting
+ behind will get their lock requests granted, if
+ they are now qualified to it */
+{
+ ut_ad(lock_mutex_own());
+ ut_a(lock_get_type_low(in_lock) == LOCK_TABLE);
+
+ lock_t* lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock);
+
+ lock_table_remove_low(in_lock);
+
+ /* Check if waiting locks in the queue can now be granted: grant
+ locks if there are no conflicting locks ahead. */
+
+ for (/* No op */;
+ lock != NULL;
+ lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+
+ if (lock_get_wait(lock)
+ && !lock_table_has_to_wait_in_queue(lock)) {
+
+ /* Grant the lock */
+ ut_ad(in_lock->trx != lock->trx);
+ lock_grant(lock);
+ }
+ }
+}
+
+/** Sets a lock on a table based on the given mode.
+@param[in] table table to lock
+@param[in,out] trx transaction
+@param[in] mode LOCK_X or LOCK_S
+@return error code or DB_SUCCESS. */
+dberr_t
+lock_table_for_trx(
+ dict_table_t* table,
+ trx_t* trx,
+ enum lock_mode mode)
+{
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ dberr_t err;
+ sel_node_t* node;
+ heap = mem_heap_create(512);
+
+ node = sel_node_create(heap);
+ thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+ thr->graph->state = QUE_FORK_ACTIVE;
+
+ /* We use the select query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = static_cast<que_thr_t*>(
+ que_fork_get_first_thr(
+ static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+ thr->start_running();
+
+run_again:
+ thr->run_node = thr;
+ thr->prev_node = thr->common.parent;
+
+ err = lock_table(0, table, mode, thr);
+
+ trx->error_state = err;
+
+ if (UNIV_LIKELY(err == DB_SUCCESS)) {
+ thr->stop_no_error();
+ } else {
+ que_thr_stop_for_mysql(thr);
+
+ if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
+ goto run_again;
+ }
+ }
+
+ que_graph_free(thr->graph);
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*=========================== LOCK RELEASE ==============================*/
+static
+void
+lock_grant_and_move_on_rec(
+ lock_t* first_lock,
+ ulint heap_no)
+{
+ lock_t* lock;
+ const page_id_t page_id(first_lock->un_member.rec_lock.page_id);
+ const ulint rec_fold= page_id.fold();
+ lock_t* previous = static_cast<lock_t*>(
+ lock_sys.rec_hash.array[lock_sys.hash(page_id)]
+ .node);
+ if (previous == NULL) {
+ return;
+ }
+ if (previous == first_lock) {
+ lock = previous;
+ } else {
+ while (previous->hash &&
+ previous->hash != first_lock) {
+ previous = previous->hash;
+ }
+ lock = previous->hash;
+ }
+ /* Grant locks if there are no conflicting locks ahead.
+ Move granted locks to the head of the list. */
+ while (lock) {
+ ut_ad(!lock->trx->is_wsrep());
+ /* If the lock is a wait lock on this page, and it does not need to wait. */
+ if (lock->un_member.rec_lock.page_id == page_id
+ && lock_rec_get_nth_bit(lock, heap_no)
+ && lock_get_wait(lock)
+ && !lock_rec_has_to_wait_in_queue(lock)) {
+
+ lock_grant(lock);
+
+ if (previous != NULL) {
+ /* Move the lock to the head of the list. */
+ HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock);
+ lock_rec_insert_to_head(lock, rec_fold);
+ } else {
+ /* Already at the head of the list. */
+ previous = lock;
+ }
+ /* Move on to the next lock. */
+ lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, previous));
+ } else {
+ previous = lock;
+ lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, lock));
+ }
+ }
+}
+
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+void
+lock_rec_unlock(
+/*============*/
+ trx_t* trx, /*!< in/out: transaction that has
+ set a record lock */
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record */
+ lock_mode lock_mode)/*!< in: LOCK_S or LOCK_X */
+{
+ lock_t* first_lock;
+ lock_t* lock;
+ ulint heap_no;
+
+ ut_ad(trx);
+ ut_ad(rec);
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(!trx->lock.wait_lock);
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(!page_rec_is_metadata(rec));
+
+ heap_no = page_rec_get_heap_no(rec);
+
+ lock_mutex_enter();
+ trx_mutex_enter(trx);
+
+ first_lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
+
+ /* Find the last lock with the same lock_mode and transaction
+ on the record. */
+
+ for (lock = first_lock; lock != NULL;
+ lock = lock_rec_get_next(heap_no, lock)) {
+ if (lock->trx == trx && lock_get_mode(lock) == lock_mode) {
+ goto released;
+ }
+ }
+
+ lock_mutex_exit();
+ trx_mutex_exit(trx);
+
+ {
+ ib::error err;
+ err << "Unlock row could not find a " << lock_mode
+ << " mode lock on the record. Current statement: ";
+ size_t stmt_len;
+ if (const char* stmt = innobase_get_stmt_unsafe(
+ trx->mysql_thd, &stmt_len)) {
+ err.write(stmt, stmt_len);
+ }
+ }
+
+ return;
+
+released:
+ ut_a(!lock_get_wait(lock));
+ lock_rec_reset_nth_bit(lock, heap_no);
+
+ if (innodb_lock_schedule_algorithm
+ == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS ||
+ thd_is_replication_slave_thread(lock->trx->mysql_thd)) {
+
+ /* Check if we can now grant waiting lock requests */
+
+ for (lock = first_lock; lock != NULL;
+ lock = lock_rec_get_next(heap_no, lock)) {
+ if (!lock_get_wait(lock)) {
+ continue;
+ }
+ const lock_t* c = lock_rec_has_to_wait_in_queue(lock);
+ if (!c) {
+ /* Grant the lock */
+ ut_ad(trx != lock->trx);
+ lock_grant(lock);
+ }
+ }
+ } else {
+ lock_grant_and_move_on_rec(first_lock, heap_no);
+ }
+
+ lock_mutex_exit();
+ trx_mutex_exit(trx);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Check if a transaction that has X or IX locks has set the dict_op
+code correctly. */
+static
+void
+lock_check_dict_lock(
+/*==================*/
+ const lock_t* lock) /*!< in: lock to check */
+{
+ if (lock_get_type_low(lock) == LOCK_REC) {
+ ut_ad(!lock->index->table->is_temporary());
+
+ /* Check if the transcation locked a record
+ in a system table in X mode. It should have set
+ the dict_op code correctly if it did. */
+ if (lock->index->table->id < DICT_HDR_FIRST_ID
+ && lock_get_mode(lock) == LOCK_X) {
+
+ ut_ad(lock_get_mode(lock) != LOCK_IX);
+ ut_ad(lock->trx->dict_operation != TRX_DICT_OP_NONE);
+ }
+ } else {
+ ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+ const dict_table_t* table = lock->un_member.tab_lock.table;
+ ut_ad(!table->is_temporary());
+
+ /* Check if the transcation locked a system table
+ in IX mode. It should have set the dict_op code
+ correctly if it did. */
+ if (table->id < DICT_HDR_FIRST_ID
+ && (lock_get_mode(lock) == LOCK_X
+ || lock_get_mode(lock) == LOCK_IX)) {
+
+ ut_ad(lock->trx->dict_operation != TRX_DICT_OP_NONE);
+ }
+ }
+}
+#endif /* UNIV_DEBUG */
+
+/** Release the explicit locks of a committing transaction,
+and release possible other transactions waiting because of these locks. */
+void lock_release(trx_t* trx)
+{
+#ifdef UNIV_DEBUG
+ std::set<table_id_t> to_evict;
+ if (innodb_evict_tables_on_commit_debug && !trx->is_recovered)
+# if 1 /* if dict_stats_exec_sql() were not playing dirty tricks */
+ if (!mutex_own(&dict_sys.mutex))
+# else /* this would be more proper way to do it */
+ if (!trx->dict_operation_lock_mode && !trx->dict_operation)
+# endif
+ for (const auto& p : trx->mod_tables)
+ if (!p.first->is_temporary())
+ to_evict.emplace(p.first->id);
+#endif
+ ulint count = 0;
+ trx_id_t max_trx_id = trx_sys.get_max_trx_id();
+
+ lock_mutex_enter();
+ ut_ad(!trx_mutex_own(trx));
+
+ for (lock_t* lock = UT_LIST_GET_LAST(trx->lock.trx_locks);
+ lock != NULL;
+ lock = UT_LIST_GET_LAST(trx->lock.trx_locks)) {
+
+ ut_d(lock_check_dict_lock(lock));
+
+ if (lock_get_type_low(lock) == LOCK_REC) {
+
+ lock_rec_dequeue_from_page(lock);
+ } else {
+ dict_table_t* table;
+
+ table = lock->un_member.tab_lock.table;
+
+ if (lock_get_mode(lock) != LOCK_IS
+ && trx->undo_no != 0) {
+
+ /* The trx may have modified the table. We
+ block the use of the MySQL query cache for
+ all currently active transactions. */
+
+ table->query_cache_inv_trx_id = max_trx_id;
+ }
+
+ lock_table_dequeue(lock);
+ }
+
+ if (count == LOCK_RELEASE_INTERVAL) {
+ /* Release the mutex for a while, so that we
+ do not monopolize it */
+
+ lock_mutex_exit();
+
+ lock_mutex_enter();
+
+ count = 0;
+ }
+
+ ++count;
+ }
+
+ lock_mutex_exit();
+
+#ifdef UNIV_DEBUG
+ if (to_evict.empty()) {
+ return;
+ }
+ mutex_enter(&dict_sys.mutex);
+ lock_mutex_enter();
+ for (table_id_t id : to_evict) {
+ if (dict_table_t *table = dict_table_open_on_id(
+ id, TRUE, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)) {
+ if (!table->get_ref_count()
+ && !UT_LIST_GET_LEN(table->locks)) {
+ dict_sys.remove(table, true);
+ }
+ }
+ }
+ lock_mutex_exit();
+ mutex_exit(&dict_sys.mutex);
+#endif
+}
+
+/* True if a lock mode is S or X */
+#define IS_LOCK_S_OR_X(lock) \
+ (lock_get_mode(lock) == LOCK_S \
+ || lock_get_mode(lock) == LOCK_X)
+
+/*********************************************************************//**
+Removes table locks of the transaction on a table to be dropped. */
+static
+void
+lock_trx_table_locks_remove(
+/*========================*/
+ const lock_t* lock_to_remove) /*!< in: lock to remove */
+{
+ trx_t* trx = lock_to_remove->trx;
+
+ ut_ad(lock_mutex_own());
+
+ /* It is safe to read this because we are holding the lock mutex */
+ if (!trx->lock.cancel) {
+ trx_mutex_enter(trx);
+ } else {
+ ut_ad(trx_mutex_own(trx));
+ }
+
+ for (lock_list::iterator it = trx->lock.table_locks.begin(),
+ end = trx->lock.table_locks.end(); it != end; ++it) {
+ const lock_t* lock = *it;
+
+ ut_ad(!lock || trx == lock->trx);
+ ut_ad(!lock || lock_get_type_low(lock) & LOCK_TABLE);
+ ut_ad(!lock || lock->un_member.tab_lock.table);
+
+ if (lock == lock_to_remove) {
+ *it = NULL;
+
+ if (!trx->lock.cancel) {
+ trx_mutex_exit(trx);
+ }
+
+ return;
+ }
+ }
+
+ if (!trx->lock.cancel) {
+ trx_mutex_exit(trx);
+ }
+
+ /* Lock must exist in the vector. */
+ ut_error;
+}
+
+/*===================== VALIDATION AND DEBUGGING ====================*/
+
+/** Print info of a table lock.
+@param[in,out] file output stream
+@param[in] lock table lock */
+static
+void
+lock_table_print(FILE* file, const lock_t* lock)
+{
+ ut_ad(lock_mutex_own());
+ ut_a(lock_get_type_low(lock) == LOCK_TABLE);
+
+ fputs("TABLE LOCK table ", file);
+ ut_print_name(file, lock->trx,
+ lock->un_member.tab_lock.table->name.m_name);
+ fprintf(file, " trx id " TRX_ID_FMT, trx_get_id_for_print(lock->trx));
+
+ if (lock_get_mode(lock) == LOCK_S) {
+ fputs(" lock mode S", file);
+ } else if (lock_get_mode(lock) == LOCK_X) {
+ ut_ad(lock->trx->id != 0);
+ fputs(" lock mode X", file);
+ } else if (lock_get_mode(lock) == LOCK_IS) {
+ fputs(" lock mode IS", file);
+ } else if (lock_get_mode(lock) == LOCK_IX) {
+ ut_ad(lock->trx->id != 0);
+ fputs(" lock mode IX", file);
+ } else if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+ fputs(" lock mode AUTO-INC", file);
+ } else {
+ fprintf(file, " unknown lock mode %lu",
+ (ulong) lock_get_mode(lock));
+ }
+
+ if (lock_get_wait(lock)) {
+ fputs(" waiting", file);
+ }
+
+ putc('\n', file);
+}
+
+/** Pretty-print a record lock.
+@param[in,out] file output stream
+@param[in] lock record lock
+@param[in,out] mtr mini-transaction for accessing the record */
+static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr)
+{
+ ut_ad(lock_mutex_own());
+ ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+ const page_id_t page_id(lock->un_member.rec_lock.page_id);
+
+ fprintf(file, "RECORD LOCKS space id %u page no %u n bits " ULINTPF
+ " index %s of table ",
+ page_id.space(), page_id.page_no(),
+ lock_rec_get_n_bits(lock),
+ lock->index->name());
+ ut_print_name(file, lock->trx, lock->index->table->name.m_name);
+ fprintf(file, " trx id " TRX_ID_FMT, trx_get_id_for_print(lock->trx));
+
+ if (lock_get_mode(lock) == LOCK_S) {
+ fputs(" lock mode S", file);
+ } else if (lock_get_mode(lock) == LOCK_X) {
+ fputs(" lock_mode X", file);
+ } else {
+ ut_error;
+ }
+
+ if (lock_rec_get_gap(lock)) {
+ fputs(" locks gap before rec", file);
+ }
+
+ if (lock_rec_get_rec_not_gap(lock)) {
+ fputs(" locks rec but not gap", file);
+ }
+
+ if (lock_rec_get_insert_intention(lock)) {
+ fputs(" insert intention", file);
+ }
+
+ if (lock_get_wait(lock)) {
+ fputs(" waiting", file);
+ }
+
+ putc('\n', file);
+
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ mtr.start();
+ const buf_block_t* block = buf_page_try_get(page_id, &mtr);
+
+ for (ulint i = 0; i < lock_rec_get_n_bits(lock); ++i) {
+
+ if (!lock_rec_get_nth_bit(lock, i)) {
+ continue;
+ }
+
+ fprintf(file, "Record lock, heap no %lu", (ulong) i);
+
+ if (block) {
+ ut_ad(page_is_leaf(block->frame));
+ const rec_t* rec;
+
+ rec = page_find_rec_with_heap_no(
+ buf_block_get_frame(block), i);
+ ut_ad(!page_rec_is_metadata(rec));
+
+ offsets = rec_get_offsets(
+ rec, lock->index, offsets,
+ lock->index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ putc(' ', file);
+ rec_print_new(file, rec, offsets);
+ }
+
+ putc('\n', file);
+ }
+
+ mtr.commit();
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/* Print the number of lock structs from lock_print_info_summary() only
+in non-production builds for performance reasons, see
+http://bugs.mysql.com/36942 */
+#define PRINT_NUM_OF_LOCK_STRUCTS
+#endif /* UNIV_DEBUG */
+
+#ifdef PRINT_NUM_OF_LOCK_STRUCTS
+/*********************************************************************//**
+Calculates the number of record lock structs in the record lock hash table.
+@return number of record locks */
+static ulint lock_get_n_rec_locks()
+{
+ ulint n_locks = 0;
+ ulint i;
+
+ ut_ad(lock_mutex_own());
+
+ for (i = 0; i < lock_sys.rec_hash.n_cells; i++) {
+ const lock_t* lock;
+
+ for (lock = static_cast<const lock_t*>(
+ HASH_GET_FIRST(&lock_sys.rec_hash, i));
+ lock != 0;
+ lock = static_cast<const lock_t*>(
+ HASH_GET_NEXT(hash, lock))) {
+
+ n_locks++;
+ }
+ }
+
+ return(n_locks);
+}
+#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain lock mutex
+and exits without printing info */
+ibool
+lock_print_info_summary(
+/*====================*/
+ FILE* file, /*!< in: file where to print */
+ ibool nowait) /*!< in: whether to wait for the lock mutex */
+{
+ /* if nowait is FALSE, wait on the lock mutex,
+ otherwise return immediately if fail to obtain the
+ mutex. */
+ if (!nowait) {
+ lock_mutex_enter();
+ } else if (lock_mutex_enter_nowait()) {
+ fputs("FAIL TO OBTAIN LOCK MUTEX,"
+ " SKIP LOCK INFO PRINTING\n", file);
+ return(FALSE);
+ }
+
+ if (lock_deadlock_found) {
+ fputs("------------------------\n"
+ "LATEST DETECTED DEADLOCK\n"
+ "------------------------\n", file);
+
+ if (!srv_read_only_mode) {
+ ut_copy_file(file, lock_latest_err_file);
+ }
+ }
+
+ fputs("------------\n"
+ "TRANSACTIONS\n"
+ "------------\n", file);
+
+ fprintf(file, "Trx id counter " TRX_ID_FMT "\n",
+ trx_sys.get_max_trx_id());
+
+ fprintf(file,
+ "Purge done for trx's n:o < " TRX_ID_FMT
+ " undo n:o < " TRX_ID_FMT " state: %s\n"
+ "History list length %u\n",
+ purge_sys.tail.trx_no,
+ purge_sys.tail.undo_no,
+ purge_sys.enabled()
+ ? (purge_sys.running() ? "running"
+ : purge_sys.paused() ? "stopped" : "running but idle")
+ : "disabled",
+ uint32_t{trx_sys.rseg_history_len});
+
+#ifdef PRINT_NUM_OF_LOCK_STRUCTS
+ fprintf(file,
+ "Total number of lock structs in row lock hash table %lu\n",
+ (ulong) lock_get_n_rec_locks());
+#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+ return(TRUE);
+}
+
+/** Prints transaction lock wait and MVCC state.
+@param[in,out] file file where to print
+@param[in] trx transaction
+@param[in] now current time */
+void
+lock_trx_print_wait_and_mvcc_state(FILE* file, const trx_t* trx, time_t now)
+{
+ fprintf(file, "---");
+
+ trx_print_latched(file, trx, 600);
+ trx->read_view.print_limits(file);
+
+ if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+ fprintf(file,
+ "------- TRX HAS BEEN WAITING %lu SEC"
+ " FOR THIS LOCK TO BE GRANTED:\n",
+ (ulong) difftime(now, trx->lock.wait_started));
+
+ if (lock_get_type_low(trx->lock.wait_lock) == LOCK_REC) {
+ mtr_t mtr;
+ lock_rec_print(file, trx->lock.wait_lock, mtr);
+ } else {
+ lock_table_print(file, trx->lock.wait_lock);
+ }
+
+ fprintf(file, "------------------\n");
+ }
+}
+
+/*********************************************************************//**
+Prints info of locks for a transaction. */
+static
+void
+lock_trx_print_locks(
+/*=================*/
+ FILE* file, /*!< in/out: File to write */
+ const trx_t* trx) /*!< in: current transaction */
+{
+ mtr_t mtr;
+ uint32_t i= 0;
+ /* Iterate over the transaction's locks. */
+ for (lock_t *lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+ lock != NULL;
+ lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+ if (lock_get_type_low(lock) == LOCK_REC) {
+
+ lock_rec_print(file, lock, mtr);
+ } else {
+ ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+ lock_table_print(file, lock);
+ }
+
+ if (++i == 10) {
+
+ fprintf(file,
+ "10 LOCKS PRINTED FOR THIS TRX:"
+ " SUPPRESSING FURTHER PRINTS\n");
+
+ break;
+ }
+ }
+}
+
+/** Functor to display all transactions */
+struct lock_print_info
+{
+ lock_print_info(FILE* file, time_t now) :
+ file(file), now(now),
+ purge_trx(purge_sys.query ? purge_sys.query->trx : NULL)
+ {}
+
+ void operator()(const trx_t &trx) const
+ {
+ if (UNIV_UNLIKELY(&trx == purge_trx))
+ return;
+ lock_trx_print_wait_and_mvcc_state(file, &trx, now);
+
+ if (trx.will_lock && srv_print_innodb_lock_monitor)
+ lock_trx_print_locks(file, &trx);
+ }
+
+ FILE* const file;
+ const time_t now;
+ const trx_t* const purge_trx;
+};
+
+/*********************************************************************//**
+Prints info of locks for each transaction. This function assumes that the
+caller holds the lock mutex and more importantly it will release the lock
+mutex on behalf of the caller. (This should be fixed in the future). */
+void
+lock_print_info_all_transactions(
+/*=============================*/
+ FILE* file) /*!< in/out: file where to print */
+{
+ ut_ad(lock_mutex_own());
+
+ fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
+
+ trx_sys.trx_list.for_each(lock_print_info(file, time(nullptr)));
+ lock_mutex_exit();
+
+ ut_ad(lock_validate());
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Find the the lock in the trx_t::trx_lock_t::table_locks vector.
+@return true if found */
+static
+bool
+lock_trx_table_locks_find(
+/*======================*/
+ trx_t* trx, /*!< in: trx to validate */
+ const lock_t* find_lock) /*!< in: lock to find */
+{
+ bool found = false;
+
+ ut_ad(trx_mutex_own(trx));
+
+ for (lock_list::const_iterator it = trx->lock.table_locks.begin(),
+ end = trx->lock.table_locks.end(); it != end; ++it) {
+
+ const lock_t* lock = *it;
+
+ if (lock == NULL) {
+
+ continue;
+
+ } else if (lock == find_lock) {
+
+ /* Can't be duplicates. */
+ ut_a(!found);
+ found = true;
+ }
+
+ ut_a(trx == lock->trx);
+ ut_a(lock_get_type_low(lock) & LOCK_TABLE);
+ ut_a(lock->un_member.tab_lock.table != NULL);
+ }
+
+ return(found);
+}
+
+/*********************************************************************//**
+Validates the lock queue on a table.
+@return TRUE if ok */
+static
+ibool
+lock_table_queue_validate(
+/*======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ const lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+
+ for (lock = UT_LIST_GET_FIRST(table->locks);
+ lock != NULL;
+ lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+
+ /* lock->trx->state cannot change from or to NOT_STARTED
+ while we are holding the lock_sys.mutex. It may change
+ from ACTIVE or PREPARED to PREPARED or COMMITTED. */
+ trx_mutex_enter(lock->trx);
+ check_trx_state(lock->trx);
+
+ if (lock->trx->state == TRX_STATE_COMMITTED_IN_MEMORY) {
+ } else if (!lock_get_wait(lock)) {
+ ut_a(!lock_table_other_has_incompatible(
+ lock->trx, 0, table,
+ lock_get_mode(lock)));
+ } else {
+ ut_a(lock_table_has_to_wait_in_queue(lock));
+ }
+
+ ut_a(lock_trx_table_locks_find(lock->trx, lock));
+ trx_mutex_exit(lock->trx);
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the lock queue on a single record.
+@return TRUE if ok */
+static
+bool
+lock_rec_queue_validate(
+/*====================*/
+ bool locked_lock_trx_sys,
+ /*!< in: if the caller holds
+ both the lock mutex and
+ trx_sys_t->lock. */
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record to look at */
+ const dict_index_t* index, /*!< in: index, or NULL if not known */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ const lock_t* lock;
+ ulint heap_no;
+
+ ut_a(rec);
+ ut_a(block->frame == page_align(rec));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(lock_mutex_own() == locked_lock_trx_sys);
+ ut_ad(!index || dict_index_is_clust(index)
+ || !dict_index_is_online_ddl(index));
+
+ heap_no = page_rec_get_heap_no(rec);
+
+ if (!locked_lock_trx_sys) {
+ lock_mutex_enter();
+ }
+
+ if (!page_rec_is_user_rec(rec)) {
+
+ for (lock = lock_rec_get_first(&lock_sys.rec_hash,
+ block, heap_no);
+ lock != NULL;
+ lock = lock_rec_get_next_const(heap_no, lock)) {
+
+ ut_ad(!index || lock->index == index);
+
+ trx_mutex_enter(lock->trx);
+ ut_ad(!lock->trx->read_only
+ || !lock->trx->is_autocommit_non_locking());
+ ut_ad(trx_state_eq(lock->trx,
+ TRX_STATE_COMMITTED_IN_MEMORY)
+ || !lock_get_wait(lock)
+ || lock_rec_has_to_wait_in_queue(lock));
+ trx_mutex_exit(lock->trx);
+ }
+
+func_exit:
+ if (!locked_lock_trx_sys) {
+ lock_mutex_exit();
+ }
+
+ return true;
+ }
+
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(lock_mutex_own());
+
+ const trx_id_t impl_trx_id = index && index->is_primary()
+ ? lock_clust_rec_some_has_impl(rec, index, offsets)
+ : 0;
+
+ if (trx_t *impl_trx = impl_trx_id
+ ? trx_sys.find(current_trx(), impl_trx_id, false)
+ : 0) {
+ /* impl_trx could have been committed before we
+ acquire its mutex, but not thereafter. */
+
+ mutex_enter(&impl_trx->mutex);
+ ut_ad(impl_trx->state != TRX_STATE_NOT_STARTED);
+ if (impl_trx->state == TRX_STATE_COMMITTED_IN_MEMORY) {
+ } else if (const lock_t* other_lock
+ = lock_rec_other_has_expl_req(
+ LOCK_S, block, true, heap_no,
+ impl_trx)) {
+ /* The impl_trx is holding an implicit lock on the
+ given record 'rec'. So there cannot be another
+ explicit granted lock. Also, there can be another
+ explicit waiting lock only if the impl_trx has an
+ explicit granted lock. */
+
+#ifdef WITH_WSREP
+ /** Galera record locking rules:
+ * If there is no other record lock to the same record, we may grant
+ the lock request.
+ * If there is other record lock but this requested record lock is
+ compatible, we may grant the lock request.
+ * If there is other record lock and it is not compatible with
+ requested lock, all normal transactions must wait.
+ * BF (brute force) additional exceptions :
+ ** If BF already holds record lock for requested record, we may
+ grant new record lock even if there is conflicting record lock(s)
+ waiting on a queue.
+ ** If conflicting transaction holds requested record lock,
+ we will cancel this record lock and select conflicting transaction
+ for BF abort or kill victim.
+ ** If conflicting transaction is waiting for requested record lock
+ we will cancel this wait and select conflicting transaction
+ for BF abort or kill victim.
+ ** There should not be two BF transactions waiting for same record lock
+ */
+ if (other_lock->trx->is_wsrep() && !lock_get_wait(other_lock)) {
+ wsrep_report_bf_lock_wait(impl_trx->mysql_thd, impl_trx->id);
+ wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id);
+
+ if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no,
+ impl_trx)) {
+ ib::info() << "WSREP impl BF lock conflict";
+ }
+ } else
+#endif /* WITH_WSREP */
+ {
+ ut_ad(lock_get_wait(other_lock));
+ ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no, impl_trx));
+ }
+ }
+
+ mutex_exit(&impl_trx->mutex);
+ }
+
+ for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
+ lock != NULL;
+ lock = lock_rec_get_next_const(heap_no, lock)) {
+ ut_ad(!lock->trx->read_only
+ || !lock->trx->is_autocommit_non_locking());
+ ut_ad(!page_rec_is_metadata(rec));
+
+ if (index) {
+ ut_a(lock->index == index);
+ }
+
+ if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) {
+
+ lock_mode mode;
+
+ if (lock_get_mode(lock) == LOCK_S) {
+ mode = LOCK_X;
+ } else {
+ mode = LOCK_S;
+ }
+
+ const lock_t* other_lock
+ = lock_rec_other_has_expl_req(
+ mode, block, false, heap_no,
+ lock->trx);
+#ifdef WITH_WSREP
+ if (UNIV_UNLIKELY(other_lock && lock->trx->is_wsrep())) {
+ /* Only BF transaction may be granted
+ lock before other conflicting lock
+ request. */
+ if (!wsrep_thd_is_BF(lock->trx->mysql_thd, FALSE)
+ && !wsrep_thd_is_BF(other_lock->trx->mysql_thd, FALSE)) {
+ /* If no BF, this case is a bug. */
+ wsrep_report_bf_lock_wait(lock->trx->mysql_thd, lock->trx->id);
+ wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id);
+ ut_error;
+ }
+ } else
+#endif /* WITH_WSREP */
+ ut_ad(!other_lock);
+ } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) {
+
+ ut_a(lock_rec_has_to_wait_in_queue(lock));
+ }
+ }
+
+ ut_ad(innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS ||
+ lock_queue_validate(lock));
+
+ goto func_exit;
+}
+
+/*********************************************************************//**
+Validates the record lock queues on a page.
+@return TRUE if ok */
+static
+ibool
+lock_rec_validate_page(
+/*===================*/
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ const lock_t* lock;
+ const rec_t* rec;
+ ulint nth_lock = 0;
+ ulint nth_bit = 0;
+ ulint i;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ lock_mutex_enter();
+loop:
+ lock = lock_sys.get_first(block->page.id());
+
+ if (!lock) {
+ goto function_exit;
+ }
+
+ DBUG_ASSERT(block->page.status != buf_page_t::FREED);
+
+ for (i = 0; i < nth_lock; i++) {
+
+ lock = lock_rec_get_next_on_page_const(lock);
+
+ if (!lock) {
+ goto function_exit;
+ }
+ }
+
+ ut_ad(!lock->trx->read_only
+ || !lock->trx->is_autocommit_non_locking());
+
+ /* Only validate the record queues when this thread is not
+ holding a space->latch. */
+ if (!sync_check_find(SYNC_FSP))
+ for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) {
+
+ if (i == PAGE_HEAP_NO_SUPREMUM
+ || lock_rec_get_nth_bit(lock, i)) {
+
+ rec = page_find_rec_with_heap_no(block->frame, i);
+ ut_a(rec);
+ ut_ad(!lock_rec_get_nth_bit(lock, i)
+ || page_rec_is_leaf(rec));
+ offsets = rec_get_offsets(rec, lock->index, offsets,
+ lock->index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* If this thread is holding the file space
+ latch (fil_space_t::latch), the following
+ check WILL break the latching order and may
+ cause a deadlock of threads. */
+
+ lock_rec_queue_validate(
+ TRUE, block, rec, lock->index, offsets);
+
+ nth_bit = i + 1;
+
+ goto loop;
+ }
+ }
+
+ nth_bit = 0;
+ nth_lock++;
+
+ goto loop;
+
+function_exit:
+ lock_mutex_exit();
+
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Validate record locks up to a limit.
+@return lock at limit or NULL if no more locks in the hash bucket */
+static MY_ATTRIBUTE((warn_unused_result))
+const lock_t*
+lock_rec_validate(
+/*==============*/
+ ulint start, /*!< in: lock_sys.rec_hash
+ bucket */
+ page_id_t* limit) /*!< in/out: upper limit of
+ (space, page_no) */
+{
+ ut_ad(lock_mutex_own());
+
+ for (const lock_t* lock = static_cast<const lock_t*>(
+ HASH_GET_FIRST(&lock_sys.rec_hash, start));
+ lock != NULL;
+ lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))) {
+
+ ut_ad(!lock->trx->read_only
+ || !lock->trx->is_autocommit_non_locking());
+ ut_ad(lock_get_type(lock) == LOCK_REC);
+
+ page_id_t current(lock->un_member.rec_lock.page_id);
+
+ if (current > *limit) {
+ *limit = current + 1;
+ return(lock);
+ }
+ }
+
+ return(0);
+}
+
+/*********************************************************************//**
+Validate a record lock's block */
+static void lock_rec_block_validate(const page_id_t page_id)
+{
+ /* The lock and the block that it is referring to may be freed at
+ this point. We pass BUF_GET_POSSIBLY_FREED to skip a debug check.
+ If the lock exists in lock_rec_validate_page() we assert
+ block->page.status != FREED. */
+
+ buf_block_t* block;
+ mtr_t mtr;
+
+ /* Transactional locks should never refer to dropped
+ tablespaces, because all DDL operations that would drop or
+ discard or rebuild a tablespace do hold an exclusive table
+ lock, which would conflict with any locks referring to the
+ tablespace from other transactions. */
+ if (fil_space_t* space = fil_space_t::get(page_id.space())) {
+ dberr_t err = DB_SUCCESS;
+ mtr_start(&mtr);
+
+ block = buf_page_get_gen(
+ page_id,
+ space->zip_size(),
+ RW_X_LATCH, NULL,
+ BUF_GET_POSSIBLY_FREED,
+ __FILE__, __LINE__, &mtr, &err);
+
+ if (err != DB_SUCCESS) {
+ ib::error() << "Lock rec block validate failed for tablespace "
+ << space->name
+ << page_id << " err " << err;
+ }
+
+ if (block && block->page.status != buf_page_t::FREED) {
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ ut_ad(lock_rec_validate_page(block));
+ }
+
+ mtr_commit(&mtr);
+
+ space->release();
+ }
+}
+
+
+static my_bool lock_validate_table_locks(rw_trx_hash_element_t *element, void*)
+{
+ ut_ad(lock_mutex_own());
+ mutex_enter(&element->mutex);
+ if (element->trx)
+ {
+ check_trx_state(element->trx);
+ for (const lock_t *lock= UT_LIST_GET_FIRST(element->trx->lock.trx_locks);
+ lock != NULL;
+ lock= UT_LIST_GET_NEXT(trx_locks, lock))
+ {
+ if (lock_get_type_low(lock) & LOCK_TABLE)
+ lock_table_queue_validate(lock->un_member.tab_lock.table);
+ }
+ }
+ mutex_exit(&element->mutex);
+ return 0;
+}
+
+
+/*********************************************************************//**
+Validates the lock system.
+@return TRUE if ok */
+static
+bool
+lock_validate()
+/*===========*/
+{
+ std::set<page_id_t> pages;
+
+ lock_mutex_enter();
+
+ /* Validate table locks */
+ trx_sys.rw_trx_hash.iterate(lock_validate_table_locks);
+
+ /* Iterate over all the record locks and validate the locks. We
+ don't want to hog the lock_sys_t::mutex. Release it during the
+ validation check. */
+
+ for (ulint i = 0; i < lock_sys.rec_hash.n_cells; i++) {
+ page_id_t limit(0, 0);
+
+ while (const lock_t* lock = lock_rec_validate(i, &limit)) {
+ if (lock_rec_find_set_bit(lock) == ULINT_UNDEFINED) {
+ /* The lock bitmap is empty; ignore it. */
+ continue;
+ }
+ pages.insert(lock->un_member.rec_lock.page_id);
+ }
+ }
+
+ lock_mutex_exit();
+
+ for (page_id_t page_id : pages) {
+ lock_rec_block_validate(page_id);
+ }
+
+ return(true);
+}
+#endif /* UNIV_DEBUG */
+/*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_rec_insert_check_and_lock(
+/*===========================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is
+ set, does nothing */
+ const rec_t* rec, /*!< in: record after which to insert */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ dict_index_t* index, /*!< in: index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ bool* inherit)/*!< out: set to true if the new
+ inserted record maybe should inherit
+ LOCK_GAP type locks from the successor
+ record */
+{
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(!dict_index_is_online_ddl(index)
+ || index->is_primary()
+ || (flags & BTR_CREATE_FLAG));
+ ut_ad(mtr->is_named_space(index->table->space));
+ ut_ad(page_rec_is_leaf(rec));
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(!index->table->is_temporary());
+ ut_ad(page_is_leaf(block->frame));
+
+ dberr_t err;
+ lock_t* lock;
+ bool inherit_in = *inherit;
+ trx_t* trx = thr_get_trx(thr);
+ const rec_t* next_rec = page_rec_get_next_const(rec);
+ ulint heap_no = page_rec_get_heap_no(next_rec);
+ ut_ad(!rec_is_metadata(next_rec, *index));
+
+ lock_mutex_enter();
+ /* Because this code is invoked for a running transaction by
+ the thread that is serving the transaction, it is not necessary
+ to hold trx->mutex here. */
+
+ /* When inserting a record into an index, the table must be at
+ least IX-locked. When we are building an index, we would pass
+ BTR_NO_LOCKING_FLAG and skip the locking altogether. */
+ ut_ad(lock_table_has(trx, index->table, LOCK_IX));
+
+ lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
+
+ if (lock == NULL) {
+ /* We optimize CPU time usage in the simplest case */
+
+ lock_mutex_exit();
+
+ if (inherit_in && !dict_index_is_clust(index)) {
+ /* Update the page max trx id field */
+ page_update_max_trx_id(block,
+ buf_block_get_page_zip(block),
+ trx->id, mtr);
+ }
+
+ *inherit = false;
+
+ return(DB_SUCCESS);
+ }
+
+ /* Spatial index does not use GAP lock protection. It uses
+ "predicate lock" to protect the "range" */
+ if (dict_index_is_spatial(index)) {
+ return(DB_SUCCESS);
+ }
+
+ *inherit = true;
+
+ /* If another transaction has an explicit lock request which locks
+ the gap, waiting or granted, on the successor, the insert has to wait.
+
+ An exception is the case where the lock by the another transaction
+ is a gap type lock which it placed to wait for its turn to insert. We
+ do not consider that kind of a lock conflicting with our insert. This
+ eliminates an unnecessary deadlock which resulted when 2 transactions
+ had to wait for their insert. Both had waiting gap type lock requests
+ on the successor, which produced an unnecessary deadlock. */
+
+ const unsigned type_mode = LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION;
+
+ if (
+#ifdef WITH_WSREP
+ lock_t* c_lock =
+#endif /* WITH_WSREP */
+ lock_rec_other_has_conflicting(type_mode, block, heap_no, trx)) {
+ /* Note that we may get DB_SUCCESS also here! */
+ trx_mutex_enter(trx);
+
+ err = lock_rec_enqueue_waiting(
+#ifdef WITH_WSREP
+ c_lock,
+#endif /* WITH_WSREP */
+ type_mode, block, heap_no, index, thr, NULL);
+
+ trx_mutex_exit(trx);
+ } else {
+ err = DB_SUCCESS;
+ }
+
+ lock_mutex_exit();
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ if (!inherit_in || dict_index_is_clust(index)) {
+ break;
+ }
+
+ /* Update the page max trx id field */
+ page_update_max_trx_id(
+ block, buf_block_get_page_zip(block), trx->id, mtr);
+ default:
+ /* We only care about the two return values. */
+ break;
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ const rec_offs* offsets;
+ rec_offs_init(offsets_);
+
+ offsets = rec_get_offsets(next_rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ ut_ad(lock_rec_queue_validate(
+ FALSE, block, next_rec, index, offsets));
+
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+ }
+#endif /* UNIV_DEBUG */
+
+ return(err);
+}
+
+/*********************************************************************//**
+Creates an explicit record lock for a running transaction that currently only
+has an implicit lock on the record. The transaction instance must have a
+reference count > 0 so that it can't be committed and freed before this
+function has completed. */
+static
+void
+lock_rec_convert_impl_to_expl_for_trx(
+/*==================================*/
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record on page */
+ dict_index_t* index, /*!< in: index of record */
+ trx_t* trx, /*!< in/out: active transaction */
+ ulint heap_no)/*!< in: rec heap number to lock */
+{
+ ut_ad(trx->is_referenced());
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(!rec_is_metadata(rec, *index));
+
+ DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx");
+ lock_mutex_enter();
+ trx_mutex_enter(trx);
+ ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+
+ if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)
+ && !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no, trx)) {
+ lock_rec_add_to_queue(LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no, index, trx, true);
+ }
+
+ lock_mutex_exit();
+ trx_mutex_exit(trx);
+ trx->release_reference();
+
+ DEBUG_SYNC_C("after_lock_rec_convert_impl_to_expl_for_trx");
+}
+
+
+#ifdef UNIV_DEBUG
+struct lock_rec_other_trx_holds_expl_arg
+{
+ const ulint heap_no;
+ const buf_block_t * const block;
+ const trx_t *impl_trx;
+};
+
+
+static my_bool lock_rec_other_trx_holds_expl_callback(
+ rw_trx_hash_element_t *element,
+ lock_rec_other_trx_holds_expl_arg *arg)
+{
+ mutex_enter(&element->mutex);
+ if (element->trx)
+ {
+ trx_mutex_enter(element->trx);
+ ut_ad(element->trx->state != TRX_STATE_NOT_STARTED);
+ lock_t *expl_lock= element->trx->state == TRX_STATE_COMMITTED_IN_MEMORY
+ ? NULL : lock_rec_has_expl(LOCK_S | LOCK_REC_NOT_GAP, arg->block,
+ arg->heap_no, element->trx);
+ /*
+ An explicit lock is held by trx other than the trx holding the implicit
+ lock.
+ */
+ ut_ad(!expl_lock || expl_lock->trx == arg->impl_trx);
+ trx_mutex_exit(element->trx);
+ }
+ mutex_exit(&element->mutex);
+ return 0;
+}
+
+
+/**
+ Checks if some transaction, other than given trx_id, has an explicit
+ lock on the given rec.
+
+ FIXME: if the current transaction holds implicit lock from INSERT, a
+ subsequent locking read should not convert it to explicit. See also
+ MDEV-11215.
+
+ @param caller_trx trx of current thread
+ @param[in] trx trx holding implicit lock on rec
+ @param[in] rec user record
+ @param[in] block buffer block containing the record
+*/
+
+static void lock_rec_other_trx_holds_expl(trx_t *caller_trx, trx_t *trx,
+ const rec_t *rec,
+ const buf_block_t *block)
+{
+ if (trx)
+ {
+ ut_ad(!page_rec_is_metadata(rec));
+ lock_mutex_enter();
+ ut_ad(trx->is_referenced());
+ trx_mutex_enter(trx);
+ const trx_state_t state = trx->state;
+ trx_mutex_exit(trx);
+ ut_ad(state != TRX_STATE_NOT_STARTED);
+ if (state == TRX_STATE_COMMITTED_IN_MEMORY)
+ {
+ /* The transaction was committed before our lock_mutex_enter(). */
+ lock_mutex_exit();
+ return;
+ }
+ lock_rec_other_trx_holds_expl_arg arg= { page_rec_get_heap_no(rec), block,
+ trx };
+ trx_sys.rw_trx_hash.iterate(caller_trx,
+ lock_rec_other_trx_holds_expl_callback, &arg);
+ lock_mutex_exit();
+ }
+}
+#endif /* UNIV_DEBUG */
+
+
+/** If an implicit x-lock exists on a record, convert it to an explicit one.
+
+Often, this is called by a transaction that is about to enter a lock wait
+due to the lock conflict. Two explicit locks would be created: first the
+exclusive lock on behalf of the lock-holder transaction in this function,
+and then a wait request on behalf of caller_trx, in the calling function.
+
+This may also be called by the same transaction that is already holding
+an implicit exclusive lock on the record. In this case, no explicit lock
+should be created.
+
+@param[in,out] caller_trx current transaction
+@param[in] block index tree leaf page
+@param[in] rec record on the leaf page
+@param[in] index the index of the record
+@param[in] offsets rec_get_offsets(rec,index)
+@return whether caller_trx already holds an exclusive lock on rec */
+static
+bool
+lock_rec_convert_impl_to_expl(
+ trx_t* caller_trx,
+ const buf_block_t* block,
+ const rec_t* rec,
+ dict_index_t* index,
+ const rec_offs* offsets)
+{
+ trx_t* trx;
+
+ ut_ad(!lock_mutex_own());
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(!rec_is_metadata(rec, *index));
+
+ if (dict_index_is_clust(index)) {
+ trx_id_t trx_id;
+
+ trx_id = lock_clust_rec_some_has_impl(rec, index, offsets);
+
+ if (trx_id == 0) {
+ return false;
+ }
+ if (UNIV_UNLIKELY(trx_id == caller_trx->id)) {
+ return true;
+ }
+
+ trx = trx_sys.find(caller_trx, trx_id);
+ } else {
+ ut_ad(!dict_index_is_online_ddl(index));
+
+ trx = lock_sec_rec_some_has_impl(caller_trx, rec, index,
+ offsets);
+ if (trx == caller_trx) {
+ trx->release_reference();
+ return true;
+ }
+
+ ut_d(lock_rec_other_trx_holds_expl(caller_trx, trx, rec,
+ block));
+ }
+
+ if (trx != 0) {
+ ulint heap_no = page_rec_get_heap_no(rec);
+
+ ut_ad(trx->is_referenced());
+
+ /* If the transaction is still active and has no
+ explicit x-lock set on the record, set one for it.
+ trx cannot be committed until the ref count is zero. */
+
+ lock_rec_convert_impl_to_expl_for_trx(
+ block, rec, index, trx, heap_no);
+ }
+
+ return false;
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified */
+ dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ ulint heap_no;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(block->frame == page_align(rec));
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+
+ return(DB_SUCCESS);
+ }
+ ut_ad(!rec_is_metadata(rec, *index));
+ ut_ad(!index->table->is_temporary());
+
+ heap_no = rec_offs_comp(offsets)
+ ? rec_get_heap_no_new(rec)
+ : rec_get_heap_no_old(rec);
+
+ /* If a transaction has no explicit x-lock set on the record, set one
+ for it */
+
+ if (lock_rec_convert_impl_to_expl(thr_get_trx(thr), block, rec, index,
+ offsets)) {
+ /* We already hold an implicit exclusive lock. */
+ return DB_SUCCESS;
+ }
+
+ err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no, index, thr);
+
+ ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
+
+ if (err == DB_SUCCESS_LOCKED_REC) {
+ err = DB_SUCCESS;
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (delete
+mark or delete unmark) of a secondary index record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified; NOTE: as this is a secondary
+ index, we always have to modify the
+ clustered index record first: see the
+ comment below */
+ dict_index_t* index, /*!< in: secondary index */
+ que_thr_t* thr, /*!< in: query thread
+ (can be NULL if BTR_NO_LOCKING_FLAG) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ dberr_t err;
+ ulint heap_no;
+
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(!dict_index_is_online_ddl(index) || (flags & BTR_CREATE_FLAG));
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(mtr->is_named_space(index->table->space));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(!rec_is_metadata(rec, *index));
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+
+ return(DB_SUCCESS);
+ }
+ ut_ad(!index->table->is_temporary());
+
+ heap_no = page_rec_get_heap_no(rec);
+
+#ifdef WITH_WSREP
+ trx_t *trx= thr_get_trx(thr);
+ /* If transaction scanning an unique secondary key is wsrep
+ high priority thread (brute force) this scanning may involve
+ GAP-locking in the index. As this locking happens also when
+ applying replication events in high priority applier threads,
+ there is a probability for lock conflicts between two wsrep
+ high priority threads. To avoid this GAP-locking we mark that
+ this transaction is using unique key scan here. */
+ if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))
+ trx->wsrep_UK_scan= true;
+#endif /* WITH_WSREP */
+
+ /* Another transaction cannot have an implicit lock on the record,
+ because when we come here, we already have modified the clustered
+ index record, and this would not have been possible if another active
+ transaction had modified this secondary index record. */
+
+ err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no, index, thr);
+
+#ifdef WITH_WSREP
+ trx->wsrep_UK_scan= false;
+#endif /* WITH_WSREP */
+
+#ifdef UNIV_DEBUG
+ {
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ const rec_offs* offsets;
+ rec_offs_init(offsets_);
+
+ offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ ut_ad(lock_rec_queue_validate(
+ FALSE, block, rec, index, offsets));
+
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+ }
+#endif /* UNIV_DEBUG */
+
+ if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) {
+ /* Update the page max trx id field */
+ /* It might not be necessary to do this if
+ err == DB_SUCCESS (no new lock created),
+ but it should not cost too much performance. */
+ page_update_max_trx_id(block,
+ buf_block_get_page_zip(block),
+ thr_get_trx(thr)->id, mtr);
+ err = DB_SUCCESS;
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ ulint heap_no;
+
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(!dict_index_is_online_ddl(index));
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+ if ((flags & BTR_NO_LOCKING_FLAG)
+ || srv_read_only_mode
+ || index->table->is_temporary()) {
+
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(!rec_is_metadata(rec, *index));
+ heap_no = page_rec_get_heap_no(rec);
+
+ /* Some transaction may have an implicit x-lock on the record only
+ if the max trx id for the page >= min trx id for the trx list or a
+ database recovery is running. */
+
+ if (!page_rec_is_supremum(rec)
+ && page_get_max_trx_id(block->frame) >= trx_sys.get_min_trx_id()
+ && lock_rec_convert_impl_to_expl(thr_get_trx(thr), block, rec,
+ index, offsets)) {
+ /* We already hold an implicit exclusive lock. */
+ return DB_SUCCESS;
+ }
+
+#ifdef WITH_WSREP
+ trx_t *trx= thr_get_trx(thr);
+ /* If transaction scanning an unique secondary key is wsrep
+ high priority thread (brute force) this scanning may involve
+ GAP-locking in the index. As this locking happens also when
+ applying replication events in high priority applier threads,
+ there is a probability for lock conflicts between two wsrep
+ high priority threads. To avoid this GAP-locking we mark that
+ this transaction is using unique key scan here. */
+ if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))
+ trx->wsrep_UK_scan= true;
+#endif /* WITH_WSREP */
+
+ err = lock_rec_lock(FALSE, gap_mode | mode,
+ block, heap_no, index, thr);
+
+#ifdef WITH_WSREP
+ trx->wsrep_UK_scan= false;
+#endif /* WITH_WSREP */
+
+ ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
+
+ return(err);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ ulint heap_no;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
+ ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP
+ || gap_mode == LOCK_REC_NOT_GAP);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(!rec_is_metadata(rec, *index));
+
+ if ((flags & BTR_NO_LOCKING_FLAG)
+ || srv_read_only_mode
+ || index->table->is_temporary()) {
+
+ return(DB_SUCCESS);
+ }
+
+ heap_no = page_rec_get_heap_no(rec);
+
+ if (heap_no != PAGE_HEAP_NO_SUPREMUM
+ && lock_rec_convert_impl_to_expl(thr_get_trx(thr), block, rec,
+ index, offsets)) {
+ /* We already hold an implicit exclusive lock. */
+ return DB_SUCCESS;
+ }
+
+ err = lock_rec_lock(FALSE, gap_mode | mode,
+ block, heap_no, index, thr);
+
+ ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
+
+ DEBUG_SYNC_C("after_lock_clust_rec_read_check_and_lock");
+
+ return(err);
+}
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mem_heap_t* tmp_heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ dberr_t err;
+ rec_offs_init(offsets_);
+
+ ut_ad(page_rec_is_leaf(rec));
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &tmp_heap);
+ err = lock_clust_rec_read_check_and_lock(flags, block, rec, index,
+ offsets, mode, gap_mode, thr);
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ if (err == DB_SUCCESS_LOCKED_REC) {
+ err = DB_SUCCESS;
+ }
+
+ return(err);
+}
+
+/*******************************************************************//**
+Release the last lock from the transaction's autoinc locks. */
+UNIV_INLINE
+void
+lock_release_autoinc_last_lock(
+/*===========================*/
+ ib_vector_t* autoinc_locks) /*!< in/out: vector of AUTOINC locks */
+{
+ ulint last;
+ lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+ ut_a(!ib_vector_is_empty(autoinc_locks));
+
+ /* The lock to be release must be the last lock acquired. */
+ last = ib_vector_size(autoinc_locks) - 1;
+ lock = *static_cast<lock_t**>(ib_vector_get(autoinc_locks, last));
+
+ /* Should have only AUTOINC locks in the vector. */
+ ut_a(lock_get_mode(lock) == LOCK_AUTO_INC);
+ ut_a(lock_get_type(lock) == LOCK_TABLE);
+
+ ut_a(lock->un_member.tab_lock.table != NULL);
+
+ /* This will remove the lock from the trx autoinc_locks too. */
+ lock_table_dequeue(lock);
+
+ /* Remove from the table vector too. */
+ lock_trx_table_locks_remove(lock);
+}
+
+/*******************************************************************//**
+Check if a transaction holds any autoinc locks.
+@return TRUE if the transaction holds any AUTOINC locks. */
+static
+ibool
+lock_trx_holds_autoinc_locks(
+/*=========================*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ ut_a(trx->autoinc_locks != NULL);
+
+ return(!ib_vector_is_empty(trx->autoinc_locks));
+}
+
+/*******************************************************************//**
+Release all the transaction's autoinc locks. */
+static
+void
+lock_release_autoinc_locks(
+/*=======================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_ad(lock_mutex_own());
+ /* If this is invoked for a running transaction by the thread
+ that is serving the transaction, then it is not necessary to
+ hold trx->mutex here. */
+
+ ut_a(trx->autoinc_locks != NULL);
+
+ /* We release the locks in the reverse order. This is to
+ avoid searching the vector for the element to delete at
+ the lower level. See (lock_table_remove_low()) for details. */
+ while (!ib_vector_is_empty(trx->autoinc_locks)) {
+
+ /* lock_table_remove_low() will also remove the lock from
+ the transaction's autoinc_locks vector. */
+ lock_release_autoinc_last_lock(trx->autoinc_locks);
+ }
+
+ /* Should release all locks. */
+ ut_a(ib_vector_is_empty(trx->autoinc_locks));
+}
+
+/*******************************************************************//**
+Gets the type of a lock. Non-inline version for using outside of the
+lock module.
+@return LOCK_TABLE or LOCK_REC */
+ulint
+lock_get_type(
+/*==========*/
+ const lock_t* lock) /*!< in: lock */
+{
+ return(lock_get_type_low(lock));
+}
+
+/*******************************************************************//**
+Gets the id of the transaction owning a lock.
+@return transaction id */
+trx_id_t
+lock_get_trx_id(
+/*============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ return(trx_get_id_for_print(lock->trx));
+}
+
+/*******************************************************************//**
+Gets the table on which the lock is.
+@return table */
+UNIV_INLINE
+dict_table_t*
+lock_get_table(
+/*===========*/
+ const lock_t* lock) /*!< in: lock */
+{
+ switch (lock_get_type_low(lock)) {
+ case LOCK_REC:
+ ut_ad(dict_index_is_clust(lock->index)
+ || !dict_index_is_online_ddl(lock->index));
+ return(lock->index->table);
+ case LOCK_TABLE:
+ return(lock->un_member.tab_lock.table);
+ default:
+ ut_error;
+ return(NULL);
+ }
+}
+
+/*******************************************************************//**
+Gets the id of the table on which the lock is.
+@return id of the table */
+table_id_t
+lock_get_table_id(
+/*==============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ dict_table_t* table = lock_get_table(lock);
+ ut_ad(!table->is_temporary());
+ return(table->id);
+}
+
+/** Determine which table a lock is associated with.
+@param[in] lock the lock
+@return name of the table */
+const table_name_t&
+lock_get_table_name(
+ const lock_t* lock)
+{
+ return(lock_get_table(lock)->name);
+}
+
+/*******************************************************************//**
+For a record lock, gets the index on which the lock is.
+@return index */
+const dict_index_t*
+lock_rec_get_index(
+/*===============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_a(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(dict_index_is_clust(lock->index)
+ || !dict_index_is_online_ddl(lock->index));
+
+ return(lock->index);
+}
+
+/*******************************************************************//**
+For a record lock, gets the name of the index on which the lock is.
+The string should not be free()'d or modified.
+@return name of the index */
+const char*
+lock_rec_get_index_name(
+/*====================*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_a(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(dict_index_is_clust(lock->index)
+ || !dict_index_is_online_ddl(lock->index));
+
+ return(lock->index->name);
+}
+
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+ lock_t* lock) /*!< in/out: waiting lock request */
+{
+ que_thr_t* thr;
+
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(lock->trx));
+ ut_ad(lock->trx->state == TRX_STATE_ACTIVE);
+
+ lock->trx->lock.cancel = true;
+
+ if (lock_get_type_low(lock) == LOCK_REC) {
+
+ lock_rec_dequeue_from_page(lock);
+ } else {
+ ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+ if (lock->trx->autoinc_locks != NULL) {
+ /* Release the transaction's AUTOINC locks. */
+ lock_release_autoinc_locks(lock->trx);
+ }
+
+ lock_table_dequeue(lock);
+ /* Remove the lock from table lock vector too. */
+ lock_trx_table_locks_remove(lock);
+ }
+
+ /* Reset the wait flag and the back pointer to lock in trx. */
+
+ lock_reset_lock_and_trx_wait(lock);
+
+ /* The following function releases the trx from lock wait. */
+
+ thr = que_thr_end_lock_wait(lock->trx);
+
+ if (thr != NULL) {
+ lock_wait_release_thread_if_suspended(thr);
+ }
+
+ lock->trx->lock.cancel = false;
+}
+
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+void
+lock_unlock_table_autoinc(
+/*======================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_ad(!lock_mutex_own());
+ ut_ad(!trx_mutex_own(trx));
+ ut_ad(!trx->lock.wait_lock);
+
+ /* This can be invoked on NOT_STARTED, ACTIVE, PREPARED,
+ but not COMMITTED transactions. */
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)
+ || !trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+
+ /* This function is invoked for a running transaction by the
+ thread that is serving the transaction. Therefore it is not
+ necessary to hold trx->mutex here. */
+
+ if (lock_trx_holds_autoinc_locks(trx)) {
+ lock_mutex_enter();
+
+ lock_release_autoinc_locks(trx);
+
+ lock_mutex_exit();
+ }
+}
+
+static inline dberr_t lock_trx_handle_wait_low(trx_t* trx)
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(trx));
+
+ if (trx->lock.was_chosen_as_deadlock_victim) {
+ return DB_DEADLOCK;
+ }
+ if (!trx->lock.wait_lock) {
+ /* The lock was probably granted before we got here. */
+ return DB_SUCCESS;
+ }
+
+ lock_cancel_waiting_and_release(trx->lock.wait_lock);
+ return DB_LOCK_WAIT;
+}
+
+/*********************************************************************//**
+Check whether the transaction has already been rolled back because it
+was selected as a deadlock victim, or if it has to wait then cancel
+the wait lock.
+@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
+dberr_t
+lock_trx_handle_wait(
+/*=================*/
+ trx_t* trx) /*!< in/out: trx lock state */
+{
+#ifdef WITH_WSREP
+ /* We already own mutexes */
+ if (trx->lock.was_chosen_as_wsrep_victim) {
+ return lock_trx_handle_wait_low(trx);
+ }
+#endif /* WITH_WSREP */
+ lock_mutex_enter();
+ trx_mutex_enter(trx);
+ dberr_t err = lock_trx_handle_wait_low(trx);
+ lock_mutex_exit();
+ trx_mutex_exit(trx);
+ return err;
+}
+
+/*********************************************************************//**
+Get the number of locks on a table.
+@return number of locks */
+ulint
+lock_table_get_n_locks(
+/*===================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ulint n_table_locks;
+
+ lock_mutex_enter();
+
+ n_table_locks = UT_LIST_GET_LEN(table->locks);
+
+ lock_mutex_exit();
+
+ return(n_table_locks);
+}
+
+#ifdef UNIV_DEBUG
+/**
+ Do an exhaustive check for any locks (table or rec) against the table.
+
+ @param[in] table check if there are any locks held on records in this table
+ or on the table itself
+*/
+
+static my_bool lock_table_locks_lookup(rw_trx_hash_element_t *element,
+ const dict_table_t *table)
+{
+ ut_ad(lock_mutex_own());
+ mutex_enter(&element->mutex);
+ if (element->trx)
+ {
+ trx_mutex_enter(element->trx);
+ check_trx_state(element->trx);
+ if (element->trx->state != TRX_STATE_COMMITTED_IN_MEMORY)
+ {
+ for (const lock_t *lock= UT_LIST_GET_FIRST(element->trx->lock.trx_locks);
+ lock != NULL;
+ lock= UT_LIST_GET_NEXT(trx_locks, lock))
+ {
+ ut_ad(lock->trx == element->trx);
+ if (lock_get_type_low(lock) == LOCK_REC)
+ {
+ ut_ad(lock->index->online_status != ONLINE_INDEX_CREATION ||
+ lock->index->is_primary());
+ ut_ad(lock->index->table != table);
+ }
+ else
+ ut_ad(lock->un_member.tab_lock.table != table);
+ }
+ }
+ trx_mutex_exit(element->trx);
+ }
+ mutex_exit(&element->mutex);
+ return 0;
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Check if there are any locks (table or rec) against table.
+@return true if table has either table or record locks. */
+bool
+lock_table_has_locks(
+/*=================*/
+ const dict_table_t* table) /*!< in: check if there are any locks
+ held on records in this table or on the
+ table itself */
+{
+ ibool has_locks;
+
+ ut_ad(table != NULL);
+ lock_mutex_enter();
+
+ has_locks = UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks > 0;
+
+#ifdef UNIV_DEBUG
+ if (!has_locks) {
+ trx_sys.rw_trx_hash.iterate(lock_table_locks_lookup, table);
+ }
+#endif /* UNIV_DEBUG */
+
+ lock_mutex_exit();
+
+ return(has_locks);
+}
+
+/*******************************************************************//**
+Initialise the table lock list. */
+void
+lock_table_lock_list_init(
+/*======================*/
+ table_lock_list_t* lock_list) /*!< List to initialise */
+{
+ UT_LIST_INIT(*lock_list, &lock_table_t::locks);
+}
+
+/*******************************************************************//**
+Initialise the trx lock list. */
+void
+lock_trx_lock_list_init(
+/*====================*/
+ trx_lock_list_t* lock_list) /*!< List to initialise */
+{
+ UT_LIST_INIT(*lock_list, &lock_t::trx_locks);
+}
+
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Check if the transaction holds any locks on the sys tables
+or its records.
+@return the strongest lock found on any sys table or 0 for none */
+const lock_t*
+lock_trx_has_sys_table_locks(
+/*=========================*/
+ const trx_t* trx) /*!< in: transaction to check */
+{
+ const lock_t* strongest_lock = 0;
+ lock_mode strongest = LOCK_NONE;
+
+ lock_mutex_enter();
+
+ const lock_list::const_iterator end = trx->lock.table_locks.end();
+ lock_list::const_iterator it = trx->lock.table_locks.begin();
+
+ /* Find a valid mode. Note: ib_vector_size() can be 0. */
+
+ for (/* No op */; it != end; ++it) {
+ const lock_t* lock = *it;
+
+ if (lock != NULL
+ && dict_is_sys_table(lock->un_member.tab_lock.table->id)) {
+
+ strongest = lock_get_mode(lock);
+ ut_ad(strongest != LOCK_NONE);
+ strongest_lock = lock;
+ break;
+ }
+ }
+
+ if (strongest == LOCK_NONE) {
+ lock_mutex_exit();
+ return(NULL);
+ }
+
+ for (/* No op */; it != end; ++it) {
+ const lock_t* lock = *it;
+
+ if (lock == NULL) {
+ continue;
+ }
+
+ ut_ad(trx == lock->trx);
+ ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+ ut_ad(lock->un_member.tab_lock.table != NULL);
+
+ lock_mode mode = lock_get_mode(lock);
+
+ if (dict_is_sys_table(lock->un_member.tab_lock.table->id)
+ && lock_mode_stronger_or_eq(mode, strongest)) {
+
+ strongest = mode;
+ strongest_lock = lock;
+ }
+ }
+
+ lock_mutex_exit();
+
+ return(strongest_lock);
+}
+
+/** Check if the transaction holds an explicit exclusive lock on a record.
+@param[in] trx transaction
+@param[in] table table
+@param[in] block leaf page
+@param[in] heap_no heap number identifying the record
+@return whether an explicit X-lock is held */
+bool
+lock_trx_has_expl_x_lock(
+ const trx_t* trx, /*!< in: transaction to check */
+ const dict_table_t* table, /*!< in: table to check */
+ const buf_block_t* block, /*!< in: buffer block of the record */
+ ulint heap_no)/*!< in: record heap number */
+{
+ ut_ad(heap_no > PAGE_HEAP_NO_SUPREMUM);
+
+ lock_mutex_enter();
+ ut_ad(lock_table_has(trx, table, LOCK_IX));
+ ut_ad(lock_table_has(trx, table, LOCK_X)
+ || lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, heap_no,
+ trx));
+ lock_mutex_exit();
+ return(true);
+}
+#endif /* UNIV_DEBUG */
+
+/** rewind(3) the file used for storing the latest detected deadlock and
+print a heading message to stderr if printing of all deadlocks to stderr
+is enabled. */
+void
+DeadlockChecker::start_print()
+{
+ ut_ad(lock_mutex_own());
+
+ rewind(lock_latest_err_file);
+ ut_print_timestamp(lock_latest_err_file);
+
+ if (srv_print_all_deadlocks) {
+ ib::info() << "Transactions deadlock detected, dumping"
+ " detailed information.";
+ }
+}
+
+/** Print a message to the deadlock file and possibly to stderr.
+@param msg message to print */
+void
+DeadlockChecker::print(const char* msg)
+{
+ fputs(msg, lock_latest_err_file);
+
+ if (srv_print_all_deadlocks) {
+ ib::info() << msg;
+ }
+}
+
+/** Print transaction data to the deadlock file and possibly to stderr.
+@param trx transaction
+@param max_query_len max query length to print */
+void
+DeadlockChecker::print(const trx_t* trx, ulint max_query_len)
+{
+ ut_ad(lock_mutex_own());
+
+ ulint n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+ ulint n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+ ulint heap_size = mem_heap_get_size(trx->lock.lock_heap);
+
+ trx_print_low(lock_latest_err_file, trx, max_query_len,
+ n_rec_locks, n_trx_locks, heap_size);
+
+ if (srv_print_all_deadlocks) {
+ trx_print_low(stderr, trx, max_query_len,
+ n_rec_locks, n_trx_locks, heap_size);
+ }
+}
+
+/** Print lock data to the deadlock file and possibly to stderr.
+@param lock record or table type lock */
+void
+DeadlockChecker::print(const lock_t* lock)
+{
+ ut_ad(lock_mutex_own());
+
+ if (lock_get_type_low(lock) == LOCK_REC) {
+ mtr_t mtr;
+ lock_rec_print(lock_latest_err_file, lock, mtr);
+
+ if (srv_print_all_deadlocks) {
+ lock_rec_print(stderr, lock, mtr);
+ }
+ } else {
+ lock_table_print(lock_latest_err_file, lock);
+
+ if (srv_print_all_deadlocks) {
+ lock_table_print(stderr, lock);
+ }
+ }
+}
+
+/** Get the next lock in the queue that is owned by a transaction whose
+sub-tree has not already been searched.
+Note: "next" here means PREV for table locks.
+
+@param lock Lock in queue
+@param heap_no heap_no if lock is a record lock else ULINT_UNDEFINED
+
+@return next lock or NULL if at end of queue */
+const lock_t*
+DeadlockChecker::get_next_lock(const lock_t* lock, ulint heap_no) const
+{
+ ut_ad(lock_mutex_own());
+
+ do {
+ if (lock_get_type_low(lock) == LOCK_REC) {
+ ut_ad(heap_no != ULINT_UNDEFINED);
+ lock = lock_rec_get_next_const(heap_no, lock);
+ } else {
+ ut_ad(heap_no == ULINT_UNDEFINED);
+ ut_ad(lock_get_type_low(lock) == LOCK_TABLE);
+
+ lock = UT_LIST_GET_NEXT(
+ un_member.tab_lock.locks, lock);
+ }
+
+ } while (lock != NULL && is_visited(lock));
+
+ ut_ad(lock == NULL
+ || lock_get_type_low(lock) == lock_get_type_low(m_wait_lock));
+
+ return(lock);
+}
+
+/** Get the first lock to search. The search starts from the current
+wait_lock. What we are really interested in is an edge from the
+current wait_lock's owning transaction to another transaction that has
+a lock ahead in the queue. We skip locks where the owning transaction's
+sub-tree has already been searched.
+
+Note: The record locks are traversed from the oldest lock to the
+latest. For table locks we go from latest to oldest.
+
+For record locks, we first position the "iterator" on the first lock on
+the page and then reposition on the actual heap_no. This is required
+due to the way the record lock has is implemented.
+
+@param[out] heap_no if rec lock, else ULINT_UNDEFINED.
+@return first lock or NULL */
+const lock_t*
+DeadlockChecker::get_first_lock(ulint* heap_no) const
+{
+ ut_ad(lock_mutex_own());
+
+ const lock_t* lock = m_wait_lock;
+
+ if (lock_get_type_low(lock) == LOCK_REC) {
+ /* We are only interested in records that match the heap_no. */
+ *heap_no = lock_rec_find_set_bit(lock);
+
+ ut_ad(*heap_no <= 0xffff);
+ ut_ad(*heap_no != ULINT_UNDEFINED);
+
+ /* Find the locks on the page. */
+ lock = lock_sys.get_first(
+ lock->type_mode & LOCK_PREDICATE
+ ? lock_sys.prdt_hash
+ : lock_sys.rec_hash,
+ lock->un_member.rec_lock.page_id);
+
+ /* Position on the first lock on the physical record.*/
+ if (!lock_rec_get_nth_bit(lock, *heap_no)) {
+ lock = lock_rec_get_next_const(*heap_no, lock);
+ }
+
+ ut_a(!lock_get_wait(lock));
+ } else {
+ /* Table locks don't care about the heap_no. */
+ *heap_no = ULINT_UNDEFINED;
+ ut_ad(lock_get_type_low(lock) == LOCK_TABLE);
+ dict_table_t* table = lock->un_member.tab_lock.table;
+ lock = UT_LIST_GET_FIRST(table->locks);
+ }
+
+ /* Must find at least two locks, otherwise there cannot be a
+ waiting lock, secondly the first lock cannot be the wait_lock. */
+ ut_a(lock != NULL);
+ ut_a(lock != m_wait_lock ||
+ (innodb_lock_schedule_algorithm
+ == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+ && !thd_is_replication_slave_thread(lock->trx->mysql_thd)));
+
+ /* Check that the lock type doesn't change. */
+ ut_ad(lock_get_type_low(lock) == lock_get_type_low(m_wait_lock));
+
+ return(lock);
+}
+
+/** Notify that a deadlock has been detected and print the conflicting
+transaction info.
+@param lock lock causing deadlock */
+void
+DeadlockChecker::notify(const lock_t* lock) const
+{
+ ut_ad(lock_mutex_own());
+
+ start_print();
+
+ print("\n*** (1) TRANSACTION:\n");
+
+ print(m_wait_lock->trx, 3000);
+
+ print("*** (1) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+ print(m_wait_lock);
+
+ print("*** (2) TRANSACTION:\n");
+
+ print(lock->trx, 3000);
+
+ print("*** (2) HOLDS THE LOCK(S):\n");
+
+ print(lock);
+
+ /* It is possible that the joining transaction was granted its
+ lock when we rolled back some other waiting transaction. */
+
+ if (m_start->lock.wait_lock != 0) {
+ print("*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+ print(m_start->lock.wait_lock);
+ }
+
+ DBUG_PRINT("ib_lock", ("deadlock detected"));
+}
+
+/** Select the victim transaction that should be rolledback.
+@return victim transaction */
+const trx_t*
+DeadlockChecker::select_victim() const
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(m_start->lock.wait_lock != 0);
+ ut_ad(m_wait_lock->trx != m_start);
+
+ if (trx_weight_ge(m_wait_lock->trx, m_start)) {
+ /* The joining transaction is 'smaller',
+ choose it as the victim and roll it back. */
+#ifdef WITH_WSREP
+ if (wsrep_thd_is_BF(m_start->mysql_thd, FALSE)) {
+ return(m_wait_lock->trx);
+ }
+#endif /* WITH_WSREP */
+ return(m_start);
+ }
+
+#ifdef WITH_WSREP
+ if (wsrep_thd_is_BF(m_wait_lock->trx->mysql_thd, FALSE)) {
+ return(m_start);
+ }
+#endif /* WITH_WSREP */
+
+ return(m_wait_lock->trx);
+}
+
+/** Looks iteratively for a deadlock. Note: the joining transaction may
+have been granted its lock by the deadlock checks.
+@return 0 if no deadlock else the victim transaction instance.*/
+const trx_t*
+DeadlockChecker::search()
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(!trx_mutex_own(m_start));
+
+ ut_ad(m_start != NULL);
+ ut_ad(m_wait_lock != NULL);
+ ut_ad(!m_wait_lock->trx->auto_commit || m_wait_lock->trx->will_lock);
+ ut_d(check_trx_state(m_wait_lock->trx));
+ ut_ad(m_mark_start <= s_lock_mark_counter);
+
+ /* Look at the locks ahead of wait_lock in the lock queue. */
+ ulint heap_no;
+ const lock_t* lock = get_first_lock(&heap_no);
+
+ for (;;) {
+ /* We should never visit the same sub-tree more than once. */
+ ut_ad(lock == NULL || !is_visited(lock));
+
+ while (m_n_elems > 0 && lock == NULL) {
+
+ /* Restore previous search state. */
+
+ pop(lock, heap_no);
+
+ lock = get_next_lock(lock, heap_no);
+ }
+
+ if (lock == NULL) {
+ break;
+ }
+
+ if (lock == m_wait_lock) {
+
+ /* We can mark this subtree as searched */
+ ut_ad(lock->trx->lock.deadlock_mark <= m_mark_start);
+
+ lock->trx->lock.deadlock_mark = ++s_lock_mark_counter;
+
+ /* We are not prepared for an overflow. This 64-bit
+ counter should never wrap around. At 10^9 increments
+ per second, it would take 10^3 years of uptime. */
+
+ ut_ad(s_lock_mark_counter > 0);
+
+ /* Backtrack */
+ lock = NULL;
+ continue;
+ }
+
+ if (!lock_has_to_wait(m_wait_lock, lock)) {
+ /* No conflict, next lock */
+ lock = get_next_lock(lock, heap_no);
+ continue;
+ }
+
+ if (lock->trx == m_start) {
+ /* Found a cycle. */
+ notify(lock);
+ return select_victim();
+ }
+
+ if (is_too_deep()) {
+ /* Search too deep to continue. */
+ m_too_deep = true;
+ return m_start;
+ }
+
+ /* We do not need to report autoinc locks to the upper
+ layer. These locks are released before commit, so they
+ can not cause deadlocks with binlog-fixed commit
+ order. */
+ if (m_report_waiters
+ && (lock_get_type_low(lock) != LOCK_TABLE
+ || lock_get_mode(lock) != LOCK_AUTO_INC)) {
+ thd_rpl_deadlock_check(m_start->mysql_thd,
+ lock->trx->mysql_thd);
+ }
+
+ if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+ /* Another trx ahead has requested a lock in an
+ incompatible mode, and is itself waiting for a lock. */
+
+ ++m_cost;
+
+ if (!push(lock, heap_no)) {
+ m_too_deep = true;
+ return m_start;
+ }
+
+ m_wait_lock = lock->trx->lock.wait_lock;
+
+ lock = get_first_lock(&heap_no);
+
+ if (is_visited(lock)) {
+ lock = get_next_lock(lock, heap_no);
+ }
+ } else {
+ lock = get_next_lock(lock, heap_no);
+ }
+ }
+
+ ut_a(lock == NULL && m_n_elems == 0);
+
+ /* No deadlock found. */
+ return(0);
+}
+
+/** Print info about transaction that was rolled back.
+@param trx transaction rolled back
+@param lock lock trx wants */
+void
+DeadlockChecker::rollback_print(const trx_t* trx, const lock_t* lock)
+{
+ ut_ad(lock_mutex_own());
+
+ /* If the lock search exceeds the max step
+ or the max depth, the current trx will be
+ the victim. Print its information. */
+ start_print();
+
+ print("TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
+ " WAITS-FOR GRAPH, WE WILL ROLL BACK"
+ " FOLLOWING TRANSACTION \n\n"
+ "*** TRANSACTION:\n");
+
+ print(trx, 3000);
+
+ print("*** WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+ print(lock);
+}
+
+/** Rollback transaction selected as the victim. */
+void
+DeadlockChecker::trx_rollback()
+{
+ ut_ad(lock_mutex_own());
+
+ trx_t* trx = m_wait_lock->trx;
+
+ print("*** WE ROLL BACK TRANSACTION (1)\n");
+#ifdef WITH_WSREP
+ if (trx->is_wsrep() && wsrep_thd_is_SR(trx->mysql_thd)) {
+ wsrep_handle_SR_rollback(m_start->mysql_thd, trx->mysql_thd);
+ }
+#endif
+
+ trx_mutex_enter(trx);
+
+ trx->lock.was_chosen_as_deadlock_victim = true;
+
+ lock_cancel_waiting_and_release(trx->lock.wait_lock);
+
+ trx_mutex_exit(trx);
+}
+
+/** Check if a joining lock request results in a deadlock.
+If a deadlock is found, we will resolve the deadlock by
+choosing a victim transaction and rolling it back.
+We will attempt to resolve all deadlocks.
+
+@param[in] lock the lock request
+@param[in,out] trx transaction requesting the lock
+
+@return trx if it was chosen as victim
+@retval NULL if another victim was chosen,
+or there is no deadlock (any more) */
+const trx_t*
+DeadlockChecker::check_and_resolve(const lock_t* lock, trx_t* trx)
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(trx));
+ ut_ad(trx->state == TRX_STATE_ACTIVE);
+ ut_ad(!trx->auto_commit || trx->will_lock);
+ ut_ad(!srv_read_only_mode);
+
+ if (!innobase_deadlock_detect) {
+ return(NULL);
+ }
+
+ /* Release the mutex to obey the latching order.
+ This is safe, because DeadlockChecker::check_and_resolve()
+ is invoked when a lock wait is enqueued for the currently
+ running transaction. Because m_trx is a running transaction
+ (it is not currently suspended because of a lock wait),
+ its state can only be changed by this thread, which is
+ currently associated with the transaction. */
+
+ trx_mutex_exit(trx);
+
+ const trx_t* victim_trx;
+ const bool report_waiters = trx->mysql_thd
+ && thd_need_wait_reports(trx->mysql_thd);
+
+ /* Try and resolve as many deadlocks as possible. */
+ do {
+ DeadlockChecker checker(trx, lock, s_lock_mark_counter,
+ report_waiters);
+
+ victim_trx = checker.search();
+
+ /* Search too deep, we rollback the joining transaction only
+ if it is possible to rollback. Otherwise we rollback the
+ transaction that is holding the lock that the joining
+ transaction wants. */
+ if (checker.is_too_deep()) {
+
+ ut_ad(trx == checker.m_start);
+ ut_ad(trx == victim_trx);
+
+ rollback_print(victim_trx, lock);
+
+ MONITOR_INC(MONITOR_DEADLOCK);
+ srv_stats.lock_deadlock_count.inc();
+
+ break;
+
+ } else if (victim_trx != NULL && victim_trx != trx) {
+
+ ut_ad(victim_trx == checker.m_wait_lock->trx);
+
+ checker.trx_rollback();
+
+ lock_deadlock_found = true;
+
+ MONITOR_INC(MONITOR_DEADLOCK);
+ srv_stats.lock_deadlock_count.inc();
+ }
+
+ } while (victim_trx != NULL && victim_trx != trx);
+
+ /* If the joining transaction was selected as the victim. */
+ if (victim_trx != NULL) {
+
+ print("*** WE ROLL BACK TRANSACTION (2)\n");
+#ifdef WITH_WSREP
+ if (trx->is_wsrep() && wsrep_thd_is_SR(trx->mysql_thd)) {
+ wsrep_handle_SR_rollback(trx->mysql_thd,
+ victim_trx->mysql_thd);
+ }
+#endif
+
+ lock_deadlock_found = true;
+ }
+
+ trx_mutex_enter(trx);
+
+ return(victim_trx);
+}
+
+/*************************************************************//**
+Updates the lock table when a page is split and merged to
+two pages. */
+UNIV_INTERN
+void
+lock_update_split_and_merge(
+ const buf_block_t* left_block, /*!< in: left page to which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor of
+ supremum on the left page before merge*/
+ const buf_block_t* right_block) /*!< in: right page from which merged */
+{
+ const rec_t* left_next_rec;
+
+ ut_ad(page_is_leaf(left_block->frame));
+ ut_ad(page_is_leaf(right_block->frame));
+ ut_ad(page_align(orig_pred) == left_block->frame);
+
+ lock_mutex_enter();
+
+ left_next_rec = page_rec_get_next_const(orig_pred);
+ ut_ad(!page_rec_is_metadata(left_next_rec));
+
+ /* Inherit the locks on the supremum of the left page to the
+ first record which was moved from the right page */
+ lock_rec_inherit_to_gap(
+ left_block, left_block,
+ page_rec_get_heap_no(left_next_rec),
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Reset the locks on the supremum of the left page,
+ releasing waiting transactions */
+ lock_rec_reset_and_release_wait(left_block,
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Inherit the locks to the supremum of the left page from the
+ successor of the infimum on the right page */
+ lock_rec_inherit_to_gap(left_block, right_block,
+ PAGE_HEAP_NO_SUPREMUM,
+ lock_get_min_heap_no(right_block));
+
+ lock_mutex_exit();
+}
diff --git a/storage/innobase/lock/lock0prdt.cc b/storage/innobase/lock/lock0prdt.cc
new file mode 100644
index 00000000..1eb96a0d
--- /dev/null
+++ b/storage/innobase/lock/lock0prdt.cc
@@ -0,0 +1,1028 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0prdt.cc
+The transaction lock system
+
+Created 9/7/2013 Jimmy Yang
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "lock0lock.h"
+#include "lock0priv.h"
+#include "lock0prdt.h"
+#include "dict0mem.h"
+#include "que0que.h"
+
+/*********************************************************************//**
+Get a minimum bounding box from a Predicate
+@return the minimum bounding box */
+UNIV_INLINE
+rtr_mbr_t*
+prdt_get_mbr_from_prdt(
+/*===================*/
+ const lock_prdt_t* prdt) /*!< in: the lock predicate */
+{
+ rtr_mbr_t* mbr_loc = reinterpret_cast<rtr_mbr_t*>(prdt->data);
+
+ return(mbr_loc);
+}
+
+/*********************************************************************//**
+Get a predicate from a lock
+@return the predicate */
+lock_prdt_t*
+lock_get_prdt_from_lock(
+/*====================*/
+ const lock_t* lock) /*!< in: the lock */
+{
+ lock_prdt_t* prdt = reinterpret_cast<lock_prdt_t*>(
+ &((reinterpret_cast<byte*>(
+ const_cast<lock_t*>(&lock[1])))[
+ UNIV_WORD_SIZE]));
+
+ return(prdt);
+}
+
+/*********************************************************************//**
+Get a minimum bounding box directly from a lock
+@return the minimum bounding box*/
+UNIV_INLINE
+rtr_mbr_t*
+lock_prdt_get_mbr_from_lock(
+/*========================*/
+ const lock_t* lock) /*!< in: the lock */
+{
+ ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+ lock_prdt_t* prdt = lock_get_prdt_from_lock(lock);
+
+ rtr_mbr_t* mbr_loc = prdt_get_mbr_from_prdt(prdt);
+
+ return(mbr_loc);
+}
+
+/*********************************************************************//**
+Append a predicate to the lock */
+void
+lock_prdt_set_prdt(
+/*===============*/
+ lock_t* lock, /*!< in: lock */
+ const lock_prdt_t* prdt) /*!< in: Predicate */
+{
+ ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+ memcpy(&(((byte*) &lock[1])[UNIV_WORD_SIZE]), prdt, sizeof *prdt);
+}
+
+
+/** Check whether two predicate locks are compatible with each other
+@param[in] prdt1 first predicate lock
+@param[in] prdt2 second predicate lock
+@param[in] op predicate comparison operator
+@return true if consistent */
+static
+bool
+lock_prdt_consistent(
+ lock_prdt_t* prdt1,
+ lock_prdt_t* prdt2,
+ ulint op)
+{
+ bool ret = false;
+ rtr_mbr_t* mbr1 = prdt_get_mbr_from_prdt(prdt1);
+ rtr_mbr_t* mbr2 = prdt_get_mbr_from_prdt(prdt2);
+ ulint action;
+
+ if (op) {
+ action = op;
+ } else {
+ if (prdt2->op != 0 && (prdt1->op != prdt2->op)) {
+ return(false);
+ }
+
+ action = prdt1->op;
+ }
+
+ switch (action) {
+ case PAGE_CUR_CONTAIN:
+ ret = MBR_CONTAIN_CMP(mbr1, mbr2);
+ break;
+ case PAGE_CUR_DISJOINT:
+ ret = MBR_DISJOINT_CMP(mbr1, mbr2);
+ break;
+ case PAGE_CUR_MBR_EQUAL:
+ ret = MBR_EQUAL_CMP(mbr1, mbr2);
+ break;
+ case PAGE_CUR_INTERSECT:
+ ret = MBR_INTERSECT_CMP(mbr1, mbr2);
+ break;
+ case PAGE_CUR_WITHIN:
+ ret = MBR_WITHIN_CMP(mbr1, mbr2);
+ break;
+ default:
+ ib::error() << "invalid operator " << action;
+ ut_error;
+ }
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Checks if a predicate lock request for a new lock has to wait for
+another lock.
+@return true if new lock has to wait for lock2 to be released */
+bool
+lock_prdt_has_to_wait(
+/*==================*/
+ const trx_t* trx, /*!< in: trx of new lock */
+ unsigned type_mode,/*!< in: precise mode of the new lock
+ to set: LOCK_S or LOCK_X, possibly
+ ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
+ LOCK_INSERT_INTENTION */
+ lock_prdt_t* prdt, /*!< in: lock predicate to check */
+ const lock_t* lock2) /*!< in: another record lock; NOTE that
+ it is assumed that this has a lock bit
+ set on the same record as in the new
+ lock we are setting */
+{
+ lock_prdt_t* cur_prdt = lock_get_prdt_from_lock(lock2);
+
+ ut_ad(trx && lock2);
+ ut_ad((lock2->type_mode & LOCK_PREDICATE && type_mode & LOCK_PREDICATE)
+ || (lock2->type_mode & LOCK_PRDT_PAGE
+ && type_mode & LOCK_PRDT_PAGE));
+
+ ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
+
+ if (trx != lock2->trx
+ && !lock_mode_compatible(static_cast<lock_mode>(
+ LOCK_MODE_MASK & type_mode),
+ lock_get_mode(lock2))) {
+
+ /* If it is a page lock, then return true (conflict) */
+ if (type_mode & LOCK_PRDT_PAGE) {
+ ut_ad(lock2->type_mode & LOCK_PRDT_PAGE);
+
+ return(true);
+ }
+
+ /* Predicate lock does not conflicts with non-predicate lock */
+ if (!(lock2->type_mode & LOCK_PREDICATE)) {
+ return(FALSE);
+ }
+
+ ut_ad(lock2->type_mode & LOCK_PREDICATE);
+
+ if (!(type_mode & LOCK_INSERT_INTENTION)) {
+ /* PREDICATE locks without LOCK_INSERT_INTENTION flag
+ do not need to wait for anything. This is because
+ different users can have conflicting lock types
+ on predicates. */
+
+ return(FALSE);
+ }
+
+ if (lock2->type_mode & LOCK_INSERT_INTENTION) {
+
+ /* No lock request needs to wait for an insert
+ intention lock to be removed. This makes it similar
+ to GAP lock, that allows conflicting insert intention
+ locks */
+ return(FALSE);
+ }
+
+ if (!lock_prdt_consistent(cur_prdt, prdt, 0)) {
+ return(false);
+ }
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a transaction has a GRANTED stronger or equal predicate lock
+on the page
+@return lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_prdt_has_lock(
+/*===============*/
+ ulint precise_mode, /*!< in: LOCK_S or LOCK_X */
+ unsigned type_mode, /*!< in: LOCK_PREDICATE etc. */
+ const buf_block_t* block, /*!< in: buffer block
+ containing the record */
+ lock_prdt_t* prdt, /*!< in: The predicate to be
+ attached to the new lock */
+ const trx_t* trx) /*!< in: transaction */
+{
+ lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+ ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+ || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+ ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
+
+ for (lock = lock_rec_get_first(
+ lock_hash_get(type_mode), block, PRDT_HEAPNO);
+ lock != NULL;
+ lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {
+ ut_ad(lock->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
+
+ if (lock->trx == trx
+ && !(lock->type_mode & LOCK_INSERT_INTENTION)
+ && !lock_get_wait(lock)
+ && lock_mode_stronger_or_eq(
+ lock_get_mode(lock),
+ static_cast<lock_mode>(
+ precise_mode & LOCK_MODE_MASK))) {
+ if (lock->type_mode & LOCK_PRDT_PAGE) {
+ return(lock);
+ }
+
+ ut_ad(lock->type_mode & LOCK_PREDICATE);
+ lock_prdt_t* cur_prdt = lock_get_prdt_from_lock(
+ lock);
+
+ /* if the lock predicate operator is the same
+ as the one to look, and prdicate test is successful,
+ then we find a lock */
+ if (cur_prdt->op == prdt->op
+ && lock_prdt_consistent(cur_prdt, prdt, 0)) {
+
+ return(lock);
+ }
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Checks if some other transaction has a conflicting predicate
+lock request in the queue, so that we have to wait.
+@return lock or NULL */
+static
+lock_t*
+lock_prdt_other_has_conflicting(
+/*============================*/
+ unsigned mode, /*!< in: LOCK_S or LOCK_X,
+ possibly ORed to LOCK_PREDICATE or
+ LOCK_PRDT_PAGE, LOCK_INSERT_INTENTION */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ lock_prdt_t* prdt, /*!< in: Predicates (currently)
+ the Minimum Bounding Rectangle)
+ the new lock will be on */
+ const trx_t* trx) /*!< in: our transaction */
+{
+ ut_ad(lock_mutex_own());
+
+ for (lock_t* lock = lock_rec_get_first(
+ lock_hash_get(mode), block, PRDT_HEAPNO);
+ lock != NULL;
+ lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {
+
+ if (lock->trx == trx) {
+ continue;
+ }
+
+ if (lock_prdt_has_to_wait(trx, mode, prdt, lock)) {
+ return(lock);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Reset the Minimum Bounding Rectangle (to a large area) */
+static
+void
+lock_prdt_enlarge_mbr(
+/*==================*/
+ const lock_t* lock, /*!< in/out: lock to modify */
+ rtr_mbr_t* mbr) /*!< in: Minimum Bounding Rectangle */
+{
+ rtr_mbr_t* cur_mbr = lock_prdt_get_mbr_from_lock(lock);
+
+ if (cur_mbr->xmin > mbr->xmin) {
+ cur_mbr->xmin = mbr->xmin;
+ }
+
+ if (cur_mbr->ymin > mbr->ymin) {
+ cur_mbr->ymin = mbr->ymin;
+ }
+
+ if (cur_mbr->xmax < mbr->xmax) {
+ cur_mbr->xmax = mbr->xmax;
+ }
+
+ if (cur_mbr->ymax < mbr->ymax) {
+ cur_mbr->ymax = mbr->ymax;
+ }
+}
+
+/*********************************************************************//**
+Reset the predicates to a "covering" (larger) predicates */
+static
+void
+lock_prdt_enlarge_prdt(
+/*===================*/
+ lock_t* lock, /*!< in/out: lock to modify */
+ lock_prdt_t* prdt) /*!< in: predicate */
+{
+ rtr_mbr_t* mbr = prdt_get_mbr_from_prdt(prdt);
+
+ lock_prdt_enlarge_mbr(lock, mbr);
+}
+
+/*********************************************************************//**
+Check two predicates' MBRs are the same
+@return true if they are the same */
+static
+bool
+lock_prdt_is_same(
+/*==============*/
+ lock_prdt_t* prdt1, /*!< in: MBR with the lock */
+ lock_prdt_t* prdt2) /*!< in: MBR with the lock */
+{
+ rtr_mbr_t* mbr1 = prdt_get_mbr_from_prdt(prdt1);
+ rtr_mbr_t* mbr2 = prdt_get_mbr_from_prdt(prdt2);
+
+ if (prdt1->op == prdt2->op && MBR_EQUAL_CMP(mbr1, mbr2)) {
+ return(true);
+ }
+
+ return(false);
+}
+
+/*********************************************************************//**
+Looks for a similar predicate lock struct by the same trx on the same page.
+This can be used to save space when a new record lock should be set on a page:
+no new struct is needed, if a suitable old one is found.
+@return lock or NULL */
+static
+lock_t*
+lock_prdt_find_on_page(
+/*===================*/
+ unsigned type_mode, /*!< in: lock type_mode field */
+ const buf_block_t* block, /*!< in: buffer block */
+ lock_prdt_t* prdt, /*!< in: MBR with the lock */
+ const trx_t* trx) /*!< in: transaction */
+{
+ lock_t* lock;
+
+ ut_ad(lock_mutex_own());
+
+ for (lock = lock_sys.get_first(*lock_hash_get(type_mode),
+ block->page.id());
+ lock != NULL;
+ lock = lock_rec_get_next_on_page(lock)) {
+
+ if (lock->trx == trx
+ && lock->type_mode == type_mode) {
+ if (lock->type_mode & LOCK_PRDT_PAGE) {
+ return(lock);
+ }
+
+ ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+ if (lock_prdt_is_same(lock_get_prdt_from_lock(lock),
+ prdt)) {
+ return(lock);
+ }
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Adds a predicate lock request in the predicate lock queue.
+@return lock where the bit was set */
+static
+lock_t*
+lock_prdt_add_to_queue(
+/*===================*/
+ unsigned type_mode,/*!< in: lock mode, wait, predicate
+ etc. flags; type is ignored
+ and replaced by LOCK_REC */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ dict_index_t* index, /*!< in: index of record */
+ trx_t* trx, /*!< in/out: transaction */
+ lock_prdt_t* prdt, /*!< in: Minimum Bounding Rectangle
+ the new lock will be on */
+ bool caller_owns_trx_mutex)
+ /*!< in: TRUE if caller owns the
+ transaction mutex */
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx));
+ ut_ad(!dict_index_is_clust(index) && !dict_index_is_online_ddl(index));
+ ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
+
+#ifdef UNIV_DEBUG
+ switch (type_mode & LOCK_MODE_MASK) {
+ case LOCK_X:
+ case LOCK_S:
+ break;
+ default:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ type_mode |= LOCK_REC;
+
+ /* Look for a waiting lock request on the same record or on a gap */
+
+ lock_t* lock;
+
+ for (lock = lock_sys.get_first(*lock_hash_get(type_mode),
+ block->page.id());
+ lock != NULL;
+ lock = lock_rec_get_next_on_page(lock)) {
+
+ if (lock_get_wait(lock)
+ && lock_rec_get_nth_bit(lock, PRDT_HEAPNO)
+ && lock->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)) {
+
+ break;
+ }
+ }
+
+ if (lock == NULL && !(type_mode & LOCK_WAIT)) {
+
+ /* Look for a similar record lock on the same page:
+ if one is found and there are no waiting lock requests,
+ we can just set the bit */
+
+ lock = lock_prdt_find_on_page(type_mode, block, prdt, trx);
+
+ if (lock != NULL) {
+
+ if (lock->type_mode & LOCK_PREDICATE) {
+ lock_prdt_enlarge_prdt(lock, prdt);
+ }
+
+ return(lock);
+ }
+ }
+
+ lock = lock_rec_create(
+#ifdef WITH_WSREP
+ NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
+#endif
+ type_mode, block, PRDT_HEAPNO, index, trx,
+ caller_owns_trx_mutex);
+
+ if (lock->type_mode & LOCK_PREDICATE) {
+ lock_prdt_set_prdt(lock, prdt);
+ }
+
+ return lock;
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a predicate record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_insert_check_and_lock(
+/*============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is
+ set, does nothing */
+ const rec_t* rec, /*!< in: record after which to insert */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ dict_index_t* index, /*!< in: index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ lock_prdt_t* prdt) /*!< in: Predicates with Minimum Bound
+ Rectangle */
+{
+ ut_ad(block->frame == page_align(rec));
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(!index->table->is_temporary());
+ ut_ad(!dict_index_is_clust(index));
+
+ trx_t* trx = thr_get_trx(thr);
+
+ lock_mutex_enter();
+
+ /* Because this code is invoked for a running transaction by
+ the thread that is serving the transaction, it is not necessary
+ to hold trx->mutex here. */
+
+ ut_ad(lock_table_has(trx, index->table, LOCK_IX));
+
+ lock_t* lock;
+
+ /* Only need to check locks on prdt_hash */
+ lock = lock_rec_get_first(&lock_sys.prdt_hash, block, PRDT_HEAPNO);
+
+ if (lock == NULL) {
+ lock_mutex_exit();
+
+ /* Update the page max trx id field */
+ page_update_max_trx_id(block, buf_block_get_page_zip(block),
+ trx->id, mtr);
+
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+ dberr_t err;
+
+ /* If another transaction has an explicit lock request which locks
+ the predicate, waiting or granted, on the successor, the insert
+ has to wait.
+
+ Similar to GAP lock, we do not consider lock from inserts conflicts
+ with each other */
+
+ const ulint mode = LOCK_X | LOCK_PREDICATE | LOCK_INSERT_INTENTION;
+
+ const lock_t* wait_for = lock_prdt_other_has_conflicting(
+ mode, block, prdt, trx);
+
+ if (wait_for != NULL) {
+ rtr_mbr_t* mbr = prdt_get_mbr_from_prdt(prdt);
+
+ /* Allocate MBR on the lock heap */
+ lock_init_prdt_from_mbr(prdt, mbr, 0, trx->lock.lock_heap);
+
+ /* Note that we may get DB_SUCCESS also here! */
+ trx_mutex_enter(trx);
+
+ err = lock_rec_enqueue_waiting(
+#ifdef WITH_WSREP
+ NULL, /* FIXME: replicate SPATIAL INDEX locks */
+#endif
+ LOCK_X | LOCK_PREDICATE | LOCK_INSERT_INTENTION,
+ block, PRDT_HEAPNO, index, thr, prdt);
+
+ trx_mutex_exit(trx);
+ } else {
+ err = DB_SUCCESS;
+ }
+
+ lock_mutex_exit();
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ /* Update the page max trx id field */
+ page_update_max_trx_id(block,
+ buf_block_get_page_zip(block),
+ trx->id, mtr);
+ default:
+ /* We only care about the two return values. */
+ break;
+ }
+
+ return(err);
+}
+
+/**************************************************************//**
+Check whether any predicate lock in parent needs to propagate to
+child page after split. */
+void
+lock_prdt_update_parent(
+/*====================*/
+ buf_block_t* left_block, /*!< in/out: page to be split */
+ buf_block_t* right_block, /*!< in/out: the new half page */
+ lock_prdt_t* left_prdt, /*!< in: MBR on the old page */
+ lock_prdt_t* right_prdt, /*!< in: MBR on the new page */
+ const page_id_t page_id) /*!< in: parent page */
+{
+ lock_mutex_enter();
+
+ /* Get all locks in parent */
+ for (lock_t *lock = lock_sys.get_first_prdt(page_id);
+ lock;
+ lock = lock_rec_get_next_on_page(lock)) {
+ lock_prdt_t* lock_prdt;
+ ulint op = PAGE_CUR_DISJOINT;
+
+ ut_ad(lock);
+
+ if (!(lock->type_mode & LOCK_PREDICATE)
+ || (lock->type_mode & LOCK_MODE_MASK) == LOCK_X) {
+ continue;
+ }
+
+ lock_prdt = lock_get_prdt_from_lock(lock);
+
+ /* Check each lock in parent to see if it intersects with
+ left or right child */
+ if (!lock_prdt_consistent(lock_prdt, left_prdt, op)
+ && !lock_prdt_find_on_page(lock->type_mode, left_block,
+ lock_prdt, lock->trx)) {
+ lock_prdt_add_to_queue(lock->type_mode,
+ left_block, lock->index,
+ lock->trx, lock_prdt,
+ FALSE);
+ }
+
+ if (!lock_prdt_consistent(lock_prdt, right_prdt, op)
+ && !lock_prdt_find_on_page(lock->type_mode, right_block,
+ lock_prdt, lock->trx)) {
+ lock_prdt_add_to_queue(lock->type_mode, right_block,
+ lock->index, lock->trx,
+ lock_prdt, FALSE);
+ }
+ }
+
+ lock_mutex_exit();
+}
+
+/**************************************************************//**
+Update predicate lock when page splits */
+static
+void
+lock_prdt_update_split_low(
+/*=======================*/
+ buf_block_t* new_block, /*!< in/out: the new half page */
+ lock_prdt_t* prdt, /*!< in: MBR on the old page */
+ lock_prdt_t* new_prdt, /*!< in: MBR on the new page */
+ const page_id_t page_id, /*!< in: page number */
+ unsigned type_mode) /*!< in: LOCK_PREDICATE or
+ LOCK_PRDT_PAGE */
+{
+ lock_t* lock;
+
+ for (lock = lock_sys.get_first(*lock_hash_get(type_mode), page_id);
+ lock;
+ lock = lock_rec_get_next_on_page(lock)) {
+ /* First dealing with Page Lock */
+ if (lock->type_mode & LOCK_PRDT_PAGE) {
+ /* Duplicate the lock to new page */
+ trx_mutex_enter(lock->trx);
+ lock_prdt_add_to_queue(lock->type_mode,
+ new_block,
+ lock->index,
+ lock->trx, NULL, TRUE);
+
+ trx_mutex_exit(lock->trx);
+ continue;
+ }
+
+ /* Now dealing with Predicate Lock */
+ lock_prdt_t* lock_prdt;
+ ulint op = PAGE_CUR_DISJOINT;
+
+ ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+ /* No need to duplicate waiting X locks */
+ if ((lock->type_mode & LOCK_MODE_MASK) == LOCK_X) {
+ continue;
+ }
+
+ lock_prdt = lock_get_prdt_from_lock(lock);
+
+ if (lock_prdt_consistent(lock_prdt, prdt, op)) {
+
+ if (!lock_prdt_consistent(lock_prdt, new_prdt, op)) {
+ /* Move the lock to new page */
+ trx_mutex_enter(lock->trx);
+ lock_prdt_add_to_queue(lock->type_mode,
+ new_block,
+ lock->index,
+ lock->trx, lock_prdt,
+ TRUE);
+ trx_mutex_exit(lock->trx);
+ }
+ } else if (!lock_prdt_consistent(lock_prdt, new_prdt, op)) {
+ /* Duplicate the lock to new page */
+ trx_mutex_enter(lock->trx);
+ lock_prdt_add_to_queue(lock->type_mode,
+ new_block,
+ lock->index,
+ lock->trx, lock_prdt, TRUE);
+
+ trx_mutex_exit(lock->trx);
+ }
+ }
+}
+
+/**************************************************************//**
+Update predicate lock when page splits */
+void
+lock_prdt_update_split(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: the new half page */
+ lock_prdt_t* prdt, /*!< in: MBR on the old page */
+ lock_prdt_t* new_prdt, /*!< in: MBR on the new page */
+ const page_id_t page_id) /*!< in: page number */
+{
+ lock_mutex_enter();
+
+ lock_prdt_update_split_low(new_block, prdt, new_prdt,
+ page_id, LOCK_PREDICATE);
+
+ lock_prdt_update_split_low(new_block, NULL, NULL,
+ page_id, LOCK_PRDT_PAGE);
+
+ lock_mutex_exit();
+}
+
+/*********************************************************************//**
+Initiate a Predicate Lock from a MBR */
+void
+lock_init_prdt_from_mbr(
+/*====================*/
+ lock_prdt_t* prdt, /*!< in/out: predicate to initialized */
+ rtr_mbr_t* mbr, /*!< in: Minimum Bounding Rectangle */
+ ulint mode, /*!< in: Search mode */
+ mem_heap_t* heap) /*!< in: heap for allocating memory */
+{
+ memset(prdt, 0, sizeof(*prdt));
+
+ if (heap != NULL) {
+ prdt->data = mem_heap_alloc(heap, sizeof(*mbr));
+ memcpy(prdt->data, mbr, sizeof(*mbr));
+ } else {
+ prdt->data = static_cast<void*>(mbr);
+ }
+
+ prdt->op = static_cast<uint16>(mode);
+}
+
+/*********************************************************************//**
+Acquire a predicate lock on a block
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_lock(
+/*===========*/
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ lock_prdt_t* prdt, /*!< in: Predicate for the lock */
+ dict_index_t* index, /*!< in: secondary index */
+ lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ unsigned type_mode,
+ /*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */
+ que_thr_t* thr) /*!< in: query thread
+ (can be NULL if BTR_NO_LOCKING_FLAG) */
+{
+ trx_t* trx = thr_get_trx(thr);
+ dberr_t err = DB_SUCCESS;
+ lock_rec_req_status status = LOCK_REC_SUCCESS;
+
+ if (trx->read_only || index->table->is_temporary()) {
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(!dict_index_is_online_ddl(index));
+ ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
+
+ const hash_table_t& hash = type_mode == LOCK_PREDICATE
+ ? lock_sys.prdt_hash
+ : lock_sys.prdt_page_hash;
+
+ /* Another transaction cannot have an implicit lock on the record,
+ because when we come here, we already have modified the clustered
+ index record, and this would not have been possible if another active
+ transaction had modified this secondary index record. */
+
+ lock_mutex_enter();
+
+ const unsigned prdt_mode = type_mode | mode;
+ lock_t* lock = lock_sys.get_first(hash, block->page.id());
+
+ if (lock == NULL) {
+ lock = lock_rec_create(
+#ifdef WITH_WSREP
+ NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
+#endif
+ prdt_mode, block, PRDT_HEAPNO,
+ index, trx, FALSE);
+
+ status = LOCK_REC_SUCCESS_CREATED;
+ } else {
+ trx_mutex_enter(trx);
+
+ if (lock_rec_get_next_on_page(lock)
+ || lock->trx != trx
+ || lock->type_mode != (LOCK_REC | prdt_mode)
+ || lock_rec_get_n_bits(lock) == 0
+ || ((type_mode & LOCK_PREDICATE)
+ && (!lock_prdt_consistent(
+ lock_get_prdt_from_lock(lock), prdt, 0)))) {
+
+ lock = lock_prdt_has_lock(
+ mode, type_mode, block, prdt, trx);
+
+ if (lock == NULL) {
+
+ lock_t* wait_for;
+
+ wait_for = lock_prdt_other_has_conflicting(
+ prdt_mode, block, prdt, trx);
+
+ if (wait_for != NULL) {
+
+ err = lock_rec_enqueue_waiting(
+#ifdef WITH_WSREP
+ NULL, /* FIXME: replicate
+ SPATIAL INDEX locks */
+#endif
+ prdt_mode,
+ block, PRDT_HEAPNO,
+ index, thr, prdt);
+ } else {
+
+ lock_prdt_add_to_queue(
+ prdt_mode, block, index, trx,
+ prdt, true);
+
+ status = LOCK_REC_SUCCESS;
+ }
+ }
+
+ trx_mutex_exit(trx);
+
+ } else {
+ trx_mutex_exit(trx);
+
+ if (!lock_rec_get_nth_bit(lock, PRDT_HEAPNO)) {
+ lock_rec_set_nth_bit(lock, PRDT_HEAPNO);
+ status = LOCK_REC_SUCCESS_CREATED;
+ }
+ }
+ }
+
+ lock_mutex_exit();
+
+ if (status == LOCK_REC_SUCCESS_CREATED && type_mode == LOCK_PREDICATE) {
+ /* Append the predicate in the lock record */
+ lock_prdt_set_prdt(lock, prdt);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Acquire a "Page" lock on a block
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_place_prdt_page_lock(
+ const page_id_t page_id, /*!< in: page identifier */
+ dict_index_t* index, /*!< in: secondary index */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(thr != NULL);
+ ut_ad(!srv_read_only_mode);
+
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(!dict_index_is_online_ddl(index));
+
+ /* Another transaction cannot have an implicit lock on the record,
+ because when we come here, we already have modified the clustered
+ index record, and this would not have been possible if another active
+ transaction had modified this secondary index record. */
+
+ lock_mutex_enter();
+
+ const lock_t* lock = lock_sys.get_first_prdt_page(page_id);
+ const ulint mode = LOCK_S | LOCK_PRDT_PAGE;
+ trx_t* trx = thr_get_trx(thr);
+
+ if (lock != NULL) {
+
+ trx_mutex_enter(trx);
+
+ /* Find a matching record lock owned by this transaction. */
+
+ while (lock != NULL && lock->trx != trx) {
+
+ lock = lock_rec_get_next_on_page_const(lock);
+ }
+
+ ut_ad(lock == NULL || lock->type_mode == (mode | LOCK_REC));
+ ut_ad(lock == NULL || lock_rec_get_n_bits(lock) != 0);
+
+ trx_mutex_exit(trx);
+ }
+
+ if (lock == NULL) {
+ lock = lock_rec_create_low(
+#ifdef WITH_WSREP
+ NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
+#endif
+ mode, page_id, NULL, PRDT_HEAPNO,
+ index, trx, FALSE);
+
+#ifdef PRDT_DIAG
+ printf("GIS_DIAGNOSTIC: page lock %d\n", (int) page_no);
+#endif /* PRDT_DIAG */
+ }
+
+ lock_mutex_exit();
+
+ return(DB_SUCCESS);
+}
+
+/** Check whether there are R-tree Page lock on a page
+@param[in] trx trx to test the lock
+@param[in] page_id page identifier
+@return true if there is none */
+bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id)
+{
+ lock_t* lock;
+
+ lock_mutex_enter();
+
+ lock = lock_sys.get_first_prdt_page(page_id);
+
+ lock_mutex_exit();
+
+ return(!lock || trx == lock->trx);
+}
+
+/*************************************************************//**
+Moves the locks of a page to another page and resets the lock bits of
+the donating records. */
+void
+lock_prdt_rec_move(
+/*===============*/
+ const buf_block_t* receiver, /*!< in: buffer block containing
+ the receiving record */
+ const buf_block_t* donator) /*!< in: buffer block containing
+ the donating record */
+{
+ lock_mutex_enter();
+
+ for (lock_t *lock = lock_rec_get_first(&lock_sys.prdt_hash,
+ donator, PRDT_HEAPNO);
+ lock != NULL;
+ lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {
+
+ const auto type_mode = lock->type_mode;
+ lock_prdt_t* lock_prdt = lock_get_prdt_from_lock(lock);
+
+ lock_rec_reset_nth_bit(lock, PRDT_HEAPNO);
+ lock_reset_lock_and_trx_wait(lock);
+
+ lock_prdt_add_to_queue(
+ type_mode, receiver, lock->index, lock->trx,
+ lock_prdt, FALSE);
+ }
+
+ lock_mutex_exit();
+}
+
+/** Removes predicate lock objects set on an index page which is discarded.
+@param[in] block page to be discarded
+@param[in] lock_hash lock hash */
+void
+lock_prdt_page_free_from_discard(
+ const buf_block_t* block,
+ hash_table_t* lock_hash)
+{
+ lock_t* lock;
+ lock_t* next_lock;
+
+ ut_ad(lock_mutex_own());
+
+ lock = lock_sys.get_first(*lock_hash, block->page.id());
+
+ while (lock != NULL) {
+ next_lock = lock_rec_get_next_on_page(lock);
+
+ lock_rec_discard(lock);
+
+ lock = next_lock;
+ }
+}
diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc
new file mode 100644
index 00000000..e5f71e0b
--- /dev/null
+++ b/storage/innobase/lock/lock0wait.cc
@@ -0,0 +1,515 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0wait.cc
+The transaction lock system
+
+Created 25/5/2010 Sunny Bains
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "univ.i"
+#include <mysql/service_thd_wait.h>
+#include <mysql/service_wsrep.h>
+
+#include "srv0mon.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "lock0priv.h"
+#include "srv0srv.h"
+
+/*********************************************************************//**
+Print the contents of the lock_sys_t::waiting_threads array. */
+static
+void
+lock_wait_table_print(void)
+/*=======================*/
+{
+ ut_ad(lock_wait_mutex_own());
+
+ const srv_slot_t* slot = lock_sys.waiting_threads;
+
+ for (ulint i = 0; i < srv_max_n_threads; i++, ++slot) {
+
+ fprintf(stderr,
+ "Slot %lu:"
+ " in use %lu, timeout %lu, time %lu\n",
+ (ulong) i,
+ (ulong) slot->in_use,
+ slot->wait_timeout,
+ (ulong) difftime(time(NULL), slot->suspend_time));
+ }
+}
+
+/*********************************************************************//**
+Release a slot in the lock_sys_t::waiting_threads. Adjust the array last pointer
+if there are empty slots towards the end of the table. */
+static
+void
+lock_wait_table_release_slot(
+/*=========================*/
+ srv_slot_t* slot) /*!< in: slot to release */
+{
+#ifdef UNIV_DEBUG
+ srv_slot_t* upper = lock_sys.waiting_threads + srv_max_n_threads;
+#endif /* UNIV_DEBUG */
+
+ lock_wait_mutex_enter();
+
+ ut_ad(slot->in_use);
+ ut_ad(slot->thr != NULL);
+ ut_ad(slot->thr->slot != NULL);
+ ut_ad(slot->thr->slot == slot);
+
+ /* Must be within the array boundaries. */
+ ut_ad(slot >= lock_sys.waiting_threads);
+ ut_ad(slot < upper);
+
+ /* Note: When we reserve the slot we use the trx_t::mutex to update
+ the slot values to change the state to reserved. Here we are using the
+ lock mutex to change the state of the slot to free. This is by design,
+ because when we query the slot state we always hold both the lock and
+ trx_t::mutex. To reduce contention on the lock mutex when reserving the
+ slot we avoid acquiring the lock mutex. */
+
+ lock_mutex_enter();
+
+ slot->thr->slot = NULL;
+ slot->thr = NULL;
+ slot->in_use = FALSE;
+
+ lock_mutex_exit();
+
+ /* Scan backwards and adjust the last free slot pointer. */
+ for (slot = lock_sys.last_slot;
+ slot > lock_sys.waiting_threads && !slot->in_use;
+ --slot) {
+ /* No op */
+ }
+
+ /* Either the array is empty or the last scanned slot is in use. */
+ ut_ad(slot->in_use || slot == lock_sys.waiting_threads);
+
+ lock_sys.last_slot = slot + 1;
+
+ /* The last slot is either outside of the array boundary or it's
+ on an empty slot. */
+ ut_ad(lock_sys.last_slot == upper || !lock_sys.last_slot->in_use);
+
+ ut_ad(lock_sys.last_slot >= lock_sys.waiting_threads);
+ ut_ad(lock_sys.last_slot <= upper);
+
+ lock_wait_mutex_exit();
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current user OS thread.
+@return reserved slot */
+static
+srv_slot_t*
+lock_wait_table_reserve_slot(
+/*=========================*/
+ que_thr_t* thr, /*!< in: query thread associated
+ with the user OS thread */
+ ulong wait_timeout) /*!< in: lock wait timeout value */
+{
+ ulint i;
+ srv_slot_t* slot;
+
+ ut_ad(lock_wait_mutex_own());
+ ut_ad(trx_mutex_own(thr_get_trx(thr)));
+
+ slot = lock_sys.waiting_threads;
+
+ for (i = srv_max_n_threads; i--; ++slot) {
+ if (!slot->in_use) {
+ slot->in_use = TRUE;
+ slot->thr = thr;
+ slot->thr->slot = slot;
+
+ if (slot->event == NULL) {
+ slot->event = os_event_create(0);
+ ut_a(slot->event);
+ }
+
+ os_event_reset(slot->event);
+ slot->suspend_time = time(NULL);
+ slot->wait_timeout = wait_timeout;
+
+ if (slot == lock_sys.last_slot) {
+ ++lock_sys.last_slot;
+ }
+
+ ut_ad(lock_sys.last_slot
+ <= lock_sys.waiting_threads + srv_max_n_threads);
+ if (!lock_sys.timeout_timer_active) {
+ lock_sys.timeout_timer_active = true;
+ lock_sys.timeout_timer->set_time(1000, 0);
+ }
+ return(slot);
+ }
+ }
+
+ ib::error() << "There appear to be " << srv_max_n_threads << " user"
+ " threads currently waiting inside InnoDB, which is the upper"
+ " limit. Cannot continue operation. Before aborting, we print"
+ " a list of waiting threads.";
+ lock_wait_table_print();
+
+ ut_error;
+ return(NULL);
+}
+
+#ifdef WITH_WSREP
+/*********************************************************************//**
+check if lock timeout was for priority thread,
+as a side effect trigger lock monitor
+@param[in] trx transaction owning the lock
+@param[in] locked true if trx and lock_sys.mutex is ownd
+@return false for regular lock timeout */
+static
+bool
+wsrep_is_BF_lock_timeout(
+ const trx_t* trx,
+ bool locked = true)
+{
+ bool long_wait= (trx->error_state != DB_DEADLOCK &&
+ srv_monitor_timer && trx->is_wsrep() &&
+ wsrep_thd_is_BF(trx->mysql_thd, false));
+ bool was_wait= true;
+
+ DBUG_EXECUTE_IF("wsrep_instrument_BF_lock_wait",
+ was_wait=false; long_wait=true;);
+
+ if (long_wait) {
+ ib::info() << "WSREP: BF lock wait long for trx:" << trx->id
+ << " query: " << wsrep_thd_query(trx->mysql_thd);
+
+ if (!locked)
+ lock_mutex_enter();
+
+ ut_ad(lock_mutex_own());
+
+ trx_print_latched(stderr, trx, 3000);
+ /* Note this will release lock_sys mutex */
+ lock_print_info_all_transactions(stderr);
+
+ if (locked)
+ lock_mutex_enter();
+
+ return was_wait;
+ } else
+ return false;
+}
+#endif /* WITH_WSREP */
+
+/***************************************************************//**
+Puts a user OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+void
+lock_wait_suspend_thread(
+/*=====================*/
+ que_thr_t* thr) /*!< in: query thread associated with the
+ user OS thread */
+{
+ srv_slot_t* slot;
+ trx_t* trx;
+ ulong lock_wait_timeout;
+
+ ut_a(lock_sys.timeout_timer.get());
+ trx = thr_get_trx(thr);
+
+ if (trx->mysql_thd != 0) {
+ DEBUG_SYNC_C("lock_wait_suspend_thread_enter");
+ }
+
+ /* InnoDB system transactions (such as the purge, and
+ incomplete transactions that are being rolled back after crash
+ recovery) will use the global value of
+ innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
+ lock_wait_timeout = trx_lock_wait_timeout_get(trx);
+
+ lock_wait_mutex_enter();
+
+ trx_mutex_enter(trx);
+
+ trx->error_state = DB_SUCCESS;
+
+ if (thr->state == QUE_THR_RUNNING) {
+
+ ut_ad(thr->is_active);
+
+ /* The lock has already been released or this transaction
+ was chosen as a deadlock victim: no need to suspend */
+
+ if (trx->lock.was_chosen_as_deadlock_victim) {
+
+ trx->error_state = DB_DEADLOCK;
+ trx->lock.was_chosen_as_deadlock_victim = false;
+ }
+
+ lock_wait_mutex_exit();
+ trx_mutex_exit(trx);
+ return;
+ }
+
+ ut_ad(!thr->is_active);
+
+ slot = lock_wait_table_reserve_slot(thr, lock_wait_timeout);
+
+ lock_wait_mutex_exit();
+ trx_mutex_exit(trx);
+
+ ulonglong start_time = 0;
+
+ if (thr->lock_state == QUE_THR_LOCK_ROW) {
+ srv_stats.n_lock_wait_count.inc();
+ srv_stats.n_lock_wait_current_count++;
+ start_time = my_interval_timer();
+ }
+
+ ulint lock_type = ULINT_UNDEFINED;
+
+ /* The wait_lock can be cleared by another thread when the
+ lock is released. But the wait can only be initiated by the
+ current thread which owns the transaction. Only acquire the
+ mutex if the wait_lock is still active. */
+ if (const lock_t* wait_lock = trx->lock.wait_lock) {
+ lock_mutex_enter();
+ wait_lock = trx->lock.wait_lock;
+ if (wait_lock) {
+ lock_type = lock_get_type_low(wait_lock);
+ }
+ lock_mutex_exit();
+ }
+
+ ulint had_dict_lock = trx->dict_operation_lock_mode;
+
+ switch (had_dict_lock) {
+ case 0:
+ break;
+ case RW_S_LATCH:
+ /* Release foreign key check latch */
+ row_mysql_unfreeze_data_dictionary(trx);
+
+ DEBUG_SYNC_C("lock_wait_release_s_latch_before_sleep");
+ break;
+ default:
+ /* There should never be a lock wait when the
+ dictionary latch is reserved in X mode. Dictionary
+ transactions should only acquire locks on dictionary
+ tables, not other tables. All access to dictionary
+ tables should be covered by dictionary
+ transactions. */
+ ut_error;
+ }
+
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ /* Suspend this thread and wait for the event. */
+
+ /* Unknown is also treated like a record lock */
+ if (lock_type == ULINT_UNDEFINED || lock_type == LOCK_REC) {
+ thd_wait_begin(trx->mysql_thd, THD_WAIT_ROW_LOCK);
+ } else {
+ ut_ad(lock_type == LOCK_TABLE);
+ thd_wait_begin(trx->mysql_thd, THD_WAIT_TABLE_LOCK);
+ }
+
+ os_event_wait(slot->event);
+
+ thd_wait_end(trx->mysql_thd);
+
+ /* After resuming, reacquire the data dictionary latch if
+ necessary. */
+
+ if (had_dict_lock) {
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ double wait_time = difftime(time(NULL), slot->suspend_time);
+
+ /* Release the slot for others to use */
+
+ lock_wait_table_release_slot(slot);
+
+ if (thr->lock_state == QUE_THR_LOCK_ROW) {
+ const ulonglong finish_time = my_interval_timer();
+
+ if (finish_time >= start_time) {
+ const ulint diff_time = static_cast<ulint>
+ ((finish_time - start_time) / 1000);
+ srv_stats.n_lock_wait_time.add(diff_time);
+ /* Only update the variable if we successfully
+ retrieved the start and finish times. See Bug#36819. */
+ if (diff_time > lock_sys.n_lock_max_wait_time) {
+ lock_sys.n_lock_max_wait_time = diff_time;
+ }
+ /* Record the lock wait time for this thread */
+ thd_storage_lock_wait(trx->mysql_thd, diff_time);
+ }
+
+ srv_stats.n_lock_wait_current_count--;
+
+ DBUG_EXECUTE_IF("lock_instrument_slow_query_log",
+ os_thread_sleep(1000););
+ }
+
+ /* The transaction is chosen as deadlock victim during sleep. */
+ if (trx->error_state == DB_DEADLOCK) {
+ return;
+ }
+
+ if (lock_wait_timeout < 100000000
+ && wait_time > (double) lock_wait_timeout
+#ifdef WITH_WSREP
+ && (!trx->is_wsrep()
+ || (!wsrep_is_BF_lock_timeout(trx, false)
+ && trx->error_state != DB_DEADLOCK))
+#endif /* WITH_WSREP */
+ ) {
+
+ trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+
+ MONITOR_INC(MONITOR_TIMEOUT);
+ }
+
+ if (trx_is_interrupted(trx)) {
+
+ trx->error_state = DB_INTERRUPTED;
+ }
+}
+
+/********************************************************************//**
+Releases a user OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+void
+lock_wait_release_thread_if_suspended(
+/*==================================*/
+ que_thr_t* thr) /*!< in: query thread associated with the
+ user OS thread */
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(thr_get_trx(thr)));
+
+ /* We own both the lock mutex and the trx_t::mutex but not the
+ lock wait mutex. This is OK because other threads will see the state
+ of this slot as being in use and no other thread can change the state
+ of the slot to free unless that thread also owns the lock mutex. */
+
+ if (thr->slot != NULL && thr->slot->in_use && thr->slot->thr == thr) {
+ trx_t* trx = thr_get_trx(thr);
+
+ if (trx->lock.was_chosen_as_deadlock_victim) {
+
+ trx->error_state = DB_DEADLOCK;
+ trx->lock.was_chosen_as_deadlock_victim = false;
+ }
+
+ os_event_set(thr->slot->event);
+ }
+}
+
+/*********************************************************************//**
+Check if the thread lock wait has timed out. Release its locks if the
+wait has actually timed out. */
+static
+void
+lock_wait_check_and_cancel(
+/*=======================*/
+ const srv_slot_t* slot) /*!< in: slot reserved by a user
+ thread when the wait started */
+{
+ ut_ad(lock_wait_mutex_own());
+ ut_ad(slot->in_use);
+
+ double wait_time = difftime(time(NULL), slot->suspend_time);
+ trx_t* trx = thr_get_trx(slot->thr);
+
+ if (trx_is_interrupted(trx)
+ || (slot->wait_timeout < 100000000
+ && (wait_time > (double) slot->wait_timeout
+ || wait_time < 0))) {
+
+ /* Timeout exceeded or a wrap-around in system
+ time counter: cancel the lock request queued
+ by the transaction and release possible
+ other transactions waiting behind; it is
+ possible that the lock has already been
+ granted: in that case do nothing */
+
+ lock_mutex_enter();
+
+ trx_mutex_enter(trx);
+
+ if (trx->lock.wait_lock != NULL) {
+
+ ut_a(trx->lock.que_state == TRX_QUE_LOCK_WAIT);
+
+#ifdef WITH_WSREP
+ if (!wsrep_is_BF_lock_timeout(trx)) {
+#endif /* WITH_WSREP */
+ lock_cancel_waiting_and_release(trx->lock.wait_lock);
+#ifdef WITH_WSREP
+ }
+#endif /* WITH_WSREP */
+ }
+
+ lock_mutex_exit();
+
+ trx_mutex_exit(trx);
+ }
+}
+
+/** A task which wakes up threads whose lock wait may have lasted too long */
+void lock_wait_timeout_task(void*)
+{
+ lock_wait_mutex_enter();
+
+ /* Check all slots for user threads that are waiting
+ on locks, and if they have exceeded the time limit. */
+ bool any_slot_in_use= false;
+ for (srv_slot_t *slot= lock_sys.waiting_threads;
+ slot < lock_sys.last_slot; ++slot)
+ {
+ /* We are doing a read without the lock mutex and/or the trx
+ mutex. This is OK because a slot can't be freed or reserved
+ without the lock wait mutex. */
+ if (slot->in_use)
+ {
+ any_slot_in_use= true;
+ lock_wait_check_and_cancel(slot);
+ }
+ }
+
+ if (any_slot_in_use)
+ lock_sys.timeout_timer->set_time(1000, 0);
+ else
+ lock_sys.timeout_timer_active= false;
+
+ lock_wait_mutex_exit();
+}
diff --git a/storage/innobase/log/log0crypt.cc b/storage/innobase/log/log0crypt.cc
new file mode 100644
index 00000000..dbf41c7d
--- /dev/null
+++ b/storage/innobase/log/log0crypt.cc
@@ -0,0 +1,429 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (C) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file log0crypt.cc
+Innodb log encrypt/decrypt
+
+Created 11/25/2013 Minli Zhu Google
+Modified Jan Lindström jan.lindstrom@mariadb.com
+MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation.
+*******************************************************/
+#include <my_global.h>
+#include "log0crypt.h"
+#include <mysql/service_my_crypt.h>
+#include "assume_aligned.h"
+
+#include "log0crypt.h"
+#include "log0recv.h" // for recv_sys
+
+/** innodb_encrypt_log: whether to encrypt the redo log */
+my_bool srv_encrypt_log;
+
+/** Redo log encryption key ID */
+#define LOG_DEFAULT_ENCRYPTION_KEY 1
+
+struct crypt_info_t {
+ ulint checkpoint_no; /*!< checkpoint no; 32 bits */
+ uint key_version; /*!< mysqld key version */
+ /** random string for encrypting the key */
+ alignas(8) byte crypt_msg[MY_AES_BLOCK_SIZE];
+ /** the secret key */
+ alignas(8) byte crypt_key[MY_AES_BLOCK_SIZE];
+ /** a random string for the per-block initialization vector */
+ alignas(4) byte crypt_nonce[4];
+};
+
+/** The crypt info */
+static crypt_info_t info;
+
+/** Initialization vector used for temporary files/tablespace */
+static byte tmp_iv[MY_AES_BLOCK_SIZE];
+
+/** Crypt info when upgrading from 10.1 */
+static crypt_info_t infos[5 * 2];
+/** First unused slot in infos[] */
+static size_t infos_used;
+
+/*********************************************************************//**
+Get a log block's start lsn.
+@return a log block's start lsn */
+static inline
+lsn_t
+log_block_get_start_lsn(
+/*====================*/
+ lsn_t lsn, /*!< in: checkpoint lsn */
+ ulint log_block_no) /*!< in: log block number */
+{
+ lsn_t start_lsn =
+ (lsn & (lsn_t)0xffffffff00000000ULL) |
+ (((log_block_no - 1) & (lsn_t)0x3fffffff) << 9);
+ return start_lsn;
+}
+
+/** Generate crypt key from crypt msg.
+@param[in,out] info encryption key
+@param[in] upgrade whether to use the key in MariaDB 10.1 format
+@return whether the operation was successful */
+static bool init_crypt_key(crypt_info_t* info, bool upgrade = false)
+{
+ byte mysqld_key[MY_AES_MAX_KEY_LENGTH];
+ uint keylen = sizeof mysqld_key;
+
+ compile_time_assert(16 == sizeof info->crypt_key);
+ compile_time_assert(16 == MY_AES_BLOCK_SIZE);
+
+ if (uint rc = encryption_key_get(LOG_DEFAULT_ENCRYPTION_KEY,
+ info->key_version, mysqld_key,
+ &keylen)) {
+ ib::error()
+ << "Obtaining redo log encryption key version "
+ << info->key_version << " failed (" << rc
+ << "). Maybe the key or the required encryption "
+ "key management plugin was not found.";
+ info->key_version = ENCRYPTION_KEY_VERSION_INVALID;
+ return false;
+ }
+
+ if (upgrade) {
+ while (keylen < sizeof mysqld_key) {
+ mysqld_key[keylen++] = 0;
+ }
+ }
+
+ uint dst_len;
+ int err= my_aes_crypt(MY_AES_ECB,
+ ENCRYPTION_FLAG_NOPAD | ENCRYPTION_FLAG_ENCRYPT,
+ info->crypt_msg, MY_AES_BLOCK_SIZE,
+ info->crypt_key, &dst_len,
+ mysqld_key, keylen, NULL, 0);
+
+ if (err != MY_AES_OK || dst_len != MY_AES_BLOCK_SIZE) {
+ ib::error() << "Getting redo log crypto key failed: err = "
+ << err << ", len = " << dst_len;
+ info->key_version = ENCRYPTION_KEY_VERSION_INVALID;
+ return false;
+ }
+
+ return true;
+}
+
+/** Encrypt or decrypt log blocks.
+@param[in,out] buf log blocks to encrypt or decrypt
+@param[in] lsn log sequence number of the start of the buffer
+@param[in] size size of the buffer, in bytes
+@param[in] op whether to decrypt, encrypt, or rotate key and encrypt
+@return whether the operation succeeded (encrypt always does) */
+bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op)
+{
+ ut_ad(size % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad(ulint(buf) % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_a(info.key_version);
+
+ alignas(8) byte aes_ctr_iv[MY_AES_BLOCK_SIZE];
+
+#define LOG_CRYPT_HDR_SIZE 4
+ lsn &= ~lsn_t(OS_FILE_LOG_BLOCK_SIZE - 1);
+
+ for (const byte* const end = buf + size; buf != end;
+ buf += OS_FILE_LOG_BLOCK_SIZE, lsn += OS_FILE_LOG_BLOCK_SIZE) {
+ alignas(4) byte dst[OS_FILE_LOG_BLOCK_SIZE - LOG_CRYPT_HDR_SIZE
+ - LOG_BLOCK_CHECKSUM];
+
+ /* The log block number is not encrypted. */
+ memcpy_aligned<4>(dst, buf + LOG_BLOCK_HDR_NO, 4);
+ memcpy_aligned<4>(aes_ctr_iv, buf + LOG_BLOCK_HDR_NO, 4);
+ *aes_ctr_iv &= byte(~(LOG_BLOCK_FLUSH_BIT_MASK >> 24));
+ static_assert(LOG_BLOCK_HDR_NO + 4 == LOG_CRYPT_HDR_SIZE,
+ "compatibility");
+ memcpy_aligned<4>(aes_ctr_iv + 4, info.crypt_nonce, 4);
+ mach_write_to_8(my_assume_aligned<8>(aes_ctr_iv + 8), lsn);
+ ut_ad(log_block_get_start_lsn(lsn,
+ log_block_get_hdr_no(buf))
+ == lsn);
+ byte* key_ver = &buf[OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_KEY
+ - LOG_BLOCK_CHECKSUM];
+ const size_t dst_size
+ = log_sys.has_encryption_key_rotation()
+ ? sizeof dst - LOG_BLOCK_KEY
+ : sizeof dst;
+ if (log_sys.has_encryption_key_rotation()) {
+ const uint key_version = info.key_version;
+ switch (op) {
+ case LOG_ENCRYPT_ROTATE_KEY:
+ info.key_version
+ = encryption_key_get_latest_version(
+ LOG_DEFAULT_ENCRYPTION_KEY);
+ if (key_version != info.key_version
+ && !init_crypt_key(&info)) {
+ info.key_version = key_version;
+ }
+ /* fall through */
+ case LOG_ENCRYPT:
+ mach_write_to_4(key_ver, info.key_version);
+ break;
+ case LOG_DECRYPT:
+ info.key_version = mach_read_from_4(key_ver);
+ if (key_version != info.key_version
+ && !init_crypt_key(&info)) {
+ return false;
+ }
+ }
+#ifndef DBUG_OFF
+ if (key_version != info.key_version) {
+ DBUG_PRINT("ib_log", ("key_version: %x -> %x",
+ key_version,
+ info.key_version));
+ }
+#endif /* !DBUG_OFF */
+ }
+
+ ut_ad(LOG_CRYPT_HDR_SIZE + dst_size
+ == log_sys.trailer_offset());
+
+ uint dst_len;
+ int rc = encryption_crypt(
+ buf + LOG_CRYPT_HDR_SIZE, static_cast<uint>(dst_size),
+ reinterpret_cast<byte*>(dst), &dst_len,
+ const_cast<byte*>(info.crypt_key),
+ MY_AES_BLOCK_SIZE,
+ aes_ctr_iv, sizeof aes_ctr_iv,
+ op == LOG_DECRYPT
+ ? ENCRYPTION_FLAG_DECRYPT | ENCRYPTION_FLAG_NOPAD
+ : ENCRYPTION_FLAG_ENCRYPT | ENCRYPTION_FLAG_NOPAD,
+ LOG_DEFAULT_ENCRYPTION_KEY,
+ info.key_version);
+ ut_a(rc == MY_AES_OK);
+ ut_a(dst_len == dst_size);
+ memcpy(buf + LOG_CRYPT_HDR_SIZE, dst, dst_size);
+ }
+
+ return true;
+}
+
+/** Initialize the redo log encryption key and random parameters
+when creating a new redo log.
+The random parameters will be persisted in the log checkpoint pages.
+@see log_crypt_write_checkpoint_buf()
+@see log_crypt_read_checkpoint_buf()
+@return whether the operation succeeded */
+bool log_crypt_init()
+{
+ info.key_version=
+ encryption_key_get_latest_version(LOG_DEFAULT_ENCRYPTION_KEY);
+
+ if (info.key_version == ENCRYPTION_KEY_VERSION_INVALID)
+ ib::error() << "log_crypt_init(): cannot get key version";
+ else if (my_random_bytes(tmp_iv, MY_AES_BLOCK_SIZE) != MY_AES_OK ||
+ my_random_bytes(info.crypt_msg, sizeof info.crypt_msg) !=
+ MY_AES_OK ||
+ my_random_bytes(info.crypt_nonce, sizeof info.crypt_nonce) !=
+ MY_AES_OK)
+ ib::error() << "log_crypt_init(): my_random_bytes() failed";
+ else if (init_crypt_key(&info))
+ goto func_exit;
+
+ info.key_version= 0;
+func_exit:
+ return info.key_version != 0;
+}
+
+/** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info.
+@param[in] buf checkpoint buffer
+@return whether the operation was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf)
+{
+ buf += 20 + 32 * 9;
+
+ const size_t n = *buf++ == 2 ? std::min(unsigned(*buf++), 5U) : 0;
+
+ for (size_t i = 0; i < n; i++) {
+ struct crypt_info_t& info = infos[infos_used];
+ unsigned checkpoint_no = mach_read_from_4(buf);
+ for (size_t j = 0; j < infos_used; j++) {
+ if (infos[j].checkpoint_no == checkpoint_no) {
+ /* Do not overwrite an existing slot. */
+ goto next_slot;
+ }
+ }
+ if (infos_used >= UT_ARR_SIZE(infos)) {
+ ut_ad("too many checkpoint pages" == 0);
+ goto next_slot;
+ }
+ infos_used++;
+ info.checkpoint_no = checkpoint_no;
+ info.key_version = mach_read_from_4(buf + 4);
+ memcpy(info.crypt_msg, buf + 8, MY_AES_BLOCK_SIZE);
+ memcpy(info.crypt_nonce, buf + 24, sizeof info.crypt_nonce);
+
+ if (!init_crypt_key(&info, true)) {
+ return false;
+ }
+next_slot:
+ buf += 4 + 4 + 2 * MY_AES_BLOCK_SIZE;
+ }
+
+ return true;
+}
+
+/** Decrypt a MariaDB 10.1 redo log block.
+@param[in,out] buf log block
+@param[in] start_lsn server start LSN
+@return whether the decryption was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn)
+{
+ const uint32_t checkpoint_no
+ = uint32_t(log_block_get_checkpoint_no(buf));
+ const crypt_info_t* info = infos;
+ for (const crypt_info_t* const end = info + infos_used; info < end;
+ info++) {
+ if (info->key_version
+ && info->key_version != ENCRYPTION_KEY_VERSION_INVALID
+ && info->checkpoint_no == checkpoint_no) {
+ goto found;
+ }
+ }
+
+ if (infos_used == 0) {
+ return false;
+ }
+ /* MariaDB Server 10.1 would use the first key if it fails to
+ find a key for the current checkpoint. */
+ info = infos;
+ if (info->key_version == ENCRYPTION_KEY_VERSION_INVALID) {
+ return false;
+ }
+found:
+ byte dst[OS_FILE_LOG_BLOCK_SIZE];
+ uint dst_len;
+ byte aes_ctr_iv[MY_AES_BLOCK_SIZE];
+
+ const uint src_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE;
+
+ ulint log_block_no = log_block_get_hdr_no(buf);
+
+ /* The log block header is not encrypted. */
+ memcpy(dst, buf, LOG_BLOCK_HDR_SIZE);
+
+ memcpy(aes_ctr_iv, info->crypt_nonce, 3);
+ mach_write_to_8(aes_ctr_iv + 3,
+ log_block_get_start_lsn(start_lsn, log_block_no));
+ memcpy(aes_ctr_iv + 11, buf, 4);
+ aes_ctr_iv[11] &= byte(~(LOG_BLOCK_FLUSH_BIT_MASK >> 24));
+ aes_ctr_iv[15] = 0;
+
+ int rc = encryption_crypt(buf + LOG_BLOCK_HDR_SIZE, src_len,
+ dst + LOG_BLOCK_HDR_SIZE, &dst_len,
+ const_cast<byte*>(info->crypt_key),
+ MY_AES_BLOCK_SIZE,
+ aes_ctr_iv, MY_AES_BLOCK_SIZE,
+ ENCRYPTION_FLAG_DECRYPT
+ | ENCRYPTION_FLAG_NOPAD,
+ LOG_DEFAULT_ENCRYPTION_KEY,
+ info->key_version);
+
+ if (rc != MY_AES_OK || dst_len != src_len) {
+ return false;
+ }
+
+ memcpy(buf, dst, sizeof dst);
+ return true;
+}
+
+/** Add the encryption information to a redo log checkpoint buffer.
+@param[in,out] buf checkpoint buffer */
+UNIV_INTERN
+void
+log_crypt_write_checkpoint_buf(byte* buf)
+{
+ ut_ad(info.key_version);
+ compile_time_assert(16 == sizeof info.crypt_msg);
+ compile_time_assert(16 == MY_AES_BLOCK_SIZE);
+ compile_time_assert(LOG_CHECKPOINT_CRYPT_MESSAGE
+ - LOG_CHECKPOINT_CRYPT_NONCE
+ == sizeof info.crypt_nonce);
+
+ memcpy(buf + LOG_CHECKPOINT_CRYPT_MESSAGE, info.crypt_msg,
+ MY_AES_BLOCK_SIZE);
+ memcpy(buf + LOG_CHECKPOINT_CRYPT_NONCE, info.crypt_nonce,
+ sizeof info.crypt_nonce);
+ mach_write_to_4(buf + LOG_CHECKPOINT_CRYPT_KEY, info.key_version);
+}
+
+/** Read the checkpoint crypto (version, msg and iv) info.
+@param[in] buf checkpoint buffer
+@return whether the operation was successful */
+bool log_crypt_read_checkpoint_buf(const byte* buf)
+{
+ info.checkpoint_no = mach_read_from_4(buf + (LOG_CHECKPOINT_NO + 4));
+ info.key_version = mach_read_from_4(buf + LOG_CHECKPOINT_CRYPT_KEY);
+
+#if MY_AES_BLOCK_SIZE != 16
+# error "MY_AES_BLOCK_SIZE != 16; redo log checkpoint format affected"
+#endif
+ compile_time_assert(16 == sizeof info.crypt_msg);
+ compile_time_assert(16 == MY_AES_BLOCK_SIZE);
+ compile_time_assert(LOG_CHECKPOINT_CRYPT_MESSAGE
+ - LOG_CHECKPOINT_CRYPT_NONCE
+ == sizeof info.crypt_nonce);
+
+ memcpy(info.crypt_msg, buf + LOG_CHECKPOINT_CRYPT_MESSAGE,
+ MY_AES_BLOCK_SIZE);
+ memcpy(info.crypt_nonce, buf + LOG_CHECKPOINT_CRYPT_NONCE,
+ sizeof info.crypt_nonce);
+
+ return init_crypt_key(&info);
+}
+
+/** Encrypt or decrypt a temporary file block.
+@param[in] src block to encrypt or decrypt
+@param[in] size size of the block
+@param[out] dst destination block
+@param[in] offs offset to block
+@param[in] encrypt true=encrypt; false=decrypt
+@return whether the operation succeeded */
+UNIV_INTERN
+bool
+log_tmp_block_encrypt(
+ const byte* src,
+ ulint size,
+ byte* dst,
+ uint64_t offs,
+ bool encrypt)
+{
+ uint dst_len;
+ uint64_t iv[MY_AES_BLOCK_SIZE / sizeof(uint64_t)];
+ iv[0] = offs;
+ memcpy(iv + 1, tmp_iv, sizeof iv - sizeof *iv);
+
+ int rc = encryption_crypt(
+ src, uint(size), dst, &dst_len,
+ const_cast<byte*>(info.crypt_key), MY_AES_BLOCK_SIZE,
+ reinterpret_cast<byte*>(iv), uint(sizeof iv),
+ encrypt
+ ? ENCRYPTION_FLAG_ENCRYPT|ENCRYPTION_FLAG_NOPAD
+ : ENCRYPTION_FLAG_DECRYPT|ENCRYPTION_FLAG_NOPAD,
+ LOG_DEFAULT_ENCRYPTION_KEY, info.key_version);
+
+ if (rc != MY_AES_OK) {
+ ib::error() << (encrypt ? "Encryption" : "Decryption")
+ << " failed for temporary file: " << rc;
+ }
+
+ return rc == MY_AES_OK;
+}
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
new file mode 100644
index 00000000..a6fa50dd
--- /dev/null
+++ b/storage/innobase/log/log0log.cc
@@ -0,0 +1,1340 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Google Inc.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0log.cc
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <debug_sync.h>
+#include <my_service_manager.h>
+
+#include "log0log.h"
+#include "log0crypt.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "lock0lock.h"
+#include "log0recv.h"
+#include "fil0fil.h"
+#include "dict0stats_bg.h"
+#include "btr0defragment.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0mon.h"
+#include "sync0sync.h"
+#include "buf0dump.h"
+#include "log0sync.h"
+
+/*
+General philosophy of InnoDB redo-logs:
+
+Every change to a contents of a data page must be done
+through mtr_t, and mtr_t::commit() will write log records
+to the InnoDB redo log. */
+
+/** Redo log system */
+log_t log_sys;
+
+/* A margin for free space in the log buffer before a log entry is catenated */
+#define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE)
+
+/* Margins for free space in the log buffer after a log entry is catenated */
+#define LOG_BUF_FLUSH_RATIO 2
+#define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN \
+ + (4U << srv_page_size_shift))
+
+/** Extends the log buffer.
+@param[in] len requested minimum size in bytes */
+void log_buffer_extend(ulong len)
+{
+ const size_t new_buf_size = ut_calc_align(len, srv_page_size);
+ byte* new_buf = static_cast<byte*>
+ (ut_malloc_dontdump(new_buf_size, PSI_INSTRUMENT_ME));
+ TRASH_ALLOC(new_buf, new_buf_size);
+ byte* new_flush_buf = static_cast<byte*>
+ (ut_malloc_dontdump(new_buf_size, PSI_INSTRUMENT_ME));
+ TRASH_ALLOC(new_flush_buf, new_buf_size);
+
+ mysql_mutex_lock(&log_sys.mutex);
+
+ if (len <= srv_log_buffer_size) {
+ /* Already extended enough by the others */
+ mysql_mutex_unlock(&log_sys.mutex);
+ ut_free_dodump(new_buf, new_buf_size);
+ ut_free_dodump(new_flush_buf, new_buf_size);
+ return;
+ }
+
+ ib::warn() << "The redo log transaction size " << len <<
+ " exceeds innodb_log_buffer_size="
+ << srv_log_buffer_size << " / 2). Trying to extend it.";
+
+ byte* old_buf = log_sys.buf;
+ byte* old_flush_buf = log_sys.flush_buf;
+ const ulong old_buf_size = srv_log_buffer_size;
+ srv_log_buffer_size = static_cast<ulong>(new_buf_size);
+ log_sys.buf = new_buf;
+ log_sys.flush_buf = new_flush_buf;
+ memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(new_buf, old_buf,
+ log_sys.buf_free);
+
+ log_sys.max_buf_free = new_buf_size / LOG_BUF_FLUSH_RATIO
+ - LOG_BUF_FLUSH_MARGIN;
+
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ ut_free_dodump(old_buf, old_buf_size);
+ ut_free_dodump(old_flush_buf, old_buf_size);
+
+ ib::info() << "innodb_log_buffer_size was extended to "
+ << new_buf_size << ".";
+}
+
+/** Calculate the recommended highest values for lsn - last_checkpoint_lsn
+and lsn - buf_pool.get_oldest_modification().
+@param[in] file_size requested innodb_log_file_size
+@retval true on success
+@retval false if the smallest log group is too small to
+accommodate the number of OS threads in the database server */
+bool
+log_set_capacity(ulonglong file_size)
+{
+ /* Margin for the free space in the smallest log, before a new query
+ step which modifies the database, is started */
+ const size_t LOG_CHECKPOINT_FREE_PER_THREAD = 4U
+ << srv_page_size_shift;
+ const size_t LOG_CHECKPOINT_EXTRA_FREE = 8U << srv_page_size_shift;
+
+ lsn_t margin;
+ ulint free;
+
+ lsn_t smallest_capacity = file_size - LOG_FILE_HDR_SIZE;
+ /* Add extra safety */
+ smallest_capacity -= smallest_capacity / 10;
+
+ /* For each OS thread we must reserve so much free space in the
+ smallest log group that it can accommodate the log entries produced
+ by single query steps: running out of free log space is a serious
+ system error which requires rebooting the database. */
+
+ free = LOG_CHECKPOINT_FREE_PER_THREAD * 10
+ + LOG_CHECKPOINT_EXTRA_FREE;
+ if (free >= smallest_capacity / 2) {
+ ib::error() << "Cannot continue operation because log file is "
+ "too small. Increase innodb_log_file_size "
+ "or decrease innodb_thread_concurrency. "
+ << INNODB_PARAMETERS_MSG;
+ return false;
+ }
+
+ margin = smallest_capacity - free;
+ margin = margin - margin / 10; /* Add still some extra safety */
+
+ mysql_mutex_lock(&log_sys.mutex);
+
+ log_sys.log_capacity = smallest_capacity;
+
+ log_sys.max_modified_age_async = margin - margin / 8;
+ log_sys.max_checkpoint_age = margin;
+
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ return(true);
+}
+
+/** Initialize the redo log subsystem. */
+void log_t::create()
+{
+ ut_ad(this == &log_sys);
+ ut_ad(!is_initialised());
+ m_initialised= true;
+
+ mysql_mutex_init(log_sys_mutex_key, &mutex, nullptr);
+ mysql_mutex_init(log_flush_order_mutex_key, &flush_order_mutex, nullptr);
+
+ /* Start the lsn from one log block from zero: this way every
+ log record has a non-zero start lsn, a fact which we will use */
+
+ set_lsn(LOG_START_LSN + LOG_BLOCK_HDR_SIZE);
+ set_flushed_lsn(LOG_START_LSN + LOG_BLOCK_HDR_SIZE);
+
+ ut_ad(srv_log_buffer_size >= 16 * OS_FILE_LOG_BLOCK_SIZE);
+ ut_ad(srv_log_buffer_size >= 4U << srv_page_size_shift);
+
+ buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size,
+ PSI_INSTRUMENT_ME));
+ TRASH_ALLOC(buf, srv_log_buffer_size);
+ flush_buf= static_cast<byte*>(ut_malloc_dontdump(srv_log_buffer_size,
+ PSI_INSTRUMENT_ME));
+ TRASH_ALLOC(flush_buf, srv_log_buffer_size);
+
+ max_buf_free= srv_log_buffer_size / LOG_BUF_FLUSH_RATIO -
+ LOG_BUF_FLUSH_MARGIN;
+ set_check_flush_or_checkpoint();
+
+ n_log_ios_old= n_log_ios;
+ last_printout_time= time(NULL);
+
+ buf_next_to_write= 0;
+ last_checkpoint_lsn= write_lsn= LOG_START_LSN;
+ n_log_ios= 0;
+ n_log_ios_old= 0;
+ log_capacity= 0;
+ max_modified_age_async= 0;
+ max_checkpoint_age= 0;
+ next_checkpoint_no= 0;
+ next_checkpoint_lsn= 0;
+ n_pending_checkpoint_writes= 0;
+
+ log_block_init(buf, LOG_START_LSN);
+ log_block_set_first_rec_group(buf, LOG_BLOCK_HDR_SIZE);
+
+ buf_free= LOG_BLOCK_HDR_SIZE;
+ checkpoint_buf= static_cast<byte*>
+ (aligned_malloc(OS_FILE_LOG_BLOCK_SIZE, OS_FILE_LOG_BLOCK_SIZE));
+}
+
+mapped_file_t::~mapped_file_t() noexcept
+{
+ if (!m_area.empty())
+ unmap();
+}
+
+dberr_t mapped_file_t::map(const char *path, bool read_only,
+ bool nvme) noexcept
+{
+ auto fd= mysql_file_open(innodb_log_file_key, path,
+ read_only ? O_RDONLY : O_RDWR, MYF(MY_WME));
+ if (fd == -1)
+ return DB_ERROR;
+
+ const auto file_size= os_file_get_size(path).m_total_size;
+
+ const int nvme_flag= nvme ? MAP_SYNC : 0;
+ void *ptr= my_mmap(0, static_cast<size_t>(file_size),
+ read_only ? PROT_READ : PROT_READ | PROT_WRITE,
+ MAP_SHARED_VALIDATE | nvme_flag, fd, 0);
+ mysql_file_close(fd, MYF(MY_WME));
+
+ if (ptr == MAP_FAILED)
+ return DB_ERROR;
+
+ m_area= {static_cast<byte *>(ptr),
+ static_cast<span<byte>::size_type>(file_size)};
+ return DB_SUCCESS;
+}
+
+dberr_t mapped_file_t::unmap() noexcept
+{
+ ut_ad(!m_area.empty());
+
+ if (my_munmap(m_area.data(), m_area.size()))
+ return DB_ERROR;
+
+ m_area= {};
+ return DB_SUCCESS;
+}
+
+file_os_io::file_os_io(file_os_io &&rhs) : m_fd(rhs.m_fd)
+{
+ rhs.m_fd= OS_FILE_CLOSED;
+}
+
+file_os_io &file_os_io::operator=(file_os_io &&rhs)
+{
+ std::swap(m_fd, rhs.m_fd);
+ return *this;
+}
+
+file_os_io::~file_os_io() noexcept
+{
+ if (is_opened())
+ close();
+}
+
+dberr_t file_os_io::open(const char *path, bool read_only) noexcept
+{
+ ut_ad(!is_opened());
+
+ bool success;
+ auto tmp_fd= os_file_create(
+ innodb_log_file_key, path, OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_NORMAL, OS_LOG_FILE, read_only, &success);
+ if (!success)
+ return DB_ERROR;
+
+ m_durable_writes= srv_file_flush_method == SRV_O_DSYNC;
+ m_fd= tmp_fd;
+ return success ? DB_SUCCESS : DB_ERROR;
+}
+
+dberr_t file_os_io::rename(const char *old_path, const char *new_path) noexcept
+{
+ return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS
+ : DB_ERROR;
+}
+
+dberr_t file_os_io::close() noexcept
+{
+ if (!os_file_close(m_fd))
+ return DB_ERROR;
+
+ m_fd= OS_FILE_CLOSED;
+ return DB_SUCCESS;
+}
+
+dberr_t file_os_io::read(os_offset_t offset, span<byte> buf) noexcept
+{
+ return os_file_read(IORequestRead, m_fd, buf.data(), offset, buf.size());
+}
+
+dberr_t file_os_io::write(const char *path, os_offset_t offset,
+ span<const byte> buf) noexcept
+{
+ return os_file_write(IORequestWrite, path, m_fd, buf.data(), offset,
+ buf.size());
+}
+
+dberr_t file_os_io::flush() noexcept
+{
+ return os_file_flush(m_fd) ? DB_SUCCESS : DB_ERROR;
+}
+
+#ifdef HAVE_PMEM
+
+#include <libpmem.h>
+
+static bool is_pmem(const char *path) noexcept
+{
+ mapped_file_t mf;
+ return mf.map(path, true, true) == DB_SUCCESS ? true : false;
+}
+
+class file_pmem_io final : public file_io
+{
+public:
+ file_pmem_io() noexcept : file_io(true) {}
+
+ dberr_t open(const char *path, bool read_only) noexcept final
+ {
+ return m_file.map(path, read_only, true);
+ }
+ dberr_t rename(const char *old_path, const char *new_path) noexcept final
+ {
+ return os_file_rename(innodb_log_file_key, old_path, new_path) ? DB_SUCCESS
+ : DB_ERROR;
+ }
+ dberr_t close() noexcept final { return m_file.unmap(); }
+ dberr_t read(os_offset_t offset, span<byte> buf) noexcept final
+ {
+ memcpy(buf.data(), m_file.data() + offset, buf.size());
+ return DB_SUCCESS;
+ }
+ dberr_t write(const char *, os_offset_t offset,
+ span<const byte> buf) noexcept final
+ {
+ pmem_memcpy_persist(m_file.data() + offset, buf.data(), buf.size());
+ return DB_SUCCESS;
+ }
+ dberr_t flush() noexcept final
+ {
+ ut_ad(0);
+ return DB_SUCCESS;
+ }
+
+private:
+ mapped_file_t m_file;
+};
+#endif
+
+dberr_t log_file_t::open(bool read_only) noexcept
+{
+ ut_a(!is_opened());
+
+#ifdef HAVE_PMEM
+ auto ptr= is_pmem(m_path.c_str())
+ ? std::unique_ptr<file_io>(new file_pmem_io)
+ : std::unique_ptr<file_io>(new file_os_io);
+#else
+ auto ptr= std::unique_ptr<file_io>(new file_os_io);
+#endif
+
+ if (dberr_t err= ptr->open(m_path.c_str(), read_only))
+ return err;
+
+ m_file= std::move(ptr);
+ return DB_SUCCESS;
+}
+
+bool log_file_t::is_opened() const noexcept
+{
+ return static_cast<bool>(m_file);
+}
+
+dberr_t log_file_t::rename(std::string new_path) noexcept
+{
+ if (dberr_t err= m_file->rename(m_path.c_str(), new_path.c_str()))
+ return err;
+
+ m_path = std::move(new_path);
+ return DB_SUCCESS;
+}
+
+dberr_t log_file_t::close() noexcept
+{
+ ut_a(is_opened());
+
+ if (dberr_t err= m_file->close())
+ return err;
+
+ m_file.reset();
+ return DB_SUCCESS;
+}
+
+dberr_t log_file_t::read(os_offset_t offset, span<byte> buf) noexcept
+{
+ ut_ad(is_opened());
+ return m_file->read(offset, buf);
+}
+
+bool log_file_t::writes_are_durable() const noexcept
+{
+ return m_file->writes_are_durable();
+}
+
+dberr_t log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept
+{
+ ut_ad(is_opened());
+ return m_file->write(m_path.c_str(), offset, buf);
+}
+
+dberr_t log_file_t::flush() noexcept
+{
+ ut_ad(is_opened());
+ return m_file->flush();
+}
+
+void log_t::file::open_file(std::string path)
+{
+ fd= log_file_t(std::move(path));
+ if (const dberr_t err= fd.open(srv_read_only_mode))
+ ib::fatal() << "open(" << fd.get_path() << ") returned " << err;
+}
+
+/** Update the log block checksum. */
+static void log_block_store_checksum(byte* block)
+{
+ log_block_set_checksum(block, log_block_calc_checksum_crc32(block));
+}
+
+void log_t::file::write_header_durable(lsn_t lsn)
+{
+ ut_ad(lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad(!recv_no_log_write);
+ ut_ad(log_sys.log.format == log_t::FORMAT_10_5 ||
+ log_sys.log.format == log_t::FORMAT_ENC_10_5);
+
+ byte *buf= log_sys.checkpoint_buf;
+ memset_aligned<OS_FILE_LOG_BLOCK_SIZE>(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
+
+ mach_write_to_4(buf + LOG_HEADER_FORMAT, log_sys.log.format);
+ mach_write_to_4(buf + LOG_HEADER_SUBFORMAT, log_sys.log.subformat);
+ mach_write_to_8(buf + LOG_HEADER_START_LSN, lsn);
+ strcpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
+ LOG_HEADER_CREATOR_CURRENT);
+ ut_ad(LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR >=
+ sizeof LOG_HEADER_CREATOR_CURRENT);
+ log_block_store_checksum(buf);
+
+ DBUG_PRINT("ib_log", ("write " LSN_PF, lsn));
+
+ log_sys.log.write(0, {buf, OS_FILE_LOG_BLOCK_SIZE});
+ if (!log_sys.log.writes_are_durable())
+ log_sys.log.flush();
+}
+
+void log_t::file::read(os_offset_t offset, span<byte> buf)
+{
+ if (const dberr_t err= fd.read(offset, buf))
+ ib::fatal() << "read(" << fd.get_path() << ") returned "<< err;
+}
+
+bool log_t::file::writes_are_durable() const noexcept
+{
+ return fd.writes_are_durable();
+}
+
+void log_t::file::write(os_offset_t offset, span<byte> buf)
+{
+ srv_stats.os_log_pending_writes.inc();
+ if (const dberr_t err= fd.write(offset, buf))
+ ib::fatal() << "write(" << fd.get_path() << ") returned " << err;
+ srv_stats.os_log_pending_writes.dec();
+ srv_stats.os_log_written.add(buf.size());
+ srv_stats.log_writes.inc();
+ log_sys.n_log_ios++;
+}
+
+void log_t::file::flush()
+{
+ log_sys.pending_flushes.fetch_add(1, std::memory_order_acquire);
+ if (const dberr_t err= fd.flush())
+ ib::fatal() << "flush(" << fd.get_path() << ") returned " << err;
+ log_sys.pending_flushes.fetch_sub(1, std::memory_order_release);
+ log_sys.flushes.fetch_add(1, std::memory_order_release);
+}
+
+void log_t::file::close_file()
+{
+ if (fd.is_opened())
+ {
+ if (const dberr_t err= fd.close())
+ ib::fatal() << "close(" << fd.get_path() << ") returned " << err;
+ }
+ fd.free(); // Free path
+}
+
+/** Initialize the redo log. */
+void log_t::file::create()
+{
+ ut_ad(this == &log_sys.log);
+ ut_ad(log_sys.is_initialised());
+
+ format= srv_encrypt_log ? log_t::FORMAT_ENC_10_5 : log_t::FORMAT_10_5;
+ subformat= 2;
+ file_size= srv_log_file_size;
+ lsn= LOG_START_LSN;
+ lsn_offset= LOG_FILE_HDR_SIZE;
+}
+
+/******************************************************//**
+Writes a buffer to a log file. */
+static
+void
+log_write_buf(
+ byte* buf, /*!< in: buffer */
+ ulint len, /*!< in: buffer len; must be divisible
+ by OS_FILE_LOG_BLOCK_SIZE */
+#ifdef UNIV_DEBUG
+ ulint pad_len, /*!< in: pad len in the buffer len */
+#endif /* UNIV_DEBUG */
+ lsn_t start_lsn, /*!< in: start lsn of the buffer; must
+ be divisible by
+ OS_FILE_LOG_BLOCK_SIZE */
+ ulint new_data_offset)/*!< in: start offset of new data in
+ buf: this parameter is used to decide
+ if we have to write a new log file
+ header */
+{
+ ulint write_len;
+ lsn_t next_offset;
+ ulint i;
+
+ ut_ad(log_write_lock_own());
+ ut_ad(!recv_no_log_write);
+ ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+loop:
+ if (len == 0) {
+
+ return;
+ }
+
+ next_offset = log_sys.log.calc_lsn_offset(start_lsn);
+
+ if ((next_offset % log_sys.log.file_size) + len
+ > log_sys.log.file_size) {
+ /* if the above condition holds, then the below expression
+ is < len which is ulint, so the typecast is ok */
+ write_len = ulint(log_sys.log.file_size
+ - (next_offset % log_sys.log.file_size));
+ } else {
+ write_len = len;
+ }
+
+ DBUG_PRINT("ib_log",
+ ("write " LSN_PF " to " LSN_PF
+ ": len " ULINTPF
+ " blocks " ULINTPF ".." ULINTPF,
+ start_lsn, next_offset,
+ write_len,
+ log_block_get_hdr_no(buf),
+ log_block_get_hdr_no(
+ buf + write_len
+ - OS_FILE_LOG_BLOCK_SIZE)));
+
+ ut_ad(pad_len >= len
+ || log_block_get_hdr_no(buf)
+ == log_block_convert_lsn_to_no(start_lsn));
+
+ /* Calculate the checksums for each log block and write them to
+ the trailer fields of the log blocks */
+
+ for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
+#ifdef UNIV_DEBUG
+ ulint hdr_no_2 = log_block_get_hdr_no(buf) + i;
+ DBUG_EXECUTE_IF("innodb_small_log_block_no_limit",
+ hdr_no_2 = ((hdr_no_2 - 1) & 0xFUL) + 1;);
+#endif
+ ut_ad(pad_len >= len
+ || i * OS_FILE_LOG_BLOCK_SIZE >= len - pad_len
+ || log_block_get_hdr_no(buf + i * OS_FILE_LOG_BLOCK_SIZE) == hdr_no_2);
+ log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
+ }
+
+ ut_a((next_offset >> srv_page_size_shift) <= ULINT_MAX);
+
+ log_sys.log.write(static_cast<size_t>(next_offset), {buf, write_len});
+
+ if (write_len < len) {
+ start_lsn += write_len;
+ len -= write_len;
+ buf += write_len;
+ goto loop;
+ }
+}
+
+/** Flush the recently written changes to the log file.
+and invoke mysql_mutex_lock(&log_sys.mutex). */
+static void log_write_flush_to_disk_low(lsn_t lsn)
+{
+ if (!log_sys.log.writes_are_durable())
+ log_sys.log.flush();
+ ut_a(lsn >= log_sys.get_flushed_lsn());
+ log_sys.set_flushed_lsn(lsn);
+}
+
+/** Swap log buffers, and copy the content of last block
+from old buf to the head of the new buf. Thus, buf_free and
+buf_next_to_write would be changed accordingly */
+static inline
+void
+log_buffer_switch()
+{
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(log_write_lock_own());
+
+ size_t area_end = ut_calc_align<size_t>(
+ log_sys.buf_free, OS_FILE_LOG_BLOCK_SIZE);
+
+ /* Copy the last block to new buf */
+ memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
+ log_sys.flush_buf,
+ log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
+ OS_FILE_LOG_BLOCK_SIZE);
+
+ std::swap(log_sys.buf, log_sys.flush_buf);
+
+ log_sys.buf_free %= OS_FILE_LOG_BLOCK_SIZE;
+ log_sys.buf_next_to_write = log_sys.buf_free;
+}
+
+/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
+log writes have been completed. */
+void log_flush_notify(lsn_t flush_lsn);
+
+/**
+Writes log buffer to disk
+which is the "write" part of log_write_up_to().
+
+This function does not flush anything.
+
+Note : the caller must have log_sys.mutex locked, and this
+mutex is released in the function.
+
+*/
+static void log_write(bool rotate_key)
+{
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(!recv_no_log_write);
+ lsn_t write_lsn;
+ if (log_sys.buf_free == log_sys.buf_next_to_write) {
+ /* Nothing to write */
+ mysql_mutex_unlock(&log_sys.mutex);
+ return;
+ }
+
+ ulint start_offset;
+ ulint end_offset;
+ ulint area_start;
+ ulint area_end;
+ ulong write_ahead_size = srv_log_write_ahead_size;
+ ulint pad_size;
+
+ DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
+ log_sys.write_lsn,
+ log_sys.get_lsn()));
+
+
+ start_offset = log_sys.buf_next_to_write;
+ end_offset = log_sys.buf_free;
+
+ area_start = ut_2pow_round(start_offset,
+ ulint(OS_FILE_LOG_BLOCK_SIZE));
+ area_end = ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
+
+ ut_ad(area_end - area_start > 0);
+
+ log_block_set_flush_bit(log_sys.buf + area_start, TRUE);
+ log_block_set_checkpoint_no(
+ log_sys.buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
+ log_sys.next_checkpoint_no);
+
+ write_lsn = log_sys.get_lsn();
+ byte *write_buf = log_sys.buf;
+
+ log_buffer_switch();
+
+ log_sys.log.set_fields(log_sys.write_lsn);
+
+ mysql_mutex_unlock(&log_sys.mutex);
+ /* Erase the end of the last log block. */
+ memset(write_buf + end_offset, 0,
+ ~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
+
+ /* Calculate pad_size if needed. */
+ pad_size = 0;
+ if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE) {
+ ulint end_offset_in_unit;
+ lsn_t end_offset = log_sys.log.calc_lsn_offset(
+ ut_uint64_align_up(write_lsn, OS_FILE_LOG_BLOCK_SIZE));
+ end_offset_in_unit = (ulint) (end_offset % write_ahead_size);
+
+ if (end_offset_in_unit > 0
+ && (area_end - area_start) > end_offset_in_unit) {
+ /* The first block in the unit was initialized
+ after the last writing.
+ Needs to be written padded data once. */
+ pad_size = std::min<ulint>(
+ ulint(write_ahead_size) - end_offset_in_unit,
+ srv_log_buffer_size - area_end);
+ ::memset(write_buf + area_end, 0, pad_size);
+ }
+ }
+
+ if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) {
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "InnoDB log write: "
+ LSN_PF, log_sys.write_lsn);
+ }
+
+ if (log_sys.is_encrypted()) {
+ log_crypt(write_buf + area_start, log_sys.write_lsn,
+ area_end - area_start,
+ rotate_key ? LOG_ENCRYPT_ROTATE_KEY : LOG_ENCRYPT);
+ }
+
+ /* Do the write to the log file */
+ log_write_buf(
+ write_buf + area_start, area_end - area_start + pad_size,
+#ifdef UNIV_DEBUG
+ pad_size,
+#endif /* UNIV_DEBUG */
+ ut_uint64_align_down(log_sys.write_lsn,
+ OS_FILE_LOG_BLOCK_SIZE),
+ start_offset - area_start);
+ srv_stats.log_padded.add(pad_size);
+ log_sys.write_lsn = write_lsn;
+ if (log_sys.log.writes_are_durable()) {
+ log_sys.set_flushed_lsn(write_lsn);
+ log_flush_notify(write_lsn);
+ }
+ return;
+}
+
+static group_commit_lock write_lock;
+static group_commit_lock flush_lock;
+
+#ifdef UNIV_DEBUG
+bool log_write_lock_own()
+{
+ return write_lock.is_owner();
+}
+#endif
+
+/** Ensure that the log has been written to the log file up to a given
+log entry (such as that of a transaction commit). Start a new write, or
+wait and check if an already running write is covering the request.
+@param[in] lsn log sequence number that should be
+included in the redo log file write
+@param[in] flush_to_disk whether the written log should also
+be flushed to the file system
+@param[in] rotate_key whether to rotate the encryption key */
+void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
+{
+ ut_ad(!srv_read_only_mode);
+ ut_ad(!rotate_key || flush_to_disk);
+ ut_ad(lsn != LSN_MAX);
+
+ if (recv_no_ibuf_operations)
+ {
+ /* Recovery is running and no operations on the log files are
+ allowed yet (the variable name .._no_ibuf_.. is misleading) */
+ return;
+ }
+
+ if (flush_to_disk &&
+ flush_lock.acquire(lsn) != group_commit_lock::ACQUIRED)
+ {
+ return;
+ }
+
+ if (write_lock.acquire(lsn) == group_commit_lock::ACQUIRED)
+ {
+ mysql_mutex_lock(&log_sys.mutex);
+ lsn_t write_lsn= log_sys.get_lsn();
+ write_lock.set_pending(write_lsn);
+
+ log_write(rotate_key);
+
+ ut_a(log_sys.write_lsn == write_lsn);
+ write_lock.release(write_lsn);
+ }
+
+ if (!flush_to_disk)
+ {
+ return;
+ }
+
+ /* Flush the highest written lsn.*/
+ auto flush_lsn = write_lock.value();
+ flush_lock.set_pending(flush_lsn);
+ log_write_flush_to_disk_low(flush_lsn);
+ flush_lock.release(flush_lsn);
+
+ log_flush_notify(flush_lsn);
+}
+
+/** write to the log file up to the last log entry.
+@param[in] sync whether we want the written log
+also to be flushed to disk. */
+void log_buffer_flush_to_disk(bool sync)
+{
+ ut_ad(!srv_read_only_mode);
+ log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), sync);
+}
+
+/********************************************************************
+
+Tries to establish a big enough margin of free space in the log buffer, such
+that a new log entry can be catenated without an immediate need for a flush. */
+ATTRIBUTE_COLD static void log_flush_margin()
+{
+ lsn_t lsn = 0;
+
+ mysql_mutex_lock(&log_sys.mutex);
+
+ if (log_sys.buf_free > log_sys.max_buf_free) {
+ /* We can write during flush */
+ lsn = log_sys.get_lsn();
+ }
+
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ if (lsn) {
+ log_write_up_to(lsn, false);
+ }
+}
+
+/** Write checkpoint info to the log header and release log_sys.mutex.
+@param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */
+ATTRIBUTE_COLD void log_write_checkpoint_info(lsn_t end_lsn)
+{
+ ut_ad(!srv_read_only_mode);
+ ut_ad(end_lsn == 0 || end_lsn >= log_sys.next_checkpoint_lsn);
+ ut_ad(end_lsn <= log_sys.get_lsn());
+ ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= log_sys.get_lsn()
+ || srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
+
+ DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF
+ " written",
+ log_sys.next_checkpoint_no,
+ log_sys.next_checkpoint_lsn));
+
+ byte* buf = log_sys.checkpoint_buf;
+ memset_aligned<OS_FILE_LOG_BLOCK_SIZE>(buf, 0, OS_FILE_LOG_BLOCK_SIZE);
+
+ mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys.next_checkpoint_no);
+ mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys.next_checkpoint_lsn);
+
+ if (log_sys.is_encrypted()) {
+ log_crypt_write_checkpoint_buf(buf);
+ }
+
+ lsn_t lsn_offset
+ = log_sys.log.calc_lsn_offset(log_sys.next_checkpoint_lsn);
+ mach_write_to_8(buf + LOG_CHECKPOINT_OFFSET, lsn_offset);
+ mach_write_to_8(buf + LOG_CHECKPOINT_LOG_BUF_SIZE,
+ srv_log_buffer_size);
+ mach_write_to_8(buf + LOG_CHECKPOINT_END_LSN, end_lsn);
+
+ log_block_store_checksum(buf);
+
+ ut_ad(LOG_CHECKPOINT_1 < srv_page_size);
+ ut_ad(LOG_CHECKPOINT_2 < srv_page_size);
+
+ ++log_sys.n_pending_checkpoint_writes;
+
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ /* Note: We alternate the physical place of the checkpoint info.
+ See the (next_checkpoint_no & 1) below. */
+
+ log_sys.log.write((log_sys.next_checkpoint_no & 1) ? LOG_CHECKPOINT_2
+ : LOG_CHECKPOINT_1,
+ {buf, OS_FILE_LOG_BLOCK_SIZE});
+
+ log_sys.log.flush();
+
+ mysql_mutex_lock(&log_sys.mutex);
+
+ --log_sys.n_pending_checkpoint_writes;
+ ut_ad(log_sys.n_pending_checkpoint_writes == 0);
+
+ log_sys.next_checkpoint_no++;
+
+ log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn;
+
+ DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF
+ ", flushed to " LSN_PF,
+ lsn_t{log_sys.last_checkpoint_lsn},
+ log_sys.get_flushed_lsn()));
+
+ MONITOR_INC(MONITOR_NUM_CHECKPOINT);
+
+ DBUG_EXECUTE_IF("crash_after_checkpoint", DBUG_SUICIDE(););
+
+ mysql_mutex_unlock(&log_sys.mutex);
+}
+
+/****************************************************************//**
+Tries to establish a big enough margin of free space in the log, such
+that a new log entry can be catenated without an immediate need for a
+checkpoint. NOTE: this function may only be called if the calling thread
+owns no synchronization objects! */
+ATTRIBUTE_COLD static void log_checkpoint_margin()
+{
+ while (log_sys.check_flush_or_checkpoint())
+ {
+ mysql_mutex_lock(&log_sys.mutex);
+ ut_ad(!recv_no_log_write);
+
+ if (!log_sys.check_flush_or_checkpoint())
+ {
+func_exit:
+ mysql_mutex_unlock(&log_sys.mutex);
+ return;
+ }
+
+ const lsn_t lsn= log_sys.get_lsn();
+ const lsn_t checkpoint= log_sys.last_checkpoint_lsn;
+ const lsn_t sync_lsn= checkpoint + log_sys.max_checkpoint_age;
+ if (lsn <= sync_lsn)
+ {
+ log_sys.set_check_flush_or_checkpoint(false);
+ goto func_exit;
+ }
+
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ /* We must wait to prevent the tail of the log overwriting the head. */
+ buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20)));
+ os_thread_sleep(10000); /* Sleep 10ms to avoid a thundering herd */
+ }
+}
+
+/**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+ATTRIBUTE_COLD void log_check_margins()
+{
+ do
+ {
+ log_flush_margin();
+ log_checkpoint_margin();
+ ut_ad(!recv_no_log_write);
+ }
+ while (log_sys.check_flush_or_checkpoint());
+}
+
+extern void buf_resize_shutdown();
+
+/** Make a checkpoint at the latest lsn on shutdown. */
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown()
+{
+ lsn_t lsn;
+ ulint count = 0;
+
+ ib::info() << "Starting shutdown...";
+
+ /* Wait until the master thread and all other operations are idle: our
+ algorithm only works if the server is idle at shutdown */
+ bool do_srv_shutdown = false;
+ if (srv_master_timer) {
+ do_srv_shutdown = srv_fast_shutdown < 2;
+ srv_master_timer.reset();
+ }
+
+ /* Wait for the end of the buffer resize task.*/
+ buf_resize_shutdown();
+ dict_stats_shutdown();
+ btr_defragment_shutdown();
+
+ srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
+
+ if (srv_buffer_pool_dump_at_shutdown &&
+ !srv_read_only_mode && srv_fast_shutdown < 2) {
+ buf_dump_start();
+ }
+ srv_monitor_timer.reset();
+ lock_sys.timeout_timer.reset();
+ if (do_srv_shutdown) {
+ srv_shutdown(srv_fast_shutdown == 0);
+ }
+
+
+loop:
+ ut_ad(lock_sys.is_initialised() || !srv_was_started);
+ ut_ad(log_sys.is_initialised() || !srv_was_started);
+ ut_ad(fil_system.is_initialised() || !srv_was_started);
+
+#define COUNT_INTERVAL 600U
+#define CHECK_INTERVAL 100000U
+ os_thread_sleep(CHECK_INTERVAL);
+
+ count++;
+
+ /* Check that there are no longer transactions, except for
+ PREPARED ones. We need this wait even for the 'very fast'
+ shutdown, because the InnoDB layer may have committed or
+ prepared transactions and we don't want to lose them. */
+
+ if (ulint total_trx = srv_was_started && !srv_read_only_mode
+ && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+ ? trx_sys.any_active_transactions() : 0) {
+
+ if (srv_print_verbose_log && count > COUNT_INTERVAL) {
+ service_manager_extend_timeout(
+ COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
+ "Waiting for %lu active transactions to finish",
+ (ulong) total_trx);
+ ib::info() << "Waiting for " << total_trx << " active"
+ << " transactions to finish";
+
+ count = 0;
+ }
+
+ goto loop;
+ }
+
+ /* We need these threads to stop early in shutdown. */
+ const char* thread_name;
+
+ if (srv_fast_shutdown != 2 && trx_rollback_is_active) {
+ thread_name = "rollback of recovered transactions";
+ } else {
+ thread_name = NULL;
+ }
+
+ if (thread_name) {
+ ut_ad(!srv_read_only_mode);
+wait_suspend_loop:
+ service_manager_extend_timeout(
+ COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
+ "Waiting for %s to exit", thread_name);
+ if (srv_print_verbose_log && count > COUNT_INTERVAL) {
+ ib::info() << "Waiting for " << thread_name
+ << " to exit";
+ count = 0;
+ }
+ goto loop;
+ }
+
+ /* Check that the background threads are suspended */
+
+ ut_ad(!srv_any_background_activity());
+ if (srv_n_fil_crypt_threads_started) {
+ os_event_set(fil_crypt_threads_event);
+ thread_name = "fil_crypt_thread";
+ goto wait_suspend_loop;
+ }
+
+ if (buf_page_cleaner_is_active) {
+ thread_name = "page cleaner thread";
+ pthread_cond_signal(&buf_pool.do_flush_list);
+ goto wait_suspend_loop;
+ }
+
+ buf_load_dump_end();
+
+ if (!buf_pool.is_initialised()) {
+ ut_ad(!srv_was_started);
+ } else if (ulint pending_io = buf_pool.io_pending()) {
+ if (srv_print_verbose_log && count > 600) {
+ ib::info() << "Waiting for " << pending_io << " buffer"
+ " page I/Os to complete";
+ count = 0;
+ }
+
+ goto loop;
+ } else {
+ buf_flush_buffer_pool();
+ }
+
+ if (log_sys.is_initialised()) {
+ mysql_mutex_lock(&log_sys.mutex);
+ const ulint n_write = log_sys.n_pending_checkpoint_writes;
+ const ulint n_flush = log_sys.pending_flushes;
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ if (n_write || n_flush) {
+ if (srv_print_verbose_log && count > 600) {
+ ib::info() << "Pending checkpoint_writes: "
+ << n_write
+ << ". Pending log flush writes: "
+ << n_flush;
+ count = 0;
+ }
+ goto loop;
+ }
+ }
+
+ if (srv_fast_shutdown == 2 || !srv_was_started) {
+ if (!srv_read_only_mode && srv_was_started) {
+ ib::info() << "MySQL has requested a very fast"
+ " shutdown without flushing the InnoDB buffer"
+ " pool to data files. At the next mysqld"
+ " startup InnoDB will do a crash recovery!";
+
+ /* In this fastest shutdown we do not flush the
+ buffer pool:
+
+ it is essentially a 'crash' of the InnoDB server.
+ Make sure that the log is all flushed to disk, so
+ that we can recover all committed transactions in
+ a crash recovery. We must not write the lsn stamps
+ to the data files, since at a startup InnoDB deduces
+ from the stamps if the previous shutdown was clean. */
+
+ log_buffer_flush_to_disk();
+ }
+
+ srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+ return;
+ }
+
+ if (!srv_read_only_mode) {
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "ensuring dirty buffer pool are written to log");
+ log_make_checkpoint();
+
+ mysql_mutex_lock(&log_sys.mutex);
+
+ lsn = log_sys.get_lsn();
+
+ const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn
+ && lsn != log_sys.last_checkpoint_lsn
+ + SIZE_OF_FILE_CHECKPOINT;
+ ut_ad(lsn >= log_sys.last_checkpoint_lsn);
+
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ if (lsn_changed) {
+ goto loop;
+ }
+
+ log_sys.log.flush();
+ } else {
+ lsn = recv_sys.recovered_lsn;
+ }
+
+ srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+
+ /* Make some checks that the server really is quiet */
+ ut_ad(!srv_any_background_activity());
+
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Free innodb buffer pool");
+ ut_d(buf_pool.assert_all_freed());
+
+ ut_a(lsn == log_sys.get_lsn()
+ || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+
+ if (UNIV_UNLIKELY(lsn < recv_sys.recovered_lsn)) {
+ ib::error() << "Shutdown LSN=" << lsn
+ << " is less than start LSN="
+ << recv_sys.recovered_lsn;
+ }
+
+ srv_shutdown_lsn = lsn;
+
+ if (!srv_read_only_mode) {
+ dberr_t err = fil_write_flushed_lsn(lsn);
+
+ if (err != DB_SUCCESS) {
+ ib::error() << "Writing flushed lsn " << lsn
+ << " failed; error=" << err;
+ }
+ }
+
+ /* Make some checks that the server really is quiet */
+ ut_ad(!srv_any_background_activity());
+
+ ut_a(lsn == log_sys.get_lsn()
+ || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+}
+
+/******************************************************//**
+Prints info of the log. */
+void
+log_print(
+/*======*/
+ FILE* file) /*!< in: file where to print */
+{
+ double time_elapsed;
+ time_t current_time;
+
+ mysql_mutex_lock(&log_sys.mutex);
+
+ const lsn_t lsn= log_sys.get_lsn();
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ const lsn_t pages_flushed = buf_pool.get_oldest_modification(lsn);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ fprintf(file,
+ "Log sequence number " LSN_PF "\n"
+ "Log flushed up to " LSN_PF "\n"
+ "Pages flushed up to " LSN_PF "\n"
+ "Last checkpoint at " LSN_PF "\n",
+ lsn,
+ log_sys.get_flushed_lsn(),
+ pages_flushed,
+ lsn_t{log_sys.last_checkpoint_lsn});
+
+ current_time = time(NULL);
+
+ time_elapsed = difftime(current_time,
+ log_sys.last_printout_time);
+
+ if (time_elapsed <= 0) {
+ time_elapsed = 1;
+ }
+
+ fprintf(file,
+ ULINTPF " pending log flushes, "
+ ULINTPF " pending chkp writes\n"
+ ULINTPF " log i/o's done, %.2f log i/o's/second\n",
+ log_sys.pending_flushes.load(),
+ log_sys.n_pending_checkpoint_writes,
+ log_sys.n_log_ios,
+ static_cast<double>(
+ log_sys.n_log_ios - log_sys.n_log_ios_old)
+ / time_elapsed);
+
+ log_sys.n_log_ios_old = log_sys.n_log_ios;
+ log_sys.last_printout_time = current_time;
+
+ mysql_mutex_unlock(&log_sys.mutex);
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+void
+log_refresh_stats(void)
+/*===================*/
+{
+ log_sys.n_log_ios_old = log_sys.n_log_ios;
+ log_sys.last_printout_time = time(NULL);
+}
+
+/** Shut down the redo log subsystem. */
+void log_t::close()
+{
+ ut_ad(this == &log_sys);
+ if (!is_initialised()) return;
+ m_initialised= false;
+ log.close();
+
+ ut_free_dodump(buf, srv_log_buffer_size);
+ buf= nullptr;
+ ut_free_dodump(flush_buf, srv_log_buffer_size);
+ flush_buf= nullptr;
+
+ mysql_mutex_destroy(&mutex);
+ mysql_mutex_destroy(&flush_order_mutex);
+
+ recv_sys.close();
+
+ aligned_free(checkpoint_buf);
+ checkpoint_buf= nullptr;
+}
+
+std::string get_log_file_path(const char *filename)
+{
+ const size_t size= strlen(srv_log_group_home_dir) + /* path separator */ 1 +
+ strlen(filename) + /* longest suffix */ 3;
+ std::string path;
+ path.reserve(size);
+ path.assign(srv_log_group_home_dir);
+
+ std::replace(path.begin(), path.end(), OS_PATH_SEPARATOR_ALT,
+ OS_PATH_SEPARATOR);
+
+ if (path.back() != OS_PATH_SEPARATOR)
+ path.push_back(OS_PATH_SEPARATOR);
+ path.append(filename);
+
+ return path;
+}
+
+std::vector<std::string> get_existing_log_files_paths() {
+ std::vector<std::string> result;
+
+ for (int i= 0; i < 101; i++) {
+ auto path= get_log_file_path(LOG_FILE_NAME_PREFIX)
+ .append(std::to_string(i));
+ os_file_stat_t stat;
+ dberr_t err= os_file_get_status(path.c_str(), &stat, false, true);
+ if (err)
+ break;
+
+ if (stat.type != OS_FILE_TYPE_FILE)
+ break;
+
+ result.push_back(std::move(path));
+ }
+
+ return result;
+}
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
new file mode 100644
index 00000000..eb34fd8e
--- /dev/null
+++ b/storage/innobase/log/log0recv.cc
@@ -0,0 +1,3783 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0recv.cc
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+
+#include <map>
+#include <string>
+#include <my_service_manager.h>
+
+#include "log0recv.h"
+
+#ifdef HAVE_MY_AES_H
+#include <my_aes.h>
+#endif
+
+#include "log0crypt.h"
+#include "mem0mem.h"
+#include "buf0buf.h"
+#include "buf0dblwr.h"
+#include "buf0flu.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "trx0undo.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "trx0rec.h"
+#include "fil0fil.h"
+#include "buf0rea.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "fil0pagecompress.h"
+
+/** Read-ahead area in applying log records to file pages */
+#define RECV_READ_AHEAD_AREA 32U
+
+/** The recovery system */
+recv_sys_t recv_sys;
+/** TRUE when recv_init_crash_recovery() has been called. */
+bool recv_needed_recovery;
+#ifdef UNIV_DEBUG
+/** TRUE if writing to the redo log (mtr_commit) is forbidden.
+Protected by log_sys.mutex. */
+bool recv_no_log_write = false;
+#endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by
+recv_recovery_from_checkpoint_start(). */
+bool recv_lsn_checks_on;
+
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this becomes TRUE if
+the log record hash table becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+true means that recovery is running and no operations on the log file
+are allowed yet: the variable name is misleading. */
+bool recv_no_ibuf_operations;
+
+/** The maximum lsn we see for a page during the recovery process. If this
+is bigger than the lsn we are able to scan up to, that is an indication that
+the recovery failed and the database may be corrupt. */
+static lsn_t recv_max_page_lsn;
+
+/** Stored physical log record with logical LSN (@see log_t::FORMAT_10_5) */
+struct log_phys_t : public log_rec_t
+{
+ /** start LSN of the mini-transaction (not necessarily of this record) */
+ const lsn_t start_lsn;
+private:
+ /** @return the start of length and data */
+ const byte *start() const
+ {
+ return my_assume_aligned<sizeof(size_t)>
+ (reinterpret_cast<const byte*>(&start_lsn + 1));
+ }
+ /** @return the start of length and data */
+ byte *start()
+ { return const_cast<byte*>(const_cast<const log_phys_t*>(this)->start()); }
+ /** @return the length of the following record */
+ uint16_t len() const { uint16_t i; memcpy(&i, start(), 2); return i; }
+
+ /** @return start of the log records */
+ byte *begin() { return start() + 2; }
+ /** @return end of the log records */
+ byte *end() { byte *e= begin() + len(); ut_ad(!*e); return e; }
+public:
+ /** @return start of the log records */
+ const byte *begin() const { return const_cast<log_phys_t*>(this)->begin(); }
+ /** @return end of the log records */
+ const byte *end() const { return const_cast<log_phys_t*>(this)->end(); }
+
+ /** Determine the allocated size of the object.
+ @param len length of recs, excluding terminating NUL byte
+ @return the total allocation size */
+ static inline size_t alloc_size(size_t len);
+
+ /** Constructor.
+ @param start_lsn start LSN of the mini-transaction
+ @param lsn mtr_t::commit_lsn() of the mini-transaction
+ @param recs the first log record for the page in the mini-transaction
+ @param size length of recs, in bytes, excluding terminating NUL byte */
+ log_phys_t(lsn_t start_lsn, lsn_t lsn, const byte *recs, size_t size) :
+ log_rec_t(lsn), start_lsn(start_lsn)
+ {
+ ut_ad(start_lsn);
+ ut_ad(start_lsn < lsn);
+ const uint16_t len= static_cast<uint16_t>(size);
+ ut_ad(len == size);
+ memcpy(start(), &len, 2);
+ reinterpret_cast<byte*>(memcpy(begin(), recs, size))[size]= 0;
+ }
+
+ /** Append a record to the log.
+ @param recs log to append
+ @param size size of the log, in bytes */
+ void append(const byte *recs, size_t size)
+ {
+ ut_ad(start_lsn < lsn);
+ uint16_t l= len();
+ reinterpret_cast<byte*>(memcpy(end(), recs, size))[size]= 0;
+ l= static_cast<uint16_t>(l + size);
+ memcpy(start(), &l, 2);
+ }
+
+ /** Apply an UNDO_APPEND record.
+ @see mtr_t::undo_append()
+ @param block undo log page
+ @param data undo log record
+ @param len length of the undo log record
+ @return whether the operation failed (inconcistency was noticed) */
+ static bool undo_append(const buf_block_t &block, const byte *data,
+ size_t len)
+ {
+ ut_ad(len > 2);
+ byte *free_p= my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.frame);
+ const uint16_t free= mach_read_from_2(free_p);
+ if (UNIV_UNLIKELY(free < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE ||
+ free + len + 6 >= srv_page_size - FIL_PAGE_DATA_END))
+ {
+ ib::error() << "Not applying UNDO_APPEND due to corruption on "
+ << block.page.id();
+ return true;
+ }
+
+ byte *p= block.frame + free;
+ mach_write_to_2(free_p, free + 4 + len);
+ memcpy(p, free_p, 2);
+ p+= 2;
+ memcpy(p, data, len);
+ p+= len;
+ mach_write_to_2(p, free);
+ return false;
+ }
+
+ /** The status of apply() */
+ enum apply_status {
+ /** The page was not affected */
+ APPLIED_NO= 0,
+ /** The page was modified */
+ APPLIED_YES,
+ /** The page was modified, affecting the encryption parameters */
+ APPLIED_TO_ENCRYPTION,
+ /** The page was modified, affecting the tablespace header */
+ APPLIED_TO_FSP_HEADER
+ };
+
+ /** Apply log to a page frame.
+ @param[in,out] block buffer block
+ @param[in,out] last_offset last byte offset, for same_page records
+ @return whether any log was applied to the page */
+ apply_status apply(const buf_block_t &block, uint16_t &last_offset) const
+ {
+ const byte * const recs= begin();
+ byte *const frame= block.page.zip.ssize
+ ? block.page.zip.data : block.frame;
+ const size_t size= block.physical_size();
+ apply_status applied= APPLIED_NO;
+
+ for (const byte *l= recs;;)
+ {
+ const byte b= *l++;
+ if (!b)
+ return applied;
+ ut_ad((b & 0x70) != RESERVED);
+ size_t rlen= b & 0xf;
+ if (!rlen)
+ {
+ const size_t lenlen= mlog_decode_varint_length(*l);
+ const uint32_t addlen= mlog_decode_varint(l);
+ ut_ad(addlen != MLOG_DECODE_ERROR);
+ rlen= addlen + 15 - lenlen;
+ l+= lenlen;
+ }
+ if (!(b & 0x80))
+ {
+ /* Skip the page identifier. It has already been validated. */
+ size_t idlen= mlog_decode_varint_length(*l);
+ ut_ad(idlen <= 5);
+ ut_ad(idlen < rlen);
+ ut_ad(mlog_decode_varint(l) == block.page.id().space());
+ l+= idlen;
+ rlen-= idlen;
+ idlen= mlog_decode_varint_length(*l);
+ ut_ad(idlen <= 5);
+ ut_ad(idlen <= rlen);
+ ut_ad(mlog_decode_varint(l) == block.page.id().page_no());
+ l+= idlen;
+ rlen-= idlen;
+ last_offset= 0;
+ }
+
+ switch (b & 0x70) {
+ case FREE_PAGE:
+ ut_ad(last_offset == 0);
+ goto next_not_same_page;
+ case INIT_PAGE:
+ if (UNIV_LIKELY(rlen == 0))
+ {
+ memset_aligned<UNIV_ZIP_SIZE_MIN>(frame, 0, size);
+ mach_write_to_4(frame + FIL_PAGE_OFFSET, block.page.id().page_no());
+ memset_aligned<8>(FIL_PAGE_PREV + frame, 0xff, 8);
+ mach_write_to_4(frame + FIL_PAGE_SPACE_ID, block.page.id().space());
+ last_offset= FIL_PAGE_TYPE;
+ next_after_applying:
+ if (applied == APPLIED_NO)
+ applied= APPLIED_YES;
+ }
+ else
+ {
+ record_corrupted:
+ if (!srv_force_recovery)
+ {
+ recv_sys.found_corrupt_log= true;
+ return applied;
+ }
+ next_not_same_page:
+ last_offset= 1; /* the next record must not be same_page */
+ }
+ next:
+ l+= rlen;
+ continue;
+ }
+
+ ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) ==
+ block.page.id().page_no());
+ ut_ad(mach_read_from_4(frame + FIL_PAGE_SPACE_ID) ==
+ block.page.id().space());
+ ut_ad(last_offset <= 1 || last_offset > 8);
+ ut_ad(last_offset <= size);
+
+ switch (b & 0x70) {
+ case OPTION:
+ goto next;
+ case EXTENDED:
+ if (UNIV_UNLIKELY(block.page.id().page_no() < 3 ||
+ block.page.zip.ssize))
+ goto record_corrupted;
+ static_assert(INIT_ROW_FORMAT_REDUNDANT == 0, "compatiblity");
+ static_assert(INIT_ROW_FORMAT_DYNAMIC == 1, "compatibility");
+ if (UNIV_UNLIKELY(!rlen))
+ goto record_corrupted;
+ switch (const byte subtype= *l) {
+ uint8_t ll;
+ size_t prev_rec, hdr_size;
+ default:
+ goto record_corrupted;
+ case INIT_ROW_FORMAT_REDUNDANT:
+ case INIT_ROW_FORMAT_DYNAMIC:
+ if (UNIV_UNLIKELY(rlen != 1))
+ goto record_corrupted;
+ page_create_low(&block, *l != INIT_ROW_FORMAT_REDUNDANT);
+ break;
+ case UNDO_INIT:
+ if (UNIV_UNLIKELY(rlen != 1))
+ goto record_corrupted;
+ trx_undo_page_init(block);
+ break;
+ case UNDO_APPEND:
+ if (UNIV_UNLIKELY(rlen <= 3))
+ goto record_corrupted;
+ if (undo_append(block, ++l, --rlen) && !srv_force_recovery)
+ {
+page_corrupted:
+ ib::error() << "Set innodb_force_recovery=1 to ignore corruption.";
+ recv_sys.found_corrupt_log= true;
+ return applied;
+ }
+ break;
+ case INSERT_HEAP_REDUNDANT:
+ case INSERT_REUSE_REDUNDANT:
+ case INSERT_HEAP_DYNAMIC:
+ case INSERT_REUSE_DYNAMIC:
+ if (UNIV_UNLIKELY(rlen < 2))
+ goto record_corrupted;
+ rlen--;
+ ll= mlog_decode_varint_length(*++l);
+ if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+ goto record_corrupted;
+ prev_rec= mlog_decode_varint(l);
+ ut_ad(prev_rec != MLOG_DECODE_ERROR);
+ rlen-= ll;
+ l+= ll;
+ ll= mlog_decode_varint_length(*l);
+ static_assert(INSERT_HEAP_REDUNDANT == 4, "compatibility");
+ static_assert(INSERT_REUSE_REDUNDANT == 5, "compatibility");
+ static_assert(INSERT_HEAP_DYNAMIC == 6, "compatibility");
+ static_assert(INSERT_REUSE_DYNAMIC == 7, "compatibility");
+ if (subtype & 2)
+ {
+ size_t shift= 0;
+ if (subtype & 1)
+ {
+ if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+ goto record_corrupted;
+ shift= mlog_decode_varint(l);
+ ut_ad(shift != MLOG_DECODE_ERROR);
+ rlen-= ll;
+ l+= ll;
+ ll= mlog_decode_varint_length(*l);
+ }
+ if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+ goto record_corrupted;
+ size_t enc_hdr_l= mlog_decode_varint(l);
+ ut_ad(enc_hdr_l != MLOG_DECODE_ERROR);
+ rlen-= ll;
+ l+= ll;
+ ll= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+ goto record_corrupted;
+ size_t hdr_c= mlog_decode_varint(l);
+ ut_ad(hdr_c != MLOG_DECODE_ERROR);
+ rlen-= ll;
+ l+= ll;
+ ll= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+ goto record_corrupted;
+ size_t data_c= mlog_decode_varint(l);
+ ut_ad(data_c != MLOG_DECODE_ERROR);
+ rlen-= ll;
+ l+= ll;
+ if (page_apply_insert_dynamic(block, subtype & 1, prev_rec,
+ shift, enc_hdr_l, hdr_c, data_c,
+ l, rlen) && !srv_force_recovery)
+ goto page_corrupted;
+ }
+ else
+ {
+ if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+ goto record_corrupted;
+ size_t header= mlog_decode_varint(l);
+ ut_ad(header != MLOG_DECODE_ERROR);
+ rlen-= ll;
+ l+= ll;
+ ll= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+ goto record_corrupted;
+ size_t hdr_c= mlog_decode_varint(l);
+ ut_ad(hdr_c != MLOG_DECODE_ERROR);
+ rlen-= ll;
+ l+= ll;
+ ll= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+ goto record_corrupted;
+ size_t data_c= mlog_decode_varint(l);
+ rlen-= ll;
+ l+= ll;
+ if (page_apply_insert_redundant(block, subtype & 1, prev_rec,
+ header, hdr_c, data_c,
+ l, rlen) && !srv_force_recovery)
+ goto page_corrupted;
+ }
+ break;
+ case DELETE_ROW_FORMAT_REDUNDANT:
+ if (UNIV_UNLIKELY(rlen < 2 || rlen > 4))
+ goto record_corrupted;
+ rlen--;
+ ll= mlog_decode_varint_length(*++l);
+ if (UNIV_UNLIKELY(ll != rlen))
+ goto record_corrupted;
+ if (page_apply_delete_redundant(block, mlog_decode_varint(l)) &&
+ !srv_force_recovery)
+ goto page_corrupted;
+ break;
+ case DELETE_ROW_FORMAT_DYNAMIC:
+ if (UNIV_UNLIKELY(rlen < 2))
+ goto record_corrupted;
+ rlen--;
+ ll= mlog_decode_varint_length(*++l);
+ if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+ goto record_corrupted;
+ prev_rec= mlog_decode_varint(l);
+ ut_ad(prev_rec != MLOG_DECODE_ERROR);
+ rlen-= ll;
+ l+= ll;
+ ll= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+ goto record_corrupted;
+ hdr_size= mlog_decode_varint(l);
+ ut_ad(hdr_size != MLOG_DECODE_ERROR);
+ rlen-= ll;
+ l+= ll;
+ ll= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(ll > 3 || ll != rlen))
+ goto record_corrupted;
+ if (page_apply_delete_dynamic(block, prev_rec, hdr_size,
+ mlog_decode_varint(l)) &&
+ !srv_force_recovery)
+ goto page_corrupted;
+ break;
+ }
+ last_offset= FIL_PAGE_TYPE;
+ goto next_after_applying;
+ case WRITE:
+ case MEMSET:
+ case MEMMOVE:
+ if (UNIV_UNLIKELY(last_offset == 1))
+ goto record_corrupted;
+ const size_t olen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3))
+ goto record_corrupted;
+ const uint32_t offset= mlog_decode_varint(l);
+ ut_ad(offset != MLOG_DECODE_ERROR);
+ static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+ if (UNIV_UNLIKELY(offset >= size))
+ goto record_corrupted;
+ if (UNIV_UNLIKELY(offset + last_offset < 8 ||
+ offset + last_offset >= size))
+ goto record_corrupted;
+ last_offset= static_cast<uint16_t>(last_offset + offset);
+ l+= olen;
+ rlen-= olen;
+ size_t llen= rlen;
+ if ((b & 0x70) == WRITE)
+ {
+ if (UNIV_UNLIKELY(rlen + last_offset > size))
+ goto record_corrupted;
+ memcpy(frame + last_offset, l, llen);
+ if (UNIV_LIKELY(block.page.id().page_no()));
+ else if (llen == 11 + MY_AES_BLOCK_SIZE &&
+ last_offset == FSP_HEADER_OFFSET + MAGIC_SZ +
+ fsp_header_get_encryption_offset(block.zip_size()))
+ applied= APPLIED_TO_ENCRYPTION;
+ else if (last_offset < FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN + 4 &&
+ last_offset + llen >= FSP_HEADER_OFFSET + FSP_SIZE)
+ applied= APPLIED_TO_FSP_HEADER;
+ next_after_applying_write:
+ ut_ad(llen + last_offset <= size);
+ last_offset= static_cast<uint16_t>(last_offset + llen);
+ goto next_after_applying;
+ }
+ llen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(llen > rlen || llen > 3))
+ goto record_corrupted;
+ const uint32_t len= mlog_decode_varint(l);
+ ut_ad(len != MLOG_DECODE_ERROR);
+ if (UNIV_UNLIKELY(len + last_offset > size))
+ goto record_corrupted;
+ l+= llen;
+ rlen-= llen;
+ llen= len;
+ if ((b & 0x70) == MEMSET)
+ {
+ ut_ad(rlen <= llen);
+ if (UNIV_UNLIKELY(rlen != 1))
+ {
+ size_t s;
+ for (s= 0; s < llen; s+= rlen)
+ memcpy(frame + last_offset + s, l, rlen);
+ memcpy(frame + last_offset + s, l, llen - s);
+ }
+ else
+ memset(frame + last_offset, *l, llen);
+ goto next_after_applying_write;
+ }
+ const size_t slen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(slen != rlen || slen > 3))
+ goto record_corrupted;
+ uint32_t s= mlog_decode_varint(l);
+ ut_ad(slen != MLOG_DECODE_ERROR);
+ if (s & 1)
+ s= last_offset - (s >> 1) - 1;
+ else
+ s= last_offset + (s >> 1) + 1;
+ if (UNIV_LIKELY(s >= 8 && s + llen <= size))
+ {
+ memmove(frame + last_offset, frame + s, llen);
+ goto next_after_applying_write;
+ }
+ }
+ goto record_corrupted;
+ }
+ }
+};
+
+
+inline size_t log_phys_t::alloc_size(size_t len)
+{
+ return len + (1 + 2 + sizeof(log_phys_t));
+}
+
+
+/** Tablespace item during recovery */
+struct file_name_t {
+ /** Tablespace file name (FILE_MODIFY) */
+ std::string name;
+ /** Tablespace object (NULL if not valid or not found) */
+ fil_space_t* space = nullptr;
+
+ /** Tablespace status. */
+ enum fil_status {
+ /** Normal tablespace */
+ NORMAL,
+ /** Deleted tablespace */
+ DELETED,
+ /** Missing tablespace */
+ MISSING
+ };
+
+ /** Status of the tablespace */
+ fil_status status;
+
+ /** FSP_SIZE of tablespace */
+ uint32_t size = 0;
+
+ /** Freed pages of tablespace */
+ range_set freed_ranges;
+
+ /** Dummy flags before they have been read from the .ibd file */
+ static constexpr uint32_t initial_flags = FSP_FLAGS_FCRC32_MASK_MARKER;
+ /** FSP_SPACE_FLAGS of tablespace */
+ uint32_t flags = initial_flags;
+
+ /** Constructor */
+ file_name_t(std::string name_, bool deleted)
+ : name(std::move(name_)), status(deleted ? DELETED: NORMAL) {}
+
+ /** Add the freed pages */
+ void add_freed_page(uint32_t page_no) { freed_ranges.add_value(page_no); }
+
+ /** Remove the freed pages */
+ void remove_freed_page(uint32_t page_no)
+ {
+ if (freed_ranges.empty()) return;
+ freed_ranges.remove_value(page_no);
+ }
+};
+
+/** Map of dirty tablespaces during recovery */
+typedef std::map<
+ ulint,
+ file_name_t,
+ std::less<ulint>,
+ ut_allocator<std::pair<const ulint, file_name_t> > > recv_spaces_t;
+
+static recv_spaces_t recv_spaces;
+
+/** The last parsed FILE_RENAME records */
+static std::map<uint32_t,std::string> renamed_spaces;
+
+/** Report an operation to create, delete, or rename a file during backup.
+@param[in] space_id tablespace identifier
+@param[in] create whether the file is being created
+@param[in] name file name (not NUL-terminated)
+@param[in] len length of name, in bytes
+@param[in] new_name new file name (NULL if not rename)
+@param[in] new_len length of new_name, in bytes (0 if NULL) */
+void (*log_file_op)(ulint space_id, bool create,
+ const byte* name, ulint len,
+ const byte* new_name, ulint new_len);
+
+/** Information about initializing page contents during redo log processing.
+FIXME: Rely on recv_sys.pages! */
+class mlog_init_t
+{
+public:
+ /** A page initialization operation that was parsed from
+ the redo log */
+ struct init {
+ /** log sequence number of the page initialization */
+ lsn_t lsn;
+ /** Whether btr_page_create() avoided a read of the page.
+
+ At the end of the last recovery batch, mark_ibuf_exist()
+ will mark pages for which this flag is set. */
+ bool created;
+ };
+
+private:
+ typedef std::map<const page_id_t, init,
+ std::less<const page_id_t>,
+ ut_allocator<std::pair<const page_id_t, init> > >
+ map;
+ /** Map of page initialization operations.
+ FIXME: Merge this to recv_sys.pages! */
+ map inits;
+public:
+ /** Record that a page will be initialized by the redo log.
+ @param[in] page_id page identifier
+ @param[in] lsn log sequence number
+ @return whether the state was changed */
+ bool add(const page_id_t page_id, lsn_t lsn)
+ {
+ ut_ad(mutex_own(&recv_sys.mutex));
+ const init init = { lsn, false };
+ std::pair<map::iterator, bool> p = inits.insert(
+ map::value_type(page_id, init));
+ ut_ad(!p.first->second.created);
+ if (p.second) return true;
+ if (p.first->second.lsn >= init.lsn) return false;
+ p.first->second = init;
+ return true;
+ }
+
+ /** Get the last stored lsn of the page id and its respective
+ init/load operation.
+ @param[in] page_id page id
+ @param[in,out] init initialize log or load log
+ @return the latest page initialization;
+ not valid after releasing recv_sys.mutex. */
+ init& last(page_id_t page_id)
+ {
+ ut_ad(mutex_own(&recv_sys.mutex));
+ return inits.find(page_id)->second;
+ }
+
+ /** Determine if a page will be initialized or freed after a time.
+ @param page_id page identifier
+ @param lsn log sequence number
+ @return whether page_id will be freed or initialized after lsn */
+ bool will_avoid_read(page_id_t page_id, lsn_t lsn) const
+ {
+ ut_ad(mutex_own(&recv_sys.mutex));
+ auto i= inits.find(page_id);
+ return i != inits.end() && i->second.lsn > lsn;
+ }
+
+ /** At the end of each recovery batch, reset the 'created' flags. */
+ void reset()
+ {
+ ut_ad(mutex_own(&recv_sys.mutex));
+ ut_ad(recv_no_ibuf_operations);
+ for (map::value_type& i : inits) {
+ i.second.created = false;
+ }
+ }
+
+ /** On the last recovery batch, mark whether there exist
+ buffered changes for the pages that were initialized
+ by buf_page_create() and still reside in the buffer pool.
+ @param[in,out] mtr dummy mini-transaction */
+ void mark_ibuf_exist(mtr_t& mtr)
+ {
+ ut_ad(mutex_own(&recv_sys.mutex));
+ mtr.start();
+
+ for (const map::value_type& i : inits) {
+ if (!i.second.created) {
+ continue;
+ }
+ if (buf_block_t* block = buf_page_get_low(
+ i.first, 0, RW_X_LATCH, nullptr,
+ BUF_GET_IF_IN_POOL, __FILE__, __LINE__,
+ &mtr, nullptr, false)) {
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ switch (fil_page_get_type(
+ block->page.zip.data)) {
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ if (page_zip_decompress(
+ &block->page.zip,
+ block->frame,
+ true)) {
+ break;
+ }
+ ib::error() << "corrupted "
+ << block->page.id();
+ }
+ }
+ if (recv_no_ibuf_operations) {
+ mtr.commit();
+ mtr.start();
+ continue;
+ }
+ mutex_exit(&recv_sys.mutex);
+ block->page.ibuf_exist = ibuf_page_exists(
+ block->page.id(), block->zip_size());
+ mtr.commit();
+ mtr.start();
+ mutex_enter(&recv_sys.mutex);
+ }
+ }
+
+ mtr.commit();
+ }
+
+ /** Clear the data structure */
+ void clear() { inits.clear(); }
+};
+
+static mlog_init_t mlog_init;
+
+/** Process a record that indicates that a tablespace is
+being shrunk in size.
+@param page_id first page identifier that is not in the file
+@param lsn log sequence number of the shrink operation */
+inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn)
+{
+ DBUG_ENTER("recv_sys_t::trim");
+ DBUG_LOG("ib_log",
+ "discarding log beyond end of tablespace "
+ << page_id << " before LSN " << lsn);
+ ut_ad(mutex_own(&mutex));
+ for (recv_sys_t::map::iterator p = pages.lower_bound(page_id);
+ p != pages.end() && p->first.space() == page_id.space();) {
+ recv_sys_t::map::iterator r = p++;
+ if (r->second.trim(lsn)) {
+ pages.erase(r);
+ }
+ }
+ if (fil_space_t* space = fil_space_get(page_id.space())) {
+ ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
+ fil_node_t* file = UT_LIST_GET_FIRST(space->chain);
+ ut_ad(file->is_open());
+ os_file_truncate(file->name, file->handle,
+ os_offset_t{page_id.page_no()}
+ << srv_page_size_shift, true);
+ }
+ DBUG_VOID_RETURN;
+}
+
+void recv_sys_t::open_log_files_if_needed()
+{
+ if (!recv_sys.files.empty())
+ return;
+
+ for (auto &&path : get_existing_log_files_paths())
+ {
+ recv_sys.files.emplace_back(std::move(path));
+ ut_a(recv_sys.files.back().open(true) == DB_SUCCESS);
+ }
+}
+
+void recv_sys_t::read(os_offset_t total_offset, span<byte> buf)
+{
+ open_log_files_if_needed();
+
+ size_t file_idx= static_cast<size_t>(total_offset / log_sys.log.file_size);
+ os_offset_t offset= total_offset % log_sys.log.file_size;
+ dberr_t err= recv_sys.files[file_idx].read(offset, buf);
+ ut_a(err == DB_SUCCESS);
+}
+
+inline size_t recv_sys_t::files_size()
+{
+ open_log_files_if_needed();
+ return files.size();
+}
+
+/** Process a file name from a FILE_* record.
+@param[in,out] name file name
+@param[in] len length of the file name
+@param[in] space_id the tablespace ID
+@param[in] deleted whether this is a FILE_DELETE record */
+static
+void
+fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
+{
+ if (srv_operation == SRV_OPERATION_BACKUP) {
+ return;
+ }
+
+ ut_ad(srv_operation == SRV_OPERATION_NORMAL
+ || srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+
+ /* We will also insert space=NULL into the map, so that
+ further checks can ensure that a FILE_MODIFY record was
+ scanned before applying any page records for the space_id. */
+
+ os_normalize_path(name);
+ const file_name_t fname(std::string(name, len), deleted);
+ std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.emplace(
+ space_id, fname);
+ ut_ad(p.first->first == space_id);
+
+ file_name_t& f = p.first->second;
+
+ if (deleted) {
+ /* Got FILE_DELETE */
+
+ if (!p.second && f.status != file_name_t::DELETED) {
+ f.status = file_name_t::DELETED;
+ if (f.space != NULL) {
+ fil_space_free(space_id, false);
+ f.space = NULL;
+ }
+ }
+
+ ut_ad(f.space == NULL);
+ } else if (p.second // the first FILE_MODIFY or FILE_RENAME
+ || f.name != fname.name) {
+ fil_space_t* space;
+
+ /* Check if the tablespace file exists and contains
+ the space_id. If not, ignore the file after displaying
+ a note. Abort if there are multiple files with the
+ same space_id. */
+ switch (fil_ibd_load(space_id, name, space)) {
+ case FIL_LOAD_OK:
+ ut_ad(space != NULL);
+
+ if (!f.space) {
+ if (f.size
+ || f.flags != f.initial_flags) {
+ fil_space_set_recv_size_and_flags(
+ space->id, f.size, f.flags);
+ }
+
+ f.space = space;
+ goto same_space;
+ } else if (f.space == space) {
+same_space:
+ f.name = fname.name;
+ f.status = file_name_t::NORMAL;
+ } else {
+ ib::error() << "Tablespace " << space_id
+ << " has been found in two places: '"
+ << f.name << "' and '" << name << "'."
+ " You must delete one of them.";
+ recv_sys.found_corrupt_fs = true;
+ }
+ break;
+
+ case FIL_LOAD_ID_CHANGED:
+ ut_ad(space == NULL);
+ break;
+
+ case FIL_LOAD_NOT_FOUND:
+ /* No matching tablespace was found; maybe it
+ was renamed, and we will find a subsequent
+ FILE_* record. */
+ ut_ad(space == NULL);
+
+ if (srv_force_recovery) {
+ /* Without innodb_force_recovery,
+ missing tablespaces will only be
+ reported in
+ recv_init_crash_recovery_spaces().
+ Enable some more diagnostics when
+ forcing recovery. */
+
+ ib::info()
+ << "At LSN: " << recv_sys.recovered_lsn
+ << ": unable to open file " << name
+ << " for tablespace " << space_id;
+ }
+ break;
+
+ case FIL_LOAD_INVALID:
+ ut_ad(space == NULL);
+ if (srv_force_recovery == 0) {
+ ib::warn() << "We do not continue the crash"
+ " recovery, because the table may"
+ " become corrupt if we cannot apply"
+ " the log records in the InnoDB log to"
+ " it. To fix the problem and start"
+ " mysqld:";
+ ib::info() << "1) If there is a permission"
+ " problem in the file and mysqld"
+ " cannot open the file, you should"
+ " modify the permissions.";
+ ib::info() << "2) If the tablespace is not"
+ " needed, or you can restore an older"
+ " version from a backup, then you can"
+ " remove the .ibd file, and use"
+ " --innodb_force_recovery=1 to force"
+ " startup without this file.";
+ ib::info() << "3) If the file system or the"
+ " disk is broken, and you cannot"
+ " remove the .ibd file, you can set"
+ " --innodb_force_recovery.";
+ recv_sys.found_corrupt_fs = true;
+ break;
+ }
+
+ ib::info() << "innodb_force_recovery was set to "
+ << srv_force_recovery << ". Continuing crash"
+ " recovery even though we cannot access the"
+ " files for tablespace " << space_id << ".";
+ break;
+ }
+ }
+}
+
+/** Clean up after recv_sys_t::create() */
+void recv_sys_t::close()
+{
+ ut_ad(this == &recv_sys);
+
+ if (is_initialised())
+ {
+ dblwr.pages.clear();
+ ut_d(mutex_enter(&mutex));
+ clear();
+ ut_d(mutex_exit(&mutex));
+
+ if (buf)
+ {
+ ut_free_dodump(buf, RECV_PARSING_BUF_SIZE);
+ buf= nullptr;
+ }
+
+ last_stored_lsn= 0;
+ mutex_free(&mutex);
+ }
+
+ recv_spaces.clear();
+ renamed_spaces.clear();
+ mlog_init.clear();
+
+ close_files();
+}
+
+/** Initialize the redo log recovery subsystem. */
+void recv_sys_t::create()
+{
+ ut_ad(this == &recv_sys);
+ ut_ad(!is_initialised());
+ mutex_create(LATCH_ID_RECV_SYS, &mutex);
+
+ apply_log_recs = false;
+ apply_batch_on = false;
+
+ buf = static_cast<byte*>(ut_malloc_dontdump(RECV_PARSING_BUF_SIZE,
+ PSI_INSTRUMENT_ME));
+ len = 0;
+ parse_start_lsn = 0;
+ scanned_lsn = 0;
+ scanned_checkpoint_no = 0;
+ recovered_offset = 0;
+ recovered_lsn = 0;
+ found_corrupt_log = false;
+ found_corrupt_fs = false;
+ mlog_checkpoint_lsn = 0;
+
+ progress_time = time(NULL);
+ recv_max_page_lsn = 0;
+
+ memset(truncated_undo_spaces, 0, sizeof truncated_undo_spaces);
+ last_stored_lsn = 1;
+ UT_LIST_INIT(blocks, &buf_block_t::unzip_LRU);
+}
+
+/** Clear a fully processed set of stored redo log records. */
+inline void recv_sys_t::clear()
+{
+ ut_ad(mutex_own(&mutex));
+ apply_log_recs= false;
+ apply_batch_on= false;
+ ut_ad(!after_apply || !UT_LIST_GET_LAST(blocks));
+ pages.clear();
+
+ for (buf_block_t *block= UT_LIST_GET_LAST(blocks); block; )
+ {
+ buf_block_t *prev_block= UT_LIST_GET_PREV(unzip_LRU, block);
+ ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+ UT_LIST_REMOVE(blocks, block);
+ MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size);
+ buf_block_free(block);
+ block= prev_block;
+ }
+}
+
+/** Free most recovery data structures. */
+void recv_sys_t::debug_free()
+{
+ ut_ad(this == &recv_sys);
+ ut_ad(is_initialised());
+ mutex_enter(&mutex);
+
+ recovery_on= false;
+ pages.clear();
+ ut_free_dodump(buf, RECV_PARSING_BUF_SIZE);
+
+ buf= nullptr;
+
+ mutex_exit(&mutex);
+}
+
+inline void *recv_sys_t::alloc(size_t len)
+{
+ ut_ad(mutex_own(&mutex));
+ ut_ad(len);
+ ut_ad(len <= srv_page_size);
+
+ buf_block_t *block= UT_LIST_GET_FIRST(blocks);
+ if (UNIV_UNLIKELY(!block))
+ {
+create_block:
+ block= buf_block_alloc();
+ block->page.access_time= 1U << 16 |
+ ut_calc_align<uint16_t>(static_cast<uint16_t>(len), ALIGNMENT);
+ static_assert(ut_is_2pow(ALIGNMENT), "ALIGNMENT must be a power of 2");
+ UT_LIST_ADD_FIRST(blocks, block);
+ MEM_MAKE_ADDRESSABLE(block->frame, len);
+ MEM_NOACCESS(block->frame + len, srv_page_size - len);
+ return my_assume_aligned<ALIGNMENT>(block->frame);
+ }
+
+ size_t free_offset= static_cast<uint16_t>(block->page.access_time);
+ ut_ad(!ut_2pow_remainder(free_offset, ALIGNMENT));
+ if (UNIV_UNLIKELY(!free_offset))
+ {
+ ut_ad(srv_page_size == 65536);
+ goto create_block;
+ }
+ ut_ad(free_offset <= srv_page_size);
+ free_offset+= len;
+
+ if (free_offset > srv_page_size)
+ goto create_block;
+
+ block->page.access_time= ((block->page.access_time >> 16) + 1) << 16 |
+ ut_calc_align<uint16_t>(static_cast<uint16_t>(free_offset), ALIGNMENT);
+ MEM_MAKE_ADDRESSABLE(block->frame + free_offset - len, len);
+ return my_assume_aligned<ALIGNMENT>(block->frame + free_offset - len);
+}
+
+
+/** Free a redo log snippet.
+@param data buffer returned by alloc() */
+inline void recv_sys_t::free(const void *data)
+{
+ ut_ad(!ut_align_offset(data, ALIGNMENT));
+ data= page_align(data);
+ ut_ad(mutex_own(&mutex));
+
+ /* MDEV-14481 FIXME: To prevent race condition with buf_pool.resize(),
+ we must acquire and hold the buffer pool mutex here. */
+ ut_ad(!buf_pool.resize_in_progress());
+
+ auto *chunk= buf_pool.chunks;
+ for (auto i= buf_pool.n_chunks; i--; chunk++)
+ {
+ if (data < chunk->blocks->frame)
+ continue;
+ const size_t offs= (reinterpret_cast<const byte*>(data) -
+ chunk->blocks->frame) >> srv_page_size_shift;
+ if (offs >= chunk->size)
+ continue;
+ buf_block_t *block= &chunk->blocks[offs];
+ ut_ad(block->frame == data);
+ ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+ ut_ad(static_cast<uint16_t>(block->page.access_time - 1) <
+ srv_page_size);
+ ut_ad(block->page.access_time >= 1U << 16);
+ if (!((block->page.access_time -= 1U << 16) >> 16))
+ {
+ UT_LIST_REMOVE(blocks, block);
+ MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size);
+ buf_block_free(block);
+ }
+ return;
+ }
+ ut_ad(0);
+}
+
+
+/** Read a log segment to log_sys.buf.
+@param[in,out] start_lsn in: read area start,
+out: the last read valid lsn
+@param[in] end_lsn read area end
+@return whether no invalid blocks (e.g checksum mismatch) were found */
+bool log_t::file::read_log_seg(lsn_t* start_lsn, lsn_t end_lsn)
+{
+ ulint len;
+ bool success = true;
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(!(*start_lsn % OS_FILE_LOG_BLOCK_SIZE));
+ ut_ad(!(end_lsn % OS_FILE_LOG_BLOCK_SIZE));
+ byte* buf = log_sys.buf;
+loop:
+ lsn_t source_offset = calc_lsn_offset_old(*start_lsn);
+
+ ut_a(end_lsn - *start_lsn <= ULINT_MAX);
+ len = (ulint) (end_lsn - *start_lsn);
+
+ ut_ad(len != 0);
+
+ const bool at_eof = (source_offset % file_size) + len > file_size;
+ if (at_eof) {
+ /* If the above condition is true then len (which is ulint)
+ is > the expression below, so the typecast is ok */
+ len = ulint(file_size - (source_offset % file_size));
+ }
+
+ log_sys.n_log_ios++;
+
+ ut_a((source_offset >> srv_page_size_shift) <= ULINT_MAX);
+
+ recv_sys.read(source_offset, {buf, len});
+
+ for (ulint l = 0; l < len; l += OS_FILE_LOG_BLOCK_SIZE,
+ buf += OS_FILE_LOG_BLOCK_SIZE,
+ (*start_lsn) += OS_FILE_LOG_BLOCK_SIZE) {
+ const ulint block_number = log_block_get_hdr_no(buf);
+
+ if (block_number != log_block_convert_lsn_to_no(*start_lsn)) {
+ /* Garbage or an incompletely written log block.
+ We will not report any error, because this can
+ happen when InnoDB was killed while it was
+ writing redo log. We simply treat this as an
+ abrupt end of the redo log. */
+fail:
+ end_lsn = *start_lsn;
+ success = false;
+ break;
+ }
+
+ ulint crc = log_block_calc_checksum_crc32(buf);
+ ulint cksum = log_block_get_checksum(buf);
+
+ DBUG_EXECUTE_IF("log_intermittent_checksum_mismatch", {
+ static int block_counter;
+ if (block_counter++ == 0) {
+ cksum = crc + 1;
+ }
+ });
+
+ DBUG_EXECUTE_IF("log_checksum_mismatch", { cksum = crc + 1; });
+
+ if (UNIV_UNLIKELY(crc != cksum)) {
+ ib::error_or_warn(srv_operation!=SRV_OPERATION_BACKUP)
+ << "Invalid log block checksum. block: "
+ << block_number
+ << " checkpoint no: "
+ << log_block_get_checkpoint_no(buf)
+ << " expected: " << crc
+ << " found: " << cksum;
+ goto fail;
+ }
+
+ if (is_encrypted()
+ && !log_crypt(buf, *start_lsn,
+ OS_FILE_LOG_BLOCK_SIZE,
+ LOG_DECRYPT)) {
+ goto fail;
+ }
+
+ ulint dl = log_block_get_data_len(buf);
+ if (dl < LOG_BLOCK_HDR_SIZE
+ || (dl != OS_FILE_LOG_BLOCK_SIZE
+ && dl > log_sys.trailer_offset())) {
+ recv_sys.found_corrupt_log = true;
+ goto fail;
+ }
+ }
+
+ if (recv_sys.report(time(NULL))) {
+ ib::info() << "Read redo log up to LSN=" << *start_lsn;
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Read redo log up to LSN=" LSN_PF,
+ *start_lsn);
+ }
+
+ if (*start_lsn != end_lsn) {
+ goto loop;
+ }
+
+ return(success);
+}
+
+
+
+/********************************************************//**
+Copies a log segment from the most up-to-date log group to the other log
+groups, so that they all contain the latest log data. Also writes the info
+about the latest checkpoint to the groups, and inits the fields in the group
+memory structs to up-to-date values. */
+static
+void
+recv_synchronize_groups()
+{
+ const lsn_t recovered_lsn = recv_sys.recovered_lsn;
+
+ /* Read the last recovered log block to the recovery system buffer:
+ the block is always incomplete */
+
+ lsn_t start_lsn = ut_uint64_align_down(recovered_lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
+ log_sys.log.read_log_seg(&start_lsn,
+ start_lsn + OS_FILE_LOG_BLOCK_SIZE);
+ log_sys.log.set_fields(recovered_lsn);
+
+ /* Copy the checkpoint info to the log; remember that we have
+ incremented checkpoint_no by one, and the info will not be written
+ over the max checkpoint info, thus making the preservation of max
+ checkpoint info on disk certain */
+
+ if (!srv_read_only_mode) {
+ log_write_checkpoint_info(0);
+ mysql_mutex_lock(&log_sys.mutex);
+ }
+}
+
+/** Check the consistency of a log header block.
+@param[in] log header block
+@return true if ok */
+static
+bool
+recv_check_log_header_checksum(
+ const byte* buf)
+{
+ return(log_block_get_checksum(buf)
+ == log_block_calc_checksum_crc32(buf));
+}
+
+static bool redo_file_sizes_are_correct()
+{
+ auto paths= get_existing_log_files_paths();
+ auto get_size= [](const std::string &path) {
+ return os_file_get_size(path.c_str()).m_total_size;
+ };
+ os_offset_t size= get_size(paths[0]);
+
+ auto it=
+ std::find_if(paths.begin(), paths.end(), [&](const std::string &path) {
+ return get_size(path) != size;
+ });
+
+ if (it == paths.end())
+ return true;
+
+ ib::error() << "Log file " << *it << " is of different size "
+ << get_size(*it) << " bytes than other log files " << size
+ << " bytes!";
+ return false;
+}
+
+/** Calculate the checksum for a log block using the pre-10.2.2 algorithm. */
+inline uint32_t log_block_calc_checksum_format_0(const byte *b)
+{
+ uint32_t sum= 1;
+ const byte *const end= &b[512 - 4];
+
+ for (uint32_t sh= 0; b < end; )
+ {
+ sum&= 0x7FFFFFFFUL;
+ sum+= uint32_t{*b} << sh++;
+ sum+= *b++;
+ if (sh > 24)
+ sh= 0;
+ }
+
+ return sum;
+}
+
+/** Determine if a redo log from before MariaDB 10.2.2 is clean.
+@return error code
+@retval DB_SUCCESS if the redo log is clean
+@retval DB_CORRUPTION if the redo log is corrupted
+@retval DB_ERROR if the redo log is not empty */
+ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2()
+{
+ uint64_t max_no= 0;
+ byte *buf= log_sys.buf;
+
+ ut_ad(log_sys.log.format == 0);
+
+ if (!redo_file_sizes_are_correct())
+ return DB_CORRUPTION;
+
+ /** Offset of the first checkpoint checksum */
+ constexpr uint CHECKSUM_1= 288;
+ /** Offset of the second checkpoint checksum */
+ constexpr uint CHECKSUM_2= CHECKSUM_1 + 4;
+ /** the checkpoint LSN field */
+ constexpr uint CHECKPOINT_LSN= 8;
+ /** Most significant bits of the checkpoint offset */
+ constexpr uint OFFS_HI= CHECKSUM_2 + 12;
+ /** Least significant bits of the checkpoint offset */
+ constexpr uint OFFS_LO= 16;
+
+ lsn_t lsn= 0;
+
+ for (ulint field= LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2;
+ field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1)
+ {
+ log_sys.log.read(field, {buf, OS_FILE_LOG_BLOCK_SIZE});
+
+ if (static_cast<uint32_t>(ut_fold_binary(buf, CHECKSUM_1)) !=
+ mach_read_from_4(buf + CHECKSUM_1) ||
+ static_cast<uint32_t>(ut_fold_binary(buf + CHECKPOINT_LSN,
+ CHECKSUM_2 - CHECKPOINT_LSN)) !=
+ mach_read_from_4(buf + CHECKSUM_2))
+ {
+ DBUG_LOG("ib_log", "invalid pre-10.2.2 checkpoint " << field);
+ continue;
+ }
+
+ if (!log_crypt_101_read_checkpoint(buf))
+ {
+ ib::error() << "Decrypting checkpoint failed";
+ continue;
+ }
+
+ const uint64_t checkpoint_no= mach_read_from_8(buf);
+
+ DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF " found",
+ checkpoint_no,
+ mach_read_from_8(buf + CHECKPOINT_LSN)));
+
+ if (checkpoint_no >= max_no)
+ {
+ max_no= checkpoint_no;
+ lsn= mach_read_from_8(buf + CHECKPOINT_LSN);
+ log_sys.log.set_lsn(lsn);
+ log_sys.log.set_lsn_offset(lsn_t{mach_read_from_4(buf + OFFS_HI)} << 32 |
+ mach_read_from_4(buf + OFFS_LO));
+ }
+ }
+
+ if (!lsn)
+ {
+ ib::error() << "Upgrade after a crash is not supported."
+ " This redo log was created before MariaDB 10.2.2,"
+ " and we did not find a valid checkpoint."
+ " Please follow the instructions at"
+ " https://mariadb.com/kb/en/library/upgrading/";
+ return DB_ERROR;
+ }
+
+ log_sys.set_lsn(lsn);
+ log_sys.set_flushed_lsn(lsn);
+ const lsn_t source_offset= log_sys.log.calc_lsn_offset_old(lsn);
+
+ static constexpr char NO_UPGRADE_RECOVERY_MSG[]=
+ "Upgrade after a crash is not supported."
+ " This redo log was created before MariaDB 10.2.2";
+
+ recv_sys.read(source_offset & ~511, {buf, 512});
+
+ if (log_block_calc_checksum_format_0(buf) != log_block_get_checksum(buf) &&
+ !log_crypt_101_read_block(buf, lsn))
+ {
+ ib::error() << NO_UPGRADE_RECOVERY_MSG << ", and it appears corrupted.";
+ return DB_CORRUPTION;
+ }
+
+ if (mach_read_from_2(buf + 4) == (source_offset & 511))
+ {
+ /* Mark the redo log for upgrading. */
+ srv_log_file_size= 0;
+ recv_sys.parse_start_lsn= recv_sys.recovered_lsn= recv_sys.scanned_lsn=
+ recv_sys.mlog_checkpoint_lsn = lsn;
+ log_sys.last_checkpoint_lsn= log_sys.next_checkpoint_lsn=
+ log_sys.write_lsn= log_sys.current_flush_lsn= lsn;
+ log_sys.next_checkpoint_no= 0;
+ return DB_SUCCESS;
+ }
+
+ if (buf[20 + 32 * 9] == 2)
+ ib::error() << "Cannot decrypt log for upgrading."
+ " The encrypted log was created before MariaDB 10.2.2.";
+ else
+ ib::error() << NO_UPGRADE_RECOVERY_MSG << ".";
+
+ return DB_ERROR;
+}
+
+/** Calculate the offset of a log sequence number
+in an old redo log file (during upgrade check).
+@param[in] lsn log sequence number
+@return byte offset within the log */
+inline lsn_t log_t::file::calc_lsn_offset_old(lsn_t lsn) const
+{
+ const lsn_t size= capacity() * recv_sys.files_size();
+ lsn_t l= lsn - this->lsn;
+ if (longlong(l) < 0)
+ {
+ l= lsn_t(-longlong(l)) % size;
+ l= size - l;
+ }
+
+ l+= lsn_offset - LOG_FILE_HDR_SIZE * (1 + lsn_offset / file_size);
+ l%= size;
+ return l + LOG_FILE_HDR_SIZE * (1 + l / (file_size - LOG_FILE_HDR_SIZE));
+}
+
+/** Determine if a redo log from MariaDB 10.2.2+, 10.3, or 10.4 is clean.
+@return error code
+@retval DB_SUCCESS if the redo log is clean
+@retval DB_CORRUPTION if the redo log is corrupted
+@retval DB_ERROR if the redo log is not empty */
+static dberr_t recv_log_recover_10_4()
+{
+ const lsn_t lsn = log_sys.log.get_lsn();
+ const lsn_t source_offset = log_sys.log.calc_lsn_offset_old(lsn);
+ byte* buf = log_sys.buf;
+
+ if (!redo_file_sizes_are_correct()) {
+ return DB_CORRUPTION;
+ }
+
+ recv_sys.read(source_offset & ~(OS_FILE_LOG_BLOCK_SIZE - 1),
+ {buf, OS_FILE_LOG_BLOCK_SIZE});
+
+ ulint crc = log_block_calc_checksum_crc32(buf);
+ ulint cksum = log_block_get_checksum(buf);
+
+ if (UNIV_UNLIKELY(crc != cksum)) {
+ ib::error() << "Invalid log block checksum."
+ << " block: "
+ << log_block_get_hdr_no(buf)
+ << " checkpoint no: "
+ << log_block_get_checkpoint_no(buf)
+ << " expected: " << crc
+ << " found: " << cksum;
+ return DB_CORRUPTION;
+ }
+
+ if (log_sys.log.is_encrypted()
+ && !log_crypt(buf, lsn & ~511, 512, LOG_DECRYPT)) {
+ return DB_ERROR;
+ }
+
+ /* On a clean shutdown, the redo log will be logically empty
+ after the checkpoint lsn. */
+
+ if (log_block_get_data_len(buf)
+ != (source_offset & (OS_FILE_LOG_BLOCK_SIZE - 1))) {
+ return DB_ERROR;
+ }
+
+ /* Mark the redo log for upgrading. */
+ srv_log_file_size = 0;
+ recv_sys.parse_start_lsn = recv_sys.recovered_lsn
+ = recv_sys.scanned_lsn
+ = recv_sys.mlog_checkpoint_lsn = lsn;
+ log_sys.set_lsn(lsn);
+ log_sys.set_flushed_lsn(lsn);
+ log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn
+ = log_sys.write_lsn = log_sys.current_flush_lsn = lsn;
+ log_sys.next_checkpoint_no = 0;
+ return DB_SUCCESS;
+}
+
+/** Find the latest checkpoint in the log header.
+@param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2
+@return error code or DB_SUCCESS */
+dberr_t
+recv_find_max_checkpoint(ulint* max_field)
+{
+ ib_uint64_t max_no;
+ ib_uint64_t checkpoint_no;
+ ulint field;
+ byte* buf;
+
+ max_no = 0;
+ *max_field = 0;
+
+ buf = log_sys.checkpoint_buf;
+
+ log_sys.log.read(0, {buf, OS_FILE_LOG_BLOCK_SIZE});
+ /* Check the header page checksum. There was no
+ checksum in the first redo log format (version 0). */
+ log_sys.log.format = mach_read_from_4(buf + LOG_HEADER_FORMAT);
+ log_sys.log.subformat = log_sys.log.format != log_t::FORMAT_3_23
+ ? mach_read_from_4(buf + LOG_HEADER_SUBFORMAT)
+ : 0;
+ if (log_sys.log.format != log_t::FORMAT_3_23
+ && !recv_check_log_header_checksum(buf)) {
+ ib::error() << "Invalid redo log header checksum.";
+ return(DB_CORRUPTION);
+ }
+
+ char creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR + 1];
+
+ memcpy(creator, buf + LOG_HEADER_CREATOR, sizeof creator);
+ /* Ensure that the string is NUL-terminated. */
+ creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR] = 0;
+
+ switch (log_sys.log.format) {
+ case log_t::FORMAT_3_23:
+ return recv_log_recover_pre_10_2();
+ case log_t::FORMAT_10_2:
+ case log_t::FORMAT_10_2 | log_t::FORMAT_ENCRYPTED:
+ case log_t::FORMAT_10_3:
+ case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED:
+ case log_t::FORMAT_10_4:
+ case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED:
+ case log_t::FORMAT_10_5:
+ case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED:
+ break;
+ default:
+ ib::error() << "Unsupported redo log format."
+ " The redo log was created with " << creator << ".";
+ return(DB_ERROR);
+ }
+
+ for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2;
+ field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) {
+ log_sys.log.read(field, {buf, OS_FILE_LOG_BLOCK_SIZE});
+
+ const ulint crc32 = log_block_calc_checksum_crc32(buf);
+ const ulint cksum = log_block_get_checksum(buf);
+
+ if (crc32 != cksum) {
+ DBUG_PRINT("ib_log",
+ ("invalid checkpoint,"
+ " at " ULINTPF
+ ", checksum " ULINTPFx
+ " expected " ULINTPFx,
+ field, cksum, crc32));
+ continue;
+ }
+
+ if (log_sys.is_encrypted()
+ && !log_crypt_read_checkpoint_buf(buf)) {
+ ib::error() << "Reading checkpoint"
+ " encryption info failed.";
+ continue;
+ }
+
+ checkpoint_no = mach_read_from_8(
+ buf + LOG_CHECKPOINT_NO);
+
+ DBUG_PRINT("ib_log",
+ ("checkpoint " UINT64PF " at " LSN_PF " found",
+ checkpoint_no, mach_read_from_8(
+ buf + LOG_CHECKPOINT_LSN)));
+
+ if (checkpoint_no >= max_no) {
+ *max_field = field;
+ max_no = checkpoint_no;
+ log_sys.log.set_lsn(mach_read_from_8(
+ buf + LOG_CHECKPOINT_LSN));
+ log_sys.log.set_lsn_offset(mach_read_from_8(
+ buf + LOG_CHECKPOINT_OFFSET));
+ log_sys.next_checkpoint_no = checkpoint_no;
+ }
+ }
+
+ if (*max_field == 0) {
+ /* Before 10.2.2, we could get here during database
+ initialization if we created an LOG_FILE_NAME file that
+ was filled with zeroes, and were killed. After
+ 10.2.2, we would reject such a file already earlier,
+ when checking the file header. */
+ ib::error() << "No valid checkpoint found"
+ " (corrupted redo log)."
+ " You can try --innodb-force-recovery=6"
+ " as a last resort.";
+ return(DB_ERROR);
+ }
+
+ switch (log_sys.log.format) {
+ case log_t::FORMAT_10_5:
+ case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED:
+ break;
+ default:
+ if (dberr_t err = recv_log_recover_10_4()) {
+ ib::error()
+ << "Upgrade after a crash is not supported."
+ " The redo log was created with " << creator
+ << (err == DB_ERROR
+ ? "." : ", and it appears corrupted.");
+ return err;
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************//**
+Calculates the new value for lsn when more data is added to the log. */
+static
+lsn_t
+recv_calc_lsn_on_data_add(
+/*======================*/
+ lsn_t lsn, /*!< in: old lsn */
+ ib_uint64_t len) /*!< in: this many bytes of data is
+ added, log block headers not included */
+{
+ unsigned frag_len = static_cast<unsigned>(lsn % OS_FILE_LOG_BLOCK_SIZE)
+ - LOG_BLOCK_HDR_SIZE;
+ unsigned payload_size = log_sys.payload_size();
+ ut_ad(frag_len < payload_size);
+ lsn_t lsn_len = len;
+ lsn_len += (lsn_len + frag_len) / payload_size
+ * (OS_FILE_LOG_BLOCK_SIZE - payload_size);
+
+ return(lsn + lsn_len);
+}
+
+/** Trim old log records for a page.
+@param start_lsn oldest log sequence number to preserve
+@return whether all the log for the page was trimmed */
+inline bool page_recv_t::trim(lsn_t start_lsn)
+{
+ while (log.head)
+ {
+ if (log.head->lsn >= start_lsn) return false;
+ last_offset= 1; /* the next record must not be same_page */
+ log_rec_t *next= log.head->next;
+ recv_sys.free(log.head);
+ log.head= next;
+ }
+ log.tail= nullptr;
+ return true;
+}
+
+
+inline void page_recv_t::recs_t::clear()
+{
+ ut_ad(mutex_own(&recv_sys.mutex));
+ for (const log_rec_t *l= head; l; )
+ {
+ const log_rec_t *next= l->next;
+ recv_sys.free(l);
+ l= next;
+ }
+ head= tail= nullptr;
+}
+
+
+/** Ignore any earlier redo log records for this page. */
+inline void page_recv_t::will_not_read()
+{
+ ut_ad(state == RECV_NOT_PROCESSED || state == RECV_WILL_NOT_READ);
+ state= RECV_WILL_NOT_READ;
+ log.clear();
+}
+
+
+/** Register a redo log snippet for a page.
+@param page_id page identifier
+@param start_lsn start LSN of the mini-transaction
+@param lsn @see mtr_t::commit_lsn()
+@param recs redo log snippet @see log_t::FORMAT_10_5
+@param len length of l, in bytes */
+inline void recv_sys_t::add(const page_id_t page_id,
+ lsn_t start_lsn, lsn_t lsn, const byte *l,
+ size_t len)
+{
+ ut_ad(mutex_own(&mutex));
+ std::pair<map::iterator, bool> p= pages.emplace(map::value_type
+ (page_id, page_recv_t()));
+ page_recv_t& recs= p.first->second;
+ ut_ad(p.second == recs.log.empty());
+
+ switch (*l & 0x70) {
+ case FREE_PAGE: case INIT_PAGE:
+ recs.will_not_read();
+ mlog_init.add(page_id, start_lsn); /* FIXME: remove this! */
+ /* fall through */
+ default:
+ log_phys_t *tail= static_cast<log_phys_t*>(recs.log.last());
+ if (!tail)
+ break;
+ if (tail->start_lsn != start_lsn)
+ break;
+ ut_ad(tail->lsn == lsn);
+ buf_block_t *block= UT_LIST_GET_LAST(blocks);
+ ut_ad(block);
+ const size_t used= static_cast<uint16_t>(block->page.access_time - 1) + 1;
+ ut_ad(used >= ALIGNMENT);
+ const byte *end= const_cast<const log_phys_t*>(tail)->end();
+ if (!((reinterpret_cast<size_t>(end + len) ^
+ reinterpret_cast<size_t>(end)) & ~(ALIGNMENT - 1)))
+ {
+ /* Use already allocated 'padding' bytes */
+append:
+ MEM_MAKE_ADDRESSABLE(end + 1, len);
+ /* Append to the preceding record for the page */
+ tail->append(l, len);
+ return;
+ }
+ if (end <= &block->frame[used - ALIGNMENT] || &block->frame[used] >= end)
+ break; /* Not the last allocated record in the page */
+ const size_t new_used= static_cast<size_t>(end - block->frame + len + 1);
+ ut_ad(new_used > used);
+ if (new_used > srv_page_size)
+ break;
+ block->page.access_time= (block->page.access_time & ~0U << 16) |
+ ut_calc_align<uint16_t>(static_cast<uint16_t>(new_used), ALIGNMENT);
+ goto append;
+ }
+ recs.log.append(new (alloc(log_phys_t::alloc_size(len)))
+ log_phys_t(start_lsn, lsn, l, len));
+}
+
+/** Store/remove the freed pages in fil_name_t of recv_spaces.
+@param[in] page_id freed or init page_id
+@param[in] freed TRUE if page is freed */
+static void store_freed_or_init_rec(page_id_t page_id, bool freed)
+{
+ uint32_t space_id= page_id.space();
+ uint32_t page_no= page_id.page_no();
+ if (is_predefined_tablespace(space_id))
+ {
+ if (!srv_immediate_scrub_data_uncompressed)
+ return;
+ fil_space_t *space;
+ if (space_id == TRX_SYS_SPACE)
+ space= fil_system.sys_space;
+ else
+ space= fil_space_get(space_id);
+
+ space->free_page(page_no, freed);
+ return;
+ }
+
+ recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id);
+ if (i != recv_spaces.end() && i->first == space_id)
+ {
+ if (freed)
+ i->second.add_freed_page(page_no);
+ else
+ i->second.remove_freed_page(page_no);
+ }
+}
+
+/** Parse and register one mini-transaction in log_t::FORMAT_10_5.
+@param checkpoint_lsn the log sequence number of the latest checkpoint
+@param store whether to store the records
+@param apply whether to apply file-level log records
+@return whether FILE_CHECKPOINT record was seen the first time,
+or corruption was noticed */
+bool recv_sys_t::parse(lsn_t checkpoint_lsn, store_t *store, bool apply)
+{
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(mutex_own(&mutex));
+ ut_ad(parse_start_lsn);
+ ut_ad(log_sys.is_physical());
+
+ bool last_phase= (*store == STORE_IF_EXISTS);
+ const byte *const end= buf + len;
+loop:
+ const byte *const log= buf + recovered_offset;
+ const lsn_t start_lsn= recovered_lsn;
+
+ /* Check that the entire mini-transaction is included within the buffer */
+ const byte *l;
+ uint32_t rlen;
+ for (l= log; l < end; l+= rlen)
+ {
+ if (!*l)
+ goto eom_found;
+ if (UNIV_LIKELY((*l & 0x70) != RESERVED));
+ else if (srv_force_recovery)
+ ib::warn() << "Ignoring unknown log record at LSN " << recovered_lsn;
+ else
+ {
+malformed:
+ ib::error() << "Malformed log record;"
+ " set innodb_force_recovery=1 to ignore.";
+corrupted:
+ const size_t trailing_bytes= std::min<size_t>(100, size_t(end - l));
+ ib::info() << "Dump from the start of the mini-transaction (LSN="
+ << start_lsn << ") to "
+ << trailing_bytes << " bytes after the record:";
+ ut_print_buf(stderr, log, l - log + trailing_bytes);
+ putc('\n', stderr);
+ found_corrupt_log= true;
+ return true;
+ }
+ rlen= *l++ & 0xf;
+ if (l + (rlen ? rlen : 16) >= end)
+ break;
+ if (!rlen)
+ {
+ rlen= mlog_decode_varint_length(*l);
+ if (l + rlen >= end)
+ break;
+ const uint32_t addlen= mlog_decode_varint(l);
+ if (UNIV_UNLIKELY(addlen == MLOG_DECODE_ERROR))
+ {
+ ib::error() << "Corrupted record length";
+ goto corrupted;
+ }
+ rlen= addlen + 15;
+ }
+ }
+
+ /* Not the entire mini-transaction was present. */
+ return false;
+
+eom_found:
+ ut_ad(!*l);
+ ut_d(const byte *const el= l + 1);
+
+ const lsn_t end_lsn= recv_calc_lsn_on_data_add(start_lsn, l + 1 - log);
+ if (UNIV_UNLIKELY(end_lsn > scanned_lsn))
+ /* The log record filled a log block, and we require that also the
+ next log block should have been scanned in */
+ return false;
+
+ ut_d(std::set<page_id_t> freed);
+#if 0 && defined UNIV_DEBUG /* MDEV-21727 FIXME: enable this */
+ /* Pages that have been modified in this mini-transaction.
+ If a mini-transaction writes INIT_PAGE for a page, it should not have
+ written any log records for the page. Unfortunately, this does not
+ hold for ROW_FORMAT=COMPRESSED pages, because page_zip_compress()
+ can be invoked in a pessimistic operation, even after log has
+ been written for other pages. */
+ ut_d(std::set<page_id_t> modified);
+#endif
+
+ uint32_t space_id= 0, page_no= 0, last_offset= 0;
+#if 1 /* MDEV-14425 FIXME: remove this */
+ bool got_page_op= false;
+#endif
+ for (l= log; l < end; l+= rlen)
+ {
+ const byte *const recs= l;
+ const byte b= *l++;
+
+ if (!b)
+ break;
+ ut_ad(UNIV_LIKELY(b & 0x70) != RESERVED || srv_force_recovery);
+ rlen= b & 0xf;
+ ut_ad(l + rlen < end);
+ ut_ad(rlen || l + 16 < end);
+ if (!rlen)
+ {
+ const uint32_t lenlen= mlog_decode_varint_length(*l);
+ ut_ad(l + lenlen < end);
+ const uint32_t addlen= mlog_decode_varint(l);
+ ut_ad(addlen != MLOG_DECODE_ERROR);
+ rlen= addlen + 15 - lenlen;
+ l+= lenlen;
+ }
+ ut_ad(l + rlen < end);
+ uint32_t idlen;
+ if ((b & 0x80) && got_page_op)
+ {
+ /* This record is for the same page as the previous one. */
+ if (UNIV_UNLIKELY((b & 0x70) <= INIT_PAGE))
+ {
+record_corrupted:
+ /* FREE_PAGE,INIT_PAGE cannot be with same_page flag */
+ if (!srv_force_recovery)
+ goto malformed;
+ ib::warn() << "Ignoring malformed log record at LSN " << recovered_lsn;
+ last_offset= 1; /* the next record must not be same_page */
+ continue;
+ }
+ goto same_page;
+ }
+ last_offset= 0;
+ idlen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen))
+ {
+page_id_corrupted:
+ if (!srv_force_recovery)
+ {
+ ib::error() << "Corrupted page identifier at " << recovered_lsn
+ << "; set innodb_force_recovery=1 to ignore the record.";
+ goto corrupted;
+ }
+ ib::warn() << "Ignoring corrupted page identifier at LSN "
+ << recovered_lsn;
+ continue;
+ }
+ space_id= mlog_decode_varint(l);
+ if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR))
+ goto page_id_corrupted;
+ l+= idlen;
+ rlen-= idlen;
+ idlen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(idlen > 5 || idlen > rlen))
+ goto page_id_corrupted;
+ page_no= mlog_decode_varint(l);
+ if (UNIV_UNLIKELY(page_no == MLOG_DECODE_ERROR))
+ goto page_id_corrupted;
+ l+= idlen;
+ rlen-= idlen;
+ got_page_op = !(b & 0x80);
+ if (got_page_op && apply && !is_predefined_tablespace(space_id))
+ {
+ recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id);
+ if (i != recv_spaces.end() && i->first == space_id);
+ else if (recovered_lsn < mlog_checkpoint_lsn)
+ /* We have not seen all records between the checkpoint and
+ FILE_CHECKPOINT. There should be a FILE_DELETE for this
+ tablespace later. */
+ recv_spaces.emplace_hint(i, space_id, file_name_t("", false));
+ else
+ {
+ const page_id_t id(space_id, page_no);
+ if (!srv_force_recovery)
+ {
+ ib::error() << "Missing FILE_DELETE or FILE_MODIFY for " << id
+ << " at " << recovered_lsn
+ << "; set innodb_force_recovery=1 to ignore the record.";
+ goto corrupted;
+ }
+ ib::warn() << "Ignoring record for " << id << " at " << recovered_lsn;
+ continue;
+ }
+ }
+same_page:
+ DBUG_PRINT("ib_log",
+ ("scan " LSN_PF ": rec %x len %zu page %u:%u",
+ recovered_lsn, b, static_cast<size_t>(l + rlen - recs),
+ space_id, page_no));
+
+ if (got_page_op)
+ {
+ const page_id_t id(space_id, page_no);
+ ut_d(if ((b & 0x70) == INIT_PAGE) freed.erase(id));
+ ut_ad(freed.find(id) == freed.end());
+ switch (b & 0x70) {
+ case FREE_PAGE:
+ ut_ad(freed.emplace(id).second);
+ last_offset= 1; /* the next record must not be same_page */
+ goto free_or_init_page;
+ case INIT_PAGE:
+ last_offset= FIL_PAGE_TYPE;
+ free_or_init_page:
+ store_freed_or_init_rec(id, (b & 0x70) == FREE_PAGE);
+ if (UNIV_UNLIKELY(rlen != 0))
+ goto record_corrupted;
+ break;
+ case EXTENDED:
+ if (UNIV_UNLIKELY(!rlen))
+ goto record_corrupted;
+ if (rlen == 1 && *l == TRIM_PAGES)
+ {
+#if 0 /* For now, we can only truncate an undo log tablespace */
+ if (UNIV_UNLIKELY(!space_id || !page_no))
+ goto record_corrupted;
+#else
+ if (!srv_is_undo_tablespace(space_id) ||
+ page_no != SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
+ goto record_corrupted;
+ static_assert(UT_ARR_SIZE(truncated_undo_spaces) ==
+ TRX_SYS_MAX_UNDO_SPACES, "compatibility");
+ truncated_undo_spaces[space_id - srv_undo_space_id_start]=
+ { recovered_lsn, page_no };
+#endif
+ last_offset= 1; /* the next record must not be same_page */
+ continue;
+ }
+ last_offset= FIL_PAGE_TYPE;
+ break;
+ case RESERVED:
+ case OPTION:
+ continue;
+ case WRITE:
+ case MEMMOVE:
+ case MEMSET:
+ if (UNIV_UNLIKELY(rlen == 0 || last_offset == 1))
+ goto record_corrupted;
+ const uint32_t olen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3))
+ goto record_corrupted;
+ const uint32_t offset= mlog_decode_varint(l);
+ ut_ad(offset != MLOG_DECODE_ERROR);
+ static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+ if (UNIV_UNLIKELY(offset >= srv_page_size))
+ goto record_corrupted;
+ last_offset+= offset;
+ if (UNIV_UNLIKELY(last_offset < 8 || last_offset >= srv_page_size))
+ goto record_corrupted;
+ l+= olen;
+ rlen-= olen;
+ if ((b & 0x70) == WRITE)
+ {
+ if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size))
+ goto record_corrupted;
+ if (UNIV_UNLIKELY(!page_no) && apply)
+ {
+ const bool has_size= last_offset <= FSP_HEADER_OFFSET + FSP_SIZE &&
+ last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SIZE + 4;
+ const bool has_flags= last_offset <=
+ FSP_HEADER_OFFSET + FSP_SPACE_FLAGS &&
+ last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + 4;
+ if (has_size || has_flags)
+ {
+ recv_spaces_t::iterator it= recv_spaces.find(space_id);
+ const uint32_t size= has_size
+ ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + l -
+ last_offset)
+ : 0;
+ const uint32_t flags= has_flags
+ ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + l -
+ last_offset)
+ : file_name_t::initial_flags;
+ if (it == recv_spaces.end())
+ ut_ad(!mlog_checkpoint_lsn || space_id == TRX_SYS_SPACE ||
+ srv_is_undo_tablespace(space_id));
+ else if (!it->second.space)
+ {
+ if (has_size)
+ it->second.size= size;
+ if (has_flags)
+ it->second.flags= flags;
+ }
+ fil_space_set_recv_size_and_flags(space_id, size, flags);
+ }
+ }
+ last_offset+= rlen;
+ break;
+ }
+ uint32_t llen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(llen > rlen || llen > 3))
+ goto record_corrupted;
+ const uint32_t len= mlog_decode_varint(l);
+ ut_ad(len != MLOG_DECODE_ERROR);
+ if (UNIV_UNLIKELY(last_offset + len > srv_page_size))
+ goto record_corrupted;
+ l+= llen;
+ rlen-= llen;
+ llen= len;
+ if ((b & 0x70) == MEMSET)
+ {
+ if (UNIV_UNLIKELY(rlen > llen))
+ goto record_corrupted;
+ last_offset+= llen;
+ break;
+ }
+ const uint32_t slen= mlog_decode_varint_length(*l);
+ if (UNIV_UNLIKELY(slen != rlen || slen > 3))
+ goto record_corrupted;
+ uint32_t s= mlog_decode_varint(l);
+ ut_ad(slen != MLOG_DECODE_ERROR);
+ if (s & 1)
+ s= last_offset - (s >> 1) - 1;
+ else
+ s= last_offset + (s >> 1) + 1;
+ if (UNIV_UNLIKELY(s < 8 || s + llen > srv_page_size))
+ goto record_corrupted;
+ last_offset+= llen;
+ break;
+ }
+#if 0 && defined UNIV_DEBUG
+ switch (b & 0x70) {
+ case RESERVED:
+ case OPTION:
+ ut_ad(0); /* we did "continue" earlier */
+ break;
+ case FREE_PAGE:
+ break;
+ default:
+ ut_ad(modified.emplace(id).second || (b & 0x70) != INIT_PAGE);
+ }
+#endif
+ const bool is_init= (b & 0x70) <= INIT_PAGE;
+ switch (*store) {
+ case STORE_IF_EXISTS:
+ if (fil_space_t *space= fil_space_t::get(space_id))
+ {
+ const auto size= space->get_size();
+ space->release();
+ if (!size)
+ continue;
+ }
+ else
+ continue;
+ /* fall through */
+ case STORE_YES:
+ if (!mlog_init.will_avoid_read(id, start_lsn))
+ add(id, start_lsn, end_lsn, recs,
+ static_cast<size_t>(l + rlen - recs));
+ continue;
+ case STORE_NO:
+ if (!is_init)
+ continue;
+ mlog_init.add(id, start_lsn);
+ map::iterator i= pages.find(id);
+ if (i == pages.end())
+ continue;
+ i->second.log.clear();
+ pages.erase(i);
+ }
+ }
+#if 1 /* MDEV-14425 FIXME: this must be in the checkpoint file only! */
+ else if (rlen)
+ {
+ switch (b & 0xf0) {
+# if 1 /* MDEV-14425 FIXME: Remove this! */
+ case FILE_CHECKPOINT:
+ if (space_id == 0 && page_no == 0 && rlen == 8)
+ {
+ const lsn_t lsn= mach_read_from_8(l);
+
+ if (UNIV_UNLIKELY(srv_print_verbose_log == 2))
+ fprintf(stderr, "FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF "\n",
+ lsn, lsn != checkpoint_lsn
+ ? "ignored"
+ : mlog_checkpoint_lsn ? "reread" : "read",
+ recovered_lsn);
+
+ DBUG_PRINT("ib_log", ("FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF,
+ lsn, lsn != checkpoint_lsn
+ ? "ignored"
+ : mlog_checkpoint_lsn ? "reread" : "read",
+ recovered_lsn));
+
+ if (lsn == checkpoint_lsn)
+ {
+ /* There can be multiple FILE_CHECKPOINT for the same LSN. */
+ if (mlog_checkpoint_lsn)
+ continue;
+ mlog_checkpoint_lsn= recovered_lsn;
+ l+= 8;
+ recovered_offset= l - buf;
+ return true;
+ }
+ continue;
+ }
+# endif
+ /* fall through */
+ default:
+ if (!srv_force_recovery)
+ goto malformed;
+ ib::warn() << "Ignoring malformed log record at LSN " << recovered_lsn;
+ continue;
+ case FILE_DELETE:
+ case FILE_MODIFY:
+ case FILE_RENAME:
+ if (UNIV_UNLIKELY(page_no != 0))
+ {
+ file_rec_error:
+ if (!srv_force_recovery)
+ {
+ ib::error() << "Corrupted file-level record;"
+ " set innodb_force_recovery=1 to ignore.";
+ goto corrupted;
+ }
+
+ ib::warn() << "Ignoring corrupted file-level record at LSN "
+ << recovered_lsn;
+ continue;
+ }
+ /* fall through */
+ case FILE_CREATE:
+ if (UNIV_UNLIKELY(!space_id || page_no))
+ goto file_rec_error;
+ /* There is no terminating NUL character. Names must end in .ibd.
+ For FILE_RENAME, there is a NUL between the two file names. */
+ const char * const fn= reinterpret_cast<const char*>(l);
+ const char *fn2= static_cast<const char*>(memchr(fn, 0, rlen));
+
+ if (UNIV_UNLIKELY((fn2 == nullptr) == ((b & 0xf0) == FILE_RENAME)))
+ goto file_rec_error;
+
+ const char * const fnend= fn2 ? fn2 : fn + rlen;
+ const char * const fn2end= fn2 ? fn + rlen : nullptr;
+
+ if (fn2)
+ {
+ fn2++;
+ if (memchr(fn2, 0, fn2end - fn2))
+ goto file_rec_error;
+ if (fn2end - fn2 < 4 || memcmp(fn2end - 4, DOT_IBD, 4))
+ goto file_rec_error;
+ }
+
+ if (is_predefined_tablespace(space_id))
+ goto file_rec_error;
+ if (fnend - fn < 4 || memcmp(fnend - 4, DOT_IBD, 4))
+ goto file_rec_error;
+
+ const char saved_end= fn[rlen];
+ const_cast<char&>(fn[rlen])= '\0';
+ fil_name_process(const_cast<char*>(fn), fnend - fn, space_id,
+ (b & 0xf0) == FILE_DELETE);
+ if (fn2)
+ fil_name_process(const_cast<char*>(fn2), fn2end - fn2, space_id,
+ false);
+ if ((b & 0xf0) < FILE_MODIFY && log_file_op)
+ log_file_op(space_id, (b & 0xf0) == FILE_CREATE,
+ l, static_cast<ulint>(fnend - fn),
+ reinterpret_cast<const byte*>(fn2),
+ fn2 ? static_cast<ulint>(fn2end - fn2) : 0);
+ const_cast<char&>(fn[rlen])= saved_end;
+
+ if (fn2 && apply)
+ {
+ const size_t len= fn2end - fn2;
+ auto r= renamed_spaces.emplace(space_id, std::string{fn2, len});
+ if (!r.second)
+ r.first->second= std::string{fn2, len};
+ }
+ if (UNIV_UNLIKELY(found_corrupt_fs))
+ return true;
+ }
+ }
+#endif
+ else
+ goto malformed;
+ }
+
+ ut_ad(l == el);
+ recovered_offset= l - buf;
+ recovered_lsn= end_lsn;
+ if (is_memory_exhausted(store) && last_phase)
+ return false;
+ goto loop;
+}
+
+/** Apply the hashed log records to the page, if the page lsn is less than the
+lsn of a log record.
+@param[in,out] block buffer pool page
+@param[in,out] mtr mini-transaction
+@param[in,out] p recovery address
+@param[in,out] space tablespace, or NULL if not looked up yet
+@param[in,out] init page initialization operation, or NULL */
+static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
+ const recv_sys_t::map::iterator& p,
+ fil_space_t* space = NULL,
+ mlog_init_t::init* init = NULL)
+{
+ ut_ad(mutex_own(&recv_sys.mutex));
+ ut_ad(recv_sys.apply_log_recs);
+ ut_ad(recv_needed_recovery);
+ ut_ad(!init || init->created);
+ ut_ad(!init || init->lsn);
+ ut_ad(block->page.id() == p->first);
+ ut_ad(!p->second.is_being_processed());
+ ut_ad(!space || space->id == block->page.id().space());
+ ut_ad(log_sys.is_physical());
+
+ if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
+ ib::info() << "Applying log to page " << block->page.id();
+ }
+
+ DBUG_PRINT("ib_log", ("Applying log to page %u:%u",
+ block->page.id().space(),
+ block->page.id().page_no()));
+
+ p->second.state = page_recv_t::RECV_BEING_PROCESSED;
+
+ mutex_exit(&recv_sys.mutex);
+
+ byte *frame = UNIV_LIKELY_NULL(block->page.zip.data)
+ ? block->page.zip.data
+ : block->frame;
+ const lsn_t page_lsn = init
+ ? 0
+ : mach_read_from_8(frame + FIL_PAGE_LSN);
+ bool free_page = false;
+ lsn_t start_lsn = 0, end_lsn = 0;
+ ut_d(lsn_t recv_start_lsn = 0);
+ const lsn_t init_lsn = init ? init->lsn : 0;
+
+ bool skipped_after_init = false;
+
+ for (const log_rec_t* recv : p->second.log) {
+ const log_phys_t* l = static_cast<const log_phys_t*>(recv);
+ ut_ad(l->lsn);
+ ut_ad(end_lsn <= l->lsn);
+ ut_ad(l->lsn <= log_sys.log.scanned_lsn);
+
+ ut_ad(l->start_lsn);
+ ut_ad(recv_start_lsn <= l->start_lsn);
+ ut_d(recv_start_lsn = l->start_lsn);
+
+ if (l->start_lsn < page_lsn) {
+ /* This record has already been applied. */
+ DBUG_PRINT("ib_log", ("apply skip %u:%u LSN " LSN_PF
+ " < " LSN_PF,
+ block->page.id().space(),
+ block->page.id().page_no(),
+ l->start_lsn, page_lsn));
+ skipped_after_init = true;
+ end_lsn = l->lsn;
+ continue;
+ }
+
+ if (l->start_lsn < init_lsn) {
+ DBUG_PRINT("ib_log", ("init skip %u:%u LSN " LSN_PF
+ " < " LSN_PF,
+ block->page.id().space(),
+ block->page.id().page_no(),
+ l->start_lsn, init_lsn));
+ skipped_after_init = false;
+ end_lsn = l->lsn;
+ continue;
+ }
+
+ /* There is no need to check LSN for just initialized pages. */
+ if (skipped_after_init) {
+ skipped_after_init = false;
+ ut_ad(end_lsn == page_lsn);
+ if (end_lsn != page_lsn)
+ ib::warn()
+ << "The last skipped log record LSN "
+ << end_lsn
+ << " is not equal to page LSN "
+ << page_lsn;
+ }
+
+ end_lsn = l->lsn;
+
+ if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
+ ib::info() << "apply " << l->start_lsn
+ << ": " << block->page.id();
+ }
+
+ DBUG_PRINT("ib_log", ("apply " LSN_PF ": %u:%u",
+ l->start_lsn,
+ block->page.id().space(),
+ block->page.id().page_no()));
+
+ log_phys_t::apply_status a= l->apply(*block,
+ p->second.last_offset);
+
+ switch (a) {
+ case log_phys_t::APPLIED_NO:
+ ut_ad(!mtr.has_modifications());
+ free_page = true;
+ start_lsn = 0;
+ continue;
+ case log_phys_t::APPLIED_YES:
+ goto set_start_lsn;
+ case log_phys_t::APPLIED_TO_FSP_HEADER:
+ case log_phys_t::APPLIED_TO_ENCRYPTION:
+ break;
+ }
+
+ if (fil_space_t* s = space
+ ? space
+ : fil_space_t::get(block->page.id().space())) {
+ switch (a) {
+ case log_phys_t::APPLIED_TO_FSP_HEADER:
+ s->flags = mach_read_from_4(
+ FSP_HEADER_OFFSET
+ + FSP_SPACE_FLAGS + frame);
+ s->size_in_header = mach_read_from_4(
+ FSP_HEADER_OFFSET + FSP_SIZE
+ + frame);
+ s->free_limit = mach_read_from_4(
+ FSP_HEADER_OFFSET
+ + FSP_FREE_LIMIT + frame);
+ s->free_len = mach_read_from_4(
+ FSP_HEADER_OFFSET + FSP_FREE
+ + FLST_LEN + frame);
+ break;
+ default:
+ byte* b= frame
+ + fsp_header_get_encryption_offset(
+ block->zip_size())
+ + FSP_HEADER_OFFSET;
+ if (memcmp(b, CRYPT_MAGIC, MAGIC_SZ)) {
+ break;
+ }
+ b += MAGIC_SZ;
+ if (*b != CRYPT_SCHEME_UNENCRYPTED
+ && *b != CRYPT_SCHEME_1) {
+ break;
+ }
+ if (b[1] != MY_AES_BLOCK_SIZE) {
+ break;
+ }
+ if (b[2 + MY_AES_BLOCK_SIZE + 4 + 4]
+ > FIL_ENCRYPTION_OFF) {
+ break;
+ }
+ fil_crypt_parse(s, b);
+ }
+
+ if (!space) {
+ s->release();
+ }
+ }
+
+set_start_lsn:
+ if (recv_sys.found_corrupt_log && !srv_force_recovery) {
+ break;
+ }
+
+ if (!start_lsn) {
+ start_lsn = l->start_lsn;
+ }
+ }
+
+ if (start_lsn) {
+ ut_ad(end_lsn >= start_lsn);
+ mach_write_to_8(FIL_PAGE_LSN + frame, end_lsn);
+ if (UNIV_LIKELY(frame == block->frame)) {
+ mach_write_to_8(srv_page_size
+ - FIL_PAGE_END_LSN_OLD_CHKSUM
+ + frame, end_lsn);
+ } else {
+ buf_zip_decompress(block, false);
+ }
+
+ buf_block_modify_clock_inc(block);
+ mysql_mutex_lock(&log_sys.flush_order_mutex);
+ buf_flush_note_modification(block, start_lsn, end_lsn);
+ mysql_mutex_unlock(&log_sys.flush_order_mutex);
+ } else if (free_page && init) {
+ /* There have been no operations that modify the page.
+ Any buffered changes must not be merged. A subsequent
+ buf_page_create() from a user thread should discard
+ any buffered changes. */
+ init->created = false;
+ ut_ad(!mtr.has_modifications());
+ block->page.status = buf_page_t::FREED;
+ }
+
+ /* Make sure that committing mtr does not change the modification
+ lsn values of page */
+
+ mtr.discard_modifications();
+ mtr.commit();
+
+ time_t now = time(NULL);
+
+ mutex_enter(&recv_sys.mutex);
+
+ if (recv_max_page_lsn < page_lsn) {
+ recv_max_page_lsn = page_lsn;
+ }
+
+ ut_ad(p->second.is_being_processed());
+ ut_ad(!recv_sys.pages.empty());
+
+ if (recv_sys.report(now)) {
+ const ulint n = recv_sys.pages.size();
+ ib::info() << "To recover: " << n << " pages from log";
+ service_manager_extend_timeout(
+ INNODB_EXTEND_TIMEOUT_INTERVAL, "To recover: " ULINTPF " pages from log", n);
+ }
+}
+
+/** Remove records for a corrupted page.
+This function should only be called when innodb_force_recovery is set.
+@param page_id corrupted page identifier */
+ATTRIBUTE_COLD void recv_sys_t::free_corrupted_page(page_id_t page_id)
+{
+ mutex_enter(&mutex);
+ map::iterator p= pages.find(page_id);
+ if (p != pages.end())
+ {
+ p->second.log.clear();
+ pages.erase(p);
+ }
+ mutex_exit(&mutex);
+}
+
+/** Apply any buffered redo log to a page that was just read from a data file.
+@param[in,out] space tablespace
+@param[in,out] bpage buffer pool page */
+void recv_recover_page(fil_space_t* space, buf_page_t* bpage)
+{
+ mtr_t mtr;
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+ buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
+
+ /* Move the ownership of the x-latch on the page to
+ this OS thread, so that we can acquire a second
+ x-latch on it. This is needed for the operations to
+ the page to pass the debug checks. */
+ rw_lock_x_lock_move_ownership(&block->lock);
+ buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+ rw_lock_x_lock(&block->lock);
+ mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+
+ mutex_enter(&recv_sys.mutex);
+ if (recv_sys.apply_log_recs) {
+ recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id());
+ if (p != recv_sys.pages.end()
+ && !p->second.is_being_processed()) {
+ recv_recover_page(block, mtr, p, space);
+ p->second.log.clear();
+ recv_sys.pages.erase(p);
+ goto func_exit;
+ }
+ }
+
+ mtr.commit();
+func_exit:
+ mutex_exit(&recv_sys.mutex);
+ ut_ad(mtr.has_committed());
+}
+
+/** Reads in pages which have hashed log records, from an area around a given
+page number.
+@param[in] page_id page id */
+static void recv_read_in_area(page_id_t page_id)
+{
+ uint32_t page_nos[RECV_READ_AHEAD_AREA];
+ compile_time_assert(ut_is_2pow(RECV_READ_AHEAD_AREA));
+ page_id.set_page_no(ut_2pow_round(page_id.page_no(),
+ RECV_READ_AHEAD_AREA));
+ const ulint up_limit = page_id.page_no() + RECV_READ_AHEAD_AREA;
+ uint32_t* p = page_nos;
+
+ for (recv_sys_t::map::iterator i= recv_sys.pages.lower_bound(page_id);
+ i != recv_sys.pages.end()
+ && i->first.space() == page_id.space()
+ && i->first.page_no() < up_limit; i++) {
+ if (i->second.state == page_recv_t::RECV_NOT_PROCESSED
+ && !buf_pool.page_hash_contains(i->first)) {
+ i->second.state = page_recv_t::RECV_BEING_READ;
+ *p++ = i->first.page_no();
+ }
+ }
+
+ if (p != page_nos) {
+ mutex_exit(&recv_sys.mutex);
+ buf_read_recv_pages(page_id.space(), page_nos,
+ ulint(p - page_nos));
+ mutex_enter(&recv_sys.mutex);
+ }
+}
+
+/** Attempt to initialize a page based on redo log records.
+@param page_id page identifier
+@param p iterator pointing to page_id
+@param mtr mini-transaction
+@param b pre-allocated buffer pool block
+@return whether the page was successfully initialized */
+inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id,
+ map::iterator &p, mtr_t &mtr,
+ buf_block_t *b)
+{
+ ut_ad(mutex_own(&mutex));
+ ut_ad(p->first == page_id);
+ page_recv_t &recs= p->second;
+ ut_ad(recs.state == page_recv_t::RECV_WILL_NOT_READ);
+ buf_block_t* block= nullptr;
+ mlog_init_t::init &i= mlog_init.last(page_id);
+ const lsn_t end_lsn = recs.log.last()->lsn;
+ if (end_lsn < i.lsn)
+ DBUG_LOG("ib_log", "skip log for page " << page_id
+ << " LSN " << end_lsn << " < " << i.lsn);
+ else if (fil_space_t *space= fil_space_t::get(page_id.space()))
+ {
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ block= buf_page_create(space, page_id.page_no(), space->zip_size(), &mtr,
+ b);
+ if (UNIV_UNLIKELY(block != b))
+ {
+ /* The page happened to exist in the buffer pool, or it was just
+ being read in. Before buf_page_get_with_no_latch() returned to
+ buf_page_create(), all changes must have been applied to the
+ page already. */
+ ut_ad(recv_sys.pages.find(page_id) == recv_sys.pages.end());
+ mtr.commit();
+ block= nullptr;
+ }
+ else
+ {
+ ut_ad(&recs == &recv_sys.pages.find(page_id)->second);
+ i.created= true;
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+ recv_recover_page(block, mtr, p, space, &i);
+ ut_ad(mtr.has_committed());
+ recs.log.clear();
+ map::iterator r= p++;
+ recv_sys.pages.erase(r);
+ }
+ space->release();
+ }
+
+ return block;
+}
+
+/** Attempt to initialize a page based on redo log records.
+@param page_id page identifier
+@return whether the page was successfully initialized */
+buf_block_t *recv_sys_t::recover_low(const page_id_t page_id)
+{
+ buf_block_t *free_block= buf_LRU_get_free_block(false);
+ buf_block_t *block= nullptr;
+
+ mutex_enter(&mutex);
+ map::iterator p= pages.find(page_id);
+
+ if (p != pages.end() && p->second.state == page_recv_t::RECV_WILL_NOT_READ)
+ {
+ mtr_t mtr;
+ block= recover_low(page_id, p, mtr, free_block);
+ ut_ad(!block || block == free_block);
+ }
+
+ mutex_exit(&mutex);
+ if (UNIV_UNLIKELY(!block))
+ buf_pool.free_block(free_block);
+ return block;
+}
+
+/** Apply buffered log to persistent data pages.
+@param last_batch whether it is possible to write more redo log */
+void recv_sys_t::apply(bool last_batch)
+{
+ ut_ad(srv_operation == SRV_OPERATION_NORMAL ||
+ srv_operation == SRV_OPERATION_RESTORE ||
+ srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+
+ mutex_enter(&mutex);
+
+ while (apply_batch_on)
+ {
+ bool abort= found_corrupt_log;
+ mutex_exit(&mutex);
+
+ if (abort)
+ return;
+
+ os_thread_sleep(500000);
+ mutex_enter(&mutex);
+ }
+
+#ifdef SAFE_MUTEX
+ DBUG_ASSERT(!last_batch == mysql_mutex_is_owner(&log_sys.mutex));
+#endif /* SAFE_MUTEX */
+
+ recv_no_ibuf_operations = !last_batch ||
+ srv_operation == SRV_OPERATION_RESTORE ||
+ srv_operation == SRV_OPERATION_RESTORE_EXPORT;
+
+ mtr_t mtr;
+
+ if (!pages.empty())
+ {
+ const char *msg= last_batch
+ ? "Starting final batch to recover "
+ : "Starting a batch to recover ";
+ const ulint n= pages.size();
+ ib::info() << msg << n << " pages from redo log.";
+ sd_notifyf(0, "STATUS=%s" ULINTPF " pages from redo log", msg, n);
+
+ apply_log_recs= true;
+ apply_batch_on= true;
+
+ for (auto id= srv_undo_tablespaces_open; id--;)
+ {
+ const trunc& t= truncated_undo_spaces[id];
+ if (t.lsn)
+ trim(page_id_t(id + srv_undo_space_id_start, t.pages), t.lsn);
+ }
+
+ fil_system.extend_to_recv_size();
+
+ buf_block_t *free_block= buf_LRU_get_free_block(false);
+
+ for (map::iterator p= pages.begin(); p != pages.end(); )
+ {
+ const page_id_t page_id= p->first;
+ page_recv_t &recs= p->second;
+ ut_ad(!recs.log.empty());
+
+ switch (recs.state) {
+ case page_recv_t::RECV_BEING_READ:
+ case page_recv_t::RECV_BEING_PROCESSED:
+ p++;
+ continue;
+ case page_recv_t::RECV_WILL_NOT_READ:
+ if (UNIV_LIKELY(!!recover_low(page_id, p, mtr, free_block)))
+ {
+ mutex_exit(&mutex);
+ free_block= buf_LRU_get_free_block(false);
+ mutex_enter(&mutex);
+next_page:
+ p= pages.lower_bound(page_id);
+ }
+ continue;
+ case page_recv_t::RECV_NOT_PROCESSED:
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ if (buf_block_t *block= buf_page_get_low(page_id, 0, RW_X_LATCH,
+ nullptr, BUF_GET_IF_IN_POOL,
+ __FILE__, __LINE__,
+ &mtr, nullptr, false))
+ {
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+ recv_recover_page(block, mtr, p);
+ ut_ad(mtr.has_committed());
+ }
+ else
+ {
+ mtr.commit();
+ recv_read_in_area(page_id);
+ break;
+ }
+ map::iterator r= p++;
+ r->second.log.clear();
+ pages.erase(r);
+ continue;
+ }
+
+ goto next_page;
+ }
+
+ buf_pool.free_block(free_block);
+
+ /* Wait until all the pages have been processed */
+ while (!pages.empty() || buf_pool.n_pend_reads)
+ {
+ const bool abort= found_corrupt_log || found_corrupt_fs;
+
+ if (found_corrupt_fs && !srv_force_recovery)
+ ib::info() << "Set innodb_force_recovery=1 to ignore corrupted pages.";
+
+ mutex_exit(&mutex);
+
+ if (abort)
+ return;
+ os_thread_sleep(500000);
+ mutex_enter(&mutex);
+ }
+ }
+
+ if (last_batch)
+ /* We skipped this in buf_page_create(). */
+ mlog_init.mark_ibuf_exist(mtr);
+ else
+ {
+ mlog_init.reset();
+ mysql_mutex_unlock(&log_sys.mutex);
+ }
+
+ mysql_mutex_assert_not_owner(&log_sys.mutex);
+ mutex_exit(&mutex);
+
+ /* Instead of flushing, last_batch could sort the buf_pool.flush_list
+ in ascending order of buf_page_t::oldest_modification. */
+ buf_flush_sync();
+
+ if (!last_batch)
+ {
+ buf_pool_invalidate();
+ mysql_mutex_lock(&log_sys.mutex);
+ }
+#if 1 /* Mariabackup FIXME: Remove or adjust rename_table_in_prepare() */
+ else if (srv_operation != SRV_OPERATION_NORMAL);
+#endif
+ else
+ {
+ /* In the last batch, we will apply any rename operations. */
+ for (auto r : renamed_spaces)
+ {
+ const uint32_t id= r.first;
+ fil_space_t *space= fil_space_t::get(id);
+ if (!space)
+ continue;
+ ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
+ const char *old= space->chain.start->name;
+ if (r.second != old)
+ {
+ bool exists;
+ os_file_type_t ftype;
+ const char *new_name= r.second.c_str();
+ if (!os_file_status(new_name, &exists, &ftype) || exists)
+ {
+ ib::error() << "Cannot replay rename of tablespace " << id
+ << " from '" << old << "' to '" << r.second <<
+ (exists ? "' because the target file exists" : "'");
+ found_corrupt_fs= true;
+ }
+ else
+ {
+ size_t base= r.second.rfind(OS_PATH_SEPARATOR);
+ ut_ad(base != std::string::npos);
+ size_t start= r.second.rfind(OS_PATH_SEPARATOR, base - 1);
+ if (start == std::string::npos)
+ start= 0;
+ else
+ ++start;
+ /* Keep only databasename/tablename without .ibd suffix */
+ std::string space_name(r.second, start, r.second.size() - start - 4);
+ ut_ad(space_name[base - start] == OS_PATH_SEPARATOR);
+#if OS_PATH_SEPARATOR != '/'
+ space_name[base - start]= '/';
+#endif
+ mysql_mutex_lock(&log_sys.mutex);
+ if (dberr_t err= space->rename(space_name.c_str(), r.second.c_str(),
+ false))
+ {
+ ib::error() << "Cannot replay rename of tablespace " << id
+ << " to '" << r.second << "': " << err;
+ found_corrupt_fs= true;
+ }
+ mysql_mutex_unlock(&log_sys.mutex);
+ }
+ }
+ space->release();
+ }
+ renamed_spaces.clear();
+ }
+
+ mutex_enter(&mutex);
+
+ ut_d(after_apply= true);
+ clear();
+ mutex_exit(&mutex);
+}
+
+/** Check whether the number of read redo log blocks exceeds the maximum.
+Store last_stored_lsn if the recovery is not in the last phase.
+@param[in,out] store whether to store page operations
+@return whether the memory is exhausted */
+inline bool recv_sys_t::is_memory_exhausted(store_t *store)
+{
+ if (*store == STORE_NO ||
+ UT_LIST_GET_LEN(blocks) * 3 < buf_pool.get_n_pages())
+ return false;
+ if (*store == STORE_YES)
+ last_stored_lsn= recovered_lsn;
+ *store= STORE_NO;
+ DBUG_PRINT("ib_log",("Ran out of memory and last stored lsn " LSN_PF
+ " last stored offset " ULINTPF "\n",
+ recovered_lsn, recovered_offset));
+ return true;
+}
+
+/** Adds data from a new log block to the parsing buffer of recv_sys if
+recv_sys.parse_start_lsn is non-zero.
+@param[in] log_block log block to add
+@param[in] scanned_lsn lsn of how far we were able to find
+ data in this log block
+@return true if more data added */
+bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn)
+{
+ ulint more_len;
+ ulint data_len;
+ ulint start_offset;
+ ulint end_offset;
+
+ ut_ad(scanned_lsn >= recv_sys.scanned_lsn);
+
+ if (!recv_sys.parse_start_lsn) {
+ /* Cannot start parsing yet because no start point for
+ it found */
+ return(false);
+ }
+
+ data_len = log_block_get_data_len(log_block);
+
+ if (recv_sys.parse_start_lsn >= scanned_lsn) {
+
+ return(false);
+
+ } else if (recv_sys.scanned_lsn >= scanned_lsn) {
+
+ return(false);
+
+ } else if (recv_sys.parse_start_lsn > recv_sys.scanned_lsn) {
+ more_len = (ulint) (scanned_lsn - recv_sys.parse_start_lsn);
+ } else {
+ more_len = (ulint) (scanned_lsn - recv_sys.scanned_lsn);
+ }
+
+ if (more_len == 0) {
+ return(false);
+ }
+
+ ut_ad(data_len >= more_len);
+
+ start_offset = data_len - more_len;
+
+ if (start_offset < LOG_BLOCK_HDR_SIZE) {
+ start_offset = LOG_BLOCK_HDR_SIZE;
+ }
+
+ end_offset = std::min<ulint>(data_len, log_sys.trailer_offset());
+
+ ut_ad(start_offset <= end_offset);
+
+ if (start_offset < end_offset) {
+ memcpy(recv_sys.buf + recv_sys.len,
+ log_block + start_offset, end_offset - start_offset);
+
+ recv_sys.len += end_offset - start_offset;
+
+ ut_a(recv_sys.len <= RECV_PARSING_BUF_SIZE);
+ }
+
+ return(true);
+}
+
+/** Moves the parsing buffer data left to the buffer start. */
+void recv_sys_justify_left_parsing_buf()
+{
+ memmove(recv_sys.buf, recv_sys.buf + recv_sys.recovered_offset,
+ recv_sys.len - recv_sys.recovered_offset);
+
+ recv_sys.len -= recv_sys.recovered_offset;
+
+ recv_sys.recovered_offset = 0;
+}
+
+/** Scan redo log from a buffer and stores new log data to the parsing buffer.
+Parse and hash the log records if new data found.
+Apply log records automatically when the hash table becomes full.
+@param[in,out] store whether the records should be
+ stored into recv_sys.pages; this is
+ reset if just debug checking is
+ needed, or when the num_max_blocks in
+ recv_sys runs out
+@param[in] log_block log segment
+@param[in] checkpoint_lsn latest checkpoint LSN
+@param[in] start_lsn buffer start LSN
+@param[in] end_lsn buffer end LSN
+@param[in,out] contiguous_lsn it is known that all groups contain
+ contiguous log data upto this lsn
+@param[out] group_scanned_lsn scanning succeeded upto this lsn
+@return true if not able to scan any more in this log group */
+static bool recv_scan_log_recs(
+ store_t* store,
+ const byte* log_block,
+ lsn_t checkpoint_lsn,
+ lsn_t start_lsn,
+ lsn_t end_lsn,
+ lsn_t* contiguous_lsn,
+ lsn_t* group_scanned_lsn)
+{
+ lsn_t scanned_lsn = start_lsn;
+ bool finished = false;
+ ulint data_len;
+ bool more_data = false;
+ bool apply = recv_sys.mlog_checkpoint_lsn != 0;
+ ulint recv_parsing_buf_size = RECV_PARSING_BUF_SIZE;
+ const bool last_phase = (*store == STORE_IF_EXISTS);
+ ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad(end_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad(end_lsn >= start_lsn + OS_FILE_LOG_BLOCK_SIZE);
+ ut_ad(log_sys.is_physical());
+
+ const byte* const log_end = log_block
+ + ulint(end_lsn - start_lsn);
+ constexpr ulint sizeof_checkpoint= SIZE_OF_FILE_CHECKPOINT;
+
+ do {
+ ut_ad(!finished);
+
+ if (log_block_get_flush_bit(log_block)) {
+ /* This block was a start of a log flush operation:
+ we know that the previous flush operation must have
+ been completed for all log groups before this block
+ can have been flushed to any of the groups. Therefore,
+ we know that log data is contiguous up to scanned_lsn
+ in all non-corrupt log groups. */
+
+ if (scanned_lsn > *contiguous_lsn) {
+ *contiguous_lsn = scanned_lsn;
+ }
+ }
+
+ data_len = log_block_get_data_len(log_block);
+
+ if (scanned_lsn + data_len > recv_sys.scanned_lsn
+ && log_block_get_checkpoint_no(log_block)
+ < recv_sys.scanned_checkpoint_no
+ && (recv_sys.scanned_checkpoint_no
+ - log_block_get_checkpoint_no(log_block)
+ > 0x80000000UL)) {
+
+ /* Garbage from a log buffer flush which was made
+ before the most recent database recovery */
+ finished = true;
+ break;
+ }
+
+ if (!recv_sys.parse_start_lsn
+ && (log_block_get_first_rec_group(log_block) > 0)) {
+
+ /* We found a point from which to start the parsing
+ of log records */
+
+ recv_sys.parse_start_lsn = scanned_lsn
+ + log_block_get_first_rec_group(log_block);
+ recv_sys.scanned_lsn = recv_sys.parse_start_lsn;
+ recv_sys.recovered_lsn = recv_sys.parse_start_lsn;
+ }
+
+ scanned_lsn += data_len;
+
+ if (data_len == LOG_BLOCK_HDR_SIZE + sizeof_checkpoint
+ && scanned_lsn == checkpoint_lsn + sizeof_checkpoint
+ && log_block[LOG_BLOCK_HDR_SIZE]
+ == (FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2))
+ && checkpoint_lsn == mach_read_from_8(
+ (LOG_BLOCK_HDR_SIZE + 1 + 2)
+ + log_block)) {
+ /* The redo log is logically empty. */
+ ut_ad(recv_sys.mlog_checkpoint_lsn == 0
+ || recv_sys.mlog_checkpoint_lsn
+ == checkpoint_lsn);
+ recv_sys.mlog_checkpoint_lsn = checkpoint_lsn;
+ DBUG_PRINT("ib_log", ("found empty log; LSN=" LSN_PF,
+ scanned_lsn));
+ finished = true;
+ break;
+ }
+
+ if (scanned_lsn > recv_sys.scanned_lsn) {
+ ut_ad(!srv_log_file_created);
+ if (!recv_needed_recovery) {
+ recv_needed_recovery = true;
+
+ if (srv_read_only_mode) {
+ ib::warn() << "innodb_read_only"
+ " prevents crash recovery";
+ return(true);
+ }
+
+ ib::info() << "Starting crash recovery from"
+ " checkpoint LSN=" << checkpoint_lsn
+ << "," << recv_sys.scanned_lsn;
+ }
+
+ /* We were able to find more log data: add it to the
+ parsing buffer if parse_start_lsn is already
+ non-zero */
+
+ DBUG_EXECUTE_IF(
+ "reduce_recv_parsing_buf",
+ recv_parsing_buf_size = RECV_SCAN_SIZE * 2;
+ );
+
+ if (recv_sys.len + 4 * OS_FILE_LOG_BLOCK_SIZE
+ >= recv_parsing_buf_size) {
+ ib::error() << "Log parsing buffer overflow."
+ " Recovery may have failed!";
+
+ recv_sys.found_corrupt_log = true;
+
+ if (!srv_force_recovery) {
+ ib::error()
+ << "Set innodb_force_recovery"
+ " to ignore this error.";
+ return(true);
+ }
+ } else if (!recv_sys.found_corrupt_log) {
+ more_data = recv_sys_add_to_parsing_buf(
+ log_block, scanned_lsn);
+ }
+
+ recv_sys.scanned_lsn = scanned_lsn;
+ recv_sys.scanned_checkpoint_no
+ = log_block_get_checkpoint_no(log_block);
+ }
+
+ /* During last phase of scanning, there can be redo logs
+ left in recv_sys.buf to parse & store it in recv_sys.heap */
+ if (last_phase
+ && recv_sys.recovered_lsn < recv_sys.scanned_lsn) {
+ more_data = true;
+ }
+
+ if (data_len < OS_FILE_LOG_BLOCK_SIZE) {
+ /* Log data for this group ends here */
+ finished = true;
+ break;
+ } else {
+ log_block += OS_FILE_LOG_BLOCK_SIZE;
+ }
+ } while (log_block < log_end);
+
+ *group_scanned_lsn = scanned_lsn;
+
+ mutex_enter(&recv_sys.mutex);
+
+ if (more_data && !recv_sys.found_corrupt_log) {
+ /* Try to parse more log records */
+ if (recv_sys.parse(checkpoint_lsn, store, apply)) {
+ ut_ad(recv_sys.found_corrupt_log
+ || recv_sys.found_corrupt_fs
+ || recv_sys.mlog_checkpoint_lsn
+ == recv_sys.recovered_lsn);
+ finished = true;
+ goto func_exit;
+ }
+
+ recv_sys.is_memory_exhausted(store);
+
+ if (recv_sys.recovered_offset > recv_parsing_buf_size / 4
+ || (recv_sys.recovered_offset
+ && recv_sys.len
+ >= recv_parsing_buf_size - RECV_SCAN_SIZE)) {
+ /* Move parsing buffer data to the buffer start */
+ recv_sys_justify_left_parsing_buf();
+ }
+
+ /* Need to re-parse the redo log which're stored
+ in recv_sys.buf */
+ if (last_phase && *store == STORE_NO) {
+ finished = false;
+ }
+ }
+
+func_exit:
+ mutex_exit(&recv_sys.mutex);
+ return(finished);
+}
+
+/** Scans log from a buffer and stores new log data to the parsing buffer.
+Parses and hashes the log records if new data found.
+@param[in] checkpoint_lsn latest checkpoint log sequence number
+@param[in,out] contiguous_lsn log sequence number
+until which all redo log has been scanned
+@param[in] last_phase whether changes
+can be applied to the tablespaces
+@return whether rescan is needed (not everything was stored) */
+static
+bool
+recv_group_scan_log_recs(
+ lsn_t checkpoint_lsn,
+ lsn_t* contiguous_lsn,
+ bool last_phase)
+{
+ DBUG_ENTER("recv_group_scan_log_recs");
+ DBUG_ASSERT(!last_phase || recv_sys.mlog_checkpoint_lsn > 0);
+
+ mutex_enter(&recv_sys.mutex);
+ recv_sys.len = 0;
+ recv_sys.recovered_offset = 0;
+ recv_sys.clear();
+ recv_sys.parse_start_lsn = *contiguous_lsn;
+ recv_sys.scanned_lsn = *contiguous_lsn;
+ recv_sys.recovered_lsn = *contiguous_lsn;
+ recv_sys.scanned_checkpoint_no = 0;
+ ut_ad(recv_max_page_lsn == 0);
+ mutex_exit(&recv_sys.mutex);
+
+ lsn_t start_lsn;
+ lsn_t end_lsn;
+ store_t store = recv_sys.mlog_checkpoint_lsn == 0
+ ? STORE_NO : (last_phase ? STORE_IF_EXISTS : STORE_YES);
+
+ log_sys.log.scanned_lsn = end_lsn = *contiguous_lsn =
+ ut_uint64_align_down(*contiguous_lsn, OS_FILE_LOG_BLOCK_SIZE);
+ ut_d(recv_sys.after_apply = last_phase);
+
+ do {
+ if (last_phase && store == STORE_NO) {
+ store = STORE_IF_EXISTS;
+ recv_sys.apply(false);
+ /* Rescan the redo logs from last stored lsn */
+ end_lsn = recv_sys.recovered_lsn;
+ }
+
+ start_lsn = ut_uint64_align_down(end_lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
+ end_lsn = start_lsn;
+ log_sys.log.read_log_seg(&end_lsn, start_lsn + RECV_SCAN_SIZE);
+ } while (end_lsn != start_lsn
+ && !recv_scan_log_recs(&store, log_sys.buf, checkpoint_lsn,
+ start_lsn, end_lsn, contiguous_lsn,
+ &log_sys.log.scanned_lsn));
+
+ if (recv_sys.found_corrupt_log || recv_sys.found_corrupt_fs) {
+ DBUG_RETURN(false);
+ }
+
+ DBUG_PRINT("ib_log", ("%s " LSN_PF " completed",
+ last_phase ? "rescan" : "scan",
+ log_sys.log.scanned_lsn));
+
+ DBUG_RETURN(store == STORE_NO);
+}
+
+/** Report a missing tablespace for which page-redo log exists.
+@param[in] err previous error code
+@param[in] i tablespace descriptor
+@return new error code */
+static
+dberr_t
+recv_init_missing_space(dberr_t err, const recv_spaces_t::const_iterator& i)
+{
+ if (srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_EXPORT) {
+ if (i->second.name.find(TEMP_TABLE_PATH_PREFIX)
+ != std::string::npos) {
+ ib::warn() << "Tablespace " << i->first << " was not"
+ " found at " << i->second.name << " when"
+ " restoring a (partial?) backup. All redo log"
+ " for this file will be ignored!";
+ }
+ return(err);
+ }
+
+ if (srv_force_recovery == 0) {
+ ib::error() << "Tablespace " << i->first << " was not"
+ " found at " << i->second.name << ".";
+
+ if (err == DB_SUCCESS) {
+ ib::error() << "Set innodb_force_recovery=1 to"
+ " ignore this and to permanently lose"
+ " all changes to the tablespace.";
+ err = DB_TABLESPACE_NOT_FOUND;
+ }
+ } else {
+ ib::warn() << "Tablespace " << i->first << " was not"
+ " found at " << i->second.name << ", and"
+ " innodb_force_recovery was set. All redo log"
+ " for this tablespace will be ignored!";
+ }
+
+ return(err);
+}
+
+/** Report the missing tablespace and discard the redo logs for the deleted
+tablespace.
+@param[in] rescan rescan of redo logs is needed
+ if hash table ran out of memory
+@param[out] missing_tablespace missing tablespace exists or not
+@return error code or DB_SUCCESS. */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+recv_validate_tablespace(bool rescan, bool& missing_tablespace)
+{
+ dberr_t err = DB_SUCCESS;
+
+ mutex_enter(&recv_sys.mutex);
+
+ for (recv_sys_t::map::iterator p = recv_sys.pages.begin();
+ p != recv_sys.pages.end();) {
+ ut_ad(!p->second.log.empty());
+ const ulint space = p->first.space();
+ if (is_predefined_tablespace(space)) {
+next:
+ p++;
+ continue;
+ }
+
+ recv_spaces_t::iterator i = recv_spaces.find(space);
+ ut_ad(i != recv_spaces.end());
+
+ switch (i->second.status) {
+ case file_name_t::NORMAL:
+ goto next;
+ case file_name_t::MISSING:
+ err = recv_init_missing_space(err, i);
+ i->second.status = file_name_t::DELETED;
+ /* fall through */
+ case file_name_t::DELETED:
+ recv_sys_t::map::iterator r = p++;
+ r->second.log.clear();
+ recv_sys.pages.erase(r);
+ continue;
+ }
+ ut_ad(0);
+ }
+
+ if (err != DB_SUCCESS) {
+func_exit:
+ mutex_exit(&recv_sys.mutex);
+ return(err);
+ }
+
+ /* When rescan is not needed, recv_sys.pages will contain the
+ entire redo log. If rescan is needed or innodb_force_recovery
+ is set, we can ignore missing tablespaces. */
+ for (const recv_spaces_t::value_type& rs : recv_spaces) {
+ if (UNIV_LIKELY(rs.second.status != file_name_t::MISSING)) {
+ continue;
+ }
+
+ missing_tablespace = true;
+
+ if (srv_force_recovery > 0) {
+ ib::warn() << "Tablespace " << rs.first
+ <<" was not found at " << rs.second.name
+ <<", and innodb_force_recovery was set."
+ <<" All redo log for this tablespace"
+ <<" will be ignored!";
+ continue;
+ }
+
+ if (!rescan) {
+ ib::info() << "Tablespace " << rs.first
+ << " was not found at '"
+ << rs.second.name << "', but there"
+ <<" were no modifications either.";
+ }
+ }
+
+ if (!rescan || srv_force_recovery > 0) {
+ missing_tablespace = false;
+ }
+
+ err = DB_SUCCESS;
+ goto func_exit;
+}
+
+/** Check if all tablespaces were found for crash recovery.
+@param[in] rescan rescan of redo logs is needed
+@param[out] missing_tablespace missing table exists
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
+{
+ bool flag_deleted = false;
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad(recv_needed_recovery);
+
+ for (recv_spaces_t::value_type& rs : recv_spaces) {
+ ut_ad(!is_predefined_tablespace(rs.first));
+ ut_ad(rs.second.status != file_name_t::DELETED
+ || !rs.second.space);
+
+ if (rs.second.status == file_name_t::DELETED) {
+ /* The tablespace was deleted,
+ so we can ignore any redo log for it. */
+ flag_deleted = true;
+ } else if (rs.second.space != NULL) {
+ /* The tablespace was found, and there
+ are some redo log records for it. */
+ fil_names_dirty(rs.second.space);
+
+ /* Add the freed page ranges in the respective
+ tablespace */
+ if (!rs.second.freed_ranges.empty()
+ && (srv_immediate_scrub_data_uncompressed
+ || rs.second.space->is_compressed())) {
+
+ rs.second.space->add_free_ranges(
+ std::move(rs.second.freed_ranges));
+ }
+ } else if (rs.second.name == "") {
+ ib::error() << "Missing FILE_CREATE, FILE_DELETE"
+ " or FILE_MODIFY before FILE_CHECKPOINT"
+ " for tablespace " << rs.first;
+ recv_sys.found_corrupt_log = true;
+ return(DB_CORRUPTION);
+ } else {
+ rs.second.status = file_name_t::MISSING;
+ flag_deleted = true;
+ }
+
+ ut_ad(rs.second.status == file_name_t::DELETED
+ || rs.second.name != "");
+ }
+
+ if (flag_deleted) {
+ return recv_validate_tablespace(rescan, missing_tablespace);
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Start recovering from a redo log checkpoint.
+@param[in] flush_lsn FIL_PAGE_FILE_FLUSH_LSN
+of first system tablespace page
+@return error code or DB_SUCCESS */
+dberr_t
+recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
+{
+ ulint max_cp_field;
+ lsn_t checkpoint_lsn;
+ bool rescan = false;
+ ib_uint64_t checkpoint_no;
+ lsn_t contiguous_lsn;
+ byte* buf;
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(srv_operation == SRV_OPERATION_NORMAL
+ || srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+ ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+ ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
+ ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
+ ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
+
+ if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
+
+ ib::info() << "innodb_force_recovery=6 skips redo log apply";
+
+ return(DB_SUCCESS);
+ }
+
+ recv_sys.recovery_on = true;
+
+ mysql_mutex_lock(&log_sys.mutex);
+
+ err = recv_find_max_checkpoint(&max_cp_field);
+
+ if (err != DB_SUCCESS) {
+
+ recv_sys.recovered_lsn = log_sys.get_lsn();
+ mysql_mutex_unlock(&log_sys.mutex);
+ return(err);
+ }
+
+ buf = log_sys.checkpoint_buf;
+ log_sys.log.read(max_cp_field, {buf, OS_FILE_LOG_BLOCK_SIZE});
+
+ checkpoint_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_LSN);
+ checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO);
+
+ /* Start reading the log from the checkpoint lsn. The variable
+ contiguous_lsn contains an lsn up to which the log is known to
+ be contiguously written. */
+ recv_sys.mlog_checkpoint_lsn = 0;
+
+ ut_ad(RECV_SCAN_SIZE <= srv_log_buffer_size);
+
+ const lsn_t end_lsn = mach_read_from_8(
+ buf + LOG_CHECKPOINT_END_LSN);
+
+ ut_ad(recv_sys.pages.empty());
+ contiguous_lsn = checkpoint_lsn;
+ switch (log_sys.log.format) {
+ case 0:
+ mysql_mutex_unlock(&log_sys.mutex);
+ return DB_SUCCESS;
+ default:
+ if (end_lsn == 0) {
+ break;
+ }
+ if (end_lsn >= checkpoint_lsn) {
+ contiguous_lsn = end_lsn;
+ break;
+ }
+ recv_sys.found_corrupt_log = true;
+ mysql_mutex_unlock(&log_sys.mutex);
+ return(DB_ERROR);
+ }
+
+ size_t sizeof_checkpoint;
+
+ if (!log_sys.is_physical()) {
+ sizeof_checkpoint = 9/* size of MLOG_CHECKPOINT */;
+ goto completed;
+ }
+
+ /* Look for FILE_CHECKPOINT. */
+ recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false);
+ /* The first scan should not have stored or applied any records. */
+ ut_ad(recv_sys.pages.empty());
+ ut_ad(!recv_sys.found_corrupt_fs);
+
+ if (srv_read_only_mode && recv_needed_recovery) {
+ mysql_mutex_unlock(&log_sys.mutex);
+ return(DB_READ_ONLY);
+ }
+
+ if (recv_sys.found_corrupt_log && !srv_force_recovery) {
+ mysql_mutex_unlock(&log_sys.mutex);
+ ib::warn() << "Log scan aborted at LSN " << contiguous_lsn;
+ return(DB_ERROR);
+ }
+
+ if (recv_sys.mlog_checkpoint_lsn == 0) {
+ lsn_t scan_lsn = log_sys.log.scanned_lsn;
+ if (!srv_read_only_mode && scan_lsn != checkpoint_lsn) {
+ mysql_mutex_unlock(&log_sys.mutex);
+ ib::error err;
+ err << "Missing FILE_CHECKPOINT";
+ if (end_lsn) {
+ err << " at " << end_lsn;
+ }
+ err << " between the checkpoint " << checkpoint_lsn
+ << " and the end " << scan_lsn << ".";
+ return(DB_ERROR);
+ }
+
+ log_sys.log.scanned_lsn = checkpoint_lsn;
+ } else {
+ contiguous_lsn = checkpoint_lsn;
+ rescan = recv_group_scan_log_recs(
+ checkpoint_lsn, &contiguous_lsn, false);
+
+ if ((recv_sys.found_corrupt_log && !srv_force_recovery)
+ || recv_sys.found_corrupt_fs) {
+ mysql_mutex_unlock(&log_sys.mutex);
+ return(DB_ERROR);
+ }
+ }
+
+ /* NOTE: we always do a 'recovery' at startup, but only if
+ there is something wrong we will print a message to the
+ user about recovery: */
+ sizeof_checkpoint= SIZE_OF_FILE_CHECKPOINT;
+
+completed:
+ if (flush_lsn == checkpoint_lsn + sizeof_checkpoint
+ && recv_sys.mlog_checkpoint_lsn == checkpoint_lsn) {
+ /* The redo log is logically empty. */
+ } else if (checkpoint_lsn != flush_lsn) {
+ ut_ad(!srv_log_file_created);
+
+ if (checkpoint_lsn + sizeof_checkpoint < flush_lsn) {
+ ib::warn()
+ << "Are you sure you are using the right "
+ << LOG_FILE_NAME
+ << " to start up the database? Log sequence "
+ "number in the "
+ << LOG_FILE_NAME << " is " << checkpoint_lsn
+ << ", less than the log sequence number in "
+ "the first system tablespace file header, "
+ << flush_lsn << ".";
+ }
+
+ if (!recv_needed_recovery) {
+
+ ib::info()
+ << "The log sequence number " << flush_lsn
+ << " in the system tablespace does not match"
+ " the log sequence number "
+ << checkpoint_lsn << " in the "
+ << LOG_FILE_NAME << "!";
+
+ if (srv_read_only_mode) {
+ ib::error() << "innodb_read_only"
+ " prevents crash recovery";
+ mysql_mutex_unlock(&log_sys.mutex);
+ return(DB_READ_ONLY);
+ }
+
+ recv_needed_recovery = true;
+ }
+ }
+
+ log_sys.set_lsn(recv_sys.recovered_lsn);
+ if (UNIV_LIKELY(log_sys.get_flushed_lsn() < recv_sys.recovered_lsn)) {
+ /* This may already have been set by create_log_file()
+ if no logs existed when the server started up. */
+ log_sys.set_flushed_lsn(recv_sys.recovered_lsn);
+ }
+
+ if (recv_needed_recovery) {
+ bool missing_tablespace = false;
+
+ err = recv_init_crash_recovery_spaces(
+ rescan, missing_tablespace);
+
+ if (err != DB_SUCCESS) {
+ mysql_mutex_unlock(&log_sys.mutex);
+ return(err);
+ }
+
+ /* If there is any missing tablespace and rescan is needed
+ then there is a possiblity that hash table will not contain
+ all space ids redo logs. Rescan the remaining unstored
+ redo logs for the validation of missing tablespace. */
+ ut_ad(rescan || !missing_tablespace);
+
+ while (missing_tablespace) {
+ DBUG_PRINT("ib_log", ("Rescan of redo log to validate "
+ "the missing tablespace. Scan "
+ "from last stored LSN " LSN_PF,
+ recv_sys.last_stored_lsn));
+
+ lsn_t recent_stored_lsn = recv_sys.last_stored_lsn;
+ rescan = recv_group_scan_log_recs(
+ checkpoint_lsn, &recent_stored_lsn, false);
+
+ ut_ad(!recv_sys.found_corrupt_fs);
+
+ missing_tablespace = false;
+
+ err = recv_sys.found_corrupt_log
+ ? DB_ERROR
+ : recv_validate_tablespace(
+ rescan, missing_tablespace);
+
+ if (err != DB_SUCCESS) {
+ mysql_mutex_unlock(&log_sys.mutex);
+ return err;
+ }
+
+ rescan = true;
+ }
+
+ recv_sys.parse_start_lsn = checkpoint_lsn;
+
+ if (srv_operation == SRV_OPERATION_NORMAL) {
+ buf_dblwr.recover();
+ }
+
+ ut_ad(srv_force_recovery <= SRV_FORCE_NO_UNDO_LOG_SCAN);
+
+ if (rescan) {
+ contiguous_lsn = checkpoint_lsn;
+
+ recv_group_scan_log_recs(
+ checkpoint_lsn, &contiguous_lsn, true);
+
+ if ((recv_sys.found_corrupt_log
+ && !srv_force_recovery)
+ || recv_sys.found_corrupt_fs) {
+ mysql_mutex_unlock(&log_sys.mutex);
+ return(DB_ERROR);
+ }
+ }
+ } else {
+ ut_ad(!rescan || recv_sys.pages.empty());
+ }
+
+ if (log_sys.is_physical()
+ && (log_sys.log.scanned_lsn < checkpoint_lsn
+ || log_sys.log.scanned_lsn < recv_max_page_lsn)) {
+
+ ib::error() << "We scanned the log up to "
+ << log_sys.log.scanned_lsn
+ << ". A checkpoint was at " << checkpoint_lsn << " and"
+ " the maximum LSN on a database page was "
+ << recv_max_page_lsn << ". It is possible that the"
+ " database is now corrupt!";
+ }
+
+ if (recv_sys.recovered_lsn < checkpoint_lsn) {
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ ib::error() << "Recovered only to lsn:"
+ << recv_sys.recovered_lsn
+ << " checkpoint_lsn: " << checkpoint_lsn;
+
+ return(DB_ERROR);
+ }
+
+ log_sys.next_checkpoint_lsn = checkpoint_lsn;
+ log_sys.next_checkpoint_no = checkpoint_no + 1;
+
+ recv_synchronize_groups();
+
+ ut_ad(recv_needed_recovery
+ || checkpoint_lsn == recv_sys.recovered_lsn);
+
+ log_sys.write_lsn = log_sys.get_lsn();
+ log_sys.buf_free = log_sys.write_lsn % OS_FILE_LOG_BLOCK_SIZE;
+ log_sys.buf_next_to_write = log_sys.buf_free;
+
+ log_sys.last_checkpoint_lsn = checkpoint_lsn;
+
+ if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL) {
+ /* Write a FILE_CHECKPOINT marker as the first thing,
+ before generating any other redo log. This ensures
+ that subsequent crash recovery will be possible even
+ if the server were killed soon after this. */
+ fil_names_clear(log_sys.last_checkpoint_lsn, true);
+ }
+
+ log_sys.next_checkpoint_no = ++checkpoint_no;
+
+ mutex_enter(&recv_sys.mutex);
+
+ recv_sys.apply_log_recs = true;
+ recv_no_ibuf_operations = false;
+ ut_d(recv_no_log_write = srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+
+ mutex_exit(&recv_sys.mutex);
+
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ recv_lsn_checks_on = true;
+
+ /* The database is now ready to start almost normal processing of user
+ transactions: transaction rollbacks and the application of the log
+ records in the hash table can be run in background. */
+
+ return(DB_SUCCESS);
+}
+
+bool recv_dblwr_t::validate_page(const page_id_t page_id,
+ const byte *page,
+ const fil_space_t *space,
+ byte *tmp_buf)
+{
+ if (page_id.page_no() == 0)
+ {
+ ulint flags= fsp_header_get_flags(page);
+ if (!fil_space_t::is_valid_flags(flags, page_id.space()))
+ {
+ ulint cflags= fsp_flags_convert_from_101(flags);
+ if (cflags == ULINT_UNDEFINED)
+ {
+ ib::warn() << "Ignoring a doublewrite copy of page " << page_id
+ << "due to invalid flags " << ib::hex(flags);
+ return false;
+ }
+
+ flags= cflags;
+ }
+
+ /* Page 0 is never page_compressed or encrypted. */
+ return !buf_page_is_corrupted(true, page, flags);
+ }
+
+ ut_ad(tmp_buf);
+ byte *tmp_frame= tmp_buf;
+ byte *tmp_page= tmp_buf + srv_page_size;
+ const uint16_t page_type= mach_read_from_2(page + FIL_PAGE_TYPE);
+ const bool expect_encrypted= space->crypt_data &&
+ space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
+
+ if (space->full_crc32())
+ return !buf_page_is_corrupted(true, page, space->flags);
+
+ if (expect_encrypted &&
+ mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION))
+ {
+ if (!fil_space_verify_crypt_checksum(page, space->zip_size()))
+ return false;
+ if (page_type != FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
+ return true;
+ if (space->zip_size())
+ return false;
+ memcpy(tmp_page, page, space->physical_size());
+ if (!fil_space_decrypt(space, tmp_frame, tmp_page))
+ return false;
+ }
+
+ switch (page_type) {
+ case FIL_PAGE_PAGE_COMPRESSED:
+ memcpy(tmp_page, page, space->physical_size());
+ /* fall through */
+ case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+ if (space->zip_size())
+ return false; /* ROW_FORMAT=COMPRESSED cannot be page_compressed */
+ ulint decomp= fil_page_decompress(tmp_frame, tmp_page, space->flags);
+ if (!decomp)
+ return false; /* decompression failed */
+ if (decomp == srv_page_size)
+ return false; /* the page was not compressed (invalid page type) */
+ return !buf_page_is_corrupted(true, tmp_page, space->flags);
+ }
+
+ return !buf_page_is_corrupted(true, page, space->flags);
+}
+
+byte *recv_dblwr_t::find_page(const page_id_t page_id,
+ const fil_space_t *space, byte *tmp_buf)
+{
+ byte *result= NULL;
+ lsn_t max_lsn= 0;
+
+ for (byte *page : pages)
+ {
+ if (page_get_page_no(page) != page_id.page_no() ||
+ page_get_space_id(page) != page_id.space())
+ continue;
+ const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
+ if (lsn <= max_lsn ||
+ !validate_page(page_id, page, space, tmp_buf))
+ {
+ /* Mark processed for subsequent iterations in buf_dblwr_t::recover() */
+ memset(page + FIL_PAGE_LSN, 0, 8);
+ continue;
+ }
+ max_lsn= lsn;
+ result= page;
+ }
+
+ return result;
+}
diff --git a/storage/innobase/log/log0sync.cc b/storage/innobase/log/log0sync.cc
new file mode 100644
index 00000000..2a6e1b8b
--- /dev/null
+++ b/storage/innobase/log/log0sync.cc
@@ -0,0 +1,309 @@
+/*****************************************************************************
+Copyright (c) 2020 MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*
+The group commit synchronization used in log_write_up_to()
+works as follows
+
+For simplicity, lets consider only write operation,synchronozation of
+flush operation works the same.
+
+Rules of the game
+
+A thread enters log_write_up_to() with lsn of the current transaction
+1. If last written lsn is greater than wait lsn (another thread already
+ wrote the log buffer),then there is no need to do anything.
+2. If no other thread is currently writing, write the log buffer,
+ and update last written lsn.
+3. Otherwise, wait, and go to step 1.
+
+Synchronization can be done in different ways, e.g
+
+a) Simple mutex locking the entire check and write operation
+Disadvantage that threads that could continue after updating
+last written lsn, still wait.
+
+b) Spinlock, with periodic checks for last written lsn.
+Fixes a) but burns CPU unnecessary.
+
+c) Mutex / condition variable combo.
+
+Condtion variable notifies (broadcast) all waiters, whenever
+last written lsn is changed.
+
+Has a disadvantage of many suprious wakeups, stress on OS scheduler,
+and mutex contention.
+
+d) Something else.
+Make use of the waiter's lsn parameter, and only wakeup "right" waiting
+threads.
+
+We chose d). Even if implementation is more complicated than alternatves
+due to the need to maintain list of waiters, it provides the best performance.
+
+See group_commit_lock implementation for details.
+
+Note that if write operation is very fast, a) or b) can be fine as alternative.
+*/
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#ifdef __linux__
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#endif
+
+#include <atomic>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <my_cpu.h>
+
+#include <log0types.h>
+#include "log0sync.h"
+#include <mysql/service_thd_wait.h>
+/**
+ Helper class , used in group commit lock.
+
+ Binary semaphore, or (same thing), an auto-reset event
+ Has state (signalled or not), and provides 2 operations.
+ wait() and wake()
+
+ The implementation uses efficient locking primitives on Linux and Windows.
+ Or, mutex/condition combo elsewhere.
+*/
+
+class binary_semaphore
+{
+public:
+ /**Wait until semaphore becomes signalled, and atomically reset the state
+ to non-signalled*/
+ void wait();
+ /** signals the semaphore */
+ void wake();
+
+private:
+#if defined(__linux__) || defined (_WIN32)
+ std::atomic<int> m_signalled;
+ static constexpr std::memory_order mem_order= std::memory_order_acq_rel;
+public:
+ binary_semaphore() :m_signalled(0) {}
+#else
+ std::mutex m_mtx{};
+ std::condition_variable m_cv{};
+ bool m_signalled = false;
+#endif
+};
+
+#if defined (__linux__) || defined (_WIN32)
+void binary_semaphore::wait()
+{
+ for (;;)
+ {
+ if (m_signalled.exchange(0, mem_order) == 1)
+ {
+ break;
+ }
+#ifdef _WIN32
+ int zero = 0;
+ WaitOnAddress(&m_signalled, &zero, sizeof(m_signalled), INFINITE);
+#else
+ syscall(SYS_futex, &m_signalled, FUTEX_WAIT_PRIVATE, 0, NULL, NULL, 0);
+#endif
+ }
+}
+
+void binary_semaphore::wake()
+{
+ if (m_signalled.exchange(1, mem_order) == 0)
+ {
+#ifdef _WIN32
+ WakeByAddressSingle(&m_signalled);
+#else
+ syscall(SYS_futex, &m_signalled, FUTEX_WAKE_PRIVATE, 1, NULL, NULL, 0);
+#endif
+ }
+}
+#else
+void binary_semaphore::wait()
+{
+ std::unique_lock<std::mutex> lk(m_mtx);
+ while (!m_signalled)
+ m_cv.wait(lk);
+ m_signalled = false;
+}
+void binary_semaphore::wake()
+{
+ std::unique_lock<std::mutex> lk(m_mtx);
+ m_signalled = true;
+ m_cv.notify_one();
+}
+#endif
+
+/* A thread helper structure, used in group commit lock below*/
+struct group_commit_waiter_t
+{
+ lsn_t m_value;
+ binary_semaphore m_sema;
+ group_commit_waiter_t* m_next;
+ group_commit_waiter_t() :m_value(), m_sema(), m_next() {}
+};
+
+group_commit_lock::group_commit_lock() :
+ m_mtx(), m_value(0), m_pending_value(0), m_lock(false), m_waiters_list()
+{
+}
+
+group_commit_lock::value_type group_commit_lock::value() const
+{
+ return m_value.load(std::memory_order::memory_order_relaxed);
+}
+
+group_commit_lock::value_type group_commit_lock::pending() const
+{
+ return m_pending_value.load(std::memory_order::memory_order_relaxed);
+}
+
+void group_commit_lock::set_pending(group_commit_lock::value_type num)
+{
+ ut_a(num >= value());
+ m_pending_value.store(num, std::memory_order::memory_order_relaxed);
+}
+
+const unsigned int MAX_SPINS = 1; /** max spins in acquire */
+thread_local group_commit_waiter_t thread_local_waiter;
+
+group_commit_lock::lock_return_code group_commit_lock::acquire(value_type num)
+{
+ unsigned int spins = MAX_SPINS;
+
+ for(;;)
+ {
+ if (num <= value())
+ {
+ /* No need to wait.*/
+ return lock_return_code::EXPIRED;
+ }
+
+ if(spins-- == 0)
+ break;
+ if (num > pending())
+ {
+ /* Longer wait expected (longer than currently running operation),
+ don't spin.*/
+ break;
+ }
+ ut_delay(1);
+ }
+
+ thread_local_waiter.m_value = num;
+ std::unique_lock<std::mutex> lk(m_mtx, std::defer_lock);
+ while (num > value())
+ {
+ lk.lock();
+
+ /* Re-read current value after acquiring the lock*/
+ if (num <= value())
+ {
+ return lock_return_code::EXPIRED;
+ }
+
+ if (!m_lock)
+ {
+ /* Take the lock, become group commit leader.*/
+ m_lock = true;
+#ifndef DBUG_OFF
+ m_owner_id = std::this_thread::get_id();
+#endif
+ return lock_return_code::ACQUIRED;
+ }
+
+ /* Add yourself to waiters list.*/
+ thread_local_waiter.m_next = m_waiters_list;
+ m_waiters_list = &thread_local_waiter;
+ lk.unlock();
+
+ /* Sleep until woken in release().*/
+ thd_wait_begin(0,THD_WAIT_GROUP_COMMIT);
+ thread_local_waiter.m_sema.wait();
+ thd_wait_end(0);
+
+ }
+ return lock_return_code::EXPIRED;
+}
+
+void group_commit_lock::release(value_type num)
+{
+ std::unique_lock<std::mutex> lk(m_mtx);
+ m_lock = false;
+
+ /* Update current value. */
+ ut_a(num >= value());
+ m_value.store(num, std::memory_order_relaxed);
+
+ /*
+ Wake waiters for value <= current value.
+ Wake one more waiter, who will become the group commit lead.
+ */
+ group_commit_waiter_t* cur, * prev, * next;
+ group_commit_waiter_t* wakeup_list = nullptr;
+ int extra_wake = 0;
+
+ for (prev= nullptr, cur= m_waiters_list; cur; cur= next)
+ {
+ next= cur->m_next;
+ if (cur->m_value <= num || extra_wake++ == 0)
+ {
+ /* Move current waiter to wakeup_list*/
+
+ if (!prev)
+ {
+ /* Remove from the start of the list.*/
+ m_waiters_list = next;
+ }
+ else
+ {
+ /* Remove from the middle of the list.*/
+ prev->m_next= cur->m_next;
+ }
+
+ /* Append entry to the wakeup list.*/
+ cur->m_next = wakeup_list;
+ wakeup_list = cur;
+ }
+ else
+ {
+ prev= cur;
+ }
+ }
+ lk.unlock();
+
+ for (cur= wakeup_list; cur; cur= next)
+ {
+ next= cur->m_next;
+ cur->m_sema.wake();
+ }
+}
+
+#ifndef DBUG_OFF
+bool group_commit_lock::is_owner()
+{
+ return m_lock && std::this_thread::get_id() == m_owner_id;
+}
+#endif
+
diff --git a/storage/innobase/log/log0sync.h b/storage/innobase/log/log0sync.h
new file mode 100644
index 00000000..40afbf74
--- /dev/null
+++ b/storage/innobase/log/log0sync.h
@@ -0,0 +1,81 @@
+/*****************************************************************************
+Copyright (c) 2020 MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#include <atomic>
+#include <thread>
+#include <log0types.h>
+
+struct group_commit_waiter_t;
+
+/**
+Special synchronization primitive, which is helpful for
+performing group commit.
+
+It has a state consisting of
+ - locked (bool)
+ - current value (number). This value is always increasing.
+ - pending value (number). current value can soon become this number
+ This is only used for optimization, does not have to be exact
+
+Operations supported on this semaphore
+
+1.acquire(num):
+- waits until current value exceeds num, or until lock is granted.
+
+- returns EXPIRED if current_value >= num,
+ or ACQUIRED, if current_value < num and lock is granted.
+
+2.release(num)
+- releases lock
+- sets new current value to max(num,current_value)
+- releases some threads waiting in acquire()
+
+3. value()
+- read current value
+
+4. pending_value()
+- read pending value
+
+5. set_pending_value()
+*/
+class group_commit_lock
+{
+ using value_type = lsn_t;
+#ifndef DBUG_OFF
+ std::thread::id m_owner_id{};
+#endif
+ std::mutex m_mtx;
+ std::atomic<value_type> m_value;
+ std::atomic<value_type> m_pending_value;
+ bool m_lock;
+ group_commit_waiter_t* m_waiters_list;
+public:
+ group_commit_lock();
+ enum lock_return_code
+ {
+ ACQUIRED,
+ EXPIRED
+ };
+ lock_return_code acquire(value_type num);
+ void release(value_type num);
+ value_type value() const;
+ value_type pending() const;
+ void set_pending(value_type num);
+#ifndef DBUG_OFF
+ bool is_owner();
+#endif
+};
diff --git a/storage/innobase/lz4.cmake b/storage/innobase/lz4.cmake
new file mode 100644
index 00000000..a908dd3b
--- /dev/null
+++ b/storage/innobase/lz4.cmake
@@ -0,0 +1,38 @@
+# Copyright (C) 2014, SkySQL Ab. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+SET(WITH_INNODB_LZ4 AUTO CACHE STRING
+ "Build with lz4. Possible values are 'ON', 'OFF', 'AUTO' and default is 'AUTO'")
+
+MACRO (MYSQL_CHECK_LZ4)
+ IF (WITH_INNODB_LZ4 STREQUAL "ON" OR WITH_INNODB_LZ4 STREQUAL "AUTO")
+ CHECK_INCLUDE_FILES(lz4.h HAVE_LZ4_H)
+ CHECK_LIBRARY_EXISTS(lz4 LZ4_compress_limitedOutput "" HAVE_LZ4_SHARED_LIB)
+ CHECK_LIBRARY_EXISTS(lz4 LZ4_compress_default "" HAVE_LZ4_COMPRESS_DEFAULT)
+
+ IF (HAVE_LZ4_SHARED_LIB AND HAVE_LZ4_H)
+ SET(HAVE_INNODB_LZ4 TRUE)
+ ADD_DEFINITIONS(-DHAVE_LZ4=1)
+ IF (HAVE_LZ4_COMPRESS_DEFAULT)
+ ADD_DEFINITIONS(-DHAVE_LZ4_COMPRESS_DEFAULT=1)
+ ENDIF()
+ LINK_LIBRARIES(lz4)
+ ELSE()
+ IF (WITH_INNODB_LZ4 STREQUAL "ON")
+ MESSAGE(FATAL_ERROR "Required lz4 library is not found")
+ ENDIF()
+ ENDIF()
+ ENDIF()
+ ADD_FEATURE_INFO(INNODB_LZ4 HAVE_INNODB_LZ4 "LZ4 compression in the InnoDB storage engine")
+ENDMACRO()
diff --git a/storage/innobase/lzma.cmake b/storage/innobase/lzma.cmake
new file mode 100644
index 00000000..3060139c
--- /dev/null
+++ b/storage/innobase/lzma.cmake
@@ -0,0 +1,35 @@
+# Copyright (C) 2014, SkySQL Ab. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+SET(WITH_INNODB_LZMA AUTO CACHE STRING
+ "Build with lzma. Possible values are 'ON', 'OFF', 'AUTO' and default is 'AUTO'")
+
+MACRO (MYSQL_CHECK_LZMA)
+ IF (WITH_INNODB_LZMA STREQUAL "ON" OR WITH_INNODB_LZMA STREQUAL "AUTO")
+ CHECK_INCLUDE_FILES(lzma.h HAVE_LZMA_H)
+ CHECK_LIBRARY_EXISTS(lzma lzma_stream_buffer_decode "" HAVE_LZMA_DECODE)
+ CHECK_LIBRARY_EXISTS(lzma lzma_easy_buffer_encode "" HAVE_LZMA_ENCODE)
+
+ IF (HAVE_LZMA_DECODE AND HAVE_LZMA_ENCODE AND HAVE_LZMA_H)
+ SET(HAVE_INNODB_LZMA TRUE)
+ ADD_DEFINITIONS(-DHAVE_LZMA=1)
+ LINK_LIBRARIES(lzma)
+ ELSE()
+ IF (WITH_INNODB_LZMA STREQUAL "ON")
+ MESSAGE(FATAL_ERROR "Required lzma library is not found")
+ ENDIF()
+ ENDIF()
+ ENDIF()
+ ADD_FEATURE_INFO(INNODB_LZMA HAVE_INNODB_LZMA "LZMA compression in the InnoDB storage engine")
+ENDMACRO()
diff --git a/storage/innobase/lzo.cmake b/storage/innobase/lzo.cmake
new file mode 100644
index 00000000..ca2de6ab
--- /dev/null
+++ b/storage/innobase/lzo.cmake
@@ -0,0 +1,34 @@
+# Copyright (C) 2014, SkySQL Ab. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+SET(WITH_INNODB_LZO AUTO CACHE STRING
+ "Build with lzo. Possible values are 'ON', 'OFF', 'AUTO' and default is 'AUTO'")
+
+MACRO (MYSQL_CHECK_LZO)
+ IF (WITH_INNODB_LZO STREQUAL "ON" OR WITH_INNODB_LZO STREQUAL "AUTO")
+ CHECK_INCLUDE_FILES(lzo/lzo1x.h HAVE_LZO_H)
+ CHECK_LIBRARY_EXISTS(lzo2 lzo1x_1_compress "" HAVE_LZO_SHARED_LIB)
+
+ IF(HAVE_LZO_SHARED_LIB AND HAVE_LZO_H)
+ SET(HAVE_INNODB_LZO TRUE)
+ ADD_DEFINITIONS(-DHAVE_LZO=1)
+ LINK_LIBRARIES(lzo2)
+ ELSE()
+ IF (WITH_INNODB_LZO STREQUAL "ON")
+ MESSAGE(FATAL_ERROR "Required lzo library is not found")
+ ENDIF()
+ ENDIF()
+ ENDIF()
+ ADD_FEATURE_INFO(INNODB_LZO HAVE_INNODB_LZO "LZO compression in the InnoDB storage engine")
+ENDMACRO()
diff --git a/storage/innobase/mem/mem0mem.cc b/storage/innobase/mem/mem0mem.cc
new file mode 100644
index 00000000..6d4593e0
--- /dev/null
+++ b/storage/innobase/mem/mem0mem.cc
@@ -0,0 +1,436 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0mem.cc
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0mem.h"
+#include "buf0buf.h"
+#include "srv0srv.h"
+#include <stdarg.h>
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return own: the result */
+char*
+mem_heap_strcat(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* s1, /*!< in: string 1 */
+ const char* s2) /*!< in: string 2 */
+{
+ char* s;
+ ulint s1_len = strlen(s1);
+ ulint s2_len = strlen(s2);
+
+ s = static_cast<char*>(mem_heap_alloc(heap, s1_len + s2_len + 1));
+
+ memcpy(s, s1, s1_len);
+ memcpy(s + s1_len, s2, s2_len);
+
+ s[s1_len + s2_len] = '\0';
+
+ return(s);
+}
+
+
+/****************************************************************//**
+Helper function for mem_heap_printf.
+@return length of formatted string, including terminating NUL */
+static
+ulint
+mem_heap_printf_low(
+/*================*/
+ char* buf, /*!< in/out: buffer to store formatted string
+ in, or NULL to just calculate length */
+ const char* format, /*!< in: format string */
+ va_list ap) /*!< in: arguments */
+{
+ ulint len = 0;
+
+ while (*format) {
+
+ /* Does this format specifier have the 'l' length modifier. */
+ ibool is_long = FALSE;
+
+ /* Length of one parameter. */
+ size_t plen;
+
+ if (*format++ != '%') {
+ /* Non-format character. */
+
+ len++;
+
+ if (buf) {
+ *buf++ = *(format - 1);
+ }
+
+ continue;
+ }
+
+ if (*format == 'l') {
+ is_long = TRUE;
+ format++;
+ }
+
+ switch (*format++) {
+ case 's':
+ /* string */
+ {
+ char* s = va_arg(ap, char*);
+
+ /* "%ls" is a non-sensical format specifier. */
+ ut_a(!is_long);
+
+ plen = strlen(s);
+ len += plen;
+
+ if (buf) {
+ memcpy(buf, s, plen);
+ buf += plen;
+ }
+ }
+
+ break;
+
+ case 'u':
+ /* unsigned int */
+ {
+ char tmp[32];
+ unsigned long val;
+
+ /* We only support 'long' values for now. */
+ ut_a(is_long);
+
+ val = va_arg(ap, unsigned long);
+
+ plen = size_t(sprintf(tmp, "%lu", val));
+ len += plen;
+
+ if (buf) {
+ memcpy(buf, tmp, plen);
+ buf += plen;
+ }
+ }
+
+ break;
+
+ case '%':
+
+ /* "%l%" is a non-sensical format specifier. */
+ ut_a(!is_long);
+
+ len++;
+
+ if (buf) {
+ *buf++ = '%';
+ }
+
+ break;
+
+ default:
+ ut_error;
+ }
+ }
+
+ /* For the NUL character. */
+ len++;
+
+ if (buf) {
+ *buf = '\0';
+ }
+
+ return(len);
+}
+
+/****************************************************************//**
+A simple sprintf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return heap-allocated formatted string */
+char*
+mem_heap_printf(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ const char* format, /*!< in: format string */
+ ...)
+{
+ va_list ap;
+ char* str;
+ ulint len;
+
+ /* Calculate length of string */
+ len = 0;
+ va_start(ap, format);
+ len = mem_heap_printf_low(NULL, format, ap);
+ va_end(ap);
+
+ /* Now create it for real. */
+ str = static_cast<char*>(mem_heap_alloc(heap, len));
+ va_start(ap, format);
+ mem_heap_printf_low(str, format, ap);
+ va_end(ap);
+
+ return(str);
+}
+
+#ifdef UNIV_DEBUG
+/** Validates the contents of a memory heap.
+Checks a memory heap for consistency, prints the contents if any error
+is detected. A fatal error is logged if an error is detected.
+@param[in] heap Memory heap to validate. */
+void
+mem_heap_validate(
+ const mem_heap_t* heap)
+{
+ ulint size = 0;
+
+ for (const mem_block_t* block = heap;
+ block != NULL;
+ block = UT_LIST_GET_NEXT(list, block)) {
+
+ switch (block->type) {
+ case MEM_HEAP_DYNAMIC:
+ break;
+ case MEM_HEAP_BUFFER:
+ case MEM_HEAP_BUFFER | MEM_HEAP_BTR_SEARCH:
+ ut_ad(block->len <= srv_page_size);
+ break;
+ default:
+ ut_error;
+ }
+
+ size += block->len;
+ }
+
+ ut_ad(size == heap->total_size);
+}
+
+/** Copy the tail of a string.
+@param[in,out] dst destination buffer
+@param[in] src string whose tail to copy
+@param[in] size size of dst buffer, in bytes, including NUL terminator
+@return strlen(src) */
+static void ut_strlcpy_rev(char* dst, const char* src, ulint size)
+{
+ size_t src_size = strlen(src), n = std::min(src_size, size - 1);
+ memcpy(dst, src + src_size - n, n + 1);
+}
+#endif /* UNIV_DEBUG */
+
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_create_block_func(
+/*=======================*/
+ mem_heap_t* heap, /*!< in: memory heap or NULL if first block
+ should be created */
+ ulint n, /*!< in: number of bytes needed for user data */
+#ifdef UNIV_DEBUG
+ const char* file_name,/*!< in: file name where created */
+ unsigned line, /*!< in: line where created */
+#endif /* UNIV_DEBUG */
+ ulint type) /*!< in: type of heap: MEM_HEAP_DYNAMIC or
+ MEM_HEAP_BUFFER */
+{
+ buf_block_t* buf_block = NULL;
+ mem_block_t* block;
+ ulint len;
+
+ ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+ || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+ if (heap != NULL) {
+ ut_d(mem_heap_validate(heap));
+ }
+
+ /* In dynamic allocation, calculate the size: block header + data. */
+ len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n);
+
+ if (type == MEM_HEAP_DYNAMIC || len < srv_page_size / 2) {
+
+ ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF);
+
+ block = static_cast<mem_block_t*>(ut_malloc_nokey(len));
+ } else {
+ len = srv_page_size;
+
+ if ((type & MEM_HEAP_BTR_SEARCH) && heap) {
+ /* We cannot allocate the block from the
+ buffer pool, but must get the free block from
+ the heap header free block field */
+
+ buf_block = static_cast<buf_block_t*>(heap->free_block);
+ heap->free_block = NULL;
+
+ if (UNIV_UNLIKELY(!buf_block)) {
+
+ return(NULL);
+ }
+ } else {
+ buf_block = buf_block_alloc();
+ }
+
+ block = (mem_block_t*) buf_block->frame;
+ }
+
+ if (block == NULL) {
+ ib::fatal() << "Unable to allocate memory of size "
+ << len << ".";
+ }
+
+ block->buf_block = buf_block;
+ block->free_block = NULL;
+
+ ut_d(ut_strlcpy_rev(block->file_name, file_name,
+ sizeof(block->file_name)));
+ ut_d(block->line = line);
+
+ mem_block_set_len(block, len);
+ mem_block_set_type(block, type);
+ mem_block_set_free(block, MEM_BLOCK_HEADER_SIZE);
+ mem_block_set_start(block, MEM_BLOCK_HEADER_SIZE);
+
+ if (UNIV_UNLIKELY(heap == NULL)) {
+ /* This is the first block of the heap. The field
+ total_size should be initialized here */
+ block->total_size = len;
+ } else {
+ /* Not the first allocation for the heap. This block's
+ total_length field should be set to undefined. */
+ ut_d(block->total_size = ULINT_UNDEFINED);
+ MEM_UNDEFINED(&block->total_size, sizeof block->total_size);
+
+ heap->total_size += len;
+ }
+
+ /* Poison all available memory. Individual chunks will be unpoisoned on
+ every mem_heap_alloc() call. */
+ compile_time_assert(MEM_BLOCK_HEADER_SIZE >= sizeof *block);
+ MEM_NOACCESS(block + 1, len - sizeof *block);
+
+ ut_ad((ulint)MEM_BLOCK_HEADER_SIZE < len);
+
+ return(block);
+}
+
+/***************************************************************//**
+Adds a new block to a memory heap.
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_add_block(
+/*===============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: number of bytes user needs */
+{
+ mem_block_t* block;
+ mem_block_t* new_block;
+ ulint new_size;
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ /* We have to allocate a new block. The size is always at least
+ doubled until the standard size is reached. After that the size
+ stays the same, except in cases where the caller needs more space. */
+
+ new_size = 2 * mem_block_get_len(block);
+
+ if (heap->type != MEM_HEAP_DYNAMIC) {
+ /* From the buffer pool we allocate buffer frames */
+ ut_a(n <= MEM_MAX_ALLOC_IN_BUF);
+
+ if (new_size > MEM_MAX_ALLOC_IN_BUF) {
+ new_size = MEM_MAX_ALLOC_IN_BUF;
+ }
+ } else if (new_size > MEM_BLOCK_STANDARD_SIZE) {
+
+ new_size = MEM_BLOCK_STANDARD_SIZE;
+ }
+
+ if (new_size < n) {
+ new_size = n;
+ }
+
+ new_block = mem_heap_create_block(heap, new_size, heap->type,
+ heap->file_name, heap->line);
+ if (new_block == NULL) {
+
+ return(NULL);
+ }
+
+ /* Add the new block as the last block */
+
+ UT_LIST_INSERT_AFTER(heap->base, block, new_block);
+
+ return(new_block);
+}
+
+/******************************************************************//**
+Frees a block from a memory heap. */
+void
+mem_heap_block_free(
+/*================*/
+ mem_heap_t* heap, /*!< in: heap */
+ mem_block_t* block) /*!< in: block to free */
+{
+ ulint type;
+ ulint len;
+ buf_block_t* buf_block;
+
+ buf_block = static_cast<buf_block_t*>(block->buf_block);
+
+ UT_LIST_REMOVE(heap->base, block);
+
+ ut_ad(heap->total_size >= block->len);
+ heap->total_size -= block->len;
+
+ type = heap->type;
+ len = block->len;
+
+ if (type == MEM_HEAP_DYNAMIC || len < srv_page_size / 2) {
+ ut_ad(!buf_block);
+ ut_free(block);
+ } else {
+ ut_ad(type & MEM_HEAP_BUFFER);
+ buf_block_free(buf_block);
+ }
+}
+
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+void
+mem_heap_free_block_free(
+/*=====================*/
+ mem_heap_t* heap) /*!< in: heap */
+{
+ if (UNIV_LIKELY_NULL(heap->free_block)) {
+
+ buf_block_free(static_cast<buf_block_t*>(heap->free_block));
+
+ heap->free_block = NULL;
+ }
+}
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
new file mode 100644
index 00000000..691b3935
--- /dev/null
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -0,0 +1,1121 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file mtr/mtr0mtr.cc
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0mtr.h"
+
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "fsp0sysspace.h"
+#include "page0types.h"
+#include "mtr0log.h"
+#include "log0recv.h"
+
+/** Iterate over a memo block in reverse. */
+template <typename Functor>
+struct CIterate {
+ CIterate() : functor() {}
+
+ CIterate(const Functor& functor) : functor(functor) {}
+
+ /** @return false if the functor returns false. */
+ bool operator()(mtr_buf_t::block_t* block) const
+ {
+ const mtr_memo_slot_t* start =
+ reinterpret_cast<const mtr_memo_slot_t*>(
+ block->begin());
+
+ mtr_memo_slot_t* slot =
+ reinterpret_cast<mtr_memo_slot_t*>(
+ block->end());
+
+ ut_ad(!(block->used() % sizeof(*slot)));
+
+ while (slot-- != start) {
+
+ if (!functor(slot)) {
+ return(false);
+ }
+ }
+
+ return(true);
+ }
+
+ Functor functor;
+};
+
+template <typename Functor>
+struct Iterate {
+ Iterate() : functor() {}
+
+ Iterate(const Functor& functor) : functor(functor) {}
+
+ /** @return false if the functor returns false. */
+ bool operator()(mtr_buf_t::block_t* block)
+ {
+ const mtr_memo_slot_t* start =
+ reinterpret_cast<const mtr_memo_slot_t*>(
+ block->begin());
+
+ mtr_memo_slot_t* slot =
+ reinterpret_cast<mtr_memo_slot_t*>(
+ block->end());
+
+ ut_ad(!(block->used() % sizeof(*slot)));
+
+ while (slot-- != start) {
+
+ if (!functor(slot)) {
+ return(false);
+ }
+ }
+
+ return(true);
+ }
+
+ Functor functor;
+};
+
+/** Find specific object */
+struct Find {
+
+ /** Constructor */
+ Find(const void* object, ulint type)
+ :
+ m_slot(),
+ m_type(type),
+ m_object(object)
+ {
+ ut_a(object != NULL);
+ }
+
+ /** @return false if the object was found. */
+ bool operator()(mtr_memo_slot_t* slot)
+ {
+ if (m_object == slot->object && m_type == slot->type) {
+ m_slot = slot;
+ return(false);
+ }
+
+ return(true);
+ }
+
+ /** Slot if found */
+ mtr_memo_slot_t*m_slot;
+
+ /** Type of the object to look for */
+ const ulint m_type;
+
+ /** The object instance to look for */
+ const void* m_object;
+};
+
+/** Find a page frame */
+struct FindPage
+{
+ /** Constructor
+ @param[in] ptr pointer to within a page frame
+ @param[in] flags MTR_MEMO flags to look for */
+ FindPage(const void* ptr, ulint flags)
+ : m_ptr(ptr), m_flags(flags), m_slot(NULL)
+ {
+ /* There must be some flags to look for. */
+ ut_ad(flags);
+ /* We can only look for page-related flags. */
+ ut_ad(!(flags & ulint(~(MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_SX_FIX
+ | MTR_MEMO_BUF_FIX
+ | MTR_MEMO_MODIFY))));
+ }
+
+ /** Visit a memo entry.
+ @param[in] slot memo entry to visit
+ @retval false if a page was found
+ @retval true if the iteration should continue */
+ bool operator()(mtr_memo_slot_t* slot)
+ {
+ ut_ad(m_slot == NULL);
+
+ if (!(m_flags & slot->type) || slot->object == NULL) {
+ return(true);
+ }
+
+ buf_block_t* block = reinterpret_cast<buf_block_t*>(
+ slot->object);
+
+ if (m_ptr < block->frame
+ || m_ptr >= block->frame + srv_page_size) {
+ return(true);
+ }
+
+ ut_ad(!(m_flags & (MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_SX_FIX
+ | MTR_MEMO_PAGE_X_FIX))
+ || rw_lock_own_flagged(&block->lock, m_flags));
+
+ m_slot = slot;
+ return(false);
+ }
+
+ /** @return the slot that was found */
+ mtr_memo_slot_t* get_slot() const
+ {
+ ut_ad(m_slot != NULL);
+ return(m_slot);
+ }
+ /** @return the block that was found */
+ buf_block_t* get_block() const
+ {
+ return(reinterpret_cast<buf_block_t*>(get_slot()->object));
+ }
+private:
+ /** Pointer inside a page frame to look for */
+ const void*const m_ptr;
+ /** MTR_MEMO flags to look for */
+ const ulint m_flags;
+ /** The slot corresponding to m_ptr */
+ mtr_memo_slot_t* m_slot;
+};
+
+/** Release latches and decrement the buffer fix count.
+@param slot memo slot */
+static void memo_slot_release(mtr_memo_slot_t *slot)
+{
+ switch (slot->type) {
+ case MTR_MEMO_S_LOCK:
+ rw_lock_s_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
+ break;
+ case MTR_MEMO_SX_LOCK:
+ rw_lock_sx_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
+ break;
+ case MTR_MEMO_SPACE_X_LOCK:
+ {
+ fil_space_t *space= static_cast<fil_space_t*>(slot->object);
+ space->set_committed_size();
+ rw_lock_x_unlock(&space->latch);
+ }
+ break;
+ case MTR_MEMO_X_LOCK:
+ rw_lock_x_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
+ break;
+ default:
+#ifdef UNIV_DEBUG
+ switch (slot->type & ~MTR_MEMO_MODIFY) {
+ case MTR_MEMO_BUF_FIX:
+ case MTR_MEMO_PAGE_S_FIX:
+ case MTR_MEMO_PAGE_SX_FIX:
+ case MTR_MEMO_PAGE_X_FIX:
+ break;
+ default:
+ ut_ad("invalid type" == 0);
+ break;
+ }
+#endif /* UNIV_DEBUG */
+ buf_block_t *block= reinterpret_cast<buf_block_t*>(slot->object);
+ buf_page_release_latch(block, slot->type & ~MTR_MEMO_MODIFY);
+ block->unfix();
+ break;
+ }
+ slot->object= nullptr;
+}
+
+/** Release the latches acquired by the mini-transaction. */
+struct ReleaseLatches {
+ /** @return true always. */
+ bool operator()(mtr_memo_slot_t *slot) const
+ {
+ if (!slot->object)
+ return true;
+ switch (slot->type) {
+ case MTR_MEMO_S_LOCK:
+ rw_lock_s_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
+ break;
+ case MTR_MEMO_SPACE_X_LOCK:
+ {
+ fil_space_t *space= static_cast<fil_space_t*>(slot->object);
+ space->set_committed_size();
+ rw_lock_x_unlock(&space->latch);
+ }
+ break;
+ case MTR_MEMO_X_LOCK:
+ rw_lock_x_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
+ break;
+ case MTR_MEMO_SX_LOCK:
+ rw_lock_sx_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
+ break;
+ default:
+#ifdef UNIV_DEBUG
+ switch (slot->type & ~MTR_MEMO_MODIFY) {
+ case MTR_MEMO_BUF_FIX:
+ case MTR_MEMO_PAGE_S_FIX:
+ case MTR_MEMO_PAGE_SX_FIX:
+ case MTR_MEMO_PAGE_X_FIX:
+ break;
+ default:
+ ut_ad("invalid type" == 0);
+ break;
+ }
+#endif /* UNIV_DEBUG */
+ buf_block_t *block= reinterpret_cast<buf_block_t*>(slot->object);
+ buf_page_release_latch(block, slot->type & ~MTR_MEMO_MODIFY);
+ block->unfix();
+ break;
+ }
+ slot->object= NULL;
+ return true;
+ }
+};
+
+/** Release the latches and blocks acquired by the mini-transaction. */
+struct ReleaseAll {
+ /** @return true always. */
+ bool operator()(mtr_memo_slot_t *slot) const
+ {
+ if (slot->object)
+ memo_slot_release(slot);
+ return true;
+ }
+};
+
+#ifdef UNIV_DEBUG
+/** Check that all slots have been handled. */
+struct DebugCheck {
+ /** @return true always. */
+ bool operator()(const mtr_memo_slot_t* slot) const
+ {
+ ut_ad(!slot->object);
+ return(true);
+ }
+};
+#endif
+
+/** Release page latches held by the mini-transaction. */
+struct ReleaseBlocks
+{
+ const lsn_t start, end;
+#ifdef UNIV_DEBUG
+ const mtr_buf_t &memo;
+
+ ReleaseBlocks(lsn_t start, lsn_t end, const mtr_buf_t &memo) :
+ start(start), end(end), memo(memo)
+#else /* UNIV_DEBUG */
+ ReleaseBlocks(lsn_t start, lsn_t end, const mtr_buf_t&) :
+ start(start), end(end)
+#endif /* UNIV_DEBUG */
+ {
+ ut_ad(start);
+ ut_ad(end);
+ }
+
+ /** @return true always */
+ bool operator()(mtr_memo_slot_t* slot) const
+ {
+ if (!slot->object)
+ return true;
+ switch (slot->type) {
+ case MTR_MEMO_PAGE_X_MODIFY:
+ case MTR_MEMO_PAGE_SX_MODIFY:
+ break;
+ default:
+ ut_ad(!(slot->type & MTR_MEMO_MODIFY));
+ return true;
+ }
+
+ buf_flush_note_modification(static_cast<buf_block_t*>(slot->object),
+ start, end);
+ return true;
+ }
+};
+
+/** Start a mini-transaction. */
+void mtr_t::start()
+{
+ ut_ad(!m_freed_pages);
+ ut_ad(!m_freed_space);
+ MEM_UNDEFINED(this, sizeof *this);
+ MEM_MAKE_DEFINED(&m_freed_space, sizeof m_freed_space);
+ MEM_MAKE_DEFINED(&m_freed_pages, sizeof m_freed_pages);
+
+ ut_d(m_start= true);
+ ut_d(m_commit= false);
+
+ m_last= nullptr;
+ m_last_offset= 0;
+
+ new(&m_memo) mtr_buf_t();
+ new(&m_log) mtr_buf_t();
+
+ m_made_dirty= false;
+ m_inside_ibuf= false;
+ m_modifications= false;
+ m_log_mode= MTR_LOG_ALL;
+ ut_d(m_user_space_id= TRX_SYS_SPACE);
+ m_user_space= nullptr;
+ m_commit_lsn= 0;
+ m_trim_pages= false;
+}
+
+/** Release the resources */
+inline void mtr_t::release_resources()
+{
+ ut_ad(is_active());
+ ut_d(m_memo.for_each_block_in_reverse(CIterate<DebugCheck>()));
+ m_log.erase();
+ m_memo.erase();
+ ut_d(m_commit= true);
+}
+
+/** Commit a mini-transaction. */
+void mtr_t::commit()
+{
+ ut_ad(is_active());
+ ut_ad(!is_inside_ibuf());
+
+ /* This is a dirty read, for debugging. */
+ ut_ad(!m_modifications || !recv_no_log_write);
+ ut_ad(!m_modifications || m_log_mode != MTR_LOG_NONE);
+
+ if (m_modifications && (m_log_mode == MTR_LOG_NO_REDO || !m_log.empty()))
+ {
+ ut_ad(!srv_read_only_mode || m_log_mode == MTR_LOG_NO_REDO);
+
+ std::pair<lsn_t,page_flush_ahead> lsns;
+
+ if (const ulint len= prepare_write())
+ lsns= finish_write(len);
+ else
+ lsns= { m_commit_lsn, PAGE_FLUSH_NO };
+
+ if (m_made_dirty)
+ mysql_mutex_lock(&log_sys.flush_order_mutex);
+
+ /* It is now safe to release the log mutex because the
+ flush_order mutex will ensure that we are the first one
+ to insert into the flush list. */
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ if (m_freed_pages)
+ {
+ ut_ad(!m_freed_pages->empty());
+ ut_ad(m_freed_space);
+ ut_ad(memo_contains(*m_freed_space));
+ ut_ad(is_named_space(m_freed_space));
+ /* Update the last freed lsn */
+ m_freed_space->update_last_freed_lsn(m_commit_lsn);
+
+ if (!is_trim_pages())
+ for (const auto &range : *m_freed_pages)
+ m_freed_space->add_free_range(range);
+ else
+ m_freed_space->clear_freed_ranges();
+ delete m_freed_pages;
+ m_freed_pages= nullptr;
+ m_freed_space= nullptr;
+ /* mtr_t::start() will reset m_trim_pages */
+ }
+ else
+ ut_ad(!m_freed_space);
+
+ m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks>
+ (ReleaseBlocks(lsns.first, m_commit_lsn,
+ m_memo)));
+ if (m_made_dirty)
+ mysql_mutex_unlock(&log_sys.flush_order_mutex);
+
+ m_memo.for_each_block_in_reverse(CIterate<ReleaseLatches>());
+
+ if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
+ buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
+
+ if (m_made_dirty)
+ srv_stats.log_write_requests.inc();
+ }
+ else
+ m_memo.for_each_block_in_reverse(CIterate<ReleaseAll>());
+
+ release_resources();
+}
+
+/** Commit a mini-transaction that did not modify any pages,
+but generated some redo log on a higher level, such as
+FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
+The caller must hold log_sys.mutex.
+This is to be used at log_checkpoint().
+@param[in] checkpoint_lsn log checkpoint LSN, or 0 */
+void mtr_t::commit_files(lsn_t checkpoint_lsn)
+{
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(is_active());
+ ut_ad(!is_inside_ibuf());
+ ut_ad(m_log_mode == MTR_LOG_ALL);
+ ut_ad(!m_made_dirty);
+ ut_ad(m_memo.size() == 0);
+ ut_ad(!srv_read_only_mode);
+ ut_ad(!m_freed_space);
+ ut_ad(!m_freed_pages);
+
+ if (checkpoint_lsn) {
+ byte* ptr = m_log.push<byte*>(SIZE_OF_FILE_CHECKPOINT);
+ compile_time_assert(SIZE_OF_FILE_CHECKPOINT == 3 + 8 + 1);
+ *ptr = FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2);
+ ::memset(ptr + 1, 0, 2);
+ mach_write_to_8(ptr + 3, checkpoint_lsn);
+ ptr[3 + 8] = 0;
+ } else {
+ *m_log.push<byte*>(1) = 0;
+ }
+
+ finish_write(m_log.size());
+ srv_stats.log_write_requests.inc();
+ release_resources();
+
+ if (checkpoint_lsn) {
+ DBUG_PRINT("ib_log",
+ ("FILE_CHECKPOINT(" LSN_PF ") written at " LSN_PF,
+ checkpoint_lsn, log_sys.get_lsn()));
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Check if a tablespace is associated with the mini-transaction
+(needed for generating a FILE_MODIFY record)
+@param[in] space tablespace
+@return whether the mini-transaction is associated with the space */
+bool
+mtr_t::is_named_space(ulint space) const
+{
+ ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE);
+
+ switch (m_log_mode) {
+ case MTR_LOG_NONE:
+ case MTR_LOG_NO_REDO:
+ return(true);
+ case MTR_LOG_ALL:
+ return(m_user_space_id == space
+ || is_predefined_tablespace(space));
+ }
+
+ ut_error;
+ return(false);
+}
+/** Check if a tablespace is associated with the mini-transaction
+(needed for generating a FILE_MODIFY record)
+@param[in] space tablespace
+@return whether the mini-transaction is associated with the space */
+bool mtr_t::is_named_space(const fil_space_t* space) const
+{
+ ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE);
+
+ switch (m_log_mode) {
+ case MTR_LOG_NONE:
+ case MTR_LOG_NO_REDO:
+ return true;
+ case MTR_LOG_ALL:
+ return m_user_space == space || is_predefined_tablespace(space->id);
+ }
+
+ ut_error;
+ return false;
+}
+#endif /* UNIV_DEBUG */
+
+/** Acquire a tablespace X-latch.
+NOTE: use mtr_x_lock_space().
+@param[in] space_id tablespace ID
+@param[in] file file name from where called
+@param[in] line line number in file
+@return the tablespace object (never NULL) */
+fil_space_t*
+mtr_t::x_lock_space(ulint space_id, const char* file, unsigned line)
+{
+ fil_space_t* space;
+
+ ut_ad(is_active());
+
+ if (space_id == TRX_SYS_SPACE) {
+ space = fil_system.sys_space;
+ } else if ((space = m_user_space) && space_id == space->id) {
+ } else {
+ space = fil_space_get(space_id);
+ ut_ad(m_log_mode != MTR_LOG_NO_REDO
+ || space->purpose == FIL_TYPE_TEMPORARY
+ || space->purpose == FIL_TYPE_IMPORT);
+ }
+
+ ut_ad(space);
+ ut_ad(space->id == space_id);
+ x_lock_space(space, file, line);
+ return(space);
+}
+
+/** Release an object in the memo stack.
+@return true if released */
+bool
+mtr_t::memo_release(const void* object, ulint type)
+{
+ ut_ad(is_active());
+
+ /* We cannot release a page that has been written to in the
+ middle of a mini-transaction. */
+ ut_ad(!m_modifications || type != MTR_MEMO_PAGE_X_FIX);
+
+ Iterate<Find> iteration(Find(object, type));
+
+ if (!m_memo.for_each_block_in_reverse(iteration)) {
+ memo_slot_release(iteration.functor.m_slot);
+ return(true);
+ }
+
+ return(false);
+}
+
+/** Release a page latch.
+@param[in] ptr pointer to within a page frame
+@param[in] type object type: MTR_MEMO_PAGE_X_FIX, ... */
+void
+mtr_t::release_page(const void* ptr, mtr_memo_type_t type)
+{
+ ut_ad(is_active());
+
+ /* We cannot release a page that has been written to in the
+ middle of a mini-transaction. */
+ ut_ad(!m_modifications || type != MTR_MEMO_PAGE_X_FIX);
+
+ Iterate<FindPage> iteration(FindPage(ptr, type));
+
+ if (!m_memo.for_each_block_in_reverse(iteration)) {
+ memo_slot_release(iteration.functor.get_slot());
+ return;
+ }
+
+ /* The page was not found! */
+ ut_ad(0);
+}
+
+static bool log_margin_warned;
+static time_t log_margin_warn_time;
+static bool log_close_warned;
+static time_t log_close_warn_time;
+
+/** Check margin not to overwrite transaction log from the last checkpoint.
+If would estimate the log write to exceed the log_capacity,
+waits for the checkpoint is done enough.
+@param len length of the data to be written */
+static void log_margin_checkpoint_age(ulint len)
+{
+ const ulint framing_size= log_sys.framing_size();
+ /* actual length stored per block */
+ const ulint len_per_blk= OS_FILE_LOG_BLOCK_SIZE - framing_size;
+
+ /* actual data length in last block already written */
+ ulint extra_len= log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
+
+ ut_ad(extra_len >= LOG_BLOCK_HDR_SIZE);
+ extra_len-= LOG_BLOCK_HDR_SIZE;
+
+ /* total extra length for block header and trailer */
+ extra_len= ((len + extra_len) / len_per_blk) * framing_size;
+
+ const ulint margin= len + extra_len;
+
+ mysql_mutex_assert_owner(&log_sys.mutex);
+
+ const lsn_t lsn= log_sys.get_lsn();
+
+ if (UNIV_UNLIKELY(margin > log_sys.log_capacity))
+ {
+ time_t t= time(nullptr);
+
+ /* return with warning output to avoid deadlock */
+ if (!log_margin_warned || difftime(t, log_margin_warn_time) > 15)
+ {
+ log_margin_warned= true;
+ log_margin_warn_time= t;
+
+ ib::error() << "innodb_log_file_size is too small "
+ "for mini-transaction size " << len;
+ }
+ }
+ else if (UNIV_LIKELY(lsn + margin <= log_sys.last_checkpoint_lsn +
+ log_sys.log_capacity))
+ return;
+
+ log_sys.set_check_flush_or_checkpoint();
+}
+
+
+/** Open the log for log_write_low(). The log must be closed with log_close().
+@param len length of the data to be written
+@return start lsn of the log record */
+static lsn_t log_reserve_and_open(size_t len)
+{
+ for (ut_d(ulint count= 0);;)
+ {
+ mysql_mutex_assert_owner(&log_sys.mutex);
+
+ /* Calculate an upper limit for the space the string may take in
+ the log buffer */
+
+ size_t len_upper_limit= (4 * OS_FILE_LOG_BLOCK_SIZE) +
+ srv_log_write_ahead_size + (5 * len) / 4;
+
+ if (log_sys.buf_free + len_upper_limit <= srv_log_buffer_size)
+ break;
+
+ mysql_mutex_unlock(&log_sys.mutex);
+ DEBUG_SYNC_C("log_buf_size_exceeded");
+
+ /* Not enough free space, do a write of the log buffer */
+ log_write_up_to(log_sys.get_lsn(), false);
+
+ srv_stats.log_waits.inc();
+
+ ut_ad(++count < 50);
+
+ mysql_mutex_lock(&log_sys.mutex);
+ }
+
+ return log_sys.get_lsn();
+}
+
+/** Append data to the log buffer. */
+static void log_write_low(const void *str, size_t size)
+{
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ const ulint trailer_offset= log_sys.trailer_offset();
+
+ do
+ {
+ /* Calculate a part length */
+ size_t len= size;
+ size_t data_len= (log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE) + size;
+
+ if (data_len > trailer_offset)
+ {
+ data_len= trailer_offset;
+ len= trailer_offset - log_sys.buf_free % OS_FILE_LOG_BLOCK_SIZE;
+ }
+
+ memcpy(log_sys.buf + log_sys.buf_free, str, len);
+
+ size-= len;
+ str= static_cast<const char*>(str) + len;
+
+ byte *log_block= static_cast<byte*>(ut_align_down(log_sys.buf +
+ log_sys.buf_free,
+ OS_FILE_LOG_BLOCK_SIZE));
+
+ log_block_set_data_len(log_block, data_len);
+ lsn_t lsn= log_sys.get_lsn();
+
+ if (data_len == trailer_offset)
+ {
+ /* This block became full */
+ log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
+ log_block_set_checkpoint_no(log_block, log_sys.next_checkpoint_no);
+ len+= log_sys.framing_size();
+ lsn+= len;
+ /* Initialize the next block header */
+ log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, lsn);
+ }
+ else
+ lsn+= len;
+
+ log_sys.set_lsn(lsn);
+ log_sys.buf_free+= len;
+
+ ut_ad(log_sys.buf_free <= size_t{srv_log_buffer_size});
+ }
+ while (size);
+}
+
+/** Close the log at mini-transaction commit.
+@return whether buffer pool flushing is needed */
+static mtr_t::page_flush_ahead log_close(lsn_t lsn)
+{
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(lsn == log_sys.get_lsn());
+
+ byte *log_block= static_cast<byte*>(ut_align_down(log_sys.buf +
+ log_sys.buf_free,
+ OS_FILE_LOG_BLOCK_SIZE));
+
+ if (!log_block_get_first_rec_group(log_block))
+ {
+ /* We initialized a new log block which was not written
+ full by the current mtr: the next mtr log record group
+ will start within this block at the offset data_len */
+ log_block_set_first_rec_group(log_block,
+ log_block_get_data_len(log_block));
+ }
+
+ if (log_sys.buf_free > log_sys.max_buf_free)
+ log_sys.set_check_flush_or_checkpoint();
+
+ const lsn_t checkpoint_age= lsn - log_sys.last_checkpoint_lsn;
+
+ if (UNIV_UNLIKELY(checkpoint_age >= log_sys.log_capacity) &&
+ /* silence message on create_log_file() after the log had been deleted */
+ checkpoint_age != lsn)
+ {
+ time_t t= time(nullptr);
+ if (!log_close_warned || difftime(t, log_close_warn_time) > 15)
+ {
+ log_close_warned= true;
+ log_close_warn_time= t;
+
+ ib::error() << "The age of the last checkpoint is " << checkpoint_age
+ << ", which exceeds the log capacity "
+ << log_sys.log_capacity << ".";
+ }
+ }
+ else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_modified_age_async))
+ return mtr_t::PAGE_FLUSH_NO;
+ else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_checkpoint_age))
+ return mtr_t::PAGE_FLUSH_ASYNC;
+
+ log_sys.set_check_flush_or_checkpoint();
+ return mtr_t::PAGE_FLUSH_SYNC;
+}
+
+/** Write the block contents to the REDO log */
+struct mtr_write_log
+{
+ /** Append a block to the redo log buffer.
+ @return whether the appending should continue */
+ bool operator()(const mtr_buf_t::block_t *block) const
+ {
+ log_write_low(block->begin(), block->used());
+ return true;
+ }
+};
+
+/** Prepare to write the mini-transaction log to the redo log buffer.
+@return number of bytes to write in finish_write() */
+inline ulint mtr_t::prepare_write()
+{
+ ut_ad(!recv_no_log_write);
+
+ if (UNIV_UNLIKELY(m_log_mode != MTR_LOG_ALL)) {
+ ut_ad(m_log_mode == MTR_LOG_NO_REDO);
+ ut_ad(m_log.size() == 0);
+ mysql_mutex_lock(&log_sys.mutex);
+ m_commit_lsn = log_sys.get_lsn();
+ return 0;
+ }
+
+ ulint len = m_log.size();
+ ut_ad(len > 0);
+
+ if (len > srv_log_buffer_size / 2) {
+ log_buffer_extend(ulong((len + 1) * 2));
+ }
+
+ fil_space_t* space = m_user_space;
+
+ if (space != NULL && is_predefined_tablespace(space->id)) {
+ /* Omit FILE_MODIFY for predefined tablespaces. */
+ space = NULL;
+ }
+
+ mysql_mutex_lock(&log_sys.mutex);
+
+ if (fil_names_write_if_was_clean(space)) {
+ len = m_log.size();
+ } else {
+ /* This was not the first time of dirtying a
+ tablespace since the latest checkpoint. */
+ ut_ad(len == m_log.size());
+ }
+
+ *m_log.push<byte*>(1) = 0;
+ len++;
+
+ /* check and attempt a checkpoint if exceeding capacity */
+ log_margin_checkpoint_age(len);
+
+ return(len);
+}
+
+/** Append the redo log records to the redo log buffer.
+@param len number of bytes to write
+@return {start_lsn,flush_ahead} */
+inline std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::finish_write(ulint len)
+{
+ ut_ad(m_log_mode == MTR_LOG_ALL);
+ mysql_mutex_assert_owner(&log_sys.mutex);
+ ut_ad(m_log.size() == len);
+ ut_ad(len > 0);
+
+ lsn_t start_lsn;
+
+ if (m_log.is_small()) {
+ const mtr_buf_t::block_t* front = m_log.front();
+ ut_ad(len <= front->used());
+
+ m_commit_lsn = log_reserve_and_write_fast(front->begin(), len,
+ &start_lsn);
+
+ if (!m_commit_lsn) {
+ goto piecewise;
+ }
+ } else {
+piecewise:
+ /* Open the database log for log_write_low */
+ start_lsn = log_reserve_and_open(len);
+ mtr_write_log write_log;
+ m_log.for_each_block(write_log);
+ m_commit_lsn = log_sys.get_lsn();
+ }
+ page_flush_ahead flush= log_close(m_commit_lsn);
+ DBUG_EXECUTE_IF("ib_log_flush_ahead", flush = PAGE_FLUSH_SYNC;);
+
+ return std::make_pair(start_lsn, flush);
+}
+
+/** Find out whether a block was not X-latched by the mini-transaction */
+struct FindBlockX
+{
+ const buf_block_t &block;
+
+ FindBlockX(const buf_block_t &block): block(block) {}
+
+ /** @return whether the block was not found x-latched */
+ bool operator()(const mtr_memo_slot_t *slot) const
+ {
+ return slot->object != &block || slot->type != MTR_MEMO_PAGE_X_FIX;
+ }
+};
+
+#ifdef UNIV_DEBUG
+/** Assert that the block is not present in the mini-transaction */
+struct FindNoBlock
+{
+ const buf_block_t &block;
+
+ FindNoBlock(const buf_block_t &block): block(block) {}
+
+ /** @return whether the block was not found */
+ bool operator()(const mtr_memo_slot_t *slot) const
+ {
+ return slot->object != &block;
+ }
+};
+#endif /* UNIV_DEBUG */
+
+bool mtr_t::have_x_latch(const buf_block_t &block) const
+{
+ if (m_memo.for_each_block(CIterate<FindBlockX>(FindBlockX(block))))
+ {
+ ut_ad(m_memo.for_each_block(CIterate<FindNoBlock>(FindNoBlock(block))));
+ ut_ad(!memo_contains_flagged(&block,
+ MTR_MEMO_PAGE_S_FIX | MTR_MEMO_PAGE_SX_FIX |
+ MTR_MEMO_BUF_FIX | MTR_MEMO_MODIFY));
+ return false;
+ }
+ ut_ad(rw_lock_own(&block.lock, RW_LOCK_X));
+ return true;
+}
+
+#ifdef UNIV_DEBUG
+/** Check if we are holding an rw-latch in this mini-transaction
+@param lock latch to search for
+@param type held latch type
+@return whether (lock,type) is contained */
+bool mtr_t::memo_contains(const rw_lock_t &lock, mtr_memo_type_t type)
+{
+ Iterate<Find> iteration(Find(&lock, type));
+ if (m_memo.for_each_block_in_reverse(iteration))
+ return false;
+
+ switch (type) {
+ case MTR_MEMO_X_LOCK:
+ ut_ad(rw_lock_own(&lock, RW_LOCK_X));
+ break;
+ case MTR_MEMO_SX_LOCK:
+ ut_ad(rw_lock_own(&lock, RW_LOCK_SX));
+ break;
+ case MTR_MEMO_S_LOCK:
+ ut_ad(rw_lock_own(&lock, RW_LOCK_S));
+ break;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+/** Check if we are holding exclusive tablespace latch
+@param space tablespace to search for
+@return whether space.latch is being held */
+bool mtr_t::memo_contains(const fil_space_t& space)
+{
+ Iterate<Find> iteration(Find(&space, MTR_MEMO_SPACE_X_LOCK));
+ if (m_memo.for_each_block_in_reverse(iteration))
+ return false;
+ ut_ad(rw_lock_own(const_cast<rw_lock_t*>(&space.latch), RW_LOCK_X));
+ return true;
+}
+
+/** Debug check for flags */
+struct FlaggedCheck {
+ FlaggedCheck(const void* ptr, ulint flags)
+ :
+ m_ptr(ptr),
+ m_flags(flags)
+ {
+ /* There must be some flags to look for. */
+ ut_ad(flags);
+ /* Look for rw-lock-related and page-related flags. */
+ ut_ad(!(flags & ulint(~(MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_SX_FIX
+ | MTR_MEMO_BUF_FIX
+ | MTR_MEMO_MODIFY
+ | MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK
+ | MTR_MEMO_S_LOCK))));
+ /* Either some rw-lock-related or page-related flags
+ must be specified, but not both at the same time. */
+ ut_ad(!(flags & (MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_SX_FIX
+ | MTR_MEMO_BUF_FIX
+ | MTR_MEMO_MODIFY))
+ == !!(flags & (MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK
+ | MTR_MEMO_S_LOCK)));
+ }
+
+ /** Visit a memo entry.
+ @param[in] slot memo entry to visit
+ @retval false if m_ptr was found
+ @retval true if the iteration should continue */
+ bool operator()(const mtr_memo_slot_t* slot) const
+ {
+ if (m_ptr != slot->object || !(m_flags & slot->type)) {
+ return(true);
+ }
+
+ if (ulint flags = m_flags & (MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_SX_FIX
+ | MTR_MEMO_PAGE_X_FIX)) {
+ rw_lock_t* lock = &static_cast<buf_block_t*>(
+ const_cast<void*>(m_ptr))->lock;
+ ut_ad(rw_lock_own_flagged(lock, flags));
+ } else {
+ rw_lock_t* lock = static_cast<rw_lock_t*>(
+ const_cast<void*>(m_ptr));
+ ut_ad(rw_lock_own_flagged(lock, m_flags >> 5));
+ }
+
+ return(false);
+ }
+
+ const void*const m_ptr;
+ const ulint m_flags;
+};
+
+/** Check if memo contains the given item.
+@param object object to search
+@param flags specify types of object (can be ORred) of
+ MTR_MEMO_PAGE_S_FIX ... values
+@return true if contains */
+bool
+mtr_t::memo_contains_flagged(const void* ptr, ulint flags) const
+{
+ ut_ad(is_active());
+
+ return !m_memo.for_each_block_in_reverse(
+ CIterate<FlaggedCheck>(FlaggedCheck(ptr, flags)));
+}
+
+/** Check if memo contains the given page.
+@param[in] ptr pointer to within buffer frame
+@param[in] flags specify types of object with OR of
+ MTR_MEMO_PAGE_S_FIX... values
+@return the block
+@retval NULL if not found */
+buf_block_t*
+mtr_t::memo_contains_page_flagged(
+ const byte* ptr,
+ ulint flags) const
+{
+ Iterate<FindPage> iteration(FindPage(ptr, flags));
+ return m_memo.for_each_block_in_reverse(iteration)
+ ? NULL : iteration.functor.get_block();
+}
+
+/** Print info of an mtr handle. */
+void
+mtr_t::print() const
+{
+ ib::info() << "Mini-transaction handle: memo size "
+ << m_memo.size() << " bytes log size "
+ << get_log()->size() << " bytes";
+}
+
+#endif /* UNIV_DEBUG */
+
+
+/** Find a block, preferrably in MTR_MEMO_MODIFY state */
+struct FindModified
+{
+ mtr_memo_slot_t *found= nullptr;
+ const buf_block_t& block;
+
+ FindModified(const buf_block_t &block) : block(block) {}
+ bool operator()(mtr_memo_slot_t *slot)
+ {
+ if (slot->object != &block)
+ return true;
+ found= slot;
+ return !(slot->type & (MTR_MEMO_MODIFY |
+ MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
+ }
+};
+
+/** Mark the given latched page as modified.
+@param block page that will be modified */
+void mtr_t::modify(const buf_block_t &block)
+{
+ if (UNIV_UNLIKELY(m_memo.empty()))
+ {
+ /* This must be PageConverter::update_page() in IMPORT TABLESPACE. */
+ ut_ad(!block.page.in_LRU_list);
+ ut_ad(!buf_pool.is_uncompressed(&block));
+ return;
+ }
+
+ Iterate<FindModified> iteration((FindModified(block)));
+ if (UNIV_UNLIKELY(m_memo.for_each_block(iteration)))
+ {
+ ut_ad("modifying an unlatched page" == 0);
+ return;
+ }
+ iteration.functor.found->type= static_cast<mtr_memo_type_t>
+ (iteration.functor.found->type | MTR_MEMO_MODIFY);
+}
diff --git a/storage/innobase/mysql-test/storage_engine/alter_tablespace.opt b/storage/innobase/mysql-test/storage_engine/alter_tablespace.opt
new file mode 100644
index 00000000..cf4b117e
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/alter_tablespace.opt
@@ -0,0 +1,2 @@
+--innodb-file-per-table=1
+
diff --git a/storage/innobase/mysql-test/storage_engine/autoinc_secondary.rdiff b/storage/innobase/mysql-test/storage_engine/autoinc_secondary.rdiff
new file mode 100644
index 00000000..00cda7c4
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/autoinc_secondary.rdiff
@@ -0,0 +1,30 @@
+--- suite/storage_engine/autoinc_secondary.result 2012-07-12 04:34:18.153885986 +0400
++++ suite/storage_engine/autoinc_secondary.reject 2012-07-15 17:47:03.937703666 +0400
+@@ -13,18 +13,15 @@
+ 5 a
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <CHAR_COLUMN>, b <INT_COLUMN> AUTO_INCREMENT, PRIMARY KEY (a,b)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+-INSERT INTO t1 (a) VALUES ('a'),('b'),('b'),('c'),('a');
+-SELECT LAST_INSERT_ID();
+-LAST_INSERT_ID()
+-1
+-SELECT a,b FROM t1;
+-a b
+-a 1
+-a 2
+-b 1
+-b 2
+-c 1
+-DROP TABLE t1;
++ERROR 42000: Incorrect table definition; there can be only one auto column and it must be defined as a key
++# ERROR: Statement ended with errno 1075, errname ER_WRONG_AUTO_KEY (expected to succeed)
++# ------------ UNEXPECTED RESULT ------------
++# The statement|command finished with ER_WRONG_AUTO_KEY.
++# Multi-part keys or PK or AUTO_INCREMENT (on a secondary column) or the mix could be unsupported|malfunctioning, or the problem was caused by previous errors.
++# You can change the engine code, or create an rdiff, or disable the test by adding it to disabled.def.
++# Further in this test, the message might sometimes be suppressed; a part of the test might be skipped.
++# Also, this problem may cause a chain effect (more errors of different kinds in the test).
++# -------------------------------------------
+ CREATE TABLE t1 (a <CHAR_COLUMN>, b <INT_COLUMN> AUTO_INCREMENT, PRIMARY KEY (a,b), <CUSTOM_INDEX>(b)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ INSERT INTO t1 (a) VALUES ('a'),('b'),('b'),('c'),('a');
+ SELECT LAST_INSERT_ID();
diff --git a/storage/innobase/mysql-test/storage_engine/cache_index.rdiff b/storage/innobase/mysql-test/storage_engine/cache_index.rdiff
new file mode 100644
index 00000000..e04df87a
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/cache_index.rdiff
@@ -0,0 +1,71 @@
+--- suite/storage_engine/cache_index.result 2012-07-15 00:22:19.822493731 +0400
++++ suite/storage_engine/cache_index.reject 2012-07-15 17:47:18.321522834 +0400
+@@ -12,31 +12,31 @@
+ SET GLOBAL <CACHE_NAME>.key_buffer_size=128*1024;
+ CACHE INDEX t1 INDEX (a), t2 IN <CACHE_NAME>;
+ Table Op Msg_type Msg_text
+-test.t1 assign_to_keycache status OK
+-test.t2 assign_to_keycache status OK
++test.t1 assign_to_keycache note The storage engine for the table doesn't support assign_to_keycache
++test.t2 assign_to_keycache note The storage engine for the table doesn't support assign_to_keycache
+ LOAD INDEX INTO CACHE t1, t2;
+ Table Op Msg_type Msg_text
+-test.t1 preload_keys status OK
+-test.t2 preload_keys status OK
++test.t1 preload_keys note The storage engine for the table doesn't support preload_keys
++test.t2 preload_keys note The storage engine for the table doesn't support preload_keys
+ INSERT INTO t1 (a,b) VALUES (3,'c'),(4,'d');
+ SET GLOBAL <CACHE_NAME>.key_buffer_size=8*1024;
+ LOAD INDEX INTO CACHE t1, t2 IGNORE LEAVES;
+ Table Op Msg_type Msg_text
+-test.t1 preload_keys status OK
+-test.t2 preload_keys status OK
++test.t1 preload_keys note The storage engine for the table doesn't support preload_keys
++test.t2 preload_keys note The storage engine for the table doesn't support preload_keys
+ SET GLOBAL <CACHE_NAME>.key_cache_age_threshold = 100, <CACHE_NAME>.key_cache_block_size = 512, <CACHE_NAME>.key_cache_division_limit = 1, <CACHE_NAME>.key_cache_segments=2;
+ INSERT INTO t1 (a,b) VALUES (5,'e'),(6,'f');
+ LOAD INDEX INTO CACHE t1;
+ Table Op Msg_type Msg_text
+-test.t1 preload_keys status OK
++test.t1 preload_keys note The storage engine for the table doesn't support preload_keys
+ SET GLOBAL new_<CACHE_NAME>.key_buffer_size=128*1024;
+ CACHE INDEX t1 IN new_<CACHE_NAME>;
+ Table Op Msg_type Msg_text
+-test.t1 assign_to_keycache status OK
++test.t1 assign_to_keycache note The storage engine for the table doesn't support assign_to_keycache
+ INSERT INTO t1 (a,b) VALUES (7,'g'),(8,'h');
+ LOAD INDEX INTO CACHE t1 IGNORE LEAVES;
+ Table Op Msg_type Msg_text
+-test.t1 preload_keys status OK
++test.t1 preload_keys note The storage engine for the table doesn't support preload_keys
+ INSERT INTO t1 (a,b) VALUES (9,'i');
+ DROP TABLE t2;
+ DROP TABLE t1;
+@@ -47,11 +47,11 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ CACHE INDEX t1 IN <CACHE_NAME>;
+ Table Op Msg_type Msg_text
+-test.t1 assign_to_keycache status OK
++test.t1 assign_to_keycache note The storage engine for the table doesn't support assign_to_keycache
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(2,'b');
+ LOAD INDEX INTO CACHE t1;
+ Table Op Msg_type Msg_text
+-test.t1 preload_keys status OK
++test.t1 preload_keys note The storage engine for the table doesn't support preload_keys
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <INT_COLUMN>,
+ b <CHAR_COLUMN>,
+@@ -59,11 +59,11 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ CACHE INDEX t1 IN <CACHE_NAME>;
+ Table Op Msg_type Msg_text
+-test.t1 assign_to_keycache status OK
++test.t1 assign_to_keycache note The storage engine for the table doesn't support assign_to_keycache
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(2,'b');
+ LOAD INDEX INTO CACHE t1;
+ Table Op Msg_type Msg_text
+-test.t1 preload_keys status OK
++test.t1 preload_keys note The storage engine for the table doesn't support preload_keys
+ DROP TABLE t1;
+ SET GLOBAL <CACHE_NAME>.key_buffer_size=0;
+ SET GLOBAL new_<CACHE_NAME>.key_buffer_size=0;
diff --git a/storage/innobase/mysql-test/storage_engine/checksum_table_live.rdiff b/storage/innobase/mysql-test/storage_engine/checksum_table_live.rdiff
new file mode 100644
index 00000000..71c78284
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/checksum_table_live.rdiff
@@ -0,0 +1,13 @@
+--- suite/storage_engine/checksum_table_live.result 2012-07-12 21:05:44.497062968 +0400
++++ suite/storage_engine/checksum_table_live.reject 2012-07-15 17:47:28.105399836 +0400
+@@ -11,8 +11,8 @@
+ test.t1 4272806499
+ CHECKSUM TABLE t1, t2 QUICK;
+ Table Checksum
+-test.t1 4272806499
+-test.t2 0
++test.t1 NULL
++test.t2 NULL
+ CHECKSUM TABLE t1, t2 EXTENDED;
+ Table Checksum
+ test.t1 4272806499
diff --git a/storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt b/storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt
new file mode 100644
index 00000000..a007f405
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt
@@ -0,0 +1 @@
+--innodb_log_file_size=200M
diff --git a/storage/innobase/mysql-test/storage_engine/col_opt_null.opt b/storage/innobase/mysql-test/storage_engine/col_opt_null.opt
new file mode 100644
index 00000000..a007f405
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/col_opt_null.opt
@@ -0,0 +1 @@
+--innodb_log_file_size=200M
diff --git a/storage/innobase/mysql-test/storage_engine/define_engine.inc b/storage/innobase/mysql-test/storage_engine/define_engine.inc
new file mode 100644
index 00000000..7d7b0c74
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/define_engine.inc
@@ -0,0 +1,45 @@
+###########################################
+#
+# This is a template of the include file define_engine.inc which
+# should be placed in storage/<engine>/mysql-test/storage_engine folder.
+#
+################################
+#
+# The name of the engine under test must be defined in $ENGINE variable.
+# You can set it either here (uncomment and edit) or in your environment.
+#
+let $ENGINE = InnoDB;
+#
+################################
+#
+# The following three variables define specific options for columns and tables.
+# Normally there should be none needed, but for some engines it can be different.
+# If the engine requires specific column option for all or indexed columns,
+# set them inside the comment, e.g. /*!NOT NULL*/.
+# Do the same for table options if needed, e.g. /*!INSERT_METHOD=LAST*/
+
+let $default_col_opts = /*!*/;
+let $default_col_indexed_opts = /*!*/;
+let $default_tbl_opts = /*!*/;
+
+# INDEX, UNIQUE INDEX, PRIMARY KEY, special index type - choose the fist that the engine allows,
+# or set it to /*!*/ if none is supported
+
+let $default_index = /*!INDEX*/;
+
+# If the engine does not support the following types, replace them with the closest possible
+
+let $default_int_type = INT(11);
+let $default_char_type = CHAR(8);
+
+################################
+
+--disable_query_log
+--disable_result_log
+
+# Here you can place your custom MTR code which needs to be executed before each test,
+# e.g. creation of an additional schema or table, etc.
+# The cleanup part should be defined in cleanup_engine.inc
+
+--enable_query_log
+--enable_result_log
diff --git a/storage/innobase/mysql-test/storage_engine/disabled.def b/storage/innobase/mysql-test/storage_engine/disabled.def
new file mode 100644
index 00000000..1d67f931
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/disabled.def
@@ -0,0 +1,9 @@
+tbl_opt_ai : MySQL:65901 (AUTO_INCREMENT option on InnoDB table is ignored if added before autoinc column)
+delete_low_prio : InnoDB does not use table-level locking
+insert_high_prio : InnoDB does not use table-level locking
+insert_low_prio : InnoDB does not use table-level locking
+select_high_prio : InnoDB does not use table-level locking
+update_low_prio : InnoDB does not use table-level locking
+insert_delayed : MDEV-12880 - INSERT DELAYED is not detected as inapplicable to a table under lock
+lock_concurrent : MDEV-12882 - Assertion failure
+tbl_opt_index_dir : INDEX DIRECTORY option is not supported anymore
diff --git a/storage/innobase/mysql-test/storage_engine/fulltext_search.rdiff b/storage/innobase/mysql-test/storage_engine/fulltext_search.rdiff
new file mode 100644
index 00000000..a68fe830
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/fulltext_search.rdiff
@@ -0,0 +1,49 @@
+--- suite/storage_engine/fulltext_search.result 2013-11-27 18:50:16.000000000 +0400
++++ suite/storage_engine/fulltext_search.reject 2014-02-05 15:33:26.000000000 +0400
+@@ -52,15 +52,14 @@
+ INSERT INTO t1 (v0,v1,v2) VALUES ('text4','Contributing more...','...is a good idea'),('text5','test','test');
+ SELECT v0, MATCH(v1) AGAINST('contributing') AS rating FROM t1 WHERE MATCH(v1) AGAINST ('contributing');
+ v0 rating
+-text4 1.3705332279205322
++text4 0.4885590672492981
+ SELECT v0 FROM t1 WHERE MATCH(v1,v2) AGAINST ('-test1 +critical +Cook*' IN BOOLEAN MODE);
+-v0
+-text1
++ERROR HY000: Can't find FULLTEXT index matching the column list
+ SELECT v0 FROM t1 WHERE MATCH(v1,v2) AGAINST ('-patch +critical +Cook*' IN BOOLEAN MODE);
+-v0
++ERROR HY000: Can't find FULLTEXT index matching the column list
+ SELECT v0, MATCH(v1) AGAINST('database' WITH QUERY EXPANSION) AS rating FROM t1 WHERE MATCH(v1) AGAINST ('database' WITH QUERY EXPANSION);
+ v0 rating
+-text1 178.11756896972656
++text1 151.4530487060547
+ DROP TABLE t1;
+ CREATE TABLE t1 (v0 VARCHAR(64) <CUSTOM_COL_OPTIONS>,
+ v1 VARCHAR(16384) <CUSTOM_COL_OPTIONS>,
+@@ -112,14 +111,15 @@
+ ), ('text2','test1','test2');
+ SELECT v0 FROM t1 WHERE MATCH(v1,v2) AGAINST ('contributing' IN NATURAL LANGUAGE MODE);
+ v0
++text1
+ INSERT INTO t1 (v0,v1,v2) VALUES ('text3','test','test');
+ SELECT v0, MATCH(v1,v2) AGAINST('contributing' IN NATURAL LANGUAGE MODE) AS rating FROM t1 WHERE MATCH(v1,v2) AGAINST ('contributing' IN NATURAL LANGUAGE MODE);
+ v0 rating
+-text1 0.2809644043445587
++text1 0.45528939366340637
+ INSERT INTO t1 (v0,v1,v2) VALUES ('text4','Contributing more...','...is a good idea'),('text5','test','test');
+ SELECT v0, MATCH(v1) AGAINST('contributing') AS rating FROM t1 WHERE MATCH(v1) AGAINST ('contributing');
+ v0 rating
+-text4 1.3705332279205322
++text4 0.4885590672492981
+ SELECT v0 FROM t1 WHERE MATCH(v1,v2) AGAINST ('-test1 +critical +Cook*' IN BOOLEAN MODE);
+ v0
+ text1
+@@ -127,6 +127,6 @@
+ v0
+ SELECT v0, MATCH(v1,v2) AGAINST('database' WITH QUERY EXPANSION) AS rating FROM t1 WHERE MATCH(v1,v2) AGAINST ('database' WITH QUERY EXPANSION);
+ v0 rating
+-text1 190.56150817871094
+-text4 1.1758291721343994
++text1 229.60874938964844
++text4 0.31671249866485596
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/index_enable_disable.rdiff b/storage/innobase/mysql-test/storage_engine/index_enable_disable.rdiff
new file mode 100644
index 00000000..f8e812e7
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/index_enable_disable.rdiff
@@ -0,0 +1,33 @@
+--- suite/storage_engine/index_enable_disable.result 2012-07-15 00:30:05.296641931 +0400
++++ suite/storage_engine/index_enable_disable.reject 2012-07-15 17:49:12.988081281 +0400
+@@ -11,15 +11,19 @@
+ Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment
+ t1 1 a 1 a # # NULL NULL YES BTREE
+ ALTER TABLE t1 DISABLE KEYS;
++Warnings:
++Note 1031 Storage engine <STORAGE_ENGINE> of the table `test`.`t1` doesn't have this option
+ SHOW INDEX IN t1;
+ Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment
+-t1 1 a 1 a # # NULL NULL YES BTREE disabled
++t1 1 a 1 a # # NULL NULL YES BTREE
+ EXPLAIN SELECT a FROM t1 ORDER BY a;
+ id select_type table type possible_keys key key_len ref rows Extra
+-1 SIMPLE t1 ALL NULL NULL NULL NULL 19 Using filesort
++1 SIMPLE t1 index NULL a 5 NULL 19 Using index
+ INSERT INTO t1 (a) VALUES
+ (11),(12),(13),(14),(15),(16),(17),(18),(19),(20);
+ ALTER TABLE t1 ENABLE KEYS;
++Warnings:
++Note 1031 Storage engine <STORAGE_ENGINE> of the table `test`.`t1` doesn't have this option
+ SHOW INDEX IN t1;
+ Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment
+ t1 1 a 1 a # # NULL NULL YES BTREE
+@@ -32,6 +36,8 @@
+ (1),(2),(3),(4),(5),(6),(7),(8),(9),
+ (21),(22),(23),(24),(25),(26),(27),(28),(29);
+ ALTER TABLE t1 DISABLE KEYS;
++Warnings:
++Note 1031 Storage engine <STORAGE_ENGINE> of the table `test`.`t1` doesn't have this option
+ INSERT INTO t1 (a) VALUES (29);
+ ERROR 23000: Duplicate entry '29' for key 'a'
+ # Statement ended with one of expected results (ER_DUP_ENTRY,ER_DUP_KEY).
diff --git a/storage/innobase/mysql-test/storage_engine/index_type_hash.rdiff b/storage/innobase/mysql-test/storage_engine/index_type_hash.rdiff
new file mode 100644
index 00000000..02f9d935
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/index_type_hash.rdiff
@@ -0,0 +1,60 @@
+--- suite/storage_engine/index_type_hash.result 2012-07-15 01:10:17.919128889 +0400
++++ suite/storage_engine/index_type_hash.reject 2012-07-15 17:49:26.135915989 +0400
+@@ -4,7 +4,7 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SHOW KEYS IN t1;
+ Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment
+-t1 1 a 1 a # # NULL NULL # HASH
++t1 1 a 1 a # # NULL NULL # BTREE
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <INT_COLUMN>,
+ b <CHAR_COLUMN>,
+@@ -12,8 +12,8 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SHOW KEYS IN t1;
+ Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment
+-t1 1 a_b 1 a # # NULL NULL # HASH a_b index
+-t1 1 a_b 2 b # # NULL NULL # HASH a_b index
++t1 1 a_b 1 a # # NULL NULL # BTREE a_b index
++t1 1 a_b 2 b # # NULL NULL # BTREE a_b index
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <INT_COLUMN>,
+ b <CHAR_COLUMN>,
+@@ -22,8 +22,8 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SHOW KEYS IN t1;
+ Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment
+-t1 1 a 1 a # # NULL NULL # HASH
+-t1 1 b 1 b # # NULL NULL # HASH
++t1 1 a 1 a # # NULL NULL # BTREE
++t1 1 b 1 b # # NULL NULL # BTREE
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <INT_COLUMN>,
+ b <CHAR_COLUMN>,
+@@ -31,7 +31,7 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SHOW KEYS IN t1;
+ Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment
+-t1 0 a 1 a # # NULL NULL # HASH
++t1 0 a 1 a # # NULL NULL # BTREE
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(2,'b');
+ INSERT INTO t1 (a,b) VALUES (1,'c');
+ ERROR 23000: Duplicate entry '1' for key 'a'
+@@ -43,7 +43,7 @@
+ ALTER TABLE t1 ADD <CUSTOM_INDEX> (a) USING HASH COMMENT 'simple index on a';
+ SHOW INDEX FROM t1;
+ Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment
+-t1 1 a 1 a # # NULL NULL # HASH simple index on a
++t1 1 a 1 a # # NULL NULL # BTREE simple index on a
+ ALTER TABLE t1 DROP KEY a;
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <INT_COLUMN>,
+@@ -52,7 +52,7 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SHOW KEYS IN t1;
+ Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment
+-t1 0 a 1 a # # NULL NULL # HASH
++t1 0 a 1 a # # NULL NULL # BTREE
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(2,'b');
+ INSERT INTO t1 (a,b) VALUES (1,'c');
+ ERROR 23000: Duplicate entry '1' for key 'a'
diff --git a/storage/innobase/mysql-test/storage_engine/insert_delayed.rdiff b/storage/innobase/mysql-test/storage_engine/insert_delayed.rdiff
new file mode 100644
index 00000000..9e6cddf0
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/insert_delayed.rdiff
@@ -0,0 +1,26 @@
+--- suite/storage_engine/insert_delayed.result 2013-01-23 01:23:49.461254916 +0400
++++ suite/storage_engine/insert_delayed.reject 2013-01-23 01:47:05.975698364 +0400
+@@ -5,7 +5,16 @@
+ connect con0,localhost,root,,;
+ SET lock_wait_timeout = 1;
+ INSERT DELAYED INTO t1 (a,b) VALUES (3,'c');
++ERROR HY000: DELAYED option not supported for table 't1'
++# ------------ UNEXPECTED RESULT ------------
++# The statement|command finished with ER_DELAYED_NOT_SUPPORTED.
++# INSERT DELAYED or the mix could be unsupported|malfunctioning, or the problem was caused by previous errors.
++# You can change the engine code, or create an rdiff, or disable the test by adding it to disabled.def.
++# Further in this test, the message might sometimes be suppressed; a part of the test might be skipped.
++# Also, this problem may cause a chain effect (more errors of different kinds in the test).
++# -------------------------------------------
+ INSERT DELAYED INTO t1 SET a=4, b='d';
++ERROR HY000: DELAYED option not supported for table 't1'
+ INSERT DELAYED INTO t1 (a,b) SELECT 5, 'e';
+ ERROR HY000: Lock wait timeout exceeded; try restarting transaction
+ disconnect con0;
+@@ -20,6 +29,4 @@
+ a b
+ 1 f
+ 2 b
+-3 c
+-4 d
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/lock_concurrent.rdiff b/storage/innobase/mysql-test/storage_engine/lock_concurrent.rdiff
new file mode 100644
index 00000000..c76a5fe7
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/lock_concurrent.rdiff
@@ -0,0 +1,25 @@
+--- suite/storage_engine/lock_concurrent.result 2012-06-24 23:55:19.539380000 +0400
++++ suite/storage_engine/lock_concurrent.reject 2012-07-15 17:50:21.279222746 +0400
+@@ -4,6 +4,14 @@
+ connect con1,localhost,root,,;
+ SET lock_wait_timeout = 1;
+ LOCK TABLES t1 READ LOCAL;
++ERROR HY000: Lock wait timeout exceeded; try restarting transaction
++# ------------ UNEXPECTED RESULT ------------
++# The statement|command finished with ER_LOCK_WAIT_TIMEOUT.
++# LOCK .. WRITE CONCURRENT or the mix could be unsupported|malfunctioning, or the problem was caused by previous errors.
++# You can change the engine code, or create an rdiff, or disable the test by adding it to disabled.def.
++# Further in this test, the message might sometimes be suppressed; a part of the test might be skipped.
++# Also, this problem may cause a chain effect (more errors of different kinds in the test).
++# -------------------------------------------
+ UNLOCK TABLES;
+ connection default;
+ UNLOCK TABLES;
+@@ -11,6 +19,7 @@
+ LOCK TABLES t1 READ LOCAL;
+ connection default;
+ LOCK TABLES t1 WRITE CONCURRENT, t1 AS t2 READ;
++ERROR HY000: Lock wait timeout exceeded; try restarting transaction
+ UNLOCK TABLES;
+ UNLOCK TABLES;
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/optimize_table.rdiff b/storage/innobase/mysql-test/storage_engine/optimize_table.rdiff
new file mode 100644
index 00000000..54d1f600
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/optimize_table.rdiff
@@ -0,0 +1,37 @@
+--- suite/storage_engine/optimize_table.result 2012-07-12 19:13:53.741428591 +0400
++++ suite/storage_engine/optimize_table.reject 2012-07-15 17:50:30.843102510 +0400
+@@ -5,25 +5,32 @@
+ INSERT INTO t1 (a,b) VALUES (3,'c'),(4,'d');
+ OPTIMIZE TABLE t1;
+ Table Op Msg_type Msg_text
++test.t1 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t1 optimize status OK
+ INSERT INTO t2 (a,b) VALUES (4,'d');
+ OPTIMIZE NO_WRITE_TO_BINLOG TABLE t2;
+ Table Op Msg_type Msg_text
++test.t2 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t2 optimize status OK
+ INSERT INTO t2 (a,b) VALUES (5,'e');
+ INSERT INTO t1 (a,b) VALUES (6,'f');
+ OPTIMIZE LOCAL TABLE t1, t2;
+ Table Op Msg_type Msg_text
++test.t1 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t1 optimize status OK
++test.t2 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t2 optimize status OK
+ OPTIMIZE TABLE t1, t2;
+ Table Op Msg_type Msg_text
+-test.t1 optimize status Table is already up to date
+-test.t2 optimize status Table is already up to date
++test.t1 optimize note Table does not support optimize, doing recreate + analyze instead
++test.t1 optimize status OK
++test.t2 optimize note Table does not support optimize, doing recreate + analyze instead
++test.t2 optimize status OK
+ DROP TABLE t1, t2;
+ CREATE TABLE t1 (a <INT_COLUMN>, b <CHAR_COLUMN>, <CUSTOM_INDEX> (a)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(100,'b'),(2,'c'),(3,'d');
+ OPTIMIZE TABLE t1;
+ Table Op Msg_type Msg_text
++test.t1 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t1 optimize status OK
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/parts/checksum_table.rdiff b/storage/innobase/mysql-test/storage_engine/parts/checksum_table.rdiff
new file mode 100644
index 00000000..c8aabb78
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/checksum_table.rdiff
@@ -0,0 +1,13 @@
+--- suite/storage_engine/parts/checksum_table.result 2013-11-08 22:30:34.000000000 +0400
++++ suite/storage_engine/parts/checksum_table.reject 2013-11-08 22:32:30.000000000 +0400
+@@ -31,8 +31,8 @@
+ test.t1 4272806499
+ CHECKSUM TABLE t1, t2 QUICK;
+ Table Checksum
+-test.t1 4272806499
+-test.t2 0
++test.t1 NULL
++test.t2 NULL
+ CHECKSUM TABLE t1, t2 EXTENDED;
+ Table Checksum
+ test.t1 4272806499
diff --git a/storage/innobase/mysql-test/storage_engine/parts/create_table.rdiff b/storage/innobase/mysql-test/storage_engine/parts/create_table.rdiff
new file mode 100644
index 00000000..0df91c6f
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/create_table.rdiff
@@ -0,0 +1,20 @@
+--- suite/storage_engine/parts/create_table.result 2012-07-12 21:56:38.618667460 +0400
++++ suite/storage_engine/parts/create_table.reject 2012-07-15 20:06:43.496358345 +0400
+@@ -65,7 +65,7 @@
+ 1 SIMPLE t1 abc,def # # # # # # #
+ EXPLAIN PARTITIONS SELECT a FROM t1 WHERE a = 100;
+ id select_type table partitions type possible_keys key key_len ref rows Extra
+-1 SIMPLE NULL NULL # # # # # # #
++1 SIMPLE t1 def # # # # # # #
+ INSERT INTO t1 (a) VALUES (50);
+ ERROR HY000: Table has no partition for value 50
+ DROP TABLE t1;
+@@ -81,7 +81,7 @@
+ 1 SIMPLE t1 abc_abcsp0,def_defsp0 # # # # # # #
+ EXPLAIN PARTITIONS SELECT a FROM t1 WHERE a = 100;
+ id select_type table partitions type possible_keys key key_len ref rows Extra
+-1 SIMPLE NULL NULL # # # # # # #
++1 SIMPLE t1 def_defsp0 # # # # # # #
+ SELECT TABLE_SCHEMA, TABLE_NAME, PARTITION_NAME, SUBPARTITION_NAME, PARTITION_METHOD, SUBPARTITION_METHOD
+ FROM INFORMATION_SCHEMA.PARTITIONS WHERE TABLE_NAME = 't1';
+ TABLE_SCHEMA TABLE_NAME PARTITION_NAME SUBPARTITION_NAME PARTITION_METHOD SUBPARTITION_METHOD
diff --git a/storage/innobase/mysql-test/storage_engine/parts/disabled.def b/storage/innobase/mysql-test/storage_engine/parts/disabled.def
new file mode 100644
index 00000000..796bdfc7
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/disabled.def
@@ -0,0 +1 @@
+repair_table : InnoDB of 5.6.10 does not support repair on partitioned tables (fixed by 5.6.14)
diff --git a/storage/innobase/mysql-test/storage_engine/parts/optimize_table.rdiff b/storage/innobase/mysql-test/storage_engine/parts/optimize_table.rdiff
new file mode 100644
index 00000000..a35ba516
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/optimize_table.rdiff
@@ -0,0 +1,58 @@
+--- suite/storage_engine/parts/optimize_table.result 2013-07-18 22:55:38.000000000 +0400
++++ suite/storage_engine/parts/optimize_table.reject 2013-08-05 19:45:19.000000000 +0400
+@@ -9,18 +9,22 @@
+ INSERT INTO t1 (a,b) VALUES (3,'c'),(4,'d');
+ ALTER TABLE t1 OPTIMIZE PARTITION p1;
+ Table Op Msg_type Msg_text
++test.t1 optimize note Table does not support optimize on partitions. All partitions will be rebuilt and analyzed.
+ test.t1 optimize status OK
+ INSERT INTO t2 (a,b) VALUES (4,'d');
+ ALTER TABLE t2 OPTIMIZE PARTITION p0 NO_WRITE_TO_BINLOG;
+ Table Op Msg_type Msg_text
++test.t2 optimize note Table does not support optimize on partitions. All partitions will be rebuilt and analyzed.
+ test.t2 optimize status OK
+ INSERT INTO t1 (a,b) VALUES (6,'f');
+ ALTER TABLE t1 OPTIMIZE PARTITION ALL LOCAL;
+ Table Op Msg_type Msg_text
++test.t1 optimize note Table does not support optimize on partitions. All partitions will be rebuilt and analyzed.
+ test.t1 optimize status OK
+ INSERT INTO t2 (a,b) VALUES (5,'e');
+ ALTER TABLE t2 OPTIMIZE PARTITION p1,p0;
+ Table Op Msg_type Msg_text
++test.t2 optimize note Table does not support optimize on partitions. All partitions will be rebuilt and analyzed.
+ test.t2 optimize status OK
+ DROP TABLE t1, t2;
+ DROP TABLE IF EXISTS t1,t2;
+@@ -30,25 +34,32 @@
+ INSERT INTO t1 (a,b) VALUES (3,'c'),(4,'d');
+ OPTIMIZE TABLE t1;
+ Table Op Msg_type Msg_text
++test.t1 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t1 optimize status OK
+ INSERT INTO t2 (a,b) VALUES (4,'d');
+ OPTIMIZE NO_WRITE_TO_BINLOG TABLE t2;
+ Table Op Msg_type Msg_text
++test.t2 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t2 optimize status OK
+ INSERT INTO t2 (a,b) VALUES (5,'e');
+ INSERT INTO t1 (a,b) VALUES (6,'f');
+ OPTIMIZE LOCAL TABLE t1, t2;
+ Table Op Msg_type Msg_text
++test.t1 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t1 optimize status OK
++test.t2 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t2 optimize status OK
+ OPTIMIZE TABLE t1, t2;
+ Table Op Msg_type Msg_text
++test.t1 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t1 optimize status OK
++test.t2 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t2 optimize status OK
+ DROP TABLE t1, t2;
+ CREATE TABLE t1 (a <INT_COLUMN>, b <CHAR_COLUMN>, <CUSTOM_INDEX> (a)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS> PARTITION BY HASH(a) PARTITIONS 2;
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(100,'b'),(2,'c'),(3,'d');
+ OPTIMIZE TABLE t1;
+ Table Op Msg_type Msg_text
++test.t1 optimize note Table does not support optimize, doing recreate + analyze instead
+ test.t1 optimize status OK
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/parts/repair_table.rdiff b/storage/innobase/mysql-test/storage_engine/parts/repair_table.rdiff
new file mode 100644
index 00000000..35b150e8
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/repair_table.rdiff
@@ -0,0 +1,158 @@
+--- suite/storage_engine/parts/repair_table.result 2013-07-18 22:55:38.000000000 +0400
++++ suite/storage_engine/parts/repair_table.reject 2013-08-05 19:54:09.000000000 +0400
+@@ -25,7 +25,7 @@
+ INSERT INTO t1 (a,b) VALUES (10,'j');
+ ALTER TABLE t1 REPAIR PARTITION p1 QUICK USE_FRM;
+ Table Op Msg_type Msg_text
+-test.t1 repair status OK
++test.t1 repair note The storage engine for the table doesn't support repair
+ INSERT INTO t2 (a,b) VALUES (12,'l');
+ ALTER TABLE t2 REPAIR PARTITION NO_WRITE_TO_BINLOG ALL QUICK EXTENDED USE_FRM;
+ Table Op Msg_type Msg_text
+@@ -58,8 +58,8 @@
+ INSERT INTO t2 (a,b) VALUES (11,'k');
+ REPAIR TABLE t1, t2 QUICK USE_FRM;
+ Table Op Msg_type Msg_text
+-test.t1 repair status OK
+-test.t2 repair status OK
++test.t1 repair note The storage engine for the table doesn't support repair
++test.t2 repair note The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (12,'l');
+ INSERT INTO t2 (a,b) VALUES (13,'m');
+ REPAIR NO_WRITE_TO_BINLOG TABLE t1, t2 QUICK EXTENDED USE_FRM;
+@@ -101,119 +101,13 @@
+ INSERT INTO t1 (a,b) VALUES (10,'j');
+ REPAIR TABLE t1 USE_FRM;
+ Table Op Msg_type Msg_text
+-test.t1 repair status OK
+-t1#P#p0.MYD
+-t1#P#p0.MYI
+-t1#P#p1.MYD
+-t1#P#p1.MYI
++test.t1 repair note The storage engine for the table doesn't support repair
+ t1.frm
+ t1.par
+ INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+ # Statement ended with one of expected results (0,144).
+ # If you got a difference in error message, just add it to rdiff file
+ FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1#P#p0.MYD
+-CHECK TABLE t1;
+-Table Op Msg_type Msg_text
+-test.t1 check error Size of datafile is: 26 Should be: 39
+-test.t1 check error Partition p0 returned error
+-test.t1 check error Corrupt
+-SELECT a,b FROM t1;
+-a b
+-8 h
+-10 j
+-7 g
+-15 o
+-Warnings:
+-Error 145 Table './test/t1#P#p0' is marked as crashed and should be repaired
+-Error 1194 Table 't1' is marked as crashed and should be repaired
+-Error 1034 Number of rows changed from 3 to 2
+-# Statement ended with one of expected results (0,ER_NOT_KEYFILE,144).
+-# If you got a difference in error message, just add it to rdiff file
+-INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+-# Statement ended with one of expected results (0,144).
+-# If you got a difference in error message, just add it to rdiff file
+-FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1#P#p0.MYI
+-CHECK TABLE t1;
+-Table Op Msg_type Msg_text
+-test.t1 check warning Size of datafile is: 39 Should be: 26
+-test.t1 check error Record-count is not ok; is 3 Should be: 2
+-test.t1 check warning Found 3 key parts. Should be: 2
+-test.t1 check error Partition p0 returned error
+-test.t1 check error Corrupt
+-SELECT a,b FROM t1;
+-a b
+-8 h
+-10 j
+-14 n
+-7 g
+-15 o
+-15 o
+-Warnings:
+-Error 145 Table './test/t1#P#p0' is marked as crashed and should be repaired
+-Error 1194 Table 't1' is marked as crashed and should be repaired
+-Error 1034 Number of rows changed from 2 to 3
+-# Statement ended with one of expected results (0,ER_NOT_KEYFILE,144).
+-# If you got a difference in error message, just add it to rdiff file
+-INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+-# Statement ended with one of expected results (0,144).
+-# If you got a difference in error message, just add it to rdiff file
+-FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1#P#p1.MYD
+-CHECK TABLE t1;
+-Table Op Msg_type Msg_text
+-test.t1 check error Size of datafile is: 39 Should be: 52
+-test.t1 check error Partition p1 returned error
+-test.t1 check error Corrupt
+-SELECT a,b FROM t1;
+-a b
+-8 h
+-10 j
+-14 n
+-14 n
+-7 g
+-15 o
+-15 o
+-Warnings:
+-Error 145 Table './test/t1#P#p1' is marked as crashed and should be repaired
+-Error 1194 Table 't1' is marked as crashed and should be repaired
+-Error 1034 Number of rows changed from 4 to 3
+-# Statement ended with one of expected results (0,ER_NOT_KEYFILE,144).
+-# If you got a difference in error message, just add it to rdiff file
+-INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+-# Statement ended with one of expected results (0,144).
+-# If you got a difference in error message, just add it to rdiff file
+-FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1#P#p1.MYI
+-CHECK TABLE t1;
+-Table Op Msg_type Msg_text
+-test.t1 check warning Size of datafile is: 52 Should be: 39
+-test.t1 check error Record-count is not ok; is 4 Should be: 3
+-test.t1 check warning Found 4 key parts. Should be: 3
+-test.t1 check error Partition p1 returned error
+-test.t1 check error Corrupt
+-SELECT a,b FROM t1;
+-a b
+-8 h
+-10 j
+-14 n
+-14 n
+-14 n
+-7 g
+-15 o
+-15 o
+-15 o
+-Warnings:
+-Error 145 Table './test/t1#P#p1' is marked as crashed and should be repaired
+-Error 1194 Table 't1' is marked as crashed and should be repaired
+-Error 1034 Number of rows changed from 3 to 4
+-# Statement ended with one of expected results (0,ER_NOT_KEYFILE,144).
+-# If you got a difference in error message, just add it to rdiff file
+-INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+-# Statement ended with one of expected results (0,144).
+-# If you got a difference in error message, just add it to rdiff file
+-FLUSH TABLE t1;
+ Restoring <DATADIR>/test/t1.par
+ CHECK TABLE t1;
+ Table Op Msg_type Msg_text
+@@ -223,14 +117,8 @@
+ 8 h
+ 10 j
+ 14 n
+-14 n
+-14 n
+-14 n
+ 7 g
+ 15 o
+-15 o
+-15 o
+-15 o
+ # Statement ended with one of expected results (0,ER_NOT_KEYFILE,144).
+ # If you got a difference in error message, just add it to rdiff file
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/parts/suite.opt b/storage/innobase/mysql-test/storage_engine/parts/suite.opt
new file mode 100644
index 00000000..66f581b5
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/suite.opt
@@ -0,0 +1,2 @@
+--innodb
+
diff --git a/storage/innobase/mysql-test/storage_engine/repair_table.rdiff b/storage/innobase/mysql-test/storage_engine/repair_table.rdiff
new file mode 100644
index 00000000..e9c46b3a
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/repair_table.rdiff
@@ -0,0 +1,139 @@
+--- suite/storage_engine/repair_table.result 2013-10-03 20:35:06.000000000 +0400
++++ suite/storage_engine/repair_table.reject 2013-11-08 22:04:22.000000000 +0400
+@@ -4,56 +4,57 @@
+ CREATE TABLE t2 (a <INT_COLUMN>, b <CHAR_COLUMN>) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ REPAIR TABLE t1;
+ Table Op Msg_type Msg_text
+-test.t1 repair status OK
++test.t1 repair note The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (3,'c');
+ INSERT INTO t2 (a,b) VALUES (4,'d');
+ REPAIR NO_WRITE_TO_BINLOG TABLE t1, t2;
+ Table Op Msg_type Msg_text
+-test.t1 repair status OK
+-test.t2 repair status OK
++test.t1 repair note The storage engine for the table doesn't support repair
++test.t2 repair note The storage engine for the table doesn't support repair
+ INSERT INTO t2 (a,b) VALUES (5,'e'),(6,'f');
+ REPAIR LOCAL TABLE t2;
+ Table Op Msg_type Msg_text
+-test.t2 repair status OK
++test.t2 repair note The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (7,'g'),(8,'h');
+ INSERT INTO t2 (a,b) VALUES (9,'i');
+ REPAIR LOCAL TABLE t2, t1 EXTENDED;
+ Table Op Msg_type Msg_text
+-test.t2 repair status OK
+-test.t1 repair status OK
++test.t2 repair note The storage engine for the table doesn't support repair
++test.t1 repair note The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (10,'j');
+ INSERT INTO t2 (a,b) VALUES (11,'k');
+ REPAIR TABLE t1, t2 QUICK USE_FRM;
+ Table Op Msg_type Msg_text
+-test.t1 repair warning Number of rows changed from 0 to 6
+-test.t1 repair status OK
+-test.t2 repair warning Number of rows changed from 0 to 5
+-test.t2 repair status OK
++test.t1 repair note The storage engine for the table doesn't support repair
++test.t2 repair note The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (12,'l');
+ INSERT INTO t2 (a,b) VALUES (13,'m');
+ REPAIR NO_WRITE_TO_BINLOG TABLE t1, t2 QUICK EXTENDED USE_FRM;
+ Table Op Msg_type Msg_text
+-test.t1 repair warning Number of rows changed from 0 to 7
+-test.t1 repair status OK
+-test.t2 repair warning Number of rows changed from 0 to 6
+-test.t2 repair status OK
++test.t1 repair note The storage engine for the table doesn't support repair
++test.t2 repair note The storage engine for the table doesn't support repair
+ FLUSH TABLE t1;
+ INSERT INTO t1 (a,b) VALUES (14,'n');
+-ERROR HY000: Incorrect file format 't1'
+ # Statement ended with one of expected results (0,130,ER_FAILED_READ_FROM_PAR_FILE,ER_OPEN_AS_READONLY).
+ # If you got a difference in error message, just add it to rdiff file
+ CHECK TABLE t1;
+ Table Op Msg_type Msg_text
+-test.t1 check Error Incorrect file format 't1'
+-test.t1 check error Corrupt
++test.t1 check status OK
+ SELECT a,b FROM t1;
+-ERROR HY000: Incorrect file format 't1'
++a b
++1 a
++2 b
++3 c
++7 g
++8 h
++10 j
++12 l
++14 n
+ # Statement ended with one of expected results (0,130,ER_FAILED_READ_FROM_PAR_FILE,ER_OPEN_AS_READONLY).
+ # If you got a difference in error message, just add it to rdiff file
+ REPAIR TABLE t1;
+ Table Op Msg_type Msg_text
+-test.t1 repair Error Incorrect file format 't1'
+-test.t1 repair error Corrupt
++test.t1 repair note The storage engine for the table doesn't support repair
+ DROP TABLE t1, t2;
+ call mtr.add_suppression("Got an error from thread_id=.*");
+ call mtr.add_suppression("MySQL thread id .*, query id .* localhost.*root Checking table");
+@@ -63,46 +64,33 @@
+ CREATE TABLE t1 (a <INT_COLUMN>, b <CHAR_COLUMN>, <CUSTOM_INDEX> (a)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ REPAIR TABLE t1;
+ Table Op Msg_type Msg_text
+-test.t1 repair status OK
++test.t1 repair note The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (7,'g'),(8,'h');
+ REPAIR TABLE t1 EXTENDED;
+ Table Op Msg_type Msg_text
+-test.t1 repair status OK
++test.t1 repair note The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (10,'j');
+ REPAIR TABLE t1 USE_FRM;
+ Table Op Msg_type Msg_text
+-test.t1 repair warning Number of rows changed from 0 to 3
+-test.t1 repair status OK
++test.t1 repair note The storage engine for the table doesn't support repair
+ db.opt
+-t1.MYD
+-t1.MYI
+ t1.frm
++t1.ibd
+ INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+ # Statement ended with one of expected results (0,144).
+ # If you got a difference in error message, just add it to rdiff file
+ FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1.MYD
++Restoring <DATADIR>/test/t1.ibd
+ CHECK TABLE t1;
+ Table Op Msg_type Msg_text
+-test.t1 check error Size of datafile is: 39 Should be: 65
+-test.t1 check error Corrupt
++test.t1 check status OK
+ SELECT a,b FROM t1;
+-ERROR HY000: Index for table 't1' is corrupt; try to repair it
+-# Statement ended with one of expected results (0,ER_NOT_KEYFILE,144).
+-# If you got a difference in error message, just add it to rdiff file
+-INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+-ERROR HY000: Table './test/t1' is marked as crashed and last (automatic?) repair failed
+-# Statement ended with one of expected results (0,144).
+-# If you got a difference in error message, just add it to rdiff file
+-FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1.MYI
+-CHECK TABLE t1;
+-Table Op Msg_type Msg_text
+-test.t1 check warning Table is marked as crashed and last repair failed
+-test.t1 check error Size of datafile is: 39 Should be: 65
+-test.t1 check error Corrupt
+-SELECT a,b FROM t1;
+-ERROR HY000: Table './test/t1' is marked as crashed and last (automatic?) repair failed
++a b
++7 g
++8 h
++10 j
++14 n
++15 o
+ # Statement ended with one of expected results (0,ER_NOT_KEYFILE,144).
+ # If you got a difference in error message, just add it to rdiff file
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/suite.opt b/storage/innobase/mysql-test/storage_engine/suite.opt
new file mode 100644
index 00000000..627becdb
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/suite.opt
@@ -0,0 +1 @@
+--innodb
diff --git a/storage/innobase/mysql-test/storage_engine/tbl_opt_index_dir.rdiff b/storage/innobase/mysql-test/storage_engine/tbl_opt_index_dir.rdiff
new file mode 100644
index 00000000..e09e50b1
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/tbl_opt_index_dir.rdiff
@@ -0,0 +1,23 @@
+--- suite/storage_engine/tbl_opt_data_index_dir.result 2013-10-03 20:35:06.000000000 +0400
++++ suite/storage_engine/tbl_opt_data_index_dir.reject 2013-11-08 22:06:54.000000000 +0400
+@@ -1,10 +1,12 @@
+ DROP TABLE IF EXISTS t1;
++Warnings:
++Warning 1618 <INDEX DIRECTORY> option ignored
+ SHOW CREATE TABLE t1;
+ Table Create Table
+ t1 CREATE TABLE `t1` (
+ `a` int(11) DEFAULT NULL,
+ `b` char(8) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 DATA DIRECTORY='<DATA_DIR>' INDEX DIRECTORY='<INDEX_DIR>'
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 DATA DIRECTORY='<DATA_DIR>'
+ Warnings:
+ Warning 1618 <INDEX DIRECTORY> option ignored
+ SHOW CREATE TABLE t1;
+@@ -12,5 +14,5 @@
+ t1 CREATE TABLE `t1` (
+ `a` int(11) DEFAULT NULL,
+ `b` char(8) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 DATA DIRECTORY='<DATA_DIR>' INDEX DIRECTORY='<INDEX_DIR>'
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 DATA DIRECTORY='<DATA_DIR>'
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/tbl_opt_insert_method.rdiff b/storage/innobase/mysql-test/storage_engine/tbl_opt_insert_method.rdiff
new file mode 100644
index 00000000..468b8292
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/tbl_opt_insert_method.rdiff
@@ -0,0 +1,11 @@
+--- suite/storage_engine/tbl_opt_insert_method.result 2012-06-24 23:55:19.539380000 +0400
++++ suite/storage_engine/tbl_opt_insert_method.reject 2012-07-15 17:51:09.978610512 +0400
+@@ -5,7 +5,7 @@
+ t1 CREATE TABLE `t1` (
+ `a` int(11) DEFAULT NULL,
+ `b` char(8) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 INSERT_METHOD=FIRST
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1
+ ALTER TABLE t1 INSERT_METHOD=NO;
+ SHOW CREATE TABLE t1;
+ Table Create Table
diff --git a/storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.rdiff b/storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.rdiff
new file mode 100644
index 00000000..daa5fc67
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.rdiff
@@ -0,0 +1,44 @@
+--- ../storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.result~ 2017-05-24 00:40:12.854181048 +0300
++++ ../storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.reject 2017-05-24 00:49:06.578191030 +0300
+@@ -7,19 +7,39 @@
+ `b` char(8) DEFAULT NULL
+ ) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 ROW_FORMAT=DYNAMIC
+ ALTER TABLE t1 ROW_FORMAT=FIXED;
++ERROR HY000: Table storage engine '<STORAGE_ENGINE>' does not support the create option 'ROW_TYPE'
++# ERROR: Statement ended with errno 1478, errname ER_ILLEGAL_HA_CREATE_OPTION (expected to succeed)
++# ------------ UNEXPECTED RESULT ------------
++# [ ALTER TABLE t1 ROW_FORMAT=FIXED ]
++# The statement|command finished with ER_ILLEGAL_HA_CREATE_OPTION.
++# ALTER TABLE or the mix could be unsupported|malfunctioning, or the problem was caused by previous errors.
++# You can change the engine code, or create an rdiff, or disable the test by adding it to disabled.def.
++# Further in this test, the message might sometimes be suppressed; a part of the test might be skipped.
++# Also, this problem may cause a chain effect (more errors of different kinds in the test).
++# -------------------------------------------
+ SHOW CREATE TABLE t1;
+ Table Create Table
+ t1 CREATE TABLE `t1` (
+ `a` int(11) DEFAULT NULL,
+ `b` char(8) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 ROW_FORMAT=FIXED
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 ROW_FORMAT=DYNAMIC
+ ALTER TABLE t1 ROW_FORMAT=PAGE;
++ERROR HY000: Table storage engine '<STORAGE_ENGINE>' does not support the create option 'ROW_TYPE'
++# ERROR: Statement ended with errno 1478, errname ER_ILLEGAL_HA_CREATE_OPTION (expected to succeed)
++# ------------ UNEXPECTED RESULT ------------
++# [ ALTER TABLE t1 ROW_FORMAT=PAGE ]
++# The statement|command finished with ER_ILLEGAL_HA_CREATE_OPTION.
++# ALTER TABLE or the mix could be unsupported|malfunctioning, or the problem was caused by previous errors.
++# You can change the engine code, or create an rdiff, or disable the test by adding it to disabled.def.
++# Further in this test, the message might sometimes be suppressed; a part of the test might be skipped.
++# Also, this problem may cause a chain effect (more errors of different kinds in the test).
++# -------------------------------------------
+ SHOW CREATE TABLE t1;
+ Table Create Table
+ t1 CREATE TABLE `t1` (
+ `a` int(11) DEFAULT NULL,
+ `b` char(8) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 ROW_FORMAT=PAGE
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 ROW_FORMAT=DYNAMIC
+ ALTER TABLE t1 ROW_FORMAT=COMPACT;
+ SHOW CREATE TABLE t1;
+ Table Create Table
diff --git a/storage/innobase/mysql-test/storage_engine/tbl_opt_union.rdiff b/storage/innobase/mysql-test/storage_engine/tbl_opt_union.rdiff
new file mode 100644
index 00000000..cbdf5818
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/tbl_opt_union.rdiff
@@ -0,0 +1,16 @@
+--- suite/storage_engine/tbl_opt_union.result 2012-06-24 23:55:19.539380000 +0400
++++ suite/storage_engine/tbl_opt_union.reject 2012-07-15 17:51:31.014346053 +0400
+@@ -4,11 +4,11 @@
+ Table Create Table
+ t1 CREATE TABLE `t1` (
+ `a` int(11) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 UNION=(`child1`)
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1
+ ALTER TABLE t1 UNION = (child1,child2);
+ SHOW CREATE TABLE t1;
+ Table Create Table
+ t1 CREATE TABLE `t1` (
+ `a` int(11) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 UNION=(`child1`,`child2`)
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1
+ DROP TABLE t1, child1, child2;
diff --git a/storage/innobase/mysql-test/storage_engine/trx/cons_snapshot_serializable.rdiff b/storage/innobase/mysql-test/storage_engine/trx/cons_snapshot_serializable.rdiff
new file mode 100644
index 00000000..e6149be5
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/trx/cons_snapshot_serializable.rdiff
@@ -0,0 +1,18 @@
+--- suite/storage_engine/trx/cons_snapshot_serializable.result 2013-11-27 18:46:36.000000000 +0400
++++ suite/storage_engine/trx/cons_snapshot_serializable.reject 2013-11-28 19:17:02.000000000 +0400
+@@ -5,12 +5,15 @@
+ CREATE TABLE t1 (a <INT_COLUMN>) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SET SESSION TRANSACTION ISOLATION LEVEL SERIALIZABLE;
+ START TRANSACTION WITH CONSISTENT SNAPSHOT;
++Warnings:
++Warning 138 InnoDB: WITH CONSISTENT SNAPSHOT was ignored because this phrase can only be used with REPEATABLE READ isolation level.
+ connection con2;
+ INSERT INTO t1 (a) VALUES (1);
+ connection con1;
+ # If consistent read works on this isolation level (SERIALIZABLE), the following SELECT should not return the value we inserted (1)
+ SELECT a FROM t1;
+ a
++1
+ COMMIT;
+ connection default;
+ disconnect con1;
diff --git a/storage/innobase/mysql-test/storage_engine/trx/level_read_committed.rdiff b/storage/innobase/mysql-test/storage_engine/trx/level_read_committed.rdiff
new file mode 100644
index 00000000..cb64d321
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/trx/level_read_committed.rdiff
@@ -0,0 +1,11 @@
+--- suite/storage_engine/trx/level_read_committed.result 2013-11-28 19:18:48.000000000 +0400
++++ suite/storage_engine/trx/level_read_committed.reject 2013-11-28 19:18:59.000000000 +0400
+@@ -77,6 +77,8 @@
+ CREATE TABLE t1 (a <INT_COLUMN>) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+ START TRANSACTION WITH CONSISTENT SNAPSHOT;
++Warnings:
++Warning 138 InnoDB: WITH CONSISTENT SNAPSHOT was ignored because this phrase can only be used with REPEATABLE READ isolation level.
+ connection con2;
+ INSERT INTO t1 (a) VALUES (1);
+ connection con1;
diff --git a/storage/innobase/mysql-test/storage_engine/trx/level_read_uncommitted.rdiff b/storage/innobase/mysql-test/storage_engine/trx/level_read_uncommitted.rdiff
new file mode 100644
index 00000000..6a79abe3
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/trx/level_read_uncommitted.rdiff
@@ -0,0 +1,11 @@
+--- suite/storage_engine/trx/level_read_uncommitted.result 2013-11-28 19:18:48.000000000 +0400
++++ suite/storage_engine/trx/level_read_uncommitted.reject 2013-11-28 19:19:50.000000000 +0400
+@@ -102,6 +102,8 @@
+ CREATE TABLE t1 (a <INT_COLUMN>) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+ START TRANSACTION WITH CONSISTENT SNAPSHOT;
++Warnings:
++Warning 138 InnoDB: WITH CONSISTENT SNAPSHOT was ignored because this phrase can only be used with REPEATABLE READ isolation level.
+ connection con2;
+ INSERT INTO t1 (a) VALUES (1);
+ connection con1;
diff --git a/storage/innobase/mysql-test/storage_engine/trx/suite.opt b/storage/innobase/mysql-test/storage_engine/trx/suite.opt
new file mode 100644
index 00000000..64bbe8b5
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/trx/suite.opt
@@ -0,0 +1,3 @@
+--innodb
+--innodb-lock-wait-timeout=1
+
diff --git a/storage/innobase/mysql-test/storage_engine/type_blob.opt b/storage/innobase/mysql-test/storage_engine/type_blob.opt
new file mode 100644
index 00000000..a007f405
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/type_blob.opt
@@ -0,0 +1 @@
+--innodb_log_file_size=200M
diff --git a/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff b/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
new file mode 100644
index 00000000..98e17f3c
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
@@ -0,0 +1,11 @@
+--- suite/storage_engine/type_char_indexes.result 2014-10-12 14:22:11.000000000 +0400
++++ suite/storage_engine/type_char_indexes.reject 2014-10-12 14:23:28.000000000 +0400
+@@ -137,7 +137,7 @@
+ r3a
+ EXPLAIN SELECT c,c20,v16,v128 FROM t1 WHERE v16 = 'varchar1a' OR v16 = 'varchar3a' ORDER BY v16;
+ id select_type table type possible_keys key key_len ref rows Extra
+-# # # range # v16 # # # #
++# # # ALL # NULL # # # #
+ SELECT c,c20,v16,v128 FROM t1 WHERE v16 = 'varchar1a' OR v16 = 'varchar3a' ORDER BY v16;
+ c c20 v16 v128
+ a char1 varchar1a varchar1b
diff --git a/storage/innobase/mysql-test/storage_engine/type_float_indexes.rdiff b/storage/innobase/mysql-test/storage_engine/type_float_indexes.rdiff
new file mode 100644
index 00000000..6ebfd61d
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/type_float_indexes.rdiff
@@ -0,0 +1,11 @@
+--- suite/storage_engine/type_float_indexes.result 2012-07-12 19:37:27.031661128 +0400
++++ suite/storage_engine/type_float_indexes.reject 2012-07-15 17:52:12.189828410 +0400
+@@ -60,7 +60,7 @@
+ ALTER TABLE t1 ADD UNIQUE KEY(d);
+ EXPLAIN SELECT d FROM t1 WHERE r > 0 and d > 0 ORDER BY d;
+ id select_type table type possible_keys key key_len ref rows Extra
+-# # # # # d # # # #
++# # # # # NULL # # # #
+ SELECT d FROM t1 WHERE r > 0 and d > 0 ORDER BY d;
+ d
+ 1.2345
diff --git a/storage/innobase/mysql-test/storage_engine/type_text.opt b/storage/innobase/mysql-test/storage_engine/type_text.opt
new file mode 100644
index 00000000..a007f405
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/type_text.opt
@@ -0,0 +1 @@
+--innodb_log_file_size=200M
diff --git a/storage/innobase/os/os0event.cc b/storage/innobase/os/os0event.cc
new file mode 100644
index 00000000..f18633cc
--- /dev/null
+++ b/storage/innobase/os/os0event.cc
@@ -0,0 +1,515 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0event.cc
+The interface to the operating system condition variables.
+
+Created 2012-09-23 Sunny Bains
+*******************************************************/
+
+#include "os0event.h"
+#include "ut0mutex.h"
+#include <my_sys.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <synchapi.h>
+/** Native condition variable. */
+typedef CONDITION_VARIABLE os_cond_t;
+#else
+/** Native condition variable */
+typedef pthread_cond_t os_cond_t;
+#endif /* _WIN32 */
+
+/** InnoDB condition variable. */
+struct os_event {
+ os_event() UNIV_NOTHROW;
+
+ ~os_event() UNIV_NOTHROW;
+
+ /**
+ Destroys a condition variable */
+ void destroy() UNIV_NOTHROW
+ {
+#ifndef _WIN32
+ int ret = pthread_cond_destroy(&cond_var);
+ ut_a(ret == 0);
+#endif /* !_WIN32 */
+
+ mutex.destroy();
+ }
+
+ /** Set the event */
+ void set() UNIV_NOTHROW
+ {
+ mutex.enter();
+
+ if (!m_set) {
+ broadcast();
+ }
+
+ mutex.exit();
+ }
+
+ int64_t reset() UNIV_NOTHROW
+ {
+ mutex.enter();
+
+ if (m_set) {
+ m_set = false;
+ }
+
+ int64_t ret = signal_count;
+
+ mutex.exit();
+
+ return(ret);
+ }
+
+ /**
+ Waits for an event object until it is in the signaled state.
+
+ Typically, if the event has been signalled after the os_event_reset()
+ we'll return immediately because event->m_set == true.
+ There are, however, situations (e.g.: sync_array code) where we may
+ lose this information. For example:
+
+ thread A calls os_event_reset()
+ thread B calls os_event_set() [event->m_set == true]
+ thread C calls os_event_reset() [event->m_set == false]
+ thread A calls os_event_wait() [infinite wait!]
+ thread C calls os_event_wait() [infinite wait!]
+
+ Where such a scenario is possible, to avoid infinite wait, the
+ value returned by reset() should be passed in as
+ reset_sig_count. */
+ void wait_low(int64_t reset_sig_count) UNIV_NOTHROW;
+
+ /**
+ Waits for an event object until it is in the signaled state or
+ a timeout is exceeded.
+ @param time_in_usec - timeout in microseconds,
+ or OS_SYNC_INFINITE_TIME
+ @param reset_sig_count- zero or the value returned by
+ previous call of os_event_reset().
+ @return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+ ulint wait_time_low(
+ ulint time_in_usec,
+ int64_t reset_sig_count) UNIV_NOTHROW;
+
+ /** @return true if the event is in the signalled state. */
+ bool is_set() const UNIV_NOTHROW
+ {
+ mutex.enter();
+ bool is_set = m_set;
+ mutex.exit();
+ return is_set;
+ }
+
+private:
+ /**
+ Initialize a condition variable */
+ void init() UNIV_NOTHROW
+ {
+
+ mutex.init();
+
+#ifdef _WIN32
+ InitializeConditionVariable(&cond_var);
+#else
+ {
+ int ret;
+
+ ret = pthread_cond_init(&cond_var, NULL);
+ ut_a(ret == 0);
+ }
+#endif /* _WIN32 */
+ }
+
+ /**
+ Wait on condition variable */
+ void wait() UNIV_NOTHROW
+ {
+#ifdef _WIN32
+ if (!SleepConditionVariableCS(&cond_var, mutex, INFINITE)) {
+ ut_error;
+ }
+#else
+ {
+ int ret;
+
+ ret = pthread_cond_wait(&cond_var, mutex);
+ ut_a(ret == 0);
+ }
+#endif /* _WIN32 */
+ }
+
+ /**
+ Wakes all threads waiting for condition variable */
+ void broadcast() UNIV_NOTHROW
+ {
+ m_set = true;
+ ++signal_count;
+
+#ifdef _WIN32
+ WakeAllConditionVariable(&cond_var);
+#else
+ {
+ int ret;
+
+ ret = pthread_cond_broadcast(&cond_var);
+ ut_a(ret == 0);
+ }
+#endif /* _WIN32 */
+ }
+
+ /**
+ Wakes one thread waiting for condition variable */
+ void signal() UNIV_NOTHROW
+ {
+#ifdef _WIN32
+ WakeConditionVariable(&cond_var);
+#else
+ {
+ int ret;
+
+ ret = pthread_cond_signal(&cond_var);
+ ut_a(ret == 0);
+ }
+#endif /* _WIN32 */
+ }
+
+ /**
+ Do a timed wait on condition variable.
+ @param abstime - timeout
+ @param time_in_ms - timeout in milliseconds.
+ @return true if timed out, false otherwise */
+ bool timed_wait(
+#ifndef _WIN32
+ const timespec* abstime
+#else
+ DWORD time_in_ms
+#endif /* !_WIN32 */
+ );
+
+private:
+ bool m_set; /*!< this is true when the
+ event is in the signaled
+ state, i.e., a thread does
+ not stop if it tries to wait
+ for this event */
+ int64_t signal_count; /*!< this is incremented
+ each time the event becomes
+ signaled */
+ mutable OSMutex mutex; /*!< this mutex protects
+ the next fields */
+
+
+ os_cond_t cond_var; /*!< condition variable is
+ used in waiting for the event */
+
+protected:
+ // Disable copying
+ os_event(const os_event&);
+ os_event& operator=(const os_event&);
+};
+
+/**
+Do a timed wait on condition variable.
+@param abstime - absolute time to wait
+@param time_in_ms - timeout in milliseconds
+@return true if timed out */
+bool
+os_event::timed_wait(
+#ifndef _WIN32
+ const timespec* abstime
+#else
+ DWORD time_in_ms
+#endif /* !_WIN32 */
+)
+{
+#ifdef _WIN32
+ BOOL ret;
+
+ ret = SleepConditionVariableCS(&cond_var, mutex, time_in_ms);
+
+ if (!ret) {
+ DWORD err = GetLastError();
+
+ /* FQDN=msdn.microsoft.com
+ @see http://$FQDN/en-us/library/ms686301%28VS.85%29.aspx,
+
+ "Condition variables are subject to spurious wakeups
+ (those not associated with an explicit wake) and stolen wakeups
+ (another thread manages to run before the woken thread)."
+ Check for both types of timeouts.
+ Conditions are checked by the caller.*/
+ if (err == WAIT_TIMEOUT || err == ERROR_TIMEOUT) {
+ return(true);
+ }
+ }
+
+ ut_a(ret);
+
+ return(false);
+#else
+ int ret;
+
+ ret = pthread_cond_timedwait(&cond_var, mutex, abstime);
+
+ switch (ret) {
+ case 0:
+ case ETIMEDOUT:
+ /* We play it safe by checking for EINTR even though
+ according to the POSIX documentation it can't return EINTR. */
+ case EINTR:
+ break;
+
+ default:
+ ib::error() << "pthread_cond_timedwait() returned: " << ret
+ << ": abstime={" << abstime->tv_sec << ","
+ << abstime->tv_nsec << "}";
+ ut_error;
+ }
+
+ return(ret == ETIMEDOUT);
+#endif /* _WIN32 */
+}
+
+/**
+Waits for an event object until it is in the signaled state.
+
+Typically, if the event has been signalled after the os_event_reset()
+we'll return immediately because event->m_set == true.
+There are, however, situations (e.g.: sync_array code) where we may
+lose this information. For example:
+
+thread A calls os_event_reset()
+thread B calls os_event_set() [event->m_set == true]
+thread C calls os_event_reset() [event->m_set == false]
+thread A calls os_event_wait() [infinite wait!]
+thread C calls os_event_wait() [infinite wait!]
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by reset() should be passed in as
+reset_sig_count. */
+void
+os_event::wait_low(
+ int64_t reset_sig_count) UNIV_NOTHROW
+{
+ mutex.enter();
+
+ if (!reset_sig_count) {
+ reset_sig_count = signal_count;
+ }
+
+ while (!m_set && signal_count == reset_sig_count) {
+
+ wait();
+
+ /* Spurious wakeups may occur: we have to check if the
+ event really has been signaled after we came here to wait. */
+ }
+
+ mutex.exit();
+}
+
+/**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded.
+@param time_in_usec - timeout in microseconds, or OS_SYNC_INFINITE_TIME
+@param reset_sig_count - zero or the value returned by previous call
+ of os_event_reset().
+@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+ulint
+os_event::wait_time_low(
+ ulint time_in_usec,
+ int64_t reset_sig_count) UNIV_NOTHROW
+{
+ bool timed_out = false;
+
+#ifdef _WIN32
+ DWORD time_in_ms;
+
+ if (time_in_usec != OS_SYNC_INFINITE_TIME) {
+ time_in_ms = DWORD(time_in_usec / 1000);
+ } else {
+ time_in_ms = INFINITE;
+ }
+#else
+ struct timespec abstime;
+
+ if (time_in_usec != OS_SYNC_INFINITE_TIME) {
+ ulonglong usec = ulonglong(time_in_usec) + my_hrtime().val;
+ abstime.tv_sec = static_cast<time_t>(usec / 1000000);
+ abstime.tv_nsec = static_cast<uint>((usec % 1000000) * 1000);
+ } else {
+ abstime.tv_nsec = 999999999;
+ abstime.tv_sec = (time_t) ULINT_MAX;
+ }
+
+ ut_a(abstime.tv_nsec <= 999999999);
+
+#endif /* _WIN32 */
+
+ mutex.enter();
+
+ if (!reset_sig_count) {
+ reset_sig_count = signal_count;
+ }
+
+ do {
+ if (m_set || signal_count != reset_sig_count) {
+
+ break;
+ }
+
+#ifndef _WIN32
+ timed_out = timed_wait(&abstime);
+#else
+ timed_out = timed_wait(time_in_ms);
+#endif /* !_WIN32 */
+
+ } while (!timed_out);
+
+ mutex.exit();
+
+ return(timed_out ? OS_SYNC_TIME_EXCEEDED : 0);
+}
+
+/** Constructor */
+os_event::os_event() UNIV_NOTHROW
+{
+ init();
+
+ m_set = false;
+
+ /* We return this value in os_event_reset(),
+ which can then be be used to pass to the
+ os_event_wait_low(). The value of zero is
+ reserved in os_event_wait_low() for the case
+ when the caller does not want to pass any
+ signal_count value. To distinguish between
+ the two cases we initialize signal_count
+ to 1 here. */
+
+ signal_count = 1;
+}
+
+/** Destructor */
+os_event::~os_event() UNIV_NOTHROW
+{
+ destroy();
+}
+
+/**
+Creates an event semaphore, i.e., a semaphore which may just have two
+states: signaled and nonsignaled. The created event is manual reset: it
+must be reset explicitly by calling sync_os_reset_event.
+@return the event handle */
+os_event_t os_event_create(const char*)
+{
+ return(UT_NEW_NOKEY(os_event()));
+}
+
+/**
+Check if the event is set.
+@return true if set */
+bool
+os_event_is_set(
+/*============*/
+ const os_event_t event) /*!< in: event to test */
+{
+ return(event->is_set());
+}
+
+/**
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+void
+os_event_set(
+/*=========*/
+ os_event_t event) /*!< in/out: event to set */
+{
+ event->set();
+}
+
+/**
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event.
+The return value should be passed to os_even_wait_low() if it is desired
+that this thread should not wait in case of an intervening call to
+os_event_set() between this os_event_reset() and the
+os_event_wait_low() call. See comments for os_event_wait_low().
+@return current signal_count. */
+int64_t
+os_event_reset(
+/*===========*/
+ os_event_t event) /*!< in/out: event to reset */
+{
+ return(event->reset());
+}
+
+/**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded.
+@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+ulint
+os_event_wait_time_low(
+/*===================*/
+ os_event_t event, /*!< in/out: event to wait */
+ ulint time_in_usec, /*!< in: timeout in
+ microseconds, or
+ OS_SYNC_INFINITE_TIME */
+ int64_t reset_sig_count) /*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+{
+ return(event->wait_time_low(time_in_usec, reset_sig_count));
+}
+
+/**
+Waits for an event object until it is in the signaled state.
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by os_event_reset() should be passed in as
+reset_sig_count. */
+void
+os_event_wait_low(
+/*==============*/
+ os_event_t event, /*!< in: event to wait */
+ int64_t reset_sig_count) /*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+{
+ event->wait_low(reset_sig_count);
+}
+
+/**
+Frees an event object. */
+void
+os_event_destroy(
+/*=============*/
+ os_event_t& event) /*!< in/own: event to free */
+
+{
+ UT_DELETE(event);
+ event = NULL;
+}
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
new file mode 100644
index 00000000..7a6829e7
--- /dev/null
+++ b/storage/innobase/os/os0file.cc
@@ -0,0 +1,4349 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file os/os0file.cc
+The interface to the operating system file i/o primitives
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+#include "os0file.h"
+#include "sql_const.h"
+
+#ifdef UNIV_LINUX
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif
+
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#ifdef HAVE_LINUX_UNISTD_H
+#include "unistd.h"
+#endif
+#include "os0event.h"
+#include "os0thread.h"
+
+#include <vector>
+#include <tpool_structs.h>
+
+#ifdef LINUX_NATIVE_AIO
+#include <libaio.h>
+#endif /* LINUX_NATIVE_AIO */
+
+#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
+# include <fcntl.h>
+# include <linux/falloc.h>
+#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
+
+#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
+# include <sys/ioctl.h>
+# ifndef DFS_IOCTL_ATOMIC_WRITE_SET
+# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
+# endif
+#endif
+
+#ifdef _WIN32
+#include <winioctl.h>
+#else
+// my_test_if_atomic_write()
+#include <my_sys.h>
+#endif
+
+#include "buf0dblwr.h"
+
+#include <thread>
+#include <chrono>
+
+/* Per-IO operation environment*/
+class io_slots
+{
+private:
+ tpool::cache<tpool::aiocb> m_cache;
+ tpool::task_group m_group;
+ int m_max_aio;
+public:
+ io_slots(int max_submitted_io, int max_callback_concurrency) :
+ m_cache(max_submitted_io),
+ m_group(max_callback_concurrency),
+ m_max_aio(max_submitted_io)
+ {
+ }
+ /* Get cached AIO control block */
+ tpool::aiocb* acquire()
+ {
+ return m_cache.get();
+ }
+ /* Release AIO control block back to cache */
+ void release(tpool::aiocb* aiocb)
+ {
+ m_cache.put(aiocb);
+ }
+
+ bool contains(tpool::aiocb* aiocb)
+ {
+ return m_cache.contains(aiocb);
+ }
+
+ /* Wait for completions of all AIO operations */
+ void wait()
+ {
+ m_cache.wait();
+ }
+
+ size_t pending_io_count()
+ {
+ return (size_t)m_max_aio - m_cache.size();
+ }
+
+ tpool::task_group* get_task_group()
+ {
+ return &m_group;
+ }
+
+ ~io_slots()
+ {
+ wait();
+ }
+};
+
+static io_slots *read_slots;
+static io_slots *write_slots;
+
+/** Number of retries for partial I/O's */
+constexpr ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
+
+/* This specifies the file permissions InnoDB uses when it creates files in
+Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
+my_umask */
+
+#ifndef _WIN32
+/** Umask for creating files */
+static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+#else
+/** Umask for creating files */
+static ulint os_innodb_umask = 0;
+#endif /* _WIN32 */
+
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
+#else
+#define WAIT_ALLOW_WRITES() do { } while (0)
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
+
+Atomic_counter<ulint> os_n_file_reads;
+static ulint os_bytes_read_since_printout;
+ulint os_n_file_writes;
+ulint os_n_fsyncs;
+static ulint os_n_file_reads_old;
+static ulint os_n_file_writes_old;
+static ulint os_n_fsyncs_old;
+
+static time_t os_last_printout;
+bool os_has_said_disk_full;
+
+/** Default Zip compression level */
+extern uint page_zip_level;
+
+#ifdef UNIV_PFS_IO
+/* Keys to register InnoDB I/O with performance schema */
+mysql_pfs_key_t innodb_data_file_key;
+mysql_pfs_key_t innodb_log_file_key;
+mysql_pfs_key_t innodb_temp_file_key;
+#endif
+
+/** Handle errors for file operations.
+@param[in] name name of a file or NULL
+@param[in] operation operation
+@param[in] should_abort whether to abort on an unknown error
+@param[in] on_error_silent whether to suppress reports of non-fatal errors
+@return true if we should retry the operation */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+os_file_handle_error_cond_exit(
+ const char* name,
+ const char* operation,
+ bool should_abort,
+ bool on_error_silent);
+
+/** Does error handling when a file operation fails.
+@param[in] name name of a file or NULL
+@param[in] operation operation name that failed
+@return true if we should retry the operation */
+static
+bool
+os_file_handle_error(
+ const char* name,
+ const char* operation)
+{
+ /* Exit in case of unknown error */
+ return(os_file_handle_error_cond_exit(name, operation, true, false));
+}
+
+/** Does error handling when a file operation fails.
+@param[in] name name of a file or NULL
+@param[in] operation operation name that failed
+@param[in] on_error_silent if true then don't print any message to the log.
+@return true if we should retry the operation */
+static
+bool
+os_file_handle_error_no_exit(
+ const char* name,
+ const char* operation,
+ bool on_error_silent)
+{
+ /* Don't exit in case of unknown error */
+ return(os_file_handle_error_cond_exit(
+ name, operation, false, on_error_silent));
+}
+
+/** Handle RENAME error.
+@param name old name of the file
+@param new_name new name of the file */
+static void os_file_handle_rename_error(const char* name, const char* new_name)
+{
+ if (os_file_get_last_error(true) != OS_FILE_DISK_FULL) {
+ ib::error() << "Cannot rename file '" << name << "' to '"
+ << new_name << "'";
+ } else if (!os_has_said_disk_full) {
+ os_has_said_disk_full = true;
+ /* Disk full error is reported irrespective of the
+ on_error_silent setting. */
+ ib::error() << "Full disk prevents renaming file '"
+ << name << "' to '" << new_name << "'";
+ }
+}
+
+
+#ifdef _WIN32
+
+/**
+ Wrapper around Windows DeviceIoControl() function.
+
+ Works synchronously, also in case for handle opened
+ for async access (i.e with FILE_FLAG_OVERLAPPED).
+
+ Accepts the same parameters as DeviceIoControl(),except
+ last parameter (OVERLAPPED).
+*/
+static
+BOOL
+os_win32_device_io_control(
+ HANDLE handle,
+ DWORD code,
+ LPVOID inbuf,
+ DWORD inbuf_size,
+ LPVOID outbuf,
+ DWORD outbuf_size,
+ LPDWORD bytes_returned
+)
+{
+ OVERLAPPED overlapped = { 0 };
+ overlapped.hEvent = tpool::win_get_syncio_event();
+ BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf,
+ outbuf_size, NULL, &overlapped);
+
+ if (result || (GetLastError() == ERROR_IO_PENDING)) {
+ /* Wait for async io to complete */
+ result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE);
+ }
+
+ return result;
+}
+
+#endif
+
+
+
+/** Helper class for doing synchronous file IO. Currently, the objective
+is to hide the OS specific code, so that the higher level functions aren't
+peppered with #ifdef. Makes the code flow difficult to follow. */
+class SyncFileIO
+{
+public:
+ /** Constructor
+ @param[in] fh File handle
+ @param[in,out] buf Buffer to read/write
+ @param[in] n Number of bytes to read/write
+ @param[in] offset Offset where to read or write */
+ SyncFileIO(os_file_t fh, void *buf, ulint n, os_offset_t offset) :
+ m_fh(fh), m_buf(buf), m_n(static_cast<ssize_t>(n)), m_offset(offset)
+ { ut_ad(m_n > 0); }
+
+ /** Do the read/write
+ @param[in] request The IO context and type
+ @return the number of bytes read/written or negative value on error */
+ ssize_t execute(const IORequest &request);
+
+ /** Move the read/write offset up to where the partial IO succeeded.
+ @param[in] n_bytes The number of bytes to advance */
+ void advance(ssize_t n_bytes)
+ {
+ m_offset+= n_bytes;
+ ut_ad(m_n >= n_bytes);
+ m_n-= n_bytes;
+ m_buf= reinterpret_cast<uchar*>(m_buf) + n_bytes;
+ }
+
+private:
+ /** Open file handle */
+ const os_file_t m_fh;
+ /** Buffer to read/write */
+ void *m_buf;
+ /** Number of bytes to read/write */
+ ssize_t m_n;
+ /** Offset from where to read/write */
+ os_offset_t m_offset;
+};
+
+#undef USE_FILE_LOCK
+#ifndef _WIN32
+/* On Windows, mandatory locking is used */
+# define USE_FILE_LOCK
+#endif
+#ifdef USE_FILE_LOCK
+/** Obtain an exclusive lock on a file.
+@param[in] fd file descriptor
+@param[in] name file name
+@return 0 on success */
+static
+int
+os_file_lock(
+ int fd,
+ const char* name)
+{
+ if (my_disable_locking) {
+ return 0;
+ }
+
+ struct flock lk;
+
+ lk.l_type = F_WRLCK;
+ lk.l_whence = SEEK_SET;
+ lk.l_start = lk.l_len = 0;
+
+ if (fcntl(fd, F_SETLK, &lk) == -1) {
+
+ ib::error()
+ << "Unable to lock " << name
+ << " error: " << errno;
+
+ if (errno == EAGAIN || errno == EACCES) {
+
+ ib::info()
+ << "Check that you do not already have"
+ " another mysqld process using the"
+ " same InnoDB data or log files.";
+ }
+
+ return(-1);
+ }
+
+ return(0);
+}
+#endif /* USE_FILE_LOCK */
+
+
+/** Create a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the in the mysql server configuration
+parameter (--tmpdir).
+@return temporary file handle, or NULL on error */
+FILE*
+os_file_create_tmpfile()
+{
+ FILE* file = NULL;
+ WAIT_ALLOW_WRITES();
+ File fd = mysql_tmpfile("ib");
+
+ if (fd >= 0) {
+ file = my_fdopen(fd, 0, O_RDWR|O_TRUNC|O_CREAT|FILE_BINARY,
+ MYF(MY_WME));
+ if (!file) {
+ my_close(fd, MYF(MY_WME));
+ }
+ }
+
+ if (file == NULL) {
+
+ ib::error()
+ << "Unable to create temporary file; errno: "
+ << errno;
+ }
+
+ return(file);
+}
+
+/** Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files.
+@param[in,out] file File to read from
+@param[in,out] str Buffer where to read
+@param[in] size Size of buffer */
+void
+os_file_read_string(
+ FILE* file,
+ char* str,
+ ulint size)
+{
+ if (size != 0) {
+ rewind(file);
+
+ size_t flen = fread(str, 1, size - 1, file);
+
+ str[flen] = '\0';
+ }
+}
+
+/** This function returns a new path name after replacing the basename
+in an old path with a new basename. The old_path is a full path
+name including the extension. The tablename is in the normal
+form "databasename/tablename". The new base name is found after
+the forward slash. Both input strings are null terminated.
+
+This function allocates memory to be returned. It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@param[in] old_path Pathname
+@param[in] tablename Contains new base name
+@return own: new full pathname */
+char*
+os_file_make_new_pathname(
+ const char* old_path,
+ const char* tablename)
+{
+ ulint dir_len;
+ char* last_slash;
+ char* base_name;
+ char* new_path;
+ ulint new_path_len;
+
+ /* Split the tablename into its database and table name components.
+ They are separated by a '/'. */
+ last_slash = strrchr((char*) tablename, '/');
+ base_name = last_slash ? last_slash + 1 : (char*) tablename;
+
+ /* Find the offset of the last slash. We will strip off the
+ old basename.ibd which starts after that slash. */
+ last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
+ dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path);
+
+ /* allocate a new path and move the old directory path to it. */
+ new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
+ new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
+ memcpy(new_path, old_path, dir_len);
+
+ snprintf(new_path + dir_len, new_path_len - dir_len,
+ "%c%s.ibd", OS_PATH_SEPARATOR, base_name);
+
+ return(new_path);
+}
+
+/** This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return. The result is used
+to inform a SHOW CREATE TABLE command.
+@param[in,out] data_dir_path Full path/data_dir_path */
+void
+os_file_make_data_dir_path(
+ char* data_dir_path)
+{
+ /* Replace the period before the extension with a null byte. */
+ char* ptr = strrchr((char*) data_dir_path, '.');
+
+ if (ptr == NULL) {
+ return;
+ }
+
+ ptr[0] = '\0';
+
+ /* The tablename starts after the last slash. */
+ ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
+
+ if (ptr == NULL) {
+ return;
+ }
+
+ ptr[0] = '\0';
+
+ char* tablename = ptr + 1;
+
+ /* The databasename starts after the next to last slash. */
+ ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
+
+ if (ptr == NULL) {
+ return;
+ }
+
+ ulint tablename_len = strlen(tablename);
+
+ memmove(++ptr, tablename, tablename_len);
+
+ ptr[tablename_len] = '\0';
+}
+
+/** Check if the path refers to the root of a drive using a pointer
+to the last directory separator that the caller has fixed.
+@param[in] path path name
+@param[in] path last directory separator in the path
+@return true if this path is a drive root, false if not */
+UNIV_INLINE
+bool
+os_file_is_root(
+ const char* path,
+ const char* last_slash)
+{
+ return(
+#ifdef _WIN32
+ (last_slash == path + 2 && path[1] == ':') ||
+#endif /* _WIN32 */
+ last_slash == path);
+}
+
+/** Return the parent directory component of a null-terminated path.
+Return a new buffer containing the string up to, but not including,
+the final component of the path.
+The path returned will not contain a trailing separator.
+Do not return a root path, return NULL instead.
+The final component trimmed off may be a filename or a directory name.
+If the final component is the only component of the path, return NULL.
+It is the caller's responsibility to free the returned string after it
+is no longer needed.
+@param[in] path Path name
+@return own: parent directory of the path */
+static
+char*
+os_file_get_parent_dir(
+ const char* path)
+{
+ bool has_trailing_slash = false;
+
+ /* Find the offset of the last slash */
+ const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
+
+ if (!last_slash) {
+ /* No slash in the path, return NULL */
+ return(NULL);
+ }
+
+ /* Ok, there is a slash. Is there anything after it? */
+ if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
+ has_trailing_slash = true;
+ }
+
+ /* Reduce repetative slashes. */
+ while (last_slash > path
+ && last_slash[-1] == OS_PATH_SEPARATOR) {
+ last_slash--;
+ }
+
+ /* Check for the root of a drive. */
+ if (os_file_is_root(path, last_slash)) {
+ return(NULL);
+ }
+
+ /* If a trailing slash prevented the first strrchr() from trimming
+ the last component of the path, trim that component now. */
+ if (has_trailing_slash) {
+ /* Back up to the previous slash. */
+ last_slash--;
+ while (last_slash > path
+ && last_slash[0] != OS_PATH_SEPARATOR) {
+ last_slash--;
+ }
+
+ /* Reduce repetative slashes. */
+ while (last_slash > path
+ && last_slash[-1] == OS_PATH_SEPARATOR) {
+ last_slash--;
+ }
+ }
+
+ /* Check for the root of a drive. */
+ if (os_file_is_root(path, last_slash)) {
+ return(NULL);
+ }
+
+ if (last_slash - path < 0) {
+ /* Sanity check, it prevents gcc from trying to handle this case which
+ * results in warnings for some optimized builds */
+ return (NULL);
+ }
+
+ /* Non-trivial directory component */
+
+ return(mem_strdupl(path, ulint(last_slash - path)));
+}
+#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+
+/* Test the function os_file_get_parent_dir. */
+void
+test_os_file_get_parent_dir(
+ const char* child_dir,
+ const char* expected_dir)
+{
+ char* child = mem_strdup(child_dir);
+ char* expected = expected_dir == NULL ? NULL
+ : mem_strdup(expected_dir);
+
+ /* os_file_get_parent_dir() assumes that separators are
+ converted to OS_PATH_SEPARATOR. */
+ os_normalize_path(child);
+ os_normalize_path(expected);
+
+ char* parent = os_file_get_parent_dir(child);
+
+ bool unexpected = (expected == NULL
+ ? (parent != NULL)
+ : (0 != strcmp(parent, expected)));
+ if (unexpected) {
+ ib::fatal() << "os_file_get_parent_dir('" << child
+ << "') returned '" << parent
+ << "', instead of '" << expected << "'.";
+ }
+ ut_free(parent);
+ ut_free(child);
+ ut_free(expected);
+}
+
+/* Test the function os_file_get_parent_dir. */
+void
+unit_test_os_file_get_parent_dir()
+{
+ test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
+ test_os_file_get_parent_dir("/usr/", NULL);
+ test_os_file_get_parent_dir("//usr//", NULL);
+ test_os_file_get_parent_dir("usr", NULL);
+ test_os_file_get_parent_dir("usr//", NULL);
+ test_os_file_get_parent_dir("/", NULL);
+ test_os_file_get_parent_dir("//", NULL);
+ test_os_file_get_parent_dir(".", NULL);
+ test_os_file_get_parent_dir("..", NULL);
+# ifdef _WIN32
+ test_os_file_get_parent_dir("D:", NULL);
+ test_os_file_get_parent_dir("D:/", NULL);
+ test_os_file_get_parent_dir("D:\\", NULL);
+ test_os_file_get_parent_dir("D:/data", NULL);
+ test_os_file_get_parent_dir("D:/data/", NULL);
+ test_os_file_get_parent_dir("D:\\data\\", NULL);
+ test_os_file_get_parent_dir("D:///data/////", NULL);
+ test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
+ test_os_file_get_parent_dir("D:/data//a", "D:/data");
+ test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
+ test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
+ test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
+#endif /* _WIN32 */
+}
+#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
+
+
+/** Creates all missing subdirectories along the given path.
+@param[in] path Path name
+@return DB_SUCCESS if OK, otherwise error code. */
+dberr_t
+os_file_create_subdirs_if_needed(
+ const char* path)
+{
+ if (srv_read_only_mode) {
+
+ ib::error()
+ << "read only mode set. Can't create "
+ << "subdirectories '" << path << "'";
+
+ return(DB_READ_ONLY);
+
+ }
+
+ char* subdir = os_file_get_parent_dir(path);
+
+ if (subdir == NULL) {
+ /* subdir is root or cwd, nothing to do */
+ return(DB_SUCCESS);
+ }
+
+ /* Test if subdir exists */
+ os_file_type_t type;
+ bool subdir_exists;
+ bool success = os_file_status(subdir, &subdir_exists, &type);
+
+ if (success && !subdir_exists) {
+
+ /* Subdir does not exist, create it */
+ dberr_t err = os_file_create_subdirs_if_needed(subdir);
+
+ if (err != DB_SUCCESS) {
+
+ ut_free(subdir);
+
+ return(err);
+ }
+
+ success = os_file_create_directory(subdir, false);
+ }
+
+ ut_free(subdir);
+
+ return(success ? DB_SUCCESS : DB_ERROR);
+}
+
+
+
+/** Do the read/write
+@param[in] request The IO context and type
+@return the number of bytes read/written or negative value on error */
+ssize_t
+SyncFileIO::execute(const IORequest& request)
+{
+ ssize_t n_bytes;
+
+ if (request.is_read()) {
+#ifdef _WIN32
+ n_bytes = tpool::pread(m_fh, m_buf, m_n, m_offset);
+#else
+ n_bytes = pread(m_fh, m_buf, m_n, m_offset);
+#endif
+ } else {
+ ut_ad(request.is_write());
+#ifdef _WIN32
+ n_bytes = tpool::pwrite(m_fh, m_buf, m_n, m_offset);
+#else
+ n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
+#endif
+ }
+
+ return(n_bytes);
+}
+
+#ifndef _WIN32
+/** Free storage space associated with a section of the file.
+@param[in] fh Open file handle
+@param[in] off Starting offset (SEEK_SET)
+@param[in] len Size of the hole
+@return DB_SUCCESS or error code */
+static
+dberr_t
+os_file_punch_hole_posix(
+ os_file_t fh,
+ os_offset_t off,
+ os_offset_t len)
+{
+
+#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
+ const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+
+ int ret = fallocate(fh, mode, off, len);
+
+ if (ret == 0) {
+ return(DB_SUCCESS);
+ }
+
+ if (errno == ENOTSUP) {
+ return(DB_IO_NO_PUNCH_HOLE);
+ }
+
+ ib::warn()
+ << "fallocate("
+ <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
+ << off << ", " << len << ") returned errno: "
+ << errno;
+
+ return(DB_IO_ERROR);
+
+#elif defined(UNIV_SOLARIS)
+
+ // Use F_FREESP
+
+#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
+
+ return(DB_IO_NO_PUNCH_HOLE);
+}
+
+
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@param[in] report_all_errors true if we want an error message
+ printed of all errors
+@param[in] on_error_silent true then don't print any diagnostic
+ to the log
+@return error number, or OS error number + 100 */
+static
+ulint
+os_file_get_last_error_low(
+ bool report_all_errors,
+ bool on_error_silent)
+{
+ int err = errno;
+
+ if (err == 0) {
+ return(0);
+ }
+
+ if (report_all_errors
+ || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
+
+ ib::error()
+ << "Operating system error number "
+ << err
+ << " in a file operation.";
+
+ if (err == ENOENT) {
+
+ ib::error()
+ << "The error means the system"
+ " cannot find the path specified.";
+
+ if (srv_is_being_started) {
+
+ ib::error()
+ << "If you are installing InnoDB,"
+ " remember that you must create"
+ " directories yourself, InnoDB"
+ " does not create them.";
+ }
+ } else if (err == EACCES) {
+
+ ib::error()
+ << "The error means mysqld does not have"
+ " the access rights to the directory.";
+
+ } else {
+ if (strerror(err) != NULL) {
+
+ ib::error()
+ << "Error number " << err << " means '"
+ << strerror(err) << "'";
+ }
+
+ ib::info() << OPERATING_SYSTEM_ERROR_MSG;
+ }
+ }
+
+ switch (err) {
+ case ENOSPC:
+ return(OS_FILE_DISK_FULL);
+ case ENOENT:
+ return(OS_FILE_NOT_FOUND);
+ case EEXIST:
+ return(OS_FILE_ALREADY_EXISTS);
+ case EXDEV:
+ case ENOTDIR:
+ case EISDIR:
+ return(OS_FILE_PATH_ERROR);
+ case EAGAIN:
+ if (srv_use_native_aio) {
+ return(OS_FILE_AIO_RESOURCES_RESERVED);
+ }
+ break;
+ case EINTR:
+ if (srv_use_native_aio) {
+ return(OS_FILE_AIO_INTERRUPTED);
+ }
+ break;
+ case EACCES:
+ return(OS_FILE_ACCESS_VIOLATION);
+ }
+ return(OS_FILE_ERROR_MAX + err);
+}
+
+/** Wrapper to fsync() or fdatasync() that retries the call on some errors.
+Returns the value 0 if successful; otherwise the value -1 is returned and
+the global variable errno is set to indicate the error.
+@param[in] file open file handle
+@return 0 if success, -1 otherwise */
+static int os_file_sync_posix(os_file_t file)
+{
+#if !defined(HAVE_FDATASYNC) || HAVE_DECL_FDATASYNC == 0
+ auto func= fsync;
+ auto func_name= "fsync()";
+#else
+ auto func= fdatasync;
+ auto func_name= "fdatasync()";
+#endif
+
+ ulint failures= 0;
+
+ for (;;)
+ {
+ ++os_n_fsyncs;
+
+ int ret= func(file);
+
+ if (ret == 0)
+ return ret;
+
+ switch (errno)
+ {
+ case ENOLCK:
+ ++failures;
+ ut_a(failures < 1000);
+
+ if (!(failures % 100))
+ ib::warn() << func_name << ": No locks available; retrying";
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(200));
+ break;
+
+ case EINTR:
+ ++failures;
+ ut_a(failures < 2000);
+ break;
+
+ default:
+ ib::fatal() << func_name << " returned " << errno;
+ }
+ }
+}
+
+/** Check the existence and type of the given file.
+@param[in] path path name of file
+@param[out] exists true if the file exists
+@param[out] type Type of the file, if it exists
+@return true if call succeeded */
+static
+bool
+os_file_status_posix(
+ const char* path,
+ bool* exists,
+ os_file_type_t* type)
+{
+ struct stat statinfo;
+
+ int ret = stat(path, &statinfo);
+
+ *exists = !ret;
+
+ if (!ret) {
+ /* file exists, everything OK */
+
+ } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
+ /* file does not exist */
+ return(true);
+
+ } else {
+ /* file exists, but stat call failed */
+ os_file_handle_error_no_exit(path, "stat", false);
+ return(false);
+ }
+
+ if (S_ISDIR(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_DIR;
+
+ } else if (S_ISLNK(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_LINK;
+
+ } else if (S_ISREG(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_FILE;
+ } else {
+ *type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_flush(), not directly this
+function!
+Flushes the write buffers of a given file to the disk.
+@param[in] file handle to a file
+@return true if success */
+bool
+os_file_flush_func(
+ os_file_t file)
+{
+ int ret;
+
+ WAIT_ALLOW_WRITES();
+ ret = os_file_sync_posix(file);
+
+ if (ret == 0) {
+ return(true);
+ }
+
+ /* Since Linux returns EINVAL if the 'file' is actually a raw device,
+ we choose to ignore that error if we are using raw disks */
+
+ if (srv_start_raw_disk_in_use && errno == EINVAL) {
+
+ return(true);
+ }
+
+ ib::error() << "The OS said file flush did not succeed";
+
+ os_file_handle_error(NULL, "flush");
+
+ /* It is a fatal error if a file flush does not succeed, because then
+ the database can get corrupt on disk */
+ ut_error;
+
+ return(false);
+}
+
+/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in] read_only if true, read only checks are enforced
+@param[out] success true if succeed, false if error
+@return handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success)
+{
+ pfs_os_file_t file;
+
+ *success = false;
+
+ int create_flag;
+ const char* mode_str = NULL;
+
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
+ WAIT_ALLOW_WRITES();
+ }
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+ mode_str = "OPEN";
+
+ if (access_type == OS_FILE_READ_ONLY) {
+
+ create_flag = O_RDONLY;
+
+ } else if (read_only) {
+
+ create_flag = O_RDONLY;
+
+ } else {
+ create_flag = O_RDWR;
+ }
+
+ } else if (read_only) {
+
+ mode_str = "OPEN";
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ mode_str = "CREATE";
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else if (create_mode == OS_FILE_CREATE_PATH) {
+
+ mode_str = "CREATE PATH";
+ /* Create subdirs along the path if needed. */
+
+ *success = os_file_create_subdirs_if_needed(name);
+
+ if (!*success) {
+
+ ib::error()
+ << "Unable to create subdirectories '"
+ << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+ create_mode = OS_FILE_CREATE;
+ } else {
+
+ ib::error()
+ << "Unknown file create mode ("
+ << create_mode
+ << " for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ bool retry;
+
+ do {
+ file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ if (file == -1) {
+ *success = false;
+ retry = os_file_handle_error(
+ name,
+ create_mode == OS_FILE_OPEN
+ ? "open" : "create");
+ } else {
+ *success = true;
+ retry = false;
+ }
+
+ } while (retry);
+
+ /* This function is always called for data files, we should disable
+ OS caching (O_DIRECT) here as we do in os_file_create_func(), so
+ we open the same file in the same mode, see man page of open(2). */
+ if (!srv_read_only_mode
+ && *success
+ && (srv_file_flush_method == SRV_O_DIRECT
+ || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
+
+ os_file_set_nocache(file, name, mode_str);
+ }
+
+#ifdef USE_FILE_LOCK
+ if (!read_only
+ && *success
+ && (access_type == OS_FILE_READ_WRITE)
+ && os_file_lock(file, name)) {
+
+ *success = false;
+ close(file);
+ file = -1;
+ }
+#endif /* USE_FILE_LOCK */
+
+ return(file);
+}
+
+/** This function attempts to create a directory named pathname. The new
+directory gets default permissions. On Unix the permissions are
+(0770 & ~umask). If the directory exists already, nothing is done and
+the call succeeds, unless the fail_if_exists arguments is true.
+If another error occurs, such as a permission error, this does not crash,
+but reports the error and returns false.
+@param[in] pathname directory name as null-terminated string
+@param[in] fail_if_exists if true, pre-existing directory is treated as
+ an error.
+@return true if call succeeds, false on error */
+bool
+os_file_create_directory(
+ const char* pathname,
+ bool fail_if_exists)
+{
+ int rcode;
+
+ WAIT_ALLOW_WRITES();
+ rcode = mkdir(pathname, 0770);
+
+ if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
+ /* failure */
+ os_file_handle_error_no_exit(pathname, "mkdir", false);
+
+ return(false);
+ }
+
+ return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
+ is desired, OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use async
+ I/O or unbuffered I/O: look in the function
+ source code for the exact rules
+@param[in] type OS_DATA_FILE or OS_LOG_FILE
+@param[in] read_only true, if read only checks should be enforcedm
+@param[in] success true if succeeded
+@return handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_func(
+ const char* name,
+ ulint create_mode,
+ ulint purpose,
+ ulint type,
+ bool read_only,
+ bool* success)
+{
+ bool on_error_no_exit;
+ bool on_error_silent;
+
+ *success = false;
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_disk_full",
+ *success = false;
+ errno = ENOSPC;
+ return(OS_FILE_CLOSED);
+ );
+
+ int create_flag;
+ const char* mode_str = NULL;
+
+ on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+ ? true : false;
+ on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+ ? true : false;
+
+ create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
+ | OS_FILE_ON_ERROR_SILENT));
+
+ if (create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_RAW
+ || create_mode == OS_FILE_OPEN_RETRY) {
+
+ mode_str = "OPEN";
+
+ create_flag = read_only ? O_RDONLY : O_RDWR;
+
+ } else if (read_only) {
+
+ mode_str = "OPEN";
+
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ mode_str = "CREATE";
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else if (create_mode == OS_FILE_OVERWRITE) {
+
+ mode_str = "OVERWRITE";
+ create_flag = O_RDWR | O_CREAT | O_TRUNC;
+
+ } else {
+ ib::error()
+ << "Unknown file create mode (" << create_mode << ")"
+ << " for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ ut_a(type == OS_LOG_FILE
+ || type == OS_DATA_FILE
+ || type == OS_DATA_FILE_NO_O_DIRECT);
+
+ ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
+
+ /* We let O_DSYNC only affect log files */
+
+ if (!read_only
+ && type == OS_LOG_FILE
+ && srv_file_flush_method == SRV_O_DSYNC) {
+#ifdef O_DSYNC
+ create_flag |= O_DSYNC;
+#else
+ create_flag |= O_SYNC;
+#endif
+ }
+
+ os_file_t file;
+ bool retry;
+
+ do {
+ file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ if (file == -1) {
+ const char* operation;
+
+ operation = (create_mode == OS_FILE_CREATE
+ && !read_only) ? "create" : "open";
+
+ *success = false;
+
+ if (on_error_no_exit) {
+ retry = os_file_handle_error_no_exit(
+ name, operation, on_error_silent);
+ } else {
+ retry = os_file_handle_error(name, operation);
+ }
+ } else {
+ *success = true;
+ retry = false;
+ }
+
+ } while (retry);
+
+ /* We disable OS caching (O_DIRECT) only on data files */
+ if (!read_only
+ && *success
+ && type != OS_LOG_FILE
+ && type != OS_DATA_FILE_NO_O_DIRECT
+ && (srv_file_flush_method == SRV_O_DIRECT
+ || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
+
+ os_file_set_nocache(file, name, mode_str);
+ }
+
+#ifdef USE_FILE_LOCK
+ if (!read_only
+ && *success
+ && create_mode != OS_FILE_OPEN_RAW
+ && os_file_lock(file, name)) {
+
+ if (create_mode == OS_FILE_OPEN_RETRY) {
+
+ ib::info()
+ << "Retrying to lock the first data file";
+
+ for (int i = 0; i < 100; i++) {
+ os_thread_sleep(1000000);
+
+ if (!os_file_lock(file, name)) {
+ *success = true;
+ return(file);
+ }
+ }
+
+ ib::info()
+ << "Unable to open the first data file";
+ }
+
+ *success = false;
+ close(file);
+ file = -1;
+ }
+#endif /* USE_FILE_LOCK */
+
+ return(file);
+}
+
+/** NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option
+ is used by a backup program reading the file
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success)
+{
+ os_file_t file;
+ int create_flag;
+
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
+ WAIT_ALLOW_WRITES();
+ }
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ *success = false;
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ if (access_type == OS_FILE_READ_ONLY) {
+
+ create_flag = O_RDONLY;
+
+ } else if (read_only) {
+
+ create_flag = O_RDONLY;
+
+ } else {
+
+ ut_a(access_type == OS_FILE_READ_WRITE
+ || access_type == OS_FILE_READ_ALLOW_DELETE);
+
+ create_flag = O_RDWR;
+ }
+
+ } else if (read_only) {
+
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else {
+
+ ib::error()
+ << "Unknown file create mode "
+ << create_mode << " for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ *success = (file != -1);
+
+#ifdef USE_FILE_LOCK
+ if (!read_only
+ && *success
+ && access_type == OS_FILE_READ_WRITE
+ && os_file_lock(file, name)) {
+
+ *success = false;
+ close(file);
+ file = -1;
+
+ }
+#endif /* USE_FILE_LOCK */
+
+ return(file);
+}
+
+/** Deletes a file if it exists. The file has to be closed before calling this.
+@param[in] name file path as a null-terminated string
+@param[out] exist indicate if file pre-exist
+@return true if success */
+bool
+os_file_delete_if_exists_func(
+ const char* name,
+ bool* exist)
+{
+ if (exist != NULL) {
+ *exist = true;
+ }
+
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = unlink(name);
+
+ if (ret != 0 && errno == ENOENT) {
+ if (exist != NULL) {
+ *exist = false;
+ }
+ } else if (ret != 0 && errno != ENOENT) {
+ os_file_handle_error_no_exit(name, "delete", false);
+
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Deletes a file. The file has to be closed before calling this.
+@param[in] name file path as a null-terminated string
+@return true if success */
+bool
+os_file_delete_func(
+ const char* name)
+{
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = unlink(name);
+
+ if (ret != 0) {
+ os_file_handle_error_no_exit(name, "delete", FALSE);
+
+ return(false);
+ }
+
+ return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_rename(), not directly this
+function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@param[in] oldpath old file path as a null-terminated string
+@param[in] newpath new file path
+@return true if success */
+bool
+os_file_rename_func(
+ const char* oldpath,
+ const char* newpath)
+{
+#ifdef UNIV_DEBUG
+ os_file_type_t type;
+ bool exists;
+
+ /* New path must not exist. */
+ ut_ad(os_file_status(newpath, &exists, &type));
+ ut_ad(!exists);
+
+ /* Old path must exist. */
+ ut_ad(os_file_status(oldpath, &exists, &type));
+ ut_ad(exists);
+#endif /* UNIV_DEBUG */
+
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = rename(oldpath, newpath);
+
+ if (ret != 0) {
+ os_file_handle_rename_error(oldpath, newpath);
+
+ return(false);
+ }
+
+ return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_close(), not directly this
+function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@param[in] file Handle to close
+@return true if success */
+bool os_file_close_func(os_file_t file)
+{
+ int ret= close(file);
+
+ if (!ret)
+ return true;
+
+ os_file_handle_error(NULL, "close");
+ return false;
+}
+
+/** Gets a file size.
+@param[in] file handle to an open file
+@return file size, or (os_offset_t) -1 on failure */
+os_offset_t
+os_file_get_size(os_file_t file)
+{
+ struct stat statbuf;
+ return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size;
+}
+
+/** Gets a file size.
+@param[in] filename Full path to the filename to check
+@return file size if OK, else set m_total_size to ~0 and m_alloc_size to
+ errno */
+os_file_size_t
+os_file_get_size(
+ const char* filename)
+{
+ struct stat s;
+ os_file_size_t file_size;
+
+ int ret = stat(filename, &s);
+
+ if (ret == 0) {
+ file_size.m_total_size = s.st_size;
+ /* st_blocks is in 512 byte sized blocks */
+ file_size.m_alloc_size = s.st_blocks * 512;
+ } else {
+ file_size.m_total_size = ~0U;
+ file_size.m_alloc_size = (os_offset_t) errno;
+ }
+
+ return(file_size);
+}
+
+/** This function returns information about the specified file
+@param[in] path pathname of the file
+@param[out] stat_info information of a file in a directory
+@param[in,out] statinfo information of a file in a directory
+@param[in] check_rw_perm for testing whether the file can be opened
+ in RW mode
+@param[in] read_only if true read only mode checks are enforced
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+os_file_get_status_posix(
+ const char* path,
+ os_file_stat_t* stat_info,
+ struct stat* statinfo,
+ bool check_rw_perm,
+ bool read_only)
+{
+ int ret = stat(path, statinfo);
+
+ if (ret && (errno == ENOENT || errno == ENOTDIR
+ || errno == ENAMETOOLONG)) {
+ /* file does not exist */
+
+ return(DB_NOT_FOUND);
+
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat", false);
+
+ return(DB_FAIL);
+ }
+
+ switch (statinfo->st_mode & S_IFMT) {
+ case S_IFDIR:
+ stat_info->type = OS_FILE_TYPE_DIR;
+ break;
+ case S_IFLNK:
+ stat_info->type = OS_FILE_TYPE_LINK;
+ break;
+ case S_IFBLK:
+ /* Handle block device as regular file. */
+ case S_IFCHR:
+ /* Handle character device as regular file. */
+ case S_IFREG:
+ stat_info->type = OS_FILE_TYPE_FILE;
+ break;
+ default:
+ stat_info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ stat_info->size = statinfo->st_size;
+ stat_info->block_size = statinfo->st_blksize;
+ stat_info->alloc_size = statinfo->st_blocks * 512;
+
+ if (check_rw_perm
+ && (stat_info->type == OS_FILE_TYPE_FILE
+ || stat_info->type == OS_FILE_TYPE_BLOCK)) {
+
+ stat_info->rw_perm = !access(path, read_only
+ ? R_OK : R_OK | W_OK);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Truncates a file to a specified size in bytes.
+Do nothing if the size to preserve is greater or equal to the current
+size of the file.
+@param[in] pathname file path
+@param[in] file file to be truncated
+@param[in] size size to preserve in bytes
+@return true if success */
+static
+bool
+os_file_truncate_posix(
+ const char* pathname,
+ os_file_t file,
+ os_offset_t size)
+{
+ int res = ftruncate(file, size);
+
+ if (res == -1) {
+
+ bool retry;
+
+ retry = os_file_handle_error_no_exit(
+ pathname, "truncate", false);
+
+ if (retry) {
+ ib::warn()
+ << "Truncate failed for '"
+ << pathname << "'";
+ }
+ }
+
+ return(res == 0);
+}
+
+/** Truncates a file at its current position.
+@return true if success */
+bool
+os_file_set_eof(
+ FILE* file) /*!< in: file to be truncated */
+{
+ WAIT_ALLOW_WRITES();
+ return(!ftruncate(fileno(file), ftell(file)));
+}
+
+#else /* !_WIN32 */
+
+#include <WinIoCtl.h>
+
+
+
+/** Free storage space associated with a section of the file.
+@param[in] fh Open file handle
+@param[in] off Starting offset (SEEK_SET)
+@param[in] len Size of the hole
+@return 0 on success or errno */
+static
+dberr_t
+os_file_punch_hole_win32(
+ os_file_t fh,
+ os_offset_t off,
+ os_offset_t len)
+{
+ FILE_ZERO_DATA_INFORMATION punch;
+
+ punch.FileOffset.QuadPart = off;
+ punch.BeyondFinalZero.QuadPart = off + len;
+
+ /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
+ therefore we pass a dummy parameter. */
+ DWORD temp;
+ BOOL success = os_win32_device_io_control(
+ fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
+ NULL, 0, &temp);
+
+ return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE);
+}
+
+/** Check the existence and type of the given file.
+@param[in] path path name of file
+@param[out] exists true if the file exists
+@param[out] type Type of the file, if it exists
+@return true if call succeeded */
+static
+bool
+os_file_status_win32(
+ const char* path,
+ bool* exists,
+ os_file_type_t* type)
+{
+ int ret;
+ struct _stat64 statinfo;
+
+ ret = _stat64(path, &statinfo);
+
+ *exists = !ret;
+
+ if (!ret) {
+ /* file exists, everything OK */
+
+ } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
+ /* file does not exist */
+ return(true);
+
+ } else {
+ /* file exists, but stat call failed */
+ os_file_handle_error_no_exit(path, "stat", false);
+ return(false);
+ }
+
+ if (_S_IFDIR & statinfo.st_mode) {
+ *type = OS_FILE_TYPE_DIR;
+
+ } else if (_S_IFREG & statinfo.st_mode) {
+ *type = OS_FILE_TYPE_FILE;
+
+ } else {
+ *type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ return(true);
+}
+
+/* Dynamically load NtFlushBuffersFileEx, used in os_file_flush_func */
+#include <winternl.h>
+typedef NTSTATUS(WINAPI* pNtFlushBuffersFileEx)(
+ HANDLE FileHandle, ULONG Flags, PVOID Parameters, ULONG ParametersSize,
+ PIO_STATUS_BLOCK IoStatusBlock);
+
+static pNtFlushBuffersFileEx my_NtFlushBuffersFileEx
+ = (pNtFlushBuffersFileEx)GetProcAddress(GetModuleHandle("ntdll"),
+ "NtFlushBuffersFileEx");
+
+/** NOTE! Use the corresponding macro os_file_flush(), not directly this
+function!
+Flushes the write buffers of a given file to the disk.
+@param[in] file handle to a file
+@return true if success */
+bool os_file_flush_func(os_file_t file)
+{
+ ++os_n_fsyncs;
+ static bool disable_datasync;
+
+ if (my_NtFlushBuffersFileEx && !disable_datasync)
+ {
+ IO_STATUS_BLOCK iosb{};
+ NTSTATUS status= my_NtFlushBuffersFileEx(
+ file, FLUSH_FLAGS_FILE_DATA_SYNC_ONLY, nullptr, 0, &iosb);
+ if (!status)
+ return true;
+ /*
+ NtFlushBuffersFileEx(FLUSH_FLAGS_FILE_DATA_SYNC_ONLY) might fail
+ unless on Win10+, and maybe non-NTFS. Switch to using FlushFileBuffers().
+ */
+ disable_datasync= true;
+ }
+
+ if (FlushFileBuffers(file))
+ return true;
+
+ /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
+ actually a raw device, we choose to ignore that error if we are using
+ raw disks */
+ if (srv_start_raw_disk_in_use && GetLastError() == ERROR_INVALID_FUNCTION)
+ return true;
+
+ os_file_handle_error(nullptr, "flush");
+
+ /* It is a fatal error if a file flush does not succeed, because then
+ the database can get corrupt on disk */
+ ut_error;
+
+ return false;
+}
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+then OS error number + OS_FILE_ERROR_MAX is returned.
+@param[in] report_all_errors true if we want an error message printed
+ of all errors
+@param[in] on_error_silent true then don't print any diagnostic
+ to the log
+@return error number, or OS error number + OS_FILE_ERROR_MAX */
+static
+ulint
+os_file_get_last_error_low(
+ bool report_all_errors,
+ bool on_error_silent)
+{
+ ulint err = (ulint) GetLastError();
+
+ if (err == ERROR_SUCCESS) {
+ return(0);
+ }
+
+ if (report_all_errors
+ || (!on_error_silent
+ && err != ERROR_DISK_FULL
+ && err != ERROR_FILE_EXISTS)) {
+
+ ib::error()
+ << "Operating system error number " << err
+ << " in a file operation.";
+
+ if (err == ERROR_PATH_NOT_FOUND) {
+ ib::error()
+ << "The error means the system"
+ " cannot find the path specified.";
+
+ if (srv_is_being_started) {
+ ib::error()
+ << "If you are installing InnoDB,"
+ " remember that you must create"
+ " directories yourself, InnoDB"
+ " does not create them.";
+ }
+
+ } else if (err == ERROR_ACCESS_DENIED) {
+
+ ib::error()
+ << "The error means mysqld does not have"
+ " the access rights to"
+ " the directory. It may also be"
+ " you have created a subdirectory"
+ " of the same name as a data file.";
+
+ } else if (err == ERROR_SHARING_VIOLATION
+ || err == ERROR_LOCK_VIOLATION) {
+
+ ib::error()
+ << "The error means that another program"
+ " is using InnoDB's files."
+ " This might be a backup or antivirus"
+ " software or another instance"
+ " of MySQL."
+ " Please close it to get rid of this error.";
+
+ } else if (err == ERROR_WORKING_SET_QUOTA
+ || err == ERROR_NO_SYSTEM_RESOURCES) {
+
+ ib::error()
+ << "The error means that there are no"
+ " sufficient system resources or quota to"
+ " complete the operation.";
+
+ } else if (err == ERROR_OPERATION_ABORTED) {
+
+ ib::error()
+ << "The error means that the I/O"
+ " operation has been aborted"
+ " because of either a thread exit"
+ " or an application request."
+ " Retry attempt is made.";
+ } else {
+
+ ib::info() << OPERATING_SYSTEM_ERROR_MSG;
+ }
+ }
+
+ if (err == ERROR_FILE_NOT_FOUND) {
+ return(OS_FILE_NOT_FOUND);
+ } else if (err == ERROR_DISK_FULL) {
+ return(OS_FILE_DISK_FULL);
+ } else if (err == ERROR_FILE_EXISTS) {
+ return(OS_FILE_ALREADY_EXISTS);
+ } else if (err == ERROR_SHARING_VIOLATION
+ || err == ERROR_LOCK_VIOLATION) {
+ return(OS_FILE_SHARING_VIOLATION);
+ } else if (err == ERROR_WORKING_SET_QUOTA
+ || err == ERROR_NO_SYSTEM_RESOURCES) {
+ return(OS_FILE_INSUFFICIENT_RESOURCE);
+ } else if (err == ERROR_OPERATION_ABORTED) {
+ return(OS_FILE_OPERATION_ABORTED);
+ } else if (err == ERROR_ACCESS_DENIED) {
+ return(OS_FILE_ACCESS_VIOLATION);
+ }
+
+ return(OS_FILE_ERROR_MAX + err);
+}
+
+
+/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeed, false if error
+@return handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success)
+{
+ os_file_t file;
+
+ *success = false;
+
+ DWORD access;
+ DWORD create_flag;
+ DWORD attributes = 0;
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+ ut_ad(srv_operation == SRV_OPERATION_NORMAL);
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (read_only) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = CREATE_NEW;
+
+ } else if (create_mode == OS_FILE_CREATE_PATH) {
+
+ /* Create subdirs along the path if needed. */
+ *success = os_file_create_subdirs_if_needed(name);
+
+ if (!*success) {
+
+ ib::error()
+ << "Unable to create subdirectories '"
+ << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ create_flag = CREATE_NEW;
+ create_mode = OS_FILE_CREATE;
+
+ } else {
+
+ ib::error()
+ << "Unknown file create mode ("
+ << create_mode << ") for file '"
+ << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ if (access_type == OS_FILE_READ_ONLY) {
+
+ access = GENERIC_READ;
+
+ } else if (read_only) {
+
+ ib::info()
+ << "Read only mode set. Unable to"
+ " open file '" << name << "' in RW mode, "
+ << "trying RO mode";
+
+ access = GENERIC_READ;
+
+ } else if (access_type == OS_FILE_READ_WRITE) {
+
+ access = GENERIC_READ | GENERIC_WRITE;
+
+ } else {
+
+ ib::error()
+ << "Unknown file access type (" << access_type << ") "
+ "for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ bool retry;
+
+ do {
+ /* Use default security attributes and no template file. */
+
+ file = CreateFile(
+ (LPCTSTR) name, access,
+ FILE_SHARE_READ | FILE_SHARE_DELETE,
+ NULL, create_flag, attributes, NULL);
+
+ if (file == INVALID_HANDLE_VALUE) {
+
+ *success = false;
+
+ retry = os_file_handle_error(
+ name, create_mode == OS_FILE_OPEN ?
+ "open" : "create");
+
+ } else {
+
+ retry = false;
+
+ *success = true;
+ }
+
+ } while (retry);
+
+ return(file);
+}
+
+/** This function attempts to create a directory named pathname. The new
+directory gets default permissions. On Unix the permissions are
+(0770 & ~umask). If the directory exists already, nothing is done and
+the call succeeds, unless the fail_if_exists arguments is true.
+If another error occurs, such as a permission error, this does not crash,
+but reports the error and returns false.
+@param[in] pathname directory name as null-terminated string
+@param[in] fail_if_exists if true, pre-existing directory is treated
+ as an error.
+@return true if call succeeds, false on error */
+bool
+os_file_create_directory(
+ const char* pathname,
+ bool fail_if_exists)
+{
+ BOOL rcode;
+
+ rcode = CreateDirectory((LPCTSTR) pathname, NULL);
+ if (!(rcode != 0
+ || (GetLastError() == ERROR_ALREADY_EXISTS
+ && !fail_if_exists))) {
+
+ os_file_handle_error_no_exit(
+ pathname, "CreateDirectory", false);
+
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Check that IO of specific size is possible for the file
+opened with FILE_FLAG_NO_BUFFERING.
+
+The requirement is that IO is multiple of the disk sector size.
+
+@param[in] file file handle
+@param[in] io_size expected io size
+@return true - unbuffered io of requested size is possible, false otherwise.
+
+@note: this function only works correctly with Windows 8 or later,
+(GetFileInformationByHandleEx with FileStorageInfo is only supported there).
+It will return true on earlier Windows version.
+ */
+static bool unbuffered_io_possible(HANDLE file, size_t io_size)
+{
+ FILE_STORAGE_INFO info;
+ if (GetFileInformationByHandleEx(
+ file, FileStorageInfo, &info, sizeof(info))) {
+ ULONG sector_size = info.LogicalBytesPerSector;
+ if (sector_size)
+ return io_size % sector_size == 0;
+ }
+ return true;
+}
+
+
+/** NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
+ is desired, OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use async
+ I/O or unbuffered I/O: look in the function
+ source code for the exact rules
+@param[in] type OS_DATA_FILE or OS_LOG_FILE
+@param[in] success true if succeeded
+@return handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_func(
+ const char* name,
+ ulint create_mode,
+ ulint purpose,
+ ulint type,
+ bool read_only,
+ bool* success)
+{
+ os_file_t file;
+ bool retry;
+ bool on_error_no_exit;
+ bool on_error_silent;
+
+ *success = false;
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_disk_full",
+ *success = false;
+ SetLastError(ERROR_DISK_FULL);
+ return(OS_FILE_CLOSED);
+ );
+
+ DWORD create_flag;
+ DWORD share_mode = read_only
+ ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
+ : FILE_SHARE_READ | FILE_SHARE_DELETE;
+
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
+ WAIT_ALLOW_WRITES();
+ }
+
+ on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+ ? true : false;
+
+ on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+ ? true : false;
+
+ create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT);
+
+ if (create_mode == OS_FILE_OPEN_RAW) {
+
+ ut_a(!read_only);
+
+ /* On Windows Physical devices require admin privileges and
+ have to have the write-share mode set. See the remarks
+ section for the CreateFile() function documentation in MSDN. */
+
+ share_mode |= FILE_SHARE_WRITE;
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_RETRY) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (read_only) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = CREATE_NEW;
+
+ } else if (create_mode == OS_FILE_OVERWRITE) {
+
+ create_flag = CREATE_ALWAYS;
+
+ } else {
+ ib::error()
+ << "Unknown file create mode (" << create_mode << ") "
+ << " for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ DWORD attributes = 0;
+
+ if (purpose == OS_FILE_AIO) {
+
+#ifdef WIN_ASYNC_IO
+ /* If specified, use asynchronous (overlapped) io and no
+ buffering of writes in the OS */
+
+ if (srv_use_native_aio) {
+ attributes |= FILE_FLAG_OVERLAPPED;
+ }
+#endif /* WIN_ASYNC_IO */
+
+ } else if (purpose == OS_FILE_NORMAL) {
+
+ /* Use default setting. */
+
+ } else {
+
+ ib::error()
+ << "Unknown purpose flag (" << purpose << ") "
+ << "while opening file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ if (type == OS_LOG_FILE) {
+ /* There is not reason to use buffered write to logs.*/
+ attributes |= FILE_FLAG_NO_BUFFERING;
+ }
+
+ switch (srv_file_flush_method)
+ {
+ case SRV_O_DSYNC:
+ if (type == OS_LOG_FILE) {
+ /* Map O_DSYNC to FILE_WRITE_THROUGH */
+ attributes |= FILE_FLAG_WRITE_THROUGH;
+ }
+ break;
+
+ case SRV_O_DIRECT_NO_FSYNC:
+ case SRV_O_DIRECT:
+ if (type != OS_DATA_FILE) {
+ break;
+ }
+ /* fall through */
+ case SRV_ALL_O_DIRECT_FSYNC:
+ /*Traditional Windows behavior, no buffering for any files.*/
+ if (type != OS_DATA_FILE_NO_O_DIRECT) {
+ attributes |= FILE_FLAG_NO_BUFFERING;
+ }
+ break;
+
+ case SRV_FSYNC:
+ case SRV_LITTLESYNC:
+ break;
+
+ case SRV_NOSYNC:
+ /* Let Windows cache manager handle all writes.*/
+ attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
+ break;
+
+ default:
+ ut_a(false); /* unknown flush mode.*/
+ }
+
+
+ // TODO: Create a bug, this looks wrong. The flush log
+ // parameter is dynamic.
+ if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
+ /* Do not use unbuffered i/o for the log files because
+ value 2 denotes that we do not flush the log at every
+ commit, but only once per second */
+ attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
+ }
+
+
+ DWORD access = GENERIC_READ;
+
+ if (!read_only) {
+ access |= GENERIC_WRITE;
+ }
+
+ for (;;) {
+ const char *operation;
+
+ /* Use default security attributes and no template file. */
+ file = CreateFile(
+ name, access, share_mode, NULL,
+ create_flag, attributes, NULL);
+
+ /* If FILE_FLAG_NO_BUFFERING was set, check if this can work at all,
+ for expected IO sizes. Reopen without the unbuffered flag, if it is won't work*/
+ if ((file != INVALID_HANDLE_VALUE)
+ && (attributes & FILE_FLAG_NO_BUFFERING)
+ && (type == OS_LOG_FILE)
+ && !unbuffered_io_possible(file, OS_FILE_LOG_BLOCK_SIZE)) {
+ ut_a(CloseHandle(file));
+ attributes &= ~FILE_FLAG_NO_BUFFERING;
+ create_flag = OPEN_ALWAYS;
+ continue;
+ }
+
+ *success = (file != INVALID_HANDLE_VALUE);
+ if (*success) {
+ break;
+ }
+
+ operation = (create_mode == OS_FILE_CREATE && !read_only) ?
+ "create" : "open";
+
+ if (on_error_no_exit) {
+ retry = os_file_handle_error_no_exit(
+ name, operation, on_error_silent);
+ }
+ else {
+ retry = os_file_handle_error(name, operation);
+ }
+
+ if (!retry) {
+ break;
+ }
+ }
+
+ if (*success && (attributes & FILE_FLAG_OVERLAPPED) && srv_thread_pool) {
+ srv_thread_pool->bind(file);
+ }
+ return(file);
+}
+
+/** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
+not directly this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file
+@param[out] success true if succeeded
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success)
+{
+ os_file_t file;
+
+ *success = false;
+
+ DWORD access;
+ DWORD create_flag;
+ DWORD attributes = 0;
+ DWORD share_mode = read_only
+ ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
+ : FILE_SHARE_READ | FILE_SHARE_DELETE;
+
+ ut_a(name);
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (read_only) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = CREATE_NEW;
+
+ } else {
+
+ ib::error()
+ << "Unknown file create mode (" << create_mode << ") "
+ << " for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ if (access_type == OS_FILE_READ_ONLY) {
+
+ access = GENERIC_READ;
+
+ } else if (read_only) {
+
+ access = GENERIC_READ;
+
+ } else if (access_type == OS_FILE_READ_WRITE) {
+
+ access = GENERIC_READ | GENERIC_WRITE;
+
+ } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
+
+ ut_a(!read_only);
+
+ access = GENERIC_READ;
+
+ /*!< A backup program has to give mysqld the maximum
+ freedom to do what it likes with the file */
+
+ share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
+ | FILE_SHARE_READ;
+
+ } else {
+
+ ib::error()
+ << "Unknown file access type (" << access_type << ") "
+ << "for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ file = CreateFile((LPCTSTR) name,
+ access,
+ share_mode,
+ NULL, // Security attributes
+ create_flag,
+ attributes,
+ NULL); // No template file
+
+ *success = (file != INVALID_HANDLE_VALUE);
+
+ return(file);
+}
+
+/** Deletes a file if it exists. The file has to be closed before calling this.
+@param[in] name file path as a null-terminated string
+@param[out] exist indicate if file pre-exist
+@return true if success */
+bool
+os_file_delete_if_exists_func(
+ const char* name,
+ bool* exist)
+{
+ ulint count = 0;
+
+ if (exist != NULL) {
+ *exist = true;
+ }
+
+ for (;;) {
+ /* In Windows, deleting an .ibd file may fail if
+ the file is being accessed by an external program,
+ such as a backup tool. */
+
+ bool ret = DeleteFile((LPCTSTR) name);
+
+ if (ret) {
+ return(true);
+ }
+
+ DWORD lasterr = GetLastError();
+
+ if (lasterr == ERROR_FILE_NOT_FOUND
+ || lasterr == ERROR_PATH_NOT_FOUND) {
+
+ /* the file does not exist, this not an error */
+ if (exist != NULL) {
+ *exist = false;
+ }
+
+ return(true);
+ }
+
+ ++count;
+
+ if (count > 100 && 0 == (count % 10)) {
+
+ /* Print error information */
+ os_file_get_last_error(true);
+
+ ib::warn() << "Delete of file '" << name << "' failed.";
+ }
+
+ /* Sleep for a second */
+ os_thread_sleep(1000000);
+
+ if (count > 2000) {
+
+ return(false);
+ }
+ }
+}
+
+/** Deletes a file. The file has to be closed before calling this.
+@param[in] name File path as NUL terminated string
+@return true if success */
+bool
+os_file_delete_func(
+ const char* name)
+{
+ ulint count = 0;
+
+ for (;;) {
+ /* In Windows, deleting an .ibd file may fail if
+ the file is being accessed by an external program,
+ such as a backup tool. */
+
+ BOOL ret = DeleteFile((LPCTSTR) name);
+
+ if (ret) {
+ return(true);
+ }
+
+ if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+ /* If the file does not exist, we classify this as
+ a 'mild' error and return */
+
+ return(false);
+ }
+
+ ++count;
+
+ if (count > 100 && 0 == (count % 10)) {
+
+ /* print error information */
+ os_file_get_last_error(true);
+
+ ib::warn()
+ << "Cannot delete file '" << name << "'. Is "
+ << "another program accessing it?";
+ }
+
+ /* sleep for a second */
+ os_thread_sleep(1000000);
+
+ if (count > 2000) {
+
+ return(false);
+ }
+ }
+
+ ut_error;
+ return(false);
+}
+
+/** NOTE! Use the corresponding macro os_file_rename(), not directly this
+function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@param[in] oldpath old file path as a null-terminated string
+@param[in] newpath new file path
+@return true if success */
+bool
+os_file_rename_func(
+ const char* oldpath,
+ const char* newpath)
+{
+#ifdef UNIV_DEBUG
+ os_file_type_t type;
+ bool exists;
+
+ /* New path must not exist. */
+ ut_ad(os_file_status(newpath, &exists, &type));
+ ut_ad(!exists);
+
+ /* Old path must exist. */
+ ut_ad(os_file_status(oldpath, &exists, &type));
+ ut_ad(exists);
+#endif /* UNIV_DEBUG */
+
+ if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
+ return(true);
+ }
+
+ os_file_handle_rename_error(oldpath, newpath);
+ return(false);
+}
+
+/** NOTE! Use the corresponding macro os_file_close(), not directly
+this function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@param[in,own] file Handle to a file
+@return true if success */
+bool os_file_close_func(os_file_t file)
+{
+ ut_ad(file);
+ if (!CloseHandle(file))
+ {
+ os_file_handle_error(NULL, "close");
+ return false;
+ }
+
+ if(srv_thread_pool)
+ srv_thread_pool->unbind(file);
+ return true;
+}
+
+/** Gets a file size.
+@param[in] file Handle to a file
+@return file size, or (os_offset_t) -1 on failure */
+os_offset_t
+os_file_get_size(
+ os_file_t file)
+{
+ DWORD high;
+ DWORD low = GetFileSize(file, &high);
+
+ if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+ return((os_offset_t) -1);
+ }
+
+ return(os_offset_t(low | (os_offset_t(high) << 32)));
+}
+
+/** Gets a file size.
+@param[in] filename Full path to the filename to check
+@return file size if OK, else set m_total_size to ~0 and m_alloc_size to
+ errno */
+os_file_size_t
+os_file_get_size(
+ const char* filename)
+{
+ struct __stat64 s;
+ os_file_size_t file_size;
+
+ int ret = _stat64(filename, &s);
+
+ if (ret == 0) {
+
+ file_size.m_total_size = s.st_size;
+
+ DWORD low_size;
+ DWORD high_size;
+
+ low_size = GetCompressedFileSize(filename, &high_size);
+
+ if (low_size != INVALID_FILE_SIZE) {
+
+ file_size.m_alloc_size = high_size;
+ file_size.m_alloc_size <<= 32;
+ file_size.m_alloc_size |= low_size;
+
+ } else {
+ ib::error()
+ << "GetCompressedFileSize("
+ << filename << ", ..) failed.";
+
+ file_size.m_alloc_size = (os_offset_t) -1;
+ }
+ } else {
+ file_size.m_total_size = ~0;
+ file_size.m_alloc_size = (os_offset_t) ret;
+ }
+
+ return(file_size);
+}
+
+/** This function returns information about the specified file
+@param[in] path pathname of the file
+@param[out] stat_info information of a file in a directory
+@param[in,out] statinfo information of a file in a directory
+@param[in] check_rw_perm for testing whether the file can be opened
+ in RW mode
+@param[in] read_only true if the file is opened in read-only mode
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+os_file_get_status_win32(
+ const char* path,
+ os_file_stat_t* stat_info,
+ struct _stat64* statinfo,
+ bool check_rw_perm,
+ bool read_only)
+{
+ int ret = _stat64(path, statinfo);
+
+ if (ret && (errno == ENOENT || errno == ENOTDIR
+ || errno == ENAMETOOLONG)) {
+ /* file does not exist */
+
+ return(DB_NOT_FOUND);
+
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "STAT", false);
+
+ return(DB_FAIL);
+
+ } else if (_S_IFDIR & statinfo->st_mode) {
+
+ stat_info->type = OS_FILE_TYPE_DIR;
+
+ } else if (_S_IFREG & statinfo->st_mode) {
+
+ DWORD access = GENERIC_READ;
+
+ if (!read_only) {
+ access |= GENERIC_WRITE;
+ }
+
+ stat_info->type = OS_FILE_TYPE_FILE;
+
+ /* Check if we can open it in read-only mode. */
+
+ if (check_rw_perm) {
+ HANDLE fh;
+
+ fh = CreateFile(
+ (LPCTSTR) path, // File to open
+ access,
+ FILE_SHARE_READ | FILE_SHARE_WRITE
+ | FILE_SHARE_DELETE, // Full sharing
+ NULL, // Default security
+ OPEN_EXISTING, // Existing file only
+ FILE_ATTRIBUTE_NORMAL, // Normal file
+ NULL); // No attr. template
+
+ if (fh == INVALID_HANDLE_VALUE) {
+ stat_info->rw_perm = false;
+ } else {
+ stat_info->rw_perm = true;
+ CloseHandle(fh);
+ }
+ }
+ stat_info->block_size = 0;
+
+ /* What follows, is calculation of FS block size, which is not important
+ (it is just shown in I_S innodb tables). The error to calculate it will be ignored.*/
+ char volname[MAX_PATH];
+ BOOL result = GetVolumePathName(path, volname, MAX_PATH);
+ static bool warned_once = false;
+ if (!result) {
+ if (!warned_once) {
+ ib::warn()
+ << "os_file_get_status_win32: "
+ << "Failed to get the volume path name for: "
+ << path
+ << "- OS error number " << GetLastError();
+ warned_once = true;
+ }
+ return(DB_SUCCESS);
+ }
+
+ DWORD sectorsPerCluster;
+ DWORD bytesPerSector;
+ DWORD numberOfFreeClusters;
+ DWORD totalNumberOfClusters;
+
+ result = GetDiskFreeSpace(
+ (LPCSTR) volname,
+ &sectorsPerCluster,
+ &bytesPerSector,
+ &numberOfFreeClusters,
+ &totalNumberOfClusters);
+
+ if (!result) {
+ if (!warned_once) {
+ ib::warn()
+ << "GetDiskFreeSpace(" << volname << ",...) "
+ << "failed "
+ << "- OS error number " << GetLastError();
+ warned_once = true;
+ }
+ return(DB_SUCCESS);
+ }
+ stat_info->block_size = bytesPerSector * sectorsPerCluster;
+ } else {
+ stat_info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ return(DB_SUCCESS);
+}
+
+/**
+Sets a sparse flag on Windows file.
+@param[in] file file handle
+@return true on success, false on error
+*/
+#include <versionhelpers.h>
+bool os_file_set_sparse_win32(os_file_t file, bool is_sparse)
+{
+ if (!is_sparse && !IsWindows8OrGreater()) {
+ /* Cannot unset sparse flag on older Windows.
+ Until Windows8 it is documented to produce unpredictable results,
+ if there are unallocated ranges in file.*/
+ return false;
+ }
+ DWORD temp;
+ FILE_SET_SPARSE_BUFFER sparse_buffer;
+ sparse_buffer.SetSparse = is_sparse;
+ return os_win32_device_io_control(file,
+ FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp);
+}
+
+
+/**
+Change file size on Windows.
+
+If file is extended, the bytes between old and new EOF
+are zeros.
+
+If file is sparse, "virtual" block is added at the end of
+allocated area.
+
+If file is normal, file system allocates storage.
+
+@param[in] pathname file path
+@param[in] file file handle
+@param[in] size size to preserve in bytes
+@return true if success */
+bool
+os_file_change_size_win32(
+ const char* pathname,
+ os_file_t file,
+ os_offset_t size)
+{
+ LARGE_INTEGER length;
+
+ length.QuadPart = size;
+
+ BOOL success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
+
+ if (!success) {
+ os_file_handle_error_no_exit(
+ pathname, "SetFilePointerEx", false);
+ } else {
+ success = SetEndOfFile(file);
+ if (!success) {
+ os_file_handle_error_no_exit(
+ pathname, "SetEndOfFile", false);
+ }
+ }
+ return(success);
+}
+
+/** Truncates a file at its current position.
+@param[in] file Handle to be truncated
+@return true if success */
+bool
+os_file_set_eof(
+ FILE* file)
+{
+ HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
+
+ return(SetEndOfFile(h));
+}
+
+#endif /* !_WIN32*/
+
+/** Does a syncronous read or write depending upon the type specified
+In case of partial reads/writes the function tries
+NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
+@param[in] type, IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer where to read
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@param[out] err DB_SUCCESS or error code
+@return number of bytes read/written, -1 if error */
+static MY_ATTRIBUTE((warn_unused_result))
+ssize_t
+os_file_io(
+ const IORequest&in_type,
+ os_file_t file,
+ void* buf,
+ ulint n,
+ os_offset_t offset,
+ dberr_t* err)
+{
+ ssize_t original_n = ssize_t(n);
+ IORequest type = in_type;
+ ssize_t bytes_returned = 0;
+
+ SyncFileIO sync_file_io(file, buf, n, offset);
+
+ for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
+
+ ssize_t n_bytes = sync_file_io.execute(type);
+
+ /* Check for a hard error. Not much we can do now. */
+ if (n_bytes < 0) {
+
+ break;
+
+ } else if (n_bytes + bytes_returned == ssize_t(n)) {
+
+ bytes_returned += n_bytes;
+
+ *err = type.maybe_punch_hole(offset, n);
+
+ return(original_n);
+ }
+
+ /* Handle partial read/write. */
+
+ ut_ad(ulint(n_bytes + bytes_returned) < n);
+
+ bytes_returned += n_bytes;
+
+ if (type.type != IORequest::READ_MAYBE_PARTIAL) {
+ const char* op = type.is_read()
+ ? "read" : "written";
+
+ ib::warn()
+ << n
+ << " bytes should have been " << op << ". Only "
+ << bytes_returned
+ << " bytes " << op << ". Retrying"
+ << " for the remaining bytes.";
+ }
+
+ /* Advance the offset and buffer by n_bytes */
+ sync_file_io.advance(n_bytes);
+ }
+
+ *err = DB_IO_ERROR;
+
+ if (type.type != IORequest::READ_MAYBE_PARTIAL) {
+ ib::warn()
+ << "Retry attempts for "
+ << (type.is_read() ? "reading" : "writing")
+ << " partial data failed.";
+ }
+
+ return(bytes_returned);
+}
+
+/** Does a synchronous write operation in Posix.
+@param[in] type IO context
+@param[in] file handle to an open file
+@param[out] buf buffer from which to write
+@param[in] n number of bytes to read, starting from offset
+@param[in] offset file offset from the start where to read
+@param[out] err DB_SUCCESS or error code
+@return number of bytes written, -1 if error */
+static MY_ATTRIBUTE((warn_unused_result))
+ssize_t
+os_file_pwrite(
+ const IORequest& type,
+ os_file_t file,
+ const byte* buf,
+ ulint n,
+ os_offset_t offset,
+ dberr_t* err)
+{
+ ut_ad(type.is_write());
+
+ ++os_n_file_writes;
+
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+ ssize_t n_bytes = os_file_io(type, file, const_cast<byte*>(buf),
+ n, offset, err);
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+ return(n_bytes);
+}
+
+/** NOTE! Use the corresponding macro os_file_write(), not directly
+Requests a synchronous write operation.
+@param[in] type IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer from which to write
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@return error code
+@retval DB_SUCCESS if the operation succeeded */
+dberr_t
+os_file_write_func(
+ const IORequest& type,
+ const char* name,
+ os_file_t file,
+ const void* buf,
+ os_offset_t offset,
+ ulint n)
+{
+ dberr_t err;
+
+ ut_ad(n > 0);
+
+ WAIT_ALLOW_WRITES();
+
+ ssize_t n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err);
+
+ if ((ulint) n_bytes != n && !os_has_said_disk_full) {
+
+ ib::error()
+ << "Write to file " << name << " failed at offset "
+ << offset << ", " << n
+ << " bytes should have been written,"
+ " only " << n_bytes << " were written."
+ " Operating system error number " << IF_WIN(GetLastError(),errno) << "."
+ " Check that your OS and file system"
+ " support files of this size."
+ " Check also that the disk is not full"
+ " or a disk quota exceeded.";
+#ifndef _WIN32
+ if (strerror(errno) != NULL) {
+
+ ib::error()
+ << "Error number " << errno
+ << " means '" << strerror(errno) << "'";
+ }
+
+ ib::info() << OPERATING_SYSTEM_ERROR_MSG;
+#endif
+ os_has_said_disk_full = true;
+ }
+
+ return(err);
+}
+
+/** Does a synchronous read operation in Posix.
+@param[in] type IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer where to read
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@param[out] err DB_SUCCESS or error code
+@return number of bytes read, -1 if error */
+static MY_ATTRIBUTE((warn_unused_result))
+ssize_t
+os_file_pread(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ ulint n,
+ os_offset_t offset,
+ dberr_t* err)
+{
+ ut_ad(type.is_read());
+
+ ++os_n_file_reads;
+
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+ ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ return(n_bytes);
+}
+
+/** Requests a synchronous positioned read operation.
+@return DB_SUCCESS if request was successful, false if fail
+@param[in] type IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer where to read
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@param[out] o number of bytes actually read
+@param[in] exit_on_err if true then exit on error
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+os_file_read_page(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ ulint* o,
+ bool exit_on_err)
+{
+ dberr_t err;
+
+ os_bytes_read_since_printout += n;
+
+ ut_ad(n > 0);
+
+ ssize_t n_bytes = os_file_pread(type, file, buf, n, offset, &err);
+
+ if (o) {
+ *o = n_bytes;
+ }
+
+ if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) {
+ return err;
+ }
+ int os_err = IF_WIN((int)GetLastError(), errno);
+
+ if (!os_file_handle_error_cond_exit(
+ NULL, "read", exit_on_err, false)) {
+ ib::fatal()
+ << "Tried to read " << n << " bytes at offset "
+ << offset << ", but was only able to read " << n_bytes
+ << ".Cannot read from file. OS error number "
+ << os_err << ".";
+ } else {
+ ib::error() << "Tried to read " << n << " bytes at offset "
+ << offset << ", but was only able to read " << n_bytes;
+ }
+ if (err == DB_SUCCESS) {
+ err = DB_IO_ERROR;
+ }
+
+ return err;
+}
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@param[in] report_all_errors true if we want an error printed
+ for all errors
+@return error number, or OS error number + 100 */
+ulint
+os_file_get_last_error(
+ bool report_all_errors)
+{
+ return(os_file_get_last_error_low(report_all_errors, false));
+}
+
+/** Handle errors for file operations.
+@param[in] name name of a file or NULL
+@param[in] operation operation
+@param[in] should_abort whether to abort on an unknown error
+@param[in] on_error_silent whether to suppress reports of non-fatal errors
+@return true if we should retry the operation */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+os_file_handle_error_cond_exit(
+ const char* name,
+ const char* operation,
+ bool should_abort,
+ bool on_error_silent)
+{
+ ulint err;
+
+ err = os_file_get_last_error_low(false, on_error_silent);
+
+ switch (err) {
+ case OS_FILE_DISK_FULL:
+ /* We only print a warning about disk full once */
+
+ if (os_has_said_disk_full) {
+
+ return(false);
+ }
+
+ /* Disk full error is reported irrespective of the
+ on_error_silent setting. */
+
+ if (name) {
+
+ ib::error()
+ << "Encountered a problem with file '"
+ << name << "'";
+ }
+
+ ib::error()
+ << "Disk is full. Try to clean the disk to free space.";
+
+ os_has_said_disk_full = true;
+
+ return(false);
+
+ case OS_FILE_AIO_RESOURCES_RESERVED:
+ case OS_FILE_AIO_INTERRUPTED:
+
+ return(true);
+
+ case OS_FILE_PATH_ERROR:
+ case OS_FILE_ALREADY_EXISTS:
+ case OS_FILE_ACCESS_VIOLATION:
+
+ return(false);
+
+ case OS_FILE_SHARING_VIOLATION:
+
+ os_thread_sleep(10000000); /* 10 sec */
+ return(true);
+
+ case OS_FILE_OPERATION_ABORTED:
+ case OS_FILE_INSUFFICIENT_RESOURCE:
+
+ os_thread_sleep(100000); /* 100 ms */
+ return(true);
+
+ default:
+
+ /* If it is an operation that can crash on error then it
+ is better to ignore on_error_silent and print an error message
+ to the log. */
+
+ if (should_abort || !on_error_silent) {
+ ib::error() << "File "
+ << (name != NULL ? name : "(unknown)")
+ << ": '" << operation << "'"
+ " returned OS error " << err << "."
+ << (should_abort
+ ? " Cannot continue operation" : "");
+ }
+
+ if (should_abort) {
+ abort();
+ }
+ }
+
+ return(false);
+}
+
+#ifndef _WIN32
+/** Tries to disable OS caching on an opened file descriptor.
+@param[in] fd file descriptor to alter
+@param[in] file_name file name, used in the diagnostic message
+@param[in] name "open" or "create"; used in the diagnostic
+ message */
+void
+os_file_set_nocache(
+ int fd MY_ATTRIBUTE((unused)),
+ const char* file_name MY_ATTRIBUTE((unused)),
+ const char* operation_name MY_ATTRIBUTE((unused)))
+{
+ /* some versions of Solaris may not have DIRECTIO_ON */
+#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
+ if (directio(fd, DIRECTIO_ON) == -1) {
+ int errno_save = errno;
+
+ ib::error()
+ << "Failed to set DIRECTIO_ON on file "
+ << file_name << "; " << operation_name << ": "
+ << strerror(errno_save) << ","
+ " continuing anyway.";
+ }
+#elif defined(O_DIRECT)
+ if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
+ int errno_save = errno;
+ static bool warning_message_printed = false;
+ if (errno_save == EINVAL) {
+ if (!warning_message_printed) {
+ warning_message_printed = true;
+# ifdef UNIV_LINUX
+ ib::warn()
+ << "Failed to set O_DIRECT on file"
+ << file_name << "; " << operation_name
+ << ": " << strerror(errno_save) << ", "
+ "continuing anyway. O_DIRECT is "
+ "known to result in 'Invalid argument' "
+ "on Linux on tmpfs, "
+ "see MySQL Bug#26662.";
+# else /* UNIV_LINUX */
+ goto short_warning;
+# endif /* UNIV_LINUX */
+ }
+ } else {
+# ifndef UNIV_LINUX
+short_warning:
+# endif
+ ib::warn()
+ << "Failed to set O_DIRECT on file "
+ << file_name << "; " << operation_name
+ << " : " << strerror(errno_save)
+ << ", continuing anyway.";
+ }
+ }
+#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
+}
+
+#endif /* _WIN32 */
+
+/** Check if the file system supports sparse files.
+@param fh file handle
+@return true if the file system supports sparse files */
+IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh)
+{
+#ifdef _WIN32
+ FILE_ATTRIBUTE_TAG_INFO info;
+ if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
+ &info, (DWORD)sizeof(info))) {
+ if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
+ return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
+ }
+ }
+ return false;
+#else
+ /* We don't know the FS block size, use the sector size. The FS
+ will do the magic. */
+ return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size);
+#endif /* _WIN32 */
+}
+
+/** Extend a file.
+
+On Windows, extending a file allocates blocks for the file,
+unless the file is sparse.
+
+On Unix, we will extend the file with ftruncate(), if
+file needs to be sparse. Otherwise posix_fallocate() is used
+when available, and if not, binary zeroes are added to the end
+of file.
+
+@param[in] name file name
+@param[in] file file handle
+@param[in] size desired file size
+@param[in] sparse whether to create a sparse file (no preallocating)
+@return whether the operation succeeded */
+bool
+os_file_set_size(
+ const char* name,
+ os_file_t file,
+ os_offset_t size,
+ bool is_sparse)
+{
+#ifdef _WIN32
+ /* On Windows, changing file size works well and as expected for both
+ sparse and normal files.
+
+ However, 10.2 up until 10.2.9 made every file sparse in innodb,
+ causing NTFS fragmentation issues(MDEV-13941). We try to undo
+ the damage, and unsparse the file.*/
+
+ if (!is_sparse && os_is_sparse_file_supported(file)) {
+ if (!os_file_set_sparse_win32(file, false))
+ /* Unsparsing file failed. Fallback to writing binary
+ zeros, to avoid even higher fragmentation.*/
+ goto fallback;
+ }
+
+ return os_file_change_size_win32(name, file, size);
+
+fallback:
+#else
+ struct stat statbuf;
+
+ if (is_sparse) {
+ bool success = !ftruncate(file, size);
+ if (!success) {
+ ib::error() << "ftruncate of file " << name << " to "
+ << size << " bytes failed with error "
+ << errno;
+ }
+ return(success);
+ }
+
+# ifdef HAVE_POSIX_FALLOCATE
+ int err;
+ do {
+ if (fstat(file, &statbuf)) {
+ err = errno;
+ } else {
+ os_offset_t current_size = statbuf.st_size;
+ if (current_size >= size) {
+ return true;
+ }
+ current_size &= ~os_offset_t(statbuf.st_blksize - 1);
+ err = posix_fallocate(file, current_size,
+ size - current_size);
+ }
+ } while (err == EINTR
+ && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
+
+ switch (err) {
+ case 0:
+ return true;
+ default:
+ ib::error() << "preallocating "
+ << size << " bytes for file " << name
+ << " failed with error " << err;
+ /* fall through */
+ case EINTR:
+ errno = err;
+ return false;
+ case EINVAL:
+ case EOPNOTSUPP:
+ /* fall back to the code below */
+ break;
+ }
+# endif /* HAVE_POSIX_ALLOCATE */
+#endif /* _WIN32*/
+
+#ifdef _WIN32
+ os_offset_t current_size = os_file_get_size(file);
+ FILE_STORAGE_INFO info;
+ if (GetFileInformationByHandleEx(file, FileStorageInfo, &info,
+ sizeof info)) {
+ if (info.LogicalBytesPerSector) {
+ current_size &= ~os_offset_t(info.LogicalBytesPerSector
+ - 1);
+ }
+ }
+#else
+ if (fstat(file, &statbuf)) {
+ return false;
+ }
+ os_offset_t current_size = statbuf.st_size
+ & ~os_offset_t(statbuf.st_blksize - 1);
+#endif
+ if (current_size >= size) {
+ return true;
+ }
+
+ /* Write up to 1 megabyte at a time. */
+ ulint buf_size = ut_min(ulint(64),
+ ulint(size >> srv_page_size_shift))
+ << srv_page_size_shift;
+
+ /* Align the buffer for possible raw i/o */
+ byte* buf = static_cast<byte*>(aligned_malloc(buf_size,
+ srv_page_size));
+ /* Write buffer full of zeros */
+ memset(buf, 0, buf_size);
+
+ while (current_size < size
+ && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
+ ulint n_bytes;
+
+ if (size - current_size < (os_offset_t) buf_size) {
+ n_bytes = (ulint) (size - current_size);
+ } else {
+ n_bytes = buf_size;
+ }
+
+ if (os_file_write(IORequestWrite, name,
+ file, buf, current_size, n_bytes) !=
+ DB_SUCCESS) {
+ break;
+ }
+
+ current_size += n_bytes;
+ }
+
+ aligned_free(buf);
+
+ return(current_size >= size && os_file_flush(file));
+}
+
+/** Truncate a file to a specified size in bytes.
+@param[in] pathname file path
+@param[in] file file to be truncated
+@param[in] size size preserved in bytes
+@param[in] allow_shrink whether to allow the file to become smaller
+@return true if success */
+bool
+os_file_truncate(
+ const char* pathname,
+ os_file_t file,
+ os_offset_t size,
+ bool allow_shrink)
+{
+ if (!allow_shrink) {
+ /* Do nothing if the size preserved is larger than or
+ equal to the current size of file */
+ os_offset_t size_bytes = os_file_get_size(file);
+
+ if (size >= size_bytes) {
+ return(true);
+ }
+ }
+
+#ifdef _WIN32
+ return(os_file_change_size_win32(pathname, file, size));
+#else /* _WIN32 */
+ return(os_file_truncate_posix(pathname, file, size));
+#endif /* _WIN32 */
+}
+
+/** NOTE! Use the corresponding macro os_file_read(), not directly this
+function!
+Requests a synchronous positioned read operation.
+@return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
+@param[in] type IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer where to read
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@return error code
+@retval DB_SUCCESS if the operation succeeded */
+dberr_t
+os_file_read_func(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n)
+{
+ return(os_file_read_page(type, file, buf, offset, n, NULL, true));
+}
+
+/** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+Requests a synchronous positioned read operation.
+@return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
+@param[in] type IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer where to read
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@param[out] o number of bytes actually read
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_read_no_error_handling_func(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ ulint* o)
+{
+ return(os_file_read_page(type, file, buf, offset, n, o, false));
+}
+
+/** Check the existence and type of the given file.
+@param[in] path path name of file
+@param[out] exists true if the file exists
+@param[out] type Type of the file, if it exists
+@return true if call succeeded */
+bool
+os_file_status(
+ const char* path,
+ bool* exists,
+ os_file_type_t* type)
+{
+#ifdef _WIN32
+ return(os_file_status_win32(path, exists, type));
+#else
+ return(os_file_status_posix(path, exists, type));
+#endif /* _WIN32 */
+}
+
+/** Free storage space associated with a section of the file.
+@param[in] fh Open file handle
+@param[in] off Starting offset (SEEK_SET)
+@param[in] len Size of the hole
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_punch_hole(
+ os_file_t fh,
+ os_offset_t off,
+ os_offset_t len)
+{
+#ifdef _WIN32
+ return os_file_punch_hole_win32(fh, off, len);
+#else
+ return os_file_punch_hole_posix(fh, off, len);
+#endif /* _WIN32 */
+}
+
+/** Free storage space associated with a section of the file.
+@param off byte offset from the start (SEEK_SET)
+@param len size of the hole in bytes
+@return DB_SUCCESS or error code */
+dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const
+{
+ ulint trim_len = bpage ? bpage->physical_size() - len : 0;
+
+ if (trim_len == 0) {
+ return(DB_SUCCESS);
+ }
+
+ off += len;
+
+ /* Check does file system support punching holes for this
+ tablespace. */
+ if (!node->space->punch_hole) {
+ return DB_IO_NO_PUNCH_HOLE;
+ }
+
+ dberr_t err = os_file_punch_hole(node->handle, off, trim_len);
+
+ if (err == DB_SUCCESS) {
+ srv_stats.page_compressed_trim_op.inc();
+ } else {
+ /* If punch hole is not supported,
+ set space so that it is not used. */
+ if (err == DB_IO_NO_PUNCH_HOLE) {
+ node->space->punch_hole = false;
+ err = DB_SUCCESS;
+ }
+ }
+
+ return (err);
+}
+
+/** This function returns information about the specified file
+@param[in] path pathname of the file
+@param[out] stat_info information of a file in a directory
+@param[in] check_rw_perm for testing whether the file can be opened
+ in RW mode
+@param[in] read_only true if file is opened in read-only mode
+@return DB_SUCCESS if all OK */
+dberr_t
+os_file_get_status(
+ const char* path,
+ os_file_stat_t* stat_info,
+ bool check_rw_perm,
+ bool read_only)
+{
+ dberr_t ret;
+
+#ifdef _WIN32
+ struct _stat64 info;
+
+ ret = os_file_get_status_win32(
+ path, stat_info, &info, check_rw_perm, read_only);
+
+#else
+ struct stat info;
+
+ ret = os_file_get_status_posix(
+ path, stat_info, &info, check_rw_perm, read_only);
+
+#endif /* _WIN32 */
+
+ if (ret == DB_SUCCESS) {
+ stat_info->ctime = info.st_ctime;
+ stat_info->atime = info.st_atime;
+ stat_info->mtime = info.st_mtime;
+ stat_info->size = info.st_size;
+ }
+
+ return(ret);
+}
+
+
+extern void fil_aio_callback(const IORequest &request);
+
+static void io_callback(tpool::aiocb* cb)
+{
+ ut_a(cb->m_err == DB_SUCCESS);
+ const IORequest request(*static_cast<const IORequest*>
+ (static_cast<const void*>(cb->m_userdata)));
+ /* Return cb back to cache*/
+ if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD)
+ {
+ ut_ad(read_slots->contains(cb));
+ read_slots->release(cb);
+ }
+ else
+ {
+ ut_ad(write_slots->contains(cb));
+ write_slots->release(cb);
+ }
+
+ fil_aio_callback(request);
+}
+
+#ifdef LINUX_NATIVE_AIO
+/** Checks if the system supports native linux aio. On some kernel
+versions where native aio is supported it won't work on tmpfs. In such
+cases we can't use native aio.
+
+@return: true if supported, false otherwise. */
+static bool is_linux_native_aio_supported()
+{
+ File fd;
+ io_context_t io_ctx;
+ std::string log_file_path = get_log_file_path();
+
+ memset(&io_ctx, 0, sizeof(io_ctx));
+ if (io_setup(1, &io_ctx)) {
+
+ /* The platform does not support native aio. */
+
+ return(false);
+
+ }
+ else if (!srv_read_only_mode) {
+
+ /* Now check if tmpdir supports native aio ops. */
+ fd = mysql_tmpfile("ib");
+
+ if (fd < 0) {
+ ib::warn()
+ << "Unable to create temp file to check"
+ " native AIO support.";
+
+ int ret = io_destroy(io_ctx);
+ ut_a(ret != -EINVAL);
+ ut_ad(ret != -EFAULT);
+
+ return(false);
+ }
+ }
+ else {
+ fd = my_open(log_file_path.c_str(), O_RDONLY | O_CLOEXEC,
+ MYF(0));
+
+ if (fd == -1) {
+
+ ib::warn() << "Unable to open \"" << log_file_path
+ << "\" to check native"
+ << " AIO read support.";
+
+ int ret = io_destroy(io_ctx);
+ ut_a(ret != EINVAL);
+ ut_ad(ret != EFAULT);
+
+ return(false);
+ }
+ }
+
+ struct io_event io_event;
+
+ memset(&io_event, 0x0, sizeof(io_event));
+
+ byte* ptr = static_cast<byte*>(aligned_malloc(srv_page_size,
+ srv_page_size));
+
+ struct iocb iocb;
+
+ /* Suppress valgrind warning. */
+ memset(ptr, 0, srv_page_size);
+ memset(&iocb, 0x0, sizeof(iocb));
+
+ struct iocb* p_iocb = &iocb;
+
+ if (!srv_read_only_mode) {
+
+ io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
+
+ }
+ else {
+ ut_a(srv_page_size >= 512);
+ io_prep_pread(p_iocb, fd, ptr, 512, 0);
+ }
+
+ int err = io_submit(io_ctx, 1, &p_iocb);
+
+ if (err >= 1) {
+ /* Now collect the submitted IO request. */
+ err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
+ }
+
+ aligned_free(ptr);
+ my_close(fd, MYF(MY_WME));
+
+ switch (err) {
+ case 1:
+ {
+ int ret = io_destroy(io_ctx);
+ ut_a(ret != -EINVAL);
+ ut_ad(ret != -EFAULT);
+
+ return(true);
+ }
+
+ case -EINVAL:
+ case -ENOSYS:
+ ib::warn()
+ << "Linux Native AIO not supported. You can either"
+ " move "
+ << (srv_read_only_mode ? log_file_path : "tmpdir")
+ << " to a file system that supports native"
+ " AIO or you can set innodb_use_native_aio to"
+ " FALSE to avoid this message.";
+
+ /* fall through. */
+ default:
+ ib::warn()
+ << "Linux Native AIO check on "
+ << (srv_read_only_mode ? log_file_path : "tmpdir")
+ << "returned error[" << -err << "]";
+ }
+
+ int ret = io_destroy(io_ctx);
+ ut_a(ret != -EINVAL);
+ ut_ad(ret != -EFAULT);
+
+ return(false);
+}
+#endif
+
+int os_aio_init()
+{
+ int max_write_events= int(srv_n_write_io_threads *
+ OS_AIO_N_PENDING_IOS_PER_THREAD);
+ int max_read_events= int(srv_n_read_io_threads *
+ OS_AIO_N_PENDING_IOS_PER_THREAD);
+ int max_events= max_read_events + max_write_events;
+ int ret;
+#if LINUX_NATIVE_AIO
+ if (srv_use_native_aio && !is_linux_native_aio_supported())
+ goto disable;
+#endif
+
+ ret= srv_thread_pool->configure_aio(srv_use_native_aio, max_events);
+
+#ifdef LINUX_NATIVE_AIO
+ if (ret)
+ {
+ ut_ad(srv_use_native_aio);
+disable:
+ ib::warn() << "Linux Native AIO disabled.";
+ srv_use_native_aio= false;
+ ret= srv_thread_pool->configure_aio(false, max_events);
+ }
+#endif
+
+ if (!ret)
+ {
+ read_slots= new io_slots(max_read_events, srv_n_read_io_threads);
+ write_slots= new io_slots(max_write_events, srv_n_write_io_threads);
+ }
+ return ret;
+}
+
+
+void os_aio_free()
+{
+ srv_thread_pool->disable_aio();
+ delete read_slots;
+ delete write_slots;
+ read_slots= nullptr;
+ write_slots= nullptr;
+}
+
+/** Wait until there are no pending asynchronous writes. */
+static void os_aio_wait_until_no_pending_writes_low()
+{
+ bool notify_wait = write_slots->pending_io_count() > 0;
+
+ if (notify_wait)
+ tpool::tpool_wait_begin();
+
+ write_slots->wait();
+
+ if (notify_wait)
+ tpool::tpool_wait_end();
+}
+
+/** Wait until there are no pending asynchronous writes.
+Only used on FLUSH TABLES...FOR EXPORT. */
+void os_aio_wait_until_no_pending_writes()
+{
+ os_aio_wait_until_no_pending_writes_low();
+ buf_dblwr.wait_flush_buffered_writes();
+}
+
+/** Request a read or write.
+@param type I/O request
+@param buf buffer
+@param offset file offset
+@param n number of bytes
+@retval DB_SUCCESS if request was queued successfully
+@retval DB_IO_ERROR on I/O error */
+dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n)
+{
+ ut_ad(n > 0);
+ ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
+ ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
+ ut_ad(type.is_read() || type.is_write());
+ ut_ad(type.node);
+ ut_ad(type.node->is_open());
+
+#ifdef WIN_ASYNC_IO
+ ut_ad((n & 0xFFFFFFFFUL) == n);
+#endif /* WIN_ASYNC_IO */
+
+#ifdef UNIV_PFS_IO
+ PSI_file_locker_state state;
+ PSI_file_locker* locker= nullptr;
+ register_pfs_file_io_begin(&state, locker, type.node->handle, n,
+ type.is_write()
+ ? PSI_FILE_WRITE : PSI_FILE_READ,
+ __FILE__, __LINE__);
+#endif /* UNIV_PFS_IO */
+ dberr_t err = DB_SUCCESS;
+
+ if (!type.is_async()) {
+ err = type.is_read()
+ ? os_file_read_func(type, type.node->handle,
+ buf, offset, n)
+ : os_file_write_func(type, type.node->name,
+ type.node->handle,
+ buf, offset, n);
+func_exit:
+#ifdef UNIV_PFS_IO
+ register_pfs_file_io_end(locker, n);
+#endif /* UNIV_PFS_IO */
+ return err;
+ }
+
+ if (type.is_read()) {
+ ++os_n_file_reads;
+ } else {
+ ++os_n_file_writes;
+ }
+
+ compile_time_assert(sizeof(IORequest) <= tpool::MAX_AIO_USERDATA_LEN);
+ io_slots* slots= type.is_read() ? read_slots : write_slots;
+ tpool::aiocb* cb = slots->acquire();
+
+ cb->m_buffer = buf;
+ cb->m_callback = (tpool::callback_func)io_callback;
+ cb->m_group = slots->get_task_group();
+ cb->m_fh = type.node->handle.m_file;
+ cb->m_len = (int)n;
+ cb->m_offset = offset;
+ cb->m_opcode = type.is_read() ? tpool::aio_opcode::AIO_PREAD : tpool::aio_opcode::AIO_PWRITE;
+ new (cb->m_userdata) IORequest{type};
+
+ ut_a(reinterpret_cast<size_t>(cb->m_buffer) % OS_FILE_LOG_BLOCK_SIZE
+ == 0);
+ ut_a(cb->m_len % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_a(cb->m_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+ if (srv_thread_pool->submit_io(cb)) {
+ slots->release(cb);
+ os_file_handle_error(type.node->name, type.is_read()
+ ? "aio read" : "aio write");
+ err = DB_IO_ERROR;
+ }
+
+ goto func_exit;
+}
+
+/** Prints info of the aio arrays.
+@param[in,out] file file where to print */
+void
+os_aio_print(FILE* file)
+{
+ time_t current_time;
+ double time_elapsed;
+
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time, os_last_printout);
+
+ fprintf(file,
+ "Pending flushes (fsync) log: " ULINTPF
+ "; buffer pool: " ULINTPF "\n"
+ ULINTPF " OS file reads, "
+ ULINTPF " OS file writes, "
+ ULINTPF " OS fsyncs\n",
+ log_sys.get_pending_flushes(),
+ ulint{fil_n_pending_tablespace_flushes},
+ ulint{os_n_file_reads},
+ os_n_file_writes,
+ os_n_fsyncs);
+
+ const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
+ const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
+
+ if (n_reads != 0 || n_writes != 0) {
+ fprintf(file,
+ ULINTPF " pending reads, " ULINTPF " pending writes\n",
+ n_reads, n_writes);
+ }
+
+ ulint avg_bytes_read = (os_n_file_reads == os_n_file_reads_old)
+ ? 0
+ : os_bytes_read_since_printout
+ / (os_n_file_reads - os_n_file_reads_old);
+
+ fprintf(file,
+ "%.2f reads/s, " ULINTPF " avg bytes/read,"
+ " %.2f writes/s, %.2f fsyncs/s\n",
+ static_cast<double>(os_n_file_reads - os_n_file_reads_old)
+ / time_elapsed,
+ avg_bytes_read,
+ static_cast<double>(os_n_file_writes - os_n_file_writes_old)
+ / time_elapsed,
+ static_cast<double>(os_n_fsyncs - os_n_fsyncs_old)
+ / time_elapsed);
+
+ os_n_file_reads_old = os_n_file_reads;
+ os_n_file_writes_old = os_n_file_writes;
+ os_n_fsyncs_old = os_n_fsyncs;
+ os_bytes_read_since_printout = 0;
+
+ os_last_printout = current_time;
+}
+
+/** Refreshes the statistics used to print per-second averages. */
+void
+os_aio_refresh_stats()
+{
+ os_n_fsyncs_old = os_n_fsyncs;
+
+ os_bytes_read_since_printout = 0;
+
+ os_n_file_reads_old = os_n_file_reads;
+
+ os_n_file_writes_old = os_n_file_writes;
+
+ os_n_fsyncs_old = os_n_fsyncs;
+
+ os_bytes_read_since_printout = 0;
+
+ os_last_printout = time(NULL);
+}
+
+
+/**
+Set the file create umask
+@param[in] umask The umask to use for file creation. */
+void
+os_file_set_umask(ulint umask)
+{
+ os_innodb_umask = umask;
+}
+
+#ifdef _WIN32
+
+/* Checks whether physical drive is on SSD.*/
+static bool is_drive_on_ssd(DWORD nr)
+{
+ char physical_drive_path[32];
+ snprintf(physical_drive_path, sizeof(physical_drive_path),
+ "\\\\.\\PhysicalDrive%lu", nr);
+
+ HANDLE h= CreateFile(physical_drive_path, 0,
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+ nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
+ if (h == INVALID_HANDLE_VALUE)
+ return false;
+
+ DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty;
+ STORAGE_PROPERTY_QUERY storage_query{};
+ storage_query.PropertyId= StorageDeviceSeekPenaltyProperty;
+ storage_query.QueryType= PropertyStandardQuery;
+
+ bool on_ssd= false;
+ DWORD bytes_written;
+ if (DeviceIoControl(h, IOCTL_STORAGE_QUERY_PROPERTY, &storage_query,
+ sizeof storage_query, &seek_penalty, sizeof seek_penalty,
+ &bytes_written, nullptr))
+ {
+ on_ssd= seek_penalty.IncursSeekPenalty;
+ }
+ else
+ {
+ on_ssd= false;
+ }
+ CloseHandle(h);
+ return on_ssd;
+}
+
+/*
+ Checks whether volume is on SSD, by checking all physical drives
+ in that volume.
+*/
+static bool is_volume_on_ssd(const char *volume_mount_point)
+{
+ char volume_name[MAX_PATH];
+
+ if (!GetVolumeNameForVolumeMountPoint(volume_mount_point, volume_name,
+ array_elements(volume_name)))
+ {
+ /* This can fail, e.g if file is on network share */
+ return false;
+ }
+
+ /* Chomp last backslash, this is needed to open volume.*/
+ size_t length= strlen(volume_name);
+ if (length && volume_name[length - 1] == '\\')
+ volume_name[length - 1]= 0;
+
+ /* Open volume handle */
+ HANDLE volume_handle= CreateFile(
+ volume_name, 0, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+ nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
+
+ if (volume_handle == INVALID_HANDLE_VALUE)
+ return false;
+
+ /*
+ Enumerate all volume extends, check whether all of them are on SSD
+ */
+
+ /* Anticipate common case where there is only one extent.*/
+ VOLUME_DISK_EXTENTS single_extent;
+
+ /* But also have a place to manage allocated data.*/
+ std::unique_ptr<BYTE[]> lifetime;
+
+ DWORD bytes_written;
+ VOLUME_DISK_EXTENTS *extents= nullptr;
+ if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
+ nullptr, 0, &single_extent, sizeof(single_extent),
+ &bytes_written, nullptr))
+ {
+ /* Worked on the first try. Use the preallocated buffer.*/
+ extents= &single_extent;
+ }
+ else
+ {
+ VOLUME_DISK_EXTENTS *last_query= &single_extent;
+ while (GetLastError() == ERROR_MORE_DATA)
+ {
+ DWORD extentCount= last_query->NumberOfDiskExtents;
+ DWORD allocatedSize=
+ FIELD_OFFSET(VOLUME_DISK_EXTENTS, Extents[extentCount]);
+ lifetime.reset(new BYTE[allocatedSize]);
+ last_query= (VOLUME_DISK_EXTENTS *) lifetime.get();
+ if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
+ nullptr, 0, last_query, allocatedSize,
+ &bytes_written, nullptr))
+ {
+ extents= last_query;
+ break;
+ }
+ }
+ }
+ CloseHandle(volume_handle);
+ if (!extents)
+ return false;
+
+ for (DWORD i= 0; i < extents->NumberOfDiskExtents; i++)
+ if (!is_drive_on_ssd(extents->Extents[i].DiskNumber))
+ return false;
+
+ return true;
+}
+
+#include <unordered_map>
+static bool is_file_on_ssd(char *file_path)
+{
+ /* Cache of volume_path => volume_info, protected by rwlock.*/
+ static std::unordered_map<std::string, bool> cache;
+ static SRWLOCK lock= SRWLOCK_INIT;
+
+ /* Preset result, in case something fails, e.g we're on network drive.*/
+ char volume_path[MAX_PATH];
+ if (!GetVolumePathName(file_path, volume_path, array_elements(volume_path)))
+ return false;
+
+ /* Try cached volume info first.*/
+ std::string volume_path_str(volume_path);
+ bool found;
+ bool result;
+ AcquireSRWLockShared(&lock);
+ auto e= cache.find(volume_path_str);
+ if ((found= e != cache.end()))
+ result= e->second;
+ ReleaseSRWLockShared(&lock);
+
+ if (found)
+ return result;
+
+ result= is_volume_on_ssd(volume_path);
+
+ /* Update cache */
+ AcquireSRWLockExclusive(&lock);
+ cache[volume_path_str]= result;
+ ReleaseSRWLockExclusive(&lock);
+ return result;
+}
+
+#endif
+
+/** Determine some file metadata when creating or reading the file.
+@param file the file that is being created, or OS_FILE_CLOSED */
+void fil_node_t::find_metadata(os_file_t file
+#ifndef _WIN32
+ , struct stat* statbuf
+#endif
+ )
+{
+ if (file == OS_FILE_CLOSED) {
+ file = handle;
+ ut_ad(is_open());
+ }
+
+#ifdef _WIN32 /* FIXME: make this unconditional */
+ if (space->punch_hole) {
+ space->punch_hole = os_is_sparse_file_supported(file);
+ }
+#endif
+
+ /*
+ For the temporary tablespace and during the
+ non-redo-logged adjustments in
+ IMPORT TABLESPACE, we do not care about
+ the atomicity of writes.
+
+ Atomic writes is supported if the file can be used
+ with atomic_writes (not log file), O_DIRECT is
+ used (tested in ha_innodb.cc) and the file is
+ device and file system that supports atomic writes
+ for the given block size.
+ */
+ space->atomic_write_supported = space->purpose == FIL_TYPE_TEMPORARY
+ || space->purpose == FIL_TYPE_IMPORT;
+#ifdef _WIN32
+ on_ssd = is_file_on_ssd(name);
+ FILE_STORAGE_INFO info;
+ if (GetFileInformationByHandleEx(
+ file, FileStorageInfo, &info, sizeof(info))) {
+ block_size = info.PhysicalBytesPerSectorForAtomicity;
+ } else {
+ block_size = 512;
+ }
+#else
+ struct stat sbuf;
+ if (!statbuf && !fstat(file, &sbuf)) {
+ statbuf = &sbuf;
+ }
+ if (statbuf) {
+ block_size = statbuf->st_blksize;
+ }
+ on_ssd = space->atomic_write_supported
+# ifdef UNIV_LINUX
+ || (statbuf && fil_system.is_ssd(statbuf->st_dev))
+# endif
+ ;
+#endif
+ if (!space->atomic_write_supported) {
+ space->atomic_write_supported = atomic_write
+ && srv_use_atomic_writes
+#ifndef _WIN32
+ && my_test_if_atomic_write(file,
+ space->physical_size())
+#else
+ /* On Windows, all single sector writes are atomic,
+ as per WriteFile() documentation on MSDN.
+ We also require SSD for atomic writes, eventhough
+ technically it is not necessary- the reason is that
+ on hard disks, we still want the benefit from
+ (non-atomic) neighbor page flushing in the buffer
+ pool code. */
+ && srv_page_size == block_size
+ && on_ssd
+#endif
+ ;
+ }
+}
+
+/** Read the first page of a data file.
+@return whether the page was found valid */
+bool fil_node_t::read_page0()
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+ const unsigned psize = space->physical_size();
+#ifndef _WIN32
+ struct stat statbuf;
+ if (fstat(handle, &statbuf)) {
+ return false;
+ }
+ os_offset_t size_bytes = statbuf.st_size;
+#else
+ os_offset_t size_bytes = os_file_get_size(handle);
+ ut_a(size_bytes != (os_offset_t) -1);
+#endif
+ const uint32_t min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
+
+ if (size_bytes < min_size) {
+ ib::error() << "The size of the file " << name
+ << " is only " << size_bytes
+ << " bytes, should be at least " << min_size;
+ return false;
+ }
+
+ page_t *page= static_cast<byte*>(aligned_malloc(psize, psize));
+ if (os_file_read(IORequestRead, handle, page, 0, psize)
+ != DB_SUCCESS) {
+ ib::error() << "Unable to read first page of file " << name;
+corrupted:
+ aligned_free(page);
+ return false;
+ }
+
+ const ulint space_id = memcmp_aligned<2>(
+ FIL_PAGE_SPACE_ID + page,
+ FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4)
+ ? ULINT_UNDEFINED
+ : mach_read_from_4(FIL_PAGE_SPACE_ID + page);
+ ulint flags = fsp_header_get_flags(page);
+ const uint32_t size = fsp_header_get_field(page, FSP_SIZE);
+ const uint32_t free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT);
+ const uint32_t free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
+ + page);
+ if (!fil_space_t::is_valid_flags(flags, space->id)) {
+ ulint cflags = fsp_flags_convert_from_101(flags);
+ if (cflags == ULINT_UNDEFINED) {
+invalid:
+ ib::error()
+ << "Expected tablespace flags "
+ << ib::hex(space->flags)
+ << " but found " << ib::hex(flags)
+ << " in the file " << name;
+ goto corrupted;
+ }
+
+ ulint cf = cflags & ~FSP_FLAGS_MEM_MASK;
+ ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK;
+
+ if (!fil_space_t::is_flags_equal(cf, sf)
+ && !fil_space_t::is_flags_equal(sf, cf)) {
+ goto invalid;
+ }
+
+ flags = cflags;
+ }
+
+ ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
+
+ /* Try to read crypt_data from page 0 if it is not yet read. */
+ if (!space->crypt_data) {
+ space->crypt_data = fil_space_read_crypt_data(
+ fil_space_t::zip_size(flags), page);
+ }
+ aligned_free(page);
+
+ if (UNIV_UNLIKELY(space_id != space->id)) {
+ ib::error() << "Expected tablespace id " << space->id
+ << " but found " << space_id
+ << " in the file " << name;
+ return false;
+ }
+
+#ifdef UNIV_LINUX
+ find_metadata(handle, &statbuf);
+#else
+ find_metadata();
+#endif
+ /* Truncate the size to a multiple of extent size. */
+ ulint mask = psize * FSP_EXTENT_SIZE - 1;
+
+ if (size_bytes <= mask) {
+ /* .ibd files start smaller than an
+ extent size. Do not truncate valid data. */
+ } else {
+ size_bytes &= ~os_offset_t(mask);
+ }
+
+ space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags;
+
+ space->punch_hole = space->is_compressed();
+ this->size = uint32_t(size_bytes / psize);
+ space->set_sizes(this->size);
+ ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
+ ut_ad(space->free_len == 0 || space->free_len == free_len);
+ space->size_in_header = size;
+ space->free_limit = free_limit;
+ space->free_len = free_len;
+ return true;
+}
+
+#else
+#include "univ.i"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Normalizes a directory path for the current OS:
+On Windows, we convert '/' to '\', else we convert '\' to '/'.
+@param[in,out] str A null-terminated directory and file path */
+void
+os_normalize_path(
+ char* str)
+{
+ if (str != NULL) {
+ for (; *str; str++) {
+ if (*str == OS_PATH_SEPARATOR_ALT) {
+ *str = OS_PATH_SEPARATOR;
+ }
+ }
+ }
+}
diff --git a/storage/innobase/os/os0thread.cc b/storage/innobase/os/os0thread.cc
new file mode 100644
index 00000000..f3533acf
--- /dev/null
+++ b/storage/innobase/os/os0thread.cc
@@ -0,0 +1,131 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0thread.cc
+The interface to the operating system thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include "srv0srv.h"
+
+#ifdef _WIN32
+bool os_thread_eq(os_thread_id_t a, os_thread_id_t b) { return a == b; }
+void os_thread_yield() { SwitchToThread(); }
+os_thread_id_t os_thread_get_curr_id() { return GetCurrentThreadId(); }
+#endif
+
+/****************************************************************//**
+Creates a new thread of execution. The execution starts from
+the function given.
+NOTE: We count the number of threads in os_thread_exit(). A created
+thread should always use that to exit so thatthe thread count will be
+decremented.
+We do not return an error code because if there is one, we crash here. */
+os_thread_t os_thread_create(os_thread_func_t func, void *arg)
+{
+ os_thread_id_t new_thread_id;
+
+#ifdef _WIN32
+ HANDLE handle;
+
+ handle = CreateThread(NULL, /* no security attributes */
+ 0, /* default size stack */
+ func,
+ arg,
+ 0, /* thread runs immediately */
+ &new_thread_id);
+
+ if (!handle) {
+ /* If we cannot start a new thread, life has no meaning. */
+ ib::fatal() << "CreateThread returned " << GetLastError();
+ }
+
+ CloseHandle(handle);
+
+ return((os_thread_t)new_thread_id);
+#else /* _WIN32 else */
+
+ pthread_attr_t attr;
+
+ int ret = pthread_attr_init(&attr);
+ if (UNIV_UNLIKELY(ret)) {
+ fprintf(stderr,
+ "InnoDB: Error: pthread_attr_init() returned %d\n",
+ ret);
+ abort();
+ }
+
+ ret = pthread_create(&new_thread_id, &attr, func, arg);
+
+ ut_a(ret == 0);
+
+ pthread_attr_destroy(&attr);
+
+#endif /* not _WIN32 */
+
+ return((os_thread_t)new_thread_id);
+}
+
+/** Detach and terminate the current thread. */
+ATTRIBUTE_NORETURN void os_thread_exit()
+{
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ ib::info() << "Thread exits, id " << os_thread_get_curr_id();
+#endif
+
+#ifdef UNIV_PFS_THREAD
+ pfs_delete_thread();
+#endif
+
+#ifdef _WIN32
+ ExitThread(0);
+#else
+ pthread_detach(pthread_self());
+ pthread_exit(NULL);
+#endif
+}
+
+/*****************************************************************//**
+The thread sleeps at least the time given in microseconds. */
+void
+os_thread_sleep(
+/*============*/
+ ulint tm) /*!< in: time in microseconds */
+{
+#ifdef _WIN32
+ Sleep((DWORD) tm / 1000);
+#elif defined(HAVE_NANOSLEEP)
+ struct timespec t;
+
+ t.tv_sec = tm / 1000000;
+ t.tv_nsec = (tm % 1000000) * 1000;
+
+ ::nanosleep(&t, NULL);
+#else
+ struct timeval t;
+
+ t.tv_sec = tm / 1000000;
+ t.tv_usec = tm % 1000000;
+
+ select(0, NULL, NULL, NULL, &t);
+#endif /* _WIN32 */
+}
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
new file mode 100644
index 00000000..cc6b1797
--- /dev/null
+++ b/storage/innobase/page/page0cur.cc
@@ -0,0 +1,2983 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file page/page0cur.cc
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0cur.h"
+#include "page0zip.h"
+#include "btr0btr.h"
+#include "mtr0log.h"
+#include "log0recv.h"
+#include "rem0cmp.h"
+#include "gis0rtree.h"
+
+#include <algorithm>
+
+#ifdef BTR_CUR_HASH_ADAPT
+# ifdef UNIV_SEARCH_PERF_STAT
+static ulint page_cur_short_succ;
+# endif /* UNIV_SEARCH_PERF_STAT */
+
+/** Try a search shortcut based on the last insert.
+@param[in] block index page
+@param[in] index index tree
+@param[in] tuple search key
+@param[in,out] iup_matched_fields already matched fields in the
+upper limit record
+@param[in,out] ilow_matched_fields already matched fields in the
+lower limit record
+@param[out] cursor page cursor
+@return true on success */
+UNIV_INLINE
+bool
+page_cur_try_search_shortcut(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ ulint* iup_matched_fields,
+ ulint* ilow_matched_fields,
+ page_cur_t* cursor)
+{
+ const rec_t* rec;
+ const rec_t* next_rec;
+ ulint low_match;
+ ulint up_match;
+ ibool success = FALSE;
+ const page_t* page = buf_block_get_frame(block);
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(page_is_leaf(page));
+
+ rec = page_header_get_ptr(page, PAGE_LAST_INSERT);
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ dtuple_get_n_fields(tuple), &heap);
+
+ ut_ad(rec);
+ ut_ad(page_rec_is_user_rec(rec));
+
+ low_match = up_match = std::min(*ilow_matched_fields,
+ *iup_matched_fields);
+
+ if (cmp_dtuple_rec_with_match(tuple, rec, offsets, &low_match) < 0) {
+ goto exit_func;
+ }
+
+ next_rec = page_rec_get_next_const(rec);
+ if (!page_rec_is_supremum(next_rec)) {
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ index->n_core_fields,
+ dtuple_get_n_fields(tuple), &heap);
+
+ if (cmp_dtuple_rec_with_match(tuple, next_rec, offsets,
+ &up_match) >= 0) {
+ goto exit_func;
+ }
+
+ *iup_matched_fields = up_match;
+ }
+
+ page_cur_position(rec, block, cursor);
+
+ *ilow_matched_fields = low_match;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ page_cur_short_succ++;
+#endif
+ success = TRUE;
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(success);
+}
+
+/** Try a search shortcut based on the last insert.
+@param[in] block index page
+@param[in] index index tree
+@param[in] tuple search key
+@param[in,out] iup_matched_fields already matched fields in the
+upper limit record
+@param[in,out] iup_matched_bytes already matched bytes in the
+first partially matched field in the upper limit record
+@param[in,out] ilow_matched_fields already matched fields in the
+lower limit record
+@param[in,out] ilow_matched_bytes already matched bytes in the
+first partially matched field in the lower limit record
+@param[out] cursor page cursor
+@return true on success */
+UNIV_INLINE
+bool
+page_cur_try_search_shortcut_bytes(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ ulint* iup_matched_fields,
+ ulint* iup_matched_bytes,
+ ulint* ilow_matched_fields,
+ ulint* ilow_matched_bytes,
+ page_cur_t* cursor)
+{
+ const rec_t* rec;
+ const rec_t* next_rec;
+ ulint low_match;
+ ulint low_bytes;
+ ulint up_match;
+ ulint up_bytes;
+ ibool success = FALSE;
+ const page_t* page = buf_block_get_frame(block);
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(page_is_leaf(page));
+
+ rec = page_header_get_ptr(page, PAGE_LAST_INSERT);
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ dtuple_get_n_fields(tuple), &heap);
+
+ ut_ad(rec);
+ ut_ad(page_rec_is_user_rec(rec));
+ if (ut_pair_cmp(*ilow_matched_fields, *ilow_matched_bytes,
+ *iup_matched_fields, *iup_matched_bytes) < 0) {
+ up_match = low_match = *ilow_matched_fields;
+ up_bytes = low_bytes = *ilow_matched_bytes;
+ } else {
+ up_match = low_match = *iup_matched_fields;
+ up_bytes = low_bytes = *iup_matched_bytes;
+ }
+
+ if (cmp_dtuple_rec_with_match_bytes(
+ tuple, rec, index, offsets, &low_match, &low_bytes) < 0) {
+ goto exit_func;
+ }
+
+ next_rec = page_rec_get_next_const(rec);
+ if (!page_rec_is_supremum(next_rec)) {
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ index->n_core_fields,
+ dtuple_get_n_fields(tuple), &heap);
+
+ if (cmp_dtuple_rec_with_match_bytes(
+ tuple, next_rec, index, offsets,
+ &up_match, &up_bytes)
+ >= 0) {
+ goto exit_func;
+ }
+
+ *iup_matched_fields = up_match;
+ *iup_matched_bytes = up_bytes;
+ }
+
+ page_cur_position(rec, block, cursor);
+
+ *ilow_matched_fields = low_match;
+ *ilow_matched_bytes = low_bytes;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ page_cur_short_succ++;
+#endif
+ success = TRUE;
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(success);
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+/****************************************************************//**
+Checks if the nth field in a record is a character type field which extends
+the nth field in tuple, i.e., the field is longer or equal in length and has
+common first characters.
+@return TRUE if rec field extends tuple field */
+static
+ibool
+page_cur_rec_field_extends(
+/*=======================*/
+ const dtuple_t* tuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: compare nth field */
+{
+ const dtype_t* type;
+ const dfield_t* dfield;
+ const byte* rec_f;
+ ulint rec_f_len;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ dfield = dtuple_get_nth_field(tuple, n);
+
+ type = dfield_get_type(dfield);
+
+ rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len);
+
+ if (type->mtype == DATA_VARCHAR
+ || type->mtype == DATA_CHAR
+ || type->mtype == DATA_FIXBINARY
+ || type->mtype == DATA_BINARY
+ || type->mtype == DATA_BLOB
+ || DATA_GEOMETRY_MTYPE(type->mtype)
+ || type->mtype == DATA_VARMYSQL
+ || type->mtype == DATA_MYSQL) {
+
+ if (dfield_get_len(dfield) != UNIV_SQL_NULL
+ && rec_f_len != UNIV_SQL_NULL
+ && rec_f_len >= dfield_get_len(dfield)
+ && !cmp_data_data(type->mtype, type->prtype,
+ dfield_get_data(dfield),
+ dfield_get_len(dfield),
+ rec_f, dfield_get_len(dfield))) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+/****************************************************************//**
+Searches the right position for a page cursor. */
+void
+page_cur_search_with_match(
+/*=======================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ const dict_index_t* index, /*!< in/out: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ ulint* iup_matched_fields,
+ /*!< in/out: already matched
+ fields in upper limit record */
+ ulint* ilow_matched_fields,
+ /*!< in/out: already matched
+ fields in lower limit record */
+ page_cur_t* cursor, /*!< out: page cursor */
+ rtr_info_t* rtr_info)/*!< in/out: rtree search stack */
+{
+ ulint up;
+ ulint low;
+ ulint mid;
+ const page_t* page;
+ const page_dir_slot_t* slot;
+ const rec_t* up_rec;
+ const rec_t* low_rec;
+ const rec_t* mid_rec;
+ ulint up_matched_fields;
+ ulint low_matched_fields;
+ ulint cur_matched_fields;
+ int cmp;
+#ifdef UNIV_ZIP_DEBUG
+ const page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+#endif /* UNIV_ZIP_DEBUG */
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(dtuple_validate(tuple));
+#ifdef UNIV_DEBUG
+# ifdef PAGE_CUR_DBG
+ if (mode != PAGE_CUR_DBG)
+# endif /* PAGE_CUR_DBG */
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode != PAGE_CUR_LE_OR_EXTENDS)
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+ || mode == PAGE_CUR_G || mode == PAGE_CUR_GE
+ || dict_index_is_spatial(index));
+#endif /* UNIV_DEBUG */
+ page = buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ ut_d(page_check_dir(page));
+ const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (n_core
+ && page_get_direction(page) == PAGE_RIGHT
+ && page_header_get_offs(page, PAGE_LAST_INSERT)
+ && mode == PAGE_CUR_LE
+ && !index->is_spatial()
+ && page_header_get_field(page, PAGE_N_DIRECTION) > 3
+ && page_cur_try_search_shortcut(
+ block, index, tuple,
+ iup_matched_fields, ilow_matched_fields, cursor)) {
+ return;
+ }
+# ifdef PAGE_CUR_DBG
+ if (mode == PAGE_CUR_DBG) {
+ mode = PAGE_CUR_LE;
+ }
+# endif
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /* If the mode is for R-tree indexes, use the special MBR
+ related compare functions */
+ if (index->is_spatial() && mode > PAGE_CUR_LE) {
+ /* For leaf level insert, we still use the traditional
+ compare function for now */
+ if (mode == PAGE_CUR_RTREE_INSERT && n_core) {
+ mode = PAGE_CUR_LE;
+ } else {
+ rtr_cur_search_with_match(
+ block, (dict_index_t*)index, tuple, mode,
+ cursor, rtr_info);
+ return;
+ }
+ }
+
+ /* The following flag does not work for non-latin1 char sets because
+ cmp_full_field does not tell how many bytes matched */
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ ut_a(mode != PAGE_CUR_LE_OR_EXTENDS);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+ /* If mode PAGE_CUR_G is specified, we are trying to position the
+ cursor to answer a query of the form "tuple < X", where tuple is
+ the input parameter, and X denotes an arbitrary physical record on
+ the page. We want to position the cursor on the first X which
+ satisfies the condition. */
+
+ up_matched_fields = *iup_matched_fields;
+ low_matched_fields = *ilow_matched_fields;
+
+ /* Perform binary search. First the search is done through the page
+ directory, after that as a linear search in the list of records
+ owned by the upper limit directory slot. */
+
+ low = 0;
+ up = ulint(page_dir_get_n_slots(page)) - 1;
+
+ /* Perform binary search until the lower and upper limit directory
+ slots come to the distance 1 of each other */
+
+ while (up - low > 1) {
+ mid = (low + up) / 2;
+ slot = page_dir_get_nth_slot(page, mid);
+ mid_rec = page_dir_slot_get_rec(slot);
+
+ cur_matched_fields = std::min(low_matched_fields,
+ up_matched_fields);
+
+ offsets = offsets_;
+ offsets = rec_get_offsets(
+ mid_rec, index, offsets, n_core,
+ dtuple_get_n_fields_cmp(tuple), &heap);
+
+ cmp = cmp_dtuple_rec_with_match(
+ tuple, mid_rec, offsets, &cur_matched_fields);
+
+ if (cmp > 0) {
+low_slot_match:
+ low = mid;
+ low_matched_fields = cur_matched_fields;
+
+ } else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && page_cur_rec_field_extends(
+ tuple, mid_rec, offsets,
+ cur_matched_fields)) {
+
+ goto low_slot_match;
+ }
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_slot_match:
+ up = mid;
+ up_matched_fields = cur_matched_fields;
+
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ) {
+ goto low_slot_match;
+ } else {
+
+ goto up_slot_match;
+ }
+ }
+
+ slot = page_dir_get_nth_slot(page, low);
+ low_rec = page_dir_slot_get_rec(slot);
+ slot = page_dir_get_nth_slot(page, up);
+ up_rec = page_dir_slot_get_rec(slot);
+
+ /* Perform linear search until the upper and lower records come to
+ distance 1 of each other. */
+
+ while (page_rec_get_next_const(low_rec) != up_rec) {
+
+ mid_rec = page_rec_get_next_const(low_rec);
+
+ cur_matched_fields = std::min(low_matched_fields,
+ up_matched_fields);
+
+ offsets = offsets_;
+ offsets = rec_get_offsets(
+ mid_rec, index, offsets, n_core,
+ dtuple_get_n_fields_cmp(tuple), &heap);
+
+ cmp = cmp_dtuple_rec_with_match(
+ tuple, mid_rec, offsets, &cur_matched_fields);
+
+ if (cmp > 0) {
+low_rec_match:
+ low_rec = mid_rec;
+ low_matched_fields = cur_matched_fields;
+
+ } else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && page_cur_rec_field_extends(
+ tuple, mid_rec, offsets,
+ cur_matched_fields)) {
+
+ goto low_rec_match;
+ }
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_rec_match:
+ up_rec = mid_rec;
+ up_matched_fields = cur_matched_fields;
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ) {
+ if (!cmp && !cur_matched_fields) {
+#ifdef UNIV_DEBUG
+ mtr_t mtr;
+ mtr_start(&mtr);
+
+ /* We got a match, but cur_matched_fields is
+ 0, it must have REC_INFO_MIN_REC_FLAG */
+ ulint rec_info = rec_get_info_bits(mid_rec,
+ rec_offs_comp(offsets));
+ ut_ad(rec_info & REC_INFO_MIN_REC_FLAG);
+ ut_ad(!page_has_prev(page));
+ mtr_commit(&mtr);
+#endif
+
+ cur_matched_fields = dtuple_get_n_fields_cmp(tuple);
+ }
+
+ goto low_rec_match;
+ } else {
+
+ goto up_rec_match;
+ }
+ }
+
+ if (mode <= PAGE_CUR_GE) {
+ page_cur_position(up_rec, block, cursor);
+ } else {
+ page_cur_position(low_rec, block, cursor);
+ }
+
+ *iup_matched_fields = up_matched_fields;
+ *ilow_matched_fields = low_matched_fields;
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Search the right position for a page cursor.
+@param[in] block buffer block
+@param[in] index index tree
+@param[in] tuple key to be searched for
+@param[in] mode search mode
+@param[in,out] iup_matched_fields already matched fields in the
+upper limit record
+@param[in,out] iup_matched_bytes already matched bytes in the
+first partially matched field in the upper limit record
+@param[in,out] ilow_matched_fields already matched fields in the
+lower limit record
+@param[in,out] ilow_matched_bytes already matched bytes in the
+first partially matched field in the lower limit record
+@param[out] cursor page cursor */
+void
+page_cur_search_with_match_bytes(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ page_cur_mode_t mode,
+ ulint* iup_matched_fields,
+ ulint* iup_matched_bytes,
+ ulint* ilow_matched_fields,
+ ulint* ilow_matched_bytes,
+ page_cur_t* cursor)
+{
+ ulint up;
+ ulint low;
+ ulint mid;
+ const page_t* page;
+ const page_dir_slot_t* slot;
+ const rec_t* up_rec;
+ const rec_t* low_rec;
+ const rec_t* mid_rec;
+ ulint up_matched_fields;
+ ulint up_matched_bytes;
+ ulint low_matched_fields;
+ ulint low_matched_bytes;
+ ulint cur_matched_fields;
+ ulint cur_matched_bytes;
+ int cmp;
+#ifdef UNIV_ZIP_DEBUG
+ const page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+#endif /* UNIV_ZIP_DEBUG */
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(dtuple_validate(tuple));
+ ut_ad(!(tuple->info_bits & REC_INFO_MIN_REC_FLAG));
+#ifdef UNIV_DEBUG
+# ifdef PAGE_CUR_DBG
+ if (mode != PAGE_CUR_DBG)
+# endif /* PAGE_CUR_DBG */
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode != PAGE_CUR_LE_OR_EXTENDS)
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+ || mode == PAGE_CUR_G || mode == PAGE_CUR_GE);
+#endif /* UNIV_DEBUG */
+ page = buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ ut_d(page_check_dir(page));
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (page_is_leaf(page)
+ && page_get_direction(page) == PAGE_RIGHT
+ && page_header_get_offs(page, PAGE_LAST_INSERT)
+ && mode == PAGE_CUR_LE
+ && page_header_get_field(page, PAGE_N_DIRECTION) > 3
+ && page_cur_try_search_shortcut_bytes(
+ block, index, tuple,
+ iup_matched_fields, iup_matched_bytes,
+ ilow_matched_fields, ilow_matched_bytes,
+ cursor)) {
+ return;
+ }
+# ifdef PAGE_CUR_DBG
+ if (mode == PAGE_CUR_DBG) {
+ mode = PAGE_CUR_LE;
+ }
+# endif
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /* The following flag does not work for non-latin1 char sets because
+ cmp_full_field does not tell how many bytes matched */
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ ut_a(mode != PAGE_CUR_LE_OR_EXTENDS);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+ /* If mode PAGE_CUR_G is specified, we are trying to position the
+ cursor to answer a query of the form "tuple < X", where tuple is
+ the input parameter, and X denotes an arbitrary physical record on
+ the page. We want to position the cursor on the first X which
+ satisfies the condition. */
+
+ up_matched_fields = *iup_matched_fields;
+ up_matched_bytes = *iup_matched_bytes;
+ low_matched_fields = *ilow_matched_fields;
+ low_matched_bytes = *ilow_matched_bytes;
+
+ /* Perform binary search. First the search is done through the page
+ directory, after that as a linear search in the list of records
+ owned by the upper limit directory slot. */
+
+ low = 0;
+ up = ulint(page_dir_get_n_slots(page)) - 1;
+
+ /* Perform binary search until the lower and upper limit directory
+ slots come to the distance 1 of each other */
+ const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+
+ while (up - low > 1) {
+ mid = (low + up) / 2;
+ slot = page_dir_get_nth_slot(page, mid);
+ mid_rec = page_dir_slot_get_rec(slot);
+
+ ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+ low_matched_fields, low_matched_bytes,
+ up_matched_fields, up_matched_bytes);
+
+ offsets = rec_get_offsets(
+ mid_rec, index, offsets_, n_core,
+ dtuple_get_n_fields_cmp(tuple), &heap);
+
+ cmp = cmp_dtuple_rec_with_match_bytes(
+ tuple, mid_rec, index, offsets,
+ &cur_matched_fields, &cur_matched_bytes);
+
+ if (cmp > 0) {
+low_slot_match:
+ low = mid;
+ low_matched_fields = cur_matched_fields;
+ low_matched_bytes = cur_matched_bytes;
+
+ } else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && page_cur_rec_field_extends(
+ tuple, mid_rec, offsets,
+ cur_matched_fields)) {
+
+ goto low_slot_match;
+ }
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_slot_match:
+ up = mid;
+ up_matched_fields = cur_matched_fields;
+ up_matched_bytes = cur_matched_bytes;
+
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ) {
+ goto low_slot_match;
+ } else {
+
+ goto up_slot_match;
+ }
+ }
+
+ slot = page_dir_get_nth_slot(page, low);
+ low_rec = page_dir_slot_get_rec(slot);
+ slot = page_dir_get_nth_slot(page, up);
+ up_rec = page_dir_slot_get_rec(slot);
+
+ /* Perform linear search until the upper and lower records come to
+ distance 1 of each other. */
+
+ while (page_rec_get_next_const(low_rec) != up_rec) {
+
+ mid_rec = page_rec_get_next_const(low_rec);
+
+ ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+ low_matched_fields, low_matched_bytes,
+ up_matched_fields, up_matched_bytes);
+
+ if (UNIV_UNLIKELY(rec_get_info_bits(
+ mid_rec,
+ dict_table_is_comp(index->table))
+ & REC_INFO_MIN_REC_FLAG)) {
+ ut_ad(!page_has_prev(page_align(mid_rec)));
+ ut_ad(!page_rec_is_leaf(mid_rec)
+ || rec_is_metadata(mid_rec, *index));
+ cmp = 1;
+ goto low_rec_match;
+ }
+
+ offsets = rec_get_offsets(
+ mid_rec, index, offsets_, n_core,
+ dtuple_get_n_fields_cmp(tuple), &heap);
+
+ cmp = cmp_dtuple_rec_with_match_bytes(
+ tuple, mid_rec, index, offsets,
+ &cur_matched_fields, &cur_matched_bytes);
+
+ if (cmp > 0) {
+low_rec_match:
+ low_rec = mid_rec;
+ low_matched_fields = cur_matched_fields;
+ low_matched_bytes = cur_matched_bytes;
+
+ } else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && page_cur_rec_field_extends(
+ tuple, mid_rec, offsets,
+ cur_matched_fields)) {
+
+ goto low_rec_match;
+ }
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_rec_match:
+ up_rec = mid_rec;
+ up_matched_fields = cur_matched_fields;
+ up_matched_bytes = cur_matched_bytes;
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ) {
+ goto low_rec_match;
+ } else {
+
+ goto up_rec_match;
+ }
+ }
+
+ if (mode <= PAGE_CUR_GE) {
+ page_cur_position(up_rec, block, cursor);
+ } else {
+ page_cur_position(low_rec, block, cursor);
+ }
+
+ *iup_matched_fields = up_matched_fields;
+ *iup_matched_bytes = up_matched_bytes;
+ *ilow_matched_fields = low_matched_fields;
+ *ilow_matched_bytes = low_matched_bytes;
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+ buf_block_t* block, /*!< in: page */
+ page_cur_t* cursor) /*!< out: page cursor */
+{
+ const ulint n_recs = page_get_n_recs(block->frame);
+
+ page_cur_set_before_first(block, cursor);
+
+ if (UNIV_UNLIKELY(n_recs == 0)) {
+
+ return;
+ }
+
+ cursor->rec = page_rec_get_nth(block->frame,
+ ut_rnd_interval(n_recs) + 1);
+}
+
+/**
+Set the number of owned records.
+@param[in,out] rec record in block.frame
+@param[in] n_owned number of records skipped in the sparse page directory
+@param[in] comp whether ROW_FORMAT is COMPACT or DYNAMIC */
+static void page_rec_set_n_owned(rec_t *rec, ulint n_owned, bool comp)
+{
+ rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED;
+ *rec= static_cast<byte>((*rec & ~REC_N_OWNED_MASK) |
+ (n_owned << REC_N_OWNED_SHIFT));
+}
+
+/**
+Split a directory slot which owns too many records.
+@param[in,out] block index page
+@param[in,out] slot the slot that needs to be split */
+static void page_dir_split_slot(const buf_block_t &block,
+ page_dir_slot_t *slot)
+{
+ ut_ad(slot <= &block.frame[srv_page_size - PAGE_EMPTY_DIR_START]);
+ slot= my_assume_aligned<2>(slot);
+
+ const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1;
+
+ ut_ad(page_dir_slot_get_n_owned(slot) == n_owned);
+ static_assert((PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 >=
+ PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility");
+
+ /* Find a record approximately in the middle. */
+ const rec_t *rec= page_dir_slot_get_rec(slot + PAGE_DIR_SLOT_SIZE);
+
+ for (ulint i= n_owned / 2; i--; )
+ rec= page_rec_get_next_const(rec);
+
+ /* Add a directory slot immediately below this one. */
+ constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
+ byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block.frame);
+ const uint16_t n_slots= mach_read_from_2(n_slots_p);
+
+ page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
+ (block.frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
+ n_slots * PAGE_DIR_SLOT_SIZE);
+ ut_ad(slot >= last_slot);
+ memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE,
+ slot - last_slot);
+
+ const ulint half_owned= n_owned / 2;
+
+ mach_write_to_2(n_slots_p, n_slots + 1);
+
+ mach_write_to_2(slot, rec - block.frame);
+ const bool comp= page_is_comp(block.frame) != 0;
+ page_rec_set_n_owned(page_dir_slot_get_rec(slot), half_owned, comp);
+ page_rec_set_n_owned(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE),
+ n_owned - half_owned, comp);
+}
+
+/**
+Split a directory slot which owns too many records.
+@param[in,out] block index page (ROW_FORMAT=COMPRESSED)
+@param[in] s the slot that needs to be split
+@param[in,out] mtr mini-transaction */
+static void page_zip_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr)
+{
+ ut_ad(block->page.zip.data);
+ ut_ad(page_is_comp(block->frame));
+ ut_ad(s);
+
+ page_dir_slot_t *slot= page_dir_get_nth_slot(block->frame, s);
+ const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1;
+
+ ut_ad(page_dir_slot_get_n_owned(slot) == n_owned);
+ static_assert((PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 >=
+ PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility");
+
+ /* 1. We loop to find a record approximately in the middle of the
+ records owned by the slot. */
+
+ const rec_t *rec= page_dir_slot_get_rec(slot + PAGE_DIR_SLOT_SIZE);
+
+ for (ulint i= n_owned / 2; i--; )
+ rec= page_rec_get_next_const(rec);
+
+ /* Add a directory slot immediately below this one. */
+ constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
+ byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block->frame);
+ const uint16_t n_slots= mach_read_from_2(n_slots_p);
+
+ page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
+ (block->frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
+ n_slots * PAGE_DIR_SLOT_SIZE);
+ memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE,
+ slot - last_slot);
+
+ const ulint half_owned= n_owned / 2;
+
+ mtr->write<2>(*block, n_slots_p, 1U + n_slots);
+
+ /* Log changes to the compressed page header and the dense page directory. */
+ memcpy_aligned<2>(&block->page.zip.data[n_slots_f], n_slots_p, 2);
+ mach_write_to_2(slot, page_offset(rec));
+ page_rec_set_n_owned<true>(block, page_dir_slot_get_rec(slot), half_owned,
+ true, mtr);
+ page_rec_set_n_owned<true>(block,
+ page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE),
+ n_owned - half_owned, true, mtr);
+}
+
+/**
+Try to balance an underfilled directory slot with an adjacent one,
+so that there are at least the minimum number of records owned by the slot;
+this may result in merging the two slots.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in] s the slot to be balanced
+@param[in,out] mtr mini-transaction */
+static void page_zip_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr)
+{
+ ut_ad(block->page.zip.data);
+ ut_ad(page_is_comp(block->frame));
+ ut_ad(s > 0);
+
+ const ulint n_slots = page_dir_get_n_slots(block->frame);
+
+ if (UNIV_UNLIKELY(s + 1 == n_slots)) {
+ /* The last directory slot cannot be balanced. */
+ return;
+ }
+
+ ut_ad(s < n_slots);
+
+ page_dir_slot_t* slot = page_dir_get_nth_slot(block->frame, s);
+ rec_t* const up_rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE));
+ rec_t* const slot_rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(slot));
+ const ulint up_n_owned = rec_get_n_owned_new(up_rec);
+
+ ut_ad(rec_get_n_owned_new(page_dir_slot_get_rec(slot))
+ == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+ if (up_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+ compile_time_assert(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1
+ <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ /* Merge the slots. */
+ page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr);
+ page_rec_set_n_owned<true>(block, up_rec, up_n_owned
+ + (PAGE_DIR_SLOT_MIN_N_OWNED - 1),
+ true, mtr);
+ /* Shift the slots */
+ page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+ block->frame, n_slots - 1);
+ memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
+ slot - last_slot);
+ constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
+ byte *n_slots_p= my_assume_aligned<2>
+ (n_slots_f + block->frame);
+ mtr->write<2>(*block, n_slots_p, n_slots - 1);
+ memcpy_aligned<2>(n_slots_f + block->page.zip.data,
+ n_slots_p, 2);
+ memset_aligned<2>(last_slot, 0, 2);
+ return;
+ }
+
+ /* Transfer one record to the underfilled slot */
+ page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr);
+ rec_t* new_rec = rec_get_next_ptr(slot_rec, TRUE);
+ page_rec_set_n_owned<true>(block, new_rec,
+ PAGE_DIR_SLOT_MIN_N_OWNED,
+ true, mtr);
+ mach_write_to_2(slot, page_offset(new_rec));
+ page_rec_set_n_owned(up_rec, up_n_owned - 1, true);
+}
+
+/**
+Try to balance an underfilled directory slot with an adjacent one,
+so that there are at least the minimum number of records owned by the slot;
+this may result in merging the two slots.
+@param[in,out] block index page
+@param[in] s the slot to be balanced */
+static void page_dir_balance_slot(const buf_block_t &block, ulint s)
+{
+ const bool comp= page_is_comp(block.frame);
+ ut_ad(!block.page.zip.data);
+ ut_ad(s > 0);
+
+ const ulint n_slots = page_dir_get_n_slots(block.frame);
+
+ if (UNIV_UNLIKELY(s + 1 == n_slots)) {
+ /* The last directory slot cannot be balanced. */
+ return;
+ }
+
+ ut_ad(s < n_slots);
+
+ page_dir_slot_t* slot = page_dir_get_nth_slot(block.frame, s);
+ rec_t* const up_rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE));
+ rec_t* const slot_rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(slot));
+ const ulint up_n_owned = comp
+ ? rec_get_n_owned_new(up_rec)
+ : rec_get_n_owned_old(up_rec);
+
+ ut_ad(page_dir_slot_get_n_owned(slot)
+ == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+ if (up_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+ compile_time_assert(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1
+ <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ /* Merge the slots. */
+ page_rec_set_n_owned(slot_rec, 0, comp);
+ page_rec_set_n_owned(up_rec, up_n_owned
+ + (PAGE_DIR_SLOT_MIN_N_OWNED - 1), comp);
+ /* Shift the slots */
+ page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+ block.frame, n_slots - 1);
+ memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
+ slot - last_slot);
+ memset_aligned<2>(last_slot, 0, 2);
+ constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
+ byte *n_slots_p= my_assume_aligned<2>
+ (n_slots_f + block.frame);
+ mach_write_to_2(n_slots_p, n_slots - 1);
+ return;
+ }
+
+ /* Transfer one record to the underfilled slot */
+ rec_t* new_rec;
+
+ if (comp) {
+ page_rec_set_n_owned(slot_rec, 0, true);
+ new_rec = rec_get_next_ptr(slot_rec, TRUE);
+ page_rec_set_n_owned(new_rec, PAGE_DIR_SLOT_MIN_N_OWNED, true);
+ page_rec_set_n_owned(up_rec, up_n_owned - 1, true);
+ } else {
+ page_rec_set_n_owned(slot_rec, 0, false);
+ new_rec = rec_get_next_ptr(slot_rec, FALSE);
+ page_rec_set_n_owned(new_rec, PAGE_DIR_SLOT_MIN_N_OWNED,
+ false);
+ page_rec_set_n_owned(up_rec, up_n_owned - 1, false);
+ }
+
+ mach_write_to_2(slot, page_offset(new_rec));
+}
+
+/** Allocate space for inserting an index record.
+@tparam compressed whether to update the ROW_FORMAT=COMPRESSED
+@param[in,out] block index page
+@param[in] need number of bytes needed
+@param[out] heap_no record heap number
+@return pointer to the start of the allocated buffer
+@retval NULL if allocation fails */
+template<bool compressed=false>
+static byte* page_mem_alloc_heap(buf_block_t *block, ulint need,
+ ulint *heap_no)
+{
+ ut_ad(!compressed || block->page.zip.data);
+
+ byte *heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER +
+ block->frame);
+
+ const uint16_t top= mach_read_from_2(heap_top);
+
+ if (need > page_get_max_insert_size(block->frame, 1))
+ return NULL;
+
+ byte *n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + block->frame);
+
+ const uint16_t h= mach_read_from_2(n_heap);
+ if (UNIV_UNLIKELY((h + 1) & 0x6000))
+ {
+ /* At the minimum record size of 5+2 bytes, we can only reach this
+ condition when using innodb_page_size=64k. */
+ ut_ad((h & 0x7fff) == 8191);
+ ut_ad(srv_page_size == 65536);
+ return NULL;
+ }
+
+ *heap_no= h & 0x7fff;
+ ut_ad(*heap_no < srv_page_size / REC_N_NEW_EXTRA_BYTES);
+ compile_time_assert(UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES < 0x3fff);
+
+ mach_write_to_2(heap_top, top + need);
+ mach_write_to_2(n_heap, h + 1);
+
+ if (compressed)
+ {
+ ut_ad(h & 0x8000);
+ memcpy_aligned<4>(&block->page.zip.data[PAGE_HEAP_TOP + PAGE_HEADER],
+ heap_top, 4);
+ }
+
+ return &block->frame[top];
+}
+
+/** Write log for inserting a B-tree or R-tree record in
+ROW_FORMAT=REDUNDANT.
+@param block B-tree or R-tree page
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev_rec byte offset of the predecessor of the record to insert,
+ starting from PAGE_OLD_INFIMUM
+@param info_bits info_bits of the record
+@param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag()
+@param hdr_c number of common record header bytes with prev_rec
+@param data_c number of common data bytes with prev_rec
+@param hdr record header bytes to copy to the log
+@param hdr_l number of copied record header bytes
+@param data record payload bytes to copy to the log
+@param data_l number of copied record data bytes */
+inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
+ ulint prev_rec, byte info_bits,
+ ulint n_fields_s, size_t hdr_c, size_t data_c,
+ const byte *hdr, size_t hdr_l,
+ const byte *data, size_t data_l)
+{
+ ut_ad(!block.page.zip.data);
+ ut_ad(m_log_mode == MTR_LOG_ALL);
+ ut_d(ulint n_slots= page_dir_get_n_slots(block.frame));
+ ut_ad(n_slots >= 2);
+ ut_d(const byte *page_end= page_dir_get_nth_slot(block.frame, n_slots - 1));
+ ut_ad(&block.frame[prev_rec + PAGE_OLD_INFIMUM] <= page_end);
+ ut_ad(block.frame + page_header_get_offs(block.frame, PAGE_HEAP_TOP) <=
+ page_end);
+ ut_ad(fil_page_index_page_check(block.frame));
+ ut_ad(!(~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_bits));
+ ut_ad(n_fields_s >= 2);
+ ut_ad((n_fields_s >> 1) <= REC_MAX_N_FIELDS);
+ ut_ad(data_l + data_c <= REDUNDANT_REC_MAX_DATA_SIZE);
+
+ set_modified(block);
+
+ static_assert(REC_INFO_MIN_REC_FLAG == 0x10, "compatibility");
+ static_assert(REC_INFO_DELETED_FLAG == 0x20, "compatibility");
+ n_fields_s= (n_fields_s - 2) << 2 | info_bits >> 4;
+
+ size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+ static_assert((REC_MAX_N_FIELDS << 1 | 1) <= MIN_3BYTE, "compatibility");
+ len+= n_fields_s < MIN_2BYTE ? 1 : 2;
+ len+= hdr_c < MIN_2BYTE ? 1 : 2;
+ static_assert(REDUNDANT_REC_MAX_DATA_SIZE <= MIN_3BYTE, "compatibility");
+ len+= data_c < MIN_2BYTE ? 1 : 2;
+ len+= hdr_l + data_l;
+
+ const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+ byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small);
+
+ if (UNIV_LIKELY(small))
+ {
+ ut_d(const byte * const end = l + len);
+ *l++= reuse ? INSERT_REUSE_REDUNDANT : INSERT_HEAP_REDUNDANT;
+ l= mlog_encode_varint(l, prev_rec);
+ l= mlog_encode_varint(l, n_fields_s);
+ l= mlog_encode_varint(l, hdr_c);
+ l= mlog_encode_varint(l, data_c);
+ ::memcpy(l, hdr, hdr_l);
+ l+= hdr_l;
+ ::memcpy(l, data, data_l);
+ l+= data_l;
+ ut_ad(end == l);
+ m_log.close(l);
+ }
+ else
+ {
+ m_log.close(l);
+ l= m_log.open(len - hdr_l - data_l);
+ ut_d(const byte * const end = l + len - hdr_l - data_l);
+ *l++= reuse ? INSERT_REUSE_REDUNDANT : INSERT_HEAP_REDUNDANT;
+ l= mlog_encode_varint(l, prev_rec);
+ l= mlog_encode_varint(l, n_fields_s);
+ l= mlog_encode_varint(l, hdr_c);
+ l= mlog_encode_varint(l, data_c);
+ ut_ad(end == l);
+ m_log.close(l);
+ m_log.push(hdr, static_cast<uint32_t>(hdr_l));
+ m_log.push(data, static_cast<uint32_t>(data_l));
+ }
+
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for inserting a B-tree or R-tree record in
+ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC.
+@param block B-tree or R-tree page
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev_rec byte offset of the predecessor of the record to insert,
+ starting from PAGE_NEW_INFIMUM
+@param info_status rec_get_info_and_status_bits()
+@param shift unless !reuse: number of bytes the PAGE_FREE is moving
+@param hdr_c number of common record header bytes with prev_rec
+@param data_c number of common data bytes with prev_rec
+@param hdr record header bytes to copy to the log
+@param hdr_l number of copied record header bytes
+@param data record payload bytes to copy to the log
+@param data_l number of copied record data bytes */
+inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
+ ulint prev_rec, byte info_status,
+ ssize_t shift, size_t hdr_c, size_t data_c,
+ const byte *hdr, size_t hdr_l,
+ const byte *data, size_t data_l)
+{
+ ut_ad(!block.page.zip.data);
+ ut_ad(m_log_mode == MTR_LOG_ALL);
+ ut_d(ulint n_slots= page_dir_get_n_slots(block.frame));
+ ut_ad(n_slots >= 2);
+ ut_d(const byte *page_end= page_dir_get_nth_slot(block.frame, n_slots - 1));
+ ut_ad(&block.frame[prev_rec + PAGE_NEW_INFIMUM] <= page_end);
+ ut_ad(block.frame + page_header_get_offs(block.frame, PAGE_HEAP_TOP) <=
+ page_end);
+ ut_ad(fil_page_index_page_check(block.frame));
+ ut_ad(hdr_l + hdr_c + data_l + data_c <=
+ static_cast<size_t>(page_end - &block.frame[PAGE_NEW_SUPREMUM_END]));
+ ut_ad(reuse || shift == 0);
+#ifdef UNIV_DEBUG
+ switch (~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_status) {
+ default:
+ ut_ad(0);
+ break;
+ case REC_STATUS_NODE_PTR:
+ ut_ad(!page_is_leaf(block.frame));
+ break;
+ case REC_STATUS_INSTANT:
+ case REC_STATUS_ORDINARY:
+ ut_ad(page_is_leaf(block.frame));
+ }
+#endif
+
+ set_modified(block);
+
+ static_assert(REC_INFO_MIN_REC_FLAG == 0x10, "compatibility");
+ static_assert(REC_INFO_DELETED_FLAG == 0x20, "compatibility");
+ static_assert(REC_STATUS_INSTANT == 4, "compatibility");
+
+ const size_t enc_hdr_l= hdr_l << 3 |
+ (info_status & REC_STATUS_INSTANT) | info_status >> 4;
+ size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+ static_assert(REC_MAX_N_FIELDS * 2 < MIN_3BYTE, "compatibility");
+ if (reuse)
+ {
+ if (shift < 0)
+ shift= -shift << 1 | 1;
+ else
+ shift<<= 1;
+ len+= static_cast<size_t>(shift) < MIN_2BYTE
+ ? 1 : static_cast<size_t>(shift) < MIN_3BYTE ? 2 : 3;
+ }
+ ut_ad(hdr_c + hdr_l <= REC_MAX_N_FIELDS * 2);
+ len+= hdr_c < MIN_2BYTE ? 1 : 2;
+ len+= enc_hdr_l < MIN_2BYTE ? 1 : enc_hdr_l < MIN_3BYTE ? 2 : 3;
+ len+= data_c < MIN_2BYTE ? 1 : data_c < MIN_3BYTE ? 2 : 3;
+ len+= hdr_l + data_l;
+
+ const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+ byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small);
+
+ if (UNIV_LIKELY(small))
+ {
+ ut_d(const byte * const end = l + len);
+ *l++= reuse ? INSERT_REUSE_DYNAMIC : INSERT_HEAP_DYNAMIC;
+ l= mlog_encode_varint(l, prev_rec);
+ if (reuse)
+ l= mlog_encode_varint(l, shift);
+ l= mlog_encode_varint(l, enc_hdr_l);
+ l= mlog_encode_varint(l, hdr_c);
+ l= mlog_encode_varint(l, data_c);
+ ::memcpy(l, hdr, hdr_l);
+ l+= hdr_l;
+ ::memcpy(l, data, data_l);
+ l+= data_l;
+ ut_ad(end == l);
+ m_log.close(l);
+ }
+ else
+ {
+ m_log.close(l);
+ l= m_log.open(len - hdr_l - data_l);
+ ut_d(const byte * const end = l + len - hdr_l - data_l);
+ *l++= reuse ? INSERT_REUSE_DYNAMIC : INSERT_HEAP_DYNAMIC;
+ l= mlog_encode_varint(l, prev_rec);
+ if (reuse)
+ l= mlog_encode_varint(l, shift);
+ l= mlog_encode_varint(l, enc_hdr_l);
+ l= mlog_encode_varint(l, hdr_c);
+ l= mlog_encode_varint(l, data_c);
+ ut_ad(end == l);
+ m_log.close(l);
+ m_log.push(hdr, static_cast<uint32_t>(hdr_l));
+ m_log.push(data, static_cast<uint32_t>(data_l));
+ }
+
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+Returns pointer to inserted record if succeed, i.e., enough
+space available, NULL otherwise. The cursor stays at the same position.
+@return pointer to record if succeed, NULL otherwise */
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+ const page_cur_t*cur, /*!< in: page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: record to insert after cur */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ buf_block_t* block= cur->block;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_n_fields(offsets) > 0);
+ ut_ad(index->table->not_redundant() == !!page_is_comp(block->frame));
+ ut_ad(!!page_is_comp(block->frame) == !!rec_offs_comp(offsets));
+ ut_ad(fil_page_index_page_check(block->frame));
+ ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame) ==
+ index->id ||
+ mtr->is_inside_ibuf());
+ ut_ad(page_dir_get_n_slots(block->frame) >= 2);
+
+ ut_ad(!page_rec_is_supremum(cur->rec));
+
+ /* We should not write log for ROW_FORMAT=COMPRESSED pages here. */
+ ut_ad(mtr->get_log_mode() != MTR_LOG_ALL ||
+ !(index->table->flags & DICT_TF_MASK_ZIP_SSIZE));
+
+ /* 1. Get the size of the physical record in the page */
+ const ulint rec_size= rec_offs_size(offsets);
+
+#ifdef HAVE_MEM_CHECK
+ {
+ const void *rec_start __attribute__((unused))=
+ rec - rec_offs_extra_size(offsets);
+ ulint extra_size __attribute__((unused))=
+ rec_offs_extra_size(offsets) -
+ (page_is_comp(block->frame)
+ ? REC_N_NEW_EXTRA_BYTES
+ : REC_N_OLD_EXTRA_BYTES);
+ /* All data bytes of the record must be valid. */
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ /* The variable-length header must be valid. */
+ MEM_CHECK_DEFINED(rec_start, extra_size);
+ }
+#endif /* HAVE_MEM_CHECK */
+
+ /* 2. Try to find suitable space from page memory management */
+ bool reuse= false;
+ ssize_t free_offset= 0;
+ ulint heap_no;
+ byte *insert_buf;
+
+ const bool comp= page_is_comp(block->frame);
+ const ulint extra_size= rec_offs_extra_size(offsets);
+
+ if (rec_t* free_rec= page_header_get_ptr(block->frame, PAGE_FREE))
+ {
+ /* Try to reuse the head of PAGE_FREE. */
+ rec_offs foffsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t *heap= nullptr;
+
+ rec_offs_init(foffsets_);
+
+ rec_offs *foffsets= rec_get_offsets(free_rec, index, foffsets_,
+ page_is_leaf(block->frame)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ const ulint fextra_size= rec_offs_extra_size(foffsets);
+ insert_buf= free_rec - fextra_size;
+ const bool too_small= (fextra_size + rec_offs_data_size(foffsets)) <
+ rec_size;
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+
+ if (too_small)
+ goto use_heap;
+
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block->frame);
+ if (comp)
+ {
+ heap_no= rec_get_heap_no_new(free_rec);
+ uint16_t next= mach_read_from_2(free_rec - REC_NEXT);
+ mach_write_to_2(page_free, next
+ ? static_cast<uint16_t>(free_rec + next - block->frame)
+ : 0);
+ }
+ else
+ {
+ heap_no= rec_get_heap_no_old(free_rec);
+ memcpy(page_free, free_rec - REC_NEXT, 2);
+ }
+
+ static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+
+ byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+ ut_ad(mach_read_from_2(page_garbage) >= rec_size);
+ mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) - rec_size);
+ reuse= true;
+ free_offset= extra_size - fextra_size;
+ }
+ else
+ {
+use_heap:
+ insert_buf= page_mem_alloc_heap(block, rec_size, &heap_no);
+
+ if (UNIV_UNLIKELY(!insert_buf))
+ return nullptr;
+ }
+
+ ut_ad(cur->rec != insert_buf + extra_size);
+
+ rec_t *next_rec= block->frame + rec_get_next_offs(cur->rec, comp);
+ ut_ad(next_rec != block->frame);
+
+ /* Update page header fields */
+ byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+ block->frame);
+ const uint16_t last_insert= mach_read_from_2(page_last_insert);
+ ut_ad(!last_insert || !comp ||
+ rec_get_node_ptr_flag(block->frame + last_insert) ==
+ rec_get_node_ptr_flag(rec));
+
+ /* Write PAGE_LAST_INSERT */
+ mach_write_to_2(page_last_insert, page_offset(insert_buf + extra_size));
+
+ /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+ if (block->frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+ {
+ byte *dir= &block->frame[PAGE_DIRECTION_B + PAGE_HEADER];
+ byte *n= my_assume_aligned<2>
+ (&block->frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+ if (UNIV_UNLIKELY(!last_insert))
+ {
+no_direction:
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+ memset(n, 0, 2);
+ }
+ else if (block->frame + last_insert == cur->rec &&
+ (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+ mach_write_to_2(n, mach_read_from_2(n) + 1);
+ }
+ else if (next_rec == block->frame + last_insert &&
+ (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+ goto inc_dir;
+ }
+ else
+ goto no_direction;
+ }
+
+ /* Update PAGE_N_RECS. */
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block->frame);
+
+ mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+ /* Update the preceding record header, the 'owner' record and
+ prepare the record to insert. */
+ rec_t *insert_rec= insert_buf + extra_size;
+ const ulint data_size= rec_offs_data_size(offsets);
+ memcpy(insert_buf, rec - extra_size, extra_size + data_size);
+ size_t hdr_common= 0;
+ ulint n_owned;
+ const byte info_status= static_cast<byte>
+ (rec_get_info_and_status_bits(rec, comp));
+ ut_ad(!(rec_get_info_bits(rec, comp) &
+ ~(REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG)));
+
+ if (comp)
+ {
+#ifdef UNIV_DEBUG
+ switch (rec_get_status(cur->rec)) {
+ case REC_STATUS_ORDINARY:
+ case REC_STATUS_NODE_PTR:
+ case REC_STATUS_INSTANT:
+ case REC_STATUS_INFIMUM:
+ break;
+ case REC_STATUS_SUPREMUM:
+ ut_ad("wrong status on cur->rec" == 0);
+ }
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_NODE_PTR:
+ ut_ad(!page_is_leaf(block->frame));
+ break;
+ case REC_STATUS_INSTANT:
+ ut_ad(index->is_instant());
+ ut_ad(page_is_leaf(block->frame));
+ if (!rec_is_metadata(rec, true))
+ break;
+ ut_ad(cur->rec == &block->frame[PAGE_NEW_INFIMUM]);
+ break;
+ case REC_STATUS_ORDINARY:
+ ut_ad(page_is_leaf(block->frame));
+ ut_ad(!(rec_get_info_bits(rec, true) & ~REC_INFO_DELETED_FLAG));
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ ut_ad("wrong status on rec" == 0);
+ }
+ ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+#endif
+
+ rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ insert_rec[-REC_NEW_STATUS]= rec[-REC_NEW_STATUS];
+ rec_set_bit_field_2(insert_rec, heap_no,
+ REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ mach_write_to_2(insert_rec - REC_NEXT,
+ static_cast<uint16_t>(next_rec - insert_rec));
+ mach_write_to_2(cur->rec - REC_NEXT,
+ static_cast<uint16_t>(insert_rec - cur->rec));
+ while (!(n_owned= rec_get_n_owned_new(next_rec)))
+ {
+ next_rec= block->frame + rec_get_next_offs(next_rec, true);
+ ut_ad(next_rec != block->frame);
+ }
+ rec_set_bit_field_1(next_rec, n_owned + 1, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ if (mtr->get_log_mode() != MTR_LOG_ALL)
+ {
+ mtr->set_modified(*block);
+ goto copied;
+ }
+
+ const byte * const c_start= cur->rec - extra_size;
+ if (extra_size > REC_N_NEW_EXTRA_BYTES &&
+ c_start >=
+ &block->frame[PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES])
+ {
+ /* Find common header bytes with the preceding record. */
+ const byte *r= rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ for (const byte *c= cur->rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ *r == *c && c-- != c_start; r--);
+ hdr_common= static_cast<size_t>((rec - (REC_N_NEW_EXTRA_BYTES + 1)) - r);
+ ut_ad(hdr_common <= extra_size - REC_N_NEW_EXTRA_BYTES);
+ }
+ }
+ else
+ {
+#ifdef UNIV_DEBUG
+ if (!page_is_leaf(block->frame));
+ else if (rec_is_metadata(rec, false))
+ {
+ ut_ad(index->is_instant());
+ ut_ad(cur->rec == &block->frame[PAGE_OLD_INFIMUM]);
+ }
+#endif
+ rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(insert_rec, heap_no,
+ REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ memcpy(insert_rec - REC_NEXT, cur->rec - REC_NEXT, 2);
+ mach_write_to_2(cur->rec - REC_NEXT, page_offset(insert_rec));
+ while (!(n_owned= rec_get_n_owned_old(next_rec)))
+ {
+ next_rec= block->frame + rec_get_next_offs(next_rec, false);
+ ut_ad(next_rec != block->frame);
+ }
+ rec_set_bit_field_1(next_rec, n_owned + 1, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ if (mtr->get_log_mode() != MTR_LOG_ALL)
+ {
+ mtr->set_modified(*block);
+ goto copied;
+ }
+
+ ut_ad(extra_size > REC_N_OLD_EXTRA_BYTES);
+ const byte * const c_start= cur->rec - extra_size;
+ if (c_start >=
+ &block->frame[PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES])
+ {
+ /* Find common header bytes with the preceding record. */
+ const byte *r= rec - (REC_N_OLD_EXTRA_BYTES + 1);
+ for (const byte *c= cur->rec - (REC_N_OLD_EXTRA_BYTES + 1);
+ *r == *c && c-- != c_start; r--);
+ hdr_common= static_cast<size_t>((rec - (REC_N_OLD_EXTRA_BYTES + 1)) - r);
+ ut_ad(hdr_common <= extra_size - REC_N_OLD_EXTRA_BYTES);
+ }
+ }
+
+ /* Insert the record, possibly copying from the preceding record. */
+ ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
+
+ {
+ const byte *r= rec;
+ const byte *c= cur->rec;
+ const byte *c_end= cur->rec + data_size;
+ if (c <= insert_buf && c_end > insert_buf)
+ c_end= insert_buf;
+ else
+ c_end= std::min<const byte*>(c_end, block->frame + srv_page_size -
+ PAGE_DIR - PAGE_DIR_SLOT_SIZE *
+ page_dir_get_n_slots(block->frame));
+ size_t data_common;
+ /* Copy common data bytes of the preceding record. */
+ for (; c != c_end && *r == *c; c++, r++);
+ data_common= static_cast<size_t>(r - rec);
+
+ if (comp)
+ mtr->page_insert(*block, reuse,
+ cur->rec - block->frame - PAGE_NEW_INFIMUM,
+ info_status, free_offset, hdr_common, data_common,
+ insert_buf,
+ extra_size - hdr_common - REC_N_NEW_EXTRA_BYTES,
+ r, data_size - data_common);
+ else
+ mtr->page_insert(*block, reuse,
+ cur->rec - block->frame - PAGE_OLD_INFIMUM,
+ info_status, rec_get_n_fields_old(insert_rec) << 1 |
+ rec_get_1byte_offs_flag(insert_rec),
+ hdr_common, data_common,
+ insert_buf,
+ extra_size - hdr_common - REC_N_OLD_EXTRA_BYTES,
+ r, data_size - data_common);
+ }
+
+copied:
+ ut_ad(!memcmp(insert_buf, rec - extra_size, extra_size -
+ (comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)));
+ ut_ad(!memcmp(insert_rec, rec, data_size));
+ /* We have incremented the n_owned field of the owner record.
+ If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, we have to split the
+ corresponding directory slot in two. */
+
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+ {
+ const auto owner= page_dir_find_owner_slot(next_rec);
+ page_dir_split_slot(*block, page_dir_get_nth_slot(block->frame, owner));
+ }
+
+ rec_offs_make_valid(insert_buf + extra_size, index,
+ page_is_leaf(block->frame), offsets);
+ return insert_buf + extra_size;
+}
+
+/** Add a slot to the dense page directory.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in] index the index that the page belongs to
+@param[in,out] mtr mini-transaction */
+static inline void page_zip_dir_add_slot(buf_block_t *block,
+ const dict_index_t *index, mtr_t *mtr)
+{
+ page_zip_des_t *page_zip= &block->page.zip;
+
+ ut_ad(page_is_comp(page_zip->data));
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ /* Read the old n_dense (n_heap has already been incremented). */
+ ulint n_dense= page_dir_get_n_heap(page_zip->data) - (PAGE_HEAP_NO_USER_LOW +
+ 1U);
+
+ byte *dir= page_zip->data + page_zip_get_size(page_zip) -
+ PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+ byte *stored= dir;
+
+ if (!page_is_leaf(page_zip->data))
+ {
+ ut_ad(!page_zip->n_blobs);
+ stored-= n_dense * REC_NODE_PTR_SIZE;
+ }
+ else if (index->is_clust())
+ {
+ /* Move the BLOB pointer array backwards to make space for the
+ columns DB_TRX_ID,DB_ROLL_PTR and the dense directory slot. */
+
+ stored-= n_dense * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ byte *externs= stored - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+ byte *dst= externs - PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+ ut_ad(!memcmp(dst, field_ref_zero, PAGE_ZIP_CLUST_LEAF_SLOT_SIZE));
+ if (const ulint len = ulint(stored - externs))
+ {
+ memmove(dst, externs, len);
+ mtr->memmove(*block, dst - page_zip->data, externs - page_zip->data,
+ len);
+ }
+ }
+ else
+ {
+ stored-= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(!memcmp(stored - PAGE_ZIP_DIR_SLOT_SIZE, field_ref_zero,
+ PAGE_ZIP_DIR_SLOT_SIZE));
+ }
+
+ /* Move the uncompressed area backwards to make space
+ for one directory slot. */
+ if (const ulint len = ulint(dir - stored))
+ {
+ byte* dst = stored - PAGE_ZIP_DIR_SLOT_SIZE;
+ memmove(dst, stored, len);
+ mtr->memmove(*block, dst - page_zip->data, stored - page_zip->data, len);
+ }
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_zip_des_t * const page_zip= page_cur_get_page_zip(cursor);
+ ut_ad(page_zip);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ ut_ad(index->table->not_redundant());
+ ut_ad(page_is_comp(cursor->block->frame));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(fil_page_get_type(cursor->block->frame) == FIL_PAGE_INDEX ||
+ fil_page_get_type(cursor->block->frame) == FIL_PAGE_RTREE);
+ ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + cursor->block->frame) ==
+ index->id || mtr->is_inside_ibuf());
+ ut_ad(!page_get_instant(cursor->block->frame));
+ ut_ad(!page_cur_is_after_last(cursor));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, cursor->block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* 1. Get the size of the physical record in the page */
+ const ulint rec_size= rec_offs_size(offsets);
+
+#ifdef HAVE_MEM_CHECK
+ {
+ const void *rec_start __attribute__((unused))=
+ rec - rec_offs_extra_size(offsets);
+ ulint extra_size __attribute__((unused))=
+ rec_offs_extra_size(offsets) - REC_N_NEW_EXTRA_BYTES;
+ /* All data bytes of the record must be valid. */
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ /* The variable-length header must be valid. */
+ MEM_CHECK_DEFINED(rec_start, extra_size);
+ }
+#endif /* HAVE_MEM_CHECK */
+ const bool reorg_before_insert= page_has_garbage(cursor->block->frame) &&
+ rec_size > page_get_max_insert_size(cursor->block->frame, 1) &&
+ rec_size <= page_get_max_insert_size_after_reorganize(cursor->block->frame,
+ 1);
+ constexpr uint16_t page_free_f= PAGE_FREE + PAGE_HEADER;
+ byte* const page_free = my_assume_aligned<4>(page_free_f +
+ cursor->block->frame);
+ uint16_t free_rec= 0;
+
+ /* 2. Try to find suitable space from page memory management */
+ ulint heap_no;
+ byte *insert_buf;
+
+ if (reorg_before_insert ||
+ !page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+ {
+ /* SET GLOBAL might be executed concurrently. Sample the value once. */
+ ulint level= page_zip_level;
+#ifdef UNIV_DEBUG
+ const rec_t * const cursor_rec= page_cur_get_rec(cursor);
+#endif /* UNIV_DEBUG */
+
+ if (page_is_empty(cursor->block->frame))
+ {
+ ut_ad(page_cur_is_before_first(cursor));
+
+ /* This is an empty page. Recreate to remove the modification log. */
+ page_create_zip(cursor->block, index,
+ page_header_get_field(cursor->block->frame, PAGE_LEVEL),
+ 0, mtr);
+ ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE));
+
+ if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+ goto use_heap;
+
+ /* The cursor should remain on the page infimum. */
+ return nullptr;
+ }
+
+ if (page_zip->m_nonempty || page_has_garbage(cursor->block->frame))
+ {
+ ulint pos= page_rec_get_n_recs_before(cursor->rec);
+
+ if (!page_zip_reorganize(cursor->block, index, level, mtr, true))
+ {
+ ut_ad(cursor->rec == cursor_rec);
+ return nullptr;
+ }
+
+ if (pos)
+ cursor->rec= page_rec_get_nth(cursor->block->frame, pos);
+ else
+ ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->frame));
+
+ ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE));
+
+ if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+ goto use_heap;
+ }
+
+ /* Try compressing the whole page afterwards. */
+ const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NONE);
+ rec_t *insert_rec= page_cur_insert_rec_low(cursor, index, rec, offsets,
+ mtr);
+ mtr->set_log_mode(log_mode);
+
+ if (insert_rec)
+ {
+ ulint pos= page_rec_get_n_recs_before(insert_rec);
+ ut_ad(pos > 0);
+
+ /* We are writing entire page images to the log. Reduce the redo
+ log volume by reorganizing the page at the same time. */
+ if (page_zip_reorganize(cursor->block, index, level, mtr))
+ {
+ /* The page was reorganized: Seek to pos. */
+ cursor->rec= pos > 1
+ ? page_rec_get_nth(cursor->block->frame, pos - 1)
+ : cursor->block->frame + PAGE_NEW_INFIMUM;
+ insert_rec= cursor->block->frame + rec_get_next_offs(cursor->rec, 1);
+ rec_offs_make_valid(insert_rec, index,
+ page_is_leaf(cursor->block->frame), offsets);
+ return insert_rec;
+ }
+
+ /* Theoretically, we could try one last resort of
+ page_zip_reorganize() followed by page_zip_available(), but that
+ would be very unlikely to succeed. (If the full reorganized page
+ failed to compress, why would it succeed to compress the page,
+ plus log the insert of this record?) */
+
+ /* Out of space: restore the page */
+ if (!page_zip_decompress(page_zip, cursor->block->frame, false))
+ ut_error; /* Memory corrupted? */
+ ut_ad(page_validate(cursor->block->frame, index));
+ insert_rec= nullptr;
+ }
+ return insert_rec;
+ }
+
+ free_rec= mach_read_from_2(page_free);
+ if (free_rec)
+ {
+ /* Try to allocate from the head of the free list. */
+ rec_offs foffsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t *heap= nullptr;
+
+ rec_offs_init(foffsets_);
+
+ rec_offs *foffsets= rec_get_offsets(cursor->block->frame + free_rec, index,
+ foffsets_,
+ page_is_leaf(cursor->block->frame)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ insert_buf= cursor->block->frame + free_rec -
+ rec_offs_extra_size(foffsets);
+
+ if (rec_offs_size(foffsets) < rec_size)
+ {
+too_small:
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ free_rec= 0;
+ goto use_heap;
+ }
+
+ /* On compressed pages, do not relocate records from
+ the free list. If extra_size would grow, use the heap. */
+ const ssize_t extra_size_diff= lint(rec_offs_extra_size(offsets) -
+ rec_offs_extra_size(foffsets));
+
+ if (UNIV_UNLIKELY(extra_size_diff < 0))
+ {
+ /* Add an offset to the extra_size. */
+ if (rec_offs_size(foffsets) < rec_size - ssize_t(extra_size_diff))
+ goto too_small;
+
+ insert_buf-= extra_size_diff;
+ }
+ else if (UNIV_UNLIKELY(extra_size_diff))
+ /* Do not allow extra_size to grow */
+ goto too_small;
+
+ byte *const free_rec_ptr= cursor->block->frame + free_rec;
+ heap_no= rec_get_heap_no_new(free_rec_ptr);
+ int16_t next_rec= mach_read_from_2(free_rec_ptr - REC_NEXT);
+ /* With innodb_page_size=64k, int16_t would be unsafe to use here,
+ but that cannot be used with ROW_FORMAT=COMPRESSED. */
+ static_assert(UNIV_ZIP_SIZE_SHIFT_MAX == 14, "compatibility");
+ if (next_rec)
+ {
+ next_rec= static_cast<int16_t>(next_rec + free_rec);
+ ut_ad(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES} <= next_rec);
+ ut_ad(static_cast<uint16_t>(next_rec) < srv_page_size);
+ }
+
+ byte *hdr= my_assume_aligned<4>(&page_zip->data[page_free_f]);
+ mach_write_to_2(hdr, static_cast<uint16_t>(next_rec));
+ const byte *const garbage= my_assume_aligned<2>(page_free + 2);
+ ut_ad(mach_read_from_2(garbage) >= rec_size);
+ mach_write_to_2(my_assume_aligned<2>(hdr + 2),
+ mach_read_from_2(garbage) - rec_size);
+ static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+ mtr->memcpy(*cursor->block, page_free, hdr, 4);
+
+ if (!page_is_leaf(cursor->block->frame))
+ {
+ /* Zero out the node pointer of free_rec, in case it will not be
+ overwritten by insert_rec. */
+ ut_ad(rec_size > REC_NODE_PTR_SIZE);
+
+ if (rec_offs_size(foffsets) > rec_size)
+ memset(rec_get_end(free_rec_ptr, foffsets) -
+ REC_NODE_PTR_SIZE, 0, REC_NODE_PTR_SIZE);
+ }
+ else if (index->is_clust())
+ {
+ /* Zero out DB_TRX_ID,DB_ROLL_PTR in free_rec, in case they will
+ not be overwritten by insert_rec. */
+
+ ulint len;
+ ulint trx_id_offs= rec_get_nth_field_offs(foffsets, index->db_trx_id(),
+ &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+
+ if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs +
+ rec_offs_extra_size(foffsets) > rec_size)
+ memset(free_rec_ptr + trx_id_offs, 0,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ ut_ad(free_rec_ptr + trx_id_offs + DATA_TRX_ID_LEN ==
+ rec_get_nth_field(free_rec_ptr, foffsets, index->db_roll_ptr(),
+ &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ }
+
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ }
+ else
+ {
+use_heap:
+ ut_ad(!free_rec);
+ insert_buf= page_mem_alloc_heap<true>(cursor->block, rec_size, &heap_no);
+
+ if (UNIV_UNLIKELY(!insert_buf))
+ return insert_buf;
+
+ static_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2, "compatibility");
+ mtr->memcpy(*cursor->block, PAGE_HEAP_TOP + PAGE_HEADER, 4);
+ page_zip_dir_add_slot(cursor->block, index, mtr);
+ }
+
+ /* 3. Create the record */
+ byte *insert_rec= rec_copy(insert_buf, rec, offsets);
+ rec_offs_make_valid(insert_rec, index, page_is_leaf(cursor->block->frame),
+ offsets);
+
+ /* 4. Insert the record in the linked list of records */
+ ut_ad(cursor->rec != insert_rec);
+
+ /* next record after current before the insertion */
+ const rec_t* next_rec = page_rec_get_next_low(cursor->rec, TRUE);
+ ut_ad(rec_get_status(cursor->rec) <= REC_STATUS_INFIMUM);
+ ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+ ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+
+ mach_write_to_2(insert_rec - REC_NEXT, static_cast<uint16_t>
+ (next_rec - insert_rec));
+ mach_write_to_2(cursor->rec - REC_NEXT, static_cast<uint16_t>
+ (insert_rec - cursor->rec));
+ byte *n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ cursor->block->frame);
+ mtr->write<2>(*cursor->block, n_recs, 1U + mach_read_from_2(n_recs));
+ memcpy_aligned<2>(&page_zip->data[PAGE_N_RECS + PAGE_HEADER], n_recs, 2);
+
+ /* 5. Set the n_owned field in the inserted record to zero,
+ and set the heap_no field */
+ rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(insert_rec, heap_no, REC_NEW_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+
+ MEM_CHECK_DEFINED(rec_get_start(insert_rec, offsets),
+ rec_offs_size(offsets));
+
+ /* 6. Update the last insertion info in page header */
+ byte *last_insert= my_assume_aligned<4>(PAGE_LAST_INSERT + PAGE_HEADER +
+ page_zip->data);
+ const uint16_t last_insert_rec= mach_read_from_2(last_insert);
+ ut_ad(!last_insert_rec ||
+ rec_get_node_ptr_flag(cursor->block->frame + last_insert_rec) ==
+ rec_get_node_ptr_flag(insert_rec));
+ mach_write_to_2(last_insert, page_offset(insert_rec));
+
+ if (!index->is_spatial())
+ {
+ byte *dir= &page_zip->data[PAGE_HEADER + PAGE_DIRECTION_B];
+ ut_ad(!(*dir & ~((1U << 3) - 1)));
+ byte *n= my_assume_aligned<2>
+ (&page_zip->data[PAGE_HEADER + PAGE_N_DIRECTION]);
+ if (UNIV_UNLIKELY(!last_insert_rec))
+ {
+no_direction:
+ *dir= PAGE_NO_DIRECTION;
+ memset(n, 0, 2);
+ }
+ else if (*dir != PAGE_LEFT &&
+ cursor->block->frame + last_insert_rec == cursor->rec)
+ {
+ *dir= PAGE_RIGHT;
+inc_dir:
+ mach_write_to_2(n, mach_read_from_2(n) + 1);
+ }
+ else if (*dir != PAGE_RIGHT && page_rec_get_next(insert_rec) ==
+ cursor->block->frame + last_insert_rec)
+ {
+ *dir= PAGE_LEFT;
+ goto inc_dir;
+ }
+ else
+ goto no_direction;
+ }
+
+ /* Write the header fields in one record. */
+ mtr->memcpy(*cursor->block,
+ my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
+ cursor->block->frame),
+ my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
+ page_zip->data),
+ PAGE_N_RECS - PAGE_LAST_INSERT + 2);
+
+ /* 7. It remains to update the owner record. */
+ ulint n_owned;
+
+ while (!(n_owned = rec_get_n_owned_new(next_rec)))
+ next_rec= page_rec_get_next_low(next_rec, true);
+
+ rec_set_bit_field_1(const_cast<rec_t*>(next_rec), n_owned + 1,
+ REC_NEW_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+ page_zip_dir_insert(cursor, free_rec, insert_rec, mtr);
+
+ /* 8. Now we have incremented the n_owned field of the owner
+ record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+ we have to split the corresponding directory slot in two. */
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+ page_zip_dir_split_slot(cursor->block,
+ page_dir_find_owner_slot(next_rec), mtr);
+
+ page_zip_write_rec(cursor->block, insert_rec, index, offsets, 1, mtr);
+ return insert_rec;
+}
+
+/** Prepend a record to the PAGE_FREE list, or shrink PAGE_HEAP_TOP.
+@param[in,out] block index page
+@param[in,out] rec record being deleted
+@param[in] data_size record payload size, in bytes
+@param[in] extra_size record header size, in bytes */
+static void page_mem_free(const buf_block_t &block, rec_t *rec,
+ size_t data_size, size_t extra_size)
+{
+ ut_ad(page_align(rec) == block.frame);
+ ut_ad(!block.page.zip.data);
+ const rec_t *free= page_header_get_ptr(block.frame, PAGE_FREE);
+
+ const uint16_t n_heap= uint16_t(page_header_get_field(block.frame,
+ PAGE_N_HEAP) - 1);
+ ut_ad(page_get_n_recs(block.frame) < (n_heap & 0x7fff));
+ const bool deleting_top= n_heap == ((n_heap & 0x8000)
+ ? (rec_get_heap_no_new(rec) | 0x8000)
+ : rec_get_heap_no_old(rec));
+
+ if (deleting_top)
+ {
+ byte *page_heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER +
+ block.frame);
+ const uint16_t heap_top= mach_read_from_2(page_heap_top);
+ const size_t extra_savings= heap_top - page_offset(rec + data_size);
+ ut_ad(extra_savings < heap_top);
+
+ /* When deleting the last record, do not add it to the PAGE_FREE list.
+ Instead, decrement PAGE_HEAP_TOP and PAGE_N_HEAP. */
+ mach_write_to_2(page_heap_top, page_offset(rec - extra_size));
+ mach_write_to_2(my_assume_aligned<2>(page_heap_top + 2), n_heap);
+ static_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2, "compatibility");
+ if (extra_savings)
+ {
+ byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+ block.frame);
+ uint16_t garbage= mach_read_from_2(page_garbage);
+ ut_ad(garbage >= extra_savings);
+ mach_write_to_2(page_garbage, garbage - extra_savings);
+ }
+ }
+ else
+ {
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block.frame);
+ byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+ block.frame);
+ mach_write_to_2(page_free, page_offset(rec));
+ mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) +
+ extra_size + data_size);
+ }
+
+ memset_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + block.frame, 0, 2);
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block.frame);
+ mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) - 1);
+
+ const byte* const end= rec + data_size;
+
+ if (!deleting_top)
+ {
+ uint16_t next= free
+ ? ((n_heap & 0x8000)
+ ? static_cast<uint16_t>(free - rec)
+ : static_cast<uint16_t>(free - block.frame))
+ : uint16_t{0};
+ mach_write_to_2(rec - REC_NEXT, next);
+ }
+ else
+ rec-= extra_size;
+
+ memset(rec, 0, end - rec);
+}
+
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the next
+record after the deleted one. */
+void
+page_cur_delete_rec(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(
+ cursor->rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_dir_slot_t* cur_dir_slot;
+ rec_t* current_rec;
+ rec_t* prev_rec = NULL;
+ rec_t* next_rec;
+ ulint cur_slot_no;
+ ulint cur_n_owned;
+ rec_t* rec;
+
+ /* page_zip_validate() will fail here when
+ btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark().
+ Then, both "page_zip" and "block->frame" would have the min-rec-mark
+ set on the smallest user record, but "block->frame" would additionally
+ have it set on the smallest-but-one record. Because sloppy
+ page_zip_validate_low() only ignores min-rec-flag differences
+ in the smallest user record, it cannot be used here either. */
+
+ current_rec = cursor->rec;
+ buf_block_t* const block = cursor->block;
+ ut_ad(rec_offs_validate(current_rec, index, offsets));
+ ut_ad(!!page_is_comp(block->frame) == index->table->not_redundant());
+ ut_ad(fil_page_index_page_check(block->frame));
+ ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame)
+ == index->id
+ || mtr->is_inside_ibuf());
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ /* The record must not be the supremum or infimum record. */
+ ut_ad(page_rec_is_user_rec(current_rec));
+
+ if (page_get_n_recs(block->frame) == 1
+ && !rec_is_alter_metadata(current_rec, *index)) {
+ /* Empty the page. */
+ ut_ad(page_is_leaf(block->frame));
+ /* Usually, this should be the root page,
+ and the whole index tree should become empty.
+ However, this could also be a call in
+ btr_cur_pessimistic_update() to delete the only
+ record in the page and to insert another one. */
+ page_cur_move_to_next(cursor);
+ ut_ad(page_cur_is_after_last(cursor));
+ page_create_empty(page_cur_get_block(cursor),
+ const_cast<dict_index_t*>(index), mtr);
+ return;
+ }
+
+ /* Save to local variables some data associated with current_rec */
+ cur_slot_no = page_dir_find_owner_slot(current_rec);
+ ut_ad(cur_slot_no > 0);
+ cur_dir_slot = page_dir_get_nth_slot(block->frame, cur_slot_no);
+ cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot);
+
+ /* The page gets invalid for btr_pcur_restore_pos().
+ We avoid invoking buf_block_modify_clock_inc(block) because its
+ consistency checks would fail for the dummy block that is being
+ used during IMPORT TABLESPACE. */
+ block->modify_clock++;
+
+ /* Find the next and the previous record. Note that the cursor is
+ left at the next record. */
+
+ rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(cur_dir_slot + PAGE_DIR_SLOT_SIZE));
+
+ /* rec now points to the record of the previous directory slot. Look
+ for the immediate predecessor of current_rec in a loop. */
+
+ while (current_rec != rec) {
+ prev_rec = rec;
+ rec = page_rec_get_next(rec);
+ }
+
+ page_cur_move_to_next(cursor);
+ next_rec = cursor->rec;
+
+ /* Remove the record from the linked list of records */
+ /* If the deleted record is pointed to by a dir slot, update the
+ record pointer in slot. In the following if-clause we assume that
+ prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED
+ >= 2. */
+ /* Update the number of owned records of the slot */
+
+ compile_time_assert(PAGE_DIR_SLOT_MIN_N_OWNED >= 2);
+ ut_ad(cur_n_owned > 1);
+
+ rec_t* slot_rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(cur_dir_slot));
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ ut_ad(page_is_comp(block->frame));
+ if (current_rec == slot_rec) {
+ page_zip_rec_set_owned(block, prev_rec, 1, mtr);
+ page_zip_rec_set_owned(block, slot_rec, 0, mtr);
+ slot_rec = prev_rec;
+ mach_write_to_2(cur_dir_slot, page_offset(slot_rec));
+ } else if (cur_n_owned == 1
+ && !page_rec_is_supremum(slot_rec)) {
+ page_zip_rec_set_owned(block, slot_rec, 0, mtr);
+ }
+
+ mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+ (next_rec - prev_rec));
+ slot_rec[-REC_NEW_N_OWNED] = static_cast<byte>(
+ (slot_rec[-REC_NEW_N_OWNED] & ~REC_N_OWNED_MASK)
+ | (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+
+ page_header_reset_last_insert(block, mtr);
+ page_zip_dir_delete(block, rec, index, offsets,
+ page_header_get_ptr(block->frame,
+ PAGE_FREE),
+ mtr);
+ if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+ page_zip_dir_balance_slot(block, cur_slot_no, mtr);
+ }
+ return;
+ }
+
+ if (current_rec == slot_rec) {
+ slot_rec = prev_rec;
+ mach_write_to_2(cur_dir_slot, page_offset(slot_rec));
+ }
+
+ const size_t data_size = rec_offs_data_size(offsets);
+ const size_t extra_size = rec_offs_extra_size(offsets);
+
+ if (page_is_comp(block->frame)) {
+ mtr->page_delete(*block, page_offset(prev_rec)
+ - PAGE_NEW_INFIMUM,
+ extra_size - REC_N_NEW_EXTRA_BYTES,
+ data_size);
+ mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+ (next_rec - prev_rec));
+ slot_rec[-REC_NEW_N_OWNED] = static_cast<byte>(
+ (slot_rec[-REC_NEW_N_OWNED] & ~REC_N_OWNED_MASK)
+ | (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+ } else {
+ mtr->page_delete(*block, page_offset(prev_rec)
+ - PAGE_OLD_INFIMUM);
+ memcpy(prev_rec - REC_NEXT, current_rec - REC_NEXT, 2);
+ slot_rec[-REC_OLD_N_OWNED] = static_cast<byte>(
+ (slot_rec[-REC_OLD_N_OWNED] & ~REC_N_OWNED_MASK)
+ | (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+ }
+
+ page_mem_free(*block, current_rec, data_size, extra_size);
+
+ /* Now we have decremented the number of owned records of the slot.
+ If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the
+ slots. */
+
+ if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+ page_dir_balance_slot(*block, cur_slot_no);
+ }
+
+ ut_ad(page_is_comp(block->frame)
+ ? page_simple_validate_new(block->frame)
+ : page_simple_validate_old(block->frame));
+}
+
+/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@param enc_hdr encoded fixed-size header bits
+@param hdr_c number of common record header bytes with prev
+@param data_c number of common data bytes with prev
+@param data literal header and data bytes
+@param data_len length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_redundant(const buf_block_t &block, bool reuse,
+ ulint prev, ulint enc_hdr,
+ size_t hdr_c, size_t data_c,
+ const void *data, size_t data_len)
+{
+ const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+ byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER +
+ block.frame);
+ const uint16_t h= mach_read_from_2(page_n_heap);
+ const page_id_t id(block.page.id());
+ if (UNIV_UNLIKELY(n_slots < 2 || h < n_slots || h < PAGE_HEAP_NO_USER_LOW ||
+ h >= srv_page_size / REC_N_OLD_EXTRA_BYTES ||
+ !fil_page_index_page_check(block.frame) ||
+ page_get_page_no(block.frame) != id.page_no() ||
+ mach_read_from_2(my_assume_aligned<2>
+ (PAGE_OLD_SUPREMUM - REC_NEXT +
+ block.frame))))
+ {
+corrupted:
+ ib::error() << (reuse
+ ? "Not applying INSERT_REUSE_REDUNDANT"
+ " due to corruption on "
+ : "Not applying INSERT_HEAP_REDUNDANT"
+ " due to corruption on ")
+ << id;
+ return true;
+ }
+
+ byte * const last_slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+ byte * const page_heap_top= my_assume_aligned<2>
+ (PAGE_HEAP_TOP + PAGE_HEADER + block.frame);
+ const byte *const heap_bot= &block.frame[PAGE_OLD_SUPREMUM_END];
+ byte *heap_top= block.frame + mach_read_from_2(page_heap_top);
+ if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot))
+ goto corrupted;
+ if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_OLD_SUPREMUM))
+ goto corrupted;
+ if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(block.frame, 0)) !=
+ PAGE_OLD_INFIMUM))
+ goto corrupted;
+ rec_t * const prev_rec= block.frame + PAGE_OLD_INFIMUM + prev;
+ if (!prev);
+ else if (UNIV_UNLIKELY(heap_bot + (REC_N_OLD_EXTRA_BYTES + 1) > prev_rec ||
+ prev_rec > heap_top))
+ goto corrupted;
+ const ulint pn_fields= rec_get_bit_field_2(prev_rec, REC_OLD_N_FIELDS,
+ REC_OLD_N_FIELDS_MASK,
+ REC_OLD_N_FIELDS_SHIFT);
+ if (UNIV_UNLIKELY(pn_fields == 0 || pn_fields > REC_MAX_N_FIELDS))
+ goto corrupted;
+ const ulint pextra_size= REC_N_OLD_EXTRA_BYTES +
+ (rec_get_1byte_offs_flag(prev_rec) ? pn_fields : pn_fields * 2);
+ if (prev_rec == &block.frame[PAGE_OLD_INFIMUM]);
+ else if (UNIV_UNLIKELY(prev_rec - pextra_size < heap_bot))
+ goto corrupted;
+ if (UNIV_UNLIKELY(hdr_c && prev_rec - hdr_c < heap_bot))
+ goto corrupted;
+ const ulint pdata_size= rec_get_data_size_old(prev_rec);
+ if (UNIV_UNLIKELY(prev_rec + pdata_size > heap_top))
+ goto corrupted;
+ rec_t * const next_rec= block.frame + mach_read_from_2(prev_rec - REC_NEXT);
+ if (next_rec == block.frame + PAGE_OLD_SUPREMUM);
+ else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > next_rec ||
+ next_rec > heap_top))
+ goto corrupted;
+ const bool is_short= (enc_hdr >> 2) & 1;
+ const ulint n_fields= (enc_hdr >> 3) + 1;
+ if (UNIV_UNLIKELY(n_fields > REC_MAX_N_FIELDS))
+ goto corrupted;
+ const ulint extra_size= REC_N_OLD_EXTRA_BYTES +
+ (is_short ? n_fields : n_fields * 2);
+ hdr_c+= REC_N_OLD_EXTRA_BYTES;
+ if (UNIV_UNLIKELY(hdr_c > extra_size))
+ goto corrupted;
+ if (UNIV_UNLIKELY(extra_size - hdr_c > data_len))
+ goto corrupted;
+ /* We buffer all changes to the record header locally, so that
+ we will avoid modifying the page before all consistency checks
+ have been fulfilled. */
+ alignas(2) byte insert_buf[REC_N_OLD_EXTRA_BYTES + REC_MAX_N_FIELDS * 2];
+
+ ulint n_owned;
+ rec_t *owner_rec= next_rec;
+ for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED;
+ !(n_owned= rec_get_n_owned_old(owner_rec)); )
+ {
+ owner_rec= block.frame + mach_read_from_2(owner_rec - REC_NEXT);
+ if (owner_rec == &block.frame[PAGE_OLD_SUPREMUM]);
+ else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > owner_rec ||
+ owner_rec > heap_top))
+ goto corrupted;
+ if (!ns--)
+ goto corrupted; /* Corrupted (cyclic?) next-record list */
+ }
+
+ page_dir_slot_t *owner_slot= last_slot;
+
+ if (n_owned > PAGE_DIR_SLOT_MAX_N_OWNED)
+ goto corrupted;
+ else
+ {
+ mach_write_to_2(insert_buf, owner_rec - block.frame);
+ static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+ const page_dir_slot_t * const first_slot=
+ page_dir_get_nth_slot(block.frame, 0);
+
+ while (memcmp_aligned<2>(owner_slot, insert_buf, 2))
+ if ((owner_slot+= 2) == first_slot)
+ goto corrupted;
+ }
+
+ memcpy(insert_buf, data, extra_size - hdr_c);
+ byte *insert_rec= &insert_buf[extra_size];
+ memcpy(insert_rec - hdr_c, prev_rec - hdr_c, hdr_c);
+ rec_set_bit_field_1(insert_rec, (enc_hdr & 3) << 4,
+ REC_OLD_INFO_BITS, REC_INFO_BITS_MASK,
+ REC_INFO_BITS_SHIFT);
+ rec_set_1byte_offs_flag(insert_rec, is_short);
+ rec_set_n_fields_old(insert_rec, n_fields);
+ rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+ const ulint data_size= rec_get_data_size_old(insert_rec);
+ if (UNIV_UNLIKELY(data_c > data_size))
+ goto corrupted;
+ if (UNIV_UNLIKELY(extra_size - hdr_c + data_size - data_c != data_len))
+ goto corrupted;
+
+ /* Perform final consistency checks and then apply the change to the page. */
+ byte *buf;
+ if (reuse)
+ {
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block.frame);
+ rec_t *free_rec= block.frame + mach_read_from_2(page_free);
+ if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > free_rec ||
+ free_rec > heap_top))
+ goto corrupted;
+ const ulint fn_fields= rec_get_n_fields_old(free_rec);
+ const ulint fextra_size= REC_N_OLD_EXTRA_BYTES +
+ (rec_get_1byte_offs_flag(free_rec) ? fn_fields : fn_fields * 2);
+ if (UNIV_UNLIKELY(free_rec - fextra_size < heap_bot))
+ goto corrupted;
+ const ulint fdata_size= rec_get_data_size_old(free_rec);
+ if (UNIV_UNLIKELY(free_rec + fdata_size > heap_top))
+ goto corrupted;
+ if (UNIV_UNLIKELY(extra_size + data_size > fextra_size + fdata_size))
+ goto corrupted;
+ byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+ if (UNIV_UNLIKELY(mach_read_from_2(page_garbage) <
+ fextra_size + fdata_size))
+ goto corrupted;
+ buf= free_rec - fextra_size;
+ const rec_t *const next_free= block.frame +
+ mach_read_from_2(free_rec - REC_NEXT);
+ if (next_free == block.frame);
+ else if (UNIV_UNLIKELY(next_free < &heap_bot[REC_N_OLD_EXTRA_BYTES + 1] ||
+ heap_top < next_free))
+ goto corrupted;
+ mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) -
+ extra_size - data_size);
+ rec_set_bit_field_2(insert_rec, rec_get_heap_no_old(free_rec),
+ REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ memcpy(page_free, free_rec - REC_NEXT, 2);
+ }
+ else
+ {
+ if (UNIV_UNLIKELY(heap_top + extra_size + data_size > last_slot))
+ goto corrupted;
+ rec_set_bit_field_2(insert_rec, h,
+ REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ mach_write_to_2(page_n_heap, h + 1);
+ mach_write_to_2(page_heap_top,
+ mach_read_from_2(page_heap_top) + extra_size + data_size);
+ buf= heap_top;
+ }
+
+ ut_ad(data_size - data_c == data_len - (extra_size - hdr_c));
+ byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+ block.frame);
+ const uint16_t last_insert= mach_read_from_2(page_last_insert);
+ memcpy(buf, insert_buf, extra_size);
+ buf+= extra_size;
+ mach_write_to_2(page_last_insert, buf - block.frame);
+ memcpy(prev_rec - REC_NEXT, page_last_insert, 2);
+ memcpy(buf, prev_rec, data_c);
+ memcpy(buf + data_c, static_cast<const byte*>(data) + (extra_size - hdr_c),
+ data_len - (extra_size - hdr_c));
+ rec_set_bit_field_1(owner_rec, n_owned + 1, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+ /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+ if (block.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+ {
+ byte *dir= &block.frame[PAGE_DIRECTION_B + PAGE_HEADER];
+ byte *n_dir= my_assume_aligned<2>
+ (&block.frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+ if (UNIV_UNLIKELY(!last_insert))
+ {
+no_direction:
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+ memset(n_dir, 0, 2);
+ }
+ else if (block.frame + last_insert == prev_rec &&
+ (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+ mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1);
+ }
+ else if (next_rec == block.frame + last_insert &&
+ (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+ goto inc_dir;
+ }
+ else
+ goto no_direction;
+ }
+
+ /* Update PAGE_N_RECS. */
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block.frame);
+
+ mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+ page_dir_split_slot(block, owner_slot);
+ ut_ad(page_simple_validate_old(block.frame));
+ return false;
+}
+
+/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param shift unless !reuse: number of bytes the PAGE_FREE is moving
+@param enc_hdr_l number of copied record header bytes, plus record type bits
+@param hdr_c number of common record header bytes with prev
+@param data_c number of common data bytes with prev
+@param data literal header and data bytes
+@param data_len length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse,
+ ulint prev, ulint shift, ulint enc_hdr_l,
+ size_t hdr_c, size_t data_c,
+ const void *data, size_t data_len)
+{
+ const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+ byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER +
+ block.frame);
+ ulint h= mach_read_from_2(page_n_heap);
+ const page_id_t id(block.page.id());
+ if (UNIV_UNLIKELY(n_slots < 2 || h < (PAGE_HEAP_NO_USER_LOW | 0x8000) ||
+ (h & 0x7fff) >= srv_page_size / REC_N_NEW_EXTRA_BYTES ||
+ (h & 0x7fff) < n_slots ||
+ !fil_page_index_page_check(block.frame) ||
+ page_get_page_no(block.frame) != id.page_no() ||
+ mach_read_from_2(my_assume_aligned<2>
+ (PAGE_NEW_SUPREMUM - REC_NEXT +
+ block.frame)) ||
+ ((enc_hdr_l & REC_STATUS_INSTANT) &&
+ !page_is_leaf(block.frame)) ||
+ (enc_hdr_l >> 3) > data_len))
+ {
+corrupted:
+ ib::error() << (reuse
+ ? "Not applying INSERT_REUSE_DYNAMIC"
+ " due to corruption on "
+ : "Not applying INSERT_HEAP_DYNAMIC"
+ " due to corruption on ")
+ << id;
+ return true;
+ }
+
+ byte * const last_slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+ byte * const page_heap_top= my_assume_aligned<2>
+ (PAGE_HEAP_TOP + PAGE_HEADER + block.frame);
+ const byte *const heap_bot= &block.frame[PAGE_NEW_SUPREMUM_END];
+ byte *heap_top= block.frame + mach_read_from_2(page_heap_top);
+ if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot))
+ goto corrupted;
+ if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_NEW_SUPREMUM))
+ goto corrupted;
+ if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(block.frame, 0)) !=
+ PAGE_NEW_INFIMUM))
+ goto corrupted;
+
+ uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev);
+ rec_t *prev_rec= block.frame + n;
+ n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT));
+ if (!prev);
+ else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > prev_rec ||
+ prev_rec > heap_top))
+ goto corrupted;
+
+ rec_t * const next_rec= block.frame + n;
+ if (next_rec == block.frame + PAGE_NEW_SUPREMUM);
+ else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > next_rec ||
+ next_rec > heap_top))
+ goto corrupted;
+
+ ulint n_owned;
+ rec_t *owner_rec= next_rec;
+ n= static_cast<uint16_t>(next_rec - block.frame);
+
+ for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED;
+ !(n_owned= rec_get_n_owned_new(owner_rec)); )
+ {
+ n= static_cast<uint16_t>(n + mach_read_from_2(owner_rec - REC_NEXT));
+ owner_rec= block.frame + n;
+ if (n == PAGE_NEW_SUPREMUM);
+ else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > owner_rec ||
+ owner_rec > heap_top))
+ goto corrupted;
+ if (!ns--)
+ goto corrupted; /* Corrupted (cyclic?) next-record list */
+ }
+
+ page_dir_slot_t* owner_slot= last_slot;
+
+ if (n_owned > PAGE_DIR_SLOT_MAX_N_OWNED)
+ goto corrupted;
+ else
+ {
+ static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+ alignas(2) byte slot_buf[2];
+ mach_write_to_2(slot_buf, owner_rec - block.frame);
+ const page_dir_slot_t * const first_slot=
+ page_dir_get_nth_slot(block.frame, 0);
+
+ while (memcmp_aligned<2>(owner_slot, slot_buf, 2))
+ if ((owner_slot+= 2) == first_slot)
+ goto corrupted;
+ }
+
+ const ulint extra_size= REC_N_NEW_EXTRA_BYTES + hdr_c + (enc_hdr_l >> 3);
+ const ulint data_size= data_c + data_len - (enc_hdr_l >> 3);
+
+ /* Perform final consistency checks and then apply the change to the page. */
+ byte *buf;
+ if (reuse)
+ {
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block.frame);
+ rec_t *free_rec= block.frame + mach_read_from_2(page_free);
+ if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > free_rec ||
+ free_rec > heap_top))
+ goto corrupted;
+ buf= free_rec - extra_size;
+ if (shift & 1)
+ buf-= shift >> 1;
+ else
+ buf+= shift >> 1;
+
+ if (UNIV_UNLIKELY(heap_bot > buf ||
+ &buf[extra_size + data_size] > heap_top))
+ goto corrupted;
+ byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+ if (UNIV_UNLIKELY(mach_read_from_2(page_garbage) < extra_size + data_size))
+ goto corrupted;
+ if ((n= mach_read_from_2(free_rec - REC_NEXT)) != 0)
+ {
+ n= static_cast<uint16_t>(n + free_rec - block.frame);
+ if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+ heap_top < block.frame + n))
+ goto corrupted;
+ }
+ mach_write_to_2(page_free, n);
+ mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) -
+ (extra_size + data_size));
+ h= rec_get_heap_no_new(free_rec);
+ }
+ else
+ {
+ if (UNIV_UNLIKELY(heap_top + extra_size + data_size > last_slot))
+ goto corrupted;
+ mach_write_to_2(page_n_heap, h + 1);
+ h&= 0x7fff;
+ mach_write_to_2(page_heap_top,
+ mach_read_from_2(page_heap_top) + extra_size + data_size);
+ buf= heap_top;
+ }
+
+ memcpy(buf, data, (enc_hdr_l >> 3));
+ buf+= enc_hdr_l >> 3;
+ data_len-= enc_hdr_l >> 3;
+ data= &static_cast<const byte*>(data)[enc_hdr_l >> 3];
+
+ memcpy(buf, prev_rec - REC_N_NEW_EXTRA_BYTES - hdr_c, hdr_c);
+ buf+= hdr_c;
+ *buf++= static_cast<byte>((enc_hdr_l & 3) << 4); /* info_bits; n_owned=0 */
+ *buf++= static_cast<byte>(h >> 5); /* MSB of heap number */
+ h= (h & ((1U << 5) - 1)) << 3;
+ static_assert(REC_STATUS_ORDINARY == 0, "compatibility");
+ static_assert(REC_STATUS_INSTANT == 4, "compatibility");
+ if (page_is_leaf(block.frame))
+ h|= enc_hdr_l & REC_STATUS_INSTANT;
+ else
+ {
+ ut_ad(!(enc_hdr_l & REC_STATUS_INSTANT)); /* Checked at the start */
+ h|= REC_STATUS_NODE_PTR;
+ }
+ *buf++= static_cast<byte>(h); /* LSB of heap number, and status */
+ static_assert(REC_NEXT == 2, "compatibility");
+ buf+= REC_NEXT;
+ mach_write_to_2(buf - REC_NEXT, static_cast<uint16_t>(next_rec - buf));
+ byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+ block.frame);
+ const uint16_t last_insert= mach_read_from_2(page_last_insert);
+ mach_write_to_2(page_last_insert, buf - block.frame);
+ mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(buf - prev_rec));
+ memcpy(buf, prev_rec, data_c);
+ buf+= data_c;
+ memcpy(buf, data, data_len);
+
+ rec_set_bit_field_1(owner_rec, n_owned + 1, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+ /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+ if (block.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+ {
+ byte *dir= &block.frame[PAGE_DIRECTION_B + PAGE_HEADER];
+ byte *n_dir= my_assume_aligned<2>
+ (&block.frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+ if (UNIV_UNLIKELY(!last_insert))
+ {
+no_direction:
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+ memset(n_dir, 0, 2);
+ }
+ else if (block.frame + last_insert == prev_rec &&
+ (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+ mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1);
+ }
+ else if (next_rec == block.frame + last_insert &&
+ (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+ goto inc_dir;
+ }
+ else
+ goto no_direction;
+ }
+
+ /* Update PAGE_N_RECS. */
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block.frame);
+
+ mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+ page_dir_split_slot(block, owner_slot);
+ ut_ad(page_simple_validate_new(block.frame));
+ return false;
+}
+
+/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page.
+@param block B-tree or R-tree page in ROW_FORMAT=REDUNDANT
+@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_redundant(const buf_block_t &block, ulint prev)
+{
+ const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+ ulint n_recs= page_get_n_recs(block.frame);
+ const page_id_t id(block.page.id());
+
+ if (UNIV_UNLIKELY(!n_recs || n_slots < 2 ||
+ !fil_page_index_page_check(block.frame) ||
+ page_get_page_no(block.frame) != id.page_no() ||
+ mach_read_from_2(my_assume_aligned<2>
+ (PAGE_OLD_SUPREMUM - REC_NEXT +
+ block.frame)) ||
+ page_is_comp(block.frame)))
+ {
+corrupted:
+ ib::error() << "Not applying DELETE_ROW_FORMAT_REDUNDANT"
+ " due to corruption on " << id;
+ return true;
+ }
+
+ byte *slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+ rec_t *prev_rec= block.frame + PAGE_OLD_INFIMUM + prev;
+ if (UNIV_UNLIKELY(prev_rec > slot))
+ goto corrupted;
+ uint16_t n= mach_read_from_2(prev_rec - REC_NEXT);
+ rec_t *rec= block.frame + n;
+ if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+ slot < rec))
+ goto corrupted;
+ const ulint extra_size= REC_N_OLD_EXTRA_BYTES + rec_get_n_fields_old(rec) *
+ (rec_get_1byte_offs_flag(rec) ? 1 : 2);
+ const ulint data_size= rec_get_data_size_old(rec);
+ if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + extra_size ||
+ slot < rec + data_size))
+ goto corrupted;
+
+ n= mach_read_from_2(rec - REC_NEXT);
+ rec_t *next= block.frame + n;
+ if (n == PAGE_OLD_SUPREMUM);
+ else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+ slot < next))
+ goto corrupted;
+
+ rec_t *s= rec;
+ ulint slot_owned;
+ for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_old(s)); )
+ {
+ n= mach_read_from_2(s - REC_NEXT);
+ s= block.frame + n;
+ if (n == PAGE_OLD_SUPREMUM);
+ else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+ slot < s))
+ goto corrupted;
+ if (UNIV_UNLIKELY(!i--)) /* Corrupted (cyclic?) next-record list */
+ goto corrupted;
+ }
+ slot_owned--;
+
+ /* The first slot is always pointing to the infimum record.
+ Find the directory slot pointing to s. */
+ const byte * const first_slot= block.frame + srv_page_size - (PAGE_DIR + 2);
+ alignas(2) byte slot_offs[2];
+ mach_write_to_2(slot_offs, s - block.frame);
+ static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+
+ while (memcmp_aligned<2>(slot, slot_offs, 2))
+ if ((slot+= 2) == first_slot)
+ goto corrupted;
+
+ if (rec == s)
+ {
+ s= prev_rec;
+ mach_write_to_2(slot, s - block.frame);
+ }
+
+ memcpy(prev_rec - REC_NEXT, rec - REC_NEXT, 2);
+ s-= REC_OLD_N_OWNED;
+ *s= static_cast<byte>((*s & ~REC_N_OWNED_MASK) |
+ slot_owned << REC_N_OWNED_SHIFT);
+ page_mem_free(block, rec, data_size, extra_size);
+
+ if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED)
+ page_dir_balance_slot(block, (first_slot - slot) / 2);
+
+ ut_ad(page_simple_validate_old(block.frame));
+ return false;
+}
+
+/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size data payload size, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
+ size_t hdr_size, size_t data_size)
+{
+ const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+ ulint n_recs= page_get_n_recs(block.frame);
+ const page_id_t id(block.page.id());
+
+ if (UNIV_UNLIKELY(!n_recs || n_slots < 2 ||
+ !fil_page_index_page_check(block.frame) ||
+ page_get_page_no(block.frame) != id.page_no() ||
+ mach_read_from_2(my_assume_aligned<2>
+ (PAGE_NEW_SUPREMUM - REC_NEXT +
+ block.frame)) ||
+ !page_is_comp(block.frame)))
+ {
+corrupted:
+ ib::error() << "Not applying DELETE_ROW_FORMAT_DYNAMIC"
+ " due to corruption on " << id;
+ return true;
+ }
+
+ byte *slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+ uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev);
+ rec_t *prev_rec= block.frame + n;
+ if (UNIV_UNLIKELY(prev_rec > slot))
+ goto corrupted;
+ n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT));
+ rec_t *rec= block.frame + n;
+ if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+ slot < rec))
+ goto corrupted;
+ const ulint extra_size= REC_N_NEW_EXTRA_BYTES + hdr_size;
+ if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + extra_size ||
+ slot < rec + data_size))
+ goto corrupted;
+ n= static_cast<uint16_t>(n + mach_read_from_2(rec - REC_NEXT));
+ rec_t *next= block.frame + n;
+ if (n == PAGE_NEW_SUPREMUM);
+ else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+ slot < next))
+ goto corrupted;
+
+ rec_t *s= rec;
+ n= static_cast<uint16_t>(rec - block.frame);
+ ulint slot_owned;
+ for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_new(s)); )
+ {
+ const uint16_t next= mach_read_from_2(s - REC_NEXT);
+ if (UNIV_UNLIKELY(next < REC_N_NEW_EXTRA_BYTES ||
+ next > static_cast<uint16_t>(-REC_N_NEW_EXTRA_BYTES)))
+ goto corrupted;
+ n= static_cast<uint16_t>(n + next);
+ s= block.frame + n;
+ if (n == PAGE_NEW_SUPREMUM);
+ else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+ slot < s))
+ goto corrupted;
+ if (UNIV_UNLIKELY(!i--)) /* Corrupted (cyclic?) next-record list */
+ goto corrupted;
+ }
+ slot_owned--;
+
+ /* The first slot is always pointing to the infimum record.
+ Find the directory slot pointing to s. */
+ const byte * const first_slot= block.frame + srv_page_size - (PAGE_DIR + 2);
+ alignas(2) byte slot_offs[2];
+ mach_write_to_2(slot_offs, s - block.frame);
+ static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+
+ while (memcmp_aligned<2>(slot, slot_offs, 2))
+ if ((slot+= 2) == first_slot)
+ goto corrupted;
+
+ if (rec == s)
+ {
+ s= prev_rec;
+ mach_write_to_2(slot, s - block.frame);
+ }
+
+ mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(next - prev_rec));
+ s-= REC_NEW_N_OWNED;
+ *s= static_cast<byte>((*s & ~REC_N_OWNED_MASK) |
+ slot_owned << REC_N_OWNED_SHIFT);
+ page_mem_free(block, rec, data_size, extra_size);
+
+ if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED)
+ page_dir_balance_slot(block, (first_slot - slot) / 2);
+
+ ut_ad(page_simple_validate_new(block.frame));
+ return false;
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+/*******************************************************************//**
+Print the first n numbers, generated by ut_rnd_gen() to make sure
+(visually) that it works properly. */
+void
+test_ut_rnd_gen(
+ int n) /*!< in: print first n numbers */
+{
+ int i;
+ unsigned long long rnd;
+
+ for (i = 0; i < n; i++) {
+ rnd = ut_rnd_gen();
+ printf("%llu\t%%2=%llu %%3=%llu %%5=%llu %%7=%llu %%11=%llu\n",
+ rnd,
+ rnd % 2,
+ rnd % 3,
+ rnd % 5,
+ rnd % 7,
+ rnd % 11);
+ }
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc
new file mode 100644
index 00000000..9b83470e
--- /dev/null
+++ b/storage/innobase/page/page0page.cc
@@ -0,0 +1,2499 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0page.cc
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0cur.h"
+#include "page0zip.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "btr0btr.h"
+#include "srv0srv.h"
+#include "lock0lock.h"
+#include "fut0lst.h"
+#include "btr0sea.h"
+#include "trx0sys.h"
+#include <algorithm>
+
+/* THE INDEX PAGE
+ ==============
+
+The index page consists of a page header which contains the page's
+id and other information. On top of it are the index records
+in a heap linked into a one way linear list according to alphabetic order.
+
+Just below page end is an array of pointers which we call page directory,
+to about every sixth record in the list. The pointers are placed in
+the directory in the alphabetical order of the records pointed to,
+enabling us to make binary search using the array. Each slot n:o I
+in the directory points to a record, where a 4-bit field contains a count
+of those records which are in the linear list between pointer I and
+the pointer I - 1 in the directory, including the record
+pointed to by pointer I and not including the record pointed to by I - 1.
+We say that the record pointed to by slot I, or that slot I, owns
+these records. The count is always kept in the range 4 to 8, with
+the exception that it is 1 for the first slot, and 1--8 for the second slot.
+
+An essentially binary search can be performed in the list of index
+records, like we could do if we had pointer to every record in the
+page directory. The data structure is, however, more efficient when
+we are doing inserts, because most inserts are just pushed on a heap.
+Only every 8th insert requires block move in the directory pointer
+table, which itself is quite small. A record is deleted from the page
+by just taking it off the linear list and updating the number of owned
+records-field of the record which owns it, and updating the page directory,
+if necessary. A special case is the one when the record owns itself.
+Because the overhead of inserts is so small, we may also increase the
+page size from the projected default of 8 kB to 64 kB without too
+much loss of efficiency in inserts. Bigger page becomes actual
+when the disk transfer rate compared to seek and latency time rises.
+On the present system, the page size is set so that the page transfer
+time (3 ms) is 20 % of the disk random access time (15 ms).
+
+When the page is split, merged, or becomes full but contains deleted
+records, we have to reorganize the page.
+
+Assuming a page size of 8 kB, a typical index page of a secondary
+index contains 300 index entries, and the size of the page directory
+is 50 x 4 bytes = 200 bytes. */
+
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return the directory slot number */
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+ const rec_t* rec) /*!< in: the physical record */
+{
+ ut_ad(page_rec_check(rec));
+
+ const page_t* page = page_align(rec);
+ const page_dir_slot_t* first_slot = page_dir_get_nth_slot(page, 0);
+ const page_dir_slot_t* slot = page_dir_get_nth_slot(
+ page, ulint(page_dir_get_n_slots(page)) - 1);
+ const rec_t* r = rec;
+
+ if (page_is_comp(page)) {
+ while (rec_get_n_owned_new(r) == 0) {
+ r = rec_get_next_ptr_const(r, TRUE);
+ ut_ad(r >= page + PAGE_NEW_SUPREMUM);
+ ut_ad(r < page + (srv_page_size - PAGE_DIR));
+ }
+ } else {
+ while (rec_get_n_owned_old(r) == 0) {
+ r = rec_get_next_ptr_const(r, FALSE);
+ ut_ad(r >= page + PAGE_OLD_SUPREMUM);
+ ut_ad(r < page + (srv_page_size - PAGE_DIR));
+ }
+ }
+
+ uint16 rec_offs_bytes = mach_encode_2(ulint(r - page));
+
+ while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) {
+
+ if (UNIV_UNLIKELY(slot == first_slot)) {
+ ib::error() << "Probable data corruption on page "
+ << page_get_page_no(page)
+ << ". Original record on that page;";
+
+ if (page_is_comp(page)) {
+ fputs("(compact record)", stderr);
+ } else {
+ rec_print_old(stderr, rec);
+ }
+
+ ib::error() << "Cannot find the dir slot for this"
+ " record on that page;";
+
+ if (page_is_comp(page)) {
+ fputs("(compact record)", stderr);
+ } else {
+ rec_print_old(stderr, page
+ + mach_decode_2(rec_offs_bytes));
+ }
+
+ ut_error;
+ }
+
+ slot += PAGE_DIR_SLOT_SIZE;
+ }
+
+ return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE);
+}
+
+/**************************************************************//**
+Used to check the consistency of a directory slot.
+@return TRUE if succeed */
+static
+ibool
+page_dir_slot_check(
+/*================*/
+ const page_dir_slot_t* slot) /*!< in: slot */
+{
+ const page_t* page;
+ ulint n_slots;
+ ulint n_owned;
+
+ ut_a(slot);
+
+ page = page_align(slot);
+
+ n_slots = page_dir_get_n_slots(page);
+
+ ut_a(slot <= page_dir_get_nth_slot(page, 0));
+ ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1));
+
+ ut_a(page_rec_check(page_dir_slot_get_rec(slot)));
+
+ if (page_is_comp(page)) {
+ n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot));
+ } else {
+ n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot));
+ }
+
+ if (slot == page_dir_get_nth_slot(page, 0)) {
+ ut_a(n_owned == 1);
+ } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) {
+ ut_a(n_owned >= 1);
+ ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ } else {
+ ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED);
+ ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************//**
+Sets the max trx id field value. */
+void
+page_set_max_trx_id(
+/*================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction, or NULL */
+{
+ ut_ad(!mtr || mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!page_zip || page_zip == &block->page.zip);
+ static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
+ byte *max_trx_id= my_assume_aligned<8>(PAGE_MAX_TRX_ID +
+ PAGE_HEADER + block->frame);
+
+ mtr->write<8>(*block, max_trx_id, trx_id);
+ if (UNIV_LIKELY_NULL(page_zip))
+ memcpy_aligned<8>(&page_zip->data[PAGE_MAX_TRX_ID + PAGE_HEADER],
+ max_trx_id, 8);
+}
+
+/** Persist the AUTO_INCREMENT value on a clustered index root page.
+@param[in,out] block clustered index root page
+@param[in] index clustered index
+@param[in] autoinc next available AUTO_INCREMENT value
+@param[in,out] mtr mini-transaction
+@param[in] reset whether to reset the AUTO_INCREMENT
+ to a possibly smaller value than currently
+ exists in the page */
+void
+page_set_autoinc(
+ buf_block_t* block,
+ ib_uint64_t autoinc,
+ mtr_t* mtr,
+ bool reset)
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+
+ byte *field= my_assume_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC +
+ block->frame);
+ ib_uint64_t old= mach_read_from_8(field);
+ if (old == autoinc || (old > autoinc && !reset))
+ return; /* nothing to update */
+
+ mtr->write<8>(*block, field, autoinc);
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + block->page.zip.data,
+ field, 8);
+}
+
+/** The page infimum and supremum of an empty page in ROW_FORMAT=REDUNDANT */
+static const byte infimum_supremum_redundant[] = {
+ /* the infimum record */
+ 0x08/*end offset*/,
+ 0x01/*n_owned*/,
+ 0x00, 0x00/*heap_no=0*/,
+ 0x03/*n_fields=1, 1-byte offsets*/,
+ 0x00, 0x74/* pointer to supremum */,
+ 'i', 'n', 'f', 'i', 'm', 'u', 'm', 0,
+ /* the supremum record */
+ 0x09/*end offset*/,
+ 0x01/*n_owned*/,
+ 0x00, 0x08/*heap_no=1*/,
+ 0x03/*n_fields=1, 1-byte offsets*/,
+ 0x00, 0x00/* end of record list */,
+ 's', 'u', 'p', 'r', 'e', 'm', 'u', 'm', 0
+};
+
+/** The page infimum and supremum of an empty page in ROW_FORMAT=COMPACT */
+static const byte infimum_supremum_compact[] = {
+ /* the infimum record */
+ 0x01/*n_owned=1*/,
+ 0x00, 0x02/* heap_no=0, REC_STATUS_INFIMUM */,
+ 0x00, 0x0d/* pointer to supremum */,
+ 'i', 'n', 'f', 'i', 'm', 'u', 'm', 0,
+ /* the supremum record */
+ 0x01/*n_owned=1*/,
+ 0x00, 0x0b/* heap_no=1, REC_STATUS_SUPREMUM */,
+ 0x00, 0x00/* end of record list */,
+ 's', 'u', 'p', 'r', 'e', 'm', 'u', 'm'
+};
+
+/** Create an index page.
+@param[in,out] block buffer block
+@param[in] comp nonzero=compact page format */
+void page_create_low(const buf_block_t* block, bool comp)
+{
+ page_t* page;
+
+ compile_time_assert(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE
+ <= PAGE_DATA);
+ compile_time_assert(PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE
+ <= PAGE_DATA);
+
+ page = buf_block_get_frame(block);
+
+ fil_page_set_type(page, FIL_PAGE_INDEX);
+
+ memset(page + PAGE_HEADER, 0, PAGE_HEADER_PRIV_END);
+ page[PAGE_HEADER + PAGE_N_DIR_SLOTS + 1] = 2;
+ page[PAGE_HEADER + PAGE_INSTANT] = 0;
+ page[PAGE_HEADER + PAGE_DIRECTION_B] = PAGE_NO_DIRECTION;
+
+ if (comp) {
+ page[PAGE_HEADER + PAGE_N_HEAP] = 0x80;/*page_is_comp()*/
+ page[PAGE_HEADER + PAGE_N_HEAP + 1] = PAGE_HEAP_NO_USER_LOW;
+ page[PAGE_HEADER + PAGE_HEAP_TOP + 1] = PAGE_NEW_SUPREMUM_END;
+ memcpy(page + PAGE_DATA, infimum_supremum_compact,
+ sizeof infimum_supremum_compact);
+ memset(page
+ + PAGE_NEW_SUPREMUM_END, 0,
+ srv_page_size - PAGE_DIR - PAGE_NEW_SUPREMUM_END);
+ page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1]
+ = PAGE_NEW_SUPREMUM;
+ page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1]
+ = PAGE_NEW_INFIMUM;
+ } else {
+ page[PAGE_HEADER + PAGE_N_HEAP + 1] = PAGE_HEAP_NO_USER_LOW;
+ page[PAGE_HEADER + PAGE_HEAP_TOP + 1] = PAGE_OLD_SUPREMUM_END;
+ memcpy(page + PAGE_DATA, infimum_supremum_redundant,
+ sizeof infimum_supremum_redundant);
+ memset(page
+ + PAGE_OLD_SUPREMUM_END, 0,
+ srv_page_size - PAGE_DIR - PAGE_OLD_SUPREMUM_END);
+ page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1]
+ = PAGE_OLD_SUPREMUM;
+ page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1]
+ = PAGE_OLD_INFIMUM;
+ }
+}
+
+/** Create an uncompressed index page.
+@param[in,out] block buffer block
+@param[in,out] mtr mini-transaction
+@param[in] comp set unless ROW_FORMAT=REDUNDANT */
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp)
+{
+ mtr->page_create(*block, comp);
+ buf_block_modify_clock_inc(block);
+ page_create_low(block, comp);
+}
+
+/**********************************************************//**
+Create a compressed B-tree index page. */
+void
+page_create_zip(
+/*============*/
+ buf_block_t* block, /*!< in/out: a buffer frame
+ where the page is created */
+ dict_index_t* index, /*!< in: the index of the
+ page */
+ ulint level, /*!< in: the B-tree level
+ of the page */
+ trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */
+ mtr_t* mtr) /*!< in/out: mini-transaction
+ handle */
+{
+ ut_ad(block);
+ ut_ad(buf_block_get_page_zip(block));
+ ut_ad(dict_table_is_comp(index->table));
+
+ /* PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC are always 0 for
+ temporary tables. */
+ ut_ad(max_trx_id == 0 || !index->table->is_temporary());
+ /* In secondary indexes and the change buffer, PAGE_MAX_TRX_ID
+ must be zero on non-leaf pages. max_trx_id can be 0 when the
+ index consists of an empty root (leaf) page. */
+ ut_ad(max_trx_id == 0
+ || level == 0
+ || !dict_index_is_sec_or_ibuf(index)
+ || index->table->is_temporary());
+ /* In the clustered index, PAGE_ROOT_AUTOINC or
+ PAGE_MAX_TRX_ID must be 0 on other pages than the root. */
+ ut_ad(level == 0 || max_trx_id == 0
+ || !dict_index_is_sec_or_ibuf(index)
+ || index->table->is_temporary());
+
+ buf_block_modify_clock_inc(block);
+ page_create_low(block, true);
+
+ if (index->is_spatial()) {
+ mach_write_to_2(FIL_PAGE_TYPE + block->frame, FIL_PAGE_RTREE);
+ memset(block->frame + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
+ memset(block->page.zip.data + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
+ }
+
+ mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + block->frame, level);
+ mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + block->frame,
+ max_trx_id);
+
+ if (!page_zip_compress(block, index, page_zip_level, mtr)) {
+ /* The compression of a newly created
+ page should always succeed. */
+ ut_error;
+ }
+}
+
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+void
+page_create_empty(
+/*==============*/
+ buf_block_t* block, /*!< in/out: B-tree block */
+ dict_index_t* index, /*!< in: the index of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ trx_id_t max_trx_id;
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+
+ ut_ad(fil_page_index_page_check(block->frame));
+ ut_ad(!index->is_dummy);
+ ut_ad(block->page.id().space() == index->table->space->id);
+
+ /* Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (dict_index_is_sec_or_ibuf(index)
+ && !index->table->is_temporary()
+ && page_is_leaf(block->frame)) {
+ max_trx_id = page_get_max_trx_id(block->frame);
+ ut_ad(max_trx_id);
+ } else if (block->page.id().page_no() == index->page) {
+ /* Preserve PAGE_ROOT_AUTO_INC. */
+ max_trx_id = page_get_max_trx_id(block->frame);
+ } else {
+ max_trx_id = 0;
+ }
+
+ if (page_zip) {
+ ut_ad(!index->table->is_temporary());
+ page_create_zip(block, index,
+ page_header_get_field(block->frame,
+ PAGE_LEVEL),
+ max_trx_id, mtr);
+ } else {
+ page_create(block, mtr, index->table->not_redundant());
+ if (index->is_spatial()) {
+ static_assert(((FIL_PAGE_INDEX & 0xff00)
+ | byte(FIL_PAGE_RTREE))
+ == FIL_PAGE_RTREE, "compatibility");
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+ byte(FIL_PAGE_RTREE));
+ if (mach_read_from_8(block->frame
+ + FIL_RTREE_SPLIT_SEQ_NUM)) {
+ mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+ 8, 0);
+ }
+ }
+
+ if (max_trx_id) {
+ mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
+ + block->frame, max_trx_id);
+ }
+ }
+}
+
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ page_cur_t cur1;
+ page_cur_t cur2;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ page_cur_position(rec, block, &cur1);
+
+ if (page_cur_is_before_first(&cur1)) {
+
+ page_cur_move_to_next(&cur1);
+ }
+
+ btr_assert_not_corrupted(new_block, index);
+ ut_a(page_is_comp(new_page) == page_rec_is_comp(rec));
+ ut_a(mach_read_from_2(new_page + srv_page_size - 10) == (ulint)
+ (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM));
+ const ulint n_core = page_is_leaf(block->frame)
+ ? index->n_core_fields : 0;
+
+ page_cur_set_before_first(new_block, &cur2);
+
+ /* Copy records from the original page to the new page */
+
+ while (!page_cur_is_after_last(&cur1)) {
+ rec_t* ins_rec;
+ offsets = rec_get_offsets(cur1.rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ ins_rec = page_cur_insert_rec_low(&cur2, index,
+ cur1.rec, offsets, mtr);
+ if (UNIV_UNLIKELY(!ins_rec)) {
+ ib::fatal() << "Rec offset " << page_offset(rec)
+ << ", cur1 offset " << page_offset(cur1.rec)
+ << ", cur2 offset " << page_offset(cur2.rec);
+ }
+
+ page_cur_move_to_next(&cur1);
+ ut_ad(!(rec_get_info_bits(cur1.rec, page_is_comp(new_page))
+ & REC_INFO_MIN_REC_FLAG));
+ cur2.rec = ins_rec;
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/*************************************************************//**
+Copies records from page to new_page, from a given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original successor of the infimum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block);
+ page_t* page = block->frame;
+ rec_t* ret = page_rec_get_next(
+ page_get_infimum_rec(new_page));
+ ulint num_moved = 0;
+ rtr_rec_move_t* rec_move = NULL;
+ mem_heap_t* heap = NULL;
+ ut_ad(page_align(rec) == page);
+
+#ifdef UNIV_ZIP_DEBUG
+ if (new_page_zip) {
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+ ut_a(page_zip);
+
+ /* Strict page_zip_validate() may fail here.
+ Furthermore, btr_compress() may set FIL_PAGE_PREV to
+ FIL_NULL on new_page while leaving it intact on
+ new_page_zip. So, we cannot validate new_page_zip. */
+ ut_a(page_zip_validate_low(page_zip, page, index, TRUE));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+ ut_ad(buf_block_get_frame(block) == page);
+ ut_ad(page_is_leaf(page) == page_is_leaf(new_page));
+ ut_ad(page_is_comp(page) == page_is_comp(new_page));
+ /* Here, "ret" may be pointing to a user record or the
+ predefined supremum record. */
+
+ const mtr_log_t log_mode = new_page_zip
+ ? mtr->set_log_mode(MTR_LOG_NONE) : MTR_LOG_NONE;
+ const bool was_empty = page_dir_get_n_heap(new_page)
+ == PAGE_HEAP_NO_USER_LOW;
+ alignas(2) byte h[PAGE_N_DIRECTION + 2 - PAGE_LAST_INSERT];
+ memcpy_aligned<2>(h, PAGE_HEADER + PAGE_LAST_INSERT + new_page,
+ sizeof h);
+
+ if (index->is_spatial()) {
+ ulint max_to_move = page_get_n_recs(
+ buf_block_get_frame(block));
+ heap = mem_heap_create(256);
+
+ rec_move = static_cast<rtr_rec_move_t*>(
+ mem_heap_alloc(heap, max_to_move * sizeof *rec_move));
+
+ /* For spatial index, we need to insert recs one by one
+ to keep recs ordered. */
+ rtr_page_copy_rec_list_end_no_locks(new_block,
+ block, rec, index,
+ heap, rec_move,
+ max_to_move,
+ &num_moved,
+ mtr);
+ } else {
+ page_copy_rec_list_end_no_locks(new_block, block, rec,
+ index, mtr);
+ if (was_empty) {
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER
+ + PAGE_LAST_INSERT
+ + new_page, h, sizeof h);
+ }
+ }
+
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below.
+ Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (dict_index_is_sec_or_ibuf(index)
+ && page_is_leaf(page)
+ && !index->table->is_temporary()) {
+ ut_ad(!was_empty || page_dir_get_n_heap(new_page)
+ == PAGE_HEAP_NO_USER_LOW
+ + page_header_get_field(new_page, PAGE_N_RECS));
+ page_update_max_trx_id(new_block, NULL,
+ page_get_max_trx_id(page), mtr);
+ }
+
+ if (new_page_zip) {
+ mtr_set_log_mode(mtr, log_mode);
+
+ if (!page_zip_compress(new_block, index,
+ page_zip_level, mtr)) {
+ /* Before trying to reorganize the page,
+ store the number of preceding records on the page. */
+ ulint ret_pos
+ = page_rec_get_n_recs_before(ret);
+ /* Before copying, "ret" was the successor of
+ the predefined infimum record. It must still
+ have at least one predecessor (the predefined
+ infimum record, or a freshly copied record
+ that is smaller than "ret"). */
+ ut_a(ret_pos > 0);
+
+ if (!page_zip_reorganize(new_block, index,
+ page_zip_level, mtr)) {
+
+ if (!page_zip_decompress(new_page_zip,
+ new_page, FALSE)) {
+ ut_error;
+ }
+ ut_ad(page_validate(new_page, index));
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ return(NULL);
+ } else {
+ /* The page was reorganized:
+ Seek to ret_pos. */
+ ret = page_rec_get_nth(new_page, ret_pos);
+ }
+ }
+ }
+
+ /* Update the lock table and possible hash index */
+
+ if (dict_table_is_locking_disabled(index->table)) {
+ } else if (rec_move && dict_index_is_spatial(index)) {
+ lock_rtr_move_rec_list(new_block, block, rec_move, num_moved);
+ } else {
+ lock_move_rec_list_end(new_block, block, rec);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ btr_search_move_or_delete_hash_entries(new_block, block);
+
+ return(ret);
+}
+
+/*************************************************************//**
+Copies records from page to new_page, up to the given record,
+NOT including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original predecessor of the supremum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(page_align(rec) == block->frame);
+
+ page_t* new_page = buf_block_get_frame(new_block);
+ page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block);
+ page_cur_t cur1;
+ page_cur_t cur2;
+ mem_heap_t* heap = NULL;
+ ulint num_moved = 0;
+ rtr_rec_move_t* rec_move = NULL;
+ rec_t* ret
+ = page_rec_get_prev(page_get_supremum_rec(new_page));
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ /* Here, "ret" may be pointing to a user record or the
+ predefined infimum record. */
+
+ if (page_rec_is_infimum(rec)) {
+ return(ret);
+ }
+
+ mtr_log_t log_mode = MTR_LOG_NONE;
+
+ if (new_page_zip) {
+ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+ }
+
+ page_cur_set_before_first(block, &cur1);
+ page_cur_move_to_next(&cur1);
+
+ page_cur_position(ret, new_block, &cur2);
+
+ const ulint n_core = page_rec_is_leaf(rec) ? index->n_core_fields : 0;
+
+ /* Copy records from the original page to the new page */
+ if (index->is_spatial()) {
+ ut_ad(!index->is_instant());
+ ulint max_to_move = page_get_n_recs(
+ buf_block_get_frame(block));
+ heap = mem_heap_create(256);
+
+ rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc(
+ heap,
+ sizeof (*rec_move) * max_to_move));
+
+ /* For spatial index, we need to insert recs one by one
+ to keep recs ordered. */
+ rtr_page_copy_rec_list_start_no_locks(new_block,
+ block, rec, index, heap,
+ rec_move, max_to_move,
+ &num_moved, mtr);
+ } else {
+ while (page_cur_get_rec(&cur1) != rec) {
+ offsets = rec_get_offsets(cur1.rec, index, offsets,
+ n_core,
+ ULINT_UNDEFINED, &heap);
+ cur2.rec = page_cur_insert_rec_low(&cur2, index,
+ cur1.rec, offsets,
+ mtr);
+ ut_a(cur2.rec);
+
+ page_cur_move_to_next(&cur1);
+ ut_ad(!(rec_get_info_bits(cur1.rec,
+ page_is_comp(new_page))
+ & REC_INFO_MIN_REC_FLAG));
+ }
+ }
+
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below.
+ Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (n_core && dict_index_is_sec_or_ibuf(index)
+ && !index->table->is_temporary()) {
+ page_update_max_trx_id(new_block,
+ new_page_zip,
+ page_get_max_trx_id(block->frame),
+ mtr);
+ }
+
+ if (new_page_zip) {
+ mtr_set_log_mode(mtr, log_mode);
+
+ DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail",
+ goto zip_reorganize;);
+
+ if (!page_zip_compress(new_block, index,
+ page_zip_level, mtr)) {
+ ulint ret_pos;
+#ifndef DBUG_OFF
+zip_reorganize:
+#endif /* DBUG_OFF */
+ /* Before trying to reorganize the page,
+ store the number of preceding records on the page. */
+ ret_pos = page_rec_get_n_recs_before(ret);
+ /* Before copying, "ret" was the predecessor
+ of the predefined supremum record. If it was
+ the predefined infimum record, then it would
+ still be the infimum, and we would have
+ ret_pos == 0. */
+
+ if (UNIV_UNLIKELY
+ (!page_zip_reorganize(new_block, index,
+ page_zip_level, mtr))) {
+
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress(new_page_zip,
+ new_page, FALSE))) {
+ ut_error;
+ }
+ ut_ad(page_validate(new_page, index));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(NULL);
+ }
+
+ /* The page was reorganized: Seek to ret_pos. */
+ ret = page_rec_get_nth(new_page, ret_pos);
+ }
+ }
+
+ /* Update the lock table and possible hash index */
+
+ if (dict_table_is_locking_disabled(index->table)) {
+ } else if (dict_index_is_spatial(index)) {
+ lock_rtr_move_rec_list(new_block, block, rec_move, num_moved);
+ } else {
+ lock_move_rec_list_start(new_block, block, rec, ret);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ btr_search_move_or_delete_hash_entries(new_block, block);
+
+ return(ret);
+}
+
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+void
+page_delete_rec_list_end(
+/*=====================*/
+ rec_t* rec, /*!< in: pointer to record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_recs, /*!< in: number of records to delete,
+ or ULINT_UNDEFINED if not known */
+ ulint size, /*!< in: the sum of the sizes of the
+ records in the end of the chain to
+ delete, or ULINT_UNDEFINED if not known */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(size == ULINT_UNDEFINED || size < srv_page_size);
+ ut_ad(page_align(rec) == block->frame);
+ ut_ad(index->table->not_redundant() == !!page_is_comp(block->frame));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!block->page.zip.data ||
+ page_zip_validate(&block->page.zip, block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_rec_is_supremum(rec))
+ {
+ ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED);
+ /* Nothing to do, there are no records bigger than the page supremum. */
+ return;
+ }
+
+ if (page_rec_is_infimum(rec) || n_recs == page_get_n_recs(block->frame) ||
+ rec == (page_is_comp(block->frame)
+ ? page_rec_get_next_low(block->frame + PAGE_NEW_INFIMUM, 1)
+ : page_rec_get_next_low(block->frame + PAGE_OLD_INFIMUM, 0)))
+ {
+ /* We are deleting all records. */
+ page_create_empty(block, index, mtr);
+ return;
+ }
+
+#if 0 // FIXME: consider deleting the last record as a special case
+ if (page_rec_is_last(rec))
+ {
+ page_cur_t cursor= { index, rec, offsets, block };
+ page_cur_delete_rec(&cursor, index, offsets, mtr);
+ return;
+ }
+#endif
+
+ /* The page becomes invalid for optimistic searches */
+ buf_block_modify_clock_inc(block);
+
+ const ulint n_core= page_is_leaf(block->frame) ? index->n_core_fields : 0;
+ mem_heap_t *heap= nullptr;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *offsets= offsets_;
+ rec_offs_init(offsets_);
+
+#if 1 // FIXME: remove this, and write minimal amount of log! */
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ {
+ ut_ad(page_is_comp(block->frame));
+ do
+ {
+ page_cur_t cur;
+ page_cur_position(rec, block, &cur);
+ offsets= rec_get_offsets(rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ rec= rec_get_next_ptr(rec, TRUE);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(&block->page.zip, block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cur_delete_rec(&cur, index, offsets, mtr);
+ }
+ while (page_offset(rec) != PAGE_NEW_SUPREMUM);
+
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ return;
+ }
+#endif
+
+ byte *prev_rec= page_rec_get_prev(rec);
+ byte *last_rec= page_rec_get_prev(page_get_supremum_rec(block->frame));
+
+ // FIXME: consider a special case of shrinking PAGE_HEAP_TOP
+
+ const bool scrub= srv_immediate_scrub_data_uncompressed;
+ if (scrub || size == ULINT_UNDEFINED || n_recs == ULINT_UNDEFINED)
+ {
+ rec_t *rec2= rec;
+ /* Calculate the sum of sizes and the number of records */
+ size= 0;
+ n_recs= 0;
+
+ do
+ {
+ offsets = rec_get_offsets(rec2, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ ulint s= rec_offs_size(offsets);
+ ut_ad(ulint(rec2 - block->frame) + s - rec_offs_extra_size(offsets) <
+ srv_page_size);
+ ut_ad(size + s < srv_page_size);
+ size+= s;
+ n_recs++;
+
+ if (scrub)
+ mtr->memset(block, page_offset(rec2), rec_offs_data_size(offsets), 0);
+
+ rec2 = page_rec_get_next(rec2);
+ }
+ while (!page_rec_is_supremum(rec2));
+
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ }
+
+ ut_ad(size < srv_page_size);
+
+ ulint slot_index, n_owned;
+ {
+ const rec_t *owner_rec= rec;
+ ulint count= 0;
+
+ if (page_is_comp(block->frame))
+ while (!(n_owned= rec_get_n_owned_new(owner_rec)))
+ {
+ count++;
+ owner_rec= rec_get_next_ptr_const(owner_rec, TRUE);
+ }
+ else
+ while (!(n_owned= rec_get_n_owned_old(owner_rec)))
+ {
+ count++;
+ owner_rec= rec_get_next_ptr_const(owner_rec, FALSE);
+ }
+
+ ut_ad(n_owned > count);
+ n_owned-= count;
+ slot_index= page_dir_find_owner_slot(owner_rec);
+ ut_ad(slot_index > 0);
+ }
+
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>
+ (PAGE_N_DIR_SLOTS + PAGE_HEADER +
+ block->frame), slot_index + 1);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>
+ (PAGE_LAST_INSERT + PAGE_HEADER +
+ block->frame), 0U);
+ /* Catenate the deleted chain segment to the page free list */
+ alignas(4) byte page_header[4];
+ byte *page_free= my_assume_aligned<4>(PAGE_HEADER + PAGE_FREE +
+ block->frame);
+ const uint16_t free= page_header_get_field(block->frame, PAGE_FREE);
+ static_assert(PAGE_FREE + 2 == PAGE_GARBAGE, "compatibility");
+
+ mach_write_to_2(page_header, page_offset(rec));
+ mach_write_to_2(my_assume_aligned<2>(page_header + 2),
+ mach_read_from_2(my_assume_aligned<2>(page_free + 2)) +
+ size);
+ mtr->memcpy(*block, page_free, page_header, 4);
+
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block->frame);
+ mtr->write<2>(*block, page_n_recs,
+ ulint{mach_read_from_2(page_n_recs)} - n_recs);
+
+ /* Update the page directory; there is no need to balance the number
+ of the records owned by the supremum record, as it is allowed to be
+ less than PAGE_DIR_SLOT_MIN_N_OWNED */
+ page_dir_slot_t *slot= page_dir_get_nth_slot(block->frame, slot_index);
+
+ if (page_is_comp(block->frame))
+ {
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_NEW_SUPREMUM);
+ byte *owned= PAGE_NEW_SUPREMUM - REC_NEW_N_OWNED + block->frame;
+ byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) |
+ n_owned << REC_N_OWNED_SHIFT);
+#if 0 // FIXME: implement minimal logging for ROW_FORMAT=COMPRESSED
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ {
+ *owned= new_owned;
+ memcpy_aligned<2>(PAGE_N_DIR_SLOTS + PAGE_HEADER + block->page.zip.data,
+ PAGE_N_DIR_SLOTS + PAGE_HEADER + block->frame,
+ PAGE_N_RECS + 2 - PAGE_N_DIR_SLOTS);
+ // TODO: the equivalent of page_zip_dir_delete() for all records
+ mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+ (PAGE_NEW_SUPREMUM - page_offset(prev_rec)));
+ mach_write_to_2(last_rec - REC_NEXT, free
+ ? static_cast<uint16_t>(free - page_offset(last_rec))
+ : 0U);
+ return;
+ }
+#endif
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned);
+ mtr->write<2>(*block, prev_rec - REC_NEXT, static_cast<uint16_t>
+ (PAGE_NEW_SUPREMUM - page_offset(prev_rec)));
+ mtr->write<2>(*block, last_rec - REC_NEXT, free
+ ? static_cast<uint16_t>(free - page_offset(last_rec))
+ : 0U);
+ }
+ else
+ {
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_OLD_SUPREMUM);
+ byte *owned= PAGE_OLD_SUPREMUM - REC_OLD_N_OWNED + block->frame;
+ byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) |
+ n_owned << REC_N_OWNED_SHIFT);
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned);
+ mtr->write<2>(*block, prev_rec - REC_NEXT, PAGE_OLD_SUPREMUM);
+ mtr->write<2>(*block, last_rec - REC_NEXT, free);
+ }
+}
+
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+void
+page_delete_rec_list_start(
+/*=======================*/
+ rec_t* rec, /*!< in: record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t cur1;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ mem_heap_t* heap = NULL;
+
+ rec_offs_init(offsets_);
+
+ ut_ad(page_align(rec) == block->frame);
+ ut_ad((ibool) !!page_rec_is_comp(rec)
+ == dict_table_is_comp(index->table));
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+ page_t* page = buf_block_get_frame(block);
+
+ /* page_zip_validate() would detect a min_rec_mark mismatch
+ in btr_page_split_and_insert()
+ between btr_attach_half_pages() and insert_page = ...
+ when btr_page_get_split_rec_to_left() holds
+ (direction == FSP_DOWN). */
+ ut_a(!page_zip
+ || page_zip_validate_low(page_zip, page, index, TRUE));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_rec_is_infimum(rec)) {
+ return;
+ }
+
+ if (page_rec_is_supremum(rec)) {
+ /* We are deleting all records. */
+ page_create_empty(block, index, mtr);
+ return;
+ }
+
+ page_cur_set_before_first(block, &cur1);
+ page_cur_move_to_next(&cur1);
+
+ const ulint n_core = page_rec_is_leaf(rec)
+ ? index->n_core_fields : 0;
+
+ while (page_cur_get_rec(&cur1) != rec) {
+ offsets = rec_get_offsets(page_cur_get_rec(&cur1), index,
+ offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ page_cur_delete_rec(&cur1, index, offsets, mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return TRUE on success; FALSE on compression failure (new_block will
+be decompressed) */
+ibool
+page_move_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in: index page from where to move */
+ rec_t* split_rec, /*!< in: first record to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ ulint old_data_size;
+ ulint new_data_size;
+ ulint old_n_recs;
+ ulint new_n_recs;
+
+ ut_ad(!dict_index_is_spatial(index));
+
+ old_data_size = page_get_data_size(new_page);
+ old_n_recs = page_get_n_recs(new_page);
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_zip_des_t* new_page_zip
+ = buf_block_get_page_zip(new_block);
+ page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(!new_page_zip == !page_zip);
+ ut_a(!new_page_zip
+ || page_zip_validate(new_page_zip, new_page, index));
+ ut_a(!page_zip
+ || page_zip_validate(page_zip, page_align(split_rec),
+ index));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (UNIV_UNLIKELY(!page_copy_rec_list_end(new_block, block,
+ split_rec, index, mtr))) {
+ return(FALSE);
+ }
+
+ new_data_size = page_get_data_size(new_page);
+ new_n_recs = page_get_n_recs(new_page);
+
+ ut_ad(new_data_size >= old_data_size);
+
+ page_delete_rec_list_end(split_rec, block, index,
+ new_n_recs - old_n_recs,
+ new_data_size - old_data_size, mtr);
+
+ return(TRUE);
+}
+
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return TRUE on success; FALSE on compression failure */
+ibool
+page_move_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in/out: page containing split_rec */
+ rec_t* split_rec, /*!< in: first record not to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (UNIV_UNLIKELY(!page_copy_rec_list_start(new_block, block,
+ split_rec, index, mtr))) {
+ return(FALSE);
+ }
+
+ page_delete_rec_list_start(split_rec, block, index, mtr);
+
+ return(TRUE);
+}
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record */
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+ const page_t* page, /*!< in: page */
+ ulint nth) /*!< in: nth record */
+{
+ const page_dir_slot_t* slot;
+ ulint i;
+ ulint n_owned;
+ const rec_t* rec;
+
+ if (nth == 0) {
+ return(page_get_infimum_rec(page));
+ }
+
+ ut_ad(nth < srv_page_size / (REC_N_NEW_EXTRA_BYTES + 1));
+
+ for (i = 0;; i++) {
+
+ slot = page_dir_get_nth_slot(page, i);
+ n_owned = page_dir_slot_get_n_owned(slot);
+
+ if (n_owned > nth) {
+ break;
+ } else {
+ nth -= n_owned;
+ }
+ }
+
+ ut_ad(i > 0);
+ slot = page_dir_get_nth_slot(page, i - 1);
+ rec = page_dir_slot_get_rec(slot);
+
+ if (page_is_comp(page)) {
+ do {
+ rec = page_rec_get_next_low(rec, TRUE);
+ ut_ad(rec);
+ } while (nth--);
+ } else {
+ do {
+ rec = page_rec_get_next_low(rec, FALSE);
+ ut_ad(rec);
+ } while (nth--);
+ }
+
+ return(rec);
+}
+
+/***************************************************************//**
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records.
+@return number of records */
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+ const rec_t* rec) /*!< in: the physical record */
+{
+ const page_dir_slot_t* slot;
+ const rec_t* slot_rec;
+ const page_t* page;
+ ulint i;
+ lint n = 0;
+
+ ut_ad(page_rec_check(rec));
+
+ page = page_align(rec);
+ if (page_is_comp(page)) {
+ while (rec_get_n_owned_new(rec) == 0) {
+
+ rec = rec_get_next_ptr_const(rec, TRUE);
+ n--;
+ }
+
+ for (i = 0; ; i++) {
+ slot = page_dir_get_nth_slot(page, i);
+ slot_rec = page_dir_slot_get_rec(slot);
+
+ n += lint(rec_get_n_owned_new(slot_rec));
+
+ if (rec == slot_rec) {
+
+ break;
+ }
+ }
+ } else {
+ while (rec_get_n_owned_old(rec) == 0) {
+
+ rec = rec_get_next_ptr_const(rec, FALSE);
+ n--;
+ }
+
+ for (i = 0; ; i++) {
+ slot = page_dir_get_nth_slot(page, i);
+ slot_rec = page_dir_slot_get_rec(slot);
+
+ n += lint(rec_get_n_owned_old(slot_rec));
+
+ if (rec == slot_rec) {
+
+ break;
+ }
+ }
+ }
+
+ n--;
+
+ ut_ad(n >= 0);
+ ut_ad((ulong) n < srv_page_size / (REC_N_NEW_EXTRA_BYTES + 1));
+
+ return((ulint) n);
+}
+
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+void
+page_rec_print(
+/*===========*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: record descriptor */
+{
+ ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+ rec_print_new(stderr, rec, offsets);
+ if (page_rec_is_comp(rec)) {
+ ib::info() << "n_owned: " << rec_get_n_owned_new(rec)
+ << "; heap_no: " << rec_get_heap_no_new(rec)
+ << "; next rec: " << rec_get_next_offs(rec, TRUE);
+ } else {
+ ib::info() << "n_owned: " << rec_get_n_owned_old(rec)
+ << "; heap_no: " << rec_get_heap_no_old(rec)
+ << "; next rec: " << rec_get_next_offs(rec, FALSE);
+ }
+
+ page_rec_check(rec);
+ rec_validate(rec, offsets);
+}
+
+#ifdef UNIV_BTR_PRINT
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+void
+page_dir_print(
+/*===========*/
+ page_t* page, /*!< in: index page */
+ ulint pr_n) /*!< in: print n first and n last entries */
+{
+ ulint n;
+ ulint i;
+ page_dir_slot_t* slot;
+
+ n = page_dir_get_n_slots(page);
+
+ fprintf(stderr, "--------------------------------\n"
+ "PAGE DIRECTORY\n"
+ "Page address %p\n"
+ "Directory stack top at offs: %lu; number of slots: %lu\n",
+ page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)),
+ (ulong) n);
+ for (i = 0; i < n; i++) {
+ slot = page_dir_get_nth_slot(page, i);
+ if ((i == pr_n) && (i < n - pr_n)) {
+ fputs(" ... \n", stderr);
+ }
+ if ((i < pr_n) || (i >= n - pr_n)) {
+ fprintf(stderr,
+ "Contents of slot: %lu: n_owned: %lu,"
+ " rec offs: %lu\n",
+ (ulong) i,
+ (ulong) page_dir_slot_get_n_owned(slot),
+ (ulong)
+ page_offset(page_dir_slot_get_rec(slot)));
+ }
+ }
+ fprintf(stderr, "Total of %lu records\n"
+ "--------------------------------\n",
+ (ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page)));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+void
+page_print_list(
+/*============*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint pr_n) /*!< in: print n first and n last entries */
+{
+ page_t* page = block->frame;
+ page_cur_t cur;
+ ulint count;
+ ulint n_recs;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+ fprint(stderr,
+ "--------------------------------\n"
+ "PAGE RECORD LIST\n"
+ "Page address %p\n", page);
+
+ n_recs = page_get_n_recs(page);
+
+ page_cur_set_before_first(block, &cur);
+ count = 0;
+ for (;;) {
+ offsets = rec_get_offsets(cur.rec, index, offsets,
+ page_rec_is_leaf(cur.rec),
+ ULINT_UNDEFINED, &heap);
+ page_rec_print(cur.rec, offsets);
+
+ if (count == pr_n) {
+ break;
+ }
+ if (page_cur_is_after_last(&cur)) {
+ break;
+ }
+ page_cur_move_to_next(&cur);
+ count++;
+ }
+
+ if (n_recs > 2 * pr_n) {
+ fputs(" ... \n", stderr);
+ }
+
+ while (!page_cur_is_after_last(&cur)) {
+ page_cur_move_to_next(&cur);
+
+ if (count + pr_n >= n_recs) {
+ offsets = rec_get_offsets(cur.rec, index, offsets,
+ page_rec_is_leaf(cur.rec),
+ ULINT_UNDEFINED, &heap);
+ page_rec_print(cur.rec, offsets);
+ }
+ count++;
+ }
+
+ fprintf(stderr,
+ "Total of %lu records \n"
+ "--------------------------------\n",
+ (ulong) (count + 1));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/***************************************************************//**
+Prints the info in a page header. */
+void
+page_header_print(
+/*==============*/
+ const page_t* page)
+{
+ fprintf(stderr,
+ "--------------------------------\n"
+ "PAGE HEADER INFO\n"
+ "Page address %p, n records %u (%s)\n"
+ "n dir slots %u, heap top %u\n"
+ "Page n heap %u, free %u, garbage %u\n"
+ "Page last insert %u, direction %u, n direction %u\n",
+ page, page_header_get_field(page, PAGE_N_RECS),
+ page_is_comp(page) ? "compact format" : "original format",
+ page_header_get_field(page, PAGE_N_DIR_SLOTS),
+ page_header_get_field(page, PAGE_HEAP_TOP),
+ page_dir_get_n_heap(page),
+ page_header_get_field(page, PAGE_FREE),
+ page_header_get_field(page, PAGE_GARBAGE),
+ page_header_get_field(page, PAGE_LAST_INSERT),
+ page_get_direction(page),
+ page_header_get_field(page, PAGE_N_DIRECTION));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+void
+page_print(
+/*=======*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint dn, /*!< in: print dn first and last entries
+ in directory */
+ ulint rn) /*!< in: print rn first and last records
+ in directory */
+{
+ page_t* page = block->frame;
+
+ page_header_print(page);
+ page_dir_print(page, dn);
+ page_print_list(block, index, rn);
+}
+#endif /* UNIV_BTR_PRINT */
+
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return TRUE if ok */
+ibool
+page_rec_validate(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n_owned;
+ ulint heap_no;
+ const page_t* page;
+
+ page = page_align(rec);
+ ut_a(!page_is_comp(page) == !rec_offs_comp(offsets));
+
+ page_rec_check(rec);
+ rec_validate(rec, offsets);
+
+ if (page_rec_is_comp(rec)) {
+ n_owned = rec_get_n_owned_new(rec);
+ heap_no = rec_get_heap_no_new(rec);
+ } else {
+ n_owned = rec_get_n_owned_old(rec);
+ heap_no = rec_get_heap_no_old(rec);
+ }
+
+ if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) {
+ ib::warn() << "Dir slot of rec " << page_offset(rec)
+ << ", n owned too big " << n_owned;
+ return(FALSE);
+ }
+
+ if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) {
+ ib::warn() << "Heap no of rec " << page_offset(rec)
+ << " too big " << heap_no << " "
+ << page_dir_get_n_heap(page);
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+void
+page_check_dir(
+/*===========*/
+ const page_t* page) /*!< in: index page */
+{
+ ulint n_slots;
+ ulint infimum_offs;
+ ulint supremum_offs;
+
+ n_slots = page_dir_get_n_slots(page);
+ infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0));
+ supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page,
+ n_slots - 1));
+
+ if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) {
+
+ ib::fatal() << "Page directory corruption: infimum not"
+ " pointed to";
+ }
+
+ if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) {
+
+ ib::fatal() << "Page directory corruption: supremum not"
+ " pointed to";
+ }
+}
+#endif /* UNIV_DEBUG */
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_old(
+/*=====================*/
+ const page_t* page) /*!< in: index page in ROW_FORMAT=REDUNDANT */
+{
+ const page_dir_slot_t* slot;
+ ulint slot_no;
+ ulint n_slots;
+ const rec_t* rec;
+ const byte* rec_heap_top;
+ ulint count;
+ ulint own_count;
+ ibool ret = FALSE;
+
+ ut_a(!page_is_comp(page));
+
+ /* Check first that the record heap and the directory do not
+ overlap. */
+
+ n_slots = page_dir_get_n_slots(page);
+
+ if (UNIV_UNLIKELY(n_slots < 2 || n_slots > srv_page_size / 4)) {
+ ib::error() << "Nonsensical number of page dir slots: "
+ << n_slots;
+ goto func_exit;
+ }
+
+ rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+ if (UNIV_UNLIKELY(rec_heap_top
+ > page_dir_get_nth_slot(page, n_slots - 1))) {
+ ib::error()
+ << "Record heap and dir overlap on a page, heap top "
+ << page_header_get_field(page, PAGE_HEAP_TOP)
+ << ", dir "
+ << page_offset(page_dir_get_nth_slot(page,
+ n_slots - 1));
+
+ goto func_exit;
+ }
+
+ /* Validate the record list in a loop checking also that it is
+ consistent with the page record directory. */
+
+ count = 0;
+ own_count = 1;
+ slot_no = 0;
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ rec = page_get_infimum_rec(page);
+
+ for (;;) {
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+ ib::error() << "Record " << (rec - page)
+ << " is above rec heap top "
+ << (rec_heap_top - page);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) != 0)) {
+ /* This is a record pointed to by a dir slot */
+ if (UNIV_UNLIKELY(rec_get_n_owned_old(rec)
+ != own_count)) {
+
+ ib::error() << "Wrong owned count "
+ << rec_get_n_owned_old(rec)
+ << ", " << own_count << ", rec "
+ << (rec - page);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY
+ (page_dir_slot_get_rec(slot) != rec)) {
+ ib::error() << "Dir slot does not point"
+ " to right rec " << (rec - page);
+
+ goto func_exit;
+ }
+
+ own_count = 0;
+
+ if (!page_rec_is_supremum(rec)) {
+ slot_no++;
+ slot = page_dir_get_nth_slot(page, slot_no);
+ }
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ break;
+ }
+
+ if (UNIV_UNLIKELY
+ (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA
+ || rec_get_next_offs(rec, FALSE) >= srv_page_size)) {
+
+ ib::error() << "Next record offset nonsensical "
+ << rec_get_next_offs(rec, FALSE) << " for rec "
+ << (rec - page);
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > srv_page_size)) {
+ ib::error() << "Page record list appears"
+ " to be circular " << count;
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next_const(rec);
+ own_count++;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+ ib::error() << "n owned is zero in a supremum rec";
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+ ib::error() << "n slots wrong "
+ << slot_no << ", " << (n_slots - 1);
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS))
+ + PAGE_HEAP_NO_USER_LOW
+ != count + 1)) {
+ ib::error() << "n recs wrong "
+ << page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW << " " << (count + 1);
+
+ goto func_exit;
+ }
+
+ /* Check then the free list */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+
+ while (rec != NULL) {
+ if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+ || rec >= page + srv_page_size)) {
+ ib::error() << "Free list record has"
+ " a nonsensical offset " << (rec - page);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+ ib::error() << "Free list record " << (rec - page)
+ << " is above rec heap top "
+ << (rec_heap_top - page);
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > srv_page_size)) {
+ ib::error() << "Page free list appears"
+ " to be circular " << count;
+ goto func_exit;
+ }
+
+ ulint offs = rec_get_next_offs(rec, FALSE);
+ if (!offs) {
+ break;
+ }
+ if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM
+ || offs >= srv_page_size)) {
+ ib::error() << "Page free list is corrupted " << count;
+ goto func_exit;
+ }
+
+ rec = page + offs;
+ }
+
+ if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+ ib::error() << "N heap is wrong "
+ << page_dir_get_n_heap(page) << ", " << (count + 1);
+
+ goto func_exit;
+ }
+
+ ret = TRUE;
+
+func_exit:
+ return(ret);
+}
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_new(
+/*=====================*/
+ const page_t* page) /*!< in: index page in ROW_FORMAT!=REDUNDANT */
+{
+ const page_dir_slot_t* slot;
+ ulint slot_no;
+ ulint n_slots;
+ const rec_t* rec;
+ const byte* rec_heap_top;
+ ulint count;
+ ulint own_count;
+ ibool ret = FALSE;
+
+ ut_a(page_is_comp(page));
+
+ /* Check first that the record heap and the directory do not
+ overlap. */
+
+ n_slots = page_dir_get_n_slots(page);
+
+ if (UNIV_UNLIKELY(n_slots < 2 || n_slots > srv_page_size / 4)) {
+ ib::error() << "Nonsensical number of page dir slots: "
+ << n_slots;
+ goto func_exit;
+ }
+
+ rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+ if (UNIV_UNLIKELY(rec_heap_top
+ > page_dir_get_nth_slot(page, n_slots - 1))) {
+
+ ib::error() << "Record heap and dir overlap on a page,"
+ " heap top "
+ << page_header_get_field(page, PAGE_HEAP_TOP)
+ << ", dir " << page_offset(
+ page_dir_get_nth_slot(page, n_slots - 1));
+
+ goto func_exit;
+ }
+
+ /* Validate the record list in a loop checking also that it is
+ consistent with the page record directory. */
+
+ count = 0;
+ own_count = 1;
+ slot_no = 0;
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ rec = page_get_infimum_rec(page);
+
+ for (;;) {
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+
+ ib::error() << "Record " << page_offset(rec)
+ << " is above rec heap top "
+ << page_offset(rec_heap_top);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) {
+ /* This is a record pointed to by a dir slot */
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec)
+ != own_count)) {
+
+ ib::error() << "Wrong owned count "
+ << rec_get_n_owned_new(rec) << ", "
+ << own_count << ", rec "
+ << page_offset(rec);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY
+ (page_dir_slot_get_rec(slot) != rec)) {
+ ib::error() << "Dir slot does not point"
+ " to right rec " << page_offset(rec);
+
+ goto func_exit;
+ }
+
+ own_count = 0;
+
+ if (!page_rec_is_supremum(rec)) {
+ slot_no++;
+ slot = page_dir_get_nth_slot(page, slot_no);
+ }
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ break;
+ }
+
+ if (UNIV_UNLIKELY
+ (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA
+ || rec_get_next_offs(rec, TRUE) >= srv_page_size)) {
+
+ ib::error() << "Next record offset nonsensical "
+ << rec_get_next_offs(rec, TRUE)
+ << " for rec " << page_offset(rec);
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > srv_page_size)) {
+ ib::error() << "Page record list appears to be"
+ " circular " << count;
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next_const(rec);
+ own_count++;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+ ib::error() << "n owned is zero in a supremum rec";
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+ ib::error() << "n slots wrong " << slot_no << ", "
+ << (n_slots - 1);
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS))
+ + PAGE_HEAP_NO_USER_LOW
+ != count + 1)) {
+ ib::error() << "n recs wrong "
+ << page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW << " " << (count + 1);
+
+ goto func_exit;
+ }
+
+ /* Check then the free list */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+
+ while (rec != NULL) {
+ if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+ || rec >= page + srv_page_size)) {
+
+ ib::error() << "Free list record has"
+ " a nonsensical offset " << page_offset(rec);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+ ib::error() << "Free list record " << page_offset(rec)
+ << " is above rec heap top "
+ << page_offset(rec_heap_top);
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > srv_page_size)) {
+ ib::error() << "Page free list appears to be"
+ " circular " << count;
+ goto func_exit;
+ }
+
+ const ulint offs = rec_get_next_offs(rec, TRUE);
+ if (!offs) {
+ break;
+ }
+ if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM
+ || offs >= srv_page_size)) {
+ ib::error() << "Page free list is corrupted " << count;
+ goto func_exit;
+ }
+
+ rec = page + offs;
+ }
+
+ if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+ ib::error() << "N heap is wrong "
+ << page_dir_get_n_heap(page) << ", " << (count + 1);
+
+ goto func_exit;
+ }
+
+ ret = TRUE;
+
+func_exit:
+ return(ret);
+}
+
+/** Check the consistency of an index page.
+@param[in] page index page
+@param[in] index B-tree or R-tree index
+@return whether the page is valid */
+bool page_validate(const page_t* page, const dict_index_t* index)
+{
+ const page_dir_slot_t* slot;
+ const rec_t* rec;
+ const rec_t* old_rec = NULL;
+ const rec_t* first_rec = NULL;
+ ulint offs = 0;
+ ulint n_slots;
+ ibool ret = TRUE;
+ ulint i;
+ rec_offs offsets_1[REC_OFFS_NORMAL_SIZE];
+ rec_offs offsets_2[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_1;
+ rec_offs* old_offsets = offsets_2;
+
+ rec_offs_init(offsets_1);
+ rec_offs_init(offsets_2);
+
+#ifdef UNIV_GIS_DEBUG
+ if (dict_index_is_spatial(index)) {
+ fprintf(stderr, "Page no: %lu\n", page_get_page_no(page));
+ }
+#endif /* UNIV_DEBUG */
+
+ if (UNIV_UNLIKELY((ibool) !!page_is_comp(page)
+ != dict_table_is_comp(index->table))) {
+ ib::error() << "'compact format' flag mismatch";
+func_exit2:
+ ib::error() << "Apparent corruption in space "
+ << page_get_space_id(page) << " page "
+ << page_get_page_no(page)
+ << " of index " << index->name
+ << " of table " << index->table->name;
+ return FALSE;
+ }
+
+ if (page_is_comp(page)) {
+ if (UNIV_UNLIKELY(!page_simple_validate_new(page))) {
+ goto func_exit2;
+ }
+ } else {
+ if (UNIV_UNLIKELY(!page_simple_validate_old(page))) {
+ goto func_exit2;
+ }
+ }
+
+ /* Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (!page_is_leaf(page) || page_is_empty(page)
+ || !dict_index_is_sec_or_ibuf(index)
+ || index->table->is_temporary()) {
+ } else if (trx_id_t sys_max_trx_id = trx_sys.get_max_trx_id()) {
+ trx_id_t max_trx_id = page_get_max_trx_id(page);
+
+ if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) {
+ ib::error() << "PAGE_MAX_TRX_ID out of bounds: "
+ << max_trx_id << ", " << sys_max_trx_id;
+ ret = FALSE;
+ }
+ } else {
+ ut_ad(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN);
+ }
+
+ /* Check first that the record heap and the directory do not
+ overlap. */
+
+ n_slots = page_dir_get_n_slots(page);
+
+ if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP)
+ <= page_dir_get_nth_slot(page, n_slots - 1)))) {
+
+ ib::warn() << "Record heap and directory overlap";
+ goto func_exit2;
+ }
+
+ switch (uint16_t type = fil_page_get_type(page)) {
+ case FIL_PAGE_RTREE:
+ if (!index->is_spatial()) {
+wrong_page_type:
+ ib::warn() << "Wrong page type " << type;
+ ret = FALSE;
+ }
+ break;
+ case FIL_PAGE_TYPE_INSTANT:
+ if (index->is_instant()
+ && page_get_page_no(page) == index->page) {
+ break;
+ }
+ goto wrong_page_type;
+ case FIL_PAGE_INDEX:
+ if (index->is_spatial()) {
+ goto wrong_page_type;
+ }
+ if (index->is_instant()
+ && page_get_page_no(page) == index->page) {
+ goto wrong_page_type;
+ }
+ break;
+ default:
+ goto wrong_page_type;
+ }
+
+ /* The following buffer is used to check that the
+ records in the page record heap do not overlap */
+ mem_heap_t* heap = mem_heap_create(srv_page_size + 200);;
+ byte* buf = static_cast<byte*>(mem_heap_zalloc(heap, srv_page_size));
+
+ /* Validate the record list in a loop checking also that
+ it is consistent with the directory. */
+ ulint count = 0, data_size = 0, own_count = 1, slot_no = 0;
+ ulint info_bits;
+ slot_no = 0;
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ rec = page_get_infimum_rec(page);
+
+ const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+
+ for (;;) {
+ offsets = rec_get_offsets(rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+
+ if (page_is_comp(page) && page_rec_is_user_rec(rec)
+ && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec)
+ == page_is_leaf(page))) {
+ ib::error() << "'node_ptr' flag mismatch";
+ ret = FALSE;
+ goto next_rec;
+ }
+
+ if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+ ret = FALSE;
+ goto next_rec;
+ }
+
+ info_bits = rec_get_info_bits(rec, page_is_comp(page));
+ if (info_bits
+ & ~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) {
+ ib::error() << "info_bits has an incorrect value "
+ << info_bits;
+ ret = false;
+ }
+
+ if (rec == first_rec) {
+ if (info_bits & REC_INFO_MIN_REC_FLAG) {
+ if (page_has_prev(page)) {
+ ib::error() << "REC_INFO_MIN_REC_FLAG "
+ "is set on non-left page";
+ ret = false;
+ } else if (!page_is_leaf(page)) {
+ /* leftmost node pointer page */
+ } else if (!index->is_instant()) {
+ ib::error() << "REC_INFO_MIN_REC_FLAG "
+ "is set in a leaf-page record";
+ ret = false;
+ } else if (!(info_bits & REC_INFO_DELETED_FLAG)
+ != !index->table->instant) {
+ ib::error() << (index->table->instant
+ ? "Metadata record "
+ "is not delete-marked"
+ : "Metadata record "
+ "is delete-marked");
+ ret = false;
+ }
+ } else if (!page_has_prev(page)
+ && index->is_instant()) {
+ ib::error() << "Metadata record is missing";
+ ret = false;
+ }
+ } else if (info_bits & REC_INFO_MIN_REC_FLAG) {
+ ib::error() << "REC_INFO_MIN_REC_FLAG record is not "
+ "first in page";
+ ret = false;
+ }
+
+ if (page_is_comp(page)) {
+ const rec_comp_status_t status = rec_get_status(rec);
+ if (status != REC_STATUS_ORDINARY
+ && status != REC_STATUS_NODE_PTR
+ && status != REC_STATUS_INFIMUM
+ && status != REC_STATUS_SUPREMUM
+ && status != REC_STATUS_INSTANT) {
+ ib::error() << "impossible record status "
+ << status;
+ ret = false;
+ } else if (page_rec_is_infimum(rec)) {
+ if (status != REC_STATUS_INFIMUM) {
+ ib::error()
+ << "infimum record has status "
+ << status;
+ ret = false;
+ }
+ } else if (page_rec_is_supremum(rec)) {
+ if (status != REC_STATUS_SUPREMUM) {
+ ib::error() << "supremum record has "
+ "status "
+ << status;
+ ret = false;
+ }
+ } else if (!page_is_leaf(page)) {
+ if (status != REC_STATUS_NODE_PTR) {
+ ib::error() << "node ptr record has "
+ "status "
+ << status;
+ ret = false;
+ }
+ } else if (!index->is_instant()
+ && status == REC_STATUS_INSTANT) {
+ ib::error() << "instantly added record in a "
+ "non-instant index";
+ ret = false;
+ }
+ }
+
+ /* Check that the records are in the ascending order */
+ if (count >= PAGE_HEAP_NO_USER_LOW
+ && !page_rec_is_supremum(rec)) {
+
+ int ret = cmp_rec_rec(
+ rec, old_rec, offsets, old_offsets, index);
+
+ /* For spatial index, on nonleaf leavel, we
+ allow recs to be equal. */
+ if (ret <= 0 && !(ret == 0 && index->is_spatial()
+ && !page_is_leaf(page))) {
+
+ ib::error() << "Records in wrong order";
+
+ fputs("\nInnoDB: previous record ", stderr);
+ /* For spatial index, print the mbr info.*/
+ if (index->type & DICT_SPATIAL) {
+ putc('\n', stderr);
+ rec_print_mbr_rec(stderr,
+ old_rec, old_offsets);
+ fputs("\nInnoDB: record ", stderr);
+ putc('\n', stderr);
+ rec_print_mbr_rec(stderr, rec, offsets);
+ putc('\n', stderr);
+ putc('\n', stderr);
+
+ } else {
+ rec_print_new(stderr, old_rec, old_offsets);
+ fputs("\nInnoDB: record ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ }
+
+ ret = FALSE;
+ }
+ }
+
+ if (page_rec_is_user_rec(rec)) {
+
+ data_size += rec_offs_size(offsets);
+
+#if defined(UNIV_GIS_DEBUG)
+ /* For spatial index, print the mbr info.*/
+ if (index->type & DICT_SPATIAL) {
+ rec_print_mbr_rec(stderr, rec, offsets);
+ putc('\n', stderr);
+ }
+#endif /* UNIV_GIS_DEBUG */
+ }
+
+ offs = page_offset(rec_get_start(rec, offsets));
+ i = rec_offs_size(offsets);
+ if (UNIV_UNLIKELY(offs + i >= srv_page_size)) {
+ ib::error() << "Record offset out of bounds: "
+ << offs << '+' << i;
+ ret = FALSE;
+ goto next_rec;
+ }
+ while (i--) {
+ if (UNIV_UNLIKELY(buf[offs + i])) {
+ ib::error() << "Record overlaps another: "
+ << offs << '+' << i;
+ ret = FALSE;
+ break;
+ }
+ buf[offs + i] = 1;
+ }
+
+ if (ulint rec_own_count = page_is_comp(page)
+ ? rec_get_n_owned_new(rec)
+ : rec_get_n_owned_old(rec)) {
+ /* This is a record pointed to by a dir slot */
+ if (UNIV_UNLIKELY(rec_own_count != own_count)) {
+ ib::error() << "Wrong owned count at " << offs
+ << ": " << rec_own_count
+ << ", " << own_count;
+ ret = FALSE;
+ }
+
+ if (page_dir_slot_get_rec(slot) != rec) {
+ ib::error() << "Dir slot does not"
+ " point to right rec at " << offs;
+ ret = FALSE;
+ }
+
+ if (ret) {
+ page_dir_slot_check(slot);
+ }
+
+ own_count = 0;
+ if (!page_rec_is_supremum(rec)) {
+ slot_no++;
+ slot = page_dir_get_nth_slot(page, slot_no);
+ }
+ }
+
+next_rec:
+ if (page_rec_is_supremum(rec)) {
+ break;
+ }
+
+ count++;
+ own_count++;
+ old_rec = rec;
+ rec = page_rec_get_next_const(rec);
+
+ if (page_rec_is_infimum(old_rec)
+ && page_rec_is_user_rec(rec)) {
+ first_rec = rec;
+ }
+
+ /* set old_offsets to offsets; recycle offsets */
+ std::swap(old_offsets, offsets);
+ }
+
+ if (page_is_comp(page)) {
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+
+ goto n_owned_zero;
+ }
+ } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+n_owned_zero:
+ ib::error() << "n owned is zero at " << offs;
+ ret = FALSE;
+ }
+
+ if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+ ib::error() << "n slots wrong " << slot_no << " "
+ << (n_slots - 1);
+ ret = FALSE;
+ }
+
+ if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS))
+ + PAGE_HEAP_NO_USER_LOW
+ != count + 1)) {
+ ib::error() << "n recs wrong "
+ << page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW << " " << (count + 1);
+ ret = FALSE;
+ }
+
+ if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) {
+ ib::error() << "Summed data size " << data_size
+ << ", returned by func " << page_get_data_size(page);
+ ret = FALSE;
+ }
+
+ /* Check then the free list */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+
+ while (rec != NULL) {
+ offsets = rec_get_offsets(rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+ ret = FALSE;
+next_free:
+ const ulint offs = rec_get_next_offs(
+ rec, page_is_comp(page));
+ if (!offs) {
+ break;
+ }
+ if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM
+ || offs >= srv_page_size)) {
+ ib::error() << "Page free list is corrupted";
+ ret = FALSE;
+ break;
+ }
+
+ rec = page + offs;
+ continue;
+ }
+
+ count++;
+ offs = page_offset(rec_get_start(rec, offsets));
+ i = rec_offs_size(offsets);
+ if (UNIV_UNLIKELY(offs + i >= srv_page_size)) {
+ ib::error() << "Free record offset out of bounds: "
+ << offs << '+' << i;
+ ret = FALSE;
+ goto next_free;
+ }
+ while (i--) {
+ if (UNIV_UNLIKELY(buf[offs + i])) {
+ ib::error() << "Free record overlaps another: "
+ << offs << '+' << i;
+ ret = FALSE;
+ break;
+ }
+ buf[offs + i] = 1;
+ }
+
+ goto next_free;
+ }
+
+ if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+ ib::error() << "N heap is wrong "
+ << page_dir_get_n_heap(page) << " " << count + 1;
+ ret = FALSE;
+ }
+
+ mem_heap_free(heap);
+
+ if (UNIV_UNLIKELY(!ret)) {
+ goto func_exit2;
+ }
+
+ return(ret);
+}
+
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return record, NULL if not found */
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+ const page_t* page, /*!< in: index page */
+ ulint heap_no)/*!< in: heap number */
+{
+ const rec_t* rec;
+
+ if (page_is_comp(page)) {
+ rec = page + PAGE_NEW_INFIMUM;
+
+ for (;;) {
+ ulint rec_heap_no = rec_get_heap_no_new(rec);
+
+ if (rec_heap_no == heap_no) {
+
+ return(rec);
+ } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+ return(NULL);
+ }
+
+ rec = page + rec_get_next_offs(rec, TRUE);
+ }
+ } else {
+ rec = page + PAGE_OLD_INFIMUM;
+
+ for (;;) {
+ ulint rec_heap_no = rec_get_heap_no_old(rec);
+
+ if (rec_heap_no == heap_no) {
+
+ return(rec);
+ } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+ return(NULL);
+ }
+
+ rec = page + rec_get_next_offs(rec, FALSE);
+ }
+ }
+}
+
+/** Get the last non-delete-marked record on a page.
+@param[in] page index tree leaf page
+@return the last record, not delete-marked
+@retval infimum record if all records are delete-marked */
+const rec_t*
+page_find_rec_max_not_deleted(
+ const page_t* page)
+{
+ const rec_t* rec = page_get_infimum_rec(page);
+ const rec_t* prev_rec = NULL; // remove warning
+
+ /* Because the page infimum is never delete-marked
+ and never the metadata pseudo-record (MIN_REC_FLAG)),
+ prev_rec will always be assigned to it first. */
+ ut_ad(!rec_get_info_bits(rec, page_rec_is_comp(rec)));
+ ut_ad(page_is_leaf(page));
+
+ if (page_is_comp(page)) {
+ do {
+ if (!(rec[-REC_NEW_INFO_BITS]
+ & (REC_INFO_DELETED_FLAG
+ | REC_INFO_MIN_REC_FLAG))) {
+ prev_rec = rec;
+ }
+ rec = page_rec_get_next_low(rec, true);
+ } while (rec != page + PAGE_NEW_SUPREMUM);
+ } else {
+ do {
+ if (!(rec[-REC_OLD_INFO_BITS]
+ & (REC_INFO_DELETED_FLAG
+ | REC_INFO_MIN_REC_FLAG))) {
+ prev_rec = rec;
+ }
+ rec = page_rec_get_next_low(rec, false);
+ } while (rec != page + PAGE_OLD_SUPREMUM);
+ }
+ return(prev_rec);
+}
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
new file mode 100644
index 00000000..331ecbfb
--- /dev/null
+++ b/storage/innobase/page/page0zip.cc
@@ -0,0 +1,4713 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0zip.cc
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#include "page0zip.h"
+#include "fsp0types.h"
+#include "page0page.h"
+#include "buf0checksum.h"
+#include "ut0crc32.h"
+#include "zlib.h"
+#include "span.h"
+
+using st_::span;
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "log0recv.h"
+#include "row0row.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "lock0lock.h"
+#include "srv0srv.h"
+#include "buf0lru.h"
+#include "srv0mon.h"
+
+#include <map>
+#include <algorithm>
+
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by index->id */
+page_zip_stat_per_index_t page_zip_stat_per_index;
+
+/** Compression level to be used by zlib. Settable by user. */
+uint page_zip_level;
+
+/* Please refer to ../include/page0zip.ic for a description of the
+compressed page format. */
+
+/* The infimum and supremum records are omitted from the compressed page.
+On compress, we compare that the records are there, and on uncompress we
+restore the records. */
+/** Extra bytes of an infimum record */
+static const byte infimum_extra[] = {
+ 0x01, /* info_bits=0, n_owned=1 */
+ 0x00, 0x02 /* heap_no=0, status=2 */
+ /* ?, ? */ /* next=(first user rec, or supremum) */
+};
+/** Data bytes of an infimum record */
+static const byte infimum_data[] = {
+ 0x69, 0x6e, 0x66, 0x69,
+ 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */
+};
+/** Extra bytes and data bytes of a supremum record */
+static const byte supremum_extra_data alignas(4) [] = {
+ /* 0x0?, */ /* info_bits=0, n_owned=1..8 */
+ 0x00, 0x0b, /* heap_no=1, status=3 */
+ 0x00, 0x00, /* next=0 */
+ 0x73, 0x75, 0x70, 0x72,
+ 0x65, 0x6d, 0x75, 0x6d /* "supremum" */
+};
+
+/** Assert that a block of memory is filled with zero bytes.
+@param b in: memory block
+@param s in: size of the memory block, in bytes */
+#define ASSERT_ZERO(b, s) ut_ad(!memcmp(b, field_ref_zero, s))
+/** Assert that a BLOB pointer is filled with zero bytes.
+@param b in: BLOB pointer */
+#define ASSERT_ZERO_BLOB(b) ASSERT_ZERO(b, FIELD_REF_SIZE)
+
+/* Enable some extra debugging output. This code can be enabled
+independently of any UNIV_ debugging conditions. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+# include <stdarg.h>
+MY_ATTRIBUTE((format (printf, 1, 2)))
+/**********************************************************************//**
+Report a failure to decompress or compress.
+@return number of characters printed */
+static
+int
+page_zip_fail_func(
+/*===============*/
+ const char* fmt, /*!< in: printf(3) format string */
+ ...) /*!< in: arguments corresponding to fmt */
+{
+ int res;
+ va_list ap;
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: ", stderr);
+ va_start(ap, fmt);
+ res = vfprintf(stderr, fmt, ap);
+ va_end(ap);
+
+ return(res);
+}
+/** Wrapper for page_zip_fail_func()
+@param fmt_args in: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args
+#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/** Dummy wrapper for page_zip_fail_func()
+@param fmt_args ignored: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) /* empty */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+ulint
+page_zip_empty_size(
+/*================*/
+ ulint n_fields, /*!< in: number of columns in the index */
+ ulint zip_size) /*!< in: compressed page size in bytes */
+{
+ ulint size = zip_size
+ /* subtract the page header and the longest
+ uncompressed data needed for one record */
+ - (PAGE_DATA
+ + PAGE_ZIP_CLUST_LEAF_SLOT_SIZE
+ + 1/* encoded heap_no==2 in page_zip_write_rec() */
+ + 1/* end of modification log */
+ - REC_N_NEW_EXTRA_BYTES/* omitted bytes */)
+ /* subtract the space for page_zip_fields_encode() */
+ - compressBound(static_cast<uLong>(2 * (n_fields + 1)));
+ return(lint(size) > 0 ? size : 0);
+}
+
+/** Check whether a tuple is too big for compressed table
+@param[in] index dict index object
+@param[in] entry entry for the index
+@return true if it's too big, otherwise false */
+bool
+page_zip_is_too_big(
+ const dict_index_t* index,
+ const dtuple_t* entry)
+{
+ const ulint zip_size = index->table->space->zip_size();
+
+ /* Estimate the free space of an empty compressed page.
+ Subtract one byte for the encoded heap_no in the
+ modification log. */
+ ulint free_space_zip = page_zip_empty_size(
+ index->n_fields, zip_size);
+ ulint n_uniq = dict_index_get_n_unique_in_tree(index);
+
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(zip_size);
+
+ if (free_space_zip == 0) {
+ return(true);
+ }
+
+ /* Subtract one byte for the encoded heap_no in the
+ modification log. */
+ free_space_zip--;
+
+ /* There should be enough room for two node pointer
+ records on an empty non-leaf page. This prevents
+ infinite page splits. */
+
+ if (entry->n_fields >= n_uniq
+ && (REC_NODE_PTR_SIZE
+ + rec_get_converted_size_comp_prefix(
+ index, entry->fields, n_uniq, NULL)
+ /* On a compressed page, there is
+ a two-byte entry in the dense
+ page directory for every record.
+ But there is no record header. */
+ - (REC_N_NEW_EXTRA_BYTES - 2)
+ > free_space_zip / 2)) {
+ return(true);
+ }
+
+ return(false);
+}
+
+/*************************************************************//**
+Gets the number of elements in the dense page directory,
+including deleted records (the free list).
+@return number of elements in the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_elems(
+/*===============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ /* Exclude the page infimum and supremum from the record count. */
+ return ulint(page_dir_get_n_heap(page_zip->data))
+ - PAGE_HEAP_NO_USER_LOW;
+}
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return length of dense page directory, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip));
+}
+
+/*************************************************************//**
+Gets an offset to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return offset of the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_start_offs(
+/*====================*/
+ const page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint n_dense) /*!< in: directory size */
+{
+ ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip));
+
+ return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
+}
+
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip compressed page
+@param[in] n_dense number of entries in the directory
+@return pointer to the dense page directory */
+#define page_zip_dir_start_low(page_zip, n_dense) \
+ ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense))
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip compressed page
+@return pointer to the dense page directory */
+#define page_zip_dir_start(page_zip) \
+ page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip))
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+only including user records (excluding the free list).
+@return length of dense page directory comprising existing records, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_user_size(
+/*===================*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ ulint size = PAGE_ZIP_DIR_SLOT_SIZE
+ * ulint(page_get_n_recs(page_zip->data));
+ ut_ad(size <= page_zip_dir_size(page_zip));
+ return(size);
+}
+
+/*************************************************************//**
+Find the slot of the given record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_low(
+/*==================*/
+ byte* slot, /*!< in: start of records */
+ byte* end, /*!< in: end of records */
+ ulint offset) /*!< in: offset of user record */
+{
+ ut_ad(slot <= end);
+
+ for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) {
+ if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK)
+ == offset) {
+ return(slot);
+ }
+ }
+
+ return(NULL);
+}
+
+/*************************************************************//**
+Find the slot of the given non-free record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint offset) /*!< in: offset of user record */
+{
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip),
+ end,
+ offset));
+}
+
+/*************************************************************//**
+Find the slot of the given free record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_free(
+/*===================*/
+ page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint offset) /*!< in: offset of user record */
+{
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip),
+ end - page_zip_dir_user_size(page_zip),
+ offset));
+}
+
+/*************************************************************//**
+Read a given slot in the dense page directory.
+@return record offset on the uncompressed page, possibly ORed with
+PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */
+UNIV_INLINE
+ulint
+page_zip_dir_get(
+/*=============*/
+ const page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint slot) /*!< in: slot
+ (0=first user record) */
+{
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE);
+ return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip)
+ - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
+}
+
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in] b ROW_FORMAT=COMPRESSED index page
+@param[in] offset byte offset from b.zip.data
+@param[in] len length of the data to write */
+inline void mtr_t::zmemcpy(const buf_block_t &b, ulint offset, ulint len)
+{
+ ut_ad(fil_page_get_type(b.page.zip.data) == FIL_PAGE_INDEX ||
+ fil_page_get_type(b.page.zip.data) == FIL_PAGE_RTREE);
+ ut_ad(page_zip_simple_validate(&b.page.zip));
+ ut_ad(offset + len <= page_zip_get_size(&b.page.zip));
+
+ memcpy_low(b, static_cast<uint16_t>(offset), &b.page.zip.data[offset], len);
+ m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in] b ROW_FORMAT=COMPRESSED index page
+@param[in] dest destination within b.zip.data
+@param[in] str the data to write
+@param[in] len length of the data to write
+@tparam w write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::zmemcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len)
+{
+ byte *d= static_cast<byte*>(dest);
+ const byte *s= static_cast<const byte*>(str);
+ ut_ad(d >= b.page.zip.data + FIL_PAGE_OFFSET);
+ if (w != FORCED)
+ {
+ ut_ad(len);
+ const byte *const end= d + len;
+ while (*d++ == *s++)
+ {
+ if (d == end)
+ {
+ ut_ad(w == MAYBE_NOP);
+ return;
+ }
+ }
+ s--;
+ d--;
+ len= static_cast<ulint>(end - d);
+ }
+ ::memcpy(d, s, len);
+ zmemcpy(b, d - b.page.zip.data, len);
+}
+
+/** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page.
+@param[in,out] block ROW_FORMAT=COMPRESSED index page
+@param[in] index the index that the block belongs to
+@param[in,out] mtr mini-transaction */
+static void page_zip_compress_write_log(buf_block_t *block,
+ dict_index_t *index, mtr_t *mtr)
+{
+ ut_ad(!index->is_ibuf());
+
+ if (mtr->get_log_mode() != MTR_LOG_ALL)
+ {
+ ut_ad(mtr->get_log_mode() == MTR_LOG_NONE ||
+ mtr->get_log_mode() == MTR_LOG_NO_REDO);
+ return;
+ }
+
+ const page_t *page= block->frame;
+ const page_zip_des_t *page_zip= &block->page.zip;
+ /* Read the number of user records. */
+ ulint trailer_size= ulint(page_dir_get_n_heap(page_zip->data)) -
+ PAGE_HEAP_NO_USER_LOW;
+ /* Multiply by uncompressed of size stored per record */
+ if (!page_is_leaf(page))
+ trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+ else if (index->is_clust())
+ trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + DATA_TRX_ID_LEN +
+ DATA_ROLL_PTR_LEN;
+ else
+ trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE;
+ /* Add the space occupied by BLOB pointers. */
+ trailer_size+= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+ ut_a(page_zip->m_end > PAGE_DATA);
+ compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA);
+ ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
+
+ mtr->init(block);
+ mtr->zmemcpy(*block, FIL_PAGE_PREV, page_zip->m_end - FIL_PAGE_PREV);
+
+ if (trailer_size)
+ mtr->zmemcpy(*block, page_zip_get_size(page_zip) - trailer_size,
+ trailer_size);
+ block->page.status = buf_page_t::INIT_ON_FLUSH; /* because of mtr_t::init() */
+}
+
+/******************************************************//**
+Determine how many externally stored columns are contained
+in existing records with smaller heap_no than rec. */
+static
+ulint
+page_zip_get_n_prev_extern(
+/*=======================*/
+ const page_zip_des_t* page_zip,/*!< in: dense page directory on
+ compressed page */
+ const rec_t* rec, /*!< in: compact physical record
+ on a B-tree leaf page */
+ const dict_index_t* index) /*!< in: record descriptor */
+{
+ const page_t* page = page_align(rec);
+ ulint n_ext = 0;
+ ulint i;
+ ulint left;
+ ulint heap_no;
+ ulint n_recs = page_get_n_recs(page_zip->data);
+
+ ut_ad(page_is_leaf(page));
+ ut_ad(page_is_comp(page));
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!dict_index_is_ibuf(index));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ left = heap_no - PAGE_HEAP_NO_USER_LOW;
+ if (UNIV_UNLIKELY(!left)) {
+ return(0);
+ }
+
+ for (i = 0; i < n_recs; i++) {
+ const rec_t* r = page + (page_zip_dir_get(page_zip, i)
+ & PAGE_ZIP_DIR_SLOT_MASK);
+
+ if (rec_get_heap_no_new(r) < heap_no) {
+ n_ext += rec_get_n_extern_new(r, index,
+ ULINT_UNDEFINED);
+ if (!--left) {
+ break;
+ }
+ }
+ }
+
+ return(n_ext);
+}
+
+/**********************************************************************//**
+Encode the length of a fixed-length column.
+@return buf + length of encoded val */
+static
+byte*
+page_zip_fixed_field_encode(
+/*========================*/
+ byte* buf, /*!< in: pointer to buffer where to write */
+ ulint val) /*!< in: value to write */
+{
+ ut_ad(val >= 2);
+
+ if (UNIV_LIKELY(val < 126)) {
+ /*
+ 0 = nullable variable field of at most 255 bytes length;
+ 1 = not null variable field of at most 255 bytes length;
+ 126 = nullable variable field with maximum length >255;
+ 127 = not null variable field with maximum length >255
+ */
+ *buf++ = (byte) val;
+ } else {
+ *buf++ = (byte) (0x80 | val >> 8);
+ *buf++ = (byte) val;
+ }
+
+ return(buf);
+}
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return used size of buf */
+ulint
+page_zip_fields_encode(
+/*===================*/
+ ulint n, /*!< in: number of fields
+ to compress */
+ const dict_index_t* index, /*!< in: index comprising
+ at least n fields */
+ ulint trx_id_pos,
+ /*!< in: position of the trx_id column
+ in the index, or ULINT_UNDEFINED if
+ this is a non-leaf page */
+ byte* buf) /*!< out: buffer of (n + 1) * 2 bytes */
+{
+ const byte* buf_start = buf;
+ ulint i;
+ ulint col;
+ ulint trx_id_col = 0;
+ /* sum of lengths of preceding non-nullable fixed fields, or 0 */
+ ulint fixed_sum = 0;
+
+ ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n);
+
+ for (i = col = 0; i < n; i++) {
+ dict_field_t* field = dict_index_get_nth_field(index, i);
+ ulint val;
+
+ if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) {
+ val = 1; /* set the "not nullable" flag */
+ } else {
+ val = 0; /* nullable field */
+ }
+
+ if (!field->fixed_len) {
+ /* variable-length field */
+ const dict_col_t* column
+ = dict_field_get_col(field);
+
+ if (DATA_BIG_COL(column)) {
+ val |= 0x7e; /* max > 255 bytes */
+ }
+
+ if (fixed_sum) {
+ /* write out the length of any
+ preceding non-nullable fields */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ *buf++ = (byte) val;
+ col++;
+ } else if (val) {
+ /* fixed-length non-nullable field */
+
+ if (fixed_sum && UNIV_UNLIKELY
+ (fixed_sum + field->fixed_len
+ > DICT_MAX_FIXED_COL_LEN)) {
+ /* Write out the length of the
+ preceding non-nullable fields,
+ to avoid exceeding the maximum
+ length of a fixed-length column. */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ if (i && UNIV_UNLIKELY(i == trx_id_pos)) {
+ if (fixed_sum) {
+ /* Write out the length of any
+ preceding non-nullable fields,
+ and start a new trx_id column. */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ col++;
+ }
+
+ trx_id_col = col;
+ fixed_sum = field->fixed_len;
+ } else {
+ /* add to the sum */
+ fixed_sum += field->fixed_len;
+ }
+ } else {
+ /* fixed-length nullable field */
+
+ if (fixed_sum) {
+ /* write out the length of any
+ preceding non-nullable fields */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ buf = page_zip_fixed_field_encode(
+ buf, ulint(field->fixed_len) << 1);
+ col++;
+ }
+ }
+
+ if (fixed_sum) {
+ /* Write out the lengths of last fixed-length columns. */
+ buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1);
+ }
+
+ if (trx_id_pos != ULINT_UNDEFINED) {
+ /* Write out the position of the trx_id column */
+ i = trx_id_col;
+ } else {
+ /* Write out the number of nullable fields */
+ i = index->n_nullable;
+ }
+
+ if (i < 128) {
+ *buf++ = (byte) i;
+ } else {
+ *buf++ = (byte) (0x80 | i >> 8);
+ *buf++ = (byte) i;
+ }
+
+ ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2);
+ return((ulint) (buf - buf_start));
+}
+
+/**********************************************************************//**
+Populate the dense page directory from the sparse directory. */
+static
+void
+page_zip_dir_encode(
+/*================*/
+ const page_t* page, /*!< in: compact page */
+ byte* buf, /*!< in: pointer to dense page directory[-1];
+ out: dense directory on compressed page */
+ const rec_t** recs) /*!< in: pointer to an array of 0, or NULL;
+ out: dense page directory sorted by ascending
+ address (and heap_no) */
+{
+ const byte* rec;
+ ulint status;
+ ulint min_mark;
+ ulint heap_no;
+ ulint i;
+ ulint n_heap;
+ ulint offs;
+
+ min_mark = 0;
+
+ if (page_is_leaf(page)) {
+ status = REC_STATUS_ORDINARY;
+ } else {
+ status = REC_STATUS_NODE_PTR;
+ if (UNIV_UNLIKELY(!page_has_prev(page))) {
+ min_mark = REC_INFO_MIN_REC_FLAG;
+ }
+ }
+
+ n_heap = page_dir_get_n_heap(page);
+
+ /* Traverse the list of stored records in the collation order,
+ starting from the first user record. */
+
+ rec = page + PAGE_NEW_INFIMUM;
+
+ i = 0;
+
+ for (;;) {
+ ulint info_bits;
+ offs = rec_get_next_offs(rec, TRUE);
+ if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) {
+ break;
+ }
+ rec = page + offs;
+ heap_no = rec_get_heap_no_new(rec);
+ ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ ut_a(heap_no < n_heap);
+ ut_a(offs < srv_page_size - PAGE_DIR);
+ ut_a(offs >= PAGE_ZIP_START);
+ compile_time_assert(!(PAGE_ZIP_DIR_SLOT_MASK
+ & (PAGE_ZIP_DIR_SLOT_MASK + 1)));
+ compile_time_assert(PAGE_ZIP_DIR_SLOT_MASK
+ >= UNIV_ZIP_SIZE_MAX - 1);
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) {
+ offs |= PAGE_ZIP_DIR_SLOT_OWNED;
+ }
+
+ info_bits = rec_get_info_bits(rec, TRUE);
+ if (info_bits & REC_INFO_DELETED_FLAG) {
+ info_bits &= ~REC_INFO_DELETED_FLAG;
+ offs |= PAGE_ZIP_DIR_SLOT_DEL;
+ }
+ ut_a(info_bits == min_mark);
+ /* Only the smallest user record can have
+ REC_INFO_MIN_REC_FLAG set. */
+ min_mark = 0;
+
+ mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+ if (UNIV_LIKELY_NULL(recs)) {
+ /* Ensure that each heap_no occurs at most once. */
+ ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+ /* exclude infimum and supremum */
+ recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+ }
+
+ ut_a(ulint(rec_get_status(rec)) == status);
+ }
+
+ offs = page_header_get_field(page, PAGE_FREE);
+
+ /* Traverse the free list (of deleted records). */
+ while (offs) {
+ ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK));
+ rec = page + offs;
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ ut_a(heap_no < n_heap);
+
+ ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */
+ ut_a(ulint(rec_get_status(rec)) == status);
+
+ mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+ if (UNIV_LIKELY_NULL(recs)) {
+ /* Ensure that each heap_no occurs at most once. */
+ ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+ /* exclude infimum and supremum */
+ recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+ }
+
+ offs = rec_get_next_offs(rec, TRUE);
+ }
+
+ /* Ensure that each heap no occurs at least once. */
+ ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
+}
+
+extern "C" {
+
+/**********************************************************************//**
+Allocate memory for zlib. */
+static
+void*
+page_zip_zalloc(
+/*============*/
+ void* opaque, /*!< in/out: memory heap */
+ uInt items, /*!< in: number of items to allocate */
+ uInt size) /*!< in: size of an item in bytes */
+{
+ return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size));
+}
+
+/**********************************************************************//**
+Deallocate memory for zlib. */
+static
+void
+page_zip_free(
+/*==========*/
+ void* opaque MY_ATTRIBUTE((unused)), /*!< in: memory heap */
+ void* address MY_ATTRIBUTE((unused)))/*!< in: object to free */
+{
+}
+
+} /* extern "C" */
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+void
+page_zip_set_alloc(
+/*===============*/
+ void* stream, /*!< in/out: zlib stream */
+ mem_heap_t* heap) /*!< in: memory heap to use */
+{
+ z_stream* strm = static_cast<z_stream*>(stream);
+
+ strm->zalloc = page_zip_zalloc;
+ strm->zfree = page_zip_free;
+ strm->opaque = heap;
+}
+
+#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/** Symbol for enabling compression and decompression diagnostics */
+# define PAGE_ZIP_COMPRESS_DBG
+#endif
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+/** Set this variable in a debugger to enable
+excessive logging in page_zip_compress(). */
+static bool page_zip_compress_dbg;
+/** Set this variable in a debugger to enable
+binary logging of the data passed to deflate().
+When this variable is nonzero, it will act
+as a log file name generator. */
+static unsigned page_zip_compress_log;
+
+/**********************************************************************//**
+Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set.
+@return deflate() status: Z_OK, Z_BUF_ERROR, ... */
+static
+int
+page_zip_compress_deflate(
+/*======================*/
+ FILE* logfile,/*!< in: log file, or NULL */
+ z_streamp strm, /*!< in/out: compressed stream for deflate() */
+ int flush) /*!< in: deflate() flushing method */
+{
+ int status;
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ ut_print_buf(stderr, strm->next_in, strm->avail_in);
+ }
+ if (UNIV_LIKELY_NULL(logfile)) {
+ if (fwrite(strm->next_in, 1, strm->avail_in, logfile)
+ != strm->avail_in) {
+ perror("fwrite");
+ }
+ }
+ status = deflate(strm, flush);
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ fprintf(stderr, " -> %d\n", status);
+ }
+ return(status);
+}
+
+/* Redefine deflate(). */
+# undef deflate
+/** Debug wrapper for the zlib compression routine deflate().
+Log the operation if page_zip_compress_dbg is set.
+@param strm in/out: compressed stream
+@param flush in: flushing method
+@return deflate() status: Z_OK, Z_BUF_ERROR, ... */
+# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush)
+/** Declaration of the logfile parameter */
+# define FILE_LOGFILE FILE* logfile,
+/** The logfile parameter */
+# define LOGFILE logfile,
+#else /* PAGE_ZIP_COMPRESS_DBG */
+/** Empty declaration of the logfile parameter */
+# define FILE_LOGFILE
+/** Missing logfile parameter */
+# define LOGFILE
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+
+/**********************************************************************//**
+Compress the records of a node pointer page.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_node_ptrs(
+/*========================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ byte* storage, /*!< in: end of dense page directory */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err = Z_OK;
+ rec_offs* offsets = NULL;
+
+ do {
+ const rec_t* rec = *recs++;
+
+ offsets = rec_get_offsets(rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ /* Only leaf nodes may contain externally stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress the extra bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+
+ /* Compress the data bytes, except node_ptr. */
+ c_stream->next_in = (byte*) rec;
+ c_stream->avail_in = static_cast<uInt>(
+ rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+
+ memcpy(storage - REC_NODE_PTR_SIZE
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in, REC_NODE_PTR_SIZE);
+ c_stream->next_in += REC_NODE_PTR_SIZE;
+ } while (--n_dense);
+
+ return(err);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a secondary index.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_sec(
+/*==================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense) /*!< in: size of recs[] */
+{
+ int err = Z_OK;
+
+ ut_ad(n_dense > 0);
+
+ do {
+ const rec_t* rec = *recs++;
+
+ /* Compress everything up to this record. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES
+ - c_stream->next_in);
+
+ if (UNIV_LIKELY(c_stream->avail_in != 0)) {
+ MEM_CHECK_DEFINED(c_stream->next_in,
+ c_stream->avail_in);
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+ /* Skip the REC_N_NEW_EXTRA_BYTES. */
+
+ c_stream->next_in = (byte*) rec;
+ } while (--n_dense);
+
+ return(err);
+}
+
+/**********************************************************************//**
+Compress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust_ext(
+/*========================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col, /*!< in: position of of DB_TRX_ID */
+ byte* deleted, /*!< in: dense directory entry pointing
+ to the head of the free list */
+ byte* storage, /*!< in: end of dense page directory */
+ byte** externs, /*!< in/out: pointer to the next
+ available BLOB pointer */
+ ulint* n_blobs) /*!< in/out: number of
+ externally stored columns */
+{
+ int err;
+ ulint i;
+
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ ulint len;
+ const byte* src;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ /* Store trx_id and roll_ptr
+ in uncompressed form. */
+ src = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets,
+ i + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Compress any preceding bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ src - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ memcpy(storage
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ c_stream->next_in
+ += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ /* Skip also roll_ptr */
+ i++;
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ src = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ src += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ c_stream->avail_in = static_cast<uInt>(
+ src - c_stream->next_in);
+ if (UNIV_LIKELY(c_stream->avail_in != 0)) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ /* Reserve space for the data at
+ the end of the space reserved for
+ the compressed data and the page
+ modification log. */
+
+ if (UNIV_UNLIKELY
+ (c_stream->avail_out
+ <= BTR_EXTERN_FIELD_REF_SIZE)) {
+ /* out of space */
+ return(Z_BUF_ERROR);
+ }
+
+ ut_ad(*externs == c_stream->next_out
+ + c_stream->avail_out
+ + 1/* end of modif. log */);
+
+ c_stream->next_in
+ += BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Skip deleted records. */
+ if (UNIV_LIKELY_NULL
+ (page_zip_dir_find_low(
+ storage, deleted,
+ page_offset(rec)))) {
+ continue;
+ }
+
+ (*n_blobs)++;
+ c_stream->avail_out
+ -= BTR_EXTERN_FIELD_REF_SIZE;
+ *externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Copy the BLOB pointer */
+ memcpy(*externs, c_stream->next_in
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+
+ return(Z_OK);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust(
+/*====================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint* n_blobs, /*!< in: 0; out: number of
+ externally stored columns */
+ ulint trx_id_col, /*!< index of the trx_id column */
+ byte* deleted, /*!< in: dense directory entry pointing
+ to the head of the free list */
+ byte* storage, /*!< in: end of dense page directory */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err = Z_OK;
+ rec_offs* offsets = NULL;
+ /* BTR_EXTERN_FIELD_REF storage */
+ byte* externs = storage - n_dense
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ ut_ad(*n_blobs == 0);
+
+ do {
+ const rec_t* rec = *recs++;
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+ ut_ad(rec_offs_n_fields(offsets)
+ == dict_index_get_n_fields(index));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress the extra bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES
+ - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+ /* Compress the data bytes. */
+
+ c_stream->next_in = (byte*) rec;
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, store the
+ BTR_EXTERN_FIELD_REF separately. */
+ if (rec_offs_any_extern(offsets)) {
+ ut_ad(dict_index_is_clust(index));
+
+ err = page_zip_compress_clust_ext(
+ LOGFILE
+ c_stream, rec, offsets, trx_id_col,
+ deleted, storage, &externs, n_blobs);
+
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ } else {
+ ulint len;
+ const byte* src;
+
+ /* Store trx_id and roll_ptr in uncompressed form. */
+ src = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets,
+ trx_id_col + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress any preceding bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ src - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ memcpy(storage
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ c_stream->next_in
+ += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ /* Skip also roll_ptr */
+ ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets));
+ }
+
+ /* Compress the last bytes of the record. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec + rec_offs_data_size(offsets) - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+ } while (--n_dense);
+
+func_exit:
+ return(err);}
+
+/** Attempt to compress a ROW_FORMAT=COMPRESSED page.
+@retval true on success
+@retval false on failure; block->page.zip will be left intact. */
+bool
+page_zip_compress(
+ buf_block_t* block, /*!< in/out: buffer block */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint level, /*!< in: commpression level */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ z_stream c_stream;
+ int err;
+ byte* fields; /*!< index field information */
+ byte* buf; /*!< compressed payload of the
+ page */
+ byte* buf_end; /* end of buf */
+ ulint n_dense;
+ ulint slot_size; /* amount of uncompressed bytes
+ per record */
+ const rec_t** recs; /*!< dense page directory,
+ sorted by address */
+ mem_heap_t* heap;
+ ulint trx_id_col = ULINT_UNDEFINED;
+ ulint n_blobs = 0;
+ byte* storage; /* storage of uncompressed
+ columns */
+ const ulonglong ns = my_interval_timer();
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ FILE* logfile = NULL;
+#endif
+ /* A local copy of srv_cmp_per_index_enabled to avoid reading that
+ variable multiple times in this function since it can be changed at
+ anytime. */
+ my_bool cmp_per_index_enabled;
+ cmp_per_index_enabled = srv_cmp_per_index_enabled;
+
+ page_t* page = block->frame;
+ page_zip_des_t* page_zip = &block->page.zip;
+
+ ut_a(page_is_comp(page));
+ ut_a(fil_page_index_page_check(page));
+ ut_ad(page_simple_validate_new((page_t*) page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(!dict_index_is_ibuf(index));
+
+ MEM_CHECK_DEFINED(page, srv_page_size);
+
+ /* Check the data that will be omitted. */
+ ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+ infimum_extra, sizeof infimum_extra));
+ ut_a(!memcmp(page + PAGE_NEW_INFIMUM,
+ infimum_data, sizeof infimum_data));
+ ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES]
+ /* info_bits == 0, n_owned <= max */
+ <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
+ supremum_extra_data, sizeof supremum_extra_data));
+
+ if (page_is_empty(page)) {
+ ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
+ == PAGE_NEW_SUPREMUM);
+ }
+
+ const ulint n_fields = page_is_leaf(page)
+ ? dict_index_get_n_fields(index)
+ : dict_index_get_n_unique_in_tree_nonleaf(index);
+ index_id_t ind_id = index->id;
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n_dense = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ ib::info() << "compress "
+ << static_cast<void*>(page_zip) << " "
+ << static_cast<const void*>(page) << " "
+ << page_is_leaf(page) << " "
+ << n_fields << " " << n_dense;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_compress_log)) {
+ /* Create a log file for every compression attempt. */
+ char logfilename[9];
+ snprintf(logfilename, sizeof logfilename,
+ "%08x", page_zip_compress_log++);
+ logfile = fopen(logfilename, "wb");
+
+ if (logfile) {
+ /* Write the uncompressed page to the log. */
+ if (fwrite(page, 1, srv_page_size, logfile)
+ != srv_page_size) {
+ perror("fwrite");
+ }
+ /* Record the compressed size as zero.
+ This will be overwritten at successful exit. */
+ putc(0, logfile);
+ putc(0, logfile);
+ putc(0, logfile);
+ putc(0, logfile);
+ }
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+ page_zip_stat[page_zip->ssize - 1].compressed++;
+ if (cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[ind_id].compressed++;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
+
+ if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+ >= page_zip_get_size(page_zip))) {
+
+ goto err_exit;
+ }
+
+ MONITOR_INC(MONITOR_PAGE_COMPRESS);
+
+ heap = mem_heap_create(page_zip_get_size(page_zip)
+ + n_fields * (2 + sizeof(ulint))
+ + REC_OFFS_HEADER_SIZE
+ + n_dense * ((sizeof *recs)
+ - PAGE_ZIP_DIR_SLOT_SIZE)
+ + srv_page_size * 4
+ + (512 << MAX_MEM_LEVEL));
+
+ recs = static_cast<const rec_t**>(
+ mem_heap_zalloc(heap, n_dense * sizeof *recs));
+
+ fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2));
+
+ buf = static_cast<byte*>(
+ mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA));
+
+ buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
+
+ /* Compress the data payload. */
+ page_zip_set_alloc(&c_stream, heap);
+
+ err = deflateInit2(&c_stream, static_cast<int>(level),
+ Z_DEFLATED, static_cast<int>(srv_page_size_shift),
+ MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ ut_a(err == Z_OK);
+
+ c_stream.next_out = buf;
+
+ /* Subtract the space reserved for uncompressed data. */
+ /* Page header and the end marker of the modification log */
+ c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1);
+
+ /* Dense page directory and uncompressed columns, if any */
+ if (page_is_leaf(page)) {
+ if (dict_index_is_clust(index)) {
+ trx_id_col = index->db_trx_id();
+
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ } else {
+ /* Signal the absence of trx_id
+ in page_zip_fields_encode() */
+ trx_id_col = 0;
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+ } else {
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+ trx_id_col = ULINT_UNDEFINED;
+ }
+
+ if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size
+ + 6/* sizeof(zlib header and footer) */)) {
+ goto zlib_error;
+ }
+
+ c_stream.avail_out -= uInt(n_dense * slot_size);
+ c_stream.avail_in = uInt(page_zip_fields_encode(n_fields, index,
+ trx_id_col, fields));
+ c_stream.next_in = fields;
+
+ if (UNIV_LIKELY(!trx_id_col)) {
+ trx_id_col = ULINT_UNDEFINED;
+ }
+
+ MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
+ err = deflate(&c_stream, Z_FULL_FLUSH);
+ if (err != Z_OK) {
+ goto zlib_error;
+ }
+
+ ut_ad(!c_stream.avail_in);
+
+ page_zip_dir_encode(page, buf_end, recs);
+
+ c_stream.next_in = (byte*) page + PAGE_ZIP_START;
+
+ storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ /* Compress the records in heap_no order. */
+ if (UNIV_UNLIKELY(!n_dense)) {
+ } else if (!page_is_leaf(page)) {
+ /* This is a node pointer page. */
+ err = page_zip_compress_node_ptrs(LOGFILE
+ &c_stream, recs, n_dense,
+ index, storage, heap);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ /* This is a leaf page in a secondary index. */
+ err = page_zip_compress_sec(LOGFILE
+ &c_stream, recs, n_dense);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ } else {
+ /* This is a leaf page in a clustered index. */
+ err = page_zip_compress_clust(LOGFILE
+ &c_stream, recs, n_dense,
+ index, &n_blobs, trx_id_col,
+ buf_end - PAGE_ZIP_DIR_SLOT_SIZE
+ * page_get_n_recs(page),
+ storage, heap);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ }
+
+ /* Finish the compression. */
+ ut_ad(!c_stream.avail_in);
+ /* Compress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list,
+ or the data of the last record from page_zip_compress_sec(). */
+ c_stream.avail_in = static_cast<uInt>(
+ page_header_get_field(page, PAGE_HEAP_TOP)
+ - (c_stream.next_in - page));
+ ut_a(c_stream.avail_in <= srv_page_size - PAGE_ZIP_START - PAGE_DIR);
+
+ MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
+ err = deflate(&c_stream, Z_FINISH);
+
+ if (UNIV_UNLIKELY(err != Z_STREAM_END)) {
+zlib_error:
+ deflateEnd(&c_stream);
+ mem_heap_free(heap);
+err_exit:
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (logfile) {
+ fclose(logfile);
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+ if (page_is_leaf(page)) {
+ dict_index_zip_failure(index);
+ }
+
+ const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+ page_zip_stat[page_zip->ssize - 1].compressed_usec
+ += time_diff;
+ if (cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[ind_id].compressed_usec
+ += time_diff;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
+ return false;
+ }
+
+ err = deflateEnd(&c_stream);
+ ut_a(err == Z_OK);
+
+ ut_ad(buf + c_stream.total_out == c_stream.next_out);
+ ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out);
+
+#if defined HAVE_valgrind && !__has_feature(memory_sanitizer)
+ /* Valgrind believes that zlib does not initialize some bits
+ in the last 7 or 8 bytes of the stream. Make Valgrind happy. */
+ MEM_MAKE_DEFINED(buf, c_stream.total_out);
+#endif /* HAVE_valgrind && !memory_sanitizer */
+
+ /* Zero out the area reserved for the modification log.
+ Space for the end marker of the modification log is not
+ included in avail_out. */
+ memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */);
+
+#ifdef UNIV_DEBUG
+ page_zip->m_start =
+#endif /* UNIV_DEBUG */
+ page_zip->m_end = uint16_t(PAGE_DATA + c_stream.total_out);
+ page_zip->m_nonempty = FALSE;
+ page_zip->n_blobs = unsigned(n_blobs) & ((1U << 12) - 1);
+ /* Copy those header fields that will not be written
+ in buf_flush_init_for_writing() */
+ memcpy_aligned<8>(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV);
+ memcpy_aligned<2>(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+ 2);
+ memcpy_aligned<2>(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_DATA - FIL_PAGE_DATA);
+ /* Copy the rest of the compressed page */
+ memcpy_aligned<2>(page_zip->data + PAGE_DATA, buf,
+ page_zip_get_size(page_zip) - PAGE_DATA);
+ mem_heap_free(heap);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ page_zip_compress_write_log(block, index, mtr);
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (logfile) {
+ /* Record the compressed size of the block. */
+ byte sz[4];
+ mach_write_to_4(sz, c_stream.total_out);
+ fseek(logfile, srv_page_size, SEEK_SET);
+ if (fwrite(sz, 1, sizeof sz, logfile) != sizeof sz) {
+ perror("fwrite");
+ }
+ fclose(logfile);
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+ const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+ page_zip_stat[page_zip->ssize - 1].compressed_ok++;
+ page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff;
+ if (cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[ind_id].compressed_ok++;
+ page_zip_stat_per_index[ind_id].compressed_usec += time_diff;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
+
+ if (page_is_leaf(page)) {
+ dict_index_zip_success(index);
+ }
+
+ return true;
+}
+
+/**********************************************************************//**
+Deallocate the index information initialized by page_zip_fields_decode(). */
+static
+void
+page_zip_fields_free(
+/*=================*/
+ dict_index_t* index) /*!< in: dummy index to be freed */
+{
+ if (index) {
+ dict_table_t* table = index->table;
+ index->zip_pad.mutex.~mutex();
+ mem_heap_free(index->heap);
+
+ dict_mem_table_free(table);
+ }
+}
+
+/**********************************************************************//**
+Read the index information for the compressed page.
+@return own: dummy index describing the page, or NULL on error */
+static
+dict_index_t*
+page_zip_fields_decode(
+/*===================*/
+ const byte* buf, /*!< in: index information */
+ const byte* end, /*!< in: end of buf */
+ ulint* trx_id_col,/*!< in: NULL for non-leaf pages;
+ for leaf pages, pointer to where to store
+ the position of the trx_id column */
+ bool is_spatial)/*< in: is spatial index or not */
+{
+ const byte* b;
+ ulint n;
+ ulint i;
+ ulint val;
+ dict_table_t* table;
+ dict_index_t* index;
+
+ /* Determine the number of fields. */
+ for (b = buf, n = 0; b < end; n++) {
+ if (*b++ & 0x80) {
+ b++; /* skip the second byte */
+ }
+ }
+
+ n--; /* n_nullable or trx_id */
+
+ if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
+
+ page_zip_fail(("page_zip_fields_decode: n = %lu\n",
+ (ulong) n));
+ return(NULL);
+ }
+
+ if (UNIV_UNLIKELY(b > end)) {
+
+ page_zip_fail(("page_zip_fields_decode: %p > %p\n",
+ (const void*) b, (const void*) end));
+ return(NULL);
+ }
+
+ table = dict_mem_table_create("ZIP_DUMMY", NULL, n, 0,
+ DICT_TF_COMPACT, 0);
+ index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n);
+ index->n_uniq = static_cast<unsigned>(n) & dict_index_t::MAX_N_FIELDS;
+ /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+ index->cached = TRUE;
+
+ /* Initialize the fields. */
+ for (b = buf, i = 0; i < n; i++) {
+ ulint mtype;
+ ulint len;
+
+ val = *b++;
+
+ if (UNIV_UNLIKELY(val & 0x80)) {
+ /* fixed length > 62 bytes */
+ val = (val & 0x7f) << 8 | *b++;
+ len = val >> 1;
+ mtype = DATA_FIXBINARY;
+ } else if (UNIV_UNLIKELY(val >= 126)) {
+ /* variable length with max > 255 bytes */
+ len = 0x7fff;
+ mtype = DATA_BINARY;
+ } else if (val <= 1) {
+ /* variable length with max <= 255 bytes */
+ len = 0;
+ mtype = DATA_BINARY;
+ } else {
+ /* fixed length < 62 bytes */
+ len = val >> 1;
+ mtype = DATA_FIXBINARY;
+ }
+
+ dict_mem_table_add_col(table, NULL, NULL, mtype,
+ val & 1 ? DATA_NOT_NULL : 0, len);
+ dict_index_add_col(index, table,
+ dict_table_get_nth_col(table, i), 0);
+ }
+
+ val = *b++;
+ if (UNIV_UNLIKELY(val & 0x80)) {
+ val = (val & 0x7f) << 8 | *b++;
+ }
+
+ /* Decode the position of the trx_id column. */
+ if (trx_id_col) {
+ if (!val) {
+ val = ULINT_UNDEFINED;
+ } else if (UNIV_UNLIKELY(val >= n)) {
+fail:
+ page_zip_fields_free(index);
+ return NULL;
+ } else {
+ index->type = DICT_CLUSTERED;
+ }
+
+ *trx_id_col = val;
+ } else {
+ /* Decode the number of nullable fields. */
+ if (UNIV_UNLIKELY(index->n_nullable > val)) {
+ goto fail;
+ } else {
+ index->n_nullable = static_cast<unsigned>(val)
+ & dict_index_t::MAX_N_FIELDS;
+ }
+ }
+
+ /* ROW_FORMAT=COMPRESSED does not support instant ADD COLUMN */
+ index->n_core_fields = index->n_fields;
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+
+ ut_ad(b == end);
+
+ if (is_spatial) {
+ index->type |= DICT_SPATIAL;
+ }
+
+ return(index);
+}
+
+/**********************************************************************//**
+Populate the sparse page directory from the dense directory.
+@return TRUE on success, FALSE on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ibool
+page_zip_dir_decode(
+/*================*/
+ const page_zip_des_t* page_zip,/*!< in: dense page directory on
+ compressed page */
+ page_t* page, /*!< in: compact page with valid header;
+ out: trailer and sparse page directory
+ filled in */
+ rec_t** recs, /*!< out: dense page directory sorted by
+ ascending address (and heap_no) */
+ ulint n_dense)/*!< in: number of user records, and
+ size of recs[] */
+{
+ ulint i;
+ ulint n_recs;
+ byte* slot;
+
+ n_recs = page_get_n_recs(page);
+
+ if (UNIV_UNLIKELY(n_recs > n_dense)) {
+ page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n",
+ (ulong) n_recs, (ulong) n_dense));
+ return(FALSE);
+ }
+
+ /* Traverse the list of stored records in the sorting order,
+ starting from the first user record. */
+
+ slot = page + (srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE);
+ UNIV_PREFETCH_RW(slot);
+
+ /* Zero out the page trailer. */
+ memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR);
+
+ mach_write_to_2(slot, PAGE_NEW_INFIMUM);
+ slot -= PAGE_DIR_SLOT_SIZE;
+ UNIV_PREFETCH_RW(slot);
+
+ /* Initialize the sparse directory and copy the dense directory. */
+ for (i = 0; i < n_recs; i++) {
+ ulint offs = page_zip_dir_get(page_zip, i);
+
+ if (offs & PAGE_ZIP_DIR_SLOT_OWNED) {
+ mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK);
+ slot -= PAGE_DIR_SLOT_SIZE;
+ UNIV_PREFETCH_RW(slot);
+ }
+
+ if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK)
+ < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) {
+ page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n",
+ (unsigned) i, (unsigned) n_recs,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK);
+ }
+
+ mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+ {
+ const page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+ page, page_dir_get_n_slots(page) - 1U);
+
+ if (UNIV_UNLIKELY(slot != last_slot)) {
+ page_zip_fail(("page_zip_dir_decode 3: %p != %p\n",
+ (const void*) slot,
+ (const void*) last_slot));
+ return(FALSE);
+ }
+ }
+
+ /* Copy the rest of the dense directory. */
+ for (; i < n_dense; i++) {
+ ulint offs = page_zip_dir_get(page_zip, i);
+
+ if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+ page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n",
+ (unsigned) i, (unsigned) n_dense,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ recs[i] = page + offs;
+ }
+
+ std::sort(recs, recs + n_dense);
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Initialize the REC_N_NEW_EXTRA_BYTES of each record.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_set_extra_bytes(
+/*=====================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ page_t* page, /*!< in/out: uncompressed page */
+ ulint info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */
+{
+ ulint n;
+ ulint i;
+ ulint n_owned = 1;
+ ulint offs;
+ rec_t* rec;
+
+ n = page_get_n_recs(page);
+ rec = page + PAGE_NEW_INFIMUM;
+
+ for (i = 0; i < n; i++) {
+ offs = page_zip_dir_get(page_zip, i);
+
+ if (offs & PAGE_ZIP_DIR_SLOT_DEL) {
+ info_bits |= REC_INFO_DELETED_FLAG;
+ }
+ if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
+ info_bits |= n_owned;
+ n_owned = 1;
+ } else {
+ n_owned++;
+ }
+ offs &= PAGE_ZIP_DIR_SLOT_MASK;
+ if (UNIV_UNLIKELY(offs < PAGE_ZIP_START
+ + REC_N_NEW_EXTRA_BYTES)) {
+ page_zip_fail(("page_zip_set_extra_bytes 1:"
+ " %u %u %lx\n",
+ (unsigned) i, (unsigned) n,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ rec_set_next_offs_new(rec, offs);
+ rec = page + offs;
+ rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits;
+ info_bits = 0;
+ }
+
+ /* Set the next pointer of the last user record. */
+ rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM);
+
+ /* Set n_owned of the supremum record. */
+ page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned;
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
+
+ if (i >= n) {
+ if (UNIV_LIKELY(i == n)) {
+ return(TRUE);
+ }
+
+ page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n",
+ (unsigned) i, (unsigned) n));
+ return(FALSE);
+ }
+
+ offs = page_zip_dir_get(page_zip, i);
+
+ /* Set the extra bytes of deleted records on the free list. */
+ for (;;) {
+ if (UNIV_UNLIKELY(!offs)
+ || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+
+ page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n",
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ rec = page + offs;
+ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+
+ if (++i == n) {
+ break;
+ }
+
+ offs = page_zip_dir_get(page_zip, i);
+ rec_set_next_offs_new(rec, offs);
+ }
+
+ /* Terminate the free list. */
+ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+ rec_set_next_offs_new(rec, 0);
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Apply the modification log to a record containing externally stored
+columns. Do not copy the fields that are stored separately.
+@return pointer to modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log_ext(
+/*===================*/
+ rec_t* rec, /*!< in/out: record */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col, /*!< in: position of of DB_TRX_ID */
+ const byte* data, /*!< in: modification log */
+ const byte* end) /*!< in: end of modification log */
+{
+ ulint i;
+ ulint len;
+ byte* next_out = rec;
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, skip the
+ BTR_EXTERN_FIELD_REF. */
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ byte* dst;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ /* Skip trx_id and roll_ptr */
+ dst = rec_get_nth_field(rec, offsets,
+ i, &len);
+ if (UNIV_UNLIKELY(dst - next_out >= end - data)
+ || UNIV_UNLIKELY
+ (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN))
+ || rec_offs_nth_extern(offsets, i)) {
+ page_zip_fail(("page_zip_apply_log_ext:"
+ " trx_id len %lu,"
+ " %p - %p >= %p - %p\n",
+ (ulong) len,
+ (const void*) dst,
+ (const void*) next_out,
+ (const void*) end,
+ (const void*) data));
+ return(NULL);
+ }
+
+ memcpy(next_out, data, ulint(dst - next_out));
+ data += ulint(dst - next_out);
+ next_out = dst + (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN);
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ dst = rec_get_nth_field(rec, offsets,
+ i, &len);
+ ut_ad(len
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ len += ulint(dst - next_out)
+ - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log_ext:"
+ " ext %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+
+ memcpy(next_out, data, len);
+ data += len;
+ next_out += len
+ + BTR_EXTERN_FIELD_REF_SIZE;
+ }
+ }
+
+ /* Copy the last bytes of the record. */
+ len = ulint(rec_get_end(rec, offsets) - next_out);
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log_ext:"
+ " last %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(next_out, data, len);
+ data += len;
+
+ return(data);
+}
+
+/**********************************************************************//**
+Apply the modification log to an uncompressed page.
+Do not copy the fields that are stored separately.
+@return pointer to end of modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log(
+/*===============*/
+ const byte* data, /*!< in: modification log */
+ ulint size, /*!< in: maximum length of the log, in bytes */
+ rec_t** recs, /*!< in: dense page directory,
+ sorted by address (indexed by
+ heap_no - PAGE_HEAP_NO_USER_LOW) */
+ ulint n_dense,/*!< in: size of recs[] */
+ ulint n_core, /*!< in: index->n_fields, or 0 for non-leaf */
+ ulint trx_id_col,/*!< in: column number of trx_id in the index,
+ or ULINT_UNDEFINED if none */
+ ulint heap_status,
+ /*!< in: heap_no and status bits for
+ the next record to uncompress */
+ dict_index_t* index, /*!< in: index of the page */
+ rec_offs* offsets)/*!< in/out: work area for
+ rec_get_offsets_reverse() */
+{
+ const byte* const end = data + size;
+
+ for (;;) {
+ ulint val;
+ rec_t* rec;
+ ulint len;
+ ulint hs;
+
+ val = *data++;
+ if (UNIV_UNLIKELY(!val)) {
+ return(data - 1);
+ }
+ if (val & 0x80) {
+ val = (val & 0x7f) << 8 | *data++;
+ if (UNIV_UNLIKELY(!val)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " invalid val %x%x\n",
+ data[-2], data[-1]));
+ return(NULL);
+ }
+ }
+ if (UNIV_UNLIKELY(data >= end)) {
+ page_zip_fail(("page_zip_apply_log: %p >= %p\n",
+ (const void*) data,
+ (const void*) end));
+ return(NULL);
+ }
+ if (UNIV_UNLIKELY((val >> 1) > n_dense)) {
+ page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n",
+ (ulong) val, (ulong) n_dense));
+ return(NULL);
+ }
+
+ /* Determine the heap number and status bits of the record. */
+ rec = recs[(val >> 1) - 1];
+
+ hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT;
+ hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1);
+
+ /* This may either be an old record that is being
+ overwritten (updated in place, or allocated from
+ the free list), or a new record, with the next
+ available_heap_no. */
+ if (UNIV_UNLIKELY(hs > heap_status)) {
+ page_zip_fail(("page_zip_apply_log: %lu > %lu\n",
+ (ulong) hs, (ulong) heap_status));
+ return(NULL);
+ } else if (hs == heap_status) {
+ /* A new record was allocated from the heap. */
+ if (UNIV_UNLIKELY(val & 1)) {
+ /* Only existing records may be cleared. */
+ page_zip_fail(("page_zip_apply_log:"
+ " attempting to create"
+ " deleted rec %lu\n",
+ (ulong) hs));
+ return(NULL);
+ }
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+ }
+
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, hs);
+
+ if (val & 1) {
+ /* Clear the data bytes of the record. */
+ mem_heap_t* heap = NULL;
+ rec_offs* offs;
+ offs = rec_get_offsets(rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ memset(rec, 0, rec_offs_data_size(offs));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ continue;
+ }
+
+ compile_time_assert(REC_STATUS_NODE_PTR == TRUE);
+ rec_get_offsets_reverse(data, index,
+ hs & REC_STATUS_NODE_PTR,
+ offsets);
+ /* Silence a debug assertion in rec_offs_make_valid().
+ This will be overwritten in page_zip_set_extra_bytes(),
+ called by page_zip_decompress_low(). */
+ ut_d(rec[-REC_NEW_INFO_BITS] = 0);
+ rec_offs_make_valid(rec, index, n_core != 0, offsets);
+
+ /* Copy the extra bytes (backwards). */
+ {
+ byte* start = rec_get_start(rec, offsets);
+ byte* b = rec - REC_N_NEW_EXTRA_BYTES;
+ while (b != start) {
+ *--b = *data++;
+ }
+ }
+
+ /* Copy the data bytes. */
+ if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+ /* Non-leaf nodes should not contain any
+ externally stored columns. */
+ if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " %lu&REC_STATUS_NODE_PTR\n",
+ (ulong) hs));
+ return(NULL);
+ }
+
+ data = page_zip_apply_log_ext(
+ rec, offsets, trx_id_col, data, end);
+
+ if (UNIV_UNLIKELY(!data)) {
+ return(NULL);
+ }
+ } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+ len = rec_offs_data_size(offsets)
+ - REC_NODE_PTR_SIZE;
+ /* Copy the data bytes, except node_ptr. */
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " node_ptr %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(rec, data, len);
+ data += len;
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ len = rec_offs_data_size(offsets);
+
+ /* Copy all data bytes of
+ a record in a secondary index. */
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " sec %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+
+ memcpy(rec, data, len);
+ data += len;
+ } else {
+ /* Skip DB_TRX_ID and DB_ROLL_PTR. */
+ ulint l = rec_get_nth_field_offs(offsets,
+ trx_id_col, &len);
+ byte* b;
+
+ if (UNIV_UNLIKELY(data + l >= end)
+ || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN))) {
+ page_zip_fail(("page_zip_apply_log:"
+ " trx_id %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) l,
+ (const void*) end));
+ return(NULL);
+ }
+
+ /* Copy any preceding data bytes. */
+ memcpy(rec, data, l);
+ data += l;
+
+ /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */
+ b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ len = ulint(rec_get_end(rec, offsets) - b);
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " clust %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(b, data, len);
+ data += len;
+ }
+ }
+}
+
+/**********************************************************************//**
+Set the heap_no in a record, and skip the fixed-size record header
+that is not included in the d_stream.
+@return TRUE on success, FALSE if d_stream does not end at rec */
+static
+ibool
+page_zip_decompress_heap_no(
+/*========================*/
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t* rec, /*!< in/out: record */
+ ulint& heap_status) /*!< in/out: heap_no and status bits */
+{
+ if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) {
+ /* n_dense has grown since the page was last compressed. */
+ return(FALSE);
+ }
+
+ /* Skip the REC_N_NEW_EXTRA_BYTES. */
+ d_stream->next_out = rec;
+
+ /* Set heap_no and the status bits. */
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a node pointer page.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_node_ptrs(
+/*==========================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ rec_offs* offsets, /*!< in/out: temporary offsets */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ ulint heap_status = REC_STATUS_NODE_PTR
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ ulint slot;
+ const byte* storage;
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= static_cast<uInt>(
+ n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE));
+
+ /* Decompress the records in heap_no order. */
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ d_stream->avail_out = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+ ut_ad(d_stream->avail_out < srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR);
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
+
+ /* Read the offsets. The status bits are needed here. */
+ offsets = rec_get_offsets(rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Decompress the data bytes, except node_ptr. */
+ d_stream->avail_out =static_cast<uInt>(
+ rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ /* Clear the node pointer in case the record
+ will be deleted and the space will be reallocated
+ to a smaller record. */
+ memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE);
+ d_stream->next_out += REC_NODE_PTR_SIZE;
+
+ ut_ad(d_stream->next_out == rec_get_end(rec, offsets));
+ }
+
+ /* Decompress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list. */
+ d_stream->avail_out = static_cast<uInt>(
+ page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out));
+ if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ ulint(page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page)
+ - 1U)
+ - d_stream->next_out));
+ }
+
+#ifdef UNIV_DEBUG
+ page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in);
+#endif /* UNIV_DEBUG */
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense, 0,
+ ULINT_UNDEFINED, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY
+ (page_zip_get_trailer_len(page_zip,
+ dict_index_is_clust(index))
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " %lu + %lu >= %lu, %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, dict_index_is_clust(index)),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip),
+ (ulong) dict_index_is_clust(index)));
+ return(FALSE);
+ }
+
+ /* Restore the uncompressed columns in heap_no order. */
+ storage = page_zip_dir_start_low(page_zip, n_dense);
+
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ offsets = rec_get_offsets(rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+ storage -= REC_NODE_PTR_SIZE;
+
+ memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE,
+ storage, REC_NODE_PTR_SIZE);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a leaf node of a secondary index.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_sec(
+/*====================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ rec_offs* offsets) /*!< in/out: temporary offsets */
+{
+ ulint heap_status = REC_STATUS_ORDINARY
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ ulint slot;
+
+ ut_a(!dict_index_is_clust(index));
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= static_cast<uint>(
+ n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
+
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ /* Decompress everything up to this record. */
+ d_stream->avail_out = static_cast<uint>(
+ rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+ if (UNIV_LIKELY(d_stream->avail_out)) {
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_sec:"
+ " inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+ }
+
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
+ }
+
+ /* Decompress the data of the last record and any trailing garbage,
+ in case the last record was allocated from an originally longer space
+ on the free list. */
+ d_stream->avail_out = static_cast<uInt>(
+ page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out));
+ if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_sec:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_sec:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ ulint(page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page)
+ - 1U)
+ - d_stream->next_out));
+ }
+
+ ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense,
+ index->n_fields,
+ ULINT_UNDEFINED, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE)
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+ page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, FALSE),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ /* There are no uncompressed columns on leaf pages of
+ secondary indexes. */
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return TRUE on success */
+static
+ibool
+page_zip_decompress_clust_ext(
+/*==========================*/
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t* rec, /*!< in/out: record */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col) /*!< in: position of of DB_TRX_ID */
+{
+ ulint i;
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ ulint len;
+ byte* dst;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ /* Skip trx_id and roll_ptr */
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+ if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN)) {
+
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " len[%lu] = %lu\n",
+ (ulong) i, (ulong) len));
+ return(FALSE);
+ }
+
+ if (rec_offs_nth_extern(offsets, i)) {
+
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " DB_TRX_ID at %lu is ext\n",
+ (ulong) i));
+ return(FALSE);
+ }
+
+ d_stream->avail_out = static_cast<uInt>(
+ dst - d_stream->next_out);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ return(FALSE);
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+ avoid uninitialized bytes in case the record
+ is affected by page_zip_apply_log(). */
+ memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ d_stream->next_out += DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ d_stream->avail_out = static_cast<uInt>(
+ dst - d_stream->next_out);
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ return(FALSE);
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear the BLOB pointer in case
+ the record will be deleted and the
+ space will not be reused. Note that
+ the final initialization of the BLOB
+ pointers (copying from "externs"
+ or clearing) will have to take place
+ only after the page modification log
+ has been applied. Otherwise, we
+ could end up with an uninitialized
+ BLOB pointer when a record is deleted,
+ reallocated and deleted. */
+ memset(d_stream->next_out, 0,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ d_stream->next_out
+ += BTR_EXTERN_FIELD_REF_SIZE;
+ }
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_clust(
+/*======================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint trx_id_col, /*!< index of the trx_id column */
+ rec_offs* offsets, /*!< in/out: temporary offsets */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err;
+ ulint slot;
+ ulint heap_status = REC_STATUS_ORDINARY
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ const byte* storage;
+ const byte* externs;
+
+ ut_a(dict_index_is_clust(index));
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= static_cast<uInt>(n_dense)
+ * (PAGE_ZIP_CLUST_LEAF_SLOT_SIZE);
+
+ /* Decompress the records in heap_no order. */
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ d_stream->avail_out =static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+ ut_ad(d_stream->avail_out < srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR);
+ err = inflate(d_stream, Z_SYNC_FLUSH);
+ switch (err) {
+ case Z_STREAM_END:
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (UNIV_LIKELY(!d_stream->avail_out)) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
+
+ /* Read the offsets. The status bits are needed here. */
+ offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* This is a leaf page in a clustered index. */
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, restore the
+ BTR_EXTERN_FIELD_REF separately. */
+
+ if (rec_offs_any_extern(offsets)) {
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress_clust_ext(
+ d_stream, rec, offsets, trx_id_col))) {
+
+ goto zlib_error;
+ }
+ } else {
+ /* Skip trx_id and roll_ptr */
+ ulint len;
+ byte* dst = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN)) {
+
+ page_zip_fail(("page_zip_decompress_clust:"
+ " len = %lu\n", (ulong) len));
+ goto zlib_error;
+ }
+
+ d_stream->avail_out = static_cast<uInt>(
+ dst - d_stream->next_out);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+ avoid uninitialized bytes in case the record
+ is affected by page_zip_apply_log(). */
+ memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ d_stream->next_out += DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ }
+
+ /* Decompress the last bytes of the record. */
+ d_stream->avail_out = static_cast<uInt>(
+ rec_get_end(rec, offsets) - d_stream->next_out);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 3 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+ }
+
+ /* Decompress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list. */
+ d_stream->avail_out = static_cast<uInt>(
+ page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out));
+ if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_clust:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_clust:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ ulint(page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page)
+ - 1U)
+ - d_stream->next_out));
+ }
+
+ ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense,
+ index->n_fields,
+ trx_id_col, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE)
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+ page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, TRUE),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ storage = page_zip_dir_start_low(page_zip, n_dense);
+
+ externs = storage - n_dense
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ /* Restore the uncompressed columns in heap_no order. */
+
+ for (slot = 0; slot < n_dense; slot++) {
+ ulint i;
+ ulint len;
+ byte* dst;
+ rec_t* rec = recs[slot];
+ bool exists = !page_zip_dir_find_free(
+ page_zip, page_offset(rec));
+ offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ dst = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ memcpy(dst, storage,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ /* Check if there are any externally stored
+ columns in this record. For each externally
+ stored column, restore or clear the
+ BTR_EXTERN_FIELD_REF. */
+ if (!rec_offs_any_extern(offsets)) {
+ continue;
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (!rec_offs_nth_extern(offsets, i)) {
+ continue;
+ }
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) {
+ page_zip_fail(("page_zip_decompress_clust:"
+ " %lu < 20\n",
+ (ulong) len));
+ return(FALSE);
+ }
+
+ dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_LIKELY(exists)) {
+ /* Existing record:
+ restore the BLOB pointer */
+ externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY
+ (externs < page_zip->data
+ + page_zip->m_end)) {
+ page_zip_fail(("page_zip_"
+ "decompress_clust:"
+ " %p < %p + %lu\n",
+ (const void*) externs,
+ (const void*)
+ page_zip->data,
+ (ulong)
+ page_zip->m_end));
+ return(FALSE);
+ }
+
+ memcpy(dst, externs,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+ page_zip->n_blobs++;
+ } else {
+ /* Deleted record:
+ clear the BLOB pointer */
+ memset(dst, 0,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_low(
+/*====================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page, /*!< out: uncompressed page, may be trashed */
+ ibool all) /*!< in: TRUE=decompress the whole page;
+ FALSE=verify but do not copy some
+ page header fields that should not change
+ after page creation */
+{
+ z_stream d_stream;
+ dict_index_t* index = NULL;
+ rec_t** recs; /*!< dense page directory, sorted by address */
+ ulint n_dense;/* number of user records on the page */
+ ulint trx_id_col = ULINT_UNDEFINED;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ MEM_CHECK_ADDRESSABLE(page, srv_page_size);
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW;
+ if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+ >= page_zip_get_size(page_zip))) {
+ page_zip_fail(("page_zip_decompress 1: %lu %lu\n",
+ (ulong) n_dense,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ heap = mem_heap_create(n_dense * (3 * sizeof *recs) + srv_page_size);
+
+ recs = static_cast<rec_t**>(
+ mem_heap_alloc(heap, n_dense * sizeof *recs));
+
+ if (all) {
+ /* Copy the page header. */
+ memcpy_aligned<2>(page, page_zip->data, PAGE_DATA);
+ } else {
+ /* Check that the bytes that we skip are identical. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(FIL_PAGE_TYPE + page,
+ FIL_PAGE_TYPE + page_zip->data,
+ PAGE_HEADER - FIL_PAGE_TYPE));
+ ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page,
+ PAGE_HEADER + PAGE_LEVEL + page_zip->data,
+ PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL)));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ /* Copy the mutable parts of the page header. */
+ memcpy_aligned<8>(page, page_zip->data, FIL_PAGE_TYPE);
+ memcpy_aligned<2>(PAGE_HEADER + page,
+ PAGE_HEADER + page_zip->data,
+ PAGE_LEVEL - PAGE_N_DIR_SLOTS);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ /* Check that the page headers match after copying. */
+ ut_a(!memcmp(page, page_zip->data, PAGE_DATA));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ /* Clear the uncompressed page, except the header. */
+ memset(PAGE_DATA + page, 0x55, srv_page_size - PAGE_DATA);
+#endif /* UNIV_ZIP_DEBUG */
+ MEM_UNDEFINED(PAGE_DATA + page, srv_page_size - PAGE_DATA);
+
+ /* Copy the page directory. */
+ if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs,
+ n_dense))) {
+zlib_error:
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+
+ /* Copy the infimum and supremum records. */
+ memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+ infimum_extra, sizeof infimum_extra);
+ if (page_is_empty(page)) {
+ rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+ PAGE_NEW_SUPREMUM);
+ } else {
+ rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+ page_zip_dir_get(page_zip, 0)
+ & PAGE_ZIP_DIR_SLOT_MASK);
+ }
+ memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data);
+ memcpy_aligned<4>(PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1
+ + page, supremum_extra_data,
+ sizeof supremum_extra_data);
+
+ page_zip_set_alloc(&d_stream, heap);
+
+ d_stream.next_in = page_zip->data + PAGE_DATA;
+ /* Subtract the space reserved for
+ the page header and the end marker of the modification log. */
+ d_stream.avail_in = static_cast<uInt>(
+ page_zip_get_size(page_zip) - (PAGE_DATA + 1));
+ d_stream.next_out = page + PAGE_ZIP_START;
+ d_stream.avail_out = uInt(srv_page_size - PAGE_ZIP_START);
+
+ if (UNIV_UNLIKELY(inflateInit2(&d_stream, int(srv_page_size_shift))
+ != Z_OK)) {
+ ut_error;
+ }
+
+ /* Decode the zlib header and the index information. */
+ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+ page_zip_fail(("page_zip_decompress:"
+ " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+ page_zip_fail(("page_zip_decompress:"
+ " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+ goto zlib_error;
+ }
+
+ index = page_zip_fields_decode(
+ page + PAGE_ZIP_START, d_stream.next_out,
+ page_is_leaf(page) ? &trx_id_col : NULL,
+ fil_page_get_type(page) == FIL_PAGE_RTREE);
+
+ if (UNIV_UNLIKELY(!index)) {
+
+ goto zlib_error;
+ }
+
+ /* Decompress the user records. */
+ page_zip->n_blobs = 0;
+ d_stream.next_out = page + PAGE_ZIP_START;
+
+ {
+ /* Pre-allocate the offsets for rec_get_offsets_reverse(). */
+ ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+
+ offsets = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, n * sizeof(ulint)));
+
+ rec_offs_set_n_alloc(offsets, n);
+ }
+
+ /* Decompress the records in heap_no order. */
+ if (!page_is_leaf(page)) {
+ /* This is a node pointer page. */
+ ulint info_bits;
+
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress_node_ptrs(page_zip, &d_stream,
+ recs, n_dense, index,
+ offsets, heap))) {
+ goto err_exit;
+ }
+
+ info_bits = page_has_prev(page) ? 0 : REC_INFO_MIN_REC_FLAG;
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page,
+ info_bits))) {
+ goto err_exit;
+ }
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ /* This is a leaf page in a secondary index. */
+ if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream,
+ recs, n_dense,
+ index, offsets))) {
+ goto err_exit;
+ }
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+ page, 0))) {
+err_exit:
+ page_zip_fields_free(index);
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+ } else {
+ /* This is a leaf page in a clustered index. */
+ if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip,
+ &d_stream, recs,
+ n_dense, index,
+ trx_id_col,
+ offsets, heap))) {
+ goto err_exit;
+ }
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+ page, 0))) {
+ goto err_exit;
+ }
+ }
+
+ ut_a(page_is_comp(page));
+ MEM_CHECK_DEFINED(page, srv_page_size);
+
+ page_zip_fields_free(index);
+ mem_heap_free(heap);
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+ibool
+page_zip_decompress(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page, /*!< out: uncompressed page, may be trashed */
+ ibool all) /*!< in: TRUE=decompress the whole page;
+ FALSE=verify but do not copy some
+ page header fields that should not change
+ after page creation */
+{
+ const ulonglong ns = my_interval_timer();
+
+ if (!page_zip_decompress_low(page_zip, page, all)) {
+ return(FALSE);
+ }
+
+ const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+ page_zip_stat[page_zip->ssize - 1].decompressed++;
+ page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff;
+
+ index_id_t index_id = btr_page_get_index_id(page);
+
+ if (srv_cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[index_id].decompressed++;
+ page_zip_stat_per_index[index_id].decompressed_usec += time_diff;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
+
+ /* Update the stat counter for LRU policy. */
+ buf_LRU_stat_inc_unzip();
+
+ MONITOR_INC(MONITOR_PAGE_DECOMPRESS);
+
+ return(TRUE);
+}
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Dump a block of memory on the standard error stream. */
+static
+void
+page_zip_hexdump_func(
+/*==================*/
+ const char* name, /*!< in: name of the data structure */
+ const void* buf, /*!< in: data */
+ ulint size) /*!< in: length of the data, in bytes */
+{
+ const byte* s = static_cast<const byte*>(buf);
+ ulint addr;
+ const ulint width = 32; /* bytes per line */
+
+ fprintf(stderr, "%s:\n", name);
+
+ for (addr = 0; addr < size; addr += width) {
+ ulint i;
+
+ fprintf(stderr, "%04lx ", (ulong) addr);
+
+ i = ut_min(width, size - addr);
+
+ while (i--) {
+ fprintf(stderr, "%02x", *s++);
+ }
+
+ putc('\n', stderr);
+ }
+}
+
+/** Dump a block of memory on the standard error stream.
+@param buf in: data
+@param size in: length of the data, in bytes */
+#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size)
+
+/** Flag: make page_zip_validate() compare page headers only */
+bool page_zip_validate_header_only;
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate_low(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ const dict_index_t* index, /*!< in: index of the page, if known */
+ ibool sloppy) /*!< in: FALSE=strict,
+ TRUE=ignore the MIN_REC_FLAG */
+{
+ page_zip_des_t temp_page_zip;
+ ibool valid;
+
+ if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV)
+ || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2)
+ || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_ROOT_AUTO_INC)
+ /* The PAGE_ROOT_AUTO_INC can be updated while holding an SX-latch
+ on the clustered index root page (page number 3 in .ibd files).
+ That allows concurrent readers (holding buf_block_t::lock S-latch).
+ Because we do not know what type of a latch our caller is holding,
+ we will ignore the field on clustered index root pages in order
+ to avoid false positives. */
+ || (page_get_page_no(page) != 3/* clustered index root page */
+ && memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC],
+ &page[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC], 8))
+ || memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
+ &page[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
+ PAGE_DATA - FIL_PAGE_DATA - PAGE_HEADER_PRIV_END)) {
+ page_zip_fail(("page_zip_validate: page header\n"));
+ page_zip_hexdump(page_zip, sizeof *page_zip);
+ page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+ page_zip_hexdump(page, srv_page_size);
+ return(FALSE);
+ }
+
+ ut_a(page_is_comp(page));
+
+ if (page_zip_validate_header_only) {
+ return(TRUE);
+ }
+
+ /* page_zip_decompress() expects the uncompressed page to be
+ srv_page_size aligned. */
+ page_t* temp_page = static_cast<byte*>(aligned_malloc(srv_page_size,
+ srv_page_size));
+
+ MEM_CHECK_DEFINED(page, srv_page_size);
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ temp_page_zip = *page_zip;
+ valid = page_zip_decompress_low(&temp_page_zip, temp_page, TRUE);
+ if (!valid) {
+ fputs("page_zip_validate(): failed to decompress\n", stderr);
+ goto func_exit;
+ }
+ if (page_zip->n_blobs != temp_page_zip.n_blobs) {
+ page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n",
+ page_zip->n_blobs, temp_page_zip.n_blobs));
+ valid = FALSE;
+ }
+#ifdef UNIV_DEBUG
+ if (page_zip->m_start != temp_page_zip.m_start) {
+ page_zip_fail(("page_zip_validate: m_start: %u!=%u\n",
+ page_zip->m_start, temp_page_zip.m_start));
+ valid = FALSE;
+ }
+#endif /* UNIV_DEBUG */
+ if (page_zip->m_end != temp_page_zip.m_end) {
+ page_zip_fail(("page_zip_validate: m_end: %u!=%u\n",
+ page_zip->m_end, temp_page_zip.m_end));
+ valid = FALSE;
+ }
+ if (page_zip->m_nonempty != temp_page_zip.m_nonempty) {
+ page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n",
+ page_zip->m_nonempty,
+ temp_page_zip.m_nonempty));
+ valid = FALSE;
+ }
+ if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER,
+ srv_page_size - PAGE_HEADER - FIL_PAGE_DATA_END)) {
+
+ /* In crash recovery, the "minimum record" flag may be
+ set incorrectly until the mini-transaction is
+ committed. Let us tolerate that difference when we
+ are performing a sloppy validation. */
+
+ rec_offs* offsets;
+ mem_heap_t* heap;
+ const rec_t* rec;
+ const rec_t* trec;
+ byte info_bits_diff;
+ ulint offset
+ = rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE);
+ ut_a(offset >= PAGE_NEW_SUPREMUM);
+ offset -= 5/*REC_NEW_INFO_BITS*/;
+
+ info_bits_diff = page[offset] ^ temp_page[offset];
+
+ if (info_bits_diff == REC_INFO_MIN_REC_FLAG) {
+ temp_page[offset] = page[offset];
+
+ if (!memcmp(page + PAGE_HEADER,
+ temp_page + PAGE_HEADER,
+ srv_page_size - PAGE_HEADER
+ - FIL_PAGE_DATA_END)) {
+
+ /* Only the minimum record flag
+ differed. Let us ignore it. */
+ page_zip_fail(("page_zip_validate:"
+ " min_rec_flag"
+ " (%s" ULINTPF "," ULINTPF
+ ",0x%02x)\n",
+ sloppy ? "ignored, " : "",
+ page_get_space_id(page),
+ page_get_page_no(page),
+ page[offset]));
+ /* We don't check for spatial index, since
+ the "minimum record" could be deleted when
+ doing rtr_update_mbr_field.
+ GIS_FIXME: need to validate why
+ rtr_update_mbr_field.() could affect this */
+ if (index && dict_index_is_spatial(index)) {
+ valid = true;
+ } else {
+ valid = sloppy;
+ }
+ goto func_exit;
+ }
+ }
+
+ /* Compare the pointers in the PAGE_FREE list. */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+ trec = page_header_get_ptr(temp_page, PAGE_FREE);
+
+ while (rec || trec) {
+ if (page_offset(rec) != page_offset(trec)) {
+ page_zip_fail(("page_zip_validate:"
+ " PAGE_FREE list: %u!=%u\n",
+ (unsigned) page_offset(rec),
+ (unsigned) page_offset(trec)));
+ valid = FALSE;
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next_low(rec, TRUE);
+ trec = page_rec_get_next_low(trec, TRUE);
+ }
+
+ /* Compare the records. */
+ heap = NULL;
+ offsets = NULL;
+ rec = page_rec_get_next_low(
+ page + PAGE_NEW_INFIMUM, TRUE);
+ trec = page_rec_get_next_low(
+ temp_page + PAGE_NEW_INFIMUM, TRUE);
+ const ulint n_core = page_is_leaf(page) ? index->n_fields : 0;
+
+ do {
+ if (page_offset(rec) != page_offset(trec)) {
+ page_zip_fail(("page_zip_validate:"
+ " record list: 0x%02x!=0x%02x\n",
+ (unsigned) page_offset(rec),
+ (unsigned) page_offset(trec)));
+ valid = FALSE;
+ break;
+ }
+
+ if (index) {
+ /* Compare the data. */
+ offsets = rec_get_offsets(
+ rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+
+ if (memcmp(rec - rec_offs_extra_size(offsets),
+ trec - rec_offs_extra_size(offsets),
+ rec_offs_size(offsets))) {
+ page_zip_fail(
+ ("page_zip_validate:"
+ " record content: 0x%02x",
+ (unsigned) page_offset(rec)));
+ valid = FALSE;
+ break;
+ }
+ }
+
+ rec = page_rec_get_next_low(rec, TRUE);
+ trec = page_rec_get_next_low(trec, TRUE);
+ } while (rec || trec);
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ }
+
+func_exit:
+ if (!valid) {
+ page_zip_hexdump(page_zip, sizeof *page_zip);
+ page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+ page_zip_hexdump(page, srv_page_size);
+ page_zip_hexdump(temp_page, srv_page_size);
+ }
+ aligned_free(temp_page);
+ return(valid);
+}
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate(
+/*==============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ const dict_index_t* index) /*!< in: index of the page, if known */
+{
+ return(page_zip_validate_low(page_zip, page, index,
+ recv_recovery_is_on()));
+}
+#endif /* UNIV_ZIP_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Assert that the compressed and decompressed page headers match.
+@return TRUE */
+static
+ibool
+page_zip_header_cmp(
+/*================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const byte* page) /*!< in: uncompressed page */
+{
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV));
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+ 2));
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_DATA - FIL_PAGE_DATA));
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Write a record on the compressed page that contains externally stored
+columns. The data must already have been written to the uncompressed page.
+@return end of modification log */
+static
+byte*
+page_zip_write_rec_ext(
+/*===================*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ const byte* rec, /*!< in: record being written */
+ const dict_index_t*index, /*!< in: record descriptor */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ ulint create, /*!< in: nonzero=insert, zero=update */
+ ulint trx_id_col, /*!< in: position of DB_TRX_ID */
+ ulint heap_no, /*!< in: heap number of rec */
+ byte* storage, /*!< in: end of dense page directory */
+ byte* data, /*!< in: end of modification log */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ const byte* start = rec;
+ ulint i;
+ ulint len;
+ byte* externs = storage;
+ ulint n_ext = rec_offs_n_extern(offsets);
+ const page_t* const page = block->frame;
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW);
+
+ /* Note that this will not take into account
+ the BLOB columns of rec if create==TRUE. */
+ ut_ad(data + rec_offs_data_size(offsets)
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ - n_ext * FIELD_REF_SIZE
+ < externs - FIELD_REF_SIZE * page_zip->n_blobs);
+
+ if (n_ext) {
+ ulint blob_no = page_zip_get_n_prev_extern(
+ page_zip, rec, index);
+ byte* ext_end = externs - page_zip->n_blobs * FIELD_REF_SIZE;
+ ut_ad(blob_no <= page_zip->n_blobs);
+ externs -= blob_no * FIELD_REF_SIZE;
+
+ if (create) {
+ page_zip->n_blobs = (page_zip->n_blobs + n_ext)
+ & ((1U << 12) - 1);
+ ASSERT_ZERO_BLOB(ext_end - n_ext * FIELD_REF_SIZE);
+ if (ulint len = ulint(externs - ext_end)) {
+ byte* ext_start = ext_end
+ - n_ext * FIELD_REF_SIZE;
+ memmove(ext_start, ext_end, len);
+ mtr->memmove(*block,
+ ext_start - page_zip->data,
+ ext_end - page_zip->data, len);
+ }
+ }
+
+ ut_a(blob_no + n_ext <= page_zip->n_blobs);
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ const byte* src;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ ut_ad(!rec_offs_nth_extern(offsets,
+ i));
+ ut_ad(!rec_offs_nth_extern(offsets,
+ i + 1));
+ /* Locate trx_id and roll_ptr. */
+ src = rec_get_nth_field(rec, offsets,
+ i, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(
+ rec, offsets,
+ i + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Log the preceding fields. */
+ ASSERT_ZERO(data, src - start);
+ memcpy(data, start, ulint(src - start));
+ data += src - start;
+ start = src + (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN);
+
+ /* Store trx_id and roll_ptr. */
+ constexpr ulint sys_len = DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ byte* sys = storage - sys_len * (heap_no - 1);
+ memcpy(sys, src, sys_len);
+ i++; /* skip also roll_ptr */
+ mtr->zmemcpy(*block, sys - page_zip->data, sys_len);
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ src = rec_get_nth_field(rec, offsets,
+ i, &len);
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(len >= FIELD_REF_SIZE);
+ src += len - FIELD_REF_SIZE;
+
+ ASSERT_ZERO(data, src - start);
+ memcpy(data, start, ulint(src - start));
+ data += src - start;
+ start = src + FIELD_REF_SIZE;
+
+ /* Store the BLOB pointer. */
+ externs -= FIELD_REF_SIZE;
+ ut_ad(data < externs);
+ memcpy(externs, src, FIELD_REF_SIZE);
+ mtr->zmemcpy(*block, externs - page_zip->data,
+ FIELD_REF_SIZE);
+ }
+ }
+
+ /* Log the last bytes of the record. */
+ len = rec_offs_data_size(offsets) - ulint(start - rec);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, start, len);
+ data += len;
+
+ return(data);
+}
+
+/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
+The data must already have been written to the uncompressed page.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in] rec record in the uncompressed page
+@param[in] index the index that the page belongs to
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] create nonzero=insert, zero=update
+@param[in,out] mtr mini-transaction */
+void page_zip_write_rec(buf_block_t *block, const byte *rec,
+ const dict_index_t *index, const rec_offs *offsets,
+ ulint create, mtr_t *mtr)
+{
+ const page_t* const page = block->frame;
+ page_zip_des_t* const page_zip = &block->page.zip;
+ byte* data;
+ byte* storage;
+ ulint heap_no;
+ byte* slot;
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+
+ ut_ad(page_zip_header_cmp(page_zip, page));
+ ut_ad(page_simple_validate_new((page_t*) page));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ slot = page_zip_dir_find(page_zip, page_offset(rec));
+ ut_a(slot);
+ /* Copy the delete mark. */
+ if (rec_get_deleted_flag(rec, TRUE)) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record.
+ On non-leaf pages, the delete-mark flag is garbage. */
+ ut_ad(!index->is_primary() || !page_is_leaf(page)
+ || row_get_rec_trx_id(rec, index, offsets));
+ *slot |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
+ } else {
+ *slot &= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
+ }
+
+ ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START);
+ ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + srv_page_size
+ - PAGE_DIR - PAGE_DIR_SLOT_SIZE
+ * page_dir_get_n_slots(page));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */
+ ut_ad(heap_no < page_dir_get_n_heap(page));
+
+ /* Append to the modification log. */
+ data = page_zip->data + page_zip->m_end;
+ ut_ad(!*data);
+
+ /* Identify the record by writing its heap number - 1.
+ 0 is reserved to indicate the end of the modification log. */
+
+ if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
+ *data++ = (byte) (0x80 | (heap_no - 1) >> 7);
+ ut_ad(!*data);
+ }
+ *data++ = (byte) ((heap_no - 1) << 1);
+ ut_ad(!*data);
+
+ {
+ const byte* start = rec - rec_offs_extra_size(offsets);
+ const byte* b = rec - REC_N_NEW_EXTRA_BYTES;
+
+ /* Write the extra bytes backwards, so that
+ rec_offs_extra_size() can be easily computed in
+ page_zip_apply_log() by invoking
+ rec_get_offsets_reverse(). */
+
+ while (b != start) {
+ *data++ = *--b;
+ ut_ad(!*data);
+ }
+ }
+
+ /* Write the data bytes. Store the uncompressed bytes separately. */
+ storage = page_zip_dir_start(page_zip);
+
+ if (page_is_leaf(page)) {
+ if (dict_index_is_clust(index)) {
+ /* Store separately trx_id, roll_ptr and
+ the BTR_EXTERN_FIELD_REF of each BLOB column. */
+ if (rec_offs_any_extern(offsets)) {
+ data = page_zip_write_rec_ext(
+ block,
+ rec, index, offsets, create,
+ index->db_trx_id(), heap_no,
+ storage, data, mtr);
+ } else {
+ /* Locate trx_id and roll_ptr. */
+ ulint len;
+ const byte* src
+ = rec_get_nth_field(rec, offsets,
+ index->db_trx_id(),
+ &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(
+ rec, offsets,
+ index->db_roll_ptr(), &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Log the preceding fields. */
+ ASSERT_ZERO(data, src - rec);
+ memcpy(data, rec, ulint(src - rec));
+ data += src - rec;
+
+ /* Store trx_id and roll_ptr. */
+ constexpr ulint sys_len
+ = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ byte* sys = storage - sys_len * (heap_no - 1);
+ memcpy(sys, src, sys_len);
+
+ src += sys_len;
+ mtr->zmemcpy(*block, sys - page_zip->data,
+ sys_len);
+ /* Log the last bytes of the record. */
+ len = rec_offs_data_size(offsets)
+ - ulint(src - rec);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, src, len);
+ data += len;
+ }
+ } else {
+ /* Leaf page of a secondary index:
+ no externally stored columns */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Log the entire record. */
+ ulint len = rec_offs_data_size(offsets);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, rec, len);
+ data += len;
+ }
+ } else {
+ /* This is a node pointer page. */
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Copy the data bytes, except node_ptr. */
+ ulint len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
+ ut_ad(data + len < storage - REC_NODE_PTR_SIZE
+ * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW));
+ ASSERT_ZERO(data, len);
+ memcpy(data, rec, len);
+ data += len;
+
+ /* Copy the node pointer to the uncompressed area. */
+ byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1);
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, node_ptr,
+ rec + len, REC_NODE_PTR_SIZE);
+ }
+
+ ut_a(!*data);
+ ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip));
+ mtr->zmemcpy(*block, page_zip->m_end,
+ data - page_zip->data - page_zip->m_end);
+ page_zip->m_end = uint16_t(data - page_zip->data);
+ page_zip->m_nonempty = TRUE;
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page_align(rec), index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+void
+page_zip_write_blob_ptr(
+/*====================*/
+ buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
+ const byte* rec, /*!< in/out: record whose data is being
+ written */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint n, /*!< in: column index */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ const byte* field;
+ byte* externs;
+ const page_t* const page = block->frame;
+ page_zip_des_t* const page_zip = &block->page.zip;
+ ulint blob_no;
+ ulint len;
+
+ ut_ad(page_align(rec) == page);
+ ut_ad(index != NULL);
+ ut_ad(offsets != NULL);
+ ut_ad(page_simple_validate_new((page_t*) page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_offs_any_extern(offsets));
+ ut_ad(rec_offs_nth_extern(offsets, n));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(page_is_leaf(page));
+ ut_ad(dict_index_is_clust(index));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ blob_no = page_zip_get_n_prev_extern(page_zip, rec, index)
+ + rec_get_n_extern_new(rec, index, n);
+ ut_a(blob_no < page_zip->n_blobs);
+
+ externs = page_zip->data + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+ * PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+
+ field = rec_get_nth_field(rec, offsets, n, &len);
+
+ externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
+ field += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, externs, field,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+void
+page_zip_write_node_ptr(
+/*====================*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ ulint size, /*!< in: data size of rec */
+ ulint ptr, /*!< in: node pointer */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ byte* field;
+ byte* storage;
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ ut_d(const page_t* const page = block->frame);
+ ut_ad(page_simple_validate_new(page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(page_rec_is_comp(rec));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(!page_is_leaf(page));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, size);
+
+ storage = page_zip_dir_start(page_zip)
+ - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE;
+ field = rec + size - REC_NODE_PTR_SIZE;
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ compile_time_assert(REC_NODE_PTR_SIZE == 4);
+ mach_write_to_4(field, ptr);
+ mtr->zmemcpy(*block, storage, field, REC_NODE_PTR_SIZE);
+}
+
+/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in,out] rec record
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields)
+@param[in] trx_id DB_TRX_ID value (transaction identifier)
+@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer)
+@param[in,out] mtr mini-transaction */
+void
+page_zip_write_trx_id_and_roll_ptr(
+ buf_block_t* block,
+ byte* rec,
+ const rec_offs* offsets,
+ ulint trx_id_col,
+ trx_id_t trx_id,
+ roll_ptr_t roll_ptr,
+ mtr_t* mtr)
+{
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ ut_d(const page_t* const page = block->frame);
+ ut_ad(page_align(rec) == page);
+ ut_ad(page_simple_validate_new(page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_offs_comp(offsets));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(page_is_leaf(page));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ const ulint heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len;
+
+ compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+ ulint len;
+ byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(field + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage, field, sys_len));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ compile_time_assert(DATA_TRX_ID_LEN == 6);
+ mach_write_to_6(field, trx_id);
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
+ len = 0;
+ if (heap_no > PAGE_HEAP_NO_USER_LOW) {
+ byte* prev = storage + sys_len;
+ for (; len < sys_len && prev[len] == field[len]; len++);
+ if (len > 4) {
+ /* We save space by replacing a single record
+
+ WRITE,offset(storage),byte[13]
+
+ with up to two records:
+
+ MEMMOVE,offset(storage),len(1 byte),+13(1 byte),
+ WRITE|0x80,0,byte[13-len]
+
+ The single WRITE record would be x+13 bytes long (x>2).
+ The MEMMOVE record would be x+1+1 = x+2 bytes, and
+ the second WRITE would be 1+1+13-len = 15-len bytes.
+
+ The total size is: x+13 versus x+2+15-len = x+17-len.
+ To save space, we must have len>4. */
+ memcpy(storage, prev, len);
+ mtr->memmove(*block, ulint(storage - page_zip->data),
+ ulint(storage - page_zip->data) + sys_len,
+ len);
+ storage += len;
+ field += len;
+ if (UNIV_LIKELY(len < sys_len)) {
+ goto write;
+ }
+ } else {
+ len = 0;
+ goto write;
+ }
+ } else {
+write:
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, storage, field,
+ sys_len - len);
+ }
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage - len, field - len, sys_len));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+}
+
+/**********************************************************************//**
+Clear an area on the uncompressed and compressed page.
+Do not clear the data payload, as that would grow the modification log. */
+static
+void
+page_zip_clear_rec(
+/*===============*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ byte* rec, /*!< in: record to clear */
+ const dict_index_t* index, /*!< in: index of rec */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint heap_no;
+ byte* storage;
+ byte* field;
+ ulint len;
+
+ ut_ad(page_align(rec) == block->frame);
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ /* page_zip_validate() would fail here if a record
+ containing externally stored columns is being deleted. */
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
+ ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
+ ut_ad(page_zip_header_cmp(page_zip, block->frame));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ if (!page_is_leaf(block->frame)) {
+ /* Clear node_ptr. On the compressed page,
+ there is an array of node_ptr immediately before the
+ dense page directory, at the very end of the page. */
+ storage = page_zip_dir_start(page_zip);
+ ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) ==
+ rec_offs_n_fields(offsets) - 1);
+ field = rec_get_nth_field(rec, offsets,
+ rec_offs_n_fields(offsets) - 1,
+ &len);
+ ut_ad(len == REC_NODE_PTR_SIZE);
+ ut_ad(!rec_offs_any_extern(offsets));
+ memset(field, 0, REC_NODE_PTR_SIZE);
+ storage -= (heap_no - 1) * REC_NODE_PTR_SIZE;
+ len = REC_NODE_PTR_SIZE;
+clear_page_zip:
+ memset(storage, 0, len);
+ mtr->memset(*block, storage - page_zip->data, len, 0);
+ } else if (index->is_clust()) {
+ /* Clear trx_id and roll_ptr. On the compressed page,
+ there is an array of these fields immediately before the
+ dense page directory, at the very end of the page. */
+ const ulint trx_id_pos
+ = dict_col_get_clust_pos(
+ dict_table_get_sys_col(
+ index->table, DATA_TRX_ID), index);
+ field = rec_get_nth_field(rec, offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ if (rec_offs_any_extern(offsets)) {
+ ulint i;
+
+ for (i = rec_offs_n_fields(offsets); i--; ) {
+ /* Clear all BLOB pointers in order to make
+ page_zip_validate() pass. */
+ if (rec_offs_nth_extern(offsets, i)) {
+ field = rec_get_nth_field(
+ rec, offsets, i, &len);
+ ut_ad(len
+ == BTR_EXTERN_FIELD_REF_SIZE);
+ memset(field + len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ 0, BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ storage = page_zip_dir_start(page_zip)
+ - (heap_no - 1)
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ goto clear_page_zip;
+ } else {
+ ut_ad(!rec_offs_any_extern(offsets));
+ }
+}
+
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in] flag the value of the delete-mark flag
+@param[in,out] mtr mini-transaction */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+ mtr_t *mtr)
+{
+ ut_ad(page_align(rec) == block->frame);
+ byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec));
+ byte b= *slot;
+ if (flag)
+ b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
+ else
+ b&= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(&block->page.zip, block->frame, nullptr));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page. The n_owned field
+must already have been written on the uncompressed page. */
+void
+page_zip_rec_set_owned(
+/*===================*/
+ buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag, /*!< in: the owned flag (nonzero=TRUE) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(page_align(rec) == block->frame);
+ page_zip_des_t *const page_zip= &block->page.zip;
+ byte *slot= page_zip_dir_find(page_zip, page_offset(rec));
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ byte b= *slot;
+ if (flag)
+ b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+ else
+ b&= byte(~(PAGE_ZIP_DIR_SLOT_OWNED >> 8));
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
+}
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+void
+page_zip_dir_insert(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ uint16_t free_rec,/*!< in: record from which rec was
+ allocated, or 0 */
+ byte* rec, /*!< in: record to insert */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(page_align(cursor->rec) == cursor->block->frame);
+ ut_ad(page_align(rec) == cursor->block->frame);
+ page_zip_des_t *const page_zip= &cursor->block->page.zip;
+
+ ulint n_dense;
+ byte* slot_rec;
+ byte* slot_free;
+
+ ut_ad(cursor->rec != rec);
+ ut_ad(page_rec_get_next_const(cursor->rec) == rec);
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ if (page_rec_is_infimum(cursor->rec)) {
+ /* Use the first slot. */
+ slot_rec = page_zip->data + page_zip_get_size(page_zip);
+ } else {
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+ byte* start = end - page_zip_dir_user_size(page_zip);
+
+ if (UNIV_LIKELY(!free_rec)) {
+ /* PAGE_N_RECS was already incremented
+ in page_cur_insert_rec_zip(), but the
+ dense directory slot at that position
+ contains garbage. Skip it. */
+ start += PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+
+ slot_rec = page_zip_dir_find_low(start, end,
+ page_offset(cursor->rec));
+ ut_a(slot_rec);
+ }
+
+ /* Read the old n_dense (n_heap may have been incremented). */
+ n_dense = page_dir_get_n_heap(page_zip->data)
+ - (PAGE_HEAP_NO_USER_LOW + 1U);
+
+ if (UNIV_UNLIKELY(free_rec)) {
+ /* The record was allocated from the free list.
+ Shift the dense directory only up to that slot.
+ Note that in this case, n_dense is actually
+ off by one, because page_cur_insert_rec_zip()
+ did not increment n_heap. */
+ ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
+ + PAGE_HEAP_NO_USER_LOW);
+ ut_ad(page_offset(rec) >= free_rec);
+ slot_free = page_zip_dir_find(page_zip, free_rec);
+ ut_ad(slot_free);
+ slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
+ } else {
+ /* The record was allocated from the heap.
+ Shift the entire dense directory. */
+ ut_ad(rec_get_heap_no_new(rec) == n_dense
+ + PAGE_HEAP_NO_USER_LOW);
+
+ /* Shift to the end of the dense page directory. */
+ slot_free = page_zip->data + page_zip_get_size(page_zip)
+ - PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+ }
+
+ if (const ulint slot_len = ulint(slot_rec - slot_free)) {
+ /* Shift the dense directory to allocate place for rec. */
+ memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free, slot_len);
+ mtr->memmove(*cursor->block, (slot_free - page_zip->data)
+ - PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free - page_zip->data, slot_len);
+ }
+
+ /* Write the entry for the inserted record.
+ The "owned" and "deleted" flags must be zero. */
+ mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec));
+ mtr->zmemcpy(*cursor->block, slot_rec - page_zip->data
+ - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE);
+}
+
+/** Shift the dense page directory and the array of BLOB pointers
+when a record is deleted.
+@param[in,out] block index page
+@param[in,out] rec record being deleted
+@param[in] index the index that the page belongs to
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] free previous start of the free list
+@param[in,out] mtr mini-transaction */
+void page_zip_dir_delete(buf_block_t *block, byte *rec,
+ const dict_index_t *index, const rec_offs *offsets,
+ const byte *free, mtr_t *mtr)
+{
+ ut_ad(page_align(rec) == block->frame);
+ page_zip_des_t *const page_zip= &block->page.zip;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_comp(offsets));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ mach_write_to_2(rec - REC_NEXT,
+ free ? static_cast<uint16_t>(free - rec) : 0);
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block->frame);
+ mtr->write<2>(*block, page_free, page_offset(rec));
+ byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+ block->frame);
+ mtr->write<2>(*block, garbage, rec_offs_size(offsets) +
+ mach_read_from_2(garbage));
+ compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
+ memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4);
+ byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec));
+ ut_a(slot_rec);
+ uint16_t n_recs= page_get_n_recs(block->frame);
+ ut_ad(n_recs);
+ ut_ad(n_recs > 1 || page_get_page_no(block->frame) == index->page);
+ /* This could not be done before page_zip_dir_find(). */
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block->frame);
+ mtr->write<2>(*block, page_n_recs, n_recs - 1U);
+ memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs,
+ 2);
+
+ byte *slot_free;
+
+ if (UNIV_UNLIKELY(!free))
+ /* Make the last slot the start of the free list. */
+ slot_free= page_zip->data + page_zip_get_size(page_zip) -
+ PAGE_ZIP_DIR_SLOT_SIZE * (page_dir_get_n_heap(page_zip->data) -
+ PAGE_HEAP_NO_USER_LOW);
+ else
+ {
+ slot_free= page_zip_dir_find_free(page_zip, page_offset(free));
+ ut_a(slot_free < slot_rec);
+ /* Grow the free list by one slot by moving the start. */
+ slot_free+= PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+
+ const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0;
+ if (slot_len)
+ {
+ memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
+ slot_len);
+ mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free - page_zip->data, slot_len);
+ }
+
+ /* Write the entry for the deleted record.
+ The "owned" and "deleted" flags will be cleared. */
+ mach_write_to_2(slot_free, page_offset(rec));
+ mtr->zmemcpy(*block, slot_free - page_zip->data, 2);
+
+ if (const ulint n_ext= rec_offs_n_extern(offsets))
+ {
+ ut_ad(index->is_primary());
+ ut_ad(page_is_leaf(block->frame));
+
+ /* Shift and zero fill the array of BLOB pointers. */
+ ulint blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
+ ut_a(blob_no + n_ext <= page_zip->n_blobs);
+
+ byte *externs= page_zip->data + page_zip_get_size(page_zip) -
+ (page_dir_get_n_heap(block->frame) - PAGE_HEAP_NO_USER_LOW) *
+ PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+ byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE;
+
+ /* Shift and zero fill the array. */
+ if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) *
+ BTR_EXTERN_FIELD_REF_SIZE)
+ {
+ memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len);
+ mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE,
+ ext_end - page_zip->data, ext_len);
+ }
+ memset(ext_end, 0, n_ext * FIELD_REF_SIZE);
+ mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0);
+ page_zip->n_blobs = (page_zip->n_blobs - n_ext) & ((1U << 12) - 1);
+ }
+
+ /* The compression algorithm expects info_bits and n_owned
+ to be 0 for deleted records. */
+ rec[-REC_N_NEW_EXTRA_BYTES]= 0; /* info_bits and n_owned */
+
+ page_zip_clear_rec(block, rec, index, offsets, mtr);
+}
+
+/**********************************************************************//**
+Reorganize and compress a page. This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, redo log will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@retval true on success
+@retval false on failure; the block will be left intact */
+bool
+page_zip_reorganize(
+ buf_block_t* block, /*!< in/out: page with compressed page;
+ on the compressed page, in: size;
+ out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint z_level,/*!< in: compression level */
+ mtr_t* mtr, /*!< in: mini-transaction */
+ bool restore)/*!< whether to restore on failure */
+{
+ page_t* page = buf_block_get_frame(block);
+ buf_block_t* temp_block;
+ page_t* temp_page;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(block->page.zip.data);
+ ut_ad(page_is_comp(page));
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(!index->table->is_temporary());
+ /* Note that page_zip_validate(page_zip, page, index) may fail here. */
+ MEM_CHECK_DEFINED(page, srv_page_size);
+ MEM_CHECK_DEFINED(buf_block_get_page_zip(block)->data,
+ page_zip_get_size(buf_block_get_page_zip(block)));
+
+ /* Disable logging */
+ mtr_log_t log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+ temp_block = buf_block_alloc();
+ btr_search_drop_page_hash_index(block);
+ temp_page = temp_block->frame;
+
+ /* Copy the old page to temporary space */
+ memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_block->frame, block->frame,
+ srv_page_size);
+
+ /* Recreate the page: note that global data on page (possible
+ segment headers, next page-field, etc.) is preserved intact */
+
+ page_create(block, mtr, true);
+ if (index->is_spatial()) {
+ mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_RTREE);
+ memcpy_aligned<2>(block->page.zip.data + FIL_PAGE_TYPE,
+ page + FIL_PAGE_TYPE, 2);
+ memset(FIL_RTREE_SPLIT_SEQ_NUM + page, 0, 8);
+ memset(FIL_RTREE_SPLIT_SEQ_NUM + block->page.zip.data, 0, 8);
+ }
+
+ /* Copy the records from the temporary space to the recreated page;
+ do not copy the lock bits yet */
+
+ page_copy_rec_list_end_no_locks(block, temp_block,
+ page_get_infimum_rec(temp_page),
+ index, mtr);
+
+ /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
+ memcpy_aligned<8>(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+ temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
+ /* PAGE_MAX_TRX_ID must be set on secondary index leaf pages. */
+ ut_ad(dict_index_is_clust(index) || !page_is_leaf(temp_page)
+ || page_get_max_trx_id(page) != 0);
+ /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
+ clustered index root pages. */
+ ut_ad(page_get_max_trx_id(page) == 0
+ || (dict_index_is_clust(index)
+ ? !page_has_siblings(temp_page)
+ : page_is_leaf(temp_page)));
+
+ /* Restore logging. */
+ mtr_set_log_mode(mtr, log_mode);
+
+ if (!page_zip_compress(block, index, z_level, mtr)) {
+ if (restore) {
+ /* Restore the old page and exit. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ /* Check that the bytes that we skip are identical. */
+ ut_a(!memcmp(page, temp_page, PAGE_HEADER));
+ ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
+ PAGE_HEADER + PAGE_N_RECS + temp_page,
+ PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
+ ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page,
+ srv_page_size - FIL_PAGE_DATA_END
+ + temp_page,
+ FIL_PAGE_DATA_END));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
+ PAGE_N_RECS - PAGE_N_DIR_SLOTS);
+ memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
+ srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(page, temp_page, srv_page_size));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ }
+
+ buf_block_free(temp_block);
+ return false;
+ }
+
+ lock_move_reorganize_page(block, temp_block);
+
+ buf_block_free(temp_block);
+ return true;
+}
+
+/**********************************************************************//**
+Copy the records of a page byte for byte. Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records. Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+void
+page_zip_copy_recs(
+ buf_block_t* block, /*!< in/out: buffer block */
+ const page_zip_des_t* src_zip, /*!< in: compressed page */
+ const page_t* src, /*!< in: page */
+ dict_index_t* index, /*!< in: index of the B-tree */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ page_t* page = block->frame;
+ page_zip_des_t* page_zip = &block->page.zip;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr->memo_contains_page_flagged(src, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(!index->table->is_temporary());
+#ifdef UNIV_ZIP_DEBUG
+ /* The B-tree operations that call this function may set
+ FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag
+ mismatch. A strict page_zip_validate() will be executed later
+ during the B-tree operations. */
+ ut_a(page_zip_validate_low(src_zip, src, index, TRUE));
+#endif /* UNIV_ZIP_DEBUG */
+ ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip));
+ if (UNIV_UNLIKELY(src_zip->n_blobs)) {
+ ut_a(page_is_leaf(src));
+ ut_a(dict_index_is_clust(index));
+ }
+
+ MEM_CHECK_ADDRESSABLE(page, srv_page_size);
+ MEM_CHECK_ADDRESSABLE(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(src, srv_page_size);
+ MEM_CHECK_DEFINED(src_zip->data, page_zip_get_size(page_zip));
+
+ /* Copy those B-tree page header fields that are related to
+ the records stored in the page. Also copy the field
+ PAGE_MAX_TRX_ID. Skip the rest of the page header and
+ trailer. On the compressed page, there is no trailer. */
+ compile_time_assert(PAGE_MAX_TRX_ID + 8 == PAGE_HEADER_PRIV_END);
+ memcpy_aligned<2>(PAGE_HEADER + page, PAGE_HEADER + src,
+ PAGE_HEADER_PRIV_END);
+ memcpy_aligned<2>(PAGE_DATA + page, PAGE_DATA + src,
+ srv_page_size - (PAGE_DATA + FIL_PAGE_DATA_END));
+ memcpy_aligned<2>(PAGE_HEADER + page_zip->data,
+ PAGE_HEADER + src_zip->data,
+ PAGE_HEADER_PRIV_END);
+ memcpy_aligned<2>(PAGE_DATA + page_zip->data,
+ PAGE_DATA + src_zip->data,
+ page_zip_get_size(page_zip) - PAGE_DATA);
+
+ if (dict_index_is_clust(index)) {
+ /* Reset the PAGE_ROOT_AUTO_INC field when copying
+ from a root page. */
+ memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
+ + page, 0, 8);
+ memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
+ + page_zip->data, 0, 8);
+ } else {
+ /* The PAGE_MAX_TRX_ID must be nonzero on leaf pages
+ of secondary indexes, and 0 on others. */
+ ut_ad(!page_is_leaf(src) == !page_get_max_trx_id(src));
+ }
+
+ /* Copy all fields of src_zip to page_zip, except the pointer
+ to the compressed data page. */
+ {
+ page_zip_t* data = page_zip->data;
+ memcpy(page_zip, src_zip, sizeof *page_zip);
+ page_zip->data = data;
+ }
+ ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index))
+ + page_zip->m_end < page_zip_get_size(page_zip));
+
+ if (!page_is_leaf(src)
+ && UNIV_UNLIKELY(!page_has_prev(src))
+ && UNIV_LIKELY(page_has_prev(page))) {
+ /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */
+ ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+ TRUE);
+ if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) {
+ rec_t* rec = page + offs;
+ ut_a(rec[-REC_N_NEW_EXTRA_BYTES]
+ & REC_INFO_MIN_REC_FLAG);
+ rec[-REC_N_NEW_EXTRA_BYTES]
+ &= byte(~REC_INFO_MIN_REC_FLAG);
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+ page_zip_compress_write_log(block, index, mtr);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Calculate the compressed page checksum.
+@param[in] data compressed page
+@param[in] size size of compressed page
+@param[in] algo algorithm to use
+@return page checksum */
+uint32_t
+page_zip_calc_checksum(
+ const void* data,
+ ulint size,
+ srv_checksum_algorithm_t algo)
+{
+ uLong adler;
+ const Bytef* s = static_cast<const byte*>(data);
+
+ /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
+ and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
+
+ switch (algo) {
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ return ut_crc32(s + FIL_PAGE_OFFSET,
+ FIL_PAGE_LSN - FIL_PAGE_OFFSET)
+ ^ ut_crc32(s + FIL_PAGE_TYPE, 2)
+ ^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ adler = adler32(0L, s + FIL_PAGE_OFFSET,
+ FIL_PAGE_LSN - FIL_PAGE_OFFSET);
+ adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
+ adler = adler32(
+ adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ static_cast<uInt>(size)
+ - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ return(uint32_t(adler));
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ return(BUF_NO_CHECKSUM_MAGIC);
+ /* no default so the compiler will emit a warning if new enum
+ is added and not handled here */
+ }
+
+ ut_error;
+ return(0);
+}
+
+/** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
+@param data ROW_FORMAT=COMPRESSED page
+@param size size of the page, in bytes
+@return whether the stored checksum matches innodb_checksum_algorithm */
+bool page_zip_verify_checksum(const byte *data, size_t size)
+{
+ const srv_checksum_algorithm_t curr_algo =
+ static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
+
+ if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
+ return true;
+ }
+
+ if (buf_is_zeroes(span<const byte>(data, size))) {
+ return true;
+ }
+
+ const uint32_t stored = mach_read_from_4(
+ data + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ uint32_t calc = page_zip_calc_checksum(data, size, curr_algo);
+
+#ifdef UNIV_INNOCHECKSUM
+ if (log_file) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " %s checksum: calculated = " UINT32PF ";"
+ " recorded = " UINT32PF "\n", cur_page_num,
+ buf_checksum_algorithm_name(
+ static_cast<srv_checksum_algorithm_t>(
+ srv_checksum_algorithm)),
+ calc, stored);
+ }
+
+ if (!strict_verify) {
+ const uint32_t crc32 = page_zip_calc_checksum(
+ data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
+
+ if (log_file) {
+ fprintf(log_file, "page::" UINT32PF ": crc32 checksum:"
+ " calculated = " UINT32PF "; recorded = " UINT32PF "\n",
+ cur_page_num, crc32, stored);
+ fprintf(log_file, "page::" UINT32PF ": none checksum:"
+ " calculated = %lu; recorded = " UINT32PF "\n",
+ cur_page_num, BUF_NO_CHECKSUM_MAGIC, stored);
+ }
+ }
+#endif /* UNIV_INNOCHECKSUM */
+
+ if (stored == calc) {
+ return(TRUE);
+ }
+
+ switch (curr_algo) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ return FALSE;
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ if (stored == BUF_NO_CHECKSUM_MAGIC) {
+ return(TRUE);
+ }
+
+ return stored == page_zip_calc_checksum(
+ data, size, SRV_CHECKSUM_ALGORITHM_INNODB);
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ if (stored == BUF_NO_CHECKSUM_MAGIC) {
+ return TRUE;
+ }
+
+ return stored == page_zip_calc_checksum(
+ data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ return TRUE;
+ }
+
+ return FALSE;
+}
diff --git a/storage/innobase/pars/lexyy.cc b/storage/innobase/pars/lexyy.cc
new file mode 100644
index 00000000..e57a28ce
--- /dev/null
+++ b/storage/innobase/pars/lexyy.cc
@@ -0,0 +1,2841 @@
+#include "univ.i"
+#line 2 "lexyy.cc"
+
+#line 4 "lexyy.cc"
+
+#define YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an
+ * integer in range [0..255] for use as an array index.
+ */
+#define YY_SC_TO_UI(c) ((YY_CHAR) (c))
+
+/* Enter a start condition. This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN (yy_start) = 1 + 2 *
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state. The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START (((yy_start) - 1) / 2)
+#define YYSTATE YY_START
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE yyrestart( yyin )
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+
+
+
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+ #define YY_LESS_LINENO(n)
+ #define YY_LINENO_REWIND_TO(ptr)
+
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+ do \
+ { \
+ /* Undo effects of setting up yytext. */ \
+ int yyless_macro_arg = (n); \
+ YY_LESS_LINENO(yyless_macro_arg);\
+ *yy_cp = (yy_hold_char); \
+ YY_RESTORE_YY_MORE_OFFSET \
+ (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+ YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+ } \
+ while ( 0 )
+#define unput(c) yyunput( c, (yytext_ptr) )
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+ {
+ FILE *yy_input_file;
+
+ char *yy_ch_buf; /* input buffer */
+ char *yy_buf_pos; /* current position in input buffer */
+
+ /* Size of input buffer in bytes, not including room for EOB
+ * characters.
+ */
+ int yy_buf_size;
+
+ /* Number of characters read into yy_ch_buf, not including EOB
+ * characters.
+ */
+ int yy_n_chars;
+
+ /* Whether we "own" the buffer - i.e., we know we created it,
+ * and can realloc() it to grow it, and should free() it to
+ * delete it.
+ */
+ int yy_is_our_buffer;
+
+ /* Whether this is an "interactive" input source; if so, and
+ * if we're using stdio for input, then we want to use getc()
+ * instead of fread(), to make sure we stop fetching input after
+ * each newline.
+ */
+ int yy_is_interactive;
+
+ /* Whether we're considered to be at the beginning of a line.
+ * If so, '^' rules will be active on the next match, otherwise
+ * not.
+ */
+ int yy_at_bol;
+
+ int yy_bs_lineno; /**< The line count. */
+ int yy_bs_column; /**< The column count. */
+
+ /* Whether to try to fill the input buffer when we reach the
+ * end of it.
+ */
+ int yy_fill_buffer;
+
+ int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+ /* When an EOF's been seen but there's still some text to process
+ * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+ * shouldn't try reading from the input source any more. We might
+ * still have a bunch of tokens to match, though, because of
+ * possible backing-up.
+ *
+ * When we actually see the EOF, we change the status to "new"
+ * (via yyrestart()), so that the user can continue scanning by
+ * just pointing yyin at a new input file.
+ */
+#define YY_BUFFER_EOF_PENDING 2
+
+ };
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* Stack of input buffers. */
+static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */
+static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */
+static YY_BUFFER_STATE * yy_buffer_stack = NULL; /**< Stack as an array. */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \
+ ? (yy_buffer_stack)[(yy_buffer_stack_top)] \
+ : 0)
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)]
+
+/* yy_hold_char holds the character lost when yytext is formed. */
+static char yy_hold_char;
+static int yy_n_chars; /* number of characters read into yy_ch_buf */
+static int yyleng;
+
+/* Points to current character in buffer. */
+static char *yy_c_buf_p = NULL;
+static int yy_init = 0; /* whether we need to initialize */
+static int yy_start = 0; /* start state number */
+
+/* Flag which is used to allow yywrap()'s to do buffer switches
+ * instead of setting up a fresh yyin. A bit of a hack ...
+ */
+static int yy_did_buffer_switch_on_eof;
+
+static void yyrestart ( FILE *input_file );
+MY_ATTRIBUTE((unused)) static void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer );
+static YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size );
+static void yy_delete_buffer ( YY_BUFFER_STATE b );
+static void yy_flush_buffer ( YY_BUFFER_STATE b );
+MY_ATTRIBUTE((unused)) static void yypush_buffer_state ( YY_BUFFER_STATE new_buffer );
+MY_ATTRIBUTE((unused)) static void yypop_buffer_state ( void );
+
+static void yyensure_buffer_stack ( void );
+static void yy_load_buffer_state ( void );
+static void yy_init_buffer ( YY_BUFFER_STATE b, FILE *file );
+#define YY_FLUSH_BUFFER yy_flush_buffer( YY_CURRENT_BUFFER )
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len );
+
+static void *yyalloc ( yy_size_t );
+static void *yyrealloc ( void *, yy_size_t );
+static void yyfree ( void * );
+
+#define yy_new_buffer yy_create_buffer
+#define yy_set_interactive(is_interactive) \
+ { \
+ if ( ! YY_CURRENT_BUFFER ){ \
+ yyensure_buffer_stack (); \
+ YY_CURRENT_BUFFER_LVALUE = \
+ yy_create_buffer( yyin, YY_BUF_SIZE ); \
+ } \
+ YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+ }
+#define yy_set_bol(at_bol) \
+ { \
+ if ( ! YY_CURRENT_BUFFER ){\
+ yyensure_buffer_stack (); \
+ YY_CURRENT_BUFFER_LVALUE = \
+ yy_create_buffer( yyin, YY_BUF_SIZE ); \
+ } \
+ YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+ }
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define yywrap() (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+typedef flex_uint8_t YY_CHAR;
+
+static FILE *yyin = NULL, *yyout = NULL;
+
+typedef int yy_state_type;
+
+
+static int yylineno = 1;
+
+
+#ifdef yytext_ptr
+#undef yytext_ptr
+#endif
+#define yytext_ptr yytext
+
+static yy_state_type yy_get_previous_state ( void );
+static yy_state_type yy_try_NUL_trans ( yy_state_type current_state );
+static int yy_get_next_buffer ( void );
+static void yynoreturn yy_fatal_error ( const char* msg );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+ (yytext_ptr) = yy_bp; \
+ yyleng = (int) (yy_cp - yy_bp); \
+ (yy_hold_char) = *yy_cp; \
+ *yy_cp = '\0'; \
+ (yy_c_buf_p) = yy_cp;
+#define YY_NUM_RULES 102
+#define YY_END_OF_BUFFER 103
+/* This struct is not used in this scanner,
+ but its presence is necessary. */
+struct yy_trans_info
+ {
+ flex_int32_t yy_verify;
+ flex_int32_t yy_nxt;
+ };
+static const flex_int16_t yy_accept[307] =
+ { 0,
+ 0, 0, 97, 97, 0, 0, 0, 0, 103, 101,
+ 100, 100, 8, 101, 92, 5, 81, 87, 90, 88,
+ 85, 89, 101, 91, 1, 101, 86, 84, 82, 83,
+ 95, 74, 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
+ 93, 94, 97, 98, 6, 7, 9, 10, 100, 4,
+ 76, 96, 2, 1, 3, 77, 78, 80, 79, 0,
+ 74, 0, 74, 74, 74, 74, 36, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
+ 23, 17, 20, 74, 74, 74, 74, 74, 74, 46,
+
+ 52, 74, 14, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 74, 74, 74, 97, 98,
+ 98, 99, 6, 7, 9, 10, 2, 0, 75, 13,
+ 37, 74, 74, 74, 74, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 22, 74, 74, 34, 74,
+ 74, 74, 74, 18, 74, 74, 74, 74, 74, 15,
+ 74, 74, 74, 74, 74, 74, 74, 43, 74, 12,
+ 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
+ 0, 75, 74, 74, 19, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 38, 25, 74, 67, 74,
+
+ 32, 74, 74, 74, 74, 40, 74, 72, 69, 27,
+ 71, 74, 11, 55, 74, 74, 74, 74, 74, 74,
+ 74, 74, 24, 74, 74, 74, 74, 74, 74, 66,
+ 0, 21, 74, 57, 74, 74, 74, 31, 74, 74,
+ 74, 74, 74, 26, 56, 74, 49, 74, 62, 74,
+ 74, 35, 74, 74, 74, 74, 70, 74, 48, 74,
+ 74, 74, 74, 33, 28, 0, 73, 74, 64, 61,
+ 47, 74, 54, 74, 44, 74, 39, 63, 74, 74,
+ 29, 74, 30, 60, 74, 50, 42, 41, 74, 45,
+ 53, 74, 74, 74, 74, 74, 74, 68, 58, 74,
+
+ 65, 74, 51, 16, 59, 0
+ } ;
+
+static const YY_CHAR yy_ec[256] =
+ { 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 2, 1, 4, 5, 6, 7, 1, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 34,
+ 1, 1, 1, 1, 50, 1, 34, 34, 34, 34,
+
+ 34, 34, 34, 34, 34, 34, 34, 51, 34, 34,
+ 34, 34, 52, 34, 53, 34, 34, 34, 34, 34,
+ 34, 34, 54, 1, 55, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1
+ } ;
+
+static const YY_CHAR yy_meta[56] =
+ { 0,
+ 1, 1, 1, 2, 3, 1, 1, 4, 1, 1,
+ 5, 1, 1, 1, 1, 6, 7, 1, 1, 1,
+ 8, 1, 1, 6, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 1, 1
+ } ;
+
+static const flex_int16_t yy_base[320] =
+ { 0,
+ 0, 0, 262, 259, 249, 244, 239, 234, 236, 960,
+ 54, 56, 960, 0, 960, 960, 960, 960, 960, 960,
+ 960, 960, 217, 220, 45, 186, 960, 42, 960, 184,
+ 960, 45, 49, 55, 51, 65, 80, 50, 69, 94,
+ 90, 92, 104, 60, 114, 116, 131, 134, 135, 149,
+ 960, 960, 0, 61, 0, 194, 0, 197, 133, 0,
+ 960, 960, 163, 53, 143, 960, 960, 960, 960, 147,
+ 125, 123, 138, 151, 152, 153, 155, 166, 169, 173,
+ 170, 171, 176, 180, 193, 182, 200, 204, 206, 209,
+ 210, 211, 213, 224, 225, 226, 235, 240, 242, 245,
+
+ 251, 252, 255, 256, 258, 261, 270, 274, 272, 277,
+ 289, 288, 276, 294, 295, 300, 304, 305, 0, 79,
+ 110, 960, 0, 116, 0, 113, 98, 58, 0, 306,
+ 315, 316, 318, 319, 322, 328, 329, 332, 334, 338,
+ 344, 353, 351, 354, 366, 360, 367, 369, 376, 378,
+ 381, 385, 388, 382, 394, 400, 403, 404, 406, 407,
+ 410, 417, 423, 424, 426, 429, 433, 440, 442, 443,
+ 444, 445, 454, 456, 459, 461, 472, 473, 474, 477,
+ 53, 0, 475, 478, 479, 490, 502, 504, 505, 507,
+ 508, 509, 511, 518, 520, 523, 524, 525, 529, 538,
+
+ 541, 542, 543, 545, 547, 544, 556, 557, 558, 559,
+ 560, 569, 572, 574, 578, 581, 579, 583, 588, 590,
+ 600, 601, 602, 607, 611, 613, 612, 618, 622, 629,
+ 41, 634, 636, 638, 639, 643, 645, 648, 649, 650,
+ 655, 659, 661, 660, 670, 675, 676, 679, 680, 682,
+ 686, 689, 691, 696, 693, 700, 705, 706, 709, 711,
+ 712, 716, 722, 723, 726, 72, 727, 736, 737, 738,
+ 739, 740, 742, 743, 752, 753, 755, 757, 758, 759,
+ 764, 770, 769, 771, 774, 784, 785, 786, 787, 789,
+ 790, 791, 796, 801, 802, 803, 806, 807, 812, 817,
+
+ 816, 823, 826, 828, 832, 960, 872, 881, 890, 893,
+ 896, 900, 909, 918, 927, 936, 943, 947, 950
+ } ;
+
+static const flex_int16_t yy_def[320] =
+ { 0,
+ 306, 1, 307, 307, 308, 308, 309, 309, 306, 306,
+ 306, 306, 306, 310, 306, 306, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306, 311, 306, 306, 306, 306,
+ 306, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 306, 306, 313, 314, 315, 306, 316, 306, 306, 310,
+ 306, 306, 306, 306, 311, 306, 306, 306, 306, 317,
+ 312, 318, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 313, 314,
+ 314, 306, 315, 306, 316, 306, 306, 306, 319, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 306, 319, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 306, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 306, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+ 312, 312, 312, 312, 312, 312, 312, 312, 312, 312,
+
+ 312, 312, 312, 312, 312, 0, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306, 306, 306, 306, 306
+ } ;
+
+static const flex_int16_t yy_nxt[1016] =
+ { 0,
+ 10, 11, 12, 13, 10, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+ 29, 30, 31, 10, 32, 33, 34, 35, 36, 37,
+ 38, 38, 39, 38, 38, 40, 41, 42, 43, 44,
+ 38, 45, 46, 47, 48, 49, 50, 38, 38, 38,
+ 38, 38, 38, 51, 52, 59, 59, 59, 59, 63,
+ 70, 64, 67, 68, 70, 70, 70, 63, 72, 64,
+ 70, 121, 72, 72, 72, 70, 122, 75, 72, 83,
+ 70, 76, 73, 72, 70, 129, 78, 74, 72, 306,
+ 79, 266, 72, 80, 306, 70, 81, 77, 91, 82,
+
+ 84, 104, 85, 72, 231, 70, 92, 70, 87, 70,
+ 181, 93, 86, 72, 127, 72, 126, 72, 88, 70,
+ 121, 89, 94, 124, 90, 122, 95, 72, 97, 70,
+ 98, 70, 96, 100, 59, 59, 99, 72, 70, 72,
+ 70, 101, 105, 102, 107, 103, 70, 108, 72, 70,
+ 70, 128, 106, 70, 72, 111, 109, 72, 72, 116,
+ 110, 72, 112, 306, 70, 130, 70, 70, 70, 113,
+ 70, 114, 72, 115, 72, 72, 72, 131, 72, 127,
+ 117, 70, 132, 133, 70, 70, 70, 118, 70, 72,
+ 134, 70, 72, 72, 72, 70, 72, 70, 140, 72,
+
+ 126, 124, 142, 72, 69, 72, 66, 135, 70, 137,
+ 138, 143, 141, 136, 147, 70, 72, 139, 144, 70,
+ 146, 70, 145, 72, 70, 70, 70, 72, 70, 72,
+ 62, 61, 72, 72, 72, 306, 72, 58, 152, 70,
+ 70, 70, 58, 148, 150, 149, 151, 72, 72, 72,
+ 70, 56, 157, 153, 154, 70, 56, 70, 72, 156,
+ 70, 155, 159, 72, 158, 72, 70, 70, 72, 54,
+ 70, 70, 54, 70, 72, 72, 70, 161, 72, 72,
+ 162, 72, 163, 160, 72, 70, 306, 70, 306, 70,
+ 306, 70, 70, 72, 164, 72, 166, 72, 169, 72,
+
+ 72, 165, 171, 70, 70, 167, 306, 170, 306, 70,
+ 70, 72, 72, 168, 172, 70, 173, 72, 72, 70,
+ 70, 70, 176, 72, 306, 174, 175, 72, 72, 72,
+ 70, 70, 178, 70, 70, 177, 179, 70, 72, 72,
+ 306, 72, 72, 70, 70, 72, 180, 70, 183, 70,
+ 184, 72, 72, 70, 306, 72, 306, 72, 189, 70,
+ 185, 72, 191, 306, 186, 188, 70, 72, 70, 70,
+ 187, 190, 306, 306, 72, 70, 72, 72, 306, 195,
+ 196, 70, 70, 72, 70, 192, 193, 306, 194, 72,
+ 72, 70, 72, 70, 197, 200, 70, 70, 198, 72,
+
+ 70, 72, 306, 70, 72, 72, 306, 202, 72, 70,
+ 199, 72, 306, 203, 201, 70, 204, 72, 70, 70,
+ 206, 70, 70, 72, 207, 70, 72, 72, 208, 72,
+ 72, 205, 70, 72, 211, 306, 212, 209, 70, 70,
+ 72, 70, 306, 210, 70, 213, 72, 72, 70, 72,
+ 216, 215, 72, 306, 214, 70, 72, 70, 70, 70,
+ 70, 219, 306, 72, 218, 72, 72, 72, 72, 70,
+ 217, 70, 306, 306, 70, 306, 70, 72, 306, 72,
+ 222, 224, 72, 220, 72, 226, 221, 70, 70, 70,
+ 70, 223, 70, 70, 70, 72, 72, 72, 72, 225,
+
+ 72, 72, 72, 306, 306, 70, 306, 306, 306, 229,
+ 306, 230, 232, 72, 228, 233, 227, 70, 234, 70,
+ 70, 306, 70, 70, 70, 72, 70, 72, 72, 237,
+ 72, 72, 72, 70, 72, 70, 236, 240, 70, 70,
+ 70, 72, 242, 72, 70, 235, 72, 72, 72, 241,
+ 238, 239, 72, 70, 244, 306, 70, 70, 70, 70,
+ 70, 72, 70, 243, 72, 72, 72, 72, 72, 245,
+ 72, 70, 70, 70, 70, 70, 306, 306, 306, 72,
+ 72, 72, 72, 72, 70, 246, 248, 70, 249, 70,
+ 247, 306, 72, 70, 70, 72, 70, 72, 70, 250,
+
+ 306, 72, 72, 70, 72, 70, 72, 251, 255, 253,
+ 306, 72, 306, 72, 256, 70, 70, 70, 257, 252,
+ 254, 306, 70, 72, 72, 72, 70, 70, 70, 259,
+ 72, 306, 306, 70, 72, 72, 72, 70, 306, 260,
+ 263, 72, 306, 258, 70, 72, 264, 306, 306, 70,
+ 265, 70, 72, 70, 70, 261, 262, 72, 70, 72,
+ 70, 72, 72, 70, 70, 70, 72, 268, 72, 306,
+ 70, 72, 72, 72, 70, 70, 70, 271, 72, 267,
+ 306, 306, 72, 72, 72, 70, 269, 272, 270, 275,
+ 70, 70, 306, 72, 70, 70, 273, 70, 72, 72,
+
+ 274, 70, 72, 72, 70, 72, 70, 276, 70, 72,
+ 306, 70, 72, 278, 72, 70, 72, 282, 280, 72,
+ 70, 70, 277, 72, 70, 306, 70, 70, 72, 72,
+ 279, 70, 72, 281, 72, 72, 306, 70, 70, 72,
+ 286, 70, 70, 283, 287, 72, 72, 284, 285, 72,
+ 72, 70, 70, 70, 70, 70, 306, 70, 70, 72,
+ 72, 72, 72, 72, 288, 72, 72, 70, 70, 306,
+ 70, 291, 70, 70, 70, 72, 72, 289, 72, 70,
+ 72, 72, 72, 290, 70, 70, 70, 72, 306, 70,
+ 306, 292, 72, 72, 72, 293, 295, 72, 296, 70,
+
+ 70, 70, 70, 294, 70, 70, 70, 72, 72, 72,
+ 72, 70, 72, 72, 72, 297, 70, 70, 70, 72,
+ 306, 70, 70, 299, 72, 72, 72, 70, 298, 72,
+ 72, 70, 70, 303, 306, 72, 301, 306, 70, 72,
+ 72, 70, 300, 70, 302, 304, 72, 70, 306, 72,
+ 306, 72, 306, 306, 306, 72, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+ 306, 305, 53, 53, 53, 53, 53, 53, 53, 53,
+ 53, 55, 55, 55, 55, 55, 55, 55, 55, 55,
+ 57, 57, 57, 57, 57, 57, 57, 57, 57, 60,
+
+ 306, 60, 65, 65, 65, 71, 71, 306, 71, 119,
+ 119, 119, 119, 306, 119, 119, 119, 119, 120, 120,
+ 120, 120, 120, 120, 120, 120, 120, 123, 123, 123,
+ 306, 123, 123, 123, 123, 123, 125, 306, 125, 125,
+ 125, 125, 125, 125, 125, 129, 306, 306, 306, 306,
+ 306, 129, 72, 72, 306, 72, 182, 306, 182, 9,
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306
+ } ;
+
+static const flex_int16_t yy_chk[1016] =
+ { 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 11, 11, 12, 12, 25,
+ 32, 25, 28, 28, 33, 38, 35, 64, 32, 64,
+ 34, 54, 33, 38, 35, 44, 54, 33, 34, 35,
+ 36, 33, 32, 44, 39, 266, 34, 32, 36, 120,
+ 34, 231, 39, 34, 120, 37, 34, 33, 39, 34,
+
+ 36, 44, 36, 37, 181, 41, 39, 42, 37, 40,
+ 128, 39, 36, 41, 127, 42, 126, 40, 37, 43,
+ 121, 37, 40, 124, 37, 121, 40, 43, 41, 45,
+ 42, 46, 40, 43, 59, 59, 42, 45, 72, 46,
+ 71, 43, 45, 43, 46, 43, 47, 46, 71, 48,
+ 49, 70, 45, 73, 47, 47, 46, 48, 49, 49,
+ 46, 73, 47, 65, 50, 73, 74, 75, 76, 47,
+ 77, 48, 50, 48, 74, 75, 76, 74, 77, 63,
+ 50, 78, 75, 76, 79, 81, 82, 50, 80, 78,
+ 78, 83, 79, 81, 82, 84, 80, 86, 81, 83,
+
+ 58, 56, 83, 84, 30, 86, 26, 79, 85, 80,
+ 80, 83, 82, 79, 86, 87, 85, 80, 83, 88,
+ 85, 89, 84, 87, 90, 91, 92, 88, 93, 89,
+ 24, 23, 90, 91, 92, 9, 93, 8, 92, 94,
+ 95, 96, 7, 87, 89, 88, 90, 94, 95, 96,
+ 97, 6, 96, 92, 92, 98, 5, 99, 97, 95,
+ 100, 94, 97, 98, 96, 99, 101, 102, 100, 4,
+ 103, 104, 3, 105, 101, 102, 106, 99, 103, 104,
+ 102, 105, 103, 98, 106, 107, 0, 109, 0, 108,
+ 0, 113, 110, 107, 104, 109, 106, 108, 108, 113,
+
+ 110, 105, 110, 112, 111, 107, 0, 109, 0, 114,
+ 115, 112, 111, 107, 111, 116, 112, 114, 115, 117,
+ 118, 130, 115, 116, 0, 113, 114, 117, 118, 130,
+ 131, 132, 117, 133, 134, 116, 117, 135, 131, 132,
+ 0, 133, 134, 136, 137, 135, 118, 138, 132, 139,
+ 133, 136, 137, 140, 0, 138, 0, 139, 138, 141,
+ 134, 140, 140, 0, 135, 137, 143, 141, 142, 144,
+ 136, 139, 0, 0, 143, 146, 142, 144, 0, 143,
+ 144, 145, 147, 146, 148, 141, 141, 0, 142, 145,
+ 147, 149, 148, 150, 145, 148, 151, 154, 145, 149,
+
+ 152, 150, 0, 153, 151, 154, 0, 151, 152, 155,
+ 147, 153, 0, 152, 150, 156, 153, 155, 157, 158,
+ 154, 159, 160, 156, 155, 161, 157, 158, 156, 159,
+ 160, 153, 162, 161, 159, 0, 160, 157, 163, 164,
+ 162, 165, 0, 158, 166, 161, 163, 164, 167, 165,
+ 164, 163, 166, 0, 162, 168, 167, 169, 170, 171,
+ 172, 167, 0, 168, 166, 169, 170, 171, 172, 173,
+ 165, 174, 0, 0, 175, 0, 176, 173, 0, 174,
+ 172, 174, 175, 169, 176, 176, 171, 177, 178, 179,
+ 183, 173, 180, 184, 185, 177, 178, 179, 183, 175,
+
+ 180, 184, 185, 0, 0, 186, 0, 0, 0, 179,
+ 0, 180, 183, 186, 178, 184, 177, 187, 186, 188,
+ 189, 0, 190, 191, 192, 187, 193, 188, 189, 189,
+ 190, 191, 192, 194, 193, 195, 188, 192, 196, 197,
+ 198, 194, 194, 195, 199, 187, 196, 197, 198, 193,
+ 190, 191, 199, 200, 198, 0, 201, 202, 203, 206,
+ 204, 200, 205, 195, 201, 202, 203, 206, 204, 200,
+ 205, 207, 208, 209, 210, 211, 0, 0, 0, 207,
+ 208, 209, 210, 211, 212, 202, 204, 213, 205, 214,
+ 203, 0, 212, 215, 217, 213, 216, 214, 218, 207,
+
+ 0, 215, 217, 219, 216, 220, 218, 212, 218, 216,
+ 0, 219, 0, 220, 219, 221, 222, 223, 220, 215,
+ 217, 0, 224, 221, 222, 223, 225, 227, 226, 222,
+ 224, 0, 0, 228, 225, 227, 226, 229, 0, 224,
+ 227, 228, 0, 221, 230, 229, 228, 0, 0, 232,
+ 229, 233, 230, 234, 235, 225, 226, 232, 236, 233,
+ 237, 234, 235, 238, 239, 240, 236, 235, 237, 0,
+ 241, 238, 239, 240, 242, 244, 243, 239, 241, 233,
+ 0, 0, 242, 244, 243, 245, 236, 240, 237, 243,
+ 246, 247, 0, 245, 248, 249, 241, 250, 246, 247,
+
+ 242, 251, 248, 249, 252, 250, 253, 246, 255, 251,
+ 0, 254, 252, 250, 253, 256, 255, 255, 253, 254,
+ 257, 258, 248, 256, 259, 0, 260, 261, 257, 258,
+ 251, 262, 259, 254, 260, 261, 0, 263, 264, 262,
+ 261, 265, 267, 256, 262, 263, 264, 258, 260, 265,
+ 267, 268, 269, 270, 271, 272, 0, 273, 274, 268,
+ 269, 270, 271, 272, 263, 273, 274, 275, 276, 0,
+ 277, 274, 278, 279, 280, 275, 276, 268, 277, 281,
+ 278, 279, 280, 272, 283, 282, 284, 281, 0, 285,
+ 0, 276, 283, 282, 284, 279, 282, 285, 285, 286,
+
+ 287, 288, 289, 280, 290, 291, 292, 286, 287, 288,
+ 289, 293, 290, 291, 292, 289, 294, 295, 296, 293,
+ 0, 297, 298, 293, 294, 295, 296, 299, 292, 297,
+ 298, 301, 300, 297, 0, 299, 295, 0, 302, 301,
+ 300, 303, 294, 304, 296, 300, 302, 305, 0, 303,
+ 0, 304, 0, 0, 0, 305, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 302, 307, 307, 307, 307, 307, 307, 307, 307,
+ 307, 308, 308, 308, 308, 308, 308, 308, 308, 308,
+ 309, 309, 309, 309, 309, 309, 309, 309, 309, 310,
+
+ 0, 310, 311, 311, 311, 312, 312, 0, 312, 313,
+ 313, 313, 313, 0, 313, 313, 313, 313, 314, 314,
+ 314, 314, 314, 314, 314, 314, 314, 315, 315, 315,
+ 0, 315, 315, 315, 315, 315, 316, 0, 316, 316,
+ 316, 316, 316, 316, 316, 317, 0, 0, 0, 0,
+ 0, 317, 318, 318, 0, 318, 319, 0, 319, 306,
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+
+ 306, 306, 306, 306, 306, 306, 306, 306, 306, 306,
+ 306, 306, 306, 306, 306
+ } ;
+
+static yy_state_type yy_last_accepting_state;
+static char *yy_last_accepting_cpos;
+
+
+static int yy_flex_debug = 0;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+static char *yytext;
+#line 1 "pars0lex.l"
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/******************************************************
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+
+The InnoDB parser is frozen because MySQL takes care of SQL parsing.
+Therefore we normally keep the InnoDB parser C files as they are, and do
+not automatically generate them from pars0grm.y and pars0lex.l.
+
+How to make the InnoDB parser and lexer C files:
+
+1. Run ./make_flex.sh to generate lexer files.
+
+2. Run ./make_bison.sh to generate parser files.
+
+These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
+Linux.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+#define YY_NO_INPUT 1
+#define YY_NO_UNISTD_H 1
+#line 54 "pars0lex.l"
+#define YYSTYPE que_node_t*
+
+#include "univ.i"
+#include "pars0pars.h"
+#include "pars0grm.h"
+#include "pars0sym.h"
+#include "mem0mem.h"
+
+#define malloc(A) ut_malloc_nokey(A)
+#define free(A) ut_free(A)
+#define realloc(P, A) ut_realloc(P, A)
+#define exit(A) ut_error
+
+#define YY_INPUT(buf, result, max_size) \
+ result = pars_get_lex_chars(buf, max_size)
+
+/* String buffer for removing quotes */
+static ulint stringbuf_len_alloc = 0; /* Allocated length */
+static ulint stringbuf_len = 0; /* Current length */
+static char* stringbuf; /* Start of buffer */
+/** Appends a string to the buffer. */
+static
+void
+string_append(
+/*==========*/
+ const char* str, /*!< in: string to be appended */
+ ulint len) /*!< in: length of the string */
+{
+ if (stringbuf == NULL) {
+ stringbuf = static_cast<char*>(malloc(1));
+ stringbuf_len_alloc = 1;
+ }
+
+ if (stringbuf_len + len > stringbuf_len_alloc) {
+ while (stringbuf_len + len > stringbuf_len_alloc) {
+ stringbuf_len_alloc <<= 1;
+ }
+
+ stringbuf = static_cast<char*>(
+ realloc(stringbuf, stringbuf_len_alloc));
+ }
+
+ memcpy(stringbuf + stringbuf_len, str, len);
+ stringbuf_len += len;
+}
+
+#line 859 "lexyy.cc"
+
+#line 861 "lexyy.cc"
+
+#define INITIAL 0
+#define comment 1
+#define quoted 2
+#define id 3
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+static int yy_init_globals ( void );
+
+/* Accessor methods to globals.
+ These are made visible to non-reentrant scanners for convenience. */
+
+MY_ATTRIBUTE((unused)) static int yylex_destroy ( void );
+
+MY_ATTRIBUTE((unused)) static int yyget_debug ( void );
+
+MY_ATTRIBUTE((unused)) static void yyset_debug ( int debug_flag );
+
+YY_EXTRA_TYPE yyget_extra ( void );
+
+
+
+MY_ATTRIBUTE((unused)) static FILE *yyget_in ( void );
+
+MY_ATTRIBUTE((unused)) static void yyset_in ( FILE * _in_str );
+
+MY_ATTRIBUTE((unused)) static FILE *yyget_out ( void );
+
+MY_ATTRIBUTE((unused)) static void yyset_out ( FILE * _out_str );
+
+ MY_ATTRIBUTE((unused)) static int yyget_leng ( void );
+
+MY_ATTRIBUTE((unused)) static char *yyget_text ( void );
+
+MY_ATTRIBUTE((unused)) static int yyget_lineno ( void );
+
+MY_ATTRIBUTE((unused)) static void yyset_lineno ( int _line_number );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( void );
+#else
+extern int yywrap ( void );
+#endif
+#endif
+
+#ifndef YY_NO_UNPUT
+
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int );
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * );
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+static int yyinput ( void );
+#else
+static int input ( void );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+ { \
+ int c = '*'; \
+ int n; \
+ for ( n = 0; n < max_size && \
+ (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+ buf[n] = (char) c; \
+ if ( c == '\n' ) \
+ buf[n++] = (char) c; \
+ if ( c == EOF && ferror( yyin ) ) \
+ YY_FATAL_ERROR( "input in flex scanner failed" ); \
+ result = n; \
+ } \
+ else \
+ { \
+ errno=0; \
+ while ( (result = (int) fread(buf, 1, (yy_size_t) max_size, yyin)) == 0 && ferror(yyin)) \
+ { \
+ if( errno != EINTR) \
+ { \
+ YY_FATAL_ERROR( "input in flex scanner failed" ); \
+ break; \
+ } \
+ errno=0; \
+ clearerr(yyin); \
+ } \
+ }\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg )
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (void);
+
+#define YY_DECL int yylex (void)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK /*LINTED*/break;
+#endif
+
+#define YY_RULE_SETUP \
+ YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+ yy_state_type yy_current_state;
+ char *yy_cp, *yy_bp;
+ int yy_act;
+
+ if ( !(yy_init) )
+ {
+ (yy_init) = 1;
+
+#ifdef YY_USER_INIT
+ YY_USER_INIT;
+#endif
+
+ if ( ! (yy_start) )
+ (yy_start) = 1; /* first start state */
+
+ if ( ! yyin )
+ yyin = stdin;
+
+ if ( ! yyout )
+ yyout = stdout;
+
+ if ( ! YY_CURRENT_BUFFER ) {
+ yyensure_buffer_stack ();
+ YY_CURRENT_BUFFER_LVALUE =
+ yy_create_buffer( yyin, YY_BUF_SIZE );
+ }
+
+ yy_load_buffer_state( );
+ }
+
+ {
+#line 112 "pars0lex.l"
+
+
+#line 1082 "lexyy.cc"
+
+ while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */
+ {
+ yy_cp = (yy_c_buf_p);
+
+ /* Support of yytext. */
+ *yy_cp = (yy_hold_char);
+
+ /* yy_bp points to the position in yy_ch_buf of the start of
+ * the current run.
+ */
+ yy_bp = yy_cp;
+
+ yy_current_state = (yy_start);
+yy_match:
+ do
+ {
+ YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
+ if ( yy_accept[yy_current_state] )
+ {
+ (yy_last_accepting_state) = yy_current_state;
+ (yy_last_accepting_cpos) = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 307 )
+ yy_c = yy_meta[yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+ ++yy_cp;
+ }
+ while ( yy_current_state != 306 );
+ yy_cp = (yy_last_accepting_cpos);
+ yy_current_state = (yy_last_accepting_state);
+
+yy_find_action:
+ yy_act = yy_accept[yy_current_state];
+
+ YY_DO_BEFORE_ACTION;
+
+do_action: /* This label is used only to access EOF actions. */
+
+ switch ( yy_act )
+ { /* beginning of action switch */
+ case 0: /* must back up */
+ /* undo the effects of YY_DO_BEFORE_ACTION */
+ *yy_cp = (yy_hold_char);
+ yy_cp = (yy_last_accepting_cpos);
+ yy_current_state = (yy_last_accepting_state);
+ goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 114 "pars0lex.l"
+{
+ yylval = sym_tab_add_int_lit(pars_sym_tab_global,
+ atoi(yytext));
+ return(PARS_INT_LIT);
+}
+ YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 120 "pars0lex.l"
+{
+ ut_error; /* not implemented */
+
+ return(PARS_FLOAT_LIT);
+}
+ YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 126 "pars0lex.l"
+{
+ ulint type;
+
+ yylval = sym_tab_add_bound_lit(pars_sym_tab_global,
+ yytext + 1, &type);
+
+ return((int) type);
+}
+ YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 135 "pars0lex.l"
+{
+ yylval = sym_tab_add_bound_id(pars_sym_tab_global,
+ yytext + 1);
+
+ return(PARS_ID_TOKEN);
+}
+ YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 142 "pars0lex.l"
+{
+/* Quoted character string literals are handled in an explicit
+start state 'quoted'. This state is entered and the buffer for
+the scanned string is emptied upon encountering a starting quote.
+
+In the state 'quoted', only two actions are possible (defined below). */
+ BEGIN(quoted);
+ stringbuf_len = 0;
+}
+ YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 151 "pars0lex.l"
+{
+ /* Got a sequence of characters other than "'":
+ append to string buffer */
+ string_append(yytext, yyleng);
+}
+ YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 156 "pars0lex.l"
+{
+ /* Got a sequence of "'" characters:
+ append half of them to string buffer,
+ as "''" represents a single "'".
+ We apply truncating division,
+ so that "'''" will result in "'". */
+
+ string_append(yytext, yyleng / 2);
+
+ /* If we got an odd number of quotes, then the
+ last quote we got is the terminating quote.
+ At the end of the string, we return to the
+ initial start state and report the scanned
+ string literal. */
+
+ if (yyleng % 2) {
+ BEGIN(INITIAL);
+ yylval = sym_tab_add_str_lit(
+ pars_sym_tab_global,
+ (byte*) stringbuf, stringbuf_len);
+ return(PARS_STR_LIT);
+ }
+}
+ YY_BREAK
+case 8:
+YY_RULE_SETUP
+#line 180 "pars0lex.l"
+{
+/* Quoted identifiers are handled in an explicit start state 'id'.
+This state is entered and the buffer for the scanned string is emptied
+upon encountering a starting quote.
+
+In the state 'id', only two actions are possible (defined below). */
+ BEGIN(id);
+ stringbuf_len = 0;
+}
+ YY_BREAK
+case 9:
+/* rule 9 can match eol */
+YY_RULE_SETUP
+#line 189 "pars0lex.l"
+{
+ /* Got a sequence of characters other than '"':
+ append to string buffer */
+ string_append(yytext, yyleng);
+}
+ YY_BREAK
+case 10:
+YY_RULE_SETUP
+#line 194 "pars0lex.l"
+{
+ /* Got a sequence of '"' characters:
+ append half of them to string buffer,
+ as '""' represents a single '"'.
+ We apply truncating division,
+ so that '"""' will result in '"'. */
+
+ string_append(yytext, yyleng / 2);
+
+ /* If we got an odd number of quotes, then the
+ last quote we got is the terminating quote.
+ At the end of the string, we return to the
+ initial start state and report the scanned
+ identifier. */
+
+ if (yyleng % 2) {
+ BEGIN(INITIAL);
+ yylval = sym_tab_add_id(
+ pars_sym_tab_global,
+ (byte*) stringbuf, stringbuf_len);
+
+ return(PARS_ID_TOKEN);
+ }
+}
+ YY_BREAK
+case 11:
+YY_RULE_SETUP
+#line 219 "pars0lex.l"
+{
+ yylval = sym_tab_add_null_lit(pars_sym_tab_global);
+
+ return(PARS_NULL_LIT);
+}
+ YY_BREAK
+case 12:
+YY_RULE_SETUP
+#line 225 "pars0lex.l"
+{
+ /* Implicit cursor name */
+ yylval = sym_tab_add_str_lit(pars_sym_tab_global,
+ (byte*) yytext, yyleng);
+ return(PARS_SQL_TOKEN);
+}
+ YY_BREAK
+case 13:
+YY_RULE_SETUP
+#line 232 "pars0lex.l"
+{
+ return(PARS_AND_TOKEN);
+}
+ YY_BREAK
+case 14:
+YY_RULE_SETUP
+#line 236 "pars0lex.l"
+{
+ return(PARS_OR_TOKEN);
+}
+ YY_BREAK
+case 15:
+YY_RULE_SETUP
+#line 240 "pars0lex.l"
+{
+ return(PARS_NOT_TOKEN);
+}
+ YY_BREAK
+case 16:
+YY_RULE_SETUP
+#line 244 "pars0lex.l"
+{
+ return(PARS_PROCEDURE_TOKEN);
+}
+ YY_BREAK
+case 17:
+YY_RULE_SETUP
+#line 248 "pars0lex.l"
+{
+ return(PARS_IN_TOKEN);
+}
+ YY_BREAK
+case 18:
+YY_RULE_SETUP
+#line 252 "pars0lex.l"
+{
+ return(PARS_INT_TOKEN);
+}
+ YY_BREAK
+case 19:
+YY_RULE_SETUP
+#line 256 "pars0lex.l"
+{
+ return(PARS_CHAR_TOKEN);
+}
+ YY_BREAK
+case 20:
+YY_RULE_SETUP
+#line 260 "pars0lex.l"
+{
+ return(PARS_IS_TOKEN);
+}
+ YY_BREAK
+case 21:
+YY_RULE_SETUP
+#line 264 "pars0lex.l"
+{
+ return(PARS_BEGIN_TOKEN);
+}
+ YY_BREAK
+case 22:
+YY_RULE_SETUP
+#line 268 "pars0lex.l"
+{
+ return(PARS_END_TOKEN);
+}
+ YY_BREAK
+case 23:
+YY_RULE_SETUP
+#line 272 "pars0lex.l"
+{
+ return(PARS_IF_TOKEN);
+}
+ YY_BREAK
+case 24:
+YY_RULE_SETUP
+#line 276 "pars0lex.l"
+{
+ return(PARS_THEN_TOKEN);
+}
+ YY_BREAK
+case 25:
+YY_RULE_SETUP
+#line 280 "pars0lex.l"
+{
+ return(PARS_ELSE_TOKEN);
+}
+ YY_BREAK
+case 26:
+YY_RULE_SETUP
+#line 284 "pars0lex.l"
+{
+ return(PARS_ELSIF_TOKEN);
+}
+ YY_BREAK
+case 27:
+YY_RULE_SETUP
+#line 288 "pars0lex.l"
+{
+ return(PARS_LOOP_TOKEN);
+}
+ YY_BREAK
+case 28:
+YY_RULE_SETUP
+#line 292 "pars0lex.l"
+{
+ return(PARS_WHILE_TOKEN);
+}
+ YY_BREAK
+case 29:
+YY_RULE_SETUP
+#line 296 "pars0lex.l"
+{
+ return(PARS_RETURN_TOKEN);
+}
+ YY_BREAK
+case 30:
+YY_RULE_SETUP
+#line 300 "pars0lex.l"
+{
+ return(PARS_SELECT_TOKEN);
+}
+ YY_BREAK
+case 31:
+YY_RULE_SETUP
+#line 304 "pars0lex.l"
+{
+ return(PARS_COUNT_TOKEN);
+}
+ YY_BREAK
+case 32:
+YY_RULE_SETUP
+#line 308 "pars0lex.l"
+{
+ return(PARS_FROM_TOKEN);
+}
+ YY_BREAK
+case 33:
+YY_RULE_SETUP
+#line 312 "pars0lex.l"
+{
+ return(PARS_WHERE_TOKEN);
+}
+ YY_BREAK
+case 34:
+YY_RULE_SETUP
+#line 316 "pars0lex.l"
+{
+ return(PARS_FOR_TOKEN);
+}
+ YY_BREAK
+case 35:
+YY_RULE_SETUP
+#line 320 "pars0lex.l"
+{
+ return(PARS_ORDER_TOKEN);
+}
+ YY_BREAK
+case 36:
+YY_RULE_SETUP
+#line 324 "pars0lex.l"
+{
+ return(PARS_BY_TOKEN);
+}
+ YY_BREAK
+case 37:
+YY_RULE_SETUP
+#line 328 "pars0lex.l"
+{
+ return(PARS_ASC_TOKEN);
+}
+ YY_BREAK
+case 38:
+YY_RULE_SETUP
+#line 332 "pars0lex.l"
+{
+ return(PARS_DESC_TOKEN);
+}
+ YY_BREAK
+case 39:
+YY_RULE_SETUP
+#line 336 "pars0lex.l"
+{
+ return(PARS_INSERT_TOKEN);
+}
+ YY_BREAK
+case 40:
+YY_RULE_SETUP
+#line 340 "pars0lex.l"
+{
+ return(PARS_INTO_TOKEN);
+}
+ YY_BREAK
+case 41:
+YY_RULE_SETUP
+#line 344 "pars0lex.l"
+{
+ return(PARS_VALUES_TOKEN);
+}
+ YY_BREAK
+case 42:
+YY_RULE_SETUP
+#line 348 "pars0lex.l"
+{
+ return(PARS_UPDATE_TOKEN);
+}
+ YY_BREAK
+case 43:
+YY_RULE_SETUP
+#line 352 "pars0lex.l"
+{
+ return(PARS_SET_TOKEN);
+}
+ YY_BREAK
+case 44:
+YY_RULE_SETUP
+#line 356 "pars0lex.l"
+{
+ return(PARS_DELETE_TOKEN);
+}
+ YY_BREAK
+case 45:
+YY_RULE_SETUP
+#line 360 "pars0lex.l"
+{
+ return(PARS_CURRENT_TOKEN);
+}
+ YY_BREAK
+case 46:
+YY_RULE_SETUP
+#line 364 "pars0lex.l"
+{
+ return(PARS_OF_TOKEN);
+}
+ YY_BREAK
+case 47:
+YY_RULE_SETUP
+#line 368 "pars0lex.l"
+{
+ return(PARS_CREATE_TOKEN);
+}
+ YY_BREAK
+case 48:
+YY_RULE_SETUP
+#line 372 "pars0lex.l"
+{
+ return(PARS_TABLE_TOKEN);
+}
+ YY_BREAK
+case 49:
+YY_RULE_SETUP
+#line 376 "pars0lex.l"
+{
+ return(PARS_INDEX_TOKEN);
+}
+ YY_BREAK
+case 50:
+YY_RULE_SETUP
+#line 380 "pars0lex.l"
+{
+ return(PARS_UNIQUE_TOKEN);
+}
+ YY_BREAK
+case 51:
+YY_RULE_SETUP
+#line 384 "pars0lex.l"
+{
+ return(PARS_CLUSTERED_TOKEN);
+}
+ YY_BREAK
+case 52:
+YY_RULE_SETUP
+#line 388 "pars0lex.l"
+{
+ return(PARS_ON_TOKEN);
+}
+ YY_BREAK
+case 53:
+YY_RULE_SETUP
+#line 392 "pars0lex.l"
+{
+ return(PARS_DECLARE_TOKEN);
+}
+ YY_BREAK
+case 54:
+YY_RULE_SETUP
+#line 396 "pars0lex.l"
+{
+ return(PARS_CURSOR_TOKEN);
+}
+ YY_BREAK
+case 55:
+YY_RULE_SETUP
+#line 400 "pars0lex.l"
+{
+ return(PARS_OPEN_TOKEN);
+}
+ YY_BREAK
+case 56:
+YY_RULE_SETUP
+#line 404 "pars0lex.l"
+{
+ return(PARS_FETCH_TOKEN);
+}
+ YY_BREAK
+case 57:
+YY_RULE_SETUP
+#line 408 "pars0lex.l"
+{
+ return(PARS_CLOSE_TOKEN);
+}
+ YY_BREAK
+case 58:
+YY_RULE_SETUP
+#line 412 "pars0lex.l"
+{
+ return(PARS_NOTFOUND_TOKEN);
+}
+ YY_BREAK
+case 59:
+YY_RULE_SETUP
+#line 416 "pars0lex.l"
+{
+ return(PARS_TO_BINARY_TOKEN);
+}
+ YY_BREAK
+case 60:
+YY_RULE_SETUP
+#line 420 "pars0lex.l"
+{
+ return(PARS_SUBSTR_TOKEN);
+}
+ YY_BREAK
+case 61:
+YY_RULE_SETUP
+#line 424 "pars0lex.l"
+{
+ return(PARS_CONCAT_TOKEN);
+}
+ YY_BREAK
+case 62:
+YY_RULE_SETUP
+#line 428 "pars0lex.l"
+{
+ return(PARS_INSTR_TOKEN);
+}
+ YY_BREAK
+case 63:
+YY_RULE_SETUP
+#line 432 "pars0lex.l"
+{
+ return(PARS_LENGTH_TOKEN);
+}
+ YY_BREAK
+case 64:
+YY_RULE_SETUP
+#line 436 "pars0lex.l"
+{
+ return(PARS_COMMIT_TOKEN);
+}
+ YY_BREAK
+case 65:
+YY_RULE_SETUP
+#line 440 "pars0lex.l"
+{
+ return(PARS_ROLLBACK_TOKEN);
+}
+ YY_BREAK
+case 66:
+YY_RULE_SETUP
+#line 444 "pars0lex.l"
+{
+ return(PARS_WORK_TOKEN);
+}
+ YY_BREAK
+case 67:
+YY_RULE_SETUP
+#line 448 "pars0lex.l"
+{
+ return(PARS_EXIT_TOKEN);
+}
+ YY_BREAK
+case 68:
+YY_RULE_SETUP
+#line 452 "pars0lex.l"
+{
+ return(PARS_FUNCTION_TOKEN);
+}
+ YY_BREAK
+case 69:
+YY_RULE_SETUP
+#line 456 "pars0lex.l"
+{
+ return(PARS_LOCK_TOKEN);
+}
+ YY_BREAK
+case 70:
+YY_RULE_SETUP
+#line 460 "pars0lex.l"
+{
+ return(PARS_SHARE_TOKEN);
+}
+ YY_BREAK
+case 71:
+YY_RULE_SETUP
+#line 464 "pars0lex.l"
+{
+ return(PARS_MODE_TOKEN);
+}
+ YY_BREAK
+case 72:
+YY_RULE_SETUP
+#line 468 "pars0lex.l"
+{
+ return(PARS_LIKE_TOKEN);
+}
+ YY_BREAK
+case 73:
+YY_RULE_SETUP
+#line 472 "pars0lex.l"
+{
+ return(PARS_BIGINT_TOKEN);
+}
+ YY_BREAK
+case 74:
+YY_RULE_SETUP
+#line 476 "pars0lex.l"
+{
+ yylval = sym_tab_add_id(pars_sym_tab_global,
+ (byte*) yytext,
+ strlen(yytext));
+ return(PARS_ID_TOKEN);
+}
+ YY_BREAK
+case 75:
+YY_RULE_SETUP
+#line 483 "pars0lex.l"
+{
+ yylval = sym_tab_add_id(pars_sym_tab_global,
+ (byte*) yytext,
+ strlen(yytext));
+ return(PARS_TABLE_NAME_TOKEN);
+}
+ YY_BREAK
+case 76:
+YY_RULE_SETUP
+#line 490 "pars0lex.l"
+{
+ return(PARS_DDOT_TOKEN);
+}
+ YY_BREAK
+case 77:
+YY_RULE_SETUP
+#line 494 "pars0lex.l"
+{
+ return(PARS_ASSIGN_TOKEN);
+}
+ YY_BREAK
+case 78:
+YY_RULE_SETUP
+#line 498 "pars0lex.l"
+{
+ return(PARS_LE_TOKEN);
+}
+ YY_BREAK
+case 79:
+YY_RULE_SETUP
+#line 502 "pars0lex.l"
+{
+ return(PARS_GE_TOKEN);
+}
+ YY_BREAK
+case 80:
+YY_RULE_SETUP
+#line 506 "pars0lex.l"
+{
+ return(PARS_NE_TOKEN);
+}
+ YY_BREAK
+case 81:
+YY_RULE_SETUP
+#line 510 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 82:
+YY_RULE_SETUP
+#line 515 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 83:
+YY_RULE_SETUP
+#line 520 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 84:
+YY_RULE_SETUP
+#line 525 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 85:
+YY_RULE_SETUP
+#line 530 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 86:
+YY_RULE_SETUP
+#line 535 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 87:
+YY_RULE_SETUP
+#line 540 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 88:
+YY_RULE_SETUP
+#line 545 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 89:
+YY_RULE_SETUP
+#line 550 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 90:
+YY_RULE_SETUP
+#line 555 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 91:
+YY_RULE_SETUP
+#line 560 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 92:
+YY_RULE_SETUP
+#line 565 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 93:
+YY_RULE_SETUP
+#line 570 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 94:
+YY_RULE_SETUP
+#line 575 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 95:
+YY_RULE_SETUP
+#line 580 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 96:
+YY_RULE_SETUP
+#line 585 "pars0lex.l"
+BEGIN(comment); /* eat up comment */
+ YY_BREAK
+case 97:
+/* rule 97 can match eol */
+YY_RULE_SETUP
+#line 587 "pars0lex.l"
+
+ YY_BREAK
+case 98:
+/* rule 98 can match eol */
+YY_RULE_SETUP
+#line 588 "pars0lex.l"
+
+ YY_BREAK
+case 99:
+YY_RULE_SETUP
+#line 589 "pars0lex.l"
+BEGIN(INITIAL);
+ YY_BREAK
+case 100:
+/* rule 100 can match eol */
+YY_RULE_SETUP
+#line 591 "pars0lex.l"
+/* eat up whitespace */
+ YY_BREAK
+case 101:
+YY_RULE_SETUP
+#line 594 "pars0lex.l"
+{
+ fprintf(stderr,"Unrecognized character: %02x\n",
+ *yytext);
+
+ ut_error;
+
+ return(0);
+}
+ YY_BREAK
+case 102:
+YY_RULE_SETUP
+#line 603 "pars0lex.l"
+YY_FATAL_ERROR( "flex scanner jammed" );
+ YY_BREAK
+#line 1942 "lexyy.cc"
+case YY_STATE_EOF(INITIAL):
+case YY_STATE_EOF(comment):
+case YY_STATE_EOF(quoted):
+case YY_STATE_EOF(id):
+ yyterminate();
+
+ case YY_END_OF_BUFFER:
+ {
+ /* Amount of text matched not including the EOB char. */
+ int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1;
+
+ /* Undo the effects of YY_DO_BEFORE_ACTION. */
+ *yy_cp = (yy_hold_char);
+ YY_RESTORE_YY_MORE_OFFSET
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+ {
+ /* We're scanning a new file or input source. It's
+ * possible that this happened because the user
+ * just pointed yyin at a new source and called
+ * yylex(). If so, then we have to assure
+ * consistency between YY_CURRENT_BUFFER and our
+ * globals. Here is the right place to do so, because
+ * this is the first action (other than possibly a
+ * back-up) that will match for the new input source.
+ */
+ (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+ YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+ YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+ }
+
+ /* Note that here we test for yy_c_buf_p "<=" to the position
+ * of the first EOB in the buffer, since yy_c_buf_p will
+ * already have been incremented past the NUL character
+ * (since all states make transitions on EOB to the
+ * end-of-buffer state). Contrast this with the test
+ * in input().
+ */
+ if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
+ { /* This was really a NUL. */
+ yy_state_type yy_next_state;
+
+ (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text;
+
+ yy_current_state = yy_get_previous_state( );
+
+ /* Okay, we're now positioned to make the NUL
+ * transition. We couldn't have
+ * yy_get_previous_state() go ahead and do it
+ * for us because it doesn't know how to deal
+ * with the possibility of jamming (and we don't
+ * want to build jamming into it because then it
+ * will run more slowly).
+ */
+
+ yy_next_state = yy_try_NUL_trans( yy_current_state );
+
+ yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+
+ if ( yy_next_state )
+ {
+ /* Consume the NUL. */
+ yy_cp = ++(yy_c_buf_p);
+ yy_current_state = yy_next_state;
+ goto yy_match;
+ }
+
+ else
+ {
+ yy_cp = (yy_last_accepting_cpos);
+ yy_current_state = (yy_last_accepting_state);
+ goto yy_find_action;
+ }
+ }
+
+ else switch ( yy_get_next_buffer( ) )
+ {
+ case EOB_ACT_END_OF_FILE:
+ {
+ (yy_did_buffer_switch_on_eof) = 0;
+
+ if ( yywrap( ) )
+ {
+ /* Note: because we've taken care in
+ * yy_get_next_buffer() to have set up
+ * yytext, we can now set up
+ * yy_c_buf_p so that if some total
+ * hoser (like flex itself) wants to
+ * call the scanner after we return the
+ * YY_NULL, it'll still work - another
+ * YY_NULL will get returned.
+ */
+ (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ;
+
+ yy_act = YY_STATE_EOF(YY_START);
+ goto do_action;
+ }
+
+ else
+ {
+ if ( ! (yy_did_buffer_switch_on_eof) )
+ YY_NEW_FILE;
+ }
+ break;
+ }
+
+ case EOB_ACT_CONTINUE_SCAN:
+ (yy_c_buf_p) =
+ (yytext_ptr) + yy_amount_of_matched_text;
+
+ yy_current_state = yy_get_previous_state( );
+
+ yy_cp = (yy_c_buf_p);
+ yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+ goto yy_match;
+
+ case EOB_ACT_LAST_MATCH:
+ (yy_c_buf_p) =
+ &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)];
+
+ yy_current_state = yy_get_previous_state( );
+
+ yy_cp = (yy_c_buf_p);
+ yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+ goto yy_find_action;
+ }
+ break;
+ }
+
+ default:
+ YY_FATAL_ERROR(
+ "fatal flex scanner internal error--no action found" );
+ } /* end of action switch */
+ } /* end of scanning one token */
+ } /* end of user's declarations */
+} /* end of yylex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ * EOB_ACT_LAST_MATCH -
+ * EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ * EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (void)
+{
+ char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+ char *source = (yytext_ptr);
+ int number_to_move, i;
+ int ret_val;
+
+ if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] )
+ YY_FATAL_ERROR(
+ "fatal flex scanner internal error--end of buffer missed" );
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+ { /* Don't try to fill the buffer, so this is an EOF. */
+ if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 )
+ {
+ /* We matched a single character, the EOB, so
+ * treat this as a final EOF.
+ */
+ return EOB_ACT_END_OF_FILE;
+ }
+
+ else
+ {
+ /* We matched some text prior to the EOB, first
+ * process it.
+ */
+ return EOB_ACT_LAST_MATCH;
+ }
+ }
+
+ /* Try to read more data. */
+
+ /* First move last chars to start of buffer. */
+ number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr) - 1);
+
+ for ( i = 0; i < number_to_move; ++i )
+ *(dest++) = *(source++);
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+ /* don't do the read, it's not guaranteed to return an EOF,
+ * just force an EOF
+ */
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0;
+
+ else
+ {
+ int num_to_read =
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+ while ( num_to_read <= 0 )
+ { /* Not enough room in the buffer - grow it. */
+
+ /* just a shorter name for the current buffer */
+ YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE;
+
+ int yy_c_buf_p_offset =
+ (int) ((yy_c_buf_p) - b->yy_ch_buf);
+
+ if ( b->yy_is_our_buffer )
+ {
+ int new_size = b->yy_buf_size * 2;
+
+ if ( new_size <= 0 )
+ b->yy_buf_size += b->yy_buf_size / 8;
+ else
+ b->yy_buf_size *= 2;
+
+ b->yy_ch_buf = (char *)
+ /* Include room in for 2 EOB chars. */
+ yyrealloc( (void *) b->yy_ch_buf,
+ (yy_size_t) (b->yy_buf_size + 2) );
+ }
+ else
+ /* Can't grow it, we don't own it. */
+ b->yy_ch_buf = NULL;
+
+ if ( ! b->yy_ch_buf )
+ YY_FATAL_ERROR(
+ "fatal error - scanner input buffer overflow" );
+
+ (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+ num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+ number_to_move - 1;
+
+ }
+
+ if ( num_to_read > YY_READ_BUF_SIZE )
+ num_to_read = YY_READ_BUF_SIZE;
+
+ /* Read in more data. */
+ YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+ (yy_n_chars), num_to_read );
+
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+ }
+
+ if ( (yy_n_chars) == 0 )
+ {
+ if ( number_to_move == YY_MORE_ADJ )
+ {
+ ret_val = EOB_ACT_END_OF_FILE;
+ yyrestart( yyin );
+ }
+
+ else
+ {
+ ret_val = EOB_ACT_LAST_MATCH;
+ YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+ YY_BUFFER_EOF_PENDING;
+ }
+ }
+
+ else
+ ret_val = EOB_ACT_CONTINUE_SCAN;
+
+ if (((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+ /* Extend the array by 50%, plus the number we really need. */
+ int new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1);
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc(
+ (void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size );
+ if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+ /* "- 2" to take care of EOB's */
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2);
+ }
+
+ (yy_n_chars) += number_to_move;
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR;
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR;
+
+ (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+ return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+ static yy_state_type yy_get_previous_state (void)
+{
+ yy_state_type yy_current_state;
+ char *yy_cp;
+
+ yy_current_state = (yy_start);
+
+ for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp )
+ {
+ YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+ if ( yy_accept[yy_current_state] )
+ {
+ (yy_last_accepting_state) = yy_current_state;
+ (yy_last_accepting_cpos) = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 307 )
+ yy_c = yy_meta[yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+ }
+
+ return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ * next_state = yy_try_NUL_trans( current_state );
+ */
+ static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state )
+{
+ int yy_is_jam;
+ char *yy_cp = (yy_c_buf_p);
+
+ YY_CHAR yy_c = 1;
+ if ( yy_accept[yy_current_state] )
+ {
+ (yy_last_accepting_state) = yy_current_state;
+ (yy_last_accepting_cpos) = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 307 )
+ yy_c = yy_meta[yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+ yy_is_jam = (yy_current_state == 306);
+
+ return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_UNPUT
+
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+ static int yyinput (void)
+#else
+ static int input (void)
+#endif
+
+{
+ int c;
+
+ *(yy_c_buf_p) = (yy_hold_char);
+
+ if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR )
+ {
+ /* yy_c_buf_p now points to the character we want to return.
+ * If this occurs *before* the EOB characters, then it's a
+ * valid NUL; if not, then we've hit the end of the buffer.
+ */
+ if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
+ /* This was really a NUL. */
+ *(yy_c_buf_p) = '\0';
+
+ else
+ { /* need more input */
+ int offset = (int) ((yy_c_buf_p) - (yytext_ptr));
+ ++(yy_c_buf_p);
+
+ switch ( yy_get_next_buffer( ) )
+ {
+ case EOB_ACT_LAST_MATCH:
+ /* This happens because yy_g_n_b()
+ * sees that we've accumulated a
+ * token and flags that we need to
+ * try matching the token before
+ * proceeding. But for input(),
+ * there's no matching to consider.
+ * So convert the EOB_ACT_LAST_MATCH
+ * to EOB_ACT_END_OF_FILE.
+ */
+
+ /* Reset buffer status. */
+ yyrestart( yyin );
+
+ /*FALLTHROUGH*/
+
+ case EOB_ACT_END_OF_FILE:
+ {
+ if ( yywrap( ) )
+ return 0;
+
+ if ( ! (yy_did_buffer_switch_on_eof) )
+ YY_NEW_FILE;
+#ifdef __cplusplus
+ return yyinput();
+#else
+ return input();
+#endif
+ }
+
+ case EOB_ACT_CONTINUE_SCAN:
+ (yy_c_buf_p) = (yytext_ptr) + offset;
+ break;
+ }
+ }
+ }
+
+ c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */
+ *(yy_c_buf_p) = '\0'; /* preserve yytext */
+ (yy_hold_char) = *++(yy_c_buf_p);
+
+ return c;
+}
+#endif /* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ *
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+ static void yyrestart (FILE * input_file )
+{
+
+ if ( ! YY_CURRENT_BUFFER ){
+ yyensure_buffer_stack ();
+ YY_CURRENT_BUFFER_LVALUE =
+ yy_create_buffer( yyin, YY_BUF_SIZE );
+ }
+
+ yy_init_buffer( YY_CURRENT_BUFFER, input_file );
+ yy_load_buffer_state( );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ *
+ */
+ MY_ATTRIBUTE((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer )
+{
+
+ /* TODO. We should be able to replace this entire function body
+ * with
+ * yypop_buffer_state();
+ * yypush_buffer_state(new_buffer);
+ */
+ yyensure_buffer_stack ();
+ if ( YY_CURRENT_BUFFER == new_buffer )
+ return;
+
+ if ( YY_CURRENT_BUFFER )
+ {
+ /* Flush out information for old buffer. */
+ *(yy_c_buf_p) = (yy_hold_char);
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p);
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+ }
+
+ YY_CURRENT_BUFFER_LVALUE = new_buffer;
+ yy_load_buffer_state( );
+
+ /* We don't actually know whether we did this switch during
+ * EOF (yywrap()) processing, but the only time this flag
+ * is looked at is after yywrap() is called, so it's safe
+ * to go ahead and always set it.
+ */
+ (yy_did_buffer_switch_on_eof) = 1;
+}
+
+static void yy_load_buffer_state (void)
+{
+ (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+ (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+ yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+ (yy_hold_char) = *(yy_c_buf_p);
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ *
+ * @return the allocated buffer state.
+ */
+ static YY_BUFFER_STATE yy_create_buffer (FILE * file, int size )
+{
+ YY_BUFFER_STATE b;
+
+ b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) );
+ if ( ! b )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+ b->yy_buf_size = size;
+
+ /* yy_ch_buf has to be 2 characters longer than the size given because
+ * we need to put in 2 end-of-buffer characters.
+ */
+ b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2) );
+ if ( ! b->yy_ch_buf )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+ b->yy_is_our_buffer = 1;
+
+ yy_init_buffer( b, file );
+
+ return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with yy_create_buffer()
+ *
+ */
+ static void yy_delete_buffer (YY_BUFFER_STATE b )
+{
+
+ if ( ! b )
+ return;
+
+ if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+ YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+ if ( b->yy_is_our_buffer )
+ yyfree( (void *) b->yy_ch_buf );
+
+ yyfree( (void *) b );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a yyrestart() or at EOF.
+ */
+ static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file )
+
+{
+ int oerrno = errno;
+
+ yy_flush_buffer( b );
+
+ b->yy_input_file = file;
+ b->yy_fill_buffer = 1;
+
+ /* If b is the current buffer, then yy_init_buffer was _probably_
+ * called from yyrestart() or through yy_get_next_buffer.
+ * In that case, we don't want to reset the lineno or column.
+ */
+ if (b != YY_CURRENT_BUFFER){
+ b->yy_bs_lineno = 1;
+ b->yy_bs_column = 0;
+ }
+
+ b->yy_is_interactive = 0;
+
+ errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ *
+ */
+ static void yy_flush_buffer (YY_BUFFER_STATE b )
+{
+ if ( ! b )
+ return;
+
+ b->yy_n_chars = 0;
+
+ /* We always need two end-of-buffer characters. The first causes
+ * a transition to the end-of-buffer state. The second causes
+ * a jam in that state.
+ */
+ b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+ b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+ b->yy_buf_pos = &b->yy_ch_buf[0];
+
+ b->yy_at_bol = 1;
+ b->yy_buffer_status = YY_BUFFER_NEW;
+
+ if ( b == YY_CURRENT_BUFFER )
+ yy_load_buffer_state( );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ * the current state. This function will allocate the stack
+ * if necessary.
+ * @param new_buffer The new state.
+ *
+ */
+MY_ATTRIBUTE((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer )
+{
+ if (new_buffer == NULL)
+ return;
+
+ yyensure_buffer_stack();
+
+ /* This block is copied from yy_switch_to_buffer. */
+ if ( YY_CURRENT_BUFFER )
+ {
+ /* Flush out information for old buffer. */
+ *(yy_c_buf_p) = (yy_hold_char);
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p);
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+ }
+
+ /* Only push if top exists. Otherwise, replace top. */
+ if (YY_CURRENT_BUFFER)
+ (yy_buffer_stack_top)++;
+ YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+ /* copied from yy_switch_to_buffer. */
+ yy_load_buffer_state( );
+ (yy_did_buffer_switch_on_eof) = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ * The next element becomes the new top.
+ *
+ */
+MY_ATTRIBUTE((unused)) static void yypop_buffer_state (void)
+{
+ if (!YY_CURRENT_BUFFER)
+ return;
+
+ yy_delete_buffer(YY_CURRENT_BUFFER );
+ YY_CURRENT_BUFFER_LVALUE = NULL;
+ if ((yy_buffer_stack_top) > 0)
+ --(yy_buffer_stack_top);
+
+ if (YY_CURRENT_BUFFER) {
+ yy_load_buffer_state( );
+ (yy_did_buffer_switch_on_eof) = 1;
+ }
+}
+
+/* Allocates the stack if it does not exist.
+ * Guarantees space for at least one push.
+ */
+static void yyensure_buffer_stack (void)
+{
+ yy_size_t num_to_alloc;
+
+ if (!(yy_buffer_stack)) {
+
+ /* First allocation is just for 2 elements, since we don't know if this
+ * scanner will even need a stack. We use 2 instead of 1 to avoid an
+ * immediate realloc on the next call.
+ */
+ num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */
+ (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc
+ (num_to_alloc * sizeof(struct yy_buffer_state*)
+ );
+ if ( ! (yy_buffer_stack) )
+ YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+ memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+ (yy_buffer_stack_max) = num_to_alloc;
+ (yy_buffer_stack_top) = 0;
+ return;
+ }
+
+ if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){
+
+ /* Increase the buffer to prepare for a possible push. */
+ yy_size_t grow_size = 8 /* arbitrary grow size */;
+
+ num_to_alloc = (yy_buffer_stack_max) + grow_size;
+ (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc
+ ((yy_buffer_stack),
+ num_to_alloc * sizeof(struct yy_buffer_state*)
+ );
+ if ( ! (yy_buffer_stack) )
+ YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+ /* zero only the new slots.*/
+ memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*));
+ (yy_buffer_stack_max) = num_to_alloc;
+ }
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yynoreturn yy_fatal_error (const char* msg )
+{
+ fprintf( stderr, "%s\n", msg );
+ exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+ do \
+ { \
+ /* Undo effects of setting up yytext. */ \
+ int yyless_macro_arg = (n); \
+ YY_LESS_LINENO(yyless_macro_arg);\
+ yytext[yyleng] = (yy_hold_char); \
+ (yy_c_buf_p) = yytext + yyless_macro_arg; \
+ (yy_hold_char) = *(yy_c_buf_p); \
+ *(yy_c_buf_p) = '\0'; \
+ yyleng = yyless_macro_arg; \
+ } \
+ while ( 0 )
+
+/* Accessor methods (get/set functions) to struct members. */
+
+/** Get the current line number.
+ *
+ */
+MY_ATTRIBUTE((unused)) static int yyget_lineno (void)
+{
+
+ return yylineno;
+}
+
+/** Get the input stream.
+ *
+ */
+MY_ATTRIBUTE((unused)) static FILE *yyget_in (void)
+{
+ return yyin;
+}
+
+/** Get the output stream.
+ *
+ */
+MY_ATTRIBUTE((unused)) static FILE *yyget_out (void)
+{
+ return yyout;
+}
+
+/** Get the length of the current token.
+ *
+ */
+MY_ATTRIBUTE((unused)) static int yyget_leng (void)
+{
+ return yyleng;
+}
+
+/** Get the current token.
+ *
+ */
+
+MY_ATTRIBUTE((unused)) static char *yyget_text (void)
+{
+ return yytext;
+}
+
+/** Set the current line number.
+ * @param _line_number line number
+ *
+ */
+MY_ATTRIBUTE((unused)) static void yyset_lineno (int _line_number )
+{
+
+ yylineno = _line_number;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param _in_str A readable stream.
+ *
+ * @see yy_switch_to_buffer
+ */
+MY_ATTRIBUTE((unused)) static void yyset_in (FILE * _in_str )
+{
+ yyin = _in_str ;
+}
+
+MY_ATTRIBUTE((unused)) static void yyset_out (FILE * _out_str )
+{
+ yyout = _out_str ;
+}
+
+MY_ATTRIBUTE((unused)) static int yyget_debug (void)
+{
+ return yy_flex_debug;
+}
+
+MY_ATTRIBUTE((unused)) static void yyset_debug (int _bdebug )
+{
+ yy_flex_debug = _bdebug ;
+}
+
+static int yy_init_globals (void)
+{
+ /* Initialization is the same as for the non-reentrant scanner.
+ * This function is called from yylex_destroy(), so don't allocate here.
+ */
+
+ (yy_buffer_stack) = NULL;
+ (yy_buffer_stack_top) = 0;
+ (yy_buffer_stack_max) = 0;
+ (yy_c_buf_p) = NULL;
+ (yy_init) = 0;
+ (yy_start) = 0;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+ yyin = stdin;
+ yyout = stdout;
+#else
+ yyin = NULL;
+ yyout = NULL;
+#endif
+
+ /* For future reference: Set errno on error, since we are called by
+ * yylex_init()
+ */
+ return 0;
+}
+
+/* yylex_destroy is for both reentrant and non-reentrant scanners. */
+MY_ATTRIBUTE((unused)) static int yylex_destroy (void)
+{
+
+ /* Pop the buffer stack, destroying each element. */
+ while(YY_CURRENT_BUFFER){
+ yy_delete_buffer( YY_CURRENT_BUFFER );
+ YY_CURRENT_BUFFER_LVALUE = NULL;
+ yypop_buffer_state();
+ }
+
+ /* Destroy the stack itself. */
+ yyfree((yy_buffer_stack) );
+ (yy_buffer_stack) = NULL;
+
+ /* Reset the globals. This is important in a non-reentrant scanner so the next time
+ * yylex() is called, initialization will occur. */
+ yy_init_globals( );
+
+ return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, const char * s2, int n )
+{
+
+ int i;
+ for ( i = 0; i < n; ++i )
+ s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (const char * s )
+{
+ int n;
+ for ( n = 0; s[n]; ++n )
+ ;
+
+ return n;
+}
+#endif
+
+static void *yyalloc (yy_size_t size )
+{
+ return malloc(size);
+}
+
+static void *yyrealloc (void * ptr, yy_size_t size )
+{
+
+ /* The cast to (char *) in the following accommodates both
+ * implementations that use char* generic pointers, and those
+ * that use void* generic pointers. It works with the latter
+ * because both ANSI C and C++ allow castless assignment from
+ * any pointer type to void*, and deal with argument conversions
+ * as though doing an assignment.
+ */
+ return realloc(ptr, size);
+}
+
+static void yyfree (void * ptr )
+{
+ free( (char *) ptr ); /* see yyrealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 603 "pars0lex.l"
+
+
+/**********************************************************************
+Release any resources used by the lexer. */
+void
+pars_lexer_close(void)
+/*==================*/
+{
+ yylex_destroy();
+ free(stringbuf);
+ stringbuf = NULL;
+ stringbuf_len_alloc = stringbuf_len = 0;
+}
+
diff --git a/storage/innobase/pars/make_bison.sh b/storage/innobase/pars/make_bison.sh
new file mode 100755
index 00000000..6b3cb693
--- /dev/null
+++ b/storage/innobase/pars/make_bison.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#
+# Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+#
+# generate parser files from bison input files.
+
+set -eu
+TMPFILE=pars0grm.tab.c
+OUTFILE=pars0grm.cc
+
+bison -d pars0grm.y
+mv pars0grm.tab.h ../include/pars0grm.h
+
+sed -e '
+s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/^\(\(YYSTYPE\|int\) yy\(char\|nerrs\)\)/static \1/;
+' < "$TMPFILE" > "$OUTFILE"
+
+rm "$TMPFILE"
diff --git a/storage/innobase/pars/make_flex.sh b/storage/innobase/pars/make_flex.sh
new file mode 100755
index 00000000..2baae9c9
--- /dev/null
+++ b/storage/innobase/pars/make_flex.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#
+# Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+# Copyright (c) 2017, 2019, MariaDB Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+#
+# generate lexer files from flex input files.
+
+set -eu
+
+TMPFILE=_flex_tmp.cc
+OUTFILE=lexyy.cc
+
+flex -o $TMPFILE pars0lex.l
+
+# The Microsoft compiler needs its includes done in a certain order.
+echo '#include "univ.i"' > $OUTFILE
+
+# flex assigns a pointer to an int in one place without a cast, resulting in
+# a warning on Win64. Add the cast. Also define some symbols as static.
+sed -e '
+s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/^void *yyset_extra *( *YY_EXTRA_TYPE *user_defined *) *;//
+s/\(int offset = \)\((yy_c_buf_p) - (yytext_ptr)\);/\1(int)(\2);/;
+s/\(void yy\(restart\|_\(delete\|flush\)_buffer\)\)/static \1/;
+s/\(void yy_switch_to_buffer\)/MY_ATTRIBUTE((unused)) static \1/;
+s/\(void yy\(push\|pop\)_buffer_state\)/MY_ATTRIBUTE((unused)) static \1/;
+s/\(YY_BUFFER_STATE yy_create_buffer\)/static \1/;
+s/\(\(int\|void\) yy[gs]et_\)/MY_ATTRIBUTE((unused)) static \1/;
+s/\(void \*\?yy\(\(re\)\?alloc\|free\)\)/static \1/;
+s/extern int yy\(leng\|_flex_debug\|lineno\);//;
+s/\(int yy\(leng\|lineno\|_flex_debug\)\)/static \1/;
+s/\(int yylex_destroy\)/MY_ATTRIBUTE((unused)) static \1/;
+s/^\(\(FILE\|char\) *\* *yyget\)/MY_ATTRIBUTE((unused)) static \1/;
+s/^extern \(\(FILE\|char\) *\* *yy\).*//;
+s/^\(FILE\|char\) *\* *yy/static &/;
+' < $TMPFILE >> $OUTFILE
+
+rm $TMPFILE
diff --git a/storage/innobase/pars/pars0grm.cc b/storage/innobase/pars/pars0grm.cc
new file mode 100644
index 00000000..7e10a783
--- /dev/null
+++ b/storage/innobase/pars/pars0grm.cc
@@ -0,0 +1,2616 @@
+/* A Bison parser, made by GNU Bison 3.4.2. */
+
+/* Bison implementation for Yacc-like parsers in C
+
+ Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2019 Free Software Foundation,
+ Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* As a special exception, you may create a larger work that contains
+ part or all of the Bison parser skeleton and distribute that work
+ under terms of your choice, so long as that work isn't itself a
+ parser generator using the skeleton or a modified version thereof
+ as a parser skeleton. Alternatively, if you modify or redistribute
+ the parser skeleton itself, you may (at your option) remove this
+ special exception, which will cause the skeleton and the resulting
+ Bison output files to be licensed under the GNU General Public
+ License without this special exception.
+
+ This special exception was added by the Free Software Foundation in
+ version 2.2 of Bison. */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+ simplifying the original so-called "semantic" parser. */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+ infringing on user name space. This should be done even for local
+ variables, as they might otherwise be expanded by user macros.
+ There are some unavoidable exceptions within include files to
+ define necessary library symbols; they are noted "INFRINGES ON
+ USER NAME SPACE" below. */
+
+/* Undocumented macros, especially those whose name start with YY_,
+ are private implementation details. Do not rely on them. */
+
+/* Identify Bison output. */
+#define YYBISON 1
+
+/* Bison version. */
+#define YYBISON_VERSION "3.4.2"
+
+/* Skeleton name. */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers. */
+#define YYPURE 0
+
+/* Push parsers. */
+#define YYPUSH 0
+
+/* Pull parsers. */
+#define YYPULL 1
+
+
+
+
+/* First part of user prologue. */
+#line 29 "pars0grm.y"
+
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h>
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+
+#define YYSTYPE que_node_t*
+
+/* #define __STDC__ */
+int
+yylex(void);
+
+#line 89 "pars0grm.cc"
+
+# ifndef YY_NULLPTR
+# if defined __cplusplus
+# if 201103L <= __cplusplus
+# define YY_NULLPTR nullptr
+# else
+# define YY_NULLPTR 0
+# endif
+# else
+# define YY_NULLPTR ((void*)0)
+# endif
+# endif
+
+/* Enabling verbose error messages. */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+/* Use api.header.include to #include this header
+ instead of duplicating it here. */
+#ifndef YY_YY_PARS0GRM_TAB_H_INCLUDED
+# define YY_YY_PARS0GRM_TAB_H_INCLUDED
+/* Debug traces. */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+#if YYDEBUG
+extern int yydebug;
+#endif
+
+/* Token type. */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+ enum yytokentype
+ {
+ PARS_INT_LIT = 258,
+ PARS_FLOAT_LIT = 259,
+ PARS_STR_LIT = 260,
+ PARS_NULL_LIT = 261,
+ PARS_ID_TOKEN = 262,
+ PARS_AND_TOKEN = 263,
+ PARS_OR_TOKEN = 264,
+ PARS_NOT_TOKEN = 265,
+ PARS_GE_TOKEN = 266,
+ PARS_LE_TOKEN = 267,
+ PARS_NE_TOKEN = 268,
+ PARS_PROCEDURE_TOKEN = 269,
+ PARS_IN_TOKEN = 270,
+ PARS_INT_TOKEN = 271,
+ PARS_CHAR_TOKEN = 272,
+ PARS_IS_TOKEN = 273,
+ PARS_BEGIN_TOKEN = 274,
+ PARS_END_TOKEN = 275,
+ PARS_IF_TOKEN = 276,
+ PARS_THEN_TOKEN = 277,
+ PARS_ELSE_TOKEN = 278,
+ PARS_ELSIF_TOKEN = 279,
+ PARS_LOOP_TOKEN = 280,
+ PARS_WHILE_TOKEN = 281,
+ PARS_RETURN_TOKEN = 282,
+ PARS_SELECT_TOKEN = 283,
+ PARS_COUNT_TOKEN = 284,
+ PARS_FROM_TOKEN = 285,
+ PARS_WHERE_TOKEN = 286,
+ PARS_FOR_TOKEN = 287,
+ PARS_DDOT_TOKEN = 288,
+ PARS_ORDER_TOKEN = 289,
+ PARS_BY_TOKEN = 290,
+ PARS_ASC_TOKEN = 291,
+ PARS_DESC_TOKEN = 292,
+ PARS_INSERT_TOKEN = 293,
+ PARS_INTO_TOKEN = 294,
+ PARS_VALUES_TOKEN = 295,
+ PARS_UPDATE_TOKEN = 296,
+ PARS_SET_TOKEN = 297,
+ PARS_DELETE_TOKEN = 298,
+ PARS_CURRENT_TOKEN = 299,
+ PARS_OF_TOKEN = 300,
+ PARS_CREATE_TOKEN = 301,
+ PARS_TABLE_TOKEN = 302,
+ PARS_INDEX_TOKEN = 303,
+ PARS_UNIQUE_TOKEN = 304,
+ PARS_CLUSTERED_TOKEN = 305,
+ PARS_ON_TOKEN = 306,
+ PARS_ASSIGN_TOKEN = 307,
+ PARS_DECLARE_TOKEN = 308,
+ PARS_CURSOR_TOKEN = 309,
+ PARS_SQL_TOKEN = 310,
+ PARS_OPEN_TOKEN = 311,
+ PARS_FETCH_TOKEN = 312,
+ PARS_CLOSE_TOKEN = 313,
+ PARS_NOTFOUND_TOKEN = 314,
+ PARS_TO_BINARY_TOKEN = 315,
+ PARS_SUBSTR_TOKEN = 316,
+ PARS_CONCAT_TOKEN = 317,
+ PARS_INSTR_TOKEN = 318,
+ PARS_LENGTH_TOKEN = 319,
+ PARS_COMMIT_TOKEN = 320,
+ PARS_ROLLBACK_TOKEN = 321,
+ PARS_WORK_TOKEN = 322,
+ PARS_EXIT_TOKEN = 323,
+ PARS_FUNCTION_TOKEN = 324,
+ PARS_LOCK_TOKEN = 325,
+ PARS_SHARE_TOKEN = 326,
+ PARS_MODE_TOKEN = 327,
+ PARS_LIKE_TOKEN = 328,
+ PARS_LIKE_TOKEN_EXACT = 329,
+ PARS_LIKE_TOKEN_PREFIX = 330,
+ PARS_LIKE_TOKEN_SUFFIX = 331,
+ PARS_LIKE_TOKEN_SUBSTR = 332,
+ PARS_TABLE_NAME_TOKEN = 333,
+ PARS_BIGINT_TOKEN = 334,
+ NEG = 335
+ };
+#endif
+
+/* Value type. */
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef int YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+extern YYSTYPE yylval;
+
+int yyparse (void);
+
+#endif /* !YY_YY_PARS0GRM_TAB_H_INCLUDED */
+
+
+
+#ifdef short
+# undef short
+#endif
+
+#ifdef YYTYPE_UINT8
+typedef YYTYPE_UINT8 yytype_uint8;
+#else
+typedef unsigned char yytype_uint8;
+#endif
+
+#ifdef YYTYPE_INT8
+typedef YYTYPE_INT8 yytype_int8;
+#else
+typedef signed char yytype_int8;
+#endif
+
+#ifdef YYTYPE_UINT16
+typedef YYTYPE_UINT16 yytype_uint16;
+#else
+typedef unsigned short yytype_uint16;
+#endif
+
+#ifdef YYTYPE_INT16
+typedef YYTYPE_INT16 yytype_int16;
+#else
+typedef short yytype_int16;
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+# define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+# define YYSIZE_T size_t
+# elif ! defined YYSIZE_T
+# include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+# define YYSIZE_T size_t
+# else
+# define YYSIZE_T unsigned
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+
+#ifndef YY_
+# if defined YYENABLE_NLS && YYENABLE_NLS
+# if ENABLE_NLS
+# include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+# define YY_(Msgid) dgettext ("bison-runtime", Msgid)
+# endif
+# endif
+# ifndef YY_
+# define YY_(Msgid) Msgid
+# endif
+#endif
+
+#ifndef YY_ATTRIBUTE
+# if (defined __GNUC__ \
+ && (2 < __GNUC__ || (__GNUC__ == 2 && 96 <= __GNUC_MINOR__))) \
+ || defined __SUNPRO_C && 0x5110 <= __SUNPRO_C
+# define YY_ATTRIBUTE(Spec) __attribute__(Spec)
+# else
+# define YY_ATTRIBUTE(Spec) /* empty */
+# endif
+#endif
+
+#ifndef YY_ATTRIBUTE_PURE
+# define YY_ATTRIBUTE_PURE YY_ATTRIBUTE ((__pure__))
+#endif
+
+#ifndef YY_ATTRIBUTE_UNUSED
+# define YY_ATTRIBUTE_UNUSED YY_ATTRIBUTE ((__unused__))
+#endif
+
+/* Suppress unused-variable warnings by "using" E. */
+#if ! defined lint || defined __GNUC__
+# define YYUSE(E) ((void) (E))
+#else
+# define YYUSE(E) /* empty */
+#endif
+
+#if defined __GNUC__ && ! defined __ICC && 407 <= __GNUC__ * 100 + __GNUC_MINOR__
+/* Suppress an incorrect diagnostic about yylval being uninitialized. */
+# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN \
+ _Pragma ("GCC diagnostic push") \
+ _Pragma ("GCC diagnostic ignored \"-Wuninitialized\"")\
+ _Pragma ("GCC diagnostic ignored \"-Wmaybe-uninitialized\"")
+# define YY_IGNORE_MAYBE_UNINITIALIZED_END \
+ _Pragma ("GCC diagnostic pop")
+#else
+# define YY_INITIAL_VALUE(Value) Value
+#endif
+#ifndef YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+# define YY_IGNORE_MAYBE_UNINITIALIZED_END
+#endif
+#ifndef YY_INITIAL_VALUE
+# define YY_INITIAL_VALUE(Value) /* Nothing. */
+#endif
+
+
+#define YY_ASSERT(E) ((void) (0 && (E)))
+
+#if ! defined yyoverflow || YYERROR_VERBOSE
+
+/* The parser invokes alloca or malloc; define the necessary symbols. */
+
+# ifdef YYSTACK_USE_ALLOCA
+# if YYSTACK_USE_ALLOCA
+# ifdef __GNUC__
+# define YYSTACK_ALLOC __builtin_alloca
+# elif defined __BUILTIN_VA_ARG_INCR
+# include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+# elif defined _AIX
+# define YYSTACK_ALLOC __alloca
+# elif defined _MSC_VER
+# include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+# define alloca _alloca
+# else
+# define YYSTACK_ALLOC alloca
+# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS
+# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+ /* Use EXIT_SUCCESS as a witness for stdlib.h. */
+# ifndef EXIT_SUCCESS
+# define EXIT_SUCCESS 0
+# endif
+# endif
+# endif
+# endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+ /* Pacify GCC's 'empty if-body' warning. */
+# define YYSTACK_FREE(Ptr) do { /* empty */; } while (0)
+# ifndef YYSTACK_ALLOC_MAXIMUM
+ /* The OS might guarantee only one guard page at the bottom of the stack,
+ and a page size can be as small as 4096 bytes. So we cannot safely
+ invoke alloca (N) if N exceeds 4096. Use a slightly smaller number
+ to allow for a few compiler-allocated temporary stack slots. */
+# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+# endif
+# else
+# define YYSTACK_ALLOC YYMALLOC
+# define YYSTACK_FREE YYFREE
+# ifndef YYSTACK_ALLOC_MAXIMUM
+# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+# endif
+# if (defined __cplusplus && ! defined EXIT_SUCCESS \
+ && ! ((defined YYMALLOC || defined malloc) \
+ && (defined YYFREE || defined free)))
+# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+# ifndef EXIT_SUCCESS
+# define EXIT_SUCCESS 0
+# endif
+# endif
+# ifndef YYMALLOC
+# define YYMALLOC malloc
+# if ! defined malloc && ! defined EXIT_SUCCESS
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+# endif
+# endif
+# ifndef YYFREE
+# define YYFREE free
+# if ! defined free && ! defined EXIT_SUCCESS
+void free (void *); /* INFRINGES ON USER NAME SPACE */
+# endif
+# endif
+# endif
+#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
+
+
+#if (! defined yyoverflow \
+ && (! defined __cplusplus \
+ || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member. */
+union yyalloc
+{
+ yytype_int16 yyss_alloc;
+ YYSTYPE yyvs_alloc;
+};
+
+/* The size of the maximum gap between one aligned stack and the next. */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+ N elements. */
+# define YYSTACK_BYTES(N) \
+ ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
+ + YYSTACK_GAP_MAXIMUM)
+
+# define YYCOPY_NEEDED 1
+
+/* Relocate STACK from its old location to the new one. The
+ local variables YYSIZE and YYSTACKSIZE give the old and new number of
+ elements in the stack, and YYPTR gives the new location of the
+ stack. Advance YYPTR to a properly aligned location for the next
+ stack. */
+# define YYSTACK_RELOCATE(Stack_alloc, Stack) \
+ do \
+ { \
+ YYSIZE_T yynewbytes; \
+ YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \
+ Stack = &yyptr->Stack_alloc; \
+ yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+ yyptr += yynewbytes / sizeof (*yyptr); \
+ } \
+ while (0)
+
+#endif
+
+#if defined YYCOPY_NEEDED && YYCOPY_NEEDED
+/* Copy COUNT objects from SRC to DST. The source and destination do
+ not overlap. */
+# ifndef YYCOPY
+# if defined __GNUC__ && 1 < __GNUC__
+# define YYCOPY(Dst, Src, Count) \
+ __builtin_memcpy (Dst, Src, (Count) * sizeof (*(Src)))
+# else
+# define YYCOPY(Dst, Src, Count) \
+ do \
+ { \
+ YYSIZE_T yyi; \
+ for (yyi = 0; yyi < (Count); yyi++) \
+ (Dst)[yyi] = (Src)[yyi]; \
+ } \
+ while (0)
+# endif
+# endif
+#endif /* !YYCOPY_NEEDED */
+
+/* YYFINAL -- State number of the termination state. */
+#define YYFINAL 5
+/* YYLAST -- Last index in YYTABLE. */
+#define YYLAST 603
+
+/* YYNTOKENS -- Number of terminals. */
+#define YYNTOKENS 96
+/* YYNNTS -- Number of nonterminals. */
+#define YYNNTS 64
+/* YYNRULES -- Number of rules. */
+#define YYNRULES 150
+/* YYNSTATES -- Number of states. */
+#define YYNSTATES 300
+
+#define YYUNDEFTOK 2
+#define YYMAXUTOK 335
+
+/* YYTRANSLATE(TOKEN-NUM) -- Symbol number corresponding to TOKEN-NUM
+ as returned by yylex, with out-of-bounds checking. */
+#define YYTRANSLATE(YYX) \
+ ((unsigned) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM
+ as returned by yylex. */
+static const yytype_uint8 yytranslate[] =
+{
+ 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 88, 2, 2,
+ 90, 91, 85, 84, 93, 83, 2, 86, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 89,
+ 81, 80, 82, 92, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 94, 2, 95, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 1, 2, 3, 4,
+ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 87
+};
+
+#if YYDEBUG
+ /* YYRLINE[YYN] -- Source line where rule number YYN was defined. */
+static const yytype_uint16 yyrline[] =
+{
+ 0, 140, 140, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+ 160, 161, 162, 166, 167, 172, 173, 175, 176, 177,
+ 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
+ 189, 190, 191, 192, 193, 194, 195, 196, 197, 199,
+ 204, 205, 206, 207, 208, 211, 213, 214, 218, 224,
+ 228, 229, 234, 235, 236, 241, 242, 243, 247, 248,
+ 256, 257, 258, 263, 265, 268, 272, 273, 277, 278,
+ 283, 284, 289, 290, 291, 295, 296, 303, 318, 323,
+ 326, 334, 340, 341, 346, 352, 361, 369, 377, 384,
+ 392, 400, 407, 413, 414, 419, 420, 422, 426, 433,
+ 439, 449, 453, 457, 464, 471, 475, 483, 492, 493,
+ 498, 499, 504, 505, 511, 519, 520, 525, 526, 530,
+ 531, 535, 549, 550, 554, 559, 564, 565, 566, 570,
+ 576, 578, 579, 583, 591, 597, 598, 601, 603, 604,
+ 608
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || 0
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+ First, the terminals, then, starting at YYNTOKENS, nonterminals. */
+static const char *const yytname[] =
+{
+ "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT",
+ "PARS_STR_LIT", "PARS_NULL_LIT", "PARS_ID_TOKEN", "PARS_AND_TOKEN",
+ "PARS_OR_TOKEN", "PARS_NOT_TOKEN", "PARS_GE_TOKEN", "PARS_LE_TOKEN",
+ "PARS_NE_TOKEN", "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN",
+ "PARS_INT_TOKEN", "PARS_CHAR_TOKEN", "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN",
+ "PARS_END_TOKEN", "PARS_IF_TOKEN", "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN",
+ "PARS_ELSIF_TOKEN", "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN",
+ "PARS_RETURN_TOKEN", "PARS_SELECT_TOKEN", "PARS_COUNT_TOKEN",
+ "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN", "PARS_FOR_TOKEN",
+ "PARS_DDOT_TOKEN", "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN",
+ "PARS_DESC_TOKEN", "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN",
+ "PARS_VALUES_TOKEN", "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN",
+ "PARS_DELETE_TOKEN", "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN",
+ "PARS_CREATE_TOKEN", "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN",
+ "PARS_UNIQUE_TOKEN", "PARS_CLUSTERED_TOKEN", "PARS_ON_TOKEN",
+ "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN", "PARS_CURSOR_TOKEN",
+ "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN", "PARS_FETCH_TOKEN",
+ "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN", "PARS_TO_BINARY_TOKEN",
+ "PARS_SUBSTR_TOKEN", "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN",
+ "PARS_LENGTH_TOKEN", "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN",
+ "PARS_WORK_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN",
+ "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN",
+ "PARS_LIKE_TOKEN", "PARS_LIKE_TOKEN_EXACT", "PARS_LIKE_TOKEN_PREFIX",
+ "PARS_LIKE_TOKEN_SUFFIX", "PARS_LIKE_TOKEN_SUBSTR",
+ "PARS_TABLE_NAME_TOKEN", "PARS_BIGINT_TOKEN", "'='", "'<'", "'>'", "'-'",
+ "'+'", "'*'", "'/'", "NEG", "'%'", "';'", "'('", "')'", "'?'", "','",
+ "'{'", "'}'", "$accept", "top_statement", "statement", "statement_list",
+ "exp", "function_name", "question_mark_list", "stored_procedure_call",
+ "user_function_call", "table_list", "variable_list", "exp_list",
+ "select_item", "select_item_list", "select_list", "search_condition",
+ "for_update_clause", "lock_shared_clause", "order_direction",
+ "order_by_clause", "select_statement", "insert_statement_start",
+ "insert_statement", "column_assignment", "column_assignment_list",
+ "cursor_positioned", "update_statement_start",
+ "update_statement_searched", "update_statement_positioned",
+ "delete_statement_start", "delete_statement_searched",
+ "delete_statement_positioned", "assignment_statement", "elsif_element",
+ "elsif_list", "else_part", "if_statement", "while_statement",
+ "for_statement", "exit_statement", "return_statement",
+ "open_cursor_statement", "close_cursor_statement", "fetch_statement",
+ "column_def", "column_def_list", "opt_column_len", "opt_not_null",
+ "create_table", "column_list", "unique_def", "clustered_def",
+ "create_index", "table_name", "commit_statement", "rollback_statement",
+ "type_name", "variable_declaration", "variable_declaration_list",
+ "cursor_declaration", "function_declaration", "declaration",
+ "declaration_list", "procedure_definition", YY_NULLPTR
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[NUM] -- (External) token number corresponding to the
+ (internal) symbol number NUM (which must be that of a token). */
+static const yytype_uint16 yytoknum[] =
+{
+ 0, 256, 257, 258, 259, 260, 261, 262, 263, 264,
+ 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+ 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+ 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
+ 295, 296, 297, 298, 299, 300, 301, 302, 303, 304,
+ 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+ 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
+ 325, 326, 327, 328, 329, 330, 331, 332, 333, 334,
+ 61, 60, 62, 45, 43, 42, 47, 335, 37, 59,
+ 40, 41, 63, 44, 123, 125
+};
+# endif
+
+#define YYPACT_NINF -129
+
+#define yypact_value_is_default(Yystate) \
+ (!!((Yystate) == (-129)))
+
+#define YYTABLE_NINF -1
+
+#define yytable_value_is_error(Yytable_value) \
+ 0
+
+ /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+ STATE-NUM. */
+static const yytype_int16 yypact[] =
+{
+ 5, 34, 46, -28, -41, -129, -129, -12, 45, 57,
+ 23, -129, 9, -129, -129, -129, 20, -9, -129, -129,
+ -129, -129, 2, -129, 83, 87, 278, -129, 93, 28,
+ 71, 427, 427, -129, 335, 105, 85, -1, 104, -27,
+ 129, 132, 133, 76, 77, -129, 141, -129, 149, -129,
+ 61, 19, 62, 118, 65, 66, 118, 68, 69, 70,
+ 72, 73, 74, 75, 78, 79, 82, 84, 89, 90,
+ 91, 94, 138, -129, 427, -129, -129, -129, -129, 86,
+ 427, 96, -129, -129, -129, -129, -129, 427, 427, 438,
+ 92, 454, 95, -129, 1, -129, -24, 130, 157, -1,
+ -129, -129, 144, -1, -1, -129, 139, -129, 154, -129,
+ -129, -129, 98, -129, -129, -129, 108, -129, -129, 345,
+ -129, -129, -129, -129, -129, -129, -129, -129, -129, -129,
+ -129, -129, -129, -129, -129, -129, -129, -129, -129, -129,
+ -129, 112, 1, 135, 285, 143, -8, 15, 427, 427,
+ 427, 427, 427, 278, 203, 427, 427, 427, 427, 427,
+ 427, 427, 427, 278, 124, 204, 381, -1, 427, -129,
+ 209, -129, 120, -129, 173, 215, 131, 427, 180, 1,
+ -129, -129, -129, -129, 285, 285, 30, 30, 1, 10,
+ -129, 30, 30, 30, 60, 60, -8, -8, 1, -39,
+ 192, 137, -129, 136, -129, -13, -129, 472, 146, -129,
+ 147, 225, 227, 151, -129, 136, -129, -21, 0, 229,
+ 278, 427, -129, 213, 219, -129, 427, 220, -129, 237,
+ 427, -1, 214, 427, 427, 209, 23, -129, 14, 196,
+ 160, 158, 162, -129, -129, 278, 486, -129, 231, 1,
+ -129, -129, -129, 218, 194, 517, 1, -129, 175, -129,
+ 225, -1, -129, -129, -129, 278, -129, -129, 251, 234,
+ 278, 266, 260, -129, 181, 278, 201, 239, -129, 235,
+ 184, 271, -129, 272, 208, 275, 258, -129, -129, -129,
+ 17, -129, -7, -129, -129, 277, -129, -129, -129, -129
+};
+
+ /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM.
+ Performed when YYTABLE does not specify something else to do. Zero
+ means the default is an error. */
+static const yytype_uint8 yydefact[] =
+{
+ 0, 0, 0, 0, 0, 1, 2, 0, 0, 140,
+ 0, 141, 147, 136, 138, 137, 0, 0, 142, 145,
+ 146, 148, 0, 139, 0, 0, 0, 149, 0, 0,
+ 0, 0, 0, 112, 70, 0, 0, 0, 0, 127,
+ 0, 0, 0, 0, 0, 111, 0, 23, 0, 3,
+ 0, 0, 0, 76, 0, 0, 76, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 144, 0, 27, 28, 29, 30, 25,
+ 0, 31, 50, 51, 52, 53, 54, 0, 0, 0,
+ 0, 0, 0, 73, 68, 71, 75, 0, 0, 0,
+ 132, 133, 0, 0, 0, 128, 129, 113, 0, 114,
+ 134, 135, 0, 150, 24, 10, 0, 90, 11, 0,
+ 96, 97, 14, 15, 99, 100, 12, 13, 9, 7,
+ 4, 5, 6, 8, 16, 18, 17, 21, 22, 19,
+ 20, 0, 101, 0, 47, 0, 36, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 65, 0, 0, 62, 0, 0, 0, 88,
+ 0, 98, 0, 130, 0, 62, 55, 65, 0, 77,
+ 143, 48, 49, 37, 45, 46, 42, 43, 44, 105,
+ 39, 38, 40, 41, 33, 32, 34, 35, 66, 0,
+ 0, 0, 63, 74, 72, 76, 60, 0, 0, 92,
+ 95, 0, 0, 63, 116, 115, 56, 0, 0, 0,
+ 0, 0, 103, 107, 0, 26, 0, 0, 69, 0,
+ 0, 0, 78, 0, 0, 0, 0, 118, 0, 0,
+ 0, 0, 0, 89, 94, 106, 0, 104, 0, 67,
+ 109, 64, 61, 0, 80, 0, 91, 93, 120, 124,
+ 0, 0, 59, 58, 57, 0, 108, 79, 0, 85,
+ 0, 0, 122, 119, 0, 102, 0, 0, 87, 0,
+ 0, 0, 117, 0, 0, 0, 0, 121, 123, 125,
+ 0, 81, 82, 110, 131, 0, 83, 84, 86, 126
+};
+
+ /* YYPGOTO[NTERM-NUM]. */
+static const yytype_int16 yypgoto[] =
+{
+ -129, -129, -48, -128, -30, -129, -129, -129, -129, -129,
+ 113, 110, 123, -129, -129, -52, -129, -129, -129, -129,
+ -40, -129, -129, 55, -129, 238, -129, -129, -129, -129,
+ -129, -129, -129, 88, -129, -129, -129, -129, -129, -129,
+ -129, -129, -129, -129, 35, -129, -129, -129, -129, -129,
+ -129, -129, -129, -96, -129, -129, 81, 290, -129, -129,
+ -129, 286, -129, -129
+};
+
+ /* YYDEFGOTO[NTERM-NUM]. */
+static const yytype_int16 yydefgoto[] =
+{
+ -1, 2, 47, 48, 94, 90, 217, 49, 214, 205,
+ 203, 199, 95, 96, 97, 120, 254, 269, 298, 278,
+ 50, 51, 52, 209, 210, 121, 53, 54, 55, 56,
+ 57, 58, 59, 222, 223, 224, 60, 61, 62, 63,
+ 64, 65, 66, 67, 237, 238, 272, 282, 68, 290,
+ 106, 174, 69, 102, 70, 71, 16, 11, 12, 19,
+ 20, 21, 22, 3
+};
+
+ /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM. If
+ positive, shift that token. If negative, reduce the rule whose
+ number is the opposite. If YYTABLE_NINF, syntax error. */
+static const yytype_uint16 yytable[] =
+{
+ 114, 89, 91, 169, 124, 152, 100, 171, 172, 148,
+ 149, 117, 150, 151, 152, 165, 10, 30, 230, 1,
+ 104, 26, 105, 148, 149, 189, 150, 151, 152, 296,
+ 297, 31, 141, 220, 221, 200, 32, 33, 34, 13,
+ 14, 4, 35, 152, 142, 24, 5, 34, 36, 7,
+ 144, 37, 225, 38, 226, 17, 39, 146, 147, 116,
+ 25, 6, 17, 9, 10, 154, 40, 41, 42, 166,
+ 241, 206, 242, 152, 154, 43, 44, 101, 45, 8,
+ 231, 155, 156, 157, 158, 159, 160, 161, 154, 179,
+ 28, 243, 245, 226, 29, 155, 156, 157, 158, 159,
+ 160, 161, 15, 154, 46, 259, 183, 260, 294, 23,
+ 295, 72, 98, 158, 159, 160, 161, 73, 184, 185,
+ 186, 187, 188, 74, 99, 191, 192, 193, 194, 195,
+ 196, 197, 198, 154, 103, 252, 107, 275, 207, 108,
+ 109, 114, 279, 110, 111, 160, 161, 198, 112, 119,
+ 115, 118, 114, 232, 122, 123, 30, 126, 127, 128,
+ 167, 129, 130, 131, 132, 274, 34, 133, 134, 113,
+ 31, 135, 168, 136, 143, 32, 33, 34, 137, 138,
+ 139, 35, 162, 140, 145, 164, 170, 36, 176, 173,
+ 37, 246, 38, 175, 181, 39, 249, 114, 177, 30,
+ 179, 180, 182, 255, 256, 40, 41, 42, 190, 201,
+ 211, 202, 227, 31, 43, 44, 208, 45, 32, 33,
+ 34, 212, 213, 216, 35, 219, 234, 114, 228, 229,
+ 36, 114, 236, 37, 239, 38, 244, 221, 39, 248,
+ 235, 240, 30, 46, 251, 250, 253, 261, 40, 41,
+ 42, 262, 266, 263, 264, 286, 31, 43, 44, 267,
+ 45, 32, 33, 34, 268, 271, 276, 35, 277, 280,
+ 281, 283, 284, 36, 285, 287, 37, 288, 38, 289,
+ 291, 39, 292, 293, 299, 30, 46, 218, 215, 204,
+ 257, 40, 41, 42, 125, 273, 150, 151, 152, 31,
+ 43, 44, 18, 45, 32, 33, 34, 0, 27, 0,
+ 35, 247, 0, 0, 0, 0, 36, 258, 0, 37,
+ 0, 38, 0, 0, 39, 0, 0, 0, 0, 46,
+ 0, 0, 0, 0, 40, 41, 42, 0, 75, 76,
+ 77, 78, 79, 43, 44, 80, 45, 0, 75, 76,
+ 77, 78, 79, 0, 0, 80, 0, 0, 154, 0,
+ 0, 0, 0, 0, 92, 155, 156, 157, 158, 159,
+ 160, 161, 46, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 75, 76, 77, 78, 79, 178,
+ 81, 80, 0, 0, 0, 82, 83, 84, 85, 86,
+ 81, 0, 0, 0, 0, 82, 83, 84, 85, 86,
+ 92, 0, 0, 0, 0, 0, 0, 0, 87, 0,
+ 93, 0, 0, 0, 0, 88, 0, 0, 87, 0,
+ 75, 76, 77, 78, 79, 88, 81, 80, 0, 0,
+ 0, 82, 83, 84, 85, 86, 148, 149, 0, 150,
+ 151, 152, 0, 0, 0, 0, 0, 0, 0, 0,
+ 153, 0, 148, 149, 87, 150, 151, 152, 0, 0,
+ 0, 88, 0, 0, 0, 0, 0, 0, 0, 163,
+ 148, 149, 81, 150, 151, 152, 0, 82, 83, 84,
+ 85, 86, 0, 0, 148, 149, 0, 150, 151, 152,
+ 0, 0, 0, 0, 0, 233, 0, 0, 265, 0,
+ 87, 154, 0, 0, 0, 0, 0, 88, 155, 156,
+ 157, 158, 159, 160, 161, 148, 149, 154, 150, 151,
+ 152, 0, 0, 0, 155, 156, 157, 158, 159, 160,
+ 161, 0, 270, 0, 0, 154, 0, 0, 0, 0,
+ 0, 0, 155, 156, 157, 158, 159, 160, 161, 154,
+ 0, 0, 0, 0, 0, 0, 155, 156, 157, 158,
+ 159, 160, 161, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 154, 0, 0, 0, 0, 0, 0, 155, 156, 157,
+ 158, 159, 160, 161
+};
+
+static const yytype_int16 yycheck[] =
+{
+ 48, 31, 32, 99, 56, 13, 7, 103, 104, 8,
+ 9, 51, 11, 12, 13, 39, 7, 7, 31, 14,
+ 47, 19, 49, 8, 9, 153, 11, 12, 13, 36,
+ 37, 21, 72, 23, 24, 163, 26, 27, 28, 16,
+ 17, 7, 32, 13, 74, 54, 0, 28, 38, 90,
+ 80, 41, 91, 43, 93, 53, 46, 87, 88, 40,
+ 69, 89, 53, 18, 7, 73, 56, 57, 58, 93,
+ 91, 167, 93, 13, 73, 65, 66, 78, 68, 91,
+ 93, 80, 81, 82, 83, 84, 85, 86, 73, 119,
+ 7, 91, 220, 93, 7, 80, 81, 82, 83, 84,
+ 85, 86, 79, 73, 94, 91, 91, 93, 91, 89,
+ 93, 18, 7, 83, 84, 85, 86, 89, 148, 149,
+ 150, 151, 152, 52, 39, 155, 156, 157, 158, 159,
+ 160, 161, 162, 73, 30, 231, 7, 265, 168, 7,
+ 7, 189, 270, 67, 67, 85, 86, 177, 7, 31,
+ 89, 89, 200, 205, 89, 89, 7, 89, 89, 89,
+ 30, 89, 89, 89, 89, 261, 28, 89, 89, 20,
+ 21, 89, 15, 89, 88, 26, 27, 28, 89, 89,
+ 89, 32, 90, 89, 88, 90, 42, 38, 90, 50,
+ 41, 221, 43, 39, 59, 46, 226, 245, 90, 7,
+ 230, 89, 59, 233, 234, 56, 57, 58, 5, 85,
+ 90, 7, 20, 21, 65, 66, 7, 68, 26, 27,
+ 28, 48, 7, 92, 32, 45, 80, 275, 91, 93,
+ 38, 279, 7, 41, 7, 43, 7, 24, 46, 20,
+ 93, 90, 7, 94, 7, 25, 32, 51, 56, 57,
+ 58, 91, 21, 95, 92, 20, 21, 65, 66, 41,
+ 68, 26, 27, 28, 70, 90, 15, 32, 34, 3,
+ 10, 90, 71, 38, 35, 91, 41, 6, 43, 7,
+ 72, 46, 7, 25, 7, 7, 94, 177, 175, 166,
+ 235, 56, 57, 58, 56, 260, 11, 12, 13, 21,
+ 65, 66, 12, 68, 26, 27, 28, -1, 22, -1,
+ 32, 223, -1, -1, -1, -1, 38, 236, -1, 41,
+ -1, 43, -1, -1, 46, -1, -1, -1, -1, 94,
+ -1, -1, -1, -1, 56, 57, 58, -1, 3, 4,
+ 5, 6, 7, 65, 66, 10, 68, -1, 3, 4,
+ 5, 6, 7, -1, -1, 10, -1, -1, 73, -1,
+ -1, -1, -1, -1, 29, 80, 81, 82, 83, 84,
+ 85, 86, 94, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, 3, 4, 5, 6, 7, 44,
+ 55, 10, -1, -1, -1, 60, 61, 62, 63, 64,
+ 55, -1, -1, -1, -1, 60, 61, 62, 63, 64,
+ 29, -1, -1, -1, -1, -1, -1, -1, 83, -1,
+ 85, -1, -1, -1, -1, 90, -1, -1, 83, -1,
+ 3, 4, 5, 6, 7, 90, 55, 10, -1, -1,
+ -1, 60, 61, 62, 63, 64, 8, 9, -1, 11,
+ 12, 13, -1, -1, -1, -1, -1, -1, -1, -1,
+ 22, -1, 8, 9, 83, 11, 12, 13, -1, -1,
+ -1, 90, -1, -1, -1, -1, -1, -1, -1, 25,
+ 8, 9, 55, 11, 12, 13, -1, 60, 61, 62,
+ 63, 64, -1, -1, 8, 9, -1, 11, 12, 13,
+ -1, -1, -1, -1, -1, 33, -1, -1, 22, -1,
+ 83, 73, -1, -1, -1, -1, -1, 90, 80, 81,
+ 82, 83, 84, 85, 86, 8, 9, 73, 11, 12,
+ 13, -1, -1, -1, 80, 81, 82, 83, 84, 85,
+ 86, -1, 25, -1, -1, 73, -1, -1, -1, -1,
+ -1, -1, 80, 81, 82, 83, 84, 85, 86, 73,
+ -1, -1, -1, -1, -1, -1, 80, 81, 82, 83,
+ 84, 85, 86, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 73, -1, -1, -1, -1, -1, -1, 80, 81, 82,
+ 83, 84, 85, 86
+};
+
+ /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+ symbol of state STATE-NUM. */
+static const yytype_uint8 yystos[] =
+{
+ 0, 14, 97, 159, 7, 0, 89, 90, 91, 18,
+ 7, 153, 154, 16, 17, 79, 152, 53, 153, 155,
+ 156, 157, 158, 89, 54, 69, 19, 157, 7, 7,
+ 7, 21, 26, 27, 28, 32, 38, 41, 43, 46,
+ 56, 57, 58, 65, 66, 68, 94, 98, 99, 103,
+ 116, 117, 118, 122, 123, 124, 125, 126, 127, 128,
+ 132, 133, 134, 135, 136, 137, 138, 139, 144, 148,
+ 150, 151, 18, 89, 52, 3, 4, 5, 6, 7,
+ 10, 55, 60, 61, 62, 63, 64, 83, 90, 100,
+ 101, 100, 29, 85, 100, 108, 109, 110, 7, 39,
+ 7, 78, 149, 30, 47, 49, 146, 7, 7, 7,
+ 67, 67, 7, 20, 98, 89, 40, 116, 89, 31,
+ 111, 121, 89, 89, 111, 121, 89, 89, 89, 89,
+ 89, 89, 89, 89, 89, 89, 89, 89, 89, 89,
+ 89, 116, 100, 88, 100, 88, 100, 100, 8, 9,
+ 11, 12, 13, 22, 73, 80, 81, 82, 83, 84,
+ 85, 86, 90, 25, 90, 39, 93, 30, 15, 149,
+ 42, 149, 149, 50, 147, 39, 90, 90, 44, 100,
+ 89, 59, 59, 91, 100, 100, 100, 100, 100, 99,
+ 5, 100, 100, 100, 100, 100, 100, 100, 100, 107,
+ 99, 85, 7, 106, 108, 105, 149, 100, 7, 119,
+ 120, 90, 48, 7, 104, 106, 92, 102, 107, 45,
+ 23, 24, 129, 130, 131, 91, 93, 20, 91, 93,
+ 31, 93, 111, 33, 80, 93, 7, 140, 141, 7,
+ 90, 91, 93, 91, 7, 99, 100, 129, 20, 100,
+ 25, 7, 149, 32, 112, 100, 100, 119, 152, 91,
+ 93, 51, 91, 95, 92, 22, 21, 41, 70, 113,
+ 25, 90, 142, 140, 149, 99, 15, 34, 115, 99,
+ 3, 10, 143, 90, 71, 35, 20, 91, 6, 7,
+ 145, 72, 7, 25, 91, 93, 36, 37, 114, 7
+};
+
+ /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */
+static const yytype_uint8 yyr1[] =
+{
+ 0, 96, 97, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 99, 99, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+ 101, 101, 101, 101, 101, 102, 102, 102, 103, 104,
+ 105, 105, 106, 106, 106, 107, 107, 107, 108, 108,
+ 109, 109, 109, 110, 110, 110, 111, 111, 112, 112,
+ 113, 113, 114, 114, 114, 115, 115, 116, 117, 118,
+ 118, 119, 120, 120, 121, 122, 123, 124, 125, 126,
+ 127, 128, 129, 130, 130, 131, 131, 131, 132, 133,
+ 134, 135, 136, 137, 138, 139, 139, 140, 141, 141,
+ 142, 142, 143, 143, 144, 145, 145, 146, 146, 147,
+ 147, 148, 149, 149, 150, 151, 152, 152, 152, 153,
+ 154, 154, 154, 155, 156, 157, 157, 158, 158, 158,
+ 159
+};
+
+ /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN. */
+static const yytype_uint8 yyr2[] =
+{
+ 0, 2, 2, 1, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 1, 2, 1, 4, 1, 1, 1,
+ 1, 1, 3, 3, 3, 3, 2, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 2, 3, 3,
+ 1, 1, 1, 1, 1, 0, 1, 3, 6, 3,
+ 1, 3, 0, 1, 3, 0, 1, 3, 1, 4,
+ 0, 1, 3, 1, 3, 1, 0, 2, 0, 2,
+ 0, 4, 0, 1, 1, 0, 4, 8, 3, 5,
+ 2, 3, 1, 3, 4, 4, 2, 2, 3, 2,
+ 2, 3, 4, 1, 2, 0, 2, 1, 7, 6,
+ 10, 1, 1, 2, 2, 4, 4, 4, 1, 3,
+ 0, 3, 0, 2, 6, 1, 3, 0, 1, 0,
+ 1, 10, 1, 1, 2, 2, 1, 1, 1, 3,
+ 0, 1, 2, 6, 4, 1, 1, 0, 1, 2,
+ 10
+};
+
+
+#define yyerrok (yyerrstatus = 0)
+#define yyclearin (yychar = YYEMPTY)
+#define YYEMPTY (-2)
+#define YYEOF 0
+
+#define YYACCEPT goto yyacceptlab
+#define YYABORT goto yyabortlab
+#define YYERROR goto yyerrorlab
+
+
+#define YYRECOVERING() (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value) \
+ do \
+ if (yychar == YYEMPTY) \
+ { \
+ yychar = (Token); \
+ yylval = (Value); \
+ YYPOPSTACK (yylen); \
+ yystate = *yyssp; \
+ goto yybackup; \
+ } \
+ else \
+ { \
+ yyerror (YY_("syntax error: cannot back up")); \
+ YYERROR; \
+ } \
+ while (0)
+
+/* Error token number */
+#define YYTERROR 1
+#define YYERRCODE 256
+
+
+
+/* Enable debugging if requested. */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+# include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+# define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args) \
+do { \
+ if (yydebug) \
+ YYFPRINTF Args; \
+} while (0)
+
+/* This macro is provided for backward compatibility. */
+#ifndef YY_LOCATION_PRINT
+# define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+#endif
+
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \
+do { \
+ if (yydebug) \
+ { \
+ YYFPRINTF (stderr, "%s ", Title); \
+ yy_symbol_print (stderr, \
+ Type, Value); \
+ YYFPRINTF (stderr, "\n"); \
+ } \
+} while (0)
+
+
+/*-----------------------------------.
+| Print this symbol's value on YYO. |
+`-----------------------------------*/
+
+static void
+yy_symbol_value_print (FILE *yyo, int yytype, YYSTYPE const * const yyvaluep)
+{
+ FILE *yyoutput = yyo;
+ YYUSE (yyoutput);
+ if (!yyvaluep)
+ return;
+# ifdef YYPRINT
+ if (yytype < YYNTOKENS)
+ YYPRINT (yyo, yytoknum[yytype], *yyvaluep);
+# endif
+ YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+ YYUSE (yytype);
+ YY_IGNORE_MAYBE_UNINITIALIZED_END
+}
+
+
+/*---------------------------.
+| Print this symbol on YYO. |
+`---------------------------*/
+
+static void
+yy_symbol_print (FILE *yyo, int yytype, YYSTYPE const * const yyvaluep)
+{
+ YYFPRINTF (yyo, "%s %s (",
+ yytype < YYNTOKENS ? "token" : "nterm", yytname[yytype]);
+
+ yy_symbol_value_print (yyo, yytype, yyvaluep);
+ YYFPRINTF (yyo, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included). |
+`------------------------------------------------------------------*/
+
+static void
+yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop)
+{
+ YYFPRINTF (stderr, "Stack now");
+ for (; yybottom <= yytop; yybottom++)
+ {
+ int yybot = *yybottom;
+ YYFPRINTF (stderr, " %d", yybot);
+ }
+ YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top) \
+do { \
+ if (yydebug) \
+ yy_stack_print ((Bottom), (Top)); \
+} while (0)
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced. |
+`------------------------------------------------*/
+
+static void
+yy_reduce_print (yytype_int16 *yyssp, YYSTYPE *yyvsp, int yyrule)
+{
+ unsigned long yylno = yyrline[yyrule];
+ int yynrhs = yyr2[yyrule];
+ int yyi;
+ YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+ yyrule - 1, yylno);
+ /* The symbols being reduced. */
+ for (yyi = 0; yyi < yynrhs; yyi++)
+ {
+ YYFPRINTF (stderr, " $%d = ", yyi + 1);
+ yy_symbol_print (stderr,
+ yystos[yyssp[yyi + 1 - yynrhs]],
+ &yyvsp[(yyi + 1) - (yynrhs)]
+ );
+ YYFPRINTF (stderr, "\n");
+ }
+}
+
+# define YY_REDUCE_PRINT(Rule) \
+do { \
+ if (yydebug) \
+ yy_reduce_print (yyssp, yyvsp, Rule); \
+} while (0)
+
+/* Nonzero means print parse trace. It is left uninitialized so that
+ multiple parsers can coexist. */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks. */
+#ifndef YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+ if the built-in stack extension method is used).
+
+ Do not make this value too large; the results are undefined if
+ YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+ evaluated with infinite-precision integer arithmetic. */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+# if defined __GLIBC__ && defined _STRING_H
+# define yystrlen strlen
+# else
+/* Return the length of YYSTR. */
+static YYSIZE_T
+yystrlen (const char *yystr)
+{
+ YYSIZE_T yylen;
+ for (yylen = 0; yystr[yylen]; yylen++)
+ continue;
+ return yylen;
+}
+# endif
+# endif
+
+# ifndef yystpcpy
+# if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
+# define yystpcpy stpcpy
+# else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+ YYDEST. */
+static char *
+yystpcpy (char *yydest, const char *yysrc)
+{
+ char *yyd = yydest;
+ const char *yys = yysrc;
+
+ while ((*yyd++ = *yys++) != '\0')
+ continue;
+
+ return yyd - 1;
+}
+# endif
+# endif
+
+# ifndef yytnamerr
+/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
+ quotes and backslashes, so that it's suitable for yyerror. The
+ heuristic is that double-quoting is unnecessary unless the string
+ contains an apostrophe, a comma, or backslash (other than
+ backslash-backslash). YYSTR is taken from yytname. If YYRES is
+ null, do not copy; instead, return the length of what the result
+ would have been. */
+static YYSIZE_T
+yytnamerr (char *yyres, const char *yystr)
+{
+ if (*yystr == '"')
+ {
+ YYSIZE_T yyn = 0;
+ char const *yyp = yystr;
+
+ for (;;)
+ switch (*++yyp)
+ {
+ case '\'':
+ case ',':
+ goto do_not_strip_quotes;
+
+ case '\\':
+ if (*++yyp != '\\')
+ goto do_not_strip_quotes;
+ else
+ goto append;
+
+ append:
+ default:
+ if (yyres)
+ yyres[yyn] = *yyp;
+ yyn++;
+ break;
+
+ case '"':
+ if (yyres)
+ yyres[yyn] = '\0';
+ return yyn;
+ }
+ do_not_strip_quotes: ;
+ }
+
+ if (! yyres)
+ return yystrlen (yystr);
+
+ return (YYSIZE_T) (yystpcpy (yyres, yystr) - yyres);
+}
+# endif
+
+/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message
+ about the unexpected token YYTOKEN for the state stack whose top is
+ YYSSP.
+
+ Return 0 if *YYMSG was successfully written. Return 1 if *YYMSG is
+ not large enough to hold the message. In that case, also set
+ *YYMSG_ALLOC to the required number of bytes. Return 2 if the
+ required number of bytes is too large to store. */
+static int
+yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
+ yytype_int16 *yyssp, int yytoken)
+{
+ YYSIZE_T yysize0 = yytnamerr (YY_NULLPTR, yytname[yytoken]);
+ YYSIZE_T yysize = yysize0;
+ enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
+ /* Internationalized format string. */
+ const char *yyformat = YY_NULLPTR;
+ /* Arguments of yyformat. */
+ char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
+ /* Number of reported tokens (one for the "unexpected", one per
+ "expected"). */
+ int yycount = 0;
+
+ /* There are many possibilities here to consider:
+ - If this state is a consistent state with a default action, then
+ the only way this function was invoked is if the default action
+ is an error action. In that case, don't check for expected
+ tokens because there are none.
+ - The only way there can be no lookahead present (in yychar) is if
+ this state is a consistent state with a default action. Thus,
+ detecting the absence of a lookahead is sufficient to determine
+ that there is no unexpected or expected token to report. In that
+ case, just report a simple "syntax error".
+ - Don't assume there isn't a lookahead just because this state is a
+ consistent state with a default action. There might have been a
+ previous inconsistent state, consistent state with a non-default
+ action, or user semantic action that manipulated yychar.
+ - Of course, the expected token list depends on states to have
+ correct lookahead information, and it depends on the parser not
+ to perform extra reductions after fetching a lookahead from the
+ scanner and before detecting a syntax error. Thus, state merging
+ (from LALR or IELR) and default reductions corrupt the expected
+ token list. However, the list is correct for canonical LR with
+ one exception: it will still contain any token that will not be
+ accepted due to an error action in a later state.
+ */
+ if (yytoken != YYEMPTY)
+ {
+ int yyn = yypact[*yyssp];
+ yyarg[yycount++] = yytname[yytoken];
+ if (!yypact_value_is_default (yyn))
+ {
+ /* Start YYX at -YYN if negative to avoid negative indexes in
+ YYCHECK. In other words, skip the first -YYN actions for
+ this state because they are default actions. */
+ int yyxbegin = yyn < 0 ? -yyn : 0;
+ /* Stay within bounds of both yycheck and yytname. */
+ int yychecklim = YYLAST - yyn + 1;
+ int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+ int yyx;
+
+ for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+ if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR
+ && !yytable_value_is_error (yytable[yyx + yyn]))
+ {
+ if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
+ {
+ yycount = 1;
+ yysize = yysize0;
+ break;
+ }
+ yyarg[yycount++] = yytname[yyx];
+ {
+ YYSIZE_T yysize1 = yysize + yytnamerr (YY_NULLPTR, yytname[yyx]);
+ if (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM)
+ yysize = yysize1;
+ else
+ return 2;
+ }
+ }
+ }
+ }
+
+ switch (yycount)
+ {
+# define YYCASE_(N, S) \
+ case N: \
+ yyformat = S; \
+ break
+ default: /* Avoid compiler warnings. */
+ YYCASE_(0, YY_("syntax error"));
+ YYCASE_(1, YY_("syntax error, unexpected %s"));
+ YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s"));
+ YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s"));
+ YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s"));
+ YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s"));
+# undef YYCASE_
+ }
+
+ {
+ YYSIZE_T yysize1 = yysize + yystrlen (yyformat);
+ if (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM)
+ yysize = yysize1;
+ else
+ return 2;
+ }
+
+ if (*yymsg_alloc < yysize)
+ {
+ *yymsg_alloc = 2 * yysize;
+ if (! (yysize <= *yymsg_alloc
+ && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM))
+ *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM;
+ return 1;
+ }
+
+ /* Avoid sprintf, as that infringes on the user's name space.
+ Don't have undefined behavior even if the translation
+ produced a string with the wrong number of "%s"s. */
+ {
+ char *yyp = *yymsg;
+ int yyi = 0;
+ while ((*yyp = *yyformat) != '\0')
+ if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount)
+ {
+ yyp += yytnamerr (yyp, yyarg[yyi++]);
+ yyformat += 2;
+ }
+ else
+ {
+ yyp++;
+ yyformat++;
+ }
+ }
+ return 0;
+}
+#endif /* YYERROR_VERBOSE */
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol. |
+`-----------------------------------------------*/
+
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+{
+ YYUSE (yyvaluep);
+ if (!yymsg)
+ yymsg = "Deleting";
+ YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+
+ YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+ YYUSE (yytype);
+ YY_IGNORE_MAYBE_UNINITIALIZED_END
+}
+
+
+
+
+/* The lookahead symbol. */
+static int yychar;
+
+/* The semantic value of the lookahead symbol. */
+YYSTYPE yylval;
+/* Number of syntax errors so far. */
+static int yynerrs;
+
+
+/*----------.
+| yyparse. |
+`----------*/
+
+int
+yyparse (void)
+{
+ int yystate;
+ /* Number of tokens to shift before error messages enabled. */
+ int yyerrstatus;
+
+ /* The stacks and their tools:
+ 'yyss': related to states.
+ 'yyvs': related to semantic values.
+
+ Refer to the stacks through separate pointers, to allow yyoverflow
+ to reallocate them elsewhere. */
+
+ /* The state stack. */
+ yytype_int16 yyssa[YYINITDEPTH];
+ yytype_int16 *yyss;
+ yytype_int16 *yyssp;
+
+ /* The semantic value stack. */
+ YYSTYPE yyvsa[YYINITDEPTH];
+ YYSTYPE *yyvs;
+ YYSTYPE *yyvsp;
+
+ YYSIZE_T yystacksize;
+
+ int yyn;
+ int yyresult;
+ /* Lookahead token as an internal (translated) token number. */
+ int yytoken = 0;
+ /* The variables used to return semantic value and location from the
+ action routines. */
+ YYSTYPE yyval;
+
+#if YYERROR_VERBOSE
+ /* Buffer for error messages, and its allocated size. */
+ char yymsgbuf[128];
+ char *yymsg = yymsgbuf;
+ YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
+#endif
+
+#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N))
+
+ /* The number of symbols on the RHS of the reduced rule.
+ Keep to zero when no symbol should be popped. */
+ int yylen = 0;
+
+ yyssp = yyss = yyssa;
+ yyvsp = yyvs = yyvsa;
+ yystacksize = YYINITDEPTH;
+
+ YYDPRINTF ((stderr, "Starting parse\n"));
+
+ yystate = 0;
+ yyerrstatus = 0;
+ yynerrs = 0;
+ yychar = YYEMPTY; /* Cause a token to be read. */
+ goto yysetstate;
+
+
+/*------------------------------------------------------------.
+| yynewstate -- push a new state, which is found in yystate. |
+`------------------------------------------------------------*/
+yynewstate:
+ /* In all cases, when you get here, the value and location stacks
+ have just been pushed. So pushing a state here evens the stacks. */
+ yyssp++;
+
+
+/*--------------------------------------------------------------------.
+| yynewstate -- set current state (the top of the stack) to yystate. |
+`--------------------------------------------------------------------*/
+yysetstate:
+ YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+ YY_ASSERT (0 <= yystate && yystate < YYNSTATES);
+ *yyssp = (yytype_int16) yystate;
+
+ if (yyss + yystacksize - 1 <= yyssp)
+#if !defined yyoverflow && !defined YYSTACK_RELOCATE
+ goto yyexhaustedlab;
+#else
+ {
+ /* Get the current used size of the three stacks, in elements. */
+ YYSIZE_T yysize = (YYSIZE_T) (yyssp - yyss + 1);
+
+# if defined yyoverflow
+ {
+ /* Give user a chance to reallocate the stack. Use copies of
+ these so that the &'s don't force the real ones into
+ memory. */
+ YYSTYPE *yyvs1 = yyvs;
+ yytype_int16 *yyss1 = yyss;
+
+ /* Each stack pointer address is followed by the size of the
+ data in use in that stack, in bytes. This used to be a
+ conditional around just the two extra args, but that might
+ be undefined if yyoverflow is a macro. */
+ yyoverflow (YY_("memory exhausted"),
+ &yyss1, yysize * sizeof (*yyssp),
+ &yyvs1, yysize * sizeof (*yyvsp),
+ &yystacksize);
+ yyss = yyss1;
+ yyvs = yyvs1;
+ }
+# else /* defined YYSTACK_RELOCATE */
+ /* Extend the stack our own way. */
+ if (YYMAXDEPTH <= yystacksize)
+ goto yyexhaustedlab;
+ yystacksize *= 2;
+ if (YYMAXDEPTH < yystacksize)
+ yystacksize = YYMAXDEPTH;
+
+ {
+ yytype_int16 *yyss1 = yyss;
+ union yyalloc *yyptr =
+ (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+ if (! yyptr)
+ goto yyexhaustedlab;
+ YYSTACK_RELOCATE (yyss_alloc, yyss);
+ YYSTACK_RELOCATE (yyvs_alloc, yyvs);
+# undef YYSTACK_RELOCATE
+ if (yyss1 != yyssa)
+ YYSTACK_FREE (yyss1);
+ }
+# endif
+
+ yyssp = yyss + yysize - 1;
+ yyvsp = yyvs + yysize - 1;
+
+ YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+ (unsigned long) yystacksize));
+
+ if (yyss + yystacksize - 1 <= yyssp)
+ YYABORT;
+ }
+#endif /* !defined yyoverflow && !defined YYSTACK_RELOCATE */
+
+ if (yystate == YYFINAL)
+ YYACCEPT;
+
+ goto yybackup;
+
+
+/*-----------.
+| yybackup. |
+`-----------*/
+yybackup:
+ /* Do appropriate processing given the current state. Read a
+ lookahead token if we need one and don't already have one. */
+
+ /* First try to decide what to do without reference to lookahead token. */
+ yyn = yypact[yystate];
+ if (yypact_value_is_default (yyn))
+ goto yydefault;
+
+ /* Not known => get a lookahead token if don't already have one. */
+
+ /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol. */
+ if (yychar == YYEMPTY)
+ {
+ YYDPRINTF ((stderr, "Reading a token: "));
+ yychar = yylex ();
+ }
+
+ if (yychar <= YYEOF)
+ {
+ yychar = yytoken = YYEOF;
+ YYDPRINTF ((stderr, "Now at end of input.\n"));
+ }
+ else
+ {
+ yytoken = YYTRANSLATE (yychar);
+ YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+ }
+
+ /* If the proper action on seeing token YYTOKEN is to reduce or to
+ detect an error, take that action. */
+ yyn += yytoken;
+ if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+ goto yydefault;
+ yyn = yytable[yyn];
+ if (yyn <= 0)
+ {
+ if (yytable_value_is_error (yyn))
+ goto yyerrlab;
+ yyn = -yyn;
+ goto yyreduce;
+ }
+
+ /* Count tokens shifted since error; after three, turn off error
+ status. */
+ if (yyerrstatus)
+ yyerrstatus--;
+
+ /* Shift the lookahead token. */
+ YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+ /* Discard the shifted token. */
+ yychar = YYEMPTY;
+
+ yystate = yyn;
+ YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+ *++yyvsp = yylval;
+ YY_IGNORE_MAYBE_UNINITIALIZED_END
+ goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state. |
+`-----------------------------------------------------------*/
+yydefault:
+ yyn = yydefact[yystate];
+ if (yyn == 0)
+ goto yyerrlab;
+ goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- do a reduction. |
+`-----------------------------*/
+yyreduce:
+ /* yyn is the number of a rule to reduce with. */
+ yylen = yyr2[yyn];
+
+ /* If YYLEN is nonzero, implement the default value of the action:
+ '$$ = $1'.
+
+ Otherwise, the following line sets YYVAL to garbage.
+ This behavior is undocumented and Bison
+ users should not rely upon it. Assigning to YYVAL
+ unconditionally makes the parser a bit smaller, and it avoids a
+ GCC warning that YYVAL may be used uninitialized. */
+ yyval = yyvsp[1-yylen];
+
+
+ YY_REDUCE_PRINT (yyn);
+ switch (yyn)
+ {
+ case 23:
+#line 166 "pars0grm.y"
+ { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1616 "pars0grm.cc"
+ break;
+
+ case 24:
+#line 168 "pars0grm.y"
+ { yyval = que_node_list_add_last(yyvsp[-1], yyvsp[0]); }
+#line 1622 "pars0grm.cc"
+ break;
+
+ case 25:
+#line 172 "pars0grm.y"
+ { yyval = yyvsp[0];}
+#line 1628 "pars0grm.cc"
+ break;
+
+ case 26:
+#line 174 "pars0grm.y"
+ { yyval = pars_func(yyvsp[-3], yyvsp[-1]); }
+#line 1634 "pars0grm.cc"
+ break;
+
+ case 27:
+#line 175 "pars0grm.y"
+ { yyval = yyvsp[0];}
+#line 1640 "pars0grm.cc"
+ break;
+
+ case 28:
+#line 176 "pars0grm.y"
+ { yyval = yyvsp[0];}
+#line 1646 "pars0grm.cc"
+ break;
+
+ case 29:
+#line 177 "pars0grm.y"
+ { yyval = yyvsp[0];}
+#line 1652 "pars0grm.cc"
+ break;
+
+ case 30:
+#line 178 "pars0grm.y"
+ { yyval = yyvsp[0];}
+#line 1658 "pars0grm.cc"
+ break;
+
+ case 31:
+#line 179 "pars0grm.y"
+ { yyval = yyvsp[0];}
+#line 1664 "pars0grm.cc"
+ break;
+
+ case 32:
+#line 180 "pars0grm.y"
+ { yyval = pars_op('+', yyvsp[-2], yyvsp[0]); }
+#line 1670 "pars0grm.cc"
+ break;
+
+ case 33:
+#line 181 "pars0grm.y"
+ { yyval = pars_op('-', yyvsp[-2], yyvsp[0]); }
+#line 1676 "pars0grm.cc"
+ break;
+
+ case 34:
+#line 182 "pars0grm.y"
+ { yyval = pars_op('*', yyvsp[-2], yyvsp[0]); }
+#line 1682 "pars0grm.cc"
+ break;
+
+ case 35:
+#line 183 "pars0grm.y"
+ { yyval = pars_op('/', yyvsp[-2], yyvsp[0]); }
+#line 1688 "pars0grm.cc"
+ break;
+
+ case 36:
+#line 184 "pars0grm.y"
+ { yyval = pars_op('-', yyvsp[0], NULL); }
+#line 1694 "pars0grm.cc"
+ break;
+
+ case 37:
+#line 185 "pars0grm.y"
+ { yyval = yyvsp[-1]; }
+#line 1700 "pars0grm.cc"
+ break;
+
+ case 38:
+#line 186 "pars0grm.y"
+ { yyval = pars_op('=', yyvsp[-2], yyvsp[0]); }
+#line 1706 "pars0grm.cc"
+ break;
+
+ case 39:
+#line 188 "pars0grm.y"
+ { yyval = pars_op(PARS_LIKE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1712 "pars0grm.cc"
+ break;
+
+ case 40:
+#line 189 "pars0grm.y"
+ { yyval = pars_op('<', yyvsp[-2], yyvsp[0]); }
+#line 1718 "pars0grm.cc"
+ break;
+
+ case 41:
+#line 190 "pars0grm.y"
+ { yyval = pars_op('>', yyvsp[-2], yyvsp[0]); }
+#line 1724 "pars0grm.cc"
+ break;
+
+ case 42:
+#line 191 "pars0grm.y"
+ { yyval = pars_op(PARS_GE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1730 "pars0grm.cc"
+ break;
+
+ case 43:
+#line 192 "pars0grm.y"
+ { yyval = pars_op(PARS_LE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1736 "pars0grm.cc"
+ break;
+
+ case 44:
+#line 193 "pars0grm.y"
+ { yyval = pars_op(PARS_NE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1742 "pars0grm.cc"
+ break;
+
+ case 45:
+#line 194 "pars0grm.y"
+ { yyval = pars_op(PARS_AND_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1748 "pars0grm.cc"
+ break;
+
+ case 46:
+#line 195 "pars0grm.y"
+ { yyval = pars_op(PARS_OR_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1754 "pars0grm.cc"
+ break;
+
+ case 47:
+#line 196 "pars0grm.y"
+ { yyval = pars_op(PARS_NOT_TOKEN, yyvsp[0], NULL); }
+#line 1760 "pars0grm.cc"
+ break;
+
+ case 48:
+#line 198 "pars0grm.y"
+ { yyval = pars_op(PARS_NOTFOUND_TOKEN, yyvsp[-2], NULL); }
+#line 1766 "pars0grm.cc"
+ break;
+
+ case 49:
+#line 200 "pars0grm.y"
+ { yyval = pars_op(PARS_NOTFOUND_TOKEN, yyvsp[-2], NULL); }
+#line 1772 "pars0grm.cc"
+ break;
+
+ case 50:
+#line 204 "pars0grm.y"
+ { yyval = &pars_to_binary_token; }
+#line 1778 "pars0grm.cc"
+ break;
+
+ case 51:
+#line 205 "pars0grm.y"
+ { yyval = &pars_substr_token; }
+#line 1784 "pars0grm.cc"
+ break;
+
+ case 52:
+#line 206 "pars0grm.y"
+ { yyval = &pars_concat_token; }
+#line 1790 "pars0grm.cc"
+ break;
+
+ case 53:
+#line 207 "pars0grm.y"
+ { yyval = &pars_instr_token; }
+#line 1796 "pars0grm.cc"
+ break;
+
+ case 54:
+#line 208 "pars0grm.y"
+ { yyval = &pars_length_token; }
+#line 1802 "pars0grm.cc"
+ break;
+
+ case 58:
+#line 219 "pars0grm.y"
+ { yyval = pars_stored_procedure_call(
+ static_cast<sym_node_t*>(yyvsp[-4])); }
+#line 1809 "pars0grm.cc"
+ break;
+
+ case 59:
+#line 224 "pars0grm.y"
+ { yyval = yyvsp[-2]; }
+#line 1815 "pars0grm.cc"
+ break;
+
+ case 60:
+#line 228 "pars0grm.y"
+ { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1821 "pars0grm.cc"
+ break;
+
+ case 61:
+#line 230 "pars0grm.y"
+ { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1827 "pars0grm.cc"
+ break;
+
+ case 62:
+#line 234 "pars0grm.y"
+ { yyval = NULL; }
+#line 1833 "pars0grm.cc"
+ break;
+
+ case 63:
+#line 235 "pars0grm.y"
+ { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1839 "pars0grm.cc"
+ break;
+
+ case 64:
+#line 237 "pars0grm.y"
+ { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1845 "pars0grm.cc"
+ break;
+
+ case 65:
+#line 241 "pars0grm.y"
+ { yyval = NULL; }
+#line 1851 "pars0grm.cc"
+ break;
+
+ case 66:
+#line 242 "pars0grm.y"
+ { yyval = que_node_list_add_last(NULL, yyvsp[0]);}
+#line 1857 "pars0grm.cc"
+ break;
+
+ case 67:
+#line 243 "pars0grm.y"
+ { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1863 "pars0grm.cc"
+ break;
+
+ case 68:
+#line 247 "pars0grm.y"
+ { yyval = yyvsp[0]; }
+#line 1869 "pars0grm.cc"
+ break;
+
+ case 69:
+#line 249 "pars0grm.y"
+ { yyval = pars_func(&pars_count_token,
+ que_node_list_add_last(NULL,
+ sym_tab_add_int_lit(
+ pars_sym_tab_global, 1))); }
+#line 1878 "pars0grm.cc"
+ break;
+
+ case 70:
+#line 256 "pars0grm.y"
+ { yyval = NULL; }
+#line 1884 "pars0grm.cc"
+ break;
+
+ case 71:
+#line 257 "pars0grm.y"
+ { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1890 "pars0grm.cc"
+ break;
+
+ case 72:
+#line 259 "pars0grm.y"
+ { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1896 "pars0grm.cc"
+ break;
+
+ case 73:
+#line 263 "pars0grm.y"
+ { yyval = pars_select_list(&pars_star_denoter,
+ NULL); }
+#line 1903 "pars0grm.cc"
+ break;
+
+ case 74:
+#line 266 "pars0grm.y"
+ { yyval = pars_select_list(
+ yyvsp[-2], static_cast<sym_node_t*>(yyvsp[0])); }
+#line 1910 "pars0grm.cc"
+ break;
+
+ case 75:
+#line 268 "pars0grm.y"
+ { yyval = pars_select_list(yyvsp[0], NULL); }
+#line 1916 "pars0grm.cc"
+ break;
+
+ case 76:
+#line 272 "pars0grm.y"
+ { yyval = NULL; }
+#line 1922 "pars0grm.cc"
+ break;
+
+ case 77:
+#line 273 "pars0grm.y"
+ { yyval = yyvsp[0]; }
+#line 1928 "pars0grm.cc"
+ break;
+
+ case 78:
+#line 277 "pars0grm.y"
+ { yyval = NULL; }
+#line 1934 "pars0grm.cc"
+ break;
+
+ case 79:
+#line 279 "pars0grm.y"
+ { yyval = &pars_update_token; }
+#line 1940 "pars0grm.cc"
+ break;
+
+ case 80:
+#line 283 "pars0grm.y"
+ { yyval = NULL; }
+#line 1946 "pars0grm.cc"
+ break;
+
+ case 81:
+#line 285 "pars0grm.y"
+ { yyval = &pars_share_token; }
+#line 1952 "pars0grm.cc"
+ break;
+
+ case 82:
+#line 289 "pars0grm.y"
+ { yyval = &pars_asc_token; }
+#line 1958 "pars0grm.cc"
+ break;
+
+ case 83:
+#line 290 "pars0grm.y"
+ { yyval = &pars_asc_token; }
+#line 1964 "pars0grm.cc"
+ break;
+
+ case 84:
+#line 291 "pars0grm.y"
+ { yyval = &pars_desc_token; }
+#line 1970 "pars0grm.cc"
+ break;
+
+ case 85:
+#line 295 "pars0grm.y"
+ { yyval = NULL; }
+#line 1976 "pars0grm.cc"
+ break;
+
+ case 86:
+#line 297 "pars0grm.y"
+ { yyval = pars_order_by(
+ static_cast<sym_node_t*>(yyvsp[-1]),
+ static_cast<pars_res_word_t*>(yyvsp[0])); }
+#line 1984 "pars0grm.cc"
+ break;
+
+ case 87:
+#line 308 "pars0grm.y"
+ { yyval = pars_select_statement(
+ static_cast<sel_node_t*>(yyvsp[-6]),
+ static_cast<sym_node_t*>(yyvsp[-4]),
+ static_cast<que_node_t*>(yyvsp[-3]),
+ static_cast<pars_res_word_t*>(yyvsp[-2]),
+ static_cast<pars_res_word_t*>(yyvsp[-1]),
+ static_cast<order_node_t*>(yyvsp[0])); }
+#line 1996 "pars0grm.cc"
+ break;
+
+ case 88:
+#line 319 "pars0grm.y"
+ { yyval = yyvsp[0]; }
+#line 2002 "pars0grm.cc"
+ break;
+
+ case 89:
+#line 324 "pars0grm.y"
+ { yyval = pars_insert_statement(
+ static_cast<sym_node_t*>(yyvsp[-4]), yyvsp[-1], NULL); }
+#line 2009 "pars0grm.cc"
+ break;
+
+ case 90:
+#line 327 "pars0grm.y"
+ { yyval = pars_insert_statement(
+ static_cast<sym_node_t*>(yyvsp[-1]),
+ NULL,
+ static_cast<sel_node_t*>(yyvsp[0])); }
+#line 2018 "pars0grm.cc"
+ break;
+
+ case 91:
+#line 334 "pars0grm.y"
+ { yyval = pars_column_assignment(
+ static_cast<sym_node_t*>(yyvsp[-2]),
+ static_cast<que_node_t*>(yyvsp[0])); }
+#line 2026 "pars0grm.cc"
+ break;
+
+ case 92:
+#line 340 "pars0grm.y"
+ { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 2032 "pars0grm.cc"
+ break;
+
+ case 93:
+#line 342 "pars0grm.y"
+ { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 2038 "pars0grm.cc"
+ break;
+
+ case 94:
+#line 348 "pars0grm.y"
+ { yyval = yyvsp[0]; }
+#line 2044 "pars0grm.cc"
+ break;
+
+ case 95:
+#line 354 "pars0grm.y"
+ { yyval = pars_update_statement_start(
+ FALSE,
+ static_cast<sym_node_t*>(yyvsp[-2]),
+ static_cast<col_assign_node_t*>(yyvsp[0])); }
+#line 2053 "pars0grm.cc"
+ break;
+
+ case 96:
+#line 362 "pars0grm.y"
+ { yyval = pars_update_statement(
+ static_cast<upd_node_t*>(yyvsp[-1]),
+ NULL,
+ static_cast<que_node_t*>(yyvsp[0])); }
+#line 2062 "pars0grm.cc"
+ break;
+
+ case 97:
+#line 370 "pars0grm.y"
+ { yyval = pars_update_statement(
+ static_cast<upd_node_t*>(yyvsp[-1]),
+ static_cast<sym_node_t*>(yyvsp[0]),
+ NULL); }
+#line 2071 "pars0grm.cc"
+ break;
+
+ case 98:
+#line 378 "pars0grm.y"
+ { yyval = pars_update_statement_start(
+ TRUE,
+ static_cast<sym_node_t*>(yyvsp[0]), NULL); }
+#line 2079 "pars0grm.cc"
+ break;
+
+ case 99:
+#line 385 "pars0grm.y"
+ { yyval = pars_update_statement(
+ static_cast<upd_node_t*>(yyvsp[-1]),
+ NULL,
+ static_cast<que_node_t*>(yyvsp[0])); }
+#line 2088 "pars0grm.cc"
+ break;
+
+ case 100:
+#line 393 "pars0grm.y"
+ { yyval = pars_update_statement(
+ static_cast<upd_node_t*>(yyvsp[-1]),
+ static_cast<sym_node_t*>(yyvsp[0]),
+ NULL); }
+#line 2097 "pars0grm.cc"
+ break;
+
+ case 101:
+#line 401 "pars0grm.y"
+ { yyval = pars_assignment_statement(
+ static_cast<sym_node_t*>(yyvsp[-2]),
+ static_cast<que_node_t*>(yyvsp[0])); }
+#line 2105 "pars0grm.cc"
+ break;
+
+ case 102:
+#line 409 "pars0grm.y"
+ { yyval = pars_elsif_element(yyvsp[-2], yyvsp[0]); }
+#line 2111 "pars0grm.cc"
+ break;
+
+ case 103:
+#line 413 "pars0grm.y"
+ { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 2117 "pars0grm.cc"
+ break;
+
+ case 104:
+#line 415 "pars0grm.y"
+ { yyval = que_node_list_add_last(yyvsp[-1], yyvsp[0]); }
+#line 2123 "pars0grm.cc"
+ break;
+
+ case 105:
+#line 419 "pars0grm.y"
+ { yyval = NULL; }
+#line 2129 "pars0grm.cc"
+ break;
+
+ case 106:
+#line 421 "pars0grm.y"
+ { yyval = yyvsp[0]; }
+#line 2135 "pars0grm.cc"
+ break;
+
+ case 107:
+#line 422 "pars0grm.y"
+ { yyval = yyvsp[0]; }
+#line 2141 "pars0grm.cc"
+ break;
+
+ case 108:
+#line 429 "pars0grm.y"
+ { yyval = pars_if_statement(yyvsp[-5], yyvsp[-3], yyvsp[-2]); }
+#line 2147 "pars0grm.cc"
+ break;
+
+ case 109:
+#line 435 "pars0grm.y"
+ { yyval = pars_while_statement(yyvsp[-4], yyvsp[-2]); }
+#line 2153 "pars0grm.cc"
+ break;
+
+ case 110:
+#line 443 "pars0grm.y"
+ { yyval = pars_for_statement(
+ static_cast<sym_node_t*>(yyvsp[-8]),
+ yyvsp[-6], yyvsp[-4], yyvsp[-2]); }
+#line 2161 "pars0grm.cc"
+ break;
+
+ case 111:
+#line 449 "pars0grm.y"
+ { yyval = pars_exit_statement(); }
+#line 2167 "pars0grm.cc"
+ break;
+
+ case 112:
+#line 453 "pars0grm.y"
+ { yyval = pars_return_statement(); }
+#line 2173 "pars0grm.cc"
+ break;
+
+ case 113:
+#line 458 "pars0grm.y"
+ { yyval = pars_open_statement(
+ ROW_SEL_OPEN_CURSOR,
+ static_cast<sym_node_t*>(yyvsp[0])); }
+#line 2181 "pars0grm.cc"
+ break;
+
+ case 114:
+#line 465 "pars0grm.y"
+ { yyval = pars_open_statement(
+ ROW_SEL_CLOSE_CURSOR,
+ static_cast<sym_node_t*>(yyvsp[0])); }
+#line 2189 "pars0grm.cc"
+ break;
+
+ case 115:
+#line 472 "pars0grm.y"
+ { yyval = pars_fetch_statement(
+ static_cast<sym_node_t*>(yyvsp[-2]),
+ static_cast<sym_node_t*>(yyvsp[0]), NULL); }
+#line 2197 "pars0grm.cc"
+ break;
+
+ case 116:
+#line 476 "pars0grm.y"
+ { yyval = pars_fetch_statement(
+ static_cast<sym_node_t*>(yyvsp[-2]),
+ NULL,
+ static_cast<sym_node_t*>(yyvsp[0])); }
+#line 2206 "pars0grm.cc"
+ break;
+
+ case 117:
+#line 484 "pars0grm.y"
+ { yyval = pars_column_def(
+ static_cast<sym_node_t*>(yyvsp[-3]),
+ static_cast<pars_res_word_t*>(yyvsp[-2]),
+ static_cast<sym_node_t*>(yyvsp[-1]),
+ yyvsp[0]); }
+#line 2216 "pars0grm.cc"
+ break;
+
+ case 118:
+#line 492 "pars0grm.y"
+ { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 2222 "pars0grm.cc"
+ break;
+
+ case 119:
+#line 494 "pars0grm.y"
+ { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 2228 "pars0grm.cc"
+ break;
+
+ case 120:
+#line 498 "pars0grm.y"
+ { yyval = NULL; }
+#line 2234 "pars0grm.cc"
+ break;
+
+ case 121:
+#line 500 "pars0grm.y"
+ { yyval = yyvsp[-1]; }
+#line 2240 "pars0grm.cc"
+ break;
+
+ case 122:
+#line 504 "pars0grm.y"
+ { yyval = NULL; }
+#line 2246 "pars0grm.cc"
+ break;
+
+ case 123:
+#line 506 "pars0grm.y"
+ { yyval = &pars_int_token;
+ /* pass any non-NULL pointer */ }
+#line 2253 "pars0grm.cc"
+ break;
+
+ case 124:
+#line 513 "pars0grm.y"
+ { yyval = pars_create_table(
+ static_cast<sym_node_t*>(yyvsp[-3]),
+ static_cast<sym_node_t*>(yyvsp[-1])); }
+#line 2261 "pars0grm.cc"
+ break;
+
+ case 125:
+#line 519 "pars0grm.y"
+ { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 2267 "pars0grm.cc"
+ break;
+
+ case 126:
+#line 521 "pars0grm.y"
+ { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 2273 "pars0grm.cc"
+ break;
+
+ case 127:
+#line 525 "pars0grm.y"
+ { yyval = NULL; }
+#line 2279 "pars0grm.cc"
+ break;
+
+ case 128:
+#line 526 "pars0grm.y"
+ { yyval = &pars_unique_token; }
+#line 2285 "pars0grm.cc"
+ break;
+
+ case 129:
+#line 530 "pars0grm.y"
+ { yyval = NULL; }
+#line 2291 "pars0grm.cc"
+ break;
+
+ case 130:
+#line 531 "pars0grm.y"
+ { yyval = &pars_clustered_token; }
+#line 2297 "pars0grm.cc"
+ break;
+
+ case 131:
+#line 540 "pars0grm.y"
+ { yyval = pars_create_index(
+ static_cast<pars_res_word_t*>(yyvsp[-8]),
+ static_cast<pars_res_word_t*>(yyvsp[-7]),
+ static_cast<sym_node_t*>(yyvsp[-5]),
+ static_cast<sym_node_t*>(yyvsp[-3]),
+ static_cast<sym_node_t*>(yyvsp[-1])); }
+#line 2308 "pars0grm.cc"
+ break;
+
+ case 132:
+#line 549 "pars0grm.y"
+ { yyval = yyvsp[0]; }
+#line 2314 "pars0grm.cc"
+ break;
+
+ case 133:
+#line 550 "pars0grm.y"
+ { yyval = yyvsp[0]; }
+#line 2320 "pars0grm.cc"
+ break;
+
+ case 134:
+#line 555 "pars0grm.y"
+ { yyval = pars_commit_statement(); }
+#line 2326 "pars0grm.cc"
+ break;
+
+ case 135:
+#line 560 "pars0grm.y"
+ { yyval = pars_rollback_statement(); }
+#line 2332 "pars0grm.cc"
+ break;
+
+ case 136:
+#line 564 "pars0grm.y"
+ { yyval = &pars_int_token; }
+#line 2338 "pars0grm.cc"
+ break;
+
+ case 137:
+#line 565 "pars0grm.y"
+ { yyval = &pars_bigint_token; }
+#line 2344 "pars0grm.cc"
+ break;
+
+ case 138:
+#line 566 "pars0grm.y"
+ { yyval = &pars_char_token; }
+#line 2350 "pars0grm.cc"
+ break;
+
+ case 139:
+#line 571 "pars0grm.y"
+ { yyval = pars_variable_declaration(
+ static_cast<sym_node_t*>(yyvsp[-2]),
+ static_cast<pars_res_word_t*>(yyvsp[-1])); }
+#line 2358 "pars0grm.cc"
+ break;
+
+ case 143:
+#line 585 "pars0grm.y"
+ { yyval = pars_cursor_declaration(
+ static_cast<sym_node_t*>(yyvsp[-3]),
+ static_cast<sel_node_t*>(yyvsp[-1])); }
+#line 2366 "pars0grm.cc"
+ break;
+
+ case 144:
+#line 592 "pars0grm.y"
+ { yyval = pars_function_declaration(
+ static_cast<sym_node_t*>(yyvsp[-1])); }
+#line 2373 "pars0grm.cc"
+ break;
+
+ case 150:
+#line 614 "pars0grm.y"
+ { yyval = pars_procedure_definition(
+ static_cast<sym_node_t*>(yyvsp[-8]), yyvsp[-1]); }
+#line 2380 "pars0grm.cc"
+ break;
+
+
+#line 2384 "pars0grm.cc"
+
+ default: break;
+ }
+ /* User semantic actions sometimes alter yychar, and that requires
+ that yytoken be updated with the new translation. We take the
+ approach of translating immediately before every use of yytoken.
+ One alternative is translating here after every semantic action,
+ but that translation would be missed if the semantic action invokes
+ YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or
+ if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an
+ incorrect destructor might then be invoked immediately. In the
+ case of YYERROR or YYBACKUP, subsequent parser actions might lead
+ to an incorrect destructor call or verbose syntax error message
+ before the lookahead is translated. */
+ YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+
+ YYPOPSTACK (yylen);
+ yylen = 0;
+ YY_STACK_PRINT (yyss, yyssp);
+
+ *++yyvsp = yyval;
+
+ /* Now 'shift' the result of the reduction. Determine what state
+ that goes to, based on the state we popped back to and the rule
+ number reduced by. */
+ {
+ const int yylhs = yyr1[yyn] - YYNTOKENS;
+ const int yyi = yypgoto[yylhs] + *yyssp;
+ yystate = (0 <= yyi && yyi <= YYLAST && yycheck[yyi] == *yyssp
+ ? yytable[yyi]
+ : yydefgoto[yylhs]);
+ }
+
+ goto yynewstate;
+
+
+/*--------------------------------------.
+| yyerrlab -- here on detecting error. |
+`--------------------------------------*/
+yyerrlab:
+ /* Make sure we have latest lookahead translation. See comments at
+ user semantic actions for why this is necessary. */
+ yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar);
+
+ /* If not already recovering from an error, report this error. */
+ if (!yyerrstatus)
+ {
+ ++yynerrs;
+#if ! YYERROR_VERBOSE
+ yyerror (YY_("syntax error"));
+#else
+# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \
+ yyssp, yytoken)
+ {
+ char const *yymsgp = YY_("syntax error");
+ int yysyntax_error_status;
+ yysyntax_error_status = YYSYNTAX_ERROR;
+ if (yysyntax_error_status == 0)
+ yymsgp = yymsg;
+ else if (yysyntax_error_status == 1)
+ {
+ if (yymsg != yymsgbuf)
+ YYSTACK_FREE (yymsg);
+ yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc);
+ if (!yymsg)
+ {
+ yymsg = yymsgbuf;
+ yymsg_alloc = sizeof yymsgbuf;
+ yysyntax_error_status = 2;
+ }
+ else
+ {
+ yysyntax_error_status = YYSYNTAX_ERROR;
+ yymsgp = yymsg;
+ }
+ }
+ yyerror (yymsgp);
+ if (yysyntax_error_status == 2)
+ goto yyexhaustedlab;
+ }
+# undef YYSYNTAX_ERROR
+#endif
+ }
+
+
+
+ if (yyerrstatus == 3)
+ {
+ /* If just tried and failed to reuse lookahead token after an
+ error, discard it. */
+
+ if (yychar <= YYEOF)
+ {
+ /* Return failure if at end of input. */
+ if (yychar == YYEOF)
+ YYABORT;
+ }
+ else
+ {
+ yydestruct ("Error: discarding",
+ yytoken, &yylval);
+ yychar = YYEMPTY;
+ }
+ }
+
+ /* Else will try to reuse lookahead token after shifting the error
+ token. */
+ goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR. |
+`---------------------------------------------------*/
+yyerrorlab:
+ /* Pacify compilers when the user code never invokes YYERROR and the
+ label yyerrorlab therefore never appears in user code. */
+ if (0)
+ YYERROR;
+
+ /* Do not reclaim the symbols of the rule whose action triggered
+ this YYERROR. */
+ YYPOPSTACK (yylen);
+ yylen = 0;
+ YY_STACK_PRINT (yyss, yyssp);
+ yystate = *yyssp;
+ goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR. |
+`-------------------------------------------------------------*/
+yyerrlab1:
+ yyerrstatus = 3; /* Each real token shifted decrements this. */
+
+ for (;;)
+ {
+ yyn = yypact[yystate];
+ if (!yypact_value_is_default (yyn))
+ {
+ yyn += YYTERROR;
+ if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+ {
+ yyn = yytable[yyn];
+ if (0 < yyn)
+ break;
+ }
+ }
+
+ /* Pop the current state because it cannot handle the error token. */
+ if (yyssp == yyss)
+ YYABORT;
+
+
+ yydestruct ("Error: popping",
+ yystos[yystate], yyvsp);
+ YYPOPSTACK (1);
+ yystate = *yyssp;
+ YY_STACK_PRINT (yyss, yyssp);
+ }
+
+ YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+ *++yyvsp = yylval;
+ YY_IGNORE_MAYBE_UNINITIALIZED_END
+
+
+ /* Shift the error token. */
+ YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+ yystate = yyn;
+ goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here. |
+`-------------------------------------*/
+yyacceptlab:
+ yyresult = 0;
+ goto yyreturn;
+
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here. |
+`-----------------------------------*/
+yyabortlab:
+ yyresult = 1;
+ goto yyreturn;
+
+
+#if !defined yyoverflow || YYERROR_VERBOSE
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here. |
+`-------------------------------------------------*/
+yyexhaustedlab:
+ yyerror (YY_("memory exhausted"));
+ yyresult = 2;
+ /* Fall through. */
+#endif
+
+
+/*-----------------------------------------------------.
+| yyreturn -- parsing is finished, return the result. |
+`-----------------------------------------------------*/
+yyreturn:
+ if (yychar != YYEMPTY)
+ {
+ /* Make sure we have latest lookahead translation. See comments at
+ user semantic actions for why this is necessary. */
+ yytoken = YYTRANSLATE (yychar);
+ yydestruct ("Cleanup: discarding lookahead",
+ yytoken, &yylval);
+ }
+ /* Do not reclaim the symbols of the rule whose action triggered
+ this YYABORT or YYACCEPT. */
+ YYPOPSTACK (yylen);
+ YY_STACK_PRINT (yyss, yyssp);
+ while (yyssp != yyss)
+ {
+ yydestruct ("Cleanup: popping",
+ yystos[*yyssp], yyvsp);
+ YYPOPSTACK (1);
+ }
+#ifndef yyoverflow
+ if (yyss != yyssa)
+ YYSTACK_FREE (yyss);
+#endif
+#if YYERROR_VERBOSE
+ if (yymsg != yymsgbuf)
+ YYSTACK_FREE (yymsg);
+#endif
+ return yyresult;
+}
+#line 618 "pars0grm.y"
+
diff --git a/storage/innobase/pars/pars0grm.y b/storage/innobase/pars/pars0grm.y
new file mode 100644
index 00000000..625ed41b
--- /dev/null
+++ b/storage/innobase/pars/pars0grm.y
@@ -0,0 +1,618 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+SQL parser: input file for the GNU Bison parser generator
+
+Look from pars0lex.l for instructions how to generate the C files for
+the InnoDB parser.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+
+%{
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h>
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+
+#define YYSTYPE que_node_t*
+
+/* #define __STDC__ */
+int
+yylex(void);
+%}
+
+%token PARS_INT_LIT
+%token PARS_FLOAT_LIT
+%token PARS_STR_LIT
+%token PARS_NULL_LIT
+%token PARS_ID_TOKEN
+%token PARS_AND_TOKEN
+%token PARS_OR_TOKEN
+%token PARS_NOT_TOKEN
+%token PARS_GE_TOKEN
+%token PARS_LE_TOKEN
+%token PARS_NE_TOKEN
+%token PARS_PROCEDURE_TOKEN
+%token PARS_IN_TOKEN
+%token PARS_INT_TOKEN
+%token PARS_CHAR_TOKEN
+%token PARS_IS_TOKEN
+%token PARS_BEGIN_TOKEN
+%token PARS_END_TOKEN
+%token PARS_IF_TOKEN
+%token PARS_THEN_TOKEN
+%token PARS_ELSE_TOKEN
+%token PARS_ELSIF_TOKEN
+%token PARS_LOOP_TOKEN
+%token PARS_WHILE_TOKEN
+%token PARS_RETURN_TOKEN
+%token PARS_SELECT_TOKEN
+%token PARS_COUNT_TOKEN
+%token PARS_FROM_TOKEN
+%token PARS_WHERE_TOKEN
+%token PARS_FOR_TOKEN
+%token PARS_DDOT_TOKEN
+%token PARS_ORDER_TOKEN
+%token PARS_BY_TOKEN
+%token PARS_ASC_TOKEN
+%token PARS_DESC_TOKEN
+%token PARS_INSERT_TOKEN
+%token PARS_INTO_TOKEN
+%token PARS_VALUES_TOKEN
+%token PARS_UPDATE_TOKEN
+%token PARS_SET_TOKEN
+%token PARS_DELETE_TOKEN
+%token PARS_CURRENT_TOKEN
+%token PARS_OF_TOKEN
+%token PARS_CREATE_TOKEN
+%token PARS_TABLE_TOKEN
+%token PARS_INDEX_TOKEN
+%token PARS_UNIQUE_TOKEN
+%token PARS_CLUSTERED_TOKEN
+%token PARS_ON_TOKEN
+%token PARS_ASSIGN_TOKEN
+%token PARS_DECLARE_TOKEN
+%token PARS_CURSOR_TOKEN
+%token PARS_SQL_TOKEN
+%token PARS_OPEN_TOKEN
+%token PARS_FETCH_TOKEN
+%token PARS_CLOSE_TOKEN
+%token PARS_NOTFOUND_TOKEN
+%token PARS_TO_BINARY_TOKEN
+%token PARS_SUBSTR_TOKEN
+%token PARS_CONCAT_TOKEN
+%token PARS_INSTR_TOKEN
+%token PARS_LENGTH_TOKEN
+%token PARS_COMMIT_TOKEN
+%token PARS_ROLLBACK_TOKEN
+%token PARS_WORK_TOKEN
+%token PARS_EXIT_TOKEN
+%token PARS_FUNCTION_TOKEN
+%token PARS_LOCK_TOKEN
+%token PARS_SHARE_TOKEN
+%token PARS_MODE_TOKEN
+%token PARS_LIKE_TOKEN
+%token PARS_LIKE_TOKEN_EXACT
+%token PARS_LIKE_TOKEN_PREFIX
+%token PARS_LIKE_TOKEN_SUFFIX
+%token PARS_LIKE_TOKEN_SUBSTR
+%token PARS_TABLE_NAME_TOKEN
+%token PARS_BIGINT_TOKEN
+
+%left PARS_AND_TOKEN PARS_OR_TOKEN
+%left PARS_NOT_TOKEN
+%left '=' '<' '>' PARS_GE_TOKEN PARS_LE_TOKEN
+%left '-' '+'
+%left '*' '/'
+%left NEG /* negation--unary minus */
+%left '%'
+
+%expect 41
+
+/* Grammar follows */
+%%
+
+top_statement:
+ procedure_definition ';'
+
+statement:
+ stored_procedure_call
+ | while_statement ';'
+ | for_statement ';'
+ | exit_statement ';'
+ | if_statement ';'
+ | return_statement ';'
+ | assignment_statement ';'
+ | select_statement ';'
+ | insert_statement ';'
+ | delete_statement_searched ';'
+ | delete_statement_positioned ';'
+ | update_statement_searched ';'
+ | update_statement_positioned ';'
+ | open_cursor_statement ';'
+ | fetch_statement ';'
+ | close_cursor_statement ';'
+ | commit_statement ';'
+ | rollback_statement ';'
+ | create_table ';'
+ | create_index ';'
+;
+
+statement_list:
+ statement { $$ = que_node_list_add_last(NULL, $1); }
+ | statement_list statement
+ { $$ = que_node_list_add_last($1, $2); }
+;
+
+exp:
+ PARS_ID_TOKEN { $$ = $1;}
+ | function_name '(' exp_list ')'
+ { $$ = pars_func($1, $3); }
+ | PARS_INT_LIT { $$ = $1;}
+ | PARS_FLOAT_LIT { $$ = $1;}
+ | PARS_STR_LIT { $$ = $1;}
+ | PARS_NULL_LIT { $$ = $1;}
+ | PARS_SQL_TOKEN { $$ = $1;}
+ | exp '+' exp { $$ = pars_op('+', $1, $3); }
+ | exp '-' exp { $$ = pars_op('-', $1, $3); }
+ | exp '*' exp { $$ = pars_op('*', $1, $3); }
+ | exp '/' exp { $$ = pars_op('/', $1, $3); }
+ | '-' exp %prec NEG { $$ = pars_op('-', $2, NULL); }
+ | '(' exp ')' { $$ = $2; }
+ | exp '=' exp { $$ = pars_op('=', $1, $3); }
+ | exp PARS_LIKE_TOKEN PARS_STR_LIT
+ { $$ = pars_op(PARS_LIKE_TOKEN, $1, $3); }
+ | exp '<' exp { $$ = pars_op('<', $1, $3); }
+ | exp '>' exp { $$ = pars_op('>', $1, $3); }
+ | exp PARS_GE_TOKEN exp { $$ = pars_op(PARS_GE_TOKEN, $1, $3); }
+ | exp PARS_LE_TOKEN exp { $$ = pars_op(PARS_LE_TOKEN, $1, $3); }
+ | exp PARS_NE_TOKEN exp { $$ = pars_op(PARS_NE_TOKEN, $1, $3); }
+ | exp PARS_AND_TOKEN exp{ $$ = pars_op(PARS_AND_TOKEN, $1, $3); }
+ | exp PARS_OR_TOKEN exp { $$ = pars_op(PARS_OR_TOKEN, $1, $3); }
+ | PARS_NOT_TOKEN exp { $$ = pars_op(PARS_NOT_TOKEN, $2, NULL); }
+ | PARS_ID_TOKEN '%' PARS_NOTFOUND_TOKEN
+ { $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); }
+ | PARS_SQL_TOKEN '%' PARS_NOTFOUND_TOKEN
+ { $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); }
+;
+
+function_name:
+ PARS_TO_BINARY_TOKEN { $$ = &pars_to_binary_token; }
+ | PARS_SUBSTR_TOKEN { $$ = &pars_substr_token; }
+ | PARS_CONCAT_TOKEN { $$ = &pars_concat_token; }
+ | PARS_INSTR_TOKEN { $$ = &pars_instr_token; }
+ | PARS_LENGTH_TOKEN { $$ = &pars_length_token; }
+;
+
+question_mark_list:
+ /* Nothing */
+ | '?'
+ | question_mark_list ',' '?'
+;
+
+stored_procedure_call:
+ '{' PARS_ID_TOKEN '(' question_mark_list ')' '}'
+ { $$ = pars_stored_procedure_call(
+ static_cast<sym_node_t*>($2)); }
+;
+
+user_function_call:
+ PARS_ID_TOKEN '(' ')' { $$ = $1; }
+;
+
+table_list:
+ table_name { $$ = que_node_list_add_last(NULL, $1); }
+ | table_list ',' table_name
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+variable_list:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_ID_TOKEN { $$ = que_node_list_add_last(NULL, $1); }
+ | variable_list ',' PARS_ID_TOKEN
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+exp_list:
+ /* Nothing */ { $$ = NULL; }
+ | exp { $$ = que_node_list_add_last(NULL, $1);}
+ | exp_list ',' exp { $$ = que_node_list_add_last($1, $3); }
+;
+
+select_item:
+ exp { $$ = $1; }
+ | PARS_COUNT_TOKEN '(' '*' ')'
+ { $$ = pars_func(&pars_count_token,
+ que_node_list_add_last(NULL,
+ sym_tab_add_int_lit(
+ pars_sym_tab_global, 1))); }
+;
+
+select_item_list:
+ /* Nothing */ { $$ = NULL; }
+ | select_item { $$ = que_node_list_add_last(NULL, $1); }
+ | select_item_list ',' select_item
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+select_list:
+ '*' { $$ = pars_select_list(&pars_star_denoter,
+ NULL); }
+ | select_item_list PARS_INTO_TOKEN variable_list
+ { $$ = pars_select_list(
+ $1, static_cast<sym_node_t*>($3)); }
+ | select_item_list { $$ = pars_select_list($1, NULL); }
+;
+
+search_condition:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_WHERE_TOKEN exp { $$ = $2; }
+;
+
+for_update_clause:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_FOR_TOKEN PARS_UPDATE_TOKEN
+ { $$ = &pars_update_token; }
+;
+
+lock_shared_clause:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_LOCK_TOKEN PARS_IN_TOKEN PARS_SHARE_TOKEN PARS_MODE_TOKEN
+ { $$ = &pars_share_token; }
+;
+
+order_direction:
+ /* Nothing */ { $$ = &pars_asc_token; }
+ | PARS_ASC_TOKEN { $$ = &pars_asc_token; }
+ | PARS_DESC_TOKEN { $$ = &pars_desc_token; }
+;
+
+order_by_clause:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction
+ { $$ = pars_order_by(
+ static_cast<sym_node_t*>($3),
+ static_cast<pars_res_word_t*>($4)); }
+;
+
+select_statement:
+ PARS_SELECT_TOKEN select_list
+ PARS_FROM_TOKEN table_list
+ search_condition
+ for_update_clause
+ lock_shared_clause
+ order_by_clause { $$ = pars_select_statement(
+ static_cast<sel_node_t*>($2),
+ static_cast<sym_node_t*>($4),
+ static_cast<que_node_t*>($5),
+ static_cast<pars_res_word_t*>($6),
+ static_cast<pars_res_word_t*>($7),
+ static_cast<order_node_t*>($8)); }
+;
+
+insert_statement_start:
+ PARS_INSERT_TOKEN PARS_INTO_TOKEN
+ table_name { $$ = $3; }
+;
+
+insert_statement:
+ insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')'
+ { $$ = pars_insert_statement(
+ static_cast<sym_node_t*>($1), $4, NULL); }
+ | insert_statement_start select_statement
+ { $$ = pars_insert_statement(
+ static_cast<sym_node_t*>($1),
+ NULL,
+ static_cast<sel_node_t*>($2)); }
+;
+
+column_assignment:
+ PARS_ID_TOKEN '=' exp { $$ = pars_column_assignment(
+ static_cast<sym_node_t*>($1),
+ static_cast<que_node_t*>($3)); }
+;
+
+column_assignment_list:
+ column_assignment { $$ = que_node_list_add_last(NULL, $1); }
+ | column_assignment_list ',' column_assignment
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+cursor_positioned:
+ PARS_WHERE_TOKEN
+ PARS_CURRENT_TOKEN PARS_OF_TOKEN
+ PARS_ID_TOKEN { $$ = $4; }
+;
+
+update_statement_start:
+ PARS_UPDATE_TOKEN table_name
+ PARS_SET_TOKEN
+ column_assignment_list { $$ = pars_update_statement_start(
+ FALSE,
+ static_cast<sym_node_t*>($2),
+ static_cast<col_assign_node_t*>($4)); }
+;
+
+update_statement_searched:
+ update_statement_start
+ search_condition { $$ = pars_update_statement(
+ static_cast<upd_node_t*>($1),
+ NULL,
+ static_cast<que_node_t*>($2)); }
+;
+
+update_statement_positioned:
+ update_statement_start
+ cursor_positioned { $$ = pars_update_statement(
+ static_cast<upd_node_t*>($1),
+ static_cast<sym_node_t*>($2),
+ NULL); }
+;
+
+delete_statement_start:
+ PARS_DELETE_TOKEN PARS_FROM_TOKEN
+ table_name { $$ = pars_update_statement_start(
+ TRUE,
+ static_cast<sym_node_t*>($3), NULL); }
+;
+
+delete_statement_searched:
+ delete_statement_start
+ search_condition { $$ = pars_update_statement(
+ static_cast<upd_node_t*>($1),
+ NULL,
+ static_cast<que_node_t*>($2)); }
+;
+
+delete_statement_positioned:
+ delete_statement_start
+ cursor_positioned { $$ = pars_update_statement(
+ static_cast<upd_node_t*>($1),
+ static_cast<sym_node_t*>($2),
+ NULL); }
+;
+
+assignment_statement:
+ PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp
+ { $$ = pars_assignment_statement(
+ static_cast<sym_node_t*>($1),
+ static_cast<que_node_t*>($3)); }
+;
+
+elsif_element:
+ PARS_ELSIF_TOKEN
+ exp PARS_THEN_TOKEN statement_list
+ { $$ = pars_elsif_element($2, $4); }
+;
+
+elsif_list:
+ elsif_element { $$ = que_node_list_add_last(NULL, $1); }
+ | elsif_list elsif_element
+ { $$ = que_node_list_add_last($1, $2); }
+;
+
+else_part:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_ELSE_TOKEN statement_list
+ { $$ = $2; }
+ | elsif_list { $$ = $1; }
+;
+
+if_statement:
+ PARS_IF_TOKEN exp PARS_THEN_TOKEN statement_list
+ else_part
+ PARS_END_TOKEN PARS_IF_TOKEN
+ { $$ = pars_if_statement($2, $4, $5); }
+;
+
+while_statement:
+ PARS_WHILE_TOKEN exp PARS_LOOP_TOKEN statement_list
+ PARS_END_TOKEN PARS_LOOP_TOKEN
+ { $$ = pars_while_statement($2, $4); }
+;
+
+for_statement:
+ PARS_FOR_TOKEN PARS_ID_TOKEN PARS_IN_TOKEN
+ exp PARS_DDOT_TOKEN exp
+ PARS_LOOP_TOKEN statement_list
+ PARS_END_TOKEN PARS_LOOP_TOKEN
+ { $$ = pars_for_statement(
+ static_cast<sym_node_t*>($2),
+ $4, $6, $8); }
+;
+
+exit_statement:
+ PARS_EXIT_TOKEN { $$ = pars_exit_statement(); }
+;
+
+return_statement:
+ PARS_RETURN_TOKEN { $$ = pars_return_statement(); }
+;
+
+open_cursor_statement:
+ PARS_OPEN_TOKEN PARS_ID_TOKEN
+ { $$ = pars_open_statement(
+ ROW_SEL_OPEN_CURSOR,
+ static_cast<sym_node_t*>($2)); }
+;
+
+close_cursor_statement:
+ PARS_CLOSE_TOKEN PARS_ID_TOKEN
+ { $$ = pars_open_statement(
+ ROW_SEL_CLOSE_CURSOR,
+ static_cast<sym_node_t*>($2)); }
+;
+
+fetch_statement:
+ PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list
+ { $$ = pars_fetch_statement(
+ static_cast<sym_node_t*>($2),
+ static_cast<sym_node_t*>($4), NULL); }
+ | PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call
+ { $$ = pars_fetch_statement(
+ static_cast<sym_node_t*>($2),
+ NULL,
+ static_cast<sym_node_t*>($4)); }
+;
+
+column_def:
+ PARS_ID_TOKEN type_name opt_column_len opt_not_null
+ { $$ = pars_column_def(
+ static_cast<sym_node_t*>($1),
+ static_cast<pars_res_word_t*>($2),
+ static_cast<sym_node_t*>($3),
+ $4); }
+;
+
+column_def_list:
+ column_def { $$ = que_node_list_add_last(NULL, $1); }
+ | column_def_list ',' column_def
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+opt_column_len:
+ /* Nothing */ { $$ = NULL; }
+ | '(' PARS_INT_LIT ')'
+ { $$ = $2; }
+;
+
+opt_not_null:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_NOT_TOKEN PARS_NULL_LIT
+ { $$ = &pars_int_token;
+ /* pass any non-NULL pointer */ }
+;
+
+create_table:
+ PARS_CREATE_TOKEN PARS_TABLE_TOKEN
+ table_name '(' column_def_list ')'
+ { $$ = pars_create_table(
+ static_cast<sym_node_t*>($3),
+ static_cast<sym_node_t*>($5)); }
+;
+
+column_list:
+ PARS_ID_TOKEN { $$ = que_node_list_add_last(NULL, $1); }
+ | column_list ',' PARS_ID_TOKEN
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+unique_def:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_UNIQUE_TOKEN { $$ = &pars_unique_token; }
+;
+
+clustered_def:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_CLUSTERED_TOKEN { $$ = &pars_clustered_token; }
+;
+
+create_index:
+ PARS_CREATE_TOKEN unique_def
+ clustered_def
+ PARS_INDEX_TOKEN
+ PARS_ID_TOKEN PARS_ON_TOKEN
+ table_name
+ '(' column_list ')' { $$ = pars_create_index(
+ static_cast<pars_res_word_t*>($2),
+ static_cast<pars_res_word_t*>($3),
+ static_cast<sym_node_t*>($5),
+ static_cast<sym_node_t*>($7),
+ static_cast<sym_node_t*>($9)); }
+;
+
+table_name:
+ PARS_ID_TOKEN { $$ = $1; }
+ | PARS_TABLE_NAME_TOKEN { $$ = $1; }
+;
+
+commit_statement:
+ PARS_COMMIT_TOKEN PARS_WORK_TOKEN
+ { $$ = pars_commit_statement(); }
+;
+
+rollback_statement:
+ PARS_ROLLBACK_TOKEN PARS_WORK_TOKEN
+ { $$ = pars_rollback_statement(); }
+;
+
+type_name:
+ PARS_INT_TOKEN { $$ = &pars_int_token; }
+ | PARS_BIGINT_TOKEN { $$ = &pars_bigint_token; }
+ | PARS_CHAR_TOKEN { $$ = &pars_char_token; }
+;
+
+variable_declaration:
+ PARS_ID_TOKEN type_name ';'
+ { $$ = pars_variable_declaration(
+ static_cast<sym_node_t*>($1),
+ static_cast<pars_res_word_t*>($2)); }
+;
+
+variable_declaration_list:
+ /* Nothing */
+ | variable_declaration
+ | variable_declaration_list variable_declaration
+;
+
+cursor_declaration:
+ PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN
+ PARS_IS_TOKEN select_statement ';'
+ { $$ = pars_cursor_declaration(
+ static_cast<sym_node_t*>($3),
+ static_cast<sel_node_t*>($5)); }
+;
+
+function_declaration:
+ PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';'
+ { $$ = pars_function_declaration(
+ static_cast<sym_node_t*>($3)); }
+;
+
+declaration:
+ cursor_declaration
+ | function_declaration
+;
+
+declaration_list:
+ /* Nothing */
+ | declaration
+ | declaration_list declaration
+;
+
+procedure_definition:
+ PARS_PROCEDURE_TOKEN PARS_ID_TOKEN '(' ')'
+ PARS_IS_TOKEN
+ variable_declaration_list
+ declaration_list
+ PARS_BEGIN_TOKEN
+ statement_list
+ PARS_END_TOKEN { $$ = pars_procedure_definition(
+ static_cast<sym_node_t*>($2), $9); }
+;
+
+%%
diff --git a/storage/innobase/pars/pars0lex.l b/storage/innobase/pars/pars0lex.l
new file mode 100644
index 00000000..1ddc5132
--- /dev/null
+++ b/storage/innobase/pars/pars0lex.l
@@ -0,0 +1,614 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+
+The InnoDB parser is frozen because MySQL takes care of SQL parsing.
+Therefore we normally keep the InnoDB parser C files as they are, and do
+not automatically generate them from pars0grm.y and pars0lex.l.
+
+How to make the InnoDB parser and lexer C files:
+
+1. Run ./make_flex.sh to generate lexer files.
+
+2. Run ./make_bison.sh to generate parser files.
+
+These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
+Linux.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+
+%option nostdinit
+%option 8bit
+%option warn
+%option pointer
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option noyy_scan_buffer
+%option noyy_scan_bytes
+%option noyy_scan_string
+%option nounistd
+
+%{
+#define YYSTYPE que_node_t*
+
+#include "univ.i"
+#include "pars0pars.h"
+#include "pars0grm.h"
+#include "pars0sym.h"
+#include "mem0mem.h"
+
+#define malloc(A) ut_malloc_nokey(A)
+#define free(A) ut_free(A)
+#define realloc(P, A) ut_realloc(P, A)
+#define exit(A) ut_error
+
+#define YY_INPUT(buf, result, max_size) \
+ result = pars_get_lex_chars(buf, max_size)
+
+/* String buffer for removing quotes */
+static ulint stringbuf_len_alloc = 0; /* Allocated length */
+static ulint stringbuf_len = 0; /* Current length */
+static char* stringbuf; /* Start of buffer */
+/** Appends a string to the buffer. */
+static
+void
+string_append(
+/*==========*/
+ const char* str, /*!< in: string to be appended */
+ ulint len) /*!< in: length of the string */
+{
+ if (stringbuf == NULL) {
+ stringbuf = static_cast<char*>(malloc(1));
+ stringbuf_len_alloc = 1;
+ }
+
+ if (stringbuf_len + len > stringbuf_len_alloc) {
+ while (stringbuf_len + len > stringbuf_len_alloc) {
+ stringbuf_len_alloc <<= 1;
+ }
+
+ stringbuf = static_cast<char*>(
+ realloc(stringbuf, stringbuf_len_alloc));
+ }
+
+ memcpy(stringbuf + stringbuf_len, str, len);
+ stringbuf_len += len;
+}
+
+%}
+
+DIGIT [0-9]
+ID [a-z_A-Z][a-z_A-Z0-9]*
+TABLE_NAME [a-z_A-Z][@a-z_A-Z0-9]*\/(#sql-|[a-z_A-Z])[a-z_A-Z0-9]*
+BOUND_LIT \:[a-z_A-Z0-9]+
+BOUND_ID \$[a-z_A-Z0-9]+
+
+%x comment
+%x quoted
+%x id
+%%
+
+{DIGIT}+ {
+ yylval = sym_tab_add_int_lit(pars_sym_tab_global,
+ atoi(yytext));
+ return(PARS_INT_LIT);
+}
+
+{DIGIT}+"."{DIGIT}* {
+ ut_error; /* not implemented */
+
+ return(PARS_FLOAT_LIT);
+}
+
+{BOUND_LIT} {
+ ulint type;
+
+ yylval = sym_tab_add_bound_lit(pars_sym_tab_global,
+ yytext + 1, &type);
+
+ return((int) type);
+}
+
+{BOUND_ID} {
+ yylval = sym_tab_add_bound_id(pars_sym_tab_global,
+ yytext + 1);
+
+ return(PARS_ID_TOKEN);
+}
+
+"'" {
+/* Quoted character string literals are handled in an explicit
+start state 'quoted'. This state is entered and the buffer for
+the scanned string is emptied upon encountering a starting quote.
+
+In the state 'quoted', only two actions are possible (defined below). */
+ BEGIN(quoted);
+ stringbuf_len = 0;
+}
+<quoted>[^\']+ {
+ /* Got a sequence of characters other than "'":
+ append to string buffer */
+ string_append(yytext, yyleng);
+}
+<quoted>"'"+ {
+ /* Got a sequence of "'" characters:
+ append half of them to string buffer,
+ as "''" represents a single "'".
+ We apply truncating division,
+ so that "'''" will result in "'". */
+
+ string_append(yytext, yyleng / 2);
+
+ /* If we got an odd number of quotes, then the
+ last quote we got is the terminating quote.
+ At the end of the string, we return to the
+ initial start state and report the scanned
+ string literal. */
+
+ if (yyleng % 2) {
+ BEGIN(INITIAL);
+ yylval = sym_tab_add_str_lit(
+ pars_sym_tab_global,
+ (byte*) stringbuf, stringbuf_len);
+ return(PARS_STR_LIT);
+ }
+}
+
+\" {
+/* Quoted identifiers are handled in an explicit start state 'id'.
+This state is entered and the buffer for the scanned string is emptied
+upon encountering a starting quote.
+
+In the state 'id', only two actions are possible (defined below). */
+ BEGIN(id);
+ stringbuf_len = 0;
+}
+<id>[^\"]+ {
+ /* Got a sequence of characters other than '"':
+ append to string buffer */
+ string_append(yytext, yyleng);
+}
+<id>\"+ {
+ /* Got a sequence of '"' characters:
+ append half of them to string buffer,
+ as '""' represents a single '"'.
+ We apply truncating division,
+ so that '"""' will result in '"'. */
+
+ string_append(yytext, yyleng / 2);
+
+ /* If we got an odd number of quotes, then the
+ last quote we got is the terminating quote.
+ At the end of the string, we return to the
+ initial start state and report the scanned
+ identifier. */
+
+ if (yyleng % 2) {
+ BEGIN(INITIAL);
+ yylval = sym_tab_add_id(
+ pars_sym_tab_global,
+ (byte*) stringbuf, stringbuf_len);
+
+ return(PARS_ID_TOKEN);
+ }
+}
+
+"NULL" {
+ yylval = sym_tab_add_null_lit(pars_sym_tab_global);
+
+ return(PARS_NULL_LIT);
+}
+
+"SQL" {
+ /* Implicit cursor name */
+ yylval = sym_tab_add_str_lit(pars_sym_tab_global,
+ (byte*) yytext, yyleng);
+ return(PARS_SQL_TOKEN);
+}
+
+"AND" {
+ return(PARS_AND_TOKEN);
+}
+
+"OR" {
+ return(PARS_OR_TOKEN);
+}
+
+"NOT" {
+ return(PARS_NOT_TOKEN);
+}
+
+"PROCEDURE" {
+ return(PARS_PROCEDURE_TOKEN);
+}
+
+"IN" {
+ return(PARS_IN_TOKEN);
+}
+
+"INT" {
+ return(PARS_INT_TOKEN);
+}
+
+"CHAR" {
+ return(PARS_CHAR_TOKEN);
+}
+
+"IS" {
+ return(PARS_IS_TOKEN);
+}
+
+"BEGIN" {
+ return(PARS_BEGIN_TOKEN);
+}
+
+"END" {
+ return(PARS_END_TOKEN);
+}
+
+"IF" {
+ return(PARS_IF_TOKEN);
+}
+
+"THEN" {
+ return(PARS_THEN_TOKEN);
+}
+
+"ELSE" {
+ return(PARS_ELSE_TOKEN);
+}
+
+"ELSIF" {
+ return(PARS_ELSIF_TOKEN);
+}
+
+"LOOP" {
+ return(PARS_LOOP_TOKEN);
+}
+
+"WHILE" {
+ return(PARS_WHILE_TOKEN);
+}
+
+"RETURN" {
+ return(PARS_RETURN_TOKEN);
+}
+
+"SELECT" {
+ return(PARS_SELECT_TOKEN);
+}
+
+"COUNT" {
+ return(PARS_COUNT_TOKEN);
+}
+
+"FROM" {
+ return(PARS_FROM_TOKEN);
+}
+
+"WHERE" {
+ return(PARS_WHERE_TOKEN);
+}
+
+"FOR" {
+ return(PARS_FOR_TOKEN);
+}
+
+"ORDER" {
+ return(PARS_ORDER_TOKEN);
+}
+
+"BY" {
+ return(PARS_BY_TOKEN);
+}
+
+"ASC" {
+ return(PARS_ASC_TOKEN);
+}
+
+"DESC" {
+ return(PARS_DESC_TOKEN);
+}
+
+"INSERT" {
+ return(PARS_INSERT_TOKEN);
+}
+
+"INTO" {
+ return(PARS_INTO_TOKEN);
+}
+
+"VALUES" {
+ return(PARS_VALUES_TOKEN);
+}
+
+"UPDATE" {
+ return(PARS_UPDATE_TOKEN);
+}
+
+"SET" {
+ return(PARS_SET_TOKEN);
+}
+
+"DELETE" {
+ return(PARS_DELETE_TOKEN);
+}
+
+"CURRENT" {
+ return(PARS_CURRENT_TOKEN);
+}
+
+"OF" {
+ return(PARS_OF_TOKEN);
+}
+
+"CREATE" {
+ return(PARS_CREATE_TOKEN);
+}
+
+"TABLE" {
+ return(PARS_TABLE_TOKEN);
+}
+
+"INDEX" {
+ return(PARS_INDEX_TOKEN);
+}
+
+"UNIQUE" {
+ return(PARS_UNIQUE_TOKEN);
+}
+
+"CLUSTERED" {
+ return(PARS_CLUSTERED_TOKEN);
+}
+
+"ON" {
+ return(PARS_ON_TOKEN);
+}
+
+"DECLARE" {
+ return(PARS_DECLARE_TOKEN);
+}
+
+"CURSOR" {
+ return(PARS_CURSOR_TOKEN);
+}
+
+"OPEN" {
+ return(PARS_OPEN_TOKEN);
+}
+
+"FETCH" {
+ return(PARS_FETCH_TOKEN);
+}
+
+"CLOSE" {
+ return(PARS_CLOSE_TOKEN);
+}
+
+"NOTFOUND" {
+ return(PARS_NOTFOUND_TOKEN);
+}
+
+"TO_BINARY" {
+ return(PARS_TO_BINARY_TOKEN);
+}
+
+"SUBSTR" {
+ return(PARS_SUBSTR_TOKEN);
+}
+
+"CONCAT" {
+ return(PARS_CONCAT_TOKEN);
+}
+
+"INSTR" {
+ return(PARS_INSTR_TOKEN);
+}
+
+"LENGTH" {
+ return(PARS_LENGTH_TOKEN);
+}
+
+"COMMIT" {
+ return(PARS_COMMIT_TOKEN);
+}
+
+"ROLLBACK" {
+ return(PARS_ROLLBACK_TOKEN);
+}
+
+"WORK" {
+ return(PARS_WORK_TOKEN);
+}
+
+"EXIT" {
+ return(PARS_EXIT_TOKEN);
+}
+
+"FUNCTION" {
+ return(PARS_FUNCTION_TOKEN);
+}
+
+"LOCK" {
+ return(PARS_LOCK_TOKEN);
+}
+
+"SHARE" {
+ return(PARS_SHARE_TOKEN);
+}
+
+"MODE" {
+ return(PARS_MODE_TOKEN);
+}
+
+"LIKE" {
+ return(PARS_LIKE_TOKEN);
+}
+
+"BIGINT" {
+ return(PARS_BIGINT_TOKEN);
+}
+
+{ID} {
+ yylval = sym_tab_add_id(pars_sym_tab_global,
+ (byte*) yytext,
+ strlen(yytext));
+ return(PARS_ID_TOKEN);
+}
+
+{TABLE_NAME} {
+ yylval = sym_tab_add_id(pars_sym_tab_global,
+ (byte*) yytext,
+ strlen(yytext));
+ return(PARS_TABLE_NAME_TOKEN);
+}
+
+".." {
+ return(PARS_DDOT_TOKEN);
+}
+
+":=" {
+ return(PARS_ASSIGN_TOKEN);
+}
+
+"<=" {
+ return(PARS_LE_TOKEN);
+}
+
+">=" {
+ return(PARS_GE_TOKEN);
+}
+
+"<>" {
+ return(PARS_NE_TOKEN);
+}
+
+"(" {
+
+ return((int)(*yytext));
+}
+
+"=" {
+
+ return((int)(*yytext));
+}
+
+">" {
+
+ return((int)(*yytext));
+}
+
+"<" {
+
+ return((int)(*yytext));
+}
+
+"," {
+
+ return((int)(*yytext));
+}
+
+";" {
+
+ return((int)(*yytext));
+}
+
+")" {
+
+ return((int)(*yytext));
+}
+
+"+" {
+
+ return((int)(*yytext));
+}
+
+"-" {
+
+ return((int)(*yytext));
+}
+
+"*" {
+
+ return((int)(*yytext));
+}
+
+"/" {
+
+ return((int)(*yytext));
+}
+
+"%" {
+
+ return((int)(*yytext));
+}
+
+"{" {
+
+ return((int)(*yytext));
+}
+
+"}" {
+
+ return((int)(*yytext));
+}
+
+"?" {
+
+ return((int)(*yytext));
+}
+
+"/*" BEGIN(comment); /* eat up comment */
+
+<comment>[^*]*
+<comment>"*"+[^*/]*
+<comment>"*"+"/" BEGIN(INITIAL);
+
+[ \t\n]+ /* eat up whitespace */
+
+
+. {
+ fprintf(stderr,"Unrecognized character: %02x\n",
+ *yytext);
+
+ ut_error;
+
+ return(0);
+}
+
+%%
+
+/**********************************************************************
+Release any resources used by the lexer. */
+void
+pars_lexer_close(void)
+/*==================*/
+{
+ yylex_destroy();
+ free(stringbuf);
+ stringbuf = NULL;
+ stringbuf_len_alloc = stringbuf_len = 0;
+}
diff --git a/storage/innobase/pars/pars0opt.cc b/storage/innobase/pars/pars0opt.cc
new file mode 100644
index 00000000..e1a913b0
--- /dev/null
+++ b/storage/innobase/pars/pars0opt.cc
@@ -0,0 +1,1267 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0opt.cc
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#include "pars0opt.h"
+#include "row0sel.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "dict0boot.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "que0que.h"
+#include "pars0grm.h"
+#include "pars0pars.h"
+
+#define OPT_EQUAL 1 /* comparison by = */
+#define OPT_COMPARISON 2 /* comparison by <, >, <=, or >= */
+
+#define OPT_NOT_COND 1
+#define OPT_END_COND 2
+#define OPT_TEST_COND 3
+#define OPT_SCROLL_COND 4
+
+
+/*******************************************************************//**
+Inverts a comparison operator.
+@return the equivalent operator when the order of the arguments is switched */
+static
+int
+opt_invert_cmp_op(
+/*==============*/
+ int op) /*!< in: operator */
+{
+ if (op == '<') {
+ return('>');
+ } else if (op == '>') {
+ return('<');
+ } else if (op == '=') {
+ return('=');
+ } else if (op == PARS_LE_TOKEN) {
+ return(PARS_GE_TOKEN);
+ } else if (op == PARS_GE_TOKEN) {
+ return(PARS_LE_TOKEN);
+ } else {
+ /* TODO: LIKE operator */
+ ut_error;
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Checks if the value of an expression can be calculated BEFORE the nth table
+in a join is accessed. If this is the case, it can possibly be used in an
+index search for the nth table.
+@return TRUE if already determined */
+static
+ibool
+opt_check_exp_determined_before(
+/*============================*/
+ que_node_t* exp, /*!< in: expression */
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint nth_table) /*!< in: nth table will be accessed */
+{
+ func_node_t* func_node;
+ sym_node_t* sym_node;
+ dict_table_t* table;
+ que_node_t* arg;
+ ulint i;
+
+ ut_ad(exp && sel_node);
+
+ if (que_node_get_type(exp) == QUE_NODE_FUNC) {
+ func_node = static_cast<func_node_t*>(exp);
+
+ arg = func_node->args;
+
+ while (arg) {
+ if (!opt_check_exp_determined_before(arg, sel_node,
+ nth_table)) {
+ return(FALSE);
+ }
+
+ arg = que_node_get_next(arg);
+ }
+
+ return(TRUE);
+ }
+
+ ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
+
+ sym_node = static_cast<sym_node_t*>(exp);
+
+ if (sym_node->token_type != SYM_COLUMN) {
+
+ return(TRUE);
+ }
+
+ for (i = 0; i < nth_table; i++) {
+
+ table = sel_node_get_nth_plan(sel_node, i)->table;
+
+ if (sym_node->table == table) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Looks in a comparison condition if a column value is already restricted by
+it BEFORE the nth table is accessed.
+@return expression restricting the value of the column, or NULL if not known */
+static
+que_node_t*
+opt_look_for_col_in_comparison_before(
+/*==================================*/
+ ulint cmp_type, /*!< in: OPT_EQUAL, OPT_COMPARISON */
+ ulint col_no, /*!< in: column number */
+ func_node_t* search_cond, /*!< in: comparison condition */
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint nth_table, /*!< in: nth table in a join (a query
+ from a single table is considered a
+ join of 1 table) */
+ ulint* op) /*!< out: comparison operator ('=',
+ PARS_GE_TOKEN, ... ); this is inverted
+ if the column appears on the right
+ side */
+{
+ sym_node_t* sym_node;
+ dict_table_t* table;
+ que_node_t* exp;
+ que_node_t* arg;
+
+ ut_ad(search_cond);
+
+ ut_a((search_cond->func == '<')
+ || (search_cond->func == '>')
+ || (search_cond->func == '=')
+ || (search_cond->func == PARS_GE_TOKEN)
+ || (search_cond->func == PARS_LE_TOKEN)
+ || (search_cond->func == PARS_LIKE_TOKEN_EXACT)
+ || (search_cond->func == PARS_LIKE_TOKEN_PREFIX)
+ || (search_cond->func == PARS_LIKE_TOKEN_SUFFIX)
+ || (search_cond->func == PARS_LIKE_TOKEN_SUBSTR));
+
+ table = sel_node_get_nth_plan(sel_node, nth_table)->table;
+
+ if ((cmp_type == OPT_EQUAL)
+ && (search_cond->func != '=')
+ && (search_cond->func != PARS_LIKE_TOKEN_EXACT)
+ && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)) {
+
+ return(NULL);
+
+ } else if ((cmp_type == OPT_COMPARISON)
+ && (search_cond->func != '<')
+ && (search_cond->func != '>')
+ && (search_cond->func != PARS_GE_TOKEN)
+ && (search_cond->func != PARS_LE_TOKEN)
+ && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)
+ && (search_cond->func != PARS_LIKE_TOKEN_SUFFIX)) {
+
+ return(NULL);
+ }
+
+ arg = search_cond->args;
+
+ if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
+ sym_node = static_cast<sym_node_t*>(arg);
+
+ if ((sym_node->token_type == SYM_COLUMN)
+ && (sym_node->table == table)
+ && (sym_node->col_no == col_no)) {
+
+ /* sym_node contains the desired column id */
+
+ /* Check if the expression on the right side of the
+ operator is already determined */
+
+ exp = que_node_get_next(arg);
+
+ if (opt_check_exp_determined_before(exp, sel_node,
+ nth_table)) {
+ *op = ulint(search_cond->func);
+
+ return(exp);
+ }
+ }
+ }
+
+ exp = search_cond->args;
+ arg = que_node_get_next(arg);
+
+ if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
+ sym_node = static_cast<sym_node_t*>(arg);
+
+ if ((sym_node->token_type == SYM_COLUMN)
+ && (sym_node->table == table)
+ && (sym_node->col_no == col_no)) {
+
+ if (opt_check_exp_determined_before(exp, sel_node,
+ nth_table)) {
+ *op = ulint(opt_invert_cmp_op(
+ search_cond->func));
+
+ return(exp);
+ }
+ }
+ }
+
+ return(NULL);
+}
+
+/*******************************************************************//**
+Looks in a search condition if a column value is already restricted by the
+search condition BEFORE the nth table is accessed. Takes into account that
+if we will fetch in an ascending order, we cannot utilize an upper limit for
+a column value; in a descending order, respectively, a lower limit.
+@return expression restricting the value of the column, or NULL if not known */
+static
+que_node_t*
+opt_look_for_col_in_cond_before(
+/*============================*/
+ ulint cmp_type, /*!< in: OPT_EQUAL, OPT_COMPARISON */
+ ulint col_no, /*!< in: column number */
+ func_node_t* search_cond, /*!< in: search condition or NULL */
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint nth_table, /*!< in: nth table in a join (a query
+ from a single table is considered a
+ join of 1 table) */
+ ulint* op) /*!< out: comparison operator ('=',
+ PARS_GE_TOKEN, ... ) */
+{
+ func_node_t* new_cond;
+ que_node_t* exp;
+
+ if (search_cond == NULL) {
+
+ return(NULL);
+ }
+
+ ut_a(que_node_get_type(search_cond) == QUE_NODE_FUNC);
+ ut_a(search_cond->func != PARS_OR_TOKEN);
+ ut_a(search_cond->func != PARS_NOT_TOKEN);
+
+ if (search_cond->func == PARS_AND_TOKEN) {
+ new_cond = static_cast<func_node_t*>(search_cond->args);
+
+ exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
+ new_cond, sel_node,
+ nth_table, op);
+ if (exp) {
+
+ return(exp);
+ }
+
+ new_cond = static_cast<func_node_t*>(
+ que_node_get_next(new_cond));
+
+ exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
+ new_cond, sel_node,
+ nth_table, op);
+ return(exp);
+ }
+
+ exp = opt_look_for_col_in_comparison_before(cmp_type, col_no,
+ search_cond, sel_node,
+ nth_table, op);
+ if (exp == NULL) {
+
+ return(NULL);
+ }
+
+ /* If we will fetch in an ascending order, we cannot utilize an upper
+ limit for a column value; in a descending order, respectively, a lower
+ limit */
+
+ if (sel_node->asc && ((*op == '<') || (*op == PARS_LE_TOKEN))) {
+
+ return(NULL);
+
+ } else if (!sel_node->asc
+ && ((*op == '>') || (*op == PARS_GE_TOKEN))) {
+
+ return(NULL);
+ }
+
+ return(exp);
+}
+
+/*******************************************************************//**
+Calculates the goodness for an index according to a select node. The
+goodness is 4 times the number of first fields in index whose values we
+already know exactly in the query. If we have a comparison condition for
+an additional field, 2 point are added. If the index is unique, and we know
+all the unique fields for the index we add 1024 points. For a clustered index
+we add 1 point.
+@return goodness */
+static
+ulint
+opt_calc_index_goodness(
+/*====================*/
+ dict_index_t* index, /*!< in: index */
+ sel_node_t* sel_node, /*!< in: parsed select node */
+ ulint nth_table, /*!< in: nth table in a join */
+ que_node_t** index_plan, /*!< in/out: comparison expressions for
+ this index */
+ ulint* last_op) /*!< out: last comparison operator, if
+ goodness > 1 */
+{
+ que_node_t* exp;
+ ulint goodness;
+ ulint n_fields;
+ ulint col_no;
+ ulint op;
+ ulint j;
+
+ /* At least for now we don't support using FTS indexes for queries
+ done through InnoDB's own SQL parser. */
+ if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
+ return(0);
+ }
+
+ goodness = 0;
+
+ /* Note that as higher level node pointers in the B-tree contain
+ page addresses as the last field, we must not put more fields in
+ the search tuple than dict_index_get_n_unique_in_tree(index); see
+ the note in btr_cur_search_to_nth_level. */
+
+ n_fields = dict_index_get_n_unique_in_tree(index);
+
+ for (j = 0; j < n_fields; j++) {
+
+ col_no = dict_index_get_nth_col_no(index, j);
+
+ exp = opt_look_for_col_in_cond_before(
+ OPT_EQUAL, col_no,
+ static_cast<func_node_t*>(sel_node->search_cond),
+ sel_node, nth_table, &op);
+ if (exp) {
+ /* The value for this column is exactly known already
+ at this stage of the join */
+
+ index_plan[j] = exp;
+ *last_op = op;
+ goodness += 4;
+ } else {
+ /* Look for non-equality comparisons */
+
+ exp = opt_look_for_col_in_cond_before(
+ OPT_COMPARISON, col_no,
+ static_cast<func_node_t*>(
+ sel_node->search_cond),
+ sel_node, nth_table, &op);
+ if (exp) {
+ index_plan[j] = exp;
+ *last_op = op;
+ goodness += 2;
+ }
+
+ break;
+ }
+ }
+
+ if (goodness / 4 >= dict_index_get_n_unique(index)) {
+ goodness += 1024;
+
+ if (dict_index_is_clust(index)) {
+
+ goodness += 1024;
+ }
+ }
+
+ /* We have to test for goodness here, as last_op may not be set */
+ if (goodness && dict_index_is_clust(index)) {
+
+ goodness++;
+ }
+
+ return(goodness);
+}
+
+/*******************************************************************//**
+Calculates the number of matched fields based on an index goodness.
+@return number of excatly or partially matched fields */
+UNIV_INLINE
+ulint
+opt_calc_n_fields_from_goodness(
+/*============================*/
+ ulint goodness) /*!< in: goodness */
+{
+ return(((goodness % 1024) + 2) / 4);
+}
+
+/*******************************************************************//**
+Converts a comparison operator to the corresponding search mode PAGE_CUR_GE,
+...
+@return search mode */
+UNIV_INLINE
+page_cur_mode_t
+opt_op_to_search_mode(
+/*==================*/
+ ibool asc, /*!< in: TRUE if the rows should be fetched in an
+ ascending order */
+ ulint op) /*!< in: operator '=', PARS_GE_TOKEN, ... */
+{
+ if (op == '='
+ || op == PARS_LIKE_TOKEN_EXACT
+ || op == PARS_LIKE_TOKEN_PREFIX
+ || op == PARS_LIKE_TOKEN_SUFFIX
+ || op == PARS_LIKE_TOKEN_SUBSTR) {
+
+ if (asc) {
+ return(PAGE_CUR_GE);
+ } else {
+ return(PAGE_CUR_LE);
+ }
+ } else if (op == '<') {
+ ut_a(!asc);
+ return(PAGE_CUR_L);
+ } else if (op == '>') {
+ ut_a(asc);
+ return(PAGE_CUR_G);
+ } else if (op == PARS_GE_TOKEN) {
+ ut_a(asc);
+ return(PAGE_CUR_GE);
+ } else if (op == PARS_LE_TOKEN) {
+ ut_a(!asc);
+ return(PAGE_CUR_LE);
+ } else {
+ ut_error;
+ }
+
+ return(PAGE_CUR_UNSUPP);
+}
+
+/*******************************************************************//**
+Determines if a node is an argument node of a function node.
+@return TRUE if is an argument */
+static
+ibool
+opt_is_arg(
+/*=======*/
+ que_node_t* arg_node, /*!< in: possible argument node */
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg;
+
+ arg = func_node->args;
+
+ while (arg) {
+ if (arg == arg_node) {
+
+ return(TRUE);
+ }
+
+ arg = que_node_get_next(arg);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Decides if the fetching of rows should be made in a descending order, and
+also checks that the chosen query plan produces a result which satisfies
+the order-by. */
+static
+void
+opt_check_order_by(
+/*===============*/
+ sel_node_t* sel_node) /*!< in: select node; asserts an error
+ if the plan does not agree with the
+ order-by */
+{
+ order_node_t* order_node;
+ dict_table_t* order_table;
+ ulint order_col_no;
+ plan_t* plan;
+ ulint i;
+
+ if (!sel_node->order_by) {
+
+ return;
+ }
+
+ order_node = sel_node->order_by;
+ order_col_no = order_node->column->col_no;
+ order_table = order_node->column->table;
+
+ /* If there is an order-by clause, the first non-exactly matched field
+ in the index used for the last table in the table list should be the
+ column defined in the order-by clause, and for all the other tables
+ we should get only at most a single row, otherwise we cannot presently
+ calculate the order-by, as we have no sort utility */
+
+ for (i = 0; i < sel_node->n_tables; i++) {
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ if (i < sel_node->n_tables - 1) {
+ ut_a(dict_index_get_n_unique(plan->index)
+ <= plan->n_exact_match);
+ } else {
+ ut_a(plan->table == order_table);
+
+ ut_a((dict_index_get_n_unique(plan->index)
+ <= plan->n_exact_match)
+ || (dict_index_get_nth_col_no(plan->index,
+ plan->n_exact_match)
+ == order_col_no));
+ }
+ }
+}
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+static
+void
+opt_search_plan_for_table(
+/*======================*/
+ sel_node_t* sel_node, /*!< in: parsed select node */
+ ulint i, /*!< in: this is the ith table */
+ dict_table_t* table) /*!< in: table */
+{
+ plan_t* plan;
+ dict_index_t* index;
+ dict_index_t* best_index;
+ ulint n_fields;
+ ulint goodness;
+ ulint last_op = 75946965; /* Eliminate a Purify
+ warning */
+ ulint best_goodness;
+ ulint best_last_op = 0; /* remove warning */
+ que_node_t* index_plan[256];
+ que_node_t* best_index_plan[256];
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ plan->table = table;
+ plan->asc = sel_node->asc;
+ plan->pcur_is_open = FALSE;
+ plan->cursor_at_end = FALSE;
+
+ /* Calculate goodness for each index of the table */
+
+ index = dict_table_get_first_index(table);
+ best_index = index; /* Eliminate compiler warning */
+ best_goodness = 0;
+
+ /* should be do ... until ? comment by Jani */
+ while (index) {
+ goodness = opt_calc_index_goodness(index, sel_node, i,
+ index_plan, &last_op);
+ if (goodness > best_goodness) {
+
+ best_index = index;
+ best_goodness = goodness;
+ n_fields = opt_calc_n_fields_from_goodness(goodness);
+
+ memcpy(best_index_plan, index_plan,
+ n_fields * sizeof *index_plan);
+ best_last_op = last_op;
+ }
+
+ dict_table_next_uncorrupted_index(index);
+ }
+
+ plan->index = best_index;
+
+ n_fields = opt_calc_n_fields_from_goodness(best_goodness);
+
+ if (n_fields == 0) {
+ plan->tuple = NULL;
+ plan->n_exact_match = 0;
+ } else {
+ plan->tuple = dtuple_create(pars_sym_tab_global->heap,
+ n_fields);
+ dict_index_copy_types(plan->tuple, plan->index, n_fields);
+
+ plan->tuple_exps = static_cast<que_node_t**>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap,
+ n_fields * sizeof(void*)));
+
+ memcpy(plan->tuple_exps, best_index_plan,
+ n_fields * sizeof *best_index_plan);
+ if (best_last_op == '='
+ || best_last_op == PARS_LIKE_TOKEN_EXACT
+ || best_last_op == PARS_LIKE_TOKEN_PREFIX
+ || best_last_op == PARS_LIKE_TOKEN_SUFFIX
+ || best_last_op == PARS_LIKE_TOKEN_SUBSTR) {
+ plan->n_exact_match = n_fields;
+ } else {
+ plan->n_exact_match = n_fields - 1;
+ }
+
+ plan->mode = opt_op_to_search_mode(sel_node->asc,
+ best_last_op);
+ }
+
+ if (dict_index_is_clust(best_index)
+ && (plan->n_exact_match >= dict_index_get_n_unique(best_index))) {
+
+ plan->unique_search = TRUE;
+ } else {
+ plan->unique_search = FALSE;
+ }
+
+ plan->old_vers_heap = NULL;
+
+ btr_pcur_init(&(plan->pcur));
+ btr_pcur_init(&(plan->clust_pcur));
+}
+
+/*******************************************************************//**
+Looks at a comparison condition and decides if it can, and need, be tested for
+a table AFTER the table has been accessed.
+@return OPT_NOT_COND if not for this table, else OPT_END_COND,
+OPT_TEST_COND, or OPT_SCROLL_COND, where the last means that the
+condition need not be tested, except when scroll cursors are used */
+static
+ulint
+opt_classify_comparison(
+/*====================*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint i, /*!< in: ith table in the join */
+ func_node_t* cond) /*!< in: comparison condition */
+{
+ plan_t* plan;
+ ulint n_fields;
+ ulint op;
+ ulint j;
+
+ ut_ad(cond && sel_node);
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ /* Check if the condition is determined after the ith table has been
+ accessed, but not after the i - 1:th */
+
+ if (!opt_check_exp_determined_before(cond, sel_node, i + 1)) {
+
+ return(OPT_NOT_COND);
+ }
+
+ if ((i > 0) && opt_check_exp_determined_before(cond, sel_node, i)) {
+
+ return(OPT_NOT_COND);
+ }
+
+ /* If the condition is an exact match condition used in constructing
+ the search tuple, it is classified as OPT_END_COND */
+
+ if (plan->tuple) {
+ n_fields = dtuple_get_n_fields(plan->tuple);
+ } else {
+ n_fields = 0;
+ }
+
+ for (j = 0; j < plan->n_exact_match; j++) {
+
+ if (opt_is_arg(plan->tuple_exps[j], cond)) {
+
+ return(OPT_END_COND);
+ }
+ }
+
+ /* If the condition is an non-exact match condition used in
+ constructing the search tuple, it is classified as OPT_SCROLL_COND.
+ When the cursor is positioned, and if a non-scroll cursor is used,
+ there is no need to test this condition; if a scroll cursor is used
+ the testing is necessary when the cursor is reversed. */
+
+ if ((n_fields > plan->n_exact_match)
+ && opt_is_arg(plan->tuple_exps[n_fields - 1], cond)) {
+
+ return(OPT_SCROLL_COND);
+ }
+
+ /* If the condition is a non-exact match condition on the first field
+ in index for which there is no exact match, and it limits the search
+ range from the opposite side of the search tuple already BEFORE we
+ access the table, it is classified as OPT_END_COND */
+
+ if ((dict_index_get_n_fields(plan->index) > plan->n_exact_match)
+ && opt_look_for_col_in_comparison_before(
+ OPT_COMPARISON,
+ dict_index_get_nth_col_no(plan->index,
+ plan->n_exact_match),
+ cond, sel_node, i, &op)) {
+
+ if (sel_node->asc && ((op == '<') || (op == PARS_LE_TOKEN))) {
+
+ return(OPT_END_COND);
+ }
+
+ if (!sel_node->asc && ((op == '>') || (op == PARS_GE_TOKEN))) {
+
+ return(OPT_END_COND);
+ }
+ }
+
+ /* Otherwise, cond is classified as OPT_TEST_COND */
+
+ return(OPT_TEST_COND);
+}
+
+/*******************************************************************//**
+Recursively looks for test conditions for a table in a join. */
+static
+void
+opt_find_test_conds(
+/*================*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint i, /*!< in: ith table in the join */
+ func_node_t* cond) /*!< in: conjunction of search
+ conditions or NULL */
+{
+ func_node_t* new_cond;
+ ulint fclass;
+ plan_t* plan;
+
+ if (cond == NULL) {
+
+ return;
+ }
+
+ if (cond->func == PARS_AND_TOKEN) {
+ new_cond = static_cast<func_node_t*>(cond->args);
+
+ opt_find_test_conds(sel_node, i, new_cond);
+
+ new_cond = static_cast<func_node_t*>(
+ que_node_get_next(new_cond));
+
+ opt_find_test_conds(sel_node, i, new_cond);
+
+ return;
+ }
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ fclass = opt_classify_comparison(sel_node, i, cond);
+
+ if (fclass == OPT_END_COND) {
+ UT_LIST_ADD_LAST(plan->end_conds, cond);
+
+ } else if (fclass == OPT_TEST_COND) {
+ UT_LIST_ADD_LAST(plan->other_conds, cond);
+
+ }
+}
+
+/*******************************************************************//**
+Normalizes a list of comparison conditions so that a column of the table
+appears on the left side of the comparison if possible. This is accomplished
+by switching the arguments of the operator. */
+static
+void
+opt_normalize_cmp_conds(
+/*====================*/
+ func_node_t* cond, /*!< in: first in a list of comparison
+ conditions, or NULL */
+ dict_table_t* table) /*!< in: table */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ sym_node_t* sym_node;
+
+ while (cond) {
+ arg1 = cond->args;
+ arg2 = que_node_get_next(arg1);
+
+ if (que_node_get_type(arg2) == QUE_NODE_SYMBOL) {
+
+ sym_node = static_cast<sym_node_t*>(arg2);
+
+ if ((sym_node->token_type == SYM_COLUMN)
+ && (sym_node->table == table)) {
+
+ /* Switch the order of the arguments */
+
+ cond->args = arg2;
+ que_node_list_add_last(NULL, arg2);
+ que_node_list_add_last(arg2, arg1);
+
+ /* Invert the operator */
+ cond->func = opt_invert_cmp_op(cond->func);
+ }
+ }
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+}
+
+/*******************************************************************//**
+Finds out the search condition conjuncts we can, and need, to test as the ith
+table in a join is accessed. The search tuple can eliminate the need to test
+some conjuncts. */
+static
+void
+opt_determine_and_normalize_test_conds(
+/*===================================*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint i) /*!< in: ith table in the join */
+{
+ plan_t* plan;
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ UT_LIST_INIT(plan->end_conds, &func_node_t::cond_list);
+ UT_LIST_INIT(plan->other_conds, &func_node_t::cond_list);
+
+ /* Recursively go through the conjuncts and classify them */
+
+ opt_find_test_conds(
+ sel_node,
+ i,
+ static_cast<func_node_t*>(sel_node->search_cond));
+
+ opt_normalize_cmp_conds(UT_LIST_GET_FIRST(plan->end_conds),
+ plan->table);
+
+ ut_a(UT_LIST_GET_LEN(plan->end_conds) >= plan->n_exact_match);
+}
+
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+void
+opt_find_all_cols(
+/*==============*/
+ ibool copy_val, /*!< in: if TRUE, new found columns are
+ added as columns to copy */
+ dict_index_t* index, /*!< in: index of the table to use */
+ sym_node_list_t* col_list, /*!< in: base node of a list where
+ to add new found columns */
+ plan_t* plan, /*!< in: plan or NULL */
+ que_node_t* exp) /*!< in: expression or condition or
+ NULL */
+{
+ func_node_t* func_node;
+ que_node_t* arg;
+ sym_node_t* sym_node;
+ sym_node_t* col_node;
+ ulint col_pos;
+
+ if (exp == NULL) {
+
+ return;
+ }
+
+ if (que_node_get_type(exp) == QUE_NODE_FUNC) {
+ func_node = static_cast<func_node_t*>(exp);
+
+ for (arg = func_node->args;
+ arg != 0;
+ arg = que_node_get_next(arg)) {
+
+ opt_find_all_cols(
+ copy_val, index, col_list, plan, arg);
+ }
+
+ return;
+ }
+
+ ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
+
+ sym_node = static_cast<sym_node_t*>(exp);
+
+ if (sym_node->token_type != SYM_COLUMN) {
+
+ return;
+ }
+
+ if (sym_node->table != index->table) {
+
+ return;
+ }
+
+ /* Look for an occurrence of the same column in the plan column
+ list */
+
+ col_node = UT_LIST_GET_FIRST(*col_list);
+
+ while (col_node) {
+ if (col_node->col_no == sym_node->col_no) {
+
+ if (col_node == sym_node) {
+ /* sym_node was already in a list: do
+ nothing */
+
+ return;
+ }
+
+ /* Put an indirection */
+ sym_node->indirection = col_node;
+ sym_node->alias = col_node;
+
+ return;
+ }
+
+ col_node = UT_LIST_GET_NEXT(col_var_list, col_node);
+ }
+
+ /* The same column did not occur in the list: add it */
+
+ UT_LIST_ADD_LAST(*col_list, sym_node);
+
+ sym_node->copy_val = copy_val;
+
+ /* Fill in the field_no fields in sym_node */
+
+ sym_node->field_nos[SYM_CLUST_FIELD_NO] = dict_index_get_nth_col_pos(
+ dict_table_get_first_index(index->table), sym_node->col_no,
+ NULL);
+ if (!dict_index_is_clust(index)) {
+
+ ut_a(plan);
+
+ col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no,
+ NULL);
+
+ if (col_pos == ULINT_UNDEFINED) {
+
+ plan->must_get_clust = TRUE;
+ }
+
+ sym_node->field_nos[SYM_SEC_FIELD_NO] = col_pos;
+ }
+}
+
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in conditions which are
+not yet determined AFTER the join operation has fetched a row in the ith
+table. The values for these column must be copied to dynamic memory for
+later use. */
+static
+void
+opt_find_copy_cols(
+/*===============*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint i, /*!< in: ith table in the join */
+ func_node_t* search_cond) /*!< in: search condition or NULL */
+{
+ func_node_t* new_cond;
+ plan_t* plan;
+
+ if (search_cond == NULL) {
+
+ return;
+ }
+
+ ut_ad(que_node_get_type(search_cond) == QUE_NODE_FUNC);
+
+ if (search_cond->func == PARS_AND_TOKEN) {
+ new_cond = static_cast<func_node_t*>(search_cond->args);
+
+ opt_find_copy_cols(sel_node, i, new_cond);
+
+ new_cond = static_cast<func_node_t*>(
+ que_node_get_next(new_cond));
+
+ opt_find_copy_cols(sel_node, i, new_cond);
+
+ return;
+ }
+
+ if (!opt_check_exp_determined_before(search_cond, sel_node, i + 1)) {
+
+ /* Any ith table columns occurring in search_cond should be
+ copied, as this condition cannot be tested already on the
+ fetch from the ith table */
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan,
+ search_cond);
+ }
+}
+
+/*******************************************************************//**
+Classifies the table columns according to whether we use the column only while
+holding the latch on the page, or whether we have to copy the column value to
+dynamic memory. Puts the first occurrence of a column to either list in the
+plan node, and puts indirections to later occurrences of the column. */
+static
+void
+opt_classify_cols(
+/*==============*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint i) /*!< in: ith table in the join */
+{
+ plan_t* plan;
+ que_node_t* exp;
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ /* The final value of the following field will depend on the
+ environment of the select statement: */
+
+ plan->must_get_clust = FALSE;
+
+ UT_LIST_INIT(plan->columns, &sym_node_t::col_var_list);
+
+ /* All select list columns should be copied: therefore TRUE as the
+ first argument */
+
+ for (exp = sel_node->select_list;
+ exp != 0;
+ exp = que_node_get_next(exp)) {
+
+ opt_find_all_cols(
+ TRUE, plan->index, &(plan->columns), plan, exp);
+ }
+
+ opt_find_copy_cols(
+ sel_node, i, static_cast<func_node_t*>(sel_node->search_cond));
+
+ /* All remaining columns in the search condition are temporary
+ columns: therefore FALSE */
+
+ opt_find_all_cols(
+ FALSE, plan->index, &plan->columns, plan,
+ static_cast<func_node_t*>(sel_node->search_cond));
+}
+
+/*******************************************************************//**
+Fills in the info in plan which is used in accessing a clustered index
+record. The columns must already be classified for the plan node. */
+static
+void
+opt_clust_access(
+/*=============*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint n) /*!< in: nth table in select */
+{
+ plan_t* plan;
+ dict_table_t* table;
+ dict_index_t* clust_index;
+ dict_index_t* index;
+ mem_heap_t* heap;
+ ulint n_fields;
+ ulint pos;
+ ulint i;
+
+ plan = sel_node_get_nth_plan(sel_node, n);
+
+ index = plan->index;
+
+ /* The final value of the following field depends on the environment
+ of the select statement: */
+
+ plan->no_prefetch = FALSE;
+
+ if (dict_index_is_clust(index)) {
+ plan->clust_map = NULL;
+ plan->clust_ref = NULL;
+
+ return;
+ }
+
+ table = index->table;
+
+ clust_index = dict_table_get_first_index(table);
+
+ n_fields = dict_index_get_n_unique(clust_index);
+
+ heap = pars_sym_tab_global->heap;
+
+ plan->clust_ref = dtuple_create(heap, n_fields);
+
+ dict_index_copy_types(plan->clust_ref, clust_index, n_fields);
+
+ plan->clust_map = static_cast<ulint*>(
+ mem_heap_alloc(heap, n_fields * sizeof(ulint)));
+
+ for (i = 0; i < n_fields; i++) {
+ pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+ ut_a(pos != ULINT_UNDEFINED);
+
+ /* We optimize here only queries to InnoDB's internal system
+ tables, and they should not contain column prefix indexes. */
+
+ if (dict_is_sys_table(index->table->id)
+ && (dict_index_get_nth_field(index, pos)->prefix_len != 0
+ || dict_index_get_nth_field(clust_index, i)
+ ->prefix_len != 0)) {
+ ib::error() << "Error in pars0opt.cc: table "
+ << index->table->name
+ << " has prefix_len != 0";
+ }
+
+ *(plan->clust_map + i) = pos;
+
+ ut_ad(pos != ULINT_UNDEFINED);
+ }
+}
+
+#ifdef UNIV_SQL_DEBUG
+/** Print info of a query plan.
+@param[in,out] sel_node select node */
+static
+void
+opt_print_query_plan(
+ sel_node_t* sel_node);
+#endif
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+void
+opt_search_plan(
+/*============*/
+ sel_node_t* sel_node) /*!< in: parsed select node */
+{
+ sym_node_t* table_node;
+ dict_table_t* table;
+ order_node_t* order_by;
+ ulint i;
+
+ sel_node->plans = static_cast<plan_t*>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap,
+ sel_node->n_tables * sizeof(plan_t)));
+
+ /* Analyze the search condition to find out what we know at each
+ join stage about the conditions that the columns of a table should
+ satisfy */
+
+ table_node = sel_node->table_list;
+
+ if (sel_node->order_by == NULL) {
+ sel_node->asc = TRUE;
+ } else {
+ order_by = sel_node->order_by;
+
+ sel_node->asc = order_by->asc;
+ }
+
+ for (i = 0; i < sel_node->n_tables; i++) {
+
+ table = table_node->table;
+
+ /* Choose index through which to access the table */
+
+ opt_search_plan_for_table(sel_node, i, table);
+
+ /* Determine the search condition conjuncts we can test at
+ this table; normalize the end conditions */
+
+ opt_determine_and_normalize_test_conds(sel_node, i);
+
+ table_node = static_cast<sym_node_t*>(
+ que_node_get_next(table_node));
+ }
+
+ table_node = sel_node->table_list;
+
+ for (i = 0; i < sel_node->n_tables; i++) {
+
+ /* Classify the table columns into those we only need to access
+ but not copy, and to those we must copy to dynamic memory */
+
+ opt_classify_cols(sel_node, i);
+
+ /* Calculate possible info for accessing the clustered index
+ record */
+
+ opt_clust_access(sel_node, i);
+
+ table_node = static_cast<sym_node_t*>(
+ que_node_get_next(table_node));
+ }
+
+ /* Check that the plan obeys a possible order-by clause: if not,
+ an assertion error occurs */
+
+ opt_check_order_by(sel_node);
+
+#ifdef UNIV_SQL_DEBUG
+ opt_print_query_plan(sel_node);
+#endif
+}
+
+#ifdef UNIV_SQL_DEBUG
+/** Print info of a query plan.
+@param[in,out] sel_node select node */
+static
+void
+opt_print_query_plan(
+ sel_node_t* sel_node)
+{
+ plan_t* plan;
+ ulint n_fields;
+ ulint i;
+
+ fputs("QUERY PLAN FOR A SELECT NODE\n", stderr);
+
+ fputs(sel_node->asc ? "Asc. search; " : "Desc. search; ", stderr);
+
+ if (sel_node->set_x_locks) {
+ fputs("sets row x-locks; ", stderr);
+ ut_a(sel_node->row_lock_mode == LOCK_X);
+ ut_a(!sel_node->consistent_read);
+ } else if (sel_node->consistent_read) {
+ fputs("consistent read; ", stderr);
+ } else {
+ ut_a(sel_node->row_lock_mode == LOCK_S);
+ fputs("sets row s-locks; ", stderr);
+ }
+
+ putc('\n', stderr);
+
+ for (i = 0; i < sel_node->n_tables; i++) {
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ if (plan->tuple) {
+ n_fields = dtuple_get_n_fields(plan->tuple);
+ } else {
+ n_fields = 0;
+ }
+
+ fprintf(stderr,
+ "Index %s of table %s"
+ "; exact m. %lu, match %lu, end conds %lu\n",
+ plan->index->name(), plan->index->table->name.m_name,
+ (unsigned long) plan->n_exact_match,
+ (unsigned long) n_fields,
+ (unsigned long) UT_LIST_GET_LEN(plan->end_conds));
+ }
+}
+#endif /* UNIV_SQL_DEBUG */
diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc
new file mode 100644
index 00000000..b955b94b
--- /dev/null
+++ b/storage/innobase/pars/pars0pars.cc
@@ -0,0 +1,2413 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
+Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0pars.c
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+/* Historical note: Innobase executed its first SQL string (CREATE TABLE)
+on 1/27/1998 */
+
+#include "pars0pars.h"
+#include "row0sel.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "dict0crea.h"
+#include "que0que.h"
+#include "pars0grm.h"
+#include "pars0opt.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "eval0eval.h"
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+sym_tab_t* pars_sym_tab_global;
+
+/* Global variables used to denote certain reserved words, used in
+constructing the parsing tree */
+
+pars_res_word_t pars_to_binary_token = {PARS_TO_BINARY_TOKEN};
+pars_res_word_t pars_substr_token = {PARS_SUBSTR_TOKEN};
+pars_res_word_t pars_concat_token = {PARS_CONCAT_TOKEN};
+pars_res_word_t pars_instr_token = {PARS_INSTR_TOKEN};
+pars_res_word_t pars_length_token = {PARS_LENGTH_TOKEN};
+pars_res_word_t pars_count_token = {PARS_COUNT_TOKEN};
+pars_res_word_t pars_int_token = {PARS_INT_TOKEN};
+pars_res_word_t pars_bigint_token = {PARS_BIGINT_TOKEN};
+pars_res_word_t pars_char_token = {PARS_CHAR_TOKEN};
+pars_res_word_t pars_update_token = {PARS_UPDATE_TOKEN};
+pars_res_word_t pars_asc_token = {PARS_ASC_TOKEN};
+pars_res_word_t pars_desc_token = {PARS_DESC_TOKEN};
+pars_res_word_t pars_open_token = {PARS_OPEN_TOKEN};
+pars_res_word_t pars_close_token = {PARS_CLOSE_TOKEN};
+pars_res_word_t pars_share_token = {PARS_SHARE_TOKEN};
+pars_res_word_t pars_unique_token = {PARS_UNIQUE_TOKEN};
+pars_res_word_t pars_clustered_token = {PARS_CLUSTERED_TOKEN};
+
+/** Global variable used to denote the '*' in SELECT * FROM.. */
+ulint pars_star_denoter = 12345678;
+
+/********************************************************************
+Get user function with the given name.*/
+UNIV_INLINE
+pars_user_func_t*
+pars_info_lookup_user_func(
+/*=======================*/
+ /* out: user func, or NULL if not
+ found */
+ pars_info_t* info, /* in: info struct */
+ const char* name) /* in: function name to find*/
+{
+ if (info && info->funcs) {
+ ulint i;
+ ib_vector_t* vec = info->funcs;
+
+ for (i = 0; i < ib_vector_size(vec); i++) {
+ pars_user_func_t* puf;
+
+ puf = static_cast<pars_user_func_t*>(
+ ib_vector_get(vec, i));
+
+ if (strcmp(puf->name, name) == 0) {
+ return(puf);
+ }
+ }
+ }
+
+ return(NULL);
+}
+
+/********************************************************************
+Get bound identifier with the given name.*/
+UNIV_INLINE
+pars_bound_id_t*
+pars_info_lookup_bound_id(
+/*======================*/
+ /* out: bound literal, or NULL if
+ not found */
+ pars_info_t* info, /* in: info struct */
+ const char* name) /* in: bound literal name to find */
+{
+ if (info && info->bound_ids) {
+ ulint i;
+ ib_vector_t* vec = info->bound_ids;
+
+ for (i = 0; i < ib_vector_size(vec); i++) {
+ pars_bound_id_t* bid;
+
+ bid = static_cast<pars_bound_id_t*>(
+ ib_vector_get(vec, i));
+
+ if (strcmp(bid->name, name) == 0) {
+ return(bid);
+ }
+ }
+ }
+
+ return(NULL);
+}
+
+/********************************************************************
+Get bound literal with the given name.*/
+UNIV_INLINE
+pars_bound_lit_t*
+pars_info_lookup_bound_lit(
+/*=======================*/
+ /* out: bound literal, or NULL if
+ not found */
+ pars_info_t* info, /* in: info struct */
+ const char* name) /* in: bound literal name to find */
+{
+ if (info && info->bound_lits) {
+ ulint i;
+ ib_vector_t* vec = info->bound_lits;
+
+ for (i = 0; i < ib_vector_size(vec); i++) {
+ pars_bound_lit_t* pbl;
+
+ pbl = static_cast<pars_bound_lit_t*>(
+ ib_vector_get(vec, i));
+
+ if (strcmp(pbl->name, name) == 0) {
+ return(pbl);
+ }
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Determines the class of a function code.
+@return function class: PARS_FUNC_ARITH, ... */
+static
+ulint
+pars_func_get_class(
+/*================*/
+ int func) /*!< in: function code: '=', PARS_GE_TOKEN, ... */
+{
+ switch (func) {
+ case '+': case '-': case '*': case '/':
+ return(PARS_FUNC_ARITH);
+
+ case '=': case '<': case '>':
+ case PARS_GE_TOKEN: case PARS_LE_TOKEN: case PARS_NE_TOKEN:
+ return(PARS_FUNC_CMP);
+
+ case PARS_AND_TOKEN: case PARS_OR_TOKEN: case PARS_NOT_TOKEN:
+ return(PARS_FUNC_LOGICAL);
+
+ case PARS_COUNT_TOKEN:
+ return(PARS_FUNC_AGGREGATE);
+
+ case PARS_TO_BINARY_TOKEN:
+ case PARS_SUBSTR_TOKEN:
+ case PARS_CONCAT_TOKEN:
+ case PARS_LENGTH_TOKEN:
+ case PARS_INSTR_TOKEN:
+ case PARS_NOTFOUND_TOKEN:
+ return(PARS_FUNC_PREDEFINED);
+
+ default:
+ return(PARS_FUNC_OTHER);
+ }
+}
+
+/*********************************************************************//**
+Parses an operator or predefined function expression.
+@return own: function node in a query tree */
+static
+func_node_t*
+pars_func_low(
+/*==========*/
+ int func, /*!< in: function token code */
+ que_node_t* arg) /*!< in: first argument in the argument list */
+{
+ func_node_t* node;
+
+ node = static_cast<func_node_t*>(
+ mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t)));
+
+ node->common.type = QUE_NODE_FUNC;
+ dfield_set_data(&(node->common.val), NULL, 0);
+ node->common.val_buf_size = 0;
+
+ node->func = func;
+
+ node->fclass = pars_func_get_class(func);
+
+ node->args = arg;
+
+ UT_LIST_ADD_LAST(pars_sym_tab_global->func_node_list, node);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a function expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_func(
+/*======*/
+ que_node_t* res_word,/*!< in: function name reserved word */
+ que_node_t* arg) /*!< in: first argument in the argument list */
+{
+ return(pars_func_low(((pars_res_word_t*) res_word)->code, arg));
+}
+
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.*/
+int
+pars_like_rebind(
+/*=============*/
+ /* out, own: function node in a query tree */
+ sym_node_t* node, /* in: The search string node.*/
+ const byte* ptr, /* in: literal to (re) bind */
+ ulint ptr_len)/* in: length of literal to (re) bind*/
+{
+ dtype_t* dtype;
+ dfield_t* dfield;
+ ib_like_t op_check;
+ sym_node_t* like_node;
+ sym_node_t* str_node = NULL;
+ ib_like_t op = IB_LIKE_EXACT;
+ int func = PARS_LIKE_TOKEN_EXACT;
+
+ /* Is this a STRING% ? */
+ if (ptr[ptr_len - 1] == '%') {
+ op = IB_LIKE_PREFIX;
+ }
+
+ /* Is this a '%STRING' or %STRING% ?*/
+ ut_ad(*ptr != '%');
+
+ if (node->like_node == NULL) {
+ /* Add the LIKE operator info node to the node list.
+ This will be used during the comparison phase to determine
+ how to match.*/
+ like_node = sym_tab_add_int_lit(node->sym_table, op);
+ que_node_list_add_last(NULL, like_node);
+ node->like_node = like_node;
+ str_node = sym_tab_add_str_lit(node->sym_table, ptr, ptr_len);
+ que_node_list_add_last(like_node, str_node);
+ } else {
+ like_node = node->like_node;
+
+ /* Change the value of the string in the existing
+ string node of like node */
+ str_node = static_cast<sym_node_t*>(
+ que_node_list_get_last(like_node));
+
+ /* Must find the string node */
+ ut_a(str_node);
+ ut_a(str_node != like_node);
+ ut_a(str_node->token_type == SYM_LIT);
+
+ dfield = que_node_get_val(str_node);
+ dfield_set_data(dfield, ptr, ptr_len);
+ }
+
+ dfield = que_node_get_val(like_node);
+ dtype = dfield_get_type(dfield);
+
+ ut_a(dtype_get_mtype(dtype) == DATA_INT);
+ op_check = static_cast<ib_like_t>(
+ mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield))));
+
+ switch (op_check) {
+ case IB_LIKE_PREFIX:
+ case IB_LIKE_EXACT:
+ break;
+
+ default:
+ ut_error;
+ }
+
+ mach_write_to_4(static_cast<byte*>(dfield_get_data(dfield)), op);
+
+ dfield = que_node_get_val(node);
+
+ /* Adjust the length of the search value so the '%' is not
+ visible. Then create and add a search string node to the
+ search value node. Searching for %SUFFIX and %SUBSTR% requires
+ a full table scan and so we set the search value to ''.
+ For PREFIX% we simply remove the trailing '%'.*/
+
+ switch (op) {
+ case IB_LIKE_EXACT:
+ dfield = que_node_get_val(str_node);
+ dtype = dfield_get_type(dfield);
+
+ ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+ dfield_set_data(dfield, ptr, ptr_len);
+ break;
+
+ case IB_LIKE_PREFIX:
+ func = PARS_LIKE_TOKEN_PREFIX;
+
+ /* Modify the original node */
+ dfield_set_len(dfield, ptr_len - 1);
+
+ dfield = que_node_get_val(str_node);
+ dtype = dfield_get_type(dfield);
+
+ ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+ dfield_set_data(dfield, ptr, ptr_len - 1);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ return(func);
+}
+
+/*************************************************************************
+Parses a LIKE operator expression. */
+static
+int
+pars_like_op(
+/*=========*/
+ /* out, own: function node in a query tree */
+ que_node_t* arg) /* in: LIKE comparison string.*/
+{
+ char* ptr;
+ ulint ptr_len;
+ int func = PARS_LIKE_TOKEN_EXACT;
+ dfield_t* dfield = que_node_get_val(arg);
+ dtype_t* dtype = dfield_get_type(dfield);
+
+ ut_a(dtype_get_mtype(dtype) == DATA_CHAR
+ || dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+ ptr = static_cast<char*>(dfield_get_data(dfield));
+ ptr_len = strlen(ptr);
+
+ if (ptr_len) {
+
+ func = pars_like_rebind(
+ static_cast<sym_node_t*>(arg), (byte*) ptr, ptr_len);
+ }
+
+ return(func);
+}
+/*********************************************************************//**
+Parses an operator expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_op(
+/*====*/
+ int func, /*!< in: operator token code */
+ que_node_t* arg1, /*!< in: first argument */
+ que_node_t* arg2) /*!< in: second argument or NULL for an unary
+ operator */
+{
+ que_node_list_add_last(NULL, arg1);
+
+ if (arg2) {
+ que_node_list_add_last(arg1, arg2);
+ }
+
+ /* We need to parse the string and determine whether it's a
+ PREFIX, SUFFIX or SUBSTRING comparison */
+ if (func == PARS_LIKE_TOKEN) {
+
+ ut_a(que_node_get_type(arg2) == QUE_NODE_SYMBOL);
+
+ func = pars_like_op(arg2);
+
+ ut_a(func == PARS_LIKE_TOKEN_EXACT
+ || func == PARS_LIKE_TOKEN_PREFIX
+ || func == PARS_LIKE_TOKEN_SUFFIX
+ || func == PARS_LIKE_TOKEN_SUBSTR);
+ }
+
+ return(pars_func_low(func, arg1));
+}
+
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return own: order-by node in a query tree */
+order_node_t*
+pars_order_by(
+/*==========*/
+ sym_node_t* column, /*!< in: column name */
+ pars_res_word_t* asc) /*!< in: &pars_asc_token or pars_desc_token */
+{
+ order_node_t* node;
+
+ node = static_cast<order_node_t*>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap, sizeof(order_node_t)));
+
+ node->common.type = QUE_NODE_ORDER;
+
+ node->column = column;
+
+ if (asc == &pars_asc_token) {
+ node->asc = TRUE;
+ } else {
+ ut_a(asc == &pars_desc_token);
+ node->asc = FALSE;
+ }
+
+ return(node);
+}
+
+/*********************************************************************//**
+Determine if a data type is a built-in string data type of the InnoDB
+SQL parser.
+@return TRUE if string data type */
+static
+ibool
+pars_is_string_type(
+/*================*/
+ ulint mtype) /*!< in: main data type */
+{
+ switch (mtype) {
+ case DATA_VARCHAR: case DATA_CHAR:
+ case DATA_FIXBINARY: case DATA_BINARY:
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Resolves the data type of a function in an expression. The argument data
+types must already be resolved. */
+static
+void
+pars_resolve_func_data_type(
+/*========================*/
+ func_node_t* node) /*!< in: function node */
+{
+ que_node_t* arg;
+
+ ut_a(que_node_get_type(node) == QUE_NODE_FUNC);
+
+ arg = node->args;
+
+ switch (node->func) {
+ case '+': case '-': case '*': case '/':
+ /* Inherit the data type from the first argument (which must
+ not be the SQL null literal whose type is DATA_ERROR) */
+
+ dtype_copy(que_node_get_data_type(node),
+ que_node_get_data_type(arg));
+
+ ut_a(dtype_get_mtype(que_node_get_data_type(node))
+ == DATA_INT);
+ break;
+
+ case PARS_COUNT_TOKEN:
+ ut_a(arg);
+ dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+ break;
+
+ case PARS_TO_BINARY_TOKEN:
+ if (dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT) {
+ dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+ DATA_ENGLISH, 0);
+ } else {
+ dtype_set(que_node_get_data_type(node), DATA_BINARY,
+ 0, 0);
+ }
+ break;
+
+ case PARS_LENGTH_TOKEN:
+ case PARS_INSTR_TOKEN:
+ ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype));
+ dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+ break;
+
+ case PARS_SUBSTR_TOKEN:
+ case PARS_CONCAT_TOKEN:
+ ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype));
+ dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+ DATA_ENGLISH, 0);
+ break;
+
+ case '>': case '<': case '=':
+ case PARS_GE_TOKEN:
+ case PARS_LE_TOKEN:
+ case PARS_NE_TOKEN:
+ case PARS_AND_TOKEN:
+ case PARS_OR_TOKEN:
+ case PARS_NOT_TOKEN:
+ case PARS_NOTFOUND_TOKEN:
+
+ /* We currently have no iboolean type: use integer type */
+ dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+ break;
+
+ case PARS_LIKE_TOKEN_EXACT:
+ case PARS_LIKE_TOKEN_PREFIX:
+ case PARS_LIKE_TOKEN_SUFFIX:
+ case PARS_LIKE_TOKEN_SUBSTR:
+ dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+ DATA_ENGLISH, 0);
+ break;
+
+ default:
+ ut_error;
+ }
+}
+
+/*********************************************************************//**
+Resolves the meaning of variables in an expression and the data types of
+functions. It is an error if some identifier cannot be resolved here. */
+static
+void
+pars_resolve_exp_variables_and_types(
+/*=================================*/
+ sel_node_t* select_node, /*!< in: select node or NULL; if
+ this is not NULL then the variable
+ sym nodes are added to the
+ copy_variables list of select_node */
+ que_node_t* exp_node) /*!< in: expression */
+{
+ func_node_t* func_node;
+ que_node_t* arg;
+ sym_node_t* sym_node;
+ sym_node_t* node;
+
+ ut_a(exp_node);
+
+ if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+ func_node = static_cast<func_node_t*>(exp_node);
+
+ arg = func_node->args;
+
+ while (arg) {
+ pars_resolve_exp_variables_and_types(select_node, arg);
+
+ arg = que_node_get_next(arg);
+ }
+
+ pars_resolve_func_data_type(func_node);
+
+ return;
+ }
+
+ ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
+
+ sym_node = static_cast<sym_node_t*>(exp_node);
+
+ if (sym_node->resolved) {
+
+ return;
+ }
+
+ /* Not resolved yet: look in the symbol table for a variable
+ or a cursor or a function with the same name */
+
+ node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list);
+
+ while (node) {
+ if (node->resolved
+ && ((node->token_type == SYM_VAR)
+ || (node->token_type == SYM_CURSOR)
+ || (node->token_type == SYM_FUNCTION))
+ && node->name
+ && sym_node->name_len == node->name_len
+ && !memcmp(sym_node->name, node->name, node->name_len)) {
+
+ /* Found a variable or a cursor declared with
+ the same name */
+
+ break;
+ }
+
+ node = UT_LIST_GET_NEXT(sym_list, node);
+ }
+
+ if (!node) {
+ fprintf(stderr, "PARSER ERROR: Unresolved identifier %s\n",
+ sym_node->name);
+ }
+
+ ut_a(node);
+
+ sym_node->resolved = TRUE;
+ sym_node->token_type = SYM_IMPLICIT_VAR;
+ sym_node->alias = node;
+ sym_node->indirection = node;
+
+ if (select_node) {
+ UT_LIST_ADD_LAST(select_node->copy_variables, sym_node);
+ }
+
+ dfield_set_type(que_node_get_val(sym_node),
+ que_node_get_data_type(node));
+}
+
+/*********************************************************************//**
+Resolves the meaning of variables in an expression list. It is an error if
+some identifier cannot be resolved here. Resolves also the data types of
+functions. */
+static
+void
+pars_resolve_exp_list_variables_and_types(
+/*======================================*/
+ sel_node_t* select_node, /*!< in: select node or NULL */
+ que_node_t* exp_node) /*!< in: expression list first node, or
+ NULL */
+{
+ while (exp_node) {
+ pars_resolve_exp_variables_and_types(select_node, exp_node);
+
+ exp_node = que_node_get_next(exp_node);
+ }
+}
+
+/*********************************************************************//**
+Resolves the columns in an expression. */
+static
+void
+pars_resolve_exp_columns(
+/*=====================*/
+ sym_node_t* table_node, /*!< in: first node in a table list */
+ que_node_t* exp_node) /*!< in: expression */
+{
+ func_node_t* func_node;
+ que_node_t* arg;
+ sym_node_t* sym_node;
+ dict_table_t* table;
+ sym_node_t* t_node;
+ ulint n_cols;
+ ulint i;
+
+ ut_a(exp_node);
+
+ if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+ func_node = static_cast<func_node_t*>(exp_node);
+
+ arg = func_node->args;
+
+ while (arg) {
+ pars_resolve_exp_columns(table_node, arg);
+
+ arg = que_node_get_next(arg);
+ }
+
+ return;
+ }
+
+ ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
+
+ sym_node = static_cast<sym_node_t*>(exp_node);
+
+ if (sym_node->resolved) {
+
+ return;
+ }
+
+ /* Not resolved yet: look in the table list for a column with the
+ same name */
+
+ t_node = table_node;
+
+ while (t_node) {
+ table = t_node->table;
+
+ n_cols = dict_table_get_n_cols(table);
+
+ for (i = 0; i < n_cols; i++) {
+ const dict_col_t* col
+ = dict_table_get_nth_col(table, i);
+ const char* col_name
+ = dict_table_get_col_name(table, i);
+
+ if (sym_node->name_len == strlen(col_name)
+ && !memcmp(sym_node->name, col_name,
+ sym_node->name_len)) {
+ /* Found */
+ sym_node->resolved = TRUE;
+ sym_node->token_type = SYM_COLUMN;
+ sym_node->table = table;
+ sym_node->col_no = i;
+ sym_node->prefetch_buf = NULL;
+
+ dict_col_copy_type(
+ col,
+ dfield_get_type(&sym_node
+ ->common.val));
+
+ return;
+ }
+ }
+
+ t_node = static_cast<sym_node_t*>(que_node_get_next(t_node));
+ }
+}
+
+/*********************************************************************//**
+Resolves the meaning of columns in an expression list. */
+static
+void
+pars_resolve_exp_list_columns(
+/*==========================*/
+ sym_node_t* table_node, /*!< in: first node in a table list */
+ que_node_t* exp_node) /*!< in: expression list first node, or
+ NULL */
+{
+ while (exp_node) {
+ pars_resolve_exp_columns(table_node, exp_node);
+
+ exp_node = que_node_get_next(exp_node);
+ }
+}
+
+/*********************************************************************//**
+Retrieves the table definition for a table name id. */
+static
+void
+pars_retrieve_table_def(
+/*====================*/
+ sym_node_t* sym_node) /*!< in: table node */
+{
+ ut_a(sym_node);
+ ut_a(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+ /* Open the table only if it is not already opened. */
+ if (sym_node->token_type != SYM_TABLE_REF_COUNTED) {
+
+ ut_a(sym_node->table == NULL);
+
+ sym_node->resolved = TRUE;
+ sym_node->token_type = SYM_TABLE_REF_COUNTED;
+
+ sym_node->table = dict_table_open_on_name(
+ sym_node->name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+
+ ut_a(sym_node->table != NULL);
+ }
+}
+
+/*********************************************************************//**
+Retrieves the table definitions for a list of table name ids.
+@return number of tables */
+static
+ulint
+pars_retrieve_table_list_defs(
+/*==========================*/
+ sym_node_t* sym_node) /*!< in: first table node in list */
+{
+ ulint count = 0;
+
+ if (sym_node == NULL) {
+
+ return(count);
+ }
+
+ while (sym_node) {
+ pars_retrieve_table_def(sym_node);
+
+ count++;
+
+ sym_node = static_cast<sym_node_t*>(
+ que_node_get_next(sym_node));
+ }
+
+ return(count);
+}
+
+/*********************************************************************//**
+Adds all columns to the select list if the query is SELECT * FROM ... */
+static
+void
+pars_select_all_columns(
+/*====================*/
+ sel_node_t* select_node) /*!< in: select node already containing
+ the table list */
+{
+ sym_node_t* col_node;
+ sym_node_t* table_node;
+ dict_table_t* table;
+ ulint i;
+
+ select_node->select_list = NULL;
+
+ table_node = select_node->table_list;
+
+ while (table_node) {
+ table = table_node->table;
+
+ for (i = 0; i < dict_table_get_n_user_cols(table); i++) {
+ const char* col_name = dict_table_get_col_name(
+ table, i);
+
+ col_node = sym_tab_add_id(pars_sym_tab_global,
+ (byte*) col_name,
+ strlen(col_name));
+
+ select_node->select_list = que_node_list_add_last(
+ select_node->select_list, col_node);
+ }
+
+ table_node = static_cast<sym_node_t*>(
+ que_node_get_next(table_node));
+ }
+}
+
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_list(
+/*=============*/
+ que_node_t* select_list, /*!< in: select list */
+ sym_node_t* into_list) /*!< in: variables list or NULL */
+{
+ sel_node_t* node;
+
+ node = sel_node_create(pars_sym_tab_global->heap);
+
+ node->select_list = select_list;
+ node->into_list = into_list;
+
+ pars_resolve_exp_list_variables_and_types(NULL, into_list);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Checks if the query is an aggregate query, in which case the selct list must
+contain only aggregate function items. */
+static
+void
+pars_check_aggregate(
+/*=================*/
+ sel_node_t* select_node) /*!< in: select node already containing
+ the select list */
+{
+ que_node_t* exp_node;
+ func_node_t* func_node;
+ ulint n_nodes = 0;
+ ulint n_aggregate_nodes = 0;
+
+ exp_node = select_node->select_list;
+
+ while (exp_node) {
+
+ n_nodes++;
+
+ if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+
+ func_node = static_cast<func_node_t*>(exp_node);
+
+ if (func_node->fclass == PARS_FUNC_AGGREGATE) {
+
+ n_aggregate_nodes++;
+ }
+ }
+
+ exp_node = que_node_get_next(exp_node);
+ }
+
+ if (n_aggregate_nodes > 0) {
+ ut_a(n_nodes == n_aggregate_nodes);
+
+ select_node->is_aggregate = TRUE;
+ } else {
+ select_node->is_aggregate = FALSE;
+ }
+}
+
+/*********************************************************************//**
+Parses a select statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_statement(
+/*==================*/
+ sel_node_t* select_node, /*!< in: select node already containing
+ the select list */
+ sym_node_t* table_list, /*!< in: table list */
+ que_node_t* search_cond, /*!< in: search condition or NULL */
+ pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */
+ pars_res_word_t* lock_shared, /*!< in: NULL or &pars_share_token */
+ order_node_t* order_by) /*!< in: NULL or an order-by node */
+{
+ select_node->state = SEL_NODE_OPEN;
+
+ select_node->table_list = table_list;
+ select_node->n_tables = pars_retrieve_table_list_defs(table_list);
+
+ if (select_node->select_list == &pars_star_denoter) {
+
+ /* SELECT * FROM ... */
+ pars_select_all_columns(select_node);
+ }
+
+ if (select_node->into_list) {
+ ut_a(que_node_list_get_len(select_node->into_list)
+ == que_node_list_get_len(select_node->select_list));
+ }
+
+ UT_LIST_INIT(select_node->copy_variables, &sym_node_t::col_var_list);
+
+ pars_resolve_exp_list_columns(table_list, select_node->select_list);
+ pars_resolve_exp_list_variables_and_types(select_node,
+ select_node->select_list);
+ pars_check_aggregate(select_node);
+
+ select_node->search_cond = search_cond;
+
+ if (search_cond) {
+ pars_resolve_exp_columns(table_list, search_cond);
+ pars_resolve_exp_variables_and_types(select_node, search_cond);
+ }
+
+ if (for_update) {
+ ut_a(!lock_shared);
+
+ select_node->set_x_locks = TRUE;
+ select_node->row_lock_mode = LOCK_X;
+
+ select_node->consistent_read = FALSE;
+ select_node->read_view = NULL;
+ } else if (lock_shared){
+ select_node->set_x_locks = FALSE;
+ select_node->row_lock_mode = LOCK_S;
+
+ select_node->consistent_read = FALSE;
+ select_node->read_view = NULL;
+ } else {
+ select_node->set_x_locks = FALSE;
+ select_node->row_lock_mode = LOCK_S;
+
+ select_node->consistent_read = TRUE;
+ }
+
+ select_node->order_by = order_by;
+
+ if (order_by) {
+ pars_resolve_exp_columns(table_list, order_by->column);
+ }
+
+ /* The final value of the following fields depend on the environment
+ where the select statement appears: */
+
+ select_node->can_get_updated = FALSE;
+ select_node->explicit_cursor = NULL;
+
+ opt_search_plan(select_node);
+
+ return(select_node);
+}
+
+/*********************************************************************//**
+Parses a cursor declaration.
+@return sym_node */
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+ sym_node_t* sym_node, /*!< in: cursor id node in the symbol
+ table */
+ sel_node_t* select_node) /*!< in: select node */
+{
+ sym_node->resolved = TRUE;
+ sym_node->token_type = SYM_CURSOR;
+ sym_node->cursor_def = select_node;
+
+ select_node->state = SEL_NODE_CLOSED;
+ select_node->explicit_cursor = sym_node;
+
+ return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a function declaration.
+@return sym_node */
+que_node_t*
+pars_function_declaration(
+/*======================*/
+ sym_node_t* sym_node) /*!< in: function id node in the symbol
+ table */
+{
+ sym_node->resolved = TRUE;
+ sym_node->token_type = SYM_FUNCTION;
+
+ /* Check that the function exists. */
+ ut_a(pars_info_lookup_user_func(
+ pars_sym_tab_global->info, sym_node->name));
+
+ return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+ ibool is_delete, /*!< in: TRUE if delete */
+ sym_node_t* table_sym, /*!< in: table name node */
+ col_assign_node_t* col_assign_list)/*!< in: column assignment list, NULL
+ if delete */
+{
+ upd_node_t* node;
+
+ node = upd_node_create(pars_sym_tab_global->heap);
+
+ node->is_delete = is_delete ? PLAIN_DELETE : NO_DELETE;
+
+ node->table_sym = table_sym;
+ node->col_assign_list = col_assign_list;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return column assignment node */
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+ sym_node_t* column, /*!< in: column to assign */
+ que_node_t* exp) /*!< in: value to assign */
+{
+ col_assign_node_t* node;
+
+ node = static_cast<col_assign_node_t*>(
+ mem_heap_alloc(pars_sym_tab_global->heap,
+ sizeof(col_assign_node_t)));
+ node->common.type = QUE_NODE_COL_ASSIGNMENT;
+
+ node->col = column;
+ node->val = exp;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Processes an update node assignment list. */
+static
+void
+pars_process_assign_list(
+/*=====================*/
+ upd_node_t* node) /*!< in: update node */
+{
+ col_assign_node_t* col_assign_list;
+ sym_node_t* table_sym;
+ col_assign_node_t* assign_node;
+ upd_field_t* upd_field;
+ dict_index_t* clust_index;
+ sym_node_t* col_sym;
+ ulint changes_ord_field;
+ ulint changes_field_size;
+ ulint n_assigns;
+ ulint i;
+
+ table_sym = node->table_sym;
+ col_assign_list = static_cast<col_assign_node_t*>(
+ node->col_assign_list);
+ clust_index = dict_table_get_first_index(node->table);
+
+ assign_node = col_assign_list;
+ n_assigns = 0;
+
+ while (assign_node) {
+ pars_resolve_exp_columns(table_sym, assign_node->col);
+ pars_resolve_exp_columns(table_sym, assign_node->val);
+ pars_resolve_exp_variables_and_types(NULL, assign_node->val);
+#if 0
+ ut_a(dtype_get_mtype(
+ dfield_get_type(que_node_get_val(
+ assign_node->col)))
+ == dtype_get_mtype(
+ dfield_get_type(que_node_get_val(
+ assign_node->val))));
+#endif
+
+ /* Add to the update node all the columns found in assignment
+ values as columns to copy: therefore, TRUE */
+
+ opt_find_all_cols(TRUE, clust_index, &(node->columns), NULL,
+ assign_node->val);
+ n_assigns++;
+
+ assign_node = static_cast<col_assign_node_t*>(
+ que_node_get_next(assign_node));
+ }
+
+ node->update = upd_create(n_assigns, pars_sym_tab_global->heap);
+
+ assign_node = col_assign_list;
+
+ changes_field_size = UPD_NODE_NO_SIZE_CHANGE;
+
+ for (i = 0; i < n_assigns; i++) {
+ upd_field = upd_get_nth_field(node->update, i);
+
+ col_sym = assign_node->col;
+
+ ulint field_no = dict_index_get_nth_col_pos(
+ clust_index, col_sym->col_no, NULL);
+ ut_ad(field_no < clust_index->n_fields);
+ upd_field_set_field_no(upd_field,
+ static_cast<uint16_t>(field_no),
+ clust_index);
+ upd_field->exp = assign_node->val;
+
+ if (!dict_col_get_fixed_size(
+ dict_index_get_nth_col(clust_index,
+ upd_field->field_no),
+ dict_table_is_comp(node->table))) {
+ changes_field_size = 0;
+ }
+
+ assign_node = static_cast<col_assign_node_t*>(
+ que_node_get_next(assign_node));
+ }
+
+ /* Find out if the update can modify an ordering field in any index */
+
+ changes_ord_field = UPD_NODE_NO_ORD_CHANGE;
+
+ if (row_upd_changes_some_index_ord_field_binary(node->table,
+ node->update)) {
+ changes_ord_field = 0;
+ }
+
+ node->cmpl_info = changes_ord_field | changes_field_size;
+}
+
+/*********************************************************************//**
+Parses an update or delete statement.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement(
+/*==================*/
+ upd_node_t* node, /*!< in: update node */
+ sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in
+ the symbol table or NULL */
+ que_node_t* search_cond) /*!< in: search condition or NULL */
+{
+ sym_node_t* table_sym;
+ sel_node_t* sel_node;
+ plan_t* plan;
+
+ table_sym = node->table_sym;
+
+ pars_retrieve_table_def(table_sym);
+ node->table = table_sym->table;
+
+ UT_LIST_INIT(node->columns, &sym_node_t::col_var_list);
+
+ /* Make the single table node into a list of table nodes of length 1 */
+
+ que_node_list_add_last(NULL, table_sym);
+
+ if (cursor_sym) {
+ pars_resolve_exp_variables_and_types(NULL, cursor_sym);
+
+ sel_node = cursor_sym->alias->cursor_def;
+
+ node->searched_update = FALSE;
+ } else {
+ sel_node = pars_select_list(NULL, NULL);
+
+ pars_select_statement(sel_node, table_sym, search_cond, NULL,
+ &pars_share_token, NULL);
+ node->searched_update = TRUE;
+ sel_node->common.parent = node;
+ }
+
+ node->select = sel_node;
+
+ ut_a(!node->is_delete || (node->col_assign_list == NULL));
+ ut_a(node->is_delete == PLAIN_DELETE || node->col_assign_list != NULL);
+
+ if (node->is_delete == PLAIN_DELETE) {
+ node->cmpl_info = 0;
+ } else {
+ pars_process_assign_list(node);
+ }
+
+ if (node->searched_update) {
+ node->has_clust_rec_x_lock = TRUE;
+ sel_node->set_x_locks = TRUE;
+ sel_node->row_lock_mode = LOCK_X;
+ } else {
+ node->has_clust_rec_x_lock = sel_node->set_x_locks;
+ ut_ad(node->has_clust_rec_x_lock);
+ }
+
+ ut_a(sel_node->n_tables == 1);
+ ut_a(sel_node->consistent_read == FALSE);
+ ut_a(sel_node->order_by == NULL);
+ ut_a(sel_node->is_aggregate == FALSE);
+
+ sel_node->can_get_updated = TRUE;
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ plan = sel_node_get_nth_plan(sel_node, 0);
+
+ plan->no_prefetch = TRUE;
+
+ if (!dict_index_is_clust(plan->index)) {
+
+ plan->must_get_clust = TRUE;
+
+ node->pcur = &(plan->clust_pcur);
+ } else {
+ node->pcur = &(plan->pcur);
+ }
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an insert statement.
+@return own: update node in a query tree */
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+ sym_node_t* table_sym, /*!< in: table name node */
+ que_node_t* values_list, /*!< in: value expression list or NULL */
+ sel_node_t* select) /*!< in: select condition or NULL */
+{
+ ins_node_t* node;
+ dtuple_t* row;
+ ulint ins_type;
+
+ ut_a(values_list || select);
+ ut_a(!values_list || !select);
+
+ if (values_list) {
+ ins_type = INS_VALUES;
+ } else {
+ ins_type = INS_SEARCHED;
+ }
+
+ pars_retrieve_table_def(table_sym);
+
+ node = ins_node_create(ins_type, table_sym->table,
+ pars_sym_tab_global->heap);
+
+ row = dtuple_create(pars_sym_tab_global->heap,
+ dict_table_get_n_cols(node->table));
+
+ dict_table_copy_types(row, table_sym->table);
+
+ ins_node_set_new_row(node, row);
+
+ node->select = select;
+
+ if (select) {
+ select->common.parent = node;
+
+ ut_a(que_node_list_get_len(select->select_list)
+ == dict_table_get_n_user_cols(table_sym->table));
+ }
+
+ node->values_list = values_list;
+
+ if (node->values_list) {
+ pars_resolve_exp_list_variables_and_types(NULL, values_list);
+
+ ut_a(que_node_list_get_len(values_list)
+ == dict_table_get_n_user_cols(table_sym->table));
+ }
+
+ return(node);
+}
+
+/*********************************************************************//**
+Set the type of a dfield. */
+static
+void
+pars_set_dfield_type(
+/*=================*/
+ dfield_t* dfield, /*!< in: dfield */
+ pars_res_word_t* type, /*!< in: pointer to a type
+ token */
+ ulint len, /*!< in: length, or 0 */
+ bool is_not_null) /*!< in: whether the column is
+ NOT NULL. */
+{
+ ulint flags = 0;
+
+ if (is_not_null) {
+ flags |= DATA_NOT_NULL;
+ }
+
+ if (type == &pars_bigint_token) {
+ ut_a(len == 0);
+
+ dtype_set(dfield_get_type(dfield), DATA_INT, flags, 8);
+ } else if (type == &pars_int_token) {
+ ut_a(len == 0);
+
+ dtype_set(dfield_get_type(dfield), DATA_INT, flags, 4);
+
+ } else if (type == &pars_char_token) {
+ //ut_a(len == 0);
+
+ dtype_set(dfield_get_type(dfield), DATA_VARCHAR,
+ DATA_ENGLISH | flags, len);
+ } else {
+ ut_error;
+ }
+}
+
+/*********************************************************************//**
+Parses a variable declaration.
+@return own: symbol table node of type SYM_VAR */
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+ sym_node_t* node, /*!< in: symbol table node allocated for the
+ id of the variable */
+ pars_res_word_t* type) /*!< in: pointer to a type token */
+{
+ node->resolved = TRUE;
+ node->token_type = SYM_VAR;
+
+ node->param_type = PARS_NOT_PARAM;
+
+ pars_set_dfield_type(que_node_get_val(node), type, 0, false);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Sets the parent field in a query node list. */
+static
+void
+pars_set_parent_in_list(
+/*====================*/
+ que_node_t* node_list, /*!< in: first node in a list */
+ que_node_t* parent) /*!< in: parent value to set in all
+ nodes of the list */
+{
+ que_common_t* common;
+
+ common = static_cast<que_common_t*>(node_list);
+
+ while (common) {
+ common->parent = parent;
+
+ common = static_cast<que_common_t*>(que_node_get_next(common));
+ }
+}
+
+/*********************************************************************//**
+Parses an elsif element.
+@return elsif node */
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list) /*!< in: statement list */
+{
+ elsif_node_t* node;
+
+ node = static_cast<elsif_node_t*>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap, sizeof(elsif_node_t)));
+
+ node->common.type = QUE_NODE_ELSIF;
+
+ node->cond = cond;
+
+ pars_resolve_exp_variables_and_types(NULL, cond);
+
+ node->stat_list = stat_list;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an if-statement.
+@return if-statement node */
+if_node_t*
+pars_if_statement(
+/*==============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list, /*!< in: statement list */
+ que_node_t* else_part) /*!< in: else-part statement list
+ or elsif element list */
+{
+ if_node_t* node;
+ elsif_node_t* elsif_node;
+
+ node = static_cast<if_node_t*>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap, sizeof(if_node_t)));
+
+ node->common.type = QUE_NODE_IF;
+
+ node->cond = cond;
+
+ pars_resolve_exp_variables_and_types(NULL, cond);
+
+ node->stat_list = stat_list;
+
+ if (else_part && (que_node_get_type(else_part) == QUE_NODE_ELSIF)) {
+
+ /* There is a list of elsif conditions */
+
+ node->else_part = NULL;
+ node->elsif_list = static_cast<elsif_node_t*>(else_part);
+
+ elsif_node = static_cast<elsif_node_t*>(else_part);
+
+ while (elsif_node) {
+ pars_set_parent_in_list(elsif_node->stat_list, node);
+
+ elsif_node = static_cast<elsif_node_t*>(
+ que_node_get_next(elsif_node));
+ }
+ } else {
+ node->else_part = else_part;
+ node->elsif_list = NULL;
+
+ pars_set_parent_in_list(else_part, node);
+ }
+
+ pars_set_parent_in_list(stat_list, node);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a while-statement.
+@return while-statement node */
+while_node_t*
+pars_while_statement(
+/*=================*/
+ que_node_t* cond, /*!< in: while-condition */
+ que_node_t* stat_list) /*!< in: statement list */
+{
+ while_node_t* node;
+
+ node = static_cast<while_node_t*>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap, sizeof(while_node_t)));
+
+ node->common.type = QUE_NODE_WHILE;
+
+ node->cond = cond;
+
+ pars_resolve_exp_variables_and_types(NULL, cond);
+
+ node->stat_list = stat_list;
+
+ pars_set_parent_in_list(stat_list, node);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return for-statement node */
+for_node_t*
+pars_for_statement(
+/*===============*/
+ sym_node_t* loop_var, /*!< in: loop variable */
+ que_node_t* loop_start_limit,/*!< in: loop start expression */
+ que_node_t* loop_end_limit, /*!< in: loop end expression */
+ que_node_t* stat_list) /*!< in: statement list */
+{
+ for_node_t* node;
+
+ node = static_cast<for_node_t*>(
+ mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t)));
+
+ node->common.type = QUE_NODE_FOR;
+
+ pars_resolve_exp_variables_and_types(NULL, loop_var);
+ pars_resolve_exp_variables_and_types(NULL, loop_start_limit);
+ pars_resolve_exp_variables_and_types(NULL, loop_end_limit);
+
+ node->loop_var = loop_var->indirection;
+
+ ut_a(loop_var->indirection);
+
+ node->loop_start_limit = loop_start_limit;
+ node->loop_end_limit = loop_end_limit;
+
+ node->stat_list = stat_list;
+
+ pars_set_parent_in_list(stat_list, node);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an exit statement.
+@return exit statement node */
+exit_node_t*
+pars_exit_statement(void)
+/*=====================*/
+{
+ exit_node_t* node;
+
+ node = static_cast<exit_node_t*>(
+ mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t)));
+ node->common.type = QUE_NODE_EXIT;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a return-statement.
+@return return-statement node */
+return_node_t*
+pars_return_statement(void)
+/*=======================*/
+{
+ return_node_t* node;
+
+ node = static_cast<return_node_t*>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap, sizeof(return_node_t)));
+ node->common.type = QUE_NODE_RETURN;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an assignment statement.
+@return assignment statement node */
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+ sym_node_t* var, /*!< in: variable to assign */
+ que_node_t* val) /*!< in: value to assign */
+{
+ assign_node_t* node;
+
+ node = static_cast<assign_node_t*>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap, sizeof(assign_node_t)));
+ node->common.type = QUE_NODE_ASSIGNMENT;
+
+ node->var = var;
+ node->val = val;
+
+ pars_resolve_exp_variables_and_types(NULL, var);
+ pars_resolve_exp_variables_and_types(NULL, val);
+
+ ut_a(dtype_get_mtype(dfield_get_type(que_node_get_val(var)))
+ == dtype_get_mtype(dfield_get_type(que_node_get_val(val))));
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure call.
+@return function node */
+func_node_t*
+pars_procedure_call(
+/*================*/
+ que_node_t* res_word,/*!< in: procedure name reserved word */
+ que_node_t* args) /*!< in: argument list */
+{
+ func_node_t* node;
+
+ node = pars_func(res_word, args);
+
+ pars_resolve_exp_list_variables_and_types(NULL, args);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return fetch statement node */
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+ sym_node_t* cursor, /*!< in: cursor node */
+ sym_node_t* into_list, /*!< in: variables to set, or NULL */
+ sym_node_t* user_func) /*!< in: user function name, or NULL */
+{
+ sym_node_t* cursor_decl;
+ fetch_node_t* node;
+
+ /* Logical XOR. */
+ ut_a(!into_list != !user_func);
+
+ node = static_cast<fetch_node_t*>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap, sizeof(fetch_node_t)));
+
+ node->common.type = QUE_NODE_FETCH;
+
+ pars_resolve_exp_variables_and_types(NULL, cursor);
+
+ if (into_list) {
+ pars_resolve_exp_list_variables_and_types(NULL, into_list);
+ node->into_list = into_list;
+ node->func = NULL;
+ } else {
+ pars_resolve_exp_variables_and_types(NULL, user_func);
+
+ node->func = pars_info_lookup_user_func(
+ pars_sym_tab_global->info, user_func->name);
+
+ ut_a(node->func);
+
+ node->into_list = NULL;
+ }
+
+ cursor_decl = cursor->alias;
+
+ ut_a(cursor_decl->token_type == SYM_CURSOR);
+
+ node->cursor_def = cursor_decl->cursor_def;
+
+ if (into_list) {
+ ut_a(que_node_list_get_len(into_list)
+ == que_node_list_get_len(node->cursor_def->select_list));
+ }
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return fetch statement node */
+open_node_t*
+pars_open_statement(
+/*================*/
+ ulint type, /*!< in: ROW_SEL_OPEN_CURSOR
+ or ROW_SEL_CLOSE_CURSOR */
+ sym_node_t* cursor) /*!< in: cursor node */
+{
+ sym_node_t* cursor_decl;
+ open_node_t* node;
+
+ node = static_cast<open_node_t*>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap, sizeof(open_node_t)));
+
+ node->common.type = QUE_NODE_OPEN;
+
+ pars_resolve_exp_variables_and_types(NULL, cursor);
+
+ cursor_decl = cursor->alias;
+
+ ut_a(cursor_decl->token_type == SYM_CURSOR);
+
+ node->op_type = static_cast<open_node_op>(type);
+ node->cursor_def = cursor_decl->cursor_def;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return row_printf-statement node */
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+ sel_node_t* sel_node) /*!< in: select node */
+{
+ row_printf_node_t* node;
+
+ node = static_cast<row_printf_node_t*>(
+ mem_heap_alloc(
+ pars_sym_tab_global->heap, sizeof(row_printf_node_t)));
+ node->common.type = QUE_NODE_ROW_PRINTF;
+
+ node->sel_node = sel_node;
+
+ sel_node->common.parent = node;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a commit statement.
+@return own: commit node struct */
+commit_node_t*
+pars_commit_statement(void)
+/*=======================*/
+{
+ return(trx_commit_node_create(pars_sym_tab_global->heap));
+}
+
+/*********************************************************************//**
+Parses a rollback statement.
+@return own: rollback node struct */
+roll_node_t*
+pars_rollback_statement(void)
+/*=========================*/
+{
+ return(roll_node_create(pars_sym_tab_global->heap));
+}
+
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return column sym table node */
+sym_node_t*
+pars_column_def(
+/*============*/
+ sym_node_t* sym_node, /*!< in: column node in the
+ symbol table */
+ pars_res_word_t* type, /*!< in: data type */
+ sym_node_t* len, /*!< in: length of column, or
+ NULL */
+ void* is_not_null) /*!< in: if not NULL, column
+ is of type NOT NULL. */
+{
+ ulint len2;
+
+ if (len) {
+ len2 = ulint(eval_node_get_int_val(len));
+ } else {
+ len2 = 0;
+ }
+
+ pars_set_dfield_type(que_node_get_val(sym_node), type, len2,
+ is_not_null != NULL);
+
+ return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a table creation operation.
+@return table create subgraph */
+tab_node_t*
+pars_create_table(
+/*==============*/
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_defs) /*!< in: list of column names */
+{
+ dict_table_t* table;
+ sym_node_t* column;
+ tab_node_t* node;
+ const dtype_t* dtype;
+ ulint n_cols;
+ ulint flags = 0;
+ ulint flags2 = DICT_TF2_FTS_AUX_HEX_NAME;
+
+ DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+ flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;);
+
+ n_cols = que_node_list_get_len(column_defs);
+
+ table = dict_mem_table_create(
+ table_sym->name, NULL, n_cols, 0, flags, flags2);
+
+ mem_heap_t* heap = pars_sym_tab_global->heap;
+ column = column_defs;
+
+ while (column) {
+ dtype = dfield_get_type(que_node_get_val(column));
+
+ dict_mem_table_add_col(table, heap,
+ column->name, dtype->mtype,
+ dtype->prtype, dtype->len);
+ column->resolved = TRUE;
+ column->token_type = SYM_COLUMN;
+
+ column = static_cast<sym_node_t*>(que_node_get_next(column));
+ }
+
+ dict_table_add_system_columns(table, heap);
+ node = tab_create_graph_create(table, heap,
+ FIL_ENCRYPTION_DEFAULT,
+ FIL_DEFAULT_ENCRYPTION_KEY);
+
+ table_sym->resolved = TRUE;
+ table_sym->token_type = SYM_TABLE;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an index creation operation.
+@return index create subgraph */
+ind_node_t*
+pars_create_index(
+/*==============*/
+ pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */
+ pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */
+ sym_node_t* index_sym, /*!< in: index name node in the symbol
+ table */
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_list) /*!< in: list of column names */
+{
+ dict_index_t* index;
+ sym_node_t* column;
+ ind_node_t* node;
+ ulint n_fields;
+ ulint ind_type;
+
+ n_fields = que_node_list_get_len(column_list);
+
+ ind_type = 0;
+
+ if (unique_def) {
+ ind_type = ind_type | DICT_UNIQUE;
+ }
+
+ if (clustered_def) {
+ ind_type = ind_type | DICT_CLUSTERED;
+ }
+
+ index = dict_mem_index_create(NULL, index_sym->name,
+ ind_type, n_fields);
+ column = column_list;
+
+ while (column) {
+ dict_mem_index_add_field(index, column->name, 0);
+
+ column->resolved = TRUE;
+ column->token_type = SYM_COLUMN;
+
+ column = static_cast<sym_node_t*>(que_node_get_next(column));
+ }
+
+ node = ind_create_graph_create(index, table_sym->name,
+ pars_sym_tab_global->heap);
+
+ table_sym->resolved = TRUE;
+ table_sym->token_type = SYM_TABLE;
+
+ index_sym->resolved = TRUE;
+ index_sym->token_type = SYM_TABLE;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure definition.
+@return query fork node */
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+ sym_node_t* sym_node, /*!< in: procedure id node in the symbol
+ table */
+ que_node_t* stat_list) /*!< in: statement list */
+{
+ proc_node_t* node;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ mem_heap_t* heap;
+
+ heap = pars_sym_tab_global->heap;
+
+ fork = que_fork_create(NULL, NULL, QUE_FORK_PROCEDURE, heap);
+ fork->trx = NULL;
+
+ thr = que_thr_create(fork, heap, NULL);
+
+ node = static_cast<proc_node_t*>(
+ mem_heap_alloc(heap, sizeof(proc_node_t)));
+
+ node->common.type = QUE_NODE_PROC;
+ node->common.parent = thr;
+
+ sym_node->token_type = SYM_PROCEDURE_NAME;
+ sym_node->resolved = TRUE;
+
+ node->proc_id = sym_node;
+ node->stat_list = stat_list;
+
+ pars_set_parent_in_list(stat_list, node);
+
+ node->sym_tab = pars_sym_tab_global;
+
+ thr->child = node;
+
+ pars_sym_tab_global->query_graph = fork;
+
+ return(fork);
+}
+
+/*************************************************************//**
+Parses a stored procedure call, when this is not within another stored
+procedure, that is, the client issues a procedure call directly.
+In MySQL/InnoDB, stored InnoDB procedures are invoked via the
+parsed procedure tree, not via InnoDB SQL, so this function is not used.
+@return query graph */
+que_fork_t*
+pars_stored_procedure_call(
+/*=======================*/
+ sym_node_t* sym_node MY_ATTRIBUTE((unused)))
+ /*!< in: stored procedure name */
+{
+ ut_error;
+ return(NULL);
+}
+
+/*************************************************************//**
+Retrieves characters to the lexical analyzer. */
+int
+pars_get_lex_chars(
+/*===============*/
+ char* buf, /*!< in/out: buffer where to copy */
+ size_t max_size) /*!< in: maximum number of characters which fit
+ in the buffer */
+{
+ size_t len = pars_sym_tab_global->string_len
+ - pars_sym_tab_global->next_char_pos;
+ if (len == 0) {
+ return(0);
+ }
+
+ if (len > max_size) {
+ len = max_size;
+ }
+
+ memcpy(buf, pars_sym_tab_global->sql_string
+ + pars_sym_tab_global->next_char_pos, len);
+
+ pars_sym_tab_global->next_char_pos += len;
+
+ return static_cast<int>(len);
+}
+
+/*************************************************************//**
+Called by yyparse on error. */
+void
+yyerror(
+/*====*/
+ const char* s MY_ATTRIBUTE((unused)))
+ /*!< in: error message string */
+{
+ ut_ad(s);
+
+ ib::fatal() << "PARSER: Syntax error in SQL string";
+}
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return own: the query graph */
+que_t*
+pars_sql(
+/*=====*/
+ pars_info_t* info, /*!< in: extra information, or NULL */
+ const char* str) /*!< in: SQL string */
+{
+ sym_node_t* sym_node;
+ mem_heap_t* heap;
+ que_t* graph;
+
+ ut_ad(str);
+
+ heap = mem_heap_create(16000);
+
+ /* Currently, the parser is not reentrant: */
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ pars_sym_tab_global = sym_tab_create(heap);
+
+ pars_sym_tab_global->string_len = strlen(str);
+ pars_sym_tab_global->sql_string = static_cast<char*>(
+ mem_heap_dup(heap, str, pars_sym_tab_global->string_len + 1));
+ pars_sym_tab_global->next_char_pos = 0;
+ pars_sym_tab_global->info = info;
+
+ yyparse();
+
+ sym_node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list);
+
+ while (sym_node) {
+ ut_a(sym_node->resolved);
+
+ sym_node = UT_LIST_GET_NEXT(sym_list, sym_node);
+ }
+
+ graph = pars_sym_tab_global->query_graph;
+
+ graph->sym_tab = pars_sym_tab_global;
+ graph->info = info;
+
+ pars_sym_tab_global = NULL;
+
+ /* fprintf(stderr, "SQL graph size %lu\n", mem_heap_get_size(heap)); */
+
+ return(graph);
+}
+
+/** Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running. The fork created is of
+type QUE_FORK_MYSQL_INTERFACE.
+@param[in] node root node for an incomplete query
+ graph, or NULL for dummy graph
+@param[in] trx transaction handle
+@param[in] heap memory heap from which allocated
+@param[in] prebuilt row prebuilt structure
+@return query thread node to run */
+que_thr_t*
+pars_complete_graph_for_exec(
+ que_node_t* node,
+ trx_t* trx,
+ mem_heap_t* heap,
+ row_prebuilt_t* prebuilt)
+{
+ que_fork_t* fork;
+ que_thr_t* thr;
+
+ fork = que_fork_create(NULL, NULL, QUE_FORK_MYSQL_INTERFACE, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap, prebuilt);
+
+ thr->child = node;
+
+ if (node) {
+ que_node_set_parent(node, thr);
+ }
+
+ trx->graph = NULL;
+
+ return(thr);
+}
+
+/****************************************************************//**
+Create parser info struct.
+@return own: info struct */
+pars_info_t*
+pars_info_create(void)
+/*==================*/
+{
+ pars_info_t* info;
+ mem_heap_t* heap;
+
+ heap = mem_heap_create(512);
+
+ info = static_cast<pars_info_t*>(mem_heap_alloc(heap, sizeof(*info)));
+
+ info->heap = heap;
+ info->funcs = NULL;
+ info->bound_lits = NULL;
+ info->bound_ids = NULL;
+ info->graph_owns_us = TRUE;
+
+ return(info);
+}
+
+/****************************************************************//**
+Free info struct and everything it contains. */
+void
+pars_info_free(
+/*===========*/
+ pars_info_t* info) /*!< in, own: info struct */
+{
+ mem_heap_free(info->heap);
+}
+
+/****************************************************************//**
+Add bound literal. */
+void
+pars_info_add_literal(
+/*==================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const void* address, /*!< in: address */
+ ulint length, /*!< in: length of data */
+ ulint type, /*!< in: type, e.g. DATA_FIXBINARY */
+ ulint prtype) /*!< in: precise type, e.g.
+ DATA_UNSIGNED */
+{
+ pars_bound_lit_t* pbl;
+
+ ut_ad(!pars_info_get_bound_lit(info, name));
+
+ pbl = static_cast<pars_bound_lit_t*>(
+ mem_heap_alloc(info->heap, sizeof(*pbl)));
+
+ pbl->name = name;
+
+ pbl->address = address;
+ pbl->length = length;
+ pbl->type = type;
+ pbl->prtype = prtype;
+
+ if (!info->bound_lits) {
+ ib_alloc_t* heap_alloc;
+
+ heap_alloc = ib_heap_allocator_create(info->heap);
+
+ info->bound_lits = ib_vector_create(heap_alloc, sizeof(*pbl), 8);
+ }
+
+ ib_vector_push(info->bound_lits, pbl);
+}
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+void
+pars_info_add_str_literal(
+/*======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* str) /*!< in: string */
+{
+ pars_info_add_literal(info, name, str, strlen(str),
+ DATA_VARCHAR, DATA_ENGLISH);
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_literal(
+/*===================*/
+ pars_info_t* info, /* in: info struct */
+ const char* name, /* in: name */
+ const void* address, /* in: address */
+ ulint length, /* in: length of data */
+ ulint type, /* in: type, e.g. DATA_FIXBINARY */
+ ulint prtype) /* in: precise type, e.g. */
+{
+ pars_bound_lit_t* pbl;
+
+ pbl = pars_info_lookup_bound_lit(info, name);
+
+ if (!pbl) {
+ pars_info_add_literal(
+ info, name, address, length, type, prtype);
+ } else {
+ pbl->address = address;
+ pbl->length = length;
+
+ sym_tab_rebind_lit(pbl->node, address, length);
+ }
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const byte* str, /*!< in: string */
+ ulint str_len) /*!< in: string length */
+{
+ pars_bound_lit_t* pbl;
+
+ pbl = pars_info_lookup_bound_lit(info, name);
+
+ if (!pbl) {
+ pars_info_add_literal(
+ info, name, str, str_len, DATA_VARCHAR, DATA_ENGLISH);
+ } else {
+
+ pbl->address = str;
+ pbl->length = str_len;
+
+ sym_tab_rebind_lit(pbl->node, str, str_len);
+ }
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_int4_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ ulint val) /*!< in: value */
+{
+ byte* buf = static_cast<byte*>(mem_heap_alloc(info->heap, 4));
+
+ mach_write_to_4(buf, val);
+ pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_int4_literal(
+/*========================*/
+ pars_info_t* info, /* in: info struct */
+ const char* name, /* in: name */
+ const ib_uint32_t* val) /* in: value */
+{
+ pars_bound_lit_t* pbl;
+
+ pbl = pars_info_lookup_bound_lit(info, name);
+
+ if (!pbl) {
+ pars_info_add_literal(info, name, val, 4, DATA_INT, 0);
+ } else {
+
+ pbl->address = val;
+ pbl->length = sizeof(*val);
+
+ sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+ }
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_int8_literal(
+/*========================*/
+ pars_info_t* info, /* in: info struct */
+ const char* name, /* in: name */
+ const ib_uint64_t* val) /* in: value */
+{
+ pars_bound_lit_t* pbl;
+
+ pbl = pars_info_lookup_bound_lit(info, name);
+
+ if (!pbl) {
+ pars_info_add_literal(
+ info, name, val, sizeof(*val), DATA_INT, 0);
+ } else {
+
+ pbl->address = val;
+ pbl->length = sizeof(*val);
+
+ sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+ }
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_ull_literal(
+/*======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ ib_uint64_t val) /*!< in: value */
+{
+ byte* buf = static_cast<byte*>(mem_heap_alloc(info->heap, 8));
+
+ mach_write_to_8(buf, val);
+
+ pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+}
+
+/****************************************************************//**
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_ull_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const ib_uint64_t* val) /*!< in: value */
+{
+ pars_bound_lit_t* pbl;
+
+ pbl = pars_info_lookup_bound_lit(info, name);
+
+ if (!pbl) {
+ pars_info_add_literal(
+ info, name, val, sizeof(*val), DATA_FIXBINARY, 0);
+ } else {
+
+ pbl->address = val;
+ pbl->length = sizeof(*val);
+
+ sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+ }
+}
+
+/****************************************************************//**
+Add user function. */
+void
+pars_info_bind_function(
+/*====================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: function name */
+ pars_user_func_cb_t func, /*!< in: function address */
+ void* arg) /*!< in: user-supplied argument */
+{
+ pars_user_func_t* puf;
+
+ puf = pars_info_lookup_user_func(info, name);
+
+ if (!puf) {
+ if (!info->funcs) {
+ ib_alloc_t* heap_alloc;
+
+ heap_alloc = ib_heap_allocator_create(info->heap);
+
+ info->funcs = ib_vector_create(
+ heap_alloc, sizeof(*puf), 8);
+ }
+
+ /* Create a "new" element */
+ puf = static_cast<pars_user_func_t*>(
+ ib_vector_push(info->funcs, NULL));
+ puf->name = name;
+ }
+
+ puf->arg = arg;
+ puf->func = func;
+}
+
+/********************************************************************
+Add bound id. */
+void
+pars_info_bind_id(
+/*==============*/
+ pars_info_t* info, /*!< in: info struct */
+ ibool copy_name, /* in: copy name if TRUE */
+ const char* name, /*!< in: name */
+ const char* id) /*!< in: id */
+{
+ pars_bound_id_t* bid;
+
+ bid = pars_info_lookup_bound_id(info, name);
+
+ if (!bid) {
+
+ if (!info->bound_ids) {
+ ib_alloc_t* heap_alloc;
+
+ heap_alloc = ib_heap_allocator_create(info->heap);
+
+ info->bound_ids = ib_vector_create(
+ heap_alloc, sizeof(*bid), 8);
+ }
+
+ /* Create a "new" element */
+ bid = static_cast<pars_bound_id_t*>(
+ ib_vector_push(info->bound_ids, NULL));
+
+ bid->name = (copy_name)
+ ? mem_heap_strdup(info->heap, name) : name;
+ }
+
+ bid->id = id;
+}
+
+/********************************************************************
+Get bound identifier with the given name.*/
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+ /* out: bound id, or NULL if not
+ found */
+ pars_info_t* info, /* in: info struct */
+ const char* name) /* in: bound id name to find */
+{
+ return(pars_info_lookup_bound_id(info, name));
+}
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return bound literal, or NULL if not found */
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name) /*!< in: bound literal name to find */
+{
+ return(pars_info_lookup_bound_lit(info, name));
+}
diff --git a/storage/innobase/pars/pars0sym.cc b/storage/innobase/pars/pars0sym.cc
new file mode 100644
index 00000000..5e4c0e0f
--- /dev/null
+++ b/storage/innobase/pars/pars0sym.cc
@@ -0,0 +1,416 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0sym.cc
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#include "pars0sym.h"
+#include "mem0mem.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "pars0grm.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+#include "row0sel.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return own: symbol table */
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+ mem_heap_t* heap) /*!< in: memory heap where to create */
+{
+ sym_tab_t* sym_tab;
+
+ sym_tab = static_cast<sym_tab_t*>(
+ mem_heap_alloc(heap, sizeof(sym_tab_t)));
+
+ UT_LIST_INIT(sym_tab->sym_list, &sym_node_t::sym_list);
+ UT_LIST_INIT(sym_tab->func_node_list, &func_node_t::func_node_list);
+
+ sym_tab->heap = heap;
+
+ return(sym_tab);
+}
+
+
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+void
+sym_tab_free_private(
+/*=================*/
+ sym_tab_t* sym_tab) /*!< in, own: symbol table */
+{
+ sym_node_t* sym;
+ func_node_t* func;
+
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ for (sym = UT_LIST_GET_FIRST(sym_tab->sym_list);
+ sym != NULL;
+ sym = UT_LIST_GET_NEXT(sym_list, sym)) {
+
+ /* Close the tables opened in pars_retrieve_table_def(). */
+
+ if (sym->token_type == SYM_TABLE_REF_COUNTED) {
+
+ dict_table_close(sym->table, TRUE, FALSE);
+
+ sym->table = NULL;
+ sym->resolved = FALSE;
+ sym->token_type = SYM_UNSET;
+ }
+
+ eval_node_free_val_buf(sym);
+
+ if (sym->prefetch_buf) {
+ sel_col_prefetch_buf_free(sym->prefetch_buf);
+ }
+
+ if (sym->cursor_def) {
+ que_graph_free_recursive(sym->cursor_def);
+ }
+ }
+
+ for (func = UT_LIST_GET_FIRST(sym_tab->func_node_list);
+ func != NULL;
+ func = UT_LIST_GET_NEXT(func_node_list, func)) {
+
+ eval_node_free_val_buf(func);
+ }
+}
+
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ ulint val) /*!< in: integer value */
+{
+ sym_node_t* node;
+ byte* data;
+
+ node = static_cast<sym_node_t*>(
+ mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->table = NULL;
+ node->resolved = TRUE;
+ node->token_type = SYM_LIT;
+
+ node->indirection = NULL;
+
+ dtype_set(dfield_get_type(&node->common.val), DATA_INT, 0, 4);
+
+ data = static_cast<byte*>(mem_heap_alloc(sym_tab->heap, 4));
+ mach_write_to_4(data, val);
+
+ dfield_set_data(&(node->common.val), data, 4);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+ node->like_node = NULL;
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
+
+/******************************************************************//**
+Adds a string literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const byte* str, /*!< in: string with no quotes around
+ it */
+ ulint len) /*!< in: string length */
+{
+ sym_node_t* node;
+ byte* data;
+
+ node = static_cast<sym_node_t*>(
+ mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->table = NULL;
+ node->resolved = TRUE;
+ node->token_type = SYM_LIT;
+
+ node->indirection = NULL;
+
+ dtype_set(dfield_get_type(&node->common.val),
+ DATA_VARCHAR, DATA_ENGLISH, 0);
+
+ data = (len) ? static_cast<byte*>(mem_heap_dup(sym_tab->heap, str, len))
+ : NULL;
+
+ dfield_set_data(&(node->common.val), data, len);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+ node->like_node = NULL;
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
+
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name, /*!< in: name of bound literal */
+ ulint* lit_type) /*!< out: type of literal (PARS_*_LIT) */
+{
+ sym_node_t* node;
+ pars_bound_lit_t* blit;
+ ulint len = 0;
+
+ blit = pars_info_get_bound_lit(sym_tab->info, name);
+ ut_a(blit);
+
+ node = static_cast<sym_node_t*>(
+ mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+ node->common.type = QUE_NODE_SYMBOL;
+ node->common.brother = node->common.parent = NULL;
+
+ node->table = NULL;
+ node->resolved = TRUE;
+ node->token_type = SYM_LIT;
+
+ node->indirection = NULL;
+
+ switch (blit->type) {
+ case DATA_FIXBINARY:
+ case DATA_CHAR:
+ ut_ad(blit->length > 0);
+ len = blit->length;
+ /* fall through */
+ case DATA_BLOB:
+ case DATA_VARCHAR:
+ *lit_type = PARS_STR_LIT;
+ break;
+
+ case DATA_INT:
+ ut_a(blit->length > 0);
+ ut_a(blit->length <= 8);
+
+ len = blit->length;
+ *lit_type = PARS_INT_LIT;
+ break;
+
+ default:
+ ut_error;
+ }
+
+ dtype_set(dfield_get_type(&node->common.val),
+ blit->type, blit->prtype, len);
+
+ dfield_set_data(&(node->common.val), blit->address, blit->length);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+ blit->node = node;
+ node->like_node = NULL;
+ node->sym_table = sym_tab;
+
+ return(node);
+}
+
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+ /* out: symbol table node */
+ sym_node_t* node, /* in: node that is bound to literal*/
+ const void* address, /* in: pointer to data */
+ ulint length) /* in: length of data */
+{
+ dfield_t* dfield = que_node_get_val(node);
+ dtype_t* dtype = dfield_get_type(dfield);
+
+ ut_a(node->token_type == SYM_LIT);
+
+ dfield_set_data(&node->common.val, address, length);
+
+ if (node->like_node) {
+
+ ut_a(dtype_get_mtype(dtype) == DATA_CHAR
+ || dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+ /* Don't force [FALSE] creation of sub-nodes (for LIKE) */
+ pars_like_rebind(
+ node,static_cast<const byte*>(address), length);
+ }
+
+ /* FIXME: What's this ? */
+ node->common.val_buf_size = 0;
+
+ if (node->prefetch_buf) {
+ sel_col_prefetch_buf_free(node->prefetch_buf);
+ node->prefetch_buf = NULL;
+ }
+
+ if (node->cursor_def) {
+ que_graph_free_recursive(node->cursor_def);
+ node->cursor_def = NULL;
+ }
+
+ return(node);
+}
+
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+ sym_tab_t* sym_tab) /*!< in: symbol table */
+{
+ sym_node_t* node;
+
+ node = static_cast<sym_node_t*>(
+ mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->table = NULL;
+ node->resolved = TRUE;
+ node->token_type = SYM_LIT;
+
+ node->indirection = NULL;
+
+ dfield_get_type(&node->common.val)->mtype = DATA_ERROR;
+
+ dfield_set_null(&node->common.val);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+ node->like_node = NULL;
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
+
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ byte* name, /*!< in: identifier name */
+ ulint len) /*!< in: identifier length */
+{
+ sym_node_t* node;
+
+ node = static_cast<sym_node_t*>(
+ mem_heap_zalloc(sym_tab->heap, sizeof(*node)));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->name = mem_heap_strdupl(sym_tab->heap, (char*) name, len);
+ node->name_len = len;
+
+ UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+ dfield_set_null(&node->common.val);
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_id(
+/*=================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name) /*!< in: name of bound id */
+{
+ sym_node_t* node;
+ pars_bound_id_t* bid;
+
+ bid = pars_info_get_bound_id(sym_tab->info, name);
+ ut_a(bid);
+
+ node = static_cast<sym_node_t*>(
+ mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->table = NULL;
+ node->resolved = FALSE;
+ node->token_type = SYM_UNSET;
+ node->indirection = NULL;
+
+ node->name = mem_heap_strdup(sym_tab->heap, bid->id);
+ node->name_len = strlen(node->name);
+
+ UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+ dfield_set_null(&node->common.val);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ node->like_node = NULL;
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
diff --git a/storage/innobase/plugin_exports b/storage/innobase/plugin_exports
new file mode 100644
index 00000000..235ae3d5
--- /dev/null
+++ b/storage/innobase/plugin_exports
@@ -0,0 +1,14 @@
+{
+ global:
+ _maria_plugin_interface_version_;
+ _maria_sizeof_struct_st_plugin_;
+ _maria_plugin_declarations_;
+ my_snprintf_service;
+ thd_alloc_service;
+ thd_autoinc_service;
+ thd_error_context_service;
+ thd_kill_statement_service;
+ thd_wait_service;
+ local:
+ *;
+};
diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc
new file mode 100644
index 00000000..121eda36
--- /dev/null
+++ b/storage/innobase/que/que0que.cc
@@ -0,0 +1,1138 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file que/que0que.cc
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "row0undo.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0purge.h"
+#include "dict0crea.h"
+#include "log0log.h"
+#include "eval0proc.h"
+
+#define QUE_MAX_LOOPS_WITHOUT_CHECK 16
+
+/* Short introduction to query graphs
+ ==================================
+
+A query graph consists of nodes linked to each other in various ways. The
+execution starts at que_run_threads() which takes a que_thr_t parameter.
+que_thr_t contains two fields that control query graph execution: run_node
+and prev_node. run_node is the next node to execute and prev_node is the
+last node executed.
+
+Each node has a pointer to a 'next' statement, i.e., its brother, and a
+pointer to its parent node. The next pointer is NULL in the last statement
+of a block.
+
+Loop nodes contain a link to the first statement of the enclosed statement
+list. While the loop runs, que_thr_step() checks if execution to the loop
+node came from its parent or from one of the statement nodes in the loop. If
+it came from the parent of the loop node it starts executing the first
+statement node in the loop. If it came from one of the statement nodes in
+the loop, then it checks if the statement node has another statement node
+following it, and runs it if so.
+
+To signify loop ending, the loop statements (see e.g. while_step()) set
+que_thr_t->run_node to the loop node's parent node. This is noticed on the
+next call of que_thr_step() and execution proceeds to the node pointed to by
+the loop node's 'next' pointer.
+
+For example, the code:
+
+X := 1;
+WHILE X < 5 LOOP
+ X := X + 1;
+ X := X + 1;
+X := 5
+
+will result in the following node hierarchy, with the X-axis indicating
+'next' links and the Y-axis indicating parent/child links:
+
+A - W - A
+ |
+ |
+ A - A
+
+A = assign_node_t, W = while_node_t. */
+
+/* How a stored procedure containing COMMIT or ROLLBACK commands
+is executed?
+
+The commit or rollback can be seen as a subprocedure call.
+
+When the transaction starts to handle a rollback or commit.
+It builds a query graph which, when executed, will roll back
+or commit the incomplete transaction. The transaction
+is moved to the TRX_QUE_ROLLING_BACK or TRX_QUE_COMMITTING state.
+If specified, the SQL cursors opened by the transaction are closed.
+When the execution of the graph completes, it is like returning
+from a subprocedure: the query thread which requested the operation
+starts running again. */
+
+/***********************************************************************//**
+Creates a query graph fork node.
+@return own: fork node */
+que_fork_t*
+que_fork_create(
+/*============*/
+ que_t* graph, /*!< in: graph, if NULL then this
+ fork node is assumed to be the
+ graph root */
+ que_node_t* parent, /*!< in: parent node */
+ ulint fork_type, /*!< in: fork type */
+ mem_heap_t* heap) /*!< in: memory heap where created */
+{
+ que_fork_t* fork;
+
+ ut_ad(heap);
+
+ fork = static_cast<que_fork_t*>(mem_heap_zalloc(heap, sizeof(*fork)));
+
+ fork->heap = heap;
+
+ fork->fork_type = fork_type;
+
+ fork->common.parent = parent;
+
+ fork->common.type = QUE_NODE_FORK;
+
+ fork->state = QUE_FORK_COMMAND_WAIT;
+
+ fork->graph = (graph != NULL) ? graph : fork;
+
+ UT_LIST_INIT(fork->thrs, &que_thr_t::thrs);
+
+ return(fork);
+}
+
+
+/** Creates a query graph thread node.
+@param[in] parent parent node, i.e., a fork node
+@param[in] heap memory heap where created
+@param[in] prebuilt row prebuilt structure
+@return own: query thread node */
+que_thr_t*
+que_thr_create(
+ que_fork_t* parent,
+ mem_heap_t* heap,
+ row_prebuilt_t* prebuilt)
+{
+ que_thr_t* thr;
+
+ ut_ad(parent != NULL);
+ ut_ad(heap != NULL);
+
+ thr = static_cast<que_thr_t*>(mem_heap_zalloc(heap, sizeof(*thr)));
+
+ thr->graph = parent->graph;
+
+ thr->common.parent = parent;
+
+ thr->common.type = QUE_NODE_THR;
+
+ thr->state = QUE_THR_COMMAND_WAIT;
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+ thr->prebuilt = prebuilt;
+
+ UT_LIST_ADD_LAST(parent->thrs, thr);
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Moves a suspended query thread to the QUE_THR_RUNNING state and may release
+a worker thread to execute it. This function should be used to end
+the wait state of a query thread waiting for a lock or a stored procedure
+completion.
+@return the query thread that needs to be released. */
+que_thr_t*
+que_thr_end_lock_wait(
+/*==================*/
+ trx_t* trx) /*!< in: transaction with que_state in
+ QUE_THR_LOCK_WAIT */
+{
+ que_thr_t* thr;
+
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(trx));
+
+ thr = trx->lock.wait_thr;
+
+ ut_ad(thr != NULL);
+
+ ut_ad(trx->lock.que_state == TRX_QUE_LOCK_WAIT);
+ /* In MySQL this is the only possible state here */
+ ut_a(thr->state == QUE_THR_LOCK_WAIT);
+
+ bool was_active = thr->is_active;
+
+ thr->start_running();
+
+ trx->lock.que_state = TRX_QUE_RUNNING;
+
+ trx->lock.wait_thr = NULL;
+
+ /* In MySQL we let the OS thread (not just the query thread) to wait
+ for the lock to be released: */
+
+ return((!was_active && thr != NULL) ? thr : NULL);
+}
+
+/**********************************************************************//**
+Inits a query thread for a command. */
+UNIV_INLINE
+void
+que_thr_init_command(
+/*=================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ thr->run_node = thr;
+ thr->prev_node = thr->common.parent;
+ thr->start_running();
+}
+
+/**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+ que_fork_t* fork, /*!< in: a query fork */
+ que_thr_t* thr) /*!< in: current pos */
+{
+ trx_mutex_enter(fork->trx);
+
+ /* If no current, start first available. */
+ if (thr == NULL) {
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+ } else {
+ thr = UT_LIST_GET_NEXT(thrs, thr);
+ }
+
+ if (thr) {
+
+ fork->state = QUE_FORK_ACTIVE;
+
+ fork->last_sel_node = NULL;
+
+ switch (thr->state) {
+ case QUE_THR_COMMAND_WAIT:
+ case QUE_THR_COMPLETED:
+ ut_a(!thr->is_active);
+ que_thr_init_command(thr);
+ break;
+
+ case QUE_THR_SUSPENDED:
+ case QUE_THR_LOCK_WAIT:
+ default:
+ ut_error;
+
+ }
+ }
+
+ trx_mutex_exit(fork->trx);
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+ que_fork_t* fork) /*!< in: a query fork */
+{
+ que_thr_t* thr;
+ que_thr_t* suspended_thr = NULL;
+ que_thr_t* completed_thr = NULL;
+
+ fork->state = QUE_FORK_ACTIVE;
+
+ fork->last_sel_node = NULL;
+
+ suspended_thr = NULL;
+ completed_thr = NULL;
+
+ /* Choose the query thread to run: usually there is just one thread,
+ but in a parallelized select, which necessarily is non-scrollable,
+ there may be several to choose from */
+
+ /* First we try to find a query thread in the QUE_THR_COMMAND_WAIT
+ state. Then we try to find a query thread in the QUE_THR_SUSPENDED
+ state, finally we try to find a query thread in the QUE_THR_COMPLETED
+ state */
+
+ /* We make a single pass over the thr list within which we note which
+ threads are ready to run. */
+ for (thr = UT_LIST_GET_FIRST(fork->thrs);
+ thr != NULL;
+ thr = UT_LIST_GET_NEXT(thrs, thr)) {
+
+ switch (thr->state) {
+ case QUE_THR_COMMAND_WAIT:
+
+ /* We have to send the initial message to query thread
+ to start it */
+
+ que_thr_init_command(thr);
+
+ return(thr);
+
+ case QUE_THR_SUSPENDED:
+ /* In this case the execution of the thread was
+ suspended: no initial message is needed because
+ execution can continue from where it was left */
+ if (!suspended_thr) {
+ suspended_thr = thr;
+ }
+
+ break;
+
+ case QUE_THR_COMPLETED:
+ if (!completed_thr) {
+ completed_thr = thr;
+ }
+
+ break;
+
+ case QUE_THR_RUNNING:
+ case QUE_THR_LOCK_WAIT:
+ ut_error;
+ }
+ }
+
+ if (suspended_thr) {
+ thr = suspended_thr;
+ thr->start_running();
+ } else if (completed_thr) {
+ thr = completed_thr;
+ que_thr_init_command(thr);
+ } else {
+ ut_error;
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Calls que_graph_free_recursive for statements in a statement list. */
+static
+void
+que_graph_free_stat_list(
+/*=====================*/
+ que_node_t* node) /*!< in: first query graph node in the list */
+{
+ while (node) {
+ que_graph_free_recursive(node);
+
+ node = que_node_get_next(node);
+ }
+}
+
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+void
+que_graph_free_recursive(
+/*=====================*/
+ que_node_t* node) /*!< in: query graph node */
+{
+ que_fork_t* fork;
+ que_thr_t* thr;
+ undo_node_t* undo;
+ sel_node_t* sel;
+ ins_node_t* ins;
+ upd_node_t* upd;
+ tab_node_t* cre_tab;
+ ind_node_t* cre_ind;
+ purge_node_t* purge;
+
+ DBUG_ENTER("que_graph_free_recursive");
+
+ if (node == NULL) {
+
+ DBUG_VOID_RETURN;
+ }
+
+ DBUG_PRINT("que_graph_free_recursive",
+ ("node: %p, type: " ULINTPF, node,
+ que_node_get_type(node)));
+
+ switch (que_node_get_type(node)) {
+
+ case QUE_NODE_FORK:
+ fork = static_cast<que_fork_t*>(node);
+
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+
+ while (thr) {
+ que_graph_free_recursive(thr);
+
+ thr = UT_LIST_GET_NEXT(thrs, thr);
+ }
+
+ break;
+ case QUE_NODE_THR:
+ thr = static_cast<que_thr_t*>(node);
+ que_graph_free_recursive(thr->child);
+ break;
+ case QUE_NODE_UNDO:
+
+ undo = static_cast<undo_node_t*>(node);
+
+ mem_heap_free(undo->heap);
+
+ break;
+ case QUE_NODE_SELECT:
+
+ sel = static_cast<sel_node_t*>(node);
+
+ sel_node_free_private(sel);
+
+ break;
+ case QUE_NODE_INSERT:
+
+ ins = static_cast<ins_node_t*>(node);
+
+ que_graph_free_recursive(ins->select);
+ ins->select = NULL;
+
+ ins->~ins_node_t();
+
+ if (ins->entry_sys_heap != NULL) {
+ mem_heap_free(ins->entry_sys_heap);
+ ins->entry_sys_heap = NULL;
+ }
+
+ break;
+ case QUE_NODE_PURGE:
+ purge = static_cast<purge_node_t*>(node);
+
+ mem_heap_free(purge->heap);
+
+ purge->~purge_node_t();
+ break;
+
+ case QUE_NODE_UPDATE:
+ upd = static_cast<upd_node_t*>(node);
+
+ if (upd->in_mysql_interface) {
+
+ btr_pcur_free_for_mysql(upd->pcur);
+ upd->in_mysql_interface = false;
+ }
+
+ que_graph_free_recursive(upd->cascade_node);
+
+ if (upd->cascade_heap) {
+ mem_heap_free(upd->cascade_heap);
+ upd->cascade_heap = NULL;
+ }
+
+ que_graph_free_recursive(upd->select);
+ upd->select = NULL;
+
+ if (upd->heap != NULL) {
+ mem_heap_free(upd->heap);
+ upd->heap = NULL;
+ }
+
+ break;
+ case QUE_NODE_CREATE_TABLE:
+ cre_tab = static_cast<tab_node_t*>(node);
+
+ que_graph_free_recursive(cre_tab->tab_def);
+ que_graph_free_recursive(cre_tab->col_def);
+ que_graph_free_recursive(cre_tab->v_col_def);
+
+ mem_heap_free(cre_tab->heap);
+
+ break;
+ case QUE_NODE_CREATE_INDEX:
+ cre_ind = static_cast<ind_node_t*>(node);
+
+ que_graph_free_recursive(cre_ind->ind_def);
+ que_graph_free_recursive(cre_ind->field_def);
+
+ mem_heap_free(cre_ind->heap);
+
+ break;
+ case QUE_NODE_PROC:
+ que_graph_free_stat_list(((proc_node_t*) node)->stat_list);
+
+ break;
+ case QUE_NODE_IF:
+ que_graph_free_stat_list(((if_node_t*) node)->stat_list);
+ que_graph_free_stat_list(((if_node_t*) node)->else_part);
+ que_graph_free_stat_list(((if_node_t*) node)->elsif_list);
+
+ break;
+ case QUE_NODE_ELSIF:
+ que_graph_free_stat_list(((elsif_node_t*) node)->stat_list);
+
+ break;
+ case QUE_NODE_WHILE:
+ que_graph_free_stat_list(((while_node_t*) node)->stat_list);
+
+ break;
+ case QUE_NODE_FOR:
+ que_graph_free_stat_list(((for_node_t*) node)->stat_list);
+
+ break;
+
+ case QUE_NODE_ASSIGNMENT:
+ case QUE_NODE_EXIT:
+ case QUE_NODE_RETURN:
+ case QUE_NODE_COMMIT:
+ case QUE_NODE_ROLLBACK:
+ case QUE_NODE_LOCK:
+ case QUE_NODE_FUNC:
+ case QUE_NODE_ORDER:
+ case QUE_NODE_ROW_PRINTF:
+ case QUE_NODE_OPEN:
+ case QUE_NODE_FETCH:
+ /* No need to do anything */
+
+ break;
+ default:
+ ut_error;
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/**********************************************************************//**
+Frees a query graph. */
+void
+que_graph_free(
+/*===========*/
+ que_t* graph) /*!< in: query graph; we assume that the memory
+ heap where this graph was created is private
+ to this graph: if not, then use
+ que_graph_free_recursive and free the heap
+ afterwards! */
+{
+ ut_ad(graph);
+
+ if (graph->sym_tab) {
+ /* The following call frees dynamic memory allocated
+ for variables etc. during execution. Frees also explicit
+ cursor definitions. */
+
+ sym_tab_free_private(graph->sym_tab);
+ }
+
+ if (graph->info && graph->info->graph_owns_us) {
+ pars_info_free(graph->info);
+ }
+
+ que_graph_free_recursive(graph);
+
+ mem_heap_free(graph->heap);
+}
+
+/****************************************************************//**
+Performs an execution step on a thr node.
+@return query thread to run next, or NULL if none */
+static
+que_thr_t*
+que_thr_node_step(
+/*==============*/
+ que_thr_t* thr) /*!< in: query thread where run_node must
+ be the thread node itself */
+{
+ ut_ad(thr->run_node == thr);
+
+ if (thr->prev_node == thr->common.parent) {
+ /* If control to the node came from above, it is just passed
+ on */
+
+ thr->run_node = thr->child;
+
+ return(thr);
+ }
+
+ trx_mutex_enter(thr_get_trx(thr));
+
+ if (que_thr_peek_stop(thr)) {
+
+ trx_mutex_exit(thr_get_trx(thr));
+
+ return(thr);
+ }
+
+ /* Thread execution completed */
+
+ thr->state = QUE_THR_COMPLETED;
+
+ trx_mutex_exit(thr_get_trx(thr));
+
+ return(NULL);
+}
+
+/**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx.
+@return TRUE if stopped */
+ibool
+que_thr_stop(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ que_t* graph;
+ trx_t* trx = thr_get_trx(thr);
+
+ graph = thr->graph;
+
+ ut_ad(trx_mutex_own(trx));
+
+ if (graph->state == QUE_FORK_COMMAND_WAIT) {
+
+ thr->state = QUE_THR_SUSPENDED;
+
+ } else if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+ trx->lock.wait_thr = thr;
+ thr->state = QUE_THR_LOCK_WAIT;
+
+ } else if (trx->error_state != DB_SUCCESS
+ && trx->error_state != DB_LOCK_WAIT) {
+
+ /* Error handling built for the MySQL interface */
+ thr->state = QUE_THR_COMPLETED;
+
+ } else if (graph->fork_type == QUE_FORK_ROLLBACK) {
+
+ thr->state = QUE_THR_SUSPENDED;
+ } else {
+ ut_ad(graph->state == QUE_FORK_ACTIVE);
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decrements the query thread reference counts in the query graph and the
+transaction.
+*** NOTE ***:
+This and que_thr_stop_for_mysql are the only functions where the reference
+count can be decremented and this function may only be called from inside
+que_run_threads! These restrictions exist to make the rollback code easier
+to maintain. */
+static
+void
+que_thr_dec_refer_count(
+/*====================*/
+ que_thr_t* thr, /*!< in: query thread */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+{
+ trx_t* trx;
+
+ trx = thr_get_trx(thr);
+
+ ut_a(thr->is_active);
+ ut_ad(trx_mutex_own(trx));
+
+ if (thr->state == QUE_THR_RUNNING) {
+
+ if (!que_thr_stop(thr)) {
+
+ ut_a(next_thr != NULL && *next_thr == NULL);
+
+ /* The reason for the thr suspension or wait was
+ already canceled before we came here: continue
+ running the thread.
+
+ This is also possible because in trx_commit_step() we
+ assume a single query thread. We set the query thread
+ state to QUE_THR_RUNNING. */
+
+ /* fprintf(stderr,
+ "Wait already ended: trx: %p\n", trx); */
+
+ /* Normally srv_suspend_mysql_thread resets
+ the state to DB_SUCCESS before waiting, but
+ in this case we have to do it here,
+ otherwise nobody does it. */
+
+ trx->error_state = DB_SUCCESS;
+
+ *next_thr = thr;
+
+ return;
+ }
+ }
+
+ ut_d(static_cast<que_fork_t*>(thr->common.parent)->set_active(false));
+ thr->is_active = false;
+}
+
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
+query thread is stopped and made inactive, except in the case where
+it was put to the lock wait state in lock0lock.cc, but the lock has already
+been granted or the transaction chosen as a victim in deadlock resolution. */
+void
+que_thr_stop_for_mysql(
+/*===================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+
+ trx = thr_get_trx(thr);
+
+ trx_mutex_enter(trx);
+
+ if (thr->state == QUE_THR_RUNNING) {
+
+ if (trx->error_state != DB_SUCCESS
+ && trx->error_state != DB_LOCK_WAIT) {
+
+ /* Error handling built for the MySQL interface */
+ thr->state = QUE_THR_COMPLETED;
+ } else {
+ /* It must have been a lock wait but the lock was
+ already released, or this transaction was chosen
+ as a victim in selective deadlock resolution */
+
+ trx_mutex_exit(trx);
+
+ return;
+ }
+ }
+
+ ut_ad(thr->is_active);
+ ut_d(thr->set_active(false));
+ thr->is_active= false;
+
+ trx_mutex_exit(trx);
+}
+
+#ifdef UNIV_DEBUG
+/** Change the 'active' status */
+void que_fork_t::set_active(bool active)
+{
+ if (active)
+ {
+ n_active_thrs++;
+ trx->lock.n_active_thrs++;
+ }
+ else
+ {
+ ut_ad(n_active_thrs);
+ ut_ad(trx->lock.n_active_thrs);
+ n_active_thrs--;
+ trx->lock.n_active_thrs--;
+ }
+}
+#endif
+
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return containing loop node, or NULL. */
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+ que_node_t* node) /*!< in: node */
+{
+ ut_ad(node);
+
+ for (;;) {
+ ulint type;
+
+ node = que_node_get_parent(node);
+
+ if (!node) {
+ break;
+ }
+
+ type = que_node_get_type(node);
+
+ if ((type == QUE_NODE_FOR) || (type == QUE_NODE_WHILE)) {
+ break;
+ }
+ }
+
+ return(node);
+}
+
+#ifndef DBUG_OFF
+/** Gets information of an SQL query graph node.
+@return type description */
+static MY_ATTRIBUTE((warn_unused_result, nonnull))
+const char*
+que_node_type_string(
+/*=================*/
+ const que_node_t* node) /*!< in: query graph node */
+{
+ switch (que_node_get_type(node)) {
+ case QUE_NODE_SELECT:
+ return("SELECT");
+ case QUE_NODE_INSERT:
+ return("INSERT");
+ case QUE_NODE_UPDATE:
+ return("UPDATE");
+ case QUE_NODE_WHILE:
+ return("WHILE");
+ case QUE_NODE_ASSIGNMENT:
+ return("ASSIGNMENT");
+ case QUE_NODE_IF:
+ return("IF");
+ case QUE_NODE_FETCH:
+ return("FETCH");
+ case QUE_NODE_OPEN:
+ return("OPEN");
+ case QUE_NODE_PROC:
+ return("STORED PROCEDURE");
+ case QUE_NODE_FUNC:
+ return("FUNCTION");
+ case QUE_NODE_LOCK:
+ return("LOCK");
+ case QUE_NODE_THR:
+ return("QUERY THREAD");
+ case QUE_NODE_COMMIT:
+ return("COMMIT");
+ case QUE_NODE_UNDO:
+ return("UNDO ROW");
+ case QUE_NODE_PURGE:
+ return("PURGE ROW");
+ case QUE_NODE_ROLLBACK:
+ return("ROLLBACK");
+ case QUE_NODE_CREATE_TABLE:
+ return("CREATE TABLE");
+ case QUE_NODE_CREATE_INDEX:
+ return("CREATE INDEX");
+ case QUE_NODE_FOR:
+ return("FOR LOOP");
+ case QUE_NODE_RETURN:
+ return("RETURN");
+ case QUE_NODE_EXIT:
+ return("EXIT");
+ default:
+ ut_ad(0);
+ return("UNKNOWN NODE TYPE");
+ }
+}
+#endif /* !DBUG_OFF */
+
+/**********************************************************************//**
+Performs an execution step on a query thread.
+@return query thread to run next: it may differ from the input
+parameter if, e.g., a subprocedure call is made */
+UNIV_INLINE
+que_thr_t*
+que_thr_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ que_node_t* node;
+ que_thr_t* old_thr;
+ trx_t* trx;
+ ulint type;
+
+ trx = thr_get_trx(thr);
+
+ ut_ad(thr->state == QUE_THR_RUNNING);
+ ut_a(trx->error_state == DB_SUCCESS);
+
+ thr->resource++;
+
+ node = thr->run_node;
+ type = que_node_get_type(node);
+
+ old_thr = thr;
+
+ DBUG_PRINT("ib_que", ("Execute %u (%s) at %p",
+ unsigned(type), que_node_type_string(node),
+ (const void*) node));
+
+ if (type & QUE_NODE_CONTROL_STAT) {
+ if ((thr->prev_node != que_node_get_parent(node))
+ && que_node_get_next(thr->prev_node)) {
+
+ /* The control statements, like WHILE, always pass the
+ control to the next child statement if there is any
+ child left */
+
+ thr->run_node = que_node_get_next(thr->prev_node);
+
+ } else if (type == QUE_NODE_IF) {
+ if_step(thr);
+ } else if (type == QUE_NODE_FOR) {
+ for_step(thr);
+ } else if (type == QUE_NODE_PROC) {
+ if (thr->prev_node == que_node_get_parent(node)) {
+ trx->last_sql_stat_start.least_undo_no
+ = trx->undo_no;
+ }
+
+ proc_step(thr);
+ } else if (type == QUE_NODE_WHILE) {
+ while_step(thr);
+ } else {
+ ut_error;
+ }
+ } else if (type == QUE_NODE_ASSIGNMENT) {
+ assign_step(thr);
+ } else if (type == QUE_NODE_SELECT) {
+ thr = row_sel_step(thr);
+ } else if (type == QUE_NODE_INSERT) {
+ trx_start_if_not_started_xa(thr_get_trx(thr), true);
+ thr = row_ins_step(thr);
+ } else if (type == QUE_NODE_UPDATE) {
+ trx_start_if_not_started_xa(thr_get_trx(thr), true);
+ thr = row_upd_step(thr);
+ } else if (type == QUE_NODE_FETCH) {
+ thr = fetch_step(thr);
+ } else if (type == QUE_NODE_OPEN) {
+ thr = open_step(thr);
+ } else if (type == QUE_NODE_FUNC) {
+ proc_eval_step(thr);
+
+ } else if (type == QUE_NODE_LOCK) {
+
+ ut_error;
+ } else if (type == QUE_NODE_THR) {
+ thr = que_thr_node_step(thr);
+ } else if (type == QUE_NODE_COMMIT) {
+ thr = trx_commit_step(thr);
+ } else if (type == QUE_NODE_UNDO) {
+ thr = row_undo_step(thr);
+ } else if (type == QUE_NODE_PURGE) {
+ thr = row_purge_step(thr);
+ } else if (type == QUE_NODE_RETURN) {
+ thr = return_step(thr);
+ } else if (type == QUE_NODE_EXIT) {
+ thr = exit_step(thr);
+ } else if (type == QUE_NODE_ROLLBACK) {
+ thr = trx_rollback_step(thr);
+ } else if (type == QUE_NODE_CREATE_TABLE) {
+ thr = dict_create_table_step(thr);
+ } else if (type == QUE_NODE_CREATE_INDEX) {
+ thr = dict_create_index_step(thr);
+ } else if (type == QUE_NODE_ROW_PRINTF) {
+ thr = row_printf_step(thr);
+ } else {
+ ut_error;
+ }
+
+ if (type == QUE_NODE_EXIT) {
+ old_thr->prev_node = que_node_get_containing_loop_node(node);
+ } else {
+ old_thr->prev_node = node;
+ }
+
+ if (thr) {
+ ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Run a query thread until it finishes or encounters e.g. a lock wait. */
+static
+void
+que_run_threads_low(
+/*================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+ que_thr_t* next_thr;
+
+ ut_ad(thr->state == QUE_THR_RUNNING);
+ ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+ ut_ad(!trx_mutex_own(thr_get_trx(thr)));
+
+ /* cumul_resource counts how much resources the OS thread (NOT the
+ query thread) has spent in this function */
+
+ trx = thr_get_trx(thr);
+
+ do {
+ /* Check that there is enough space in the log to accommodate
+ possible log entries by this query step; if the operation can
+ touch more than about 4 pages, checks must be made also within
+ the query step! */
+
+ log_free_check();
+
+ /* Perform the actual query step: note that the query thread
+ may change if, e.g., a subprocedure call is made */
+
+ /*-------------------------*/
+ next_thr = que_thr_step(thr);
+ /*-------------------------*/
+
+ trx_mutex_enter(trx);
+
+ ut_a(next_thr == NULL || trx->error_state == DB_SUCCESS);
+
+ if (next_thr != thr) {
+ ut_a(next_thr == NULL);
+
+ /* This can change next_thr to a non-NULL value
+ if there was a lock wait that already completed. */
+
+ que_thr_dec_refer_count(thr, &next_thr);
+
+ if (next_thr != NULL) {
+
+ thr = next_thr;
+ }
+ }
+
+ ut_ad(trx == thr_get_trx(thr));
+
+ trx_mutex_exit(trx);
+
+ } while (next_thr != NULL);
+}
+
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+void
+que_run_threads(
+/*============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(!trx_mutex_own(thr_get_trx(thr)));
+
+loop:
+ ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+
+ que_run_threads_low(thr);
+
+ switch (thr->state) {
+
+ case QUE_THR_RUNNING:
+ /* There probably was a lock wait, but it already ended
+ before we came here: continue running thr */
+
+ goto loop;
+
+ case QUE_THR_LOCK_WAIT:
+ lock_wait_suspend_thread(thr);
+
+ trx_mutex_enter(thr_get_trx(thr));
+
+ ut_a(thr_get_trx(thr)->id != 0);
+
+ if (thr_get_trx(thr)->error_state != DB_SUCCESS) {
+ /* thr was chosen as a deadlock victim or there was
+ a lock wait timeout */
+
+ que_thr_dec_refer_count(thr, NULL);
+ trx_mutex_exit(thr_get_trx(thr));
+ break;
+ }
+
+ trx_mutex_exit(thr_get_trx(thr));
+ goto loop;
+
+ case QUE_THR_COMPLETED:
+ case QUE_THR_COMMAND_WAIT:
+ /* Do nothing */
+ break;
+
+ default:
+ ut_error;
+ }
+}
+
+/*********************************************************************//**
+Evaluate the given SQL.
+@return error code or DB_SUCCESS */
+dberr_t
+que_eval_sql(
+/*=========*/
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql, /*!< in: SQL string */
+ bool reserve_dict_mutex,
+ /*!< in: whether to acquire/release
+ dict_sys.mutex around call to pars_sql. */
+ trx_t* trx) /*!< in: trx */
+{
+ que_thr_t* thr;
+ que_t* graph;
+
+ DBUG_ENTER("que_eval_sql");
+ DBUG_PRINT("que_eval_sql", ("query: %s", sql));
+
+ ut_a(trx->error_state == DB_SUCCESS);
+
+ if (reserve_dict_mutex) {
+ mutex_enter(&dict_sys.mutex);
+ }
+
+ graph = pars_sql(info, sql);
+
+ if (reserve_dict_mutex) {
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ graph->trx = trx;
+ trx->graph = NULL;
+
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ ut_a(thr = que_fork_start_command(graph));
+
+ que_run_threads(thr);
+
+ if (reserve_dict_mutex) {
+ mutex_enter(&dict_sys.mutex);
+ }
+
+ que_graph_free(graph);
+
+ if (reserve_dict_mutex) {
+ mutex_exit(&dict_sys.mutex);
+ }
+
+ DBUG_RETURN(trx->error_state);
+}
diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc
new file mode 100644
index 00000000..9047618d
--- /dev/null
+++ b/storage/innobase/read/read0read.cc
@@ -0,0 +1,252 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file read/read0read.cc
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#include "read0types.h"
+
+#include "srv0srv.h"
+#include "trx0sys.h"
+#include "trx0purge.h"
+
+/*
+-------------------------------------------------------------------------------
+FACT A: Cursor read view on a secondary index sees only committed versions
+-------
+of the records in the secondary index or those versions of rows created
+by transaction which created a cursor before cursor was created even
+if transaction which created the cursor has changed that clustered index page.
+
+PROOF: We must show that read goes always to the clustered index record
+to see that record is visible in the cursor read view. Consider e.g.
+following table and SQL-clauses:
+
+create table t1(a int not null, b int, primary key(a), index(b));
+insert into t1 values (1,1),(2,2);
+commit;
+
+Now consider that we have a cursor for a query
+
+select b from t1 where b >= 1;
+
+This query will use secondary key on the table t1. Now after the first fetch
+on this cursor if we do a update:
+
+update t1 set b = 5 where b = 2;
+
+Now second fetch of the cursor should not see record (2,5) instead it should
+see record (2,2).
+
+We also should show that if we have delete t1 where b = 5; we still
+can see record (2,2).
+
+When we access a secondary key record maximum transaction id is fetched
+from this record and this trx_id is compared to up_limit_id in the view.
+If trx_id in the record is greater or equal than up_limit_id in the view
+cluster record is accessed. Because trx_id of the creating
+transaction is stored when this view was created to the list of
+trx_ids not seen by this read view previous version of the
+record is requested to be built. This is build using clustered record.
+If the secondary key record is delete-marked, its corresponding
+clustered record can be already be purged only if records
+trx_id < low_limit_no. Purge can't remove any record deleted by a
+transaction which was active when cursor was created. But, we still
+may have a deleted secondary key record but no clustered record. But,
+this is not a problem because this case is handled in
+row_sel_get_clust_rec() function which is called
+whenever we note that this read view does not see trx_id in the
+record. Thus, we see correct version. Q. E. D.
+
+-------------------------------------------------------------------------------
+FACT B: Cursor read view on a clustered index sees only committed versions
+-------
+of the records in the clustered index or those versions of rows created
+by transaction which created a cursor before cursor was created even
+if transaction which created the cursor has changed that clustered index page.
+
+PROOF: Consider e.g.following table and SQL-clauses:
+
+create table t1(a int not null, b int, primary key(a));
+insert into t1 values (1),(2);
+commit;
+
+Now consider that we have a cursor for a query
+
+select a from t1 where a >= 1;
+
+This query will use clustered key on the table t1. Now after the first fetch
+on this cursor if we do a update:
+
+update t1 set a = 5 where a = 2;
+
+Now second fetch of the cursor should not see record (5) instead it should
+see record (2).
+
+We also should show that if we have execute delete t1 where a = 5; after
+the cursor is opened we still can see record (2).
+
+When accessing clustered record we always check if this read view sees
+trx_id stored to clustered record. By default we don't see any changes
+if record trx_id >= low_limit_id i.e. change was made transaction
+which started after transaction which created the cursor. If row
+was changed by the future transaction a previous version of the
+clustered record is created. Thus we see only committed version in
+this case. We see all changes made by committed transactions i.e.
+record trx_id < up_limit_id. In this case we don't need to do anything,
+we already see correct version of the record. We don't see any changes
+made by active transaction except creating transaction. We have stored
+trx_id of creating transaction to list of trx_ids when this view was
+created. Thus we can easily see if this record was changed by the
+creating transaction. Because we already have clustered record we can
+access roll_ptr. Using this roll_ptr we can fetch undo record.
+We can now check that undo_no of the undo record is less than undo_no of the
+trancaction which created a view when cursor was created. We see this
+clustered record only in case when record undo_no is less than undo_no
+in the view. If this is not true we build based on undo_rec previous
+version of the record. This record is found because purge can't remove
+records accessed by active transaction. Thus we see correct version. Q. E. D.
+-------------------------------------------------------------------------------
+FACT C: Purge does not remove any delete-marked row that is visible
+-------
+in any cursor read view.
+
+PROOF: We know that:
+ 1: Currently active read views in trx_sys_t::view_list are ordered by
+ ReadView::low_limit_no in descending order, that is,
+ newest read view first.
+
+ 2: Purge clones the oldest read view and uses that to determine whether there
+ are any active transactions that can see the to be purged records.
+
+Therefore any joining or active transaction will not have a view older
+than the purge view, according to 1.
+
+When purge needs to remove a delete-marked row from a secondary index,
+it will first check that the DB_TRX_ID value of the corresponding
+record in the clustered index is older than the purge view. It will
+also check if there is a newer version of the row (clustered index
+record) that is not delete-marked in the secondary index. If such a
+row exists and is collation-equal to the delete-marked secondary index
+record then purge will not remove the secondary index record.
+
+Delete-marked clustered index records will be removed by
+row_purge_remove_clust_if_poss(), unless the clustered index record
+(and its DB_ROLL_PTR) has been updated. Every new version of the
+clustered index record will update DB_ROLL_PTR, pointing to a new UNDO
+log entry that allows the old version to be reconstructed. The
+DB_ROLL_PTR in the oldest remaining version in the old-version chain
+may be pointing to garbage (an undo log record discarded by purge),
+but it will never be dereferenced, because the purge view is older
+than any active transaction.
+
+For details see: row_vers_old_has_index_entry() and row_purge_poss_sec()
+*/
+
+
+/**
+ Creates a snapshot where exactly the transactions serialized before this
+ point in time are seen in the view.
+
+ @param[in,out] trx transaction
+*/
+inline void ReadViewBase::snapshot(trx_t *trx)
+{
+ trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no);
+ std::sort(m_ids.begin(), m_ids.end());
+ m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front();
+ ut_ad(m_up_limit_id <= m_low_limit_id);
+}
+
+
+/**
+ Opens a read view where exactly the transactions serialized before this
+ point in time are seen in the view.
+
+ View becomes visible to purge thread.
+
+ @param[in,out] trx transaction
+
+ Reuses closed view if there were no read-write transactions since (and at)
+ its creation time.
+
+ Original comment states: there is an inherent race here between purge
+ and this thread.
+
+ To avoid this race we should've checked trx_sys.get_max_trx_id() and
+ set m_open atomically under ReadView::m_mutex protection. But we're cutting
+ edges to achieve greater performance.
+
+ There're at least two types of concurrent threads interested in this
+ value: purge coordinator thread (see trx_sys_t::clone_oldest_view()) and
+ InnoDB monitor thread (see lock_trx_print_wait_and_mvcc_state()).
+
+ What bad things can happen because we allow this race?
+
+ Speculative execution may reorder state change before get_max_trx_id().
+ In this case purge thread has short gap to clone outdated view. Which is
+ probably not that bad: it just won't be able to purge things that it was
+ actually allowed to purge for a short while.
+
+ This thread may as well get suspended after trx_sys.get_max_trx_id() and
+ before m_open is set. New read-write transaction may get started, committed
+ and purged meanwhile. It is acceptable as well, since this view doesn't see
+ it.
+*/
+void ReadView::open(trx_t *trx)
+{
+ ut_ad(this == &trx->read_view);
+ if (is_open())
+ ut_ad(!srv_read_only_mode);
+ else if (likely(!srv_read_only_mode))
+ {
+ m_creator_trx_id= trx->id;
+ if (trx->is_autocommit_non_locking() && empty() &&
+ low_limit_id() == trx_sys.get_max_trx_id())
+ m_open.store(true, std::memory_order_relaxed);
+ else
+ {
+ mutex_enter(&m_mutex);
+ snapshot(trx);
+ m_open.store(true, std::memory_order_relaxed);
+ mutex_exit(&m_mutex);
+ }
+ }
+}
+
+
+/**
+ Clones the oldest view and stores it in view.
+
+ No need to call ReadView::close(). The caller owns the view that is passed
+ in. This function is called by purge thread to determine whether it should
+ purge the delete marked record or not.
+*/
+void trx_sys_t::clone_oldest_view(ReadViewBase *view) const
+{
+ view->snapshot(nullptr);
+ /* Find oldest view. */
+ trx_list.for_each([view](const trx_t &trx) {
+ trx.read_view.append_to(view);
+ });
+}
diff --git a/storage/innobase/rem/rem0cmp.cc b/storage/innobase/rem/rem0cmp.cc
new file mode 100644
index 00000000..70c0255d
--- /dev/null
+++ b/storage/innobase/rem/rem0cmp.cc
@@ -0,0 +1,1005 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file rem/rem0cmp.cc
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#include "rem0cmp.h"
+#include "rem0rec.h"
+#include "page0page.h"
+#include "dict0mem.h"
+#include "handler0alter.h"
+
+/* ALPHABETICAL ORDER
+ ==================
+
+The records are put into alphabetical order in the following
+way: let F be the first field where two records disagree.
+If there is a character in some position n where the
+records disagree, the order is determined by comparison of
+the characters at position n, possibly after
+collating transformation. If there is no such character,
+but the corresponding fields have different lengths, then
+if the data type of the fields is paddable,
+shorter field is padded with a padding character. If the
+data type is not paddable, longer field is considered greater.
+Finally, the SQL null is bigger than any other value.
+
+At the present, the comparison functions return 0 in the case,
+where two records disagree only in the way that one
+has more fields than the other. */
+
+/** Compare two data fields.
+@param[in] prtype precise type
+@param[in] a data field
+@param[in] a_length length of a, in bytes (not UNIV_SQL_NULL)
+@param[in] b data field
+@param[in] b_length length of b, in bytes (not UNIV_SQL_NULL)
+@return positive, 0, negative, if a is greater, equal, less than b,
+respectively */
+UNIV_INLINE
+int
+innobase_mysql_cmp(
+ ulint prtype,
+ const byte* a,
+ ulint a_length,
+ const byte* b,
+ ulint b_length)
+{
+#ifdef UNIV_DEBUG
+ switch (prtype & DATA_MYSQL_TYPE_MASK) {
+ case MYSQL_TYPE_BIT:
+ case MYSQL_TYPE_STRING:
+ case MYSQL_TYPE_VAR_STRING:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ case MYSQL_TYPE_VARCHAR:
+ break;
+ default:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ uint cs_num = (uint) dtype_get_charset_coll(prtype);
+
+ if (CHARSET_INFO* cs = get_charset(cs_num, MYF(MY_WME))) {
+ return(cs->strnncollsp(a, a_length, b, b_length));
+ }
+
+ ib::fatal() << "Unable to find charset-collation " << cs_num;
+ return(0);
+}
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return TRUE if the columns are considered equal in comparisons */
+ibool
+cmp_cols_are_equal(
+/*===============*/
+ const dict_col_t* col1, /*!< in: column 1 */
+ const dict_col_t* col2, /*!< in: column 2 */
+ ibool check_charsets)
+ /*!< in: whether to check charsets */
+{
+ if (dtype_is_non_binary_string_type(col1->mtype, col1->prtype)
+ && dtype_is_non_binary_string_type(col2->mtype, col2->prtype)) {
+
+ /* Both are non-binary string types: they can be compared if
+ and only if the charset-collation is the same */
+
+ if (check_charsets) {
+ return(dtype_get_charset_coll(col1->prtype)
+ == dtype_get_charset_coll(col2->prtype));
+ } else {
+ return(TRUE);
+ }
+ }
+
+ if (dtype_is_binary_string_type(col1->mtype, col1->prtype)
+ && dtype_is_binary_string_type(col2->mtype, col2->prtype)) {
+
+ /* Both are binary string types: they can be compared */
+
+ return(TRUE);
+ }
+
+ if (col1->mtype != col2->mtype) {
+
+ return(FALSE);
+ }
+
+ if (col1->mtype == DATA_INT
+ && (col1->prtype & DATA_UNSIGNED)
+ != (col2->prtype & DATA_UNSIGNED)) {
+
+ /* The storage format of an unsigned integer is different
+ from a signed integer: in a signed integer we OR
+ 0x8000... to the value of positive integers. */
+
+ return(FALSE);
+ }
+
+ return(col1->mtype != DATA_INT || col1->len == col2->len);
+}
+
+/** Compare two DATA_DECIMAL (MYSQL_TYPE_DECIMAL) fields.
+TODO: Remove this function. Everything should use MYSQL_TYPE_NEWDECIMAL.
+@param[in] a data field
+@param[in] a_length length of a, in bytes (not UNIV_SQL_NULL)
+@param[in] b data field
+@param[in] b_length length of b, in bytes (not UNIV_SQL_NULL)
+@return positive, 0, negative, if a is greater, equal, less than b,
+respectively */
+static ATTRIBUTE_COLD
+int
+cmp_decimal(const byte* a, ulint a_length, const byte* b, ulint b_length)
+{
+ int swap_flag;
+
+ /* Remove preceding spaces */
+ for (; a_length && *a == ' '; a++, a_length--) { }
+ for (; b_length && *b == ' '; b++, b_length--) { }
+
+ if (*a == '-') {
+ swap_flag = -1;
+
+ if (*b != '-') {
+ return(swap_flag);
+ }
+
+ a++; b++;
+ a_length--;
+ b_length--;
+ } else {
+ swap_flag = 1;
+
+ if (*b == '-') {
+ return(swap_flag);
+ }
+ }
+
+ while (a_length > 0 && (*a == '+' || *a == '0')) {
+ a++; a_length--;
+ }
+
+ while (b_length > 0 && (*b == '+' || *b == '0')) {
+ b++; b_length--;
+ }
+
+ if (a_length != b_length) {
+ if (a_length < b_length) {
+ return(-swap_flag);
+ }
+
+ return(swap_flag);
+ }
+
+ while (a_length > 0 && *a == *b) {
+
+ a++; b++; a_length--;
+ }
+
+ if (a_length == 0) {
+ return(0);
+ }
+
+ if (*a <= *b) {
+ swap_flag = -swap_flag;
+ }
+
+ return(swap_flag);
+}
+
+/** Compare two data fields.
+@param[in] mtype main type
+@param[in] prtype precise type
+@param[in] data1 data field
+@param[in] len1 length of data1 in bytes, or UNIV_SQL_NULL
+@param[in] data2 data field
+@param[in] len2 length of data2 in bytes, or UNIV_SQL_NULL
+@return the comparison result of data1 and data2
+@retval 0 if data1 is equal to data2
+@retval negative if data1 is less than data2
+@retval positive if data1 is greater than data2 */
+inline
+int
+cmp_data(
+ ulint mtype,
+ ulint prtype,
+ const byte* data1,
+ ulint len1,
+ const byte* data2,
+ ulint len2)
+{
+ ut_ad(len1 != UNIV_SQL_DEFAULT);
+ ut_ad(len2 != UNIV_SQL_DEFAULT);
+
+ if (len1 == UNIV_SQL_NULL || len2 == UNIV_SQL_NULL) {
+ if (len1 == len2) {
+ return(0);
+ }
+
+ /* We define the SQL null to be the smallest possible
+ value of a field. */
+ return(len1 == UNIV_SQL_NULL ? -1 : 1);
+ }
+
+ ulint pad;
+
+ switch (mtype) {
+ default:
+ ib::fatal() << "Unknown data type number " << mtype;
+ case DATA_FIXBINARY:
+ case DATA_BINARY:
+ if (dtype_get_charset_coll(prtype)
+ != DATA_MYSQL_BINARY_CHARSET_COLL) {
+ pad = 0x20;
+ break;
+ }
+ /* fall through */
+ case DATA_INT:
+ case DATA_SYS_CHILD:
+ case DATA_SYS:
+ pad = ULINT_UNDEFINED;
+ break;
+ case DATA_GEOMETRY:
+ ut_ad(prtype & DATA_BINARY_TYPE);
+ if (prtype & DATA_GIS_MBR) {
+ ut_ad(len1 == DATA_MBR_LEN);
+ ut_ad(len2 == DATA_MBR_LEN);
+ return cmp_geometry_field(data1, data2);
+ }
+ pad = ULINT_UNDEFINED;
+ break;
+ case DATA_BLOB:
+ if (prtype & DATA_BINARY_TYPE) {
+ pad = ULINT_UNDEFINED;
+ break;
+ }
+ if (prtype & DATA_BINARY_TYPE) {
+ ib::error() << "Comparing a binary BLOB"
+ " using a character set collation!";
+ ut_ad(0);
+ }
+ /* fall through */
+ case DATA_VARMYSQL:
+ case DATA_MYSQL:
+ return innobase_mysql_cmp(prtype, data1, len1, data2, len2);
+ case DATA_VARCHAR:
+ case DATA_CHAR:
+ return my_charset_latin1.strnncollsp(data1, len1, data2, len2);
+ case DATA_DECIMAL:
+ return cmp_decimal(data1, len1, data2, len2);
+ case DATA_DOUBLE:
+ {
+ double d_1 = mach_double_read(data1);
+ double d_2 = mach_double_read(data2);
+
+ if (d_1 > d_2) {
+ return 1;
+ } else if (d_2 > d_1) {
+ return -1;
+ }
+ }
+ return 0;
+
+ case DATA_FLOAT:
+ float f_1 = mach_float_read(data1);
+ float f_2 = mach_float_read(data2);
+
+ if (f_1 > f_2) {
+ return 1;
+ } else if (f_2 > f_1) {
+ return -1;
+ }
+
+ return 0;
+ }
+
+ ulint len = std::min(len1, len2);
+ int cmp = len ? memcmp(data1, data2, len) : 0;
+
+ if (cmp) {
+ return (cmp);
+ }
+
+ data1 += len;
+ data2 += len;
+ len1 -= len;
+ len2 -= len;
+
+ cmp = (int) (len1 - len2);
+
+ if (!cmp || pad == ULINT_UNDEFINED) {
+ return(cmp);
+ }
+
+ len = 0;
+
+ if (len1) {
+ do {
+ cmp = static_cast<int>(
+ mach_read_from_1(&data1[len++]) - pad);
+ } while (cmp == 0 && len < len1);
+ } else {
+ ut_ad(len2 > 0);
+
+ do {
+ cmp = static_cast<int>(
+ pad - mach_read_from_1(&data2[len++]));
+ } while (cmp == 0 && len < len2);
+ }
+
+ return(cmp);
+}
+
+/** Compare two data fields.
+@param[in] mtype main type
+@param[in] prtype precise type
+@param[in] data1 data field
+@param[in] len1 length of data1 in bytes, or UNIV_SQL_NULL
+@param[in] data2 data field
+@param[in] len2 length of data2 in bytes, or UNIV_SQL_NULL
+@return the comparison result of data1 and data2
+@retval 0 if data1 is equal to data2
+@retval negative if data1 is less than data2
+@retval positive if data1 is greater than data2 */
+int
+cmp_data_data(
+ ulint mtype,
+ ulint prtype,
+ const byte* data1,
+ ulint len1,
+ const byte* data2,
+ ulint len2)
+{
+ return(cmp_data(mtype, prtype, data1, len1, data2, len2));
+}
+
+/** Compare a data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec B-tree record
+@param[in] offsets rec_get_offsets(rec)
+@param[in] n_cmp number of fields to compare
+@param[in,out] matched_fields number of completely matched fields
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int
+cmp_dtuple_rec_with_match_low(
+ const dtuple_t* dtuple,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ ulint n_cmp,
+ ulint* matched_fields)
+{
+ ulint cur_field; /* current field number */
+ int ret; /* return value */
+
+ ut_ad(dtuple_check_typed(dtuple));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ cur_field = *matched_fields;
+
+ ut_ad(n_cmp > 0);
+ ut_ad(n_cmp <= dtuple_get_n_fields(dtuple));
+ ut_ad(cur_field <= n_cmp);
+ ut_ad(cur_field <= rec_offs_n_fields(offsets));
+
+ if (cur_field == 0) {
+ ulint rec_info = rec_get_info_bits(rec,
+ rec_offs_comp(offsets));
+ ulint tup_info = dtuple_get_info_bits(dtuple);
+
+ if (UNIV_UNLIKELY(rec_info & REC_INFO_MIN_REC_FLAG)) {
+ ret = !(tup_info & REC_INFO_MIN_REC_FLAG);
+ goto order_resolved;
+ } else if (UNIV_UNLIKELY(tup_info & REC_INFO_MIN_REC_FLAG)) {
+ ret = -1;
+ goto order_resolved;
+ }
+ }
+
+ /* Match fields in a loop */
+
+ for (; cur_field < n_cmp; cur_field++) {
+ const byte* rec_b_ptr;
+ const dfield_t* dtuple_field
+ = dtuple_get_nth_field(dtuple, cur_field);
+ const byte* dtuple_b_ptr
+ = static_cast<const byte*>(
+ dfield_get_data(dtuple_field));
+ const dtype_t* type
+ = dfield_get_type(dtuple_field);
+ ulint dtuple_f_len
+ = dfield_get_len(dtuple_field);
+ ulint rec_f_len;
+
+ /* We should never compare against an externally
+ stored field. Only clustered index records can
+ contain externally stored fields, and the first fields
+ (primary key fields) should already differ. */
+ ut_ad(!rec_offs_nth_extern(offsets, cur_field));
+ /* We should never compare against instantly added columns.
+ Columns can only be instantly added to clustered index
+ leaf page records, and the first fields (primary key fields)
+ should already differ. */
+ ut_ad(!rec_offs_nth_default(offsets, cur_field));
+
+ rec_b_ptr = rec_get_nth_field(rec, offsets, cur_field,
+ &rec_f_len);
+
+ ut_ad(!dfield_is_ext(dtuple_field));
+
+ ret = cmp_data(type->mtype, type->prtype,
+ dtuple_b_ptr, dtuple_f_len,
+ rec_b_ptr, rec_f_len);
+ if (ret) {
+ goto order_resolved;
+ }
+ }
+
+ ret = 0; /* If we ran out of fields, dtuple was equal to rec
+ up to the common fields */
+order_resolved:
+ *matched_fields = cur_field;
+ return(ret);
+}
+
+/** Get the pad character code point for a type.
+@param[in] type
+@return pad character code point
+@retval ULINT_UNDEFINED if no padding is specified */
+UNIV_INLINE
+ulint
+cmp_get_pad_char(
+ const dtype_t* type)
+{
+ switch (type->mtype) {
+ case DATA_FIXBINARY:
+ case DATA_BINARY:
+ if (dtype_get_charset_coll(type->prtype)
+ == DATA_MYSQL_BINARY_CHARSET_COLL) {
+ /* Starting from 5.0.18, do not pad
+ VARBINARY or BINARY columns. */
+ return(ULINT_UNDEFINED);
+ }
+ /* Fall through */
+ case DATA_CHAR:
+ case DATA_VARCHAR:
+ case DATA_MYSQL:
+ case DATA_VARMYSQL:
+ /* Space is the padding character for all char and binary
+ strings, and starting from 5.0.3, also for TEXT strings. */
+ return(0x20);
+ case DATA_GEOMETRY:
+ /* DATA_GEOMETRY is binary data, not ASCII-based. */
+ return(ULINT_UNDEFINED);
+ case DATA_BLOB:
+ if (!(type->prtype & DATA_BINARY_TYPE)) {
+ return(0x20);
+ }
+ /* Fall through */
+ default:
+ /* No padding specified */
+ return(ULINT_UNDEFINED);
+ }
+}
+
+/** Compare a data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec B-tree or R-tree index record
+@param[in] index index tree
+@param[in] offsets rec_get_offsets(rec)
+@param[in,out] matched_fields number of completely matched fields
+@param[in,out] matched_bytes number of matched bytes in the first
+field that is not matched
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int
+cmp_dtuple_rec_with_match_bytes(
+ const dtuple_t* dtuple,
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ ulint* matched_fields,
+ ulint* matched_bytes)
+{
+ ut_ad(dtuple_check_typed(dtuple));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!(REC_INFO_MIN_REC_FLAG
+ & dtuple_get_info_bits(dtuple)));
+
+ if (UNIV_UNLIKELY(REC_INFO_MIN_REC_FLAG
+ & rec_get_info_bits(rec, rec_offs_comp(offsets)))) {
+ ut_ad(page_rec_is_first(rec, page_align(rec)));
+ ut_ad(!page_has_prev(page_align(rec)));
+ ut_ad(rec_is_metadata(rec, *index));
+ return 1;
+ }
+
+ ulint cur_field = *matched_fields;
+ ulint cur_bytes = *matched_bytes;
+ ulint n_cmp = dtuple_get_n_fields_cmp(dtuple);
+ int ret;
+
+ ut_ad(n_cmp <= dtuple_get_n_fields(dtuple));
+ ut_ad(cur_field <= n_cmp);
+ ut_ad(cur_field + (cur_bytes > 0) <= rec_offs_n_fields(offsets));
+
+ /* Match fields in a loop; stop if we run out of fields in dtuple
+ or find an externally stored field */
+
+ while (cur_field < n_cmp) {
+ const dfield_t* dfield = dtuple_get_nth_field(
+ dtuple, cur_field);
+ const dtype_t* type = dfield_get_type(dfield);
+ ulint dtuple_f_len = dfield_get_len(dfield);
+ const byte* dtuple_b_ptr;
+ const byte* rec_b_ptr;
+ ulint rec_f_len;
+
+ dtuple_b_ptr = static_cast<const byte*>(
+ dfield_get_data(dfield));
+
+ ut_ad(!rec_offs_nth_default(offsets, cur_field));
+ rec_b_ptr = rec_get_nth_field(rec, offsets,
+ cur_field, &rec_f_len);
+ ut_ad(!rec_offs_nth_extern(offsets, cur_field));
+
+ /* If we have matched yet 0 bytes, it may be that one or
+ both the fields are SQL null, or the record or dtuple may be
+ the predefined minimum record. */
+ if (cur_bytes == 0) {
+ if (dtuple_f_len == UNIV_SQL_NULL) {
+ if (rec_f_len == UNIV_SQL_NULL) {
+
+ goto next_field;
+ }
+
+ ret = -1;
+ goto order_resolved;
+ } else if (rec_f_len == UNIV_SQL_NULL) {
+ /* We define the SQL null to be the
+ smallest possible value of a field
+ in the alphabetical order */
+
+ ret = 1;
+ goto order_resolved;
+ }
+ }
+
+ switch (type->mtype) {
+ case DATA_FIXBINARY:
+ case DATA_BINARY:
+ case DATA_INT:
+ case DATA_SYS_CHILD:
+ case DATA_SYS:
+ break;
+ case DATA_BLOB:
+ if (type->prtype & DATA_BINARY_TYPE) {
+ break;
+ }
+ /* fall through */
+ default:
+ ret = cmp_data(type->mtype, type->prtype,
+ dtuple_b_ptr, dtuple_f_len,
+ rec_b_ptr, rec_f_len);
+
+ if (!ret) {
+ goto next_field;
+ }
+
+ cur_bytes = 0;
+ goto order_resolved;
+ }
+
+ /* Set the pointers at the current byte */
+
+ rec_b_ptr += cur_bytes;
+ dtuple_b_ptr += cur_bytes;
+ /* Compare then the fields */
+
+ for (const ulint pad = cmp_get_pad_char(type);;
+ cur_bytes++) {
+ ulint rec_byte = pad;
+ ulint dtuple_byte = pad;
+
+ if (rec_f_len <= cur_bytes) {
+ if (dtuple_f_len <= cur_bytes) {
+
+ goto next_field;
+ }
+
+ if (rec_byte == ULINT_UNDEFINED) {
+ ret = 1;
+
+ goto order_resolved;
+ }
+ } else {
+ rec_byte = *rec_b_ptr++;
+ }
+
+ if (dtuple_f_len <= cur_bytes) {
+ if (dtuple_byte == ULINT_UNDEFINED) {
+ ret = -1;
+
+ goto order_resolved;
+ }
+ } else {
+ dtuple_byte = *dtuple_b_ptr++;
+ }
+
+ if (dtuple_byte < rec_byte) {
+ ret = -1;
+ goto order_resolved;
+ } else if (dtuple_byte > rec_byte) {
+ ret = 1;
+ goto order_resolved;
+ }
+ }
+
+next_field:
+ cur_field++;
+ cur_bytes = 0;
+ }
+
+ ut_ad(cur_bytes == 0);
+
+ ret = 0; /* If we ran out of fields, dtuple was equal to rec
+ up to the common fields */
+order_resolved:
+ *matched_fields = cur_field;
+ *matched_bytes = cur_bytes;
+
+ return(ret);
+}
+
+/** Compare a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@param[in] dtuple data tuple
+@param[in] rec B-tree record
+@param[in] offsets rec_get_offsets(rec); may be NULL
+for ROW_FORMAT=REDUNDANT
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int
+cmp_dtuple_rec(
+ const dtuple_t* dtuple,
+ const rec_t* rec,
+ const rec_offs* offsets)
+{
+ ulint matched_fields = 0;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ return(cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+ &matched_fields));
+}
+
+/**************************************************************//**
+Checks if a dtuple is a prefix of a record. The last field in dtuple
+is allowed to be a prefix of the corresponding field in the record.
+@return TRUE if prefix */
+ibool
+cmp_dtuple_is_prefix_of_rec(
+/*========================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n_fields;
+ ulint matched_fields = 0;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ n_fields = dtuple_get_n_fields(dtuple);
+
+ if (n_fields > rec_offs_n_fields(offsets)) {
+ ut_ad(0);
+ return(FALSE);
+ }
+
+ cmp_dtuple_rec_with_match(dtuple, rec, offsets, &matched_fields);
+ return(matched_fields == n_fields);
+}
+
+/*************************************************************//**
+Compare two physical record fields.
+@retval positive if rec1 field is greater than rec2
+@retval negative if rec1 field is less than rec2
+@retval 0 if rec1 field equals to rec2 */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+int
+cmp_rec_rec_simple_field(
+/*=====================*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const rec_offs* offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+ const rec_offs* offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+ const dict_index_t* index, /*!< in: data dictionary index */
+ ulint n) /*!< in: field to compare */
+{
+ const byte* rec1_b_ptr;
+ const byte* rec2_b_ptr;
+ ulint rec1_f_len;
+ ulint rec2_f_len;
+ const dict_col_t* col = dict_index_get_nth_col(index, n);
+
+ ut_ad(!rec_offs_nth_extern(offsets1, n));
+ ut_ad(!rec_offs_nth_extern(offsets2, n));
+
+ rec1_b_ptr = rec_get_nth_field(rec1, offsets1, n, &rec1_f_len);
+ rec2_b_ptr = rec_get_nth_field(rec2, offsets2, n, &rec2_f_len);
+
+ return(cmp_data(col->mtype, col->prtype,
+ rec1_b_ptr, rec1_f_len, rec2_b_ptr, rec2_f_len));
+}
+
+/** Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@retval positive if rec1 (including non-ordering columns) is greater than rec2
+@retval negative if rec1 (including non-ordering columns) is less than rec2
+@retval 0 if rec1 is a duplicate of rec2 */
+int
+cmp_rec_rec_simple(
+/*===============*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const rec_offs* offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+ const rec_offs* offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+ const dict_index_t* index, /*!< in: data dictionary index */
+ struct TABLE* table) /*!< in: MySQL table, for reporting
+ duplicate key value if applicable,
+ or NULL */
+{
+ ulint n;
+ ulint n_uniq = dict_index_get_n_unique(index);
+ bool null_eq = false;
+
+ ut_ad(rec_offs_n_fields(offsets1) >= n_uniq);
+ ut_ad(rec_offs_n_fields(offsets2) == rec_offs_n_fields(offsets2));
+
+ ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+
+ for (n = 0; n < n_uniq; n++) {
+ int cmp = cmp_rec_rec_simple_field(
+ rec1, rec2, offsets1, offsets2, index, n);
+
+ if (cmp) {
+ return(cmp);
+ }
+
+ /* If the fields are internally equal, they must both
+ be NULL or non-NULL. */
+ ut_ad(rec_offs_nth_sql_null(offsets1, n)
+ == rec_offs_nth_sql_null(offsets2, n));
+
+ if (rec_offs_nth_sql_null(offsets1, n)) {
+ ut_ad(!(dict_index_get_nth_col(index, n)->prtype
+ & DATA_NOT_NULL));
+ null_eq = true;
+ }
+ }
+
+ /* If we ran out of fields, the ordering columns of rec1 were
+ equal to rec2. Issue a duplicate key error if needed. */
+
+ if (!null_eq && table && dict_index_is_unique(index)) {
+ /* Report erroneous row using new version of table. */
+ innobase_rec_to_mysql(table, rec1, index, offsets1);
+ return(0);
+ }
+
+ /* Else, keep comparing so that we have the full internal
+ order. */
+ for (; n < dict_index_get_n_fields(index); n++) {
+ int cmp = cmp_rec_rec_simple_field(
+ rec1, rec2, offsets1, offsets2, index, n);
+
+ if (cmp) {
+ return(cmp);
+ }
+
+ /* If the fields are internally equal, they must both
+ be NULL or non-NULL. */
+ ut_ad(rec_offs_nth_sql_null(offsets1, n)
+ == rec_offs_nth_sql_null(offsets2, n));
+ }
+
+ /* This should never be reached. Internally, an index must
+ never contain duplicate entries. */
+ ut_ad(0);
+ return(0);
+}
+
+/** Compare two B-tree or R-tree records.
+Only the common first fields are compared, and externally stored field
+are treated as equal.
+@param[in] rec1 record (possibly not on an index page)
+@param[in] rec2 B-tree or R-tree record in an index page
+@param[in] offsets1 rec_get_offsets(rec1, index)
+@param[in] offsets2 rec_get_offsets(rec2, index)
+@param[in] nulls_unequal true if this is for index cardinality
+ statistics estimation with
+ innodb_stats_method=nulls_unequal
+ or innodb_stats_method=nulls_ignored
+@param[out] matched_fields number of completely matched fields
+ within the first field not completely matched
+@retval 0 if rec1 is equal to rec2
+@retval negative if rec1 is less than rec2
+@retval positive if rec1 is greater than rec2 */
+int
+cmp_rec_rec(
+ const rec_t* rec1,
+ const rec_t* rec2,
+ const rec_offs* offsets1,
+ const rec_offs* offsets2,
+ const dict_index_t* index,
+ bool nulls_unequal,
+ ulint* matched_fields)
+{
+ ulint rec1_f_len; /* length of current field in rec */
+ const byte* rec1_b_ptr; /* pointer to the current byte
+ in rec field */
+ ulint rec2_f_len; /* length of current field in rec */
+ const byte* rec2_b_ptr; /* pointer to the current byte
+ in rec field */
+ ulint cur_field = 0; /* current field number */
+ int ret = 0; /* return value */
+
+ ut_ad(rec1 != NULL);
+ ut_ad(rec2 != NULL);
+ ut_ad(index != NULL);
+ ut_ad(rec_offs_validate(rec1, index, offsets1));
+ ut_ad(rec_offs_validate(rec2, index, offsets2));
+ ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+ ut_ad(fil_page_index_page_check(page_align(rec2)));
+ ut_ad(!!dict_index_is_spatial(index)
+ == (fil_page_get_type(page_align(rec2)) == FIL_PAGE_RTREE));
+
+ ulint comp = rec_offs_comp(offsets1);
+ ulint n_fields;
+
+ /* Test if rec is the predefined minimum record */
+ if (UNIV_UNLIKELY(rec_get_info_bits(rec1, comp)
+ & REC_INFO_MIN_REC_FLAG)) {
+ ret = UNIV_UNLIKELY(rec_get_info_bits(rec2, comp)
+ & REC_INFO_MIN_REC_FLAG)
+ ? 0 : -1;
+ goto order_resolved;
+ } else if (UNIV_UNLIKELY
+ (rec_get_info_bits(rec2, comp)
+ & REC_INFO_MIN_REC_FLAG)) {
+ ret = 1;
+ goto order_resolved;
+ }
+
+ /* For non-leaf spatial index records, the
+ dict_index_get_n_unique_in_tree() does include the child page
+ number, because spatial index node pointers only contain
+ the MBR (minimum bounding rectangle) and the child page number.
+
+ For B-tree node pointers, the key alone (secondary index
+ columns and PRIMARY KEY columns) must be unique, and there is
+ no need to compare the child page number. */
+ n_fields = std::min(rec_offs_n_fields(offsets1),
+ rec_offs_n_fields(offsets2));
+ n_fields = std::min<ulint>(n_fields,
+ dict_index_get_n_unique_in_tree(index));
+
+ for (; cur_field < n_fields; cur_field++) {
+ ulint mtype;
+ ulint prtype;
+
+ if (UNIV_UNLIKELY(dict_index_is_ibuf(index))) {
+ /* This is for the insert buffer B-tree. */
+ mtype = DATA_BINARY;
+ prtype = 0;
+ } else {
+ const dict_col_t* col = dict_index_get_nth_col(
+ index, cur_field);
+ mtype = col->mtype;
+ prtype = col->prtype;
+
+ if (UNIV_LIKELY(!dict_index_is_spatial(index))) {
+ } else if (cur_field == 0) {
+ ut_ad(DATA_GEOMETRY_MTYPE(mtype));
+ prtype |= DATA_GIS_MBR;
+ } else if (!page_rec_is_leaf(rec2)) {
+ /* Compare the child page number. */
+ ut_ad(cur_field == 1);
+ mtype = DATA_SYS_CHILD;
+ prtype = 0;
+ }
+ }
+
+ /* We should never encounter an externally stored field.
+ Externally stored fields only exist in clustered index
+ leaf page records. These fields should already differ
+ in the primary key columns already, before DB_TRX_ID,
+ DB_ROLL_PTR, and any externally stored columns. */
+ ut_ad(!rec_offs_nth_extern(offsets1, cur_field));
+ ut_ad(!rec_offs_nth_extern(offsets2, cur_field));
+ ut_ad(!rec_offs_nth_default(offsets1, cur_field));
+ ut_ad(!rec_offs_nth_default(offsets2, cur_field));
+
+ rec1_b_ptr = rec_get_nth_field(rec1, offsets1,
+ cur_field, &rec1_f_len);
+ rec2_b_ptr = rec_get_nth_field(rec2, offsets2,
+ cur_field, &rec2_f_len);
+
+ if (nulls_unequal
+ && rec1_f_len == UNIV_SQL_NULL
+ && rec2_f_len == UNIV_SQL_NULL) {
+ ret = -1;
+ goto order_resolved;
+ }
+
+ ret = cmp_data(mtype, prtype,
+ rec1_b_ptr, rec1_f_len,
+ rec2_b_ptr, rec2_f_len);
+ if (ret) {
+ goto order_resolved;
+ }
+ }
+
+ /* If we ran out of fields, rec1 was equal to rec2 up
+ to the common fields */
+ ut_ad(ret == 0);
+order_resolved:
+ if (matched_fields) {
+ *matched_fields = cur_field;
+ }
+ return ret;
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#ifdef HAVE_UT_CHRONO_T
+
+void
+test_cmp_data_data(ulint len)
+{
+ int i;
+ static byte zeros[64];
+
+ if (len > sizeof zeros) {
+ len = sizeof zeros;
+ }
+
+ ut_chrono_t ch(__func__);
+
+ for (i = 1000000; i > 0; i--) {
+ i += cmp_data(DATA_INT, 0, zeros, len, zeros, len);
+ }
+}
+
+#endif /* HAVE_UT_CHRONO_T */
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc
new file mode 100644
index 00000000..902f3f2d
--- /dev/null
+++ b/storage/innobase/rem/rem0rec.cc
@@ -0,0 +1,2844 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file rem/rem0rec.cc
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "rem0rec.h"
+#include "page0page.h"
+#include "mtr0log.h"
+#include "fts0fts.h"
+#include "trx0sys.h"
+#include "row0log.h"
+
+/* PHYSICAL RECORD (OLD STYLE)
+ ===========================
+
+The physical record, which is the data type of all the records
+found in index pages of the database, has the following format
+(lower addresses and more significant bits inside a byte are below
+represented on a higher text line):
+
+| offset of the end of the last field of data, the most significant
+ bit is set to 1 if and only if the field is SQL-null,
+ if the offset is 2-byte, then the second most significant
+ bit is set to 1 if the field is stored on another page:
+ mostly this will occur in the case of big BLOB fields |
+...
+| offset of the end of the first field of data + the SQL-null bit |
+| 4 bits used to delete mark a record, and mark a predefined
+ minimum record in alphabetical order |
+| 4 bits giving the number of records owned by this record
+ (this term is explained in page0page.h) |
+| 13 bits giving the order number of this record in the
+ heap of the index page |
+| 10 bits giving the number of fields in this record |
+| 1 bit which is set to 1 if the offsets above are given in
+ one byte format, 0 if in two byte format |
+| two bytes giving an absolute pointer to the next record in the page |
+ORIGIN of the record
+| first field of data |
+...
+| last field of data |
+
+The origin of the record is the start address of the first field
+of data. The offsets are given relative to the origin.
+The offsets of the data fields are stored in an inverted
+order because then the offset of the first fields are near the
+origin, giving maybe a better processor cache hit rate in searches.
+
+The offsets of the data fields are given as one-byte
+(if there are less than 127 bytes of data in the record)
+or two-byte unsigned integers. The most significant bit
+is not part of the offset, instead it indicates the SQL-null
+if the bit is set to 1. */
+
+/* PHYSICAL RECORD (NEW STYLE)
+ ===========================
+
+The physical record, which is the data type of all the records
+found in index pages of the database, has the following format
+(lower addresses and more significant bits inside a byte are below
+represented on a higher text line):
+
+| length of the last non-null variable-length field of data:
+ if the maximum length is 255, one byte; otherwise,
+ 0xxxxxxx (one byte, length=0..127), or 1exxxxxxxxxxxxxx (two bytes,
+ length=128..16383, extern storage flag) |
+...
+| length of first variable-length field of data |
+| SQL-null flags (1 bit per nullable field), padded to full bytes |
+| 4 bits used to delete mark a record, and mark a predefined
+ minimum record in alphabetical order |
+| 4 bits giving the number of records owned by this record
+ (this term is explained in page0page.h) |
+| 13 bits giving the order number of this record in the
+ heap of the index page |
+| 3 bits record type: 000=conventional, 001=node pointer (inside B-tree),
+ 010=infimum, 011=supremum, 1xx=reserved |
+| two bytes giving a relative pointer to the next record in the page |
+ORIGIN of the record
+| first field of data |
+...
+| last field of data |
+
+The origin of the record is the start address of the first field
+of data. The offsets are given relative to the origin.
+The offsets of the data fields are stored in an inverted
+order because then the offset of the first fields are near the
+origin, giving maybe a better processor cache hit rate in searches.
+
+The offsets of the data fields are given as one-byte
+(if there are less than 127 bytes of data in the record)
+or two-byte unsigned integers. The most significant bit
+is not part of the offset, instead it indicates the SQL-null
+if the bit is set to 1. */
+
+/* CANONICAL COORDINATES. A record can be seen as a single
+string of 'characters' in the following way: catenate the bytes
+in each field, in the order of fields. An SQL-null field
+is taken to be an empty sequence of bytes. Then after
+the position of each field insert in the string
+the 'character' <FIELD-END>, except that after an SQL-null field
+insert <NULL-FIELD-END>. Now the ordinal position of each
+byte in this canonical string is its canonical coordinate.
+So, for the record ("AA", SQL-NULL, "BB", ""), the canonical
+string is "AA<FIELD_END><NULL-FIELD-END>BB<FIELD-END><FIELD-END>".
+We identify prefixes (= initial segments) of a record
+with prefixes of the canonical string. The canonical
+length of the prefix is the length of the corresponding
+prefix of the canonical string. The canonical length of
+a record is the length of its canonical string.
+
+For example, the maximal common prefix of records
+("AA", SQL-NULL, "BB", "C") and ("AA", SQL-NULL, "B", "C")
+is "AA<FIELD-END><NULL-FIELD-END>B", and its canonical
+length is 5.
+
+A complete-field prefix of a record is a prefix which ends at the
+end of some field (containing also <FIELD-END>).
+A record is a complete-field prefix of another record, if
+the corresponding canonical strings have the same property. */
+
+/***************************************************************//**
+Validates the consistency of an old-style physical record.
+@return TRUE if ok */
+static
+ibool
+rec_validate_old(
+/*=============*/
+ const rec_t* rec); /*!< in: physical record */
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return number of externally stored columns */
+ulint
+rec_get_n_extern_new(
+/*=================*/
+ const rec_t* rec, /*!< in: compact physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n) /*!< in: number of columns to scan */
+{
+ const byte* nulls;
+ const byte* lens;
+ ulint null_mask;
+ ulint n_extern;
+ ulint i;
+
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(!index->table->supports_instant());
+ ut_ad(!index->is_instant());
+ ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY
+ || rec_get_status(rec) == REC_STATUS_INSTANT);
+ ut_ad(n == ULINT_UNDEFINED || n <= dict_index_get_n_fields(index));
+
+ if (n == ULINT_UNDEFINED) {
+ n = dict_index_get_n_fields(index);
+ }
+
+ nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+ null_mask = 1;
+ n_extern = 0;
+ i = 0;
+
+ /* read the lengths of fields 0..n */
+ do {
+ const dict_field_t* field
+ = dict_index_get_nth_field(index, i);
+ const dict_col_t* col
+ = dict_field_get_col(field);
+ ulint len;
+
+ if (!(col->prtype & DATA_NOT_NULL)) {
+ /* nullable field => read the null flag */
+
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls--;
+ null_mask = 1;
+ }
+
+ if (*nulls & null_mask) {
+ null_mask <<= 1;
+ /* No length is stored for NULL fields. */
+ continue;
+ }
+ null_mask <<= 1;
+ }
+
+ if (UNIV_UNLIKELY(!field->fixed_len)) {
+ /* Variable-length field: read the length */
+ len = *lens--;
+ /* If the maximum length of the field is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the field is stored externally. */
+ if (DATA_BIG_COL(col)) {
+ if (len & 0x80) {
+ /* 1exxxxxxx xxxxxxxx */
+ if (len & 0x40) {
+ n_extern++;
+ }
+ lens--;
+ }
+ }
+ }
+ } while (++i < n);
+
+ return(n_extern);
+}
+
+/** Format of a leaf-page ROW_FORMAT!=REDUNDANT record */
+enum rec_leaf_format {
+ /** Temporary file record */
+ REC_LEAF_TEMP,
+ /** Temporary file record, with added columns (REC_STATUS_INSTANT) */
+ REC_LEAF_TEMP_INSTANT,
+ /** Normal (REC_STATUS_ORDINARY) */
+ REC_LEAF_ORDINARY,
+ /** With add or drop columns (REC_STATUS_INSTANT) */
+ REC_LEAF_INSTANT
+};
+
+/** Determine the offset to each field in a leaf-page record
+in ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED.
+This is a special case of rec_init_offsets() and rec_get_offsets_func().
+@tparam mblob whether the record includes a metadata BLOB
+@tparam redundant_temp whether the record belongs to a temporary file
+ of a ROW_FORMAT=REDUNDANT table
+@param[in] rec leaf-page record
+@param[in] index the index that the record belongs in
+@param[in] n_core number of core fields (index->n_core_fields)
+@param[in] def_val default values for non-core fields, or
+ NULL to refer to index->fields[].col->def_val
+@param[in,out] offsets offsets, with valid rec_offs_n_fields(offsets)
+@param[in] format record format */
+template<bool mblob = false, bool redundant_temp = false>
+static inline
+void
+rec_init_offsets_comp_ordinary(
+ const rec_t* rec,
+ const dict_index_t* index,
+ rec_offs* offsets,
+ ulint n_core,
+ const dict_col_t::def_t*def_val,
+ rec_leaf_format format)
+{
+ rec_offs offs = 0;
+ rec_offs any = 0;
+ const byte* nulls = rec;
+ const byte* lens = NULL;
+ ulint n_fields = n_core;
+ ulint null_mask = 1;
+
+ ut_ad(n_core > 0);
+ ut_ad(index->n_core_fields >= n_core);
+ ut_ad(index->n_fields >= index->n_core_fields);
+ ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+ ut_ad(format == REC_LEAF_TEMP || format == REC_LEAF_TEMP_INSTANT
+ || dict_table_is_comp(index->table));
+ ut_ad(format != REC_LEAF_TEMP_INSTANT
+ || index->n_fields == rec_offs_n_fields(offsets));
+ ut_d(ulint n_null= 0);
+
+ const unsigned n_core_null_bytes = UNIV_UNLIKELY(index->n_core_fields
+ != n_core)
+ ? UT_BITS_IN_BYTES(unsigned(index->get_n_nullable(n_core)))
+ : (redundant_temp
+ ? UT_BITS_IN_BYTES(index->n_nullable)
+ : index->n_core_null_bytes);
+
+ if (mblob) {
+ ut_ad(index->table->instant);
+ ut_ad(index->is_instant());
+ ut_ad(rec_offs_n_fields(offsets)
+ <= ulint(index->n_fields) + 1);
+ ut_ad(!def_val);
+ ut_ad(format == REC_LEAF_INSTANT);
+ nulls -= REC_N_NEW_EXTRA_BYTES;
+ n_fields = n_core + 1 + rec_get_n_add_field(nulls);
+ ut_ad(n_fields <= ulint(index->n_fields) + 1);
+ const ulint n_nullable = index->get_n_nullable(n_fields - 1);
+ const ulint n_null_bytes = UT_BITS_IN_BYTES(n_nullable);
+ ut_d(n_null = n_nullable);
+ ut_ad(n_null <= index->n_nullable);
+ ut_ad(n_null_bytes >= n_core_null_bytes
+ || n_core < index->n_core_fields);
+ lens = --nulls - n_null_bytes;
+ goto start;
+ }
+
+ switch (format) {
+ case REC_LEAF_TEMP:
+ if (dict_table_is_comp(index->table)) {
+ /* No need to do adjust fixed_len=0. We only need to
+ adjust it for ROW_FORMAT=REDUNDANT. */
+ format = REC_LEAF_ORDINARY;
+ }
+ goto ordinary;
+ case REC_LEAF_ORDINARY:
+ nulls -= REC_N_NEW_EXTRA_BYTES;
+ordinary:
+ lens = --nulls - n_core_null_bytes;
+
+ ut_d(n_null = std::min<uint>(n_core_null_bytes * 8U,
+ index->n_nullable));
+ break;
+ case REC_LEAF_INSTANT:
+ nulls -= REC_N_NEW_EXTRA_BYTES;
+ ut_ad(index->is_instant());
+ /* fall through */
+ case REC_LEAF_TEMP_INSTANT:
+ n_fields = n_core + rec_get_n_add_field(nulls) + 1;
+ ut_ad(n_fields <= index->n_fields);
+ const ulint n_nullable = index->get_n_nullable(n_fields);
+ const ulint n_null_bytes = UT_BITS_IN_BYTES(n_nullable);
+ ut_d(n_null = n_nullable);
+ ut_ad(n_null <= index->n_nullable);
+ ut_ad(n_null_bytes >= n_core_null_bytes
+ || n_core < index->n_core_fields);
+ lens = --nulls - n_null_bytes;
+ }
+
+start:
+#ifdef UNIV_DEBUG
+ /* We cannot invoke rec_offs_make_valid() if format==REC_LEAF_TEMP.
+ Similarly, rec_offs_validate() will fail in that case, because
+ it invokes rec_get_status(). */
+ memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec));
+ memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index));
+#endif /* UNIV_DEBUG */
+
+ /* read the lengths of fields 0..n_fields */
+ rec_offs len;
+ ulint i = 0;
+ const dict_field_t* field = index->fields;
+
+ do {
+ if (mblob) {
+ if (i == index->first_user_field()) {
+ offs = static_cast<rec_offs>(offs
+ + FIELD_REF_SIZE);
+ len = combine(offs, STORED_OFFPAGE);
+ any |= REC_OFFS_EXTERNAL;
+ field--;
+ continue;
+ } else if (i >= n_fields) {
+ len = combine(offs, DEFAULT);
+ any |= REC_OFFS_DEFAULT;
+ continue;
+ }
+ } else if (i < n_fields) {
+ /* The field is present, and will be covered below. */
+ } else if (!mblob && def_val) {
+ const dict_col_t::def_t& d = def_val[i - n_core];
+ if (!d.data) {
+ len = combine(offs, SQL_NULL);
+ ut_ad(d.len == UNIV_SQL_NULL);
+ } else {
+ len = combine(offs, DEFAULT);
+ any |= REC_OFFS_DEFAULT;
+ }
+
+ continue;
+ } else {
+ ulint dlen;
+ if (!index->instant_field_value(i, &dlen)) {
+ len = combine(offs, SQL_NULL);
+ ut_ad(dlen == UNIV_SQL_NULL);
+ } else {
+ len = combine(offs, DEFAULT);
+ any |= REC_OFFS_DEFAULT;
+ }
+
+ continue;
+ }
+
+ const dict_col_t* col = field->col;
+
+ if (col->is_nullable()) {
+ /* nullable field => read the null flag */
+ ut_ad(n_null--);
+
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls--;
+ null_mask = 1;
+ }
+
+ if (*nulls & null_mask) {
+ null_mask <<= 1;
+ /* No length is stored for NULL fields.
+ We do not advance offs, and we set
+ the length to zero and enable the
+ SQL NULL flag in offsets[]. */
+ len = combine(offs, SQL_NULL);
+ continue;
+ }
+ null_mask <<= 1;
+ }
+
+ if (!field->fixed_len
+ || (format == REC_LEAF_TEMP
+ && !dict_col_get_fixed_size(col, true))) {
+ /* Variable-length field: read the length */
+ len = *lens--;
+ /* If the maximum length of the field is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the field is stored externally. */
+ if ((len & 0x80) && DATA_BIG_COL(col)) {
+ /* 1exxxxxxx xxxxxxxx */
+ len = static_cast<rec_offs>(len << 8
+ | *lens--);
+ offs = static_cast<rec_offs>(offs
+ + get_value(len));
+ if (UNIV_UNLIKELY(len & 0x4000)) {
+ ut_ad(index->is_primary());
+ any |= REC_OFFS_EXTERNAL;
+ len = combine(offs, STORED_OFFPAGE);
+ } else {
+ len = offs;
+ }
+
+ continue;
+ }
+
+ len = offs = static_cast<rec_offs>(offs + len);
+ } else {
+ len = offs = static_cast<rec_offs>(offs
+ + field->fixed_len);
+ }
+ } while (field++, rec_offs_base(offsets)[++i] = len,
+ i < rec_offs_n_fields(offsets));
+
+ *rec_offs_base(offsets) = static_cast<rec_offs>((rec - (lens + 1))
+ | REC_OFFS_COMPACT
+ | any);
+}
+
+#ifdef UNIV_DEBUG
+/** Update debug data in offsets, in order to tame rec_offs_validate().
+@param[in] rec record
+@param[in] index the index that the record belongs in
+@param[in] leaf whether the record resides in a leaf page
+@param[in,out] offsets offsets from rec_get_offsets() to adjust */
+void
+rec_offs_make_valid(
+ const rec_t* rec,
+ const dict_index_t* index,
+ bool leaf,
+ rec_offs* offsets)
+{
+ const bool is_alter_metadata = leaf
+ && rec_is_alter_metadata(rec, *index);
+ ut_ad(is_alter_metadata
+ || index->is_dummy || index->is_ibuf()
+ || (leaf
+ ? rec_offs_n_fields(offsets)
+ <= dict_index_get_n_fields(index)
+ : rec_offs_n_fields(offsets) - 1
+ <= dict_index_get_n_unique_in_tree_nonleaf(index)));
+ const bool is_user_rec = (dict_table_is_comp(index->table)
+ ? rec_get_heap_no_new(rec)
+ : rec_get_heap_no_old(rec))
+ >= PAGE_HEAP_NO_USER_LOW;
+ ulint n = rec_get_n_fields(rec, index);
+ /* The infimum and supremum records carry 1 field. */
+ ut_ad(is_user_rec || n == 1);
+ ut_ad(is_user_rec || rec_offs_n_fields(offsets) == 1);
+ ut_ad(!is_user_rec
+ || (n + (index->id == DICT_INDEXES_ID)) >= index->n_core_fields
+ || n >= rec_offs_n_fields(offsets));
+ for (; n < rec_offs_n_fields(offsets); n++) {
+ ut_ad(leaf);
+ ut_ad(is_alter_metadata
+ || get_type(rec_offs_base(offsets)[1 + n]) == DEFAULT);
+ }
+ memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec));
+ memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index));
+}
+
+/** Validate offsets returned by rec_get_offsets().
+@param[in] rec record, or NULL
+@param[in] index the index that the record belongs in, or NULL
+@param[in,out] offsets the offsets of the record
+@return true */
+bool
+rec_offs_validate(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets)
+{
+ ulint i = rec_offs_n_fields(offsets);
+ ulint last = ULINT_MAX;
+ ulint comp = *rec_offs_base(offsets) & REC_OFFS_COMPACT;
+
+ if (rec) {
+ ut_ad(!memcmp(&rec, &offsets[RECORD_OFFSET], sizeof(rec)));
+ if (!comp) {
+ const bool is_user_rec = rec_get_heap_no_old(rec)
+ >= PAGE_HEAP_NO_USER_LOW;
+ ulint n = rec_get_n_fields_old(rec);
+ /* The infimum and supremum records carry 1 field. */
+ ut_ad(is_user_rec || n == 1);
+ ut_ad(is_user_rec || i == 1);
+ ut_ad(!is_user_rec || n >= i || !index
+ || (n + (index->id == DICT_INDEXES_ID))
+ >= index->n_core_fields);
+ for (; n < i; n++) {
+ ut_ad(get_type(rec_offs_base(offsets)[1 + n])
+ == DEFAULT);
+ }
+ }
+ }
+ if (index) {
+ ut_ad(!memcmp(&index, &offsets[INDEX_OFFSET], sizeof(index)));
+ ulint max_n_fields = std::max<ulint>(
+ dict_index_get_n_fields(index),
+ dict_index_get_n_unique_in_tree(index) + 1);
+ if (comp && rec) {
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_INSTANT:
+ ut_ad(index->is_instant() || index->is_dummy);
+ ut_ad(max_n_fields == index->n_fields);
+ max_n_fields += index->table->instant
+ || index->is_dummy;
+ break;
+ case REC_STATUS_ORDINARY:
+ break;
+ case REC_STATUS_NODE_PTR:
+ max_n_fields = dict_index_get_n_unique_in_tree(
+ index) + 1;
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ max_n_fields = 1;
+ break;
+ default:
+ ut_error;
+ }
+ } else if (max_n_fields == index->n_fields
+ && (index->is_dummy
+ || (index->is_instant()
+ && index->table->instant))) {
+ max_n_fields++;
+ }
+ /* index->n_def == 0 for dummy indexes if !comp */
+ ut_ad(!comp || index->n_def);
+ ut_ad(!index->n_def || i <= max_n_fields);
+ }
+ while (i--) {
+ ulint curr = get_value(rec_offs_base(offsets)[1 + i]);
+ ut_ad(curr <= last);
+ last = curr;
+ }
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/** Determine the offsets to each field in the record.
+ The offsets are written to a previously allocated array of
+ulint, where rec_offs_n_fields(offsets) has been initialized to the
+number of fields in the record. The rest of the array will be
+initialized by this function. rec_offs_base(offsets)[0] will be set
+to the extra size (if REC_OFFS_COMPACT is set, the record is in the
+new format; if REC_OFFS_EXTERNAL is set, the record contains externally
+stored columns), and rec_offs_base(offsets)[1..n_fields] will be set to
+offsets past the end of fields 0..n_fields, or to the beginning of
+fields 1..n_fields+1. When the type of the offset at [i+1]
+is (SQL_NULL), the field i is NULL. When the type of the offset at [i+1]
+is (STORED_OFFPAGE), the field i is stored externally.
+@param[in] rec record
+@param[in] index the index that the record belongs in
+@param[in] n_core 0, or index->n_core_fields for leaf page
+@param[in,out] offsets array of offsets, with valid rec_offs_n_fields() */
+static
+void
+rec_init_offsets(
+ const rec_t* rec,
+ const dict_index_t* index,
+ ulint n_core,
+ rec_offs* offsets)
+{
+ ulint i = 0;
+ rec_offs offs;
+
+ /* This assertion was relaxed for the btr_cur_open_at_index_side()
+ call in btr_cur_instant_init_low(). We cannot invoke
+ index->is_instant(), because the same assertion would fail there
+ until btr_cur_instant_init_low() has invoked
+ dict_table_t::deserialise_columns(). */
+ ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)
+ || index->in_instant_init);
+ ut_d(memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec)));
+ ut_d(memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index)));
+ ut_ad(index->n_fields >= n_core);
+ ut_ad(index->n_core_fields >= n_core);
+
+ if (dict_table_is_comp(index->table)) {
+ const byte* nulls;
+ const byte* lens;
+ dict_field_t* field;
+ ulint null_mask;
+ rec_comp_status_t status = rec_get_status(rec);
+ ulint n_node_ptr_field = ULINT_UNDEFINED;
+
+ switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ /* the field is 8 bytes long */
+ rec_offs_base(offsets)[0]
+ = REC_N_NEW_EXTRA_BYTES | REC_OFFS_COMPACT;
+ rec_offs_base(offsets)[1] = 8;
+ return;
+ case REC_STATUS_NODE_PTR:
+ ut_ad(!n_core);
+ n_node_ptr_field
+ = dict_index_get_n_unique_in_tree_nonleaf(
+ index);
+ break;
+ case REC_STATUS_INSTANT:
+ ut_ad(index->is_instant());
+ rec_init_offsets_comp_ordinary(rec, index, offsets,
+ n_core,
+ NULL,
+ REC_LEAF_INSTANT);
+ return;
+ case REC_STATUS_ORDINARY:
+ rec_init_offsets_comp_ordinary(rec, index, offsets,
+ n_core,
+ NULL,
+ REC_LEAF_ORDINARY);
+ return;
+ }
+
+ /* The n_nullable flags in the clustered index node pointer
+ records in ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC must
+ reflect the number of 'core columns'. These flags are
+ useless garbage, and they are only reserved because of
+ file format compatibility.
+ (Clustered index node pointer records only contain the
+ PRIMARY KEY columns, which are always NOT NULL,
+ so we should have used n_nullable=0.) */
+ ut_ad(index->n_core_fields > 0);
+
+ nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ lens = nulls - index->n_core_null_bytes;
+ offs = 0;
+ null_mask = 1;
+
+ /* read the lengths of fields 0..n */
+ do {
+ rec_offs len;
+ if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+ len = offs = static_cast<rec_offs>(
+ offs + REC_NODE_PTR_SIZE);
+ goto resolved;
+ }
+
+ field = dict_index_get_nth_field(index, i);
+ if (!(dict_field_get_col(field)->prtype
+ & DATA_NOT_NULL)) {
+ /* nullable field => read the null flag */
+
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls--;
+ null_mask = 1;
+ }
+
+ if (*nulls & null_mask) {
+ null_mask <<= 1;
+ /* No length is stored for NULL fields.
+ We do not advance offs, and we set
+ the length to zero and enable the
+ SQL NULL flag in offsets[]. */
+ len = combine(offs, SQL_NULL);
+ goto resolved;
+ }
+ null_mask <<= 1;
+ }
+
+ if (UNIV_UNLIKELY(!field->fixed_len)) {
+ const dict_col_t* col
+ = dict_field_get_col(field);
+ /* Variable-length field: read the length */
+ len = *lens--;
+ /* If the maximum length of the field
+ is up to 255 bytes, the actual length
+ is always stored in one byte. If the
+ maximum length is more than 255 bytes,
+ the actual length is stored in one
+ byte for 0..127. The length will be
+ encoded in two bytes when it is 128 or
+ more, or when the field is stored
+ externally. */
+ if (DATA_BIG_COL(col)) {
+ if (len & 0x80) {
+ /* 1exxxxxxx xxxxxxxx */
+ len = static_cast<rec_offs>(
+ len << 8 | *lens--);
+
+ /* B-tree node pointers
+ must not contain externally
+ stored columns. Thus
+ the "e" flag must be 0. */
+ ut_a(!(len & 0x4000));
+ offs = static_cast<rec_offs>(
+ offs + get_value(len));
+ len = offs;
+
+ goto resolved;
+ }
+ }
+
+ len = offs = static_cast<rec_offs>(offs + len);
+ } else {
+ len = offs = static_cast<rec_offs>(
+ offs + field->fixed_len);
+ }
+resolved:
+ rec_offs_base(offsets)[i + 1] = len;
+ } while (++i < rec_offs_n_fields(offsets));
+
+ *rec_offs_base(offsets)
+ = static_cast<rec_offs>((rec - (lens + 1))
+ | REC_OFFS_COMPACT);
+ } else {
+ /* Old-style record: determine extra size and end offsets */
+ offs = REC_N_OLD_EXTRA_BYTES;
+ const ulint n_fields = rec_get_n_fields_old(rec);
+ const ulint n = std::min(n_fields, rec_offs_n_fields(offsets));
+ rec_offs any;
+
+ if (rec_get_1byte_offs_flag(rec)) {
+ offs = static_cast<rec_offs>(offs + n_fields);
+ any = offs;
+ /* Determine offsets to fields */
+ do {
+ offs = rec_1_get_field_end_info(rec, i);
+ if (offs & REC_1BYTE_SQL_NULL_MASK) {
+ offs &= static_cast<rec_offs>(
+ ~REC_1BYTE_SQL_NULL_MASK);
+ set_type(offs, SQL_NULL);
+ }
+ rec_offs_base(offsets)[1 + i] = offs;
+ } while (++i < n);
+ } else {
+ offs = static_cast<rec_offs>(offs + 2 * n_fields);
+ any = offs;
+ /* Determine offsets to fields */
+ do {
+ offs = rec_2_get_field_end_info(rec, i);
+ if (offs & REC_2BYTE_SQL_NULL_MASK) {
+ offs &= static_cast<rec_offs>(
+ ~REC_2BYTE_SQL_NULL_MASK);
+ set_type(offs, SQL_NULL);
+ }
+ if (offs & REC_2BYTE_EXTERN_MASK) {
+ offs &= static_cast<rec_offs>(
+ ~REC_2BYTE_EXTERN_MASK);
+ set_type(offs, STORED_OFFPAGE);
+ any |= REC_OFFS_EXTERNAL;
+ }
+ rec_offs_base(offsets)[1 + i] = offs;
+ } while (++i < n);
+ }
+
+ if (i < rec_offs_n_fields(offsets)) {
+ ut_ad(index->is_instant()
+ || i + (index->id == DICT_INDEXES_ID)
+ == rec_offs_n_fields(offsets));
+
+ ut_ad(i != 0);
+ offs = combine(rec_offs_base(offsets)[i], DEFAULT);
+
+ do {
+ rec_offs_base(offsets)[1 + i] = offs;
+ } while (++i < rec_offs_n_fields(offsets));
+
+ any |= REC_OFFS_DEFAULT;
+ }
+
+ *rec_offs_base(offsets) = any;
+ }
+}
+
+/** Determine the offsets to each field in an index record.
+@param[in] rec physical record
+@param[in] index the index that the record belongs to
+@param[in,out] offsets array comprising offsets[0] allocated elements,
+ or an array from rec_get_offsets(), or NULL
+@param[in] n_core 0, or index->n_core_fields for leaf page
+@param[in] n_fields maximum number of offsets to compute
+ (ULINT_UNDEFINED to compute all offsets)
+@param[in,out] heap memory heap
+@return the new offsets */
+rec_offs*
+rec_get_offsets_func(
+ const rec_t* rec,
+ const dict_index_t* index,
+ rec_offs* offsets,
+ ulint n_core,
+ ulint n_fields,
+#ifdef UNIV_DEBUG
+ const char* file, /*!< in: file name where called */
+ unsigned line, /*!< in: line number where called */
+#endif /* UNIV_DEBUG */
+ mem_heap_t** heap) /*!< in/out: memory heap */
+{
+ ulint n;
+ ulint size;
+ bool alter_metadata = false;
+
+ ut_ad(index->n_core_fields >= n_core);
+ /* This assertion was relaxed for the btr_cur_open_at_index_side()
+ call in btr_cur_instant_init_low(). We cannot invoke
+ index->is_instant(), because the same assertion would fail there
+ until btr_cur_instant_init_low() has invoked
+ dict_table_t::deserialise_columns(). */
+ ut_ad(index->n_fields >= index->n_core_fields
+ || index->in_instant_init);
+
+ if (dict_table_is_comp(index->table)) {
+ switch (UNIV_EXPECT(rec_get_status(rec),
+ REC_STATUS_ORDINARY)) {
+ case REC_STATUS_INSTANT:
+ alter_metadata = rec_is_alter_metadata(rec, true);
+ /* fall through */
+ case REC_STATUS_ORDINARY:
+ ut_ad(n_core);
+ n = dict_index_get_n_fields(index) + alter_metadata;
+ break;
+ case REC_STATUS_NODE_PTR:
+ /* Node pointer records consist of the
+ uniquely identifying fields of the record
+ followed by a child page number field. */
+ ut_ad(!n_core);
+ n = dict_index_get_n_unique_in_tree_nonleaf(index) + 1;
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ /* infimum or supremum record */
+ ut_ad(rec_get_heap_no_new(rec)
+ == ulint(rec_get_status(rec)
+ == REC_STATUS_INFIMUM
+ ? PAGE_HEAP_NO_INFIMUM
+ : PAGE_HEAP_NO_SUPREMUM));
+ n = 1;
+ break;
+ default:
+ ut_error;
+ return(NULL);
+ }
+ } else {
+ n = rec_get_n_fields_old(rec);
+ /* Here, rec can be allocated from the heap (copied
+ from an index page record), or it can be located in an
+ index page. If rec is not in an index page, then
+ page_rec_is_user_rec(rec) and similar predicates
+ cannot be evaluated. We can still distinguish the
+ infimum and supremum record based on the heap number. */
+ const bool is_user_rec = rec_get_heap_no_old(rec)
+ >= PAGE_HEAP_NO_USER_LOW;
+ /* The infimum and supremum records carry 1 field. */
+ ut_ad(is_user_rec || n == 1);
+ ut_ad(!is_user_rec || n_core || index->is_dummy
+ || dict_index_is_ibuf(index)
+ || n == n_fields /* dict_stats_analyze_index_level() */
+ || n - 1
+ == dict_index_get_n_unique_in_tree_nonleaf(index));
+ ut_ad(!is_user_rec || !n_core || index->is_dummy
+ || dict_index_is_ibuf(index)
+ || n == n_fields /* btr_pcur_restore_position() */
+ || (n + (index->id == DICT_INDEXES_ID)
+ >= n_core && n <= index->n_fields
+ + unsigned(rec_is_alter_metadata(rec, false))));
+
+ if (is_user_rec && n_core && n < index->n_fields) {
+ ut_ad(!index->is_dummy);
+ ut_ad(!dict_index_is_ibuf(index));
+ n = index->n_fields;
+ }
+ }
+
+ if (UNIV_UNLIKELY(n_fields < n)) {
+ n = n_fields;
+ }
+
+ /* The offsets header consists of the allocation size at
+ offsets[0] and the REC_OFFS_HEADER_SIZE bytes. */
+ size = n + (1 + REC_OFFS_HEADER_SIZE);
+
+ if (UNIV_UNLIKELY(!offsets)
+ || UNIV_UNLIKELY(rec_offs_get_n_alloc(offsets) < size)) {
+ if (UNIV_UNLIKELY(!*heap)) {
+ *heap = mem_heap_create_at(size * sizeof(*offsets),
+ file, line);
+ }
+ offsets = static_cast<rec_offs*>(
+ mem_heap_alloc(*heap, size * sizeof(*offsets)));
+
+ rec_offs_set_n_alloc(offsets, size);
+ }
+
+ rec_offs_set_n_fields(offsets, n);
+
+ if (UNIV_UNLIKELY(alter_metadata) && index->table->not_redundant()) {
+#ifdef UNIV_DEBUG
+ memcpy(&offsets[RECORD_OFFSET], &rec, sizeof rec);
+ memcpy(&offsets[INDEX_OFFSET], &index, sizeof index);
+#endif /* UNIV_DEBUG */
+ ut_ad(n_core);
+ ut_ad(index->table->instant);
+ ut_ad(index->is_instant());
+ ut_ad(rec_offs_n_fields(offsets)
+ <= ulint(index->n_fields) + 1);
+ rec_init_offsets_comp_ordinary<true>(rec, index, offsets,
+ index->n_core_fields,
+ nullptr,
+ REC_LEAF_INSTANT);
+ } else {
+ rec_init_offsets(rec, index, n_core, offsets);
+ }
+ return offsets;
+}
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record. It can reuse a previously allocated array. */
+void
+rec_get_offsets_reverse(
+/*====================*/
+ const byte* extra, /*!< in: the extra bytes of a
+ compact record in reverse order,
+ excluding the fixed-size
+ REC_N_NEW_EXTRA_BYTES */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint node_ptr,/*!< in: nonzero=node pointer,
+ 0=leaf node */
+ rec_offs* offsets)/*!< in/out: array consisting of
+ offsets[0] allocated elements */
+{
+ ulint n;
+ ulint i;
+ rec_offs offs;
+ rec_offs any_ext = 0;
+ const byte* nulls;
+ const byte* lens;
+ dict_field_t* field;
+ ulint null_mask;
+ ulint n_node_ptr_field;
+
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(!index->is_instant());
+
+ if (UNIV_UNLIKELY(node_ptr != 0)) {
+ n_node_ptr_field =
+ dict_index_get_n_unique_in_tree_nonleaf(index);
+ n = n_node_ptr_field + 1;
+ } else {
+ n_node_ptr_field = ULINT_UNDEFINED;
+ n = dict_index_get_n_fields(index);
+ }
+
+ ut_a(rec_offs_get_n_alloc(offsets) >= n + (1 + REC_OFFS_HEADER_SIZE));
+ rec_offs_set_n_fields(offsets, n);
+
+ nulls = extra;
+ lens = nulls + UT_BITS_IN_BYTES(index->n_nullable);
+ i = offs = 0;
+ null_mask = 1;
+
+ /* read the lengths of fields 0..n */
+ do {
+ rec_offs len;
+ if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+ len = offs = static_cast<rec_offs>(
+ offs + REC_NODE_PTR_SIZE);
+ goto resolved;
+ }
+
+ field = dict_index_get_nth_field(index, i);
+ if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) {
+ /* nullable field => read the null flag */
+
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls++;
+ null_mask = 1;
+ }
+
+ if (*nulls & null_mask) {
+ null_mask <<= 1;
+ /* No length is stored for NULL fields.
+ We do not advance offs, and we set
+ the length to zero and enable the
+ SQL NULL flag in offsets[]. */
+ len = combine(offs, SQL_NULL);
+ goto resolved;
+ }
+ null_mask <<= 1;
+ }
+
+ if (UNIV_UNLIKELY(!field->fixed_len)) {
+ /* Variable-length field: read the length */
+ const dict_col_t* col
+ = dict_field_get_col(field);
+ len = *lens++;
+ /* If the maximum length of the field is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the field is stored externally. */
+ if (DATA_BIG_COL(col)) {
+ if (len & 0x80) {
+ /* 1exxxxxxx xxxxxxxx */
+ len = static_cast<rec_offs>(
+ len << 8 | *lens++);
+
+ offs = static_cast<rec_offs>(
+ offs + get_value(len));
+ if (UNIV_UNLIKELY(len & 0x4000)) {
+ any_ext = REC_OFFS_EXTERNAL;
+ len = combine(offs,
+ STORED_OFFPAGE);
+ } else {
+ len = offs;
+ }
+
+ goto resolved;
+ }
+ }
+
+ len = offs = static_cast<rec_offs>(offs + len);
+ } else {
+ len = offs = static_cast<rec_offs>(offs
+ + field->fixed_len);
+ }
+resolved:
+ rec_offs_base(offsets)[i + 1] = len;
+ } while (++i < rec_offs_n_fields(offsets));
+
+ ut_ad(lens >= extra);
+ *rec_offs_base(offsets)
+ = static_cast<rec_offs>(lens - extra + REC_N_NEW_EXTRA_BYTES)
+ | REC_OFFS_COMPACT | any_ext;
+}
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return offset to the field */
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: index of the field */
+ ulint* len) /*!< out: length of the field;
+ UNIV_SQL_NULL if SQL null */
+{
+ ulint os;
+ ulint next_os;
+
+ ut_a(n < rec_get_n_fields_old(rec));
+
+ if (rec_get_1byte_offs_flag(rec)) {
+ os = rec_1_get_field_start_offs(rec, n);
+
+ next_os = rec_1_get_field_end_info(rec, n);
+
+ if (next_os & REC_1BYTE_SQL_NULL_MASK) {
+ *len = UNIV_SQL_NULL;
+
+ return(os);
+ }
+
+ next_os = next_os & ~REC_1BYTE_SQL_NULL_MASK;
+ } else {
+ os = rec_2_get_field_start_offs(rec, n);
+
+ next_os = rec_2_get_field_end_info(rec, n);
+
+ if (next_os & REC_2BYTE_SQL_NULL_MASK) {
+ *len = UNIV_SQL_NULL;
+
+ return(os);
+ }
+
+ next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK
+ | REC_2BYTE_EXTERN_MASK);
+ }
+
+ *len = next_os - os;
+
+ ut_ad(*len < srv_page_size);
+
+ return(os);
+}
+
+/** Determine the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@tparam mblob whether the record includes a metadata BLOB
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[in] index record descriptor; dict_table_is_comp()
+ is assumed to hold, even if it doesn't
+@param[in] dfield array of data fields
+@param[in] n_fields number of data fields
+@param[out] extra extra size
+@param[in] status status flags
+@param[in] temp whether this is a temporary file record
+@return total size */
+template<bool mblob = false, bool redundant_temp = false>
+static inline
+ulint
+rec_get_converted_size_comp_prefix_low(
+ const dict_index_t* index,
+ const dfield_t* dfield,
+ ulint n_fields,
+ ulint* extra,
+ rec_comp_status_t status,
+ bool temp)
+{
+ ulint extra_size = temp ? 0 : REC_N_NEW_EXTRA_BYTES;
+ ut_ad(n_fields > 0);
+ ut_ad(n_fields - mblob <= dict_index_get_n_fields(index));
+ ut_d(ulint n_null = index->n_nullable);
+ ut_ad(status == REC_STATUS_ORDINARY || status == REC_STATUS_NODE_PTR
+ || status == REC_STATUS_INSTANT);
+ unsigned n_core_fields = redundant_temp
+ ? row_log_get_n_core_fields(index)
+ : index->n_core_fields;
+
+ if (mblob) {
+ ut_ad(index->table->instant);
+ ut_ad(!redundant_temp && index->is_instant());
+ ut_ad(status == REC_STATUS_INSTANT);
+ ut_ad(n_fields == ulint(index->n_fields) + 1);
+ extra_size += UT_BITS_IN_BYTES(index->n_nullable)
+ + rec_get_n_add_field_len(n_fields - 1
+ - n_core_fields);
+ } else if (status == REC_STATUS_INSTANT
+ && (!temp || n_fields > n_core_fields)) {
+ if (!redundant_temp) { ut_ad(index->is_instant()); }
+ ut_ad(UT_BITS_IN_BYTES(n_null) >= index->n_core_null_bytes);
+ extra_size += UT_BITS_IN_BYTES(index->get_n_nullable(n_fields))
+ + rec_get_n_add_field_len(n_fields - 1
+ - n_core_fields);
+ } else {
+ ut_ad(n_fields <= n_core_fields);
+ extra_size += redundant_temp
+ ? UT_BITS_IN_BYTES(index->n_nullable)
+ : index->n_core_null_bytes;
+ }
+
+ ulint data_size = 0;
+
+ if (temp && dict_table_is_comp(index->table)) {
+ /* No need to do adjust fixed_len=0. We only need to
+ adjust it for ROW_FORMAT=REDUNDANT. */
+ temp = false;
+ }
+
+ const dfield_t* const end = dfield + n_fields;
+ /* read the lengths of fields 0..n */
+ for (ulint i = 0; dfield < end; i++, dfield++) {
+ if (mblob && i == index->first_user_field()) {
+ data_size += FIELD_REF_SIZE;
+ if (++dfield == end) {
+ ut_ad(i == index->n_fields);
+ break;
+ }
+ }
+
+ ulint len = dfield_get_len(dfield);
+
+ const dict_field_t* field = dict_index_get_nth_field(index, i);
+#ifdef UNIV_DEBUG
+ if (dict_index_is_spatial(index)) {
+ if (DATA_GEOMETRY_MTYPE(field->col->mtype) && i == 0) {
+ ut_ad(dfield->type.prtype & DATA_GIS_MBR);
+ } else {
+ ut_ad(dfield->type.mtype == DATA_SYS_CHILD
+ || dict_col_type_assert_equal(
+ field->col, &dfield->type));
+ }
+ } else {
+ ut_ad(field->col->is_dropped()
+ || dict_col_type_assert_equal(field->col,
+ &dfield->type));
+ }
+#endif
+
+ /* All NULLable fields must be included in the n_null count. */
+ ut_ad(!field->col->is_nullable() || n_null--);
+
+ if (dfield_is_null(dfield)) {
+ /* No length is stored for NULL fields. */
+ ut_ad(field->col->is_nullable());
+ continue;
+ }
+
+ ut_ad(len <= field->col->len
+ || DATA_LARGE_MTYPE(field->col->mtype)
+ || (field->col->len == 0
+ && field->col->mtype == DATA_VARCHAR));
+
+ ulint fixed_len = field->fixed_len;
+ if (temp && fixed_len
+ && !dict_col_get_fixed_size(field->col, temp)) {
+ fixed_len = 0;
+ }
+ /* If the maximum length of a variable-length field
+ is up to 255 bytes, the actual length is always stored
+ in one byte. If the maximum length is more than 255
+ bytes, the actual length is stored in one byte for
+ 0..127. The length will be encoded in two bytes when
+ it is 128 or more, or when the field is stored externally. */
+
+ if (fixed_len) {
+#ifdef UNIV_DEBUG
+ ut_ad(len <= fixed_len);
+
+ if (dict_index_is_spatial(index)) {
+ ut_ad(dfield->type.mtype == DATA_SYS_CHILD
+ || !field->col->mbmaxlen
+ || len >= field->col->mbminlen
+ * fixed_len / field->col->mbmaxlen);
+ } else {
+ ut_ad(dfield->type.mtype != DATA_SYS_CHILD);
+
+ ut_ad(field->col->is_dropped()
+ || !field->col->mbmaxlen
+ || len >= field->col->mbminlen
+ * fixed_len / field->col->mbmaxlen);
+ }
+
+ /* dict_index_add_col() should guarantee this */
+ ut_ad(!field->prefix_len
+ || fixed_len == field->prefix_len);
+#endif /* UNIV_DEBUG */
+ } else if (dfield_is_ext(dfield)) {
+ ut_ad(DATA_BIG_COL(field->col));
+ extra_size += 2;
+ } else if (len < 128 || !DATA_BIG_COL(field->col)) {
+ extra_size++;
+ } else {
+ /* For variable-length columns, we look up the
+ maximum length from the column itself. If this
+ is a prefix index column shorter than 256 bytes,
+ this will waste one byte. */
+ extra_size += 2;
+ }
+ data_size += len;
+ }
+
+ if (extra) {
+ *extra = extra_size;
+ }
+
+ return(extra_size + data_size);
+}
+
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return total size */
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields,/*!< in: number of data fields */
+ ulint* extra) /*!< out: extra size */
+{
+ ut_ad(dict_table_is_comp(index->table));
+ return(rec_get_converted_size_comp_prefix_low(
+ index, fields, n_fields, extra,
+ REC_STATUS_ORDINARY, false));
+}
+
+/** Determine the size of a record in ROW_FORMAT=COMPACT.
+@param[in] index record descriptor. dict_table_is_comp()
+ is assumed to hold, even if it doesn't
+@param[in] tuple logical record
+@param[out] extra extra size
+@return total size */
+ulint
+rec_get_converted_size_comp(
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ ulint* extra)
+{
+ ut_ad(tuple->n_fields > 0);
+
+ rec_comp_status_t status = rec_comp_status_t(tuple->info_bits
+ & REC_NEW_STATUS_MASK);
+
+ switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+ case REC_STATUS_ORDINARY:
+ ut_ad(!tuple->is_metadata());
+ if (tuple->n_fields > index->n_core_fields) {
+ ut_ad(index->is_instant());
+ status = REC_STATUS_INSTANT;
+ }
+ /* fall through */
+ case REC_STATUS_INSTANT:
+ ut_ad(tuple->n_fields >= index->n_core_fields);
+ if (tuple->is_alter_metadata()) {
+ return rec_get_converted_size_comp_prefix_low<true>(
+ index, tuple->fields, tuple->n_fields,
+ extra, status, false);
+ }
+ ut_ad(tuple->n_fields <= index->n_fields);
+ return rec_get_converted_size_comp_prefix_low(
+ index, tuple->fields, tuple->n_fields,
+ extra, status, false);
+ case REC_STATUS_NODE_PTR:
+ ut_ad(tuple->n_fields - 1
+ == dict_index_get_n_unique_in_tree_nonleaf(index));
+ ut_ad(dfield_get_len(&tuple->fields[tuple->n_fields - 1])
+ == REC_NODE_PTR_SIZE);
+ return REC_NODE_PTR_SIZE /* child page number */
+ + rec_get_converted_size_comp_prefix_low(
+ index, tuple->fields, tuple->n_fields - 1,
+ extra, status, false);
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ /* not supported */
+ break;
+ }
+
+ ut_error;
+ return(ULINT_UNDEFINED);
+}
+
+/*********************************************************//**
+Builds an old-style physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return pointer to the origin of physical record */
+static
+rec_t*
+rec_convert_dtuple_to_rec_old(
+/*==========================*/
+ byte* buf, /*!< in: start address of the physical record */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ const dfield_t* field;
+ ulint n_fields;
+ ulint data_size;
+ rec_t* rec;
+ ulint end_offset;
+ ulint ored_offset;
+ ulint len;
+ ulint i;
+
+ ut_ad(buf && dtuple);
+ ut_ad(dtuple_validate(dtuple));
+ ut_ad(dtuple_check_typed(dtuple));
+
+ n_fields = dtuple_get_n_fields(dtuple);
+ data_size = dtuple_get_data_size(dtuple, 0);
+
+ ut_ad(n_fields > 0);
+
+ /* Calculate the offset of the origin in the physical record */
+
+ rec = buf + rec_get_converted_extra_size(data_size, n_fields, n_ext);
+ /* Store the number of fields */
+ rec_set_n_fields_old(rec, n_fields);
+
+ /* Set the info bits of the record */
+ rec_set_bit_field_1(rec,
+ dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK,
+ REC_OLD_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+ rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW, REC_OLD_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+
+ /* Store the data and the offsets */
+
+ end_offset = 0;
+
+ if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+ rec_set_1byte_offs_flag(rec, TRUE);
+
+ for (i = 0; i < n_fields; i++) {
+
+ field = dtuple_get_nth_field(dtuple, i);
+
+ if (dfield_is_null(field)) {
+ len = dtype_get_sql_null_size(
+ dfield_get_type(field), 0);
+ data_write_sql_null(rec + end_offset, len);
+
+ end_offset += len;
+ ored_offset = end_offset
+ | REC_1BYTE_SQL_NULL_MASK;
+ } else {
+ /* If the data is not SQL null, store it */
+ len = dfield_get_len(field);
+
+ if (len)
+ memcpy(rec + end_offset,
+ dfield_get_data(field), len);
+
+ end_offset += len;
+ ored_offset = end_offset;
+ }
+
+ rec_1_set_field_end_info(rec, i, ored_offset);
+ }
+ } else {
+ rec_set_1byte_offs_flag(rec, FALSE);
+
+ for (i = 0; i < n_fields; i++) {
+
+ field = dtuple_get_nth_field(dtuple, i);
+
+ if (dfield_is_null(field)) {
+ len = dtype_get_sql_null_size(
+ dfield_get_type(field), 0);
+ data_write_sql_null(rec + end_offset, len);
+
+ end_offset += len;
+ ored_offset = end_offset
+ | REC_2BYTE_SQL_NULL_MASK;
+ } else {
+ /* If the data is not SQL null, store it */
+ len = dfield_get_len(field);
+
+ if (len)
+ memcpy(rec + end_offset,
+ dfield_get_data(field), len);
+
+ end_offset += len;
+ ored_offset = end_offset;
+
+ if (dfield_is_ext(field)) {
+ ored_offset |= REC_2BYTE_EXTERN_MASK;
+ }
+ }
+
+ rec_2_set_field_end_info(rec, i, ored_offset);
+ }
+ }
+
+ return(rec);
+}
+
+/** Convert a data tuple into a ROW_FORMAT=COMPACT record.
+@tparam mblob whether the record includes a metadata BLOB
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[out] rec converted record
+@param[in] index index
+@param[in] field data fields to convert
+@param[in] n_fields number of data fields
+@param[in] status rec_get_status(rec)
+@param[in] temp whether to use the format for temporary files
+ in index creation */
+template<bool mblob = false, bool redundant_temp = false>
+static inline
+void
+rec_convert_dtuple_to_rec_comp(
+ rec_t* rec,
+ const dict_index_t* index,
+ const dfield_t* field,
+ ulint n_fields,
+ rec_comp_status_t status,
+ bool temp)
+{
+ byte* end;
+ byte* nulls = temp
+ ? rec - 1 : rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ byte* UNINIT_VAR(lens);
+ ulint UNINIT_VAR(n_node_ptr_field);
+ ulint null_mask = 1;
+ const ulint n_core_fields = redundant_temp
+ ? row_log_get_n_core_fields(index)
+ : index->n_core_fields;
+ ut_ad(n_fields > 0);
+ ut_ad(temp || dict_table_is_comp(index->table));
+ ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+
+ ut_d(ulint n_null = index->n_nullable);
+
+ if (mblob) {
+ ut_ad(!temp);
+ ut_ad(index->table->instant);
+ ut_ad(!redundant_temp && index->is_instant());
+ ut_ad(status == REC_STATUS_INSTANT);
+ ut_ad(n_fields == ulint(index->n_fields) + 1);
+ rec_set_n_add_field(nulls, n_fields - 1 - n_core_fields);
+ rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW,
+ REC_NEW_HEAP_NO, REC_HEAP_NO_MASK,
+ REC_HEAP_NO_SHIFT);
+ rec_set_status(rec, REC_STATUS_INSTANT);
+ n_node_ptr_field = ULINT_UNDEFINED;
+ lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+ goto start;
+ }
+ switch (status) {
+ case REC_STATUS_INSTANT:
+ if (!redundant_temp) { ut_ad(index->is_instant()); }
+ ut_ad(n_fields > n_core_fields);
+ rec_set_n_add_field(nulls, n_fields - 1 - n_core_fields);
+ /* fall through */
+ case REC_STATUS_ORDINARY:
+ ut_ad(n_fields <= dict_index_get_n_fields(index));
+ if (!temp) {
+ rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW,
+ REC_NEW_HEAP_NO, REC_HEAP_NO_MASK,
+ REC_HEAP_NO_SHIFT);
+ rec_set_status(rec, n_fields == n_core_fields
+ ? REC_STATUS_ORDINARY
+ : REC_STATUS_INSTANT);
+ }
+
+ if (dict_table_is_comp(index->table)) {
+ /* No need to do adjust fixed_len=0. We only
+ need to adjust it for ROW_FORMAT=REDUNDANT. */
+ temp = false;
+ }
+
+ n_node_ptr_field = ULINT_UNDEFINED;
+
+ lens = nulls - (index->is_instant()
+ ? UT_BITS_IN_BYTES(index->get_n_nullable(
+ n_fields))
+ : UT_BITS_IN_BYTES(
+ unsigned(index->n_nullable)));
+ break;
+ case REC_STATUS_NODE_PTR:
+ ut_ad(!temp);
+ rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW,
+ REC_NEW_HEAP_NO, REC_HEAP_NO_MASK,
+ REC_HEAP_NO_SHIFT);
+ rec_set_status(rec, status);
+ ut_ad(n_fields - 1
+ == dict_index_get_n_unique_in_tree_nonleaf(index));
+ ut_d(n_null = std::min<uint>(index->n_core_null_bytes * 8U,
+ index->n_nullable));
+ n_node_ptr_field = n_fields - 1;
+ lens = nulls - index->n_core_null_bytes;
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ ut_error;
+ return;
+ }
+
+start:
+ end = rec;
+ /* clear the SQL-null flags */
+ memset(lens + 1, 0, ulint(nulls - lens));
+
+ const dfield_t* const fend = field + n_fields;
+ /* Store the data and the offsets */
+ for (ulint i = 0; field < fend; i++, field++) {
+ ulint len = dfield_get_len(field);
+
+ if (mblob) {
+ if (i == index->first_user_field()) {
+ ut_ad(len == FIELD_REF_SIZE);
+ ut_ad(dfield_is_ext(field));
+ memcpy(end, dfield_get_data(field), len);
+ end += len;
+ if (++field == fend) {
+ ut_ad(i == index->n_fields);
+ break;
+ }
+ len = dfield_get_len(field);
+ }
+ } else if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+ ut_ad(field->type.prtype & DATA_NOT_NULL);
+ ut_ad(len == REC_NODE_PTR_SIZE);
+ memcpy(end, dfield_get_data(field), len);
+ end += REC_NODE_PTR_SIZE;
+ break;
+ }
+
+ if (!(field->type.prtype & DATA_NOT_NULL)) {
+ /* nullable field */
+ ut_ad(n_null--);
+
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls--;
+ null_mask = 1;
+ }
+
+ ut_ad(*nulls < null_mask);
+
+ /* set the null flag if necessary */
+ if (dfield_is_null(field)) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ *nulls |= static_cast<byte>(null_mask);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ null_mask <<= 1;
+ continue;
+ }
+
+ null_mask <<= 1;
+ }
+ /* only nullable fields can be null */
+ ut_ad(!dfield_is_null(field));
+
+ const dict_field_t* ifield
+ = dict_index_get_nth_field(index, i);
+ ulint fixed_len = ifield->fixed_len;
+
+ if (temp && fixed_len
+ && !dict_col_get_fixed_size(ifield->col, temp)) {
+ fixed_len = 0;
+ }
+
+ /* If the maximum length of a variable-length field
+ is up to 255 bytes, the actual length is always stored
+ in one byte. If the maximum length is more than 255
+ bytes, the actual length is stored in one byte for
+ 0..127. The length will be encoded in two bytes when
+ it is 128 or more, or when the field is stored externally. */
+ if (fixed_len) {
+ ut_ad(len <= fixed_len);
+ ut_ad(!ifield->col->mbmaxlen
+ || len >= ifield->col->mbminlen
+ * fixed_len / ifield->col->mbmaxlen);
+ ut_ad(!dfield_is_ext(field));
+ } else if (dfield_is_ext(field)) {
+ ut_ad(DATA_BIG_COL(ifield->col));
+ ut_ad(len <= REC_ANTELOPE_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ *lens-- = static_cast<byte>(len >> 8 | 0xc0);
+ *lens-- = static_cast<byte>(len);
+ } else {
+ ut_ad(len <= field->type.len
+ || DATA_LARGE_MTYPE(field->type.mtype)
+ || !strcmp(index->name,
+ FTS_INDEX_TABLE_IND_NAME));
+ if (len < 128 || !DATA_BIG_LEN_MTYPE(
+ field->type.len, field->type.mtype)) {
+ *lens-- = static_cast<byte>(len);
+ } else {
+ ut_ad(len < 16384);
+ *lens-- = static_cast<byte>(len >> 8 | 0x80);
+ *lens-- = static_cast<byte>(len);
+ }
+ }
+
+ if (len) {
+ memcpy(end, dfield_get_data(field), len);
+ end += len;
+ }
+ }
+}
+
+/*********************************************************//**
+Builds a new-style physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return pointer to the origin of physical record */
+static
+rec_t*
+rec_convert_dtuple_to_rec_new(
+/*==========================*/
+ byte* buf, /*!< in: start address of
+ the physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple) /*!< in: data tuple */
+{
+ ut_ad(!(dtuple->info_bits
+ & ~(REC_NEW_STATUS_MASK | REC_INFO_DELETED_FLAG
+ | REC_INFO_MIN_REC_FLAG)));
+
+ ulint extra_size;
+
+ if (UNIV_UNLIKELY(dtuple->is_alter_metadata())) {
+ ut_ad((dtuple->info_bits & REC_NEW_STATUS_MASK)
+ == REC_STATUS_INSTANT);
+ rec_get_converted_size_comp_prefix_low<true>(
+ index, dtuple->fields, dtuple->n_fields,
+ &extra_size, REC_STATUS_INSTANT, false);
+ buf += extra_size;
+ rec_convert_dtuple_to_rec_comp<true>(
+ buf, index, dtuple->fields, dtuple->n_fields,
+ REC_STATUS_INSTANT, false);
+ } else {
+ rec_get_converted_size_comp(index, dtuple, &extra_size);
+ buf += extra_size;
+ rec_comp_status_t status = rec_comp_status_t(
+ dtuple->info_bits & REC_NEW_STATUS_MASK);
+ if (status == REC_STATUS_ORDINARY
+ && dtuple->n_fields > index->n_core_fields) {
+ ut_ad(index->is_instant());
+ status = REC_STATUS_INSTANT;
+ }
+
+ rec_convert_dtuple_to_rec_comp(
+ buf, index, dtuple->fields, dtuple->n_fields,
+ status, false);
+ }
+
+ rec_set_bit_field_1(buf, dtuple->info_bits & ~REC_NEW_STATUS_MASK,
+ REC_NEW_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+ return buf;
+}
+
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return pointer to the origin of physical record */
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+ byte* buf, /*!< in: start address of the
+ physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of
+ externally stored columns */
+{
+ rec_t* rec;
+
+ ut_ad(buf != NULL);
+ ut_ad(index != NULL);
+ ut_ad(dtuple != NULL);
+ ut_ad(dtuple_validate(dtuple));
+ ut_ad(dtuple_check_typed(dtuple));
+
+ if (dict_table_is_comp(index->table)) {
+ rec = rec_convert_dtuple_to_rec_new(buf, index, dtuple);
+ } else {
+ rec = rec_convert_dtuple_to_rec_old(buf, dtuple, n_ext);
+ }
+
+ return(rec);
+}
+
+/** Determine the size of a data tuple prefix in a temporary file.
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[in] index clustered or secondary index
+@param[in] fields data fields
+@param[in] n_fields number of data fields
+@param[out] extra record header size
+@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT
+@return total size, in bytes */
+template<bool redundant_temp>
+ulint
+rec_get_converted_size_temp(
+ const dict_index_t* index,
+ const dfield_t* fields,
+ ulint n_fields,
+ ulint* extra,
+ rec_comp_status_t status)
+{
+ return rec_get_converted_size_comp_prefix_low<false,redundant_temp>(
+ index, fields, n_fields, extra, status, true);
+}
+
+template ulint rec_get_converted_size_temp<false>(
+ const dict_index_t*, const dfield_t*, ulint, ulint*,
+ rec_comp_status_t);
+
+template ulint rec_get_converted_size_temp<true>(
+ const dict_index_t*, const dfield_t*, ulint, ulint*,
+ rec_comp_status_t);
+
+/** Determine the offset to each field in temporary file.
+@param[in] rec temporary file record
+@param[in] index index of that the record belongs to
+@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets)
+@param[in] n_core number of core fields (index->n_core_fields)
+@param[in] def_val default values for non-core fields
+@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */
+void
+rec_init_offsets_temp(
+ const rec_t* rec,
+ const dict_index_t* index,
+ rec_offs* offsets,
+ ulint n_core,
+ const dict_col_t::def_t*def_val,
+ rec_comp_status_t status)
+{
+ ut_ad(status == REC_STATUS_ORDINARY
+ || status == REC_STATUS_INSTANT);
+ /* The table may have been converted to plain format
+ if it was emptied during an ALTER TABLE operation. */
+ ut_ad(index->n_core_fields == n_core || !index->is_instant());
+ ut_ad(index->n_core_fields >= n_core);
+ if (index->table->not_redundant()) {
+ rec_init_offsets_comp_ordinary(
+ rec, index, offsets, n_core, def_val,
+ status == REC_STATUS_INSTANT
+ ? REC_LEAF_TEMP_INSTANT
+ : REC_LEAF_TEMP);
+ } else {
+ rec_init_offsets_comp_ordinary<false, true>(
+ rec, index, offsets, n_core, def_val,
+ status == REC_STATUS_INSTANT
+ ? REC_LEAF_TEMP_INSTANT
+ : REC_LEAF_TEMP);
+ }
+}
+
+/** Determine the offset to each field in temporary file.
+@param[in] rec temporary file record
+@param[in] index index of that the record belongs to
+@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets)
+*/
+void
+rec_init_offsets_temp(
+ const rec_t* rec,
+ const dict_index_t* index,
+ rec_offs* offsets)
+{
+ ut_ad(!index->is_instant());
+ if (index->table->not_redundant()) {
+ rec_init_offsets_comp_ordinary(
+ rec, index, offsets,
+ index->n_core_fields, NULL, REC_LEAF_TEMP);
+ } else {
+ rec_init_offsets_comp_ordinary<false, true>(
+ rec, index, offsets,
+ index->n_core_fields, NULL, REC_LEAF_TEMP);
+ }
+}
+
+/** Convert a data tuple prefix to the temporary file format.
+@param[out] rec record in temporary file format
+@param[in] index clustered or secondary index
+@param[in] fields data fields
+@param[in] n_fields number of data fields
+@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT
+*/
+template<bool redundant_temp>
+void
+rec_convert_dtuple_to_temp(
+ rec_t* rec,
+ const dict_index_t* index,
+ const dfield_t* fields,
+ ulint n_fields,
+ rec_comp_status_t status)
+{
+ rec_convert_dtuple_to_rec_comp<false,redundant_temp>(
+ rec, index, fields, n_fields, status, true);
+}
+
+template void rec_convert_dtuple_to_temp<false>(
+ rec_t*, const dict_index_t*, const dfield_t*,
+ ulint, rec_comp_status_t);
+
+template void rec_convert_dtuple_to_temp<true>(
+ rec_t*, const dict_index_t*, const dfield_t*,
+ ulint, rec_comp_status_t);
+
+/** Copy the first n fields of a (copy of a) physical record to a data tuple.
+The fields are copied into the memory heap.
+@param[out] tuple data tuple
+@param[in] rec index record, or a copy thereof
+@param[in] index index of rec
+@param[in] n_core index->n_core_fields at the time rec was
+ copied, or 0 if non-leaf page record
+@param[in] n_fields number of fields to copy
+@param[in,out] heap memory heap */
+void
+rec_copy_prefix_to_dtuple(
+ dtuple_t* tuple,
+ const rec_t* rec,
+ const dict_index_t* index,
+ ulint n_core,
+ ulint n_fields,
+ mem_heap_t* heap)
+{
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(n_core <= index->n_core_fields);
+ ut_ad(n_core || n_fields - 1
+ <= dict_index_get_n_unique_in_tree_nonleaf(index));
+
+ offsets = rec_get_offsets(rec, index, offsets, n_core,
+ n_fields, &heap);
+
+ ut_ad(rec_validate(rec, offsets));
+ ut_ad(!rec_offs_any_default(offsets));
+ ut_ad(dtuple_check_typed(tuple));
+
+ tuple->info_bits = rec_get_info_bits(rec, rec_offs_comp(offsets));
+
+ for (ulint i = 0; i < n_fields; i++) {
+ dfield_t* field;
+ const byte* data;
+ ulint len;
+
+ field = dtuple_get_nth_field(tuple, i);
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ dfield_set_data(field,
+ mem_heap_dup(heap, data, len), len);
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ } else {
+ dfield_set_null(field);
+ }
+ }
+}
+
+/**************************************************************//**
+Copies the first n fields of an old-style physical record
+to a new physical record in a buffer.
+@return own: copied record */
+static
+rec_t*
+rec_copy_prefix_to_buf_old(
+/*=======================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint n_fields, /*!< in: number of fields to copy */
+ ulint area_end, /*!< in: end of the prefix data */
+ byte** buf, /*!< in/out: memory buffer for
+ the copied prefix, or NULL */
+ ulint* buf_size) /*!< in/out: buffer size */
+{
+ rec_t* copy_rec;
+ ulint area_start;
+ ulint prefix_len;
+
+ if (rec_get_1byte_offs_flag(rec)) {
+ area_start = REC_N_OLD_EXTRA_BYTES + n_fields;
+ } else {
+ area_start = REC_N_OLD_EXTRA_BYTES + 2 * n_fields;
+ }
+
+ prefix_len = area_start + area_end;
+
+ if ((*buf == NULL) || (*buf_size < prefix_len)) {
+ ut_free(*buf);
+ *buf_size = prefix_len;
+ *buf = static_cast<byte*>(ut_malloc_nokey(prefix_len));
+ }
+
+ memcpy(*buf, rec - area_start, prefix_len);
+
+ copy_rec = *buf + area_start;
+
+ rec_set_n_fields_old(copy_rec, n_fields);
+
+ return(copy_rec);
+}
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return own: copied record */
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n_fields, /*!< in: number of fields
+ to copy */
+ byte** buf, /*!< in/out: memory buffer
+ for the copied prefix,
+ or NULL */
+ ulint* buf_size) /*!< in/out: buffer size */
+{
+ ut_ad(n_fields <= index->n_fields || dict_index_is_ibuf(index));
+ ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+ UNIV_PREFETCH_RW(*buf);
+
+ if (!dict_table_is_comp(index->table)) {
+ ut_ad(rec_validate_old(rec));
+ return(rec_copy_prefix_to_buf_old(
+ rec, n_fields,
+ rec_get_field_start_offs(rec, n_fields),
+ buf, buf_size));
+ }
+
+ ulint prefix_len = 0;
+ ulint instant_omit = 0;
+ const byte* nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ const byte* nullf = nulls;
+ const byte* lens = nulls - index->n_core_null_bytes;
+
+ switch (rec_get_status(rec)) {
+ default:
+ /* infimum or supremum record: no sense to copy anything */
+ ut_error;
+ return(NULL);
+ case REC_STATUS_ORDINARY:
+ ut_ad(n_fields <= index->n_core_fields);
+ break;
+ case REC_STATUS_NODE_PTR:
+ /* For R-tree, we need to copy the child page number field. */
+ compile_time_assert(DICT_INDEX_SPATIAL_NODEPTR_SIZE == 1);
+ if (dict_index_is_spatial(index)) {
+ ut_ad(index->n_core_null_bytes == 0);
+ ut_ad(n_fields == DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1);
+ ut_ad(index->fields[0].col->prtype & DATA_NOT_NULL);
+ ut_ad(DATA_BIG_COL(index->fields[0].col));
+ /* This is a deficiency of the format introduced
+ in MySQL 5.7. The length in the R-tree index should
+ always be DATA_MBR_LEN. */
+ ut_ad(!index->fields[0].fixed_len);
+ ut_ad(*lens == DATA_MBR_LEN);
+ lens--;
+ prefix_len = DATA_MBR_LEN + REC_NODE_PTR_SIZE;
+ n_fields = 0; /* skip the "for" loop below */
+ break;
+ }
+ /* it doesn't make sense to copy the child page number field */
+ ut_ad(n_fields
+ <= dict_index_get_n_unique_in_tree_nonleaf(index));
+ break;
+ case REC_STATUS_INSTANT:
+ /* We would have !index->is_instant() when rolling back
+ an instant ADD COLUMN operation. */
+ ut_ad(index->is_instant() || page_rec_is_metadata(rec));
+ ut_ad(n_fields <= index->first_user_field());
+ nulls++;
+ const ulint n_rec = ulint(index->n_core_fields) + 1
+ + rec_get_n_add_field(nulls)
+ - rec_is_alter_metadata(rec, true);
+ instant_omit = ulint(&rec[-REC_N_NEW_EXTRA_BYTES] - nulls);
+ ut_ad(instant_omit == 1 || instant_omit == 2);
+ nullf = nulls;
+ const uint nb = UT_BITS_IN_BYTES(index->get_n_nullable(n_rec));
+ instant_omit += nb - index->n_core_null_bytes;
+ lens = --nulls - nb;
+ }
+
+ const byte* const lenf = lens;
+ UNIV_PREFETCH_R(lens);
+
+ /* read the lengths of fields 0..n */
+ for (ulint i = 0, null_mask = 1; i < n_fields; i++) {
+ const dict_field_t* field;
+ const dict_col_t* col;
+
+ field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(field);
+
+ if (!(col->prtype & DATA_NOT_NULL)) {
+ /* nullable field => read the null flag */
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls--;
+ null_mask = 1;
+ }
+
+ if (*nulls & null_mask) {
+ null_mask <<= 1;
+ continue;
+ }
+
+ null_mask <<= 1;
+ }
+
+ if (field->fixed_len) {
+ prefix_len += field->fixed_len;
+ } else {
+ ulint len = *lens--;
+ /* If the maximum length of the column is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the column is stored externally. */
+ if (DATA_BIG_COL(col)) {
+ if (len & 0x80) {
+ /* 1exxxxxx */
+ len &= 0x3f;
+ len <<= 8;
+ len |= *lens--;
+ UNIV_PREFETCH_R(lens);
+ }
+ }
+ prefix_len += len;
+ }
+ }
+
+ UNIV_PREFETCH_R(rec + prefix_len);
+
+ ulint size = prefix_len + ulint(rec - (lens + 1)) - instant_omit;
+
+ if (*buf == NULL || *buf_size < size) {
+ ut_free(*buf);
+ *buf_size = size;
+ *buf = static_cast<byte*>(ut_malloc_nokey(size));
+ }
+
+ if (instant_omit) {
+ /* Copy and convert the record header to a format where
+ instant ADD COLUMN has not been used:
+ + lengths of variable-length fields in the prefix
+ - omit any null flag bytes for any instantly added columns
+ + index->n_core_null_bytes of null flags
+ - omit the n_add_fields header (1 or 2 bytes)
+ + REC_N_NEW_EXTRA_BYTES of fixed header */
+ byte* b = *buf;
+ /* copy the lengths of the variable-length fields */
+ memcpy(b, lens + 1, ulint(lenf - lens));
+ b += ulint(lenf - lens);
+ /* copy the null flags */
+ memcpy(b, nullf - index->n_core_null_bytes,
+ index->n_core_null_bytes);
+ b += index->n_core_null_bytes + REC_N_NEW_EXTRA_BYTES;
+ ut_ad(ulint(b - *buf) + prefix_len == size);
+ /* copy the fixed-size header and the record prefix */
+ memcpy(b - REC_N_NEW_EXTRA_BYTES, rec - REC_N_NEW_EXTRA_BYTES,
+ prefix_len + REC_N_NEW_EXTRA_BYTES);
+ ut_ad(rec_get_status(b) == REC_STATUS_INSTANT);
+ rec_set_status(b, REC_STATUS_ORDINARY);
+ return b;
+ } else {
+ memcpy(*buf, lens + 1, size);
+ return *buf + (rec - (lens + 1));
+ }
+}
+
+/***************************************************************//**
+Validates the consistency of an old-style physical record.
+@return TRUE if ok */
+static
+ibool
+rec_validate_old(
+/*=============*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ulint len;
+ ulint n_fields;
+ ulint len_sum = 0;
+ ulint i;
+
+ ut_a(rec);
+ n_fields = rec_get_n_fields_old(rec);
+
+ if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
+ ib::error() << "Record has " << n_fields << " fields";
+ return(FALSE);
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ rec_get_nth_field_offs_old(rec, i, &len);
+
+ if (!((len < srv_page_size) || (len == UNIV_SQL_NULL))) {
+ ib::error() << "Record field " << i << " len " << len;
+ return(FALSE);
+ }
+
+ if (len != UNIV_SQL_NULL) {
+ len_sum += len;
+ } else {
+ len_sum += rec_get_nth_field_size(rec, i);
+ }
+ }
+
+ if (len_sum != rec_get_data_size_old(rec)) {
+ ib::error() << "Record len should be " << len_sum << ", len "
+ << rec_get_data_size_old(rec);
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return TRUE if ok */
+ibool
+rec_validate(
+/*=========*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint len;
+ ulint n_fields;
+ ulint len_sum = 0;
+ ulint i;
+
+ n_fields = rec_offs_n_fields(offsets);
+
+ if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
+ ib::error() << "Record has " << n_fields << " fields";
+ return(FALSE);
+ }
+
+ ut_a(rec_offs_any_flag(offsets, REC_OFFS_COMPACT | REC_OFFS_DEFAULT)
+ || n_fields <= rec_get_n_fields_old(rec));
+
+ for (i = 0; i < n_fields; i++) {
+ rec_get_nth_field_offs(offsets, i, &len);
+
+ switch (len) {
+ default:
+ if (len >= srv_page_size) {
+ ib::error() << "Record field " << i
+ << " len " << len;
+ return(FALSE);
+ }
+ len_sum += len;
+ break;
+ case UNIV_SQL_DEFAULT:
+ break;
+ case UNIV_SQL_NULL:
+ if (!rec_offs_comp(offsets)) {
+ len_sum += rec_get_nth_field_size(rec, i);
+ }
+ }
+ }
+
+ if (len_sum != rec_offs_data_size(offsets)) {
+ ib::error() << "Record len should be " << len_sum << ", len "
+ << rec_offs_data_size(offsets);
+ return(FALSE);
+ }
+
+ if (!rec_offs_comp(offsets)) {
+ ut_a(rec_validate_old(rec));
+ }
+
+ return(TRUE);
+}
+
+/***************************************************************//**
+Prints an old-style physical record. */
+void
+rec_print_old(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec) /*!< in: physical record */
+{
+ const byte* data;
+ ulint len;
+ ulint n;
+ ulint i;
+
+ n = rec_get_n_fields_old(rec);
+
+ fprintf(file, "PHYSICAL RECORD: n_fields " ULINTPF ";"
+ " %u-byte offsets; info bits %u\n",
+ n,
+ rec_get_1byte_offs_flag(rec) ? 1 : 2,
+ rec_get_info_bits(rec, FALSE));
+
+ for (i = 0; i < n; i++) {
+
+ data = rec_get_nth_field_old(rec, i, &len);
+
+ fprintf(file, " " ULINTPF ":", i);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len <= 30) {
+
+ ut_print_buf(file, data, len);
+ } else {
+ ut_print_buf(file, data, 30);
+
+ fprintf(file, " (total " ULINTPF " bytes)",
+ len);
+ }
+ } else {
+ fprintf(file, " SQL NULL, size " ULINTPF " ",
+ rec_get_nth_field_size(rec, i));
+ }
+
+ putc(';', file);
+ putc('\n', file);
+ }
+
+ rec_validate_old(rec);
+}
+
+/***************************************************************//**
+Prints a physical record in ROW_FORMAT=COMPACT. Ignores the
+record header. */
+static
+void
+rec_print_comp(
+/*===========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint i;
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ const byte* UNINIT_VAR(data);
+ ulint len;
+
+ if (rec_offs_nth_default(offsets, i)) {
+ len = UNIV_SQL_DEFAULT;
+ } else {
+ data = rec_get_nth_field(rec, offsets, i, &len);
+ }
+
+ fprintf(file, " " ULINTPF ":", i);
+
+ if (len == UNIV_SQL_NULL) {
+ fputs(" SQL NULL", file);
+ } else if (len == UNIV_SQL_DEFAULT) {
+ fputs(" SQL DEFAULT", file);
+ } else {
+ if (len <= 30) {
+
+ ut_print_buf(file, data, len);
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ ut_print_buf(file, data, 30);
+ fprintf(file,
+ " (total " ULINTPF " bytes, external)",
+ len);
+ ut_print_buf(file, data + len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ } else {
+ ut_print_buf(file, data, 30);
+
+ fprintf(file, " (total " ULINTPF " bytes)",
+ len);
+ }
+ }
+ putc(';', file);
+ putc('\n', file);
+ }
+}
+
+/***************************************************************//**
+Prints an old-style spatial index record. */
+static
+void
+rec_print_mbr_old(
+/*==============*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec) /*!< in: physical record */
+{
+ const byte* data;
+ ulint len;
+ ulint n;
+ ulint i;
+
+ ut_ad(rec);
+
+ n = rec_get_n_fields_old(rec);
+
+ fprintf(file, "PHYSICAL RECORD: n_fields %lu;"
+ " %u-byte offsets; info bits %lu\n",
+ (ulong) n,
+ rec_get_1byte_offs_flag(rec) ? 1 : 2,
+ (ulong) rec_get_info_bits(rec, FALSE));
+
+ for (i = 0; i < n; i++) {
+
+ data = rec_get_nth_field_old(rec, i, &len);
+
+ fprintf(file, " %lu:", (ulong) i);
+
+ if (len != UNIV_SQL_NULL) {
+ if (i == 0) {
+ fprintf(file, " MBR:");
+ for (; len > 0; len -= sizeof(double)) {
+ double d = mach_double_read(data);
+
+ if (len != sizeof(double)) {
+ fprintf(file, "%.2lf,", d);
+ } else {
+ fprintf(file, "%.2lf", d);
+ }
+
+ data += sizeof(double);
+ }
+ } else {
+ if (len <= 30) {
+
+ ut_print_buf(file, data, len);
+ } else {
+ ut_print_buf(file, data, 30);
+
+ fprintf(file, " (total %lu bytes)",
+ (ulong) len);
+ }
+ }
+ } else {
+ fprintf(file, " SQL NULL, size " ULINTPF " ",
+ rec_get_nth_field_size(rec, i));
+ }
+
+ putc(';', file);
+ putc('\n', file);
+ }
+
+ if (rec_get_deleted_flag(rec, false)) {
+ fprintf(file, " Deleted");
+ }
+
+ if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) {
+ fprintf(file, " First rec");
+ }
+
+ rec_validate_old(rec);
+}
+
+/***************************************************************//**
+Prints a spatial index record. */
+void
+rec_print_mbr_rec(
+/*==============*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!rec_offs_any_default(offsets));
+
+ if (!rec_offs_comp(offsets)) {
+ rec_print_mbr_old(file, rec);
+ return;
+ }
+
+ for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+ const byte* data;
+ ulint len;
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (i == 0) {
+ fprintf(file, " MBR:");
+ for (; len > 0; len -= sizeof(double)) {
+ double d = mach_double_read(data);
+
+ if (len != sizeof(double)) {
+ fprintf(file, "%.2lf,", d);
+ } else {
+ fprintf(file, "%.2lf", d);
+ }
+
+ data += sizeof(double);
+ }
+ } else {
+ fprintf(file, " %lu:", (ulong) i);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len <= 30) {
+
+ ut_print_buf(file, data, len);
+ } else {
+ ut_print_buf(file, data, 30);
+
+ fprintf(file, " (total %lu bytes)",
+ (ulong) len);
+ }
+ } else {
+ fputs(" SQL NULL", file);
+ }
+ }
+ putc(';', file);
+ }
+
+ if (rec_get_info_bits(rec, true) & REC_INFO_DELETED_FLAG) {
+ fprintf(file, " Deleted");
+ }
+
+ if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) {
+ fprintf(file, " First rec");
+ }
+
+
+ rec_validate(rec, offsets);
+}
+
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print_new(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+#ifdef UNIV_DEBUG
+ if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+ DBUG_PRINT("info", ("deleted "));
+ } else {
+ DBUG_PRINT("info", ("not-deleted "));
+ }
+#endif /* UNIV_DEBUG */
+
+ if (!rec_offs_comp(offsets)) {
+ rec_print_old(file, rec);
+ return;
+ }
+
+ fprintf(file, "PHYSICAL RECORD: n_fields " ULINTPF ";"
+ " compact format; info bits %u\n",
+ rec_offs_n_fields(offsets),
+ rec_get_info_bits(rec, TRUE));
+
+ rec_print_comp(file, rec, offsets);
+ rec_validate(rec, offsets);
+}
+
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print(
+/*======*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index) /*!< in: record descriptor */
+{
+ if (!dict_table_is_comp(index->table)) {
+ rec_print_old(file, rec);
+ return;
+ } else {
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ rec_print_new(file, rec,
+ rec_get_offsets(rec, index, offsets_,
+ page_rec_is_leaf(rec)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap));
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+}
+
+/** Pretty-print a record.
+@param[in,out] o output stream
+@param[in] rec physical record
+@param[in] info rec_get_info_bits(rec)
+@param[in] offsets rec_get_offsets(rec) */
+void
+rec_print(
+ std::ostream& o,
+ const rec_t* rec,
+ ulint info,
+ const rec_offs* offsets)
+{
+ const ulint comp = rec_offs_comp(offsets);
+ const ulint n = rec_offs_n_fields(offsets);
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ o << (comp ? "COMPACT RECORD" : "RECORD")
+ << "(info_bits=" << info << ", " << n << " fields): {";
+
+ for (ulint i = 0; i < n; i++) {
+ const byte* data;
+ ulint len;
+
+ if (i) {
+ o << ',';
+ }
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len == UNIV_SQL_DEFAULT) {
+ o << "DEFAULT";
+ continue;
+ }
+
+ if (len == UNIV_SQL_NULL) {
+ o << "NULL";
+ continue;
+ }
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint local_len = len - BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ o << '['
+ << local_len
+ << '+' << BTR_EXTERN_FIELD_REF_SIZE << ']';
+ ut_print_buf(o, data, local_len);
+ ut_print_buf_hex(o, data + local_len,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ } else {
+ o << '[' << len << ']';
+ ut_print_buf(o, data, len);
+ }
+ }
+
+ o << "}";
+}
+
+/** Display a record.
+@param[in,out] o output stream
+@param[in] r record to display
+@return the output stream */
+std::ostream&
+operator<<(std::ostream& o, const rec_index_print& r)
+{
+ mem_heap_t* heap = NULL;
+ rec_offs* offsets = rec_get_offsets(
+ r.m_rec, r.m_index, NULL, page_rec_is_leaf(r.m_rec)
+ ? r.m_index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ rec_print(o, r.m_rec,
+ rec_get_info_bits(r.m_rec, rec_offs_comp(offsets)),
+ offsets);
+ mem_heap_free(heap);
+ return(o);
+}
+
+/** Display a record.
+@param[in,out] o output stream
+@param[in] r record to display
+@return the output stream */
+std::ostream&
+operator<<(std::ostream& o, const rec_offsets_print& r)
+{
+ rec_print(o, r.m_rec,
+ rec_get_info_bits(r.m_rec, rec_offs_comp(r.m_offsets)),
+ r.m_offsets);
+ return(o);
+}
+
+#ifdef UNIV_DEBUG
+/** Read the DB_TRX_ID of a clustered index record.
+@param[in] rec clustered index record
+@param[in] index clustered index
+@return the value of DB_TRX_ID */
+trx_id_t
+rec_get_trx_id(
+ const rec_t* rec,
+ const dict_index_t* index)
+{
+ const byte* trx_id;
+ ulint len;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+ rec_offs_init(offsets_);
+ rec_offs* offsets = offsets_;
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ index->db_trx_id() + 1, &heap);
+
+ trx_id = rec_get_nth_field(rec, offsets, index->db_trx_id(), &len);
+
+ ut_ad(len == DATA_TRX_ID_LEN);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(trx_read_trx_id(trx_id));
+}
+#endif /* UNIV_DEBUG */
+
+/** Mark the nth field as externally stored.
+@param[in] offsets array returned by rec_get_offsets()
+@param[in] n nth field */
+void
+rec_offs_make_nth_extern(
+ rec_offs* offsets,
+ const ulint n)
+{
+ ut_ad(!rec_offs_nth_sql_null(offsets, n));
+ set_type(rec_offs_base(offsets)[1 + n], STORED_OFFPAGE);
+}
+#ifdef WITH_WSREP
+# include "ha_prototypes.h"
+
+int
+wsrep_rec_get_foreign_key(
+ byte *buf, /* out: extracted key */
+ ulint *buf_len, /* in/out: length of buf */
+ const rec_t* rec, /* in: physical record */
+ dict_index_t* index_for, /* in: index in foreign table */
+ dict_index_t* index_ref, /* in: index in referenced table */
+ ibool new_protocol) /* in: protocol > 1 */
+{
+ const byte* data;
+ ulint len;
+ ulint key_len = 0;
+ ulint i;
+ uint key_parts;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ const rec_offs* offsets;
+
+ ut_ad(index_for);
+ ut_ad(index_ref);
+
+ rec_offs_init(offsets_);
+ offsets = rec_get_offsets(rec, index_for, offsets_,
+ index_for->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ ut_ad(rec);
+
+ key_parts = dict_index_get_n_unique_in_tree(index_for);
+ for (i = 0;
+ i < key_parts &&
+ (index_for->type & DICT_CLUSTERED || i < key_parts - 1);
+ i++) {
+ dict_field_t* field_f =
+ dict_index_get_nth_field(index_for, i);
+ const dict_col_t* col_f = dict_field_get_col(field_f);
+ dict_field_t* field_r =
+ dict_index_get_nth_field(index_ref, i);
+ const dict_col_t* col_r = dict_field_get_col(field_r);
+
+ ut_ad(!rec_offs_nth_default(offsets, i));
+ data = rec_get_nth_field(rec, offsets, i, &len);
+ if (key_len + ((len != UNIV_SQL_NULL) ? len + 1 : 1) >
+ *buf_len) {
+ fprintf(stderr,
+ "WSREP: FK key len exceeded "
+ ULINTPF " " ULINTPF " " ULINTPF "\n",
+ key_len, len, *buf_len);
+ goto err_out;
+ }
+
+ if (len == UNIV_SQL_NULL) {
+ ut_a(!(col_f->prtype & DATA_NOT_NULL));
+ *buf++ = 1;
+ key_len++;
+ } else if (!new_protocol) {
+ if (!(col_r->prtype & DATA_NOT_NULL)) {
+ *buf++ = 0;
+ key_len++;
+ }
+ memcpy(buf, data, len);
+ *buf_len = wsrep_innobase_mysql_sort(
+ (int)(col_f->prtype & DATA_MYSQL_TYPE_MASK),
+ dtype_get_charset_coll(col_f->prtype),
+ buf, static_cast<uint>(len),
+ static_cast<uint>(*buf_len));
+ } else { /* new protocol */
+ if (!(col_r->prtype & DATA_NOT_NULL)) {
+ *buf++ = 0;
+ key_len++;
+ }
+ switch (col_f->mtype) {
+ case DATA_INT: {
+ byte* ptr = buf+len;
+ for (;;) {
+ ptr--;
+ *ptr = *data;
+ if (ptr == buf) {
+ break;
+ }
+ data++;
+ }
+
+ if (!(col_f->prtype & DATA_UNSIGNED)) {
+ buf[len-1] = (byte) (buf[len-1] ^ 128);
+ }
+
+ break;
+ }
+ case DATA_VARCHAR:
+ case DATA_VARMYSQL:
+ case DATA_CHAR:
+ case DATA_MYSQL:
+ /* Copy the actual data */
+ memcpy(buf, data, len);
+ len = wsrep_innobase_mysql_sort(
+ (int)
+ (col_f->prtype & DATA_MYSQL_TYPE_MASK),
+ dtype_get_charset_coll(col_f->prtype),
+ buf, len, *buf_len);
+ break;
+ case DATA_BLOB:
+ case DATA_BINARY:
+ case DATA_FIXBINARY:
+ case DATA_GEOMETRY:
+ memcpy(buf, data, len);
+ break;
+
+ case DATA_FLOAT:
+ {
+ float f = mach_float_read(data);
+ memcpy(buf, &f, sizeof(float));
+ }
+ break;
+ case DATA_DOUBLE:
+ {
+ double d = mach_double_read(data);
+ memcpy(buf, &d, sizeof(double));
+ }
+ break;
+ default:
+ break;
+ }
+
+ key_len += len;
+ buf += len;
+ }
+ }
+
+ rec_validate(rec, offsets);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ *buf_len = key_len;
+ return DB_SUCCESS;
+
+ err_out:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return DB_ERROR;
+}
+#endif // WITH_WSREP
diff --git a/storage/innobase/row/row0ext.cc b/storage/innobase/row/row0ext.cc
new file mode 100644
index 00000000..b7a62760
--- /dev/null
+++ b/storage/innobase/row/row0ext.cc
@@ -0,0 +1,132 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ext.cc
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "row0ext.h"
+#include "btr0cur.h"
+
+/** Fills the column prefix cache of an externally stored column.
+@param[in,out] ext column prefix cache
+@param[in] i index of ext->ext[]
+@param[in] space tablespace
+@param[in] dfield data field */
+static
+void
+row_ext_cache_fill(
+ row_ext_t* ext,
+ ulint i,
+ fil_space_t* space,
+ const dfield_t* dfield)
+{
+ const byte* field = static_cast<const byte*>(
+ dfield_get_data(dfield));
+ ulint f_len = dfield_get_len(dfield);
+ byte* buf = ext->buf + i * ext->max_len;
+
+ ut_ad(ext->max_len > 0);
+ ut_ad(i < ext->n_ext);
+ ut_ad(dfield_is_ext(dfield));
+ ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ if (UNIV_UNLIKELY(!memcmp(field_ref_zero,
+ field + f_len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE))) {
+ /* The BLOB pointer is not set: we cannot fetch it */
+ ext->len[i] = 0;
+ } else {
+ if (ext->max_len == REC_VERSION_56_MAX_INDEX_COL_LEN
+ && f_len > BTR_EXTERN_FIELD_REF_SIZE) {
+ /* In this case, the field is in B format or beyond,
+ (refer to the definition of row_ext_t.max_len)
+ and the field is already fill with prefix, otherwise
+ f_len would be BTR_EXTERN_FIELD_REF_SIZE.
+ So there is no need to re-read the prefix externally,
+ but just copy the local prefix to buf. Please note
+ if the ext->len[i] is zero, it means an error
+ as above. */
+ memcpy(buf, field, f_len - BTR_EXTERN_FIELD_REF_SIZE);
+ ext->len[i] = f_len - BTR_EXTERN_FIELD_REF_SIZE;
+ } else {
+ /* Fetch at most ext->max_len of the column.
+ The column should be non-empty. However,
+ trx_rollback_all_recovered() may try to
+ access a half-deleted BLOB if the server previously
+ crashed during the execution of
+ btr_free_externally_stored_field(). */
+ ext->len[i] = btr_copy_externally_stored_field_prefix(
+ buf, ext->max_len, ext->zip_size,
+ field, f_len);
+ }
+ }
+}
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+row_ext_t*
+row_ext_create(
+/*===========*/
+ ulint n_ext, /*!< in: number of externally stored columns */
+ const ulint* ext, /*!< in: col_no's of externally stored columns
+ in the InnoDB table object, as reported by
+ dict_col_get_no(); NOT relative to the records
+ in the clustered index */
+ const dict_table_t& table, /*!< in: table */
+ const dtuple_t* tuple, /*!< in: data tuple containing the field
+ references of the externally stored
+ columns; must be indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch
+ to prevent deletion (rollback or purge). */
+ mem_heap_t* heap) /*!< in: heap where created */
+{
+ if (!table.space) {
+ return NULL;
+ }
+
+ ut_ad(n_ext > 0);
+
+ row_ext_t* ret = static_cast<row_ext_t*>(
+ mem_heap_alloc(heap,
+ (sizeof *ret) + (n_ext - 1) * sizeof ret->len));
+
+ ret->n_ext = n_ext;
+ ret->ext = ext;
+ ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(table.flags);
+ ret->zip_size = dict_tf_get_zip_size(table.flags);
+
+ ret->buf = static_cast<byte*>(
+ mem_heap_alloc(heap, n_ext * ret->max_len));
+
+ /* Fetch the BLOB prefixes */
+ for (ulint i = 0; i < n_ext; i++) {
+ const dfield_t* dfield;
+
+ dfield = dtuple_get_nth_field(tuple, ext[i]);
+ row_ext_cache_fill(ret, i, table.space, dfield);
+ }
+
+ return(ret);
+}
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
new file mode 100644
index 00000000..5ec73d96
--- /dev/null
+++ b/storage/innobase/row/row0ftsort.cc
@@ -0,0 +1,1781 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ftsort.cc
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#include "row0ftsort.h"
+#include "dict0dict.h"
+#include "row0merge.h"
+#include "row0row.h"
+#include "btr0cur.h"
+#include "fts0plugin.h"
+#include "log0crypt.h"
+
+/** Read the next record to buffer N.
+@param N index into array of merge info structure */
+#define ROW_MERGE_READ_GET_NEXT(N) \
+ do { \
+ b[N] = row_merge_read_rec( \
+ block[N], buf[N], b[N], index, \
+ fd[N], &foffs[N], &mrec[N], offsets[N], \
+ crypt_block[N], space); \
+ if (UNIV_UNLIKELY(!b[N])) { \
+ if (mrec[N]) { \
+ goto exit; \
+ } \
+ } \
+ } while (0)
+
+/** Parallel sort degree */
+ulong fts_sort_pll_degree = 2;
+
+/*********************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID (depend on number of records to sort, it can be a 4 bytes or 8 bytes
+integer value)
+3) Word's position in original doc.
+
+@see fts_create_one_index_table()
+
+@return dict_index_t structure for the fts sort index */
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+ dict_index_t* index, /*!< in: Original FTS index
+ based on which this sort index
+ is created */
+ dict_table_t* table, /*!< in,out: table that FTS index
+ is being created on */
+ ibool* opt_doc_id_size)
+ /*!< out: whether to use 4 bytes
+ instead of 8 bytes integer to
+ store Doc ID during sort */
+{
+ dict_index_t* new_index;
+ dict_field_t* field;
+ dict_field_t* idx_field;
+ CHARSET_INFO* charset;
+
+ // FIXME: This name shouldn't be hard coded here.
+ new_index = dict_mem_index_create(table, "tmp_fts_idx", DICT_FTS, 3);
+
+ new_index->id = index->id;
+ new_index->n_uniq = FTS_NUM_FIELDS_SORT;
+ new_index->n_def = FTS_NUM_FIELDS_SORT;
+ new_index->cached = TRUE;
+ new_index->parser = index->parser;
+
+ idx_field = dict_index_get_nth_field(index, 0);
+ charset = fts_index_get_charset(index);
+
+ /* The first field is on the Tokenized Word */
+ field = dict_index_get_nth_field(new_index, 0);
+ field->name = NULL;
+ field->prefix_len = 0;
+ field->col = static_cast<dict_col_t*>(
+ mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+ field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL;
+ field->col->mtype = charset == &my_charset_latin1
+ ? DATA_VARCHAR : DATA_VARMYSQL;
+ field->col->mbminlen = idx_field->col->mbminlen;
+ field->col->mbmaxlen = idx_field->col->mbmaxlen;
+ field->col->len = static_cast<uint16_t>(
+ HA_FT_MAXCHARLEN * field->col->mbmaxlen);
+
+ field->fixed_len = 0;
+
+ /* Doc ID */
+ field = dict_index_get_nth_field(new_index, 1);
+ field->name = NULL;
+ field->prefix_len = 0;
+ field->col = static_cast<dict_col_t*>(
+ mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+ field->col->mtype = DATA_INT;
+ *opt_doc_id_size = FALSE;
+
+ /* Check whether we can use 4 bytes instead of 8 bytes integer
+ field to hold the Doc ID, thus reduce the overall sort size */
+ if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+ /* If Doc ID column is being added by this create
+ index, then just check the number of rows in the table */
+ if (dict_table_get_n_rows(table) < MAX_DOC_ID_OPT_VAL) {
+ *opt_doc_id_size = TRUE;
+ }
+ } else {
+ doc_id_t max_doc_id;
+
+ /* If the Doc ID column is supplied by user, then
+ check the maximum Doc ID in the table */
+ max_doc_id = fts_get_max_doc_id((dict_table_t*) table);
+
+ if (max_doc_id && max_doc_id < MAX_DOC_ID_OPT_VAL) {
+ *opt_doc_id_size = TRUE;
+ }
+ }
+
+ if (*opt_doc_id_size) {
+ field->col->len = sizeof(ib_uint32_t);
+ field->fixed_len = sizeof(ib_uint32_t);
+ } else {
+ field->col->len = FTS_DOC_ID_LEN;
+ field->fixed_len = FTS_DOC_ID_LEN;
+ }
+
+ field->col->prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+
+ /* The third field is on the word's position in the original doc */
+ field = dict_index_get_nth_field(new_index, 2);
+ field->name = NULL;
+ field->prefix_len = 0;
+ field->col = static_cast<dict_col_t*>(
+ mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+ field->col->mtype = DATA_INT;
+ field->col->len = 4 ;
+ field->fixed_len = 4;
+ field->col->prtype = DATA_NOT_NULL;
+
+ return(new_index);
+}
+
+/** Initialize FTS parallel sort structures.
+@param[in] trx transaction
+@param[in,out] dup descriptor of FTS index being created
+@param[in,out] new_table table where indexes are created
+@param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes
+ integer to store Doc ID during sort
+@param[in] old_zip_size page size of the old table during alter
+@param[out] psort parallel sort info to be instantiated
+@param[out] merge parallel merge info to be instantiated
+@return true if all successful */
+bool
+row_fts_psort_info_init(
+ trx_t* trx,
+ row_merge_dup_t*dup,
+ dict_table_t* new_table,
+ bool opt_doc_id_size,
+ ulint old_zip_size,
+ fts_psort_t** psort,
+ fts_psort_t** merge)
+{
+ ulint i;
+ ulint j;
+ fts_psort_common_t* common_info = NULL;
+ fts_psort_t* psort_info = NULL;
+ fts_psort_t* merge_info = NULL;
+ ulint block_size;
+ ibool ret = TRUE;
+ bool encrypted = false;
+ ut_ad(ut_is_2pow(old_zip_size));
+
+ block_size = 3 * srv_sort_buf_size;
+
+ *psort = psort_info = static_cast<fts_psort_t*>(ut_zalloc_nokey(
+ fts_sort_pll_degree * sizeof *psort_info));
+
+ if (!psort_info) {
+ ut_free(dup);
+ return(FALSE);
+ }
+
+ /* Common Info for all sort threads */
+ common_info = static_cast<fts_psort_common_t*>(
+ ut_malloc_nokey(sizeof *common_info));
+
+ if (!common_info) {
+ ut_free(dup);
+ ut_free(psort_info);
+ return(FALSE);
+ }
+
+ common_info->dup = dup;
+ common_info->new_table = new_table;
+ common_info->old_zip_size = old_zip_size;
+ common_info->trx = trx;
+ common_info->all_info = psort_info;
+ common_info->sort_event = os_event_create(0);
+ common_info->opt_doc_id_size = opt_doc_id_size;
+
+ if (log_tmp_is_encrypted()) {
+ encrypted = true;
+ }
+
+ ut_ad(trx->mysql_thd != NULL);
+ const char* path = thd_innodb_tmpdir(trx->mysql_thd);
+ /* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for
+ each parallel sort thread. Each "sort bucket" holds records for
+ a particular "FTS index partition" */
+ for (j = 0; j < fts_sort_pll_degree; j++) {
+
+ UT_LIST_INIT(
+ psort_info[j].fts_doc_list, &fts_doc_item_t::doc_list);
+
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+
+ psort_info[j].merge_file[i] =
+ static_cast<merge_file_t*>(
+ ut_zalloc_nokey(sizeof(merge_file_t)));
+
+ if (!psort_info[j].merge_file[i]) {
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ psort_info[j].merge_buf[i] = row_merge_buf_create(
+ dup->index);
+
+ if (row_merge_file_create(psort_info[j].merge_file[i],
+ path) == OS_FILE_CLOSED) {
+ goto func_exit;
+ }
+
+ /* Need to align memory for O_DIRECT write */
+ psort_info[j].merge_block[i] =
+ static_cast<row_merge_block_t*>(
+ aligned_malloc(block_size, 1024));
+
+ if (!psort_info[j].merge_block[i]) {
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ /* If tablespace is encrypted, allocate additional buffer for
+ encryption/decryption. */
+ if (encrypted) {
+ /* Need to align memory for O_DIRECT write */
+ psort_info[j].crypt_block[i] =
+ static_cast<row_merge_block_t*>(
+ aligned_malloc(block_size,
+ 1024));
+
+ if (!psort_info[j].crypt_block[i]) {
+ ret = FALSE;
+ goto func_exit;
+ }
+ } else {
+ psort_info[j].crypt_block[i] = NULL;
+ }
+ }
+
+ psort_info[j].child_status = 0;
+ psort_info[j].state = 0;
+ psort_info[j].psort_common = common_info;
+ psort_info[j].error = DB_SUCCESS;
+ psort_info[j].memory_used = 0;
+ mutex_create(LATCH_ID_FTS_PLL_TOKENIZE, &psort_info[j].mutex);
+ }
+
+ /* Initialize merge_info structures parallel merge and insert
+ into auxiliary FTS tables (FTS_INDEX_TABLE) */
+ *merge = merge_info = static_cast<fts_psort_t*>(
+ ut_malloc_nokey(FTS_NUM_AUX_INDEX * sizeof *merge_info));
+
+ for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+
+ merge_info[j].child_status = 0;
+ merge_info[j].state = 0;
+ merge_info[j].psort_common = common_info;
+ }
+
+func_exit:
+ if (!ret) {
+ row_fts_psort_info_destroy(psort_info, merge_info);
+ }
+
+ return(ret);
+}
+/*********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close the
+merge sort files */
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+ fts_psort_t* psort_info, /*!< parallel sort info */
+ fts_psort_t* merge_info) /*!< parallel merge info */
+{
+ ulint i;
+ ulint j;
+
+ if (psort_info) {
+ for (j = 0; j < fts_sort_pll_degree; j++) {
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ if (psort_info[j].merge_file[i]) {
+ row_merge_file_destroy(
+ psort_info[j].merge_file[i]);
+ }
+
+ aligned_free(psort_info[j].merge_block[i]);
+ ut_free(psort_info[j].merge_file[i]);
+ aligned_free(psort_info[j].crypt_block[i]);
+ }
+
+ mutex_free(&psort_info[j].mutex);
+ }
+
+ os_event_destroy(merge_info[0].psort_common->sort_event);
+ ut_free(merge_info[0].psort_common->dup);
+ ut_free(merge_info[0].psort_common);
+ ut_free(psort_info);
+ }
+
+ ut_free(merge_info);
+}
+/*********************************************************************//**
+Free up merge buffers when merge sort is done */
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+ fts_psort_t* psort_info) /*!< in: parallel sort info */
+{
+ ulint j;
+ ulint i;
+
+ if (!psort_info) {
+ return;
+ }
+
+ for (j = 0; j < fts_sort_pll_degree; j++) {
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ row_merge_buf_free(psort_info[j].merge_buf[i]);
+ }
+ }
+
+ return;
+}
+
+/*********************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for row merge.
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return always returns 0 */
+static
+int
+row_merge_fts_doc_add_word_for_parser(
+/*==================================*/
+ MYSQL_FTPARSER_PARAM *param, /* in: parser paramter */
+ const char *word, /* in: token word */
+ int word_len, /* in: word len */
+ MYSQL_FTPARSER_BOOLEAN_INFO* boolean_info) /* in: boolean info */
+{
+ fts_string_t str;
+ fts_tokenize_ctx_t* t_ctx;
+ row_fts_token_t* fts_token;
+ byte* ptr;
+
+ ut_ad(param);
+ ut_ad(param->mysql_ftparam);
+ ut_ad(word);
+ ut_ad(boolean_info);
+
+ t_ctx = static_cast<fts_tokenize_ctx_t*>(param->mysql_ftparam);
+ ut_ad(t_ctx);
+
+ str.f_str = (byte*)(word);
+ str.f_len = ulint(word_len);
+ str.f_n_char = fts_get_token_size(
+ (CHARSET_INFO*)param->cs, word, ulint(word_len));
+
+ /* JAN: TODO: MySQL 5.7 FTS
+ ut_ad(boolean_info->position >= 0);
+ */
+
+ ptr = static_cast<byte*>(ut_malloc_nokey(sizeof(row_fts_token_t)
+ + sizeof(fts_string_t) + str.f_len));
+ fts_token = reinterpret_cast<row_fts_token_t*>(ptr);
+ fts_token->text = reinterpret_cast<fts_string_t*>(
+ ptr + sizeof(row_fts_token_t));
+ fts_token->text->f_str = static_cast<byte*>(
+ ptr + sizeof(row_fts_token_t) + sizeof(fts_string_t));
+
+ fts_token->text->f_len = str.f_len;
+ fts_token->text->f_n_char = str.f_n_char;
+ memcpy(fts_token->text->f_str, str.f_str, str.f_len);
+
+ /* JAN: TODO: MySQL 5.7 FTS
+ fts_token->position = boolean_info->position;
+ */
+
+ /* Add token to list */
+ UT_LIST_ADD_LAST(t_ctx->fts_token_list, fts_token);
+
+ return(0);
+}
+
+/*********************************************************************//**
+Tokenize by fts plugin parser */
+static
+void
+row_merge_fts_doc_tokenize_by_parser(
+/*=================================*/
+ fts_doc_t* doc, /* in: doc to tokenize */
+ st_mysql_ftparser* parser, /* in: plugin parser instance */
+ fts_tokenize_ctx_t* t_ctx) /* in/out: tokenize ctx instance */
+{
+ MYSQL_FTPARSER_PARAM param;
+
+ ut_a(parser);
+
+ /* Set paramters for param */
+ param.mysql_parse = fts_tokenize_document_internal;
+ param.mysql_add_word = row_merge_fts_doc_add_word_for_parser;
+ param.mysql_ftparam = t_ctx;
+ param.cs = doc->charset;
+ param.doc = reinterpret_cast<char*>(doc->text.f_str);
+ param.length = static_cast<int>(doc->text.f_len);
+ param.mode= MYSQL_FTPARSER_SIMPLE_MODE;
+
+ PARSER_INIT(parser, &param);
+ /* We assume parse returns successfully here. */
+ parser->parse(&param);
+ PARSER_DEINIT(parser, &param);
+}
+
+/*********************************************************************//**
+Tokenize incoming text data and add to the sort buffer.
+@see row_merge_buf_encode()
+@return TRUE if the record passed, FALSE if out of space */
+static
+ibool
+row_merge_fts_doc_tokenize(
+/*=======================*/
+ row_merge_buf_t** sort_buf, /*!< in/out: sort buffer */
+ doc_id_t doc_id, /*!< in: Doc ID */
+ fts_doc_t* doc, /*!< in: Doc to be tokenized */
+ merge_file_t** merge_file, /*!< in/out: merge file */
+ ibool opt_doc_id_size,/*!< in: whether to use 4 bytes
+ instead of 8 bytes integer to
+ store Doc ID during sort*/
+ fts_tokenize_ctx_t* t_ctx) /*!< in/out: tokenize context */
+{
+ ulint inc = 0;
+ fts_string_t str;
+ ulint len;
+ row_merge_buf_t* buf;
+ dfield_t* field;
+ fts_string_t t_str;
+ ibool buf_full = FALSE;
+ byte str_buf[FTS_MAX_WORD_LEN + 1];
+ ulint data_size[FTS_NUM_AUX_INDEX];
+ ulint n_tuple[FTS_NUM_AUX_INDEX];
+ st_mysql_ftparser* parser;
+
+ t_str.f_n_char = 0;
+ t_ctx->buf_used = 0;
+
+ memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+ memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+
+ parser = sort_buf[0]->index->parser;
+
+ /* Tokenize the data and add each word string, its corresponding
+ doc id and position to sort buffer */
+ while (t_ctx->processed_len < doc->text.f_len) {
+ ulint idx = 0;
+ ulint cur_len;
+ doc_id_t write_doc_id;
+ row_fts_token_t* fts_token = NULL;
+
+ if (parser != NULL) {
+ if (t_ctx->processed_len == 0) {
+ UT_LIST_INIT(t_ctx->fts_token_list, &row_fts_token_t::token_list);
+
+ /* Parse the whole doc and cache tokens */
+ row_merge_fts_doc_tokenize_by_parser(doc,
+ parser, t_ctx);
+
+ /* Just indictate we have parsed all the word */
+ t_ctx->processed_len += 1;
+ }
+
+ /* Then get a token */
+ fts_token = UT_LIST_GET_FIRST(t_ctx->fts_token_list);
+ if (fts_token) {
+ str.f_len = fts_token->text->f_len;
+ str.f_n_char = fts_token->text->f_n_char;
+ str.f_str = fts_token->text->f_str;
+ } else {
+ ut_ad(UT_LIST_GET_LEN(t_ctx->fts_token_list) == 0);
+ /* Reach the end of the list */
+ t_ctx->processed_len = doc->text.f_len;
+ break;
+ }
+ } else {
+ inc = innobase_mysql_fts_get_token(
+ doc->charset,
+ doc->text.f_str + t_ctx->processed_len,
+ doc->text.f_str + doc->text.f_len, &str);
+
+ ut_a(inc > 0);
+ }
+
+ /* Ignore string whose character number is less than
+ "fts_min_token_size" or more than "fts_max_token_size" */
+ if (!fts_check_token(&str, NULL, NULL)) {
+ if (parser != NULL) {
+ UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+ ut_free(fts_token);
+ } else {
+ t_ctx->processed_len += inc;
+ }
+
+ continue;
+ }
+
+ t_str.f_len = innobase_fts_casedn_str(
+ doc->charset, (char*) str.f_str, str.f_len,
+ (char*) &str_buf, FTS_MAX_WORD_LEN + 1);
+
+ t_str.f_str = (byte*) &str_buf;
+
+ /* if "cached_stopword" is defined, ignore words in the
+ stopword list */
+ if (!fts_check_token(&str, t_ctx->cached_stopword,
+ doc->charset)) {
+ if (parser != NULL) {
+ UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+ ut_free(fts_token);
+ } else {
+ t_ctx->processed_len += inc;
+ }
+
+ continue;
+ }
+
+ /* There are FTS_NUM_AUX_INDEX auxiliary tables, find
+ out which sort buffer to put this word record in */
+ t_ctx->buf_used = fts_select_index(
+ doc->charset, t_str.f_str, t_str.f_len);
+
+ buf = sort_buf[t_ctx->buf_used];
+
+ ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX);
+ idx = t_ctx->buf_used;
+
+ mtuple_t* mtuple = &buf->tuples[buf->n_tuples + n_tuple[idx]];
+
+ field = mtuple->fields = static_cast<dfield_t*>(
+ mem_heap_alloc(buf->heap,
+ FTS_NUM_FIELDS_SORT * sizeof *field));
+
+ /* The first field is the tokenized word */
+ dfield_set_data(field, t_str.f_str, t_str.f_len);
+ len = dfield_get_len(field);
+
+ dict_col_copy_type(dict_index_get_nth_col(buf->index, 0), &field->type);
+ field->type.prtype |= DATA_NOT_NULL;
+ ut_ad(len <= field->type.len);
+
+ /* For the temporary file, row_merge_buf_encode() uses
+ 1 byte for representing the number of extra_size bytes.
+ This number will always be 1, because for this 3-field index
+ consisting of one variable-size column, extra_size will always
+ be 1 or 2, which can be encoded in one byte.
+
+ The extra_size is 1 byte if the length of the
+ variable-length column is less than 128 bytes or the
+ maximum length is less than 256 bytes. */
+
+ /* One variable length column, word with its lenght less than
+ fts_max_token_size, add one extra size and one extra byte.
+
+ Since the max length for FTS token now is larger than 255,
+ so we will need to signify length byte itself, so only 1 to 128
+ bytes can be used for 1 bytes, larger than that 2 bytes. */
+ if (len < 128 || field->type.len < 256) {
+ /* Extra size is one byte. */
+ cur_len = 2 + len;
+ } else {
+ /* Extra size is two bytes. */
+ cur_len = 3 + len;
+ }
+
+ dfield_dup(field, buf->heap);
+ field++;
+
+ /* The second field is the Doc ID */
+
+ ib_uint32_t doc_id_32_bit;
+
+ if (!opt_doc_id_size) {
+ fts_write_doc_id((byte*) &write_doc_id, doc_id);
+
+ dfield_set_data(
+ field, &write_doc_id, sizeof(write_doc_id));
+ } else {
+ mach_write_to_4(
+ (byte*) &doc_id_32_bit, (ib_uint32_t) doc_id);
+
+ dfield_set_data(
+ field, &doc_id_32_bit, sizeof(doc_id_32_bit));
+ }
+
+ len = field->len;
+ ut_ad(len == FTS_DOC_ID_LEN || len == sizeof(ib_uint32_t));
+
+ field->type.mtype = DATA_INT;
+ field->type.prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+ field->type.len = static_cast<uint16_t>(field->len);
+ field->type.mbminlen = 0;
+ field->type.mbmaxlen = 0;
+
+ cur_len += len;
+ dfield_dup(field, buf->heap);
+
+ ++field;
+
+ /* The third field is the position.
+ MySQL 5.7 changed the fulltext parser plugin interface
+ by adding MYSQL_FTPARSER_BOOLEAN_INFO::position.
+ Below we assume that the field is always 0. */
+ ulint pos = t_ctx->init_pos;
+ byte position[4];
+ if (parser == NULL) {
+ pos += t_ctx->processed_len + inc - str.f_len;
+ }
+ len = 4;
+ mach_write_to_4(position, pos);
+ dfield_set_data(field, &position, len);
+
+ field->type.mtype = DATA_INT;
+ field->type.prtype = DATA_NOT_NULL;
+ field->type.len = 4;
+ field->type.mbminlen = 0;
+ field->type.mbmaxlen = 0;
+ cur_len += len;
+ dfield_dup(field, buf->heap);
+
+ /* Reserve one byte for the end marker of row_merge_block_t */
+ if (buf->total_size + data_size[idx] + cur_len
+ >= srv_sort_buf_size - 1) {
+
+ buf_full = TRUE;
+ break;
+ }
+
+ /* Increment the number of tuples */
+ n_tuple[idx]++;
+ if (parser != NULL) {
+ UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+ ut_free(fts_token);
+ } else {
+ t_ctx->processed_len += inc;
+ }
+ data_size[idx] += cur_len;
+ }
+
+ /* Update the data length and the number of new word tuples
+ added in this round of tokenization */
+ for (ulint i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ /* The computation of total_size below assumes that no
+ delete-mark flags will be stored and that all fields
+ are NOT NULL and fixed-length. */
+
+ sort_buf[i]->total_size += data_size[i];
+
+ sort_buf[i]->n_tuples += n_tuple[i];
+
+ merge_file[i]->n_rec += n_tuple[i];
+ t_ctx->rows_added[i] += n_tuple[i];
+ }
+
+ if (!buf_full) {
+ /* we pad one byte between text accross two fields */
+ t_ctx->init_pos += doc->text.f_len + 1;
+ }
+
+ return(!buf_full);
+}
+
+/*********************************************************************//**
+Get next doc item from fts_doc_list */
+UNIV_INLINE
+void
+row_merge_fts_get_next_doc_item(
+/*============================*/
+ fts_psort_t* psort_info, /*!< in: psort_info */
+ fts_doc_item_t** doc_item) /*!< in/out: doc item */
+{
+ if (*doc_item != NULL) {
+ ut_free(*doc_item);
+ }
+
+ mutex_enter(&psort_info->mutex);
+
+ *doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list);
+ if (*doc_item != NULL) {
+ UT_LIST_REMOVE(psort_info->fts_doc_list, *doc_item);
+
+ ut_ad(psort_info->memory_used >= sizeof(fts_doc_item_t)
+ + (*doc_item)->field->len);
+ psort_info->memory_used -= sizeof(fts_doc_item_t)
+ + (*doc_item)->field->len;
+ }
+
+ mutex_exit(&psort_info->mutex);
+}
+
+/*********************************************************************//**
+Function performs parallel tokenization of the incoming doc strings.
+It also performs the initial in memory sort of the parsed records.
+*/
+static
+void fts_parallel_tokenization(
+/*======================*/
+ void* arg) /*!< in: psort_info for the thread */
+{
+ fts_psort_t* psort_info = (fts_psort_t*) arg;
+ ulint i;
+ fts_doc_item_t* doc_item = NULL;
+ row_merge_buf_t** buf;
+ ibool processed = FALSE;
+ merge_file_t** merge_file;
+ row_merge_block_t** block;
+ row_merge_block_t** crypt_block;
+ pfs_os_file_t tmpfd[FTS_NUM_AUX_INDEX];
+ ulint mycount[FTS_NUM_AUX_INDEX];
+ ulint num_doc_processed = 0;
+ doc_id_t last_doc_id = 0;
+ mem_heap_t* blob_heap = NULL;
+ fts_doc_t doc;
+ dict_table_t* table = psort_info->psort_common->new_table;
+ fts_tokenize_ctx_t t_ctx;
+ ulint retried = 0;
+ dberr_t error = DB_SUCCESS;
+
+ ut_ad(psort_info->psort_common->trx->mysql_thd != NULL);
+
+ /* const char* path = thd_innodb_tmpdir(
+ psort_info->psort_common->trx->mysql_thd);
+ */
+
+ ut_ad(psort_info->psort_common->trx->mysql_thd != NULL);
+
+ const char* path = thd_innodb_tmpdir(
+ psort_info->psort_common->trx->mysql_thd);
+
+ ut_ad(psort_info);
+
+ buf = psort_info->merge_buf;
+ merge_file = psort_info->merge_file;
+ blob_heap = mem_heap_create(512);
+ memset(&doc, 0, sizeof(doc));
+ memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int));
+
+ doc.charset = fts_index_get_charset(
+ psort_info->psort_common->dup->index);
+
+ block = psort_info->merge_block;
+ crypt_block = psort_info->crypt_block;
+
+ const ulint zip_size = psort_info->psort_common->old_zip_size;
+
+ row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+ t_ctx.cached_stopword = table->fts->cache->stopword_info.cached_stopword;
+ processed = TRUE;
+loop:
+ while (doc_item) {
+ dfield_t* dfield = doc_item->field;
+
+ last_doc_id = doc_item->doc_id;
+
+ ut_ad (dfield->data != NULL
+ && dfield_get_len(dfield) != UNIV_SQL_NULL);
+
+ /* If finish processing the last item, update "doc" with
+ strings in the doc_item, otherwise continue processing last
+ item */
+ if (processed) {
+ byte* data;
+ ulint data_len;
+
+ dfield = doc_item->field;
+ data = static_cast<byte*>(dfield_get_data(dfield));
+ data_len = dfield_get_len(dfield);
+
+ if (dfield_is_ext(dfield)) {
+ doc.text.f_str =
+ btr_copy_externally_stored_field(
+ &doc.text.f_len, data,
+ zip_size, data_len, blob_heap);
+ } else {
+ doc.text.f_str = data;
+ doc.text.f_len = data_len;
+ }
+
+ doc.tokens = 0;
+ t_ctx.processed_len = 0;
+ } else {
+ /* Not yet finish processing the "doc" on hand,
+ continue processing it */
+ ut_ad(doc.text.f_str);
+ ut_ad(t_ctx.processed_len < doc.text.f_len);
+ }
+
+ processed = row_merge_fts_doc_tokenize(
+ buf, doc_item->doc_id, &doc,
+ merge_file, psort_info->psort_common->opt_doc_id_size,
+ &t_ctx);
+
+ /* Current sort buffer full, need to recycle */
+ if (!processed) {
+ ut_ad(t_ctx.processed_len < doc.text.f_len);
+ ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
+ break;
+ }
+
+ num_doc_processed++;
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)
+ && num_doc_processed % 10000 == 1) {
+ ib::info() << "Number of documents processed: "
+ << num_doc_processed;
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ ib::info() << "ID " << psort_info->psort_id
+ << ", partition " << i << ", word "
+ << mycount[i];
+ }
+#endif
+ }
+
+ mem_heap_empty(blob_heap);
+
+ row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+ if (doc_item && last_doc_id != doc_item->doc_id) {
+ t_ctx.init_pos = 0;
+ }
+ }
+
+ /* If we run out of current sort buffer, need to sort
+ and flush the sort buffer to disk */
+ if (t_ctx.rows_added[t_ctx.buf_used] && !processed) {
+ row_merge_buf_sort(buf[t_ctx.buf_used], NULL);
+ row_merge_buf_write(buf[t_ctx.buf_used],
+ merge_file[t_ctx.buf_used],
+ block[t_ctx.buf_used]);
+
+ if (!row_merge_write(merge_file[t_ctx.buf_used]->fd,
+ merge_file[t_ctx.buf_used]->offset++,
+ block[t_ctx.buf_used],
+ crypt_block[t_ctx.buf_used],
+ table->space_id)) {
+ error = DB_TEMP_FILE_WRITE_FAIL;
+ goto func_exit;
+ }
+
+ MEM_UNDEFINED(block[t_ctx.buf_used], srv_sort_buf_size);
+ buf[t_ctx.buf_used] = row_merge_buf_empty(buf[t_ctx.buf_used]);
+ mycount[t_ctx.buf_used] += t_ctx.rows_added[t_ctx.buf_used];
+ t_ctx.rows_added[t_ctx.buf_used] = 0;
+
+ ut_a(doc_item);
+ goto loop;
+ }
+
+ /* Parent done scanning, and if finish processing all the docs, exit */
+ if (psort_info->state == FTS_PARENT_COMPLETE) {
+ if (UT_LIST_GET_LEN(psort_info->fts_doc_list) == 0) {
+ goto exit;
+ } else if (retried > 10000) {
+ ut_ad(!doc_item);
+ /* retried too many times and cannot get new record */
+ ib::error() << "FTS parallel sort processed "
+ << num_doc_processed
+ << " records, the sort queue has "
+ << UT_LIST_GET_LEN(psort_info->fts_doc_list)
+ << " records. But sort cannot get the next"
+ " records during alter table " << table->name;
+ goto exit;
+ }
+ } else if (psort_info->state == FTS_PARENT_EXITING) {
+ /* Parent abort */
+ goto func_exit;
+ }
+
+ if (doc_item == NULL) {
+ os_thread_yield();
+ }
+
+ row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+ if (doc_item != NULL) {
+ if (last_doc_id != doc_item->doc_id) {
+ t_ctx.init_pos = 0;
+ }
+
+ retried = 0;
+ } else if (psort_info->state == FTS_PARENT_COMPLETE) {
+ retried++;
+ }
+
+ goto loop;
+
+exit:
+ /* Do a final sort of the last (or latest) batch of records
+ in block memory. Flush them to temp file if records cannot
+ be hold in one block memory */
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ if (t_ctx.rows_added[i]) {
+ row_merge_buf_sort(buf[i], NULL);
+ row_merge_buf_write(
+ buf[i], merge_file[i], block[i]);
+
+ /* Write to temp file, only if records have
+ been flushed to temp file before (offset > 0):
+ The pseudo code for sort is following:
+
+ while (there are rows) {
+ tokenize rows, put result in block[]
+ if (block[] runs out) {
+ sort rows;
+ write to temp file with
+ row_merge_write();
+ offset++;
+ }
+ }
+
+ # write out the last batch
+ if (offset > 0) {
+ row_merge_write();
+ offset++;
+ } else {
+ # no need to write anything
+ offset stay as 0
+ }
+
+ so if merge_file[i]->offset is 0 when we come to
+ here as the last batch, this means rows have
+ never flush to temp file, it can be held all in
+ memory */
+ if (merge_file[i]->offset != 0) {
+ if (!row_merge_write(merge_file[i]->fd,
+ merge_file[i]->offset++,
+ block[i],
+ crypt_block[i],
+ table->space_id)) {
+ error = DB_TEMP_FILE_WRITE_FAIL;
+ goto func_exit;
+ }
+
+#ifdef HAVE_valgrind
+ MEM_UNDEFINED(block[i], srv_sort_buf_size);
+
+ if (crypt_block[i]) {
+ MEM_UNDEFINED(crypt_block[i],
+ srv_sort_buf_size);
+ }
+#endif /* HAVE_valgrind */
+ }
+
+ buf[i] = row_merge_buf_empty(buf[i]);
+ t_ctx.rows_added[i] = 0;
+ }
+ }
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ DEBUG_FTS_SORT_PRINT(" InnoDB_FTS: start merge sort\n");
+ }
+
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ if (!merge_file[i]->offset) {
+ continue;
+ }
+
+ tmpfd[i] = row_merge_file_create_low(path);
+ if (tmpfd[i] == OS_FILE_CLOSED) {
+ error = DB_OUT_OF_MEMORY;
+ goto func_exit;
+ }
+
+ error = row_merge_sort(psort_info->psort_common->trx,
+ psort_info->psort_common->dup,
+ merge_file[i], block[i], &tmpfd[i],
+ false, 0.0/* pct_progress */, 0.0/* pct_cost */,
+ crypt_block[i], table->space_id);
+
+ if (error != DB_SUCCESS) {
+ row_merge_file_destroy_low(tmpfd[i]);
+ goto func_exit;
+ }
+
+ row_merge_file_destroy_low(tmpfd[i]);
+ }
+
+func_exit:
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ DEBUG_FTS_SORT_PRINT(" InnoDB_FTS: complete merge sort\n");
+ }
+
+ mem_heap_free(blob_heap);
+
+ mutex_enter(&psort_info->mutex);
+ psort_info->error = error;
+ mutex_exit(&psort_info->mutex);
+
+ if (UT_LIST_GET_LEN(psort_info->fts_doc_list) > 0) {
+ /* child can exit either with error or told by parent. */
+ ut_ad(error != DB_SUCCESS
+ || psort_info->state == FTS_PARENT_EXITING);
+ }
+
+ /* Free fts doc list in case of error. */
+ do {
+ row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+ } while (doc_item != NULL);
+
+ psort_info->child_status = FTS_CHILD_COMPLETE;
+ os_event_set(psort_info->psort_common->sort_event);
+ psort_info->child_status = FTS_CHILD_EXITING;
+}
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+void
+row_fts_start_psort(
+/*================*/
+ fts_psort_t* psort_info) /*!< parallel sort structure */
+{
+ ulint i = 0;
+
+ for (i = 0; i < fts_sort_pll_degree; i++) {
+ psort_info[i].psort_id = i;
+ psort_info[i].task =
+ new tpool::waitable_task(fts_parallel_tokenization,&psort_info[i]);
+ srv_thread_pool->submit_task(psort_info[i].task);
+ }
+}
+
+/*********************************************************************//**
+Function performs the merge and insertion of the sorted records. */
+static
+void
+fts_parallel_merge(
+/*===============*/
+ void* arg) /*!< in: parallel merge info */
+{
+ fts_psort_t* psort_info = (fts_psort_t*) arg;
+ ulint id;
+
+ ut_ad(psort_info);
+
+ id = psort_info->psort_id;
+
+ row_fts_merge_insert(psort_info->psort_common->dup->index,
+ psort_info->psort_common->new_table,
+ psort_info->psort_common->all_info, id);
+}
+
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+ fts_psort_t* merge_info) /*!< in: parallel sort info */
+{
+ ulint i = 0;
+
+ /* Kick off merge/insert tasks */
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ merge_info[i].psort_id = i;
+ merge_info[i].child_status = 0;
+
+ merge_info[i].task = new tpool::waitable_task(
+ fts_parallel_merge,
+ (void*) &merge_info[i]);
+ srv_thread_pool->submit_task(merge_info[i].task);
+ }
+}
+
+/**
+Write out a single word's data as new entry/entries in the INDEX table.
+@param[in] ins_ctx insert context
+@param[in] word word string
+@param[in] node node colmns
+@return DB_SUCCUESS if insertion runs fine, otherwise error code */
+static
+dberr_t
+row_merge_write_fts_node(
+ const fts_psort_insert_t* ins_ctx,
+ const fts_string_t* word,
+ const fts_node_t* node)
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ dberr_t ret = DB_SUCCESS;
+ doc_id_t write_first_doc_id[8];
+ doc_id_t write_last_doc_id[8];
+ ib_uint32_t write_doc_count;
+
+ tuple = ins_ctx->tuple;
+
+ /* The first field is the tokenized word */
+ field = dtuple_get_nth_field(tuple, 0);
+ dfield_set_data(field, word->f_str, word->f_len);
+
+ /* The second field is first_doc_id */
+ field = dtuple_get_nth_field(tuple, 1);
+ fts_write_doc_id((byte*)&write_first_doc_id, node->first_doc_id);
+ dfield_set_data(field, &write_first_doc_id, sizeof(doc_id_t));
+
+ /* The third and fourth fileds(TRX_ID, ROLL_PTR) are filled already.*/
+ /* The fifth field is last_doc_id */
+ field = dtuple_get_nth_field(tuple, 4);
+ fts_write_doc_id((byte*)&write_last_doc_id, node->last_doc_id);
+ dfield_set_data(field, &write_last_doc_id, sizeof(doc_id_t));
+
+ /* The sixth field is doc_count */
+ field = dtuple_get_nth_field(tuple, 5);
+ mach_write_to_4((byte*)&write_doc_count, (ib_uint32_t)node->doc_count);
+ dfield_set_data(field, &write_doc_count, sizeof(ib_uint32_t));
+
+ /* The seventh field is ilist */
+ field = dtuple_get_nth_field(tuple, 6);
+ dfield_set_data(field, node->ilist, node->ilist_size);
+
+ ret = ins_ctx->btr_bulk->insert(tuple);
+
+ return(ret);
+}
+
+/********************************************************************//**
+Insert processed FTS data to auxillary index tables.
+@return DB_SUCCESS if insertion runs fine */
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+row_merge_write_fts_word(
+/*=====================*/
+ fts_psort_insert_t* ins_ctx, /*!< in: insert context */
+ fts_tokenizer_word_t* word) /*!< in: sorted and tokenized
+ word */
+{
+ dberr_t ret = DB_SUCCESS;
+
+ ut_ad(ins_ctx->aux_index_id == fts_select_index(
+ ins_ctx->charset, word->text.f_str, word->text.f_len));
+
+ /* Pop out each fts_node in word->nodes write them to auxiliary table */
+ for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+ dberr_t error;
+ fts_node_t* fts_node;
+
+ fts_node = static_cast<fts_node_t*>(ib_vector_get(word->nodes, i));
+
+ error = row_merge_write_fts_node(ins_ctx, &word->text, fts_node);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "Failed to write word to FTS auxiliary"
+ " index table "
+ << ins_ctx->btr_bulk->table_name()
+ << ", error " << error;
+ ret = error;
+ }
+
+ ut_free(fts_node->ilist);
+ fts_node->ilist = NULL;
+ }
+
+ ib_vector_reset(word->nodes);
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Read sorted FTS data files and insert data tuples to auxillary tables.
+@return DB_SUCCESS or error number */
+static
+void
+row_fts_insert_tuple(
+/*=================*/
+ fts_psort_insert_t*
+ ins_ctx, /*!< in: insert context */
+ fts_tokenizer_word_t* word, /*!< in: last processed
+ tokenized word */
+ ib_vector_t* positions, /*!< in: word position */
+ doc_id_t* in_doc_id, /*!< in: last item doc id */
+ dtuple_t* dtuple) /*!< in: entry to insert */
+{
+ fts_node_t* fts_node = NULL;
+ dfield_t* dfield;
+ doc_id_t doc_id;
+ ulint position;
+ fts_string_t token_word;
+ ulint i;
+
+ /* Get fts_node for the FTS auxillary INDEX table */
+ if (ib_vector_size(word->nodes) > 0) {
+ fts_node = static_cast<fts_node_t*>(
+ ib_vector_last(word->nodes));
+ }
+
+ if (fts_node == NULL
+ || fts_node->ilist_size > FTS_ILIST_MAX_SIZE) {
+
+ fts_node = static_cast<fts_node_t*>(
+ ib_vector_push(word->nodes, NULL));
+
+ memset(fts_node, 0x0, sizeof(*fts_node));
+ }
+
+ /* If dtuple == NULL, this is the last word to be processed */
+ if (!dtuple) {
+ if (fts_node && ib_vector_size(positions) > 0) {
+ fts_cache_node_add_positions(
+ NULL, fts_node, *in_doc_id,
+ positions);
+
+ /* Write out the current word */
+ row_merge_write_fts_word(ins_ctx, word);
+ }
+
+ return;
+ }
+
+ /* Get the first field for the tokenized word */
+ dfield = dtuple_get_nth_field(dtuple, 0);
+
+ token_word.f_n_char = 0;
+ token_word.f_len = dfield->len;
+ token_word.f_str = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (!word->text.f_str) {
+ fts_string_dup(&word->text, &token_word, ins_ctx->heap);
+ }
+
+ /* compare to the last word, to see if they are the same
+ word */
+ if (innobase_fts_text_cmp(ins_ctx->charset,
+ &word->text, &token_word) != 0) {
+ ulint num_item;
+
+ /* Getting a new word, flush the last position info
+ for the currnt word in fts_node */
+ if (ib_vector_size(positions) > 0) {
+ fts_cache_node_add_positions(
+ NULL, fts_node, *in_doc_id, positions);
+ }
+
+ /* Write out the current word */
+ row_merge_write_fts_word(ins_ctx, word);
+
+ /* Copy the new word */
+ fts_string_dup(&word->text, &token_word, ins_ctx->heap);
+
+ num_item = ib_vector_size(positions);
+
+ /* Clean up position queue */
+ for (i = 0; i < num_item; i++) {
+ ib_vector_pop(positions);
+ }
+
+ /* Reset Doc ID */
+ *in_doc_id = 0;
+ memset(fts_node, 0x0, sizeof(*fts_node));
+ }
+
+ /* Get the word's Doc ID */
+ dfield = dtuple_get_nth_field(dtuple, 1);
+
+ if (!ins_ctx->opt_doc_id_size) {
+ doc_id = fts_read_doc_id(
+ static_cast<byte*>(dfield_get_data(dfield)));
+ } else {
+ doc_id = (doc_id_t) mach_read_from_4(
+ static_cast<byte*>(dfield_get_data(dfield)));
+ }
+
+ /* Get the word's position info */
+ dfield = dtuple_get_nth_field(dtuple, 2);
+ position = mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield)));
+
+ /* If this is the same word as the last word, and they
+ have the same Doc ID, we just need to add its position
+ info. Otherwise, we will flush position info to the
+ fts_node and initiate a new position vector */
+ if (!(*in_doc_id) || *in_doc_id == doc_id) {
+ ib_vector_push(positions, &position);
+ } else {
+ ulint num_pos = ib_vector_size(positions);
+
+ fts_cache_node_add_positions(NULL, fts_node,
+ *in_doc_id, positions);
+ for (i = 0; i < num_pos; i++) {
+ ib_vector_pop(positions);
+ }
+ ib_vector_push(positions, &position);
+ }
+
+ /* record the current Doc ID */
+ *in_doc_id = doc_id;
+}
+
+/*********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+static
+ulint
+row_fts_sel_tree_propagate(
+/*=======================*/
+ ulint propogated, /*<! in: tree node propagated */
+ int* sel_tree, /*<! in: selection tree */
+ const mrec_t** mrec, /*<! in: sort record */
+ rec_offs** offsets, /*<! in: record offsets */
+ dict_index_t* index) /*<! in/out: FTS index */
+{
+ ulint parent;
+ int child_left;
+ int child_right;
+ int selected;
+
+ /* Find which parent this value will be propagated to */
+ parent = (propogated - 1) / 2;
+
+ /* Find out which value is smaller, and to propagate */
+ child_left = sel_tree[parent * 2 + 1];
+ child_right = sel_tree[parent * 2 + 2];
+
+ if (child_left == -1 || mrec[child_left] == NULL) {
+ if (child_right == -1
+ || mrec[child_right] == NULL) {
+ selected = -1;
+ } else {
+ selected = child_right ;
+ }
+ } else if (child_right == -1
+ || mrec[child_right] == NULL) {
+ selected = child_left;
+ } else if (cmp_rec_rec_simple(mrec[child_left], mrec[child_right],
+ offsets[child_left],
+ offsets[child_right],
+ index, NULL) < 0) {
+ selected = child_left;
+ } else {
+ selected = child_right;
+ }
+
+ sel_tree[parent] = selected;
+
+ return parent;
+}
+
+/*********************************************************************//**
+Readjust selection tree after popping the root and read a new value
+@return the new root */
+static
+int
+row_fts_sel_tree_update(
+/*====================*/
+ int* sel_tree, /*<! in/out: selection tree */
+ ulint propagated, /*<! in: node to propagate up */
+ ulint height, /*<! in: tree height */
+ const mrec_t** mrec, /*<! in: sort record */
+ rec_offs** offsets, /*<! in: record offsets */
+ dict_index_t* index) /*<! in: index dictionary */
+{
+ ulint i;
+
+ for (i = 1; i <= height; i++) {
+ propagated = row_fts_sel_tree_propagate(
+ propagated, sel_tree, mrec, offsets, index);
+ }
+
+ return(sel_tree[0]);
+}
+
+/*********************************************************************//**
+Build selection tree at a specified level */
+static
+void
+row_fts_build_sel_tree_level(
+/*=========================*/
+ int* sel_tree, /*<! in/out: selection tree */
+ ulint level, /*<! in: selection tree level */
+ const mrec_t** mrec, /*<! in: sort record */
+ rec_offs** offsets, /*<! in: record offsets */
+ dict_index_t* index) /*<! in: index dictionary */
+{
+ ulint start;
+ int child_left;
+ int child_right;
+ ulint i;
+ ulint num_item = ulint(1) << level;
+
+ start = num_item - 1;
+
+ for (i = 0; i < num_item; i++) {
+ child_left = sel_tree[(start + i) * 2 + 1];
+ child_right = sel_tree[(start + i) * 2 + 2];
+
+ if (child_left == -1) {
+ if (child_right == -1) {
+ sel_tree[start + i] = -1;
+ } else {
+ sel_tree[start + i] = child_right;
+ }
+ continue;
+ } else if (child_right == -1) {
+ sel_tree[start + i] = child_left;
+ continue;
+ }
+
+ /* Deal with NULL child conditions */
+ if (!mrec[child_left]) {
+ if (!mrec[child_right]) {
+ sel_tree[start + i] = -1;
+ } else {
+ sel_tree[start + i] = child_right;
+ }
+ continue;
+ } else if (!mrec[child_right]) {
+ sel_tree[start + i] = child_left;
+ continue;
+ }
+
+ /* Select the smaller one to set parent pointer */
+ int cmp = cmp_rec_rec_simple(
+ mrec[child_left], mrec[child_right],
+ offsets[child_left], offsets[child_right],
+ index, NULL);
+
+ sel_tree[start + i] = cmp < 0 ? child_left : child_right;
+ }
+}
+
+/*********************************************************************//**
+Build a selection tree for merge. The selection tree is a binary tree
+and should have fts_sort_pll_degree / 2 levels. With root as level 0
+@return number of tree levels */
+static
+ulint
+row_fts_build_sel_tree(
+/*===================*/
+ int* sel_tree, /*<! in/out: selection tree */
+ const mrec_t** mrec, /*<! in: sort record */
+ rec_offs** offsets, /*<! in: record offsets */
+ dict_index_t* index) /*<! in: index dictionary */
+{
+ ulint treelevel = 1;
+ ulint num = 2;
+ ulint i = 0;
+ ulint start;
+
+ /* No need to build selection tree if we only have two merge threads */
+ if (fts_sort_pll_degree <= 2) {
+ return(0);
+ }
+
+ while (num < fts_sort_pll_degree) {
+ num = num << 1;
+ treelevel++;
+ }
+
+ start = (ulint(1) << treelevel) - 1;
+
+ for (i = 0; i < fts_sort_pll_degree; i++) {
+ sel_tree[i + start] = int(i);
+ }
+
+ i = treelevel;
+ do {
+ row_fts_build_sel_tree_level(
+ sel_tree, --i, mrec, offsets, index);
+ } while (i > 0);
+
+ return(treelevel);
+}
+
+/*********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+ dict_index_t* index, /*!< in: index */
+ dict_table_t* table, /*!< in: new table */
+ fts_psort_t* psort_info, /*!< parallel sort info */
+ ulint id) /* !< in: which auxiliary table's data
+ to insert to */
+{
+ const byte** b;
+ mem_heap_t* tuple_heap;
+ mem_heap_t* heap;
+ dberr_t error = DB_SUCCESS;
+ ulint* foffs;
+ rec_offs** offsets;
+ fts_tokenizer_word_t new_word;
+ ib_vector_t* positions;
+ doc_id_t last_doc_id;
+ ib_alloc_t* heap_alloc;
+ ulint i;
+ mrec_buf_t** buf;
+ pfs_os_file_t* fd;
+ byte** block;
+ byte** crypt_block;
+ const mrec_t** mrec;
+ ulint count = 0;
+ int* sel_tree;
+ ulint height;
+ ulint start;
+ fts_psort_insert_t ins_ctx;
+ uint64_t count_diag = 0;
+ fts_table_t fts_table;
+ char aux_table_name[MAX_FULL_NAME_LEN];
+ dict_table_t* aux_table;
+ dict_index_t* aux_index;
+ trx_t* trx;
+
+ /* We use the insert query graph as the dummy graph
+ needed in the row module call */
+
+ trx = trx_create();
+ trx_start_if_not_started(trx, true);
+
+ trx->op_info = "inserting index entries";
+
+ ins_ctx.opt_doc_id_size = psort_info[0].psort_common->opt_doc_id_size;
+
+ heap = mem_heap_create(500 + sizeof(mrec_buf_t));
+
+ b = (const byte**) mem_heap_alloc(
+ heap, sizeof (*b) * fts_sort_pll_degree);
+ foffs = (ulint*) mem_heap_alloc(
+ heap, sizeof(*foffs) * fts_sort_pll_degree);
+ offsets = (rec_offs**) mem_heap_alloc(
+ heap, sizeof(*offsets) * fts_sort_pll_degree);
+ buf = (mrec_buf_t**) mem_heap_alloc(
+ heap, sizeof(*buf) * fts_sort_pll_degree);
+ fd = (pfs_os_file_t*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree);
+ block = (byte**) mem_heap_alloc(
+ heap, sizeof(*block) * fts_sort_pll_degree);
+ crypt_block = (byte**) mem_heap_alloc(
+ heap, sizeof(*block) * fts_sort_pll_degree);
+ mrec = (const mrec_t**) mem_heap_alloc(
+ heap, sizeof(*mrec) * fts_sort_pll_degree);
+ sel_tree = (int*) mem_heap_alloc(
+ heap, sizeof(*sel_tree) * (fts_sort_pll_degree * 2));
+
+ tuple_heap = mem_heap_create(1000);
+
+ ins_ctx.charset = fts_index_get_charset(index);
+ ins_ctx.heap = heap;
+
+ for (i = 0; i < fts_sort_pll_degree; i++) {
+ ulint num;
+
+ num = 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+ offsets[i] = static_cast<rec_offs*>(mem_heap_zalloc(
+ heap, num * sizeof *offsets[i]));
+ rec_offs_set_n_alloc(offsets[i], num);
+ rec_offs_set_n_fields(offsets[i], dict_index_get_n_fields(index));
+ block[i] = psort_info[i].merge_block[id];
+ crypt_block[i] = psort_info[i].crypt_block[id];
+ b[i] = psort_info[i].merge_block[id];
+ fd[i] = psort_info[i].merge_file[id]->fd;
+ foffs[i] = 0;
+
+ buf[i] = static_cast<mrec_buf_t*>(
+ mem_heap_alloc(heap, sizeof *buf[i]));
+
+ count_diag += psort_info[i].merge_file[id]->n_rec;
+ }
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "InnoDB_FTS: to insert " << count_diag
+ << " records";
+ }
+
+ /* Initialize related variables if creating FTS indexes */
+ heap_alloc = ib_heap_allocator_create(heap);
+
+ memset(&new_word, 0, sizeof(new_word));
+
+ new_word.nodes = ib_vector_create(heap_alloc, sizeof(fts_node_t), 4);
+ positions = ib_vector_create(heap_alloc, sizeof(ulint), 32);
+ last_doc_id = 0;
+
+ /* We should set the flags2 with aux_table_name here,
+ in order to get the correct aux table names. */
+ index->table->flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
+ DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+ index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME
+ & ((1U << DICT_TF2_BITS) - 1););
+ fts_table.type = FTS_INDEX_TABLE;
+ fts_table.index_id = index->id;
+ fts_table.table_id = table->id;
+ fts_table.table = index->table;
+ fts_table.suffix = fts_get_suffix(id);
+
+ /* Get aux index */
+ fts_get_table_name(&fts_table, aux_table_name);
+ aux_table = dict_table_open_on_name(aux_table_name, FALSE, FALSE,
+ DICT_ERR_IGNORE_NONE);
+ ut_ad(aux_table != NULL);
+ dict_table_close(aux_table, FALSE, FALSE);
+ aux_index = dict_table_get_first_index(aux_table);
+
+ ut_ad(!aux_index->is_instant());
+ /* row_merge_write_fts_node() depends on the correct value */
+ ut_ad(aux_index->n_core_null_bytes
+ == UT_BITS_IN_BYTES(aux_index->n_nullable));
+
+ /* Create bulk load instance */
+ ins_ctx.btr_bulk = UT_NEW_NOKEY(BtrBulk(aux_index, trx));
+
+ /* Create tuple for insert */
+ ins_ctx.tuple = dtuple_create(heap, dict_index_get_n_fields(aux_index));
+ dict_index_copy_types(ins_ctx.tuple, aux_index,
+ dict_index_get_n_fields(aux_index));
+
+ /* Set TRX_ID and ROLL_PTR */
+ dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 2),
+ &reset_trx_id, DATA_TRX_ID_LEN);
+ dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 3),
+ &reset_trx_id[DATA_TRX_ID_LEN], DATA_ROLL_PTR_LEN);
+
+ ut_d(ins_ctx.aux_index_id = id);
+
+ const ulint space = table->space_id;
+
+ for (i = 0; i < fts_sort_pll_degree; i++) {
+ if (psort_info[i].merge_file[id]->n_rec == 0) {
+ /* No Rows to read */
+ mrec[i] = b[i] = NULL;
+ } else {
+ /* Read from temp file only if it has been
+ written to. Otherwise, block memory holds
+ all the sorted records */
+ if (psort_info[i].merge_file[id]->offset > 0
+ && (!row_merge_read(
+ fd[i], foffs[i],
+ (row_merge_block_t*) block[i],
+ (row_merge_block_t*) crypt_block[i],
+ space))) {
+ error = DB_CORRUPTION;
+ goto exit;
+ }
+
+ ROW_MERGE_READ_GET_NEXT(i);
+ }
+ }
+
+ height = row_fts_build_sel_tree(sel_tree, (const mrec_t **) mrec,
+ offsets, index);
+
+ start = (1U << height) - 1;
+
+ /* Fetch sorted records from sort buffer and insert them into
+ corresponding FTS index auxiliary tables */
+ for (;;) {
+ dtuple_t* dtuple;
+ int min_rec = 0;
+
+ if (fts_sort_pll_degree <= 2) {
+ while (!mrec[min_rec]) {
+ min_rec++;
+
+ if (min_rec >= (int) fts_sort_pll_degree) {
+ row_fts_insert_tuple(
+ &ins_ctx, &new_word,
+ positions, &last_doc_id,
+ NULL);
+
+ goto exit;
+ }
+ }
+
+ for (i = min_rec + 1; i < fts_sort_pll_degree; i++) {
+ if (!mrec[i]) {
+ continue;
+ }
+
+ if (cmp_rec_rec_simple(
+ mrec[i], mrec[min_rec],
+ offsets[i], offsets[min_rec],
+ index, NULL) < 0) {
+ min_rec = static_cast<int>(i);
+ }
+ }
+ } else {
+ min_rec = sel_tree[0];
+
+ if (min_rec == -1) {
+ row_fts_insert_tuple(
+ &ins_ctx, &new_word,
+ positions, &last_doc_id,
+ NULL);
+
+ goto exit;
+ }
+ }
+
+ dtuple = row_rec_to_index_entry_low(
+ mrec[min_rec], index, offsets[min_rec],
+ tuple_heap);
+
+ row_fts_insert_tuple(
+ &ins_ctx, &new_word, positions,
+ &last_doc_id, dtuple);
+
+
+ ROW_MERGE_READ_GET_NEXT(min_rec);
+
+ if (fts_sort_pll_degree > 2) {
+ if (!mrec[min_rec]) {
+ sel_tree[start + min_rec] = -1;
+ }
+
+ row_fts_sel_tree_update(sel_tree, start + min_rec,
+ height, mrec,
+ offsets, index);
+ }
+
+ count++;
+
+ mem_heap_empty(tuple_heap);
+ }
+
+exit:
+ fts_sql_commit(trx);
+
+ trx->op_info = "";
+
+ mem_heap_free(tuple_heap);
+
+ error = ins_ctx.btr_bulk->finish(error);
+ UT_DELETE(ins_ctx.btr_bulk);
+
+ trx->free();
+
+ mem_heap_free(heap);
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "InnoDB_FTS: inserted " << count << " records";
+ }
+
+ return(error);
+}
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
new file mode 100644
index 00000000..219e0e93
--- /dev/null
+++ b/storage/innobase/row/row0import.cc
@@ -0,0 +1,4290 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0import.cc
+Import a tablespace to a running instance.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0import.h"
+#include "btr0pcur.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif
+#include "que0que.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "ibuf0ibuf.h"
+#include "pars0pars.h"
+#include "row0sel.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "row0quiesce.h"
+#include "fil0pagecompress.h"
+#include "trx0undo.h"
+#ifdef HAVE_LZO
+#include "lzo/lzo1x.h"
+#endif
+#ifdef HAVE_SNAPPY
+#include "snappy-c.h"
+#endif
+
+#include <vector>
+
+#ifdef HAVE_MY_AES_H
+#include <my_aes.h>
+#endif
+
+using st_::span;
+
+/** The size of the buffer to use for IO.
+@param n physical page size
+@return number of pages */
+#define IO_BUFFER_SIZE(n) ((1024 * 1024) / (n))
+
+/** For gathering stats on records during phase I */
+struct row_stats_t {
+ ulint m_n_deleted; /*!< Number of deleted records
+ found in the index */
+
+ ulint m_n_purged; /*!< Number of records purged
+ optimisatically */
+
+ ulint m_n_rows; /*!< Number of rows */
+
+ ulint m_n_purge_failed; /*!< Number of deleted rows
+ that could not be purged */
+};
+
+/** Index information required by IMPORT. */
+struct row_index_t {
+ index_id_t m_id; /*!< Index id of the table
+ in the exporting server */
+ byte* m_name; /*!< Index name */
+
+ ulint m_space; /*!< Space where it is placed */
+
+ ulint m_page_no; /*!< Root page number */
+
+ ulint m_type; /*!< Index type */
+
+ ulint m_trx_id_offset; /*!< Relevant only for clustered
+ indexes, offset of transaction
+ id system column */
+
+ ulint m_n_user_defined_cols; /*!< User defined columns */
+
+ ulint m_n_uniq; /*!< Number of columns that can
+ uniquely identify the row */
+
+ ulint m_n_nullable; /*!< Number of nullable
+ columns */
+
+ ulint m_n_fields; /*!< Total number of fields */
+
+ dict_field_t* m_fields; /*!< Index fields */
+
+ const dict_index_t*
+ m_srv_index; /*!< Index instance in the
+ importing server */
+
+ row_stats_t m_stats; /*!< Statistics gathered during
+ the import phase */
+
+};
+
+/** Meta data required by IMPORT. */
+struct row_import {
+ row_import() UNIV_NOTHROW
+ :
+ m_table(NULL),
+ m_version(0),
+ m_hostname(NULL),
+ m_table_name(NULL),
+ m_autoinc(0),
+ m_zip_size(0),
+ m_flags(0),
+ m_n_cols(0),
+ m_cols(NULL),
+ m_col_names(NULL),
+ m_n_indexes(0),
+ m_indexes(NULL),
+ m_missing(true) { }
+
+ ~row_import() UNIV_NOTHROW;
+
+ /** Find the index entry in in the indexes array.
+ @param name index name
+ @return instance if found else 0. */
+ row_index_t* get_index(const char* name) const UNIV_NOTHROW;
+
+ /** Get the number of rows in the index.
+ @param name index name
+ @return number of rows (doesn't include delete marked rows). */
+ ulint get_n_rows(const char* name) const UNIV_NOTHROW;
+
+ /** Find the ordinal value of the column name in the cfg table columns.
+ @param name of column to look for.
+ @return ULINT_UNDEFINED if not found. */
+ ulint find_col(const char* name) const UNIV_NOTHROW;
+
+ /** Get the number of rows for which purge failed during the
+ convert phase.
+ @param name index name
+ @return number of rows for which purge failed. */
+ ulint get_n_purge_failed(const char* name) const UNIV_NOTHROW;
+
+ /** Check if the index is clean. ie. no delete-marked records
+ @param name index name
+ @return true if index needs to be purged. */
+ bool requires_purge(const char* name) const UNIV_NOTHROW
+ {
+ return(get_n_purge_failed(name) > 0);
+ }
+
+ /** Set the index root <space, pageno> using the index name */
+ void set_root_by_name() UNIV_NOTHROW;
+
+ /** Set the index root <space, pageno> using a heuristic
+ @return DB_SUCCESS or error code */
+ dberr_t set_root_by_heuristic() UNIV_NOTHROW;
+
+ /** Check if the index schema that was read from the .cfg file
+ matches the in memory index definition.
+ Note: It will update row_import_t::m_srv_index to map the meta-data
+ read from the .cfg file to the server index instance.
+ @return DB_SUCCESS or error code. */
+ dberr_t match_index_columns(
+ THD* thd,
+ const dict_index_t* index) UNIV_NOTHROW;
+
+ /** Check if the table schema that was read from the .cfg file
+ matches the in memory table definition.
+ @param thd MySQL session variable
+ @return DB_SUCCESS or error code. */
+ dberr_t match_table_columns(
+ THD* thd) UNIV_NOTHROW;
+
+ /** Check if the table (and index) schema that was read from the
+ .cfg file matches the in memory table definition.
+ @param thd MySQL session variable
+ @return DB_SUCCESS or error code. */
+ dberr_t match_schema(
+ THD* thd) UNIV_NOTHROW;
+
+ dict_table_t* m_table; /*!< Table instance */
+
+ ulint m_version; /*!< Version of config file */
+
+ byte* m_hostname; /*!< Hostname where the
+ tablespace was exported */
+ byte* m_table_name; /*!< Exporting instance table
+ name */
+
+ ib_uint64_t m_autoinc; /*!< Next autoinc value */
+
+ ulint m_zip_size; /*!< ROW_FORMAT=COMPRESSED
+ page size, or 0 */
+
+ ulint m_flags; /*!< Table flags */
+
+ ulint m_n_cols; /*!< Number of columns in the
+ meta-data file */
+
+ dict_col_t* m_cols; /*!< Column data */
+
+ byte** m_col_names; /*!< Column names, we store the
+ column naems separately becuase
+ there is no field to store the
+ value in dict_col_t */
+
+ ulint m_n_indexes; /*!< Number of indexes,
+ including clustered index */
+
+ row_index_t* m_indexes; /*!< Index meta data */
+
+ bool m_missing; /*!< true if a .cfg file was
+ found and was readable */
+};
+
+/** Use the page cursor to iterate over records in a block. */
+class RecIterator {
+public:
+ /** Default constructor */
+ RecIterator() UNIV_NOTHROW
+ {
+ memset(&m_cur, 0x0, sizeof(m_cur));
+ /* Make page_cur_delete_rec() happy. */
+ m_mtr.start();
+ m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ /** Position the cursor on the first user record. */
+ void open(buf_block_t* block) UNIV_NOTHROW
+ {
+ page_cur_set_before_first(block, &m_cur);
+
+ if (!end()) {
+ next();
+ }
+ }
+
+ /** Move to the next record. */
+ void next() UNIV_NOTHROW
+ {
+ page_cur_move_to_next(&m_cur);
+ }
+
+ /**
+ @return the current record */
+ rec_t* current() UNIV_NOTHROW
+ {
+ ut_ad(!end());
+ return(page_cur_get_rec(&m_cur));
+ }
+
+ buf_block_t* current_block() const { return m_cur.block; }
+
+ /**
+ @return true if cursor is at the end */
+ bool end() UNIV_NOTHROW
+ {
+ return(page_cur_is_after_last(&m_cur) == TRUE);
+ }
+
+ /** Remove the current record
+ @return true on success */
+ bool remove(
+ const dict_index_t* index,
+ rec_offs* offsets) UNIV_NOTHROW
+ {
+ ut_ad(page_is_leaf(m_cur.block->frame));
+ /* We can't end up with an empty page unless it is root. */
+ if (page_get_n_recs(m_cur.block->frame) <= 1) {
+ return(false);
+ }
+
+ if (!rec_offs_any_extern(offsets)
+ && m_cur.block->page.id().page_no() != index->page
+ && ((page_get_data_size(m_cur.block->frame)
+ - rec_offs_size(offsets)
+ < BTR_CUR_PAGE_COMPRESS_LIMIT(index))
+ || !page_has_siblings(m_cur.block->frame)
+ || (page_get_n_recs(m_cur.block->frame) < 2))) {
+ return false;
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ page_zip_des_t* page_zip = buf_block_get_page_zip(m_cur.block);
+ ut_a(!page_zip || page_zip_validate(
+ page_zip, m_cur.block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ page_cur_delete_rec(&m_cur, index, offsets, &m_mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(
+ page_zip, m_cur.block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ return true;
+ }
+
+private:
+ page_cur_t m_cur;
+public:
+ mtr_t m_mtr;
+};
+
+/** Class that purges delete marked reocords from indexes, both secondary
+and cluster. It does a pessimistic delete. This should only be done if we
+couldn't purge the delete marked reocrds during Phase I. */
+class IndexPurge {
+public:
+ /** Constructor
+ @param trx the user transaction covering the import tablespace
+ @param index to be imported
+ @param space_id space id of the tablespace */
+ IndexPurge(
+ trx_t* trx,
+ dict_index_t* index) UNIV_NOTHROW
+ :
+ m_trx(trx),
+ m_index(index),
+ m_n_rows(0)
+ {
+ ib::info() << "Phase II - Purge records from index "
+ << index->name;
+ }
+
+ /** Descructor */
+ ~IndexPurge() UNIV_NOTHROW { }
+
+ /** Purge delete marked records.
+ @return DB_SUCCESS or error code. */
+ dberr_t garbage_collect() UNIV_NOTHROW;
+
+ /** The number of records that are not delete marked.
+ @return total records in the index after purge */
+ ulint get_n_rows() const UNIV_NOTHROW
+ {
+ return(m_n_rows);
+ }
+
+private:
+ /** Begin import, position the cursor on the first record. */
+ void open() UNIV_NOTHROW;
+
+ /** Close the persistent curosr and commit the mini-transaction. */
+ void close() UNIV_NOTHROW;
+
+ /** Position the cursor on the next record.
+ @return DB_SUCCESS or error code */
+ dberr_t next() UNIV_NOTHROW;
+
+ /** Store the persistent cursor position and reopen the
+ B-tree cursor in BTR_MODIFY_TREE mode, because the
+ tree structure may be changed during a pessimistic delete. */
+ void purge_pessimistic_delete() UNIV_NOTHROW;
+
+ /** Purge delete-marked records.
+ @param offsets current row offsets. */
+ void purge() UNIV_NOTHROW;
+
+protected:
+ // Disable copying
+ IndexPurge();
+ IndexPurge(const IndexPurge&);
+ IndexPurge &operator=(const IndexPurge&);
+
+private:
+ trx_t* m_trx; /*!< User transaction */
+ mtr_t m_mtr; /*!< Mini-transaction */
+ btr_pcur_t m_pcur; /*!< Persistent cursor */
+ dict_index_t* m_index; /*!< Index to be processed */
+ ulint m_n_rows; /*!< Records in index */
+};
+
+/** Functor that is called for each physical page that is read from the
+tablespace file. */
+class AbstractCallback
+{
+public:
+ /** Constructor
+ @param trx covering transaction */
+ AbstractCallback(trx_t* trx, ulint space_id)
+ :
+ m_zip_size(0),
+ m_trx(trx),
+ m_space(space_id),
+ m_xdes(),
+ m_xdes_page_no(ULINT_UNDEFINED),
+ m_space_flags(ULINT_UNDEFINED) UNIV_NOTHROW { }
+
+ /** Free any extent descriptor instance */
+ virtual ~AbstractCallback()
+ {
+ UT_DELETE_ARRAY(m_xdes);
+ }
+
+ /** Determine the page size to use for traversing the tablespace
+ @param file_size size of the tablespace file in bytes
+ @param block contents of the first page in the tablespace file.
+ @retval DB_SUCCESS or error code. */
+ virtual dberr_t init(
+ os_offset_t file_size,
+ const buf_block_t* block) UNIV_NOTHROW;
+
+ /** @return true if compressed table. */
+ bool is_compressed_table() const UNIV_NOTHROW
+ {
+ return get_zip_size();
+ }
+
+ /** @return the tablespace flags */
+ ulint get_space_flags() const
+ {
+ return(m_space_flags);
+ }
+
+ /**
+ Set the name of the physical file and the file handle that is used
+ to open it for the file that is being iterated over.
+ @param filename the physical name of the tablespace file
+ @param file OS file handle */
+ void set_file(const char* filename, pfs_os_file_t file) UNIV_NOTHROW
+ {
+ m_file = file;
+ m_filepath = filename;
+ }
+
+ ulint get_zip_size() const { return m_zip_size; }
+ ulint physical_size() const
+ {
+ return m_zip_size ? m_zip_size : srv_page_size;
+ }
+
+ const char* filename() const { return m_filepath; }
+
+ /**
+ Called for every page in the tablespace. If the page was not
+ updated then its state must be set to BUF_PAGE_NOT_USED. For
+ compressed tables the page descriptor memory will be at offset:
+ block->frame + srv_page_size;
+ @param block block read from file, note it is not from the buffer pool
+ @retval DB_SUCCESS or error code. */
+ virtual dberr_t operator()(buf_block_t* block) UNIV_NOTHROW = 0;
+
+ /** @return the tablespace identifier */
+ ulint get_space_id() const { return m_space; }
+
+ bool is_interrupted() const { return trx_is_interrupted(m_trx); }
+
+ /**
+ Get the data page depending on the table type, compressed or not.
+ @param block - block read from disk
+ @retval the buffer frame */
+ static byte* get_frame(const buf_block_t* block)
+ {
+ return block->page.zip.data
+ ? block->page.zip.data : block->frame;
+ }
+
+protected:
+ /** Get the physical offset of the extent descriptor within the page.
+ @param page_no page number of the extent descriptor
+ @param page contents of the page containing the extent descriptor.
+ @return the start of the xdes array in a page */
+ const xdes_t* xdes(
+ ulint page_no,
+ const page_t* page) const UNIV_NOTHROW
+ {
+ ulint offset;
+
+ offset = xdes_calc_descriptor_index(get_zip_size(), page_no);
+
+ return(page + XDES_ARR_OFFSET + XDES_SIZE * offset);
+ }
+
+ /** Set the current page directory (xdes). If the extent descriptor is
+ marked as free then free the current extent descriptor and set it to
+ 0. This implies that all pages that are covered by this extent
+ descriptor are also freed.
+
+ @param page_no offset of page within the file
+ @param page page contents
+ @return DB_SUCCESS or error code. */
+ dberr_t set_current_xdes(
+ ulint page_no,
+ const page_t* page) UNIV_NOTHROW
+ {
+ m_xdes_page_no = page_no;
+
+ UT_DELETE_ARRAY(m_xdes);
+ m_xdes = NULL;
+
+ if (mach_read_from_4(XDES_ARR_OFFSET + XDES_STATE + page)
+ != XDES_FREE) {
+ const ulint physical_size = m_zip_size
+ ? m_zip_size : srv_page_size;
+
+ m_xdes = UT_NEW_ARRAY_NOKEY(xdes_t, physical_size);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_13",
+ UT_DELETE_ARRAY(m_xdes);
+ m_xdes = NULL;
+ );
+
+ if (m_xdes == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ memcpy(m_xdes, page, physical_size);
+ }
+
+ return(DB_SUCCESS);
+ }
+
+ /** Check if the page is marked as free in the extent descriptor.
+ @param page_no page number to check in the extent descriptor.
+ @return true if the page is marked as free */
+ bool is_free(uint32_t page_no) const UNIV_NOTHROW
+ {
+ ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no)
+ == m_xdes_page_no);
+
+ if (m_xdes != 0) {
+ const xdes_t* xdesc = xdes(page_no, m_xdes);
+ ulint pos = page_no % FSP_EXTENT_SIZE;
+
+ return xdes_is_free(xdesc, pos);
+ }
+
+ /* If the current xdes was free, the page must be free. */
+ return(true);
+ }
+
+protected:
+ /** The ROW_FORMAT=COMPRESSED page size, or 0. */
+ ulint m_zip_size;
+
+ /** File handle to the tablespace */
+ pfs_os_file_t m_file;
+
+ /** Physical file path. */
+ const char* m_filepath;
+
+ /** Covering transaction. */
+ trx_t* m_trx;
+
+ /** Space id of the file being iterated over. */
+ ulint m_space;
+
+ /** Current size of the space in pages */
+ ulint m_size;
+
+ /** Current extent descriptor page */
+ xdes_t* m_xdes;
+
+ /** Physical page offset in the file of the extent descriptor */
+ ulint m_xdes_page_no;
+
+ /** Flags value read from the header page */
+ ulint m_space_flags;
+};
+
+/** Determine the page size to use for traversing the tablespace
+@param file_size size of the tablespace file in bytes
+@param block contents of the first page in the tablespace file.
+@retval DB_SUCCESS or error code. */
+dberr_t
+AbstractCallback::init(
+ os_offset_t file_size,
+ const buf_block_t* block) UNIV_NOTHROW
+{
+ const page_t* page = block->frame;
+
+ m_space_flags = fsp_header_get_flags(page);
+ if (!fil_space_t::is_valid_flags(m_space_flags, true)) {
+ ulint cflags = fsp_flags_convert_from_101(m_space_flags);
+ if (cflags == ULINT_UNDEFINED) {
+ ib::error() << "Invalid FSP_SPACE_FLAGS="
+ << ib::hex(m_space_flags);
+ return(DB_CORRUPTION);
+ }
+ m_space_flags = cflags;
+ }
+
+ /* Clear the DATA_DIR flag, which is basically garbage. */
+ m_space_flags &= ~(1U << FSP_FLAGS_POS_RESERVED);
+ m_zip_size = fil_space_t::zip_size(m_space_flags);
+ const ulint logical_size = fil_space_t::logical_size(m_space_flags);
+ const ulint physical_size = fil_space_t::physical_size(m_space_flags);
+
+ if (logical_size != srv_page_size) {
+
+ ib::error() << "Page size " << logical_size
+ << " of ibd file is not the same as the server page"
+ " size " << srv_page_size;
+
+ return(DB_CORRUPTION);
+
+ } else if (file_size & (physical_size - 1)) {
+
+ ib::error() << "File size " << file_size << " is not a"
+ " multiple of the page size "
+ << physical_size;
+
+ return(DB_CORRUPTION);
+ }
+
+ m_size = mach_read_from_4(page + FSP_SIZE);
+ if (m_space == ULINT_UNDEFINED) {
+ m_space = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID
+ + page);
+ }
+
+ return set_current_xdes(0, page);
+}
+
+/**
+Try and determine the index root pages by checking if the next/prev
+pointers are both FIL_NULL. We need to ensure that skip deleted pages. */
+struct FetchIndexRootPages : public AbstractCallback {
+
+ /** Index information gathered from the .ibd file. */
+ struct Index {
+
+ Index(index_id_t id, ulint page_no)
+ :
+ m_id(id),
+ m_page_no(page_no) { }
+
+ index_id_t m_id; /*!< Index id */
+ ulint m_page_no; /*!< Root page number */
+ };
+
+ typedef std::vector<Index, ut_allocator<Index> > Indexes;
+
+ /** Constructor
+ @param trx covering (user) transaction
+ @param table table definition in server .*/
+ FetchIndexRootPages(const dict_table_t* table, trx_t* trx)
+ :
+ AbstractCallback(trx, ULINT_UNDEFINED),
+ m_table(table) UNIV_NOTHROW { }
+
+ /** Destructor */
+ ~FetchIndexRootPages() UNIV_NOTHROW override { }
+
+ /** Called for each block as it is read from the file.
+ @param block block to convert, it is not from the buffer pool.
+ @retval DB_SUCCESS or error code. */
+ dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override;
+
+ /** Update the import configuration that will be used to import
+ the tablespace. */
+ dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW;
+
+ /** Table definition in server. */
+ const dict_table_t* m_table;
+
+ /** Index information */
+ Indexes m_indexes;
+};
+
+/** Called for each block as it is read from the file. Check index pages to
+determine the exact row format. We can't get that from the tablespace
+header flags alone.
+
+@param block block to convert, it is not from the buffer pool.
+@retval DB_SUCCESS or error code. */
+dberr_t FetchIndexRootPages::operator()(buf_block_t* block) UNIV_NOTHROW
+{
+ if (is_interrupted()) return DB_INTERRUPTED;
+
+ const page_t* page = get_frame(block);
+
+ ulint page_type = fil_page_get_type(page);
+
+ if (page_type == FIL_PAGE_TYPE_XDES) {
+ return set_current_xdes(block->page.id().page_no(), page);
+ } else if (fil_page_index_page_check(page)
+ && !is_free(block->page.id().page_no())
+ && !page_has_siblings(page)) {
+
+ index_id_t id = btr_page_get_index_id(page);
+
+ m_indexes.push_back(Index(id, block->page.id().page_no()));
+
+ if (m_indexes.size() == 1) {
+ /* Check that the tablespace flags match the table flags. */
+ ulint expected = dict_tf_to_fsp_flags(m_table->flags);
+ if (!fsp_flags_match(expected, m_space_flags)) {
+ ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Expected FSP_SPACE_FLAGS=0x%x, .ibd "
+ "file contains 0x%x.",
+ unsigned(expected),
+ unsigned(m_space_flags));
+ return(DB_CORRUPTION);
+ }
+ }
+
+ if (!page_is_comp(block->frame) !=
+ !dict_table_is_comp(m_table)) {
+ ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "ROW_FORMAT mismatch");
+ return DB_CORRUPTION;
+ }
+ }
+
+ return DB_SUCCESS;
+}
+
+/**
+Update the import configuration that will be used to import the tablespace.
+@return error code or DB_SUCCESS */
+dberr_t
+FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW
+{
+ Indexes::const_iterator end = m_indexes.end();
+
+ ut_a(cfg->m_table == m_table);
+ cfg->m_zip_size = m_zip_size;
+ cfg->m_n_indexes = m_indexes.size();
+
+ if (cfg->m_n_indexes == 0) {
+
+ ib::error() << "No B+Tree found in tablespace";
+
+ return(DB_CORRUPTION);
+ }
+
+ cfg->m_indexes = UT_NEW_ARRAY_NOKEY(row_index_t, cfg->m_n_indexes);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_11",
+ UT_DELETE_ARRAY(cfg->m_indexes);
+ cfg->m_indexes = NULL;
+ );
+
+ if (cfg->m_indexes == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+ row_index_t* cfg_index = cfg->m_indexes;
+
+ for (Indexes::const_iterator it = m_indexes.begin();
+ it != end;
+ ++it, ++cfg_index) {
+
+ char name[BUFSIZ];
+
+ snprintf(name, sizeof(name), "index" IB_ID_FMT, it->m_id);
+
+ ulint len = strlen(name) + 1;
+
+ cfg_index->m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_12",
+ UT_DELETE_ARRAY(cfg_index->m_name);
+ cfg_index->m_name = NULL;
+ );
+
+ if (cfg_index->m_name == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ memcpy(cfg_index->m_name, name, len);
+
+ cfg_index->m_id = it->m_id;
+
+ cfg_index->m_space = m_space;
+
+ cfg_index->m_page_no = it->m_page_no;
+ }
+
+ return(DB_SUCCESS);
+}
+
+/* Functor that is called for each physical page that is read from the
+tablespace file.
+
+ 1. Check each page for corruption.
+
+ 2. Update the space id and LSN on every page
+ * For the header page
+ - Validate the flags
+ - Update the LSN
+
+ 3. On Btree pages
+ * Set the index id
+ * Update the max trx id
+ * In a cluster index, update the system columns
+ * In a cluster index, update the BLOB ptr, set the space id
+ * Purge delete marked records, but only if they can be easily
+ removed from the page
+ * Keep a counter of number of rows, ie. non-delete-marked rows
+ * Keep a counter of number of delete marked rows
+ * Keep a counter of number of purge failure
+ * If a page is stamped with an index id that isn't in the .cfg file
+ we assume it is deleted and the page can be ignored.
+
+ 4. Set the page state to dirty so that it will be written to disk.
+*/
+class PageConverter : public AbstractCallback {
+public:
+ /** Constructor
+ @param cfg config of table being imported.
+ @param space_id tablespace identifier
+ @param trx transaction covering the import */
+ PageConverter(row_import* cfg, ulint space_id, trx_t* trx)
+ :
+ AbstractCallback(trx, space_id),
+ m_cfg(cfg),
+ m_index(cfg->m_indexes),
+ m_rec_iter(),
+ m_offsets_(), m_offsets(m_offsets_),
+ m_heap(0),
+ m_cluster_index(dict_table_get_first_index(cfg->m_table))
+ {
+ rec_offs_init(m_offsets_);
+ }
+
+ ~PageConverter() UNIV_NOTHROW override
+ {
+ if (m_heap != 0) {
+ mem_heap_free(m_heap);
+ }
+ }
+
+ /** Called for each block as it is read from the file.
+ @param block block to convert, it is not from the buffer pool.
+ @retval DB_SUCCESS or error code. */
+ dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override;
+
+private:
+ /** Update the page, set the space id, max trx id and index id.
+ @param block block read from file
+ @param page_type type of the page
+ @retval DB_SUCCESS or error code */
+ dberr_t update_page(buf_block_t* block, uint16_t& page_type)
+ UNIV_NOTHROW;
+
+ /** Update the space, index id, trx id.
+ @param block block to convert
+ @return DB_SUCCESS or error code */
+ dberr_t update_index_page(buf_block_t* block) UNIV_NOTHROW;
+
+ /** Update the BLOB refrences and write UNDO log entries for
+ rows that can't be purged optimistically.
+ @param block block to update
+ @retval DB_SUCCESS or error code */
+ dberr_t update_records(buf_block_t* block) UNIV_NOTHROW;
+
+ /** Validate the space flags and update tablespace header page.
+ @param block block read from file, not from the buffer pool.
+ @retval DB_SUCCESS or error code */
+ dberr_t update_header(buf_block_t* block) UNIV_NOTHROW;
+
+ /** Adjust the BLOB reference for a single column that is externally stored
+ @param rec record to update
+ @param offsets column offsets for the record
+ @param i column ordinal value
+ @return DB_SUCCESS or error code */
+ dberr_t adjust_cluster_index_blob_column(
+ rec_t* rec,
+ const rec_offs* offsets,
+ ulint i) UNIV_NOTHROW;
+
+ /** Adjusts the BLOB reference in the clustered index row for all
+ externally stored columns.
+ @param rec record to update
+ @param offsets column offsets for the record
+ @return DB_SUCCESS or error code */
+ dberr_t adjust_cluster_index_blob_columns(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW;
+
+ /** In the clustered index, adjist the BLOB pointers as needed.
+ Also update the BLOB reference, write the new space id.
+ @param rec record to update
+ @param offsets column offsets for the record
+ @return DB_SUCCESS or error code */
+ dberr_t adjust_cluster_index_blob_ref(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW;
+
+ /** Purge delete-marked records, only if it is possible to do
+ so without re-organising the B+tree.
+ @retval true if purged */
+ bool purge() UNIV_NOTHROW;
+
+ /** Adjust the BLOB references and sys fields for the current record.
+ @param rec record to update
+ @param offsets column offsets for the record
+ @return DB_SUCCESS or error code. */
+ dberr_t adjust_cluster_record(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW;
+
+ /** Find an index with the matching id.
+ @return row_index_t* instance or 0 */
+ row_index_t* find_index(index_id_t id) UNIV_NOTHROW
+ {
+ row_index_t* index = &m_cfg->m_indexes[0];
+
+ for (ulint i = 0; i < m_cfg->m_n_indexes; ++i, ++index) {
+ if (id == index->m_id) {
+ return(index);
+ }
+ }
+
+ return(0);
+
+ }
+private:
+ /** Config for table that is being imported. */
+ row_import* m_cfg;
+
+ /** Current index whose pages are being imported */
+ row_index_t* m_index;
+
+ /** Iterator over records in a block */
+ RecIterator m_rec_iter;
+
+ /** Record offset */
+ rec_offs m_offsets_[REC_OFFS_NORMAL_SIZE];
+
+ /** Pointer to m_offsets_ */
+ rec_offs* m_offsets;
+
+ /** Memory heap for the record offsets */
+ mem_heap_t* m_heap;
+
+ /** Cluster index instance */
+ dict_index_t* m_cluster_index;
+};
+
+/**
+row_import destructor. */
+row_import::~row_import() UNIV_NOTHROW
+{
+ for (ulint i = 0; m_indexes != 0 && i < m_n_indexes; ++i) {
+ UT_DELETE_ARRAY(m_indexes[i].m_name);
+
+ if (m_indexes[i].m_fields == NULL) {
+ continue;
+ }
+
+ dict_field_t* fields = m_indexes[i].m_fields;
+ ulint n_fields = m_indexes[i].m_n_fields;
+
+ for (ulint j = 0; j < n_fields; ++j) {
+ UT_DELETE_ARRAY(const_cast<char*>(fields[j].name()));
+ }
+
+ UT_DELETE_ARRAY(fields);
+ }
+
+ for (ulint i = 0; m_col_names != 0 && i < m_n_cols; ++i) {
+ UT_DELETE_ARRAY(m_col_names[i]);
+ }
+
+ UT_DELETE_ARRAY(m_cols);
+ UT_DELETE_ARRAY(m_indexes);
+ UT_DELETE_ARRAY(m_col_names);
+ UT_DELETE_ARRAY(m_table_name);
+ UT_DELETE_ARRAY(m_hostname);
+}
+
+/** Find the index entry in in the indexes array.
+@param name index name
+@return instance if found else 0. */
+row_index_t*
+row_import::get_index(
+ const char* name) const UNIV_NOTHROW
+{
+ for (ulint i = 0; i < m_n_indexes; ++i) {
+ const char* index_name;
+ row_index_t* index = &m_indexes[i];
+
+ index_name = reinterpret_cast<const char*>(index->m_name);
+
+ if (strcmp(index_name, name) == 0) {
+
+ return(index);
+ }
+ }
+
+ return(0);
+}
+
+/** Get the number of rows in the index.
+@param name index name
+@return number of rows (doesn't include delete marked rows). */
+ulint
+row_import::get_n_rows(
+ const char* name) const UNIV_NOTHROW
+{
+ const row_index_t* index = get_index(name);
+
+ ut_a(name != 0);
+
+ return(index->m_stats.m_n_rows);
+}
+
+/** Get the number of rows for which purge failed uding the convert phase.
+@param name index name
+@return number of rows for which purge failed. */
+ulint
+row_import::get_n_purge_failed(
+ const char* name) const UNIV_NOTHROW
+{
+ const row_index_t* index = get_index(name);
+
+ ut_a(name != 0);
+
+ return(index->m_stats.m_n_purge_failed);
+}
+
+/** Find the ordinal value of the column name in the cfg table columns.
+@param name of column to look for.
+@return ULINT_UNDEFINED if not found. */
+ulint
+row_import::find_col(
+ const char* name) const UNIV_NOTHROW
+{
+ for (ulint i = 0; i < m_n_cols; ++i) {
+ const char* col_name;
+
+ col_name = reinterpret_cast<const char*>(m_col_names[i]);
+
+ if (strcmp(col_name, name) == 0) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**
+Check if the index schema that was read from the .cfg file matches the
+in memory index definition.
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_index_columns(
+ THD* thd,
+ const dict_index_t* index) UNIV_NOTHROW
+{
+ row_index_t* cfg_index;
+ dberr_t err = DB_SUCCESS;
+
+ cfg_index = get_index(index->name);
+
+ if (cfg_index == 0) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Index %s not found in tablespace meta-data file.",
+ index->name());
+
+ return(DB_ERROR);
+ }
+
+ if (cfg_index->m_n_fields != index->n_fields) {
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Index field count %u doesn't match"
+ " tablespace metadata file value " ULINTPF,
+ index->n_fields, cfg_index->m_n_fields);
+
+ return(DB_ERROR);
+ }
+
+ cfg_index->m_srv_index = index;
+
+ const dict_field_t* field = index->fields;
+ const dict_field_t* cfg_field = cfg_index->m_fields;
+
+ for (ulint i = 0; i < index->n_fields; ++i, ++field, ++cfg_field) {
+
+ if (strcmp(field->name(), cfg_field->name()) != 0) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Index field name %s doesn't match"
+ " tablespace metadata field name %s"
+ " for field position " ULINTPF,
+ field->name(), cfg_field->name(), i);
+
+ err = DB_ERROR;
+ }
+
+ if (cfg_field->prefix_len != field->prefix_len) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Index %s field %s prefix len %u"
+ " doesn't match metadata file value %u",
+ index->name(), field->name(),
+ field->prefix_len, cfg_field->prefix_len);
+
+ err = DB_ERROR;
+ }
+
+ if (cfg_field->fixed_len != field->fixed_len) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Index %s field %s fixed len %u"
+ " doesn't match metadata file value %u",
+ index->name(), field->name(),
+ field->fixed_len,
+ cfg_field->fixed_len);
+
+ err = DB_ERROR;
+ }
+ }
+
+ return(err);
+}
+
+/** Check if the table schema that was read from the .cfg file matches the
+in memory table definition.
+@param thd MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_table_columns(
+ THD* thd) UNIV_NOTHROW
+{
+ dberr_t err = DB_SUCCESS;
+ const dict_col_t* col = m_table->cols;
+
+ for (ulint i = 0; i < m_table->n_cols; ++i, ++col) {
+
+ const char* col_name;
+ ulint cfg_col_index;
+
+ col_name = dict_table_get_col_name(
+ m_table, dict_col_get_no(col));
+
+ cfg_col_index = find_col(col_name);
+
+ if (cfg_col_index == ULINT_UNDEFINED) {
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s not found in tablespace.",
+ col_name);
+
+ err = DB_ERROR;
+ } else if (cfg_col_index != col->ind) {
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s ordinal value mismatch, it's at %u"
+ " in the table and " ULINTPF
+ " in the tablespace meta-data file",
+ col_name, col->ind, cfg_col_index);
+
+ err = DB_ERROR;
+ } else {
+ const dict_col_t* cfg_col;
+
+ cfg_col = &m_cols[cfg_col_index];
+ ut_a(cfg_col->ind == cfg_col_index);
+
+ if (cfg_col->prtype != col->prtype) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s precise type mismatch,"
+ " it's 0X%X in the table and 0X%X"
+ " in the tablespace meta file",
+ col_name, col->prtype, cfg_col->prtype);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->mtype != col->mtype) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s main type mismatch,"
+ " it's 0X%X in the table and 0X%X"
+ " in the tablespace meta file",
+ col_name, col->mtype, cfg_col->mtype);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->len != col->len) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s length mismatch,"
+ " it's %u in the table and %u"
+ " in the tablespace meta file",
+ col_name, col->len, cfg_col->len);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->mbminlen != col->mbminlen
+ || cfg_col->mbmaxlen != col->mbmaxlen) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s multi-byte len mismatch,"
+ " it's %u-%u in the table and %u-%u"
+ " in the tablespace meta file",
+ col_name, col->mbminlen, col->mbmaxlen,
+ cfg_col->mbminlen, cfg_col->mbmaxlen);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->ind != col->ind) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s position mismatch,"
+ " it's %u in the table and %u"
+ " in the tablespace meta file",
+ col_name, col->ind, cfg_col->ind);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->ord_part != col->ord_part) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s ordering mismatch,"
+ " it's %u in the table and %u"
+ " in the tablespace meta file",
+ col_name, col->ord_part,
+ cfg_col->ord_part);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->max_prefix != col->max_prefix) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s max prefix mismatch"
+ " it's %u in the table and %u"
+ " in the tablespace meta file",
+ col_name, col->max_prefix,
+ cfg_col->max_prefix);
+ err = DB_ERROR;
+ }
+ }
+ }
+
+ return(err);
+}
+
+/** Check if the table (and index) schema that was read from the .cfg file
+matches the in memory table definition.
+@param thd MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_schema(
+ THD* thd) UNIV_NOTHROW
+{
+ /* Do some simple checks. */
+
+ if (ulint mismatch = (m_table->flags ^ m_flags)
+ & ~DICT_TF_MASK_DATA_DIR) {
+ const char* msg;
+ if (mismatch & DICT_TF_MASK_ZIP_SSIZE) {
+ if ((m_table->flags & DICT_TF_MASK_ZIP_SSIZE)
+ && (m_flags & DICT_TF_MASK_ZIP_SSIZE)) {
+ switch (m_flags & DICT_TF_MASK_ZIP_SSIZE) {
+ case 0U << DICT_TF_POS_ZIP_SSIZE:
+ goto uncompressed;
+ case 1U << DICT_TF_POS_ZIP_SSIZE:
+ msg = "ROW_FORMAT=COMPRESSED"
+ " KEY_BLOCK_SIZE=1";
+ break;
+ case 2U << DICT_TF_POS_ZIP_SSIZE:
+ msg = "ROW_FORMAT=COMPRESSED"
+ " KEY_BLOCK_SIZE=2";
+ break;
+ case 3U << DICT_TF_POS_ZIP_SSIZE:
+ msg = "ROW_FORMAT=COMPRESSED"
+ " KEY_BLOCK_SIZE=4";
+ break;
+ case 4U << DICT_TF_POS_ZIP_SSIZE:
+ msg = "ROW_FORMAT=COMPRESSED"
+ " KEY_BLOCK_SIZE=8";
+ break;
+ case 5U << DICT_TF_POS_ZIP_SSIZE:
+ msg = "ROW_FORMAT=COMPRESSED"
+ " KEY_BLOCK_SIZE=16";
+ break;
+ default:
+ msg = "strange KEY_BLOCK_SIZE";
+ }
+ } else if (m_flags & DICT_TF_MASK_ZIP_SSIZE) {
+ msg = "ROW_FORMAT=COMPRESSED";
+ } else {
+ goto uncompressed;
+ }
+ } else {
+uncompressed:
+ msg = (m_flags & DICT_TF_MASK_ATOMIC_BLOBS)
+ ? "ROW_FORMAT=DYNAMIC"
+ : (m_flags & DICT_TF_MASK_COMPACT)
+ ? "ROW_FORMAT=COMPACT"
+ : "ROW_FORMAT=REDUNDANT";
+ }
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+ "Table flags don't match, server table has 0x%x"
+ " and the meta-data file has 0x" ULINTPFx ";"
+ " .cfg file uses %s",
+ m_table->flags, m_flags, msg);
+
+ return(DB_ERROR);
+ } else if (m_table->n_cols != m_n_cols) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+ "Number of columns don't match, table has %u "
+ "columns but the tablespace meta-data file has "
+ ULINTPF " columns",
+ m_table->n_cols, m_n_cols);
+
+ return(DB_ERROR);
+ } else if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+ /* If the number of indexes don't match then it is better
+ to abort the IMPORT. It is easy for the user to create a
+ table matching the IMPORT definition. */
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+ "Number of indexes don't match, table has " ULINTPF
+ " indexes but the tablespace meta-data file has "
+ ULINTPF " indexes",
+ UT_LIST_GET_LEN(m_table->indexes), m_n_indexes);
+
+ return(DB_ERROR);
+ }
+
+ dberr_t err = match_table_columns(thd);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Check if the index definitions match. */
+
+ const dict_index_t* index;
+
+ for (index = UT_LIST_GET_FIRST(m_table->indexes);
+ index != 0;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ dberr_t index_err;
+
+ index_err = match_index_columns(thd, index);
+
+ if (index_err != DB_SUCCESS) {
+ err = index_err;
+ }
+ }
+
+ return(err);
+}
+
+/**
+Set the index root <space, pageno>, using index name. */
+void
+row_import::set_root_by_name() UNIV_NOTHROW
+{
+ row_index_t* cfg_index = m_indexes;
+
+ for (ulint i = 0; i < m_n_indexes; ++i, ++cfg_index) {
+ dict_index_t* index;
+
+ const char* index_name;
+
+ index_name = reinterpret_cast<const char*>(cfg_index->m_name);
+
+ index = dict_table_get_index_on_name(m_table, index_name);
+
+ /* We've already checked that it exists. */
+ ut_a(index != 0);
+
+ index->page = static_cast<uint32_t>(cfg_index->m_page_no);
+ }
+}
+
+/**
+Set the index root <space, pageno>, using a heuristic.
+@return DB_SUCCESS or error code */
+dberr_t
+row_import::set_root_by_heuristic() UNIV_NOTHROW
+{
+ row_index_t* cfg_index = m_indexes;
+
+ ut_a(m_n_indexes > 0);
+
+ // TODO: For now use brute force, based on ordinality
+
+ if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+ ib::warn() << "Table " << m_table->name << " should have "
+ << UT_LIST_GET_LEN(m_table->indexes) << " indexes but"
+ " the tablespace has " << m_n_indexes << " indexes";
+ }
+
+ dict_mutex_enter_for_mysql();
+
+ ulint i = 0;
+ dberr_t err = DB_SUCCESS;
+
+ for (dict_index_t* index = UT_LIST_GET_FIRST(m_table->indexes);
+ index != 0;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ if (index->type & DICT_FTS) {
+ index->type |= DICT_CORRUPT;
+ ib::warn() << "Skipping FTS index: " << index->name;
+ } else if (i < m_n_indexes) {
+
+ UT_DELETE_ARRAY(cfg_index[i].m_name);
+
+ ulint len = strlen(index->name) + 1;
+
+ cfg_index[i].m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_14",
+ UT_DELETE_ARRAY(cfg_index[i].m_name);
+ cfg_index[i].m_name = NULL;
+ );
+
+ if (cfg_index[i].m_name == NULL) {
+ err = DB_OUT_OF_MEMORY;
+ break;
+ }
+
+ memcpy(cfg_index[i].m_name, index->name, len);
+
+ cfg_index[i].m_srv_index = index;
+
+ index->page = static_cast<uint32_t>(
+ cfg_index[i++].m_page_no);
+ }
+ }
+
+ dict_mutex_exit_for_mysql();
+
+ return(err);
+}
+
+/**
+Purge delete marked records.
+@return DB_SUCCESS or error code. */
+dberr_t
+IndexPurge::garbage_collect() UNIV_NOTHROW
+{
+ dberr_t err;
+ ibool comp = dict_table_is_comp(m_index->table);
+
+ /* Open the persistent cursor and start the mini-transaction. */
+
+ open();
+
+ while ((err = next()) == DB_SUCCESS) {
+
+ rec_t* rec = btr_pcur_get_rec(&m_pcur);
+ ibool deleted = rec_get_deleted_flag(rec, comp);
+
+ if (!deleted) {
+ ++m_n_rows;
+ } else {
+ purge();
+ }
+ }
+
+ /* Close the persistent cursor and commit the mini-transaction. */
+
+ close();
+
+ return(err == DB_END_OF_INDEX ? DB_SUCCESS : err);
+}
+
+/**
+Begin import, position the cursor on the first record. */
+void
+IndexPurge::open() UNIV_NOTHROW
+{
+ mtr_start(&m_mtr);
+
+ mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+ btr_pcur_open_at_index_side(
+ true, m_index, BTR_MODIFY_LEAF, &m_pcur, true, 0, &m_mtr);
+ btr_pcur_move_to_next_user_rec(&m_pcur, &m_mtr);
+ if (rec_is_metadata(btr_pcur_get_rec(&m_pcur), *m_index)) {
+ ut_ad(btr_pcur_is_on_user_rec(&m_pcur));
+ /* Skip the metadata pseudo-record. */
+ } else {
+ btr_pcur_move_to_prev_on_page(&m_pcur);
+ }
+}
+
+/**
+Close the persistent curosr and commit the mini-transaction. */
+void
+IndexPurge::close() UNIV_NOTHROW
+{
+ btr_pcur_close(&m_pcur);
+ mtr_commit(&m_mtr);
+}
+
+/**
+Position the cursor on the next record.
+@return DB_SUCCESS or error code */
+dberr_t
+IndexPurge::next() UNIV_NOTHROW
+{
+ btr_pcur_move_to_next_on_page(&m_pcur);
+
+ /* When switching pages, commit the mini-transaction
+ in order to release the latch on the old page. */
+
+ if (!btr_pcur_is_after_last_on_page(&m_pcur)) {
+ return(DB_SUCCESS);
+ } else if (trx_is_interrupted(m_trx)) {
+ /* Check after every page because the check
+ is expensive. */
+ return(DB_INTERRUPTED);
+ }
+
+ btr_pcur_store_position(&m_pcur, &m_mtr);
+
+ mtr_commit(&m_mtr);
+
+ mtr_start(&m_mtr);
+
+ mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+ btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr);
+ /* The following is based on btr_pcur_move_to_next_user_rec(). */
+ m_pcur.old_stored = false;
+ ut_ad(m_pcur.latch_mode == BTR_MODIFY_LEAF);
+ do {
+ if (btr_pcur_is_after_last_on_page(&m_pcur)) {
+ if (btr_pcur_is_after_last_in_tree(&m_pcur)) {
+ return DB_END_OF_INDEX;
+ }
+
+ buf_block_t* block = btr_pcur_get_block(&m_pcur);
+ uint32_t next_page = btr_page_get_next(block->frame);
+
+ /* MDEV-13542 FIXME: Make these checks part of
+ btr_pcur_move_to_next_page(), and introduce a
+ return status that will be checked in all callers! */
+ switch (next_page) {
+ default:
+ if (next_page != block->page.id().page_no()) {
+ break;
+ }
+ /* MDEV-20931 FIXME: Check that
+ next_page is within the tablespace
+ bounds! Also check that it is not a
+ change buffer bitmap page. */
+ /* fall through */
+ case 0:
+ case 1:
+ case FIL_NULL:
+ return DB_CORRUPTION;
+ }
+
+ dict_index_t* index = m_pcur.btr_cur.index;
+ buf_block_t* next_block = btr_block_get(
+ *index, next_page, BTR_MODIFY_LEAF, false,
+ &m_mtr);
+
+ if (UNIV_UNLIKELY(!next_block
+ || !fil_page_index_page_check(
+ next_block->frame)
+ || !!dict_index_is_spatial(index)
+ != (fil_page_get_type(
+ next_block->frame)
+ == FIL_PAGE_RTREE)
+ || page_is_comp(next_block->frame)
+ != page_is_comp(block->frame)
+ || btr_page_get_prev(
+ next_block->frame)
+ != block->page.id().page_no())) {
+ return DB_CORRUPTION;
+ }
+
+ btr_leaf_page_release(block, BTR_MODIFY_LEAF, &m_mtr);
+
+ page_cur_set_before_first(next_block,
+ &m_pcur.btr_cur.page_cur);
+
+ ut_d(page_check_dir(next_block->frame));
+ } else {
+ btr_pcur_move_to_next_on_page(&m_pcur);
+ }
+ } while (!btr_pcur_is_on_user_rec(&m_pcur));
+
+ return DB_SUCCESS;
+}
+
+/**
+Store the persistent cursor position and reopen the
+B-tree cursor in BTR_MODIFY_TREE mode, because the
+tree structure may be changed during a pessimistic delete. */
+void
+IndexPurge::purge_pessimistic_delete() UNIV_NOTHROW
+{
+ dberr_t err;
+
+ btr_pcur_restore_position(BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+ &m_pcur, &m_mtr);
+
+ ut_ad(rec_get_deleted_flag(
+ btr_pcur_get_rec(&m_pcur),
+ dict_table_is_comp(m_index->table)));
+
+ btr_cur_pessimistic_delete(
+ &err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0, false, &m_mtr);
+
+ ut_a(err == DB_SUCCESS);
+
+ /* Reopen the B-tree cursor in BTR_MODIFY_LEAF mode */
+ mtr_commit(&m_mtr);
+}
+
+/**
+Purge delete-marked records. */
+void
+IndexPurge::purge() UNIV_NOTHROW
+{
+ btr_pcur_store_position(&m_pcur, &m_mtr);
+
+ purge_pessimistic_delete();
+
+ mtr_start(&m_mtr);
+
+ mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+ btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr);
+}
+
+/** Adjust the BLOB reference for a single column that is externally stored
+@param rec record to update
+@param offsets column offsets for the record
+@param i column ordinal value
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_column(
+ rec_t* rec,
+ const rec_offs* offsets,
+ ulint i) UNIV_NOTHROW
+{
+ ulint len;
+ byte* field;
+
+ field = rec_get_nth_field(rec, offsets, i, &len);
+
+ DBUG_EXECUTE_IF("ib_import_trigger_corruption_2",
+ len = BTR_EXTERN_FIELD_REF_SIZE - 1;);
+
+ if (len < BTR_EXTERN_FIELD_REF_SIZE) {
+
+ ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_INNODB_INDEX_CORRUPT,
+ "Externally stored column(" ULINTPF
+ ") has a reference length of " ULINTPF
+ " in the cluster index %s",
+ i, len, m_cluster_index->name());
+
+ return(DB_CORRUPTION);
+ }
+
+ field += len - (BTR_EXTERN_FIELD_REF_SIZE - BTR_EXTERN_SPACE_ID);
+
+ mach_write_to_4(field, get_space_id());
+
+ if (UNIV_LIKELY_NULL(m_rec_iter.current_block()->page.zip.data)) {
+ page_zip_write_blob_ptr(
+ m_rec_iter.current_block(), rec, m_cluster_index,
+ offsets, i, &m_rec_iter.m_mtr);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Adjusts the BLOB reference in the clustered index row for all externally
+stored columns.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_columns(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW
+{
+ ut_ad(rec_offs_any_extern(offsets));
+
+ /* Adjust the space_id in the BLOB pointers. */
+
+ for (ulint i = 0; i < rec_offs_n_fields(offsets); ++i) {
+
+ /* Only if the column is stored "externally". */
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ dberr_t err;
+
+ err = adjust_cluster_index_blob_column(rec, offsets, i);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** In the clustered index, adjust BLOB pointers as needed. Also update the
+BLOB reference, write the new space id.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_ref(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW
+{
+ if (rec_offs_any_extern(offsets)) {
+ dberr_t err;
+
+ err = adjust_cluster_index_blob_columns(rec, offsets);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Purge delete-marked records, only if it is possible to do so without
+re-organising the B+tree.
+@return true if purge succeeded */
+inline bool PageConverter::purge() UNIV_NOTHROW
+{
+ const dict_index_t* index = m_index->m_srv_index;
+
+ /* We can't have a page that is empty and not root. */
+ if (m_rec_iter.remove(index, m_offsets)) {
+
+ ++m_index->m_stats.m_n_purged;
+
+ return(true);
+ } else {
+ ++m_index->m_stats.m_n_purge_failed;
+ }
+
+ return(false);
+}
+
+/** Adjust the BLOB references and sys fields for the current record.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code. */
+inline
+dberr_t
+PageConverter::adjust_cluster_record(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW
+{
+ dberr_t err;
+
+ if ((err = adjust_cluster_index_blob_ref(rec, offsets)) == DB_SUCCESS) {
+
+ /* Reset DB_TRX_ID and DB_ROLL_PTR. Normally, these fields
+ are only written in conjunction with other changes to the
+ record. */
+ ulint trx_id_pos = m_cluster_index->n_uniq
+ ? m_cluster_index->n_uniq : 1;
+ if (UNIV_LIKELY_NULL(m_rec_iter.current_block()
+ ->page.zip.data)) {
+ page_zip_write_trx_id_and_roll_ptr(
+ m_rec_iter.current_block(),
+ rec, m_offsets, trx_id_pos,
+ 0, roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS,
+ &m_rec_iter.m_mtr);
+ } else {
+ ulint len;
+ byte* ptr = rec_get_nth_field(
+ rec, m_offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ memcpy(ptr, reset_trx_id, sizeof reset_trx_id);
+ }
+ }
+
+ return(err);
+}
+
+/** Update the BLOB refrences and write UNDO log entries for
+rows that can't be purged optimistically.
+@param block block to update
+@retval DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_records(
+ buf_block_t* block) UNIV_NOTHROW
+{
+ ibool comp = dict_table_is_comp(m_cfg->m_table);
+ bool clust_index = m_index->m_srv_index == m_cluster_index;
+
+ /* This will also position the cursor on the first user record. */
+
+ m_rec_iter.open(block);
+
+ while (!m_rec_iter.end()) {
+ rec_t* rec = m_rec_iter.current();
+ ibool deleted = rec_get_deleted_flag(rec, comp);
+
+ /* For the clustered index we have to adjust the BLOB
+ reference and the system fields irrespective of the
+ delete marked flag. The adjustment of delete marked
+ cluster records is required for purge to work later. */
+
+ if (deleted || clust_index) {
+ m_offsets = rec_get_offsets(
+ rec, m_index->m_srv_index, m_offsets,
+ m_index->m_srv_index->n_core_fields,
+ ULINT_UNDEFINED, &m_heap);
+ }
+
+ if (clust_index) {
+
+ dberr_t err = adjust_cluster_record(rec, m_offsets);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ /* If it is a delete marked record then try an
+ optimistic delete. */
+
+ if (deleted) {
+ /* A successful purge will move the cursor to the
+ next record. */
+
+ if (!purge()) {
+ m_rec_iter.next();
+ }
+
+ ++m_index->m_stats.m_n_deleted;
+ } else {
+ ++m_index->m_stats.m_n_rows;
+ m_rec_iter.next();
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Update the space, index id, trx id.
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_index_page(
+ buf_block_t* block) UNIV_NOTHROW
+{
+ const page_id_t page_id(block->page.id());
+
+ if (is_free(page_id.page_no())) {
+ return(DB_SUCCESS);
+ }
+
+ buf_frame_t* page = block->frame;
+ const index_id_t id = btr_page_get_index_id(page);
+
+ if (id != m_index->m_id) {
+ row_index_t* index = find_index(id);
+
+ if (UNIV_UNLIKELY(!index)) {
+ ib::warn() << "Unknown index id " << id
+ << " on page " << page_id.page_no();
+ return DB_SUCCESS;
+ }
+
+ m_index = index;
+ }
+
+ /* If the .cfg file is missing and there is an index mismatch
+ then ignore the error. */
+ if (m_cfg->m_missing && !m_index->m_srv_index) {
+ return(DB_SUCCESS);
+ }
+
+ if (m_index && page_id.page_no() == m_index->m_page_no) {
+ byte *b = FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + FSEG_HDR_SPACE
+ + page;
+ mach_write_to_4(b, page_id.space());
+
+ memcpy(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + FSEG_HDR_SPACE
+ + page, b, 4);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ memcpy(&block->page.zip.data[FIL_PAGE_DATA
+ + PAGE_BTR_SEG_TOP
+ + FSEG_HDR_SPACE], b, 4);
+ memcpy(&block->page.zip.data[FIL_PAGE_DATA
+ + PAGE_BTR_SEG_LEAF
+ + FSEG_HDR_SPACE], b, 4);
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!block->page.zip.data || page_zip_validate(&block->page.zip, page,
+ m_index->m_srv_index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* This has to be written to uncompressed index header. Set it to
+ the current index id. */
+ mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID),
+ m_index->m_srv_index->id);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ memcpy(&block->page.zip.data[PAGE_HEADER + PAGE_INDEX_ID],
+ &block->frame[PAGE_HEADER + PAGE_INDEX_ID], 8);
+ }
+
+ if (m_index->m_srv_index->is_clust()) {
+ if (page_id.page_no() == m_index->m_srv_index->page) {
+ dict_index_t* index = const_cast<dict_index_t*>(
+ m_index->m_srv_index);
+ /* Preserve the PAGE_ROOT_AUTO_INC. */
+ if (index->table->supports_instant()) {
+ if (btr_cur_instant_root_init(index, page)) {
+ return(DB_CORRUPTION);
+ }
+
+ if (index->n_core_fields > index->n_fields) {
+ /* Some columns have been dropped.
+ Refuse to IMPORT TABLESPACE for now.
+
+ NOTE: This is not an accurate check.
+ Columns could have been both
+ added and dropped instantly.
+ For an accurate check, we must read
+ the metadata BLOB page pointed to
+ by the leftmost leaf page.
+
+ But we would have to read
+ those pages in a special way,
+ bypassing the buffer pool! */
+ return DB_UNSUPPORTED;
+ }
+
+ /* Provisionally set all instantly
+ added columns to be DEFAULT NULL. */
+ for (unsigned i = index->n_core_fields;
+ i < index->n_fields; i++) {
+ dict_col_t* col = index->fields[i].col;
+ col->def_val.len = UNIV_SQL_NULL;
+ col->def_val.data = NULL;
+ }
+ }
+ } else {
+ goto clear_page_max_trx_id;
+ }
+ } else if (page_is_leaf(page)) {
+ /* Set PAGE_MAX_TRX_ID on secondary index leaf pages. */
+ mach_write_to_8(&block->frame[PAGE_HEADER + PAGE_MAX_TRX_ID],
+ m_trx->id);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ memcpy_aligned<8>(&block->page.zip.data
+ [PAGE_HEADER + PAGE_MAX_TRX_ID],
+ &block->frame
+ [PAGE_HEADER + PAGE_MAX_TRX_ID], 8);
+ }
+ } else {
+clear_page_max_trx_id:
+ /* Clear PAGE_MAX_TRX_ID so that it can be
+ used for other purposes in the future. IMPORT
+ in MySQL 5.6, 5.7 and MariaDB 10.0 and 10.1
+ would set the field to the transaction ID even
+ on clustered index pages. */
+ memset_aligned<8>(&block->frame[PAGE_HEADER + PAGE_MAX_TRX_ID],
+ 0, 8);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ memset_aligned<8>(&block->page.zip.data
+ [PAGE_HEADER + PAGE_MAX_TRX_ID],
+ 0, 8);
+ }
+ }
+
+ if (page_is_empty(page)) {
+
+ /* Only a root page can be empty. */
+ if (page_has_siblings(page)) {
+ // TODO: We should relax this and skip secondary
+ // indexes. Mark them as corrupt because they can
+ // always be rebuilt.
+ return(DB_CORRUPTION);
+ }
+
+ return(DB_SUCCESS);
+ }
+
+ return page_is_leaf(block->frame) ? update_records(block) : DB_SUCCESS;
+}
+
+/** Validate the space flags and update tablespace header page.
+@param block block read from file, not from the buffer pool.
+@retval DB_SUCCESS or error code */
+inline dberr_t PageConverter::update_header(buf_block_t* block) UNIV_NOTHROW
+{
+ byte *frame= get_frame(block);
+ if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + frame,
+ FSP_HEADER_OFFSET + FSP_SPACE_ID + frame, 4))
+ ib::warn() << "Space id check in the header failed: ignored";
+ else if (!mach_read_from_4(FIL_PAGE_SPACE_ID + frame))
+ return DB_CORRUPTION;
+
+ memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+
+ /* Write space_id to the tablespace header, page 0. */
+ mach_write_to_4(FIL_PAGE_SPACE_ID + frame, get_space_id());
+ memcpy_aligned<2>(FSP_HEADER_OFFSET + FSP_SPACE_ID + frame,
+ FIL_PAGE_SPACE_ID + frame, 4);
+ /* Write back the adjusted flags. */
+ mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + frame, m_space_flags);
+
+ return DB_SUCCESS;
+}
+
+/** Update the page, set the space id, max trx id and index id.
+@param block block read from file
+@retval DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_page(buf_block_t* block, uint16_t& page_type)
+ UNIV_NOTHROW
+{
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(!block->page.zip.data == !is_compressed_table());
+
+ switch (page_type = fil_page_get_type(get_frame(block))) {
+ case FIL_PAGE_TYPE_FSP_HDR:
+ ut_a(block->page.id().page_no() == 0);
+ /* Work directly on the uncompressed page headers. */
+ return(update_header(block));
+
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ /* We need to decompress the contents into block->frame
+ before we can do any thing with Btree pages. */
+
+ if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) {
+ return(DB_CORRUPTION);
+ }
+
+ /* fall through */
+ case FIL_PAGE_TYPE_INSTANT:
+ /* This is on every page in the tablespace. */
+ mach_write_to_4(
+ get_frame(block)
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+ /* Only update the Btree nodes. */
+ return(update_index_page(block));
+
+ case FIL_PAGE_TYPE_SYS:
+ /* This is page 0 in the system tablespace. */
+ return(DB_CORRUPTION);
+
+ case FIL_PAGE_TYPE_XDES:
+ err = set_current_xdes(
+ block->page.id().page_no(), get_frame(block));
+ /* fall through */
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_TYPE_TRX_SYS:
+ case FIL_PAGE_IBUF_FREE_LIST:
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_BLOB:
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+
+ /* Work directly on the uncompressed page headers. */
+ /* This is on every page in the tablespace. */
+ mach_write_to_4(
+ get_frame(block)
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+ return(err);
+ }
+
+ ib::warn() << "Unknown page type (" << page_type << ")";
+
+ return(DB_CORRUPTION);
+}
+
+/** Called for every page in the tablespace. If the page was not
+updated then its state must be set to BUF_PAGE_NOT_USED.
+@param block block read from file, note it is not from the buffer pool
+@retval DB_SUCCESS or error code. */
+dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW
+{
+ /* If we already had an old page with matching number
+ in the buffer pool, evict it now, because
+ we no longer evict the pages on DISCARD TABLESPACE. */
+ buf_page_get_gen(block->page.id(), get_zip_size(),
+ RW_NO_LATCH, NULL, BUF_EVICT_IF_IN_POOL,
+ __FILE__, __LINE__, NULL, NULL);
+
+ uint16_t page_type;
+
+ if (dberr_t err = update_page(block, page_type)) {
+ return err;
+ }
+
+ const bool full_crc32 = fil_space_t::full_crc32(get_space_flags());
+ byte* frame = get_frame(block);
+ memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8);
+
+ if (!block->page.zip.data) {
+ buf_flush_init_for_writing(
+ NULL, block->frame, NULL, full_crc32);
+ } else if (fil_page_type_is_index(page_type)) {
+ buf_flush_init_for_writing(
+ NULL, block->page.zip.data, &block->page.zip,
+ full_crc32);
+ } else {
+ /* Calculate and update the checksum of non-index
+ pages for ROW_FORMAT=COMPRESSED tables. */
+ buf_flush_update_zip_checksum(
+ block->page.zip.data, block->zip_size());
+ }
+
+ return DB_SUCCESS;
+}
+
+/*****************************************************************//**
+Clean up after import tablespace. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_cleanup(
+/*===============*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */
+ trx_t* trx, /*!< in/out: transaction for import */
+ dberr_t err) /*!< in: error code */
+{
+ ut_a(prebuilt->trx != trx);
+
+ if (err != DB_SUCCESS) {
+ dict_table_t* table = prebuilt->table;
+ table->file_unreadable = true;
+ if (table->space) {
+ fil_close_tablespace(table->space_id);
+ table->space = NULL;
+ }
+
+ prebuilt->trx->error_info = NULL;
+
+ ib::info() << "Discarding tablespace of table "
+ << table->name << ": " << err;
+
+ if (!trx->dict_operation_lock_mode) {
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+ index->page = FIL_NULL;
+ }
+ }
+
+ ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE(););
+
+ trx_commit_for_mysql(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->free();
+
+ prebuilt->trx->op_info = "";
+
+ DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE(););
+
+ log_make_checkpoint();
+
+ return(err);
+}
+
+/*****************************************************************//**
+Report error during tablespace import. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_error(
+/*=============*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */
+ trx_t* trx, /*!< in/out: transaction for import */
+ dberr_t err) /*!< in: error code */
+{
+ if (!trx_is_interrupted(trx)) {
+ char table_name[MAX_FULL_NAME_LEN + 1];
+
+ innobase_format_name(
+ table_name, sizeof(table_name),
+ prebuilt->table->name.m_name);
+
+ ib_senderrf(
+ trx->mysql_thd, IB_LOG_LEVEL_WARN,
+ ER_INNODB_IMPORT_ERROR,
+ table_name, (ulong) err, ut_strerr(err));
+ }
+
+ return(row_import_cleanup(prebuilt, trx, err));
+}
+
+/*****************************************************************//**
+Adjust the root page index node and leaf node segment headers, update
+with the new space id. For all the table's secondary indexes.
+@return error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_adjust_root_pages_of_secondary_indexes(
+/*==============================================*/
+ trx_t* trx, /*!< in: transaction used for
+ the import */
+ dict_table_t* table, /*!< in: table the indexes
+ belong to */
+ const row_import& cfg) /*!< Import context */
+{
+ dict_index_t* index;
+ ulint n_rows_in_table;
+ dberr_t err = DB_SUCCESS;
+
+ /* Skip the clustered index. */
+ index = dict_table_get_first_index(table);
+
+ n_rows_in_table = cfg.get_n_rows(index->name);
+
+ DBUG_EXECUTE_IF("ib_import_sec_rec_count_mismatch_failure",
+ n_rows_in_table++;);
+
+ /* Adjust the root pages of the secondary indexes only. */
+ while ((index = dict_table_get_next_index(index)) != NULL) {
+ ut_a(!dict_index_is_clust(index));
+
+ if (!(index->type & DICT_CORRUPT)
+ && index->page != FIL_NULL) {
+
+ /* Update the Btree segment headers for index node and
+ leaf nodes in the root page. Set the new space id. */
+
+ err = btr_root_adjust_on_import(index);
+ } else {
+ ib::warn() << "Skip adjustment of root pages for"
+ " index " << index->name << ".";
+
+ err = DB_CORRUPTION;
+ }
+
+ if (err != DB_SUCCESS) {
+
+ if (index->type & DICT_CLUSTERED) {
+ break;
+ }
+
+ ib_errf(trx->mysql_thd,
+ IB_LOG_LEVEL_WARN,
+ ER_INNODB_INDEX_CORRUPT,
+ "Index %s not found or corrupt,"
+ " you should recreate this index.",
+ index->name());
+
+ /* Do not bail out, so that the data
+ can be recovered. */
+
+ err = DB_SUCCESS;
+ index->type |= DICT_CORRUPT;
+ continue;
+ }
+
+ /* If we failed to purge any records in the index then
+ do it the hard way.
+
+ TODO: We can do this in the first pass by generating UNDO log
+ records for the failed rows. */
+
+ if (!cfg.requires_purge(index->name)) {
+ continue;
+ }
+
+ IndexPurge purge(trx, index);
+
+ trx->op_info = "secondary: purge delete marked records";
+
+ err = purge.garbage_collect();
+
+ trx->op_info = "";
+
+ if (err != DB_SUCCESS) {
+ break;
+ } else if (purge.get_n_rows() != n_rows_in_table) {
+
+ ib_errf(trx->mysql_thd,
+ IB_LOG_LEVEL_WARN,
+ ER_INNODB_INDEX_CORRUPT,
+ "Index '%s' contains " ULINTPF " entries, "
+ "should be " ULINTPF ", you should recreate "
+ "this index.", index->name(),
+ purge.get_n_rows(), n_rows_in_table);
+
+ index->type |= DICT_CORRUPT;
+
+ /* Do not bail out, so that the data
+ can be recovered. */
+
+ err = DB_SUCCESS;
+ }
+ }
+
+ return(err);
+}
+
+/*****************************************************************//**
+Ensure that dict_sys.row_id exceeds SELECT MAX(DB_ROW_ID). */
+MY_ATTRIBUTE((nonnull)) static
+void
+row_import_set_sys_max_row_id(
+/*==========================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from
+ handler */
+ const dict_table_t* table) /*!< in: table to import */
+{
+ const rec_t* rec;
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ row_id_t row_id = 0;
+ dict_index_t* index;
+
+ index = dict_table_get_first_index(table);
+ ut_ad(index->is_primary());
+ ut_ad(dict_index_is_auto_gen_clust(index));
+
+ mtr_start(&mtr);
+
+ mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+ btr_pcur_open_at_index_side(
+ false, // High end
+ index,
+ BTR_SEARCH_LEAF,
+ &pcur,
+ true, // Init cursor
+ 0, // Leaf level
+ &mtr);
+
+ btr_pcur_move_to_prev_on_page(&pcur);
+ rec = btr_pcur_get_rec(&pcur);
+
+ /* Check for empty table. */
+ if (page_rec_is_infimum(rec)) {
+ /* The table is empty. */
+ } else if (rec_is_metadata(rec, *index)) {
+ /* The clustered index contains the metadata record only,
+ that is, the table is empty. */
+ } else {
+ row_id = mach_read_from_6(rec);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ if (row_id) {
+ /* Update the system row id if the imported index row id is
+ greater than the max system row id. */
+
+ mutex_enter(&dict_sys.mutex);
+
+ if (row_id >= dict_sys.row_id) {
+ dict_sys.row_id = row_id + 1;
+ dict_hdr_flush_row_id();
+ }
+
+ mutex_exit(&dict_sys.mutex);
+ }
+}
+
+/*****************************************************************//**
+Read the a string from the meta data file.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_cfg_read_string(
+/*=======================*/
+ FILE* file, /*!< in/out: File to read from */
+ byte* ptr, /*!< out: string to read */
+ ulint max_len) /*!< in: maximum length of the output
+ buffer in bytes */
+{
+ DBUG_EXECUTE_IF("ib_import_string_read_error",
+ errno = EINVAL; return(DB_IO_ERROR););
+
+ ulint len = 0;
+
+ while (!feof(file)) {
+ int ch = fgetc(file);
+
+ if (ch == EOF) {
+ break;
+ } else if (ch != 0) {
+ if (len < max_len) {
+ ptr[len++] = static_cast<byte>(ch);
+ } else {
+ break;
+ }
+ /* max_len includes the NUL byte */
+ } else if (len != max_len - 1) {
+ break;
+ } else {
+ ptr[len] = 0;
+ return(DB_SUCCESS);
+ }
+ }
+
+ errno = EINVAL;
+
+ return(DB_IO_ERROR);
+}
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_cfg_read_index_fields(
+/*=============================*/
+ FILE* file, /*!< in: file to write to */
+ THD* thd, /*!< in/out: session */
+ row_index_t* index) /*!< Index being read in */
+{
+ byte row[sizeof(ib_uint32_t) * 3];
+ ulint n_fields = index->m_n_fields;
+
+ index->m_fields = UT_NEW_ARRAY_NOKEY(dict_field_t, n_fields);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_4",
+ UT_DELETE_ARRAY(index->m_fields);
+ index->m_fields = NULL;
+ );
+
+ if (index->m_fields == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ dict_field_t* field = index->m_fields;
+
+ for (ulint i = 0; i < n_fields; ++i, ++field) {
+ byte* ptr = row;
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_1",
+ (void) fseek(file, 0L, SEEK_END););
+
+ if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading index fields.");
+
+ return(DB_IO_ERROR);
+ }
+
+ new (field) dict_field_t();
+
+ field->prefix_len = mach_read_from_4(ptr) & ((1U << 12) - 1);
+ ptr += sizeof(ib_uint32_t);
+
+ field->fixed_len = mach_read_from_4(ptr) & ((1U << 10) - 1);
+ ptr += sizeof(ib_uint32_t);
+
+ /* Include the NUL byte in the length. */
+ ulint len = mach_read_from_4(ptr);
+
+ byte* name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_5",
+ UT_DELETE_ARRAY(name);
+ name = NULL;
+ );
+
+ if (name == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ field->name = reinterpret_cast<const char*>(name);
+
+ dberr_t err = row_import_cfg_read_string(file, name, len);
+
+ if (err != DB_SUCCESS) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while parsing table name.");
+
+ return(err);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the index names and root page numbers of the indexes and set the values.
+Row format [root_page_no, len of str, str ... ]
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_index_data(
+/*=======================*/
+ FILE* file, /*!< in: File to read from */
+ THD* thd, /*!< in: session */
+ row_import* cfg) /*!< in/out: meta-data read */
+{
+ byte* ptr;
+ row_index_t* cfg_index;
+ byte row[sizeof(index_id_t) + sizeof(ib_uint32_t) * 9];
+
+ /* FIXME: What is the max value? */
+ ut_a(cfg->m_n_indexes > 0);
+ ut_a(cfg->m_n_indexes < 1024);
+
+ cfg->m_indexes = UT_NEW_ARRAY_NOKEY(row_index_t, cfg->m_n_indexes);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_6",
+ UT_DELETE_ARRAY(cfg->m_indexes);
+ cfg->m_indexes = NULL;
+ );
+
+ if (cfg->m_indexes == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+ cfg_index = cfg->m_indexes;
+
+ for (ulint i = 0; i < cfg->m_n_indexes; ++i, ++cfg_index) {
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_2",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the index data. */
+ size_t n_bytes = fread(row, 1, sizeof(row), file);
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error",
+ (void) fseek(file, 0L, SEEK_END););
+
+ if (n_bytes != sizeof(row)) {
+ char msg[BUFSIZ];
+
+ snprintf(msg, sizeof(msg),
+ "while reading index meta-data, expected "
+ "to read " ULINTPF
+ " bytes but read only " ULINTPF " bytes",
+ sizeof(row), n_bytes);
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno), msg);
+
+ ib::error() << "IO Error: " << msg;
+
+ return(DB_IO_ERROR);
+ }
+
+ ptr = row;
+
+ cfg_index->m_id = mach_read_from_8(ptr);
+ ptr += sizeof(index_id_t);
+
+ cfg_index->m_space = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_page_no = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_type = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_trx_id_offset = mach_read_from_4(ptr);
+ if (cfg_index->m_trx_id_offset != mach_read_from_4(ptr)) {
+ ut_ad(0);
+ /* Overflow. Pretend that the clustered index
+ has a variable-length PRIMARY KEY. */
+ cfg_index->m_trx_id_offset = 0;
+ }
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_n_user_defined_cols = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_n_uniq = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_n_nullable = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_n_fields = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ /* The NUL byte is included in the name length. */
+ ulint len = mach_read_from_4(ptr);
+
+ if (len > OS_FILE_MAX_PATH) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_INNODB_INDEX_CORRUPT,
+ "Index name length (" ULINTPF ") is too long, "
+ "the meta-data is corrupt", len);
+
+ return(DB_CORRUPTION);
+ }
+
+ cfg_index->m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_7",
+ UT_DELETE_ARRAY(cfg_index->m_name);
+ cfg_index->m_name = NULL;
+ );
+
+ if (cfg_index->m_name == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ dberr_t err;
+
+ err = row_import_cfg_read_string(file, cfg_index->m_name, len);
+
+ if (err != DB_SUCCESS) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while parsing index name.");
+
+ return(err);
+ }
+
+ err = row_import_cfg_read_index_fields(file, thd, cfg_index);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the index root page number for v1 format.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_read_indexes(
+/*====================*/
+ FILE* file, /*!< in: File to read from */
+ THD* thd, /*!< in: session */
+ row_import* cfg) /*!< in/out: meta-data read */
+{
+ byte row[sizeof(ib_uint32_t)];
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_3",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the number of indexes. */
+ if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading number of indexes.");
+
+ return(DB_IO_ERROR);
+ }
+
+ cfg->m_n_indexes = mach_read_from_4(row);
+
+ if (cfg->m_n_indexes == 0) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ "Number of indexes in meta-data file is 0");
+
+ return(DB_CORRUPTION);
+
+ } else if (cfg->m_n_indexes > 1024) {
+ // FIXME: What is the upper limit? */
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ "Number of indexes in meta-data file is too high: "
+ ULINTPF, cfg->m_n_indexes);
+ cfg->m_n_indexes = 0;
+
+ return(DB_CORRUPTION);
+ }
+
+ return(row_import_read_index_data(file, thd, cfg));
+}
+
+/*********************************************************************//**
+Read the meta data (table columns) config file. Deserialise the contents of
+dict_col_t structure, along with the column name. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_columns(
+/*====================*/
+ FILE* file, /*!< in: file to write to */
+ THD* thd, /*!< in/out: session */
+ row_import* cfg) /*!< in/out: meta-data read */
+{
+ dict_col_t* col;
+ byte row[sizeof(ib_uint32_t) * 8];
+
+ /* FIXME: What should the upper limit be? */
+ ut_a(cfg->m_n_cols > 0);
+ ut_a(cfg->m_n_cols < 1024);
+
+ cfg->m_cols = UT_NEW_ARRAY_NOKEY(dict_col_t, cfg->m_n_cols);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_8",
+ UT_DELETE_ARRAY(cfg->m_cols);
+ cfg->m_cols = NULL;
+ );
+
+ if (cfg->m_cols == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ cfg->m_col_names = UT_NEW_ARRAY_NOKEY(byte*, cfg->m_n_cols);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_9",
+ UT_DELETE_ARRAY(cfg->m_col_names);
+ cfg->m_col_names = NULL;
+ );
+
+ if (cfg->m_col_names == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ memset(cfg->m_cols, 0x0, sizeof(cfg->m_cols) * cfg->m_n_cols);
+ memset(cfg->m_col_names, 0x0, sizeof(cfg->m_col_names) * cfg->m_n_cols);
+
+ col = cfg->m_cols;
+
+ for (ulint i = 0; i < cfg->m_n_cols; ++i, ++col) {
+ byte* ptr = row;
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_4",
+ (void) fseek(file, 0L, SEEK_END););
+
+ if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading table column meta-data.");
+
+ return(DB_IO_ERROR);
+ }
+
+ col->prtype = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ col->mtype = static_cast<byte>(mach_read_from_4(ptr));
+ ptr += sizeof(ib_uint32_t);
+
+ col->len = static_cast<uint16_t>(mach_read_from_4(ptr));
+ ptr += sizeof(ib_uint32_t);
+
+ uint32_t mbminmaxlen = mach_read_from_4(ptr);
+ col->mbmaxlen = (mbminmaxlen / 5) & 7;
+ col->mbminlen = (mbminmaxlen % 5) & 7;
+ ptr += sizeof(ib_uint32_t);
+
+ col->ind = mach_read_from_4(ptr) & dict_index_t::MAX_N_FIELDS;
+ ptr += sizeof(ib_uint32_t);
+
+ col->ord_part = mach_read_from_4(ptr) & 1;
+ ptr += sizeof(ib_uint32_t);
+
+ col->max_prefix = mach_read_from_4(ptr) & ((1U << 12) - 1);
+ ptr += sizeof(ib_uint32_t);
+
+ /* Read in the column name as [len, byte array]. The len
+ includes the NUL byte. */
+
+ ulint len = mach_read_from_4(ptr);
+
+ /* FIXME: What is the maximum column name length? */
+ if (len == 0 || len > 128) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_IO_READ_ERROR,
+ "Column name length " ULINTPF ", is invalid",
+ len);
+
+ return(DB_CORRUPTION);
+ }
+
+ cfg->m_col_names[i] = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_10",
+ UT_DELETE_ARRAY(cfg->m_col_names[i]);
+ cfg->m_col_names[i] = NULL;
+ );
+
+ if (cfg->m_col_names[i] == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ dberr_t err;
+
+ err = row_import_cfg_read_string(
+ file, cfg->m_col_names[i], len);
+
+ if (err != DB_SUCCESS) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while parsing table column name.");
+
+ return(err);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_v1(
+/*===============*/
+ FILE* file, /*!< in: File to read from */
+ THD* thd, /*!< in: session */
+ row_import* cfg) /*!< out: meta data */
+{
+ byte value[sizeof(ib_uint32_t)];
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_5",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the hostname where the tablespace was exported. */
+ if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading meta-data export hostname length.");
+
+ return(DB_IO_ERROR);
+ }
+
+ ulint len = mach_read_from_4(value);
+
+ /* NUL byte is part of name length. */
+ cfg->m_hostname = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_1",
+ UT_DELETE_ARRAY(cfg->m_hostname);
+ cfg->m_hostname = NULL;
+ );
+
+ if (cfg->m_hostname == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ dberr_t err = row_import_cfg_read_string(file, cfg->m_hostname, len);
+
+ if (err != DB_SUCCESS) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while parsing export hostname.");
+
+ return(err);
+ }
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_6",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the table name of tablespace that was exported. */
+ if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading meta-data table name length.");
+
+ return(DB_IO_ERROR);
+ }
+
+ len = mach_read_from_4(value);
+
+ /* NUL byte is part of name length. */
+ cfg->m_table_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_2",
+ UT_DELETE_ARRAY(cfg->m_table_name);
+ cfg->m_table_name = NULL;
+ );
+
+ if (cfg->m_table_name == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ err = row_import_cfg_read_string(file, cfg->m_table_name, len);
+
+ if (err != DB_SUCCESS) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while parsing table name.");
+
+ return(err);
+ }
+
+ ib::info() << "Importing tablespace for table '" << cfg->m_table_name
+ << "' that was exported from host '" << cfg->m_hostname << "'";
+
+ byte row[sizeof(ib_uint32_t) * 3];
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_7",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the autoinc value. */
+ if (fread(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading autoinc value.");
+
+ return(DB_IO_ERROR);
+ }
+
+ cfg->m_autoinc = mach_read_from_8(row);
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_8",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the tablespace page size. */
+ if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading meta-data header.");
+
+ return(DB_IO_ERROR);
+ }
+
+ byte* ptr = row;
+
+ const ulint logical_page_size = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ if (logical_page_size != srv_page_size) {
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+ "Tablespace to be imported has a different"
+ " page size than this server. Server page size"
+ " is %lu, whereas tablespace page size"
+ " is " ULINTPF,
+ srv_page_size,
+ logical_page_size);
+
+ return(DB_ERROR);
+ }
+
+ cfg->m_flags = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg->m_zip_size = dict_tf_get_zip_size(cfg->m_flags);
+ cfg->m_n_cols = mach_read_from_4(ptr);
+
+ if (!dict_tf_is_valid(cfg->m_flags)) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Invalid table flags: " ULINTPF, cfg->m_flags);
+
+ return(DB_CORRUPTION);
+ }
+
+ err = row_import_read_columns(file, thd, cfg);
+
+ if (err == DB_SUCCESS) {
+ err = row_import_read_indexes(file, thd, cfg);
+ }
+
+ return(err);
+}
+
+/**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_meta_data(
+/*======================*/
+ FILE* file, /*!< in: File to read from */
+ THD* thd, /*!< in: session */
+ row_import& cfg) /*!< out: contents of the .cfg file */
+{
+ byte row[sizeof(ib_uint32_t)];
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_9",
+ (void) fseek(file, 0L, SEEK_END););
+
+ if (fread(&row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading meta-data version.");
+
+ return(DB_IO_ERROR);
+ }
+
+ cfg.m_version = mach_read_from_4(row);
+
+ /* Check the version number. */
+ switch (cfg.m_version) {
+ case IB_EXPORT_CFG_VERSION_V1:
+
+ return(row_import_read_v1(file, thd, &cfg));
+ default:
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ "Unsupported meta-data version number (" ULINTPF "), "
+ "file ignored", cfg.m_version);
+ }
+
+ return(DB_ERROR);
+}
+
+/**
+Read the contents of the <tablename>.cfg file.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_cfg(
+/*================*/
+ dict_table_t* table, /*!< in: table */
+ THD* thd, /*!< in: session */
+ row_import& cfg) /*!< out: contents of the .cfg file */
+{
+ dberr_t err;
+ char name[OS_FILE_MAX_PATH];
+
+ cfg.m_table = table;
+
+ srv_get_meta_data_filename(table, name, sizeof(name));
+
+ FILE* file = fopen(name, "rb");
+
+ if (file == NULL) {
+ char msg[BUFSIZ];
+
+ snprintf(msg, sizeof(msg),
+ "Error opening '%s', will attempt to import"
+ " without schema verification", name);
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno), msg);
+
+ cfg.m_missing = true;
+
+ err = DB_FAIL;
+ } else {
+
+ cfg.m_missing = false;
+
+ err = row_import_read_meta_data(file, thd, cfg);
+ fclose(file);
+ }
+
+ return(err);
+}
+
+/** Update the root page numbers and tablespace ID of a table.
+@param[in,out] trx dictionary transaction
+@param[in,out] table persistent table
+@param[in] reset whether to reset the fields to FIL_NULL
+@return DB_SUCCESS or error code */
+dberr_t
+row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset)
+{
+ const dict_index_t* index;
+ que_t* graph = 0;
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(reset || table->space->id == table->space_id);
+
+ static const char sql[] = {
+ "PROCEDURE UPDATE_INDEX_ROOT() IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_INDEXES\n"
+ "SET SPACE = :space,\n"
+ " PAGE_NO = :page,\n"
+ " TYPE = :type\n"
+ "WHERE TABLE_ID = :table_id AND ID = :index_id;\n"
+ "END;\n"};
+
+ table->def_trx_id = trx->id;
+
+ for (index = dict_table_get_first_index(table);
+ index != 0;
+ index = dict_table_get_next_index(index)) {
+
+ pars_info_t* info;
+ ib_uint32_t page;
+ ib_uint32_t space;
+ ib_uint32_t type;
+ index_id_t index_id;
+ table_id_t table_id;
+
+ info = (graph != 0) ? graph->info : pars_info_create();
+
+ mach_write_to_4(
+ reinterpret_cast<byte*>(&type),
+ index->type);
+
+ mach_write_to_4(
+ reinterpret_cast<byte*>(&page),
+ reset ? FIL_NULL : index->page);
+
+ mach_write_to_4(
+ reinterpret_cast<byte*>(&space),
+ reset ? FIL_NULL : index->table->space_id);
+
+ mach_write_to_8(
+ reinterpret_cast<byte*>(&index_id),
+ index->id);
+
+ mach_write_to_8(
+ reinterpret_cast<byte*>(&table_id),
+ table->id);
+
+ /* If we set the corrupt bit during the IMPORT phase then
+ we need to update the system tables. */
+ pars_info_bind_int4_literal(info, "type", &type);
+ pars_info_bind_int4_literal(info, "space", &space);
+ pars_info_bind_int4_literal(info, "page", &page);
+ pars_info_bind_ull_literal(info, "index_id", &index_id);
+ pars_info_bind_ull_literal(info, "table_id", &table_id);
+
+ if (graph == 0) {
+ graph = pars_sql(info, sql);
+ ut_a(graph);
+ graph->trx = trx;
+ }
+
+ que_thr_t* thr;
+
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ ut_a(thr = que_fork_start_command(graph));
+
+ que_run_threads(thr);
+
+ DBUG_EXECUTE_IF("ib_import_internal_error",
+ trx->error_state = DB_ERROR;);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_INTERNAL_ERROR,
+ "While updating the <space, root page"
+ " number> of index %s - %s",
+ index->name(), ut_strerr(err));
+
+ break;
+ }
+ }
+
+ que_graph_free(graph);
+
+ return(err);
+}
+
+/** Callback arg for row_import_set_discarded. */
+struct discard_t {
+ ib_uint32_t flags2; /*!< Value read from column */
+ bool state; /*!< New state of the flag */
+ ulint n_recs; /*!< Number of recs processed */
+};
+
+/******************************************************************//**
+Fetch callback that sets or unsets the DISCARDED tablespace flag in
+SYS_TABLES. The flags is stored in MIX_LEN column.
+@return FALSE if all OK */
+static
+ibool
+row_import_set_discarded(
+/*=====================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: bool set/unset flag */
+{
+ sel_node_t* node = static_cast<sel_node_t*>(row);
+ discard_t* discard = static_cast<discard_t*>(user_arg);
+ dfield_t* dfield = que_node_get_val(node->select_list);
+ dtype_t* type = dfield_get_type(dfield);
+ ulint len = dfield_get_len(dfield);
+
+ ut_a(dtype_get_mtype(type) == DATA_INT);
+ ut_a(len == sizeof(ib_uint32_t));
+
+ ulint flags2 = mach_read_from_4(
+ static_cast<byte*>(dfield_get_data(dfield)));
+
+ if (discard->state) {
+ flags2 |= DICT_TF2_DISCARDED;
+ } else {
+ flags2 &= ~DICT_TF2_DISCARDED;
+ }
+
+ mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2);
+
+ ++discard->n_recs;
+
+ /* There should be at most one matching record. */
+ ut_a(discard->n_recs == 1);
+
+ return(FALSE);
+}
+
+/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN.
+@param[in,out] trx dictionary transaction
+@param[in] table_id table identifier
+@param[in] discarded whether to set or clear the flag
+@return DB_SUCCESS or error code */
+dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id,
+ bool discarded)
+{
+ pars_info_t* info;
+ discard_t discard;
+
+ static const char sql[] =
+ "PROCEDURE UPDATE_DISCARDED_FLAG() IS\n"
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS\n"
+ " SELECT MIX_LEN"
+ " FROM SYS_TABLES"
+ " WHERE ID = :table_id FOR UPDATE;"
+ "\n"
+ "BEGIN\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "UPDATE SYS_TABLES"
+ " SET MIX_LEN = :flags2"
+ " WHERE ID = :table_id;\n"
+ "CLOSE c;\n"
+ "END;\n";
+
+ discard.n_recs = 0;
+ discard.state = discarded;
+ discard.flags2 = ULINT32_UNDEFINED;
+
+ info = pars_info_create();
+
+ pars_info_add_ull_literal(info, "table_id", table_id);
+ pars_info_bind_int4_literal(info, "flags2", &discard.flags2);
+
+ pars_info_bind_function(
+ info, "my_func", row_import_set_discarded, &discard);
+
+ dberr_t err = que_eval_sql(info, sql, false, trx);
+
+ ut_a(discard.n_recs == 1);
+ ut_a(discard.flags2 != ULINT32_UNDEFINED);
+
+ return(err);
+}
+
+struct fil_iterator_t {
+ pfs_os_file_t file; /*!< File handle */
+ const char* filepath; /*!< File path name */
+ os_offset_t start; /*!< From where to start */
+ os_offset_t end; /*!< Where to stop */
+ os_offset_t file_size; /*!< File size in bytes */
+ ulint n_io_buffers; /*!< Number of pages to use
+ for IO */
+ byte* io_buffer; /*!< Buffer to use for IO */
+ fil_space_crypt_t *crypt_data; /*!< Crypt data (if encrypted) */
+ byte* crypt_io_buffer; /*!< IO buffer when encrypted */
+};
+
+
+/** InnoDB writes page by page when there is page compressed
+tablespace involved. It does help to save the disk space when
+punch hole is enabled
+@param iter Tablespace iterator
+@param full_crc32 whether the file is in the full_crc32 format
+@param offset offset of the file to be written
+@param writeptr buffer to be written
+@param n_bytes number of bytes to be written
+@param try_punch_only Try the range punch only because the
+ current range is full of empty pages
+@return DB_SUCCESS */
+static
+dberr_t fil_import_compress_fwrite(const fil_iterator_t &iter,
+ bool full_crc32,
+ os_offset_t offset,
+ const byte *writeptr,
+ ulint n_bytes,
+ bool try_punch_only= false)
+{
+ if (dberr_t err= os_file_punch_hole(iter.file, offset, n_bytes))
+ return err;
+
+ if (try_punch_only)
+ return DB_SUCCESS;
+
+ for (ulint j= 0; j < n_bytes; j+= srv_page_size)
+ {
+ /* Read the original data length from block and
+ safer to read FIL_PAGE_COMPRESSED_SIZE because it
+ is not encrypted*/
+ ulint n_write_bytes= srv_page_size;
+ if (j || offset)
+ {
+ n_write_bytes= mach_read_from_2(writeptr + j + FIL_PAGE_DATA);
+ const unsigned ptype= mach_read_from_2(writeptr + j + FIL_PAGE_TYPE);
+ /* Ignore the empty page */
+ if (ptype == 0 && n_write_bytes == 0)
+ continue;
+ if (full_crc32)
+ n_write_bytes= buf_page_full_crc32_size(writeptr + j,
+ nullptr, nullptr);
+ else
+ {
+ n_write_bytes+= ptype == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED
+ ? FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN
+ : FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
+ }
+ }
+
+ if (dberr_t err= os_file_write(IORequestWrite, iter.filepath, iter.file,
+ writeptr + j, offset + j, n_write_bytes))
+ return err;
+ }
+
+ return DB_SUCCESS;
+}
+
+/********************************************************************//**
+TODO: This can be made parallel trivially by chunking up the file and creating
+a callback per thread. . Main benefit will be to use multiple CPUs for
+checksums and compressed tables. We have to do compressed tables block by
+block right now. Secondly we need to decompress/compress and copy too much
+of data. These are CPU intensive.
+
+Iterate over all the pages in the tablespace.
+@param iter - Tablespace iterator
+@param block - block to use for IO
+@param callback - Callback to inspect and update page contents
+@retval DB_SUCCESS or error code */
+static
+dberr_t
+fil_iterate(
+/*========*/
+ const fil_iterator_t& iter,
+ buf_block_t* block,
+ AbstractCallback& callback)
+{
+ os_offset_t offset;
+ const ulint size = callback.physical_size();
+ ulint n_bytes = iter.n_io_buffers * size;
+
+ const ulint buf_size = srv_page_size
+#ifdef HAVE_LZO
+ + LZO1X_1_15_MEM_COMPRESS
+#elif defined HAVE_SNAPPY
+ + snappy_max_compressed_length(srv_page_size)
+#endif
+ ;
+ byte* page_compress_buf = static_cast<byte*>(malloc(buf_size));
+ ut_ad(!srv_read_only_mode);
+
+ if (!page_compress_buf) {
+ return DB_OUT_OF_MEMORY;
+ }
+
+ ulint actual_space_id = 0;
+ const bool full_crc32 = fil_space_t::full_crc32(
+ callback.get_space_flags());
+
+ /* TODO: For ROW_FORMAT=COMPRESSED tables we do a lot of useless
+ copying for non-index pages. Unfortunately, it is
+ required by buf_zip_decompress() */
+ dberr_t err = DB_SUCCESS;
+ bool page_compressed = false;
+ bool punch_hole = true;
+
+ for (offset = iter.start; offset < iter.end; offset += n_bytes) {
+ if (callback.is_interrupted()) {
+ err = DB_INTERRUPTED;
+ goto func_exit;
+ }
+
+ byte* io_buffer = iter.io_buffer;
+ block->frame = io_buffer;
+
+ if (block->page.zip.data) {
+ /* Zip IO is done in the compressed page buffer. */
+ io_buffer = block->page.zip.data;
+ }
+
+ /* We have to read the exact number of bytes. Otherwise the
+ InnoDB IO functions croak on failed reads. */
+
+ n_bytes = ulint(ut_min(os_offset_t(n_bytes),
+ iter.end - offset));
+
+ ut_ad(n_bytes > 0);
+ ut_ad(!(n_bytes % size));
+
+ const bool encrypted = iter.crypt_data != NULL
+ && iter.crypt_data->should_encrypt();
+ /* Use additional crypt io buffer if tablespace is encrypted */
+ byte* const readptr = encrypted
+ ? iter.crypt_io_buffer : io_buffer;
+ byte* const writeptr = readptr;
+
+ err = os_file_read_no_error_handling(
+ IORequestReadPartial,
+ iter.file, readptr, offset, n_bytes, 0);
+ if (err != DB_SUCCESS) {
+ ib::error() << iter.filepath
+ << ": os_file_read() failed";
+ goto func_exit;
+ }
+
+ bool updated = false;
+ os_offset_t page_off = offset;
+ ulint n_pages_read = n_bytes / size;
+ /* This block is not attached to buf_pool */
+ block->page.id_.set_page_no(uint32_t(page_off / size));
+
+ for (ulint i = 0; i < n_pages_read;
+ ++block->page.id_,
+ ++i, page_off += size, block->frame += size) {
+ byte* src = readptr + i * size;
+ const ulint page_no = page_get_page_no(src);
+ if (!page_no && block->page.id().page_no()) {
+ if (!buf_is_zeroes(span<const byte>(src,
+ size))) {
+ goto page_corrupted;
+ }
+ /* Proceed to the next page,
+ because this one is all zero. */
+ continue;
+ }
+
+ if (page_no != block->page.id().page_no()) {
+page_corrupted:
+ ib::warn() << callback.filename()
+ << ": Page " << (offset / size)
+ << " at offset " << offset
+ << " looks corrupted.";
+ err = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ if (block->page.id().page_no() == 0) {
+ actual_space_id = mach_read_from_4(
+ src + FIL_PAGE_SPACE_ID);
+ }
+
+ const uint16_t type = fil_page_get_type(src);
+ page_compressed =
+ (full_crc32
+ && fil_space_t::is_compressed(
+ callback.get_space_flags())
+ && buf_page_is_compressed(
+ src, callback.get_space_flags()))
+ || type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED
+ || type == FIL_PAGE_PAGE_COMPRESSED;
+
+ if (page_compressed && block->page.zip.data) {
+ goto page_corrupted;
+ }
+
+ bool decrypted = false;
+ byte* dst = io_buffer + i * size;
+ bool frame_changed = false;
+ uint key_version = buf_page_get_key_version(
+ src, callback.get_space_flags());
+
+ if (!encrypted) {
+ } else if (!key_version) {
+not_encrypted:
+ if (block->page.id().page_no() == 0
+ && block->page.zip.data) {
+ block->page.zip.data = src;
+ frame_changed = true;
+ } else if (!page_compressed
+ && !block->page.zip.data) {
+ block->frame = src;
+ frame_changed = true;
+ } else {
+ ut_ad(dst != src);
+ memcpy(dst, src, size);
+ }
+ } else {
+ if (!buf_page_verify_crypt_checksum(
+ src, callback.get_space_flags())) {
+ goto page_corrupted;
+ }
+
+ decrypted = fil_space_decrypt(
+ actual_space_id,
+ iter.crypt_data, dst,
+ callback.physical_size(),
+ callback.get_space_flags(),
+ src, &err);
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (!decrypted) {
+ goto not_encrypted;
+ }
+
+ updated = true;
+ }
+
+ /* For full_crc32 format, skip checksum check
+ after decryption. */
+ bool skip_checksum_check = full_crc32 && encrypted;
+
+ /* If the original page is page_compressed, we need
+ to decompress it before adjusting further. */
+ if (page_compressed) {
+ ulint compress_length = fil_page_decompress(
+ page_compress_buf, dst,
+ callback.get_space_flags());
+ ut_ad(compress_length != srv_page_size);
+ if (compress_length == 0) {
+ goto page_corrupted;
+ }
+ updated = true;
+ } else if (!skip_checksum_check
+ && buf_page_is_corrupted(
+ false,
+ encrypted && !frame_changed
+ ? dst : src,
+ callback.get_space_flags())) {
+ goto page_corrupted;
+ }
+
+ if ((err = callback(block)) != DB_SUCCESS) {
+ goto func_exit;
+ } else if (!updated) {
+ updated = block->page.state()
+ == BUF_BLOCK_FILE_PAGE;
+ }
+
+ /* If tablespace is encrypted we use additional
+ temporary scratch area where pages are read
+ for decrypting readptr == crypt_io_buffer != io_buffer.
+
+ Destination for decryption is a buffer pool block
+ block->frame == dst == io_buffer that is updated.
+ Pages that did not require decryption even when
+ tablespace is marked as encrypted are not copied
+ instead block->frame is set to src == readptr.
+
+ For encryption we again use temporary scratch area
+ writeptr != io_buffer == dst
+ that is then written to the tablespace
+
+ (1) For normal tables io_buffer == dst == writeptr
+ (2) For only page compressed tables
+ io_buffer == dst == writeptr
+ (3) For encrypted (and page compressed)
+ readptr != io_buffer == dst != writeptr
+ */
+
+ ut_ad(!encrypted && !page_compressed ?
+ src == dst && dst == writeptr + (i * size):1);
+ ut_ad(page_compressed && !encrypted ?
+ src == dst && dst == writeptr + (i * size):1);
+ ut_ad(encrypted ?
+ src != dst && dst != writeptr + (i * size):1);
+
+ /* When tablespace is encrypted or compressed its
+ first page (i.e. page 0) is not encrypted or
+ compressed and there is no need to copy frame. */
+ if (encrypted && block->page.id().page_no() != 0) {
+ byte *local_frame = callback.get_frame(block);
+ ut_ad((writeptr + (i * size)) != local_frame);
+ memcpy((writeptr + (i * size)), local_frame, size);
+ }
+
+ if (frame_changed) {
+ if (block->page.zip.data) {
+ block->page.zip.data = dst;
+ } else {
+ block->frame = dst;
+ }
+ }
+
+ src = io_buffer + (i * size);
+
+ if (page_compressed) {
+ updated = true;
+ if (ulint len = fil_page_compress(
+ src,
+ page_compress_buf,
+ callback.get_space_flags(),
+ 512,/* FIXME: proper block size */
+ encrypted)) {
+ /* FIXME: remove memcpy() */
+ memcpy(src, page_compress_buf, len);
+ memset(src + len, 0,
+ srv_page_size - len);
+ }
+ }
+
+ /* Encrypt the page if encryption was used. */
+ if (encrypted && decrypted) {
+ byte *dest = writeptr + i * size;
+
+ byte* tmp = fil_encrypt_buf(
+ iter.crypt_data,
+ block->page.id().space(),
+ block->page.id().page_no(),
+ src, block->zip_size(), dest,
+ full_crc32);
+
+ if (tmp == src) {
+ /* TODO: remove unnecessary memcpy's */
+ ut_ad(dest != src);
+ memcpy(dest, src, size);
+ }
+
+ updated = true;
+ }
+
+ /* Write checksum for the compressed full crc32 page.*/
+ if (full_crc32 && page_compressed) {
+ ut_ad(updated);
+ byte* dest = writeptr + i * size;
+ ut_d(bool comp = false);
+ ut_d(bool corrupt = false);
+ ulint size = buf_page_full_crc32_size(
+ dest,
+#ifdef UNIV_DEBUG
+ &comp, &corrupt
+#else
+ NULL, NULL
+#endif
+ );
+ ut_ad(!comp == (size == srv_page_size));
+ ut_ad(!corrupt);
+ mach_write_to_4(dest + (size - 4),
+ ut_crc32(dest, size - 4));
+ }
+ }
+
+ if (page_compressed && punch_hole) {
+ err = fil_import_compress_fwrite(
+ iter, full_crc32, offset, writeptr, n_bytes,
+ !updated);
+
+ if (err != DB_SUCCESS) {
+ punch_hole = false;
+ if (updated) {
+ goto normal_write;
+ }
+ }
+ } else if (updated) {
+normal_write:
+ /* A page was updated in the set, write it back. */
+ err = os_file_write(IORequestWrite,
+ iter.filepath, iter.file,
+ writeptr, offset, n_bytes);
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+ }
+
+func_exit:
+ free(page_compress_buf);
+ return err;
+}
+
+/********************************************************************//**
+Iterate over all the pages in the tablespace.
+@param table - the table definiton in the server
+@param n_io_buffers - number of blocks to read and write together
+@param callback - functor that will do the page updates
+@return DB_SUCCESS or error code */
+static
+dberr_t
+fil_tablespace_iterate(
+/*===================*/
+ dict_table_t* table,
+ ulint n_io_buffers,
+ AbstractCallback& callback)
+{
+ dberr_t err;
+ pfs_os_file_t file;
+ char* filepath;
+
+ ut_a(n_io_buffers > 0);
+ ut_ad(!srv_read_only_mode);
+
+ DBUG_EXECUTE_IF("ib_import_trigger_corruption_1",
+ return(DB_CORRUPTION););
+
+ /* Make sure the data_dir_path is set. */
+ dict_get_and_save_data_dir_path(table, false);
+
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+ ut_a(table->data_dir_path);
+
+ filepath = fil_make_filepath(
+ table->data_dir_path, table->name.m_name, IBD, true);
+ } else {
+ filepath = fil_make_filepath(
+ NULL, table->name.m_name, IBD, false);
+ }
+
+ if (!filepath) {
+ return(DB_OUT_OF_MEMORY);
+ } else {
+ bool success;
+
+ file = os_file_create_simple_no_error_handling(
+ innodb_data_file_key, filepath,
+ OS_FILE_OPEN, OS_FILE_READ_WRITE, false, &success);
+
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(true);
+ ib::error() << "Trying to import a tablespace,"
+ " but could not open the tablespace file "
+ << filepath;
+ ut_free(filepath);
+ return DB_TABLESPACE_NOT_FOUND;
+ } else {
+ err = DB_SUCCESS;
+ }
+ }
+
+ callback.set_file(filepath, file);
+
+ os_offset_t file_size = os_file_get_size(file);
+ ut_a(file_size != (os_offset_t) -1);
+
+ /* Allocate a page to read in the tablespace header, so that we
+ can determine the page size and zip_size (if it is compressed).
+ We allocate an extra page in case it is a compressed table. */
+
+ byte* page = static_cast<byte*>(aligned_malloc(2 * srv_page_size,
+ srv_page_size));
+
+ buf_block_t* block = reinterpret_cast<buf_block_t*>
+ (ut_zalloc_nokey(sizeof *block));
+ block->frame = page;
+ block->page.init(BUF_BLOCK_FILE_PAGE, page_id_t(~0ULL), 1);
+
+ /* Read the first page and determine the page and zip size. */
+
+ err = os_file_read_no_error_handling(IORequestReadPartial,
+ file, page, 0, srv_page_size, 0);
+
+ if (err == DB_SUCCESS) {
+ err = callback.init(file_size, block);
+ }
+
+ if (err == DB_SUCCESS) {
+ block->page.id_ = page_id_t(callback.get_space_id(), 0);
+ if (ulint zip_size = callback.get_zip_size()) {
+ page_zip_set_size(&block->page.zip, zip_size);
+ /* ROW_FORMAT=COMPRESSED is not optimised for block IO
+ for now. We do the IMPORT page by page. */
+ n_io_buffers = 1;
+ }
+
+ fil_iterator_t iter;
+
+ /* read (optional) crypt data */
+ iter.crypt_data = fil_space_read_crypt_data(
+ callback.get_zip_size(), page);
+
+ /* If tablespace is encrypted, it needs extra buffers */
+ if (iter.crypt_data && n_io_buffers > 1) {
+ /* decrease io buffers so that memory
+ consumption will not double */
+ n_io_buffers /= 2;
+ }
+
+ iter.file = file;
+ iter.start = 0;
+ iter.end = file_size;
+ iter.filepath = filepath;
+ iter.file_size = file_size;
+ iter.n_io_buffers = n_io_buffers;
+
+ /* Add an extra page for compressed page scratch area. */
+ iter.io_buffer = static_cast<byte*>(
+ aligned_malloc((1 + iter.n_io_buffers)
+ << srv_page_size_shift, srv_page_size));
+
+ iter.crypt_io_buffer = iter.crypt_data
+ ? static_cast<byte*>(
+ aligned_malloc((1 + iter.n_io_buffers)
+ << srv_page_size_shift,
+ srv_page_size))
+ : NULL;
+
+ if (block->page.zip.ssize) {
+ ut_ad(iter.n_io_buffers == 1);
+ block->frame = iter.io_buffer;
+ block->page.zip.data = block->frame + srv_page_size;
+ }
+
+ err = fil_iterate(iter, block, callback);
+
+ if (iter.crypt_data) {
+ fil_space_destroy_crypt_data(&iter.crypt_data);
+ }
+
+ aligned_free(iter.crypt_io_buffer);
+ aligned_free(iter.io_buffer);
+ }
+
+ if (err == DB_SUCCESS) {
+ ib::info() << "Sync to disk";
+
+ if (!os_file_flush(file)) {
+ ib::info() << "os_file_flush() failed!";
+ err = DB_IO_ERROR;
+ } else {
+ ib::info() << "Sync to disk - done!";
+ }
+ }
+
+ os_file_close(file);
+
+ aligned_free(page);
+ ut_free(filepath);
+ ut_free(block);
+
+ return(err);
+}
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_for_mysql(
+/*=================*/
+ dict_table_t* table, /*!< in/out: table */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */
+{
+ dberr_t err;
+ trx_t* trx;
+ ib_uint64_t autoinc = 0;
+ char* filepath = NULL;
+
+ /* The caller assured that this is not read_only_mode and that no
+ temorary tablespace is being imported. */
+ ut_ad(!srv_read_only_mode);
+ ut_ad(!table->is_temporary());
+
+ ut_ad(table->space_id);
+ ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND);
+ ut_ad(prebuilt->trx);
+ ut_ad(!table->is_readable());
+
+ ibuf_delete_for_discarded_space(table->space_id);
+
+ trx_start_if_not_started(prebuilt->trx, true);
+
+ trx = trx_create();
+
+ /* So that the table is not DROPped during recovery. */
+ trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+ trx_start_if_not_started(trx, true);
+
+ /* So that we can send error messages to the user. */
+ trx->mysql_thd = prebuilt->trx->mysql_thd;
+
+ /* Ensure that the table will be dropped by trx_rollback_active()
+ in case of a crash. */
+
+ trx->table_id = table->id;
+
+ /* Assign an undo segment for the transaction, so that the
+ transaction will be recovered after a crash. */
+
+ /* TODO: Do not write any undo log for the IMPORT cleanup. */
+ {
+ mtr_t mtr;
+ mtr.start();
+ trx_undo_assign(trx, &err, &mtr);
+ mtr.commit();
+ }
+
+ DBUG_EXECUTE_IF("ib_import_undo_assign_failure",
+ err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+ if (err != DB_SUCCESS) {
+
+ return(row_import_cleanup(prebuilt, trx, err));
+
+ } else if (trx->rsegs.m_redo.undo == 0) {
+
+ err = DB_TOO_MANY_CONCURRENT_TRXS;
+ return(row_import_cleanup(prebuilt, trx, err));
+ }
+
+ prebuilt->trx->op_info = "read meta-data file";
+
+ /* Prevent DDL operations while we are checking. */
+
+ rw_lock_s_lock(&dict_sys.latch);
+
+ row_import cfg;
+
+ err = row_import_read_cfg(table, trx->mysql_thd, cfg);
+
+ /* Check if the table column definitions match the contents
+ of the config file. */
+
+ if (err == DB_SUCCESS) {
+
+ /* We have a schema file, try and match it with our
+ data dictionary. */
+
+ err = cfg.match_schema(trx->mysql_thd);
+
+ /* Update index->page and SYS_INDEXES.PAGE_NO to match the
+ B-tree root page numbers in the tablespace. Use the index
+ name from the .cfg file to find match. */
+
+ if (err == DB_SUCCESS) {
+ cfg.set_root_by_name();
+ autoinc = cfg.m_autoinc;
+ }
+
+ rw_lock_s_unlock(&dict_sys.latch);
+
+ DBUG_EXECUTE_IF("ib_import_set_index_root_failure",
+ err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+ } else if (cfg.m_missing) {
+
+ rw_lock_s_unlock(&dict_sys.latch);
+
+ /* We don't have a schema file, we will have to discover
+ the index root pages from the .ibd file and skip the schema
+ matching step. */
+
+ ut_a(err == DB_FAIL);
+
+ cfg.m_zip_size = 0;
+
+ FetchIndexRootPages fetchIndexRootPages(table, trx);
+
+ err = fil_tablespace_iterate(
+ table, IO_BUFFER_SIZE(srv_page_size),
+ fetchIndexRootPages);
+
+ if (err == DB_SUCCESS) {
+
+ err = fetchIndexRootPages.build_row_import(&cfg);
+
+ /* Update index->page and SYS_INDEXES.PAGE_NO
+ to match the B-tree root page numbers in the
+ tablespace. */
+
+ if (err == DB_SUCCESS) {
+ err = cfg.set_root_by_heuristic();
+ }
+ }
+ } else {
+ rw_lock_s_unlock(&dict_sys.latch);
+ }
+
+ if (err != DB_SUCCESS) {
+ return(row_import_error(prebuilt, trx, err));
+ }
+
+ prebuilt->trx->op_info = "importing tablespace";
+
+ ib::info() << "Phase I - Update all pages";
+
+ /* Iterate over all the pages and do the sanity checking and
+ the conversion required to import the tablespace. */
+
+ PageConverter converter(&cfg, table->space_id, trx);
+
+ /* Set the IO buffer size in pages. */
+
+ err = fil_tablespace_iterate(
+ table, IO_BUFFER_SIZE(cfg.m_zip_size ? cfg.m_zip_size
+ : srv_page_size), converter);
+
+ DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure",
+ err = DB_TOO_MANY_CONCURRENT_TRXS;);
+#ifdef BTR_CUR_HASH_ADAPT
+ /* On DISCARD TABLESPACE, we did not drop any adaptive hash
+ index entries. If we replaced the discarded tablespace with a
+ smaller one here, there could still be some adaptive hash
+ index entries that point to cached garbage pages in the buffer
+ pool, because PageConverter::operator() only evicted those
+ pages that were replaced by the imported pages. We must
+ detach any remaining adaptive hash index entries, because the
+ adaptive hash index must be a subset of the table contents;
+ false positives are not tolerated. */
+ for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); index;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+ index = index->clone_if_needed();
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (err != DB_SUCCESS) {
+ char table_name[MAX_FULL_NAME_LEN + 1];
+
+ innobase_format_name(
+ table_name, sizeof(table_name),
+ table->name.m_name);
+
+ if (err != DB_DECRYPTION_FAILED) {
+
+ ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_INTERNAL_ERROR,
+ "Cannot reset LSNs in table %s : %s",
+ table_name, ut_strerr(err));
+ }
+
+ return(row_import_cleanup(prebuilt, trx, err));
+ }
+
+ row_mysql_lock_data_dictionary(trx);
+
+ /* If the table is stored in a remote tablespace, we need to
+ determine that filepath from the link file and system tables.
+ Find the space ID in SYS_TABLES since this is an ALTER TABLE. */
+ dict_get_and_save_data_dir_path(table, true);
+
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+ ut_a(table->data_dir_path);
+
+ filepath = fil_make_filepath(
+ table->data_dir_path, table->name.m_name, IBD, true);
+ } else {
+ filepath = fil_make_filepath(
+ NULL, table->name.m_name, IBD, false);
+ }
+
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_15",
+ ut_free(filepath);
+ filepath = NULL;
+ );
+
+ if (filepath == NULL) {
+ row_mysql_unlock_data_dictionary(trx);
+ return(row_import_cleanup(prebuilt, trx, DB_OUT_OF_MEMORY));
+ }
+
+ /* Open the tablespace so that we can access via the buffer pool.
+ We set the 2nd param (fix_dict = true) here because we already
+ have an x-lock on dict_sys.latch and dict_sys.mutex.
+ The tablespace is initially opened as a temporary one, because
+ we will not be writing any redo log for it before we have invoked
+ fil_space_t::set_imported() to declare it a persistent tablespace. */
+
+ ulint fsp_flags = dict_tf_to_fsp_flags(table->flags);
+
+ table->space = fil_ibd_open(
+ true, true, FIL_TYPE_IMPORT, table->space_id,
+ fsp_flags, table->name, filepath, &err);
+
+ ut_ad((table->space == NULL) == (err != DB_SUCCESS));
+ DBUG_EXECUTE_IF("ib_import_open_tablespace_failure",
+ err = DB_TABLESPACE_NOT_FOUND; table->space = NULL;);
+
+ if (!table->space) {
+ row_mysql_unlock_data_dictionary(trx);
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_GET_ERRMSG,
+ err, ut_strerr(err), filepath);
+
+ ut_free(filepath);
+
+ return(row_import_cleanup(prebuilt, trx, err));
+ }
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ ut_free(filepath);
+
+ err = ibuf_check_bitmap_on_import(trx, table->space);
+
+ DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;);
+
+ if (err != DB_SUCCESS) {
+ return(row_import_cleanup(prebuilt, trx, err));
+ }
+
+ /* The first index must always be the clustered index. */
+
+ dict_index_t* index = dict_table_get_first_index(table);
+
+ if (!dict_index_is_clust(index)) {
+ return(row_import_error(prebuilt, trx, DB_CORRUPTION));
+ }
+
+ /* Update the Btree segment headers for index node and
+ leaf nodes in the root page. Set the new space id. */
+
+ err = btr_root_adjust_on_import(index);
+
+ DBUG_EXECUTE_IF("ib_import_cluster_root_adjust_failure",
+ err = DB_CORRUPTION;);
+
+ if (err != DB_SUCCESS) {
+ return(row_import_error(prebuilt, trx, err));
+ } else if (cfg.requires_purge(index->name)) {
+
+ /* Purge any delete-marked records that couldn't be
+ purged during the page conversion phase from the
+ cluster index. */
+
+ IndexPurge purge(trx, index);
+
+ trx->op_info = "cluster: purging delete marked records";
+
+ err = purge.garbage_collect();
+
+ trx->op_info = "";
+ }
+
+ DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;);
+
+ if (err != DB_SUCCESS) {
+ return(row_import_error(prebuilt, trx, err));
+ }
+
+ /* For secondary indexes, purge any records that couldn't be purged
+ during the page conversion phase. */
+
+ err = row_import_adjust_root_pages_of_secondary_indexes(
+ trx, table, cfg);
+
+ DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure",
+ err = DB_CORRUPTION;);
+
+ if (err != DB_SUCCESS) {
+ return(row_import_error(prebuilt, trx, err));
+ }
+
+ /* Ensure that the next available DB_ROW_ID is not smaller than
+ any DB_ROW_ID stored in the table. */
+
+ if (prebuilt->clust_index_was_generated) {
+ row_import_set_sys_max_row_id(prebuilt, table);
+ }
+
+ ib::info() << "Phase III - Flush changes to disk";
+
+ /* Ensure that all pages dirtied during the IMPORT make it to disk.
+ The only dirty pages generated should be from the pessimistic purge
+ of delete marked records that couldn't be purged in Phase I. */
+ while (buf_flush_list_space(prebuilt->table->space));
+
+ for (ulint count = 0; prebuilt->table->space->referenced(); count++) {
+ /* Issue a warning every 10.24 seconds, starting after
+ 2.56 seconds */
+ if ((count & 511) == 128) {
+ ib::warn() << "Waiting for flush to complete on "
+ << prebuilt->table->name;
+ }
+ os_thread_sleep(20000);
+ }
+
+ ib::info() << "Phase IV - Flush complete";
+ prebuilt->table->space->set_imported();
+
+ /* The dictionary latches will be released in in row_import_cleanup()
+ after the transaction commit, for both success and error. */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ /* Update the root pages of the table's indexes. */
+ err = row_import_update_index_root(trx, table, false);
+
+ if (err != DB_SUCCESS) {
+ return(row_import_error(prebuilt, trx, err));
+ }
+
+ err = row_import_update_discarded_flag(trx, table->id, false);
+
+ if (err != DB_SUCCESS) {
+ return(row_import_error(prebuilt, trx, err));
+ }
+
+ table->file_unreadable = false;
+ table->flags2 &= ~DICT_TF2_DISCARDED & ((1U << DICT_TF2_BITS) - 1);
+
+ /* Set autoinc value read from .cfg file, if one was specified.
+ Otherwise, keep the PAGE_ROOT_AUTO_INC as is. */
+ if (autoinc) {
+ ib::info() << table->name << " autoinc value set to "
+ << autoinc;
+
+ table->autoinc = autoinc--;
+ btr_write_autoinc(dict_table_get_first_index(table), autoinc);
+ }
+
+ return(row_import_cleanup(prebuilt, trx, err));
+}
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
new file mode 100644
index 00000000..6c2edd19
--- /dev/null
+++ b/storage/innobase/row/row0ins.cc
@@ -0,0 +1,3838 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ins.cc
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+#include "dict0dict.h"
+#include "trx0rec.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "ibuf0ibuf.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0log.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "buf0lru.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#ifdef WITH_WSREP
+#include "wsrep_mysqld.h"
+#endif /* WITH_WSREP */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/** Create an row template for each index of a table. */
+static void ins_node_create_entry_list(ins_node_t *node)
+{
+ node->entry_list.reserve(UT_LIST_GET_LEN(node->table->indexes));
+
+ for (dict_index_t *index= dict_table_get_first_index(node->table); index;
+ index= dict_table_get_next_index(index))
+ {
+ /* Corrupted or incomplete secondary indexes will be filtered out in
+ row_ins(). */
+ dtuple_t *entry= index->online_status >= ONLINE_INDEX_ABORTED
+ ? dtuple_create(node->entry_sys_heap, 0)
+ : row_build_index_entry_low(node->row, NULL, index, node->entry_sys_heap,
+ ROW_BUILD_FOR_INSERT);
+ node->entry_list.push_back(entry);
+ }
+}
+
+/*****************************************************************//**
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+ ins_node_t* node) /*!< in: insert node */
+{
+ dtuple_t* row;
+ dict_table_t* table;
+ const dict_col_t* col;
+ dfield_t* dfield;
+
+ row = node->row;
+ table = node->table;
+
+ ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+ /* allocate buffer to hold the needed system created hidden columns. */
+ compile_time_assert(DATA_ROW_ID_LEN
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+ == sizeof node->sys_buf);
+ memset(node->sys_buf, 0, sizeof node->sys_buf);
+ /* Assign DB_ROLL_PTR to 1 << ROLL_PTR_INSERT_FLAG_POS */
+ node->sys_buf[DATA_ROW_ID_LEN + DATA_TRX_ID_LEN] = 0x80;
+ ut_ad(!memcmp(node->sys_buf + DATA_ROW_ID_LEN, reset_trx_id,
+ sizeof reset_trx_id));
+
+ /* 1. Populate row-id */
+ col = dict_table_get_sys_col(table, DATA_ROW_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ dfield_set_data(dfield, node->sys_buf, DATA_ROW_ID_LEN);
+
+ /* 2. Populate trx id */
+ col = dict_table_get_sys_col(table, DATA_TRX_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ dfield_set_data(dfield, &node->sys_buf[DATA_ROW_ID_LEN],
+ DATA_TRX_ID_LEN);
+
+ col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ dfield_set_data(dfield, &node->sys_buf[DATA_ROW_ID_LEN
+ + DATA_TRX_ID_LEN],
+ DATA_ROLL_PTR_LEN);
+}
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+void
+ins_node_set_new_row(
+/*=================*/
+ ins_node_t* node, /*!< in: insert node */
+ dtuple_t* row) /*!< in: new row (or first row) for the node */
+{
+ node->state = INS_NODE_SET_IX_LOCK;
+ node->index = NULL;
+ node->entry_list.clear();
+ node->entry = node->entry_list.end();
+
+ node->row = row;
+
+ mem_heap_empty(node->entry_sys_heap);
+
+ /* Create templates for index entries */
+
+ ins_node_create_entry_list(node);
+
+ /* Allocate from entry_sys_heap buffers for sys fields */
+
+ row_ins_alloc_sys_fields(node);
+
+ /* As we allocated a new trx id buf, the trx id should be written
+ there again: */
+
+ node->trx_id = 0;
+}
+
+/*******************************************************************//**
+Does an insert operation by updating a delete-marked existing record
+in the index. This situation can occur if the delete-marked record is
+kept in the index for consistent reads.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether mtr holds just a leaf
+ latch or also a tree latch */
+ btr_cur_t* cursor, /*!< in: B-tree cursor */
+ rec_offs** offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+ mem_heap_t* offsets_heap,
+ /*!< in/out: memory heap that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ const dtuple_t* entry, /*!< in: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr; must be committed before
+ latching any further pages */
+{
+ big_rec_t* dummy_big_rec;
+ upd_t* update;
+ rec_t* rec;
+ dberr_t err;
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(!dict_index_is_clust(cursor->index));
+ ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+ ut_ad(!entry->info_bits);
+
+ /* We know that in the alphabetical ordering, entry and rec are
+ identified. But in their binary form there may be differences if
+ there are char fields in them. Therefore we have to calculate the
+ difference. */
+
+ update = row_upd_build_sec_rec_difference_binary(
+ rec, cursor->index, *offsets, entry, heap);
+
+ if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+ /* We should never insert in place of a record that
+ has not been delete-marked. The only exception is when
+ online CREATE INDEX copied the changes that we already
+ made to the clustered index, and completed the
+ secondary index creation before we got here. In this
+ case, the change would already be there. The CREATE
+ INDEX should be waiting for a MySQL meta-data lock
+ upgrade at least until this INSERT or UPDATE
+ returns. After that point, set_committed(true)
+ would be invoked in commit_inplace_alter_table(). */
+ ut_a(update->n_fields == 0);
+ ut_a(!cursor->index->is_committed());
+ ut_ad(!dict_index_is_online_ddl(cursor->index));
+ return(DB_SUCCESS);
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+ /* Try an optimistic updating of the record, keeping changes
+ within the page */
+
+ /* TODO: pass only *offsets */
+ err = btr_cur_optimistic_update(
+ flags | BTR_KEEP_SYS_FLAG, cursor,
+ offsets, &offsets_heap, update, 0, thr,
+ thr_get_trx(thr)->id, mtr);
+ switch (err) {
+ case DB_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_ZIP_OVERFLOW:
+ err = DB_FAIL;
+ default:
+ break;
+ }
+ } else {
+ ut_a(mode == BTR_MODIFY_TREE);
+ if (buf_pool.running_out()) {
+
+ return(DB_LOCK_TABLE_FULL);
+ }
+
+ err = btr_cur_pessimistic_update(
+ flags | BTR_KEEP_SYS_FLAG, cursor,
+ offsets, &offsets_heap,
+ heap, &dummy_big_rec, update, 0,
+ thr, thr_get_trx(thr)->id, mtr);
+ ut_ad(!dummy_big_rec);
+ }
+
+ return(err);
+}
+
+/*******************************************************************//**
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads.
+@return DB_SUCCESS, DB_FAIL, or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+ btr_pcur_t* pcur, /*!< in/out: a persistent cursor pointing
+ to the clust_rec that is being modified. */
+ ulint flags, /*!< in: undo logging and locking flags */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether mtr holds just a leaf
+ latch or also a tree latch */
+ rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
+ mem_heap_t** offsets_heap,
+ /*!< in/out: pointer to memory heap that can
+ be emptied, or NULL */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ const dtuple_t* entry, /*!< in: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr; must be committed before
+ latching any further pages */
+{
+ const rec_t* rec;
+ upd_t* update;
+ dberr_t err = DB_SUCCESS;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ TABLE* mysql_table = NULL;
+ ut_ad(dict_index_is_clust(cursor->index));
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(rec_get_deleted_flag(rec,
+ dict_table_is_comp(cursor->index->table)));
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(rec_get_trx_id(rec, cursor->index));
+
+ /* Build an update vector containing all the fields to be modified;
+ NOTE that this vector may NOT contain system columns trx_id or
+ roll_ptr */
+ if (thr->prebuilt != NULL) {
+ mysql_table = thr->prebuilt->m_mysql_table;
+ ut_ad(thr->prebuilt->trx == thr_get_trx(thr));
+ }
+
+ update = row_upd_build_difference_binary(
+ cursor->index, entry, rec, NULL, true,
+ thr_get_trx(thr), heap, mysql_table, &err);
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ if (mode != BTR_MODIFY_TREE) {
+ ut_ad((mode & ulint(~BTR_ALREADY_S_LATCHED))
+ == BTR_MODIFY_LEAF);
+
+ /* Try optimistic updating of the record, keeping changes
+ within the page */
+
+ err = btr_cur_optimistic_update(
+ flags, cursor, offsets, offsets_heap, update, 0, thr,
+ thr_get_trx(thr)->id, mtr);
+ switch (err) {
+ case DB_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_ZIP_OVERFLOW:
+ err = DB_FAIL;
+ default:
+ break;
+ }
+ } else {
+ if (buf_pool.running_out()) {
+ return DB_LOCK_TABLE_FULL;
+ }
+
+ big_rec_t* big_rec = NULL;
+
+ err = btr_cur_pessimistic_update(
+ flags | BTR_KEEP_POS_FLAG,
+ cursor, offsets, offsets_heap, heap,
+ &big_rec, update, 0, thr, thr_get_trx(thr)->id, mtr);
+
+ if (big_rec) {
+ ut_a(err == DB_SUCCESS);
+
+ DEBUG_SYNC_C("before_row_ins_upd_extern");
+ err = btr_store_big_rec_extern_fields(
+ pcur, *offsets, big_rec, mtr,
+ BTR_STORE_INSERT_UPDATE);
+ DEBUG_SYNC_C("after_row_ins_upd_extern");
+ dtuple_big_rec_free(big_rec);
+ }
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Returns TRUE if in a cascaded update/delete an ancestor node of node
+updates (not DELETE, but UPDATE) table.
+@return TRUE if an ancestor updates table */
+static
+ibool
+row_ins_cascade_ancestor_updates_table(
+/*===================================*/
+ que_node_t* node, /*!< in: node in a query graph */
+ dict_table_t* table) /*!< in: table */
+{
+ que_node_t* parent;
+
+ for (parent = que_node_get_parent(node);
+ que_node_get_type(parent) == QUE_NODE_UPDATE;
+ parent = que_node_get_parent(parent)) {
+
+ upd_node_t* upd_node;
+
+ upd_node = static_cast<upd_node_t*>(parent);
+
+ if (upd_node->table == table && !upd_node->is_delete) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Returns the number of ancestor UPDATE or DELETE nodes of a
+cascaded update/delete node.
+@return number of ancestors */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ulint
+row_ins_cascade_n_ancestors(
+/*========================*/
+ que_node_t* node) /*!< in: node in a query graph */
+{
+ que_node_t* parent;
+ ulint n_ancestors = 0;
+
+ for (parent = que_node_get_parent(node);
+ que_node_get_type(parent) == QUE_NODE_UPDATE;
+ parent = que_node_get_parent(parent)) {
+
+ n_ancestors++;
+ }
+
+ return(n_ancestors);
+}
+
+/******************************************************************//**
+Calculates the update vector node->cascade->update for a child table in
+a cascaded update.
+@return whether any FULLTEXT INDEX is affected */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_ins_cascade_calc_update_vec(
+/*============================*/
+ upd_node_t* node, /*!< in: update node of the parent
+ table */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint whose
+ type is != 0 */
+ mem_heap_t* heap, /*!< in: memory heap to use as
+ temporary storage */
+ trx_t* trx) /*!< in: update transaction */
+{
+ upd_node_t* cascade = node->cascade_node;
+ dict_table_t* table = foreign->foreign_table;
+ dict_index_t* index = foreign->foreign_index;
+ upd_t* update;
+ dict_table_t* parent_table;
+ dict_index_t* parent_index;
+ upd_t* parent_update;
+ ulint n_fields_updated;
+ ulint parent_field_no;
+ ulint i;
+ ulint j;
+ bool doc_id_updated = false;
+ unsigned doc_id_pos = 0;
+ doc_id_t new_doc_id = FTS_NULL_DOC_ID;
+ ulint prefix_col;
+
+ ut_a(cascade);
+ ut_a(table);
+ ut_a(index);
+
+ /* Calculate the appropriate update vector which will set the fields
+ in the child index record to the same value (possibly padded with
+ spaces if the column is a fixed length CHAR or FIXBINARY column) as
+ the referenced index record will get in the update. */
+
+ parent_table = node->table;
+ ut_a(parent_table == foreign->referenced_table);
+ parent_index = foreign->referenced_index;
+ parent_update = node->update;
+
+ update = cascade->update;
+
+ update->info_bits = 0;
+
+ n_fields_updated = 0;
+
+ bool affects_fulltext = foreign->affects_fulltext();
+
+ if (table->fts) {
+ doc_id_pos = dict_table_get_nth_col_pos(
+ table, table->fts->doc_col, &prefix_col);
+ }
+
+ for (i = 0; i < foreign->n_fields; i++) {
+
+ parent_field_no = dict_table_get_nth_col_pos(
+ parent_table,
+ dict_index_get_nth_col_no(parent_index, i),
+ &prefix_col);
+
+ for (j = 0; j < parent_update->n_fields; j++) {
+ const upd_field_t* parent_ufield
+ = &parent_update->fields[j];
+
+ if (parent_ufield->field_no == parent_field_no) {
+
+ ulint min_size;
+ const dict_col_t* col;
+ ulint ufield_len;
+ upd_field_t* ufield;
+
+ col = dict_index_get_nth_col(index, i);
+
+ /* A field in the parent index record is
+ updated. Let us make the update vector
+ field for the child table. */
+
+ ufield = update->fields + n_fields_updated;
+
+ ufield->field_no = static_cast<uint16_t>(
+ dict_table_get_nth_col_pos(
+ table, dict_col_get_no(col),
+ &prefix_col));
+
+ ufield->orig_len = 0;
+ ufield->exp = NULL;
+
+ ufield->new_val = parent_ufield->new_val;
+ dfield_get_type(&ufield->new_val)->prtype |=
+ col->prtype & DATA_VERSIONED;
+ ufield_len = dfield_get_len(&ufield->new_val);
+
+ /* Clear the "external storage" flag */
+ dfield_set_len(&ufield->new_val, ufield_len);
+
+ /* Do not allow a NOT NULL column to be
+ updated as NULL */
+
+ if (dfield_is_null(&ufield->new_val)
+ && (col->prtype & DATA_NOT_NULL)) {
+ goto err_exit;
+ }
+
+ /* If the new value would not fit in the
+ column, do not allow the update */
+
+ if (!dfield_is_null(&ufield->new_val)
+ && dtype_get_at_most_n_mbchars(
+ col->prtype,
+ col->mbminlen, col->mbmaxlen,
+ col->len,
+ ufield_len,
+ static_cast<char*>(
+ dfield_get_data(
+ &ufield->new_val)))
+ < ufield_len) {
+ goto err_exit;
+ }
+
+ /* If the parent column type has a different
+ length than the child column type, we may
+ need to pad with spaces the new value of the
+ child column */
+
+ min_size = dict_col_get_min_size(col);
+
+ /* Because UNIV_SQL_NULL (the marker
+ of SQL NULL values) exceeds all possible
+ values of min_size, the test below will
+ not hold for SQL NULL columns. */
+
+ if (min_size > ufield_len) {
+
+ byte* pad;
+ ulint pad_len;
+ byte* padded_data;
+ ulint mbminlen;
+
+ padded_data = static_cast<byte*>(
+ mem_heap_alloc(
+ heap, min_size));
+
+ pad = padded_data + ufield_len;
+ pad_len = min_size - ufield_len;
+
+ memcpy(padded_data,
+ dfield_get_data(&ufield
+ ->new_val),
+ ufield_len);
+
+ mbminlen = dict_col_get_mbminlen(col);
+
+ ut_ad(!(ufield_len % mbminlen));
+ ut_ad(!(min_size % mbminlen));
+
+ if (mbminlen == 1
+ && dtype_get_charset_coll(
+ col->prtype)
+ == DATA_MYSQL_BINARY_CHARSET_COLL) {
+ /* Do not pad BINARY columns */
+ goto err_exit;
+ }
+
+ row_mysql_pad_col(mbminlen,
+ pad, pad_len);
+ dfield_set_data(&ufield->new_val,
+ padded_data, min_size);
+ }
+
+ /* If Doc ID is updated, check whether the
+ Doc ID is valid */
+ if (table->fts
+ && ufield->field_no == doc_id_pos) {
+ doc_id_t n_doc_id;
+
+ n_doc_id =
+ table->fts->cache->next_doc_id;
+
+ new_doc_id = fts_read_doc_id(
+ static_cast<const byte*>(
+ dfield_get_data(
+ &ufield->new_val)));
+
+ affects_fulltext = true;
+ doc_id_updated = true;
+
+ if (new_doc_id <= 0) {
+ ib::error() << "FTS Doc ID"
+ " must be larger than"
+ " 0";
+ goto err_exit;
+ }
+
+ if (new_doc_id < n_doc_id) {
+ ib::error() << "FTS Doc ID"
+ " must be larger than "
+ << n_doc_id - 1
+ << " for table "
+ << table->name;
+ goto err_exit;
+ }
+ }
+
+ n_fields_updated++;
+ }
+ }
+ }
+
+ if (affects_fulltext) {
+ ut_ad(table->fts);
+
+ if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+ doc_id_t doc_id;
+ doc_id_t* next_doc_id;
+ upd_field_t* ufield;
+
+ next_doc_id = static_cast<doc_id_t*>(mem_heap_alloc(
+ heap, sizeof(doc_id_t)));
+
+ ut_ad(!doc_id_updated);
+ ufield = update->fields + n_fields_updated;
+ fts_get_next_doc_id(table, next_doc_id);
+ doc_id = fts_update_doc_id(table, ufield, next_doc_id);
+ n_fields_updated++;
+ fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+ } else {
+ if (doc_id_updated) {
+ ut_ad(new_doc_id);
+ fts_trx_add_op(trx, table, new_doc_id,
+ FTS_INSERT, NULL);
+ } else {
+ ib::error() << "FTS Doc ID must be updated"
+ " along with FTS indexed column for"
+ " table " << table->name;
+err_exit:
+ n_fields_updated = ULINT_UNDEFINED;
+ }
+ }
+ }
+
+ update->n_fields = n_fields_updated;
+
+ return affects_fulltext;
+}
+
+/*********************************************************************//**
+Set detailed error message associated with foreign key errors for
+the given transaction. */
+static
+void
+row_ins_set_detailed(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign) /*!< in: foreign key constraint */
+{
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&srv_misc_tmpfile_mutex);
+ rewind(srv_misc_tmpfile);
+
+ if (os_file_set_eof(srv_misc_tmpfile)) {
+ ut_print_name(srv_misc_tmpfile, trx,
+ foreign->foreign_table_name);
+ std::string fk_str = dict_print_info_on_foreign_key_in_create_format(
+ trx, foreign, FALSE);
+ fputs(fk_str.c_str(), srv_misc_tmpfile);
+ trx_set_detailed_error_from_file(trx, srv_misc_tmpfile);
+ } else {
+ trx_set_detailed_error(trx, "temp file operation failed");
+ }
+
+ mutex_exit(&srv_misc_tmpfile_mutex);
+}
+
+/*********************************************************************//**
+Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file
+and displays information about the given transaction.
+The caller must release dict_foreign_err_mutex. */
+static
+void
+row_ins_foreign_trx_print(
+/*======================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ ulint n_rec_locks;
+ ulint n_trx_locks;
+ ulint heap_size;
+
+ ut_ad(!srv_read_only_mode);
+
+ lock_mutex_enter();
+ n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+ n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+ heap_size = mem_heap_get_size(trx->lock.lock_heap);
+ lock_mutex_exit();
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(dict_foreign_err_file);
+ ut_print_timestamp(dict_foreign_err_file);
+ fputs(" Transaction:\n", dict_foreign_err_file);
+
+ trx_print_low(dict_foreign_err_file, trx, 600,
+ n_rec_locks, n_trx_locks, heap_size);
+
+ ut_ad(mutex_own(&dict_foreign_err_mutex));
+}
+
+/*********************************************************************//**
+Reports a foreign key error associated with an update or a delete of a
+parent table index entry. */
+static
+void
+row_ins_foreign_report_err(
+/*=======================*/
+ const char* errstr, /*!< in: error string from the viewpoint
+ of the parent table */
+ que_thr_t* thr, /*!< in: query thread whose run_node
+ is an update node */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ const rec_t* rec, /*!< in: a matching index record in the
+ child table */
+ const dtuple_t* entry) /*!< in: index entry in the parent
+ table */
+{
+ std::string fk_str;
+
+ if (srv_read_only_mode) {
+ return;
+ }
+
+ FILE* ef = dict_foreign_err_file;
+ trx_t* trx = thr_get_trx(thr);
+
+ row_ins_set_detailed(trx, foreign);
+
+ row_ins_foreign_trx_print(trx);
+
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fputs(":\n", ef);
+ fk_str = dict_print_info_on_foreign_key_in_create_format(trx, foreign,
+ TRUE);
+ fputs(fk_str.c_str(), ef);
+ putc('\n', ef);
+ fputs(errstr, ef);
+ fprintf(ef, " in parent table, in index %s",
+ foreign->referenced_index->name());
+ if (entry) {
+ fputs(" tuple:\n", ef);
+ dtuple_print(ef, entry);
+ }
+ fputs("\nBut in child table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fprintf(ef, ", in index %s", foreign->foreign_index->name());
+ if (rec) {
+ fputs(", there is a record:\n", ef);
+ rec_print(ef, rec, foreign->foreign_index);
+ } else {
+ fputs(", the record is not available\n", ef);
+ }
+ putc('\n', ef);
+
+ mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error to dict_foreign_err_file when we are trying
+to add an index entry to a child table. Note that the adding may be the result
+of an update, too. */
+static
+void
+row_ins_foreign_report_add_err(
+/*===========================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ const rec_t* rec, /*!< in: a record in the parent table:
+ it does not match entry because we
+ have an error! */
+ const dtuple_t* entry) /*!< in: index entry to insert in the
+ child table */
+{
+ std::string fk_str;
+
+ if (srv_read_only_mode) {
+ return;
+ }
+
+ FILE* ef = dict_foreign_err_file;
+
+ row_ins_set_detailed(trx, foreign);
+
+ row_ins_foreign_trx_print(trx);
+
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fputs(":\n", ef);
+ fk_str = dict_print_info_on_foreign_key_in_create_format(trx, foreign,
+ TRUE);
+ fputs(fk_str.c_str(), ef);
+ if (foreign->foreign_index) {
+ fprintf(ef, " in parent table, in index %s",
+ foreign->foreign_index->name());
+ } else {
+ fputs(" in parent table", ef);
+ }
+ if (entry) {
+ fputs(" tuple:\n", ef);
+ /* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized.
+ It would be better to only display the user columns. */
+ dtuple_print(ef, entry);
+ }
+ fputs("\nBut in parent table ", ef);
+ ut_print_name(ef, trx, foreign->referenced_table_name);
+ fprintf(ef, ", in index %s,\n"
+ "the closest match we can find is record:\n",
+ foreign->referenced_index->name());
+ if (rec && page_rec_is_supremum(rec)) {
+ /* If the cursor ended on a supremum record, it is better
+ to report the previous record in the error message, so that
+ the user gets a more descriptive error message. */
+ rec = page_rec_get_prev_const(rec);
+ }
+
+ if (rec) {
+ rec_print(ef, rec, foreign->referenced_index);
+ }
+ putc('\n', ef);
+
+ mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Invalidate the query cache for the given table. */
+static
+void
+row_ins_invalidate_query_cache(
+/*===========================*/
+ que_thr_t* thr, /*!< in: query thread whose run_node
+ is an update node */
+ const char* name) /*!< in: table name prefixed with
+ database name and a '/' character */
+{
+ innobase_invalidate_query_cache(thr_get_trx(thr), name);
+}
+
+
+/** Fill virtual column information in cascade node for the child table.
+@param[out] cascade child update node
+@param[in] rec clustered rec of child table
+@param[in] index clustered index of child table
+@param[in] node parent update node
+@param[in] foreign foreign key information
+@return error code. */
+static
+dberr_t
+row_ins_foreign_fill_virtual(
+ upd_node_t* cascade,
+ const rec_t* rec,
+ dict_index_t* index,
+ upd_node_t* node,
+ dict_foreign_t* foreign)
+{
+ THD* thd = current_thd;
+ row_ext_t* ext;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+ const rec_offs* offsets =
+ rec_get_offsets(rec, index, offsets_, index->n_core_fields,
+ ULINT_UNDEFINED, &cascade->heap);
+ TABLE* mysql_table= NULL;
+ upd_t* update = cascade->update;
+ ulint n_v_fld = index->table->n_v_def;
+ ulint n_diff;
+ upd_field_t* upd_field;
+ dict_vcol_set* v_cols = foreign->v_cols;
+ update->old_vrow = row_build(
+ ROW_COPY_DATA, index, rec,
+ offsets, index->table, NULL, NULL,
+ &ext, update->heap);
+ n_diff = update->n_fields;
+
+ if (index->table->vc_templ == NULL) {
+ /** This can occur when there is a cascading
+ delete or update after restart. */
+ innobase_init_vc_templ(index->table);
+ }
+
+ ib_vcol_row vc(NULL);
+ uchar *record = vc.record(thd, index, &mysql_table);
+ if (!record) {
+ return DB_OUT_OF_MEMORY;
+ }
+
+ for (uint16_t i = 0; i < n_v_fld; i++) {
+
+ dict_v_col_t* col = dict_table_get_nth_v_col(
+ index->table, i);
+
+ dict_vcol_set::iterator it = v_cols->find(col);
+
+ if (it == v_cols->end()) {
+ continue;
+ }
+
+ dfield_t* vfield = innobase_get_computed_value(
+ update->old_vrow, col, index,
+ &vc.heap, update->heap, NULL, thd, mysql_table,
+ record, NULL, NULL, NULL);
+
+ if (vfield == NULL) {
+ return DB_COMPUTE_VALUE_FAILED;
+ }
+
+ upd_field = update->fields + n_diff;
+
+ upd_field->old_v_val = static_cast<dfield_t*>(
+ mem_heap_alloc(update->heap,
+ sizeof *upd_field->old_v_val));
+
+ dfield_copy(upd_field->old_v_val, vfield);
+
+ upd_field_set_v_field_no(upd_field, i, index);
+
+ bool set_null =
+ node->is_delete
+ ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
+ : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL);
+
+ dfield_t* new_vfield = innobase_get_computed_value(
+ update->old_vrow, col, index,
+ &vc.heap, update->heap, NULL, thd,
+ mysql_table, record, NULL,
+ set_null ? update : node->update, foreign);
+
+ if (new_vfield == NULL) {
+ return DB_COMPUTE_VALUE_FAILED;
+ }
+
+ dfield_copy(&upd_field->new_val, new_vfield);
+
+ if (!dfield_datas_are_binary_equal(
+ upd_field->old_v_val,
+ &upd_field->new_val, 0))
+ n_diff++;
+ }
+
+ update->n_fields = n_diff;
+ return DB_SUCCESS;
+}
+
+#ifdef WITH_WSREP
+dberr_t wsrep_append_foreign_key(trx_t *trx,
+ dict_foreign_t* foreign,
+ const rec_t* clust_rec,
+ dict_index_t* clust_index,
+ ibool referenced,
+ Wsrep_service_key_type key_type);
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Perform referential actions or checks when a parent row is deleted or updated
+and the constraint had an ON DELETE or ON UPDATE condition which was not
+RESTRICT.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_foreign_check_on_constraint(
+/*================================*/
+ que_thr_t* thr, /*!< in: query thread whose run_node
+ is an update node */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint whose
+ type is != 0 */
+ btr_pcur_t* pcur, /*!< in: cursor placed on a matching
+ index record in the child table */
+ dtuple_t* entry, /*!< in: index entry in the parent
+ table */
+ mtr_t* mtr) /*!< in: mtr holding the latch of pcur
+ page */
+{
+ upd_node_t* node;
+ upd_node_t* cascade;
+ dict_table_t* table = foreign->foreign_table;
+ dict_index_t* index;
+ dict_index_t* clust_index;
+ dtuple_t* ref;
+ const rec_t* rec;
+ const rec_t* clust_rec;
+ const buf_block_t* clust_block;
+ upd_t* update;
+ dberr_t err;
+ trx_t* trx;
+ mem_heap_t* tmp_heap = NULL;
+ doc_id_t doc_id = FTS_NULL_DOC_ID;
+
+ DBUG_ENTER("row_ins_foreign_check_on_constraint");
+
+ trx = thr_get_trx(thr);
+
+ /* Since we are going to delete or update a row, we have to invalidate
+ the MySQL query cache for table. A deadlock of threads is not possible
+ here because the caller of this function does not hold any latches with
+ the mutex rank above the lock_sys_t::mutex. The query cache mutex
+ has a rank just above the lock_sys_t::mutex. */
+
+ row_ins_invalidate_query_cache(thr, table->name.m_name);
+
+ node = static_cast<upd_node_t*>(thr->run_node);
+
+ if (node->is_delete && 0 == (foreign->type
+ & (DICT_FOREIGN_ON_DELETE_CASCADE
+ | DICT_FOREIGN_ON_DELETE_SET_NULL))) {
+
+ row_ins_foreign_report_err("Trying to delete",
+ thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ DBUG_RETURN(DB_ROW_IS_REFERENCED);
+ }
+
+ if (!node->is_delete && 0 == (foreign->type
+ & (DICT_FOREIGN_ON_UPDATE_CASCADE
+ | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+ /* This is an UPDATE */
+
+ row_ins_foreign_report_err("Trying to update",
+ thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ DBUG_RETURN(DB_ROW_IS_REFERENCED);
+ }
+
+ if (node->cascade_node == NULL) {
+ node->cascade_heap = mem_heap_create(128);
+ node->cascade_node = row_create_update_node_for_mysql(
+ table, node->cascade_heap);
+ que_node_set_parent(node->cascade_node, node);
+
+ }
+ cascade = node->cascade_node;
+ cascade->table = table;
+ cascade->foreign = foreign;
+
+ if (node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) {
+ cascade->is_delete = PLAIN_DELETE;
+ } else {
+ cascade->is_delete = NO_DELETE;
+
+ if (foreign->n_fields > cascade->update_n_fields) {
+ /* We have to make the update vector longer */
+
+ cascade->update = upd_create(foreign->n_fields,
+ node->cascade_heap);
+ cascade->update_n_fields = foreign->n_fields;
+ }
+
+ /* We do not allow cyclic cascaded updating (DELETE is
+ allowed, but not UPDATE) of the same table, as this
+ can lead to an infinite cycle. Check that we are not
+ updating the same table which is already being
+ modified in this cascade chain. We have to check this
+ also because the modification of the indexes of a
+ 'parent' table may still be incomplete, and we must
+ avoid seeing the indexes of the parent table in an
+ inconsistent state! */
+
+ if (row_ins_cascade_ancestor_updates_table(cascade, table)) {
+
+ /* We do not know if this would break foreign key
+ constraints, but play safe and return an error */
+
+ err = DB_ROW_IS_REFERENCED;
+
+ row_ins_foreign_report_err(
+ "Trying an update, possibly causing a cyclic"
+ " cascaded update\n"
+ "in the child table,", thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ }
+ }
+
+ if (row_ins_cascade_n_ancestors(cascade) >= FK_MAX_CASCADE_DEL) {
+ err = DB_FOREIGN_EXCEED_MAX_CASCADE;
+
+ row_ins_foreign_report_err(
+ "Trying a too deep cascaded delete or update\n",
+ thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ }
+
+ index = btr_pcur_get_btr_cur(pcur)->index;
+
+ ut_a(index == foreign->foreign_index);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ tmp_heap = mem_heap_create(256);
+
+ if (dict_index_is_clust(index)) {
+ /* pcur is already positioned in the clustered index of
+ the child table */
+
+ clust_index = index;
+ clust_rec = rec;
+ clust_block = btr_pcur_get_block(pcur);
+ } else {
+ /* We have to look for the record in the clustered index
+ in the child table */
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
+ tmp_heap);
+ btr_pcur_open_with_no_init(clust_index, ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ cascade->pcur, 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(cascade->pcur);
+ clust_block = btr_pcur_get_block(cascade->pcur);
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(cascade->pcur)
+ < dict_index_get_n_unique(clust_index)) {
+
+ ib::error() << "In cascade of a foreign key op index "
+ << index->name
+ << " of table " << index->table->name;
+
+ fputs("InnoDB: record ", stderr);
+ rec_print(stderr, rec, index);
+ fputs("\n"
+ "InnoDB: clustered record ", stderr);
+ rec_print(stderr, clust_rec, clust_index);
+ fputs("\n"
+ "InnoDB: Submit a detailed bug report to"
+ " https://jira.mariadb.org/\n", stderr);
+ ut_ad(0);
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+ }
+
+ /* Set an X-lock on the row to delete or update in the child table */
+
+ err = lock_table(0, table, LOCK_IX, thr);
+
+ if (err == DB_SUCCESS) {
+ /* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+ we already have a normal shared lock on the appropriate
+ gap if the search criterion was not unique */
+
+ err = lock_clust_rec_read_check_and_lock_alt(
+ 0, clust_block, clust_rec, clust_index,
+ LOCK_X, LOCK_REC_NOT_GAP, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ goto nonstandard_exit_func;
+ }
+
+ if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(rec_get_trx_id(clust_rec, clust_index));
+ /* This can happen if there is a circular reference of
+ rows such that cascading delete comes to delete a row
+ already in the process of being delete marked */
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+
+ if (table->fts) {
+ doc_id = fts_get_doc_id_from_rec(
+ clust_rec, clust_index,
+ rec_get_offsets(clust_rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &tmp_heap));
+ }
+
+ if (node->is_delete
+ ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
+ : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)) {
+ /* Build the appropriate update vector which sets
+ foreign->n_fields first fields in rec to SQL NULL */
+
+ update = cascade->update;
+
+ update->info_bits = 0;
+ update->n_fields = foreign->n_fields;
+ MEM_UNDEFINED(update->fields,
+ update->n_fields * sizeof *update->fields);
+
+ for (ulint i = 0; i < foreign->n_fields; i++) {
+ upd_field_t* ufield = &update->fields[i];
+ ulint col_no = dict_index_get_nth_col_no(
+ index, i);
+ ulint prefix_col;
+
+ ufield->field_no = static_cast<uint16_t>(
+ dict_table_get_nth_col_pos(
+ table, col_no, &prefix_col));
+ dict_col_t* col = dict_table_get_nth_col(
+ table, col_no);
+ dict_col_copy_type(col, dfield_get_type(&ufield->new_val));
+
+ ufield->orig_len = 0;
+ ufield->exp = NULL;
+ dfield_set_null(&ufield->new_val);
+ }
+
+ if (foreign->affects_fulltext()) {
+ fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+ }
+
+ if (foreign->v_cols != NULL
+ && foreign->v_cols->size() > 0) {
+ err = row_ins_foreign_fill_virtual(
+ cascade, clust_rec, clust_index,
+ node, foreign);
+
+ if (err != DB_SUCCESS) {
+ goto nonstandard_exit_func;
+ }
+ }
+ } else if (table->fts && cascade->is_delete == PLAIN_DELETE
+ && foreign->affects_fulltext()) {
+ /* DICT_FOREIGN_ON_DELETE_CASCADE case */
+ fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+ }
+
+ if (!node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) {
+
+ /* Build the appropriate update vector which sets changing
+ foreign->n_fields first fields in rec to new values */
+
+ bool affects_fulltext = row_ins_cascade_calc_update_vec(
+ node, foreign, tmp_heap, trx);
+
+ if (foreign->v_cols && !foreign->v_cols->empty()) {
+ err = row_ins_foreign_fill_virtual(
+ cascade, clust_rec, clust_index,
+ node, foreign);
+
+ if (err != DB_SUCCESS) {
+ goto nonstandard_exit_func;
+ }
+ }
+
+ switch (cascade->update->n_fields) {
+ case ULINT_UNDEFINED:
+ err = DB_ROW_IS_REFERENCED;
+
+ row_ins_foreign_report_err(
+ "Trying a cascaded update where the"
+ " updated value in the child\n"
+ "table would not fit in the length"
+ " of the column, or the value would\n"
+ "be NULL and the column is"
+ " declared as not NULL in the child table,",
+ thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ case 0:
+ /* The update does not change any columns referred
+ to in this foreign key constraint: no need to do
+ anything */
+
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+
+ /* Mark the old Doc ID as deleted */
+ if (affects_fulltext) {
+ ut_ad(table->fts);
+ fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+ }
+ }
+
+ if (table->versioned() && cascade->is_delete != PLAIN_DELETE
+ && cascade->update->affects_versioned()) {
+ ut_ad(!cascade->historical_heap);
+ cascade->historical_heap = mem_heap_create(srv_page_size);
+ cascade->historical_row = row_build(
+ ROW_COPY_DATA, clust_index, clust_rec, NULL, table,
+ NULL, NULL, NULL, cascade->historical_heap);
+ }
+
+ /* Store pcur position and initialize or store the cascade node
+ pcur stored position */
+
+ btr_pcur_store_position(pcur, mtr);
+
+ if (index == clust_index) {
+ btr_pcur_copy_stored_position(cascade->pcur, pcur);
+ } else {
+ btr_pcur_store_position(cascade->pcur, mtr);
+ }
+
+#ifdef WITH_WSREP
+ err = wsrep_append_foreign_key(trx, foreign, clust_rec, clust_index,
+ FALSE, WSREP_SERVICE_KEY_EXCLUSIVE);
+ if (err != DB_SUCCESS) {
+ ib::info() << "WSREP: foreign key append failed: " << err;
+ goto nonstandard_exit_func;
+ }
+#endif /* WITH_WSREP */
+ mtr_commit(mtr);
+
+ ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON);
+
+ cascade->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ err = row_update_cascade_for_mysql(thr, cascade,
+ foreign->foreign_table);
+
+ /* Release the data dictionary latch for a while, so that we do not
+ starve other threads from doing CREATE TABLE etc. if we have a huge
+ cascaded operation running. */
+
+ row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
+
+ DEBUG_SYNC_C("innodb_dml_cascade_dict_unfreeze");
+
+ row_mysql_freeze_data_dictionary(thr_get_trx(thr));
+
+ mtr_start(mtr);
+
+ /* Restore pcur position */
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ DBUG_RETURN(err);
+
+nonstandard_exit_func:
+
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ btr_pcur_store_position(pcur, mtr);
+
+ mtr_commit(mtr);
+ mtr_start(mtr);
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+ DBUG_RETURN(err);
+}
+
+/*********************************************************************//**
+Sets a shared lock on a record. Used in locking possible duplicate key
+records and also in checking foreign key constraints.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+dberr_t
+row_ins_set_shared_rec_lock(
+/*========================*/
+ unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP type lock */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (dict_index_is_clust(index)) {
+ err = lock_clust_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_S, type, thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_S, type, thr);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Sets a exclusive lock on a record. Used in locking possible duplicate key
+records
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+dberr_t
+row_ins_set_exclusive_rec_lock(
+/*===========================*/
+ unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP type lock */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (dict_index_is_clust(index)) {
+ err = lock_clust_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_X, type, thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_X, type, thr);
+ }
+
+ return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_sys.latch.
+@return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */
+dberr_t
+row_ins_check_foreign_constraint(
+/*=============================*/
+ ibool check_ref,/*!< in: TRUE if we want to check that
+ the referenced table is ok, FALSE if we
+ want to check the foreign key table */
+ dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the
+ tables mentioned in it must be in the
+ dictionary cache if they exist at all */
+ dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign
+ table, else the referenced table */
+ dtuple_t* entry, /*!< in: index entry for index */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ upd_node_t* upd_node;
+ dict_table_t* check_table;
+ dict_index_t* check_index;
+ ulint n_fields_cmp;
+ btr_pcur_t pcur;
+ int cmp;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ bool skip_gap_lock;
+
+ skip_gap_lock = (trx->isolation_level <= TRX_ISO_READ_COMMITTED);
+
+ DBUG_ENTER("row_ins_check_foreign_constraint");
+
+ rec_offs_init(offsets_);
+
+#ifdef WITH_WSREP
+ upd_node= NULL;
+#endif /* WITH_WSREP */
+
+ ut_ad(rw_lock_own(&dict_sys.latch, RW_LOCK_S));
+
+ err = DB_SUCCESS;
+
+ if (trx->check_foreigns == FALSE) {
+ /* The user has suppressed foreign key checks currently for
+ this session */
+ goto exit_func;
+ }
+
+ /* If any of the foreign key fields in entry is SQL NULL, we
+ suppress the foreign key check: this is compatible with Oracle,
+ for example */
+ for (ulint i = 0; i < entry->n_fields; i++) {
+ dfield_t* field = dtuple_get_nth_field(entry, i);
+ if (i < foreign->n_fields && dfield_is_null(field)) {
+ goto exit_func;
+ }
+ /* System Versioning: if row_end != Inf, we
+ suppress the foreign key check */
+ if (field->type.vers_sys_end() && field->vers_history_row()) {
+ goto exit_func;
+ }
+ }
+
+ if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
+ upd_node = static_cast<upd_node_t*>(thr->run_node);
+
+ if (upd_node->is_delete != PLAIN_DELETE
+ && upd_node->foreign == foreign) {
+ /* If a cascaded update is done as defined by a
+ foreign key constraint, do not check that
+ constraint for the child row. In ON UPDATE CASCADE
+ the update of the parent row is only half done when
+ we come here: if we would check the constraint here
+ for the child row it would fail.
+
+ A QUESTION remains: if in the child table there are
+ several constraints which refer to the same parent
+ table, we should merge all updates to the child as
+ one update? And the updates can be contradictory!
+ Currently we just perform the update associated
+ with each foreign key constraint, one after
+ another, and the user has problems predicting in
+ which order they are performed. */
+
+ goto exit_func;
+ }
+ }
+
+ if (que_node_get_type(thr->run_node) == QUE_NODE_INSERT) {
+ ins_node_t* insert_node =
+ static_cast<ins_node_t*>(thr->run_node);
+ dict_table_t* table = insert_node->index->table;
+ if (table->versioned()) {
+ dfield_t* row_end = dtuple_get_nth_field(
+ insert_node->row, table->vers_end);
+ if (row_end->vers_history_row()) {
+ goto exit_func;
+ }
+ }
+ }
+
+ if (check_ref) {
+ check_table = foreign->referenced_table;
+ check_index = foreign->referenced_index;
+ } else {
+ check_table = foreign->foreign_table;
+ check_index = foreign->foreign_index;
+ }
+
+ if (check_table == NULL
+ || !check_table->is_readable()
+ || check_index == NULL) {
+
+ FILE* ef = dict_foreign_err_file;
+ std::string fk_str;
+
+ row_ins_set_detailed(trx, foreign);
+ row_ins_foreign_trx_print(trx);
+
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, check_ref
+ ? foreign->foreign_table_name
+ : foreign->referenced_table_name);
+ fputs(":\n", ef);
+ fk_str = dict_print_info_on_foreign_key_in_create_format(
+ trx, foreign, TRUE);
+ fputs(fk_str.c_str(), ef);
+ if (check_ref) {
+ if (foreign->foreign_index) {
+ fprintf(ef, "\nTrying to add to index %s"
+ " tuple:\n",
+ foreign->foreign_index->name());
+ } else {
+ fputs("\nTrying to add tuple:\n", ef);
+ }
+ dtuple_print(ef, entry);
+ fputs("\nBut the parent table ", ef);
+ ut_print_name(ef, trx, foreign->referenced_table_name);
+ fputs("\nor its .ibd file or the required index does"
+ " not currently exist!\n", ef);
+ err = DB_NO_REFERENCED_ROW;
+ } else {
+ if (foreign->referenced_index) {
+ fprintf(ef, "\nTrying to modify index %s"
+ " tuple:\n",
+ foreign->referenced_index->name());
+ } else {
+ fputs("\nTrying to modify tuple:\n", ef);
+ }
+ dtuple_print(ef, entry);
+ fputs("\nBut the referencing table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fputs("\nor its .ibd file or the required index does"
+ " not currently exist!\n", ef);
+ err = DB_ROW_IS_REFERENCED;
+ }
+
+ mutex_exit(&dict_foreign_err_mutex);
+ goto exit_func;
+ }
+
+ if (check_table != table) {
+ /* We already have a LOCK_IX on table, but not necessarily
+ on check_table */
+
+ err = lock_table(0, check_table, LOCK_IS, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto do_possible_lock_wait;
+ }
+ }
+
+ mtr_start(&mtr);
+
+ /* Store old value on n_fields_cmp */
+
+ n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+ dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+
+ btr_pcur_open(check_index, entry, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ /* Scan index records and check if there is a matching record */
+
+ do {
+ const rec_t* rec = btr_pcur_get_rec(&pcur);
+ const buf_block_t* block = btr_pcur_get_block(&pcur);
+
+ if (page_rec_is_infimum(rec)) {
+
+ continue;
+ }
+
+ offsets = rec_get_offsets(rec, check_index, offsets,
+ check_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (page_rec_is_supremum(rec)) {
+
+ if (skip_gap_lock) {
+
+ continue;
+ }
+
+ err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block,
+ rec, check_index,
+ offsets, thr);
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ continue;
+ default:
+ goto end_scan;
+ }
+ }
+
+ cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+ if (cmp == 0) {
+ if (check_table->versioned()) {
+ bool history_row = false;
+
+ if (check_index->is_primary()) {
+ history_row = check_index->
+ vers_history_row(rec, offsets);
+ } else if (check_index->
+ vers_history_row(rec, history_row))
+ {
+ break;
+ }
+
+ if (history_row) {
+ continue;
+ }
+ }
+
+ if (rec_get_deleted_flag(rec,
+ rec_offs_comp(offsets))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(!dict_index_is_clust(check_index)
+ || row_get_rec_trx_id(rec, check_index,
+ offsets));
+
+ err = row_ins_set_shared_rec_lock(
+ skip_gap_lock
+ ? LOCK_REC_NOT_GAP
+ : LOCK_ORDINARY, block,
+ rec, check_index, offsets, thr);
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto end_scan;
+ }
+ } else {
+ /* Found a matching record. Lock only
+ a record because we can allow inserts
+ into gaps */
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP, block,
+ rec, check_index, offsets, thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto end_scan;
+ }
+
+ if (check_ref) {
+ err = DB_SUCCESS;
+#ifdef WITH_WSREP
+ err = wsrep_append_foreign_key(
+ thr_get_trx(thr),
+ foreign,
+ rec,
+ check_index,
+ check_ref,
+ (upd_node != NULL
+ && wsrep_protocol_version < 4)
+ ? WSREP_SERVICE_KEY_SHARED
+ : WSREP_SERVICE_KEY_REFERENCE);
+ if (err != DB_SUCCESS) {
+ fprintf(stderr,
+ "WSREP: foreign key append failed: %d\n", err);
+ }
+#endif /* WITH_WSREP */
+ goto end_scan;
+ } else if (foreign->type != 0) {
+ /* There is an ON UPDATE or ON DELETE
+ condition: check them in a separate
+ function */
+
+ err = row_ins_foreign_check_on_constraint(
+ thr, foreign, &pcur, entry,
+ &mtr);
+ if (err != DB_SUCCESS) {
+ /* Since reporting a plain
+ "duplicate key" error
+ message to the user in
+ cases where a long CASCADE
+ operation would lead to a
+ duplicate key in some
+ other table is very
+ confusing, map duplicate
+ key errors resulting from
+ FK constraints to a
+ separate error code. */
+
+ if (err == DB_DUPLICATE_KEY) {
+ err = DB_FOREIGN_DUPLICATE_KEY;
+ }
+
+ goto end_scan;
+ }
+
+ /* row_ins_foreign_check_on_constraint
+ may have repositioned pcur on a
+ different block */
+ block = btr_pcur_get_block(&pcur);
+ } else {
+ row_ins_foreign_report_err(
+ "Trying to delete or update",
+ thr, foreign, rec, entry);
+
+ err = DB_ROW_IS_REFERENCED;
+ goto end_scan;
+ }
+ }
+ } else {
+ ut_a(cmp < 0);
+
+ err = skip_gap_lock
+ ? DB_SUCCESS
+ : row_ins_set_shared_rec_lock(
+ LOCK_GAP, block,
+ rec, check_index, offsets, thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ if (check_ref) {
+ err = DB_NO_REFERENCED_ROW;
+ row_ins_foreign_report_add_err(
+ trx, foreign, rec, entry);
+ }
+ default:
+ break;
+ }
+
+ goto end_scan;
+ }
+ } while (btr_pcur_move_to_next(&pcur, &mtr));
+
+ if (check_ref) {
+ row_ins_foreign_report_add_err(
+ trx, foreign, btr_pcur_get_rec(&pcur), entry);
+ err = DB_NO_REFERENCED_ROW;
+ } else {
+ err = DB_SUCCESS;
+ }
+
+end_scan:
+ btr_pcur_close(&pcur);
+
+ mtr_commit(&mtr);
+
+ /* Restore old value */
+ dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+do_possible_lock_wait:
+ if (err == DB_LOCK_WAIT) {
+ trx->error_state = err;
+
+ que_thr_stop_for_mysql(thr);
+
+ thr->lock_state = QUE_THR_LOCK_ROW;
+
+ check_table->inc_fk_checks();
+
+ lock_wait_suspend_thread(thr);
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+ err = trx->error_state;
+ if (err != DB_SUCCESS) {
+ } else if (check_table->to_be_dropped) {
+ err = DB_LOCK_WAIT_TIMEOUT;
+ } else {
+ err = DB_LOCK_WAIT;
+ }
+
+ check_table->dec_fk_checks();
+ }
+
+exit_func:
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+
+ DBUG_RETURN(err);
+}
+
+/** Sets the values of the dtuple fields in ref_entry from the values of
+foreign columns in entry.
+@param[in] foreign foreign key constraint
+@param[in] index clustered index
+@param[in] entry tuple of clustered index
+@param[in] ref_entry tuple of foreign columns
+@return true if all foreign key fields present in clustered index */
+static
+bool row_ins_foreign_index_entry(dict_foreign_t *foreign,
+ const dict_index_t *index,
+ const dtuple_t *entry,
+ dtuple_t *ref_entry)
+{
+ for (ulint i= 0; i < foreign->n_fields; i++)
+ {
+ for (ulint j= 0; j < index->n_fields; j++)
+ {
+ const dict_col_t *col= dict_index_get_nth_col(index, j);
+
+ /* A clustered index may contain instantly dropped columns,
+ which must be skipped. */
+ if (col->is_dropped())
+ continue;
+
+ const char *col_name= dict_table_get_col_name(index->table, col->ind);
+ if (0 == innobase_strcasecmp(col_name, foreign->foreign_col_names[i]))
+ {
+ dfield_copy(&ref_entry->fields[i], &entry->fields[j]);
+ goto got_match;
+ }
+ }
+ return false;
+got_match:
+ continue;
+ }
+
+ return true;
+}
+
+/***************************************************************//**
+Checks if foreign key constraints fail for an index entry. If index
+is not mentioned in any constraint, this function does nothing,
+Otherwise does searches to the indexes of referenced tables and
+sets shared locks which lock either the success or the failure of
+a constraint.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_check_foreign_constraints(
+/*==============================*/
+ dict_table_t* table, /*!< in: table */
+ dict_index_t* index, /*!< in: index */
+ bool pk, /*!< in: index->is_primary() */
+ dtuple_t* entry, /*!< in: index entry for index */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_foreign_t* foreign;
+ dberr_t err = DB_SUCCESS;
+ trx_t* trx;
+ ibool got_s_lock = FALSE;
+ mem_heap_t* heap = NULL;
+
+ DBUG_ASSERT(index->is_primary() == pk);
+
+ trx = thr_get_trx(thr);
+
+ DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+ "foreign_constraint_check_for_ins");
+
+ for (dict_foreign_set::iterator it = table->foreign_set.begin();
+ err == DB_SUCCESS && it != table->foreign_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ if (foreign->foreign_index == index
+ || (pk && !foreign->foreign_index)) {
+
+ dtuple_t* ref_tuple = entry;
+ if (UNIV_UNLIKELY(!foreign->foreign_index)) {
+ /* Change primary key entry to
+ foreign key index entry */
+ if (!heap) {
+ heap = mem_heap_create(1000);
+ } else {
+ mem_heap_empty(heap);
+ }
+
+ ref_tuple = dtuple_create(
+ heap, foreign->n_fields);
+ dtuple_set_n_fields_cmp(
+ ref_tuple, foreign->n_fields);
+ if (!row_ins_foreign_index_entry(
+ foreign, index, entry, ref_tuple)) {
+ err = DB_NO_REFERENCED_ROW;
+ break;
+ }
+
+ }
+
+ dict_table_t* ref_table = NULL;
+ dict_table_t* referenced_table
+ = foreign->referenced_table;
+
+ if (referenced_table == NULL) {
+
+ ref_table = dict_table_open_on_name(
+ foreign->referenced_table_name_lookup,
+ FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+ }
+
+ if (0 == trx->dict_operation_lock_mode) {
+ got_s_lock = TRUE;
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ if (referenced_table) {
+ foreign->foreign_table->inc_fk_checks();
+ }
+
+ /* NOTE that if the thread ends up waiting for a lock
+ we will release dict_sys.latch temporarily!
+ But the counter on the table protects the referenced
+ table from being dropped while the check is running. */
+
+ err = row_ins_check_foreign_constraint(
+ TRUE, foreign, table, ref_tuple, thr);
+
+ if (referenced_table) {
+ foreign->foreign_table->dec_fk_checks();
+ }
+
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ if (ref_table != NULL) {
+ dict_table_close(ref_table, FALSE, FALSE);
+ }
+ }
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return err;
+}
+
+/***************************************************************//**
+Checks if a unique key violation to rec would occur at the index entry
+insert.
+@return TRUE if error */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+ const rec_t* rec, /*!< in: user record; NOTE that we assume
+ that the caller already has a record lock on
+ the record! */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint matched_fields;
+ ulint n_unique;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ n_unique = dict_index_get_n_unique(index);
+
+ matched_fields = 0;
+
+ cmp_dtuple_rec_with_match(entry, rec, offsets, &matched_fields);
+
+ if (matched_fields < n_unique) {
+
+ return(FALSE);
+ }
+
+ /* In a unique secondary index we allow equal key values if they
+ contain SQL NULLs */
+
+ if (!dict_index_is_clust(index) && !index->nulls_equal) {
+
+ for (i = 0; i < n_unique; i++) {
+ if (dfield_is_null(dtuple_get_nth_field(entry, i))) {
+
+ return(FALSE);
+ }
+ }
+ }
+
+ return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+}
+
+/***************************************************************//**
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry.
+Set shared locks on possible duplicate records.
+@return DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ dict_index_t* index, /*!< in: non-clustered unique index */
+ dtuple_t* entry, /*!< in: index entry */
+ que_thr_t* thr, /*!< in: query thread */
+ bool s_latch,/*!< in: whether index->lock is being held */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ mem_heap_t* offsets_heap)
+ /*!< in/out: memory heap that can be emptied */
+{
+ ulint n_unique;
+ int cmp;
+ ulint n_fields_cmp;
+ btr_pcur_t pcur;
+ dberr_t err = DB_SUCCESS;
+ ulint allow_duplicates;
+ rec_offs offsets_[REC_OFFS_SEC_INDEX_SIZE];
+ rec_offs* offsets = offsets_;
+ DBUG_ENTER("row_ins_scan_sec_index_for_duplicate");
+
+ rec_offs_init(offsets_);
+
+ ut_ad(s_latch == rw_lock_own_flagged(
+ &index->lock, RW_LOCK_FLAG_S | RW_LOCK_FLAG_SX));
+
+ n_unique = dict_index_get_n_unique(index);
+
+ /* If the secondary index is unique, but one of the fields in the
+ n_unique first fields is NULL, a unique key violation cannot occur,
+ since we define NULL != NULL in this case */
+
+ if (!index->nulls_equal) {
+ for (ulint i = 0; i < n_unique; i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(entry, i))) {
+
+ DBUG_RETURN(DB_SUCCESS);
+ }
+ }
+ }
+
+ /* Store old value on n_fields_cmp */
+
+ n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+ dtuple_set_n_fields_cmp(entry, n_unique);
+
+ btr_pcur_open(index, entry, PAGE_CUR_GE,
+ s_latch
+ ? BTR_SEARCH_LEAF_ALREADY_S_LATCHED
+ : BTR_SEARCH_LEAF,
+ &pcur, mtr);
+
+ allow_duplicates = thr_get_trx(thr)->duplicates;
+
+ /* Scan index records and check if there is a duplicate */
+
+ do {
+ const rec_t* rec = btr_pcur_get_rec(&pcur);
+ const buf_block_t* block = btr_pcur_get_block(&pcur);
+ const ulint lock_type = LOCK_ORDINARY;
+
+ if (page_rec_is_infimum(rec)) {
+
+ continue;
+ }
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &offsets_heap);
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+ /* Set no locks when applying log
+ in online table rebuild. */
+ } else if (allow_duplicates) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ lock_type, block, rec, index, offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ lock_type, block, rec, index, offsets, thr);
+ }
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ case DB_SUCCESS:
+ break;
+ default:
+ goto end_scan;
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ continue;
+ }
+
+ cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+ if (cmp == 0) {
+ if (row_ins_dupl_error_with_rec(rec, entry,
+ index, offsets)) {
+ err = DB_DUPLICATE_KEY;
+
+ thr_get_trx(thr)->error_info = index;
+
+ /* If the duplicate is on hidden FTS_DOC_ID,
+ state so in the error log */
+ if (index == index->table->fts_doc_id_index
+ && DICT_TF2_FLAG_IS_SET(
+ index->table,
+ DICT_TF2_FTS_HAS_DOC_ID)) {
+
+ ib::error() << "Duplicate FTS_DOC_ID"
+ " value on table "
+ << index->table->name;
+ }
+
+ goto end_scan;
+ }
+ } else {
+ ut_a(cmp < 0);
+ goto end_scan;
+ }
+ } while (btr_pcur_move_to_next(&pcur, mtr));
+
+end_scan:
+ /* Restore old value */
+ dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+ DBUG_RETURN(err);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@retval DB_SUCCESS when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_online(
+/*=====================*/
+ ulint n_uniq, /*!< in: offset of DB_TRX_ID */
+ const dtuple_t* entry, /*!< in: entry that is being inserted */
+ const rec_t* rec, /*!< in: clustered index record */
+ rec_offs* offsets)/*!< in/out: rec_get_offsets(rec) */
+{
+ ulint fields = 0;
+
+ /* During rebuild, there should not be any delete-marked rows
+ in the new table. */
+ ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+ ut_ad(dtuple_get_n_fields_cmp(entry) == n_uniq);
+
+ /* Compare the PRIMARY KEY fields and the
+ DB_TRX_ID, DB_ROLL_PTR. */
+ cmp_dtuple_rec_with_match_low(
+ entry, rec, offsets, n_uniq + 2, &fields);
+
+ if (fields < n_uniq) {
+ /* Not a duplicate. */
+ return(DB_SUCCESS);
+ }
+
+ ulint trx_id_len;
+
+ if (fields == n_uniq + 2
+ && memcmp(rec_get_nth_field(rec, offsets, n_uniq, &trx_id_len),
+ reset_trx_id, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+ ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+ /* rec is an exact match of entry, and DB_TRX_ID belongs
+ to a transaction that started after our ALTER TABLE. */
+ return(DB_SUCCESS_LOCKED_REC);
+ }
+
+ return(DB_DUPLICATE_KEY);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@retval DB_SUCCESS when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust_online(
+/*====================================*/
+ ulint n_uniq, /*!< in: offset of DB_TRX_ID */
+ const dtuple_t* entry, /*!< in: entry that is being inserted */
+ const btr_cur_t*cursor, /*!< in: cursor on insert position */
+ rec_offs** offsets,/*!< in/out: rec_get_offsets(rec) */
+ mem_heap_t** heap) /*!< in/out: heap for offsets */
+{
+ dberr_t err = DB_SUCCESS;
+ const rec_t* rec = btr_cur_get_rec(cursor);
+
+ ut_ad(!cursor->index->is_instant());
+
+ if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) {
+ *offsets = rec_get_offsets(rec, cursor->index, *offsets,
+ cursor->index->n_fields,
+ ULINT_UNDEFINED, heap);
+ err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets);
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ rec = page_rec_get_next_const(btr_cur_get_rec(cursor));
+
+ if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) {
+ *offsets = rec_get_offsets(rec, cursor->index, *offsets,
+ cursor->index->n_fields,
+ ULINT_UNDEFINED, heap);
+ err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets);
+ }
+
+ return(err);
+}
+
+/***************************************************************//**
+Checks if a unique key violation error would occur at an index entry
+insert. Sets shared locks on possible duplicate records. Works only
+for a clustered index!
+@retval DB_SUCCESS if no error
+@retval DB_DUPLICATE_KEY if error,
+@retval DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate
+record */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust(
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: B-tree cursor */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ rec_t* rec;
+ ulint n_unique;
+ trx_t* trx = thr_get_trx(thr);
+ mem_heap_t*heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(dict_index_is_clust(cursor->index));
+
+ /* NOTE: For unique non-clustered indexes there may be any number
+ of delete marked records with the same value for the non-clustered
+ index key (remember multiversioning), and which differ only in
+ the row refererence part of the index record, containing the
+ clustered index key fields. For such a secondary index record,
+ to avoid race condition, we must FIRST do the insertion and after
+ that check that the uniqueness condition is not breached! */
+
+ /* NOTE: A problem is that in the B-tree node pointers on an
+ upper level may match more to the entry than the actual existing
+ user records on the leaf level. So, even if low_match would suggest
+ that a duplicate key violation may occur, this may not be the case. */
+
+ n_unique = dict_index_get_n_unique(cursor->index);
+
+ if (cursor->low_match >= n_unique) {
+
+ rec = btr_cur_get_rec(cursor);
+
+ if (!page_rec_is_infimum(rec)) {
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ cursor->index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* We set a lock on the possible duplicate: this
+ is needed in logical logging of MySQL to make
+ sure that in roll-forward we get the same duplicate
+ errors as in original execution */
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+ /* Do nothing if no-locking is set */
+ err = DB_SUCCESS;
+ } else if (trx->duplicates) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor),
+ rec, cursor->index, offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor), rec,
+ cursor->index, offsets, thr);
+ }
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto func_exit;
+ }
+
+ if (row_ins_dupl_error_with_rec(
+ rec, entry, cursor->index, offsets)) {
+duplicate:
+ trx->error_info = cursor->index;
+ err = DB_DUPLICATE_KEY;
+ if (cursor->index->table->versioned()
+ && entry->vers_history_row())
+ {
+ ulint trx_id_len;
+ byte *trx_id = rec_get_nth_field(
+ rec, offsets, n_unique,
+ &trx_id_len);
+ ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+ if (trx->id == trx_read_trx_id(trx_id)) {
+ err = DB_FOREIGN_DUPLICATE_KEY;
+ }
+ }
+ goto func_exit;
+ }
+ }
+ }
+
+ if (cursor->up_match >= n_unique) {
+
+ rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+ if (!page_rec_is_supremum(rec)) {
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ cursor->index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (trx->duplicates) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor),
+ rec, cursor->index, offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor),
+ rec, cursor->index, offsets, thr);
+ }
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto func_exit;
+ }
+
+ if (row_ins_dupl_error_with_rec(
+ rec, entry, cursor->index, offsets)) {
+ goto duplicate;
+ }
+ }
+
+ /* This should never happen */
+ ut_error;
+ }
+
+ err = DB_SUCCESS;
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/***************************************************************//**
+Checks if an index entry has long enough common prefix with an
+existing record so that the intended insert of the entry must be
+changed to a modify of the existing record. In the case of a clustered
+index, the prefix must be n_unique fields long. In the case of a
+secondary index, all fields must be equal. InnoDB never updates
+secondary index records in place, other than clearing or setting the
+delete-mark flag. We could be able to update the non-unique fields
+of a unique secondary index record by checking the cursor->up_match,
+but we do not do so, because it could have some locking implications.
+@return TRUE if the existing record should be updated; FALSE if not */
+UNIV_INLINE
+ibool
+row_ins_must_modify_rec(
+/*====================*/
+ const btr_cur_t* cursor) /*!< in: B-tree cursor */
+{
+ /* NOTE: (compare to the note in row_ins_duplicate_error_in_clust)
+ Because node pointers on upper levels of the B-tree may match more
+ to entry than to actual user records on the leaf level, we
+ have to check if the candidate record is actually a user record.
+ A clustered index node pointer contains index->n_unique first fields,
+ and a secondary index node pointer contains all index fields. */
+
+ return(cursor->low_match
+ >= dict_index_get_n_unique_in_tree(cursor->index)
+ && !page_rec_is_infimum(btr_cur_get_rec(cursor)));
+}
+
+/** Insert the externally stored fields (off-page columns)
+of a clustered index entry.
+@param[in] entry index entry to insert
+@param[in] big_rec externally stored fields
+@param[in,out] offsets rec_get_offsets()
+@param[in,out] heap memory heap
+@param[in] thd client connection, or NULL
+@param[in] index clustered index
+@return error code
+@retval DB_SUCCESS
+@retval DB_OUT_OF_FILE_SPACE */
+static
+dberr_t
+row_ins_index_entry_big_rec(
+ const dtuple_t* entry,
+ const big_rec_t* big_rec,
+ rec_offs* offsets,
+ mem_heap_t** heap,
+ dict_index_t* index,
+ const void* thd __attribute__((unused)))
+{
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ rec_t* rec;
+ dberr_t error;
+
+ ut_ad(dict_index_is_clust(index));
+
+ DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch");
+
+ mtr.start();
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ }
+
+ btr_pcur_open(index, entry, PAGE_CUR_LE, BTR_MODIFY_TREE,
+ &pcur, &mtr);
+ rec = btr_pcur_get_rec(&pcur);
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, heap);
+
+ DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern");
+ error = btr_store_big_rec_extern_fields(
+ &pcur, offsets, big_rec, &mtr, BTR_STORE_INSERT);
+ DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern");
+
+ if (error == DB_SUCCESS
+ && dict_index_is_online_ddl(index)) {
+ row_log_table_insert(btr_pcur_get_rec(&pcur), index, offsets);
+ }
+
+ mtr.commit();
+
+ btr_pcur_close(&pcur);
+
+ return(error);
+}
+
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint n_uniq, /*!< in: 0 or index->n_uniq */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* cursor;
+ dberr_t err = DB_SUCCESS;
+ big_rec_t* big_rec = NULL;
+ mtr_t mtr;
+ ib_uint64_t auto_inc = 0;
+ mem_heap_t* offsets_heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ DBUG_ENTER("row_ins_clust_index_entry_low");
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!dict_index_is_unique(index)
+ || n_uniq == dict_index_get_n_unique(index));
+ ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index));
+ ut_ad(!thr_get_trx(thr)->in_rollback);
+
+ mtr_start(&mtr);
+
+ if (index->table->is_temporary()) {
+ /* Disable REDO logging as the lifetime of temp-tables is
+ limited to server or connection lifetime and so REDO
+ information is not needed on restart for recovery.
+ Disable locking as temp-tables are local to a connection. */
+
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ ut_ad(!dict_index_is_online_ddl(index));
+ ut_ad(!index->table->persistent_autoinc);
+ ut_ad(!index->is_instant());
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+
+ if (UNIV_UNLIKELY(entry->is_metadata())) {
+ ut_ad(index->is_instant());
+ ut_ad(!dict_index_is_online_ddl(index));
+ ut_ad(mode == BTR_MODIFY_TREE);
+ } else {
+ if (mode == BTR_MODIFY_LEAF
+ && dict_index_is_online_ddl(index)) {
+ mode = BTR_MODIFY_LEAF_ALREADY_S_LATCHED;
+ mtr_s_lock_index(index, &mtr);
+ }
+
+ if (unsigned ai = index->table->persistent_autoinc) {
+ /* Prepare to persist the AUTO_INCREMENT value
+ from the index entry to PAGE_ROOT_AUTO_INC. */
+ const dfield_t* dfield = dtuple_get_nth_field(
+ entry, ai - 1);
+ if (!dfield_is_null(dfield)) {
+ auto_inc = row_parse_int(
+ static_cast<const byte*>(
+ dfield->data),
+ dfield->len,
+ dfield->type.mtype,
+ dfield->type.prtype
+ & DATA_UNSIGNED);
+ }
+ }
+ }
+ }
+
+ /* Note that we use PAGE_CUR_LE as the search mode, because then
+ the function will return in both low_match and up_match of the
+ cursor sensible values */
+ err = btr_pcur_open_low(index, 0, entry, PAGE_CUR_LE, mode, &pcur,
+ __FILE__, __LINE__, auto_inc, &mtr);
+ if (err != DB_SUCCESS) {
+ index->table->file_unreadable = true;
+ mtr.commit();
+ goto func_exit;
+ }
+
+ cursor = btr_pcur_get_btr_cur(&pcur);
+ cursor->thr = thr;
+
+#ifdef UNIV_DEBUG
+ {
+ page_t* page = btr_cur_get_page(cursor);
+ rec_t* first_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+
+ ut_ad(page_rec_is_supremum(first_rec)
+ || rec_n_fields_is_sane(index, first_rec, entry));
+ }
+#endif /* UNIV_DEBUG */
+
+ if (UNIV_UNLIKELY(entry->info_bits != 0)) {
+ ut_ad(entry->is_metadata());
+ ut_ad(flags == BTR_NO_LOCKING_FLAG);
+ ut_ad(index->is_instant());
+ ut_ad(!dict_index_is_online_ddl(index));
+
+ const rec_t* rec = btr_cur_get_rec(cursor);
+
+ if (rec_get_info_bits(rec, page_rec_is_comp(rec))
+ & REC_INFO_MIN_REC_FLAG) {
+ thr_get_trx(thr)->error_info = index;
+ err = DB_DUPLICATE_KEY;
+ goto err_exit;
+ }
+
+ ut_ad(!row_ins_must_modify_rec(cursor));
+ goto do_insert;
+ }
+
+ if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) {
+ goto do_insert;
+ }
+
+ if (n_uniq
+ && (cursor->up_match >= n_uniq || cursor->low_match >= n_uniq)) {
+
+ if (flags
+ == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) {
+ /* Set no locks when applying log
+ in online table rebuild. Only check for duplicates. */
+ err = row_ins_duplicate_error_in_clust_online(
+ n_uniq, entry, cursor,
+ &offsets, &offsets_heap);
+
+ switch (err) {
+ case DB_SUCCESS:
+ break;
+ default:
+ ut_ad(0);
+ /* fall through */
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_DUPLICATE_KEY:
+ thr_get_trx(thr)->error_info = cursor->index;
+ }
+ } else {
+ /* Note that the following may return also
+ DB_LOCK_WAIT */
+
+ err = row_ins_duplicate_error_in_clust(
+ flags, cursor, entry, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+err_exit:
+ mtr_commit(&mtr);
+ goto func_exit;
+ }
+ }
+
+ /* Note: Allowing duplicates would qualify for modification of
+ an existing record as the new entry is exactly same as old entry. */
+ if (row_ins_must_modify_rec(cursor)) {
+ /* There is already an index entry with a long enough common
+ prefix, we must convert the insert into a modify of an
+ existing record */
+ mem_heap_t* entry_heap = mem_heap_create(1024);
+
+ err = row_ins_clust_index_entry_by_modify(
+ &pcur, flags, mode, &offsets, &offsets_heap,
+ entry_heap, entry, thr, &mtr);
+
+ if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) {
+ row_log_table_insert(btr_cur_get_rec(cursor),
+ index, offsets);
+ }
+
+ mtr_commit(&mtr);
+ mem_heap_free(entry_heap);
+ } else {
+ if (index->is_instant()) entry->trim(*index);
+do_insert:
+ rec_t* insert_rec;
+
+ if (mode != BTR_MODIFY_TREE) {
+ ut_ad((mode & ulint(~BTR_ALREADY_S_LATCHED))
+ == BTR_MODIFY_LEAF);
+ err = btr_cur_optimistic_insert(
+ flags, cursor, &offsets, &offsets_heap,
+ entry, &insert_rec, &big_rec,
+ n_ext, thr, &mtr);
+ } else {
+ if (buf_pool.running_out()) {
+ err = DB_LOCK_TABLE_FULL;
+ goto err_exit;
+ }
+
+ DEBUG_SYNC_C("before_insert_pessimitic_row_ins_clust");
+
+ err = btr_cur_optimistic_insert(
+ flags, cursor,
+ &offsets, &offsets_heap,
+ entry, &insert_rec, &big_rec,
+ n_ext, thr, &mtr);
+
+ if (err == DB_FAIL) {
+ err = btr_cur_pessimistic_insert(
+ flags, cursor,
+ &offsets, &offsets_heap,
+ entry, &insert_rec, &big_rec,
+ n_ext, thr, &mtr);
+ }
+ }
+
+ if (big_rec != NULL) {
+ mtr_commit(&mtr);
+
+ /* Online table rebuild could read (and
+ ignore) the incomplete record at this point.
+ If online rebuild is in progress, the
+ row_ins_index_entry_big_rec() will write log. */
+
+ DBUG_EXECUTE_IF(
+ "row_ins_extern_checkpoint",
+ log_write_up_to(mtr.commit_lsn(), true););
+ err = row_ins_index_entry_big_rec(
+ entry, big_rec, offsets, &offsets_heap, index,
+ thr_get_trx(thr)->mysql_thd);
+ dtuple_convert_back_big_rec(index, entry, big_rec);
+ } else {
+ if (err == DB_SUCCESS
+ && dict_index_is_online_ddl(index)) {
+ row_log_table_insert(
+ insert_rec, index, offsets);
+ }
+
+ mtr_commit(&mtr);
+ }
+ }
+
+func_exit:
+ if (offsets_heap != NULL) {
+ mem_heap_free(offsets_heap);
+ }
+
+ btr_pcur_close(&pcur);
+
+ DBUG_RETURN(err);
+}
+
+/** Start a mini-transaction and check if the index will be dropped.
+@param[in,out] mtr mini-transaction
+@param[in,out] index secondary index
+@param[in] check whether to check
+@param[in] search_mode flags
+@return true if the index is to be dropped */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_ins_sec_mtr_start_and_check_if_aborted(
+ mtr_t* mtr,
+ dict_index_t* index,
+ bool check,
+ ulint search_mode)
+{
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ const mtr_log_t log_mode = mtr->get_log_mode();
+
+ mtr->start();
+ index->set_modified(*mtr);
+ mtr->set_log_mode(log_mode);
+
+ if (!check) {
+ return(false);
+ }
+
+ if (search_mode & BTR_ALREADY_S_LATCHED) {
+ mtr_s_lock_index(index, mtr);
+ } else {
+ mtr_sx_lock_index(index, mtr);
+ }
+
+ switch (index->online_status) {
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ ut_ad(!index->is_committed());
+ return(true);
+ case ONLINE_INDEX_COMPLETE:
+ return(false);
+ case ONLINE_INDEX_CREATION:
+ break;
+ }
+
+ ut_error;
+ return(true);
+}
+
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: secondary index */
+ mem_heap_t* offsets_heap,
+ /*!< in/out: memory heap that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during
+ row_log_table_apply(), or 0 */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ DBUG_ENTER("row_ins_sec_index_entry_low");
+
+ btr_cur_t cursor;
+ ulint search_mode = mode;
+ dberr_t err = DB_SUCCESS;
+ ulint n_unique;
+ mtr_t mtr;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ rtr_info_t rtr_info;
+
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE);
+
+ cursor.thr = thr;
+ cursor.rtr_info = NULL;
+ ut_ad(thr_get_trx(thr)->id != 0);
+
+ mtr.start();
+
+ if (index->table->is_temporary()) {
+ /* Disable locking, because temporary tables are never
+ shared between transactions or connections. */
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ if (!dict_index_is_spatial(index)) {
+ search_mode |= BTR_INSERT;
+ }
+ }
+
+ /* Ensure that we acquire index->lock when inserting into an
+ index with index->online_status == ONLINE_INDEX_COMPLETE, but
+ could still be subject to rollback_inplace_alter_table().
+ This prevents a concurrent change of index->online_status.
+ The memory object cannot be freed as long as we have an open
+ reference to the table, or index->table->n_ref_count > 0. */
+ const bool check = !index->is_committed();
+ if (check) {
+ DEBUG_SYNC_C("row_ins_sec_index_enter");
+ if (mode == BTR_MODIFY_LEAF) {
+ search_mode |= BTR_ALREADY_S_LATCHED;
+ mtr_s_lock_index(index, &mtr);
+ } else {
+ mtr_sx_lock_index(index, &mtr);
+ }
+
+ if (row_log_online_op_try(
+ index, entry, thr_get_trx(thr)->id)) {
+ goto func_exit;
+ }
+ }
+
+ /* Note that we use PAGE_CUR_LE as the search mode, because then
+ the function will return in both low_match and up_match of the
+ cursor sensible values */
+
+ if (!thr_get_trx(thr)->check_unique_secondary) {
+ search_mode |= BTR_IGNORE_SEC_UNIQUE;
+ }
+
+ if (dict_index_is_spatial(index)) {
+ cursor.index = index;
+ rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
+ rtr_info_update_btr(&cursor, &rtr_info);
+
+ err = btr_cur_search_to_nth_level(
+ index, 0, entry, PAGE_CUR_RTREE_INSERT,
+ search_mode,
+ &cursor, 0, __FILE__, __LINE__, &mtr);
+
+ if (mode == BTR_MODIFY_LEAF && rtr_info.mbr_adj) {
+ mtr_commit(&mtr);
+ rtr_clean_rtr_info(&rtr_info, true);
+ rtr_init_rtr_info(&rtr_info, false, &cursor,
+ index, false);
+ rtr_info_update_btr(&cursor, &rtr_info);
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+ search_mode &= ulint(~BTR_MODIFY_LEAF);
+ search_mode |= BTR_MODIFY_TREE;
+ err = btr_cur_search_to_nth_level(
+ index, 0, entry, PAGE_CUR_RTREE_INSERT,
+ search_mode,
+ &cursor, 0, __FILE__, __LINE__, &mtr);
+ mode = BTR_MODIFY_TREE;
+ }
+
+ DBUG_EXECUTE_IF(
+ "rtree_test_check_count", {
+ goto func_exit;});
+
+ } else {
+ err = btr_cur_search_to_nth_level(
+ index, 0, entry, PAGE_CUR_LE,
+ search_mode,
+ &cursor, 0, __FILE__, __LINE__, &mtr);
+ }
+
+ if (err != DB_SUCCESS) {
+ if (err == DB_DECRYPTION_FAILED) {
+ ib_push_warning(thr_get_trx(thr)->mysql_thd,
+ DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ index->table->name.m_name);
+ index->table->file_unreadable = true;
+ }
+ goto func_exit;
+ }
+
+ if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+ ut_ad(!dict_index_is_spatial(index));
+ /* The insert was buffered during the search: we are done */
+ goto func_exit;
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ page_t* page = btr_cur_get_page(&cursor);
+ rec_t* first_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+
+ ut_ad(page_rec_is_supremum(first_rec)
+ || rec_n_fields_is_sane(index, first_rec, entry));
+ }
+#endif /* UNIV_DEBUG */
+
+ n_unique = dict_index_get_n_unique(index);
+
+ if (dict_index_is_unique(index)
+ && (cursor.low_match >= n_unique || cursor.up_match >= n_unique)) {
+ mtr_commit(&mtr);
+
+ DEBUG_SYNC_C("row_ins_sec_index_unique");
+
+ if (row_ins_sec_mtr_start_and_check_if_aborted(
+ &mtr, index, check, search_mode)) {
+ goto func_exit;
+ }
+
+ err = row_ins_scan_sec_index_for_duplicate(
+ flags, index, entry, thr, check, &mtr, offsets_heap);
+
+ mtr_commit(&mtr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ break;
+ case DB_DUPLICATE_KEY:
+ if (!index->is_committed()) {
+ ut_ad(!thr_get_trx(thr)
+ ->dict_operation_lock_mode);
+ mutex_enter(&dict_sys.mutex);
+ dict_set_corrupted_index_cache_only(index);
+ mutex_exit(&dict_sys.mutex);
+ /* Do not return any error to the
+ caller. The duplicate will be reported
+ by ALTER TABLE or CREATE UNIQUE INDEX.
+ Unfortunately we cannot report the
+ duplicate key value to the DDL thread,
+ because the altered_table object is
+ private to its call stack. */
+ err = DB_SUCCESS;
+ }
+ /* fall through */
+ default:
+ if (dict_index_is_spatial(index)) {
+ rtr_clean_rtr_info(&rtr_info, true);
+ }
+ DBUG_RETURN(err);
+ }
+
+ if (row_ins_sec_mtr_start_and_check_if_aborted(
+ &mtr, index, check, search_mode)) {
+ goto func_exit;
+ }
+
+ DEBUG_SYNC_C("row_ins_sec_index_entry_dup_locks_created");
+
+ /* We did not find a duplicate and we have now
+ locked with s-locks the necessary records to
+ prevent any insertion of a duplicate by another
+ transaction. Let us now reposition the cursor and
+ continue the insertion. */
+ btr_cur_search_to_nth_level(
+ index, 0, entry, PAGE_CUR_LE,
+ (search_mode
+ & ~(BTR_INSERT | BTR_IGNORE_SEC_UNIQUE)),
+ &cursor, 0, __FILE__, __LINE__, &mtr);
+ }
+
+ if (row_ins_must_modify_rec(&cursor)) {
+ /* There is already an index entry with a long enough common
+ prefix, we must convert the insert into a modify of an
+ existing record */
+ offsets = rec_get_offsets(
+ btr_cur_get_rec(&cursor), index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &offsets_heap);
+
+ err = row_ins_sec_index_entry_by_modify(
+ flags, mode, &cursor, &offsets,
+ offsets_heap, heap, entry, thr, &mtr);
+
+ if (err == DB_SUCCESS && dict_index_is_spatial(index)
+ && rtr_info.mbr_adj) {
+ err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+ }
+ } else {
+ rec_t* insert_rec;
+ big_rec_t* big_rec;
+
+ if (mode == BTR_MODIFY_LEAF) {
+ err = btr_cur_optimistic_insert(
+ flags, &cursor, &offsets, &offsets_heap,
+ entry, &insert_rec,
+ &big_rec, 0, thr, &mtr);
+ if (err == DB_SUCCESS
+ && dict_index_is_spatial(index)
+ && rtr_info.mbr_adj) {
+ err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+ if (buf_pool.running_out()) {
+ err = DB_LOCK_TABLE_FULL;
+ goto func_exit;
+ }
+
+ err = btr_cur_optimistic_insert(
+ flags, &cursor,
+ &offsets, &offsets_heap,
+ entry, &insert_rec,
+ &big_rec, 0, thr, &mtr);
+ if (err == DB_FAIL) {
+ err = btr_cur_pessimistic_insert(
+ flags, &cursor,
+ &offsets, &offsets_heap,
+ entry, &insert_rec,
+ &big_rec, 0, thr, &mtr);
+ }
+ if (err == DB_SUCCESS
+ && dict_index_is_spatial(index)
+ && rtr_info.mbr_adj) {
+ err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+ }
+ }
+
+ if (err == DB_SUCCESS && trx_id) {
+ page_update_max_trx_id(
+ btr_cur_get_block(&cursor),
+ btr_cur_get_page_zip(&cursor),
+ trx_id, &mtr);
+ }
+
+ ut_ad(!big_rec);
+ }
+
+func_exit:
+ if (dict_index_is_spatial(index)) {
+ rtr_clean_rtr_info(&rtr_info, true);
+ }
+
+ mtr_commit(&mtr);
+ DBUG_RETURN(err);
+}
+
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ dberr_t err;
+ ulint n_uniq;
+
+ DBUG_ENTER("row_ins_clust_index_entry");
+
+ if (!index->table->foreign_set.empty()) {
+ err = row_ins_check_foreign_constraints(
+ index->table, index, true, entry, thr);
+ if (err != DB_SUCCESS) {
+
+ DBUG_RETURN(err);
+ }
+ }
+
+ n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0;
+
+#ifdef WITH_WSREP
+ const bool skip_locking
+ = wsrep_thd_skip_locking(thr_get_trx(thr)->mysql_thd);
+ ulint flags = index->table->no_rollback() ? BTR_NO_ROLLBACK
+ : (index->table->is_temporary() || skip_locking)
+ ? BTR_NO_LOCKING_FLAG : 0;
+#ifdef UNIV_DEBUG
+ if (skip_locking && strcmp(wsrep_get_sr_table_name(),
+ index->table->name.m_name)) {
+ WSREP_ERROR("Record locking is disabled in this thread, "
+ "but the table being modified is not "
+ "`%s`: `%s`.", wsrep_get_sr_table_name(),
+ index->table->name.m_name);
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+#else
+ ulint flags = index->table->no_rollback() ? BTR_NO_ROLLBACK
+ : index->table->is_temporary()
+ ? BTR_NO_LOCKING_FLAG : 0;
+#endif /* WITH_WSREP */
+ const ulint orig_n_fields = entry->n_fields;
+
+ /* Try first optimistic descent to the B-tree */
+ log_free_check();
+
+ /* For intermediate table during copy alter table,
+ skip the undo log and record lock checking for
+ insertion operation.
+ */
+ if (index->table->skip_alter_undo) {
+ flags |= BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG;
+ }
+
+ /* Try first optimistic descent to the B-tree */
+ log_free_check();
+
+ err = row_ins_clust_index_entry_low(
+ flags, BTR_MODIFY_LEAF, index, n_uniq, entry,
+ n_ext, thr);
+
+ entry->n_fields = orig_n_fields;
+
+ DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+ "after_row_ins_clust_index_entry_leaf");
+
+ if (err != DB_FAIL) {
+ DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after");
+ DBUG_RETURN(err);
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+ log_free_check();
+
+ err = row_ins_clust_index_entry_low(
+ flags, BTR_MODIFY_TREE, index, n_uniq, entry,
+ n_ext, thr);
+
+ entry->n_fields = orig_n_fields;
+
+ DBUG_RETURN(err);
+}
+
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+ dict_index_t* index, /*!< in: secondary index */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ bool check_foreign) /*!< in: true if check
+ foreign table is needed, false otherwise */
+{
+ dberr_t err;
+ mem_heap_t* offsets_heap;
+ mem_heap_t* heap;
+ trx_id_t trx_id = 0;
+
+ DBUG_EXECUTE_IF("row_ins_sec_index_entry_timeout", {
+ DBUG_SET("-d,row_ins_sec_index_entry_timeout");
+ return(DB_LOCK_WAIT);});
+
+ if (check_foreign && !index->table->foreign_set.empty()) {
+ err = row_ins_check_foreign_constraints(index->table, index,
+ false, entry, thr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(thr_get_trx(thr)->id != 0);
+
+ offsets_heap = mem_heap_create(1024);
+ heap = mem_heap_create(1024);
+
+ /* Try first optimistic descent to the B-tree */
+
+ log_free_check();
+ ulint flags = index->table->is_temporary()
+ ? BTR_NO_LOCKING_FLAG
+ : 0;
+
+ /* For intermediate table during copy alter table,
+ skip the undo log and record lock checking for
+ insertion operation.
+ */
+ if (index->table->skip_alter_undo) {
+ trx_id = thr_get_trx(thr)->id;
+ flags |= BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG;
+ }
+
+ err = row_ins_sec_index_entry_low(
+ flags, BTR_MODIFY_LEAF, index, offsets_heap, heap, entry,
+ trx_id, thr);
+ if (err == DB_FAIL) {
+ mem_heap_empty(heap);
+
+ if (index->table->space == fil_system.sys_space
+ && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) {
+ ibuf_free_excess_pages();
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+ log_free_check();
+
+ err = row_ins_sec_index_entry_low(
+ flags, BTR_MODIFY_TREE, index,
+ offsets_heap, heap, entry, 0, thr);
+ }
+
+ mem_heap_free(heap);
+ mem_heap_free(offsets_heap);
+ return(err);
+}
+
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+static
+dberr_t
+row_ins_index_entry(
+/*================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(thr_get_trx(thr)->id || index->table->no_rollback()
+ || index->table->is_temporary());
+
+ DBUG_EXECUTE_IF("row_ins_index_entry_timeout", {
+ DBUG_SET("-d,row_ins_index_entry_timeout");
+ return(DB_LOCK_WAIT);});
+
+ if (index->is_primary()) {
+ return row_ins_clust_index_entry(index, entry, thr, 0);
+ } else {
+ return row_ins_sec_index_entry(index, entry, thr);
+ }
+}
+
+
+/*****************************************************************//**
+This function generate MBR (Minimum Bounding Box) for spatial objects
+and set it to spatial index field. */
+static
+void
+row_ins_spatial_index_entry_set_mbr_field(
+/*======================================*/
+ dfield_t* field, /*!< in/out: mbr field */
+ const dfield_t* row_field) /*!< in: row field */
+{
+ ulint dlen = 0;
+ double mbr[SPDIMS * 2];
+
+ /* This must be a GEOMETRY datatype */
+ ut_ad(DATA_GEOMETRY_MTYPE(field->type.mtype));
+
+ const byte* dptr = static_cast<const byte*>(
+ dfield_get_data(row_field));
+ dlen = dfield_get_len(row_field);
+
+ /* obtain the MBR */
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(dlen - GEO_DATA_HEADER_SIZE),
+ SPDIMS, mbr);
+
+ /* Set mbr as index entry data */
+ dfield_write_mbr(field, mbr);
+}
+
+/** Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row.
+@param[in] index index handler
+@param[out] entry index entry to make
+@param[in] row row
+@return DB_SUCCESS if the set is successful */
+static
+dberr_t
+row_ins_index_entry_set_vals(
+ const dict_index_t* index,
+ dtuple_t* entry,
+ const dtuple_t* row)
+{
+ ulint n_fields;
+ ulint i;
+ ulint num_v = dtuple_get_n_v_fields(entry);
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ for (i = 0; i < n_fields + num_v; i++) {
+ dict_field_t* ind_field = NULL;
+ dfield_t* field;
+ const dfield_t* row_field;
+ ulint len;
+ dict_col_t* col;
+
+ if (i >= n_fields) {
+ /* This is virtual field */
+ field = dtuple_get_nth_v_field(entry, i - n_fields);
+ col = &dict_table_get_nth_v_col(
+ index->table, i - n_fields)->m_col;
+ } else {
+ field = dtuple_get_nth_field(entry, i);
+ ind_field = dict_index_get_nth_field(index, i);
+ col = ind_field->col;
+ }
+
+ if (col->is_virtual()) {
+ const dict_v_col_t* v_col
+ = reinterpret_cast<const dict_v_col_t*>(col);
+ ut_ad(dtuple_get_n_fields(row)
+ == dict_table_get_n_cols(index->table));
+ row_field = dtuple_get_nth_v_field(row, v_col->v_pos);
+ } else if (col->is_dropped()) {
+ ut_ad(index->is_primary());
+
+ if (!(col->prtype & DATA_NOT_NULL)) {
+ field->data = NULL;
+ field->len = UNIV_SQL_NULL;
+ field->type.prtype = DATA_BINARY_TYPE;
+ } else {
+ ut_ad(ind_field->fixed_len <= col->len);
+ dfield_set_data(field, field_ref_zero,
+ ind_field->fixed_len);
+ field->type.prtype = DATA_NOT_NULL;
+ }
+
+ field->type.mtype = col->len
+ ? DATA_FIXBINARY : DATA_BINARY;
+ continue;
+ } else {
+ row_field = dtuple_get_nth_field(
+ row, ind_field->col->ind);
+ }
+
+ len = dfield_get_len(row_field);
+
+ /* Check column prefix indexes */
+ if (ind_field != NULL && ind_field->prefix_len > 0
+ && len != UNIV_SQL_NULL) {
+
+ const dict_col_t* col
+ = dict_field_get_col(ind_field);
+
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype, col->mbminlen, col->mbmaxlen,
+ ind_field->prefix_len,
+ len,
+ static_cast<const char*>(
+ dfield_get_data(row_field)));
+
+ ut_ad(!dfield_is_ext(row_field));
+ }
+
+ /* Handle spatial index. For the first field, replace
+ the data with its MBR (Minimum Bounding Box). */
+ if ((i == 0) && dict_index_is_spatial(index)) {
+ if (!row_field->data
+ || row_field->len < GEO_DATA_HEADER_SIZE) {
+ return(DB_CANT_CREATE_GEOMETRY_OBJECT);
+ }
+ row_ins_spatial_index_entry_set_mbr_field(
+ field, row_field);
+ continue;
+ }
+
+ dfield_set_data(field, dfield_get_data(row_field), len);
+ if (dfield_is_ext(row_field)) {
+ ut_ad(dict_index_is_clust(index));
+ dfield_set_ext(field);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a single index entry to the table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_index_entry_step(
+/*=====================*/
+ ins_node_t* node, /*!< in: row insert node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+
+ DBUG_ENTER("row_ins_index_entry_step");
+
+ ut_ad(dtuple_check_typed(node->row));
+
+ err = row_ins_index_entry_set_vals(node->index, *node->entry,
+ node->row);
+
+ if (err != DB_SUCCESS) {
+ DBUG_RETURN(err);
+ }
+
+ ut_ad(dtuple_check_typed(*node->entry));
+
+ err = row_ins_index_entry(node->index, *node->entry, thr);
+
+ DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+ "after_row_ins_index_entry_step");
+
+ DBUG_RETURN(err);
+}
+
+/***********************************************************//**
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+ ins_node_t* node) /*!< in: row insert node */
+{
+ row_id_t row_id;
+
+ ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+
+ if (dict_index_is_unique(dict_table_get_first_index(node->table))) {
+
+ /* No row id is stored if the clustered index is unique */
+
+ return;
+ }
+
+ /* Fill in row id value to row */
+
+ row_id = dict_sys_get_new_row_id();
+
+ dict_sys_write_row_id(node->sys_buf, row_id);
+}
+
+/***********************************************************//**
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+ ins_node_t* node) /*!< in: row insert node */
+{
+ que_node_t* list_node;
+ dfield_t* dfield;
+ dtuple_t* row;
+ ulint i;
+
+ /* The field values are copied in the buffers of the select node and
+ it is safe to use them until we fetch from select again: therefore
+ we can just copy the pointers */
+
+ row = node->row;
+
+ i = 0;
+ list_node = node->values_list;
+
+ while (list_node) {
+ eval_exp(list_node);
+
+ dfield = dtuple_get_nth_field(row, i);
+ dfield_copy_data(dfield, que_node_get_val(list_node));
+
+ i++;
+ list_node = que_node_get_next(list_node);
+ }
+}
+
+/***********************************************************//**
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+ ins_node_t* node) /*!< in: row insert node */
+{
+ que_node_t* list_node;
+ dfield_t* dfield;
+ dtuple_t* row;
+ ulint i;
+
+ /* The field values are copied in the buffers of the select node and
+ it is safe to use them until we fetch from select again: therefore
+ we can just copy the pointers */
+
+ row = node->row;
+
+ i = 0;
+ list_node = node->select->select_list;
+
+ while (list_node) {
+ dfield = dtuple_get_nth_field(row, i);
+ dfield_copy_data(dfield, que_node_get_val(list_node));
+
+ i++;
+ list_node = que_node_get_next(list_node);
+ }
+}
+
+inline
+bool ins_node_t::vers_history_row() const
+{
+ if (!table->versioned())
+ return false;
+ dfield_t* row_end = dtuple_get_nth_field(row, table->vers_end);
+ return row_end->vers_history_row();
+}
+
+
+/***********************************************************//**
+Inserts a row to a table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins(
+/*====*/
+ ins_node_t* node, /*!< in: row insert node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ DBUG_ENTER("row_ins");
+
+ DBUG_PRINT("row_ins", ("table: %s", node->table->name.m_name));
+
+ if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+ row_ins_alloc_row_id_step(node);
+
+ node->index = dict_table_get_first_index(node->table);
+ ut_ad(node->entry_list.empty() == false);
+ node->entry = node->entry_list.begin();
+
+ if (node->ins_type == INS_SEARCHED) {
+
+ row_ins_get_row_from_select(node);
+
+ } else if (node->ins_type == INS_VALUES) {
+
+ row_ins_get_row_from_values(node);
+ }
+
+ node->state = INS_NODE_INSERT_ENTRIES;
+ }
+
+ ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+ while (node->index != NULL) {
+ dict_index_t *index = node->index;
+ /*
+ We do not insert history rows into FTS_DOC_ID_INDEX because
+ it is unique by FTS_DOC_ID only and we do not want to add
+ row_end to unique key. Fulltext field works the way new
+ FTS_DOC_ID is created on every fulltext UPDATE, so holding only
+ FTS_DOC_ID for history is enough.
+ */
+ const unsigned type = index->type;
+ if (index->type & DICT_FTS) {
+ } else if (!(type & DICT_UNIQUE) || index->n_uniq > 1
+ || !node->vers_history_row()) {
+
+ dberr_t err = row_ins_index_entry_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+ DBUG_RETURN(err);
+ }
+ } else {
+ /* Unique indexes with system versioning must contain
+ the version end column. The only exception is a hidden
+ FTS_DOC_ID_INDEX that InnoDB may create on a hidden or
+ user-created FTS_DOC_ID column. */
+ ut_ad(!strcmp(index->name, FTS_DOC_ID_INDEX_NAME));
+ ut_ad(!strcmp(index->fields[0].name, FTS_DOC_ID_COL_NAME));
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ ++node->entry;
+
+ /* Skip corrupted secondary index and its entry */
+ while (node->index && node->index->is_corrupted()) {
+ node->index = dict_table_get_next_index(node->index);
+ ++node->entry;
+ }
+ }
+
+ ut_ad(node->entry == node->entry_list.end());
+
+ node->state = INS_NODE_ALLOC_ROW_ID;
+
+ DBUG_RETURN(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_ins_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ins_node_t* node;
+ que_node_t* parent;
+ sel_node_t* sel_node;
+ trx_t* trx;
+ dberr_t err;
+
+ ut_ad(thr);
+
+ DEBUG_SYNC_C("innodb_row_ins_step_enter");
+
+ trx = thr_get_trx(thr);
+
+ node = static_cast<ins_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+ parent = que_node_get_parent(node);
+ sel_node = node->select;
+
+ if (thr->prev_node == parent) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ }
+
+ /* If this is the first time this node is executed (or when
+ execution resumes after wait for the table IX lock), set an
+ IX lock on the table and reset the possible select node. MySQL's
+ partitioned table code may also call an insert within the same
+ SQL statement AFTER it has used this table handle to do a search.
+ This happens, for example, when a row update moves it to another
+ partition. In that case, we have already set the IX lock on the
+ table during the search operation, and there is no need to set
+ it again here. But we must write trx->id to node->sys_buf. */
+
+ if (node->table->no_rollback()) {
+ /* No-rollback tables should only be written to by a
+ single thread at a time, but there can be multiple
+ concurrent readers. We must hold an open table handle. */
+ DBUG_ASSERT(node->table->get_ref_count() > 0);
+ DBUG_ASSERT(node->ins_type == INS_DIRECT);
+ /* No-rollback tables can consist only of a single index. */
+ DBUG_ASSERT(node->entry_list.size() == 1);
+ DBUG_ASSERT(UT_LIST_GET_LEN(node->table->indexes) == 1);
+ /* There should be no possibility for interruption and
+ restarting here. In theory, we could allow resumption
+ from the INS_NODE_INSERT_ENTRIES state here. */
+ DBUG_ASSERT(node->state == INS_NODE_SET_IX_LOCK);
+ node->index = dict_table_get_first_index(node->table);
+ node->entry = node->entry_list.begin();
+ node->state = INS_NODE_INSERT_ENTRIES;
+ goto do_insert;
+ }
+
+ if (UNIV_LIKELY(!node->table->skip_alter_undo)) {
+ trx_write_trx_id(&node->sys_buf[DATA_TRX_ID_LEN], trx->id);
+ }
+
+ if (node->state == INS_NODE_SET_IX_LOCK) {
+
+ node->state = INS_NODE_ALLOC_ROW_ID;
+
+ if (node->table->is_temporary()) {
+ node->trx_id = trx->id;
+ }
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ if (trx->id == node->trx_id) {
+ /* No need to do IX-locking */
+
+ goto same_trx;
+ }
+
+ err = lock_table(0, node->table, LOCK_IX, thr);
+
+ DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait",
+ err = DB_LOCK_WAIT;);
+
+ if (err != DB_SUCCESS) {
+
+ goto error_handling;
+ }
+
+ node->trx_id = trx->id;
+same_trx:
+ if (node->ins_type == INS_SEARCHED) {
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch a row to insert */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+ }
+
+ if ((node->ins_type == INS_SEARCHED)
+ && (sel_node->state != SEL_NODE_FETCH)) {
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to insert */
+ thr->run_node = parent;
+
+ return(thr);
+ }
+do_insert:
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = row_ins(node, thr);
+
+error_handling:
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ /* err == DB_LOCK_WAIT or SQL error detected */
+ return(NULL);
+ }
+
+ /* DO THE TRIGGER ACTIONS HERE */
+
+ if (node->ins_type == INS_SEARCHED) {
+ /* Fetch a row to insert */
+
+ thr->run_node = sel_node;
+ } else {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
new file mode 100644
index 00000000..336b5a27
--- /dev/null
+++ b/storage/innobase/row/row0log.cc
@@ -0,0 +1,4053 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0log.cc
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#include "row0log.h"
+#include "row0row.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0merge.h"
+#include "row0ext.h"
+#include "log0crypt.h"
+#include "data0data.h"
+#include "que0que.h"
+#include "srv0mon.h"
+#include "handler0alter.h"
+#include "ut0stage.h"
+#include "trx0rec.h"
+
+#include <sql_class.h>
+#include <algorithm>
+#include <map>
+
+Atomic_counter<ulint> onlineddl_rowlog_rows;
+ulint onlineddl_rowlog_pct_used;
+ulint onlineddl_pct_progress;
+
+/** Table row modification operations during online table rebuild.
+Delete-marked records are not copied to the rebuilt table. */
+enum row_tab_op {
+ /** Insert a record */
+ ROW_T_INSERT = 0x41,
+ /** Update a record in place */
+ ROW_T_UPDATE,
+ /** Delete (purge) a record */
+ ROW_T_DELETE
+};
+
+/** Index record modification operations during online index creation */
+enum row_op {
+ /** Insert a record */
+ ROW_OP_INSERT = 0x61,
+ /** Delete a record */
+ ROW_OP_DELETE
+};
+
+/** Size of the modification log entry header, in bytes */
+#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/
+
+/** Log block for modifications during online ALTER TABLE */
+struct row_log_buf_t {
+ byte* block; /*!< file block buffer */
+ size_t size; /*!< length of block in bytes */
+ ut_new_pfx_t block_pfx; /*!< opaque descriptor of "block". Set
+ by ut_allocator::allocate_large() and fed to
+ ut_allocator::deallocate_large(). */
+ mrec_buf_t buf; /*!< buffer for accessing a record
+ that spans two blocks */
+ ulint blocks; /*!< current position in blocks */
+ ulint bytes; /*!< current position within block */
+ ulonglong total; /*!< logical position, in bytes from
+ the start of the row_log_table log;
+ 0 for row_log_online_op() and
+ row_log_apply(). */
+};
+
+/** Tracks BLOB allocation during online ALTER TABLE */
+class row_log_table_blob_t {
+public:
+ /** Constructor (declaring a BLOB freed)
+ @param offset_arg row_log_t::tail::total */
+#ifdef UNIV_DEBUG
+ row_log_table_blob_t(ulonglong offset_arg) :
+ old_offset (0), free_offset (offset_arg),
+ offset (BLOB_FREED) {}
+#else /* UNIV_DEBUG */
+ row_log_table_blob_t() :
+ offset (BLOB_FREED) {}
+#endif /* UNIV_DEBUG */
+
+ /** Declare a BLOB freed again.
+ @param offset_arg row_log_t::tail::total */
+#ifdef UNIV_DEBUG
+ void blob_free(ulonglong offset_arg)
+#else /* UNIV_DEBUG */
+ void blob_free()
+#endif /* UNIV_DEBUG */
+ {
+ ut_ad(offset < offset_arg);
+ ut_ad(offset != BLOB_FREED);
+ ut_d(old_offset = offset);
+ ut_d(free_offset = offset_arg);
+ offset = BLOB_FREED;
+ }
+ /** Declare a freed BLOB reused.
+ @param offset_arg row_log_t::tail::total */
+ void blob_alloc(ulonglong offset_arg) {
+ ut_ad(free_offset <= offset_arg);
+ ut_d(old_offset = offset);
+ offset = offset_arg;
+ }
+ /** Determine if a BLOB was freed at a given log position
+ @param offset_arg row_log_t::head::total after the log record
+ @return true if freed */
+ bool is_freed(ulonglong offset_arg) const {
+ /* This is supposed to be the offset at the end of the
+ current log record. */
+ ut_ad(offset_arg > 0);
+ /* We should never get anywhere close the magic value. */
+ ut_ad(offset_arg < BLOB_FREED);
+ return(offset_arg < offset);
+ }
+private:
+ /** Magic value for a freed BLOB */
+ static const ulonglong BLOB_FREED = ~0ULL;
+#ifdef UNIV_DEBUG
+ /** Old offset, in case a page was freed, reused, freed, ... */
+ ulonglong old_offset;
+ /** Offset of last blob_free() */
+ ulonglong free_offset;
+#endif /* UNIV_DEBUG */
+ /** Byte offset to the log file */
+ ulonglong offset;
+};
+
+/** @brief Map of off-page column page numbers to 0 or log byte offsets.
+
+If there is no mapping for a page number, it is safe to access.
+If a page number maps to 0, it is an off-page column that has been freed.
+If a page number maps to a nonzero number, the number is a byte offset
+into the index->online_log, indicating that the page is safe to access
+when applying log records starting from that offset. */
+typedef std::map<
+ ulint,
+ row_log_table_blob_t,
+ std::less<ulint>,
+ ut_allocator<std::pair<const ulint, row_log_table_blob_t> > >
+ page_no_map;
+
+/** @brief Buffer for logging modifications during online index creation
+
+All modifications to an index that is being created will be logged by
+row_log_online_op() to this buffer.
+
+All modifications to a table that is being rebuilt will be logged by
+row_log_table_delete(), row_log_table_update(), row_log_table_insert()
+to this buffer.
+
+When head.blocks == tail.blocks, the reader will access tail.block
+directly. When also head.bytes == tail.bytes, both counts will be
+reset to 0 and the file will be truncated. */
+struct row_log_t {
+ pfs_os_file_t fd; /*!< file descriptor */
+ ib_mutex_t mutex; /*!< mutex protecting error,
+ max_trx and tail */
+ page_no_map* blobs; /*!< map of page numbers of off-page columns
+ that have been freed during table-rebuilding
+ ALTER TABLE (row_log_table_*); protected by
+ index->lock X-latch only */
+ dict_table_t* table; /*!< table that is being rebuilt,
+ or NULL when this is a secondary
+ index that is being created online */
+ bool same_pk;/*!< whether the definition of the PRIMARY KEY
+ has remained the same */
+ const dtuple_t* defaults;
+ /*!< default values of added, changed columns,
+ or NULL */
+ const ulint* col_map;/*!< mapping of old column numbers to
+ new ones, or NULL if !table */
+ dberr_t error; /*!< error that occurred during online
+ table rebuild */
+ /** The transaction ID of the ALTER TABLE transaction. Any
+ concurrent DML would necessarily be logged with a larger
+ transaction ID, because ha_innobase::prepare_inplace_alter_table()
+ acts as a barrier that ensures that any concurrent transaction
+ that operates on the table would have been started after
+ ha_innobase::prepare_inplace_alter_table() returns and before
+ ha_innobase::commit_inplace_alter_table(commit=true) is invoked.
+
+ Due to the nondeterministic nature of purge and due to the
+ possibility of upgrading from an earlier version of MariaDB
+ or MySQL, it is possible that row_log_table_low() would be
+ fed DB_TRX_ID that precedes than min_trx. We must normalize
+ such references to reset_trx_id[]. */
+ trx_id_t min_trx;
+ trx_id_t max_trx;/*!< biggest observed trx_id in
+ row_log_online_op();
+ protected by mutex and index->lock S-latch,
+ or by index->lock X-latch only */
+ row_log_buf_t tail; /*!< writer context;
+ protected by mutex and index->lock S-latch,
+ or by index->lock X-latch only */
+ size_t crypt_tail_size; /*!< size of crypt_tail_size*/
+ byte* crypt_tail; /*!< writer context;
+ temporary buffer used in encryption,
+ decryption or NULL*/
+ row_log_buf_t head; /*!< reader context; protected by MDL only;
+ modifiable by row_log_apply_ops() */
+ size_t crypt_head_size; /*!< size of crypt_tail_size*/
+ byte* crypt_head; /*!< reader context;
+ temporary buffer used in encryption,
+ decryption or NULL */
+ const char* path; /*!< where to create temporary file during
+ log operation */
+ /** the number of core fields in the clustered index of the
+ source table; before row_log_table_apply() completes, the
+ table could be emptied, so that table->is_instant() no longer holds,
+ but all log records must be in the "instant" format. */
+ unsigned n_core_fields;
+ /** the default values of non-core fields when the operation started */
+ dict_col_t::def_t* non_core_fields;
+ bool allow_not_null; /*!< Whether the alter ignore is being
+ used or if the sql mode is non-strict mode;
+ if not, NULL values will not be converted to
+ defaults */
+ const TABLE* old_table; /*< Use old table in case of error. */
+
+ uint64_t n_rows; /*< Number of rows read from the table */
+ /** Determine whether the log should be in the 'instant ADD' format
+ @param[in] index the clustered index of the source table
+ @return whether to use the 'instant ADD COLUMN' format */
+ bool is_instant(const dict_index_t* index) const
+ {
+ ut_ad(table);
+ ut_ad(n_core_fields <= index->n_fields);
+ return n_core_fields != index->n_fields;
+ }
+
+ const byte* instant_field_value(ulint n, ulint* len) const
+ {
+ ut_ad(n >= n_core_fields);
+ const dict_col_t::def_t& d= non_core_fields[n - n_core_fields];
+ *len = d.len;
+ return static_cast<const byte*>(d.data);
+ }
+};
+
+/** Create the file or online log if it does not exist.
+@param[in,out] log online rebuild log
+@return true if success, false if not */
+static MY_ATTRIBUTE((warn_unused_result))
+pfs_os_file_t
+row_log_tmpfile(
+ row_log_t* log)
+{
+ DBUG_ENTER("row_log_tmpfile");
+ if (log->fd == OS_FILE_CLOSED) {
+ log->fd = row_merge_file_create_low(log->path);
+ DBUG_EXECUTE_IF("row_log_tmpfile_fail",
+ if (log->fd != OS_FILE_CLOSED)
+ row_merge_file_destroy_low(log->fd);
+ log->fd = OS_FILE_CLOSED;);
+ if (log->fd != OS_FILE_CLOSED) {
+ MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_LOG_FILES);
+ }
+ }
+
+ DBUG_RETURN(log->fd);
+}
+
+/** Allocate the memory for the log buffer.
+@param[in,out] log_buf Buffer used for log operation
+@return TRUE if success, false if not */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_log_block_allocate(
+ row_log_buf_t& log_buf)
+{
+ DBUG_ENTER("row_log_block_allocate");
+ if (log_buf.block == NULL) {
+ DBUG_EXECUTE_IF(
+ "simulate_row_log_allocation_failure",
+ DBUG_RETURN(false);
+ );
+
+ log_buf.block = ut_allocator<byte>(mem_key_row_log_buf)
+ .allocate_large(srv_sort_buf_size,
+ &log_buf.block_pfx);
+
+ if (log_buf.block == NULL) {
+ DBUG_RETURN(false);
+ }
+ log_buf.size = srv_sort_buf_size;
+ }
+ DBUG_RETURN(true);
+}
+
+/** Free the log buffer.
+@param[in,out] log_buf Buffer used for log operation */
+static
+void
+row_log_block_free(
+ row_log_buf_t& log_buf)
+{
+ DBUG_ENTER("row_log_block_free");
+ if (log_buf.block != NULL) {
+ ut_allocator<byte>(mem_key_row_log_buf).deallocate_large(
+ log_buf.block, &log_buf.block_pfx);
+ log_buf.block = NULL;
+ }
+ DBUG_VOID_RETURN;
+}
+
+/******************************************************//**
+Logs an operation to a secondary index that is (or was) being created. */
+void
+row_log_online_op(
+/*==============*/
+ dict_index_t* index, /*!< in/out: index, S or X latched */
+ const dtuple_t* tuple, /*!< in: index tuple */
+ trx_id_t trx_id) /*!< in: transaction ID for insert,
+ or 0 for delete */
+{
+ byte* b;
+ ulint extra_size;
+ ulint size;
+ ulint mrec_size;
+ ulint avail_size;
+ row_log_t* log;
+
+ ut_ad(dtuple_validate(tuple));
+ ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index));
+ ut_ad(rw_lock_own_flagged(&index->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+ if (index->is_corrupted()) {
+ return;
+ }
+
+ ut_ad(dict_index_is_online_ddl(index));
+
+ /* Compute the size of the record. This differs from
+ row_merge_buf_encode(), because here we do not encode
+ extra_size+1 (and reserve 0 as the end-of-chunk marker). */
+
+ size = rec_get_converted_size_temp<false>(
+ index, tuple->fields, tuple->n_fields, &extra_size);
+ ut_ad(size >= extra_size);
+ ut_ad(size <= sizeof log->tail.buf);
+
+ mrec_size = ROW_LOG_HEADER_SIZE
+ + (extra_size >= 0x80) + size
+ + (trx_id ? DATA_TRX_ID_LEN : 0);
+
+ log = index->online_log;
+ mutex_enter(&log->mutex);
+
+ if (trx_id > log->max_trx) {
+ log->max_trx = trx_id;
+ }
+
+ if (!row_log_block_allocate(log->tail)) {
+ log->error = DB_OUT_OF_MEMORY;
+ goto err_exit;
+ }
+
+ MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+
+ ut_ad(log->tail.bytes < srv_sort_buf_size);
+ avail_size = srv_sort_buf_size - log->tail.bytes;
+
+ if (mrec_size > avail_size) {
+ b = log->tail.buf;
+ } else {
+ b = log->tail.block + log->tail.bytes;
+ }
+
+ if (trx_id != 0) {
+ *b++ = ROW_OP_INSERT;
+ trx_write_trx_id(b, trx_id);
+ b += DATA_TRX_ID_LEN;
+ } else {
+ *b++ = ROW_OP_DELETE;
+ }
+
+ if (extra_size < 0x80) {
+ *b++ = (byte) extra_size;
+ } else {
+ ut_ad(extra_size < 0x8000);
+ *b++ = (byte) (0x80 | (extra_size >> 8));
+ *b++ = (byte) extra_size;
+ }
+
+ rec_convert_dtuple_to_temp<false>(
+ b + extra_size, index, tuple->fields, tuple->n_fields);
+ b += size;
+
+ if (mrec_size >= avail_size) {
+ const os_offset_t byte_offset
+ = (os_offset_t) log->tail.blocks
+ * srv_sort_buf_size;
+ byte* buf = log->tail.block;
+
+ if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+ goto write_failed;
+ }
+
+ if (mrec_size == avail_size) {
+ ut_ad(b == &buf[srv_sort_buf_size]);
+ } else {
+ ut_ad(b == log->tail.buf + mrec_size);
+ memcpy(buf + log->tail.bytes,
+ log->tail.buf, avail_size);
+ }
+
+ MEM_CHECK_DEFINED(buf, srv_sort_buf_size);
+
+ if (row_log_tmpfile(log) == OS_FILE_CLOSED) {
+ log->error = DB_OUT_OF_MEMORY;
+ goto err_exit;
+ }
+
+ /* If encryption is enabled encrypt buffer before writing it
+ to file system. */
+ if (log_tmp_is_encrypted()) {
+ if (!log_tmp_block_encrypt(
+ buf, srv_sort_buf_size,
+ log->crypt_tail, byte_offset)) {
+ log->error = DB_DECRYPTION_FAILED;
+ goto write_failed;
+ }
+
+ srv_stats.n_rowlog_blocks_encrypted.inc();
+ buf = log->crypt_tail;
+ }
+
+ log->tail.blocks++;
+ if (os_file_write(
+ IORequestWrite,
+ "(modification log)",
+ log->fd,
+ buf, byte_offset, srv_sort_buf_size)
+ != DB_SUCCESS) {
+write_failed:
+ /* We set the flag directly instead of invoking
+ dict_set_corrupted_index_cache_only(index) here,
+ because the index is not "public" yet. */
+ index->type |= DICT_CORRUPT;
+ }
+
+ MEM_UNDEFINED(log->tail.block, srv_sort_buf_size);
+ MEM_UNDEFINED(buf, srv_sort_buf_size);
+
+ memcpy(log->tail.block, log->tail.buf + avail_size,
+ mrec_size - avail_size);
+ log->tail.bytes = mrec_size - avail_size;
+ } else {
+ log->tail.bytes += mrec_size;
+ ut_ad(b == log->tail.block + log->tail.bytes);
+ }
+
+ MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+err_exit:
+ mutex_exit(&log->mutex);
+}
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+dberr_t
+row_log_table_get_error(
+/*====================*/
+ const dict_index_t* index) /*!< in: clustered index of a table
+ that is being rebuilt online */
+{
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_is_online_ddl(index));
+ return(index->online_log->error);
+}
+
+/******************************************************//**
+Starts logging an operation to a table that is being rebuilt.
+@return pointer to log, or NULL if no logging is necessary */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+byte*
+row_log_table_open(
+/*===============*/
+ row_log_t* log, /*!< in/out: online rebuild log */
+ ulint size, /*!< in: size of log record */
+ ulint* avail) /*!< out: available size for log record */
+{
+ mutex_enter(&log->mutex);
+
+ MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+
+ if (log->error != DB_SUCCESS) {
+err_exit:
+ mutex_exit(&log->mutex);
+ return(NULL);
+ }
+
+ if (!row_log_block_allocate(log->tail)) {
+ log->error = DB_OUT_OF_MEMORY;
+ goto err_exit;
+ }
+
+ ut_ad(log->tail.bytes < srv_sort_buf_size);
+ *avail = srv_sort_buf_size - log->tail.bytes;
+
+ if (size > *avail) {
+ /* Make sure log->tail.buf is large enough */
+ ut_ad(size <= sizeof log->tail.buf);
+ return(log->tail.buf);
+ } else {
+ return(log->tail.block + log->tail.bytes);
+ }
+}
+
+/******************************************************//**
+Stops logging an operation to a table that is being rebuilt. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_log_table_close_func(
+/*=====================*/
+ dict_index_t* index, /*!< in/out: online rebuilt index */
+#ifdef UNIV_DEBUG
+ const byte* b, /*!< in: end of log record */
+#endif /* UNIV_DEBUG */
+ ulint size, /*!< in: size of log record */
+ ulint avail) /*!< in: available size for log record */
+{
+ row_log_t* log = index->online_log;
+
+ ut_ad(mutex_own(&log->mutex));
+
+ if (size >= avail) {
+ const os_offset_t byte_offset
+ = (os_offset_t) log->tail.blocks
+ * srv_sort_buf_size;
+ byte* buf = log->tail.block;
+
+ if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+ goto write_failed;
+ }
+
+ if (size == avail) {
+ ut_ad(b == &buf[srv_sort_buf_size]);
+ } else {
+ ut_ad(b == log->tail.buf + size);
+ memcpy(buf + log->tail.bytes, log->tail.buf, avail);
+ }
+
+ MEM_CHECK_DEFINED(buf, srv_sort_buf_size);
+
+ if (row_log_tmpfile(log) == OS_FILE_CLOSED) {
+ log->error = DB_OUT_OF_MEMORY;
+ goto err_exit;
+ }
+
+ /* If encryption is enabled encrypt buffer before writing it
+ to file system. */
+ if (log_tmp_is_encrypted()) {
+ if (!log_tmp_block_encrypt(
+ log->tail.block, srv_sort_buf_size,
+ log->crypt_tail, byte_offset,
+ index->table->space_id)) {
+ log->error = DB_DECRYPTION_FAILED;
+ goto err_exit;
+ }
+
+ srv_stats.n_rowlog_blocks_encrypted.inc();
+ buf = log->crypt_tail;
+ }
+
+ log->tail.blocks++;
+ if (os_file_write(
+ IORequestWrite,
+ "(modification log)",
+ log->fd,
+ buf, byte_offset, srv_sort_buf_size)
+ != DB_SUCCESS) {
+write_failed:
+ log->error = DB_ONLINE_LOG_TOO_BIG;
+ }
+
+ MEM_UNDEFINED(log->tail.block, srv_sort_buf_size);
+ MEM_UNDEFINED(buf, srv_sort_buf_size);
+ memcpy(log->tail.block, log->tail.buf + avail, size - avail);
+ log->tail.bytes = size - avail;
+ } else {
+ log->tail.bytes += size;
+ ut_ad(b == log->tail.block + log->tail.bytes);
+ }
+
+ log->tail.total += size;
+ MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+err_exit:
+ mutex_exit(&log->mutex);
+
+ onlineddl_rowlog_rows++;
+ /* 10000 means 100.00%, 4525 means 45.25% */
+ onlineddl_rowlog_pct_used = static_cast<ulint>((log->tail.total * 10000) / srv_online_max_size);
+}
+
+#ifdef UNIV_DEBUG
+# define row_log_table_close(index, b, size, avail) \
+ row_log_table_close_func(index, b, size, avail)
+#else /* UNIV_DEBUG */
+# define row_log_table_close(log, b, size, avail) \
+ row_log_table_close_func(index, size, avail)
+#endif /* UNIV_DEBUG */
+
+/** Check whether a virtual column is indexed in the new table being
+created during alter table
+@param[in] index cluster index
+@param[in] v_no virtual column number
+@return true if it is indexed, else false */
+bool
+row_log_col_is_indexed(
+ const dict_index_t* index,
+ ulint v_no)
+{
+ return(dict_table_get_nth_v_col(
+ index->online_log->table, v_no)->m_col.ord_part);
+}
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+void
+row_log_table_delete(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ const byte* sys) /*!< in: DB_TRX_ID,DB_ROLL_PTR that should
+ be logged, or NULL to use those in rec */
+{
+ ulint old_pk_extra_size;
+ ulint old_pk_size;
+ ulint mrec_size;
+ ulint avail_size;
+ mem_heap_t* heap = NULL;
+ const dtuple_t* old_pk;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+ ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);
+ ut_ad(rw_lock_own_flagged(
+ &index->lock,
+ RW_LOCK_FLAG_S | RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+
+ if (index->online_status != ONLINE_INDEX_CREATION
+ || (index->type & DICT_CORRUPT) || index->table->corrupted
+ || index->online_log->error != DB_SUCCESS) {
+ return;
+ }
+
+ dict_table_t* new_table = index->online_log->table;
+ dict_index_t* new_index = dict_table_get_first_index(new_table);
+
+ ut_ad(dict_index_is_clust(new_index));
+ ut_ad(!dict_index_is_online_ddl(new_index));
+ ut_ad(index->online_log->min_trx);
+
+ /* Create the tuple PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in new_table. */
+ if (index->online_log->same_pk) {
+ dtuple_t* tuple;
+ ut_ad(new_index->n_uniq == index->n_uniq);
+
+ /* The PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR are in the first
+ fields of the record. */
+ heap = mem_heap_create(
+ DATA_TRX_ID_LEN
+ + DTUPLE_EST_ALLOC(new_index->first_user_field()));
+ old_pk = tuple = dtuple_create(heap,
+ new_index->first_user_field());
+ dict_index_copy_types(tuple, new_index, tuple->n_fields);
+ dtuple_set_n_fields_cmp(tuple, new_index->n_uniq);
+
+ for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+ ulint len;
+ const void* field = rec_get_nth_field(
+ rec, offsets, i, &len);
+ dfield_t* dfield = dtuple_get_nth_field(
+ tuple, i);
+ ut_ad(len != UNIV_SQL_NULL);
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ dfield_set_data(dfield, field, len);
+ }
+
+ dfield_t* db_trx_id = dtuple_get_nth_field(
+ tuple, new_index->n_uniq);
+
+ const bool replace_sys_fields
+ = sys
+ || trx_read_trx_id(static_cast<byte*>(db_trx_id->data))
+ < index->online_log->min_trx;
+
+ if (replace_sys_fields) {
+ if (!sys || trx_read_trx_id(sys)
+ < index->online_log->min_trx) {
+ sys = reset_trx_id;
+ }
+
+ dfield_set_data(db_trx_id, sys, DATA_TRX_ID_LEN);
+ dfield_set_data(db_trx_id + 1, sys + DATA_TRX_ID_LEN,
+ DATA_ROLL_PTR_LEN);
+ }
+
+ ut_d(trx_id_check(db_trx_id->data,
+ index->online_log->min_trx));
+ } else {
+ /* The PRIMARY KEY has changed. Translate the tuple. */
+ old_pk = row_log_table_get_pk(
+ rec, index, offsets, NULL, &heap);
+
+ if (!old_pk) {
+ ut_ad(index->online_log->error != DB_SUCCESS);
+ if (heap) {
+ goto func_exit;
+ }
+ return;
+ }
+ }
+
+ ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 2)->len);
+ ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 1)->len);
+ old_pk_size = rec_get_converted_size_temp<false>(
+ new_index, old_pk->fields, old_pk->n_fields,
+ &old_pk_extra_size);
+ ut_ad(old_pk_extra_size < 0x100);
+
+ /* 2 = 1 (extra_size) + at least 1 byte payload */
+ mrec_size = 2 + old_pk_size;
+
+ if (byte* b = row_log_table_open(index->online_log,
+ mrec_size, &avail_size)) {
+ *b++ = ROW_T_DELETE;
+ *b++ = static_cast<byte>(old_pk_extra_size);
+
+ rec_convert_dtuple_to_temp<false>(
+ b + old_pk_extra_size, new_index,
+ old_pk->fields, old_pk->n_fields);
+
+ b += old_pk_size;
+
+ row_log_table_close(index, b, mrec_size, avail_size);
+ }
+
+func_exit:
+ mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static
+void
+row_log_table_low_redundant(
+/*========================*/
+ const rec_t* rec, /*!< in: clustered index leaf
+ page record in ROW_FORMAT=REDUNDANT,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ bool insert, /*!< in: true if insert,
+ false if update */
+ const dtuple_t* old_pk, /*!< in: old PRIMARY KEY value
+ (if !insert and a PRIMARY KEY
+ is being created) */
+ const dict_index_t* new_index)
+ /*!< in: clustered index of the
+ new table, not latched */
+{
+ ulint old_pk_size;
+ ulint old_pk_extra_size;
+ ulint size;
+ ulint extra_size;
+ ulint mrec_size;
+ ulint avail_size;
+ mem_heap_t* heap = NULL;
+ dtuple_t* tuple;
+ const ulint n_fields = rec_get_n_fields_old(rec);
+
+ ut_ad(!page_is_comp(page_align(rec)));
+ ut_ad(index->n_fields >= n_fields);
+ ut_ad(index->n_fields == n_fields || index->is_instant());
+ ut_ad(dict_tf2_is_valid(index->table->flags, index->table->flags2));
+ ut_ad(!dict_table_is_comp(index->table)); /* redundant row format */
+ ut_ad(dict_index_is_clust(new_index));
+
+ heap = mem_heap_create(DTUPLE_EST_ALLOC(n_fields));
+ tuple = dtuple_create(heap, n_fields);
+ dict_index_copy_types(tuple, index, n_fields);
+
+ dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index));
+
+ if (rec_get_1byte_offs_flag(rec)) {
+ for (ulint i = 0; i < n_fields; i++) {
+ dfield_t* dfield;
+ ulint len;
+ const void* field;
+
+ dfield = dtuple_get_nth_field(tuple, i);
+ field = rec_get_nth_field_old(rec, i, &len);
+
+ dfield_set_data(dfield, field, len);
+ }
+ } else {
+ for (ulint i = 0; i < n_fields; i++) {
+ dfield_t* dfield;
+ ulint len;
+ const void* field;
+
+ dfield = dtuple_get_nth_field(tuple, i);
+ field = rec_get_nth_field_old(rec, i, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ if (rec_2_is_field_extern(rec, i)) {
+ dfield_set_ext(dfield);
+ }
+ }
+ }
+
+ dfield_t* db_trx_id = dtuple_get_nth_field(tuple, index->n_uniq);
+ ut_ad(dfield_get_len(db_trx_id) == DATA_TRX_ID_LEN);
+ ut_ad(dfield_get_len(db_trx_id + 1) == DATA_ROLL_PTR_LEN);
+
+ if (trx_read_trx_id(static_cast<const byte*>
+ (dfield_get_data(db_trx_id)))
+ < index->online_log->min_trx) {
+ dfield_set_data(db_trx_id, reset_trx_id, DATA_TRX_ID_LEN);
+ dfield_set_data(db_trx_id + 1, reset_trx_id + DATA_TRX_ID_LEN,
+ DATA_ROLL_PTR_LEN);
+ }
+
+ const bool is_instant = index->online_log->is_instant(index);
+ rec_comp_status_t status = is_instant
+ ? REC_STATUS_INSTANT : REC_STATUS_ORDINARY;
+
+ size = rec_get_converted_size_temp<true>(
+ index, tuple->fields, tuple->n_fields, &extra_size, status);
+ if (is_instant) {
+ size++;
+ extra_size++;
+ }
+
+ mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80);
+
+ if (insert || index->online_log->same_pk) {
+ ut_ad(!old_pk);
+ old_pk_extra_size = old_pk_size = 0;
+ } else {
+ ut_ad(old_pk);
+ ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+ ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 2)->len);
+ ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 1)->len);
+
+ old_pk_size = rec_get_converted_size_temp<false>(
+ new_index, old_pk->fields, old_pk->n_fields,
+ &old_pk_extra_size);
+ ut_ad(old_pk_extra_size < 0x100);
+ mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+ }
+
+ if (byte* b = row_log_table_open(index->online_log,
+ mrec_size, &avail_size)) {
+ if (insert) {
+ *b++ = ROW_T_INSERT;
+ } else {
+ *b++ = ROW_T_UPDATE;
+
+ if (old_pk_size) {
+ *b++ = static_cast<byte>(old_pk_extra_size);
+
+ rec_convert_dtuple_to_temp<false>(
+ b + old_pk_extra_size, new_index,
+ old_pk->fields, old_pk->n_fields);
+ b += old_pk_size;
+ }
+ }
+
+ if (extra_size < 0x80) {
+ *b++ = static_cast<byte>(extra_size);
+ } else {
+ ut_ad(extra_size < 0x8000);
+ *b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+ *b++ = static_cast<byte>(extra_size);
+ }
+
+ if (status == REC_STATUS_INSTANT) {
+ ut_ad(is_instant);
+ if (n_fields <= index->online_log->n_core_fields) {
+ status = REC_STATUS_ORDINARY;
+ }
+ *b = status;
+ }
+
+ rec_convert_dtuple_to_temp<true>(
+ b + extra_size, index, tuple->fields, tuple->n_fields,
+ status);
+ b += size;
+
+ row_log_table_close(index, b, mrec_size, avail_size);
+ }
+
+ mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static
+void
+row_log_table_low(
+/*==============*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ bool insert, /*!< in: true if insert, false if update */
+ const dtuple_t* old_pk) /*!< in: old PRIMARY KEY value (if !insert
+ and a PRIMARY KEY is being created) */
+{
+ ulint old_pk_size;
+ ulint old_pk_extra_size;
+ ulint extra_size;
+ ulint mrec_size;
+ ulint avail_size;
+ const dict_index_t* new_index;
+ row_log_t* log = index->online_log;
+
+ new_index = dict_table_get_first_index(log->table);
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_is_clust(new_index));
+ ut_ad(!dict_index_is_online_ddl(new_index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+ ut_ad(rec_offs_size(offsets) <= sizeof log->tail.buf);
+ ut_ad(rw_lock_own_flagged(
+ &index->lock,
+ RW_LOCK_FLAG_S | RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+#ifdef UNIV_DEBUG
+ switch (fil_page_get_type(page_align(rec))) {
+ case FIL_PAGE_INDEX:
+ break;
+ case FIL_PAGE_TYPE_INSTANT:
+ ut_ad(index->is_instant());
+ ut_ad(!page_has_siblings(page_align(rec)));
+ ut_ad(page_get_page_no(page_align(rec)) == index->page);
+ break;
+ default:
+ ut_ad("wrong page type" == 0);
+ }
+#endif /* UNIV_DEBUG */
+ ut_ad(!rec_is_metadata(rec, *index));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(!page_is_comp(page_align(rec)) == !rec_offs_comp(offsets));
+ /* old_pk=row_log_table_get_pk() [not needed in INSERT] is a prefix
+ of the clustered index record (PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR),
+ with no information on virtual columns */
+ ut_ad(!old_pk || !insert);
+ ut_ad(!old_pk || old_pk->n_v_fields == 0);
+
+ if (index->online_status != ONLINE_INDEX_CREATION
+ || (index->type & DICT_CORRUPT) || index->table->corrupted
+ || log->error != DB_SUCCESS) {
+ return;
+ }
+
+ if (!rec_offs_comp(offsets)) {
+ row_log_table_low_redundant(
+ rec, index, insert, old_pk, new_index);
+ return;
+ }
+
+ ut_ad(page_is_comp(page_align(rec)));
+ ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY
+ || rec_get_status(rec) == REC_STATUS_INSTANT);
+
+ const ulint omit_size = REC_N_NEW_EXTRA_BYTES;
+
+ const ulint rec_extra_size = rec_offs_extra_size(offsets) - omit_size;
+ const bool is_instant = log->is_instant(index);
+ extra_size = rec_extra_size + is_instant;
+
+ unsigned fake_extra_size = 0;
+ byte fake_extra_buf[3];
+ if (is_instant && UNIV_UNLIKELY(!index->is_instant())) {
+ /* The source table was emptied after ALTER TABLE
+ started, and it was converted to non-instant format.
+ Because row_log_table_apply_op() expects to find
+ all records to be logged in the same way, we will
+ be unable to copy the rec_extra_size bytes from the
+ record header, but must convert them here. */
+ unsigned n_add = index->n_fields - 1 - log->n_core_fields;
+ fake_extra_size = rec_get_n_add_field_len(n_add);
+ ut_ad(fake_extra_size == 1 || fake_extra_size == 2);
+ extra_size += fake_extra_size;
+ byte* fake_extra = fake_extra_buf + fake_extra_size;
+ rec_set_n_add_field(fake_extra, n_add);
+ ut_ad(fake_extra == fake_extra_buf);
+ }
+
+ mrec_size = ROW_LOG_HEADER_SIZE
+ + (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size
+ + is_instant + fake_extra_size;
+
+ if (insert || log->same_pk) {
+ ut_ad(!old_pk);
+ old_pk_extra_size = old_pk_size = 0;
+ } else {
+ ut_ad(old_pk);
+ ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+ ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 2)->len);
+ ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 1)->len);
+
+ old_pk_size = rec_get_converted_size_temp<false>(
+ new_index, old_pk->fields, old_pk->n_fields,
+ &old_pk_extra_size);
+ ut_ad(old_pk_extra_size < 0x100);
+ mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+ }
+
+ if (byte* b = row_log_table_open(log, mrec_size, &avail_size)) {
+ if (insert) {
+ *b++ = ROW_T_INSERT;
+ } else {
+ *b++ = ROW_T_UPDATE;
+
+ if (old_pk_size) {
+ *b++ = static_cast<byte>(old_pk_extra_size);
+
+ rec_convert_dtuple_to_temp<false>(
+ b + old_pk_extra_size, new_index,
+ old_pk->fields, old_pk->n_fields);
+ b += old_pk_size;
+ }
+ }
+
+ if (extra_size < 0x80) {
+ *b++ = static_cast<byte>(extra_size);
+ } else {
+ ut_ad(extra_size < 0x8000);
+ *b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+ *b++ = static_cast<byte>(extra_size);
+ }
+
+ if (is_instant) {
+ *b++ = fake_extra_size
+ ? REC_STATUS_INSTANT
+ : rec_get_status(rec);
+ } else {
+ ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY);
+ }
+
+ memcpy(b, rec - rec_extra_size - omit_size, rec_extra_size);
+ b += rec_extra_size;
+ memcpy(b, fake_extra_buf + 1, fake_extra_size);
+ b += fake_extra_size;
+ ulint len;
+ ulint trx_id_offs = rec_get_nth_field_offs(
+ offsets, index->n_uniq, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ memcpy(b, rec, rec_offs_data_size(offsets));
+ if (trx_read_trx_id(b + trx_id_offs) < log->min_trx) {
+ memcpy(b + trx_id_offs,
+ reset_trx_id, sizeof reset_trx_id);
+ }
+ b += rec_offs_data_size(offsets);
+
+ row_log_table_close(index, b, mrec_size, avail_size);
+ }
+}
+
+/******************************************************//**
+Logs an update to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+void
+row_log_table_update(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ const dtuple_t* old_pk) /*!< in: row_log_table_get_pk()
+ before the update */
+{
+ row_log_table_low(rec, index, offsets, false, old_pk);
+}
+
+/** Gets the old table column of a PRIMARY KEY column.
+@param table old table (before ALTER TABLE)
+@param col_map mapping of old column numbers to new ones
+@param col_no column position in the new table
+@return old table column, or NULL if this is an added column */
+static
+const dict_col_t*
+row_log_table_get_pk_old_col(
+/*=========================*/
+ const dict_table_t* table,
+ const ulint* col_map,
+ ulint col_no)
+{
+ for (ulint i = 0; i < table->n_cols; i++) {
+ if (col_no == col_map[i]) {
+ return(dict_table_get_nth_col(table, i));
+ }
+ }
+
+ return(NULL);
+}
+
+/** Maps an old table column of a PRIMARY KEY column.
+@param[in] ifield clustered index field in the new table (after
+ALTER TABLE)
+@param[in] index the clustered index of ifield
+@param[in,out] dfield clustered index tuple field in the new table
+@param[in,out] heap memory heap for allocating dfield contents
+@param[in] rec clustered index leaf page record in the old
+table
+@param[in] offsets rec_get_offsets(rec)
+@param[in] i rec field corresponding to col
+@param[in] zip_size ROW_FORMAT=COMPRESSED size of the old table
+@param[in] max_len maximum length of dfield
+@param[in] log row log for the table
+@retval DB_INVALID_NULL if a NULL value is encountered
+@retval DB_TOO_BIG_INDEX_COL if the maximum prefix length is exceeded */
+static
+dberr_t
+row_log_table_get_pk_col(
+ const dict_field_t* ifield,
+ const dict_index_t* index,
+ dfield_t* dfield,
+ mem_heap_t* heap,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ ulint i,
+ ulint zip_size,
+ ulint max_len,
+ const row_log_t* log)
+{
+ const byte* field;
+ ulint len;
+
+ field = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len == UNIV_SQL_DEFAULT) {
+ field = log->instant_field_value(i, &len);
+ }
+
+ if (len == UNIV_SQL_NULL) {
+ if (!log->allow_not_null) {
+ return(DB_INVALID_NULL);
+ }
+
+ unsigned col_no= ifield->col->ind;
+ ut_ad(col_no < log->defaults->n_fields);
+
+ field = static_cast<const byte*>(
+ log->defaults->fields[col_no].data);
+ if (!field) {
+ return(DB_INVALID_NULL);
+ }
+ len = log->defaults->fields[col_no].len;
+ }
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint field_len = ifield->prefix_len;
+ byte* blob_field;
+
+ if (!field_len) {
+ field_len = ifield->fixed_len;
+ if (!field_len) {
+ field_len = max_len + 1;
+ }
+ }
+
+ blob_field = static_cast<byte*>(
+ mem_heap_alloc(heap, field_len));
+
+ len = btr_copy_externally_stored_field_prefix(
+ blob_field, field_len, zip_size, field, len);
+ if (len >= max_len + 1) {
+ return(DB_TOO_BIG_INDEX_COL);
+ }
+
+ dfield_set_data(dfield, blob_field, len);
+ } else {
+ dfield_set_data(dfield, mem_heap_dup(heap, field, len), len);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ byte* sys, /*!< out: DB_TRX_ID,DB_ROLL_PTR for
+ row_log_table_delete(), or NULL */
+ mem_heap_t** heap) /*!< in/out: memory heap where allocated */
+{
+ dtuple_t* tuple = NULL;
+ row_log_t* log = index->online_log;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_is_online_ddl(index));
+ ut_ad(!offsets || rec_offs_validate(rec, index, offsets));
+ ut_ad(rw_lock_own_flagged(
+ &index->lock,
+ RW_LOCK_FLAG_S | RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+
+ ut_ad(log);
+ ut_ad(log->table);
+ ut_ad(log->min_trx);
+
+ if (log->same_pk) {
+ /* The PRIMARY KEY columns are unchanged. */
+ if (sys) {
+ /* Store the DB_TRX_ID,DB_ROLL_PTR. */
+ ulint trx_id_offs = index->trx_id_offset;
+
+ if (!trx_id_offs) {
+ ulint len;
+
+ if (!offsets) {
+ offsets = rec_get_offsets(
+ rec, index, nullptr,
+ index->n_core_fields,
+ index->db_trx_id() + 1, heap);
+ }
+
+ trx_id_offs = rec_get_nth_field_offs(
+ offsets, index->db_trx_id(), &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ }
+
+ const byte* ptr = trx_read_trx_id(rec + trx_id_offs)
+ < log->min_trx
+ ? reset_trx_id
+ : rec + trx_id_offs;
+
+ memcpy(sys, ptr, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ ut_d(trx_id_check(sys, log->min_trx));
+ }
+
+ return(NULL);
+ }
+
+ mutex_enter(&log->mutex);
+
+ /* log->error is protected by log->mutex. */
+ if (log->error == DB_SUCCESS) {
+ dict_table_t* new_table = log->table;
+ dict_index_t* new_index
+ = dict_table_get_first_index(new_table);
+ const ulint new_n_uniq
+ = dict_index_get_n_unique(new_index);
+
+ if (!*heap) {
+ ulint size = 0;
+
+ if (!offsets) {
+ size += (1 + REC_OFFS_HEADER_SIZE
+ + unsigned(index->n_fields))
+ * sizeof *offsets;
+ }
+
+ for (ulint i = 0; i < new_n_uniq; i++) {
+ size += dict_col_get_min_size(
+ dict_index_get_nth_col(new_index, i));
+ }
+
+ *heap = mem_heap_create(
+ DTUPLE_EST_ALLOC(new_n_uniq + 2) + size);
+ }
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, nullptr,
+ index->n_core_fields,
+ ULINT_UNDEFINED, heap);
+ }
+
+ tuple = dtuple_create(*heap, new_n_uniq + 2);
+ dict_index_copy_types(tuple, new_index, tuple->n_fields);
+ dtuple_set_n_fields_cmp(tuple, new_n_uniq);
+
+ const ulint max_len = DICT_MAX_FIELD_LEN_BY_FORMAT(new_table);
+
+ const ulint zip_size = index->table->space->zip_size();
+
+ for (ulint new_i = 0; new_i < new_n_uniq; new_i++) {
+ dict_field_t* ifield;
+ dfield_t* dfield;
+ ulint prtype;
+ ulint mbminlen, mbmaxlen;
+
+ ifield = dict_index_get_nth_field(new_index, new_i);
+ dfield = dtuple_get_nth_field(tuple, new_i);
+
+ const ulint col_no
+ = dict_field_get_col(ifield)->ind;
+
+ if (const dict_col_t* col
+ = row_log_table_get_pk_old_col(
+ index->table, log->col_map, col_no)) {
+ ulint i = dict_col_get_clust_pos(col, index);
+
+ if (i == ULINT_UNDEFINED) {
+ ut_ad(0);
+ log->error = DB_CORRUPTION;
+ goto err_exit;
+ }
+
+ log->error = row_log_table_get_pk_col(
+ ifield, new_index, dfield, *heap,
+ rec, offsets, i, zip_size, max_len,
+ log);
+
+ if (log->error != DB_SUCCESS) {
+err_exit:
+ tuple = NULL;
+ goto func_exit;
+ }
+
+ mbminlen = col->mbminlen;
+ mbmaxlen = col->mbmaxlen;
+ prtype = col->prtype;
+ } else {
+ /* No matching column was found in the old
+ table, so this must be an added column.
+ Copy the default value. */
+ ut_ad(log->defaults);
+
+ dfield_copy(dfield, dtuple_get_nth_field(
+ log->defaults, col_no));
+ mbminlen = dfield->type.mbminlen;
+ mbmaxlen = dfield->type.mbmaxlen;
+ prtype = dfield->type.prtype;
+ }
+
+ ut_ad(!dfield_is_ext(dfield));
+ ut_ad(!dfield_is_null(dfield));
+
+ if (ifield->prefix_len) {
+ ulint len = dtype_get_at_most_n_mbchars(
+ prtype, mbminlen, mbmaxlen,
+ ifield->prefix_len,
+ dfield_get_len(dfield),
+ static_cast<const char*>(
+ dfield_get_data(dfield)));
+
+ ut_ad(len <= dfield_get_len(dfield));
+ dfield_set_len(dfield, len);
+ }
+ }
+
+ const byte* trx_roll = rec
+ + row_get_trx_id_offset(index, offsets);
+
+ /* Copy the fields, because the fields will be updated
+ or the record may be moved somewhere else in the B-tree
+ as part of the upcoming operation. */
+ if (trx_read_trx_id(trx_roll) < log->min_trx) {
+ trx_roll = reset_trx_id;
+ if (sys) {
+ memcpy(sys, trx_roll,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ }
+ } else if (sys) {
+ memcpy(sys, trx_roll,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ trx_roll = sys;
+ } else {
+ trx_roll = static_cast<const byte*>(
+ mem_heap_dup(
+ *heap, trx_roll,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+ }
+
+ ut_d(trx_id_check(trx_roll, log->min_trx));
+
+ dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq),
+ trx_roll, DATA_TRX_ID_LEN);
+ dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1),
+ trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN);
+ }
+
+func_exit:
+ mutex_exit(&log->mutex);
+ return(tuple);
+}
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+void
+row_log_table_insert(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec,index) */
+{
+ row_log_table_low(rec, index, offsets, true, NULL);
+}
+
+/******************************************************//**
+Notes that a BLOB is being freed during online ALTER TABLE. */
+void
+row_log_table_blob_free(
+/*====================*/
+ dict_index_t* index, /*!< in/out: clustered index, X-latched */
+ ulint page_no)/*!< in: starting page number of the BLOB */
+{
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_is_online_ddl(index));
+ ut_ad(rw_lock_own_flagged(
+ &index->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+ ut_ad(page_no != FIL_NULL);
+
+ if (index->online_log->error != DB_SUCCESS) {
+ return;
+ }
+
+ page_no_map* blobs = index->online_log->blobs;
+
+ if (blobs == NULL) {
+ index->online_log->blobs = blobs = UT_NEW_NOKEY(page_no_map());
+ }
+
+#ifdef UNIV_DEBUG
+ const ulonglong log_pos = index->online_log->tail.total;
+#else
+# define log_pos /* empty */
+#endif /* UNIV_DEBUG */
+
+ const page_no_map::value_type v(page_no,
+ row_log_table_blob_t(log_pos));
+
+ std::pair<page_no_map::iterator,bool> p = blobs->insert(v);
+
+ if (!p.second) {
+ /* Update the existing mapping. */
+ ut_ad(p.first->first == page_no);
+ p.first->second.blob_free(log_pos);
+ }
+#undef log_pos
+}
+
+/******************************************************//**
+Notes that a BLOB is being allocated during online ALTER TABLE. */
+void
+row_log_table_blob_alloc(
+/*=====================*/
+ dict_index_t* index, /*!< in/out: clustered index, X-latched */
+ ulint page_no)/*!< in: starting page number of the BLOB */
+{
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_is_online_ddl(index));
+
+ ut_ad(rw_lock_own_flagged(
+ &index->lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+
+ ut_ad(page_no != FIL_NULL);
+
+ if (index->online_log->error != DB_SUCCESS) {
+ return;
+ }
+
+ /* Only track allocations if the same page has been freed
+ earlier. Double allocation without a free is not allowed. */
+ if (page_no_map* blobs = index->online_log->blobs) {
+ page_no_map::iterator p = blobs->find(page_no);
+
+ if (p != blobs->end()) {
+ ut_ad(p->first == page_no);
+ p->second.blob_alloc(index->online_log->tail.total);
+ }
+ }
+}
+
+/******************************************************//**
+Converts a log record to a table row.
+@return converted row, or NULL if the conversion fails */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const dtuple_t*
+row_log_table_apply_convert_mrec(
+/*=============================*/
+ const mrec_t* mrec, /*!< in: merge record */
+ dict_index_t* index, /*!< in: index of mrec */
+ const rec_offs* offsets, /*!< in: offsets of mrec */
+ row_log_t* log, /*!< in: rebuild context */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ dberr_t* error) /*!< out: DB_SUCCESS or
+ DB_MISSING_HISTORY or
+ reason of failure */
+{
+ dtuple_t* row;
+
+ log->n_rows++;
+ *error = DB_SUCCESS;
+
+ /* This is based on row_build(). */
+ if (log->defaults) {
+ row = dtuple_copy(log->defaults, heap);
+ /* dict_table_copy_types() would set the fields to NULL */
+ for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) {
+ dict_col_copy_type(
+ dict_table_get_nth_col(log->table, i),
+ dfield_get_type(dtuple_get_nth_field(row, i)));
+ }
+ } else {
+ row = dtuple_create(heap, dict_table_get_n_cols(log->table));
+ dict_table_copy_types(row, log->table);
+ }
+
+ for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+ const dict_field_t* ind_field
+ = dict_index_get_nth_field(index, i);
+
+ if (ind_field->prefix_len) {
+ /* Column prefixes can only occur in key
+ fields, which cannot be stored externally. For
+ a column prefix, there should also be the full
+ field in the clustered index tuple. The row
+ tuple comprises full fields, not prefixes. */
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ continue;
+ }
+
+ const dict_col_t* col
+ = dict_field_get_col(ind_field);
+
+ if (col->is_dropped()) {
+ /* the column was instantly dropped earlier */
+ ut_ad(index->table->instant);
+ continue;
+ }
+
+ ulint col_no
+ = log->col_map[dict_col_get_no(col)];
+
+ if (col_no == ULINT_UNDEFINED) {
+ /* the column is being dropped now */
+ continue;
+ }
+
+ dfield_t* dfield
+ = dtuple_get_nth_field(row, col_no);
+
+ ulint len;
+ const byte* data;
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ ut_ad(rec_offs_any_extern(offsets));
+ rw_lock_x_lock(dict_index_get_lock(index));
+
+ if (const page_no_map* blobs = log->blobs) {
+ data = rec_get_nth_field(
+ mrec, offsets, i, &len);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ ulint page_no = mach_read_from_4(
+ data + len - (BTR_EXTERN_FIELD_REF_SIZE
+ - BTR_EXTERN_PAGE_NO));
+ page_no_map::const_iterator p = blobs->find(
+ page_no);
+ if (p != blobs->end()
+ && p->second.is_freed(log->head.total)) {
+ /* This BLOB has been freed.
+ We must not access the row. */
+ *error = DB_MISSING_HISTORY;
+ dfield_set_data(dfield, data, len);
+ dfield_set_ext(dfield);
+ goto blob_done;
+ }
+ }
+
+ data = btr_rec_copy_externally_stored_field(
+ mrec, offsets,
+ index->table->space->zip_size(),
+ i, &len, heap);
+ ut_a(data);
+ dfield_set_data(dfield, data, len);
+blob_done:
+ rw_lock_x_unlock(dict_index_get_lock(index));
+ } else {
+ data = rec_get_nth_field(mrec, offsets, i, &len);
+ if (len == UNIV_SQL_DEFAULT) {
+ data = log->instant_field_value(i, &len);
+ }
+ dfield_set_data(dfield, data, len);
+ }
+
+ if (len != UNIV_SQL_NULL && col->mtype == DATA_MYSQL
+ && col->len != len && !dict_table_is_comp(log->table)) {
+
+ ut_ad(col->len >= len);
+ if (dict_table_is_comp(index->table)) {
+ byte* buf = (byte*) mem_heap_alloc(heap,
+ col->len);
+ memcpy(buf, dfield->data, len);
+ memset(buf + len, 0x20, col->len - len);
+
+ dfield_set_data(dfield, buf, col->len);
+ } else {
+ /* field length mismatch should not happen
+ when rebuilding the redundant row format
+ table. */
+ ut_ad(0);
+ *error = DB_CORRUPTION;
+ return(NULL);
+ }
+ }
+
+ /* See if any columns were changed to NULL or NOT NULL. */
+ const dict_col_t* new_col
+ = dict_table_get_nth_col(log->table, col_no);
+ ut_ad(new_col->same_format(*col));
+
+ /* Assert that prtype matches except for nullability. */
+ ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype)
+ & ~(DATA_NOT_NULL | DATA_VERSIONED
+ | CHAR_COLL_MASK << 16 | DATA_LONG_TRUE_VARCHAR)));
+
+ if (new_col->prtype == col->prtype) {
+ continue;
+ }
+
+ if ((new_col->prtype & DATA_NOT_NULL)
+ && dfield_is_null(dfield)) {
+
+ const dfield_t& default_field
+ = log->defaults->fields[col_no];
+
+ Field* field = log->old_table->field[col->ind];
+
+ field->set_warning(Sql_condition::WARN_LEVEL_WARN,
+ WARN_DATA_TRUNCATED, 1,
+ ulong(log->n_rows));
+
+ if (!log->allow_not_null) {
+ /* We got a NULL value for a NOT NULL column. */
+ *error = DB_INVALID_NULL;
+ return NULL;
+ }
+
+ *dfield = default_field;
+ }
+
+ /* Adjust the DATA_NOT_NULL flag in the parsed row. */
+ dfield_get_type(dfield)->prtype = new_col->prtype;
+
+ ut_ad(dict_col_type_assert_equal(new_col,
+ dfield_get_type(dfield)));
+ }
+
+ return(row);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert_low(
+/*===========================*/
+ que_thr_t* thr, /*!< in: query graph */
+ const dtuple_t* row, /*!< in: table row
+ in the old table definition */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap
+ that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ row_merge_dup_t* dup) /*!< in/out: for reporting
+ duplicate key errors */
+{
+ dberr_t error;
+ dtuple_t* entry;
+ const row_log_t*log = dup->index->online_log;
+ dict_index_t* index = dict_table_get_first_index(log->table);
+ ulint n_index = 0;
+
+ ut_ad(dtuple_validate(row));
+
+ DBUG_LOG("ib_alter_table",
+ "insert table " << index->table->id << " (index "
+ << index->id << "): " << rec_printer(row).str());
+
+ static const ulint flags
+ = (BTR_CREATE_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG);
+
+ entry = row_build_index_entry(row, NULL, index, heap);
+
+ error = row_ins_clust_index_entry_low(
+ flags, BTR_MODIFY_TREE, index, index->n_uniq,
+ entry, 0, thr);
+
+ switch (error) {
+ case DB_SUCCESS:
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ /* The row had already been copied to the table. */
+ return(DB_SUCCESS);
+ default:
+ return(error);
+ }
+
+ ut_ad(dict_index_is_clust(index));
+
+ for (n_index += index->type != DICT_CLUSTERED;
+ (index = dict_table_get_next_index(index)); n_index++) {
+ if (index->type & DICT_FTS) {
+ continue;
+ }
+
+ entry = row_build_index_entry(row, NULL, index, heap);
+ error = row_ins_sec_index_entry_low(
+ flags, BTR_MODIFY_TREE,
+ index, offsets_heap, heap, entry,
+ thr_get_trx(thr)->id, thr);
+
+ if (error != DB_SUCCESS) {
+ if (error == DB_DUPLICATE_KEY) {
+ thr_get_trx(thr)->error_key_num = n_index;
+ }
+ break;
+ }
+ }
+
+ return(error);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert(
+/*=======================*/
+ que_thr_t* thr, /*!< in: query graph */
+ const mrec_t* mrec, /*!< in: record to insert */
+ const rec_offs* offsets, /*!< in: offsets of mrec */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap
+ that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ row_merge_dup_t* dup) /*!< in/out: for reporting
+ duplicate key errors */
+{
+ row_log_t*log = dup->index->online_log;
+ dberr_t error;
+ const dtuple_t* row = row_log_table_apply_convert_mrec(
+ mrec, dup->index, offsets, log, heap, &error);
+
+ switch (error) {
+ case DB_MISSING_HISTORY:
+ ut_ad(log->blobs);
+ /* Because some BLOBs are missing, we know that the
+ transaction was rolled back later (a rollback of
+ an insert can free BLOBs).
+ We can simply skip the insert: the subsequent
+ ROW_T_DELETE will be ignored, or a ROW_T_UPDATE will
+ be interpreted as ROW_T_INSERT. */
+ return(DB_SUCCESS);
+ case DB_SUCCESS:
+ ut_ad(row != NULL);
+ break;
+ default:
+ ut_ad(0);
+ /* fall through */
+ case DB_INVALID_NULL:
+ ut_ad(row == NULL);
+ return(error);
+ }
+
+ error = row_log_table_apply_insert_low(
+ thr, row, offsets_heap, heap, dup);
+ if (error != DB_SUCCESS) {
+ /* Report the erroneous row using the new
+ version of the table. */
+ innobase_row_to_mysql(dup->table, log->table, row);
+ }
+ return(error);
+}
+
+/******************************************************//**
+Deletes a record from a table that is being rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_delete_low(
+/*===========================*/
+ btr_pcur_t* pcur, /*!< in/out: B-tree cursor,
+ will be trashed */
+ const rec_offs* offsets, /*!< in: offsets on pcur */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ mtr_t* mtr) /*!< in/out: mini-transaction,
+ will be committed */
+{
+ dberr_t error;
+ row_ext_t* ext;
+ dtuple_t* row;
+ dict_index_t* index = btr_pcur_get_btr_cur(pcur)->index;
+
+ ut_ad(dict_index_is_clust(index));
+
+ DBUG_LOG("ib_alter_table",
+ "delete table " << index->table->id << " (index "
+ << index->id << "): "
+ << rec_printer(btr_pcur_get_rec(pcur), offsets).str());
+
+ if (dict_table_get_next_index(index)) {
+ /* Build a row template for purging secondary index entries. */
+ row = row_build(
+ ROW_COPY_DATA, index, btr_pcur_get_rec(pcur),
+ offsets, NULL, NULL, NULL, &ext, heap);
+ } else {
+ row = NULL;
+ }
+
+ btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur),
+ BTR_CREATE_FLAG, false, mtr);
+ mtr_commit(mtr);
+
+ if (error != DB_SUCCESS) {
+ return(error);
+ }
+
+ while ((index = dict_table_get_next_index(index)) != NULL) {
+ if (index->type & DICT_FTS) {
+ continue;
+ }
+
+ const dtuple_t* entry = row_build_index_entry(
+ row, ext, index, heap);
+ mtr->start();
+ index->set_modified(*mtr);
+ btr_pcur_open(index, entry, PAGE_CUR_LE,
+ BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+ pcur, mtr);
+#ifdef UNIV_DEBUG
+ switch (btr_pcur_get_btr_cur(pcur)->flag) {
+ case BTR_CUR_DELETE_REF:
+ case BTR_CUR_DEL_MARK_IBUF:
+ case BTR_CUR_DELETE_IBUF:
+ case BTR_CUR_INSERT_TO_IBUF:
+ /* We did not request buffering. */
+ break;
+ case BTR_CUR_HASH:
+ case BTR_CUR_HASH_FAIL:
+ case BTR_CUR_BINARY:
+ goto flag_ok;
+ }
+ ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+ if (page_rec_is_infimum(btr_pcur_get_rec(pcur))
+ || btr_pcur_get_low_match(pcur) < index->n_uniq) {
+ /* All secondary index entries should be
+ found, because new_table is being modified by
+ this thread only, and all indexes should be
+ updated in sync. */
+ mtr->commit();
+ return(DB_INDEX_CORRUPT);
+ }
+
+ btr_cur_pessimistic_delete(&error, FALSE,
+ btr_pcur_get_btr_cur(pcur),
+ BTR_CREATE_FLAG, false, mtr);
+ mtr->commit();
+ }
+
+ return(error);
+}
+
+/******************************************************//**
+Replays a delete operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_delete(
+/*=======================*/
+ ulint trx_id_col, /*!< in: position of
+ DB_TRX_ID in the new
+ clustered index */
+ const mrec_t* mrec, /*!< in: merge record */
+ const rec_offs* moffsets, /*!< in: offsets of mrec */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap
+ that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ const row_log_t* log) /*!< in: online log */
+{
+ dict_table_t* new_table = log->table;
+ dict_index_t* index = dict_table_get_first_index(new_table);
+ dtuple_t* old_pk;
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ rec_offs* offsets;
+
+ ut_ad(rec_offs_n_fields(moffsets) == index->first_user_field());
+ ut_ad(!rec_offs_any_extern(moffsets));
+
+ /* Convert the row to a search tuple. */
+ old_pk = dtuple_create(heap, index->n_uniq);
+ dict_index_copy_types(old_pk, index, index->n_uniq);
+
+ for (ulint i = 0; i < index->n_uniq; i++) {
+ ulint len;
+ const void* field;
+ field = rec_get_nth_field(mrec, moffsets, i, &len);
+ ut_ad(len != UNIV_SQL_NULL);
+ dfield_set_data(dtuple_get_nth_field(old_pk, i),
+ field, len);
+ }
+
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+ btr_pcur_open(index, old_pk, PAGE_CUR_LE,
+ BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+ &pcur, &mtr);
+#ifdef UNIV_DEBUG
+ switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+ case BTR_CUR_DELETE_REF:
+ case BTR_CUR_DEL_MARK_IBUF:
+ case BTR_CUR_DELETE_IBUF:
+ case BTR_CUR_INSERT_TO_IBUF:
+ /* We did not request buffering. */
+ break;
+ case BTR_CUR_HASH:
+ case BTR_CUR_HASH_FAIL:
+ case BTR_CUR_BINARY:
+ goto flag_ok;
+ }
+ ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+ if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+ || btr_pcur_get_low_match(&pcur) < index->n_uniq) {
+all_done:
+ mtr_commit(&mtr);
+ /* The record was not found. All done. */
+ /* This should only happen when an earlier
+ ROW_T_INSERT was skipped or
+ ROW_T_UPDATE was interpreted as ROW_T_DELETE
+ due to BLOBs having been freed by rollback. */
+ return(DB_SUCCESS);
+ }
+
+ offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, nullptr,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &offsets_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ /* Only remove the record if DB_TRX_ID,DB_ROLL_PTR match. */
+
+ {
+ ulint len;
+ const byte* mrec_trx_id
+ = rec_get_nth_field(mrec, moffsets, trx_id_col, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ const byte* rec_trx_id
+ = rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+ trx_id_col, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_d(trx_id_check(rec_trx_id, log->min_trx));
+ ut_d(trx_id_check(mrec_trx_id, log->min_trx));
+
+ ut_ad(rec_get_nth_field(mrec, moffsets, trx_id_col + 1, &len)
+ == mrec_trx_id + DATA_TRX_ID_LEN);
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ ut_ad(rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+ trx_id_col + 1, &len)
+ == rec_trx_id + DATA_TRX_ID_LEN);
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ if (memcmp(mrec_trx_id, rec_trx_id,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+ /* The ROW_T_DELETE was logged for a different
+ PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR.
+ This is possible if a ROW_T_INSERT was skipped
+ or a ROW_T_UPDATE was interpreted as ROW_T_DELETE
+ because some BLOBs were missing due to
+ (1) rolling back the initial insert, or
+ (2) purging the BLOB for a later ROW_T_DELETE
+ (3) purging 'old values' for a later ROW_T_UPDATE
+ or ROW_T_DELETE. */
+ ut_ad(!log->same_pk);
+ goto all_done;
+ }
+ }
+
+ return row_log_table_apply_delete_low(&pcur, offsets, heap, &mtr);
+}
+
+/******************************************************//**
+Replays an update operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_update(
+/*=======================*/
+ que_thr_t* thr, /*!< in: query graph */
+ ulint new_trx_id_col, /*!< in: position of
+ DB_TRX_ID in the new
+ clustered index */
+ const mrec_t* mrec, /*!< in: new value */
+ const rec_offs* offsets, /*!< in: offsets of mrec */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap
+ that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ row_merge_dup_t* dup, /*!< in/out: for reporting
+ duplicate key errors */
+ const dtuple_t* old_pk) /*!< in: PRIMARY KEY and
+ DB_TRX_ID,DB_ROLL_PTR
+ of the old value,
+ or PRIMARY KEY if same_pk */
+{
+ row_log_t* log = dup->index->online_log;
+ const dtuple_t* row;
+ dict_index_t* index = dict_table_get_first_index(log->table);
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ dberr_t error;
+ ulint n_index = 0;
+
+ ut_ad(dtuple_get_n_fields_cmp(old_pk)
+ == dict_index_get_n_unique(index));
+ ut_ad(dtuple_get_n_fields(old_pk) - (log->same_pk ? 0 : 2)
+ == dict_index_get_n_unique(index));
+
+ row = row_log_table_apply_convert_mrec(
+ mrec, dup->index, offsets, log, heap, &error);
+
+ switch (error) {
+ case DB_MISSING_HISTORY:
+ /* The record contained BLOBs that are now missing. */
+ ut_ad(log->blobs);
+ /* Whether or not we are updating the PRIMARY KEY, we
+ know that there should be a subsequent
+ ROW_T_DELETE for rolling back a preceding ROW_T_INSERT,
+ overriding this ROW_T_UPDATE record. (*1)
+
+ This allows us to interpret this ROW_T_UPDATE
+ as ROW_T_DELETE.
+
+ When applying the subsequent ROW_T_DELETE, no matching
+ record will be found. */
+ /* fall through */
+ case DB_SUCCESS:
+ ut_ad(row != NULL);
+ break;
+ default:
+ ut_ad(0);
+ /* fall through */
+ case DB_INVALID_NULL:
+ ut_ad(row == NULL);
+ return(error);
+ }
+
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+ btr_pcur_open(index, old_pk, PAGE_CUR_LE,
+ BTR_MODIFY_TREE, &pcur, &mtr);
+#ifdef UNIV_DEBUG
+ switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+ case BTR_CUR_DELETE_REF:
+ case BTR_CUR_DEL_MARK_IBUF:
+ case BTR_CUR_DELETE_IBUF:
+ case BTR_CUR_INSERT_TO_IBUF:
+ ut_ad(0);/* We did not request buffering. */
+ case BTR_CUR_HASH:
+ case BTR_CUR_HASH_FAIL:
+ case BTR_CUR_BINARY:
+ break;
+ }
+#endif /* UNIV_DEBUG */
+
+ if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+ || btr_pcur_get_low_match(&pcur) < index->n_uniq) {
+ /* The record was not found. This should only happen
+ when an earlier ROW_T_INSERT or ROW_T_UPDATE was
+ diverted because BLOBs were freed when the insert was
+ later rolled back. */
+
+ ut_ad(log->blobs);
+
+ if (error == DB_SUCCESS) {
+ /* An earlier ROW_T_INSERT could have been
+ skipped because of a missing BLOB, like this:
+
+ BEGIN;
+ INSERT INTO t SET blob_col='blob value';
+ UPDATE t SET blob_col='';
+ ROLLBACK;
+
+ This would generate the following records:
+ ROW_T_INSERT (referring to 'blob value')
+ ROW_T_UPDATE
+ ROW_T_UPDATE (referring to 'blob value')
+ ROW_T_DELETE
+ [ROLLBACK removes the 'blob value']
+
+ The ROW_T_INSERT would have been skipped
+ because of a missing BLOB. Now we are
+ executing the first ROW_T_UPDATE.
+ The second ROW_T_UPDATE (for the ROLLBACK)
+ would be interpreted as ROW_T_DELETE, because
+ the BLOB would be missing.
+
+ We could probably assume that the transaction
+ has been rolled back and simply skip the
+ 'insert' part of this ROW_T_UPDATE record.
+ However, there might be some complex scenario
+ that could interfere with such a shortcut.
+ So, we will insert the row (and risk
+ introducing a bogus duplicate key error
+ for the ALTER TABLE), and a subsequent
+ ROW_T_UPDATE or ROW_T_DELETE will delete it. */
+ mtr_commit(&mtr);
+ error = row_log_table_apply_insert_low(
+ thr, row, offsets_heap, heap, dup);
+ } else {
+ /* Some BLOBs are missing, so we are interpreting
+ this ROW_T_UPDATE as ROW_T_DELETE (see *1).
+ Because the record was not found, we do nothing. */
+ ut_ad(error == DB_MISSING_HISTORY);
+ error = DB_SUCCESS;
+func_exit:
+ mtr_commit(&mtr);
+ }
+func_exit_committed:
+ ut_ad(mtr.has_committed());
+
+ if (error != DB_SUCCESS) {
+ /* Report the erroneous row using the new
+ version of the table. */
+ innobase_row_to_mysql(dup->table, log->table, row);
+ }
+
+ return(error);
+ }
+
+ /* Prepare to update (or delete) the record. */
+ rec_offs* cur_offsets = rec_get_offsets(
+ btr_pcur_get_rec(&pcur), index, nullptr, index->n_core_fields,
+ ULINT_UNDEFINED, &offsets_heap);
+
+ if (!log->same_pk) {
+ /* Only update the record if DB_TRX_ID,DB_ROLL_PTR match what
+ was buffered. */
+ ulint len;
+ const byte* rec_trx_id
+ = rec_get_nth_field(btr_pcur_get_rec(&pcur),
+ cur_offsets, index->n_uniq, &len);
+ const dfield_t* old_pk_trx_id
+ = dtuple_get_nth_field(old_pk, index->n_uniq);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_d(trx_id_check(rec_trx_id, log->min_trx));
+ ut_ad(old_pk_trx_id->len == DATA_TRX_ID_LEN);
+ ut_ad(old_pk_trx_id[1].len == DATA_ROLL_PTR_LEN);
+ ut_ad(DATA_TRX_ID_LEN
+ + static_cast<const char*>(old_pk_trx_id->data)
+ == old_pk_trx_id[1].data);
+ ut_d(trx_id_check(old_pk_trx_id->data, log->min_trx));
+
+ if (memcmp(rec_trx_id, old_pk_trx_id->data,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+ /* The ROW_T_UPDATE was logged for a different
+ DB_TRX_ID,DB_ROLL_PTR. This is possible if an
+ earlier ROW_T_INSERT or ROW_T_UPDATE was diverted
+ because some BLOBs were missing due to rolling
+ back the initial insert or due to purging
+ the old BLOB values of an update. */
+ ut_ad(log->blobs);
+ if (error != DB_SUCCESS) {
+ ut_ad(error == DB_MISSING_HISTORY);
+ /* Some BLOBs are missing, so we are
+ interpreting this ROW_T_UPDATE as
+ ROW_T_DELETE (see *1).
+ Because this is a different row,
+ we will do nothing. */
+ error = DB_SUCCESS;
+ } else {
+ /* Because the user record is missing due to
+ BLOBs that were missing when processing
+ an earlier log record, we should
+ interpret the ROW_T_UPDATE as ROW_T_INSERT.
+ However, there is a different user record
+ with the same PRIMARY KEY value already. */
+ error = DB_DUPLICATE_KEY;
+ }
+
+ goto func_exit;
+ }
+ }
+
+ if (error != DB_SUCCESS) {
+ ut_ad(error == DB_MISSING_HISTORY);
+ ut_ad(log->blobs);
+ /* Some BLOBs are missing, so we are interpreting
+ this ROW_T_UPDATE as ROW_T_DELETE (see *1). */
+ error = row_log_table_apply_delete_low(
+ &pcur, cur_offsets, heap, &mtr);
+ goto func_exit_committed;
+ }
+
+ dtuple_t* entry = row_build_index_entry_low(
+ row, NULL, index, heap, ROW_BUILD_NORMAL);
+ upd_t* update = row_upd_build_difference_binary(
+ index, entry, btr_pcur_get_rec(&pcur), cur_offsets,
+ false, NULL, heap, dup->table, &error);
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (!update->n_fields) {
+ /* Nothing to do. */
+ goto func_exit;
+ }
+
+ const bool pk_updated
+ = upd_get_nth_field(update, 0)->field_no < new_trx_id_col;
+
+ if (pk_updated || rec_offs_any_extern(cur_offsets)) {
+ /* If the record contains any externally stored
+ columns, perform the update by delete and insert,
+ because we will not write any undo log that would
+ allow purge to free any orphaned externally stored
+ columns. */
+
+ if (pk_updated && log->same_pk) {
+ /* The ROW_T_UPDATE log record should only be
+ written when the PRIMARY KEY fields of the
+ record did not change in the old table. We
+ can only get a change of PRIMARY KEY columns
+ in the rebuilt table if the PRIMARY KEY was
+ redefined (!same_pk). */
+ ut_ad(0);
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ error = row_log_table_apply_delete_low(
+ &pcur, cur_offsets, heap, &mtr);
+ ut_ad(mtr.has_committed());
+
+ if (error == DB_SUCCESS) {
+ error = row_log_table_apply_insert_low(
+ thr, row, offsets_heap, heap, dup);
+ }
+
+ goto func_exit_committed;
+ }
+
+ dtuple_t* old_row;
+ row_ext_t* old_ext;
+
+ if (dict_table_get_next_index(index)) {
+ /* Construct the row corresponding to the old value of
+ the record. */
+ old_row = row_build(
+ ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur),
+ cur_offsets, NULL, NULL, NULL, &old_ext, heap);
+ ut_ad(old_row);
+
+ DBUG_LOG("ib_alter_table",
+ "update table " << index->table->id
+ << " (index " << index->id
+ << ": " << rec_printer(old_row).str()
+ << " to " << rec_printer(row).str());
+ } else {
+ old_row = NULL;
+ old_ext = NULL;
+ }
+
+ big_rec_t* big_rec;
+
+ error = btr_cur_pessimistic_update(
+ BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG
+ | BTR_KEEP_POS_FLAG,
+ btr_pcur_get_btr_cur(&pcur),
+ &cur_offsets, &offsets_heap, heap, &big_rec,
+ update, 0, thr, 0, &mtr);
+
+ if (big_rec) {
+ if (error == DB_SUCCESS) {
+ error = btr_store_big_rec_extern_fields(
+ &pcur, cur_offsets, big_rec, &mtr,
+ BTR_STORE_UPDATE);
+ }
+
+ dtuple_big_rec_free(big_rec);
+ }
+
+ for (n_index += index->type != DICT_CLUSTERED;
+ (index = dict_table_get_next_index(index)); n_index++) {
+ if (index->type & DICT_FTS) {
+ continue;
+ }
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+
+ if (!row_upd_changes_ord_field_binary(
+ index, update, thr, old_row, NULL)) {
+ continue;
+ }
+
+ if (dict_index_has_virtual(index)) {
+ dtuple_copy_v_fields(old_row, old_pk);
+ }
+
+ mtr_commit(&mtr);
+
+ entry = row_build_index_entry(old_row, old_ext, index, heap);
+ if (!entry) {
+ ut_ad(0);
+ return(DB_CORRUPTION);
+ }
+
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+
+ if (ROW_FOUND != row_search_index_entry(
+ index, entry, BTR_MODIFY_TREE, &pcur, &mtr)) {
+ ut_ad(0);
+ error = DB_CORRUPTION;
+ break;
+ }
+
+ btr_cur_pessimistic_delete(
+ &error, FALSE, btr_pcur_get_btr_cur(&pcur),
+ BTR_CREATE_FLAG, false, &mtr);
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+
+ mtr_commit(&mtr);
+
+ entry = row_build_index_entry(row, NULL, index, heap);
+ error = row_ins_sec_index_entry_low(
+ BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG,
+ BTR_MODIFY_TREE, index, offsets_heap, heap,
+ entry, thr_get_trx(thr)->id, thr);
+
+ /* Report correct index name for duplicate key error. */
+ if (error == DB_DUPLICATE_KEY) {
+ thr_get_trx(thr)->error_key_num = n_index;
+ }
+
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+ }
+
+ goto func_exit;
+}
+
+/******************************************************//**
+Applies an operation to a table that was rebuilt.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const mrec_t*
+row_log_table_apply_op(
+/*===================*/
+ que_thr_t* thr, /*!< in: query graph */
+ ulint new_trx_id_col, /*!< in: position of
+ DB_TRX_ID in new index */
+ row_merge_dup_t* dup, /*!< in/out: for reporting
+ duplicate key errors */
+ dberr_t* error, /*!< out: DB_SUCCESS
+ or error code */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap
+ that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ const mrec_t* mrec, /*!< in: merge record */
+ const mrec_t* mrec_end, /*!< in: end of buffer */
+ rec_offs* offsets) /*!< in/out: work area
+ for parsing mrec */
+{
+ row_log_t* log = dup->index->online_log;
+ dict_index_t* new_index = dict_table_get_first_index(log->table);
+ ulint extra_size;
+ const mrec_t* next_mrec;
+ dtuple_t* old_pk;
+
+ ut_ad(dict_index_is_clust(dup->index));
+ ut_ad(dup->index->table != log->table);
+ ut_ad(log->head.total <= log->tail.total);
+
+ *error = DB_SUCCESS;
+
+ /* 3 = 1 (op type) + 1 (extra_size) + at least 1 byte payload */
+ if (mrec + 3 >= mrec_end) {
+ return(NULL);
+ }
+
+ const bool is_instant = log->is_instant(dup->index);
+ const mrec_t* const mrec_start = mrec;
+
+ switch (*mrec++) {
+ default:
+ ut_ad(0);
+ *error = DB_CORRUPTION;
+ return(NULL);
+ case ROW_T_INSERT:
+ extra_size = *mrec++;
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *mrec++;
+ }
+
+ mrec += extra_size;
+
+ ut_ad(extra_size || !is_instant);
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ rec_offs_set_n_fields(offsets, dup->index->n_fields);
+ rec_init_offsets_temp(mrec, dup->index, offsets,
+ log->n_core_fields, log->non_core_fields,
+ is_instant
+ ? static_cast<rec_comp_status_t>(
+ *(mrec - extra_size))
+ : REC_STATUS_ORDINARY);
+
+ next_mrec = mrec + rec_offs_data_size(offsets);
+
+ if (next_mrec > mrec_end) {
+ return(NULL);
+ } else {
+ log->head.total += ulint(next_mrec - mrec_start);
+ *error = row_log_table_apply_insert(
+ thr, mrec, offsets, offsets_heap,
+ heap, dup);
+ }
+ break;
+
+ case ROW_T_DELETE:
+ /* 1 (extra_size) + at least 1 (payload) */
+ if (mrec + 2 >= mrec_end) {
+ return(NULL);
+ }
+
+ extra_size = *mrec++;
+ ut_ad(mrec < mrec_end);
+
+ /* We assume extra_size < 0x100 for the PRIMARY KEY prefix.
+ For fixed-length PRIMARY key columns, it is 0. */
+ mrec += extra_size;
+
+ /* The ROW_T_DELETE record was converted by
+ rec_convert_dtuple_to_temp() using new_index. */
+ ut_ad(!new_index->is_instant());
+ rec_offs_set_n_fields(offsets, new_index->first_user_field());
+ rec_init_offsets_temp(mrec, new_index, offsets);
+ next_mrec = mrec + rec_offs_data_size(offsets);
+ if (next_mrec > mrec_end) {
+ return(NULL);
+ }
+
+ log->head.total += ulint(next_mrec - mrec_start);
+
+ *error = row_log_table_apply_delete(
+ new_trx_id_col,
+ mrec, offsets, offsets_heap, heap, log);
+ break;
+
+ case ROW_T_UPDATE:
+ /* Logically, the log entry consists of the
+ (PRIMARY KEY,DB_TRX_ID) of the old value (converted
+ to the new primary key definition) followed by
+ the new value in the old table definition. If the
+ definition of the columns belonging to PRIMARY KEY
+ is not changed, the log will only contain
+ DB_TRX_ID,new_row. */
+
+ if (log->same_pk) {
+ ut_ad(new_index->n_uniq == dup->index->n_uniq);
+
+ extra_size = *mrec++;
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *mrec++;
+ }
+
+ mrec += extra_size;
+
+ ut_ad(extra_size || !is_instant);
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ rec_offs_set_n_fields(offsets, dup->index->n_fields);
+ rec_init_offsets_temp(mrec, dup->index, offsets,
+ log->n_core_fields,
+ log->non_core_fields,
+ is_instant
+ ? static_cast<rec_comp_status_t>(
+ *(mrec - extra_size))
+ : REC_STATUS_ORDINARY);
+
+ next_mrec = mrec + rec_offs_data_size(offsets);
+
+ if (next_mrec > mrec_end) {
+ return(NULL);
+ }
+
+ old_pk = dtuple_create(heap, new_index->n_uniq);
+ dict_index_copy_types(
+ old_pk, new_index, old_pk->n_fields);
+
+ /* Copy the PRIMARY KEY fields from mrec to old_pk. */
+ for (ulint i = 0; i < new_index->n_uniq; i++) {
+ const void* field;
+ ulint len;
+ dfield_t* dfield;
+
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+
+ field = rec_get_nth_field(
+ mrec, offsets, i, &len);
+ ut_ad(len != UNIV_SQL_NULL);
+
+ dfield = dtuple_get_nth_field(old_pk, i);
+ dfield_set_data(dfield, field, len);
+ }
+ } else {
+ /* We assume extra_size < 0x100
+ for the PRIMARY KEY prefix. */
+ mrec += *mrec + 1;
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ /* Get offsets for PRIMARY KEY,
+ DB_TRX_ID, DB_ROLL_PTR. */
+ /* The old_pk prefix was converted by
+ rec_convert_dtuple_to_temp() using new_index. */
+ ut_ad(!new_index->is_instant());
+ rec_offs_set_n_fields(offsets,
+ new_index->first_user_field());
+ rec_init_offsets_temp(mrec, new_index, offsets);
+
+ next_mrec = mrec + rec_offs_data_size(offsets);
+ if (next_mrec + 2 > mrec_end) {
+ return(NULL);
+ }
+
+ /* Copy the PRIMARY KEY fields and
+ DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */
+ old_pk = dtuple_create(heap,
+ new_index->first_user_field());
+ dict_index_copy_types(old_pk, new_index,
+ old_pk->n_fields);
+
+ for (ulint i = 0; i < new_index->first_user_field();
+ i++) {
+ const void* field;
+ ulint len;
+ dfield_t* dfield;
+
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+
+ field = rec_get_nth_field(
+ mrec, offsets, i, &len);
+ ut_ad(len != UNIV_SQL_NULL);
+
+ dfield = dtuple_get_nth_field(old_pk, i);
+ dfield_set_data(dfield, field, len);
+ }
+
+ mrec = next_mrec;
+
+ /* Fetch the new value of the row as it was
+ in the old table definition. */
+ extra_size = *mrec++;
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *mrec++;
+ }
+
+ mrec += extra_size;
+
+ ut_ad(extra_size || !is_instant);
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ rec_offs_set_n_fields(offsets, dup->index->n_fields);
+ rec_init_offsets_temp(mrec, dup->index, offsets,
+ log->n_core_fields,
+ log->non_core_fields,
+ is_instant
+ ? static_cast<rec_comp_status_t>(
+ *(mrec - extra_size))
+ : REC_STATUS_ORDINARY);
+
+ next_mrec = mrec + rec_offs_data_size(offsets);
+
+ if (next_mrec > mrec_end) {
+ return(NULL);
+ }
+ }
+
+ ut_ad(next_mrec <= mrec_end);
+ log->head.total += ulint(next_mrec - mrec_start);
+ dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq);
+
+ *error = row_log_table_apply_update(
+ thr, new_trx_id_col,
+ mrec, offsets, offsets_heap, heap, dup, old_pk);
+ break;
+ }
+
+ ut_ad(log->head.total <= log->tail.total);
+ mem_heap_empty(offsets_heap);
+ mem_heap_empty(heap);
+ return(next_mrec);
+}
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Estimate how much an ALTER TABLE progress should be incremented per
+one block of log applied.
+For the other phases of ALTER TABLE we increment the progress with 1 per
+page processed.
+@return amount of abstract units to add to work_completed when one block
+of log is applied.
+*/
+inline
+ulint
+row_log_progress_inc_per_block()
+{
+ /* We must increment the progress once per page (as in
+ srv_page_size, default = innodb_page_size=16KiB).
+ One block here is srv_sort_buf_size (usually 1MiB). */
+ const ulint pages_per_block = std::max<ulint>(
+ ulint(srv_sort_buf_size >> srv_page_size_shift), 1);
+
+ /* Multiply by an artificial factor of 6 to even the pace with
+ the rest of the ALTER TABLE phases, they process page_size amount
+ of data faster. */
+ return(pages_per_block * 6);
+}
+
+/** Estimate how much work is to be done by the log apply phase
+of an ALTER TABLE for this index.
+@param[in] index index whose log to assess
+@return work to be done by log-apply in abstract units
+*/
+ulint
+row_log_estimate_work(
+ const dict_index_t* index)
+{
+ if (index == NULL || index->online_log == NULL) {
+ return(0);
+ }
+
+ const row_log_t* l = index->online_log;
+ const ulint bytes_left =
+ static_cast<ulint>(l->tail.total - l->head.total);
+ const ulint blocks_left = bytes_left / srv_sort_buf_size;
+
+ return(blocks_left * row_log_progress_inc_per_block());
+}
+#else /* HAVE_PSI_STAGE_INTERFACE */
+inline
+ulint
+row_log_progress_inc_per_block()
+{
+ return(0);
+}
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Applies operations to a table was rebuilt.
+@param[in] thr query graph
+@param[in,out] dup for reporting duplicate key errors
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL, then stage->inc() will be called for each block
+of log that is applied.
+@return DB_SUCCESS, or error code on failure */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_log_table_apply_ops(
+ que_thr_t* thr,
+ row_merge_dup_t* dup,
+ ut_stage_alter_t* stage)
+{
+ dberr_t error;
+ const mrec_t* mrec = NULL;
+ const mrec_t* next_mrec;
+ const mrec_t* mrec_end = NULL; /* silence bogus warning */
+ const mrec_t* next_mrec_end;
+ mem_heap_t* heap;
+ mem_heap_t* offsets_heap;
+ rec_offs* offsets;
+ bool has_index_lock;
+ dict_index_t* index = const_cast<dict_index_t*>(
+ dup->index);
+ dict_table_t* new_table = index->online_log->table;
+ dict_index_t* new_index = dict_table_get_first_index(
+ new_table);
+ const ulint i = 1 + REC_OFFS_HEADER_SIZE
+ + std::max<ulint>(index->n_fields,
+ new_index->first_user_field());
+ const ulint new_trx_id_col = dict_col_get_clust_pos(
+ dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index);
+ trx_t* trx = thr_get_trx(thr);
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_is_online_ddl(index));
+ ut_ad(trx->mysql_thd);
+ ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+ ut_ad(!dict_index_is_online_ddl(new_index));
+ ut_ad(dict_col_get_clust_pos(
+ dict_table_get_sys_col(index->table, DATA_TRX_ID), index)
+ != ULINT_UNDEFINED);
+ ut_ad(new_trx_id_col > 0);
+ ut_ad(new_trx_id_col != ULINT_UNDEFINED);
+
+ MEM_UNDEFINED(&mrec_end, sizeof mrec_end);
+
+ offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets));
+ rec_offs_set_n_alloc(offsets, i);
+ rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+
+ heap = mem_heap_create(srv_page_size);
+ offsets_heap = mem_heap_create(srv_page_size);
+ has_index_lock = true;
+
+next_block:
+ ut_ad(has_index_lock);
+ ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+ ut_ad(index->online_log->head.bytes == 0);
+
+ stage->inc(row_log_progress_inc_per_block());
+
+ if (trx_is_interrupted(trx)) {
+ goto interrupted;
+ }
+
+ if (index->is_corrupted()) {
+ error = DB_INDEX_CORRUPT;
+ goto func_exit;
+ }
+
+ ut_ad(dict_index_is_online_ddl(index));
+
+ error = index->online_log->error;
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(index->online_log->head.blocks
+ > index->online_log->tail.blocks)) {
+unexpected_eof:
+ ib::error() << "Unexpected end of temporary file for table "
+ << index->table->name;
+corruption:
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ if (index->online_log->head.blocks
+ == index->online_log->tail.blocks) {
+ if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+ /* Truncate the file in order to save space. */
+ if (index->online_log->fd > 0
+ && ftruncate(index->online_log->fd, 0) == -1) {
+ ib::error()
+ << "\'" << index->name + 1
+ << "\' failed with error "
+ << errno << ":" << strerror(errno);
+
+ goto corruption;
+ }
+#endif /* HAVE_FTRUNCATE */
+ index->online_log->head.blocks
+ = index->online_log->tail.blocks = 0;
+ }
+
+ next_mrec = index->online_log->tail.block;
+ next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+ if (next_mrec_end == next_mrec) {
+ /* End of log reached. */
+all_done:
+ ut_ad(has_index_lock);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->tail.blocks == 0);
+ index->online_log->head.bytes = 0;
+ index->online_log->tail.bytes = 0;
+ error = DB_SUCCESS;
+ goto func_exit;
+ }
+ } else {
+ os_offset_t ofs;
+
+ ofs = (os_offset_t) index->online_log->head.blocks
+ * srv_sort_buf_size;
+
+ ut_ad(has_index_lock);
+ has_index_lock = false;
+ rw_lock_x_unlock(dict_index_get_lock(index));
+
+ log_free_check();
+
+ ut_ad(dict_index_is_online_ddl(index));
+
+ if (!row_log_block_allocate(index->online_log->head)) {
+ error = DB_OUT_OF_MEMORY;
+ goto func_exit;
+ }
+
+ byte* buf = index->online_log->head.block;
+
+ if (os_file_read_no_error_handling(
+ IORequestRead, index->online_log->fd,
+ buf, ofs, srv_sort_buf_size, 0) != DB_SUCCESS) {
+ ib::error()
+ << "Unable to read temporary file"
+ " for table " << index->table->name;
+ goto corruption;
+ }
+
+ if (log_tmp_is_encrypted()) {
+ if (!log_tmp_block_decrypt(
+ buf, srv_sort_buf_size,
+ index->online_log->crypt_head, ofs)) {
+ error = DB_DECRYPTION_FAILED;
+ goto func_exit;
+ }
+
+ srv_stats.n_rowlog_blocks_decrypted.inc();
+ memcpy(buf, index->online_log->crypt_head,
+ srv_sort_buf_size);
+ }
+
+#ifdef POSIX_FADV_DONTNEED
+ /* Each block is read exactly once. Free up the file cache. */
+ posix_fadvise(index->online_log->fd,
+ ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+ next_mrec = index->online_log->head.block;
+ next_mrec_end = next_mrec + srv_sort_buf_size;
+ }
+
+ /* This read is not protected by index->online_log->mutex for
+ performance reasons. We will eventually notice any error that
+ was flagged by a DML thread. */
+ error = index->online_log->error;
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (mrec) {
+ /* A partial record was read from the previous block.
+ Copy the temporary buffer full, as we do not know the
+ length of the record. Parse subsequent records from
+ the bigger buffer index->online_log->head.block
+ or index->online_log->tail.block. */
+
+ ut_ad(mrec == index->online_log->head.buf);
+ ut_ad(mrec_end > mrec);
+ ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+ memcpy((mrec_t*) mrec_end, next_mrec,
+ ulint((&index->online_log->head.buf)[1] - mrec_end));
+ mrec = row_log_table_apply_op(
+ thr, new_trx_id_col,
+ dup, &error, offsets_heap, heap,
+ index->online_log->head.buf,
+ (&index->online_log->head.buf)[1], offsets);
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ } else if (UNIV_UNLIKELY(mrec == NULL)) {
+ /* The record was not reassembled properly. */
+ goto corruption;
+ }
+ /* The record was previously found out to be
+ truncated. Now that the parse buffer was extended,
+ it should proceed beyond the old end of the buffer. */
+ ut_a(mrec > mrec_end);
+
+ index->online_log->head.bytes = ulint(mrec - mrec_end);
+ next_mrec += index->online_log->head.bytes;
+ }
+
+ ut_ad(next_mrec <= next_mrec_end);
+ /* The following loop must not be parsing the temporary
+ buffer, but head.block or tail.block. */
+
+ /* mrec!=NULL means that the next record starts from the
+ middle of the block */
+ ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+ if (next_mrec_end == index->online_log->head.block
+ + srv_sort_buf_size) {
+ /* If tail.bytes == 0, next_mrec_end can also be at
+ the end of tail.block. */
+ if (index->online_log->tail.bytes == 0) {
+ ut_ad(next_mrec == next_mrec_end);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->head.bytes == 0);
+ } else {
+ ut_ad(next_mrec == index->online_log->head.block
+ + index->online_log->head.bytes);
+ ut_ad(index->online_log->tail.blocks
+ > index->online_log->head.blocks);
+ }
+ } else if (next_mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes) {
+ ut_ad(next_mrec == index->online_log->tail.block
+ + index->online_log->head.bytes);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->head.bytes
+ <= index->online_log->tail.bytes);
+ } else {
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ mrec_end = next_mrec_end;
+
+ while (!trx_is_interrupted(trx)) {
+ mrec = next_mrec;
+ ut_ad(mrec <= mrec_end);
+
+ if (mrec == mrec_end) {
+ /* We are at the end of the log.
+ Mark the replay all_done. */
+ if (has_index_lock) {
+ goto all_done;
+ }
+ }
+
+ if (!has_index_lock) {
+ /* We are applying operations from a different
+ block than the one that is being written to.
+ We do not hold index->lock in order to
+ allow other threads to concurrently buffer
+ modifications. */
+ ut_ad(mrec >= index->online_log->head.block);
+ ut_ad(mrec_end == index->online_log->head.block
+ + srv_sort_buf_size);
+ ut_ad(index->online_log->head.bytes
+ < srv_sort_buf_size);
+
+ /* Take the opportunity to do a redo log
+ checkpoint if needed. */
+ log_free_check();
+ } else {
+ /* We are applying operations from the last block.
+ Do not allow other threads to buffer anything,
+ so that we can finally catch up and synchronize. */
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes);
+ ut_ad(mrec >= index->online_log->tail.block);
+ }
+
+ /* This read is not protected by index->online_log->mutex
+ for performance reasons. We will eventually notice any
+ error that was flagged by a DML thread. */
+ error = index->online_log->error;
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ next_mrec = row_log_table_apply_op(
+ thr, new_trx_id_col,
+ dup, &error, offsets_heap, heap,
+ mrec, mrec_end, offsets);
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ } else if (next_mrec == next_mrec_end) {
+ /* The record happened to end on a block boundary.
+ Do we have more blocks left? */
+ if (has_index_lock) {
+ /* The index will be locked while
+ applying the last block. */
+ goto all_done;
+ }
+
+ mrec = NULL;
+process_next_block:
+ rw_lock_x_lock(dict_index_get_lock(index));
+ has_index_lock = true;
+
+ index->online_log->head.bytes = 0;
+ index->online_log->head.blocks++;
+ goto next_block;
+ } else if (next_mrec != NULL) {
+ ut_ad(next_mrec < next_mrec_end);
+ index->online_log->head.bytes
+ += ulint(next_mrec - mrec);
+ } else if (has_index_lock) {
+ /* When mrec is within tail.block, it should
+ be a complete record, because we are holding
+ index->lock and thus excluding the writer. */
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes);
+ ut_ad(0);
+ goto unexpected_eof;
+ } else {
+ memcpy(index->online_log->head.buf, mrec,
+ ulint(mrec_end - mrec));
+ mrec_end += ulint(index->online_log->head.buf - mrec);
+ mrec = index->online_log->head.buf;
+ goto process_next_block;
+ }
+ }
+
+interrupted:
+ error = DB_INTERRUPTED;
+func_exit:
+ if (!has_index_lock) {
+ rw_lock_x_lock(dict_index_get_lock(index));
+ }
+
+ mem_heap_free(offsets_heap);
+ mem_heap_free(heap);
+ row_log_block_free(index->online_log->head);
+ ut_free(offsets);
+ return(error);
+}
+
+/** Apply the row_log_table log to a table upon completing rebuild.
+@param[in] thr query graph
+@param[in] old_table old table
+@param[in,out] table MySQL table (for reporting duplicates)
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_table() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@param[in] new_table Altered table
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_table_apply(
+ que_thr_t* thr,
+ dict_table_t* old_table,
+ struct TABLE* table,
+ ut_stage_alter_t* stage,
+ dict_table_t* new_table)
+{
+ dberr_t error;
+ dict_index_t* clust_index;
+
+ thr_get_trx(thr)->error_key_num = 0;
+ DBUG_EXECUTE_IF("innodb_trx_duplicates",
+ thr_get_trx(thr)->duplicates = TRX_DUP_REPLACE;);
+
+ stage->begin_phase_log_table();
+
+ ut_ad(!rw_lock_own(&dict_sys.latch, RW_LOCK_S));
+ clust_index = dict_table_get_first_index(old_table);
+
+ if (clust_index->online_log->n_rows == 0) {
+ clust_index->online_log->n_rows = new_table->stat_n_rows;
+ }
+
+ rw_lock_x_lock(dict_index_get_lock(clust_index));
+
+ if (!clust_index->online_log) {
+ ut_ad(dict_index_get_online_status(clust_index)
+ == ONLINE_INDEX_COMPLETE);
+ /* This function should not be called unless
+ rebuilding a table online. Build in some fault
+ tolerance. */
+ ut_ad(0);
+ error = DB_ERROR;
+ } else {
+ row_merge_dup_t dup = {
+ clust_index, table,
+ clust_index->online_log->col_map, 0
+ };
+
+ error = row_log_table_apply_ops(thr, &dup, stage);
+
+ ut_ad(error != DB_SUCCESS
+ || clust_index->online_log->head.total
+ == clust_index->online_log->tail.total);
+ }
+
+ rw_lock_x_unlock(dict_index_get_lock(clust_index));
+ DBUG_EXECUTE_IF("innodb_trx_duplicates",
+ thr_get_trx(thr)->duplicates = 0;);
+
+ return(error);
+}
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+bool
+row_log_allocate(
+/*=============*/
+ const trx_t* trx, /*!< in: the ALTER TABLE transaction */
+ dict_index_t* index, /*!< in/out: index */
+ dict_table_t* table, /*!< in/out: new table being rebuilt,
+ or NULL when creating a secondary index */
+ bool same_pk,/*!< in: whether the definition of the
+ PRIMARY KEY has remained the same */
+ const dtuple_t* defaults,
+ /*!< in: default values of
+ added, changed columns, or NULL */
+ const ulint* col_map,/*!< in: mapping of old column
+ numbers to new ones, or NULL if !table */
+ const char* path, /*!< in: where to create temporary file */
+ const TABLE* old_table, /*!< in: table definition before alter */
+ const bool allow_not_null) /*!< in: allow null to not-null
+ conversion */
+{
+ row_log_t* log;
+ DBUG_ENTER("row_log_allocate");
+
+ ut_ad(!dict_index_is_online_ddl(index));
+ ut_ad(dict_index_is_clust(index) == !!table);
+ ut_ad(!table || index->table != table);
+ ut_ad(same_pk || table);
+ ut_ad(!table || col_map);
+ ut_ad(!defaults || col_map);
+ ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(trx->id);
+
+ log = static_cast<row_log_t*>(ut_malloc_nokey(sizeof *log));
+
+ if (log == NULL) {
+ DBUG_RETURN(false);
+ }
+
+ log->fd = OS_FILE_CLOSED;
+ mutex_create(LATCH_ID_INDEX_ONLINE_LOG, &log->mutex);
+
+ log->blobs = NULL;
+ log->table = table;
+ log->same_pk = same_pk;
+ log->defaults = defaults;
+ log->col_map = col_map;
+ log->error = DB_SUCCESS;
+ log->min_trx = trx->id;
+ log->max_trx = 0;
+ log->tail.blocks = log->tail.bytes = 0;
+ log->tail.total = 0;
+ log->tail.block = log->head.block = NULL;
+ log->crypt_tail = log->crypt_head = NULL;
+ log->head.blocks = log->head.bytes = 0;
+ log->head.total = 0;
+ log->path = path;
+ log->n_core_fields = index->n_core_fields;
+ ut_ad(!table || log->is_instant(index)
+ == (index->n_core_fields < index->n_fields));
+ log->allow_not_null = allow_not_null;
+ log->old_table = old_table;
+ log->n_rows = 0;
+
+ if (table && index->is_instant()) {
+ const unsigned n = log->n_core_fields;
+ log->non_core_fields = UT_NEW_ARRAY_NOKEY(
+ dict_col_t::def_t, index->n_fields - n);
+ for (unsigned i = n; i < index->n_fields; i++) {
+ log->non_core_fields[i - n]
+ = index->fields[i].col->def_val;
+ }
+ } else {
+ log->non_core_fields = NULL;
+ }
+
+ dict_index_set_online_status(index, ONLINE_INDEX_CREATION);
+
+ if (log_tmp_is_encrypted()) {
+ log->crypt_head_size = log->crypt_tail_size = srv_sort_buf_size;
+ log->crypt_head = static_cast<byte *>(
+ my_large_malloc(&log->crypt_head_size, MYF(MY_WME)));
+ log->crypt_tail = static_cast<byte *>(
+ my_large_malloc(&log->crypt_tail_size, MYF(MY_WME)));
+
+ if (!log->crypt_head || !log->crypt_tail) {
+ row_log_free(log);
+ DBUG_RETURN(false);
+ }
+ }
+
+ index->online_log = log;
+ /* While we might be holding an exclusive data dictionary lock
+ here, in row_log_abort_sec() we will not always be holding it. Use
+ atomic operations in both cases. */
+ MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX);
+
+ DBUG_RETURN(true);
+}
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+void
+row_log_free(
+/*=========*/
+ row_log_t* log) /*!< in,own: row log */
+{
+ MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX);
+
+ UT_DELETE(log->blobs);
+ UT_DELETE_ARRAY(log->non_core_fields);
+ row_log_block_free(log->tail);
+ row_log_block_free(log->head);
+ row_merge_file_destroy_low(log->fd);
+
+ if (log->crypt_head) {
+ my_large_free(log->crypt_head, log->crypt_head_size);
+ }
+
+ if (log->crypt_tail) {
+ my_large_free(log->crypt_tail, log->crypt_tail_size);
+ }
+
+ mutex_free(&log->mutex);
+ ut_free(log);
+}
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+ dict_index_t* index) /*!< in: index, must be locked */
+{
+ ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION);
+
+ ut_ad((rw_lock_own(dict_index_get_lock(index), RW_LOCK_S)
+ && mutex_own(&index->online_log->mutex))
+ || rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+
+ return(index->online_log->max_trx);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_log_apply_op_low(
+/*=================*/
+ dict_index_t* index, /*!< in/out: index */
+ row_merge_dup_t*dup, /*!< in/out: for reporting
+ duplicate key errors */
+ dberr_t* error, /*!< out: DB_SUCCESS or error code */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap for
+ allocating offsets; can be emptied */
+ bool has_index_lock, /*!< in: true if holding index->lock
+ in exclusive mode */
+ enum row_op op, /*!< in: operation being applied */
+ trx_id_t trx_id, /*!< in: transaction identifier */
+ const dtuple_t* entry) /*!< in: row */
+{
+ mtr_t mtr;
+ btr_cur_t cursor;
+ rec_offs* offsets = NULL;
+
+ ut_ad(!dict_index_is_clust(index));
+
+ ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X)
+ == has_index_lock);
+
+ ut_ad(!index->is_corrupted());
+ ut_ad(trx_id != 0 || op == ROW_OP_DELETE);
+
+ DBUG_LOG("ib_create_index",
+ (op == ROW_OP_INSERT ? "insert " : "delete ")
+ << (has_index_lock ? "locked index " : "unlocked index ")
+ << index->id << ',' << ib::hex(trx_id) << ": "
+ << rec_printer(entry).str());
+
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+
+ /* We perform the pessimistic variant of the operations if we
+ already hold index->lock exclusively. First, search the
+ record. The operation may already have been performed,
+ depending on when the row in the clustered index was
+ scanned. */
+ btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+ has_index_lock
+ ? BTR_MODIFY_TREE
+ : BTR_MODIFY_LEAF,
+ &cursor, 0, __FILE__, __LINE__,
+ &mtr);
+
+ ut_ad(dict_index_get_n_unique(index) > 0);
+ /* This test is somewhat similar to row_ins_must_modify_rec(),
+ but not identical for unique secondary indexes. */
+ if (cursor.low_match >= dict_index_get_n_unique(index)
+ && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) {
+ /* We have a matching record. */
+ bool exists = (cursor.low_match
+ == dict_index_get_n_fields(index));
+#ifdef UNIV_DEBUG
+ rec_t* rec = btr_cur_get_rec(&cursor);
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+#endif /* UNIV_DEBUG */
+
+ ut_ad(exists || dict_index_is_unique(index));
+
+ switch (op) {
+ case ROW_OP_DELETE:
+ if (!exists) {
+ /* The existing record matches the
+ unique secondary index key, but the
+ PRIMARY KEY columns differ. So, this
+ exact record does not exist. For
+ example, we could detect a duplicate
+ key error in some old index before
+ logging an ROW_OP_INSERT for our
+ index. This ROW_OP_DELETE could have
+ been logged for rolling back
+ TRX_UNDO_INSERT_REC. */
+ goto func_exit;
+ }
+
+ if (btr_cur_optimistic_delete(
+ &cursor, BTR_CREATE_FLAG, &mtr)) {
+ *error = DB_SUCCESS;
+ break;
+ }
+
+ if (!has_index_lock) {
+ /* This needs a pessimistic operation.
+ Lock the index tree exclusively. */
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+ btr_cur_search_to_nth_level(
+ index, 0, entry, PAGE_CUR_LE,
+ BTR_MODIFY_TREE, &cursor, 0,
+ __FILE__, __LINE__, &mtr);
+
+ /* No other thread than the current one
+ is allowed to modify the index tree.
+ Thus, the record should still exist. */
+ ut_ad(cursor.low_match
+ >= dict_index_get_n_fields(index));
+ ut_ad(page_rec_is_user_rec(
+ btr_cur_get_rec(&cursor)));
+ }
+
+ /* As there are no externally stored fields in
+ a secondary index record, the parameter
+ rollback=false will be ignored. */
+
+ btr_cur_pessimistic_delete(
+ error, FALSE, &cursor,
+ BTR_CREATE_FLAG, false, &mtr);
+ break;
+ case ROW_OP_INSERT:
+ if (exists) {
+ /* The record already exists. There
+ is nothing to be inserted.
+ This could happen when processing
+ TRX_UNDO_DEL_MARK_REC in statement
+ rollback:
+
+ UPDATE of PRIMARY KEY can lead to
+ statement rollback if the updated
+ value of the PRIMARY KEY already
+ exists. In this case, the UPDATE would
+ be mapped to DELETE;INSERT, and we
+ only wrote undo log for the DELETE
+ part. The duplicate key error would be
+ triggered before logging the INSERT
+ part.
+
+ Theoretically, we could also get a
+ similar situation when a DELETE operation
+ is blocked by a FOREIGN KEY constraint. */
+ goto func_exit;
+ }
+
+ if (dtuple_contains_null(entry)) {
+ /* The UNIQUE KEY columns match, but
+ there is a NULL value in the key, and
+ NULL!=NULL. */
+ goto insert_the_rec;
+ }
+
+ goto duplicate;
+ }
+ } else {
+ switch (op) {
+ rec_t* rec;
+ big_rec_t* big_rec;
+ case ROW_OP_DELETE:
+ /* The record does not exist. For example, we
+ could detect a duplicate key error in some old
+ index before logging an ROW_OP_INSERT for our
+ index. This ROW_OP_DELETE could be logged for
+ rolling back TRX_UNDO_INSERT_REC. */
+ goto func_exit;
+ case ROW_OP_INSERT:
+ if (dict_index_is_unique(index)
+ && (cursor.up_match
+ >= dict_index_get_n_unique(index)
+ || cursor.low_match
+ >= dict_index_get_n_unique(index))
+ && (!index->n_nullable
+ || !dtuple_contains_null(entry))) {
+duplicate:
+ /* Duplicate key */
+ ut_ad(dict_index_is_unique(index));
+ row_merge_dup_report(dup, entry->fields);
+ *error = DB_DUPLICATE_KEY;
+ goto func_exit;
+ }
+insert_the_rec:
+ /* Insert the record. As we are inserting into
+ a secondary index, there cannot be externally
+ stored columns (!big_rec). */
+ *error = btr_cur_optimistic_insert(
+ BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_CREATE_FLAG,
+ &cursor, &offsets, &offsets_heap,
+ const_cast<dtuple_t*>(entry),
+ &rec, &big_rec, 0, NULL, &mtr);
+ ut_ad(!big_rec);
+ if (*error != DB_FAIL) {
+ break;
+ }
+
+ if (!has_index_lock) {
+ /* This needs a pessimistic operation.
+ Lock the index tree exclusively. */
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+ btr_cur_search_to_nth_level(
+ index, 0, entry, PAGE_CUR_LE,
+ BTR_MODIFY_TREE, &cursor, 0,
+ __FILE__, __LINE__, &mtr);
+ }
+
+ /* We already determined that the
+ record did not exist. No other thread
+ than the current one is allowed to
+ modify the index tree. Thus, the
+ record should still not exist. */
+
+ *error = btr_cur_pessimistic_insert(
+ BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_CREATE_FLAG,
+ &cursor, &offsets, &offsets_heap,
+ const_cast<dtuple_t*>(entry),
+ &rec, &big_rec,
+ 0, NULL, &mtr);
+ ut_ad(!big_rec);
+ break;
+ }
+ mem_heap_empty(offsets_heap);
+ }
+
+ if (*error == DB_SUCCESS && trx_id) {
+ page_update_max_trx_id(btr_cur_get_block(&cursor),
+ btr_cur_get_page_zip(&cursor),
+ trx_id, &mtr);
+ }
+
+func_exit:
+ mtr_commit(&mtr);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const mrec_t*
+row_log_apply_op(
+/*=============*/
+ dict_index_t* index, /*!< in/out: index */
+ row_merge_dup_t*dup, /*!< in/out: for reporting
+ duplicate key errors */
+ dberr_t* error, /*!< out: DB_SUCCESS or error code */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap for
+ allocating offsets; can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap for
+ allocating data tuples */
+ bool has_index_lock, /*!< in: true if holding index->lock
+ in exclusive mode */
+ const mrec_t* mrec, /*!< in: merge record */
+ const mrec_t* mrec_end, /*!< in: end of buffer */
+ rec_offs* offsets) /*!< in/out: work area for
+ rec_init_offsets_temp() */
+
+{
+ enum row_op op;
+ ulint extra_size;
+ ulint data_size;
+ dtuple_t* entry;
+ trx_id_t trx_id;
+
+ /* Online index creation is only used for secondary indexes. */
+ ut_ad(!dict_index_is_clust(index));
+
+ ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X)
+ == has_index_lock);
+
+ if (index->is_corrupted()) {
+ *error = DB_INDEX_CORRUPT;
+ return(NULL);
+ }
+
+ *error = DB_SUCCESS;
+
+ if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) {
+ return(NULL);
+ }
+
+ switch (*mrec) {
+ case ROW_OP_INSERT:
+ if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) {
+ return(NULL);
+ }
+
+ op = static_cast<enum row_op>(*mrec++);
+ trx_id = trx_read_trx_id(mrec);
+ mrec += DATA_TRX_ID_LEN;
+ break;
+ case ROW_OP_DELETE:
+ op = static_cast<enum row_op>(*mrec++);
+ trx_id = 0;
+ break;
+ default:
+corrupted:
+ ut_ad(0);
+ *error = DB_CORRUPTION;
+ return(NULL);
+ }
+
+ extra_size = *mrec++;
+
+ ut_ad(mrec < mrec_end);
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *mrec++;
+ }
+
+ mrec += extra_size;
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ rec_init_offsets_temp(mrec, index, offsets);
+
+ if (rec_offs_any_extern(offsets)) {
+ /* There should never be any externally stored fields
+ in a secondary index, which is what online index
+ creation is used for. Therefore, the log file must be
+ corrupted. */
+ goto corrupted;
+ }
+
+ data_size = rec_offs_data_size(offsets);
+
+ mrec += data_size;
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ entry = row_rec_to_index_entry_low(
+ mrec - data_size, index, offsets, heap);
+ /* Online index creation is only implemented for secondary
+ indexes, which never contain off-page columns. */
+ ut_ad(dtuple_get_n_ext(entry) == 0);
+
+ row_log_apply_op_low(index, dup, error, offsets_heap,
+ has_index_lock, op, trx_id, entry);
+ return(mrec);
+}
+
+/** Applies operations to a secondary index that was being created.
+@param[in] trx transaction (for checking if the operation was
+interrupted)
+@param[in,out] index index
+@param[in,out] dup for reporting duplicate key errors
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL, then stage->inc() will be called for each block
+of log that is applied.
+@return DB_SUCCESS, or error code on failure */
+static
+dberr_t
+row_log_apply_ops(
+ const trx_t* trx,
+ dict_index_t* index,
+ row_merge_dup_t* dup,
+ ut_stage_alter_t* stage)
+{
+ dberr_t error;
+ const mrec_t* mrec = NULL;
+ const mrec_t* next_mrec;
+ const mrec_t* mrec_end= NULL; /* silence bogus warning */
+ const mrec_t* next_mrec_end;
+ mem_heap_t* offsets_heap;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+ bool has_index_lock;
+ const ulint i = 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+
+ ut_ad(dict_index_is_online_ddl(index));
+ ut_ad(!index->is_committed());
+ ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+ ut_ad(index->online_log);
+
+ MEM_UNDEFINED(&mrec_end, sizeof mrec_end);
+
+ offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets));
+ rec_offs_set_n_alloc(offsets, i);
+ rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+
+ offsets_heap = mem_heap_create(srv_page_size);
+ heap = mem_heap_create(srv_page_size);
+ has_index_lock = true;
+
+next_block:
+ ut_ad(has_index_lock);
+ ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+ ut_ad(index->online_log->head.bytes == 0);
+
+ stage->inc(row_log_progress_inc_per_block());
+
+ if (trx_is_interrupted(trx)) {
+ goto interrupted;
+ }
+
+ error = index->online_log->error;
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (index->is_corrupted()) {
+ error = DB_INDEX_CORRUPT;
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(index->online_log->head.blocks
+ > index->online_log->tail.blocks)) {
+unexpected_eof:
+ ib::error() << "Unexpected end of temporary file for index "
+ << index->name;
+corruption:
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ if (index->online_log->head.blocks
+ == index->online_log->tail.blocks) {
+ if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+ /* Truncate the file in order to save space. */
+ if (index->online_log->fd > 0
+ && ftruncate(index->online_log->fd, 0) == -1) {
+ ib::error()
+ << "\'" << index->name + 1
+ << "\' failed with error "
+ << errno << ":" << strerror(errno);
+
+ goto corruption;
+ }
+#endif /* HAVE_FTRUNCATE */
+ index->online_log->head.blocks
+ = index->online_log->tail.blocks = 0;
+ }
+
+ next_mrec = index->online_log->tail.block;
+ next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+ if (next_mrec_end == next_mrec) {
+ /* End of log reached. */
+all_done:
+ ut_ad(has_index_lock);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->tail.blocks == 0);
+ error = DB_SUCCESS;
+ goto func_exit;
+ }
+ } else {
+ os_offset_t ofs = static_cast<os_offset_t>(
+ index->online_log->head.blocks)
+ * srv_sort_buf_size;
+ ut_ad(has_index_lock);
+ has_index_lock = false;
+ rw_lock_x_unlock(dict_index_get_lock(index));
+
+ log_free_check();
+
+ if (!row_log_block_allocate(index->online_log->head)) {
+ error = DB_OUT_OF_MEMORY;
+ goto func_exit;
+ }
+
+ byte* buf = index->online_log->head.block;
+
+ if (os_file_read_no_error_handling(
+ IORequestRead, index->online_log->fd,
+ buf, ofs, srv_sort_buf_size, 0) != DB_SUCCESS) {
+ ib::error()
+ << "Unable to read temporary file"
+ " for index " << index->name;
+ goto corruption;
+ }
+
+ if (log_tmp_is_encrypted()) {
+ if (!log_tmp_block_decrypt(
+ buf, srv_sort_buf_size,
+ index->online_log->crypt_head, ofs)) {
+ error = DB_DECRYPTION_FAILED;
+ goto func_exit;
+ }
+
+ srv_stats.n_rowlog_blocks_decrypted.inc();
+ memcpy(buf, index->online_log->crypt_head, srv_sort_buf_size);
+ }
+
+#ifdef POSIX_FADV_DONTNEED
+ /* Each block is read exactly once. Free up the file cache. */
+ posix_fadvise(index->online_log->fd,
+ ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+ next_mrec = index->online_log->head.block;
+ next_mrec_end = next_mrec + srv_sort_buf_size;
+ }
+
+ if (mrec) {
+ /* A partial record was read from the previous block.
+ Copy the temporary buffer full, as we do not know the
+ length of the record. Parse subsequent records from
+ the bigger buffer index->online_log->head.block
+ or index->online_log->tail.block. */
+
+ ut_ad(mrec == index->online_log->head.buf);
+ ut_ad(mrec_end > mrec);
+ ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+ memcpy((mrec_t*) mrec_end, next_mrec,
+ ulint((&index->online_log->head.buf)[1] - mrec_end));
+ mrec = row_log_apply_op(
+ index, dup, &error, offsets_heap, heap,
+ has_index_lock, index->online_log->head.buf,
+ (&index->online_log->head.buf)[1], offsets);
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ } else if (UNIV_UNLIKELY(mrec == NULL)) {
+ /* The record was not reassembled properly. */
+ goto corruption;
+ }
+ /* The record was previously found out to be
+ truncated. Now that the parse buffer was extended,
+ it should proceed beyond the old end of the buffer. */
+ ut_a(mrec > mrec_end);
+
+ index->online_log->head.bytes = ulint(mrec - mrec_end);
+ next_mrec += index->online_log->head.bytes;
+ }
+
+ ut_ad(next_mrec <= next_mrec_end);
+ /* The following loop must not be parsing the temporary
+ buffer, but head.block or tail.block. */
+
+ /* mrec!=NULL means that the next record starts from the
+ middle of the block */
+ ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+ if (next_mrec_end == index->online_log->head.block
+ + srv_sort_buf_size) {
+ /* If tail.bytes == 0, next_mrec_end can also be at
+ the end of tail.block. */
+ if (index->online_log->tail.bytes == 0) {
+ ut_ad(next_mrec == next_mrec_end);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->head.bytes == 0);
+ } else {
+ ut_ad(next_mrec == index->online_log->head.block
+ + index->online_log->head.bytes);
+ ut_ad(index->online_log->tail.blocks
+ > index->online_log->head.blocks);
+ }
+ } else if (next_mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes) {
+ ut_ad(next_mrec == index->online_log->tail.block
+ + index->online_log->head.bytes);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->head.bytes
+ <= index->online_log->tail.bytes);
+ } else {
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ mrec_end = next_mrec_end;
+
+ while (!trx_is_interrupted(trx)) {
+ mrec = next_mrec;
+ ut_ad(mrec < mrec_end);
+
+ if (!has_index_lock) {
+ /* We are applying operations from a different
+ block than the one that is being written to.
+ We do not hold index->lock in order to
+ allow other threads to concurrently buffer
+ modifications. */
+ ut_ad(mrec >= index->online_log->head.block);
+ ut_ad(mrec_end == index->online_log->head.block
+ + srv_sort_buf_size);
+ ut_ad(index->online_log->head.bytes
+ < srv_sort_buf_size);
+
+ /* Take the opportunity to do a redo log
+ checkpoint if needed. */
+ log_free_check();
+ } else {
+ /* We are applying operations from the last block.
+ Do not allow other threads to buffer anything,
+ so that we can finally catch up and synchronize. */
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes);
+ ut_ad(mrec >= index->online_log->tail.block);
+ }
+
+ next_mrec = row_log_apply_op(
+ index, dup, &error, offsets_heap, heap,
+ has_index_lock, mrec, mrec_end, offsets);
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ } else if (next_mrec == next_mrec_end) {
+ /* The record happened to end on a block boundary.
+ Do we have more blocks left? */
+ if (has_index_lock) {
+ /* The index will be locked while
+ applying the last block. */
+ goto all_done;
+ }
+
+ mrec = NULL;
+process_next_block:
+ rw_lock_x_lock(dict_index_get_lock(index));
+ has_index_lock = true;
+
+ index->online_log->head.bytes = 0;
+ index->online_log->head.blocks++;
+ goto next_block;
+ } else if (next_mrec != NULL) {
+ ut_ad(next_mrec < next_mrec_end);
+ index->online_log->head.bytes
+ += ulint(next_mrec - mrec);
+ } else if (has_index_lock) {
+ /* When mrec is within tail.block, it should
+ be a complete record, because we are holding
+ index->lock and thus excluding the writer. */
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes);
+ ut_ad(0);
+ goto unexpected_eof;
+ } else {
+ memcpy(index->online_log->head.buf, mrec,
+ ulint(mrec_end - mrec));
+ mrec_end += ulint(index->online_log->head.buf - mrec);
+ mrec = index->online_log->head.buf;
+ goto process_next_block;
+ }
+ }
+
+interrupted:
+ error = DB_INTERRUPTED;
+func_exit:
+ if (!has_index_lock) {
+ rw_lock_x_lock(dict_index_get_lock(index));
+ }
+
+ switch (error) {
+ case DB_SUCCESS:
+ break;
+ case DB_INDEX_CORRUPT:
+ if (((os_offset_t) index->online_log->tail.blocks + 1)
+ * srv_sort_buf_size >= srv_online_max_size) {
+ /* The log file grew too big. */
+ error = DB_ONLINE_LOG_TOO_BIG;
+ }
+ /* fall through */
+ default:
+ /* We set the flag directly instead of invoking
+ dict_set_corrupted_index_cache_only(index) here,
+ because the index is not "public" yet. */
+ index->type |= DICT_CORRUPT;
+ }
+
+ mem_heap_free(heap);
+ mem_heap_free(offsets_heap);
+ row_log_block_free(index->online_log->head);
+ ut_free(offsets);
+ return(error);
+}
+
+/** Apply the row log to the index upon completing index creation.
+@param[in] trx transaction (for checking if the operation was
+interrupted)
+@param[in,out] index secondary index
+@param[in,out] table MySQL table (for reporting duplicates)
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_index() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_apply(
+ const trx_t* trx,
+ dict_index_t* index,
+ struct TABLE* table,
+ ut_stage_alter_t* stage)
+{
+ dberr_t error;
+ row_log_t* log;
+ row_merge_dup_t dup = { index, table, NULL, 0 };
+ DBUG_ENTER("row_log_apply");
+
+ ut_ad(dict_index_is_online_ddl(index));
+ ut_ad(!dict_index_is_clust(index));
+
+ stage->begin_phase_log_index();
+
+ log_free_check();
+
+ rw_lock_x_lock(dict_index_get_lock(index));
+
+ if (!dict_table_is_corrupted(index->table)) {
+ error = row_log_apply_ops(trx, index, &dup, stage);
+ } else {
+ error = DB_SUCCESS;
+ }
+
+ if (error != DB_SUCCESS) {
+ ut_ad(index->table->space);
+ /* We set the flag directly instead of invoking
+ dict_set_corrupted_index_cache_only(index) here,
+ because the index is not "public" yet. */
+ index->type |= DICT_CORRUPT;
+ index->table->drop_aborted = TRUE;
+
+ dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+ } else {
+ ut_ad(dup.n_dup == 0);
+ dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE);
+ }
+
+ log = index->online_log;
+ index->online_log = NULL;
+ rw_lock_x_unlock(dict_index_get_lock(index));
+
+ row_log_free(log);
+
+ DBUG_RETURN(error);
+}
+
+unsigned row_log_get_n_core_fields(const dict_index_t *index)
+{
+ ut_ad(index->online_log);
+ return index->online_log->n_core_fields;
+}
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
new file mode 100644
index 00000000..417bf6a4
--- /dev/null
+++ b/storage/innobase/row/row0merge.cc
@@ -0,0 +1,4799 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0merge.cc
+New index creation routines using a merge sort
+
+Created 12/4/2005 Jan Lindstrom
+Completed by Sunny Bains and Marko Makela
+*******************************************************/
+#include <my_global.h>
+#include <log.h>
+#include <sql_class.h>
+#include <math.h>
+
+#include "row0merge.h"
+#include "row0ext.h"
+#include "row0log.h"
+#include "row0ins.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "log0crypt.h"
+#include "dict0crea.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+#include "ut0sort.h"
+#include "row0ftsort.h"
+#include "row0import.h"
+#include "row0vers.h"
+#include "handler0alter.h"
+#include "btr0bulk.h"
+#ifdef BTR_CUR_ADAPT
+# include "btr0sea.h"
+#endif /* BTR_CUR_ADAPT */
+#include "ut0stage.h"
+#include "fil0crypt.h"
+
+/* Ignore posix_fadvise() on those platforms where it does not exist */
+#if defined _WIN32
+# define posix_fadvise(fd, offset, len, advice) /* nothing */
+#endif /* _WIN32 */
+
+/* Whether to disable file system cache */
+char srv_disable_sort_file_cache;
+
+/** Class that caches index row tuples made from a single cluster
+index page scan, and then insert into corresponding index tree */
+class index_tuple_info_t {
+public:
+ /** constructor
+ @param[in] heap memory heap
+ @param[in] index index to be created */
+ index_tuple_info_t(mem_heap_t* heap, dict_index_t* index) :
+ m_dtuple_vec(UT_NEW_NOKEY(idx_tuple_vec())),
+ m_index(index), m_heap(heap)
+ { ut_ad(index->is_spatial()); }
+
+ /** destructor */
+ ~index_tuple_info_t()
+ {
+ UT_DELETE(m_dtuple_vec);
+ }
+
+ /** Get the index object
+ @return the index object */
+ dict_index_t* get_index() UNIV_NOTHROW
+ {
+ return(m_index);
+ }
+
+ /** Caches an index row into index tuple vector
+ @param[in] row table row
+ @param[in] ext externally stored column
+ prefixes, or NULL */
+ void add(
+ const dtuple_t* row,
+ const row_ext_t* ext) UNIV_NOTHROW
+ {
+ dtuple_t* dtuple;
+
+ dtuple = row_build_index_entry(row, ext, m_index, m_heap);
+
+ ut_ad(dtuple);
+
+ m_dtuple_vec->push_back(dtuple);
+ }
+
+ /** Insert spatial index rows cached in vector into spatial index
+ @param[in] trx_id transaction id
+ @param[in,out] row_heap memory heap
+ @param[in] pcur cluster index scanning cursor
+ @param[in,out] mtr_started whether scan_mtr is active
+ @param[in,out] scan_mtr mini-transaction for pcur
+ @return DB_SUCCESS if successful, else error number */
+ dberr_t insert(trx_id_t trx_id, mem_heap_t* row_heap, btr_pcur_t* pcur,
+ bool& mtr_started, mtr_t* scan_mtr) const
+ {
+ big_rec_t* big_rec;
+ rec_t* rec;
+ btr_cur_t ins_cur;
+ mtr_t mtr;
+ rtr_info_t rtr_info;
+ rec_offs* ins_offsets = NULL;
+ dberr_t error = DB_SUCCESS;
+ dtuple_t* dtuple;
+ ulint count = 0;
+ const ulint flag = BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG;
+
+ ut_ad(mtr_started == scan_mtr->is_active());
+
+ DBUG_EXECUTE_IF("row_merge_instrument_log_check_flush",
+ log_sys.set_check_flush_or_checkpoint(););
+
+ for (idx_tuple_vec::iterator it = m_dtuple_vec->begin();
+ it != m_dtuple_vec->end();
+ ++it) {
+ dtuple = *it;
+ ut_ad(dtuple);
+
+ if (log_sys.check_flush_or_checkpoint()) {
+ if (mtr_started) {
+ btr_pcur_move_to_prev_on_page(pcur);
+ btr_pcur_store_position(pcur, scan_mtr);
+ scan_mtr->commit();
+ mtr_started = false;
+ }
+
+ log_free_check();
+ }
+
+ mtr.start();
+ m_index->set_modified(mtr);
+
+ ins_cur.index = m_index;
+ rtr_init_rtr_info(&rtr_info, false, &ins_cur, m_index,
+ false);
+ rtr_info_update_btr(&ins_cur, &rtr_info);
+
+ btr_cur_search_to_nth_level(m_index, 0, dtuple,
+ PAGE_CUR_RTREE_INSERT,
+ BTR_MODIFY_LEAF, &ins_cur,
+ 0, __FILE__, __LINE__,
+ &mtr);
+
+ /* It need to update MBR in parent entry,
+ so change search mode to BTR_MODIFY_TREE */
+ if (rtr_info.mbr_adj) {
+ mtr_commit(&mtr);
+ rtr_clean_rtr_info(&rtr_info, true);
+ rtr_init_rtr_info(&rtr_info, false, &ins_cur,
+ m_index, false);
+ rtr_info_update_btr(&ins_cur, &rtr_info);
+ mtr_start(&mtr);
+ m_index->set_modified(mtr);
+ btr_cur_search_to_nth_level(
+ m_index, 0, dtuple,
+ PAGE_CUR_RTREE_INSERT,
+ BTR_MODIFY_TREE, &ins_cur, 0,
+ __FILE__, __LINE__, &mtr);
+ }
+
+ error = btr_cur_optimistic_insert(
+ flag, &ins_cur, &ins_offsets, &row_heap,
+ dtuple, &rec, &big_rec, 0, NULL, &mtr);
+
+ if (error == DB_FAIL) {
+ ut_ad(!big_rec);
+ mtr.commit();
+ mtr.start();
+ m_index->set_modified(mtr);
+
+ rtr_clean_rtr_info(&rtr_info, true);
+ rtr_init_rtr_info(&rtr_info, false,
+ &ins_cur, m_index, false);
+
+ rtr_info_update_btr(&ins_cur, &rtr_info);
+ btr_cur_search_to_nth_level(
+ m_index, 0, dtuple,
+ PAGE_CUR_RTREE_INSERT,
+ BTR_MODIFY_TREE,
+ &ins_cur, 0,
+ __FILE__, __LINE__, &mtr);
+
+ error = btr_cur_pessimistic_insert(
+ flag, &ins_cur, &ins_offsets,
+ &row_heap, dtuple, &rec,
+ &big_rec, 0, NULL, &mtr);
+ }
+
+ DBUG_EXECUTE_IF(
+ "row_merge_ins_spatial_fail",
+ error = DB_FAIL;
+ );
+
+ if (error == DB_SUCCESS) {
+ if (rtr_info.mbr_adj) {
+ error = rtr_ins_enlarge_mbr(
+ &ins_cur, &mtr);
+ }
+
+ if (error == DB_SUCCESS) {
+ page_update_max_trx_id(
+ btr_cur_get_block(&ins_cur),
+ btr_cur_get_page_zip(&ins_cur),
+ trx_id, &mtr);
+ }
+ }
+
+ mtr_commit(&mtr);
+
+ rtr_clean_rtr_info(&rtr_info, true);
+ count++;
+ }
+
+ m_dtuple_vec->clear();
+
+ return(error);
+ }
+
+private:
+ /** Cache index rows made from a cluster index scan. Usually
+ for rows on single cluster index page */
+ typedef std::vector<dtuple_t*, ut_allocator<dtuple_t*> >
+ idx_tuple_vec;
+
+ /** vector used to cache index rows made from cluster index scan */
+ idx_tuple_vec* const m_dtuple_vec;
+
+ /** the index being built */
+ dict_index_t* const m_index;
+
+ /** memory heap for creating index tuples */
+ mem_heap_t* const m_heap;
+};
+
+/* Maximum pending doc memory limit in bytes for a fts tokenization thread */
+#define FTS_PENDING_DOC_MEMORY_LIMIT 1000000
+
+/** Insert sorted data tuples to the index.
+@param[in] index index to be inserted
+@param[in] old_table old table
+@param[in] fd file descriptor
+@param[in,out] block file buffer
+@param[in] row_buf row_buf the sorted data tuples,
+or NULL if fd, block will be used instead
+@param[in,out] btr_bulk btr bulk instance
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->begin_phase_insert() will be called initially
+and then stage->inc() will be called for each record that is processed.
+@return DB_SUCCESS or error number */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_insert_index_tuples(
+ dict_index_t* index,
+ const dict_table_t* old_table,
+ const pfs_os_file_t& fd,
+ row_merge_block_t* block,
+ const row_merge_buf_t* row_buf,
+ BtrBulk* btr_bulk,
+ const ib_uint64_t table_total_rows, /*!< in: total rows of old table */
+ const double pct_progress, /*!< in: total progress
+ percent until now */
+ const double pct_cost, /*!< in: current progress percent
+ */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space, /*!< in: space id */
+ ut_stage_alter_t* stage = NULL);
+
+/******************************************************//**
+Encode an index record. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_merge_buf_encode(
+/*=================*/
+ byte** b, /*!< in/out: pointer to
+ current end of output buffer */
+ const dict_index_t* index, /*!< in: index */
+ const mtuple_t* entry, /*!< in: index fields
+ of the record to encode */
+ ulint n_fields) /*!< in: number of fields
+ in the entry */
+{
+ ulint size;
+ ulint extra_size;
+
+ size = rec_get_converted_size_temp<false>(
+ index, entry->fields, n_fields, &extra_size);
+ ut_ad(size >= extra_size);
+
+ /* Encode extra_size + 1 */
+ if (extra_size + 1 < 0x80) {
+ *(*b)++ = (byte) (extra_size + 1);
+ } else {
+ ut_ad((extra_size + 1) < 0x8000);
+ *(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8));
+ *(*b)++ = (byte) (extra_size + 1);
+ }
+
+ rec_convert_dtuple_to_temp<false>(*b + extra_size, index,
+ entry->fields, n_fields);
+
+ *b += size;
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+static MY_ATTRIBUTE((malloc, nonnull))
+row_merge_buf_t*
+row_merge_buf_create_low(
+/*=====================*/
+ mem_heap_t* heap, /*!< in: heap where allocated */
+ dict_index_t* index, /*!< in: secondary index */
+ ulint max_tuples, /*!< in: maximum number of
+ data tuples */
+ ulint buf_size) /*!< in: size of the buffer,
+ in bytes */
+{
+ row_merge_buf_t* buf;
+
+ ut_ad(max_tuples > 0);
+
+ ut_ad(max_tuples <= srv_sort_buf_size);
+
+ buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
+ buf->heap = heap;
+ buf->index = index;
+ buf->max_tuples = max_tuples;
+ buf->tuples = static_cast<mtuple_t*>(
+ ut_malloc_nokey(2 * max_tuples * sizeof *buf->tuples));
+ buf->tmp_tuples = buf->tuples + max_tuples;
+
+ return(buf);
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+ dict_index_t* index) /*!< in: secondary index */
+{
+ row_merge_buf_t* buf;
+ ulint max_tuples;
+ ulint buf_size;
+ mem_heap_t* heap;
+
+ max_tuples = srv_sort_buf_size
+ / std::max<ulint>(1, dict_index_get_min_size(index));
+
+ buf_size = (sizeof *buf);
+
+ heap = mem_heap_create(buf_size);
+
+ buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
+
+ return(buf);
+}
+
+/******************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+ row_merge_buf_t* buf) /*!< in,own: sort buffer */
+{
+ ulint buf_size = sizeof *buf;
+ ulint max_tuples = buf->max_tuples;
+ mem_heap_t* heap = buf->heap;
+ dict_index_t* index = buf->index;
+ mtuple_t* tuples = buf->tuples;
+
+ mem_heap_empty(heap);
+
+ buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
+ buf->heap = heap;
+ buf->index = index;
+ buf->max_tuples = max_tuples;
+ buf->tuples = tuples;
+ buf->tmp_tuples = buf->tuples + max_tuples;
+
+ return(buf);
+}
+
+/******************************************************//**
+Deallocate a sort buffer. */
+void
+row_merge_buf_free(
+/*===============*/
+ row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */
+{
+ ut_free(buf->tuples);
+ mem_heap_free(buf->heap);
+}
+
+/** Convert the field data from compact to redundant format.
+@param[in] row_field field to copy from
+@param[out] field field to copy to
+@param[in] len length of the field data
+@param[in] zip_size compressed BLOB page size,
+ zero for uncompressed BLOBs
+@param[in,out] heap memory heap where to allocate data when
+ converting to ROW_FORMAT=REDUNDANT, or NULL
+ when not to invoke
+ row_merge_buf_redundant_convert(). */
+static
+void
+row_merge_buf_redundant_convert(
+ const dfield_t* row_field,
+ dfield_t* field,
+ ulint len,
+ ulint zip_size,
+ mem_heap_t* heap)
+{
+ ut_ad(field->type.mbminlen == 1);
+ ut_ad(field->type.mbmaxlen > 1);
+
+ byte* buf = (byte*) mem_heap_alloc(heap, len);
+ ulint field_len = row_field->len;
+ ut_ad(field_len <= len);
+
+ if (row_field->ext) {
+ const byte* field_data = static_cast<const byte*>(
+ dfield_get_data(row_field));
+ ulint ext_len;
+
+ ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_a(memcmp(field_data + field_len - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+ byte* data = btr_copy_externally_stored_field(
+ &ext_len, field_data, zip_size, field_len, heap);
+
+ ut_ad(ext_len < len);
+
+ memcpy(buf, data, ext_len);
+ field_len = ext_len;
+ } else {
+ memcpy(buf, row_field->data, field_len);
+ }
+
+ memset(buf + field_len, 0x20, len - field_len);
+
+ dfield_set_data(field, buf, len);
+}
+
+/** Insert a data tuple into a sort buffer.
+@param[in,out] buf sort buffer
+@param[in] fts_index fts index to be created
+@param[in] old_table original table
+@param[in] new_table new table
+@param[in,out] psort_info parallel sort info
+@param[in,out] row table row
+@param[in] ext cache of externally stored
+ column prefixes, or NULL
+@param[in,out] doc_id Doc ID if we are creating
+ FTS index
+@param[in,out] conv_heap memory heap where to allocate data when
+ converting to ROW_FORMAT=REDUNDANT, or NULL
+ when not to invoke
+ row_merge_buf_redundant_convert()
+@param[in,out] err set if error occurs
+@param[in,out] v_heap heap memory to process data for virtual column
+@param[in,out] my_table mysql table object
+@param[in] trx transaction object
+@return number of rows added, 0 if out of space */
+static
+ulint
+row_merge_buf_add(
+ row_merge_buf_t* buf,
+ dict_index_t* fts_index,
+ const dict_table_t* old_table,
+ const dict_table_t* new_table,
+ fts_psort_t* psort_info,
+ dtuple_t* row,
+ const row_ext_t* ext,
+ doc_id_t* doc_id,
+ mem_heap_t* conv_heap,
+ dberr_t* err,
+ mem_heap_t** v_heap,
+ TABLE* my_table,
+ trx_t* trx)
+{
+ ulint i;
+ const dict_index_t* index;
+ mtuple_t* entry;
+ dfield_t* field;
+ const dict_field_t* ifield;
+ ulint n_fields;
+ ulint data_size;
+ ulint extra_size;
+ ulint bucket = 0;
+ doc_id_t write_doc_id;
+ ulint n_row_added = 0;
+ VCOL_STORAGE vcol_storage;
+ DBUG_ENTER("row_merge_buf_add");
+
+ if (buf->n_tuples >= buf->max_tuples) {
+error:
+ n_row_added = 0;
+ goto end;
+ }
+
+ DBUG_EXECUTE_IF(
+ "ib_row_merge_buf_add_two",
+ if (buf->n_tuples >= 2) DBUG_RETURN(0););
+
+ UNIV_PREFETCH_R(row->fields);
+
+ /* If we are building FTS index, buf->index points to
+ the 'fts_sort_idx', and real FTS index is stored in
+ fts_index */
+ index = (buf->index->type & DICT_FTS) ? fts_index : buf->index;
+
+ /* create spatial index should not come here */
+ ut_ad(!dict_index_is_spatial(index));
+
+ n_fields = dict_index_get_n_fields(index);
+
+ entry = &buf->tuples[buf->n_tuples];
+ field = entry->fields = static_cast<dfield_t*>(
+ mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields));
+
+ data_size = 0;
+ extra_size = UT_BITS_IN_BYTES(unsigned(index->n_nullable));
+
+ ifield = dict_index_get_nth_field(index, 0);
+
+ for (i = 0; i < n_fields; i++, field++, ifield++) {
+ ulint len;
+ ulint fixed_len;
+ const dfield_t* row_field;
+ const dict_col_t* const col = ifield->col;
+ const dict_v_col_t* const v_col = col->is_virtual()
+ ? reinterpret_cast<const dict_v_col_t*>(col)
+ : NULL;
+
+ /* Process the Doc ID column */
+ if (!v_col && *doc_id
+ && col->ind == index->table->fts->doc_col) {
+ fts_write_doc_id((byte*) &write_doc_id, *doc_id);
+
+ /* Note: field->data now points to a value on the
+ stack: &write_doc_id after dfield_set_data(). Because
+ there is only one doc_id per row, it shouldn't matter.
+ We allocate a new buffer before we leave the function
+ later below. */
+
+ dfield_set_data(
+ field, &write_doc_id, sizeof(write_doc_id));
+
+ field->type.mtype = ifield->col->mtype;
+ field->type.prtype = ifield->col->prtype;
+ field->type.mbminlen = 0;
+ field->type.mbmaxlen = 0;
+ field->type.len = ifield->col->len;
+ } else {
+ /* Use callback to get the virtual column value */
+ if (v_col) {
+ dict_index_t* clust_index
+ = dict_table_get_first_index(new_table);
+
+ if (!vcol_storage.innobase_record &&
+ !innobase_allocate_row_for_vcol(
+ trx->mysql_thd, clust_index,
+ v_heap, &my_table,
+ &vcol_storage)) {
+ *err = DB_OUT_OF_MEMORY;
+ goto error;
+ }
+
+ row_field = innobase_get_computed_value(
+ row, v_col, clust_index,
+ v_heap, NULL, ifield, trx->mysql_thd,
+ my_table, vcol_storage.innobase_record,
+ old_table, NULL, NULL);
+
+ if (row_field == NULL) {
+ *err = DB_COMPUTE_VALUE_FAILED;
+ goto error;
+ }
+ dfield_copy(field, row_field);
+ } else {
+ row_field = dtuple_get_nth_field(row,
+ col->ind);
+ dfield_copy(field, row_field);
+ }
+
+
+ /* Tokenize and process data for FTS */
+ if (index->type & DICT_FTS) {
+ fts_doc_item_t* doc_item;
+ byte* value;
+ void* ptr;
+ const ulint max_trial_count = 10000;
+ ulint trial_count = 0;
+
+ /* fetch Doc ID if it already exists
+ in the row, and not supplied by the
+ caller. Even if the value column is
+ NULL, we still need to get the Doc
+ ID so to maintain the correct max
+ Doc ID */
+ if (*doc_id == 0) {
+ const dfield_t* doc_field;
+ doc_field = dtuple_get_nth_field(
+ row,
+ index->table->fts->doc_col);
+ *doc_id = (doc_id_t) mach_read_from_8(
+ static_cast<const byte*>(
+ dfield_get_data(doc_field)));
+
+ if (*doc_id == 0) {
+ ib::warn() << "FTS Doc ID is"
+ " zero. Record"
+ " skipped";
+ goto error;
+ }
+ }
+
+ if (dfield_is_null(field)) {
+ n_row_added = 1;
+ continue;
+ }
+
+ ptr = ut_malloc_nokey(sizeof(*doc_item)
+ + field->len);
+
+ doc_item = static_cast<fts_doc_item_t*>(ptr);
+ value = static_cast<byte*>(ptr)
+ + sizeof(*doc_item);
+ memcpy(value, field->data, field->len);
+ field->data = value;
+
+ doc_item->field = field;
+ doc_item->doc_id = *doc_id;
+
+ bucket = static_cast<ulint>(
+ *doc_id % fts_sort_pll_degree);
+
+ /* Add doc item to fts_doc_list */
+ mutex_enter(&psort_info[bucket].mutex);
+
+ if (psort_info[bucket].error == DB_SUCCESS) {
+ UT_LIST_ADD_LAST(
+ psort_info[bucket].fts_doc_list,
+ doc_item);
+ psort_info[bucket].memory_used +=
+ sizeof(*doc_item) + field->len;
+ } else {
+ ut_free(doc_item);
+ }
+
+ mutex_exit(&psort_info[bucket].mutex);
+
+ /* Sleep when memory used exceeds limit*/
+ while (psort_info[bucket].memory_used
+ > FTS_PENDING_DOC_MEMORY_LIMIT
+ && trial_count++ < max_trial_count) {
+ os_thread_sleep(1000);
+ }
+
+ n_row_added = 1;
+ continue;
+ }
+
+ /* innobase_get_computed_value() sets the
+ length of the virtual column field. */
+ if (v_col == NULL
+ && field->len != UNIV_SQL_NULL
+ && col->mtype == DATA_MYSQL
+ && col->len != field->len) {
+ if (conv_heap != NULL) {
+ row_merge_buf_redundant_convert(
+ row_field, field, col->len,
+ old_table->space->zip_size(),
+ conv_heap);
+ } else {
+ /* Field length mismatch should not
+ happen when rebuilding redundant row
+ format table. */
+ ut_ad(index->table->not_redundant());
+ }
+ }
+ }
+
+ len = dfield_get_len(field);
+
+ if (dfield_is_null(field)) {
+ ut_ad(!(col->prtype & DATA_NOT_NULL));
+ continue;
+ } else if (!ext) {
+ } else if (dict_index_is_clust(index)) {
+ /* Flag externally stored fields. */
+ const byte* buf = row_ext_lookup(ext, col->ind,
+ &len);
+ if (UNIV_LIKELY_NULL(buf)) {
+ ut_a(buf != field_ref_zero);
+ if (i < dict_index_get_n_unique(index)) {
+ dfield_set_data(field, buf, len);
+ } else {
+ dfield_set_ext(field);
+ len = dfield_get_len(field);
+ }
+ }
+ } else if (!v_col) {
+ /* Only non-virtual column are stored externally */
+ const byte* buf = row_ext_lookup(ext, col->ind,
+ &len);
+ if (UNIV_LIKELY_NULL(buf)) {
+ ut_a(buf != field_ref_zero);
+ dfield_set_data(field, buf, len);
+ }
+ }
+
+ /* If a column prefix index, take only the prefix */
+
+ if (ifield->prefix_len) {
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype,
+ col->mbminlen, col->mbmaxlen,
+ ifield->prefix_len,
+ len,
+ static_cast<char*>(dfield_get_data(field)));
+ dfield_set_len(field, len);
+ }
+
+ ut_ad(len <= col->len
+ || DATA_LARGE_MTYPE(col->mtype));
+
+ fixed_len = ifield->fixed_len;
+ if (fixed_len && !dict_table_is_comp(index->table)
+ && col->mbminlen != col->mbmaxlen) {
+ /* CHAR in ROW_FORMAT=REDUNDANT is always
+ fixed-length, but in the temporary file it is
+ variable-length for variable-length character
+ sets. */
+ fixed_len = 0;
+ }
+
+ if (fixed_len) {
+#ifdef UNIV_DEBUG
+ /* len should be between size calcualted base on
+ mbmaxlen and mbminlen */
+ ut_ad(len <= fixed_len);
+ ut_ad(!col->mbmaxlen || len >= col->mbminlen
+ * (fixed_len / col->mbmaxlen));
+
+ ut_ad(!dfield_is_ext(field));
+#endif /* UNIV_DEBUG */
+ } else if (dfield_is_ext(field)) {
+ extra_size += 2;
+ } else if (len < 128
+ || (!DATA_BIG_COL(col))) {
+ extra_size++;
+ } else {
+ /* For variable-length columns, we look up the
+ maximum length from the column itself. If this
+ is a prefix index column shorter than 256 bytes,
+ this will waste one byte. */
+ extra_size += 2;
+ }
+ data_size += len;
+ }
+
+ /* If this is FTS index, we already populated the sort buffer, return
+ here */
+ if (index->type & DICT_FTS) {
+ goto end;
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ ulint size;
+ ulint extra;
+
+ size = rec_get_converted_size_temp<false>(
+ index, entry->fields, n_fields, &extra);
+
+ ut_ad(data_size + extra_size == size);
+ ut_ad(extra_size == extra);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* Add to the total size of the record in row_merge_block_t
+ the encoded length of extra_size and the extra bytes (extra_size).
+ See row_merge_buf_write() for the variable-length encoding
+ of extra_size. */
+ data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+ /* Record size can exceed page size while converting to
+ redundant row format. But there is assert
+ ut_ad(size < srv_page_size) in rec_offs_data_size().
+ It may hit the assert before attempting to insert the row. */
+ if (conv_heap != NULL && data_size > srv_page_size) {
+ *err = DB_TOO_BIG_RECORD;
+ }
+
+ ut_ad(data_size < srv_sort_buf_size);
+
+ /* Reserve bytes for the end marker of row_merge_block_t. */
+ if (buf->total_size + data_size >= srv_sort_buf_size) {
+ goto error;
+ }
+
+ buf->total_size += data_size;
+ buf->n_tuples++;
+ n_row_added++;
+
+ field = entry->fields;
+
+ /* Copy the data fields. */
+
+ do {
+ dfield_dup(field++, buf->heap);
+ } while (--n_fields);
+
+ if (conv_heap != NULL) {
+ mem_heap_empty(conv_heap);
+ }
+
+end:
+ if (vcol_storage.innobase_record)
+ innobase_free_row_for_vcol(&vcol_storage);
+ DBUG_RETURN(n_row_added);
+}
+
+/*************************************************************//**
+Report a duplicate key. */
+void
+row_merge_dup_report(
+/*=================*/
+ row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
+ const dfield_t* entry) /*!< in: duplicate index entry */
+{
+ if (!dup->n_dup++) {
+ /* Only report the first duplicate record,
+ but count all duplicate records. */
+ innobase_fields_to_mysql(dup->table, dup->index, entry);
+ }
+}
+
+/*************************************************************//**
+Compare two tuples.
+@return positive, 0, negative if a is greater, equal, less, than b,
+respectively */
+static MY_ATTRIBUTE((warn_unused_result))
+int
+row_merge_tuple_cmp(
+/*================*/
+ ulint n_uniq, /*!< in: number of unique fields */
+ ulint n_field,/*!< in: number of fields */
+ const mtuple_t& a, /*!< in: first tuple to be compared */
+ const mtuple_t& b, /*!< in: second tuple to be compared */
+ row_merge_dup_t* dup) /*!< in/out: for reporting duplicates,
+ NULL if non-unique index */
+{
+ int cmp;
+ const dfield_t* af = a.fields;
+ const dfield_t* bf = b.fields;
+ ulint n = n_uniq;
+
+ ut_ad(n_uniq > 0);
+ ut_ad(n_uniq <= n_field);
+
+ /* Compare the fields of the tuples until a difference is
+ found or we run out of fields to compare. If !cmp at the
+ end, the tuples are equal. */
+ do {
+ cmp = cmp_dfield_dfield(af++, bf++);
+ } while (!cmp && --n);
+
+ if (cmp) {
+ return(cmp);
+ }
+
+ if (dup) {
+ /* Report a duplicate value error if the tuples are
+ logically equal. NULL columns are logically inequal,
+ although they are equal in the sorting order. Find
+ out if any of the fields are NULL. */
+ for (const dfield_t* df = a.fields; df != af; df++) {
+ if (dfield_is_null(df)) {
+ goto no_report;
+ }
+ }
+
+ row_merge_dup_report(dup, a.fields);
+ }
+
+no_report:
+ /* The n_uniq fields were equal, but we compare all fields so
+ that we will get the same (internal) order as in the B-tree. */
+ for (n = n_field - n_uniq + 1; --n; ) {
+ cmp = cmp_dfield_dfield(af++, bf++);
+ if (cmp) {
+ return(cmp);
+ }
+ }
+
+ /* This should never be reached, except in a secondary index
+ when creating a secondary index and a PRIMARY KEY, and there
+ is a duplicate in the PRIMARY KEY that has not been detected
+ yet. Internally, an index must never contain duplicates. */
+ return(cmp);
+}
+
+/** Wrapper for row_merge_tuple_sort() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param tuples array of tuples that being sorted
+@param aux work area, same size as tuples[]
+@param low lower bound of the sorting area, inclusive
+@param high upper bound of the sorting area, inclusive */
+#define row_merge_tuple_sort_ctx(tuples, aux, low, high) \
+ row_merge_tuple_sort(n_uniq, n_field, dup, tuples, aux, low, high)
+/** Wrapper for row_merge_tuple_cmp() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a first tuple to be compared
+@param b second tuple to be compared
+@return positive, 0, negative, if a is greater, equal, less, than b,
+respectively */
+#define row_merge_tuple_cmp_ctx(a,b) \
+ row_merge_tuple_cmp(n_uniq, n_field, a, b, dup)
+
+/**********************************************************************//**
+Merge sort the tuple buffer in main memory. */
+static
+void
+row_merge_tuple_sort(
+/*=================*/
+ ulint n_uniq, /*!< in: number of unique fields */
+ ulint n_field,/*!< in: number of fields */
+ row_merge_dup_t* dup, /*!< in/out: reporter of duplicates
+ (NULL if non-unique index) */
+ mtuple_t* tuples, /*!< in/out: tuples */
+ mtuple_t* aux, /*!< in/out: work area */
+ ulint low, /*!< in: lower bound of the
+ sorting area, inclusive */
+ ulint high) /*!< in: upper bound of the
+ sorting area, exclusive */
+{
+ ut_ad(n_field > 0);
+ ut_ad(n_uniq <= n_field);
+
+ UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
+ tuples, aux, low, high, row_merge_tuple_cmp_ctx);
+}
+
+/******************************************************//**
+Sort a buffer. */
+void
+row_merge_buf_sort(
+/*===============*/
+ row_merge_buf_t* buf, /*!< in/out: sort buffer */
+ row_merge_dup_t* dup) /*!< in/out: reporter of duplicates
+ (NULL if non-unique index) */
+{
+ ut_ad(!dict_index_is_spatial(buf->index));
+
+ row_merge_tuple_sort(dict_index_get_n_unique(buf->index),
+ dict_index_get_n_fields(buf->index),
+ dup,
+ buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
+}
+
+/******************************************************//**
+Write a buffer to a block. */
+void
+row_merge_buf_write(
+/*================*/
+ const row_merge_buf_t* buf, /*!< in: sorted buffer */
+ const merge_file_t* of UNIV_UNUSED,
+ /*!< in: output file */
+ row_merge_block_t* block) /*!< out: buffer for writing to file */
+{
+ const dict_index_t* index = buf->index;
+ ulint n_fields= dict_index_get_n_fields(index);
+ byte* b = &block[0];
+
+ DBUG_ENTER("row_merge_buf_write");
+
+ for (ulint i = 0; i < buf->n_tuples; i++) {
+ const mtuple_t* entry = &buf->tuples[i];
+
+ row_merge_buf_encode(&b, index, entry, n_fields);
+ ut_ad(b < &block[srv_sort_buf_size]);
+
+ DBUG_LOG("ib_merge_sort",
+ reinterpret_cast<const void*>(b) << ','
+ << of->fd << ',' << of->offset << ' ' <<
+ i << ": " <<
+ rec_printer(entry->fields, n_fields).str());
+ }
+
+ /* Write an "end-of-chunk" marker. */
+ ut_a(b < &block[srv_sort_buf_size]);
+ ut_a(b == &block[0] + buf->total_size);
+ *b++ = 0;
+#ifdef HAVE_valgrind
+ /* The rest of the block is uninitialized. Initialize it
+ to avoid bogus warnings. */
+ memset(b, 0xff, &block[srv_sort_buf_size] - b);
+#endif /* HAVE_valgrind */
+ DBUG_LOG("ib_merge_sort",
+ "write " << reinterpret_cast<const void*>(b) << ','
+ << of->fd << ',' << of->offset << " EOF");
+ DBUG_VOID_RETURN;
+}
+
+/******************************************************//**
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
+@return memory heap */
+static
+mem_heap_t*
+row_merge_heap_create(
+/*==================*/
+ const dict_index_t* index, /*!< in: record descriptor */
+ mrec_buf_t** buf, /*!< out: 3 buffers */
+ rec_offs** offsets1, /*!< out: offsets */
+ rec_offs** offsets2) /*!< out: offsets */
+{
+ ulint i = 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+ mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1
+ + 3 * sizeof **buf);
+
+ *buf = static_cast<mrec_buf_t*>(
+ mem_heap_alloc(heap, 3 * sizeof **buf));
+ *offsets1 = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, i * sizeof **offsets1));
+ *offsets2 = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, i * sizeof **offsets2));
+
+ rec_offs_set_n_alloc(*offsets1, i);
+ rec_offs_set_n_alloc(*offsets2, i);
+ rec_offs_set_n_fields(*offsets1, dict_index_get_n_fields(index));
+ rec_offs_set_n_fields(*offsets2, dict_index_get_n_fields(index));
+
+ return(heap);
+}
+
+/** Read a merge block from the file system.
+@return whether the request was completed successfully */
+bool
+row_merge_read(
+/*===========*/
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint offset, /*!< in: offset where to read
+ in number of row_merge_block_t
+ elements */
+ row_merge_block_t* buf, /*!< out: data */
+ row_merge_block_t* crypt_buf, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+{
+ os_offset_t ofs = ((os_offset_t) offset) * srv_sort_buf_size;
+
+ DBUG_ENTER("row_merge_read");
+ DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs);
+ DBUG_EXECUTE_IF("row_merge_read_failure", DBUG_RETURN(FALSE););
+
+ const bool success = DB_SUCCESS == os_file_read_no_error_handling(
+ IORequestRead, fd, buf, ofs, srv_sort_buf_size, 0);
+
+ /* If encryption is enabled decrypt buffer */
+ if (success && log_tmp_is_encrypted()) {
+ if (!log_tmp_block_decrypt(buf, srv_sort_buf_size,
+ crypt_buf, ofs)) {
+ return (FALSE);
+ }
+
+ srv_stats.n_merge_blocks_decrypted.inc();
+ memcpy(buf, crypt_buf, srv_sort_buf_size);
+ }
+
+#ifdef POSIX_FADV_DONTNEED
+ /* Each block is read exactly once. Free up the file cache. */
+ posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+ if (!success) {
+ ib::error() << "Failed to read merge block at " << ofs;
+ }
+
+ DBUG_RETURN(success);
+}
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return whether the request was completed successfully
+@retval false on error
+@retval true on success */
+UNIV_INTERN
+bool
+row_merge_write(
+/*============*/
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint offset, /*!< in: offset where to write,
+ in number of row_merge_block_t elements */
+ const void* buf, /*!< in: data */
+ void* crypt_buf, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+{
+ size_t buf_len = srv_sort_buf_size;
+ os_offset_t ofs = buf_len * (os_offset_t) offset;
+ void* out_buf = (void *)buf;
+
+ DBUG_ENTER("row_merge_write");
+ DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs);
+ DBUG_EXECUTE_IF("row_merge_write_failure", DBUG_RETURN(FALSE););
+
+ /* For encrypted tables, encrypt data before writing */
+ if (log_tmp_is_encrypted()) {
+ if (!log_tmp_block_encrypt(static_cast<const byte*>(buf),
+ buf_len,
+ static_cast<byte*>(crypt_buf),
+ ofs)) {
+ return false;
+ }
+
+ srv_stats.n_merge_blocks_encrypted.inc();
+ out_buf = crypt_buf;
+ }
+
+ const bool success = DB_SUCCESS == os_file_write(
+ IORequestWrite, "(merge)", fd, out_buf, ofs, buf_len);
+
+#ifdef POSIX_FADV_DONTNEED
+ /* The block will be needed on the next merge pass,
+ but it can be evicted from the file cache meanwhile. */
+ posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+ DBUG_RETURN(success);
+}
+
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+const byte*
+row_merge_read_rec(
+/*===============*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ mrec_buf_t* buf, /*!< in/out: secondary buffer */
+ const byte* b, /*!< in: pointer to record */
+ const dict_index_t* index, /*!< in: index of the record */
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint* foffs, /*!< in/out: file offset */
+ const mrec_t** mrec, /*!< out: pointer to merge record,
+ or NULL on end of list
+ (non-NULL on I/O error) */
+ rec_offs* offsets,/*!< out: offsets of mrec */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+{
+ ulint extra_size;
+ ulint data_size;
+ ulint avail_size;
+
+ ut_ad(b >= &block[0]);
+ ut_ad(b < &block[srv_sort_buf_size]);
+
+ ut_ad(rec_offs_get_n_alloc(offsets) == 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index));
+
+ DBUG_ENTER("row_merge_read_rec");
+
+ extra_size = *b++;
+
+ if (UNIV_UNLIKELY(!extra_size)) {
+ /* End of list */
+ *mrec = NULL;
+ DBUG_LOG("ib_merge_sort",
+ "read " << reinterpret_cast<const void*>(b) << ',' <<
+ reinterpret_cast<const void*>(block) << ',' <<
+ fd << ',' << *foffs << " EOF");
+ DBUG_RETURN(NULL);
+ }
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ if (UNIV_UNLIKELY(b >= &block[srv_sort_buf_size])) {
+ if (!row_merge_read(fd, ++(*foffs), block,
+ crypt_block,
+ space)) {
+err_exit:
+ /* Signal I/O error. */
+ *mrec = b;
+ DBUG_RETURN(NULL);
+ }
+
+ /* Wrap around to the beginning of the buffer. */
+ b = &block[0];
+ }
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *b++;
+ }
+
+ /* Normalize extra_size. Above, value 0 signals "end of list". */
+ extra_size--;
+
+ /* Read the extra bytes. */
+
+ if (UNIV_UNLIKELY(b + extra_size >= &block[srv_sort_buf_size])) {
+ /* The record spans two blocks. Copy the entire record
+ to the auxiliary buffer and handle this as a special
+ case. */
+
+ avail_size = ulint(&block[srv_sort_buf_size] - b);
+ ut_ad(avail_size < sizeof *buf);
+ memcpy(*buf, b, avail_size);
+
+ if (!row_merge_read(fd, ++(*foffs), block,
+ crypt_block,
+ space)) {
+
+ goto err_exit;
+ }
+
+ /* Wrap around to the beginning of the buffer. */
+ b = &block[0];
+
+ /* Copy the record. */
+ memcpy(*buf + avail_size, b, extra_size - avail_size);
+ b += extra_size - avail_size;
+
+ *mrec = *buf + extra_size;
+
+ rec_init_offsets_temp(*mrec, index, offsets);
+
+ data_size = rec_offs_data_size(offsets);
+
+ /* These overflows should be impossible given that
+ records are much smaller than either buffer, and
+ the record starts near the beginning of each buffer. */
+ ut_a(extra_size + data_size < sizeof *buf);
+ ut_a(b + data_size < &block[srv_sort_buf_size]);
+
+ /* Copy the data bytes. */
+ memcpy(*buf + extra_size, b, data_size);
+ b += data_size;
+
+ goto func_exit;
+ }
+
+ *mrec = b + extra_size;
+
+ rec_init_offsets_temp(*mrec, index, offsets);
+
+ data_size = rec_offs_data_size(offsets);
+ ut_ad(extra_size + data_size < sizeof *buf);
+
+ b += extra_size + data_size;
+
+ if (UNIV_LIKELY(b < &block[srv_sort_buf_size])) {
+ /* The record fits entirely in the block.
+ This is the normal case. */
+ goto func_exit;
+ }
+
+ /* The record spans two blocks. Copy it to buf. */
+
+ b -= extra_size + data_size;
+ avail_size = ulint(&block[srv_sort_buf_size] - b);
+ memcpy(*buf, b, avail_size);
+ *mrec = *buf + extra_size;
+
+ rec_init_offsets_temp(*mrec, index, offsets);
+
+ if (!row_merge_read(fd, ++(*foffs), block,
+ crypt_block,
+ space)) {
+
+ goto err_exit;
+ }
+
+ /* Wrap around to the beginning of the buffer. */
+ b = &block[0];
+
+ /* Copy the rest of the record. */
+ memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
+ b += extra_size + data_size - avail_size;
+
+func_exit:
+ DBUG_LOG("ib_merge_sort",
+ reinterpret_cast<const void*>(b) << ',' <<
+ reinterpret_cast<const void*>(block)
+ << ",fd=" << fd << ',' << *foffs << ": "
+ << rec_printer(*mrec, 0, offsets).str());
+ DBUG_RETURN(b);
+}
+
+/********************************************************************//**
+Write a merge record. */
+static
+void
+row_merge_write_rec_low(
+/*====================*/
+ byte* b, /*!< out: buffer */
+ ulint e, /*!< in: encoded extra_size */
+#ifndef DBUG_OFF
+ ulint size, /*!< in: total size to write */
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint foffs, /*!< in: file offset */
+#endif /* !DBUG_OFF */
+ const mrec_t* mrec, /*!< in: record to write */
+ const rec_offs* offsets)/*!< in: offsets of mrec */
+#ifdef DBUG_OFF
+# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \
+ row_merge_write_rec_low(b, e, mrec, offsets)
+#endif /* DBUG_OFF */
+{
+ DBUG_ENTER("row_merge_write_rec_low");
+
+#ifndef DBUG_OFF
+ const byte* const end = b + size;
+#endif /* DBUG_OFF */
+ DBUG_ASSERT(e == rec_offs_extra_size(offsets) + 1);
+
+ DBUG_LOG("ib_merge_sort",
+ reinterpret_cast<const void*>(b) << ",fd=" << fd << ','
+ << foffs << ": " << rec_printer(mrec, 0, offsets).str());
+
+ if (e < 0x80) {
+ *b++ = (byte) e;
+ } else {
+ *b++ = (byte) (0x80 | (e >> 8));
+ *b++ = (byte) e;
+ }
+
+ memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
+ DBUG_SLOW_ASSERT(b + rec_offs_size(offsets) == end);
+ DBUG_VOID_RETURN;
+}
+
+/********************************************************************//**
+Write a merge record.
+@return pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_rec(
+/*================*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ mrec_buf_t* buf, /*!< in/out: secondary buffer */
+ byte* b, /*!< in: pointer to end of block */
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint* foffs, /*!< in/out: file offset */
+ const mrec_t* mrec, /*!< in: record to write */
+ const rec_offs* offsets,/*!< in: offsets of mrec */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+{
+ ulint extra_size;
+ ulint size;
+ ulint avail_size;
+
+ ut_ad(block);
+ ut_ad(buf);
+ ut_ad(b >= &block[0]);
+ ut_ad(b < &block[srv_sort_buf_size]);
+ ut_ad(mrec);
+ ut_ad(foffs);
+ ut_ad(mrec < &block[0] || mrec > &block[srv_sort_buf_size]);
+ ut_ad(mrec < buf[0] || mrec > buf[1]);
+
+ /* Normalize extra_size. Value 0 signals "end of list". */
+ extra_size = rec_offs_extra_size(offsets) + 1;
+
+ size = extra_size + (extra_size >= 0x80)
+ + rec_offs_data_size(offsets);
+
+ if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) {
+ /* The record spans two blocks.
+ Copy it to the temporary buffer first. */
+ avail_size = ulint(&block[srv_sort_buf_size] - b);
+
+ row_merge_write_rec_low(buf[0],
+ extra_size, size, fd, *foffs,
+ mrec, offsets);
+
+ /* Copy the head of the temporary buffer, write
+ the completed block, and copy the tail of the
+ record to the head of the new block. */
+ memcpy(b, buf[0], avail_size);
+
+ if (!row_merge_write(fd, (*foffs)++, block,
+ crypt_block,
+ space)) {
+ return(NULL);
+ }
+
+ MEM_UNDEFINED(&block[0], srv_sort_buf_size);
+
+ /* Copy the rest. */
+ b = &block[0];
+ memcpy(b, buf[0] + avail_size, size - avail_size);
+ b += size - avail_size;
+ } else {
+ row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
+ mrec, offsets);
+ b += size;
+ }
+
+ return(b);
+}
+
+/********************************************************************//**
+Write an end-of-list marker.
+@return pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_eof(
+/*================*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ byte* b, /*!< in: pointer to end of block */
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint* foffs, /*!< in/out: file offset */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+{
+ ut_ad(block);
+ ut_ad(b >= &block[0]);
+ ut_ad(b < &block[srv_sort_buf_size]);
+ ut_ad(foffs);
+
+ DBUG_ENTER("row_merge_write_eof");
+ DBUG_LOG("ib_merge_sort",
+ reinterpret_cast<const void*>(b) << ',' <<
+ reinterpret_cast<const void*>(block) <<
+ ",fd=" << fd << ',' << *foffs);
+
+ *b++ = 0;
+ MEM_CHECK_DEFINED(&block[0], b - &block[0]);
+ MEM_CHECK_ADDRESSABLE(&block[0], srv_sort_buf_size);
+
+ /* The rest of the block is uninitialized. Silence warnings. */
+ MEM_MAKE_DEFINED(b, &block[srv_sort_buf_size] - b);
+
+ if (!row_merge_write(fd, (*foffs)++, block, crypt_block, space)) {
+ DBUG_RETURN(NULL);
+ }
+
+ MEM_UNDEFINED(&block[0], srv_sort_buf_size);
+ DBUG_RETURN(&block[0]);
+}
+
+/** Create a temporary file if it has not been created already.
+@param[in,out] tmpfd temporary file handle
+@param[in] path location for creating temporary file
+@return true on success, false on error */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_merge_tmpfile_if_needed(
+ pfs_os_file_t* tmpfd,
+ const char* path)
+{
+ if (*tmpfd == OS_FILE_CLOSED) {
+ *tmpfd = row_merge_file_create_low(path);
+ if (*tmpfd != OS_FILE_CLOSED) {
+ MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES);
+ }
+ }
+
+ return(*tmpfd != OS_FILE_CLOSED);
+}
+
+/** Create a temporary file for merge sort if it was not created already.
+@param[in,out] file merge file structure
+@param[in] nrec number of records in the file
+@param[in] path location for creating temporary file
+@return true on success, false on error */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_merge_file_create_if_needed(
+ merge_file_t* file,
+ pfs_os_file_t* tmpfd,
+ ulint nrec,
+ const char* path)
+{
+ ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED);
+ if (file->fd == OS_FILE_CLOSED && row_merge_file_create(file, path)!= OS_FILE_CLOSED) {
+ MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES);
+ if (!row_merge_tmpfile_if_needed(tmpfd, path) ) {
+ return(false);
+ }
+
+ file->n_rec = nrec;
+ }
+
+ ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED);
+ return(file->fd != OS_FILE_CLOSED);
+}
+
+/** Copy the merge data tuple from another merge data tuple.
+@param[in] mtuple source merge data tuple
+@param[in,out] prev_mtuple destination merge data tuple
+@param[in] n_unique number of unique fields exist in the mtuple
+@param[in,out] heap memory heap where last_mtuple allocated */
+static
+void
+row_mtuple_create(
+ const mtuple_t* mtuple,
+ mtuple_t* prev_mtuple,
+ ulint n_unique,
+ mem_heap_t* heap)
+{
+ memcpy(prev_mtuple->fields, mtuple->fields,
+ n_unique * sizeof *mtuple->fields);
+
+ dfield_t* field = prev_mtuple->fields;
+
+ for (ulint i = 0; i < n_unique; i++) {
+ dfield_dup(field++, heap);
+ }
+}
+
+/** Compare two merge data tuples.
+@param[in] prev_mtuple merge data tuple
+@param[in] current_mtuple merge data tuple
+@param[in,out] dup reporter of duplicates
+@retval positive, 0, negative if current_mtuple is greater, equal, less, than
+last_mtuple. */
+static
+int
+row_mtuple_cmp(
+ const mtuple_t* prev_mtuple,
+ const mtuple_t* current_mtuple,
+ row_merge_dup_t* dup)
+{
+ ut_ad(dict_index_is_clust(dup->index));
+ const ulint n_unique = dict_index_get_n_unique(dup->index);
+
+ return(row_merge_tuple_cmp(
+ n_unique, n_unique, *current_mtuple, *prev_mtuple, dup));
+}
+
+/** Insert cached spatial index rows.
+@param[in] trx_id transaction id
+@param[in] sp_tuples cached spatial rows
+@param[in] num_spatial number of spatial indexes
+@param[in,out] heap heap for insert
+@param[in,out] sp_heap heap for tuples
+@param[in,out] pcur cluster index cursor
+@param[in,out] started whether mtr is active
+@param[in,out] mtr mini-transaction
+@return DB_SUCCESS or error number */
+static
+dberr_t
+row_merge_spatial_rows(
+ trx_id_t trx_id,
+ index_tuple_info_t** sp_tuples,
+ ulint num_spatial,
+ mem_heap_t* heap,
+ mem_heap_t* sp_heap,
+ btr_pcur_t* pcur,
+ bool& started,
+ mtr_t* mtr)
+{
+ if (!sp_tuples)
+ return DB_SUCCESS;
+
+ for (ulint j= 0; j < num_spatial; j++)
+ if (dberr_t err= sp_tuples[j]->insert(trx_id, heap, pcur, started, mtr))
+ return err;
+
+ mem_heap_empty(sp_heap);
+ return DB_SUCCESS;
+}
+
+/** Check if the geometry field is valid.
+@param[in] row the row
+@param[in] index spatial index
+@return true if it's valid, false if it's invalid. */
+static
+bool
+row_geo_field_is_valid(
+ const dtuple_t* row,
+ dict_index_t* index)
+{
+ const dict_field_t* ind_field
+ = dict_index_get_nth_field(index, 0);
+ const dict_col_t* col
+ = ind_field->col;
+ ulint col_no
+ = dict_col_get_no(col);
+ const dfield_t* dfield
+ = dtuple_get_nth_field(row, col_no);
+
+ if (dfield_is_null(dfield)
+ || dfield_get_len(dfield) < GEO_DATA_HEADER_SIZE) {
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Reads clustered index of the table and create temporary files
+containing the index entries for the indexes to be built.
+@param[in] trx transaction
+@param[in,out] table MySQL table object, for reporting erroneous
+ records
+@param[in] old_table table where rows are read from
+@param[in] new_table table where indexes are created; identical to
+ old_table unless creating a PRIMARY KEY
+@param[in] online true if creating indexes online
+@param[in] index indexes to be created
+@param[in] fts_sort_idx full-text index to be created, or NULL
+@param[in] psort_info parallel sort info for fts_sort_idx creation,
+ or NULL
+@param[in] files temporary files
+@param[in] key_numbers MySQL key numbers to create
+@param[in] n_index number of indexes to create
+@param[in] defaults default values of added, changed columns, or NULL
+@param[in] add_v newly added virtual columns along with indexes
+@param[in] col_map mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in] add_autoinc number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out] sequence autoinc sequence
+@param[in,out] block file buffer
+@param[in] skip_pk_sort whether the new PRIMARY KEY will follow
+existing order
+@param[in,out] tmpfd temporary file handle
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->n_pk_recs_inc() will be called for each record read and
+stage->inc() will be called for each page read.
+@param[in] pct_cost percent of task weight out of total alter job
+@param[in,out] crypt_block crypted file buffer
+@param[in] eval_table mysql table used to evaluate virtual column
+ value, see innobase_get_computed_value().
+@param[in] allow_not_null allow null to not-null conversion
+@return DB_SUCCESS or error */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_read_clustered_index(
+ trx_t* trx,
+ struct TABLE* table,
+ const dict_table_t* old_table,
+ dict_table_t* new_table,
+ bool online,
+ dict_index_t** index,
+ dict_index_t* fts_sort_idx,
+ fts_psort_t* psort_info,
+ merge_file_t* files,
+ const ulint* key_numbers,
+ ulint n_index,
+ const dtuple_t* defaults,
+ const dict_add_v_col_t* add_v,
+ const ulint* col_map,
+ ulint add_autoinc,
+ ib_sequence_t& sequence,
+ row_merge_block_t* block,
+ bool skip_pk_sort,
+ pfs_os_file_t* tmpfd,
+ ut_stage_alter_t* stage,
+ double pct_cost,
+ row_merge_block_t* crypt_block,
+ struct TABLE* eval_table,
+ bool allow_not_null)
+{
+ dict_index_t* clust_index; /* Clustered index */
+ mem_heap_t* row_heap = NULL;/* Heap memory to create
+ clustered index tuples */
+ row_merge_buf_t** merge_buf; /* Temporary list for records*/
+ mem_heap_t* v_heap = NULL; /* Heap memory to process large
+ data for virtual column */
+ btr_pcur_t pcur; /* Cursor on the clustered
+ index */
+ mtr_t mtr; /* Mini transaction */
+ bool mtr_started = false;
+ dberr_t err = DB_SUCCESS;/* Return code */
+ ulint n_nonnull = 0; /* number of columns
+ changed to NOT NULL */
+ ulint* nonnull = NULL; /* NOT NULL columns */
+ dict_index_t* fts_index = NULL;/* FTS index */
+ doc_id_t doc_id = 0;
+ doc_id_t max_doc_id = 0;
+ ibool add_doc_id = FALSE;
+ os_event_t fts_parallel_sort_event = NULL;
+ ibool fts_pll_sort = FALSE;
+ int64_t sig_count = 0;
+ index_tuple_info_t** sp_tuples = NULL;
+ mem_heap_t* sp_heap = NULL;
+ ulint num_spatial = 0;
+ BtrBulk* clust_btr_bulk = NULL;
+ bool clust_temp_file = false;
+ mem_heap_t* mtuple_heap = NULL;
+ mtuple_t prev_mtuple;
+ mem_heap_t* conv_heap = NULL;
+ double curr_progress = 0.0;
+ ib_uint64_t read_rows = 0;
+ ib_uint64_t table_total_rows = 0;
+ char new_sys_trx_start[8];
+ char new_sys_trx_end[8];
+ byte any_autoinc_data[8] = {0};
+ bool vers_update_trt = false;
+
+ DBUG_ENTER("row_merge_read_clustered_index");
+
+ ut_ad((old_table == new_table) == !col_map);
+ ut_ad(!defaults || col_map);
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(trx->id);
+
+ table_total_rows = dict_table_get_n_rows(old_table);
+ if(table_total_rows == 0) {
+ /* We don't know total row count */
+ table_total_rows = 1;
+ }
+
+ trx->op_info = "reading clustered index";
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n");
+#endif
+
+ /* Create and initialize memory for record buffers */
+
+ merge_buf = static_cast<row_merge_buf_t**>(
+ ut_malloc_nokey(n_index * sizeof *merge_buf));
+
+ row_merge_dup_t clust_dup = {index[0], table, col_map, 0};
+ dfield_t* prev_fields;
+ const ulint n_uniq = dict_index_get_n_unique(index[0]);
+
+ ut_ad(trx->mysql_thd != NULL);
+
+ const char* path = thd_innodb_tmpdir(trx->mysql_thd);
+
+ ut_ad(!skip_pk_sort || dict_index_is_clust(index[0]));
+ /* There is no previous tuple yet. */
+ prev_mtuple.fields = NULL;
+
+ for (ulint i = 0; i < n_index; i++) {
+ if (index[i]->type & DICT_FTS) {
+
+ /* We are building a FT index, make sure
+ we have the temporary 'fts_sort_idx' */
+ ut_a(fts_sort_idx);
+
+ fts_index = index[i];
+
+ merge_buf[i] = row_merge_buf_create(fts_sort_idx);
+
+ add_doc_id = DICT_TF2_FLAG_IS_SET(
+ new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+ /* If Doc ID does not exist in the table itself,
+ fetch the first FTS Doc ID */
+ if (add_doc_id) {
+ fts_get_next_doc_id(
+ (dict_table_t*) new_table,
+ &doc_id);
+ ut_ad(doc_id > 0);
+ }
+
+ fts_pll_sort = TRUE;
+ row_fts_start_psort(psort_info);
+ fts_parallel_sort_event =
+ psort_info[0].psort_common->sort_event;
+ } else {
+ if (dict_index_is_spatial(index[i])) {
+ num_spatial++;
+ }
+
+ merge_buf[i] = row_merge_buf_create(index[i]);
+ }
+ }
+
+ if (num_spatial > 0) {
+ ulint count = 0;
+
+ sp_heap = mem_heap_create(512);
+
+ sp_tuples = static_cast<index_tuple_info_t**>(
+ ut_malloc_nokey(num_spatial
+ * sizeof(*sp_tuples)));
+
+ for (ulint i = 0; i < n_index; i++) {
+ if (dict_index_is_spatial(index[i])) {
+ sp_tuples[count]
+ = UT_NEW_NOKEY(
+ index_tuple_info_t(
+ sp_heap,
+ index[i]));
+ count++;
+ }
+ }
+
+ ut_ad(count == num_spatial);
+ }
+
+ mtr.start();
+ mtr_started = true;
+
+ /* Find the clustered index and create a persistent cursor
+ based on that. */
+
+ clust_index = dict_table_get_first_index(old_table);
+ const ulint old_trx_id_col = ulint(old_table->n_cols)
+ - (DATA_N_SYS_COLS - DATA_TRX_ID);
+ ut_ad(old_table->cols[old_trx_id_col].mtype == DATA_SYS);
+ ut_ad(old_table->cols[old_trx_id_col].prtype
+ == (DATA_TRX_ID | DATA_NOT_NULL));
+ ut_ad(old_table->cols[old_trx_id_col + 1].mtype == DATA_SYS);
+ ut_ad(old_table->cols[old_trx_id_col + 1].prtype
+ == (DATA_ROLL_PTR | DATA_NOT_NULL));
+ const ulint new_trx_id_col = col_map
+ ? col_map[old_trx_id_col] : old_trx_id_col;
+
+ btr_pcur_open_at_index_side(
+ true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+ mtr_started = true;
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ if (rec_is_metadata(btr_pcur_get_rec(&pcur), *clust_index)) {
+ ut_ad(btr_pcur_is_on_user_rec(&pcur));
+ /* Skip the metadata pseudo-record. */
+ } else {
+ ut_ad(!clust_index->is_instant());
+ btr_pcur_move_to_prev_on_page(&pcur);
+ }
+
+ if (old_table != new_table) {
+ /* The table is being rebuilt. Identify the columns
+ that were flagged NOT NULL in the new table, so that
+ we can quickly check that the records in the old table
+ do not violate the added NOT NULL constraints. */
+
+ nonnull = static_cast<ulint*>(
+ ut_malloc_nokey(dict_table_get_n_cols(new_table)
+ * sizeof *nonnull));
+
+ for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) {
+ if (dict_table_get_nth_col(old_table, i)->prtype
+ & DATA_NOT_NULL) {
+ continue;
+ }
+
+ const ulint j = col_map[i];
+
+ if (j == ULINT_UNDEFINED) {
+ /* The column was dropped. */
+ continue;
+ }
+
+ if (dict_table_get_nth_col(new_table, j)->prtype
+ & DATA_NOT_NULL) {
+ nonnull[n_nonnull++] = j;
+ }
+ }
+
+ if (!n_nonnull) {
+ ut_free(nonnull);
+ nonnull = NULL;
+ }
+ }
+
+ row_heap = mem_heap_create(sizeof(mrec_buf_t));
+
+ if (dict_table_is_comp(old_table)
+ && !dict_table_is_comp(new_table)) {
+ conv_heap = mem_heap_create(sizeof(mrec_buf_t));
+ }
+
+ if (skip_pk_sort) {
+ prev_fields = static_cast<dfield_t*>(
+ ut_malloc_nokey(n_uniq * sizeof *prev_fields));
+ mtuple_heap = mem_heap_create(sizeof(mrec_buf_t));
+ } else {
+ prev_fields = NULL;
+ }
+
+ mach_write_to_8(new_sys_trx_start, trx->id);
+ mach_write_to_8(new_sys_trx_end, TRX_ID_MAX);
+ uint64_t n_rows = 0;
+
+ /* Scan the clustered index. */
+ for (;;) {
+ /* Do not continue if table pages are still encrypted */
+ if (!old_table->is_readable() || !new_table->is_readable()) {
+ err = DB_DECRYPTION_FAILED;
+ trx->error_key_num = 0;
+ goto func_exit;
+ }
+
+ const rec_t* rec;
+ trx_id_t rec_trx_id;
+ rec_offs* offsets;
+ dtuple_t* row;
+ row_ext_t* ext;
+ page_cur_t* cur = btr_pcur_get_page_cur(&pcur);
+
+ mem_heap_empty(row_heap);
+
+ page_cur_move_to_next(cur);
+
+ stage->n_pk_recs_inc();
+
+ if (page_cur_is_after_last(cur)) {
+
+ stage->inc();
+
+ if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+ err = DB_INTERRUPTED;
+ trx->error_key_num = 0;
+ goto func_exit;
+ }
+
+ if (online && old_table != new_table) {
+ err = row_log_table_get_error(clust_index);
+ if (err != DB_SUCCESS) {
+ trx->error_key_num = 0;
+ goto func_exit;
+ }
+ }
+
+ /* Insert the cached spatial index rows. */
+ err = row_merge_spatial_rows(
+ trx->id, sp_tuples, num_spatial,
+ row_heap, sp_heap, &pcur, mtr_started, &mtr);
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (!mtr_started) {
+ goto scan_next;
+ }
+
+ if (clust_index->lock.waiters) {
+ /* There are waiters on the clustered
+ index tree lock, likely the purge
+ thread. Store and restore the cursor
+ position, and yield so that scanning a
+ large table will not starve other
+ threads. */
+
+ /* Store the cursor position on the last user
+ record on the page. */
+ btr_pcur_move_to_prev_on_page(&pcur);
+ /* Leaf pages must never be empty, unless
+ this is the only page in the index tree. */
+ ut_ad(btr_pcur_is_on_user_rec(&pcur)
+ || btr_pcur_get_block(
+ &pcur)->page.id().page_no()
+ == clust_index->page);
+
+ btr_pcur_store_position(&pcur, &mtr);
+ mtr.commit();
+ mtr_started = false;
+
+ /* Give the waiters a chance to proceed. */
+ os_thread_yield();
+scan_next:
+ ut_ad(!mtr_started);
+ ut_ad(!mtr.is_active());
+ mtr.start();
+ mtr_started = true;
+ /* Restore position on the record, or its
+ predecessor if the record was purged
+ meanwhile. */
+ btr_pcur_restore_position(
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ /* Move to the successor of the
+ original record. */
+ if (!btr_pcur_move_to_next_user_rec(
+ &pcur, &mtr)) {
+end_of_index:
+ row = NULL;
+ mtr.commit();
+ mtr_started = false;
+ mem_heap_free(row_heap);
+ row_heap = NULL;
+ ut_free(nonnull);
+ nonnull = NULL;
+ goto write_buffers;
+ }
+ } else {
+ uint32_t next_page_no = btr_page_get_next(
+ page_cur_get_page(cur));
+
+ if (next_page_no == FIL_NULL) {
+ goto end_of_index;
+ }
+
+ buf_block_t* block = btr_block_get(
+ *clust_index, next_page_no,
+ RW_S_LATCH, false, &mtr);
+
+ btr_leaf_page_release(page_cur_get_block(cur),
+ BTR_SEARCH_LEAF, &mtr);
+ page_cur_set_before_first(block, cur);
+ page_cur_move_to_next(cur);
+
+ ut_ad(!page_cur_is_after_last(cur));
+ }
+ }
+
+ rec = page_cur_get_rec(cur);
+
+ if (online) {
+ offsets = rec_get_offsets(rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &row_heap);
+ rec_trx_id = row_get_rec_trx_id(rec, clust_index,
+ offsets);
+
+ /* Perform a REPEATABLE READ.
+
+ When rebuilding the table online,
+ row_log_table_apply() must not see a newer
+ state of the table when applying the log.
+ This is mainly to prevent false duplicate key
+ errors, because the log will identify records
+ by the PRIMARY KEY, and also to prevent unsafe
+ BLOB access.
+
+ When creating a secondary index online, this
+ table scan must not see records that have only
+ been inserted to the clustered index, but have
+ not been written to the online_log of
+ index[]. If we performed READ UNCOMMITTED, it
+ could happen that the ADD INDEX reaches
+ ONLINE_INDEX_COMPLETE state between the time
+ the DML thread has updated the clustered index
+ but has not yet accessed secondary index. */
+ ut_ad(trx->read_view.is_open());
+ ut_ad(rec_trx_id != trx->id);
+
+ if (!trx->read_view.changes_visible(
+ rec_trx_id, old_table->name)) {
+ rec_t* old_vers;
+
+ row_vers_build_for_consistent_read(
+ rec, &mtr, clust_index, &offsets,
+ &trx->read_view, &row_heap,
+ row_heap, &old_vers, NULL);
+
+ if (!old_vers) {
+ continue;
+ }
+
+ /* The old version must necessarily be
+ in the "prehistory", because the
+ exclusive lock in
+ ha_innobase::prepare_inplace_alter_table()
+ forced the completion of any transactions
+ that accessed this table. */
+ ut_ad(row_get_rec_trx_id(old_vers, clust_index,
+ offsets) < trx->id);
+
+ rec = old_vers;
+ rec_trx_id = 0;
+ }
+
+ if (rec_get_deleted_flag(
+ rec,
+ dict_table_is_comp(old_table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record.
+ Above, we did reset rec_trx_id = 0
+ for rec = old_vers.*/
+ ut_ad(rec == page_cur_get_rec(cur)
+ ? rec_trx_id
+ : !rec_trx_id);
+ /* This record was deleted in the latest
+ committed version, or it was deleted and
+ then reinserted-by-update before purge
+ kicked in. Skip it. */
+ continue;
+ }
+
+ ut_ad(!rec_offs_any_null_extern(rec, offsets));
+ } else if (rec_get_deleted_flag(
+ rec, dict_table_is_comp(old_table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_d(rec_trx_id = rec_get_trx_id(rec, clust_index));
+ ut_ad(rec_trx_id);
+ /* This must be a purgeable delete-marked record,
+ and the transaction that delete-marked the record
+ must have been committed before this
+ !online ALTER TABLE transaction. */
+ ut_ad(rec_trx_id < trx->id);
+ /* Skip delete-marked records.
+
+ Skipping delete-marked records will make the
+ created indexes unuseable for transactions
+ whose read views were created before the index
+ creation completed, but an attempt to preserve
+ the history would make it tricky to detect
+ duplicate keys. */
+ continue;
+ } else {
+ offsets = rec_get_offsets(rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &row_heap);
+ /* This is a locking ALTER TABLE.
+
+ If we are not rebuilding the table, the
+ DB_TRX_ID does not matter, as it is not being
+ written to any secondary indexes; see
+ if (old_table == new_table) below.
+
+ If we are rebuilding the table, the
+ DB_TRX_ID,DB_ROLL_PTR should be reset, because
+ there will be no history available. */
+ ut_ad(rec_get_trx_id(rec, clust_index) < trx->id);
+ rec_trx_id = 0;
+ }
+
+ /* When !online, we are holding a lock on old_table, preventing
+ any inserts that could have written a record 'stub' before
+ writing out off-page columns. */
+ ut_ad(!rec_offs_any_null_extern(rec, offsets));
+
+ /* Build a row based on the clustered index. */
+
+ row = row_build_w_add_vcol(ROW_COPY_POINTERS, clust_index,
+ rec, offsets, new_table,
+ defaults, add_v, col_map, &ext,
+ row_heap);
+ ut_ad(row);
+
+ for (ulint i = 0; i < n_nonnull; i++) {
+ dfield_t* field = &row->fields[nonnull[i]];
+
+ ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL);
+
+ if (dfield_is_null(field)) {
+
+ Field* null_field =
+ table->field[nonnull[i]];
+
+ null_field->set_warning(
+ Sql_condition::WARN_LEVEL_WARN,
+ WARN_DATA_TRUNCATED, 1,
+ ulong(n_rows + 1));
+
+ if (!allow_not_null) {
+ err = DB_INVALID_NULL;
+ trx->error_key_num = 0;
+ goto func_exit;
+ }
+
+ const dfield_t& default_field
+ = defaults->fields[nonnull[i]];
+
+ *field = default_field;
+ }
+ }
+
+ /* Get the next Doc ID */
+ if (add_doc_id) {
+ doc_id++;
+ } else {
+ doc_id = 0;
+ }
+
+ ut_ad(row->fields[new_trx_id_col].type.mtype == DATA_SYS);
+ ut_ad(row->fields[new_trx_id_col].type.prtype
+ == (DATA_TRX_ID | DATA_NOT_NULL));
+ ut_ad(row->fields[new_trx_id_col].len == DATA_TRX_ID_LEN);
+ ut_ad(row->fields[new_trx_id_col + 1].type.mtype == DATA_SYS);
+ ut_ad(row->fields[new_trx_id_col + 1].type.prtype
+ == (DATA_ROLL_PTR | DATA_NOT_NULL));
+ ut_ad(row->fields[new_trx_id_col + 1].len == DATA_ROLL_PTR_LEN);
+
+ if (old_table == new_table) {
+ /* Do not bother touching DB_TRX_ID,DB_ROLL_PTR
+ because they are not going to be written into
+ secondary indexes. */
+ } else if (rec_trx_id < trx->id) {
+ /* Reset the DB_TRX_ID,DB_ROLL_PTR of old rows
+ for which history is not going to be
+ available after the rebuild operation.
+ This essentially mimics row_purge_reset_trx_id(). */
+ row->fields[new_trx_id_col].data
+ = const_cast<byte*>(reset_trx_id);
+ row->fields[new_trx_id_col + 1].data
+ = const_cast<byte*>(reset_trx_id
+ + DATA_TRX_ID_LEN);
+ }
+
+ if (add_autoinc != ULINT_UNDEFINED) {
+
+ ut_ad(add_autoinc
+ < dict_table_get_n_user_cols(new_table));
+
+ bool history_row = false;
+ if (new_table->versioned()) {
+ const dfield_t* dfield = dtuple_get_nth_field(
+ row, new_table->vers_end);
+ history_row = dfield->vers_history_row();
+ }
+
+ dfield_t* dfield = dtuple_get_nth_field(row,
+ add_autoinc);
+
+ if (new_table->versioned()) {
+ if (history_row) {
+ if (dfield_get_type(dfield)->prtype & DATA_NOT_NULL) {
+ err = DB_UNSUPPORTED;
+ my_error(ER_UNSUPPORTED_EXTENSION, MYF(0),
+ old_table->name.m_name);
+ goto func_exit;
+ }
+ dfield_set_null(dfield);
+ } else {
+ // set not null
+ ulint len = dfield_get_type(dfield)->len;
+ dfield_set_data(dfield, any_autoinc_data, len);
+ }
+ }
+
+ if (dfield_is_null(dfield)) {
+ goto write_buffers;
+ }
+
+ const dtype_t* dtype = dfield_get_type(dfield);
+ byte* b = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (sequence.eof()) {
+ err = DB_ERROR;
+ trx->error_key_num = 0;
+
+ ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_AUTOINC_READ_FAILED, "[NULL]");
+
+ goto func_exit;
+ }
+
+ ulonglong value = sequence++;
+
+ switch (dtype_get_mtype(dtype)) {
+ case DATA_INT: {
+ ibool usign;
+ ulint len = dfield_get_len(dfield);
+
+ usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
+ mach_write_ulonglong(b, value, len, usign);
+
+ break;
+ }
+
+ case DATA_FLOAT:
+ mach_float_write(
+ b, static_cast<float>(value));
+ break;
+
+ case DATA_DOUBLE:
+ mach_double_write(
+ b, static_cast<double>(value));
+ break;
+
+ default:
+ ut_ad(0);
+ }
+ }
+
+ if (old_table->versioned()) {
+ if (!new_table->versioned()
+ && clust_index->vers_history_row(rec, offsets)) {
+ continue;
+ }
+ } else if (new_table->versioned()) {
+ dfield_t* start =
+ dtuple_get_nth_field(row, new_table->vers_start);
+ dfield_t* end =
+ dtuple_get_nth_field(row, new_table->vers_end);
+ dfield_set_data(start, new_sys_trx_start, 8);
+ dfield_set_data(end, new_sys_trx_end, 8);
+ vers_update_trt = true;
+ }
+
+write_buffers:
+ /* Build all entries for all the indexes to be created
+ in a single scan of the clustered index. */
+
+ n_rows++;
+ ulint s_idx_cnt = 0;
+ bool skip_sort = skip_pk_sort
+ && dict_index_is_clust(merge_buf[0]->index);
+
+ for (ulint k = 0, i = 0; i < n_index; i++, skip_sort = false) {
+ row_merge_buf_t* buf = merge_buf[i];
+ ulint rows_added = 0;
+
+ if (dict_index_is_spatial(buf->index)) {
+ if (!row) {
+ continue;
+ }
+
+ ut_ad(sp_tuples[s_idx_cnt]->get_index()
+ == buf->index);
+
+ /* If the geometry field is invalid, report
+ error. */
+ if (!row_geo_field_is_valid(row, buf->index)) {
+ err = DB_CANT_CREATE_GEOMETRY_OBJECT;
+ break;
+ }
+
+ sp_tuples[s_idx_cnt]->add(row, ext);
+ s_idx_cnt++;
+
+ continue;
+ }
+
+ ut_ad(!row
+ || !dict_index_is_clust(buf->index)
+ || trx_id_check(row->fields[new_trx_id_col].data,
+ trx->id));
+
+ merge_file_t* file = &files[k++];
+
+ if (UNIV_LIKELY
+ (row && (rows_added = row_merge_buf_add(
+ buf, fts_index, old_table, new_table,
+ psort_info, row, ext, &doc_id,
+ conv_heap, &err,
+ &v_heap, eval_table, trx)))) {
+
+ /* If we are creating FTS index,
+ a single row can generate more
+ records for tokenized word */
+ file->n_rec += rows_added;
+
+ if (err != DB_SUCCESS) {
+ ut_ad(err == DB_TOO_BIG_RECORD);
+ break;
+ }
+
+ if (doc_id > max_doc_id) {
+ max_doc_id = doc_id;
+ }
+
+ if (buf->index->type & DICT_FTS) {
+ /* Check if error occurs in child thread */
+ for (ulint j = 0;
+ j < fts_sort_pll_degree; j++) {
+ if (psort_info[j].error
+ != DB_SUCCESS) {
+ err = psort_info[j].error;
+ trx->error_key_num = i;
+ break;
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ }
+
+ if (skip_sort) {
+ ut_ad(buf->n_tuples > 0);
+ const mtuple_t* curr =
+ &buf->tuples[buf->n_tuples - 1];
+
+ ut_ad(i == 0);
+ ut_ad(dict_index_is_clust(merge_buf[0]->index));
+ /* Detect duplicates by comparing the
+ current record with previous record.
+ When temp file is not used, records
+ should be in sorted order. */
+ if (prev_mtuple.fields != NULL
+ && (row_mtuple_cmp(
+ &prev_mtuple, curr,
+ &clust_dup) == 0)) {
+
+ err = DB_DUPLICATE_KEY;
+ trx->error_key_num
+ = key_numbers[0];
+ goto func_exit;
+ }
+
+ prev_mtuple.fields = curr->fields;
+ }
+
+ continue;
+ }
+
+ if (err == DB_COMPUTE_VALUE_FAILED) {
+ trx->error_key_num = i;
+ goto func_exit;
+ }
+
+ if (buf->index->type & DICT_FTS) {
+ if (!row || !doc_id) {
+ continue;
+ }
+ }
+
+ /* The buffer must be sufficiently large
+ to hold at least one record. It may only
+ be empty when we reach the end of the
+ clustered index. row_merge_buf_add()
+ must not have been called in this loop. */
+ ut_ad(buf->n_tuples || row == NULL);
+
+ /* We have enough data tuples to form a block.
+ Sort them and write to disk if temp file is used
+ or insert into index if temp file is not used. */
+ ut_ad(old_table == new_table
+ ? !dict_index_is_clust(buf->index)
+ : (i == 0) == dict_index_is_clust(buf->index));
+
+ /* We have enough data tuples to form a block.
+ Sort them (if !skip_sort) and write to disk. */
+
+ if (buf->n_tuples) {
+ if (skip_sort) {
+ /* Temporary File is not used.
+ so insert sorted block to the index */
+ if (row != NULL) {
+ /* We have to do insert the
+ cached spatial index rows, since
+ after the mtr_commit, the cluster
+ index page could be updated, then
+ the data in cached rows become
+ invalid. */
+ err = row_merge_spatial_rows(
+ trx->id, sp_tuples,
+ num_spatial,
+ row_heap, sp_heap,
+ &pcur, mtr_started,
+ &mtr);
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ /* We are not at the end of
+ the scan yet. We must
+ mtr.commit() in order to be
+ able to call log_free_check()
+ in row_merge_insert_index_tuples().
+ Due to mtr.commit(), the
+ current row will be invalid, and
+ we must reread it on the next
+ loop iteration. */
+ if (mtr_started) {
+ btr_pcur_move_to_prev_on_page(
+ &pcur);
+ btr_pcur_store_position(
+ &pcur, &mtr);
+
+ mtr.commit();
+ mtr_started = false;
+ }
+ }
+
+ mem_heap_empty(mtuple_heap);
+ prev_mtuple.fields = prev_fields;
+
+ row_mtuple_create(
+ &buf->tuples[buf->n_tuples - 1],
+ &prev_mtuple, n_uniq,
+ mtuple_heap);
+
+ if (clust_btr_bulk == NULL) {
+ clust_btr_bulk = UT_NEW_NOKEY(
+ BtrBulk(index[i],
+ trx));
+ } else {
+ clust_btr_bulk->latch();
+ }
+
+ err = row_merge_insert_index_tuples(
+ index[i], old_table,
+ OS_FILE_CLOSED, NULL, buf,
+ clust_btr_bulk,
+ table_total_rows,
+ curr_progress,
+ pct_cost,
+ crypt_block,
+ new_table->space_id);
+
+ if (row == NULL) {
+ err = clust_btr_bulk->finish(
+ err);
+ UT_DELETE(clust_btr_bulk);
+ clust_btr_bulk = NULL;
+ } else {
+ /* Release latches for possible
+ log_free_chck in spatial index
+ build. */
+ clust_btr_bulk->release();
+ }
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ if (row != NULL) {
+ /* Restore the cursor on the
+ previous clustered index record,
+ and empty the buffer. The next
+ iteration of the outer loop will
+ advance the cursor and read the
+ next record (the one which we
+ had to ignore due to the buffer
+ overflow). */
+ mtr.start();
+ mtr_started = true;
+ btr_pcur_restore_position(
+ BTR_SEARCH_LEAF, &pcur,
+ &mtr);
+ buf = row_merge_buf_empty(buf);
+ merge_buf[i] = buf;
+ /* Restart the outer loop on the
+ record. We did not insert it
+ into any index yet. */
+ ut_ad(i == 0);
+ break;
+ }
+ } else if (dict_index_is_unique(buf->index)) {
+ row_merge_dup_t dup = {
+ buf->index, table, col_map, 0};
+
+ row_merge_buf_sort(buf, &dup);
+
+ if (dup.n_dup) {
+ err = DB_DUPLICATE_KEY;
+ trx->error_key_num
+ = key_numbers[i];
+ break;
+ }
+ } else {
+ row_merge_buf_sort(buf, NULL);
+ }
+ } else if (online && new_table == old_table) {
+ /* Note the newest transaction that
+ modified this index when the scan was
+ completed. We prevent older readers
+ from accessing this index, to ensure
+ read consistency. */
+
+ trx_id_t max_trx_id;
+
+ ut_a(row == NULL);
+ rw_lock_x_lock(
+ dict_index_get_lock(buf->index));
+ ut_a(dict_index_get_online_status(buf->index)
+ == ONLINE_INDEX_CREATION);
+
+ max_trx_id = row_log_get_max_trx(buf->index);
+
+ if (max_trx_id > buf->index->trx_id) {
+ buf->index->trx_id = max_trx_id;
+ }
+
+ rw_lock_x_unlock(
+ dict_index_get_lock(buf->index));
+ }
+
+ /* Secondary index and clustered index which is
+ not in sorted order can use the temporary file.
+ Fulltext index should not use the temporary file. */
+ if (!skip_sort && !(buf->index->type & DICT_FTS)) {
+ /* In case we can have all rows in sort buffer,
+ we can insert directly into the index without
+ temporary file if clustered index does not uses
+ temporary file. */
+ if (row == NULL && file->fd == OS_FILE_CLOSED
+ && !clust_temp_file) {
+ DBUG_EXECUTE_IF(
+ "row_merge_write_failure",
+ err = DB_TEMP_FILE_WRITE_FAIL;
+ trx->error_key_num = i;
+ goto all_done;);
+
+ DBUG_EXECUTE_IF(
+ "row_merge_tmpfile_fail",
+ err = DB_OUT_OF_MEMORY;
+ trx->error_key_num = i;
+ goto all_done;);
+
+ BtrBulk btr_bulk(index[i], trx);
+
+ err = row_merge_insert_index_tuples(
+ index[i], old_table,
+ OS_FILE_CLOSED, NULL, buf,
+ &btr_bulk,
+ table_total_rows,
+ curr_progress,
+ pct_cost,
+ crypt_block,
+ new_table->space_id);
+
+ err = btr_bulk.finish(err);
+
+ DBUG_EXECUTE_IF(
+ "row_merge_insert_big_row",
+ err = DB_TOO_BIG_RECORD;);
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ } else {
+ if (!row_merge_file_create_if_needed(
+ file, tmpfd,
+ buf->n_tuples, path)) {
+ err = DB_OUT_OF_MEMORY;
+ trx->error_key_num = i;
+ break;
+ }
+
+ /* Ensure that duplicates in the
+ clustered index will be detected before
+ inserting secondary index records. */
+ if (dict_index_is_clust(buf->index)) {
+ clust_temp_file = true;
+ }
+
+ ut_ad(file->n_rec > 0);
+
+ row_merge_buf_write(buf, file, block);
+
+ if (!row_merge_write(
+ file->fd, file->offset++,
+ block, crypt_block,
+ new_table->space_id)) {
+ err = DB_TEMP_FILE_WRITE_FAIL;
+ trx->error_key_num = i;
+ break;
+ }
+
+ MEM_UNDEFINED(
+ &block[0], srv_sort_buf_size);
+ }
+ }
+ merge_buf[i] = row_merge_buf_empty(buf);
+ buf = merge_buf[i];
+
+ if (UNIV_LIKELY(row != NULL)) {
+ /* Try writing the record again, now
+ that the buffer has been written out
+ and emptied. */
+
+ if (UNIV_UNLIKELY
+ (!(rows_added = row_merge_buf_add(
+ buf, fts_index, old_table,
+ new_table, psort_info, row, ext,
+ &doc_id, conv_heap,
+ &err, &v_heap, eval_table, trx)))) {
+ /* An empty buffer should have enough
+ room for at least one record. */
+ ut_ad(err == DB_COMPUTE_VALUE_FAILED
+ || err == DB_OUT_OF_MEMORY
+ || err == DB_TOO_BIG_RECORD);
+ } else if (err == DB_SUCCESS) {
+ file->n_rec += rows_added;
+ continue;
+ }
+
+ trx->error_key_num = i;
+ break;
+ }
+ }
+
+ if (row == NULL) {
+ if (old_table != new_table) {
+ new_table->stat_n_rows = n_rows;
+ }
+
+ goto all_done;
+ }
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (v_heap) {
+ mem_heap_empty(v_heap);
+ }
+
+ /* Increment innodb_onlineddl_pct_progress status variable */
+ read_rows++;
+ if(read_rows % 1000 == 0) {
+ /* Update progress for each 1000 rows */
+ curr_progress = (read_rows >= table_total_rows) ?
+ pct_cost :
+ pct_cost * static_cast<double>(read_rows)
+ / static_cast<double>(table_total_rows);
+ /* presenting 10.12% as 1012 integer */
+ onlineddl_pct_progress = (ulint) (curr_progress * 100);
+ }
+ }
+
+func_exit:
+ ut_ad(mtr_started == mtr.is_active());
+ if (mtr_started) {
+ mtr.commit();
+ }
+ if (row_heap) {
+ mem_heap_free(row_heap);
+ }
+ ut_free(nonnull);
+
+all_done:
+ if (clust_btr_bulk != NULL) {
+ ut_ad(err != DB_SUCCESS);
+ clust_btr_bulk->latch();
+ err = clust_btr_bulk->finish(
+ err);
+ UT_DELETE(clust_btr_bulk);
+ }
+
+ if (prev_fields != NULL) {
+ ut_free(prev_fields);
+ mem_heap_free(mtuple_heap);
+ }
+
+ if (v_heap) {
+ mem_heap_free(v_heap);
+ }
+
+ if (conv_heap != NULL) {
+ mem_heap_free(conv_heap);
+ }
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n");
+#endif
+ if (fts_pll_sort) {
+wait_again:
+ /* Check if error occurs in child thread */
+ for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+ if (psort_info[j].error != DB_SUCCESS) {
+ err = psort_info[j].error;
+ trx->error_key_num = j;
+ break;
+ }
+ }
+
+ /* Tell all children that parent has done scanning */
+ for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+ if (err == DB_SUCCESS) {
+ psort_info[i].state = FTS_PARENT_COMPLETE;
+ } else {
+ psort_info[i].state = FTS_PARENT_EXITING;
+ }
+ }
+
+ /* Now wait all children to report back to be completed */
+ os_event_wait_time_low(fts_parallel_sort_event,
+ 1000000, sig_count);
+
+ for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+ if (psort_info[i].child_status != FTS_CHILD_COMPLETE
+ && psort_info[i].child_status != FTS_CHILD_EXITING) {
+ sig_count = os_event_reset(
+ fts_parallel_sort_event);
+ goto wait_again;
+ }
+ }
+
+ for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+ psort_info[j].task->wait();
+ delete psort_info[j].task;
+ }
+ }
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n");
+#endif
+ for (ulint i = 0; i < n_index; i++) {
+ row_merge_buf_free(merge_buf[i]);
+ }
+
+ row_fts_free_pll_merge_buf(psort_info);
+
+ ut_free(merge_buf);
+
+ btr_pcur_close(&pcur);
+
+ if (sp_tuples != NULL) {
+ for (ulint i = 0; i < num_spatial; i++) {
+ UT_DELETE(sp_tuples[i]);
+ }
+ ut_free(sp_tuples);
+
+ if (sp_heap) {
+ mem_heap_free(sp_heap);
+ }
+ }
+
+ /* Update the next Doc ID we used. Table should be locked, so
+ no concurrent DML */
+ if (max_doc_id && err == DB_SUCCESS) {
+ /* Sync fts cache for other fts indexes to keep all
+ fts indexes consistent in sync_doc_id. */
+ err = fts_sync_table(const_cast<dict_table_t*>(new_table));
+
+ if (err == DB_SUCCESS) {
+ fts_update_next_doc_id(NULL, new_table, max_doc_id);
+ }
+ }
+
+ if (vers_update_trt) {
+ trx_mod_table_time_t& time =
+ trx->mod_tables
+ .insert(trx_mod_tables_t::value_type(
+ const_cast<dict_table_t*>(new_table), 0))
+ .first->second;
+ time.set_versioned(0);
+ }
+
+ trx->op_info = "";
+
+ DBUG_RETURN(err);
+}
+
+/** Write a record via buffer 2 and read the next record to buffer N.
+@param N number of the buffer (0 or 1)
+@param INDEX record descriptor
+@param AT_END statement to execute at end of input */
+#define ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END) \
+ do { \
+ b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \
+ &buf[2], b2, \
+ of->fd, &of->offset, \
+ mrec##N, offsets##N, \
+ crypt_block ? &crypt_block[2 * srv_sort_buf_size] : NULL , \
+ space); \
+ if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) { \
+ goto corrupt; \
+ } \
+ b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\
+ &buf[N], b##N, INDEX, \
+ file->fd, foffs##N, \
+ &mrec##N, offsets##N, \
+ crypt_block ? &crypt_block[N * srv_sort_buf_size] : NULL, \
+ space); \
+ \
+ if (UNIV_UNLIKELY(!b##N)) { \
+ if (mrec##N) { \
+ goto corrupt; \
+ } \
+ AT_END; \
+ } \
+ } while (0)
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END) \
+ do { \
+ if (stage != NULL) { \
+ stage->inc(); \
+ } \
+ ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END); \
+ } while (0)
+#else /* HAVE_PSI_STAGE_INTERFACE */
+#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END) \
+ ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END)
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Merge two blocks of records on disk and write a bigger block.
+@param[in] dup descriptor of index being created
+@param[in] file file containing index entries
+@param[in,out] block 3 buffers
+@param[in,out] foffs0 offset of first source list in the file
+@param[in,out] foffs1 offset of second source list in the file
+@param[in,out] of output file
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@param[in,out] crypt_block encryption buffer
+@param[in] space tablespace ID for encryption
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_blocks(
+ const row_merge_dup_t* dup,
+ const merge_file_t* file,
+ row_merge_block_t* block,
+ ulint* foffs0,
+ ulint* foffs1,
+ merge_file_t* of,
+ ut_stage_alter_t* stage MY_ATTRIBUTE((unused)),
+ row_merge_block_t* crypt_block,
+ ulint space)
+{
+ mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
+
+ mrec_buf_t* buf; /*!< buffer for handling
+ split mrec in block[] */
+ const byte* b0; /*!< pointer to block[0] */
+ const byte* b1; /*!< pointer to block[srv_sort_buf_size] */
+ byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */
+ const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */
+ const mrec_t* mrec1; /*!< merge rec, points to
+ block[srv_sort_buf_size] or buf[1] */
+ rec_offs* offsets0;/* offsets of mrec0 */
+ rec_offs* offsets1;/* offsets of mrec1 */
+
+ DBUG_ENTER("row_merge_blocks");
+ DBUG_LOG("ib_merge_sort",
+ "fd=" << file->fd << ',' << *foffs0 << '+' << *foffs1
+ << " to fd=" << of->fd << ',' << of->offset);
+
+ heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1);
+
+ /* Write a record and read the next record. Split the output
+ file in two halves, which can be merged on the following pass. */
+
+ if (!row_merge_read(file->fd, *foffs0, &block[0],
+ crypt_block ? &crypt_block[0] : NULL,
+ space) ||
+ !row_merge_read(file->fd, *foffs1, &block[srv_sort_buf_size],
+ crypt_block ? &crypt_block[srv_sort_buf_size] : NULL,
+ space)) {
+corrupt:
+ mem_heap_free(heap);
+ DBUG_RETURN(DB_CORRUPTION);
+ }
+
+ b0 = &block[0];
+ b1 = &block[srv_sort_buf_size];
+ b2 = &block[2 * srv_sort_buf_size];
+
+ b0 = row_merge_read_rec(
+ &block[0], &buf[0], b0, dup->index,
+ file->fd, foffs0, &mrec0, offsets0,
+ crypt_block ? &crypt_block[0] : NULL,
+ space);
+
+ b1 = row_merge_read_rec(
+ &block[srv_sort_buf_size],
+ &buf[srv_sort_buf_size], b1, dup->index,
+ file->fd, foffs1, &mrec1, offsets1,
+ crypt_block ? &crypt_block[srv_sort_buf_size] : NULL,
+ space);
+
+ if (UNIV_UNLIKELY(!b0 && mrec0)
+ || UNIV_UNLIKELY(!b1 && mrec1)) {
+
+ goto corrupt;
+ }
+
+ while (mrec0 && mrec1) {
+ int cmp = cmp_rec_rec_simple(
+ mrec0, mrec1, offsets0, offsets1,
+ dup->index, dup->table);
+ if (cmp < 0) {
+ ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged);
+ } else if (cmp) {
+ ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged);
+ } else {
+ mem_heap_free(heap);
+ DBUG_RETURN(DB_DUPLICATE_KEY);
+ }
+ }
+
+merged:
+ if (mrec0) {
+ /* append all mrec0 to output */
+ for (;;) {
+ ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0);
+ }
+ }
+done0:
+ if (mrec1) {
+ /* append all mrec1 to output */
+ for (;;) {
+ ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1);
+ }
+ }
+done1:
+
+ mem_heap_free(heap);
+
+ b2 = row_merge_write_eof(
+ &block[2 * srv_sort_buf_size],
+ b2, of->fd, &of->offset,
+ crypt_block ? &crypt_block[2 * srv_sort_buf_size] : NULL,
+ space);
+ DBUG_RETURN(b2 ? DB_SUCCESS : DB_CORRUPTION);
+}
+
+/** Copy a block of index entries.
+@param[in] index index being created
+@param[in] file input file
+@param[in,out] block 3 buffers
+@param[in,out] foffs0 input file offset
+@param[in,out] of output file
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@param[in,out] crypt_block encryption buffer
+@param[in] space tablespace ID for encryption
+@return TRUE on success, FALSE on failure */
+static MY_ATTRIBUTE((warn_unused_result))
+ibool
+row_merge_blocks_copy(
+ const dict_index_t* index,
+ const merge_file_t* file,
+ row_merge_block_t* block,
+ ulint* foffs0,
+ merge_file_t* of,
+ ut_stage_alter_t* stage MY_ATTRIBUTE((unused)),
+ row_merge_block_t* crypt_block,
+ ulint space)
+{
+ mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
+
+ mrec_buf_t* buf; /*!< buffer for handling
+ split mrec in block[] */
+ const byte* b0; /*!< pointer to block[0] */
+ byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */
+ const mrec_t* mrec0; /*!< merge rec, points to block[0] */
+ rec_offs* offsets0;/* offsets of mrec0 */
+ rec_offs* offsets1;/* dummy offsets */
+
+ DBUG_ENTER("row_merge_blocks_copy");
+ DBUG_LOG("ib_merge_sort",
+ "fd=" << file->fd << ',' << foffs0
+ << " to fd=" << of->fd << ',' << of->offset);
+
+ heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
+
+ /* Write a record and read the next record. Split the output
+ file in two halves, which can be merged on the following pass. */
+
+ if (!row_merge_read(file->fd, *foffs0, &block[0],
+ crypt_block ? &crypt_block[0] : NULL,
+ space)) {
+corrupt:
+ mem_heap_free(heap);
+ DBUG_RETURN(FALSE);
+ }
+
+ b0 = &block[0];
+
+ b2 = &block[2 * srv_sort_buf_size];
+
+ b0 = row_merge_read_rec(&block[0], &buf[0], b0, index,
+ file->fd, foffs0, &mrec0, offsets0,
+ crypt_block ? &crypt_block[0] : NULL,
+ space);
+
+ if (UNIV_UNLIKELY(!b0 && mrec0)) {
+
+ goto corrupt;
+ }
+
+ if (mrec0) {
+ /* append all mrec0 to output */
+ for (;;) {
+ ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0);
+ }
+ }
+done0:
+
+ /* The file offset points to the beginning of the last page
+ that has been read. Update it to point to the next block. */
+ (*foffs0)++;
+
+ mem_heap_free(heap);
+
+ DBUG_RETURN(row_merge_write_eof(
+ &block[2 * srv_sort_buf_size],
+ b2, of->fd, &of->offset,
+ crypt_block
+ ? &crypt_block[2 * srv_sort_buf_size]
+ : NULL, space)
+ != NULL);
+}
+
+/** Merge disk files.
+@param[in] trx transaction
+@param[in] dup descriptor of index being created
+@param[in,out] file file containing index entries
+@param[in,out] block 3 buffers
+@param[in,out] tmpfd temporary file handle
+@param[in,out] num_run Number of runs that remain to be merged
+@param[in,out] run_offset Array that contains the first offset number
+for each merge run
+@param[in,out] stage performance schema accounting object, used by
+@param[in,out] crypt_block encryption buffer
+@param[in] space tablespace ID for encryption
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+row_merge(
+ trx_t* trx,
+ const row_merge_dup_t* dup,
+ merge_file_t* file,
+ row_merge_block_t* block,
+ pfs_os_file_t* tmpfd,
+ ulint* num_run,
+ ulint* run_offset,
+ ut_stage_alter_t* stage,
+ row_merge_block_t* crypt_block,
+ ulint space)
+{
+ ulint foffs0; /*!< first input offset */
+ ulint foffs1; /*!< second input offset */
+ dberr_t error; /*!< error code */
+ merge_file_t of; /*!< output file */
+ const ulint ihalf = run_offset[*num_run / 2];
+ /*!< half the input file */
+ ulint n_run = 0;
+ /*!< num of runs generated from this merge */
+
+ MEM_CHECK_ADDRESSABLE(&block[0], 3 * srv_sort_buf_size);
+
+ if (crypt_block) {
+ MEM_CHECK_ADDRESSABLE(&crypt_block[0], 3 * srv_sort_buf_size);
+ }
+
+ ut_ad(ihalf < file->offset);
+
+ of.fd = *tmpfd;
+ of.offset = 0;
+ of.n_rec = 0;
+
+#ifdef POSIX_FADV_SEQUENTIAL
+ /* The input file will be read sequentially, starting from the
+ beginning and the middle. In Linux, the POSIX_FADV_SEQUENTIAL
+ affects the entire file. Each block will be read exactly once. */
+ posix_fadvise(file->fd, 0, 0,
+ POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
+#endif /* POSIX_FADV_SEQUENTIAL */
+
+ /* Merge blocks to the output file. */
+ foffs0 = 0;
+ foffs1 = ihalf;
+
+ MEM_UNDEFINED(run_offset, *num_run * sizeof *run_offset);
+
+ for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
+
+ if (trx_is_interrupted(trx)) {
+ return(DB_INTERRUPTED);
+ }
+
+ /* Remember the offset number for this run */
+ run_offset[n_run++] = of.offset;
+
+ error = row_merge_blocks(dup, file, block,
+ &foffs0, &foffs1, &of, stage,
+ crypt_block, space);
+
+ if (error != DB_SUCCESS) {
+ return(error);
+ }
+
+ }
+
+ /* Copy the last blocks, if there are any. */
+
+ while (foffs0 < ihalf) {
+
+ if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+ return(DB_INTERRUPTED);
+ }
+
+ /* Remember the offset number for this run */
+ run_offset[n_run++] = of.offset;
+
+ if (!row_merge_blocks_copy(dup->index, file, block,
+ &foffs0, &of, stage,
+ crypt_block, space)) {
+ return(DB_CORRUPTION);
+ }
+ }
+
+ ut_ad(foffs0 == ihalf);
+
+ while (foffs1 < file->offset) {
+
+ if (trx_is_interrupted(trx)) {
+ return(DB_INTERRUPTED);
+ }
+
+ /* Remember the offset number for this run */
+ run_offset[n_run++] = of.offset;
+
+ if (!row_merge_blocks_copy(dup->index, file, block,
+ &foffs1, &of, stage,
+ crypt_block, space)) {
+ return(DB_CORRUPTION);
+ }
+ }
+
+ ut_ad(foffs1 == file->offset);
+
+ if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
+ return(DB_CORRUPTION);
+ }
+
+ ut_ad(n_run <= *num_run);
+
+ *num_run = n_run;
+
+ /* Each run can contain one or more offsets. As merge goes on,
+ the number of runs (to merge) will reduce until we have one
+ single run. So the number of runs will always be smaller than
+ the number of offsets in file */
+ ut_ad((*num_run) <= file->offset);
+
+ /* The number of offsets in output file is always equal or
+ smaller than input file */
+ ut_ad(of.offset <= file->offset);
+
+ /* Swap file descriptors for the next pass. */
+ *tmpfd = file->fd;
+ *file = of;
+
+ MEM_UNDEFINED(&block[0], 3 * srv_sort_buf_size);
+
+ return(DB_SUCCESS);
+}
+
+/** Merge disk files.
+@param[in] trx transaction
+@param[in] dup descriptor of index being created
+@param[in,out] file file containing index entries
+@param[in,out] block 3 buffers
+@param[in,out] tmpfd temporary file handle
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially
+and then stage->inc() will be called for each record processed.
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_sort(
+ trx_t* trx,
+ const row_merge_dup_t* dup,
+ merge_file_t* file,
+ row_merge_block_t* block,
+ pfs_os_file_t* tmpfd,
+ const bool update_progress,
+ /*!< in: update progress
+ status variable or not */
+ const double pct_progress,
+ /*!< in: total progress percent
+ until now */
+ const double pct_cost, /*!< in: current progress percent */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space, /*!< in: space id */
+ ut_stage_alter_t* stage)
+{
+ const ulint half = file->offset / 2;
+ ulint num_runs;
+ ulint* run_offset;
+ dberr_t error = DB_SUCCESS;
+ ulint merge_count = 0;
+ ulint total_merge_sort_count;
+ double curr_progress = 0;
+
+ DBUG_ENTER("row_merge_sort");
+
+ /* Record the number of merge runs we need to perform */
+ num_runs = file->offset;
+
+ if (stage != NULL) {
+ stage->begin_phase_sort(log2(double(num_runs)));
+ }
+
+ /* If num_runs are less than 1, nothing to merge */
+ if (num_runs <= 1) {
+ DBUG_RETURN(error);
+ }
+
+ total_merge_sort_count = ulint(ceil(log2(double(num_runs))));
+
+ /* "run_offset" records each run's first offset number */
+ run_offset = (ulint*) ut_malloc_nokey(file->offset * sizeof(ulint));
+
+ /* This tells row_merge() where to start for the first round
+ of merge. */
+ run_offset[half] = half;
+
+ /* The file should always contain at least one byte (the end
+ of file marker). Thus, it must be at least one block. */
+ ut_ad(file->offset > 0);
+
+ /* These thd_progress* calls will crash on sol10-64 when innodb_plugin
+ is used. MDEV-9356: innodb.innodb_bug53290 fails (crashes) on
+ sol10-64 in buildbot.
+ */
+#ifndef UNIV_SOLARIS
+ /* Progress report only for "normal" indexes. */
+ if (!(dup->index->type & DICT_FTS)) {
+ thd_progress_init(trx->mysql_thd, 1);
+ }
+#endif /* UNIV_SOLARIS */
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information("InnoDB: Online DDL : merge-sorting"
+ " has estimated " ULINTPF " runs",
+ num_runs);
+ }
+
+ /* Merge the runs until we have one big run */
+ do {
+ /* Report progress of merge sort to MySQL for
+ show processlist progress field */
+ /* Progress report only for "normal" indexes. */
+#ifndef UNIV_SOLARIS
+ if (!(dup->index->type & DICT_FTS)) {
+ thd_progress_report(trx->mysql_thd, file->offset - num_runs, file->offset);
+ }
+#endif /* UNIV_SOLARIS */
+
+ error = row_merge(trx, dup, file, block, tmpfd,
+ &num_runs, run_offset, stage,
+ crypt_block, space);
+
+ if(update_progress) {
+ merge_count++;
+ curr_progress = (merge_count >= total_merge_sort_count) ?
+ pct_cost :
+ pct_cost * static_cast<double>(merge_count)
+ / static_cast<double>(total_merge_sort_count);
+ /* presenting 10.12% as 1012 integer */;
+ onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100);
+ }
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+
+ MEM_CHECK_DEFINED(run_offset, num_runs * sizeof *run_offset);
+ } while (num_runs > 1);
+
+ ut_free(run_offset);
+
+ /* Progress report only for "normal" indexes. */
+#ifndef UNIV_SOLARIS
+ if (!(dup->index->type & DICT_FTS)) {
+ thd_progress_end(trx->mysql_thd);
+ }
+#endif /* UNIV_SOLARIS */
+
+ DBUG_RETURN(error);
+}
+
+/** Copy externally stored columns to the data tuple.
+@param[in] mrec record containing BLOB pointers,
+or NULL to use tuple instead
+@param[in] offsets offsets of mrec
+@param[in] zip_size compressed page size in bytes, or 0
+@param[in,out] tuple data tuple
+@param[in,out] heap memory heap */
+static
+void
+row_merge_copy_blobs(
+ const mrec_t* mrec,
+ const rec_offs* offsets,
+ ulint zip_size,
+ dtuple_t* tuple,
+ mem_heap_t* heap)
+{
+ ut_ad(mrec == NULL || rec_offs_any_extern(offsets));
+
+ for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+ ulint len;
+ const void* data;
+ dfield_t* field = dtuple_get_nth_field(tuple, i);
+ ulint field_len;
+ const byte* field_data;
+
+ if (!dfield_is_ext(field)) {
+ continue;
+ }
+
+ ut_ad(!dfield_is_null(field));
+
+ /* During the creation of a PRIMARY KEY, the table is
+ X-locked, and we skip copying records that have been
+ marked for deletion. Therefore, externally stored
+ columns cannot possibly be freed between the time the
+ BLOB pointers are read (row_merge_read_clustered_index())
+ and dereferenced (below). */
+ if (mrec == NULL) {
+ field_data
+ = static_cast<byte*>(dfield_get_data(field));
+ field_len = dfield_get_len(field);
+
+ ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ ut_a(memcmp(field_data + field_len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+
+ data = btr_copy_externally_stored_field(
+ &len, field_data, zip_size, field_len, heap);
+ } else {
+ data = btr_rec_copy_externally_stored_field(
+ mrec, offsets, zip_size, i, &len, heap);
+ }
+
+ /* Because we have locked the table, any records
+ written by incomplete transactions must have been
+ rolled back already. There must not be any incomplete
+ BLOB columns. */
+ ut_a(data);
+
+ dfield_set_data(field, data, len);
+ }
+}
+
+/** Convert a merge record to a typed data tuple. Note that externally
+stored fields are not copied to heap.
+@param[in,out] index index on the table
+@param[in] mtuple merge record
+@param[in] heap memory heap from which memory needed is allocated
+@return index entry built. */
+static
+void
+row_merge_mtuple_to_dtuple(
+ dict_index_t* index,
+ dtuple_t* dtuple,
+ const mtuple_t* mtuple)
+{
+ ut_ad(!dict_index_is_ibuf(index));
+
+ memcpy(dtuple->fields, mtuple->fields,
+ dtuple->n_fields * sizeof *mtuple->fields);
+}
+
+/** Insert sorted data tuples to the index.
+@param[in] index index to be inserted
+@param[in] old_table old table
+@param[in] fd file descriptor
+@param[in,out] block file buffer
+@param[in] row_buf row_buf the sorted data tuples,
+or NULL if fd, block will be used instead
+@param[in,out] btr_bulk btr bulk instance
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->begin_phase_insert() will be called initially
+and then stage->inc() will be called for each record that is processed.
+@return DB_SUCCESS or error number */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_insert_index_tuples(
+ dict_index_t* index,
+ const dict_table_t* old_table,
+ const pfs_os_file_t& fd,
+ row_merge_block_t* block,
+ const row_merge_buf_t* row_buf,
+ BtrBulk* btr_bulk,
+ const ib_uint64_t table_total_rows, /*!< in: total rows of old table */
+ const double pct_progress, /*!< in: total progress
+ percent until now */
+ const double pct_cost, /*!< in: current progress percent
+ */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space, /*!< in: space id */
+ ut_stage_alter_t* stage)
+{
+ const byte* b;
+ mem_heap_t* heap;
+ mem_heap_t* tuple_heap;
+ dberr_t error = DB_SUCCESS;
+ ulint foffs = 0;
+ rec_offs* offsets;
+ mrec_buf_t* buf;
+ ulint n_rows = 0;
+ dtuple_t* dtuple;
+ ib_uint64_t inserted_rows = 0;
+ double curr_progress = 0;
+ dict_index_t* old_index = NULL;
+ const mrec_t* mrec = NULL;
+ mtr_t mtr;
+
+
+ DBUG_ENTER("row_merge_insert_index_tuples");
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad(!(index->type & DICT_FTS));
+ ut_ad(!dict_index_is_spatial(index));
+
+ if (stage != NULL) {
+ stage->begin_phase_insert();
+ }
+
+ tuple_heap = mem_heap_create(1000);
+
+ {
+ ulint i = 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+ heap = mem_heap_create(sizeof *buf + i * sizeof *offsets);
+ offsets = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, i * sizeof *offsets));
+ rec_offs_set_n_alloc(offsets, i);
+ rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+ }
+
+ if (row_buf != NULL) {
+ ut_ad(fd == OS_FILE_CLOSED);
+ ut_ad(block == NULL);
+ DBUG_EXECUTE_IF("row_merge_read_failure",
+ error = DB_CORRUPTION;
+ goto err_exit;);
+ buf = NULL;
+ b = NULL;
+ dtuple = dtuple_create(
+ heap, dict_index_get_n_fields(index));
+ dtuple_set_n_fields_cmp(
+ dtuple, dict_index_get_n_unique_in_tree(index));
+ } else {
+ b = block;
+ dtuple = NULL;
+
+ if (!row_merge_read(fd, foffs, block, crypt_block, space)) {
+ error = DB_CORRUPTION;
+ goto err_exit;
+ } else {
+ buf = static_cast<mrec_buf_t*>(
+ mem_heap_alloc(heap, sizeof *buf));
+ }
+ }
+
+ for (;;) {
+
+ if (stage != NULL) {
+ stage->inc();
+ }
+
+ if (row_buf != NULL) {
+ if (n_rows >= row_buf->n_tuples) {
+ break;
+ }
+
+ /* Convert merge tuple record from
+ row buffer to data tuple record */
+ row_merge_mtuple_to_dtuple(
+ index, dtuple, &row_buf->tuples[n_rows]);
+ n_rows++;
+ /* BLOB pointers must be copied from dtuple */
+ mrec = NULL;
+ } else {
+ b = row_merge_read_rec(block, buf, b, index,
+ fd, &foffs, &mrec, offsets,
+ crypt_block,
+ space);
+
+ if (UNIV_UNLIKELY(!b)) {
+ /* End of list, or I/O error */
+ if (mrec) {
+ error = DB_CORRUPTION;
+ }
+ break;
+ }
+
+ dtuple = row_rec_to_index_entry_low(
+ mrec, index, offsets, tuple_heap);
+ }
+
+ old_index = dict_table_get_first_index(old_table);
+
+ if (dict_index_is_clust(index)
+ && dict_index_is_online_ddl(old_index)) {
+ error = row_log_table_get_error(old_index);
+ if (error != DB_SUCCESS) {
+ break;
+ }
+ }
+
+ if (dict_index_is_clust(index) && dtuple_get_n_ext(dtuple)) {
+ /* Off-page columns can be fetched safely
+ when concurrent modifications to the table
+ are disabled. (Purge can process delete-marked
+ records, but row_merge_read_clustered_index()
+ would have skipped them.)
+
+ When concurrent modifications are enabled,
+ row_merge_read_clustered_index() will
+ only see rows from transactions that were
+ committed before the ALTER TABLE started
+ (REPEATABLE READ).
+
+ Any modifications after the
+ row_merge_read_clustered_index() scan
+ will go through row_log_table_apply().
+ Any modifications to off-page columns
+ will be tracked by
+ row_log_table_blob_alloc() and
+ row_log_table_blob_free(). */
+ row_merge_copy_blobs(
+ mrec, offsets, old_table->space->zip_size(),
+ dtuple, tuple_heap);
+ }
+
+#ifdef UNIV_DEBUG
+ static const latch_level_t latches[] = {
+ SYNC_INDEX_TREE, /* index->lock */
+ SYNC_LEVEL_VARYING /* btr_bulk->m_page_bulks */
+ };
+#endif /* UNIV_DEBUG */
+
+ ut_ad(dtuple_validate(dtuple));
+ ut_ad(!sync_check_iterate(sync_allowed_latches(latches,
+ latches + 2)));
+ error = btr_bulk->insert(dtuple);
+
+ if (error != DB_SUCCESS) {
+ goto err_exit;
+ }
+
+ mem_heap_empty(tuple_heap);
+
+ /* Increment innodb_onlineddl_pct_progress status variable */
+ inserted_rows++;
+ if(inserted_rows % 1000 == 0) {
+ /* Update progress for each 1000 rows */
+ curr_progress = (inserted_rows >= table_total_rows ||
+ table_total_rows <= 0) ?
+ pct_cost :
+ pct_cost * static_cast<double>(inserted_rows)
+ / static_cast<double>(table_total_rows);
+
+ /* presenting 10.12% as 1012 integer */;
+ onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100);
+ }
+ }
+
+err_exit:
+ mem_heap_free(tuple_heap);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return error code or DB_SUCCESS */
+dberr_t
+row_merge_lock_table(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_table_t* table, /*!< in: table to lock */
+ enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */
+{
+ ut_ad(!srv_read_only_mode);
+ ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+ trx->op_info = "setting table lock for creating or dropping index";
+ trx->ddl = true;
+
+ return(lock_table_for_trx(table, trx, mode));
+}
+
+/*********************************************************************//**
+Drop an index that was created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+static
+void
+row_merge_drop_index_dict(
+/*======================*/
+ trx_t* trx, /*!< in/out: dictionary transaction */
+ index_id_t index_id)/*!< in: index identifier */
+{
+ static const char sql[] =
+ "PROCEDURE DROP_INDEX_PROC () IS\n"
+ "BEGIN\n"
+ "DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n"
+ "DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n"
+ "END;\n";
+ dberr_t error;
+ pars_info_t* info;
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+ ut_d(dict_sys.assert_locked());
+
+ info = pars_info_create();
+ pars_info_add_ull_literal(info, "indexid", index_id);
+ trx->op_info = "dropping index from dictionary";
+ error = que_eval_sql(info, sql, FALSE, trx);
+
+ if (error != DB_SUCCESS) {
+ /* Even though we ensure that DDL transactions are WAIT
+ and DEADLOCK free, we could encounter other errors e.g.,
+ DB_TOO_MANY_CONCURRENT_TRXS. */
+ trx->error_state = DB_SUCCESS;
+
+ ib::error() << "row_merge_drop_index_dict failed with error "
+ << error;
+ }
+
+ trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+void
+row_merge_drop_indexes_dict(
+/*========================*/
+ trx_t* trx, /*!< in/out: dictionary transaction */
+ table_id_t table_id)/*!< in: table identifier */
+{
+ static const char sql[] =
+ "PROCEDURE DROP_INDEXES_PROC () IS\n"
+ "ixid CHAR;\n"
+ "found INT;\n"
+
+ "DECLARE CURSOR index_cur IS\n"
+ " SELECT ID FROM SYS_INDEXES\n"
+ " WHERE TABLE_ID=:tableid AND\n"
+ " SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+ "FOR UPDATE;\n"
+
+ "BEGIN\n"
+ "found := 1;\n"
+ "OPEN index_cur;\n"
+ "WHILE found = 1 LOOP\n"
+ " FETCH index_cur INTO ixid;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE\n"
+ " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+ " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE index_cur;\n"
+
+ "END;\n";
+ dberr_t error;
+ pars_info_t* info;
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+ ut_d(dict_sys.assert_locked());
+
+ /* It is possible that table->n_ref_count > 1 when
+ locked=TRUE. In this case, all code that should have an open
+ handle to the table be waiting for the next statement to execute,
+ or waiting for a meta-data lock.
+
+ A concurrent purge will be prevented by dict_sys.latch. */
+
+ info = pars_info_create();
+ pars_info_add_ull_literal(info, "tableid", table_id);
+ trx->op_info = "dropping indexes";
+ error = que_eval_sql(info, sql, FALSE, trx);
+
+ switch (error) {
+ case DB_SUCCESS:
+ break;
+ default:
+ /* Even though we ensure that DDL transactions are WAIT
+ and DEADLOCK free, we could encounter other errors e.g.,
+ DB_TOO_MANY_CONCURRENT_TRXS. */
+ ib::error() << "row_merge_drop_indexes_dict failed with error "
+ << error;
+ /* fall through */
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ trx->error_state = DB_SUCCESS;
+ }
+
+ trx->op_info = "";
+}
+
+/** Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@param trx dictionary transaction
+@param table table containing the indexes
+@param locked True if table is locked,
+ false - may need to do lazy drop
+@param alter_trx Alter table transaction */
+void
+row_merge_drop_indexes(
+ trx_t* trx,
+ dict_table_t* table,
+ bool locked,
+ const trx_t* alter_trx)
+{
+ dict_index_t* index;
+ dict_index_t* next_index;
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+ ut_d(dict_sys.assert_locked());
+
+ index = dict_table_get_first_index(table);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_COMPLETE);
+
+ /* the caller should have an open handle to the table */
+ ut_ad(table->get_ref_count() >= 1);
+
+ /* It is possible that table->n_ref_count > 1 when
+ locked=TRUE. In this case, all code that should have an open
+ handle to the table be waiting for the next statement to execute,
+ or waiting for a meta-data lock.
+
+ A concurrent purge will be prevented by dict_sys.latch. */
+
+ if (!locked && (table->get_ref_count() > 1
+ || table->has_lock_other_than(alter_trx))) {
+ /* We will have to drop the indexes later, when the
+ table is guaranteed to be no longer in use. Mark the
+ indexes as incomplete and corrupted, so that other
+ threads will stop using them. Let dict_table_close()
+ or crash recovery or the next invocation of
+ prepare_inplace_alter_table() take care of dropping
+ the indexes. */
+
+ while ((index = dict_table_get_next_index(index)) != NULL) {
+ ut_ad(!dict_index_is_clust(index));
+
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ continue;
+ case ONLINE_INDEX_COMPLETE:
+ if (index->is_committed()) {
+ /* Do nothing to already
+ published indexes. */
+ } else if (index->type & DICT_FTS) {
+ /* Drop a completed FULLTEXT
+ index, due to a timeout during
+ MDL upgrade for
+ commit_inplace_alter_table().
+ Because only concurrent reads
+ are allowed (and they are not
+ seeing this index yet) we
+ are safe to drop the index. */
+ dict_index_t* prev = UT_LIST_GET_PREV(
+ indexes, index);
+ /* At least there should be
+ the clustered index before
+ this one. */
+ ut_ad(prev);
+ ut_a(table->fts);
+ fts_drop_index(table, index, trx);
+ row_merge_drop_index_dict(
+ trx, index->id);
+ /* We can remove a DICT_FTS
+ index from the cache, because
+ we do not allow ADD FULLTEXT INDEX
+ with LOCK=NONE. If we allowed that,
+ we should exclude FTS entries from
+ prebuilt->ins_node->entry_list
+ in ins_node_create_entry_list(). */
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!index->search_info->ref_count);
+#endif /* BTR_CUR_HASH_ADAPT */
+ dict_index_remove_from_cache(
+ table, index);
+ index = prev;
+ } else {
+ rw_lock_x_lock(
+ dict_index_get_lock(index));
+ dict_index_set_online_status(
+ index, ONLINE_INDEX_ABORTED);
+ index->type |= DICT_CORRUPT;
+ table->drop_aborted = TRUE;
+ goto drop_aborted;
+ }
+ continue;
+ case ONLINE_INDEX_CREATION:
+ rw_lock_x_lock(dict_index_get_lock(index));
+ ut_ad(!index->is_committed());
+ row_log_abort_sec(index);
+ drop_aborted:
+ rw_lock_x_unlock(dict_index_get_lock(index));
+
+ DEBUG_SYNC_C("merge_drop_index_after_abort");
+ /* covered by dict_sys.mutex */
+ MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX);
+ /* fall through */
+ case ONLINE_INDEX_ABORTED:
+ /* Drop the index tree from the
+ data dictionary and free it from
+ the tablespace, but keep the object
+ in the data dictionary cache. */
+ row_merge_drop_index_dict(trx, index->id);
+ rw_lock_x_lock(dict_index_get_lock(index));
+ dict_index_set_online_status(
+ index, ONLINE_INDEX_ABORTED_DROPPED);
+ rw_lock_x_unlock(dict_index_get_lock(index));
+ table->drop_aborted = TRUE;
+ continue;
+ }
+ ut_error;
+ }
+
+ fts_clear_all(table, trx);
+ return;
+ }
+
+ row_merge_drop_indexes_dict(trx, table->id);
+
+ /* Invalidate all row_prebuilt_t::ins_graph that are referring
+ to this table. That is, force row_get_prebuilt_insert_row() to
+ rebuild prebuilt->ins_node->entry_list). */
+ ut_ad(table->def_trx_id <= trx->id);
+ table->def_trx_id = trx->id;
+
+ next_index = dict_table_get_next_index(index);
+
+ while ((index = next_index) != NULL) {
+ /* read the next pointer before freeing the index */
+ next_index = dict_table_get_next_index(index);
+
+ ut_ad(!dict_index_is_clust(index));
+
+ if (!index->is_committed()) {
+ /* If it is FTS index, drop from table->fts
+ and also drop its auxiliary tables */
+ if (index->type & DICT_FTS) {
+ ut_a(table->fts);
+ fts_drop_index(table, index, trx);
+ }
+
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_CREATION:
+ /* This state should only be possible
+ when prepare_inplace_alter_table() fails
+ after invoking row_merge_create_index().
+ In inplace_alter_table(),
+ row_merge_build_indexes()
+ should never leave the index in this state.
+ It would invoke row_log_abort_sec() on
+ failure. */
+ case ONLINE_INDEX_COMPLETE:
+ /* In these cases, we are able to drop
+ the index straight. The DROP INDEX was
+ never deferred. */
+ break;
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ /* covered by dict_sys.mutex */
+ MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX);
+ }
+
+ dict_index_remove_from_cache(table, index);
+ }
+ }
+
+ fts_clear_all(table, trx);
+ table->drop_aborted = FALSE;
+ ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
+}
+
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+void
+row_merge_drop_temp_indexes(void)
+/*=============================*/
+{
+ static const char sql[] =
+ "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
+ "ixid CHAR;\n"
+ "found INT;\n"
+
+ "DECLARE CURSOR index_cur IS\n"
+ " SELECT ID FROM SYS_INDEXES\n"
+ " WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+ "FOR UPDATE;\n"
+
+ "BEGIN\n"
+ "found := 1;\n"
+ "OPEN index_cur;\n"
+ "WHILE found = 1 LOOP\n"
+ " FETCH index_cur INTO ixid;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE\n"
+ " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+ " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE index_cur;\n"
+ "END;\n";
+ trx_t* trx;
+ dberr_t error;
+
+ /* Load the table definitions that contain partially defined
+ indexes, so that the data dictionary information can be checked
+ when accessing the tablename.ibd files. */
+ trx = trx_create();
+ trx->op_info = "dropping partially created indexes";
+ row_mysql_lock_data_dictionary(trx);
+ /* Ensure that this transaction will be rolled back and locks
+ will be released, if the server gets killed before the commit
+ gets written to the redo log. */
+ trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+ trx->op_info = "dropping indexes";
+ error = que_eval_sql(NULL, sql, FALSE, trx);
+
+ if (error != DB_SUCCESS) {
+ /* Even though we ensure that DDL transactions are WAIT
+ and DEADLOCK free, we could encounter other errors e.g.,
+ DB_TOO_MANY_CONCURRENT_TRXS. */
+ trx->error_state = DB_SUCCESS;
+
+ ib::error() << "row_merge_drop_temp_indexes failed with error"
+ << error;
+ }
+
+ trx_commit_for_mysql(trx);
+ row_mysql_unlock_data_dictionary(trx);
+ trx->free();
+}
+
+
+/** Create temporary merge files in the given paramater path, and if
+UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
+@param[in] path location for creating temporary merge files, or NULL
+@return File descriptor */
+pfs_os_file_t
+row_merge_file_create_low(
+ const char* path)
+{
+#ifdef WITH_INNODB_DISALLOW_WRITES
+ os_event_wait(srv_allow_writes_event);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+ if (!path) {
+ path = mysql_tmpdir;
+ }
+#ifdef UNIV_PFS_IO
+ /* This temp file open does not go through normal
+ file APIs, add instrumentation to register with
+ performance schema */
+ struct PSI_file_locker* locker;
+ PSI_file_locker_state state;
+ static const char label[] = "/Innodb Merge Temp File";
+ char* name = static_cast<char*>(
+ ut_malloc_nokey(strlen(path) + sizeof label));
+ strcpy(name, path);
+ strcat(name, label);
+
+ register_pfs_file_open_begin(
+ &state, locker, innodb_temp_file_key,
+ PSI_FILE_CREATE, path ? name : label, __FILE__, __LINE__);
+
+#endif
+ DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN);
+ char filename[FN_REFLEN];
+ File f = create_temp_file(filename, path, "ib",
+ O_BINARY | O_SEQUENTIAL,
+ MYF(MY_WME | MY_TEMPORARY));
+ pfs_os_file_t fd = IF_WIN((os_file_t)my_get_osfhandle(f), f);
+
+#ifdef UNIV_PFS_IO
+ register_pfs_file_open_end(locker, fd,
+ (fd == OS_FILE_CLOSED)?NULL:&fd);
+ ut_free(name);
+#endif
+
+ if (fd == OS_FILE_CLOSED) {
+ ib::error() << "Cannot create temporary merge file";
+ }
+ return(fd);
+}
+
+
+/** Create a merge file in the given location.
+@param[out] merge_file merge file structure
+@param[in] path location for creating temporary file, or NULL
+@return file descriptor, or OS_FILE_CLOSED on error */
+pfs_os_file_t
+row_merge_file_create(
+ merge_file_t* merge_file,
+ const char* path)
+{
+ merge_file->fd = row_merge_file_create_low(path);
+ merge_file->offset = 0;
+ merge_file->n_rec = 0;
+
+ if (merge_file->fd != OS_FILE_CLOSED) {
+ if (srv_disable_sort_file_cache) {
+ os_file_set_nocache(merge_file->fd,
+ "row0merge.cc", "sort");
+ }
+ }
+ return(merge_file->fd);
+}
+
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+void
+row_merge_file_destroy_low(
+/*=======================*/
+ const pfs_os_file_t& fd) /*!< in: merge file descriptor */
+{
+ if (fd != OS_FILE_CLOSED) {
+ int res = mysql_file_close(IF_WIN(my_win_handle2File((os_file_t)fd), fd),
+ MYF(MY_WME));
+ ut_a(res != -1);
+ }
+}
+/*********************************************************************//**
+Destroy a merge file. */
+void
+row_merge_file_destroy(
+/*===================*/
+ merge_file_t* merge_file) /*!< in/out: merge file structure */
+{
+ ut_ad(!srv_read_only_mode);
+
+ if (merge_file->fd != OS_FILE_CLOSED) {
+ row_merge_file_destroy_low(merge_file->fd);
+ merge_file->fd = OS_FILE_CLOSED;
+ }
+}
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+ trx_t* trx, /*!< in/out: transaction */
+ table_id_t table_id, /*!< in: table identifier */
+ index_id_t index_id) /*!< in: index identifier */
+{
+ dberr_t err = DB_SUCCESS;
+ pars_info_t* info = pars_info_create();
+
+ /* We use the private SQL parser of Innobase to generate the
+ query graphs needed in renaming indexes. */
+
+ static const char rename_index[] =
+ "PROCEDURE RENAME_INDEX_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
+ "WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
+ "END;\n";
+
+ ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+ trx->op_info = "renaming index to add";
+
+ pars_info_add_ull_literal(info, "tableid", table_id);
+ pars_info_add_ull_literal(info, "indexid", index_id);
+
+ err = que_eval_sql(info, rename_index, FALSE, trx);
+
+ if (err != DB_SUCCESS) {
+ /* Even though we ensure that DDL transactions are WAIT
+ and DEADLOCK free, we could encounter other errors e.g.,
+ DB_TOO_MANY_CONCURRENT_TRXS. */
+ trx->error_state = DB_SUCCESS;
+
+ ib::error() << "row_merge_rename_index_to_add failed with"
+ " error " << err;
+ }
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Rename an index in the dictionary that is to be dropped. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+dberr_t
+row_merge_rename_index_to_drop(
+/*===========================*/
+ trx_t* trx, /*!< in/out: transaction */
+ table_id_t table_id, /*!< in: table identifier */
+ index_id_t index_id) /*!< in: index identifier */
+{
+ dberr_t err;
+ pars_info_t* info = pars_info_create();
+
+ ut_ad(!srv_read_only_mode);
+
+ /* We use the private SQL parser of Innobase to generate the
+ query graphs needed in renaming indexes. */
+
+ static const char rename_index[] =
+ "PROCEDURE RENAME_INDEX_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_INDEXES SET NAME=CONCAT('"
+ TEMP_INDEX_PREFIX_STR "',NAME)\n"
+ "WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
+ "END;\n";
+
+ ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+ ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+ trx->op_info = "renaming index to drop";
+
+ pars_info_add_ull_literal(info, "tableid", table_id);
+ pars_info_add_ull_literal(info, "indexid", index_id);
+
+ err = que_eval_sql(info, rename_index, FALSE, trx);
+
+ if (err != DB_SUCCESS) {
+ /* Even though we ensure that DDL transactions are WAIT
+ and DEADLOCK free, we could encounter other errors e.g.,
+ DB_TOO_MANY_CONCURRENT_TRXS. */
+ trx->error_state = DB_SUCCESS;
+
+ ib::error() << "row_merge_rename_index_to_drop failed with"
+ " error " << err;
+ }
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/** Create the index and load in to the dictionary.
+@param[in,out] table the index is on this table
+@param[in] index_def the index definition
+@param[in] add_v new virtual columns added along with add
+ index call
+@return index, or NULL on error */
+dict_index_t*
+row_merge_create_index(
+ dict_table_t* table,
+ const index_def_t* index_def,
+ const dict_add_v_col_t* add_v)
+{
+ dict_index_t* index;
+ ulint n_fields = index_def->n_fields;
+ ulint i;
+ ulint n_add_vcol = 0;
+
+ DBUG_ENTER("row_merge_create_index");
+
+ ut_ad(!srv_read_only_mode);
+
+ /* Create the index prototype, using the passed in def, this is not
+ a persistent operation. We pass 0 as the space id, and determine at
+ a lower level the space id where to store the table. */
+
+ index = dict_mem_index_create(table, index_def->name,
+ index_def->ind_type, n_fields);
+ index->set_committed(index_def->rebuild);
+
+ for (i = 0; i < n_fields; i++) {
+ const char* name;
+ index_field_t* ifield = &index_def->fields[i];
+
+ if (ifield->is_v_col) {
+ if (ifield->col_no >= table->n_v_def) {
+ ut_ad(ifield->col_no < table->n_v_def
+ + add_v->n_v_col);
+ ut_ad(ifield->col_no >= table->n_v_def);
+ name = add_v->v_col_name[
+ ifield->col_no - table->n_v_def];
+ n_add_vcol++;
+ } else {
+ name = dict_table_get_v_col_name(
+ table, ifield->col_no);
+ }
+ } else {
+ name = dict_table_get_col_name(table, ifield->col_no);
+ }
+
+ dict_mem_index_add_field(index, name, ifield->prefix_len);
+ }
+
+ if (n_add_vcol) {
+ index->assign_new_v_col(n_add_vcol);
+ }
+
+ DBUG_RETURN(index);
+}
+
+/*********************************************************************//**
+Check if a transaction can use an index. */
+bool
+row_merge_is_index_usable(
+/*======================*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index) /*!< in: index to check */
+{
+ if (!index->is_primary()
+ && dict_index_is_online_ddl(index)) {
+ /* Indexes that are being created are not useable. */
+ return(false);
+ }
+
+ return(!index->is_corrupted()
+ && (index->table->is_temporary() || index->table->no_rollback()
+ || index->trx_id == 0
+ || !trx->read_view.is_open()
+ || trx->read_view.changes_visible(
+ index->trx_id,
+ index->table->name)));
+}
+
+/*********************************************************************//**
+Drop a table. The caller must have ensured that the background stats
+thread is not processing the table. This can be done by calling
+dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and
+before calling this function.
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_drop_table(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table) /*!< in: table to drop */
+{
+ ut_ad(!srv_read_only_mode);
+
+ /* There must be no open transactions on the table. */
+ ut_a(table->get_ref_count() == 0);
+
+ return(row_drop_table_for_mysql(table->name.m_name,
+ trx, SQLCOM_DROP_TABLE, false, false));
+}
+
+/** Build indexes on a table by reading a clustered index, creating a temporary
+file containing index entries, merge sorting these index entries and inserting
+sorted index entries to indexes.
+@param[in] trx transaction
+@param[in] old_table table where rows are read from
+@param[in] new_table table where indexes are created; identical to
+old_table unless creating a PRIMARY KEY
+@param[in] online true if creating indexes online
+@param[in] indexes indexes to be created
+@param[in] key_numbers MySQL key numbers
+@param[in] n_indexes size of indexes[]
+@param[in,out] table MySQL table, for reporting erroneous key value
+if applicable
+@param[in] defaults default values of added, changed columns, or NULL
+@param[in] col_map mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in] add_autoinc number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out] sequence autoinc sequence
+@param[in] skip_pk_sort whether the new PRIMARY KEY will follow
+existing order
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of
+this function and it will be passed to other functions for further accounting.
+@param[in] add_v new virtual columns added along with indexes
+@param[in] eval_table mysql table used to evaluate virtual column
+ value, see innobase_get_computed_value().
+@param[in] allow_not_null allow the conversion from null to not-null
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_build_indexes(
+ trx_t* trx,
+ dict_table_t* old_table,
+ dict_table_t* new_table,
+ bool online,
+ dict_index_t** indexes,
+ const ulint* key_numbers,
+ ulint n_indexes,
+ struct TABLE* table,
+ const dtuple_t* defaults,
+ const ulint* col_map,
+ ulint add_autoinc,
+ ib_sequence_t& sequence,
+ bool skip_pk_sort,
+ ut_stage_alter_t* stage,
+ const dict_add_v_col_t* add_v,
+ struct TABLE* eval_table,
+ bool allow_not_null)
+{
+ merge_file_t* merge_files;
+ row_merge_block_t* block;
+ ut_new_pfx_t block_pfx;
+ size_t block_size;
+ ut_new_pfx_t crypt_pfx;
+ row_merge_block_t* crypt_block = NULL;
+ ulint i;
+ ulint j;
+ dberr_t error;
+ pfs_os_file_t tmpfd = OS_FILE_CLOSED;
+ dict_index_t* fts_sort_idx = NULL;
+ fts_psort_t* psort_info = NULL;
+ fts_psort_t* merge_info = NULL;
+ bool fts_psort_initiated = false;
+
+ double total_static_cost = 0;
+ double total_dynamic_cost = 0;
+ ulint total_index_blocks = 0;
+ double pct_cost=0;
+ double pct_progress=0;
+
+ DBUG_ENTER("row_merge_build_indexes");
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad((old_table == new_table) == !col_map);
+ ut_ad(!defaults || col_map);
+
+ stage->begin_phase_read_pk(skip_pk_sort && new_table != old_table
+ ? n_indexes - 1
+ : n_indexes);
+
+ /* Allocate memory for merge file data structure and initialize
+ fields */
+
+ ut_allocator<row_merge_block_t> alloc(mem_key_row_merge_sort);
+
+ /* This will allocate "3 * srv_sort_buf_size" elements of type
+ row_merge_block_t. The latter is defined as byte. */
+ block_size = 3 * srv_sort_buf_size;
+ block = alloc.allocate_large(block_size, &block_pfx);
+
+ if (block == NULL) {
+ DBUG_RETURN(DB_OUT_OF_MEMORY);
+ }
+
+ crypt_pfx.m_size = 0; /* silence bogus -Wmaybe-uninitialized */
+ TRASH_ALLOC(&crypt_pfx, sizeof crypt_pfx);
+
+ if (log_tmp_is_encrypted()) {
+ crypt_block = static_cast<row_merge_block_t*>(
+ alloc.allocate_large(block_size,
+ &crypt_pfx));
+
+ if (crypt_block == NULL) {
+ DBUG_RETURN(DB_OUT_OF_MEMORY);
+ }
+ }
+
+ trx_start_if_not_started_xa(trx, true);
+ ulint n_merge_files = 0;
+
+ for (ulint i = 0; i < n_indexes; i++)
+ {
+ if (!dict_index_is_spatial(indexes[i])) {
+ n_merge_files++;
+ }
+ }
+
+ merge_files = static_cast<merge_file_t*>(
+ ut_malloc_nokey(n_merge_files * sizeof *merge_files));
+
+ /* Initialize all the merge file descriptors, so that we
+ don't call row_merge_file_destroy() on uninitialized
+ merge file descriptor */
+
+ for (i = 0; i < n_merge_files; i++) {
+ merge_files[i].fd = OS_FILE_CLOSED;
+ merge_files[i].offset = 0;
+ merge_files[i].n_rec = 0;
+ }
+
+ total_static_cost = COST_BUILD_INDEX_STATIC
+ * static_cast<double>(n_indexes) + COST_READ_CLUSTERED_INDEX;
+ total_dynamic_cost = COST_BUILD_INDEX_DYNAMIC
+ * static_cast<double>(n_indexes);
+ for (i = 0; i < n_indexes; i++) {
+ if (indexes[i]->type & DICT_FTS) {
+ ibool opt_doc_id_size = FALSE;
+
+ /* To build FTS index, we would need to extract
+ doc's word, Doc ID, and word's position, so
+ we need to build a "fts sort index" indexing
+ on above three 'fields' */
+ fts_sort_idx = row_merge_create_fts_sort_index(
+ indexes[i], old_table, &opt_doc_id_size);
+
+ row_merge_dup_t* dup
+ = static_cast<row_merge_dup_t*>(
+ ut_malloc_nokey(sizeof *dup));
+ dup->index = fts_sort_idx;
+ dup->table = table;
+ dup->col_map = col_map;
+ dup->n_dup = 0;
+
+ /* This can fail e.g. if temporal files can't be
+ created */
+ if (!row_fts_psort_info_init(
+ trx, dup, new_table, opt_doc_id_size,
+ old_table->space->zip_size(),
+ &psort_info, &merge_info)) {
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ /* We need to ensure that we free the resources
+ allocated */
+ fts_psort_initiated = true;
+ }
+ }
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information("InnoDB: Online DDL : Start reading"
+ " clustered index of the table"
+ " and create temporary files");
+ }
+
+ pct_cost = COST_READ_CLUSTERED_INDEX * 100 / (total_static_cost + total_dynamic_cost);
+
+ /* Do not continue if we can't encrypt table pages */
+ if (!old_table->is_readable() ||
+ !new_table->is_readable()) {
+ error = DB_DECRYPTION_FAILED;
+ ib_push_warning(trx->mysql_thd, DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ !old_table->is_readable() ? old_table->name.m_name :
+ new_table->name.m_name);
+ goto func_exit;
+ }
+
+ /* Read clustered index of the table and create files for
+ secondary index entries for merge sort */
+ error = row_merge_read_clustered_index(
+ trx, table, old_table, new_table, online, indexes,
+ fts_sort_idx, psort_info, merge_files, key_numbers,
+ n_indexes, defaults, add_v, col_map, add_autoinc,
+ sequence, block, skip_pk_sort, &tmpfd, stage,
+ pct_cost, crypt_block, eval_table, allow_not_null);
+
+ stage->end_phase_read_pk();
+
+ pct_progress += pct_cost;
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information("InnoDB: Online DDL : End of reading "
+ "clustered index of the table"
+ " and create temporary files");
+ }
+
+ for (i = 0; i < n_merge_files; i++) {
+ total_index_blocks += merge_files[i].offset;
+ }
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ DEBUG_SYNC_C("row_merge_after_scan");
+
+ /* Now we have files containing index entries ready for
+ sorting and inserting. */
+
+ for (ulint k = 0, i = 0; i < n_indexes; i++) {
+ dict_index_t* sort_idx = indexes[i];
+
+ if (dict_index_is_spatial(sort_idx)) {
+ continue;
+ }
+
+ if (indexes[i]->type & DICT_FTS) {
+
+ sort_idx = fts_sort_idx;
+
+ if (FTS_PLL_MERGE) {
+ row_fts_start_parallel_merge(merge_info);
+ for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+ merge_info[j].task->wait();
+ delete merge_info[j].task;
+ }
+ } else {
+ /* This cannot report duplicates; an
+ assertion would fail in that case. */
+ error = row_fts_merge_insert(
+ sort_idx, new_table,
+ psort_info, 0);
+ }
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n");
+#endif
+ } else if (merge_files[k].fd != OS_FILE_CLOSED) {
+ char buf[NAME_LEN + 1];
+ row_merge_dup_t dup = {
+ sort_idx, table, col_map, 0};
+
+ pct_cost = (COST_BUILD_INDEX_STATIC +
+ (total_dynamic_cost
+ * static_cast<double>(merge_files[k].offset)
+ / static_cast<double>(total_index_blocks)))
+ / (total_static_cost + total_dynamic_cost)
+ * PCT_COST_MERGESORT_INDEX * 100;
+ char* bufend = innobase_convert_name(
+ buf, sizeof buf,
+ indexes[i]->name,
+ strlen(indexes[i]->name),
+ trx->mysql_thd);
+ buf[bufend - buf]='\0';
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information("InnoDB: Online DDL :"
+ " Start merge-sorting"
+ " index %s"
+ " (" ULINTPF
+ " / " ULINTPF "),"
+ " estimated cost :"
+ " %2.4f",
+ buf, i + 1, n_indexes,
+ pct_cost);
+ }
+
+ error = row_merge_sort(
+ trx, &dup, &merge_files[k],
+ block, &tmpfd, true,
+ pct_progress, pct_cost,
+ crypt_block, new_table->space_id,
+ stage);
+
+ pct_progress += pct_cost;
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information("InnoDB: Online DDL :"
+ " End of "
+ " merge-sorting index %s"
+ " (" ULINTPF
+ " / " ULINTPF ")",
+ buf, i + 1, n_indexes);
+ }
+
+ if (error == DB_SUCCESS) {
+ BtrBulk btr_bulk(sort_idx, trx);
+
+ pct_cost = (COST_BUILD_INDEX_STATIC +
+ (total_dynamic_cost
+ * static_cast<double>(
+ merge_files[k].offset)
+ / static_cast<double>(
+ total_index_blocks)))
+ / (total_static_cost
+ + total_dynamic_cost)
+ * PCT_COST_INSERT_INDEX * 100;
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information(
+ "InnoDB: Online DDL : Start "
+ "building index %s"
+ " (" ULINTPF
+ " / " ULINTPF "), estimated "
+ "cost : %2.4f", buf, i + 1,
+ n_indexes, pct_cost);
+ }
+
+ error = row_merge_insert_index_tuples(
+ sort_idx, old_table,
+ merge_files[k].fd, block, NULL,
+ &btr_bulk,
+ merge_files[k].n_rec, pct_progress, pct_cost,
+ crypt_block, new_table->space_id,
+ stage);
+
+ error = btr_bulk.finish(error);
+
+ pct_progress += pct_cost;
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information(
+ "InnoDB: Online DDL : "
+ "End of building index %s"
+ " (" ULINTPF " / " ULINTPF ")",
+ buf, i + 1, n_indexes);
+ }
+ }
+ }
+
+ /* Close the temporary file to free up space. */
+ row_merge_file_destroy(&merge_files[k++]);
+
+ if (indexes[i]->type & DICT_FTS) {
+ row_fts_psort_info_destroy(psort_info, merge_info);
+ fts_psort_initiated = false;
+ } else if (old_table != new_table) {
+ ut_ad(!sort_idx->online_log);
+ ut_ad(sort_idx->online_status
+ == ONLINE_INDEX_COMPLETE);
+ }
+
+ if (old_table != new_table
+ || (indexes[i]->type & (DICT_FTS | DICT_SPATIAL))
+ || error != DB_SUCCESS || !online) {
+ /* Do not apply any online log. */
+ } else {
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information(
+ "InnoDB: Online DDL : Applying"
+ " log to index");
+ }
+
+ DEBUG_SYNC_C("row_log_apply_before");
+ error = row_log_apply(trx, sort_idx, table, stage);
+ DEBUG_SYNC_C("row_log_apply_after");
+ }
+
+ if (error != DB_SUCCESS) {
+ trx->error_key_num = key_numbers[i];
+ goto func_exit;
+ }
+
+ if (indexes[i]->type & DICT_FTS
+ && UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "Finished building full-text index "
+ << indexes[i]->name;
+ }
+ }
+
+func_exit:
+
+ DBUG_EXECUTE_IF(
+ "ib_build_indexes_too_many_concurrent_trxs",
+ error = DB_TOO_MANY_CONCURRENT_TRXS;
+ trx->error_state = error;);
+
+ if (fts_psort_initiated) {
+ /* Clean up FTS psort related resource */
+ row_fts_psort_info_destroy(psort_info, merge_info);
+ fts_psort_initiated = false;
+ }
+
+ row_merge_file_destroy_low(tmpfd);
+
+ for (i = 0; i < n_merge_files; i++) {
+ row_merge_file_destroy(&merge_files[i]);
+ }
+
+ if (fts_sort_idx) {
+ dict_mem_index_free(fts_sort_idx);
+ }
+
+ ut_free(merge_files);
+
+ alloc.deallocate_large(block, &block_pfx);
+
+ if (crypt_block) {
+ alloc.deallocate_large(crypt_block, &crypt_pfx);
+ }
+
+ DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+ if (online && old_table == new_table && error != DB_SUCCESS) {
+ /* On error, flag all online secondary index creation
+ as aborted. */
+ for (i = 0; i < n_indexes; i++) {
+ ut_ad(!(indexes[i]->type & DICT_FTS));
+ ut_ad(!indexes[i]->is_committed());
+ ut_ad(!dict_index_is_clust(indexes[i]));
+
+ /* Completed indexes should be dropped as
+ well, and indexes whose creation was aborted
+ should be dropped from the persistent
+ storage. However, at this point we can only
+ set some flags in the not-yet-published
+ indexes. These indexes will be dropped later
+ in row_merge_drop_indexes(), called by
+ rollback_inplace_alter_table(). */
+
+ switch (dict_index_get_online_status(indexes[i])) {
+ case ONLINE_INDEX_COMPLETE:
+ break;
+ case ONLINE_INDEX_CREATION:
+ rw_lock_x_lock(
+ dict_index_get_lock(indexes[i]));
+ row_log_abort_sec(indexes[i]);
+ indexes[i]->type |= DICT_CORRUPT;
+ rw_lock_x_unlock(
+ dict_index_get_lock(indexes[i]));
+ new_table->drop_aborted = TRUE;
+ /* fall through */
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ case ONLINE_INDEX_ABORTED:
+ MONITOR_ATOMIC_INC(
+ MONITOR_BACKGROUND_DROP_INDEX);
+ }
+ }
+ }
+
+ DBUG_EXECUTE_IF("ib_index_crash_after_bulk_load", DBUG_SUICIDE(););
+ DBUG_RETURN(error);
+}
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
new file mode 100644
index 00000000..6998a573
--- /dev/null
+++ b/storage/innobase/row/row0mysql.cc
@@ -0,0 +1,4902 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0mysql.cc
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <debug_sync.h>
+#include <gstream.h>
+#include <spatial.h>
+
+#include "row0mysql.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "dict0defrag_bg.h"
+#include "btr0defragment.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "fsp0file.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "row0import.h"
+#include "row0ins.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "srv0start.h"
+#include "row0ext.h"
+#include "srv0start.h"
+
+#include <algorithm>
+#include <deque>
+#include <vector>
+
+#ifdef WITH_WSREP
+#include "mysql/service_wsrep.h"
+#include "wsrep.h"
+#include "log.h"
+#include "wsrep_mysqld.h"
+#endif
+
+/** Provide optional 4.x backwards compatibility for 5.0 and above */
+ibool row_rollback_on_timeout = FALSE;
+
+/** Chain node of the list of tables to drop in the background. */
+struct row_mysql_drop_t{
+ table_id_t table_id; /*!< table id */
+ UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list;
+ /*!< list chain node */
+};
+
+/** @brief List of tables we should drop in background.
+
+ALTER TABLE in MySQL requires that the table handler can drop the
+table in background when there are no queries to it any
+more. Protected by row_drop_list_mutex. */
+static UT_LIST_BASE_NODE_T(row_mysql_drop_t) row_mysql_drop_list;
+
+/** Mutex protecting the background table drop list. */
+static ib_mutex_t row_drop_list_mutex;
+
+/** Flag: has row_mysql_drop_list been initialized? */
+static bool row_mysql_drop_list_inited;
+
+/*******************************************************************//**
+Determine if the given name is a name reserved for MySQL system tables.
+@return TRUE if name is a MySQL system table name */
+static
+ibool
+row_mysql_is_system_table(
+/*======================*/
+ const char* name)
+{
+ if (strncmp(name, "mysql/", 6) != 0) {
+
+ return(FALSE);
+ }
+
+ return(0 == strcmp(name + 6, "host")
+ || 0 == strcmp(name + 6, "user")
+ || 0 == strcmp(name + 6, "db"));
+}
+
+#ifdef UNIV_DEBUG
+/** Wait for the background drop list to become empty. */
+void
+row_wait_for_background_drop_list_empty()
+{
+ bool empty = false;
+ while (!empty) {
+ mutex_enter(&row_drop_list_mutex);
+ empty = (UT_LIST_GET_LEN(row_mysql_drop_list) == 0);
+ mutex_exit(&row_drop_list_mutex);
+ os_thread_sleep(100000);
+ }
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */
+static
+void
+row_mysql_delay_if_needed(void)
+/*===========================*/
+{
+ if (srv_dml_needed_delay) {
+ os_thread_sleep(srv_dml_needed_delay);
+ }
+}
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct of a
+ ha_innobase:: table handle */
+{
+ DBUG_ENTER("row_mysql_prebuilt_free_blob_heap");
+
+ DBUG_PRINT("row_mysql_prebuilt_free_blob_heap",
+ ("blob_heap freeing: %p", prebuilt->blob_heap));
+
+ mem_heap_free(prebuilt->blob_heap);
+ prebuilt->blob_heap = NULL;
+ DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+ byte* dest, /*!< in: where to store */
+ ulint len, /*!< in: length, must fit in two bytes */
+ ulint lenlen) /*!< in: storage length of len: either 1 or 2 bytes */
+{
+ if (lenlen == 2) {
+ ut_a(len < 256 * 256);
+
+ mach_write_to_2_little_endian(dest, len);
+
+ return(dest + 2);
+ }
+
+ ut_a(lenlen == 1);
+ ut_a(len < 256);
+
+ mach_write_to_1(dest, len);
+
+ return(dest + 1);
+}
+
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+ ulint* len, /*!< out: variable-length field length */
+ const byte* field, /*!< in: field in the MySQL format */
+ ulint lenlen) /*!< in: storage length of len: either 1
+ or 2 bytes */
+{
+ if (lenlen == 2) {
+ *len = mach_read_from_2_little_endian(field);
+
+ return(field + 2);
+ }
+
+ ut_a(lenlen == 1);
+
+ *len = mach_read_from_1(field);
+
+ return(field + 1);
+}
+
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+ byte* dest, /*!< in: where to store */
+ ulint col_len,/*!< in: dest buffer size: determines into
+ how many bytes the BLOB length is stored,
+ the space for the length may vary from 1
+ to 4 bytes */
+ const void* data, /*!< in: BLOB data; if the value to store
+ is SQL NULL this should be NULL pointer */
+ ulint len) /*!< in: BLOB length; if the value to store
+ is SQL NULL this should be 0; remember
+ also to set the NULL bit in the MySQL record
+ header! */
+{
+ /* MySQL might assume the field is set to zero except the length and
+ the pointer fields */
+
+ memset(dest, '\0', col_len);
+
+ /* In dest there are 1 - 4 bytes reserved for the BLOB length,
+ and after that 8 bytes reserved for the pointer to the data.
+ In 32-bit architectures we only use the first 4 bytes of the pointer
+ slot. */
+
+ ut_a(col_len - 8 > 1 || len < 256);
+ ut_a(col_len - 8 > 2 || len < 256 * 256);
+ ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+
+ mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+ memcpy(dest + col_len - 8, &data, sizeof data);
+}
+
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+ ulint* len, /*!< out: BLOB length */
+ const byte* ref, /*!< in: BLOB reference in the
+ MySQL format */
+ ulint col_len) /*!< in: BLOB reference length
+ (not BLOB length) */
+{
+ byte* data;
+
+ *len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+ memcpy(&data, ref + col_len - 8, sizeof data);
+
+ return(data);
+}
+
+/*******************************************************************//**
+Converting InnoDB geometry data format to MySQL data format. */
+void
+row_mysql_store_geometry(
+/*=====================*/
+ byte* dest, /*!< in/out: where to store */
+ ulint dest_len, /*!< in: dest buffer size: determines
+ into how many bytes the GEOMETRY length
+ is stored, the space for the length
+ may vary from 1 to 4 bytes */
+ const byte* src, /*!< in: GEOMETRY data; if the value to
+ store is SQL NULL this should be NULL
+ pointer */
+ ulint src_len) /*!< in: GEOMETRY length; if the value
+ to store is SQL NULL this should be 0;
+ remember also to set the NULL bit in
+ the MySQL record header! */
+{
+ /* MySQL might assume the field is set to zero except the length and
+ the pointer fields */
+ MEM_CHECK_DEFINED(src, src_len);
+
+ memset(dest, '\0', dest_len);
+
+ /* In dest there are 1 - 4 bytes reserved for the BLOB length,
+ and after that 8 bytes reserved for the pointer to the data.
+ In 32-bit architectures we only use the first 4 bytes of the pointer
+ slot. */
+
+ ut_ad(dest_len - 8 > 1 || src_len < 1<<8);
+ ut_ad(dest_len - 8 > 2 || src_len < 1<<16);
+ ut_ad(dest_len - 8 > 3 || src_len < 1<<24);
+
+ mach_write_to_n_little_endian(dest, dest_len - 8, src_len);
+
+ memcpy(dest + dest_len - 8, &src, sizeof src);
+}
+
+/*******************************************************************//**
+Read geometry data in the MySQL format.
+@return pointer to geometry data */
+static
+const byte*
+row_mysql_read_geometry(
+/*====================*/
+ ulint* len, /*!< out: data length */
+ const byte* ref, /*!< in: geometry data in the
+ MySQL format */
+ ulint col_len) /*!< in: MySQL format length */
+{
+ byte* data;
+ ut_ad(col_len > 8);
+
+ *len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+ memcpy(&data, ref + col_len - 8, sizeof data);
+
+ return(data);
+}
+
+/**************************************************************//**
+Pad a column with spaces. */
+void
+row_mysql_pad_col(
+/*==============*/
+ ulint mbminlen, /*!< in: minimum size of a character,
+ in bytes */
+ byte* pad, /*!< out: padded buffer */
+ ulint len) /*!< in: number of bytes to pad */
+{
+ const byte* pad_end;
+
+ switch (UNIV_EXPECT(mbminlen, 1)) {
+ default:
+ ut_error;
+ case 1:
+ /* space=0x20 */
+ memset(pad, 0x20, len);
+ break;
+ case 2:
+ /* space=0x0020 */
+ pad_end = pad + len;
+ ut_a(!(len % 2));
+ while (pad < pad_end) {
+ *pad++ = 0x00;
+ *pad++ = 0x20;
+ };
+ break;
+ case 4:
+ /* space=0x00000020 */
+ pad_end = pad + len;
+ ut_a(!(len % 4));
+ while (pad < pad_end) {
+ *pad++ = 0x00;
+ *pad++ = 0x00;
+ *pad++ = 0x00;
+ *pad++ = 0x20;
+ }
+ break;
+ }
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.cc.
+@return up to which byte we used buf in the conversion */
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+ dfield_t* dfield, /*!< in/out: dfield where dtype
+ information must be already set when
+ this function is called! */
+ byte* buf, /*!< in/out: buffer for a converted
+ integer value; this must be at least
+ col_len long then! NOTE that dfield
+ may also get a pointer to 'buf',
+ therefore do not discard this as long
+ as dfield is used! */
+ ibool row_format_col, /*!< TRUE if the mysql_data is from
+ a MySQL row, FALSE if from a MySQL
+ key value;
+ in MySQL, a true VARCHAR storage
+ format differs in a row and in a
+ key value: in a key value the length
+ is always stored in 2 bytes! */
+ const byte* mysql_data, /*!< in: MySQL column value, not
+ SQL NULL; NOTE that dfield may also
+ get a pointer to mysql_data,
+ therefore do not discard this as long
+ as dfield is used! */
+ ulint col_len, /*!< in: MySQL column length; NOTE that
+ this is the storage length of the
+ column in the MySQL format row, not
+ necessarily the length of the actual
+ payload data; if the column is a true
+ VARCHAR then this is irrelevant */
+ ulint comp) /*!< in: nonzero=compact format */
+{
+ const byte* ptr = mysql_data;
+ const dtype_t* dtype;
+ ulint type;
+ ulint lenlen;
+
+ dtype = dfield_get_type(dfield);
+
+ type = dtype->mtype;
+
+ if (type == DATA_INT) {
+ /* Store integer data in Innobase in a big-endian format,
+ sign bit negated if the data is a signed integer. In MySQL,
+ integers are stored in a little-endian format. */
+
+ byte* p = buf + col_len;
+
+ for (;;) {
+ p--;
+ *p = *mysql_data;
+ if (p == buf) {
+ break;
+ }
+ mysql_data++;
+ }
+
+ if (!(dtype->prtype & DATA_UNSIGNED)) {
+
+ *buf ^= 128;
+ }
+
+ ptr = buf;
+ buf += col_len;
+ } else if ((type == DATA_VARCHAR
+ || type == DATA_VARMYSQL
+ || type == DATA_BINARY)) {
+
+ if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) {
+ /* The length of the actual data is stored to 1 or 2
+ bytes at the start of the field */
+
+ if (row_format_col) {
+ if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) {
+ lenlen = 2;
+ } else {
+ lenlen = 1;
+ }
+ } else {
+ /* In a MySQL key value, lenlen is always 2 */
+ lenlen = 2;
+ }
+
+ ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
+ lenlen);
+ } else {
+ /* Remove trailing spaces from old style VARCHAR
+ columns. */
+
+ /* Handle Unicode strings differently. */
+ ulint mbminlen = dtype_get_mbminlen(dtype);
+
+ ptr = mysql_data;
+
+ switch (mbminlen) {
+ default:
+ ut_error;
+ case 4:
+ /* space=0x00000020 */
+ /* Trim "half-chars", just in case. */
+ col_len &= ~3U;
+
+ while (col_len >= 4
+ && ptr[col_len - 4] == 0x00
+ && ptr[col_len - 3] == 0x00
+ && ptr[col_len - 2] == 0x00
+ && ptr[col_len - 1] == 0x20) {
+ col_len -= 4;
+ }
+ break;
+ case 2:
+ /* space=0x0020 */
+ /* Trim "half-chars", just in case. */
+ col_len &= ~1U;
+
+ while (col_len >= 2 && ptr[col_len - 2] == 0x00
+ && ptr[col_len - 1] == 0x20) {
+ col_len -= 2;
+ }
+ break;
+ case 1:
+ /* space=0x20 */
+ while (col_len > 0
+ && ptr[col_len - 1] == 0x20) {
+ col_len--;
+ }
+ }
+ }
+ } else if (comp && type == DATA_MYSQL
+ && dtype_get_mbminlen(dtype) == 1
+ && dtype_get_mbmaxlen(dtype) > 1) {
+ /* In some cases we strip trailing spaces from UTF-8 and other
+ multibyte charsets, from FIXED-length CHAR columns, to save
+ space. UTF-8 would otherwise normally use 3 * the string length
+ bytes to store an ASCII string! */
+
+ /* We assume that this CHAR field is encoded in a
+ variable-length character set where spaces have
+ 1:1 correspondence to 0x20 bytes, such as UTF-8.
+
+ Consider a CHAR(n) field, a field of n characters.
+ It will contain between n * mbminlen and n * mbmaxlen bytes.
+ We will try to truncate it to n bytes by stripping
+ space padding. If the field contains single-byte
+ characters only, it will be truncated to n characters.
+ Consider a CHAR(5) field containing the string
+ ".a " where "." denotes a 3-byte character represented
+ by the bytes "$%&". After our stripping, the string will
+ be stored as "$%&a " (5 bytes). The string
+ ".abc " will be stored as "$%&abc" (6 bytes).
+
+ The space padding will be restored in row0sel.cc, function
+ row_sel_field_store_in_mysql_format(). */
+
+ ulint n_chars;
+
+ ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype)));
+
+ n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype);
+
+ /* Strip space padding. */
+ while (col_len > n_chars && ptr[col_len - 1] == 0x20) {
+ col_len--;
+ }
+ } else if (!row_format_col) {
+ /* if mysql data is from a MySQL key value
+ since the length is always stored in 2 bytes,
+ we need do nothing here. */
+ } else if (type == DATA_BLOB) {
+
+ ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+ } else if (DATA_GEOMETRY_MTYPE(type)) {
+ ptr = row_mysql_read_geometry(&col_len, mysql_data, col_len);
+ }
+
+ dfield_set_data(dfield, ptr, col_len);
+
+ return(buf);
+}
+
+/**************************************************************//**
+Convert a row in the MySQL format to a row in the Innobase format. Note that
+the function to convert a MySQL format key value to an InnoDB dtuple is
+row_sel_convert_mysql_key_to_innobase() in row0sel.cc. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+ dtuple_t* row, /*!< in/out: Innobase row where the
+ field type information is already
+ copied there! */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct where template
+ must be of type ROW_MYSQL_WHOLE_ROW */
+ const byte* mysql_rec, /*!< in: row in the MySQL format;
+ NOTE: do not discard as long as
+ row is used, as row may contain
+ pointers to this record! */
+ mem_heap_t** blob_heap) /*!< in: FIX_ME, remove this after
+ server fixes its issue */
+{
+ const mysql_row_templ_t*templ;
+ dfield_t* dfield;
+ ulint i;
+ ulint n_col = 0;
+ ulint n_v_col = 0;
+
+ ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+ ut_ad(prebuilt->mysql_template);
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+
+ templ = prebuilt->mysql_template + i;
+
+ if (templ->is_virtual) {
+ ut_ad(n_v_col < dtuple_get_n_v_fields(row));
+ dfield = dtuple_get_nth_v_field(row, n_v_col);
+ n_v_col++;
+ } else {
+ dfield = dtuple_get_nth_field(row, n_col);
+ n_col++;
+ }
+
+ if (templ->mysql_null_bit_mask != 0) {
+ /* Column may be SQL NULL */
+
+ if (mysql_rec[templ->mysql_null_byte_offset]
+ & (byte) (templ->mysql_null_bit_mask)) {
+
+ /* It is SQL NULL */
+
+ dfield_set_null(dfield);
+
+ goto next_column;
+ }
+ }
+
+ row_mysql_store_col_in_innobase_format(
+ dfield,
+ prebuilt->ins_upd_rec_buff + templ->mysql_col_offset,
+ TRUE, /* MySQL row format data */
+ mysql_rec + templ->mysql_col_offset,
+ templ->mysql_col_len,
+ dict_table_is_comp(prebuilt->table));
+
+ /* server has issue regarding handling BLOB virtual fields,
+ and we need to duplicate it with our own memory here */
+ if (templ->is_virtual
+ && DATA_LARGE_MTYPE(dfield_get_type(dfield)->mtype)) {
+ if (*blob_heap == NULL) {
+ *blob_heap = mem_heap_create(dfield->len);
+ }
+ dfield_dup(dfield, *blob_heap);
+ }
+next_column:
+ ;
+ }
+
+ /* If there is a FTS doc id column and it is not user supplied (
+ generated by server) then assign it a new doc id. */
+ if (!prebuilt->table->fts) {
+ return;
+ }
+
+ ut_a(prebuilt->table->fts->doc_col != ULINT_UNDEFINED);
+
+ doc_id_t doc_id;
+
+ if (!DICT_TF2_FLAG_IS_SET(prebuilt->table, DICT_TF2_FTS_HAS_DOC_ID)) {
+ if (prebuilt->table->fts->cache->first_doc_id
+ == FTS_NULL_DOC_ID) {
+ fts_get_next_doc_id(prebuilt->table, &doc_id);
+ }
+ return;
+ }
+
+ dfield_t* fts_doc_id = dtuple_get_nth_field(
+ row, prebuilt->table->fts->doc_col);
+
+ if (fts_get_next_doc_id(prebuilt->table, &doc_id) == DB_SUCCESS) {
+ ut_a(doc_id != FTS_NULL_DOC_ID);
+ ut_ad(sizeof(doc_id) == fts_doc_id->type.len);
+ dfield_set_data(fts_doc_id, prebuilt->ins_upd_rec_buff
+ + prebuilt->mysql_row_len, 8);
+ fts_write_doc_id(fts_doc_id->data, doc_id);
+ } else {
+ dfield_set_null(fts_doc_id);
+ }
+}
+
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return true if it was a lock wait and we should continue running the
+query thread and in that case the thr is ALREADY in the running state. */
+bool
+row_mysql_handle_errors(
+/*====================*/
+ dberr_t* new_err,/*!< out: possible new error encountered in
+ lock wait, or if no new error, the value
+ of trx->error_state at the entry of this
+ function */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* thr, /*!< in: query thread, or NULL */
+ trx_savept_t* savept) /*!< in: savepoint, or NULL */
+{
+ dberr_t err;
+
+ DBUG_ENTER("row_mysql_handle_errors");
+ DEBUG_SYNC_C("row_mysql_handle_errors");
+
+handle_new_error:
+ err = trx->error_state;
+
+ ut_a(err != DB_SUCCESS);
+
+ trx->error_state = DB_SUCCESS;
+
+ DBUG_LOG("trx", "handle error: " << ut_strerr(err)
+ << ";id=" << ib::hex(trx->id) << ", " << trx);
+
+ switch (err) {
+ case DB_LOCK_WAIT_TIMEOUT:
+ if (row_rollback_on_timeout) {
+ goto rollback;
+ }
+ /* fall through */
+ case DB_DUPLICATE_KEY:
+ case DB_FOREIGN_DUPLICATE_KEY:
+ case DB_TOO_BIG_RECORD:
+ case DB_UNDO_RECORD_TOO_BIG:
+ case DB_ROW_IS_REFERENCED:
+ case DB_NO_REFERENCED_ROW:
+ case DB_CANNOT_ADD_CONSTRAINT:
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ case DB_OUT_OF_FILE_SPACE:
+ case DB_READ_ONLY:
+ case DB_FTS_INVALID_DOCID:
+ case DB_INTERRUPTED:
+ case DB_CANT_CREATE_GEOMETRY_OBJECT:
+ case DB_TABLE_NOT_FOUND:
+ case DB_DECRYPTION_FAILED:
+ case DB_COMPUTE_VALUE_FAILED:
+ rollback_to_savept:
+ DBUG_EXECUTE_IF("row_mysql_crash_if_error", {
+ log_buffer_flush_to_disk();
+ DBUG_SUICIDE(); });
+ if (savept) {
+ /* Roll back the latest, possibly incomplete insertion
+ or update */
+
+ trx->rollback(savept);
+ }
+ /* MySQL will roll back the latest SQL statement */
+ break;
+ case DB_LOCK_WAIT:
+ lock_wait_suspend_thread(thr);
+
+ if (trx->error_state != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ goto handle_new_error;
+ }
+
+ *new_err = err;
+
+ DBUG_RETURN(true);
+
+ case DB_DEADLOCK:
+ case DB_LOCK_TABLE_FULL:
+ rollback:
+ /* Roll back the whole transaction; this resolution was added
+ to version 3.23.43 */
+
+ trx->rollback();
+ break;
+
+ case DB_MUST_GET_MORE_FILE_SPACE:
+ ib::fatal() << "The database cannot continue operation because"
+ " of lack of space. You must add a new data file"
+ " to my.cnf and restart the database.";
+ break;
+
+ case DB_CORRUPTION:
+ case DB_PAGE_CORRUPTED:
+ ib::error() << "We detected index corruption in an InnoDB type"
+ " table. You have to dump + drop + reimport the"
+ " table or, in a case of widespread corruption,"
+ " dump all InnoDB tables and recreate the whole"
+ " tablespace. If the mysqld server crashes after"
+ " the startup or when you dump the tables. "
+ << FORCE_RECOVERY_MSG;
+ goto rollback_to_savept;
+ case DB_FOREIGN_EXCEED_MAX_CASCADE:
+ ib::error() << "Cannot delete/update rows with cascading"
+ " foreign key constraints that exceed max depth of "
+ << FK_MAX_CASCADE_DEL << ". Please drop excessive"
+ " foreign constraints and try again";
+ goto rollback_to_savept;
+ case DB_UNSUPPORTED:
+ ib::error() << "Cannot delete/update rows with cascading"
+ " foreign key constraints in timestamp-based temporal"
+ " table. Please drop excessive"
+ " foreign constraints and try again";
+ goto rollback_to_savept;
+ default:
+ ib::fatal() << "Unknown error " << err;
+ }
+
+ if (trx->error_state != DB_SUCCESS) {
+ *new_err = trx->error_state;
+ } else {
+ *new_err = err;
+ }
+
+ trx->error_state = DB_SUCCESS;
+
+ DBUG_RETURN(false);
+}
+
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+ dict_table_t* table, /*!< in: Innobase table handle */
+ ulint mysql_row_len) /*!< in: length in bytes of a row in
+ the MySQL format */
+{
+ DBUG_ENTER("row_create_prebuilt");
+
+ row_prebuilt_t* prebuilt;
+ mem_heap_t* heap;
+ dict_index_t* clust_index;
+ dict_index_t* temp_index;
+ dtuple_t* ref;
+ ulint ref_len;
+ uint srch_key_len = 0;
+ ulint search_tuple_n_fields;
+
+ search_tuple_n_fields = 2 * (dict_table_get_n_cols(table)
+ + dict_table_get_n_v_cols(table));
+
+ clust_index = dict_table_get_first_index(table);
+
+ /* Make sure that search_tuple is long enough for clustered index */
+ ut_a(2 * unsigned(table->n_cols) >= unsigned(clust_index->n_fields)
+ - clust_index->table->n_dropped());
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+
+ /* Maximum size of the buffer needed for conversion of INTs from
+ little endian format to big endian format in an index. An index
+ can have maximum 16 columns (MAX_REF_PARTS) in it. Therfore
+ Max size for PK: 16 * 8 bytes (BIGINT's size) = 128 bytes
+ Max size Secondary index: 16 * 8 bytes + PK = 256 bytes. */
+#define MAX_SRCH_KEY_VAL_BUFFER 2* (8 * MAX_REF_PARTS)
+
+#define PREBUILT_HEAP_INITIAL_SIZE \
+ ( \
+ sizeof(*prebuilt) \
+ /* allocd in this function */ \
+ + DTUPLE_EST_ALLOC(search_tuple_n_fields) \
+ + DTUPLE_EST_ALLOC(ref_len) \
+ /* allocd in row_prebuild_sel_graph() */ \
+ + sizeof(sel_node_t) \
+ + sizeof(que_fork_t) \
+ + sizeof(que_thr_t) \
+ /* allocd in row_get_prebuilt_update_vector() */ \
+ + sizeof(upd_node_t) \
+ + sizeof(upd_t) \
+ + sizeof(upd_field_t) \
+ * dict_table_get_n_cols(table) \
+ + sizeof(que_fork_t) \
+ + sizeof(que_thr_t) \
+ /* allocd in row_get_prebuilt_insert_row() */ \
+ + sizeof(ins_node_t) \
+ /* mysql_row_len could be huge and we are not \
+ sure if this prebuilt instance is going to be \
+ used in inserts */ \
+ + (mysql_row_len < 256 ? mysql_row_len : 0) \
+ + DTUPLE_EST_ALLOC(dict_table_get_n_cols(table) \
+ + dict_table_get_n_v_cols(table)) \
+ + sizeof(que_fork_t) \
+ + sizeof(que_thr_t) \
+ + sizeof(*prebuilt->pcur) \
+ + sizeof(*prebuilt->clust_pcur) \
+ )
+
+ /* Calculate size of key buffer used to store search key in
+ InnoDB format. MySQL stores INTs in little endian format and
+ InnoDB stores INTs in big endian format with the sign bit
+ flipped. All other field types are stored/compared the same
+ in MySQL and InnoDB, so we must create a buffer containing
+ the INT key parts in InnoDB format.We need two such buffers
+ since both start and end keys are used in records_in_range(). */
+
+ for (temp_index = dict_table_get_first_index(table); temp_index;
+ temp_index = dict_table_get_next_index(temp_index)) {
+ DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
+ ut_a(temp_index->n_user_defined_cols
+ == MAX_REF_PARTS););
+ uint temp_len = 0;
+ for (uint i = 0; i < temp_index->n_uniq; i++) {
+ ulint type = temp_index->fields[i].col->mtype;
+ if (type == DATA_INT) {
+ temp_len +=
+ temp_index->fields[i].fixed_len;
+ }
+ }
+ srch_key_len = std::max(srch_key_len,temp_len);
+ }
+
+ ut_a(srch_key_len <= MAX_SRCH_KEY_VAL_BUFFER);
+
+ DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
+ ut_a(srch_key_len == MAX_SRCH_KEY_VAL_BUFFER););
+
+ /* We allocate enough space for the objects that are likely to
+ be created later in order to minimize the number of malloc()
+ calls */
+ heap = mem_heap_create(PREBUILT_HEAP_INITIAL_SIZE + 2 * srch_key_len);
+
+ prebuilt = static_cast<row_prebuilt_t*>(
+ mem_heap_zalloc(heap, sizeof(*prebuilt)));
+
+ prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
+ prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
+
+ prebuilt->table = table;
+
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->heap = heap;
+
+ prebuilt->srch_key_val_len = srch_key_len;
+ if (prebuilt->srch_key_val_len) {
+ prebuilt->srch_key_val1 = static_cast<byte*>(
+ mem_heap_alloc(prebuilt->heap,
+ 2 * prebuilt->srch_key_val_len));
+ prebuilt->srch_key_val2 = prebuilt->srch_key_val1 +
+ prebuilt->srch_key_val_len;
+ } else {
+ prebuilt->srch_key_val1 = NULL;
+ prebuilt->srch_key_val2 = NULL;
+ }
+
+ prebuilt->pcur = static_cast<btr_pcur_t*>(
+ mem_heap_zalloc(prebuilt->heap,
+ sizeof(btr_pcur_t)));
+ prebuilt->clust_pcur = static_cast<btr_pcur_t*>(
+ mem_heap_zalloc(prebuilt->heap,
+ sizeof(btr_pcur_t)));
+ btr_pcur_reset(prebuilt->pcur);
+ btr_pcur_reset(prebuilt->clust_pcur);
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->stored_select_lock_type = LOCK_NONE_UNSET;
+
+ prebuilt->search_tuple = dtuple_create(heap, search_tuple_n_fields);
+
+ ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ prebuilt->clust_ref = ref;
+
+ prebuilt->autoinc_error = DB_SUCCESS;
+ prebuilt->autoinc_offset = 0;
+
+ /* Default to 1, we will set the actual value later in
+ ha_innobase::get_auto_increment(). */
+ prebuilt->autoinc_increment = 1;
+
+ prebuilt->autoinc_last_value = 0;
+
+ /* During UPDATE and DELETE we need the doc id. */
+ prebuilt->fts_doc_id = 0;
+
+ prebuilt->mysql_row_len = mysql_row_len;
+
+ prebuilt->fts_doc_id_in_read_set = 0;
+ prebuilt->blob_heap = NULL;
+
+ DBUG_RETURN(prebuilt);
+}
+
+/********************************************************************//**
+Free a prebuilt struct for a MySQL table handle. */
+void
+row_prebuilt_free(
+/*==============*/
+ row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */
+ ibool dict_locked) /*!< in: TRUE=data dictionary locked */
+{
+ DBUG_ENTER("row_prebuilt_free");
+
+ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+ prebuilt->magic_n = ROW_PREBUILT_FREED;
+ prebuilt->magic_n2 = ROW_PREBUILT_FREED;
+
+ btr_pcur_reset(prebuilt->pcur);
+ btr_pcur_reset(prebuilt->clust_pcur);
+
+ ut_free(prebuilt->mysql_template);
+
+ if (prebuilt->ins_graph) {
+ que_graph_free_recursive(prebuilt->ins_graph);
+ }
+
+ if (prebuilt->sel_graph) {
+ que_graph_free_recursive(prebuilt->sel_graph);
+ }
+
+ if (prebuilt->upd_graph) {
+ que_graph_free_recursive(prebuilt->upd_graph);
+ }
+
+ if (prebuilt->blob_heap) {
+ row_mysql_prebuilt_free_blob_heap(prebuilt);
+ }
+
+ if (prebuilt->old_vers_heap) {
+ mem_heap_free(prebuilt->old_vers_heap);
+ }
+
+ if (prebuilt->fetch_cache[0] != NULL) {
+ byte* base = prebuilt->fetch_cache[0] - 4;
+ byte* ptr = base;
+
+ for (ulint i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+ ulint magic1 = mach_read_from_4(ptr);
+ ut_a(magic1 == ROW_PREBUILT_FETCH_MAGIC_N);
+ ptr += 4;
+
+ byte* row = ptr;
+ ut_a(row == prebuilt->fetch_cache[i]);
+ ptr += prebuilt->mysql_row_len;
+
+ ulint magic2 = mach_read_from_4(ptr);
+ ut_a(magic2 == ROW_PREBUILT_FETCH_MAGIC_N);
+ ptr += 4;
+ }
+
+ ut_free(base);
+ }
+
+ if (prebuilt->rtr_info) {
+ rtr_clean_rtr_info(prebuilt->rtr_info, true);
+ }
+ if (prebuilt->table) {
+ dict_table_close(prebuilt->table, dict_locked, FALSE);
+ }
+
+ mem_heap_free(prebuilt->heap);
+
+ DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+void
+row_update_prebuilt_trx(
+/*====================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
+ in MySQL handle */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+ prebuilt->trx = trx;
+
+ if (prebuilt->ins_graph) {
+ prebuilt->ins_graph->trx = trx;
+ }
+
+ if (prebuilt->upd_graph) {
+ prebuilt->upd_graph->trx = trx;
+ }
+
+ if (prebuilt->sel_graph) {
+ prebuilt->sel_graph->trx = trx;
+ }
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it.
+@return prebuilt dtuple; the column type information is also set in it */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ dict_table_t* table = prebuilt->table;
+
+ ut_ad(prebuilt && table && prebuilt->trx);
+
+ if (prebuilt->ins_node != 0) {
+
+ /* Check if indexes have been dropped or added and we
+ may need to rebuild the row insert template. */
+
+ if (prebuilt->trx_id == table->def_trx_id
+ && prebuilt->ins_node->entry_list.size()
+ == UT_LIST_GET_LEN(table->indexes)) {
+
+ return(prebuilt->ins_node->row);
+ }
+
+ ut_ad(prebuilt->trx_id < table->def_trx_id);
+
+ que_graph_free_recursive(prebuilt->ins_graph);
+
+ prebuilt->ins_graph = 0;
+ }
+
+ /* Create an insert node and query graph to the prebuilt struct */
+
+ ins_node_t* node;
+
+ node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+
+ prebuilt->ins_node = node;
+
+ if (prebuilt->ins_upd_rec_buff == 0) {
+ prebuilt->ins_upd_rec_buff = static_cast<byte*>(
+ mem_heap_alloc(
+ prebuilt->heap,
+ DICT_TF2_FLAG_IS_SET(prebuilt->table,
+ DICT_TF2_FTS_HAS_DOC_ID)
+ ? prebuilt->mysql_row_len + 8/* FTS_DOC_ID */
+ : prebuilt->mysql_row_len));
+ }
+
+ dtuple_t* row;
+
+ row = dtuple_create_with_vcol(
+ prebuilt->heap, dict_table_get_n_cols(table),
+ dict_table_get_n_v_cols(table));
+
+ dict_table_copy_types(row, table);
+
+ ins_node_set_new_row(node, row);
+
+ prebuilt->ins_graph = static_cast<que_fork_t*>(
+ que_node_get_parent(
+ pars_complete_graph_for_exec(
+ node,
+ prebuilt->trx, prebuilt->heap, prebuilt)));
+
+ prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+
+ prebuilt->trx_id = table->def_trx_id;
+
+ return(prebuilt->ins_node->row);
+}
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL
+ table handle */
+{
+ trx_t* trx = prebuilt->trx;
+ ins_node_t* node = prebuilt->ins_node;
+ const dict_table_t* table = prebuilt->table;
+ que_thr_t* thr;
+ dberr_t err;
+ ibool was_lock_wait;
+
+ /* If we already hold an AUTOINC lock on the table then do nothing.
+ Note: We peek at the value of the current owner without acquiring
+ the lock mutex. */
+ if (trx == table->autoinc_trx) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "setting auto-inc lock";
+
+ row_get_prebuilt_insert_row(prebuilt);
+ node = prebuilt->ins_node;
+
+ /* We use the insert query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+ thr->start_running();
+
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started_xa(trx, true);
+
+ err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr);
+
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ return(err);
+ }
+
+ thr->stop_no_error();
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/** Lock a table.
+@param[in,out] prebuilt table handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table(row_prebuilt_t* prebuilt)
+{
+ trx_t* trx = prebuilt->trx;
+ que_thr_t* thr;
+ dberr_t err;
+ ibool was_lock_wait;
+
+ trx->op_info = "setting table lock";
+
+ if (prebuilt->sel_graph == NULL) {
+ /* Build a dummy select query graph */
+ row_prebuild_sel_graph(prebuilt);
+ }
+
+ /* We use the select query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+ thr->start_running();
+
+run_again:
+ thr->run_node = thr;
+ thr->prev_node = thr->common.parent;
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started_xa(trx, false);
+
+ err = lock_table(0, prebuilt->table,
+ static_cast<enum lock_mode>(
+ prebuilt->select_lock_type),
+ thr);
+
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ return(err);
+ }
+
+ thr->stop_no_error();
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/** Determine is tablespace encrypted but decryption failed, is table corrupted
+or is tablespace .ibd file missing.
+@param[in] table Table
+@param[in] trx Transaction
+@param[in] push_warning true if we should push warning to user
+@retval DB_DECRYPTION_FAILED table is encrypted but decryption failed
+@retval DB_CORRUPTION table is corrupted
+@retval DB_TABLESPACE_NOT_FOUND tablespace .ibd file not found */
+static
+dberr_t
+row_mysql_get_table_status(
+ const dict_table_t* table,
+ trx_t* trx,
+ bool push_warning = true)
+{
+ dberr_t err;
+ if (const fil_space_t* space = table->space) {
+ if (space->crypt_data && space->crypt_data->is_encrypted()) {
+ // maybe we cannot access the table due to failing
+ // to decrypt
+ if (push_warning) {
+ ib_push_warning(trx, DB_DECRYPTION_FAILED,
+ "Table %s in tablespace %lu encrypted."
+ "However key management plugin or used key_id is not found or"
+ " used encryption algorithm or method does not match.",
+ table->name.m_name, table->space);
+ }
+
+ err = DB_DECRYPTION_FAILED;
+ } else {
+ if (push_warning) {
+ ib_push_warning(trx, DB_CORRUPTION,
+ "Table %s in tablespace %lu corrupted.",
+ table->name.m_name, table->space);
+ }
+
+ err = DB_CORRUPTION;
+ }
+ } else {
+ ib::error() << ".ibd file is missing for table "
+ << table->name;
+ err = DB_TABLESPACE_NOT_FOUND;
+ }
+
+ return(err);
+}
+
+/** Does an insert for MySQL.
+@param[in] mysql_rec row in the MySQL format
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_insert_for_mysql(
+ const byte* mysql_rec,
+ row_prebuilt_t* prebuilt,
+ ins_mode_t ins_mode)
+{
+ trx_savept_t savept;
+ que_thr_t* thr;
+ dberr_t err;
+ ibool was_lock_wait;
+ trx_t* trx = prebuilt->trx;
+ ins_node_t* node = prebuilt->ins_node;
+ dict_table_t* table = prebuilt->table;
+
+ /* FIX_ME: This blob heap is used to compensate an issue in server
+ for virtual column blob handling */
+ mem_heap_t* blob_heap = NULL;
+
+ ut_ad(trx);
+ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+ if (!prebuilt->table->space) {
+
+ ib::error() << "The table " << prebuilt->table->name
+ << " doesn't have a corresponding tablespace, it was"
+ " discarded.";
+
+ return(DB_TABLESPACE_DELETED);
+
+ } else if (!prebuilt->table->is_readable()) {
+ return(row_mysql_get_table_status(prebuilt->table, trx, true));
+ } else if (high_level_read_only) {
+ return(DB_READ_ONLY);
+ }
+
+ DBUG_EXECUTE_IF("mark_table_corrupted", {
+ /* Mark the table corrupted for the clustered index */
+ dict_index_t* index = dict_table_get_first_index(table);
+ ut_ad(dict_index_is_clust(index));
+ dict_set_corrupted(index, trx, "INSERT TABLE"); });
+
+ if (dict_table_is_corrupted(table)) {
+
+ ib::error() << "Table " << table->name << " is corrupt.";
+ return(DB_TABLE_CORRUPT);
+ }
+
+ trx->op_info = "inserting";
+
+ row_mysql_delay_if_needed();
+
+ if (!table->no_rollback()) {
+ trx_start_if_not_started_xa(trx, true);
+ }
+
+ row_get_prebuilt_insert_row(prebuilt);
+ node = prebuilt->ins_node;
+
+ row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec,
+ &blob_heap);
+
+ if (ins_mode != ROW_INS_NORMAL) {
+ node->vers_update_end(prebuilt, ins_mode == ROW_INS_HISTORICAL);
+ }
+
+ savept = trx_savept_take(trx);
+
+ thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+ if (prebuilt->sql_stat_start) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ prebuilt->sql_stat_start = FALSE;
+ } else {
+ node->state = INS_NODE_ALLOC_ROW_ID;
+ }
+
+ thr->start_running();
+
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ row_ins_step(thr);
+
+ DEBUG_SYNC_C("ib_after_row_insert_step");
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+error_exit:
+ que_thr_stop_for_mysql(thr);
+
+ /* FIXME: What's this ? */
+ thr->lock_state = QUE_THR_LOCK_ROW;
+
+ was_lock_wait = row_mysql_handle_errors(
+ &err, trx, thr, &savept);
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+ if (was_lock_wait) {
+ ut_ad(node->state == INS_NODE_INSERT_ENTRIES
+ || node->state == INS_NODE_ALLOC_ROW_ID);
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ if (blob_heap != NULL) {
+ mem_heap_free(blob_heap);
+ }
+
+ return(err);
+ }
+
+ if (dict_table_has_fts_index(table)) {
+ doc_id_t doc_id;
+
+ /* Extract the doc id from the hidden FTS column */
+ doc_id = fts_get_doc_id_from_row(table, node->row);
+
+ if (doc_id <= 0) {
+ ib::error() << "FTS_DOC_ID must be larger than 0 for table "
+ << table->name;
+ err = DB_FTS_INVALID_DOCID;
+ trx->error_state = DB_FTS_INVALID_DOCID;
+ goto error_exit;
+ }
+
+ if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+ doc_id_t next_doc_id
+ = table->fts->cache->next_doc_id;
+
+ if (doc_id < next_doc_id) {
+ ib::error() << "FTS_DOC_ID must be larger than "
+ << next_doc_id - 1 << " for table "
+ << table->name;
+
+ err = DB_FTS_INVALID_DOCID;
+ trx->error_state = DB_FTS_INVALID_DOCID;
+ goto error_exit;
+ }
+
+ /* Difference between Doc IDs are restricted within
+ 4 bytes integer. See fts_get_encoded_len(). Consecutive
+ doc_ids difference should not exceed
+ FTS_DOC_ID_MAX_STEP value. */
+
+ if (doc_id - next_doc_id >= FTS_DOC_ID_MAX_STEP) {
+ ib::error() << "Doc ID " << doc_id
+ << " is too big. Its difference with"
+ " largest used Doc ID "
+ << next_doc_id - 1 << " cannot"
+ " exceed or equal to "
+ << FTS_DOC_ID_MAX_STEP;
+ err = DB_FTS_INVALID_DOCID;
+ trx->error_state = DB_FTS_INVALID_DOCID;
+ goto error_exit;
+ }
+ }
+
+ if (table->skip_alter_undo) {
+ if (trx->fts_trx == NULL) {
+ trx->fts_trx = fts_trx_create(trx);
+ }
+
+ fts_trx_table_t ftt;
+ ftt.table = table;
+ ftt.fts_trx = trx->fts_trx;
+
+ fts_add_doc_from_tuple(&ftt, doc_id, node->row);
+ } else {
+ /* Pass NULL for the columns affected, since an INSERT affects
+ all FTS indexes. */
+ fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+ }
+ }
+
+ thr->stop_no_error();
+
+ if (table->is_system_db) {
+ srv_stats.n_system_rows_inserted.inc(size_t(trx->id));
+ } else {
+ srv_stats.n_rows_inserted.inc(size_t(trx->id));
+ }
+
+ /* Not protected by dict_sys.mutex for performance
+ reasons, we would rather get garbage in stat_n_rows (which is
+ just an estimate anyway) than protecting the following code
+ with a latch. */
+ dict_table_n_rows_inc(table);
+
+ if (prebuilt->clust_index_was_generated) {
+ /* set row id to prebuilt */
+ memcpy(prebuilt->row_id, node->sys_buf, DATA_ROW_ID_LEN);
+ }
+
+ dict_stats_update_if_needed(table, *trx);
+ trx->op_info = "";
+
+ if (blob_heap != NULL) {
+ mem_heap_free(blob_heap);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+void
+row_prebuild_sel_graph(
+/*===================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ sel_node_t* node;
+
+ ut_ad(prebuilt && prebuilt->trx);
+
+ if (prebuilt->sel_graph == NULL) {
+
+ node = sel_node_create(prebuilt->heap);
+
+ prebuilt->sel_graph = static_cast<que_fork_t*>(
+ que_node_get_parent(
+ pars_complete_graph_for_exec(
+ static_cast<sel_node_t*>(node),
+ prebuilt->trx, prebuilt->heap,
+ prebuilt)));
+
+ prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+ }
+}
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+ dict_table_t* table, /*!< in: table to update */
+ mem_heap_t* heap) /*!< in: mem heap from which allocated */
+{
+ upd_node_t* node;
+
+ DBUG_ENTER("row_create_update_node_for_mysql");
+
+ node = upd_node_create(heap);
+
+ node->in_mysql_interface = true;
+ node->is_delete = NO_DELETE;
+ node->searched_update = FALSE;
+ node->select = NULL;
+ node->pcur = btr_pcur_create_for_mysql();
+
+ DBUG_PRINT("info", ("node: %p, pcur: %p", node, node->pcur));
+
+ node->table = table;
+
+ node->update = upd_create(dict_table_get_n_cols(table)
+ + dict_table_get_n_v_cols(table), heap);
+
+ node->update_n_fields = dict_table_get_n_cols(table);
+
+ UT_LIST_INIT(node->columns, &sym_node_t::col_var_list);
+
+ node->has_clust_rec_x_lock = TRUE;
+ node->cmpl_info = 0;
+
+ node->table_sym = NULL;
+ node->col_assign_list = NULL;
+
+ DBUG_RETURN(node);
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ if (prebuilt->upd_node == NULL) {
+
+ /* Not called before for this handle: create an update node
+ and query graph to the prebuilt struct */
+
+ prebuilt->upd_node = row_create_update_node_for_mysql(
+ prebuilt->table, prebuilt->heap);
+
+ prebuilt->upd_graph = static_cast<que_fork_t*>(
+ que_node_get_parent(
+ pars_complete_graph_for_exec(
+ prebuilt->upd_node,
+ prebuilt->trx, prebuilt->heap,
+ prebuilt)));
+
+ prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+ }
+
+ return(prebuilt->upd_node->update);
+}
+
+/********************************************************************
+Handle an update of a column that has an FTS index. */
+static
+void
+row_fts_do_update(
+/*==============*/
+ trx_t* trx, /* in: transaction */
+ dict_table_t* table, /* in: Table with FTS index */
+ doc_id_t old_doc_id, /* in: old document id */
+ doc_id_t new_doc_id) /* in: new document id */
+{
+ if(trx->fts_next_doc_id) {
+ fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+ if(new_doc_id != FTS_NULL_DOC_ID)
+ fts_trx_add_op(trx, table, new_doc_id, FTS_INSERT, NULL);
+ }
+}
+
+/************************************************************************
+Handles FTS matters for an update or a delete.
+NOTE: should not be called if the table does not have an FTS index. .*/
+static
+dberr_t
+row_fts_update_or_delete(
+/*=====================*/
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ trx_t* trx = prebuilt->trx;
+ dict_table_t* table = prebuilt->table;
+ upd_node_t* node = prebuilt->upd_node;
+ doc_id_t old_doc_id = prebuilt->fts_doc_id;
+
+ DBUG_ENTER("row_fts_update_or_delete");
+
+ ut_a(dict_table_has_fts_index(prebuilt->table));
+
+ /* Deletes are simple; get them out of the way first. */
+ if (node->is_delete == PLAIN_DELETE) {
+ /* A delete affects all FTS indexes, so we pass NULL */
+ fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+ } else {
+ doc_id_t new_doc_id;
+ new_doc_id = fts_read_doc_id((byte*) &trx->fts_next_doc_id);
+
+ if (new_doc_id == 0) {
+ ib::error() << "InnoDB FTS: Doc ID cannot be 0";
+ return(DB_FTS_INVALID_DOCID);
+ }
+ row_fts_do_update(trx, table, old_doc_id, new_doc_id);
+ }
+
+ DBUG_RETURN(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Initialize the Doc ID system for FK table with FTS index */
+static
+void
+init_fts_doc_id_for_ref(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ ulint* depth) /*!< in: recusive call depth */
+{
+ dict_foreign_t* foreign;
+
+ table->fk_max_recusive_level = 0;
+
+ (*depth)++;
+
+ /* Limit on tables involved in cascading delete/update */
+ if (*depth > FK_MAX_CASCADE_DEL) {
+ return;
+ }
+
+ /* Loop through this table's referenced list and also
+ recursively traverse each table's foreign table list */
+ for (dict_foreign_set::iterator it = table->referenced_set.begin();
+ it != table->referenced_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ ut_ad(foreign->foreign_table != NULL);
+
+ if (foreign->foreign_table->fts != NULL) {
+ fts_init_doc_id(foreign->foreign_table);
+ }
+
+ if (!foreign->foreign_table->referenced_set.empty()
+ && foreign->foreign_table != table) {
+ init_fts_doc_id_for_ref(
+ foreign->foreign_table, depth);
+ }
+ }
+}
+
+/** Does an update or delete of a row for MySQL.
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_for_mysql(row_prebuilt_t* prebuilt)
+{
+ trx_savept_t savept;
+ dberr_t err;
+ que_thr_t* thr;
+ dict_index_t* clust_index;
+ upd_node_t* node;
+ dict_table_t* table = prebuilt->table;
+ trx_t* trx = prebuilt->trx;
+ ulint fk_depth = 0;
+ bool got_s_lock = false;
+
+ DBUG_ENTER("row_update_for_mysql");
+
+ ut_ad(trx);
+ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+ ut_ad(table->stat_initialized);
+
+ if (!table->is_readable()) {
+ return(row_mysql_get_table_status(table, trx, true));
+ }
+
+ if (high_level_read_only) {
+ return(DB_READ_ONLY);
+ }
+
+ DEBUG_SYNC_C("innodb_row_update_for_mysql_begin");
+
+ trx->op_info = "updating or deleting";
+
+ row_mysql_delay_if_needed();
+
+ init_fts_doc_id_for_ref(table, &fk_depth);
+
+ if (!table->no_rollback()) {
+ trx_start_if_not_started_xa(trx, true);
+ }
+
+ if (dict_table_is_referenced_by_foreign_key(table)) {
+ /* Share lock the data dictionary to prevent any
+ table dictionary (for foreign constraint) change.
+ This is similar to row_ins_check_foreign_constraint
+ check protect by the dictionary lock as well.
+ In the future, this can be removed once the Foreign
+ key MDL is implemented */
+ row_mysql_freeze_data_dictionary(trx);
+ init_fts_doc_id_for_ref(table, &fk_depth);
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ node = prebuilt->upd_node;
+ const bool is_delete = node->is_delete == PLAIN_DELETE;
+ ut_ad(node->table == table);
+
+ clust_index = dict_table_get_first_index(table);
+
+ btr_pcur_copy_stored_position(node->pcur,
+ prebuilt->pcur->btr_cur.index
+ == clust_index
+ ? prebuilt->pcur
+ : prebuilt->clust_pcur);
+
+ ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+ /* MySQL seems to call rnd_pos before updating each row it
+ has cached: we can get the correct cursor position from
+ prebuilt->pcur; NOTE that we cannot build the row reference
+ from mysql_rec if the clustered index was automatically
+ generated for the table: MySQL does not know anything about
+ the row id used as the clustered index key */
+
+ savept = trx_savept_take(trx);
+
+ thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ ut_ad(!prebuilt->sql_stat_start);
+
+ thr->start_running();
+
+ ut_ad(!prebuilt->versioned_write || node->table->versioned());
+
+ if (prebuilt->versioned_write) {
+ if (node->is_delete == VERSIONED_DELETE) {
+ node->vers_make_delete(trx);
+ } else if (node->update->affects_versioned()) {
+ node->vers_make_update(trx);
+ }
+ }
+
+ for (;;) {
+ thr->run_node = node;
+ thr->prev_node = node;
+ thr->fk_cascade_depth = 0;
+
+ row_upd_step(thr);
+
+ err = trx->error_state;
+
+ if (err == DB_SUCCESS) {
+ break;
+ }
+
+ que_thr_stop_for_mysql(thr);
+
+ if (err == DB_RECORD_NOT_FOUND) {
+ trx->error_state = DB_SUCCESS;
+ goto error;
+ }
+
+ thr->lock_state= QUE_THR_LOCK_ROW;
+
+ DEBUG_SYNC(trx->mysql_thd, "row_update_for_mysql_error");
+
+ bool was_lock_wait = row_mysql_handle_errors(
+ &err, trx, thr, &savept);
+ thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+ if (!was_lock_wait) {
+ goto error;
+ }
+ }
+
+ thr->stop_no_error();
+
+ if (dict_table_has_fts_index(table)
+ && trx->fts_next_doc_id != UINT64_UNDEFINED) {
+ err = row_fts_update_or_delete(prebuilt);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ut_ad("unexpected error" == 0);
+ goto error;
+ }
+ }
+
+ /* Completed cascading operations (if any) */
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ bool update_statistics;
+ ut_ad(is_delete == (node->is_delete == PLAIN_DELETE));
+
+ if (is_delete) {
+ /* Not protected by dict_sys.mutex for performance
+ reasons, we would rather get garbage in stat_n_rows (which is
+ just an estimate anyway) than protecting the following code
+ with a latch. */
+ dict_table_n_rows_dec(prebuilt->table);
+
+ if (table->is_system_db) {
+ srv_stats.n_system_rows_deleted.inc(size_t(trx->id));
+ } else {
+ srv_stats.n_rows_deleted.inc(size_t(trx->id));
+ }
+
+ update_statistics = !srv_stats_include_delete_marked;
+ } else {
+ if (table->is_system_db) {
+ srv_stats.n_system_rows_updated.inc(size_t(trx->id));
+ } else {
+ srv_stats.n_rows_updated.inc(size_t(trx->id));
+ }
+
+ update_statistics
+ = !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+ }
+
+ if (update_statistics) {
+ dict_stats_update_if_needed(prebuilt->table, *trx);
+ } else {
+ /* Always update the table modification counter. */
+ prebuilt->table->stat_modified_counter++;
+ }
+
+ trx->op_info = "";
+
+ DBUG_RETURN(err);
+
+error:
+ trx->op_info = "";
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ DBUG_RETURN(err);
+}
+
+/** This can only be used when the current transaction is at
+READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_for_mysql() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@param[in] has_latches_on_recs TRUE if called so that we have the
+ latches on the records under pcur
+ and clust_pcur, and we do not need
+ to reposition the cursors. */
+void
+row_unlock_for_mysql(
+ row_prebuilt_t* prebuilt,
+ ibool has_latches_on_recs)
+{
+ btr_pcur_t* pcur = prebuilt->pcur;
+ btr_pcur_t* clust_pcur = prebuilt->clust_pcur;
+ trx_t* trx = prebuilt->trx;
+
+ ut_ad(prebuilt != NULL);
+ ut_ad(trx != NULL);
+ ut_ad(trx->isolation_level <= TRX_ISO_READ_COMMITTED);
+
+ if (dict_index_is_spatial(prebuilt->index)) {
+ return;
+ }
+
+ trx->op_info = "unlock_row";
+
+ if (prebuilt->new_rec_locks >= 1) {
+
+ const rec_t* rec;
+ dict_index_t* index;
+ trx_id_t rec_trx_id;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ /* Restore the cursor position and find the record */
+
+ if (!has_latches_on_recs) {
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr);
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ index = btr_pcur_get_btr_cur(pcur)->index;
+
+ if (prebuilt->new_rec_locks >= 2) {
+ /* Restore the cursor position and find the record
+ in the clustered index. */
+
+ if (!has_latches_on_recs) {
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ clust_pcur, &mtr);
+ }
+
+ rec = btr_pcur_get_rec(clust_pcur);
+ index = btr_pcur_get_btr_cur(clust_pcur)->index;
+ }
+
+ if (!dict_index_is_clust(index)) {
+ /* This is not a clustered index record. We
+ do not know how to unlock the record. */
+ goto no_unlock;
+ }
+
+ /* If the record has been modified by this
+ transaction, do not unlock it. */
+
+ if (index->trx_id_offset) {
+ rec_trx_id = trx_read_trx_id(rec
+ + index->trx_id_offset);
+ } else {
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ rec_offs_init(offsets_);
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
+ if (rec_trx_id != trx->id) {
+ /* We did not update the record: unlock it */
+
+ rec = btr_pcur_get_rec(pcur);
+
+ lock_rec_unlock(
+ trx,
+ btr_pcur_get_block(pcur),
+ rec,
+ static_cast<enum lock_mode>(
+ prebuilt->select_lock_type));
+
+ if (prebuilt->new_rec_locks >= 2) {
+ rec = btr_pcur_get_rec(clust_pcur);
+
+ lock_rec_unlock(
+ trx,
+ btr_pcur_get_block(clust_pcur),
+ rec,
+ static_cast<enum lock_mode>(
+ prebuilt->select_lock_type));
+ }
+ }
+no_unlock:
+ mtr_commit(&mtr);
+ }
+
+ trx->op_info = "";
+}
+
+/*********************************************************************//**
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+void
+row_mysql_freeze_data_dictionary_func(
+/*==================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const char* file, /*!< in: file name */
+ unsigned line) /*!< in: line number */
+{
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ rw_lock_s_lock_inline(&dict_sys.latch, 0, file, line);
+
+ trx->dict_operation_lock_mode = RW_S_LATCH;
+}
+
+/*********************************************************************//**
+Unlocks the data dictionary shared lock. */
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_ad(lock_trx_has_sys_table_locks(trx) == NULL);
+
+ ut_a(trx->dict_operation_lock_mode == RW_S_LATCH);
+
+ rw_lock_s_unlock(&dict_sys.latch);
+
+ trx->dict_operation_lock_mode = 0;
+}
+
+/** Write query start time as SQL field data to a buffer. Needed by InnoDB.
+@param thd Thread object
+@param buf Buffer to hold start time data */
+void thd_get_query_start_data(THD *thd, char *buf);
+
+/** Insert history row when evaluating foreign key referential action.
+
+1. Create new dtuple_t 'row' from node->historical_row;
+2. Update its row_end to current timestamp;
+3. Insert it to a table;
+4. Update table statistics.
+
+This is used in UPDATE CASCADE/SET NULL of a system versioned referenced table.
+
+node->historical_row: dtuple_t containing pointers of row changed by refertial
+action.
+
+@param[in] thr current query thread
+@param[in] node a node which just updated a row in a foreign table
+@return DB_SUCCESS or some error */
+static dberr_t row_update_vers_insert(que_thr_t* thr, upd_node_t* node)
+{
+ trx_t* trx = thr_get_trx(thr);
+ dfield_t* row_end;
+ char row_end_data[8];
+ dict_table_t* table = node->table;
+ const unsigned zip_size = table->space->zip_size();
+ ut_ad(table->versioned());
+
+ dtuple_t* row;
+ const ulint n_cols = dict_table_get_n_cols(table);
+ const ulint n_v_cols = dict_table_get_n_v_cols(table);
+
+ ut_ad(n_cols == dtuple_get_n_fields(node->historical_row));
+ ut_ad(n_v_cols == dtuple_get_n_v_fields(node->historical_row));
+
+ row = dtuple_create_with_vcol(node->historical_heap, n_cols, n_v_cols);
+
+ dict_table_copy_types(row, table);
+
+ ins_node_t* insert_node =
+ ins_node_create(INS_DIRECT, table, node->historical_heap);
+
+ if (!insert_node) {
+ trx->error_state = DB_OUT_OF_MEMORY;
+ goto exit;
+ }
+
+ insert_node->common.parent = thr;
+ ins_node_set_new_row(insert_node, row);
+
+ ut_ad(n_cols > DATA_N_SYS_COLS);
+ // Exclude DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR
+ for (ulint i = 0; i < n_cols - DATA_N_SYS_COLS; i++) {
+ dfield_t *src= dtuple_get_nth_field(node->historical_row, i);
+ dfield_t *dst= dtuple_get_nth_field(row, i);
+ dfield_copy(dst, src);
+ if (dfield_is_ext(src)) {
+ byte *field_data
+ = static_cast<byte*>(dfield_get_data(src));
+ ulint ext_len;
+ ulint field_len = dfield_get_len(src);
+
+ ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ ut_a(memcmp(field_data + field_len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+
+ byte *data = btr_copy_externally_stored_field(
+ &ext_len, field_data, zip_size, field_len,
+ node->historical_heap);
+ dfield_set_data(dst, data, ext_len);
+ }
+ }
+
+ for (ulint i = 0; i < n_v_cols; i++) {
+ dfield_t *dst= dtuple_get_nth_v_field(row, i);
+ dfield_t *src= dtuple_get_nth_v_field(node->historical_row, i);
+ dfield_copy(dst, src);
+ }
+
+ node->historical_row = NULL;
+
+ row_end = dtuple_get_nth_field(row, table->vers_end);
+ if (dict_table_get_nth_col(table, table->vers_end)->vers_native()) {
+ mach_write_to_8(row_end_data, trx->id);
+ dfield_set_data(row_end, row_end_data, 8);
+ } else {
+ thd_get_query_start_data(trx->mysql_thd, row_end_data);
+ dfield_set_data(row_end, row_end_data, 7);
+ }
+
+ for (;;) {
+ thr->run_node = insert_node;
+ thr->prev_node = insert_node;
+
+ row_ins_step(thr);
+
+ switch (trx->error_state) {
+ case DB_LOCK_WAIT:
+ que_thr_stop_for_mysql(thr);
+ lock_wait_suspend_thread(thr);
+
+ if (trx->error_state == DB_SUCCESS) {
+ continue;
+ }
+
+ /* fall through */
+ default:
+ /* Other errors are handled for the parent node. */
+ thr->fk_cascade_depth = 0;
+ goto exit;
+
+ case DB_SUCCESS:
+ srv_stats.n_rows_inserted.inc(
+ static_cast<size_t>(trx->id));
+ dict_stats_update_if_needed(table, *trx);
+ goto exit;
+ }
+ }
+exit:
+ que_graph_free_recursive(insert_node);
+ mem_heap_free(node->historical_heap);
+ node->historical_heap = NULL;
+ return trx->error_state;
+}
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_cascade_for_mysql(
+/*=========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ upd_node_t* node, /*!< in: update node used in the cascade
+ or set null operation */
+ dict_table_t* table) /*!< in: table where we do the operation */
+{
+ /* Increment fk_cascade_depth to record the recursive call depth on
+ a single update/delete that affects multiple tables chained
+ together with foreign key relations. */
+
+ if (++thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) {
+ return(DB_FOREIGN_EXCEED_MAX_CASCADE);
+ }
+
+ const trx_t* trx = thr_get_trx(thr);
+
+ if (table->versioned()) {
+ if (node->is_delete == PLAIN_DELETE) {
+ node->vers_make_delete(trx);
+ } else if (node->update->affects_versioned()) {
+ dberr_t err = row_update_vers_insert(thr, node);
+ if (err != DB_SUCCESS) {
+ return err;
+ }
+ node->vers_make_update(trx);
+ }
+ }
+
+ for (;;) {
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ DEBUG_SYNC_C("foreign_constraint_update_cascade");
+ {
+ TABLE *mysql_table = thr->prebuilt->m_mysql_table;
+ thr->prebuilt->m_mysql_table = NULL;
+ row_upd_step(thr);
+ thr->prebuilt->m_mysql_table = mysql_table;
+ }
+
+ switch (trx->error_state) {
+ case DB_LOCK_WAIT:
+ que_thr_stop_for_mysql(thr);
+ lock_wait_suspend_thread(thr);
+
+ if (trx->error_state == DB_SUCCESS) {
+ continue;
+ }
+
+ /* fall through */
+ default:
+ /* Other errors are handled for the parent node. */
+ thr->fk_cascade_depth = 0;
+ return trx->error_state;
+
+ case DB_SUCCESS:
+ thr->fk_cascade_depth = 0;
+ bool stats;
+
+ if (node->is_delete == PLAIN_DELETE) {
+ /* Not protected by dict_sys.mutex for
+ performance reasons, we would rather
+ get garbage in stat_n_rows (which is
+ just an estimate anyway) than
+ protecting the following code with a
+ latch. */
+ dict_table_n_rows_dec(node->table);
+
+ stats = !srv_stats_include_delete_marked;
+ srv_stats.n_rows_deleted.inc(size_t(trx->id));
+ } else {
+ stats = !(node->cmpl_info
+ & UPD_NODE_NO_ORD_CHANGE);
+ srv_stats.n_rows_updated.inc(size_t(trx->id));
+ }
+
+ if (stats) {
+ dict_stats_update_if_needed(node->table, *trx);
+ } else {
+ /* Always update the table
+ modification counter. */
+ node->table->stat_modified_counter++;
+ }
+
+ return(DB_SUCCESS);
+ }
+ }
+}
+
+/*********************************************************************//**
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+void
+row_mysql_lock_data_dictionary_func(
+/*================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const char* file, /*!< in: file name */
+ unsigned line) /*!< in: line number */
+{
+ ut_a(trx->dict_operation_lock_mode == 0
+ || trx->dict_operation_lock_mode == RW_X_LATCH);
+ dict_sys.lock(file, line);
+ trx->dict_operation_lock_mode = RW_X_LATCH;
+}
+
+/*********************************************************************//**
+Unlocks the data dictionary exclusive lock. */
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_ad(lock_trx_has_sys_table_locks(trx) == NULL);
+ ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+ trx->dict_operation_lock_mode = 0;
+ dict_sys.unlock();
+}
+
+/*********************************************************************//**
+Creates a table for MySQL. On failure the transaction will be rolled back
+and the 'table' object will be freed.
+@return error code or DB_SUCCESS */
+dberr_t
+row_create_table_for_mysql(
+/*=======================*/
+ dict_table_t* table, /*!< in, own: table definition
+ (will be freed, or on DB_SUCCESS
+ added to the data dictionary cache) */
+ trx_t* trx, /*!< in/out: transaction */
+ fil_encryption_t mode, /*!< in: encryption mode */
+ uint32_t key_id) /*!< in: encryption key_id */
+{
+ tab_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ dberr_t err;
+
+ ut_d(dict_sys.assert_locked());
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_at_start_of_row_create_table_for_mysql",
+ goto err_exit;
+ );
+
+ trx->op_info = "creating table";
+
+ if (row_mysql_is_system_table(table->name.m_name)) {
+
+ ib::error() << "Trying to create a MySQL system table "
+ << table->name << " of type InnoDB. MySQL system"
+ " tables must be of the MyISAM type!";
+#ifndef DBUG_OFF
+err_exit:
+#endif /* !DBUG_OFF */
+ dict_mem_table_free(table);
+
+ trx->op_info = "";
+
+ return(DB_ERROR);
+ }
+
+ trx_start_if_not_started_xa(trx, true);
+
+ heap = mem_heap_create(512);
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+ case TRX_DICT_OP_TABLE:
+ break;
+ case TRX_DICT_OP_INDEX:
+ /* If the transaction was previously flagged as
+ TRX_DICT_OP_INDEX, we should be creating auxiliary
+ tables for full-text indexes. */
+ ut_ad(strstr(table->name.m_name, "/FTS_") != NULL);
+ }
+
+ node = tab_create_graph_create(table, heap, mode, key_id);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+
+ ut_a(thr == que_fork_start_command(
+ static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ /* Update SYS_TABLESPACES and SYS_DATAFILES if a new file-per-table
+ tablespace was created. */
+ if (err == DB_SUCCESS && dict_table_is_file_per_table(table)) {
+ err = dict_replace_tablespace_in_dictionary(
+ table->space_id, table->name.m_name,
+ table->space->flags,
+ table->space->chain.start->name, trx);
+
+ if (err != DB_SUCCESS) {
+
+ /* We must delete the link file. */
+ RemoteDatafile::delete_link_file(table->name.m_name);
+ }
+ }
+
+ switch (err) {
+ case DB_SUCCESS:
+ break;
+ case DB_OUT_OF_FILE_SPACE:
+ trx->error_state = DB_SUCCESS;
+ trx->rollback();
+
+ ib::warn() << "Cannot create table "
+ << table->name
+ << " because tablespace full";
+
+ if (dict_table_open_on_name(table->name.m_name, TRUE, FALSE,
+ DICT_ERR_IGNORE_NONE)) {
+
+ dict_table_close_and_drop(trx, table);
+ } else {
+ dict_mem_table_free(table);
+ }
+
+ break;
+
+ case DB_UNSUPPORTED:
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ /* We already have .ibd file here. it should be deleted. */
+
+ if (dict_table_is_file_per_table(table)
+ && fil_delete_tablespace(table->space_id) != DB_SUCCESS) {
+ ib::error() << "Cannot delete the file of table "
+ << table->name;
+ }
+ /* fall through */
+
+ case DB_DUPLICATE_KEY:
+ case DB_TABLESPACE_EXISTS:
+ default:
+ trx->error_state = DB_SUCCESS;
+ trx->rollback();
+ dict_mem_table_free(table);
+ break;
+ }
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Create an index when creating a table.
+On failure, the caller must drop the table!
+@return error number or DB_SUCCESS */
+dberr_t
+row_create_index_for_mysql(
+/*=======================*/
+ dict_index_t* index, /*!< in, own: index definition
+ (will be freed) */
+ trx_t* trx, /*!< in: transaction handle */
+ const ulint* field_lengths) /*!< in: if not NULL, must contain
+ dict_index_get_n_fields(index)
+ actual field lengths for the
+ index columns, which are
+ then checked for not being too
+ large. */
+{
+ ind_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ dberr_t err;
+ ulint i;
+ ulint len;
+ dict_table_t* table = index->table;
+
+ ut_d(dict_sys.assert_locked());
+
+ for (i = 0; i < index->n_def; i++) {
+ /* Check that prefix_len and actual length
+ < DICT_MAX_INDEX_COL_LEN */
+
+ len = dict_index_get_nth_field(index, i)->prefix_len;
+
+ if (field_lengths && field_lengths[i]) {
+ len = ut_max(len, field_lengths[i]);
+ }
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_at_create_index",
+ len = DICT_MAX_FIELD_LEN_BY_FORMAT(table) + 1;
+ );
+
+ /* Column or prefix length exceeds maximum column length */
+ if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) {
+ dict_mem_index_free(index);
+ return DB_TOO_BIG_INDEX_COL;
+ }
+ }
+
+ trx->op_info = "creating index";
+
+ /* For temp-table we avoid insertion into SYSTEM TABLES to
+ maintain performance and so we have separate path that directly
+ just updates dictonary cache. */
+ if (!table->is_temporary()) {
+ trx_start_if_not_started_xa(trx, true);
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+ /* Note that the space id where we store the index is
+ inherited from the table in dict_build_index_def_step()
+ in dict0crea.cc. */
+
+ heap = mem_heap_create(512);
+ node = ind_create_graph_create(index, table->name.m_name,
+ heap);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+
+ ut_a(thr == que_fork_start_command(
+ static_cast<que_fork_t*>(
+ que_node_get_parent(thr))));
+
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ index = node->index;
+
+ ut_ad(!index == (err != DB_SUCCESS));
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ if (index && (index->type & DICT_FTS)) {
+ err = fts_create_index_tables(trx, index, table->id);
+ }
+ } else {
+ dict_build_index_def(table, index, trx);
+
+ err = dict_index_add_to_cache(index, FIL_NULL);
+ ut_ad((index == NULL) == (err != DB_SUCCESS));
+ if (UNIV_LIKELY(err == DB_SUCCESS)) {
+ ut_ad(!index->is_instant());
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+
+ err = dict_create_index_tree_in_mem(index, trx);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!index->search_info->ref_count);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (err != DB_SUCCESS) {
+ dict_index_remove_from_cache(table, index);
+ }
+ }
+ }
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Drops a table for MySQL as a background operation. MySQL relies on Unix
+in ALTER TABLE to the fact that the table handler does not remove the
+table before all handles to it has been removed. Furhermore, the MySQL's
+call to drop table must be non-blocking. Therefore we do the drop table
+as a background operation, which is taken care of by the master thread
+in srv0srv.cc.
+@return error code or DB_SUCCESS */
+static
+dberr_t
+row_drop_table_for_mysql_in_background(
+/*===================================*/
+ const char* name) /*!< in: table name */
+{
+ dberr_t error;
+ trx_t* trx;
+
+ trx = trx_create();
+
+ /* If the original transaction was dropping a table referenced by
+ foreign keys, we must set the following to be able to drop the
+ table: */
+
+ trx->check_foreigns = false;
+
+ /* Try to drop the table in InnoDB */
+
+ error = row_drop_table_for_mysql(name, trx, SQLCOM_TRUNCATE);
+
+ trx_commit_for_mysql(trx);
+
+ trx->free();
+
+ return(error);
+}
+
+/*********************************************************************//**
+The master thread in srv0srv.cc calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix.
+@return how many tables dropped + remaining tables in list */
+ulint
+row_drop_tables_for_mysql_in_background(void)
+/*=========================================*/
+{
+ row_mysql_drop_t* drop;
+ dict_table_t* table;
+ ulint n_tables;
+ ulint n_tables_dropped = 0;
+loop:
+ mutex_enter(&row_drop_list_mutex);
+
+ ut_a(row_mysql_drop_list_inited);
+next:
+ drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+ n_tables = UT_LIST_GET_LEN(row_mysql_drop_list);
+
+ mutex_exit(&row_drop_list_mutex);
+
+ if (drop == NULL) {
+ /* All tables dropped */
+
+ return(n_tables + n_tables_dropped);
+ }
+
+ /* On fast shutdown, just empty the list without dropping tables. */
+ table = srv_shutdown_state == SRV_SHUTDOWN_NONE || !srv_fast_shutdown
+ ? dict_table_open_on_id(drop->table_id, FALSE,
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)
+ : NULL;
+
+ if (!table) {
+ n_tables_dropped++;
+ mutex_enter(&row_drop_list_mutex);
+ UT_LIST_REMOVE(row_mysql_drop_list, drop);
+ MONITOR_DEC(MONITOR_BACKGROUND_DROP_TABLE);
+ ut_free(drop);
+ goto next;
+ }
+
+ ut_a(!table->can_be_evicted);
+
+ bool skip = false;
+
+ if (!table->to_be_dropped) {
+skip:
+ dict_table_close(table, FALSE, FALSE);
+
+ mutex_enter(&row_drop_list_mutex);
+ UT_LIST_REMOVE(row_mysql_drop_list, drop);
+ if (!skip) {
+ UT_LIST_ADD_LAST(row_mysql_drop_list, drop);
+ } else {
+ ut_free(drop);
+ }
+ goto next;
+ }
+
+ if (!srv_fast_shutdown && !trx_sys.any_active_transactions()) {
+ lock_mutex_enter();
+ skip = UT_LIST_GET_LEN(table->locks) != 0;
+ lock_mutex_exit();
+ if (skip) {
+ /* We cannot drop tables that are locked by XA
+ PREPARE transactions. */
+ goto skip;
+ }
+ }
+
+ char* name = mem_strdup(table->name.m_name);
+
+ dict_table_close(table, FALSE, FALSE);
+
+ dberr_t err = row_drop_table_for_mysql_in_background(name);
+
+ ut_free(name);
+
+ if (err != DB_SUCCESS) {
+ /* If the DROP fails for some table, we return, and let the
+ main thread retry later */
+ return(n_tables + n_tables_dropped);
+ }
+
+ goto loop;
+}
+
+/*********************************************************************//**
+Get the background drop list length. NOTE: the caller must own the
+drop list mutex!
+@return how many tables in list */
+ulint
+row_get_background_drop_list_len_low(void)
+/*======================================*/
+{
+ ulint len;
+
+ mutex_enter(&row_drop_list_mutex);
+
+ ut_a(row_mysql_drop_list_inited);
+
+ len = UT_LIST_GET_LEN(row_mysql_drop_list);
+
+ mutex_exit(&row_drop_list_mutex);
+
+ return(len);
+}
+
+/** Drop garbage tables during recovery. */
+void
+row_mysql_drop_garbage_tables()
+{
+ mem_heap_t* heap = mem_heap_create(FN_REFLEN);
+ btr_pcur_t pcur;
+ mtr_t mtr;
+ trx_t* trx = trx_create();
+ trx->op_info = "dropping garbage tables";
+ row_mysql_lock_data_dictionary(trx);
+
+ mtr.start();
+ btr_pcur_open_at_index_side(
+ true, dict_table_get_first_index(dict_sys.sys_tables),
+ BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+
+ for (;;) {
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ const char* table_name;
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+ if (rec_get_deleted_flag(rec, 0)) {
+ continue;
+ }
+
+ field = rec_get_nth_field_old(rec, 0/*NAME*/, &len);
+ if (len == UNIV_SQL_NULL || len == 0) {
+ /* Corrupted SYS_TABLES.NAME */
+ continue;
+ }
+
+ table_name = mem_heap_strdupl(
+ heap,
+ reinterpret_cast<const char*>(field), len);
+ if (strstr(table_name, "/" TEMP_FILE_PREFIX "-") &&
+ !strstr(table_name, "/" TEMP_FILE_PREFIX "-backup-") &&
+ !strstr(table_name, "/" TEMP_FILE_PREFIX "-exchange-"))
+ {
+ btr_pcur_store_position(&pcur, &mtr);
+ btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+ if (dict_load_table(table_name,
+ DICT_ERR_IGNORE_DROP)) {
+ row_drop_table_for_mysql(table_name, trx,
+ SQLCOM_DROP_TABLE);
+ trx_commit_for_mysql(trx);
+ }
+
+ mtr.start();
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &pcur, &mtr);
+ }
+
+ mem_heap_empty(heap);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr.commit();
+ row_mysql_unlock_data_dictionary(trx);
+ trx->free();
+ mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily.
+@return whether background DROP TABLE was scheduled for the first time */
+static
+bool
+row_add_table_to_background_drop_list(table_id_t table_id)
+{
+ row_mysql_drop_t* drop;
+ bool added = true;
+
+ mutex_enter(&row_drop_list_mutex);
+
+ ut_a(row_mysql_drop_list_inited);
+
+ /* Look if the table already is in the drop list */
+ for (drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+ drop != NULL;
+ drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop)) {
+
+ if (drop->table_id == table_id) {
+ added = false;
+ goto func_exit;
+ }
+ }
+
+ drop = static_cast<row_mysql_drop_t*>(ut_malloc_nokey(sizeof *drop));
+ drop->table_id = table_id;
+
+ UT_LIST_ADD_LAST(row_mysql_drop_list, drop);
+
+ MONITOR_INC(MONITOR_BACKGROUND_DROP_TABLE);
+func_exit:
+ mutex_exit(&row_drop_list_mutex);
+ return added;
+}
+
+/** Reassigns the table identifier of a table.
+@param[in,out] table table
+@param[in,out] trx transaction
+@param[out] new_id new table id
+@return error code or DB_SUCCESS */
+static
+dberr_t
+row_mysql_table_id_reassign(
+ dict_table_t* table,
+ trx_t* trx,
+ table_id_t* new_id)
+{
+ dberr_t err;
+ pars_info_t* info = pars_info_create();
+
+ dict_hdr_get_new_id(new_id, NULL, NULL);
+
+ pars_info_add_ull_literal(info, "old_id", table->id);
+ pars_info_add_ull_literal(info, "new_id", *new_id);
+
+ /* Note: This cannot be rolled back. Rollback would see the
+ UPDATE SYS_INDEXES as two operations: DELETE and INSERT.
+ It would invoke btr_free_if_exists() when rolling back the
+ INSERT, effectively dropping all indexes of the table. */
+ err = que_eval_sql(
+ info,
+ "PROCEDURE RENUMBER_TABLE_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLES SET ID = :new_id\n"
+ " WHERE ID = :old_id;\n"
+ "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+ " WHERE TABLE_ID = :old_id;\n"
+ "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
+ " WHERE TABLE_ID = :old_id;\n"
+ "UPDATE SYS_VIRTUAL SET TABLE_ID = :new_id\n"
+ " WHERE TABLE_ID = :old_id;\n"
+ "END;\n", FALSE, trx);
+
+ return(err);
+}
+
+/*********************************************************************//**
+Setup the pre-requisites for DISCARD TABLESPACE. It will start the transaction,
+acquire the data dictionary lock in X mode and open the table.
+@return table instance or 0 if not found. */
+static
+dict_table_t*
+row_discard_tablespace_begin(
+/*=========================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ trx->op_info = "discarding tablespace";
+
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+ trx_start_if_not_started_xa(trx, true);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ this is to avoid deadlocks during data dictionary operations */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ dict_table_t* table;
+
+ table = dict_table_open_on_name(
+ name, TRUE, FALSE, DICT_ERR_IGNORE_FK_NOKEY);
+
+ if (table) {
+ dict_stats_wait_bg_to_stop_using_table(table, trx);
+ ut_a(!is_system_tablespace(table->space_id));
+ ut_ad(!table->n_foreign_key_checks_running);
+ }
+
+ return(table);
+}
+
+/*********************************************************************//**
+Do the foreign key constraint checks.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace_foreign_key_checks(
+/*======================================*/
+ const trx_t* trx, /*!< in: transaction handle */
+ const dict_table_t* table) /*!< in: table to be discarded */
+{
+
+ if (srv_read_only_mode || !trx->check_foreigns) {
+ return(DB_SUCCESS);
+ }
+
+ /* Check if the table is referenced by foreign key constraints from
+ some other table (not the table itself) */
+ dict_foreign_set::const_iterator it
+ = std::find_if(table->referenced_set.begin(),
+ table->referenced_set.end(),
+ dict_foreign_different_tables());
+
+ if (it == table->referenced_set.end()) {
+ return(DB_SUCCESS);
+ }
+
+ const dict_foreign_t* foreign = *it;
+ FILE* ef = dict_foreign_err_file;
+
+ ut_ad(foreign->foreign_table != table);
+ ut_ad(foreign->referenced_table == table);
+
+ /* We only allow discarding a referenced table if
+ FOREIGN_KEY_CHECKS is set to 0 */
+
+ mutex_enter(&dict_foreign_err_mutex);
+
+ rewind(ef);
+
+ ut_print_timestamp(ef);
+
+ fputs(" Cannot DISCARD table ", ef);
+ ut_print_name(ef, trx, table->name.m_name);
+ fputs("\n"
+ "because it is referenced by ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ putc('\n', ef);
+
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(DB_CANNOT_DROP_CONSTRAINT);
+}
+
+/*********************************************************************//**
+Cleanup after the DISCARD TABLESPACE operation.
+@return error code. */
+static
+dberr_t
+row_discard_tablespace_end(
+/*=======================*/
+ trx_t* trx, /*!< in/out: transaction handle */
+ dict_table_t* table, /*!< in/out: table to be discarded */
+ dberr_t err) /*!< in: error code */
+{
+ if (table != 0) {
+ dict_table_close(table, TRUE, FALSE);
+ }
+
+ DBUG_EXECUTE_IF("ib_discard_before_commit_crash",
+ log_buffer_flush_to_disk();
+ DBUG_SUICIDE(););
+
+ trx_commit_for_mysql(trx);
+
+ DBUG_EXECUTE_IF("ib_discard_after_commit_crash",
+ log_buffer_flush_to_disk();
+ DBUG_SUICIDE(););
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Do the DISCARD TABLESPACE operation.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace(
+/*===================*/
+ trx_t* trx, /*!< in/out: transaction handle */
+ dict_table_t* table) /*!< in/out: table to be discarded */
+{
+ dberr_t err;
+
+ /* How do we prevent crashes caused by ongoing operations on
+ the table? Old operations could try to access non-existent
+ pages. MySQL will block all DML on the table using MDL and a
+ DISCARD will not start unless all existing operations on the
+ table to be discarded are completed.
+
+ 1) Acquire the data dictionary latch in X mode. To prevent any
+ internal operations that MySQL is not aware off and also for
+ the internal SQL parser.
+
+ 2) Purge and rollback: we assign a new table id for the
+ table. Since purge and rollback look for the table based on
+ the table id, they see the table as 'dropped' and discard
+ their operations.
+
+ 3) Insert buffer: we remove all entries for the tablespace in
+ the insert buffer tree. */
+
+ ibuf_delete_for_discarded_space(table->space_id);
+
+ table_id_t new_id;
+
+ /* Set the TABLESPACE DISCARD flag in the table definition
+ on disk. */
+ err = row_import_update_discarded_flag(trx, table->id, true);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Update the index root pages in the system tables, on disk */
+ err = row_import_update_index_root(trx, table, true);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Drop all the FTS auxiliary tables. */
+ if (dict_table_has_fts_index(table)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+ fts_drop_tables(trx, table);
+ }
+
+ /* Assign a new space ID to the table definition so that purge
+ can ignore the changes. Update the system table on disk. */
+
+ err = row_mysql_table_id_reassign(table, trx, &new_id);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Discard the physical file that is used for the tablespace. */
+ err = fil_delete_tablespace(table->space_id);
+ switch (err) {
+ case DB_IO_ERROR:
+ ib::warn() << "ALTER TABLE " << table->name
+ << " DISCARD TABLESPACE failed to delete file";
+ break;
+ case DB_TABLESPACE_NOT_FOUND:
+ ib::warn() << "ALTER TABLE " << table->name
+ << " DISCARD TABLESPACE failed to find tablespace";
+ break;
+ case DB_SUCCESS:
+ break;
+ default:
+ ut_error;
+ }
+
+ /* All persistent operations successful, update the
+ data dictionary memory cache. */
+
+ table->file_unreadable = true;
+ table->space = NULL;
+ table->flags2 |= DICT_TF2_DISCARDED;
+ dict_table_change_id_in_cache(table, new_id);
+
+ dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ if (index) index->clear_instant_alter();
+
+ /* Reset the root page numbers. */
+ for (; index; index = UT_LIST_GET_NEXT(indexes, index)) {
+ index->page = FIL_NULL;
+ }
+
+ /* If the tablespace did not already exist or we couldn't
+ write to it, we treat that as a successful DISCARD. It is
+ unusable anyway. */
+ return DB_SUCCESS;
+}
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function renames the .ibd file and assigns a new table id for
+the table. Also the file_unreadable flag is set.
+@return error code or DB_SUCCESS */
+dberr_t
+row_discard_tablespace_for_mysql(
+/*=============================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ dberr_t err;
+ dict_table_t* table;
+
+ /* Open the table and start the transaction if not started. */
+
+ table = row_discard_tablespace_begin(name, trx);
+
+ if (table == 0) {
+ err = DB_TABLE_NOT_FOUND;
+ } else if (table->is_temporary()) {
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_CANNOT_DISCARD_TEMPORARY_TABLE);
+
+ err = DB_ERROR;
+
+ } else if (table->space_id == TRX_SYS_SPACE) {
+ char table_name[MAX_FULL_NAME_LEN + 1];
+
+ innobase_format_name(
+ table_name, sizeof(table_name),
+ table->name.m_name);
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_IN_SYSTEM_TABLESPACE, table_name);
+
+ err = DB_ERROR;
+
+ } else {
+ ut_ad(!table->n_foreign_key_checks_running);
+
+ bool fts_exist = (dict_table_has_fts_index(table)
+ || DICT_TF2_FLAG_IS_SET(
+ table, DICT_TF2_FTS_HAS_DOC_ID));
+
+ if (fts_exist) {
+ row_mysql_unlock_data_dictionary(trx);
+ fts_optimize_remove_table(table);
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ /* Do foreign key constraint checks. */
+
+ err = row_discard_tablespace_foreign_key_checks(trx, table);
+
+ if (err == DB_SUCCESS) {
+ /* Note: This cannot be rolled back.
+ Rollback would see the UPDATE SYS_INDEXES
+ as two operations: DELETE and INSERT.
+ It would invoke btr_free_if_exists()
+ when rolling back the INSERT, effectively
+ dropping all indexes of the table. */
+ err = row_discard_tablespace(trx, table);
+ }
+
+ if (fts_exist && err != DB_SUCCESS) {
+ fts_optimize_add_table(table);
+ }
+ }
+
+ return(row_discard_tablespace_end(trx, table, err));
+}
+
+/*********************************************************************//**
+Sets an exclusive lock on a table.
+@return error code or DB_SUCCESS */
+dberr_t
+row_mysql_lock_table(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_table_t* table, /*!< in: table to lock */
+ enum lock_mode mode, /*!< in: LOCK_X or LOCK_S */
+ const char* op_info) /*!< in: string for trx->op_info */
+{
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ dberr_t err;
+ sel_node_t* node;
+
+ ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+ heap = mem_heap_create(512);
+
+ trx->op_info = op_info;
+
+ node = sel_node_create(heap);
+ thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+ thr->graph->state = QUE_FORK_ACTIVE;
+
+ /* We use the select query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = que_fork_get_first_thr(
+ static_cast<que_fork_t*>(que_node_get_parent(thr)));
+
+ thr->start_running();
+
+run_again:
+ thr->run_node = thr;
+ thr->prev_node = thr->common.parent;
+
+ err = lock_table(0, table, mode, thr);
+
+ trx->error_state = err;
+
+ if (err == DB_SUCCESS) {
+ thr->stop_no_error();
+ } else {
+ que_thr_stop_for_mysql(thr);
+
+ if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
+ goto run_again;
+ }
+ }
+
+ que_graph_free(thr->graph);
+ trx->op_info = "";
+
+ return(err);
+}
+
+/** Drop ancillary FTS tables as part of dropping a table.
+@param[in,out] table Table cache entry
+@param[in,out] trx Transaction handle
+@return error code or DB_SUCCESS */
+UNIV_INLINE
+dberr_t
+row_drop_ancillary_fts_tables(
+ dict_table_t* table,
+ trx_t* trx)
+{
+ /* Drop ancillary FTS tables */
+ if (dict_table_has_fts_index(table)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+ ut_ad(table->get_ref_count() == 0);
+ ut_ad(trx_is_started(trx));
+
+ dberr_t err = fts_drop_tables(trx, table);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ib::error() << " Unable to remove ancillary FTS"
+ " tables for table "
+ << table->name << " : " << err;
+
+ return(err);
+ }
+ }
+
+ /* The table->fts flag can be set on the table for which
+ the cluster index is being rebuilt. Such table might not have
+ DICT_TF2_FTS flag set. So keep this out of above
+ dict_table_has_fts_index condition */
+ if (table->fts != NULL) {
+ /* fts_que_graph_free_check_lock would try to acquire
+ dict mutex lock */
+ table->fts->dict_locked = true;
+
+ fts_free(table);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Drop a table from the memory cache as part of dropping a table.
+@param[in] tablename A copy of table->name. Used when table == null
+@param[in,out] table Table cache entry
+@param[in,out] trx Transaction handle
+@return error code or DB_SUCCESS */
+UNIV_INLINE
+dberr_t
+row_drop_table_from_cache(
+ const char* tablename,
+ dict_table_t* table,
+ trx_t* trx)
+{
+ dberr_t err = DB_SUCCESS;
+ ut_ad(!table->is_temporary());
+
+ /* Remove the pointer to this table object from the list
+ of modified tables by the transaction because the object
+ is going to be destroyed below. */
+ trx->mod_tables.erase(table);
+
+ dict_sys.remove(table);
+
+ if (dict_load_table(tablename, DICT_ERR_IGNORE_FK_NOKEY)) {
+ ib::error() << "Not able to remove table "
+ << ut_get_name(trx, tablename)
+ << " from the dictionary cache!";
+ err = DB_ERROR;
+ }
+
+ return(err);
+}
+
+/** Drop a table for MySQL.
+If the data dictionary was not already locked by the transaction,
+the transaction will be committed. Otherwise, the data dictionary
+will remain locked.
+@param[in] name Table name
+@param[in,out] trx Transaction handle
+@param[in] sqlcom type of SQL operation
+@param[in] create_failed true=create table failed
+ because e.g. foreign key column
+@param[in] nonatomic Whether it is permitted to release
+ and reacquire dict_sys.latch
+@return error code or DB_SUCCESS */
+dberr_t
+row_drop_table_for_mysql(
+ const char* name,
+ trx_t* trx,
+ enum_sql_command sqlcom,
+ bool create_failed,
+ bool nonatomic)
+{
+ dberr_t err;
+ dict_foreign_t* foreign;
+ dict_table_t* table;
+ char* tablename = NULL;
+ bool locked_dictionary = false;
+ pars_info_t* info = NULL;
+ mem_heap_t* heap = NULL;
+
+
+ DBUG_ENTER("row_drop_table_for_mysql");
+ DBUG_PRINT("row_drop_table_for_mysql", ("table: '%s'", name));
+
+ ut_a(name != NULL);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ trx->op_info = "dropping table";
+
+ if (trx->dict_operation_lock_mode != RW_X_LATCH) {
+ /* Prevent foreign key checks etc. while we are
+ dropping the table */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ locked_dictionary = true;
+ nonatomic = true;
+ }
+
+ ut_d(dict_sys.assert_locked());
+
+ table = dict_table_open_on_name(
+ name, TRUE, FALSE,
+ static_cast<dict_err_ignore_t>(
+ DICT_ERR_IGNORE_INDEX_ROOT
+ | DICT_ERR_IGNORE_CORRUPT));
+
+ if (!table) {
+ if (locked_dictionary) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+ trx->op_info = "";
+ DBUG_RETURN(DB_TABLE_NOT_FOUND);
+ }
+
+ std::vector<pfs_os_file_t> detached_handles;
+
+ const bool is_temp_name = strstr(table->name.m_name,
+ "/" TEMP_FILE_PREFIX);
+
+ if (table->is_temporary()) {
+ ut_ad(table->space == fil_system.temp_space);
+ for (dict_index_t* index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ btr_free(page_id_t(SRV_TMP_SPACE_ID, index->page));
+ }
+ /* Remove the pointer to this table object from the list
+ of modified tables by the transaction because the object
+ is going to be destroyed below. */
+ trx->mod_tables.erase(table);
+ table->release();
+ dict_sys.remove(table);
+ err = DB_SUCCESS;
+ goto funct_exit_all_freed;
+ }
+
+ /* This function is called recursively via fts_drop_tables(). */
+ if (!trx_is_started(trx)) {
+ trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+ }
+
+ /* Turn on this drop bit before we could release the dictionary
+ latch */
+ table->to_be_dropped = true;
+
+ if (nonatomic) {
+ /* This trx did not acquire any locks on dictionary
+ table records yet. Thus it is safe to release and
+ reacquire the data dictionary latches. */
+ if (table->fts) {
+ row_mysql_unlock_data_dictionary(trx);
+ fts_optimize_remove_table(table);
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ dict_stats_wait_bg_to_stop_using_table(table, trx);
+ }
+
+ /* make sure background stats thread is not running on the table */
+ ut_ad(!(table->stats_bg_flag & BG_STAT_IN_PROGRESS));
+ if (!table->no_rollback()) {
+ if (table->space != fil_system.sys_space) {
+ /* Delete the link file if used. */
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+ RemoteDatafile::delete_link_file(name);
+ }
+ }
+
+ dict_stats_recalc_pool_del(table);
+ dict_stats_defrag_pool_del(table, NULL);
+ if (btr_defragment_active) {
+ /* During fts_drop_orphaned_tables() the
+ btr_defragment_mutex has not yet been
+ initialized by btr_defragment_init(). */
+ btr_defragment_remove_table(table);
+ }
+
+ if (UNIV_LIKELY(!strstr(name, "/" TEMP_FILE_PREFIX_INNODB))) {
+ /* Remove any persistent statistics for this table,
+ in a separate transaction. */
+ char errstr[1024];
+ err = dict_stats_drop_table(name, errstr,
+ sizeof errstr);
+ if (err != DB_SUCCESS) {
+ ib::warn() << errstr;
+ }
+ }
+ }
+
+ dict_table_prevent_eviction(table);
+ dict_table_close(table, TRUE, FALSE);
+
+ /* Check if the table is referenced by foreign key constraints from
+ some other table (not the table itself) */
+
+ if (!srv_read_only_mode && trx->check_foreigns) {
+
+ for (dict_foreign_set::iterator it
+ = table->referenced_set.begin();
+ it != table->referenced_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ const bool ref_ok = sqlcom == SQLCOM_DROP_DB
+ && dict_tables_have_same_db(
+ name,
+ foreign->foreign_table_name_lookup);
+
+ /* We should allow dropping a referenced table if creating
+ that referenced table has failed for some reason. For example
+ if referenced table is created but it column types that are
+ referenced do not match. */
+ if (foreign->foreign_table != table &&
+ !create_failed && !ref_ok) {
+
+ FILE* ef = dict_foreign_err_file;
+
+ /* We only allow dropping a referenced table
+ if FOREIGN_KEY_CHECKS is set to 0 */
+
+ err = DB_CANNOT_DROP_CONSTRAINT;
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+
+ fputs(" Cannot drop table ", ef);
+ ut_print_name(ef, trx, name);
+ fputs("\n"
+ "because it is referenced by ", ef);
+ ut_print_name(ef, trx,
+ foreign->foreign_table_name);
+ putc('\n', ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ goto funct_exit;
+ }
+ }
+ }
+
+ DBUG_EXECUTE_IF("row_drop_table_add_to_background", goto defer;);
+
+ /* TODO: could we replace the counter n_foreign_key_checks_running
+ with lock checks on the table? Acquire here an exclusive lock on the
+ table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
+ they can cope with the table having been dropped here? Foreign key
+ checks take an IS or IX lock on the table. */
+
+ if (table->n_foreign_key_checks_running > 0) {
+defer:
+ /* Rename #sql-backup to #sql-ib if table has open ref count
+ while dropping the table. This scenario can happen
+ when purge thread is waiting for dict_sys.mutex so
+ that it could close the table. But drop table acquires
+ dict_sys.mutex.
+ In the future this should use 'tmp_file_prefix'!
+ */
+ if (!is_temp_name
+ || strstr(table->name.m_name, "/#sql-backup-")) {
+ heap = mem_heap_create(FN_REFLEN);
+ const char* tmp_name
+ = dict_mem_create_temporary_tablename(
+ heap, table->name.m_name, table->id);
+ ib::info() << "Deferring DROP TABLE " << table->name
+ << "; renaming to " << tmp_name;
+ err = row_rename_table_for_mysql(
+ table->name.m_name, tmp_name, trx,
+ false, false);
+ } else {
+ err = DB_SUCCESS;
+ }
+ if (err == DB_SUCCESS) {
+ row_add_table_to_background_drop_list(table->id);
+ }
+ goto funct_exit;
+ }
+
+ /* Remove all locks that are on the table or its records, if there
+ are no references to the table but it has record locks, we release
+ the record locks unconditionally. One use case is:
+
+ CREATE TABLE t2 (PRIMARY KEY (a)) SELECT * FROM t1;
+
+ If after the user transaction has done the SELECT and there is a
+ problem in completing the CREATE TABLE operation, MySQL will drop
+ the table. InnoDB will create a new background transaction to do the
+ actual drop, the trx instance that is passed to this function. To
+ preserve existing behaviour we remove the locks but ideally we
+ shouldn't have to. There should never be record locks on a table
+ that is going to be dropped. */
+
+ if (table->get_ref_count() > 0 || table->n_rec_locks > 0
+ || lock_table_has_locks(table)) {
+ goto defer;
+ }
+
+ /* The "to_be_dropped" marks table that is to be dropped, but
+ has not been dropped, instead, was put in the background drop
+ list due to being used by concurrent DML operations. Clear it
+ here since there are no longer any concurrent activities on it,
+ and it is free to be dropped */
+ table->to_be_dropped = false;
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+ trx->table_id = table->id;
+ case TRX_DICT_OP_TABLE:
+ break;
+ case TRX_DICT_OP_INDEX:
+ /* If the transaction was previously flagged as
+ TRX_DICT_OP_INDEX, we should be dropping auxiliary
+ tables for full-text indexes. */
+ ut_ad(strstr(table->name.m_name, "/FTS_"));
+ }
+
+ /* Mark all indexes unavailable in the data dictionary cache
+ before starting to drop the table. */
+
+ unsigned* page_no;
+ unsigned* page_nos;
+ heap = mem_heap_create(
+ 200 + UT_LIST_GET_LEN(table->indexes) * sizeof *page_nos);
+ tablename = mem_heap_strdup(heap, name);
+
+ page_no = page_nos = static_cast<unsigned*>(
+ mem_heap_alloc(
+ heap,
+ UT_LIST_GET_LEN(table->indexes) * sizeof *page_no));
+
+ for (dict_index_t* index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ rw_lock_x_lock(dict_index_get_lock(index));
+ /* Save the page numbers so that we can restore them
+ if the operation fails. */
+ *page_no++ = index->page;
+ /* Mark the index unusable. */
+ index->page = FIL_NULL;
+ rw_lock_x_unlock(dict_index_get_lock(index));
+ }
+
+ /* Deleting a row from SYS_INDEXES table will invoke
+ dict_drop_index_tree(). */
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "name", name);
+
+ if (sqlcom != SQLCOM_TRUNCATE
+ && strchr(name, '/')
+ && dict_table_get_low("SYS_FOREIGN")
+ && dict_table_get_low("SYS_FOREIGN_COLS")) {
+ err = que_eval_sql(
+ info,
+ "PROCEDURE DROP_FOREIGN_PROC () IS\n"
+ "fid CHAR;\n"
+
+ "DECLARE CURSOR fk IS\n"
+ "SELECT ID FROM SYS_FOREIGN\n"
+ "WHERE FOR_NAME = :name\n"
+ "AND TO_BINARY(FOR_NAME) = TO_BINARY(:name)\n"
+ "FOR UPDATE;\n"
+
+ "BEGIN\n"
+ "OPEN fk;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH fk INTO fid;\n"
+ " IF (SQL % NOTFOUND) THEN RETURN; END IF;\n"
+ " DELETE FROM SYS_FOREIGN_COLS WHERE ID=fid;\n"
+ " DELETE FROM SYS_FOREIGN WHERE ID=fid;\n"
+ "END LOOP;\n"
+ "CLOSE fk;\n"
+ "END;\n", FALSE, trx);
+ if (err == DB_SUCCESS) {
+ info = pars_info_create();
+ pars_info_add_str_literal(info, "name", name);
+ goto do_drop;
+ }
+ } else {
+do_drop:
+ if (dict_table_get_low("SYS_VIRTUAL")) {
+ err = que_eval_sql(
+ info,
+ "PROCEDURE DROP_VIRTUAL_PROC () IS\n"
+ "tid CHAR;\n"
+
+ "BEGIN\n"
+ "SELECT ID INTO tid FROM SYS_TABLES\n"
+ "WHERE NAME = :name FOR UPDATE;\n"
+ "IF (SQL % NOTFOUND) THEN RETURN;"
+ " END IF;\n"
+ "DELETE FROM SYS_VIRTUAL"
+ " WHERE TABLE_ID = tid;\n"
+ "END;\n", FALSE, trx);
+ if (err == DB_SUCCESS) {
+ info = pars_info_create();
+ pars_info_add_str_literal(
+ info, "name", name);
+ }
+ } else {
+ err = DB_SUCCESS;
+ }
+
+ err = err == DB_SUCCESS ? que_eval_sql(
+ info,
+ "PROCEDURE DROP_TABLE_PROC () IS\n"
+ "tid CHAR;\n"
+ "iid CHAR;\n"
+
+ "DECLARE CURSOR cur_idx IS\n"
+ "SELECT ID FROM SYS_INDEXES\n"
+ "WHERE TABLE_ID = tid FOR UPDATE;\n"
+
+ "BEGIN\n"
+ "SELECT ID INTO tid FROM SYS_TABLES\n"
+ "WHERE NAME = :name FOR UPDATE;\n"
+ "IF (SQL % NOTFOUND) THEN RETURN; END IF;\n"
+
+ "OPEN cur_idx;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH cur_idx INTO iid;\n"
+ " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+ " DELETE FROM SYS_FIELDS\n"
+ " WHERE INDEX_ID = iid;\n"
+ " DELETE FROM SYS_INDEXES\n"
+ " WHERE ID = iid AND TABLE_ID = tid;\n"
+ "END LOOP;\n"
+ "CLOSE cur_idx;\n"
+
+ "DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n"
+ "DELETE FROM SYS_TABLES WHERE NAME=:name;\n"
+
+ "END;\n", FALSE, trx) : err;
+
+ if (err == DB_SUCCESS && table->space
+ && dict_table_get_low("SYS_TABLESPACES")
+ && dict_table_get_low("SYS_DATAFILES")) {
+ info = pars_info_create();
+ pars_info_add_int4_literal(info, "id",
+ lint(table->space_id));
+ err = que_eval_sql(
+ info,
+ "PROCEDURE DROP_SPACE_PROC () IS\n"
+ "BEGIN\n"
+ "DELETE FROM SYS_TABLESPACES\n"
+ "WHERE SPACE = :id;\n"
+ "DELETE FROM SYS_DATAFILES\n"
+ "WHERE SPACE = :id;\n"
+ "END;\n", FALSE, trx);
+ }
+ }
+
+ switch (err) {
+ fil_space_t* space;
+ char* filepath;
+ case DB_SUCCESS:
+ if (!table->no_rollback()) {
+ err = row_drop_ancillary_fts_tables(table, trx);
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ }
+
+ space = table->space;
+ ut_ad(!space || space->id == table->space_id);
+ /* Determine the tablespace filename before we drop
+ dict_table_t. */
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+ dict_get_and_save_data_dir_path(table, true);
+ ut_ad(table->data_dir_path || !space);
+ filepath = space ? NULL : fil_make_filepath(
+ table->data_dir_path,
+ table->name.m_name, IBD,
+ table->data_dir_path != NULL);
+ } else {
+ filepath = space ? NULL : fil_make_filepath(
+ NULL, table->name.m_name, IBD, false);
+ }
+
+ /* Free the dict_table_t object. */
+ err = row_drop_table_from_cache(tablename, table, trx);
+ if (err != DB_SUCCESS) {
+ ut_free(filepath);
+ break;
+ }
+
+ /* Do not attempt to drop known-to-be-missing tablespaces,
+ nor the system tablespace. */
+ if (!space) {
+ fil_delete_file(filepath);
+ ut_free(filepath);
+ break;
+ }
+
+ ut_ad(!filepath);
+
+ if (space->id != TRX_SYS_SPACE) {
+ err = fil_delete_tablespace(space->id, false,
+ &detached_handles);
+ }
+ break;
+
+ case DB_OUT_OF_FILE_SPACE:
+ err = DB_MUST_GET_MORE_FILE_SPACE;
+ trx->error_state = err;
+ row_mysql_handle_errors(&err, trx, NULL, NULL);
+
+ /* raise error */
+ ut_error;
+ break;
+
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ /* Cannot even find a free slot for the
+ the undo log. We can directly exit here
+ and return the DB_TOO_MANY_CONCURRENT_TRXS
+ error. */
+
+ default:
+ /* This is some error we do not expect. Print
+ the error number and rollback the transaction */
+ ib::error() << "Unknown error code " << err << " while"
+ " dropping table: "
+ << ut_get_name(trx, tablename) << ".";
+
+ trx->error_state = DB_SUCCESS;
+ trx->rollback();
+ trx->error_state = DB_SUCCESS;
+
+ /* Mark all indexes available in the data dictionary
+ cache again. */
+
+ page_no = page_nos;
+
+ for (dict_index_t* index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ rw_lock_x_lock(dict_index_get_lock(index));
+ ut_a(index->page == FIL_NULL);
+ index->page = *page_no++;
+ rw_lock_x_unlock(dict_index_get_lock(index));
+ }
+ }
+
+ if (err != DB_SUCCESS && table != NULL) {
+ /* Drop table has failed with error but as drop table is not
+ transaction safe we should mark the table as corrupted to avoid
+ unwarranted follow-up action on this table that can result
+ in more serious issues. */
+
+ table->corrupted = true;
+ for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index != NULL;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+ dict_set_corrupted(index, trx, "DROP TABLE");
+ }
+ }
+
+funct_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+funct_exit_all_freed:
+ if (locked_dictionary) {
+
+ if (trx_is_started(trx)) {
+
+ trx_commit_for_mysql(trx);
+ }
+
+ /* Add the table to fts queue if drop table fails */
+ if (err != DB_SUCCESS && table->fts) {
+ fts_optimize_add_table(table);
+ }
+
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ for (const auto& handle : detached_handles) {
+ ut_ad(handle != OS_FILE_CLOSED);
+ os_file_close(handle);
+ }
+
+ trx->op_info = "";
+
+ DBUG_RETURN(err);
+}
+
+/** Drop a table after failed CREATE TABLE. */
+dberr_t row_drop_table_after_create_fail(const char* name, trx_t* trx)
+{
+ ib::warn() << "Dropping incompletely created " << name << " table.";
+ return row_drop_table_for_mysql(name, trx, SQLCOM_DROP_DB, true);
+}
+
+/*******************************************************************//**
+Drop all foreign keys in a database, see Bug#18942.
+Called at the end of row_drop_database_for_mysql().
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+drop_all_foreign_keys_in_db(
+/*========================*/
+ const char* name, /*!< in: database name which ends to '/' */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ pars_info_t* pinfo;
+ dberr_t err;
+
+ ut_a(name[strlen(name) - 1] == '/');
+
+ pinfo = pars_info_create();
+
+ pars_info_add_str_literal(pinfo, "dbname", name);
+
+/** true if for_name is not prefixed with dbname */
+#define TABLE_NOT_IN_THIS_DB \
+"SUBSTR(for_name, 0, LENGTH(:dbname)) <> :dbname"
+
+ err = que_eval_sql(pinfo,
+ "PROCEDURE DROP_ALL_FOREIGN_KEYS_PROC () IS\n"
+ "foreign_id CHAR;\n"
+ "for_name CHAR;\n"
+ "found INT;\n"
+ "DECLARE CURSOR cur IS\n"
+ "SELECT ID, FOR_NAME FROM SYS_FOREIGN\n"
+ "WHERE FOR_NAME >= :dbname\n"
+ "LOCK IN SHARE MODE\n"
+ "ORDER BY FOR_NAME;\n"
+ "BEGIN\n"
+ "found := 1;\n"
+ "OPEN cur;\n"
+ "WHILE found = 1 LOOP\n"
+ " FETCH cur INTO foreign_id, for_name;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSIF (" TABLE_NOT_IN_THIS_DB ") THEN\n"
+ " found := 0;\n"
+ " ELSIF (1=1) THEN\n"
+ " DELETE FROM SYS_FOREIGN_COLS\n"
+ " WHERE ID = foreign_id;\n"
+ " DELETE FROM SYS_FOREIGN\n"
+ " WHERE ID = foreign_id;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE cur;\n"
+ "COMMIT WORK;\n"
+ "END;\n",
+ FALSE, /* do not reserve dict mutex,
+ we are already holding it */
+ trx);
+
+ return(err);
+}
+
+/** Drop a database for MySQL.
+@param[in] name database name which ends at '/'
+@param[in] trx transaction handle
+@param[out] found number of dropped tables/partitions
+@return error code or DB_SUCCESS */
+dberr_t
+row_drop_database_for_mysql(
+ const char* name,
+ trx_t* trx,
+ ulint* found)
+{
+ dict_table_t* table;
+ char* table_name;
+ dberr_t err = DB_SUCCESS;
+ ulint namelen = strlen(name);
+ bool is_partition = false;
+
+ ut_ad(found != NULL);
+
+ DBUG_ENTER("row_drop_database_for_mysql");
+
+ DBUG_PRINT("row_drop_database_for_mysql", ("db: '%s'", name));
+
+ ut_a(name != NULL);
+ /* Assert DB name or partition name. */
+ if (name[namelen - 1] == '#') {
+ ut_ad(name[namelen - 2] != '/');
+ is_partition = true;
+ trx->op_info = "dropping partitions";
+ } else {
+ ut_a(name[namelen - 1] == '/');
+ trx->op_info = "dropping database";
+ }
+
+ *found = 0;
+
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+ trx_start_if_not_started_xa(trx, true);
+
+loop:
+ row_mysql_lock_data_dictionary(trx);
+
+ while ((table_name = dict_get_first_table_name_in_db(name))) {
+ /* Drop parent table if it is a fts aux table, to
+ avoid accessing dropped fts aux tables in information
+ scheam when parent table still exists.
+ Note: Drop parent table will drop fts aux tables. */
+ char* parent_table_name = NULL;
+ table_id_t table_id;
+ index_id_t index_id;
+
+ if (fts_check_aux_table(
+ table_name, &table_id, &index_id)) {
+ dict_table_t* parent_table = dict_table_open_on_id(
+ table_id, TRUE, DICT_TABLE_OP_NORMAL);
+ if (parent_table != NULL) {
+ parent_table_name = mem_strdupl(
+ parent_table->name.m_name,
+ strlen(parent_table->name.m_name));
+ dict_table_close(parent_table, TRUE, FALSE);
+ }
+ }
+
+ if (parent_table_name != NULL) {
+ ut_free(table_name);
+ table_name = parent_table_name;
+ }
+
+ ut_a(memcmp(table_name, name, namelen) == 0);
+
+ table = dict_table_open_on_name(
+ table_name, TRUE, FALSE, static_cast<dict_err_ignore_t>(
+ DICT_ERR_IGNORE_INDEX_ROOT
+ | DICT_ERR_IGNORE_CORRUPT));
+
+ if (!table) {
+ ib::error() << "Cannot load table " << table_name
+ << " from InnoDB internal data dictionary"
+ " during drop database";
+ ut_free(table_name);
+ err = DB_TABLE_NOT_FOUND;
+ break;
+
+ }
+
+ if (!table->name.is_temporary()) {
+ /* There could be orphan temp tables left from
+ interrupted alter table. Leave them, and handle
+ the rest.*/
+ if (table->can_be_evicted
+ && (name[namelen - 1] != '#')) {
+ ib::warn() << "Orphan table encountered during"
+ " DROP DATABASE. This is possible if '"
+ << table->name << ".frm' was lost.";
+ }
+
+ if (!table->is_readable() && !table->space) {
+ ib::warn() << "Missing .ibd file for table "
+ << table->name << ".";
+ }
+ }
+
+ dict_table_close(table, TRUE, FALSE);
+
+ /* The dict_table_t object must not be accessed before
+ dict_table_open() or after dict_table_close(). But this is OK
+ if we are holding, the dict_sys.mutex. */
+ ut_ad(mutex_own(&dict_sys.mutex));
+
+ /* Disable statistics on the found table. */
+ if (!dict_stats_stop_bg(table)) {
+ row_mysql_unlock_data_dictionary(trx);
+
+ os_thread_sleep(250000);
+
+ ut_free(table_name);
+
+ goto loop;
+ }
+
+ /* Wait until MySQL does not have any queries running on
+ the table */
+
+ if (table->get_ref_count() > 0) {
+ row_mysql_unlock_data_dictionary(trx);
+
+ ib::warn() << "MySQL is trying to drop database "
+ << ut_get_name(trx, name) << " though"
+ " there are still open handles to table "
+ << table->name << ".";
+
+ os_thread_sleep(1000000);
+
+ ut_free(table_name);
+
+ goto loop;
+ }
+
+ err = row_drop_table_for_mysql(
+ table_name, trx, SQLCOM_DROP_DB);
+ trx_commit_for_mysql(trx);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ib::error() << "DROP DATABASE "
+ << ut_get_name(trx, name) << " failed"
+ " with error (" << err << ") for"
+ " table " << ut_get_name(trx, table_name);
+ ut_free(table_name);
+ break;
+ }
+
+ ut_free(table_name);
+ (*found)++;
+ }
+
+ /* Partitioning does not yet support foreign keys. */
+ if (err == DB_SUCCESS && !is_partition) {
+ /* after dropping all tables try to drop all leftover
+ foreign keys in case orphaned ones exist */
+ err = drop_all_foreign_keys_in_db(name, trx);
+
+ if (err != DB_SUCCESS) {
+ const std::string& db = ut_get_name(trx, name);
+ ib::error() << "DROP DATABASE " << db << " failed with"
+ " error " << err << " while dropping all"
+ " foreign keys";
+ }
+ }
+
+ trx_commit_for_mysql(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->op_info = "";
+
+ DBUG_RETURN(err);
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_delete_constraint_low(
+/*======================*/
+ const char* id, /*!< in: constraint id */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_str_literal(info, "id", id);
+
+ return(que_eval_sql(info,
+ "PROCEDURE DELETE_CONSTRAINT () IS\n"
+ "BEGIN\n"
+ "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n"
+ "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n"
+ "END;\n"
+ , FALSE, trx));
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_delete_constraint(
+/*==================*/
+ const char* id, /*!< in: constraint id */
+ const char* database_name, /*!< in: database name, with the
+ trailing '/' */
+ mem_heap_t* heap, /*!< in: memory heap */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ dberr_t err;
+
+ /* New format constraints have ids <databasename>/<constraintname>. */
+ err = row_delete_constraint_low(
+ mem_heap_strcat(heap, database_name, id), trx);
+
+ if ((err == DB_SUCCESS) && !strchr(id, '/')) {
+ /* Old format < 4.0.18 constraints have constraint ids
+ NUMBER_NUMBER. We only try deleting them if the
+ constraint name does not contain a '/' character, otherwise
+ deleting a new format constraint named 'foo/bar' from
+ database 'baz' would remove constraint 'bar' from database
+ 'foo', if it existed. */
+
+ err = row_delete_constraint_low(id, trx);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+row_rename_table_for_mysql(
+/*=======================*/
+ const char* old_name, /*!< in: old table name */
+ const char* new_name, /*!< in: new table name */
+ trx_t* trx, /*!< in/out: transaction */
+ bool commit, /*!< in: whether to commit trx */
+ bool use_fk) /*!< in: whether to parse and enforce
+ FOREIGN KEY constraints */
+{
+ dict_table_t* table = NULL;
+ dberr_t err = DB_ERROR;
+ mem_heap_t* heap = NULL;
+ const char** constraints_to_drop = NULL;
+ ulint n_constraints_to_drop = 0;
+ ibool old_is_tmp, new_is_tmp;
+ pars_info_t* info = NULL;
+ int retry;
+ bool aux_fts_rename = false;
+ char* is_part = NULL;
+
+ ut_a(old_name != NULL);
+ ut_a(new_name != NULL);
+ ut_ad(trx->state == TRX_STATE_ACTIVE);
+ const bool dict_locked = trx->dict_operation_lock_mode == RW_X_LATCH;
+ ut_ad(!commit || dict_locked);
+
+ if (high_level_read_only) {
+ return(DB_READ_ONLY);
+
+ } else if (row_mysql_is_system_table(new_name)) {
+
+ ib::error() << "Trying to create a MySQL system table "
+ << new_name << " of type InnoDB. MySQL system tables"
+ " must be of the MyISAM type!";
+
+ goto funct_exit;
+ }
+
+ trx->op_info = "renaming table";
+
+ old_is_tmp = dict_table_t::is_temporary_name(old_name);
+ new_is_tmp = dict_table_t::is_temporary_name(new_name);
+
+ table = dict_table_open_on_name(old_name, dict_locked, FALSE,
+ DICT_ERR_IGNORE_FK_NOKEY);
+
+ /* We look for pattern #P# to see if the table is partitioned
+ MySQL table. */
+#ifdef __WIN__
+ is_part = strstr((char *)old_name, (char *)"#p#");
+#else
+ is_part = strstr((char *)old_name, (char *)"#P#");
+#endif /* __WIN__ */
+
+ /* MySQL partition engine hard codes the file name
+ separator as "#P#". The text case is fixed even if
+ lower_case_table_names is set to 1 or 2. This is true
+ for sub-partition names as well. InnoDB always
+ normalises file names to lower case on Windows, this
+ can potentially cause problems when copying/moving
+ tables between platforms.
+
+ 1) If boot against an installation from Windows
+ platform, then its partition table name could
+ be all be in lower case in system tables. So we
+ will need to check lower case name when load table.
+
+ 2) If we boot an installation from other case
+ sensitive platform in Windows, we might need to
+ check the existence of table name without lowering
+ case them in the system table. */
+ if (!table &&
+ is_part &&
+ innobase_get_lower_case_table_names() == 1) {
+ char par_case_name[MAX_FULL_NAME_LEN + 1];
+#ifndef __WIN__
+ /* Check for the table using lower
+ case name, including the partition
+ separator "P" */
+ memcpy(par_case_name, old_name,
+ strlen(old_name));
+ par_case_name[strlen(old_name)] = 0;
+ innobase_casedn_str(par_case_name);
+#else
+ /* On Windows platfrom, check
+ whether there exists table name in
+ system table whose name is
+ not being normalized to lower case */
+ normalize_table_name_c_low(
+ par_case_name, old_name, FALSE);
+#endif
+ table = dict_table_open_on_name(par_case_name, dict_locked, FALSE,
+ DICT_ERR_IGNORE_FK_NOKEY);
+ }
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+ goto funct_exit;
+
+ } else if (!table->is_readable() && !table->space
+ && !(table->flags2 & DICT_TF2_DISCARDED)) {
+
+ err = DB_TABLE_NOT_FOUND;
+
+ ib::error() << "Table " << old_name << " does not have an .ibd"
+ " file in the database directory. "
+ << TROUBLESHOOTING_MSG;
+
+ goto funct_exit;
+
+ } else if (use_fk && !old_is_tmp && new_is_tmp) {
+ /* MySQL is doing an ALTER TABLE command and it renames the
+ original table to a temporary table name. We want to preserve
+ the original foreign key constraint definitions despite the
+ name change. An exception is those constraints for which
+ the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/
+
+ heap = mem_heap_create(100);
+
+ err = dict_foreign_parse_drop_constraints(
+ heap, trx, table, &n_constraints_to_drop,
+ &constraints_to_drop);
+
+ if (err != DB_SUCCESS) {
+ goto funct_exit;
+ }
+ }
+
+ /* Is a foreign key check running on this table? */
+ for (retry = 0; retry < 100
+ && table->n_foreign_key_checks_running > 0; ++retry) {
+ row_mysql_unlock_data_dictionary(trx);
+ os_thread_yield();
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ if (table->n_foreign_key_checks_running > 0) {
+ ib::error() << "In ALTER TABLE "
+ << ut_get_name(trx, old_name)
+ << " a FOREIGN KEY check is running. Cannot rename"
+ " table.";
+ err = DB_TABLE_IN_FK_CHECK;
+ goto funct_exit;
+ }
+
+ if (!table->is_temporary()) {
+ if (commit) {
+ dict_stats_wait_bg_to_stop_using_table(table, trx);
+ }
+
+ err = trx_undo_report_rename(trx, table);
+
+ if (err != DB_SUCCESS) {
+ goto funct_exit;
+ }
+ }
+
+ /* We use the private SQL parser of Innobase to generate the query
+ graphs needed in updating the dictionary data from system tables. */
+
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "new_table_name", new_name);
+ pars_info_add_str_literal(info, "old_table_name", old_name);
+
+ err = que_eval_sql(info,
+ "PROCEDURE RENAME_TABLE () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLES"
+ " SET NAME = :new_table_name\n"
+ " WHERE NAME = :old_table_name;\n"
+ "END;\n"
+ , FALSE, trx);
+
+ /* Assume the caller guarantees destination name doesn't exist. */
+ ut_ad(err != DB_DUPLICATE_KEY);
+
+ /* SYS_TABLESPACES and SYS_DATAFILES need to be updated if
+ the table is in a single-table tablespace. */
+ if (err != DB_SUCCESS || !dict_table_is_file_per_table(table)) {
+ } else if (table->space) {
+ /* If old path and new path are the same means tablename
+ has not changed and only the database name holding the table
+ has changed so we need to make the complete filepath again. */
+ char* new_path = dict_tables_have_same_db(old_name, new_name)
+ ? os_file_make_new_pathname(
+ table->space->chain.start->name, new_name)
+ : fil_make_filepath(NULL, new_name, IBD, false);
+
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "new_table_name", new_name);
+ pars_info_add_str_literal(info, "new_path_name", new_path);
+ pars_info_add_int4_literal(info, "space_id", table->space_id);
+
+ err = que_eval_sql(info,
+ "PROCEDURE RENAME_SPACE () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLESPACES"
+ " SET NAME = :new_table_name\n"
+ " WHERE SPACE = :space_id;\n"
+ "UPDATE SYS_DATAFILES"
+ " SET PATH = :new_path_name\n"
+ " WHERE SPACE = :space_id;\n"
+ "END;\n"
+ , FALSE, trx);
+
+ ut_free(new_path);
+ }
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+
+ if (!new_is_tmp) {
+ /* Rename all constraints. */
+ char new_table_name[MAX_TABLE_NAME_LEN + 1];
+ char old_table_utf8[MAX_TABLE_NAME_LEN + 1];
+ uint errors = 0;
+
+ strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN);
+ old_table_utf8[MAX_TABLE_NAME_LEN] = '\0';
+ innobase_convert_to_system_charset(
+ strchr(old_table_utf8, '/') + 1,
+ strchr(old_name, '/') +1,
+ MAX_TABLE_NAME_LEN, &errors);
+
+ if (errors) {
+ /* Table name could not be converted from charset
+ my_charset_filename to UTF-8. This means that the
+ table name is already in UTF-8 (#mysql#50). */
+ strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN);
+ old_table_utf8[MAX_TABLE_NAME_LEN] = '\0';
+ }
+
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "new_table_name", new_name);
+ pars_info_add_str_literal(info, "old_table_name", old_name);
+ pars_info_add_str_literal(info, "old_table_name_utf8",
+ old_table_utf8);
+
+ strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN);
+ new_table_name[MAX_TABLE_NAME_LEN] = '\0';
+ innobase_convert_to_system_charset(
+ strchr(new_table_name, '/') + 1,
+ strchr(new_name, '/') +1,
+ MAX_TABLE_NAME_LEN, &errors);
+
+ if (errors) {
+ /* Table name could not be converted from charset
+ my_charset_filename to UTF-8. This means that the
+ table name is already in UTF-8 (#mysql#50). */
+ strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN);
+ new_table_name[MAX_TABLE_NAME_LEN] = '\0';
+ }
+
+ pars_info_add_str_literal(info, "new_table_utf8", new_table_name);
+
+ err = que_eval_sql(
+ info,
+ "PROCEDURE RENAME_CONSTRAINT_IDS () IS\n"
+ "gen_constr_prefix CHAR;\n"
+ "new_db_name CHAR;\n"
+ "foreign_id CHAR;\n"
+ "new_foreign_id CHAR;\n"
+ "old_db_name_len INT;\n"
+ "old_t_name_len INT;\n"
+ "new_db_name_len INT;\n"
+ "id_len INT;\n"
+ "offset INT;\n"
+ "found INT;\n"
+ "BEGIN\n"
+ "found := 1;\n"
+ "old_db_name_len := INSTR(:old_table_name, '/')-1;\n"
+ "new_db_name_len := INSTR(:new_table_name, '/')-1;\n"
+ "new_db_name := SUBSTR(:new_table_name, 0,\n"
+ " new_db_name_len);\n"
+ "old_t_name_len := LENGTH(:old_table_name);\n"
+ "gen_constr_prefix := CONCAT(:old_table_name_utf8,\n"
+ " '_ibfk_');\n"
+ "WHILE found = 1 LOOP\n"
+ " SELECT ID INTO foreign_id\n"
+ " FROM SYS_FOREIGN\n"
+ " WHERE FOR_NAME = :old_table_name\n"
+ " AND TO_BINARY(FOR_NAME)\n"
+ " = TO_BINARY(:old_table_name)\n"
+ " LOCK IN SHARE MODE;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE\n"
+ " UPDATE SYS_FOREIGN\n"
+ " SET FOR_NAME = :new_table_name\n"
+ " WHERE ID = foreign_id;\n"
+ " id_len := LENGTH(foreign_id);\n"
+ " IF (INSTR(foreign_id, '/') > 0) THEN\n"
+ " IF (INSTR(foreign_id,\n"
+ " gen_constr_prefix) > 0)\n"
+ " THEN\n"
+ " offset := INSTR(foreign_id, '_ibfk_') - 1;\n"
+ " new_foreign_id :=\n"
+ " CONCAT(:new_table_utf8,\n"
+ " SUBSTR(foreign_id, offset,\n"
+ " id_len - offset));\n"
+ " ELSE\n"
+ " new_foreign_id :=\n"
+ " CONCAT(new_db_name,\n"
+ " SUBSTR(foreign_id,\n"
+ " old_db_name_len,\n"
+ " id_len - old_db_name_len));\n"
+ " END IF;\n"
+ " UPDATE SYS_FOREIGN\n"
+ " SET ID = new_foreign_id\n"
+ " WHERE ID = foreign_id;\n"
+ " UPDATE SYS_FOREIGN_COLS\n"
+ " SET ID = new_foreign_id\n"
+ " WHERE ID = foreign_id;\n"
+ " END IF;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n"
+ "WHERE REF_NAME = :old_table_name\n"
+ " AND TO_BINARY(REF_NAME)\n"
+ " = TO_BINARY(:old_table_name);\n"
+ "END;\n"
+ , FALSE, trx);
+
+ } else if (n_constraints_to_drop > 0) {
+ /* Drop some constraints of tmp tables. */
+
+ ulint db_name_len = dict_get_db_name_len(old_name) + 1;
+ char* db_name = mem_heap_strdupl(heap, old_name,
+ db_name_len);
+ ulint i;
+
+ for (i = 0; i < n_constraints_to_drop; i++) {
+ err = row_delete_constraint(constraints_to_drop[i],
+ db_name, heap, trx);
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ }
+ }
+
+ if (err == DB_SUCCESS
+ && (dict_table_has_fts_index(table)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID))
+ && !dict_tables_have_same_db(old_name, new_name)) {
+ err = fts_rename_aux_tables(table, new_name, trx);
+ if (err != DB_TABLE_NOT_FOUND) {
+ aux_fts_rename = true;
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+err_exit:
+ if (err == DB_DUPLICATE_KEY) {
+ ib::error() << "Possible reasons:";
+ ib::error() << "(1) Table rename would cause two"
+ " FOREIGN KEY constraints to have the same"
+ " internal name in case-insensitive"
+ " comparison.";
+ ib::error() << "(2) Table "
+ << ut_get_name(trx, new_name)
+ << " exists in the InnoDB internal data"
+ " dictionary though MySQL is trying to rename"
+ " table " << ut_get_name(trx, old_name)
+ << " to it. Have you deleted the .frm file and"
+ " not used DROP TABLE?";
+ ib::info() << TROUBLESHOOTING_MSG;
+ ib::error() << "If table "
+ << ut_get_name(trx, new_name)
+ << " is a temporary table #sql..., then"
+ " it can be that there are still queries"
+ " running on the table, and it will be dropped"
+ " automatically when the queries end. You can"
+ " drop the orphaned table inside InnoDB by"
+ " creating an InnoDB table with the same name"
+ " in another database and copying the .frm file"
+ " to the current database. Then MySQL thinks"
+ " the table exists, and DROP TABLE will"
+ " succeed.";
+ }
+ trx->error_state = DB_SUCCESS;
+ trx->rollback();
+ trx->error_state = DB_SUCCESS;
+ } else {
+ /* The following call will also rename the .ibd data file if
+ the table is stored in a single-table tablespace */
+
+ err = dict_table_rename_in_cache(
+ table, new_name, !new_is_tmp);
+ if (err != DB_SUCCESS) {
+ trx->error_state = DB_SUCCESS;
+ trx->rollback();
+ trx->error_state = DB_SUCCESS;
+ goto funct_exit;
+ }
+
+ /* In case of copy alter, template db_name and
+ table_name should be renamed only for newly
+ created table. */
+ if (table->vc_templ != NULL && !new_is_tmp) {
+ innobase_rename_vc_templ(table);
+ }
+
+ /* We only want to switch off some of the type checking in
+ an ALTER TABLE, not in a RENAME. */
+ dict_names_t fk_tables;
+
+ err = dict_load_foreigns(
+ new_name, NULL, false,
+ !old_is_tmp || trx->check_foreigns,
+ use_fk
+ ? DICT_ERR_IGNORE_NONE
+ : DICT_ERR_IGNORE_FK_NOKEY,
+ fk_tables);
+
+ if (err != DB_SUCCESS) {
+
+ if (old_is_tmp) {
+ /* In case of copy alter, ignore the
+ loading of foreign key constraint
+ when foreign_key_check is disabled */
+ ib::error_or_warn(trx->check_foreigns)
+ << "In ALTER TABLE "
+ << ut_get_name(trx, new_name)
+ << " has or is referenced in foreign"
+ " key constraints which are not"
+ " compatible with the new table"
+ " definition.";
+ if (!trx->check_foreigns) {
+ err = DB_SUCCESS;
+ goto funct_exit;
+ }
+ } else {
+ ib::error() << "In RENAME TABLE table "
+ << ut_get_name(trx, new_name)
+ << " is referenced in foreign key"
+ " constraints which are not compatible"
+ " with the new table definition.";
+ }
+
+ trx->error_state = DB_SUCCESS;
+ trx->rollback();
+ trx->error_state = DB_SUCCESS;
+ }
+
+ /* Check whether virtual column or stored column affects
+ the foreign key constraint of the table. */
+ if (dict_foreigns_has_s_base_col(
+ table->foreign_set, table)) {
+ err = DB_NO_FK_ON_S_BASE_COL;
+ ut_a(DB_SUCCESS == dict_table_rename_in_cache(
+ table, old_name, FALSE));
+ trx->error_state = DB_SUCCESS;
+ trx->rollback();
+ trx->error_state = DB_SUCCESS;
+ goto funct_exit;
+ }
+
+ /* Fill the virtual column set in foreign when
+ the table undergoes copy alter operation. */
+ dict_mem_table_free_foreign_vcol_set(table);
+ dict_mem_table_fill_foreign_vcol_set(table);
+
+ while (!fk_tables.empty()) {
+ dict_load_table(fk_tables.front(),
+ DICT_ERR_IGNORE_NONE);
+ fk_tables.pop_front();
+ }
+
+ table->data_dir_path= NULL;
+ }
+
+funct_exit:
+ if (aux_fts_rename && err != DB_SUCCESS
+ && table != NULL && (table->space != 0)) {
+
+ char* orig_name = table->name.m_name;
+ trx_t* trx_bg = trx_create();
+
+ /* If the first fts_rename fails, the trx would
+ be rolled back and committed, we can't use it any more,
+ so we have to start a new background trx here. */
+ ut_a(trx_state_eq(trx_bg, TRX_STATE_NOT_STARTED));
+ trx_bg->op_info = "Revert the failing rename "
+ "for fts aux tables";
+ trx_bg->dict_operation_lock_mode = RW_X_LATCH;
+ trx_start_for_ddl(trx_bg, TRX_DICT_OP_TABLE);
+
+ /* If rename fails and table has its own tablespace,
+ we need to call fts_rename_aux_tables again to
+ revert the ibd file rename, which is not under the
+ control of trx. Also notice the parent table name
+ in cache is not changed yet. If the reverting fails,
+ the ibd data may be left in the new database, which
+ can be fixed only manually. */
+ table->name.m_name = const_cast<char*>(new_name);
+ fts_rename_aux_tables(table, old_name, trx_bg);
+ table->name.m_name = orig_name;
+
+ trx_bg->dict_operation_lock_mode = 0;
+ trx_commit_for_mysql(trx_bg);
+ trx_bg->free();
+ }
+
+ if (table != NULL) {
+ if (commit && !table->is_temporary()) {
+ table->stats_bg_flag &= byte(~BG_STAT_SHOULD_QUIT);
+ }
+ dict_table_close(table, dict_locked, FALSE);
+ }
+
+ if (commit) {
+ DEBUG_SYNC(trx->mysql_thd, "before_rename_table_commit");
+ trx_commit_for_mysql(trx);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Scans an index for either COUNT(*) or CHECK TABLE.
+If CHECK TABLE; Checks that the index contains entries in an ascending order,
+unique constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return DB_SUCCESS or other error */
+dberr_t
+row_scan_index_for_mysql(
+/*=====================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct
+ in MySQL handle */
+ const dict_index_t* index, /*!< in: index */
+ ulint* n_rows) /*!< out: number of entries
+ seen in the consistent read */
+{
+ dtuple_t* prev_entry = NULL;
+ ulint matched_fields;
+ byte* buf;
+ dberr_t ret;
+ rec_t* rec;
+ int cmp;
+ ibool contains_null;
+ ulint i;
+ ulint cnt;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets;
+ rec_offs_init(offsets_);
+
+ *n_rows = 0;
+
+ /* Don't support RTree Leaf level scan */
+ ut_ad(!dict_index_is_spatial(index));
+
+ if (dict_index_is_clust(index)) {
+ /* The clustered index of a table is always available.
+ During online ALTER TABLE that rebuilds the table, the
+ clustered index in the old table will have
+ index->online_log pointing to the new table. All
+ indexes of the old table will remain valid and the new
+ table will be unaccessible to MySQL until the
+ completion of the ALTER TABLE. */
+ } else if (dict_index_is_online_ddl(index)
+ || (index->type & DICT_FTS)) {
+ /* Full Text index are implemented by auxiliary tables,
+ not the B-tree. We also skip secondary indexes that are
+ being created online. */
+ return(DB_SUCCESS);
+ }
+
+ ulint bufsize = std::max<ulint>(srv_page_size,
+ prebuilt->mysql_row_len);
+ buf = static_cast<byte*>(ut_malloc_nokey(bufsize));
+ heap = mem_heap_create(100);
+
+ cnt = 1000;
+
+ ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0);
+loop:
+ /* Check thd->killed every 1,000 scanned rows */
+ if (--cnt == 0) {
+ if (trx_is_interrupted(prebuilt->trx)) {
+ ret = DB_INTERRUPTED;
+ goto func_exit;
+ }
+ cnt = 1000;
+ }
+
+ switch (ret) {
+ case DB_SUCCESS:
+ break;
+ case DB_DEADLOCK:
+ case DB_LOCK_TABLE_FULL:
+ case DB_LOCK_WAIT_TIMEOUT:
+ case DB_INTERRUPTED:
+ goto func_exit;
+ default:
+ ib::warn() << "CHECK TABLE on index " << index->name << " of"
+ " table " << index->table->name << " returned " << ret;
+ /* (this error is ignored by CHECK TABLE) */
+ /* fall through */
+ case DB_END_OF_INDEX:
+ ret = DB_SUCCESS;
+func_exit:
+ ut_free(buf);
+ mem_heap_free(heap);
+
+ return(ret);
+ }
+
+ *n_rows = *n_rows + 1;
+
+ /* else this code is doing handler::check() for CHECK TABLE */
+
+ /* row_search... returns the index record in buf, record origin offset
+ within buf stored in the first 4 bytes, because we have built a dummy
+ template */
+
+ rec = buf + mach_read_from_4(buf);
+
+ offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (prev_entry != NULL) {
+ matched_fields = 0;
+
+ cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets,
+ &matched_fields);
+ contains_null = FALSE;
+
+ /* In a unique secondary index we allow equal key values if
+ they contain SQL NULLs */
+
+ for (i = 0;
+ i < dict_index_get_n_ordering_defined_by_user(index);
+ i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(prev_entry, i))) {
+
+ contains_null = TRUE;
+ break;
+ }
+ }
+
+ const char* msg;
+
+ if (cmp > 0) {
+ ret = DB_INDEX_CORRUPT;
+ msg = "index records in a wrong order in ";
+not_ok:
+ ib::error()
+ << msg << index->name
+ << " of table " << index->table->name
+ << ": " << *prev_entry << ", "
+ << rec_offsets_print(rec, offsets);
+ /* Continue reading */
+ } else if (dict_index_is_unique(index)
+ && !contains_null
+ && matched_fields
+ >= dict_index_get_n_ordering_defined_by_user(
+ index)) {
+ ret = DB_DUPLICATE_KEY;
+ msg = "duplicate key in ";
+ goto not_ok;
+ }
+ }
+
+ {
+ mem_heap_t* tmp_heap = NULL;
+
+ /* Empty the heap on each round. But preserve offsets[]
+ for the row_rec_to_index_entry() call, by copying them
+ into a separate memory heap when needed. */
+ if (UNIV_UNLIKELY(offsets != offsets_)) {
+ ulint size = rec_offs_get_n_alloc(offsets)
+ * sizeof *offsets;
+
+ tmp_heap = mem_heap_create(size);
+
+ offsets = static_cast<rec_offs*>(
+ mem_heap_dup(tmp_heap, offsets, size));
+ }
+
+ mem_heap_empty(heap);
+
+ prev_entry = row_rec_to_index_entry(
+ rec, index, offsets, heap);
+
+ if (UNIV_LIKELY_NULL(tmp_heap)) {
+ mem_heap_free(tmp_heap);
+ }
+ }
+
+ ret = row_search_for_mysql(
+ buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT);
+
+ goto loop;
+}
+
+/*********************************************************************//**
+Initialize this module */
+void
+row_mysql_init(void)
+/*================*/
+{
+ mutex_create(LATCH_ID_ROW_DROP_LIST, &row_drop_list_mutex);
+
+ UT_LIST_INIT(
+ row_mysql_drop_list,
+ &row_mysql_drop_t::row_mysql_drop_list);
+
+ row_mysql_drop_list_inited = true;
+}
+
+void row_mysql_close()
+{
+ ut_ad(!UT_LIST_GET_LEN(row_mysql_drop_list) ||
+ srv_force_recovery >= SRV_FORCE_NO_BACKGROUND);
+ if (row_mysql_drop_list_inited)
+ {
+ row_mysql_drop_list_inited= false;
+ mutex_free(&row_drop_list_mutex);
+
+ while (row_mysql_drop_t *drop= UT_LIST_GET_FIRST(row_mysql_drop_list))
+ {
+ UT_LIST_REMOVE(row_mysql_drop_list, drop);
+ ut_free(drop);
+ }
+ }
+}
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
new file mode 100644
index 00000000..a0600801
--- /dev/null
+++ b/storage/innobase/row/row0purge.cc
@@ -0,0 +1,1221 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0purge.cc
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "dict0stats.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "row0mysql.h"
+#include "row0log.h"
+#include "log0log.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "handler.h"
+#include "ha_innodb.h"
+#include "fil0fil.h"
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Repositions the pcur in the purge node on the clustered index record,
+if found. If the record is not found, close pcur.
+@return TRUE if the record was found */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+ ulint mode, /*!< in: latching mode */
+ purge_node_t* node, /*!< in: row purge node */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (node->found_clust) {
+ ut_ad(node->validate_pcur());
+
+ node->found_clust = btr_pcur_restore_position(mode, &node->pcur, mtr);
+
+ } else {
+ node->found_clust = row_search_on_row_ref(
+ &node->pcur, mode, node->table, node->ref, mtr);
+
+ if (node->found_clust) {
+ btr_pcur_store_position(&node->pcur, mtr);
+ }
+ }
+
+ /* Close the current cursor if we fail to position it correctly. */
+ if (!node->found_clust) {
+ btr_pcur_close(&node->pcur);
+ }
+
+ return(node->found_clust);
+}
+
+/***********************************************************//**
+Removes a delete marked clustered index record if possible.
+@retval true if the row was not found, or it was successfully removed
+@retval false if the row was modified after the delete marking */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+ purge_node_t* node, /*!< in/out: row purge node */
+ ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ dict_index_t* index = dict_table_get_first_index(node->table);
+
+ log_free_check();
+
+ mtr_t mtr;
+ mtr.start();
+ index->set_modified(mtr);
+
+ if (!row_purge_reposition_pcur(mode, node, &mtr)) {
+ /* The record was already removed. */
+ mtr.commit();
+ return true;
+ }
+
+ rec_t* rec = btr_pcur_get_rec(&node->pcur);
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+ mem_heap_t* heap = NULL;
+ rec_offs* offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ bool success = true;
+
+ if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) {
+ /* Someone else has modified the record later: do not remove */
+ goto func_exit;
+ }
+
+ ut_ad(rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, offsets));
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(
+ btr_pcur_get_btr_cur(&node->pcur), 0, &mtr);
+ } else {
+ dberr_t err;
+ ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE));
+ btr_cur_pessimistic_delete(
+ &err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0,
+ false, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ break;
+ case DB_OUT_OF_FILE_SPACE:
+ success = false;
+ break;
+ default:
+ ut_error;
+ }
+ }
+
+func_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ /* Persistent cursor is closed if reposition fails. */
+ if (node->found_clust) {
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+ } else {
+ mtr_commit(&mtr);
+ }
+
+ return(success);
+}
+
+/***********************************************************//**
+Removes a clustered index record if it has not been modified after the delete
+marking.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of running out
+of file space. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_clust_if_poss(
+/*===========================*/
+ purge_node_t* node) /*!< in/out: row purge node */
+{
+ if (row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF)) {
+ return(true);
+ }
+
+ for (ulint n_tries = 0;
+ n_tries < BTR_CUR_RETRY_DELETE_N_TIMES;
+ n_tries++) {
+ if (row_purge_remove_clust_if_poss_low(
+ node, BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE)) {
+ return(true);
+ }
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+ }
+
+ return(false);
+}
+
+/** Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page). It is possible that
+this function first returns true and then false, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@param[in,out] node row purge node
+@param[in] index secondary index
+@param[in] entry secondary index entry
+@param[in,out] sec_pcur secondary index cursor or NULL
+ if it is called for purge buffering
+ operation.
+@param[in,out] sec_mtr mini-transaction which holds
+ secondary index entry or NULL if it is
+ called for purge buffering operation.
+@param[in] is_tree true=pessimistic purge,
+ false=optimistic (leaf-page only)
+@return true if the secondary index record can be purged */
+bool
+row_purge_poss_sec(
+ purge_node_t* node,
+ dict_index_t* index,
+ const dtuple_t* entry,
+ btr_pcur_t* sec_pcur,
+ mtr_t* sec_mtr,
+ bool is_tree)
+{
+ bool can_delete;
+ mtr_t mtr;
+
+ ut_ad(!dict_index_is_clust(index));
+
+ mtr_start(&mtr);
+
+ can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr)
+ || !row_vers_old_has_index_entry(true,
+ btr_pcur_get_rec(&node->pcur),
+ &mtr, index, entry,
+ node->roll_ptr, node->trx_id);
+
+ /* Persistent cursor is closed if reposition fails. */
+ if (node->found_clust) {
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+ } else {
+ mtr.commit();
+ }
+
+ ut_ad(mtr.has_committed());
+
+ return can_delete;
+}
+
+/***************************************************************
+Removes a secondary index entry if possible, by modifying the
+index tree. Does not try to buffer the delete.
+@return TRUE if success or if not found */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ibool
+row_purge_remove_sec_if_poss_tree(
+/*==============================*/
+ purge_node_t* node, /*!< in: row purge node */
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry) /*!< in: index entry */
+{
+ btr_pcur_t pcur;
+ ibool success = TRUE;
+ dberr_t err;
+ mtr_t mtr;
+ enum row_search_result search_result;
+
+ log_free_check();
+ mtr.start();
+ index->set_modified(mtr);
+
+ if (!index->is_committed()) {
+ /* The index->online_status may change if the index is
+ or was being created online, but not committed yet. It
+ is protected by index->lock. */
+ mtr_sx_lock_index(index, &mtr);
+
+ if (dict_index_is_online_ddl(index)) {
+ /* Online secondary index creation will not
+ copy any delete-marked records. Therefore
+ there is nothing to be purged. We must also
+ skip the purge when a completed index is
+ dropped by rollback_inplace_alter_table(). */
+ goto func_exit_no_pcur;
+ }
+ } else {
+ /* For secondary indexes,
+ index->online_status==ONLINE_INDEX_COMPLETE if
+ index->is_committed(). */
+ ut_ad(!dict_index_is_online_ddl(index));
+ }
+
+ search_result = row_search_index_entry(
+ index, entry,
+ BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+ &pcur, &mtr);
+
+ switch (search_result) {
+ case ROW_NOT_FOUND:
+ /* Not found. This is a legitimate condition. In a
+ rollback, InnoDB will remove secondary recs that would
+ be purged anyway. Then the actual purge will not find
+ the secondary index record. Also, the purge itself is
+ eager: if it comes to consider a secondary index
+ record, and notices it does not need to exist in the
+ index, it will remove it. Then if/when the purge
+ comes to consider the secondary index record a second
+ time, it will not exist any more in the index. */
+
+ /* fputs("PURGE:........sec entry not found\n", stderr); */
+ /* dtuple_print(stderr, entry); */
+ goto func_exit;
+ case ROW_FOUND:
+ break;
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
+ }
+
+ /* We should remove the index record if no later version of the row,
+ which cannot be purged yet, requires its existence. If some requires,
+ we should do nothing. */
+
+ if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, true)) {
+
+ /* Remove the index record, which should have been
+ marked for deletion. */
+ if (!rec_get_deleted_flag(btr_cur_get_rec(
+ btr_pcur_get_btr_cur(&pcur)),
+ dict_table_is_comp(index->table))) {
+ ib::error()
+ << "tried to purge non-delete-marked record"
+ " in index " << index->name
+ << " of table " << index->table->name
+ << ": tuple: " << *entry
+ << ", record: " << rec_index_print(
+ btr_cur_get_rec(
+ btr_pcur_get_btr_cur(&pcur)),
+ index);
+
+ ut_ad(0);
+
+ goto func_exit;
+ }
+
+ btr_cur_pessimistic_delete(&err, FALSE,
+ btr_pcur_get_btr_cur(&pcur),
+ 0, false, &mtr);
+ switch (UNIV_EXPECT(err, DB_SUCCESS)) {
+ case DB_SUCCESS:
+ break;
+ case DB_OUT_OF_FILE_SPACE:
+ success = FALSE;
+ break;
+ default:
+ ut_error;
+ }
+ }
+
+func_exit:
+ btr_pcur_close(&pcur); // FIXME: need this?
+func_exit_no_pcur:
+ mtr.commit();
+
+ return(success);
+}
+
+/***************************************************************
+Removes a secondary index entry without modifying the index tree,
+if possible.
+@retval true if success or if not found
+@retval false if row_purge_remove_sec_if_poss_tree() should be invoked */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_sec_if_poss_leaf(
+/*==============================*/
+ purge_node_t* node, /*!< in: row purge node */
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry) /*!< in: index entry */
+{
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ enum btr_latch_mode mode;
+ enum row_search_result search_result;
+ bool success = true;
+
+ log_free_check();
+ ut_ad(index->table == node->table);
+ ut_ad(!index->table->is_temporary());
+ mtr.start();
+ index->set_modified(mtr);
+
+ if (!index->is_committed()) {
+ /* For uncommitted spatial index, we also skip the purge. */
+ if (dict_index_is_spatial(index)) {
+ goto func_exit_no_pcur;
+ }
+
+ /* The index->online_status may change if the the
+ index is or was being created online, but not
+ committed yet. It is protected by index->lock. */
+ mtr_s_lock_index(index, &mtr);
+
+ if (dict_index_is_online_ddl(index)) {
+ /* Online secondary index creation will not
+ copy any delete-marked records. Therefore
+ there is nothing to be purged. We must also
+ skip the purge when a completed index is
+ dropped by rollback_inplace_alter_table(). */
+ goto func_exit_no_pcur;
+ }
+
+ mode = BTR_PURGE_LEAF_ALREADY_S_LATCHED;
+ } else {
+ /* For secondary indexes,
+ index->online_status==ONLINE_INDEX_COMPLETE if
+ index->is_committed(). */
+ ut_ad(!dict_index_is_online_ddl(index));
+
+ /* Change buffering is disabled for spatial index and
+ virtual index. */
+ mode = (dict_index_is_spatial(index)
+ || dict_index_has_virtual(index))
+ ? BTR_MODIFY_LEAF
+ : BTR_PURGE_LEAF;
+ }
+
+ /* Set the purge node for the call to row_purge_poss_sec(). */
+ pcur.btr_cur.purge_node = node;
+ if (dict_index_is_spatial(index)) {
+ rw_lock_sx_lock(dict_index_get_lock(index));
+ pcur.btr_cur.thr = NULL;
+ } else {
+ /* Set the query thread, so that ibuf_insert_low() will be
+ able to invoke thd_get_trx(). */
+ pcur.btr_cur.thr = static_cast<que_thr_t*>(
+ que_node_get_parent(node));
+ }
+
+ search_result = row_search_index_entry(
+ index, entry, mode, &pcur, &mtr);
+
+ if (dict_index_is_spatial(index)) {
+ rw_lock_sx_unlock(dict_index_get_lock(index));
+ }
+
+ switch (search_result) {
+ case ROW_FOUND:
+ /* Before attempting to purge a record, check
+ if it is safe to do so. */
+ if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, false)) {
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ /* Only delete-marked records should be purged. */
+ if (!rec_get_deleted_flag(
+ btr_cur_get_rec(btr_cur),
+ dict_table_is_comp(index->table))) {
+
+ ib::error()
+ << "tried to purge non-delete-marked"
+ " record" " in index " << index->name
+ << " of table " << index->table->name
+ << ": tuple: " << *entry
+ << ", record: "
+ << rec_index_print(
+ btr_cur_get_rec(btr_cur),
+ index);
+ ut_ad(0);
+
+ btr_pcur_close(&pcur);
+
+ goto func_exit_no_pcur;
+ }
+
+ if (index->is_spatial()) {
+ const buf_block_t* block = btr_cur_get_block(
+ btr_cur);
+
+ if (block->page.id().page_no()
+ != index->page
+ && page_get_n_recs(block->frame) < 2
+ && !lock_test_prdt_page_lock(
+ btr_cur->rtr_info
+ && btr_cur->rtr_info->thr
+ ? thr_get_trx(
+ btr_cur->rtr_info->thr)
+ : nullptr,
+ block->page.id())) {
+ /* this is the last record on page,
+ and it has a "page" lock on it,
+ which mean search is still depending
+ on it, so do not delete */
+ DBUG_LOG("purge",
+ "skip purging last"
+ " record on page "
+ << block->page.id());
+
+ btr_pcur_close(&pcur);
+ mtr.commit();
+ return(success);
+ }
+ }
+
+ if (!btr_cur_optimistic_delete(btr_cur, 0, &mtr)) {
+
+ /* The index entry could not be deleted. */
+ success = false;
+ }
+ }
+
+ /* (The index entry is still needed,
+ or the deletion succeeded) */
+ /* fall through */
+ case ROW_NOT_DELETED_REF:
+ /* The index entry is still needed. */
+ case ROW_BUFFERED:
+ /* The deletion was buffered. */
+ case ROW_NOT_FOUND:
+ /* The index entry does not exist, nothing to do. */
+ btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set?
+func_exit_no_pcur:
+ mtr.commit();
+ return(success);
+ }
+
+ ut_error;
+ return(false);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible. */
+UNIV_INLINE MY_ATTRIBUTE((nonnull(1,2)))
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+ purge_node_t* node, /*!< in: row purge node */
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry) /*!< in: index entry */
+{
+ ibool success;
+ ulint n_tries = 0;
+
+ /* fputs("Purge: Removing secondary record\n", stderr); */
+
+ if (!entry) {
+ /* The node->row must have lacked some fields of this
+ index. This is possible when the undo log record was
+ written before this index was created. */
+ return;
+ }
+
+ if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) {
+
+ return;
+ }
+retry:
+ success = row_purge_remove_sec_if_poss_tree(node, index, entry);
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ ut_a(success);
+}
+
+/** Skip uncommitted virtual indexes on newly added virtual column.
+@param[in,out] index dict index object */
+static
+inline
+void
+row_purge_skip_uncommitted_virtual_index(
+ dict_index_t*& index)
+{
+ /* We need to skip virtual indexes which is not
+ committed yet. It's safe because these indexes are
+ newly created by alter table, and because we do
+ not support LOCK=NONE when adding an index on newly
+ added virtual column.*/
+ while (index != NULL && dict_index_has_virtual(index)
+ && !index->is_committed() && index->has_new_v_col()) {
+ index = dict_table_get_next_index(index);
+ }
+}
+
+/***********************************************************//**
+Purges a delete marking of a record.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of
+running out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_del_mark(
+/*===============*/
+ purge_node_t* node) /*!< in/out: row purge node */
+{
+ mem_heap_t* heap;
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ /* skip corrupted secondary index */
+ dict_table_skip_corrupt_index(node->index);
+
+ row_purge_skip_uncommitted_virtual_index(node->index);
+
+ if (!node->index) {
+ break;
+ }
+
+ if (node->index->type != DICT_FTS) {
+ dtuple_t* entry = row_build_index_entry_low(
+ node->row, NULL, node->index,
+ heap, ROW_BUILD_FOR_PURGE);
+ row_purge_remove_sec_if_poss(node, node->index, entry);
+ mem_heap_empty(heap);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(row_purge_remove_clust_if_poss(node));
+}
+
+/** Reset DB_TRX_ID, DB_ROLL_PTR of a clustered index record
+whose old history can no longer be observed.
+@param[in,out] node purge node
+@param[in,out] mtr mini-transaction (will be started and committed) */
+static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr)
+{
+ /* Reset DB_TRX_ID, DB_ROLL_PTR for old records. */
+ mtr->start();
+
+ if (row_purge_reposition_pcur(BTR_MODIFY_LEAF, node, mtr)) {
+ dict_index_t* index = dict_table_get_first_index(
+ node->table);
+ ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+ rec_t* rec = btr_pcur_get_rec(&node->pcur);
+ mem_heap_t* heap = NULL;
+ /* Reserve enough offsets for the PRIMARY KEY and 2 columns
+ so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+ rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+ rec_offs_init(offsets_);
+ rec_offs* offsets = rec_get_offsets(
+ rec, index, offsets_, index->n_core_fields,
+ trx_id_pos + 2, &heap);
+ ut_ad(heap == NULL);
+
+ ut_ad(dict_index_get_nth_field(index, trx_id_pos)
+ ->col->mtype == DATA_SYS);
+ ut_ad(dict_index_get_nth_field(index, trx_id_pos)
+ ->col->prtype == (DATA_TRX_ID | DATA_NOT_NULL));
+ ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1)
+ ->col->mtype == DATA_SYS);
+ ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1)
+ ->col->prtype == (DATA_ROLL_PTR | DATA_NOT_NULL));
+
+ /* Only update the record if DB_ROLL_PTR matches (the
+ record has not been modified after this transaction
+ became purgeable) */
+ if (node->roll_ptr
+ == row_get_rec_roll_ptr(rec, index, offsets)) {
+ ut_ad(!rec_get_deleted_flag(
+ rec, rec_offs_comp(offsets))
+ || rec_is_alter_metadata(rec, *index));
+ DBUG_LOG("purge", "reset DB_TRX_ID="
+ << ib::hex(row_get_rec_trx_id(
+ rec, index, offsets)));
+
+ index->set_modified(*mtr);
+ buf_block_t* block = btr_pcur_get_block(&node->pcur);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ page_zip_write_trx_id_and_roll_ptr(
+ block, rec, offsets, trx_id_pos,
+ 0, 1ULL << ROLL_PTR_INSERT_FLAG_POS,
+ mtr);
+ } else {
+ ulint len;
+ byte* ptr = rec_get_nth_field(
+ rec, offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ size_t offs = page_offset(ptr);
+ mtr->memset(block, offs, DATA_TRX_ID_LEN, 0);
+ offs += DATA_TRX_ID_LEN;
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block,
+ block->frame
+ + offs, 0x80U);
+ mtr->memset(block, offs + 1,
+ DATA_ROLL_PTR_LEN - 1, 0);
+ }
+ }
+ }
+
+ mtr->commit();
+}
+
+/***********************************************************//**
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
+static
+void
+row_purge_upd_exist_or_extern_func(
+/*===============================*/
+#ifdef UNIV_DEBUG
+ const que_thr_t*thr, /*!< in: query thread */
+#endif /* UNIV_DEBUG */
+ purge_node_t* node, /*!< in: row purge node */
+ trx_undo_rec_t* undo_rec) /*!< in: record to purge */
+{
+ mem_heap_t* heap;
+
+ ut_ad(!node->table->skip_alter_undo);
+
+ if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+ || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+
+ goto skip_secondaries;
+ }
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ dict_table_skip_corrupt_index(node->index);
+
+ row_purge_skip_uncommitted_virtual_index(node->index);
+
+ if (!node->index) {
+ break;
+ }
+
+ if (row_upd_changes_ord_field_binary(node->index, node->update,
+ thr, NULL, NULL)) {
+ /* Build the older version of the index entry */
+ dtuple_t* entry = row_build_index_entry_low(
+ node->row, NULL, node->index,
+ heap, ROW_BUILD_FOR_PURGE);
+ row_purge_remove_sec_if_poss(node, node->index, entry);
+
+ ut_ad(node->table);
+
+ mem_heap_empty(heap);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+skip_secondaries:
+ mtr_t mtr;
+ dict_index_t* index = dict_table_get_first_index(node->table);
+ /* Free possible externally stored fields */
+ for (ulint i = 0; i < upd_get_n_fields(node->update); i++) {
+
+ const upd_field_t* ufield
+ = upd_get_nth_field(node->update, i);
+
+ if (dfield_is_ext(&ufield->new_val)) {
+ trx_rseg_t* rseg;
+ buf_block_t* block;
+ byte* data_field;
+ bool is_insert;
+ ulint rseg_id;
+ uint32_t page_no;
+ uint16_t offset;
+
+ /* We use the fact that new_val points to
+ undo_rec and get thus the offset of
+ dfield data inside the undo record. Then we
+ can calculate from node->roll_ptr the file
+ address of the new_val data */
+
+ const uint16_t internal_offset = uint16_t(
+ static_cast<const byte*>
+ (dfield_get_data(&ufield->new_val))
+ - undo_rec);
+
+ ut_a(internal_offset < srv_page_size);
+
+ trx_undo_decode_roll_ptr(node->roll_ptr,
+ &is_insert, &rseg_id,
+ &page_no, &offset);
+
+ rseg = trx_sys.rseg_array[rseg_id];
+
+ ut_a(rseg != NULL);
+ ut_ad(rseg->id == rseg_id);
+ ut_ad(rseg->is_persistent());
+
+ mtr.start();
+
+ /* We have to acquire an SX-latch to the clustered
+ index tree (exclude other tree changes) */
+
+ mtr_sx_lock_index(index, &mtr);
+
+ index->set_modified(mtr);
+
+ /* NOTE: we must also acquire an X-latch to the
+ root page of the tree. We will need it when we
+ free pages from the tree. If the tree is of height 1,
+ the tree X-latch does NOT protect the root page,
+ because it is also a leaf page. Since we will have a
+ latch on an undo log page, we would break the
+ latching order if we would only later latch the
+ root page of such a tree! */
+
+ btr_root_get(index, &mtr);
+
+ block = buf_page_get(
+ page_id_t(rseg->space->id, page_no),
+ 0, RW_X_LATCH, &mtr);
+
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ data_field = buf_block_get_frame(block)
+ + offset + internal_offset;
+
+ ut_a(dfield_get_len(&ufield->new_val)
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+ btr_free_externally_stored_field(
+ index,
+ data_field + dfield_get_len(&ufield->new_val)
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ NULL, NULL, block, 0, false, &mtr);
+ mtr.commit();
+ }
+ }
+
+ row_purge_reset_trx_id(node, &mtr);
+}
+
+#ifdef UNIV_DEBUG
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec) \
+ row_purge_upd_exist_or_extern_func(thr,node,undo_rec)
+#else /* UNIV_DEBUG */
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec) \
+ row_purge_upd_exist_or_extern_func(node,undo_rec)
+#endif /* UNIV_DEBUG */
+
+/** Parses the row reference and other info in a modify undo log record.
+@param[in] node row undo node
+@param[in] undo_rec record to purge
+@param[in] thr query thread
+@param[out] updated_extern true if an externally stored field was
+ updated
+@return true if purge operation required */
+static
+bool
+row_purge_parse_undo_rec(
+ purge_node_t* node,
+ trx_undo_rec_t* undo_rec,
+ que_thr_t* thr,
+ bool* updated_extern)
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ roll_ptr_t roll_ptr;
+ byte info_bits;
+ ulint type;
+
+ ut_ad(node != NULL);
+ ut_ad(thr != NULL);
+
+ ptr = trx_undo_rec_get_pars(
+ undo_rec, &type, &node->cmpl_info,
+ updated_extern, &undo_no, &table_id);
+
+ node->rec_type = type;
+
+ switch (type) {
+ case TRX_UNDO_RENAME_TABLE:
+ return false;
+ case TRX_UNDO_INSERT_METADATA:
+ case TRX_UNDO_INSERT_REC:
+ /* These records do not store any transaction identifier.
+
+ FIXME: Update SYS_TABLES.ID on both DISCARD TABLESPACE
+ and IMPORT TABLESPACE to get rid of the repeated lookups! */
+ node->trx_id = TRX_ID_MAX;
+ break;
+ default:
+#ifdef UNIV_DEBUG
+ ut_ad("unknown undo log record type" == 0);
+ return false;
+ case TRX_UNDO_UPD_DEL_REC:
+ case TRX_UNDO_UPD_EXIST_REC:
+ case TRX_UNDO_DEL_MARK_REC:
+#endif /* UNIV_DEBUG */
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &node->trx_id,
+ &roll_ptr, &info_bits);
+ break;
+ }
+
+ if (node->is_skipped(table_id)) {
+ return false;
+ }
+
+ trx_id_t trx_id = TRX_ID_MAX;
+
+ if (node->retain_mdl(table_id)) {
+ ut_ad(node->table != NULL);
+ goto already_locked;
+ }
+
+try_again:
+ node->table = dict_table_open_on_id(
+ table_id, false, DICT_TABLE_OP_NORMAL, node->purge_thd,
+ &node->mdl_ticket);
+
+ if (node->table == NULL || node->table->name.is_temporary()) {
+ /* The table has been dropped: no need to do purge and
+ release mdl happened as a part of open process itself */
+ goto err_exit;
+ }
+
+already_locked:
+ ut_ad(!node->table->is_temporary());
+
+ switch (type) {
+ case TRX_UNDO_INSERT_METADATA:
+ case TRX_UNDO_INSERT_REC:
+ break;
+ default:
+ if (!node->table->n_v_cols || node->table->vc_templ
+ || !dict_table_has_indexed_v_cols(node->table)) {
+ break;
+ }
+ /* Need server fully up for virtual column computation */
+ if (!mysqld_server_started) {
+
+ node->close_table();
+ if (srv_shutdown_state > SRV_SHUTDOWN_NONE) {
+ return(false);
+ }
+ os_thread_sleep(1000000);
+ goto try_again;
+ }
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ if (!clust_index || clust_index->is_corrupted()) {
+ /* The table was corrupt in the data dictionary.
+ dict_set_corrupted() works on an index, and
+ we do not have an index to call it with. */
+ DBUG_ASSERT(table_id == node->table->id);
+ trx_id = node->table->def_trx_id;
+ if (!trx_id) {
+ trx_id = TRX_ID_MAX;
+ }
+
+err_exit:
+ node->close_table();
+ node->skip(table_id, trx_id);
+ return(false);
+ }
+
+ node->last_table_id = table_id;
+
+ if (type == TRX_UNDO_INSERT_METADATA) {
+ node->ref = &trx_undo_metadata;
+ return(true);
+ }
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+
+ if (type == TRX_UNDO_INSERT_REC) {
+ return(true);
+ }
+
+ ptr = trx_undo_update_rec_get_update(ptr, clust_index, type,
+ node->trx_id,
+ roll_ptr, info_bits,
+ node->heap, &(node->update));
+
+ /* Read to the partial row the fields that occur in indexes */
+
+ if (!(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ ut_ad(!(node->update->info_bits & REC_INFO_MIN_REC_FLAG));
+ ptr = trx_undo_rec_get_partial_row(
+ ptr, clust_index, node->update, &node->row,
+ type == TRX_UNDO_UPD_DEL_REC,
+ node->heap);
+ } else if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) {
+ node->ref = &trx_undo_metadata;
+ }
+
+ return(true);
+}
+
+/** Purges the parsed record.
+@param[in] node row purge node
+@param[in] undo_rec record to purge
+@param[in] thr query thread
+@param[in] updated_extern whether external columns were updated
+@return true if purged, false if skipped */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_record_func(
+ purge_node_t* node,
+ trx_undo_rec_t* undo_rec,
+#if defined UNIV_DEBUG || defined WITH_WSREP
+ const que_thr_t*thr,
+#endif /* UNIV_DEBUG || WITH_WSREP */
+ bool updated_extern)
+{
+ dict_index_t* clust_index;
+ bool purged = true;
+
+ ut_ad(!node->found_clust);
+ ut_ad(!node->table->skip_alter_undo);
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ node->index = dict_table_get_next_index(clust_index);
+ ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+ switch (node->rec_type) {
+ case TRX_UNDO_DEL_MARK_REC:
+ purged = row_purge_del_mark(node);
+ if (purged) {
+ if (node->table->stat_initialized
+ && srv_stats_include_delete_marked) {
+ dict_stats_update_if_needed(
+ node->table, *thr->graph->trx);
+ }
+ MONITOR_INC(MONITOR_N_DEL_ROW_PURGE);
+ }
+ break;
+ case TRX_UNDO_INSERT_METADATA:
+ case TRX_UNDO_INSERT_REC:
+ node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS;
+ /* fall through */
+ default:
+ if (!updated_extern) {
+ mtr_t mtr;
+ row_purge_reset_trx_id(node, &mtr);
+ break;
+ }
+ /* fall through */
+ case TRX_UNDO_UPD_EXIST_REC:
+ row_purge_upd_exist_or_extern(thr, node, undo_rec);
+ MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN);
+ break;
+ }
+
+ if (node->found_clust) {
+ btr_pcur_close(&node->pcur);
+ node->found_clust = FALSE;
+ }
+
+ return(purged);
+}
+
+#if defined UNIV_DEBUG || defined WITH_WSREP
+# define row_purge_record(node,undo_rec,thr,updated_extern) \
+ row_purge_record_func(node,undo_rec,thr,updated_extern)
+#else /* UNIV_DEBUG || WITH_WSREP */
+# define row_purge_record(node,undo_rec,thr,updated_extern) \
+ row_purge_record_func(node,undo_rec,updated_extern)
+#endif /* UNIV_DEBUG || WITH_WSREP */
+
+/***********************************************************//**
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_purge(
+/*======*/
+ purge_node_t* node, /*!< in: row purge node */
+ trx_undo_rec_t* undo_rec, /*!< in: record to purge */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ if (undo_rec != &trx_purge_dummy_rec) {
+ bool updated_extern;
+
+ while (row_purge_parse_undo_rec(
+ node, undo_rec, thr, &updated_extern)) {
+
+ bool purged = row_purge_record(
+ node, undo_rec, thr, updated_extern);
+
+ if (purged
+ || srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
+ return;
+ }
+
+ /* Retry the purge in a second. */
+ os_thread_sleep(1000000);
+ }
+ }
+}
+
+/***********************************************************//**
+Reset the purge query thread. */
+UNIV_INLINE
+void
+row_purge_end(
+/*==========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(thr);
+
+ thr->run_node = static_cast<purge_node_t*>(thr->run_node)->end();
+
+ ut_a(thr->run_node != NULL);
+}
+
+/***********************************************************//**
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph.
+@return query thread to run next or NULL */
+que_thr_t*
+row_purge_step(
+/*===========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ purge_node_t* node;
+
+ node = static_cast<purge_node_t*>(thr->run_node);
+
+ node->start();
+
+ if (!node->undo_recs.empty()) {
+ trx_purge_rec_t purge_rec = node->undo_recs.front();
+ node->undo_recs.pop();
+ node->roll_ptr = purge_rec.roll_ptr;
+
+ row_purge(node, purge_rec.undo_rec, thr);
+
+ if (node->undo_recs.empty()) {
+ row_purge_end(thr);
+ } else {
+ thr->run_node = node;
+ }
+ } else {
+ row_purge_end(thr);
+ }
+
+ return(thr);
+}
+
+#ifdef UNIV_DEBUG
+/***********************************************************//**
+Validate the persisent cursor. The purge node has two references
+to the clustered index record - one via the ref member, and the
+other via the persistent cursor. These two references must match
+each other if the found_clust flag is set.
+@return true if the stored copy of persistent cursor is consistent
+with the ref member.*/
+bool
+purge_node_t::validate_pcur()
+{
+ if (!found_clust) {
+ return(true);
+ }
+
+ if (index == NULL) {
+ return(true);
+ }
+
+ if (index->type == DICT_FTS) {
+ return(true);
+ }
+
+ if (!pcur.old_stored) {
+ return(true);
+ }
+
+ dict_index_t* clust_index = pcur.btr_cur.index;
+
+ rec_offs* offsets = rec_get_offsets(
+ pcur.old_rec, clust_index, NULL, pcur.old_n_core_fields,
+ pcur.old_n_fields, &heap);
+
+ /* Here we are comparing the purge ref record and the stored initial
+ part in persistent cursor. Both cases we store n_uniq fields of the
+ cluster index and so it is fine to do the comparison. We note this
+ dependency here as pcur and ref belong to different modules. */
+ int st = cmp_dtuple_rec(ref, pcur.old_rec, offsets);
+
+ if (st != 0) {
+ ib::error() << "Purge node pcur validation failed";
+ ib::error() << rec_printer(ref).str();
+ ib::error() << rec_printer(pcur.old_rec, offsets).str();
+ return(false);
+ }
+
+ return(true);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc
new file mode 100644
index 00000000..f106cc8a
--- /dev/null
+++ b/storage/innobase/row/row0quiesce.cc
@@ -0,0 +1,710 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0quiesce.cc
+Quiesce a tablespace.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0quiesce.h"
+#include "row0mysql.h"
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+
+#ifdef HAVE_MY_AES_H
+#include <my_aes.h>
+#endif
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_index_fields(
+/*===========================*/
+ const dict_index_t* index, /*!< in: write the meta data for
+ this index */
+ FILE* file, /*!< in: file to write to */
+ THD* thd) /*!< in/out: session */
+{
+ byte row[sizeof(ib_uint32_t) * 2];
+
+ for (ulint i = 0; i < index->n_fields; ++i) {
+ byte* ptr = row;
+ const dict_field_t* field = &index->fields[i];
+
+ mach_write_to_4(ptr, field->prefix_len);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, field->fixed_len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_9",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing index fields.");
+
+ return(DB_IO_ERROR);
+ }
+
+ const char* field_name = field->name ? field->name : "";
+ /* Include the NUL byte in the length. */
+ ib_uint32_t len = static_cast<ib_uint32_t>(strlen(field_name) + 1);
+ mach_write_to_4(row, len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_10",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(len), file) != sizeof(len)
+ || fwrite(field_name, 1, len, file) != len) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing index column.");
+
+ return(DB_IO_ERROR);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file index information.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_indexes(
+/*======================*/
+ const dict_table_t* table, /*!< in: write the meta data for
+ this table */
+ FILE* file, /*!< in: file to write to */
+ THD* thd) /*!< in/out: session */
+{
+ {
+ byte row[sizeof(ib_uint32_t)];
+
+ /* Write the number of indexes in the table. */
+ mach_write_to_4(row, UT_LIST_GET_LEN(table->indexes));
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_11",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing index count.");
+
+ return(DB_IO_ERROR);
+ }
+ }
+
+ dberr_t err = DB_SUCCESS;
+
+ /* Write the index meta data. */
+ for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index != 0 && err == DB_SUCCESS;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ byte* ptr;
+ byte row[sizeof(index_id_t)
+ + sizeof(ib_uint32_t) * 8];
+
+ ptr = row;
+
+ ut_ad(sizeof(index_id_t) == 8);
+ mach_write_to_8(ptr, index->id);
+ ptr += sizeof(index_id_t);
+
+ mach_write_to_4(ptr, table->space_id);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->page);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->type);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->trx_id_offset);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->n_user_defined_cols);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->n_uniq);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->n_nullable);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->n_fields);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_12",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing index meta-data.");
+
+ return(DB_IO_ERROR);
+ }
+
+ /* Write the length of the index name.
+ NUL byte is included in the length. */
+ ib_uint32_t len = static_cast<ib_uint32_t>(strlen(index->name) + 1);
+ ut_a(len > 1);
+
+ mach_write_to_4(row, len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_1",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(len), file) != sizeof(len)
+ || fwrite(index->name, 1, len, file) != len) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing index name.");
+
+ return(DB_IO_ERROR);
+ }
+
+ err = row_quiesce_write_index_fields(index, file, thd);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Write the meta data (table columns) config file. Serialise the contents of
+dict_col_t structure, along with the column name. All fields are serialized
+as ib_uint32_t.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_table(
+/*====================*/
+ const dict_table_t* table, /*!< in: write the meta data for
+ this table */
+ FILE* file, /*!< in: file to write to */
+ THD* thd) /*!< in/out: session */
+{
+ dict_col_t* col;
+ byte row[sizeof(ib_uint32_t) * 7];
+
+ col = table->cols;
+
+ for (ulint i = 0; i < table->n_cols; ++i, ++col) {
+ byte* ptr = row;
+
+ mach_write_to_4(ptr, col->prtype);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, col->mtype);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, col->len);
+ ptr += sizeof(ib_uint32_t);
+
+ /* FIXME: This will not work if mbminlen>4.
+ This field is also redundant, because the lengths
+ are a property of the character set encoding, which
+ in turn is encodedin prtype above. */
+ mach_write_to_4(ptr, ulint(col->mbmaxlen * 5 + col->mbminlen));
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, col->ind);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, col->ord_part);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, col->max_prefix);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_2",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing table column data.");
+
+ return(DB_IO_ERROR);
+ }
+
+ /* Write out the column name as [len, byte array]. The len
+ includes the NUL byte. */
+ ib_uint32_t len;
+ const char* col_name;
+
+ col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+
+ /* Include the NUL byte in the length. */
+ len = static_cast<ib_uint32_t>(strlen(col_name) + 1);
+ ut_a(len > 1);
+
+ mach_write_to_4(row, len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_3",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(len), file) != sizeof(len)
+ || fwrite(col_name, 1, len, file) != len) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing column name.");
+
+ return(DB_IO_ERROR);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file header.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_header(
+/*=====================*/
+ const dict_table_t* table, /*!< in: write the meta data for
+ this table */
+ FILE* file, /*!< in: file to write to */
+ THD* thd) /*!< in/out: session */
+{
+ byte value[sizeof(ib_uint32_t)];
+
+ /* Write the meta-data version number. */
+ mach_write_to_4(value, IB_EXPORT_CFG_VERSION_V1);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_4", close(fileno(file)););
+
+ if (fwrite(&value, 1, sizeof(value), file) != sizeof(value)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing meta-data version number.");
+
+ return(DB_IO_ERROR);
+ }
+
+ /* Write the server hostname. */
+ ib_uint32_t len;
+ const char* hostname = server_get_hostname();
+
+ /* Play it safe and check for NULL. */
+ if (hostname == 0) {
+ static const char NullHostname[] = "Hostname unknown";
+
+ ib::warn() << "Unable to determine server hostname.";
+
+ hostname = NullHostname;
+ }
+
+ /* The server hostname includes the NUL byte. */
+ len = static_cast<ib_uint32_t>(strlen(hostname) + 1);
+ mach_write_to_4(value, len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_5", close(fileno(file)););
+
+ if (fwrite(&value, 1, sizeof(value), file) != sizeof(value)
+ || fwrite(hostname, 1, len, file) != len) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing hostname.");
+
+ return(DB_IO_ERROR);
+ }
+
+ /* The table name includes the NUL byte. */
+ ut_a(table->name.m_name != NULL);
+ len = static_cast<ib_uint32_t>(strlen(table->name.m_name) + 1);
+
+ /* Write the table name. */
+ mach_write_to_4(value, len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_6", close(fileno(file)););
+
+ if (fwrite(&value, 1, sizeof(value), file) != sizeof(value)
+ || fwrite(table->name.m_name, 1, len, file) != len) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing table name.");
+
+ return(DB_IO_ERROR);
+ }
+
+ byte row[sizeof(ib_uint32_t) * 3];
+
+ /* Write the next autoinc value. */
+ mach_write_to_8(row, table->autoinc);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_7", close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing table autoinc value.");
+
+ return(DB_IO_ERROR);
+ }
+
+ byte* ptr = row;
+
+ /* Write the system page size. */
+ mach_write_to_4(ptr, srv_page_size);
+ ptr += sizeof(ib_uint32_t);
+
+ /* Write the table->flags. */
+ mach_write_to_4(ptr, table->flags);
+ ptr += sizeof(ib_uint32_t);
+
+ /* Write the number of columns in the table. */
+ mach_write_to_4(ptr, table->n_cols);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_8", close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing table meta-data.");
+
+ return(DB_IO_ERROR);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the table meta data after quiesce.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_cfg(
+/*==================*/
+ dict_table_t* table, /*!< in: write the meta data for
+ this table */
+ THD* thd) /*!< in/out: session */
+{
+ dberr_t err;
+ char name[OS_FILE_MAX_PATH];
+
+ srv_get_meta_data_filename(table, name, sizeof(name));
+
+ ib::info() << "Writing table metadata to '" << name << "'";
+
+ FILE* file = fopen(name, "w+b");
+
+ if (file == NULL) {
+ ib_errf(thd, IB_LOG_LEVEL_WARN, ER_CANT_CREATE_FILE,
+ name, errno, strerror(errno));
+
+ err = DB_IO_ERROR;
+ } else {
+ err = row_quiesce_write_header(table, file, thd);
+
+ if (err == DB_SUCCESS) {
+ err = row_quiesce_write_table(table, file, thd);
+ }
+
+ if (err == DB_SUCCESS) {
+ err = row_quiesce_write_indexes(table, file, thd);
+ }
+
+ if (fflush(file) != 0) {
+
+ char msg[BUFSIZ];
+
+ snprintf(msg, sizeof(msg), "%s flush() failed", name);
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno), msg);
+ }
+
+ if (fclose(file) != 0) {
+ char msg[BUFSIZ];
+
+ snprintf(msg, sizeof(msg), "%s flose() failed", name);
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno), msg);
+ }
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Check whether a table has an FTS index defined on it.
+@return true if an FTS index exists on the table */
+static
+bool
+row_quiesce_table_has_fts_index(
+/*============================*/
+ const dict_table_t* table) /*!< in: quiesce this table */
+{
+ bool exists = false;
+
+ dict_mutex_enter_for_mysql();
+
+ for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index != 0;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ if (index->type & DICT_FTS) {
+ exists = true;
+ break;
+ }
+ }
+
+ dict_mutex_exit_for_mysql();
+
+ return(exists);
+}
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+void
+row_quiesce_table_start(
+/*====================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ trx_t* trx) /*!< in/out: transaction/session */
+{
+ ut_a(trx->mysql_thd != 0);
+ ut_a(srv_n_purge_threads > 0);
+ ut_ad(!srv_read_only_mode);
+
+ ut_a(trx->mysql_thd != 0);
+
+ ut_ad(table->space != NULL);
+ ib::info() << "Sync to disk of " << table->name << " started.";
+
+ if (srv_undo_sources) {
+ purge_sys.stop();
+ }
+
+ for (ulint count = 0;
+ ibuf_merge_space(table->space_id);
+ ++count) {
+ if (trx_is_interrupted(trx)) {
+ goto aborted;
+ }
+ if (!(count % 20)) {
+ ib::info() << "Merging change buffer entries for "
+ << table->name;
+ }
+ }
+
+ while (buf_flush_list_space(table->space)) {
+ if (trx_is_interrupted(trx)) {
+ goto aborted;
+ }
+ }
+
+ if (!trx_is_interrupted(trx)) {
+ /* Ensure that all asynchronous IO is completed. */
+ os_aio_wait_until_no_pending_writes();
+ table->space->flush<false>();
+
+ if (row_quiesce_write_cfg(table, trx->mysql_thd)
+ != DB_SUCCESS) {
+ ib::warn() << "There was an error writing to the"
+ " meta data file";
+ } else {
+ ib::info() << "Table " << table->name
+ << " flushed to disk";
+ }
+ } else {
+aborted:
+ ib::warn() << "Quiesce aborted!";
+ }
+
+ dberr_t err = row_quiesce_set_state(table, QUIESCE_COMPLETE, trx);
+ ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+void
+row_quiesce_table_complete(
+/*=======================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ trx_t* trx) /*!< in/out: transaction/session */
+{
+ ulint count = 0;
+
+ ut_a(trx->mysql_thd != 0);
+
+ /* We need to wait for the operation to complete if the
+ transaction has been killed. */
+
+ while (table->quiesce != QUIESCE_COMPLETE) {
+
+ /* Print a warning after every minute. */
+ if (!(count % 60)) {
+ ib::warn() << "Waiting for quiesce of " << table->name
+ << " to complete";
+ }
+
+ /* Sleep for a second. */
+ os_thread_sleep(1000000);
+
+ ++count;
+ }
+
+ if (!opt_bootstrap) {
+ /* Remove the .cfg file now that the user has resumed
+ normal operations. Otherwise it will cause problems when
+ the user tries to drop the database (remove directory). */
+ char cfg_name[OS_FILE_MAX_PATH];
+
+ srv_get_meta_data_filename(table, cfg_name, sizeof(cfg_name));
+
+ os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
+
+ ib::info() << "Deleting the meta-data file '" << cfg_name << "'";
+ }
+
+ if (srv_undo_sources) {
+ purge_sys.resume();
+ }
+
+ dberr_t err = row_quiesce_set_state(table, QUIESCE_NONE, trx);
+ ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or error code. */
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ ib_quiesce_t state, /*!< in: quiesce state to set */
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_a(srv_n_purge_threads > 0);
+
+ if (srv_read_only_mode) {
+
+ ib_senderrf(trx->mysql_thd,
+ IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+
+ return(DB_UNSUPPORTED);
+
+ } else if (table->is_temporary()) {
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+ ER_CANNOT_DISCARD_TEMPORARY_TABLE);
+
+ return(DB_UNSUPPORTED);
+ } else if (table->space_id == TRX_SYS_SPACE) {
+
+ char table_name[MAX_FULL_NAME_LEN + 1];
+
+ innobase_format_name(
+ table_name, sizeof(table_name),
+ table->name.m_name);
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+ ER_TABLE_IN_SYSTEM_TABLESPACE, table_name);
+
+ return(DB_UNSUPPORTED);
+ } else if (row_quiesce_table_has_fts_index(table)) {
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+ ER_NOT_SUPPORTED_YET,
+ "FLUSH TABLES on tables that have an FTS index."
+ " FTS auxiliary tables will not be flushed.");
+
+ } else if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+ /* If this flag is set then the table may not have any active
+ FTS indexes but it will still have the auxiliary tables. */
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+ ER_NOT_SUPPORTED_YET,
+ "FLUSH TABLES on a table that had an FTS index,"
+ " created on a hidden column, the"
+ " auxiliary tables haven't been dropped as yet."
+ " FTS auxiliary tables will not be flushed.");
+ }
+
+ dict_index_t* clust_index = dict_table_get_first_index(table);
+
+ row_mysql_lock_data_dictionary(trx);
+
+ for (dict_index_t* index = dict_table_get_next_index(clust_index);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ rw_lock_x_lock(&index->lock);
+ }
+
+ rw_lock_x_lock(&clust_index->lock);
+
+ switch (state) {
+ case QUIESCE_START:
+ break;
+
+ case QUIESCE_COMPLETE:
+ ut_a(table->quiesce == QUIESCE_START);
+ break;
+
+ case QUIESCE_NONE:
+ ut_a(table->quiesce == QUIESCE_COMPLETE);
+ break;
+ }
+
+ table->quiesce = state;
+
+ for (dict_index_t* index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ rw_lock_x_unlock(&index->lock);
+ }
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ return(DB_SUCCESS);
+}
+
diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc
new file mode 100644
index 00000000..7e70341a
--- /dev/null
+++ b/storage/innobase/row/row0row.cc
@@ -0,0 +1,1741 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0row.cc
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "ut0mem.h"
+#include "gis0geo.h"
+#include "row0mysql.h"
+
+/** Build a spatial index key.
+@param[in] index spatial index
+@param[in] ext externally stored column prefixes, or NULL
+@param[in,out] dfield field of the tuple to be copied
+@param[in] dfield2 field of the tuple to copy
+@param[in] flag ROW_BUILD_NORMAL, ROW_BUILD_FOR_PURGE or
+ ROW_BUILD_FOR_UNDO
+@param[in,out] heap memory heap from which the memory
+ of the field entry is allocated.
+@retval false if undo log is logged before spatial index creation. */
+static bool row_build_spatial_index_key(
+ const dict_index_t* index,
+ const row_ext_t* ext,
+ dfield_t* dfield,
+ const dfield_t* dfield2,
+ ulint flag,
+ mem_heap_t* heap)
+{
+ if (dfield2->type.mtype == DATA_MISSING) {
+ return false;
+ }
+
+ double* mbr;
+
+ dfield_copy(dfield, dfield2);
+ dfield->type.prtype |= DATA_GIS_MBR;
+
+ /* Allocate memory for mbr field */
+ mbr = static_cast<double*>(mem_heap_alloc(heap, DATA_MBR_LEN));
+
+ /* Set mbr field data. */
+ dfield_set_data(dfield, mbr, DATA_MBR_LEN);
+
+ const fil_space_t* space = index->table->space;
+
+ if (UNIV_UNLIKELY(!dfield2->data || !space)) {
+ /* FIXME: dfield contains uninitialized data,
+ but row_build_index_entry_low() will not return NULL.
+ This bug is inherited from MySQL 5.7.5
+ commit b66ad511b61fffe75c58d0a607cdb837c6e6c821. */
+ return true;
+ }
+
+ const byte* dptr = NULL;
+ ulint dlen = 0;
+ ulint flen = 0;
+ double tmp_mbr[SPDIMS * 2];
+ mem_heap_t* temp_heap = NULL;
+
+ if (!dfield_is_ext(dfield2)) {
+ dptr = static_cast<const byte*>(dfield_get_data(dfield2));
+ dlen = dfield_get_len(dfield2);
+ ut_ad(dptr != &data_error);
+ goto write_mbr;
+ }
+
+ if (flag == ROW_BUILD_FOR_PURGE) {
+ const byte* ptr = static_cast<const byte*>(
+ dfield_get_data(dfield2));
+
+ switch (dfield_get_spatial_status(dfield2)) {
+ case SPATIAL_ONLY:
+ ut_ad(dfield_get_len(dfield2) == DATA_MBR_LEN);
+ break;
+
+ case SPATIAL_MIXED:
+ ptr += dfield_get_len(dfield2);
+ break;
+
+ case SPATIAL_UNKNOWN:
+ ut_ad(0);
+ /* fall through */
+ case SPATIAL_NONE:
+ /* Undo record is logged before
+ spatial index is created.*/
+ return false;
+ }
+
+ memcpy(mbr, ptr, DATA_MBR_LEN);
+ return true;
+ }
+
+ if (flag == ROW_BUILD_FOR_UNDO
+ && dict_table_has_atomic_blobs(index->table)) {
+ /* For ROW_FORMAT=DYNAMIC or COMPRESSED, a prefix of
+ off-page records is stored in the undo log record (for
+ any column prefix indexes). For SPATIAL INDEX, we
+ must ignore this prefix. The full column value is
+ stored in the BLOB. For non-spatial index, we would
+ have already fetched a necessary prefix of the BLOB,
+ available in the "ext" parameter.
+
+ Here, for SPATIAL INDEX, we are fetching the full
+ column, which is potentially wasting a lot of I/O,
+ memory, and possibly involving a concurrency problem,
+ similar to ones that existed before the introduction
+ of row_ext_t.
+
+ MDEV-11657 FIXME: write the MBR directly to the undo
+ log record, and avoid recomputing it here! */
+ flen = BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(dfield_get_len(dfield2) >= BTR_EXTERN_FIELD_REF_SIZE);
+ dptr = static_cast<const byte*>(dfield_get_data(dfield2))
+ + dfield_get_len(dfield2)
+ - BTR_EXTERN_FIELD_REF_SIZE;
+ } else {
+ flen = dfield_get_len(dfield2);
+ dptr = static_cast<const byte*>(dfield_get_data(dfield2));
+ }
+
+ temp_heap = mem_heap_create(1000);
+
+ dptr = btr_copy_externally_stored_field(
+ &dlen, dptr, ext ? ext->zip_size : space->zip_size(),
+ flen, temp_heap);
+
+write_mbr:
+ if (dlen <= GEO_DATA_HEADER_SIZE) {
+ for (uint i = 0; i < SPDIMS; i += 2) {
+ tmp_mbr[i] = DBL_MAX;
+ tmp_mbr[i + 1] = -DBL_MAX;
+ }
+ } else {
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ uint(dlen - GEO_DATA_HEADER_SIZE),
+ SPDIMS, tmp_mbr);
+ }
+
+ dfield_write_mbr(dfield, tmp_mbr);
+ if (temp_heap) {
+ mem_heap_free(temp_heap);
+ }
+
+ return true;
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ const row_ext_t* ext, /*!< in: externally stored column
+ prefixes, or NULL */
+ const dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap, /*!< in,out: memory heap from which
+ the memory for the index entry
+ is allocated */
+ ulint flag) /*!< in: ROW_BUILD_NORMAL,
+ ROW_BUILD_FOR_PURGE
+ or ROW_BUILD_FOR_UNDO */
+{
+ dtuple_t* entry;
+ ulint entry_len;
+ ulint i = 0;
+ ulint num_v = 0;
+
+ entry_len = dict_index_get_n_fields(index);
+
+ if (flag == ROW_BUILD_FOR_INSERT && dict_index_is_clust(index)) {
+ num_v = dict_table_get_n_v_cols(index->table);
+ entry = dtuple_create_with_vcol(heap, entry_len, num_v);
+ } else {
+ entry = dtuple_create(heap, entry_len);
+ }
+
+ if (dict_index_is_ibuf(index)) {
+ dtuple_set_n_fields_cmp(entry, entry_len);
+ /* There may only be externally stored columns
+ in a clustered index B-tree of a user table. */
+ ut_a(!ext);
+ } else {
+ dtuple_set_n_fields_cmp(
+ entry, dict_index_get_n_unique_in_tree(index));
+ if (dict_index_is_spatial(index)) {
+ /* Set the MBR field */
+ if (!row_build_spatial_index_key(
+ index, ext,
+ dtuple_get_nth_field(entry, 0),
+ dtuple_get_nth_field(
+ row,
+ dict_index_get_nth_field(index, i)
+ ->col->ind), flag, heap)) {
+ return NULL;
+ }
+
+ i = 1;
+ }
+ }
+
+ for (; i < entry_len; i++) {
+ const dict_field_t& f = index->fields[i];
+ dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+ if (f.col->is_dropped()) {
+ ut_ad(index->is_primary());
+ ut_ad(index->is_instant());
+ ut_ad(!f.col->is_virtual());
+ dict_col_copy_type(f.col, &dfield->type);
+ if (f.col->is_nullable()) {
+ dfield_set_null(dfield);
+ } else {
+ dfield_set_data(dfield, field_ref_zero,
+ f.fixed_len);
+ }
+ continue;
+ }
+
+ const dfield_t* dfield2;
+
+ if (f.col->is_virtual()) {
+ const dict_v_col_t* v_col
+ = reinterpret_cast<const dict_v_col_t*>(f.col);
+
+ ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row));
+ dfield2 = dtuple_get_nth_v_field(row, v_col->v_pos);
+
+ ut_ad(dfield_is_null(dfield2) ||
+ dfield_get_len(dfield2) == 0 || dfield2->data);
+ ut_ad(!dfield_is_ext(dfield2));
+ if (UNIV_UNLIKELY(dfield2->type.mtype
+ == DATA_MISSING)) {
+ ut_ad(flag == ROW_BUILD_FOR_PURGE);
+ return(NULL);
+ }
+ } else {
+ dfield2 = dtuple_get_nth_field(row, f.col->ind);
+ if (UNIV_UNLIKELY(dfield2->type.mtype
+ == DATA_MISSING)) {
+ /* The field has not been initialized in
+ the row. This should be from
+ trx_undo_rec_get_partial_row(). */
+ return(NULL);
+ }
+
+ ut_ad(!(dfield2->type.prtype & DATA_VIRTUAL));
+ }
+
+ compile_time_assert(DATA_MISSING == 0);
+
+ *dfield = *dfield2;
+
+ if (dfield_is_null(dfield)) {
+ continue;
+ }
+
+ ut_ad(!(index->type & DICT_FTS));
+
+ ulint len = dfield_get_len(dfield);
+
+ if (f.prefix_len == 0
+ && (!dfield_is_ext(dfield)
+ || dict_index_is_clust(index))) {
+ /* The *dfield = *dfield2 above suffices for
+ columns that are stored in-page, or for
+ clustered index record columns that are not
+ part of a column prefix in the PRIMARY KEY. */
+ continue;
+ }
+
+ /* If the column is stored externally (off-page) in
+ the clustered index, it must be an ordering field in
+ the secondary index. If !atomic_blobs, the only way
+ we may have a secondary index pointing to a clustered
+ index record with an off-page column is when it is a
+ column prefix index. If atomic_blobs, also fully
+ indexed long columns may be stored off-page. */
+ ut_ad(f.col->ord_part);
+
+ if (ext && !f.col->is_virtual()) {
+ /* See if the column is stored externally. */
+ const byte* buf = row_ext_lookup(ext, f.col->ind,
+ &len);
+ if (UNIV_LIKELY_NULL(buf)) {
+ if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+ return(NULL);
+ }
+ dfield_set_data(dfield, buf, len);
+ }
+
+ if (f.prefix_len == 0) {
+ /* If ROW_FORMAT=DYNAMIC or
+ ROW_FORMAT=COMPRESSED, we can have a
+ secondary index on an entire column
+ that is stored off-page in the
+ clustered index. As this is not a
+ prefix index (prefix_len == 0),
+ include the entire off-page column in
+ the secondary index record. */
+ continue;
+ }
+ } else if (dfield_is_ext(dfield)) {
+ /* This table is either in
+ (ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT)
+ or a purge record where the ordered part of
+ the field is not external.
+ In ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT,
+ the maximum column prefix
+ index length is 767 bytes, and the clustered
+ index record contains a 768-byte prefix of
+ each off-page column. */
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ len -= BTR_EXTERN_FIELD_REF_SIZE;
+ dfield_set_len(dfield, len);
+ }
+
+ /* If a column prefix index, take only the prefix. */
+ if (f.prefix_len) {
+ len = dtype_get_at_most_n_mbchars(
+ f.col->prtype,
+ f.col->mbminlen, f.col->mbmaxlen,
+ f.prefix_len, len,
+ static_cast<char*>(dfield_get_data(dfield)));
+ dfield_set_len(dfield, len);
+ }
+ }
+
+ for (i = num_v; i--; ) {
+ ut_ad(index->is_primary());
+ ut_ad(flag == ROW_BUILD_FOR_INSERT);
+ dfield_t* dfield = dtuple_get_nth_v_field(entry, i);
+ const dict_v_col_t* v_col = dict_table_get_nth_v_col(
+ index->table, i);
+ ut_ad(!v_col->m_col.is_dropped());
+ ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row));
+ const dfield_t* dfield2 = dtuple_get_nth_v_field(
+ row, v_col->v_pos);
+ ut_ad(dfield_is_null(dfield2) ||
+ dfield_get_len(dfield2) == 0 || dfield2->data);
+ ut_ad(dfield2->type.mtype != DATA_MISSING);
+ *dfield = *dfield2;
+ }
+
+ return entry;
+}
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in] type ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in] index clustered index
+@param[in] rec record in the clustered index
+@param[in] offsets rec_get_offsets(rec,index) or NULL
+@param[in] col_table table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead
+@param[in] defaults default values of added/changed columns, or NULL
+@param[in] add_v new virtual columns added
+ along with new indexes
+@param[in] col_map mapping of old column
+ numbers to new ones, or NULL
+@param[in] ext cache of externally stored column
+ prefixes, or NULL
+@param[in] heap memory heap from which
+ the memory needed is allocated
+@return own: row built; */
+static inline
+dtuple_t*
+row_build_low(
+ ulint type,
+ const dict_index_t* index,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ const dict_table_t* col_table,
+ const dtuple_t* defaults,
+ const dict_add_v_col_t* add_v,
+ const ulint* col_map,
+ row_ext_t** ext,
+ mem_heap_t* heap)
+{
+ const byte* copy;
+ dtuple_t* row;
+ ulint n_ext_cols;
+ ulint* ext_cols = NULL; /* remove warning */
+ ulint len;
+ byte* buf;
+ ulint j;
+ mem_heap_t* tmp_heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ ut_ad(index != NULL);
+ ut_ad(rec != NULL);
+ ut_ad(heap != NULL);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!col_map || col_table);
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &tmp_heap);
+ } else {
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* Some blob refs can be NULL during crash recovery before
+ trx_rollback_active() has completed execution, or when a concurrently
+ executing insert or update has committed the B-tree mini-transaction
+ but has not yet managed to restore the cursor position for writing
+ the big_rec. Note that the mini-transaction can be committed multiple
+ times, and the cursor restore can happen multiple times for single
+ insert or update statement. */
+ ut_a(!rec_offs_any_null_extern(rec, offsets)
+ || trx_sys.is_registered(current_trx(),
+ row_get_rec_trx_id(rec, index,
+ offsets)));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ if (type != ROW_COPY_POINTERS) {
+ /* Take a copy of rec to heap */
+ buf = static_cast<byte*>(
+ mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+ copy = rec_copy(buf, rec, offsets);
+ } else {
+ copy = rec;
+ }
+
+ n_ext_cols = rec_offs_n_extern(offsets);
+ if (n_ext_cols) {
+ ext_cols = static_cast<ulint*>(
+ mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols));
+ }
+
+ /* Avoid a debug assertion in rec_offs_validate(). */
+ rec_offs_make_valid(copy, index, true, const_cast<rec_offs*>(offsets));
+
+ if (!col_table) {
+ ut_ad(!col_map);
+ ut_ad(!defaults);
+ col_table = index->table;
+ }
+
+ if (defaults) {
+ ut_ad(col_map);
+ row = dtuple_copy(defaults, heap);
+ /* dict_table_copy_types() would set the fields to NULL */
+ for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) {
+ dict_col_copy_type(
+ dict_table_get_nth_col(col_table, i),
+ dfield_get_type(dtuple_get_nth_field(row, i)));
+ }
+ } else if (add_v != NULL) {
+ row = dtuple_create_with_vcol(
+ heap, dict_table_get_n_cols(col_table),
+ dict_table_get_n_v_cols(col_table) + add_v->n_v_col);
+ dict_table_copy_types(row, col_table);
+
+ for (ulint i = 0; i < add_v->n_v_col; i++) {
+ dict_col_copy_type(
+ &add_v->v_col[i].m_col,
+ dfield_get_type(dtuple_get_nth_v_field(
+ row, i + col_table->n_v_def)));
+ }
+ } else {
+ row = dtuple_create_with_vcol(
+ heap, dict_table_get_n_cols(col_table),
+ dict_table_get_n_v_cols(col_table));
+ dict_table_copy_types(row, col_table);
+ }
+
+ dtuple_set_info_bits(row, rec_get_info_bits(
+ copy, rec_offs_comp(offsets)));
+
+ j = 0;
+
+ const dict_field_t* ind_field = index->fields;
+
+ for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (i == index->first_user_field()
+ && rec_is_alter_metadata(rec, *index)) {
+ ut_ad(rec_offs_nth_extern(offsets, i));
+ ut_d(ulint len);
+ ut_d(rec_get_nth_field_offs(offsets, i, &len));
+ ut_ad(len == FIELD_REF_SIZE);
+ continue;
+ }
+
+ ut_ad(ind_field < &index->fields[index->n_fields]);
+
+ const dict_col_t* col = dict_field_get_col(ind_field);
+
+ if ((ind_field++)->prefix_len) {
+ /* Column prefixes can only occur in key
+ fields, which cannot be stored externally. For
+ a column prefix, there should also be the full
+ field in the clustered index tuple. The row
+ tuple comprises full fields, not prefixes. */
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ continue;
+ }
+
+ if (col->is_dropped()) {
+ continue;
+ }
+
+ ulint col_no = dict_col_get_no(col);
+
+ if (col_map) {
+ col_no = col_map[col_no];
+
+ if (col_no == ULINT_UNDEFINED) {
+ /* dropped column */
+ continue;
+ }
+ }
+
+ dfield_t* dfield = dtuple_get_nth_field(row, col_no);
+
+ const void* field = rec_get_nth_field(
+ copy, offsets, i, &len);
+ if (len == UNIV_SQL_DEFAULT) {
+ field = index->instant_field_value(i, &len);
+ if (field && type != ROW_COPY_POINTERS) {
+ field = mem_heap_dup(heap, field, len);
+ }
+ }
+ dfield_set_data(dfield, field, len);
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ dfield_set_ext(dfield);
+
+ col = dict_table_get_nth_col(col_table, col_no);
+
+ if (col->ord_part) {
+ /* We will have to fetch prefixes of
+ externally stored columns that are
+ referenced by column prefixes. */
+ ext_cols[j++] = col_no;
+ }
+ }
+ }
+
+ rec_offs_make_valid(rec, index, true, const_cast<rec_offs*>(offsets));
+
+ ut_ad(dtuple_check_typed(row));
+
+ if (!ext) {
+ /* REDUNDANT and COMPACT formats store a local
+ 768-byte prefix of each externally stored
+ column. No cache is needed.
+
+ During online table rebuild,
+ row_log_table_apply_delete_low()
+ may use a cache that was set up by
+ row_log_table_delete(). */
+
+ } else if (j) {
+ *ext = row_ext_create(j, ext_cols, *index->table, row,
+ heap);
+ } else {
+ *ext = NULL;
+ }
+
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ return(row);
+}
+
+
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+dtuple_t*
+row_build(
+/*======*/
+ ulint type, /*!< in: ROW_COPY_POINTERS or
+ ROW_COPY_DATA; the latter
+ copies also the data fields to
+ heap while the first only
+ places pointers to data fields
+ on the index page, and thus is
+ more efficient */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_t* rec, /*!< in: record in the clustered
+ index; NOTE: in the case
+ ROW_COPY_POINTERS the data
+ fields in the row will point
+ directly into this record,
+ therefore, the buffer page of
+ this record must be at least
+ s-latched and the latch held
+ as long as the row dtuple is used! */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index)
+ or NULL, in which case this function
+ will invoke rec_get_offsets() */
+ const dict_table_t* col_table,
+ /*!< in: table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead */
+ const dtuple_t* defaults,
+ /*!< in: default values of
+ added and changed columns, or NULL */
+ const ulint* col_map,/*!< in: mapping of old column
+ numbers to new ones, or NULL */
+ row_ext_t** ext, /*!< out, own: cache of
+ externally stored column
+ prefixes, or NULL */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+{
+ return(row_build_low(type, index, rec, offsets, col_table,
+ defaults, NULL, col_map, ext, heap));
+}
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in] type ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in] index clustered index
+@param[in] rec record in the clustered index
+@param[in] offsets rec_get_offsets(rec,index) or NULL
+@param[in] col_table table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead
+@param[in] defaults default values of added, changed columns, or NULL
+@param[in] add_v new virtual columns added
+ along with new indexes
+@param[in] col_map mapping of old column
+ numbers to new ones, or NULL
+@param[in] ext cache of externally stored column
+ prefixes, or NULL
+@param[in] heap memory heap from which
+ the memory needed is allocated
+@return own: row built; */
+dtuple_t*
+row_build_w_add_vcol(
+ ulint type,
+ const dict_index_t* index,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ const dict_table_t* col_table,
+ const dtuple_t* defaults,
+ const dict_add_v_col_t* add_v,
+ const ulint* col_map,
+ row_ext_t** ext,
+ mem_heap_t* heap)
+{
+ return(row_build_low(type, index, rec, offsets, col_table,
+ defaults, add_v, col_map, ext, heap));
+}
+
+/** Convert an index record to a data tuple.
+@tparam metadata whether the index->instant_field_value() needs to be accessed
+@tparam mblob 1 if rec_is_alter_metadata();
+2 if we want converted metadata corresponding to info_bits
+@param[in] rec index record
+@param[in] index index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[out] n_ext number of externally stored columns
+@param[in,out] heap memory heap for allocations
+@param[in] info_bits (only used if mblob=2)
+@param[in] pad (only used if mblob=2)
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+template<bool metadata, int mblob = 0>
+static inline
+dtuple_t*
+row_rec_to_index_entry_impl(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ mem_heap_t* heap,
+ ulint info_bits = 0,
+ bool pad = false)
+{
+ ut_ad(rec != NULL);
+ ut_ad(heap != NULL);
+ ut_ad(index != NULL);
+ ut_ad(!mblob || index->is_primary());
+ ut_ad(!mblob || !index->table->is_temporary());
+ ut_ad(!mblob || !dict_index_is_spatial(index));
+ compile_time_assert(!mblob || metadata);
+ compile_time_assert(mblob <= 2);
+ /* Because this function may be invoked by row0merge.cc
+ on a record whose header is in different format, the check
+ rec_offs_validate(rec, index, offsets) must be avoided here. */
+
+ const bool got = mblob == 2 && rec_is_alter_metadata(rec, *index);
+ ulint rec_len = rec_offs_n_fields(offsets);
+ if (mblob == 2) {
+ ut_ad(info_bits == REC_INFO_METADATA_ALTER
+ || info_bits == REC_INFO_METADATA_ADD);
+ ut_ad(rec_len <= ulint(index->n_fields + got));
+ if (pad) {
+ rec_len = ulint(index->n_fields)
+ + (info_bits == REC_INFO_METADATA_ALTER);
+ } else if (!got && info_bits == REC_INFO_METADATA_ALTER) {
+ rec_len++;
+ }
+ } else {
+ ut_ad(info_bits == 0);
+ ut_ad(!pad);
+ }
+ dtuple_t* entry = dtuple_create(heap, rec_len);
+ dfield_t* dfield = entry->fields;
+
+ dtuple_set_n_fields_cmp(entry,
+ dict_index_get_n_unique_in_tree(index));
+ ut_ad(mblob == 2
+ || rec_len == dict_index_get_n_fields(index) + uint(mblob == 1)
+ /* a record for older SYS_INDEXES table
+ (missing merge_threshold column) is acceptable. */
+ || (!index->table->is_temporary()
+ && index->table->id == DICT_INDEXES_ID
+ && rec_len + 1 == dict_index_get_n_fields(index)));
+
+ ulint i;
+ for (i = 0; i < (mblob ? index->first_user_field() : rec_len);
+ i++, dfield++) {
+ dict_col_copy_type(dict_index_get_nth_col(index, i),
+ &dfield->type);
+ if (!mblob
+ && dict_index_is_spatial(index)
+ && DATA_GEOMETRY_MTYPE(dfield->type.mtype)) {
+ dfield->type.prtype |= DATA_GIS_MBR;
+ }
+
+ ulint len;
+ const byte* field = metadata
+ ? rec_get_nth_cfield(rec, index, offsets, i, &len)
+ : rec_get_nth_field(rec, offsets, i, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ dfield_set_ext(dfield);
+ }
+ }
+
+ if (mblob) {
+ ulint len;
+ const byte* field;
+ ulint j = i;
+
+ if (mblob == 2) {
+ const bool want = info_bits == REC_INFO_METADATA_ALTER;
+ if (got == want) {
+ if (got) {
+ goto copy_metadata;
+ }
+ } else {
+ if (want) {
+ /* Allocate a placeholder for
+ adding metadata in an update. */
+ len = FIELD_REF_SIZE;
+ field = static_cast<byte*>(
+ mem_heap_zalloc(heap, len));
+ /* In reality there is one fewer
+ field present in the record. */
+ rec_len--;
+ goto init_metadata;
+ }
+
+ /* Skip the undesired metadata blob
+ (for example, when rolling back an
+ instant ALTER TABLE). */
+ i++;
+ }
+ goto copy_user_fields;
+ }
+copy_metadata:
+ ut_ad(rec_offs_nth_extern(offsets, i));
+ field = rec_get_nth_field(rec, offsets, i++, &len);
+init_metadata:
+ dfield->type.metadata_blob_init();
+ ut_ad(len == FIELD_REF_SIZE);
+ dfield_set_data(dfield, field, len);
+ dfield_set_ext(dfield++);
+copy_user_fields:
+ for (; i < rec_len; i++, dfield++) {
+ dict_col_copy_type(dict_index_get_nth_col(index, j++),
+ &dfield->type);
+ if (mblob == 2 && pad
+ && i >= rec_offs_n_fields(offsets)) {
+ field = index->instant_field_value(j - 1,
+ &len);
+ dfield_set_data(dfield, field, len);
+ continue;
+ }
+
+ field = rec_get_nth_field(rec, offsets, i, &len);
+ dfield_set_data(dfield, field, len);
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ dfield_set_ext(dfield);
+ }
+ }
+ }
+
+ if (mblob == 2) {
+ ulint n_fields = ulint(dfield - entry->fields);
+ ut_ad(entry->n_fields >= n_fields);
+ entry->n_fields = n_fields;
+ }
+ ut_ad(dfield == entry->fields + entry->n_fields);
+ ut_ad(dtuple_check_typed(entry));
+ return entry;
+}
+
+/** Convert an index record to a data tuple.
+@param[in] rec index record
+@param[in] index index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in,out] heap memory heap for allocations */
+dtuple_t*
+row_rec_to_index_entry_low(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ mem_heap_t* heap)
+{
+ return row_rec_to_index_entry_impl<false>(rec, index, offsets, heap);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built */
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+ const rec_t* rec, /*!< in: record in the index */
+ const dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+{
+ ut_ad(rec != NULL);
+ ut_ad(heap != NULL);
+ ut_ad(index != NULL);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ /* Take a copy of rec to heap */
+ const rec_t* copy_rec = rec_copy(
+ static_cast<byte*>(mem_heap_alloc(heap,
+ rec_offs_size(offsets))),
+ rec, offsets);
+
+ rec_offs_make_valid(copy_rec, index, true,
+ const_cast<rec_offs*>(offsets));
+
+ dtuple_t* entry = rec_is_alter_metadata(copy_rec, *index)
+ ? row_rec_to_index_entry_impl<true,1>(
+ copy_rec, index, offsets, heap)
+ : row_rec_to_index_entry_impl<true>(
+ copy_rec, index, offsets, heap);
+
+ rec_offs_make_valid(rec, index, true,
+ const_cast<rec_offs*>(offsets));
+
+ dtuple_set_info_bits(entry,
+ rec_get_info_bits(rec, rec_offs_comp(offsets)));
+
+ return(entry);
+}
+
+/** Convert a metadata record to a data tuple.
+@param[in] rec metadata record
+@param[in] index clustered index after instant ALTER TABLE
+@param[in] offsets rec_get_offsets(rec)
+@param[in,out] heap memory heap for allocations
+@param[in] info_bits the info_bits after an update
+@param[in] pad whether to pad to index->n_fields */
+dtuple_t*
+row_metadata_to_tuple(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ mem_heap_t* heap,
+ ulint info_bits,
+ bool pad)
+{
+ ut_ad(info_bits == REC_INFO_METADATA_ALTER
+ || info_bits == REC_INFO_METADATA_ADD);
+ ut_ad(rec_is_metadata(rec, *index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ const rec_t* copy_rec = rec_copy(
+ static_cast<byte*>(mem_heap_alloc(heap,
+ rec_offs_size(offsets))),
+ rec, offsets);
+
+ rec_offs_make_valid(copy_rec, index, true,
+ const_cast<rec_offs*>(offsets));
+
+ dtuple_t* entry = info_bits == REC_INFO_METADATA_ALTER
+ || rec_is_alter_metadata(copy_rec, *index)
+ ? row_rec_to_index_entry_impl<true,2>(
+ copy_rec, index, offsets, heap, info_bits, pad)
+ : row_rec_to_index_entry_impl<true>(
+ copy_rec, index, offsets, heap);
+
+ rec_offs_make_valid(rec, index, true,
+ const_cast<rec_offs*>(offsets));
+
+ dtuple_set_info_bits(entry, info_bits);
+ return entry;
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+ ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap, whereas the latter only places pointers
+ to data fields on the index page */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ dict_table_t* table;
+ dict_index_t* clust_index;
+ dfield_t* dfield;
+ dtuple_t* ref;
+ const byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint pos;
+ byte* buf;
+ ulint clust_col_prefix_len;
+ ulint i;
+ mem_heap_t* tmp_heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(index != NULL);
+ ut_ad(rec != NULL);
+ ut_ad(heap != NULL);
+ ut_ad(!dict_index_is_clust(index));
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &tmp_heap);
+ /* Secondary indexes must not contain externally stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ if (type == ROW_COPY_DATA) {
+ /* Take a copy of rec to heap */
+
+ buf = static_cast<byte*>(
+ mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+ rec = rec_copy(buf, rec, offsets);
+ rec_offs_make_valid(rec, index, true, offsets);
+ }
+
+ table = index->table;
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+ ut_a(pos != ULINT_UNDEFINED);
+
+ ut_ad(!rec_offs_nth_default(offsets, pos));
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ /* If the primary key contains a column prefix, then the
+ secondary index may contain a longer prefix of the same
+ column, or the full column, and we must adjust the length
+ accordingly. */
+
+ clust_col_prefix_len = dict_index_get_nth_field(
+ clust_index, i)->prefix_len;
+
+ if (clust_col_prefix_len > 0) {
+ if (len != UNIV_SQL_NULL) {
+
+ const dtype_t* dtype
+ = dfield_get_type(dfield);
+
+ dfield_set_len(dfield,
+ dtype_get_at_most_n_mbchars(
+ dtype->prtype,
+ dtype->mbminlen,
+ dtype->mbmaxlen,
+ clust_col_prefix_len,
+ len, (char*) field));
+ }
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ return(ref);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+ dtuple_t* ref, /*!< in/out: row reference built;
+ see the NOTE below! */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: the data fields in ref
+ will point directly into this
+ record, therefore, the buffer
+ page of this record must be at
+ least s-latched and the latch
+ held as long as the row
+ reference is used! */
+ const dict_index_t* index, /*!< in: secondary index */
+ rec_offs* offsets)/*!< in: rec_get_offsets(rec, index)
+ or NULL */
+{
+ const dict_index_t* clust_index;
+ dfield_t* dfield;
+ const byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint pos;
+ ulint clust_col_prefix_len;
+ ulint i;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ ut_ad(!dict_index_is_clust(index));
+ ut_a(index->table);
+
+ clust_index = dict_table_get_first_index(index->table);
+ ut_ad(clust_index);
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ } else {
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ }
+
+ /* Secondary indexes must not contain externally stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+ ut_a(pos != ULINT_UNDEFINED);
+
+ ut_ad(!rec_offs_nth_default(offsets, pos));
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ /* If the primary key contains a column prefix, then the
+ secondary index may contain a longer prefix of the same
+ column, or the full column, and we must adjust the length
+ accordingly. */
+
+ clust_col_prefix_len = dict_index_get_nth_field(
+ clust_index, i)->prefix_len;
+
+ if (clust_col_prefix_len > 0) {
+ if (len != UNIV_SQL_NULL) {
+
+ const dtype_t* dtype
+ = dfield_get_type(dfield);
+
+ dfield_set_len(dfield,
+ dtype_get_at_most_n_mbchars(
+ dtype->prtype,
+ dtype->mbminlen,
+ dtype->mbmaxlen,
+ clust_col_prefix_len,
+ len, (char*) field));
+ }
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row reference.
+@return TRUE if found */
+ibool
+row_search_on_row_ref(
+/*==================*/
+ btr_pcur_t* pcur, /*!< out: persistent cursor, which must
+ be closed by the caller */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const dict_table_t* table, /*!< in: table */
+ const dtuple_t* ref, /*!< in: row reference */
+ mtr_t* mtr) /*!< in/out: mtr */
+{
+ ulint low_match;
+ rec_t* rec;
+ dict_index_t* index;
+
+ ut_ad(dtuple_check_typed(ref));
+
+ index = dict_table_get_first_index(table);
+
+ if (UNIV_UNLIKELY(ref->info_bits != 0)) {
+ ut_ad(ref->is_metadata());
+ ut_ad(ref->n_fields <= index->n_uniq);
+ if (btr_pcur_open_at_index_side(
+ true, index, mode, pcur, true, 0, mtr)
+ != DB_SUCCESS
+ || !btr_pcur_move_to_next_user_rec(pcur, mtr)) {
+ return FALSE;
+ }
+ /* We do not necessarily have index->is_instant() here,
+ because we could be executing a rollback of an
+ instant ADD COLUMN operation. The function
+ rec_is_metadata() asserts index->is_instant();
+ we do not want to call it here. */
+ return rec_get_info_bits(btr_pcur_get_rec(pcur),
+ dict_table_is_comp(index->table))
+ & REC_INFO_MIN_REC_FLAG;
+ } else {
+ ut_a(ref->n_fields == index->n_uniq);
+ if (btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr)
+ != DB_SUCCESS) {
+ return FALSE;
+ }
+ }
+
+ low_match = btr_pcur_get_low_match(pcur);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (page_rec_is_infimum(rec)) {
+
+ return(FALSE);
+ }
+
+ if (low_match != dtuple_get_n_fields(ref)) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+rec_t*
+row_get_clust_rec(
+/*==============*/
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const rec_t* rec, /*!< in: record in a secondary index */
+ dict_index_t* index, /*!< in: secondary index */
+ dict_index_t** clust_index,/*!< out: clustered index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mem_heap_t* heap;
+ dtuple_t* ref;
+ dict_table_t* table;
+ btr_pcur_t pcur;
+ ibool found;
+ rec_t* clust_rec;
+
+ ut_ad(!dict_index_is_clust(index));
+
+ table = index->table;
+
+ heap = mem_heap_create(256);
+
+ ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+ found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+ clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL;
+
+ mem_heap_free(heap);
+
+ btr_pcur_close(&pcur);
+
+ *clust_index = dict_table_get_first_index(table);
+
+ return(clust_rec);
+}
+
+/***************************************************************//**
+Searches an index record.
+@return whether the record was found or buffered */
+enum row_search_result
+row_search_index_entry(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry, /*!< in: index entry */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must
+ be closed by the caller */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint n_fields;
+ ulint low_match;
+ rec_t* rec;
+
+ ut_ad(dtuple_check_typed(entry));
+
+ if (dict_index_is_spatial(index)) {
+ ut_ad(mode & BTR_MODIFY_LEAF || mode & BTR_MODIFY_TREE);
+ rtr_pcur_open(index, entry, PAGE_CUR_RTREE_LOCATE,
+ mode, pcur, mtr);
+ } else {
+ btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+ }
+
+ switch (btr_pcur_get_btr_cur(pcur)->flag) {
+ case BTR_CUR_DELETE_REF:
+ ut_a(mode & BTR_DELETE && !dict_index_is_spatial(index));
+ return(ROW_NOT_DELETED_REF);
+
+ case BTR_CUR_DEL_MARK_IBUF:
+ case BTR_CUR_DELETE_IBUF:
+ case BTR_CUR_INSERT_TO_IBUF:
+ return(ROW_BUFFERED);
+
+ case BTR_CUR_HASH:
+ case BTR_CUR_HASH_FAIL:
+ case BTR_CUR_BINARY:
+ break;
+ }
+
+ low_match = btr_pcur_get_low_match(pcur);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ if (page_rec_is_infimum(rec)) {
+
+ return(ROW_NOT_FOUND);
+ } else if (low_match != n_fields) {
+
+ return(ROW_NOT_FOUND);
+ }
+
+ return(ROW_FOUND);
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_INT using "prtype" and writes the result to "buf".
+If the data is in unknown format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return number of bytes that were written */
+static
+ulint
+row_raw_format_int(
+/*===============*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint prtype, /*!< in: precise type */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size, /*!< in: output buffer size
+ in bytes */
+ ibool* format_in_hex) /*!< out: should the data be
+ formated in hex */
+{
+ ulint ret;
+
+ if (data_len <= sizeof(ib_uint64_t)) {
+
+ ib_uint64_t value;
+ ibool unsigned_type = prtype & DATA_UNSIGNED;
+
+ value = mach_read_int_type(
+ (const byte*) data, data_len, unsigned_type);
+
+ ret = (ulint) snprintf(
+ buf, buf_size,
+ unsigned_type ? "%llu" : "%lld", (longlong) value)+1;
+ } else {
+
+ *format_in_hex = TRUE;
+ ret = 0;
+ }
+
+ return(ut_min(ret, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the
+result to "buf".
+If the data is in binary format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return number of bytes that were written */
+static
+ulint
+row_raw_format_str(
+/*===============*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint prtype, /*!< in: precise type */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size, /*!< in: output buffer size
+ in bytes */
+ ibool* format_in_hex) /*!< out: should the data be
+ formated in hex */
+{
+ ulint charset_coll;
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ /* we assume system_charset_info is UTF-8 */
+
+ charset_coll = dtype_get_charset_coll(prtype);
+
+ if (UNIV_LIKELY(dtype_is_utf8(prtype))) {
+
+ return(ut_str_sql_format(data, data_len, buf, buf_size));
+ }
+ /* else */
+
+ if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) {
+
+ *format_in_hex = TRUE;
+ return(0);
+ }
+ /* else */
+
+ return(innobase_raw_format(data, data_len, charset_coll,
+ buf, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+row_raw_format(
+/*===========*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ const dict_field_t* dict_field, /*!< in: index field */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ ulint mtype;
+ ulint prtype;
+ ulint ret;
+ ibool format_in_hex;
+
+ ut_ad(data_len != UNIV_SQL_DEFAULT);
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ if (data_len == UNIV_SQL_NULL) {
+
+ ret = snprintf((char*) buf, buf_size, "NULL") + 1;
+
+ return(ut_min(ret, buf_size));
+ }
+
+ mtype = dict_field->col->mtype;
+ prtype = dict_field->col->prtype;
+
+ format_in_hex = FALSE;
+
+ switch (mtype) {
+ case DATA_INT:
+
+ ret = row_raw_format_int(data, data_len, prtype,
+ buf, buf_size, &format_in_hex);
+ if (format_in_hex) {
+
+ goto format_in_hex;
+ }
+ break;
+ case DATA_CHAR:
+ case DATA_VARCHAR:
+ case DATA_MYSQL:
+ case DATA_VARMYSQL:
+
+ ret = row_raw_format_str(data, data_len, prtype,
+ buf, buf_size, &format_in_hex);
+ if (format_in_hex) {
+
+ goto format_in_hex;
+ }
+
+ break;
+ /* XXX support more data types */
+ default:
+ format_in_hex:
+
+ if (UNIV_LIKELY(buf_size > 2)) {
+
+ memcpy(buf, "0x", 2);
+ buf += 2;
+ buf_size -= 2;
+ ret = 2 + ut_raw_to_hex(data, data_len,
+ buf, buf_size);
+ } else {
+
+ buf[0] = '\0';
+ ret = 1;
+ }
+ }
+
+ return(ret);
+}
+
+#ifdef UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT
+
+#ifdef HAVE_UT_CHRONO_T
+
+void
+test_row_raw_format_int()
+{
+ ulint ret;
+ char buf[128];
+ ibool format_in_hex;
+ ulint i;
+
+#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\
+ ret_expected, buf_expected, format_in_hex_expected)\
+ do {\
+ ibool ok = TRUE;\
+ ulint i;\
+ memset(buf, 'x', 10);\
+ buf[10] = '\0';\
+ format_in_hex = FALSE;\
+ fprintf(stderr, "TESTING \"\\x");\
+ for (i = 0; i < data_len; i++) {\
+ fprintf(stderr, "%02hhX", data[i]);\
+ }\
+ fprintf(stderr, "\", %lu, %lu, %lu\n",\
+ (ulint) data_len, (ulint) prtype,\
+ (ulint) buf_size);\
+ ret = row_raw_format_int(data, data_len, prtype,\
+ buf, buf_size, &format_in_hex);\
+ if (ret != ret_expected) {\
+ fprintf(stderr, "expected ret %lu, got %lu\n",\
+ (ulint) ret_expected, ret);\
+ ok = FALSE;\
+ }\
+ if (strcmp((char*) buf, buf_expected) != 0) {\
+ fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+ buf_expected, buf);\
+ ok = FALSE;\
+ }\
+ if (format_in_hex != format_in_hex_expected) {\
+ fprintf(stderr, "expected format_in_hex %d, got %d\n",\
+ (int) format_in_hex_expected,\
+ (int) format_in_hex);\
+ ok = FALSE;\
+ }\
+ if (ok) {\
+ fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\
+ (ulint) ret, buf, (int) format_in_hex);\
+ } else {\
+ return;\
+ }\
+ } while (0)
+
+#if 1
+ /* min values for signed 1-8 byte integers */
+
+ CALL_AND_TEST("\x00", 1, 0,
+ buf, sizeof(buf), 5, "-128", 0);
+
+ CALL_AND_TEST("\x00\x00", 2, 0,
+ buf, sizeof(buf), 7, "-32768", 0);
+
+ CALL_AND_TEST("\x00\x00\x00", 3, 0,
+ buf, sizeof(buf), 9, "-8388608", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00", 4, 0,
+ buf, sizeof(buf), 12, "-2147483648", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0,
+ buf, sizeof(buf), 14, "-549755813888", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0,
+ buf, sizeof(buf), 17, "-140737488355328", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0,
+ buf, sizeof(buf), 19, "-36028797018963968", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0,
+ buf, sizeof(buf), 21, "-9223372036854775808", 0);
+
+ /* min values for unsigned 1-8 byte integers */
+
+ CALL_AND_TEST("\x00", 1, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ /* max values for signed 1-8 byte integers */
+
+ CALL_AND_TEST("\xFF", 1, 0,
+ buf, sizeof(buf), 4, "127", 0);
+
+ CALL_AND_TEST("\xFF\xFF", 2, 0,
+ buf, sizeof(buf), 6, "32767", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF", 3, 0,
+ buf, sizeof(buf), 8, "8388607", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0,
+ buf, sizeof(buf), 11, "2147483647", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0,
+ buf, sizeof(buf), 13, "549755813887", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0,
+ buf, sizeof(buf), 16, "140737488355327", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0,
+ buf, sizeof(buf), 18, "36028797018963967", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0,
+ buf, sizeof(buf), 20, "9223372036854775807", 0);
+
+ /* max values for unsigned 1-8 byte integers */
+
+ CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED,
+ buf, sizeof(buf), 4, "255", 0);
+
+ CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "65535", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED,
+ buf, sizeof(buf), 9, "16777215", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED,
+ buf, sizeof(buf), 11, "4294967295", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED,
+ buf, sizeof(buf), 14, "1099511627775", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED,
+ buf, sizeof(buf), 16, "281474976710655", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED,
+ buf, sizeof(buf), 18, "72057594037927935", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED,
+ buf, sizeof(buf), 21, "18446744073709551615", 0);
+
+ /* some random values */
+
+ CALL_AND_TEST("\x52", 1, 0,
+ buf, sizeof(buf), 4, "-46", 0);
+
+ CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED,
+ buf, sizeof(buf), 3, "14", 0);
+
+ CALL_AND_TEST("\x62\xCE", 2, 0,
+ buf, sizeof(buf), 6, "-7474", 0);
+
+ CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "10710", 0);
+
+ CALL_AND_TEST("\x7F\xFF\x90", 3, 0,
+ buf, sizeof(buf), 5, "-112", 0);
+
+ CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "41238", 0);
+
+ CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0,
+ buf, sizeof(buf), 3, "-9", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED,
+ buf, sizeof(buf), 3, "92", 0);
+
+ CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0,
+ buf, sizeof(buf), 6, "-9117", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "91234", 0);
+#endif
+
+ /* speed test */
+
+ ut_chrono_t ch(__func__);
+
+ for (i = 0; i < 1000000; i++) {
+ row_raw_format_int("\x23", 1,
+ 0, buf, sizeof(buf),
+ &format_in_hex);
+ row_raw_format_int("\x23", 1,
+ DATA_UNSIGNED, buf, sizeof(buf),
+ &format_in_hex);
+
+ row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+ 0, buf, sizeof(buf),
+ &format_in_hex);
+ row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+ DATA_UNSIGNED, buf, sizeof(buf),
+ &format_in_hex);
+ }
+}
+
+#endif /* HAVE_UT_CHRONO_T */
+
+#endif /* UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT */
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
new file mode 100644
index 00000000..cc82bec4
--- /dev/null
+++ b/storage/innobase/row/row0sel.cc
@@ -0,0 +1,6082 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************//**
+@file row/row0sel.cc
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "gis0rtree.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "buf0lru.h"
+#include "srv0srv.h"
+#include "srv0mon.h"
+#ifdef WITH_WSREP
+#include "mysql/service_wsrep.h" /* For wsrep_thd_skip_locking */
+#endif
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH 16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT 1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT 100
+
+/* Flags for search shortcut */
+#define SEL_FOUND 0
+#define SEL_EXHAUSTED 1
+#define SEL_RETRY 2
+
+/********************************************************************//**
+Returns TRUE if the user-defined column in a secondary index record
+is alphabetically the same as the corresponding BLOB column in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return whether the columns are equal */
+static
+bool
+row_sel_sec_rec_is_for_blob(
+/*========================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint mbminlen, /*!< in: minimum length of
+ a character, in bytes */
+ ulint mbmaxlen, /*!< in: maximum length of
+ a character, in bytes */
+ const byte* clust_field, /*!< in: the locally stored part of
+ the clustered index column, including
+ the BLOB pointer; the clustered
+ index record must be covered by
+ a lock or a page latch to protect it
+ against deletion (rollback or purge) */
+ ulint clust_len, /*!< in: length of clust_field */
+ const byte* sec_field, /*!< in: column in secondary index */
+ ulint sec_len, /*!< in: length of sec_field */
+ ulint prefix_len, /*!< in: index column prefix length
+ in bytes, or 0 for full column */
+ dict_table_t* table) /*!< in: table */
+{
+ ulint len;
+ byte buf[REC_VERSION_56_MAX_INDEX_COL_LEN + 1];
+
+ /* This function should never be invoked on tables in
+ ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT, because they
+ should always contain enough prefix in the clustered index record. */
+ ut_ad(dict_table_has_atomic_blobs(table));
+ ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_ad(!prefix_len || prefix_len >= sec_len);
+ ut_a(prefix_len <= sizeof buf);
+
+ if (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)) {
+ /* The externally stored field was not written yet.
+ This record should only be seen by
+ trx_rollback_recovered() or any
+ TRX_ISO_READ_UNCOMMITTED transactions. */
+ return false;
+ }
+
+ len = btr_copy_externally_stored_field_prefix(
+ buf, prefix_len ? prefix_len : sizeof buf,
+ table->space->zip_size(),
+ clust_field, clust_len);
+
+ if (len == 0) {
+ /* The BLOB was being deleted as the server crashed.
+ There should not be any secondary index records
+ referring to this clustered index record, because
+ btr_free_externally_stored_field() is called after all
+ secondary index entries of the row have been purged. */
+ return false;
+ }
+
+ if (prefix_len) {
+ len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
+ prefix_len, len,
+ reinterpret_cast<const char*>
+ (buf));
+ } else if (len >= sizeof buf) {
+ ut_ad("too long column" == 0);
+ return false;
+ }
+
+ return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
+}
+
+/** Function to read the secondary spatial index, calculate
+the minimum bounding rectangle for clustered index record
+and secondary index record and compare it.
+@param sec_rec secondary index record
+@param sec_index spatial secondary index
+@param clust_rec clustered index record
+@param clust_index clustered index
+@retval DB_SUCCESS_LOCKED_REC if the secondary record is equal to the
+ corresponding fields in the clustered record, when compared with
+ collation;
+@retval DB_SUCCESS if not equal */
+static
+dberr_t
+row_sel_spatial_sec_rec_is_for_clust_rec(
+ const rec_t *sec_rec, const dict_index_t *sec_index,
+ const rec_t *clust_rec, dict_index_t *clust_index)
+{
+ mem_heap_t *heap= mem_heap_create(256);
+ rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *clust_offs= clust_offsets_;
+ ulint clust_len;
+
+ rec_offs_init(clust_offsets_);
+ ulint clust_pos= dict_col_get_clust_pos(
+ dict_index_get_nth_col(sec_index, 0), clust_index);
+ clust_offs= rec_get_offsets(clust_rec, clust_index, clust_offs,
+ clust_index->n_core_fields, clust_pos + 1,
+ &heap);
+ ut_ad(sec_index->n_user_defined_cols == 1);
+ const byte *clust_field= rec_get_nth_field(clust_rec, clust_offs,
+ clust_pos, &clust_len);
+ if (clust_len == UNIV_SQL_NULL || clust_len < GEO_DATA_HEADER_SIZE)
+ {
+ ut_ad("corrupted geometry column" == 0);
+err_exit:
+ mem_heap_free(heap);
+ return DB_SUCCESS;
+ }
+
+ /* For externally stored field, we need to get full
+ geo data to generate the MBR for comparing. */
+ if (rec_offs_nth_extern(clust_offs, clust_pos))
+ {
+ clust_field= btr_copy_externally_stored_field(
+ &clust_len, clust_field, sec_index->table->space->zip_size(),
+ clust_len, heap);
+ if (clust_field == NULL)
+ {
+ ut_ad("corrupted geometry blob" == 0);
+ goto err_exit;
+ }
+ }
+
+ ut_ad(clust_len >= GEO_DATA_HEADER_SIZE);
+ rtr_mbr_t tmp_mbr;
+ rtr_mbr_t sec_mbr;
+
+ rtree_mbr_from_wkb(
+ clust_field + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(clust_len - GEO_DATA_HEADER_SIZE),
+ SPDIMS, reinterpret_cast<double*>(&tmp_mbr));
+
+ rtr_read_mbr(sec_rec, &sec_mbr);
+
+ mem_heap_free(heap);
+ return MBR_EQUAL_CMP(&sec_mbr, &tmp_mbr)
+ ? DB_SUCCESS_LOCKED_REC
+ : DB_SUCCESS;
+}
+
+/** Returns TRUE if the user-defined column values in a secondary index record
+are alphabetically the same as the corresponding columns in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@param[in] sec_rec secondary index record
+@param[in] sec_index secondary index
+@param[in] clust_rec clustered index record;
+ must be protected by a page s-latch
+@param[in] clust_index clustered index
+@param[in] thr query thread
+@retval DB_COMPUTE_VALUE_FAILED in case of virtual column value computation
+ failure.
+@retval DB_SUCCESS_LOCKED_REC if the secondary record is equal to the
+ corresponding fields in the clustered record, when compared with
+ collation;
+@retval DB_SUCCESS if not equal or if the clustered record has been marked
+ for deletion */
+static
+dberr_t
+row_sel_sec_rec_is_for_clust_rec(
+ const rec_t* sec_rec,
+ dict_index_t* sec_index,
+ const rec_t* clust_rec,
+ dict_index_t* clust_index,
+ que_thr_t* thr)
+{
+ if (rec_get_deleted_flag(clust_rec,
+ dict_table_is_comp(clust_index->table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(rec_get_trx_id(clust_rec, clust_index));
+
+ /* The clustered index record is delete-marked;
+ it is not visible in the read view. Besides,
+ if there are any externally stored columns,
+ some of them may have already been purged. */
+ return DB_SUCCESS;
+ }
+
+ if (dict_index_is_spatial(sec_index)) {
+ return row_sel_spatial_sec_rec_is_for_clust_rec(
+ sec_rec, sec_index, clust_rec,
+ clust_index);
+ }
+
+ const byte* sec_field;
+ ulint sec_len;
+ const byte* clust_field;
+ ulint n;
+ ulint i;
+ mem_heap_t* heap = mem_heap_create(256);
+ rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs sec_offsets_[REC_OFFS_SMALL_SIZE];
+ rec_offs* clust_offs = clust_offsets_;
+ rec_offs* sec_offs = sec_offsets_;
+
+ rec_offs_init(clust_offsets_);
+ rec_offs_init(sec_offsets_);
+
+
+ ib_vcol_row vc(heap);
+
+ clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
+ sec_index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ n = dict_index_get_n_ordering_defined_by_user(sec_index);
+
+ for (i = 0; i < n; i++) {
+ const dict_field_t* ifield;
+ const dict_col_t* col;
+ ulint clust_pos = 0;
+ ulint clust_len = 0;
+ ulint len;
+
+ ifield = dict_index_get_nth_field(sec_index, i);
+ col = dict_field_get_col(ifield);
+
+ sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
+
+ const bool is_virtual = col->is_virtual();
+
+ /* For virtual column, its value will need to be
+ reconstructed from base column in cluster index */
+ if (is_virtual) {
+ const dict_v_col_t* v_col;
+ dfield_t* vfield;
+ row_ext_t* ext;
+
+ byte *record = vc.record(thr_get_trx(thr)->mysql_thd,
+ clust_index,
+ &thr->prebuilt->m_mysql_table);
+
+ v_col = reinterpret_cast<const dict_v_col_t*>(col);
+
+ dtuple_t* row = row_build(
+ ROW_COPY_POINTERS,
+ clust_index, clust_rec,
+ clust_offs,
+ NULL, NULL, NULL, &ext, heap);
+
+ vfield = innobase_get_computed_value(
+ row, v_col, clust_index,
+ &heap, NULL, NULL,
+ thr_get_trx(thr)->mysql_thd,
+ thr->prebuilt->m_mysql_table,
+ record, NULL, NULL, NULL);
+
+ if (vfield == NULL) {
+ innobase_report_computed_value_failed(row);
+ return DB_COMPUTE_VALUE_FAILED;
+ }
+ len = clust_len = vfield->len;
+ clust_field = static_cast<byte*>(vfield->data);
+ } else {
+ clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+ clust_field = rec_get_nth_cfield(
+ clust_rec, clust_index, clust_offs,
+ clust_pos, &clust_len);
+ if (clust_len == UNIV_SQL_NULL) {
+ if (sec_len == UNIV_SQL_NULL) {
+ continue;
+ }
+ return DB_SUCCESS;
+ }
+ if (sec_len == UNIV_SQL_NULL) {
+ return DB_SUCCESS;
+ }
+
+ len = clust_len;
+ if (rec_offs_nth_extern(clust_offs, clust_pos)) {
+ len -= BTR_EXTERN_FIELD_REF_SIZE;
+ }
+
+ if (ulint prefix_len = ifield->prefix_len) {
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype, col->mbminlen,
+ col->mbmaxlen, prefix_len, len,
+ reinterpret_cast<const char*>(
+ clust_field));
+ if (len < sec_len) {
+ goto check_for_blob;
+ }
+ } else {
+check_for_blob:
+ if (rec_offs_nth_extern(clust_offs,
+ clust_pos)) {
+ if (!row_sel_sec_rec_is_for_blob(
+ col->mtype, col->prtype,
+ col->mbminlen,
+ col->mbmaxlen,
+ clust_field, clust_len,
+ sec_field, sec_len,
+ prefix_len,
+ clust_index->table)) {
+ return DB_SUCCESS;
+ }
+
+ continue;
+ }
+ }
+ }
+
+ if (0 != cmp_data_data(col->mtype, col->prtype,
+ clust_field, len,
+ sec_field, sec_len)) {
+ return DB_SUCCESS;
+ }
+ }
+
+ return DB_SUCCESS_LOCKED_REC;
+}
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+sel_node_t*
+sel_node_create(
+/*============*/
+ mem_heap_t* heap) /*!< in: memory heap where created */
+{
+ sel_node_t* node;
+
+ node = static_cast<sel_node_t*>(
+ mem_heap_alloc(heap, sizeof(sel_node_t)));
+
+ node->common.type = QUE_NODE_SELECT;
+ node->state = SEL_NODE_OPEN;
+
+ node->plans = NULL;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+void
+sel_node_free_private(
+/*==================*/
+ sel_node_t* node) /*!< in: select node struct */
+{
+ ulint i;
+ plan_t* plan;
+
+ if (node->plans != NULL) {
+ for (i = 0; i < node->n_tables; i++) {
+ plan = sel_node_get_nth_plan(node, i);
+
+ btr_pcur_close(&(plan->pcur));
+ btr_pcur_close(&(plan->clust_pcur));
+
+ if (plan->old_vers_heap) {
+ mem_heap_free(plan->old_vers_heap);
+ }
+ }
+ }
+}
+
+/*********************************************************************//**
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ que_node_t* exp;
+
+ exp = node->select_list;
+
+ while (exp) {
+ eval_exp(exp);
+
+ exp = que_node_get_next(exp);
+ }
+}
+
+/*********************************************************************//**
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+ sym_node_t* var, /*!< in: first variable in a list of
+ variables */
+ sel_node_t* node) /*!< in: select node */
+{
+ que_node_t* exp;
+
+ if (var == NULL) {
+
+ return;
+ }
+
+ for (exp = node->select_list;
+ var != 0;
+ var = static_cast<sym_node_t*>(que_node_get_next(var))) {
+
+ ut_ad(exp);
+
+ eval_node_copy_val(var->alias, exp);
+
+ exp = que_node_get_next(exp);
+ }
+}
+
+/*********************************************************************//**
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ func_node_t* func_node;
+
+ ut_ad(node->is_aggregate);
+
+ for (func_node = static_cast<func_node_t*>(node->select_list);
+ func_node != 0;
+ func_node = static_cast<func_node_t*>(
+ que_node_get_next(func_node))) {
+
+ eval_node_set_int_val(func_node, 0);
+ }
+
+ node->aggregate_already_fetched = FALSE;
+}
+
+/*********************************************************************//**
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ sym_node_t* var;
+
+ var = UT_LIST_GET_FIRST(node->copy_variables);
+
+ while (var) {
+ eval_node_copy_val(var, var->alias);
+
+ var->indirection = NULL;
+
+ var = UT_LIST_GET_NEXT(col_var_list, var);
+ }
+}
+
+/*********************************************************************//**
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+ dict_index_t* index, /*!< in: record index */
+ const rec_t* rec, /*!< in: record in a clustered or non-clustered
+ index; must be protected by a page latch */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ sym_node_t* column) /*!< in: first column in a column list, or
+ NULL */
+{
+ dfield_t* val;
+ ulint index_type;
+ ulint field_no;
+ const byte* data;
+ ulint len;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (dict_index_is_clust(index)) {
+ index_type = SYM_CLUST_FIELD_NO;
+ } else {
+ index_type = SYM_SEC_FIELD_NO;
+ }
+
+ while (column) {
+ mem_heap_t* heap = NULL;
+ ibool needs_copy;
+
+ field_no = column->field_nos[index_type];
+
+ if (field_no != ULINT_UNDEFINED) {
+
+ if (UNIV_UNLIKELY(rec_offs_nth_extern(
+ offsets, field_no) != 0)) {
+
+ /* Copy an externally stored field to the
+ temporary heap, if possible. */
+
+ heap = mem_heap_create(1);
+
+ data = btr_rec_copy_externally_stored_field(
+ rec, offsets,
+ index->table->space->zip_size(),
+ field_no, &len, heap);
+
+ /* data == NULL means that the
+ externally stored field was not
+ written yet. This record
+ should only be seen by
+ trx_rollback_recovered() or any
+ TRX_ISO_READ_UNCOMMITTED
+ transactions. The InnoDB SQL parser
+ (the sole caller of this function)
+ does not implement READ UNCOMMITTED,
+ and it is not involved during rollback. */
+ ut_a(data);
+ ut_a(len != UNIV_SQL_NULL);
+
+ needs_copy = TRUE;
+ } else {
+ data = rec_get_nth_cfield(rec, index, offsets,
+ field_no, &len);
+ needs_copy = column->copy_val;
+ }
+
+ if (needs_copy) {
+ eval_node_copy_and_alloc_val(column, data,
+ len);
+ } else {
+ val = que_node_get_val(column);
+ dfield_set_data(val, data, len);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*********************************************************************//**
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+ sym_node_t* column) /*!< in: symbol table node for a column */
+{
+ sel_buf_t* sel_buf;
+ ulint i;
+
+ ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+
+ column->prefetch_buf = static_cast<sel_buf_t*>(
+ ut_malloc_nokey(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
+
+ for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+ sel_buf = column->prefetch_buf + i;
+
+ sel_buf->data = NULL;
+ sel_buf->len = 0;
+ sel_buf->val_buf_size = 0;
+ }
+}
+
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+ sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */
+{
+ sel_buf_t* sel_buf;
+ ulint i;
+
+ for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+ sel_buf = prefetch_buf + i;
+
+ if (sel_buf->val_buf_size > 0) {
+
+ ut_free(sel_buf->data);
+ }
+ }
+
+ ut_free(prefetch_buf);
+}
+
+/*********************************************************************//**
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_dequeue_prefetched_row(
+/*=======================*/
+ plan_t* plan) /*!< in: plan node for a table */
+{
+ sym_node_t* column;
+ sel_buf_t* sel_buf;
+ dfield_t* val;
+ byte* data;
+ ulint len;
+ ulint val_buf_size;
+
+ ut_ad(plan->n_rows_prefetched > 0);
+
+ column = UT_LIST_GET_FIRST(plan->columns);
+
+ while (column) {
+ val = que_node_get_val(column);
+
+ if (!column->copy_val) {
+ /* We did not really push any value for the
+ column */
+
+ ut_ad(!column->prefetch_buf);
+ ut_ad(que_node_get_val_buf_size(column) == 0);
+ ut_d(dfield_set_null(val));
+
+ goto next_col;
+ }
+
+ ut_ad(column->prefetch_buf);
+ ut_ad(!dfield_is_ext(val));
+
+ sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+ data = sel_buf->data;
+ len = sel_buf->len;
+ val_buf_size = sel_buf->val_buf_size;
+
+ /* We must keep track of the allocated memory for
+ column values to be able to free it later: therefore
+ we swap the values for sel_buf and val */
+
+ sel_buf->data = static_cast<byte*>(dfield_get_data(val));
+ sel_buf->len = dfield_get_len(val);
+ sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+
+ dfield_set_data(val, data, len);
+ que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+
+ plan->n_rows_prefetched--;
+
+ plan->first_prefetched++;
+}
+
+/*********************************************************************//**
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_enqueue_prefetched_row(
+/*=======================*/
+ plan_t* plan) /*!< in: plan node for a table */
+{
+ sym_node_t* column;
+ sel_buf_t* sel_buf;
+ dfield_t* val;
+ byte* data;
+ ulint len;
+ ulint pos;
+ ulint val_buf_size;
+
+ if (plan->n_rows_prefetched == 0) {
+ pos = 0;
+ plan->first_prefetched = 0;
+ } else {
+ pos = plan->n_rows_prefetched;
+
+ /* We have the convention that pushing new rows starts only
+ after the prefetch stack has been emptied: */
+
+ ut_ad(plan->first_prefetched == 0);
+ }
+
+ plan->n_rows_prefetched++;
+
+ ut_ad(pos < SEL_MAX_N_PREFETCH);
+
+ for (column = UT_LIST_GET_FIRST(plan->columns);
+ column != 0;
+ column = UT_LIST_GET_NEXT(col_var_list, column)) {
+
+ if (!column->copy_val) {
+ /* There is no sense to push pointers to database
+ page fields when we do not keep latch on the page! */
+ continue;
+ }
+
+ if (!column->prefetch_buf) {
+ /* Allocate a new prefetch buffer */
+
+ sel_col_prefetch_buf_alloc(column);
+ }
+
+ sel_buf = column->prefetch_buf + pos;
+
+ val = que_node_get_val(column);
+
+ data = static_cast<byte*>(dfield_get_data(val));
+ len = dfield_get_len(val);
+ val_buf_size = que_node_get_val_buf_size(column);
+
+ /* We must keep track of the allocated memory for
+ column values to be able to free it later: therefore
+ we swap the values for sel_buf and val */
+
+ dfield_set_data(val, sel_buf->data, sel_buf->len);
+ que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+
+ sel_buf->data = data;
+ sel_buf->len = len;
+ sel_buf->val_buf_size = val_buf_size;
+ }
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_sel_build_prev_vers(
+/*====================*/
+ ReadView* read_view, /*!< in: read view */
+ dict_index_t* index, /*!< in: plan node for table */
+ rec_t* rec, /*!< in: record in a clustered index */
+ rec_offs** offsets, /*!< in/out: offsets returned by
+ rec_get_offsets(rec, plan->index) */
+ mem_heap_t** offset_heap, /*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t** old_vers_heap, /*!< out: old version heap to use */
+ rec_t** old_vers, /*!< out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dberr_t err;
+
+ if (*old_vers_heap) {
+ mem_heap_empty(*old_vers_heap);
+ } else {
+ *old_vers_heap = mem_heap_create(512);
+ }
+
+ err = row_vers_build_for_consistent_read(
+ rec, mtr, index, offsets, read_view, offset_heap,
+ *old_vers_heap, old_vers, NULL);
+ return(err);
+}
+
+/*********************************************************************//**
+Builds the last committed version of a clustered index record for a
+semi-consistent read. */
+static
+void
+row_sel_build_committed_vers_for_mysql(
+/*===================================*/
+ dict_index_t* clust_index, /*!< in: clustered index */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
+ const rec_t* rec, /*!< in: record in a clustered index */
+ rec_offs** offsets, /*!< in/out: offsets returned by
+ rec_get_offsets(rec, clust_index) */
+ mem_heap_t** offset_heap, /*!< in/out: memory heap from which
+ the offsets are allocated */
+ const rec_t** old_vers, /*!< out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ dtuple_t** vrow, /*!< out: to be filled with old virtual
+ column version if any */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (prebuilt->old_vers_heap) {
+ mem_heap_empty(prebuilt->old_vers_heap);
+ } else {
+ prebuilt->old_vers_heap = mem_heap_create(
+ rec_offs_size(*offsets));
+ }
+
+ row_vers_build_for_semi_consistent_read(prebuilt->trx,
+ rec, mtr, clust_index, offsets, offset_heap,
+ prebuilt->old_vers_heap, old_vers, vrow);
+}
+
+/*********************************************************************//**
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted.
+@return TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+ plan_t* plan) /*!< in: plan for the table; the column values must
+ already have been retrieved and the right sides of
+ comparisons evaluated */
+{
+ func_node_t* cond;
+
+ /* All conditions in end_conds are comparisons of a column to an
+ expression */
+
+ for (cond = UT_LIST_GET_FIRST(plan->end_conds);
+ cond != 0;
+ cond = UT_LIST_GET_NEXT(cond_list, cond)) {
+
+ /* Evaluate the left side of the comparison, i.e., get the
+ column value if there is an indirection */
+
+ eval_sym(static_cast<sym_node_t*>(cond->args));
+
+ /* Do the comparison */
+
+ if (!eval_cmp(cond)) {
+
+ return(FALSE);
+ }
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Tests the other conditions.
+@return TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+ plan_t* plan) /*!< in: plan for the table; the column values must
+ already have been retrieved */
+{
+ func_node_t* cond;
+
+ cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+ while (cond) {
+ eval_exp(cond);
+
+ if (!eval_node_get_ibool_val(cond)) {
+
+ return(FALSE);
+ }
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_sel_get_clust_rec(
+/*==================*/
+ sel_node_t* node, /*!< in: select_node */
+ plan_t* plan, /*!< in: plan node for table */
+ rec_t* rec, /*!< in: record in a non-clustered index */
+ que_thr_t* thr, /*!< in: query thread */
+ rec_t** out_rec,/*!< out: clustered record or an old version of
+ it, NULL if the old version did not exist
+ in the read view, i.e., it was a fresh
+ inserted version */
+ mtr_t* mtr) /*!< in: mtr used to get access to the
+ non-clustered record; the same mtr is used to
+ access the clustered index */
+{
+ dict_index_t* index;
+ rec_t* clust_rec;
+ rec_t* old_vers;
+ dberr_t err = DB_SUCCESS;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ *out_rec = NULL;
+
+ offsets = rec_get_offsets(rec,
+ btr_pcur_get_btr_cur(&plan->pcur)->index,
+ offsets,
+ btr_pcur_get_btr_cur(&plan->pcur)->index
+ ->n_core_fields, ULINT_UNDEFINED, &heap);
+
+ row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
+
+ index = dict_table_get_first_index(plan->table);
+
+ btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
+ BTR_SEARCH_LEAF, &plan->clust_pcur,
+ 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+ /* Note: only if the search ends up on a non-infimum record is the
+ low_match value the real match to the search tuple */
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(&(plan->clust_pcur))
+ < dict_index_get_n_unique(index)) {
+
+ ut_a(rec_get_deleted_flag(rec,
+ dict_table_is_comp(plan->table)));
+ ut_a(node->read_view);
+
+ /* In a rare case it is possible that no clust rec is found
+ for a delete-marked secondary index record: if in row0umod.cc
+ in row_undo_mod_remove_clust_low() we have already removed
+ the clust rec, while purge is still cleaning and removing
+ secondary index records associated with earlier versions of
+ the clustered index record. In that case we know that the
+ clustered index record did not exist in the read view of
+ trx. */
+
+ goto err_exit;
+ }
+
+ offsets = rec_get_offsets(clust_rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (!node->read_view) {
+ /* Try to place a lock on the index record */
+ trx_t* trx = thr_get_trx(thr);
+
+ /* At READ UNCOMMITTED or READ COMMITTED isolation level
+ we lock only the record, i.e., next-key locking is
+ not used. */
+ err = lock_clust_rec_read_check_and_lock(
+ 0, btr_pcur_get_block(&plan->clust_pcur),
+ clust_rec, index, offsets,
+ node->row_lock_mode,
+ trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ ? LOCK_REC_NOT_GAP : LOCK_ORDINARY,
+ thr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_SUCCESS_LOCKED_REC:
+ /* Declare the variable uninitialized.
+ It should be set to DB_SUCCESS at func_exit. */
+ MEM_UNDEFINED(&err, sizeof err);
+ break;
+ default:
+ goto err_exit;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ old_vers = NULL;
+
+ if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
+ node->read_view)) {
+
+ err = row_sel_build_prev_vers(
+ node->read_view, index, clust_rec,
+ &offsets, &heap, &plan->old_vers_heap,
+ &old_vers, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto err_exit;
+ }
+
+ clust_rec = old_vers;
+
+ if (clust_rec == NULL) {
+ goto err_exit;
+ }
+ }
+
+ /* If we had to go to an earlier version of row or the
+ secondary index record is delete marked, then it may be that
+ the secondary index record corresponding to clust_rec
+ (or old_vers) is not rec; in that case we must ignore
+ such row because in our snapshot rec would not have existed.
+ Remember that from rec we cannot see directly which transaction
+ id corresponds to it: we have to go to the clustered index
+ record. A query where we want to fetch all rows where
+ the secondary index value is in some interval would return
+ a wrong result if we would not drop rows which we come to
+ visit through secondary index records that would not really
+ exist in our snapshot. */
+
+ if (old_vers || rec_get_deleted_flag(rec, dict_table_is_comp(
+ plan->table))) {
+ err = row_sel_sec_rec_is_for_clust_rec(rec,
+ plan->index, clust_rec,
+ index, thr);
+ if (err != DB_SUCCESS_LOCKED_REC) {
+ goto err_exit;
+ }
+ }
+ }
+
+ /* Fetch the columns needed in test conditions. The clustered
+ index record is protected by a page latch that was acquired
+ when plan->clust_pcur was positioned. The latch will not be
+ released until mtr->commit(). */
+
+ ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
+ row_sel_fetch_columns(index, clust_rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+ *out_rec = clust_rec;
+ err = DB_SUCCESS;
+err_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a page of R-Tree record. This is all or none action,
+mostly due to we cannot reposition a record in R-Tree (with the
+nature of splitting)
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+dberr_t
+sel_set_rtr_rec_lock(
+/*=================*/
+ btr_pcur_t* pcur, /*!< in: cursor */
+ const rec_t* first_rec,/*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ unsigned mode, /*!< in: lock mode */
+ unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOC_REC_NOT_GAP */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ matched_rec_t* match = pcur->btr_cur.rtr_info->matches;
+ mem_heap_t* heap = NULL;
+ dberr_t err = DB_SUCCESS;
+ trx_t* trx = thr_get_trx(thr);
+ buf_block_t* cur_block = btr_pcur_get_block(pcur);
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* my_offsets = const_cast<rec_offs*>(offsets);
+ rec_t* rec = const_cast<rec_t*>(first_rec);
+ rtr_rec_vector* match_rec;
+ rtr_rec_vector::iterator end;
+
+ rec_offs_init(offsets_);
+
+ if (match->locked || page_rec_is_supremum(first_rec)) {
+ return(DB_SUCCESS_LOCKED_REC);
+ }
+
+ ut_ad(page_align(first_rec) == cur_block->frame);
+ ut_ad(match->valid);
+
+ rw_lock_x_lock(&(match->block.lock));
+retry:
+ cur_block = btr_pcur_get_block(pcur);
+ ut_ad(rw_lock_own_flagged(&match->block.lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+ ut_ad(page_is_leaf(buf_block_get_frame(cur_block)));
+
+ err = lock_sec_rec_read_check_and_lock(
+ 0, cur_block, rec, index, my_offsets,
+ static_cast<lock_mode>(mode), type, thr);
+
+ if (err == DB_LOCK_WAIT) {
+re_scan:
+ mtr->commit();
+ trx->error_state = err;
+ que_thr_stop_for_mysql(thr);
+ thr->lock_state = QUE_THR_LOCK_ROW;
+ if (row_mysql_handle_errors(
+ &err, trx, thr, NULL)) {
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+ mtr->start();
+
+ mutex_enter(&match->rtr_match_mutex);
+ if (!match->valid && match->matched_recs->empty()) {
+ mutex_exit(&match->rtr_match_mutex);
+ err = DB_RECORD_NOT_FOUND;
+ goto func_end;
+ }
+ mutex_exit(&match->rtr_match_mutex);
+
+ /* MDEV-14059 FIXME: why re-latch the block?
+ pcur is already positioned on it! */
+ uint32_t page_no = page_get_page_no(
+ btr_pcur_get_page(pcur));
+
+ cur_block = buf_page_get_gen(
+ page_id_t(index->table->space_id, page_no),
+ index->table->space->zip_size(),
+ RW_X_LATCH, NULL, BUF_GET,
+ __FILE__, __LINE__, mtr, &err);
+ } else {
+ mtr->start();
+ goto func_end;
+ }
+
+ DEBUG_SYNC_C("rtr_set_lock_wait");
+
+ if (!match->valid) {
+ /* Page got deleted */
+ mtr->commit();
+ mtr->start();
+ err = DB_RECORD_NOT_FOUND;
+ goto func_end;
+ }
+
+ match->matched_recs->clear();
+
+ rtr_cur_search_with_match(
+ cur_block, index,
+ pcur->btr_cur.rtr_info->search_tuple,
+ pcur->btr_cur.rtr_info->search_mode,
+ &pcur->btr_cur.page_cur,
+ pcur->btr_cur.rtr_info);
+
+ if (!page_is_leaf(buf_block_get_frame(cur_block))) {
+ /* Page got splitted and promoted (only for
+ root page it is possible). Release the
+ page and ask for a re-search */
+ mtr->commit();
+ mtr->start();
+ err = DB_RECORD_NOT_FOUND;
+ goto func_end;
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ my_offsets = offsets_;
+ my_offsets = rec_get_offsets(rec, index, my_offsets,
+ index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* No match record */
+ if (page_rec_is_supremum(rec) || !match->valid) {
+ mtr->commit();
+ mtr->start();
+ err = DB_RECORD_NOT_FOUND;
+ goto func_end;
+ }
+
+ goto retry;
+ }
+
+ my_offsets = offsets_;
+ match_rec = match->matched_recs;
+ end = match_rec->end();
+
+ for (rtr_rec_vector::iterator it = match_rec->begin();
+ it != end; ++it) {
+ rtr_rec_t* rtr_rec = &(*it);
+
+ my_offsets = rec_get_offsets(
+ rtr_rec->r_rec, index, my_offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ err = lock_sec_rec_read_check_and_lock(
+ 0, &match->block, rtr_rec->r_rec, index,
+ my_offsets, static_cast<lock_mode>(mode),
+ type, thr);
+
+ if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) {
+ rtr_rec->locked = true;
+ } else if (err == DB_LOCK_WAIT) {
+ goto re_scan;
+ } else {
+ goto func_end;
+ }
+ }
+
+ match->locked = true;
+
+func_end:
+ rw_lock_x_unlock(&(match->block.lock));
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+
+ ut_ad(err != DB_LOCK_WAIT);
+
+ return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+dberr_t
+sel_set_rec_lock(
+/*=============*/
+ btr_pcur_t* pcur, /*!< in: cursor */
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ unsigned mode, /*!< in: lock mode */
+ unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOC_REC_NOT_GAP */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_t* trx;
+ dberr_t err = DB_SUCCESS;
+ const buf_block_t* block;
+
+ block = btr_pcur_get_block(pcur);
+
+ trx = thr_get_trx(thr);
+
+ if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000
+ && buf_pool.running_out()) {
+ return DB_LOCK_TABLE_FULL;
+ }
+
+ if (dict_index_is_clust(index)) {
+ err = lock_clust_rec_read_check_and_lock(
+ 0, block, rec, index, offsets,
+ static_cast<lock_mode>(mode), type, thr);
+ } else {
+
+ if (dict_index_is_spatial(index)) {
+ if (type == LOCK_GAP || type == LOCK_ORDINARY) {
+ ut_ad(0);
+ ib::error() << "Incorrectly request GAP lock "
+ "on RTree";
+ return(DB_SUCCESS);
+ }
+ err = sel_set_rtr_rec_lock(pcur, rec, index, offsets,
+ mode, type, thr, mtr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(
+ 0, block, rec, index, offsets,
+ static_cast<lock_mode>(mode), type, thr);
+ }
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Opens a pcur to a table index. */
+static
+void
+row_sel_open_pcur(
+/*==============*/
+ plan_t* plan, /*!< in: table plan */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ dict_index_t* index;
+ func_node_t* cond;
+ que_node_t* exp;
+ ulint n_fields;
+ ulint i;
+
+ index = plan->index;
+
+ /* Calculate the value of the search tuple: the exact match columns
+ get their expressions evaluated when we evaluate the right sides of
+ end_conds */
+
+ cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+ while (cond) {
+ eval_exp(que_node_get_next(cond->args));
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ if (plan->tuple) {
+ n_fields = dtuple_get_n_fields(plan->tuple);
+
+ if (plan->n_exact_match < n_fields) {
+ /* There is a non-exact match field which must be
+ evaluated separately */
+
+ eval_exp(plan->tuple_exps[n_fields - 1]);
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ exp = plan->tuple_exps[i];
+
+ dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+ que_node_get_val(exp));
+ }
+
+ /* Open pcur to the index */
+
+ btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
+ BTR_SEARCH_LEAF, &plan->pcur,
+ NULL, mtr);
+ } else {
+ /* Open the cursor to the start or the end of the index
+ (FALSE: no init) */
+
+ btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
+ &(plan->pcur), false, 0, mtr);
+ }
+
+ ut_ad(plan->n_rows_prefetched == 0);
+ ut_ad(plan->n_rows_fetched == 0);
+ ut_ad(plan->cursor_at_end == FALSE);
+
+ plan->pcur_is_open = TRUE;
+}
+
+/*********************************************************************//**
+Restores a stored pcur position to a table index.
+@return TRUE if the cursor should be moved to the next record after we
+return from this function (moved to the previous, in the case of a
+descending cursor) without processing again the current cursor
+record */
+static
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+ plan_t* plan, /*!< in: table plan */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ibool equal_position;
+ ulint relative_position;
+
+ ut_ad(!plan->cursor_at_end);
+
+ relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+ equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &(plan->pcur), mtr);
+
+ /* If the cursor is traveling upwards, and relative_position is
+
+ (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+ yet on the successor of the page infimum;
+ (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+ first record GREATER than the predecessor of a page supremum; we have
+ not yet processed the cursor record: no need to move the cursor to the
+ next record;
+ (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+ last record LESS or EQUAL to the old stored user record; (a) if
+ equal_position is FALSE, this means that the cursor is now on a record
+ less than the old user record, and we must move to the next record;
+ (b) if equal_position is TRUE, then if
+ plan->stored_cursor_rec_processed is TRUE, we must move to the next
+ record, else there is no need to move the cursor. */
+
+ if (plan->asc) {
+ if (relative_position == BTR_PCUR_ON) {
+
+ if (equal_position) {
+
+ return(plan->stored_cursor_rec_processed);
+ }
+
+ return(TRUE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_AFTER
+ || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+ return(FALSE);
+ }
+
+ /* If the cursor is traveling downwards, and relative_position is
+
+ (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+ the last record LESS than the successor of a page infimum; we have not
+ processed the cursor record: no need to move the cursor;
+ (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+ first record GREATER than the predecessor of a page supremum; we have
+ processed the cursor record: we should move the cursor to the previous
+ record;
+ (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+ last record LESS or EQUAL to the old stored user record; (a) if
+ equal_position is FALSE, this means that the cursor is now on a record
+ less than the old user record, and we need not move to the previous
+ record; (b) if equal_position is TRUE, then if
+ plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+ record, else there is no need to move the cursor. */
+
+ if (relative_position == BTR_PCUR_BEFORE
+ || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+
+ return(FALSE);
+ }
+
+ if (relative_position == BTR_PCUR_ON) {
+
+ if (equal_position) {
+
+ return(plan->stored_cursor_rec_processed);
+ }
+
+ return(FALSE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_AFTER
+ || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+ plan_t* plan) /*!< in: plan */
+{
+ plan->pcur_is_open = FALSE;
+ plan->cursor_at_end = FALSE;
+ plan->n_rows_fetched = 0;
+ plan->n_rows_prefetched = 0;
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always).
+@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+ sel_node_t* node, /*!< in: select node for a consistent read */
+ plan_t* plan, /*!< in: plan for a unique search in clustered
+ index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index = plan->index;
+
+ ut_ad(node->read_view);
+ ut_ad(plan->unique_search);
+ ut_ad(!plan->must_get_clust);
+
+ row_sel_open_pcur(plan, mtr);
+
+ const rec_t* rec = btr_pcur_get_rec(&(plan->pcur));
+
+ if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) {
+retry:
+ return(SEL_RETRY);
+ }
+
+ ut_ad(plan->mode == PAGE_CUR_GE);
+
+ /* As the cursor is now placed on a user record after a search with
+ the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+ fields in the user record matched to the search tuple */
+
+ if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+exhausted:
+ return(SEL_EXHAUSTED);
+ }
+
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (dict_index_is_clust(index)) {
+ if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+ node->read_view)) {
+ goto retry;
+ }
+ } else if (!srv_read_only_mode
+ && !lock_sec_rec_cons_read_sees(
+ rec, index, node->read_view)) {
+ goto retry;
+ }
+
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
+ goto exhausted;
+ }
+
+ /* Fetch the columns needed in test conditions. The index
+ record is protected by a page latch that was acquired when
+ plan->pcur was positioned. The latch will not be released
+ until mtr->commit(). */
+
+ row_sel_fetch_columns(index, rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+
+ /* Test the rest of search conditions */
+
+ if (!row_sel_test_other_conds(plan)) {
+ goto exhausted;
+ }
+
+ ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+ plan->n_rows_fetched++;
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(SEL_FOUND);
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/*********************************************************************//**
+Performs a select step.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_sel(
+/*====*/
+ sel_node_t* node, /*!< in: select node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_index_t* index;
+ plan_t* plan;
+ mtr_t mtr;
+ ibool moved;
+ rec_t* rec;
+ rec_t* old_vers;
+ rec_t* clust_rec;
+ ibool consistent_read;
+
+ /* The following flag becomes TRUE when we are doing a
+ consistent read from a non-clustered index and we must look
+ at the clustered index to find out the previous delete mark
+ state of the non-clustered record: */
+
+ ibool cons_read_requires_clust_rec = FALSE;
+ ulint cost_counter = 0;
+ ibool cursor_just_opened;
+ ibool must_go_to_next;
+ ibool mtr_has_extra_clust_latch = FALSE;
+ /* TRUE if the search was made using
+ a non-clustered index, and we had to
+ access the clustered record: now &mtr
+ contains a clustered index latch, and
+ &mtr must be committed before we move
+ to the next non-clustered record */
+ dberr_t err;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(thr->run_node == node);
+
+ if (node->read_view) {
+ /* In consistent reads, we try to do with the hash index and
+ not to use the buffer page get. This is to reduce memory bus
+ load resulting from semaphore operations. The search latch
+ will be s-locked when we access an index with a unique search
+ condition, but not locked when we access an index with a
+ less selective search condition. */
+
+ consistent_read = TRUE;
+ } else {
+ consistent_read = FALSE;
+ }
+
+table_loop:
+ /* TABLE LOOP
+ ----------
+ This is the outer major loop in calculating a join. We come here when
+ node->fetch_table changes, and after adding a row to aggregate totals
+ and, of course, when this function is called. */
+
+ ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+ plan = sel_node_get_nth_plan(node, node->fetch_table);
+ index = plan->index;
+
+ if (plan->n_rows_prefetched > 0) {
+ sel_dequeue_prefetched_row(plan);
+
+ goto next_table_no_mtr;
+ }
+
+ if (plan->cursor_at_end) {
+ /* The cursor has already reached the result set end: no more
+ rows to process for this table cursor, as also the prefetch
+ stack was empty */
+
+ ut_ad(plan->pcur_is_open);
+
+ goto table_exhausted_no_mtr;
+ }
+
+ /* Open a cursor to index, or restore an open cursor position */
+
+ mtr.start();
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (consistent_read && plan->unique_search && !plan->pcur_is_open
+ && !plan->must_get_clust) {
+ switch (row_sel_try_search_shortcut(node, plan, &mtr)) {
+ case SEL_FOUND:
+ goto next_table;
+ case SEL_EXHAUSTED:
+ goto table_exhausted;
+ default:
+ ut_ad(0);
+ /* fall through */
+ case SEL_RETRY:
+ break;
+ }
+
+ plan_reset_cursor(plan);
+
+ mtr.commit();
+ mtr.start();
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (!plan->pcur_is_open) {
+ /* Evaluate the expressions to build the search tuple and
+ open the cursor */
+ row_sel_open_pcur(plan, &mtr);
+
+ cursor_just_opened = TRUE;
+
+ /* A new search was made: increment the cost counter */
+ cost_counter++;
+ } else {
+ /* Restore pcur position to the index */
+
+ must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
+
+ cursor_just_opened = FALSE;
+
+ if (must_go_to_next) {
+ /* We have already processed the cursor record: move
+ to the next */
+
+ goto next_rec;
+ }
+ }
+
+rec_loop:
+ /* RECORD LOOP
+ -----------
+ In this loop we use pcur and try to fetch a qualifying row, and
+ also fill the prefetch buffer for this table if n_rows_fetched has
+ exceeded a threshold. While we are inside this loop, the following
+ holds:
+ (1) &mtr is started,
+ (2) pcur is positioned and open.
+
+ NOTE that if cursor_just_opened is TRUE here, it means that we came
+ to this point right after row_sel_open_pcur. */
+
+ ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+ rec = btr_pcur_get_rec(&(plan->pcur));
+
+ /* PHASE 1: Set a lock if specified */
+
+ if (!node->asc && cursor_just_opened
+ && !page_rec_is_supremum(rec)) {
+
+ /* Do not support "descending search" for Spatial index */
+ ut_ad(!dict_index_is_spatial(index));
+
+ /* When we open a cursor for a descending search, we must set
+ a next-key lock on the successor record: otherwise it would
+ be possible to insert new records next to the cursor position,
+ and it might be that these new records should appear in the
+ search result set, resulting in the phantom problem. */
+
+ if (!consistent_read) {
+ rec_t* next_rec = page_rec_get_next(rec);
+ unsigned lock_type;
+ trx_t* trx;
+
+ trx = thr_get_trx(thr);
+
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* At READ UNCOMMITTED or READ COMMITTED
+ isolation level, we lock only the record,
+ i.e., next-key locking is not used. */
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+ if (page_rec_is_supremum(next_rec)) {
+ goto skip_lock;
+ }
+
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = sel_set_rec_lock(&plan->pcur,
+ next_rec, index, offsets,
+ node->row_lock_mode,
+ lock_type, thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ break;
+ default:
+ /* Note that in this case we will store in pcur
+ the PREDECESSOR of the record we are waiting
+ the lock for */
+ goto lock_wait_or_error;
+ }
+ }
+ }
+
+skip_lock:
+ if (page_rec_is_infimum(rec)) {
+
+ /* The infimum record on a page cannot be in the result set,
+ and neither can a record lock be placed on it: we skip such
+ a record. We also increment the cost counter as we may have
+ processed yet another page of index. */
+
+ cost_counter++;
+
+ goto next_rec;
+ }
+
+ if (rec_is_metadata(rec, *index)) {
+ /* Skip the metadata pseudo-record. */
+ cost_counter++;
+ goto next_rec;
+ }
+
+ if (!consistent_read) {
+ /* Try to place a lock on the index record */
+ unsigned lock_type;
+ trx_t* trx;
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ trx = thr_get_trx(thr);
+
+ /* At READ UNCOMMITTED or READ COMMITTED isolation level,
+ we lock only the record, i.e., next-key locking is
+ not used. */
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ || dict_index_is_spatial(index)) {
+
+ if (page_rec_is_supremum(rec)) {
+
+ goto next_rec;
+ }
+
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = sel_set_rec_lock(&plan->pcur,
+ rec, index, offsets,
+ node->row_lock_mode, lock_type,
+ thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ /* A page supremum record cannot be in the result set: skip
+ it now when we have placed a possible lock on it */
+
+ goto next_rec;
+ }
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ if (cost_counter > SEL_COST_LIMIT) {
+
+ /* Now that we have placed the necessary locks, we can stop
+ for a while and store the cursor position; NOTE that if we
+ would store the cursor position BEFORE placing a record lock,
+ it might happen that the cursor would jump over some records
+ that another transaction could meanwhile insert adjacent to
+ the cursor: this would result in the phantom problem. */
+
+ goto stop_for_a_while;
+ }
+
+ /* PHASE 2: Check a mixed index mix id if needed */
+
+ if (plan->unique_search && cursor_just_opened) {
+
+ ut_ad(plan->mode == PAGE_CUR_GE);
+
+ /* As the cursor is now placed on a user record after a search
+ with the mode PAGE_CUR_GE, the up_match field in the cursor
+ tells how many fields in the user record matched to the search
+ tuple */
+
+ if (btr_pcur_get_up_match(&(plan->pcur))
+ < plan->n_exact_match) {
+ goto table_exhausted;
+ }
+
+ /* Ok, no need to test end_conds or mix id */
+
+ }
+
+ /* We are ready to look at a possible new index entry in the result
+ set: the cursor is now placed on a user record */
+
+ /* PHASE 3: Get previous version in a consistent read */
+
+ cons_read_requires_clust_rec = FALSE;
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (consistent_read) {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (dict_index_is_clust(index)) {
+
+ if (!lock_clust_rec_cons_read_sees(
+ rec, index, offsets, node->read_view)) {
+
+ err = row_sel_build_prev_vers(
+ node->read_view, index, rec,
+ &offsets, &heap, &plan->old_vers_heap,
+ &old_vers, &mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (old_vers == NULL) {
+ /* The record does not exist
+ in our read view. Skip it, but
+ first attempt to determine
+ whether the index segment we
+ are searching through has been
+ exhausted. */
+
+ offsets = rec_get_offsets(
+ rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* Fetch the columns needed in
+ test conditions. The clustered
+ index record is protected by a
+ page latch that was acquired
+ by row_sel_open_pcur() or
+ row_sel_restore_pcur_pos().
+ The latch will not be released
+ until mtr.commit(). */
+
+ row_sel_fetch_columns(
+ index, rec, offsets,
+ UT_LIST_GET_FIRST(
+ plan->columns));
+
+ if (!row_sel_test_end_conds(plan)) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ rec = old_vers;
+ }
+ } else if (!srv_read_only_mode
+ && !lock_sec_rec_cons_read_sees(
+ rec, index, node->read_view)) {
+
+ cons_read_requires_clust_rec = TRUE;
+ }
+ }
+
+ /* PHASE 4: Test search end conditions and deleted flag */
+
+ /* Fetch the columns needed in test conditions. The record is
+ protected by a page latch that was acquired by
+ row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
+ will not be released until mtr.commit(). */
+
+ row_sel_fetch_columns(index, rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+
+ /* Test the selection end conditions: these can only contain columns
+ which already are found in the index, even though the index might be
+ non-clustered */
+
+ if (plan->unique_search && cursor_just_opened) {
+
+ /* No test necessary: the test was already made above */
+
+ } else if (!row_sel_test_end_conds(plan)) {
+
+ goto table_exhausted;
+ }
+
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
+ && !cons_read_requires_clust_rec) {
+
+ /* The record is delete marked: we can skip it if this is
+ not a consistent read which might see an earlier version
+ of a non-clustered index record */
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ /* PHASE 5: Get the clustered index record, if needed and if we did
+ not do the search using the clustered index */
+
+ if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+ /* It was a non-clustered index and we must fetch also the
+ clustered index record */
+
+ err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+ &mtr);
+ mtr_has_extra_clust_latch = TRUE;
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ /* Retrieving the clustered record required a search:
+ increment the cost counter */
+
+ cost_counter++;
+
+ if (clust_rec == NULL) {
+ /* The record did not exist in the read view */
+ ut_ad(consistent_read);
+
+ goto next_rec;
+ }
+
+ if (rec_get_deleted_flag(clust_rec,
+ dict_table_is_comp(plan->table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing update_undo log record. */
+ ut_ad(rec_get_trx_id(clust_rec,
+ dict_table_get_first_index(
+ plan->table)));
+
+ /* The record is delete marked: we can skip it */
+
+ goto next_rec;
+ }
+
+ if (node->can_get_updated) {
+
+ btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+ }
+ }
+
+ /* PHASE 6: Test the rest of search conditions */
+
+ if (!row_sel_test_other_conds(plan)) {
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ /* PHASE 7: We found a new qualifying row for the current table; push
+ the row if prefetch is on, or move to the next table in the join */
+
+ plan->n_rows_fetched++;
+
+ ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+ if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+ || plan->unique_search || plan->no_prefetch) {
+
+ /* No prefetch in operation: go to the next table */
+
+ goto next_table;
+ }
+
+ sel_enqueue_prefetched_row(plan);
+
+ if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+ /* The prefetch buffer is now full */
+
+ sel_dequeue_prefetched_row(plan);
+
+ goto next_table;
+ }
+
+next_rec:
+ if (mtr_has_extra_clust_latch) {
+
+ /* We must commit &mtr if we are moving to the next
+ non-clustered index record, because we could break the
+ latching order if we would access a different clustered
+ index page right away without releasing the previous. */
+
+ goto commit_mtr_for_a_while;
+ }
+
+ if (node->asc) {
+ moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+ } else {
+ moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+ }
+
+ if (!moved) {
+
+ goto table_exhausted;
+ }
+
+ cursor_just_opened = FALSE;
+
+ /* END OF RECORD LOOP
+ ------------------ */
+ goto rec_loop;
+
+next_table:
+ /* We found a record which satisfies the conditions: we can move to
+ the next table or return a row in the result set */
+
+ ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
+
+ if (plan->unique_search && !node->can_get_updated) {
+
+ plan->cursor_at_end = TRUE;
+ } else {
+ plan->stored_cursor_rec_processed = TRUE;
+
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+ }
+
+ mtr.commit();
+
+ mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+ /* If we use 'goto' to this label, it means that the row was popped
+ from the prefetched rows stack, and &mtr is already committed */
+
+ if (node->fetch_table + 1 == node->n_tables) {
+
+ sel_eval_select_list(node);
+
+ if (node->is_aggregate) {
+
+ goto table_loop;
+ }
+
+ sel_assign_into_var_values(node->into_list, node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ node->fetch_table++;
+
+ /* When we move to the next table, we first reset the plan cursor:
+ we do not care about resetting it when we backtrack from a table */
+
+ plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+
+ goto table_loop;
+
+table_exhausted:
+ /* The table cursor pcur reached the result set end: backtrack to the
+ previous table in the join if we do not have cached prefetched rows */
+
+ plan->cursor_at_end = TRUE;
+
+ mtr.commit();
+
+ mtr_has_extra_clust_latch = FALSE;
+
+ if (plan->n_rows_prefetched > 0) {
+ /* The table became exhausted during a prefetch */
+
+ sel_dequeue_prefetched_row(plan);
+
+ goto next_table_no_mtr;
+ }
+
+table_exhausted_no_mtr:
+ if (node->fetch_table == 0) {
+ err = DB_SUCCESS;
+
+ if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+ node->aggregate_already_fetched = TRUE;
+
+ sel_assign_into_var_values(node->into_list, node);
+
+ thr->run_node = que_node_get_parent(node);
+ } else {
+ node->state = SEL_NODE_NO_MORE_ROWS;
+
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ goto func_exit;
+ }
+
+ node->fetch_table--;
+
+ goto table_loop;
+
+stop_for_a_while:
+ /* Return control for a while to que_run_threads, so that runaway
+ queries can be canceled. NOTE that when we come here, we must, in a
+ locking read, have placed the necessary (possibly waiting request)
+ record lock on the cursor record or its successor: when we reposition
+ the cursor, this record lock guarantees that nobody can meanwhile have
+ inserted new records which should have appeared in the result set,
+ which would result in the phantom problem. */
+
+ plan->stored_cursor_rec_processed = FALSE;
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr.commit();
+ ut_ad(!sync_check_iterate(sync_check()));
+
+ err = DB_SUCCESS;
+ goto func_exit;
+
+commit_mtr_for_a_while:
+ /* Stores the cursor position and commits &mtr; this is used if
+ &mtr may contain latches which would break the latching order if
+ &mtr would not be committed and the latches released. */
+
+ plan->stored_cursor_rec_processed = TRUE;
+
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr.commit();
+
+ mtr_has_extra_clust_latch = FALSE;
+ ut_ad(!sync_check_iterate(dict_sync_check()));
+
+ goto table_loop;
+
+lock_wait_or_error:
+ /* See the note at stop_for_a_while: the same holds for this case */
+
+ ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
+
+ plan->stored_cursor_rec_processed = FALSE;
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr.commit();
+
+func_exit:
+ ut_ad(!sync_check_iterate(dict_sync_check()));
+
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_sel_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ sel_node_t* node;
+
+ ut_ad(thr);
+
+ node = static_cast<sel_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+ /* If this is a new time this node is executed (or when execution
+ resumes after wait for a table intention lock), set intention locks
+ on the tables, or assign a read view */
+
+ if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+ node->state = SEL_NODE_OPEN;
+ }
+
+ if (node->state == SEL_NODE_OPEN) {
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started_xa(thr_get_trx(thr), false);
+
+ plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+ if (node->consistent_read) {
+ trx_t *trx = thr_get_trx(thr);
+ /* Assign a read view for the query */
+ trx->read_view.open(trx);
+ node->read_view = trx->read_view.is_open() ?
+ &trx->read_view : NULL;
+ } else {
+ sym_node_t* table_node;
+ lock_mode i_lock_mode;
+
+ if (node->set_x_locks) {
+ i_lock_mode = LOCK_IX;
+ } else {
+ i_lock_mode = LOCK_IS;
+ }
+
+ for (table_node = node->table_list;
+ table_node != 0;
+ table_node = static_cast<sym_node_t*>(
+ que_node_get_next(table_node))) {
+
+ dberr_t err = lock_table(
+ 0, table_node->table, i_lock_mode,
+ thr);
+
+ if (err != DB_SUCCESS) {
+ trx_t* trx;
+
+ trx = thr_get_trx(thr);
+ trx->error_state = err;
+
+ return(NULL);
+ }
+ }
+ }
+
+ /* If this is an explicit cursor, copy stored procedure
+ variable values, so that the values cannot change between
+ fetches (currently, we copy them also for non-explicit
+ cursors) */
+
+ if (node->explicit_cursor
+ && UT_LIST_GET_FIRST(node->copy_variables)) {
+
+ row_sel_copy_input_variable_vals(node);
+ }
+
+ node->state = SEL_NODE_FETCH;
+ node->fetch_table = 0;
+
+ if (node->is_aggregate) {
+ /* Reset the aggregate total values */
+ sel_reset_aggregate_vals(node);
+ }
+ }
+
+ dberr_t err = row_sel(node, thr);
+
+ /* NOTE! if queries are parallelized, the following assignment may
+ have problems; the assignment should be made only if thr is the
+ only top-level thr in the graph: */
+
+ thr->graph->last_sel_node = node;
+
+ if (err != DB_SUCCESS) {
+ thr_get_trx(thr)->error_state = err;
+
+ return(NULL);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+que_thr_t*
+fetch_step(
+/*=======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ sel_node_t* sel_node;
+ fetch_node_t* node;
+
+ ut_ad(thr);
+
+ node = static_cast<fetch_node_t*>(thr->run_node);
+ sel_node = node->cursor_def;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+ if (thr->prev_node != que_node_get_parent(node)) {
+
+ if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+
+ if (node->into_list) {
+ sel_assign_into_var_values(node->into_list,
+ sel_node);
+ } else {
+ ibool ret = (*node->func->func)(
+ sel_node, node->func->arg);
+
+ if (!ret) {
+ sel_node->state
+ = SEL_NODE_NO_MORE_ROWS;
+ }
+ }
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+ }
+
+ /* Make the fetch node the parent of the cursor definition for
+ the time of the fetch, so that execution knows to return to this
+ fetch node after a row has been selected or we know that there is
+ no row left */
+
+ sel_node->common.parent = node;
+
+ if (sel_node->state == SEL_NODE_CLOSED) {
+ ib::error() << "fetch called on a closed cursor";
+
+ thr_get_trx(thr)->error_state = DB_ERROR;
+
+ return(NULL);
+ }
+
+ thr->run_node = sel_node;
+
+ return(thr);
+}
+
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+que_thr_t*
+row_printf_step(
+/*============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ row_printf_node_t* node;
+ sel_node_t* sel_node;
+ que_node_t* arg;
+
+ ut_ad(thr);
+
+ node = static_cast<row_printf_node_t*>(thr->run_node);
+
+ sel_node = node->sel_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch next row to print */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+
+ if (sel_node->state != SEL_NODE_FETCH) {
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to print */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+ }
+
+ arg = sel_node->select_list;
+
+ while (arg) {
+ dfield_print_also_hex(que_node_get_val(arg));
+
+ fputs(" ::: ", stderr);
+
+ arg = que_node_get_next(arg);
+ }
+
+ putc('\n', stderr);
+
+ /* Fetch next row to print */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+}
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. A counterpart of this function is
+ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+ dtuple_t* tuple, /*!< in/out: tuple where to build;
+ NOTE: we assume that the type info
+ in the tuple is already according
+ to index! */
+ byte* buf, /*!< in: buffer to use in field
+ conversions; NOTE that dtuple->data
+ may end up pointing inside buf so
+ do not discard that buffer while
+ the tuple is being used. See
+ row_mysql_store_col_in_innobase_format()
+ in the case of DATA_INT */
+ ulint buf_len, /*!< in: buffer length */
+ dict_index_t* index, /*!< in: index of the key value */
+ const byte* key_ptr, /*!< in: MySQL key value */
+ ulint key_len) /*!< in: MySQL key value length */
+{
+ byte* original_buf = buf;
+ const byte* original_key_ptr = key_ptr;
+ dict_field_t* field;
+ dfield_t* dfield;
+ ulint data_offset;
+ ulint data_len;
+ ulint data_field_len;
+ ibool is_null;
+ const byte* key_end;
+ ulint n_fields = 0;
+
+ /* For documentation of the key value storage format in MySQL, see
+ ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+ key_end = key_ptr + key_len;
+
+ /* Permit us to access any field in the tuple (ULINT_MAX): */
+
+ dtuple_set_n_fields(tuple, ULINT_MAX);
+
+ dfield = dtuple_get_nth_field(tuple, 0);
+ field = dict_index_get_nth_field(index, 0);
+
+ if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
+ /* A special case: we are looking for a position in the
+ generated clustered index which InnoDB automatically added
+ to a table with no primary key: the first and the only
+ ordering column is ROW_ID which InnoDB stored to the key_ptr
+ buffer. */
+
+ ut_a(key_len == DATA_ROW_ID_LEN);
+
+ dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+
+ dtuple_set_n_fields(tuple, 1);
+
+ return;
+ }
+
+ while (key_ptr < key_end) {
+
+ ulint type = dfield_get_type(dfield)->mtype;
+ ut_a(field->col->mtype == type);
+
+ data_offset = 0;
+ is_null = FALSE;
+
+ if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+ /* The first byte in the field tells if this is
+ an SQL NULL value */
+
+ data_offset = 1;
+
+ if (*key_ptr != 0) {
+ dfield_set_null(dfield);
+
+ is_null = TRUE;
+ }
+ }
+
+ /* Calculate data length and data field total length */
+ if (DATA_LARGE_MTYPE(type) || DATA_GEOMETRY_MTYPE(type)) {
+
+ /* For R-tree index, data length should be the
+ total size of the wkb data.*/
+ if (dict_index_is_spatial(index)) {
+ ut_ad(DATA_GEOMETRY_MTYPE(type));
+ data_len = key_len;
+ data_field_len = data_offset + data_len;
+ } else {
+ /* The key field is a column prefix of a BLOB
+ or TEXT. */
+
+ ut_a(field->prefix_len > 0);
+
+ /* MySQL stores the actual data length to the
+ first 2 bytes after the optional SQL NULL
+ marker byte. The storage format is
+ little-endian, that is, the most significant
+ byte at a higher address. In UTF-8, MySQL
+ seems to reserve field->prefix_len bytes for
+ storing this field in the key value buffer,
+ even though the actual value only takes data
+ len bytes from the start. */
+
+ data_len = ulint(key_ptr[data_offset])
+ | ulint(key_ptr[data_offset + 1]) << 8;
+ data_field_len = data_offset + 2
+ + field->prefix_len;
+
+ data_offset += 2;
+
+ /* Now that we know the length, we store the
+ column value like it would be a fixed char
+ field */
+ }
+
+
+ } else if (field->prefix_len > 0) {
+ /* Looks like MySQL pads unused end bytes in the
+ prefix with space. Therefore, also in UTF-8, it is ok
+ to compare with a prefix containing full prefix_len
+ bytes, and no need to take at most prefix_len / 3
+ UTF-8 characters from the start.
+ If the prefix is used as the upper end of a LIKE
+ 'abc%' query, then MySQL pads the end with chars
+ 0xff. TODO: in that case does it any harm to compare
+ with the full prefix_len bytes. How do characters
+ 0xff in UTF-8 behave? */
+
+ data_len = field->prefix_len;
+ data_field_len = data_offset + data_len;
+ } else {
+ data_len = dfield_get_type(dfield)->len;
+ data_field_len = data_offset + data_len;
+ }
+
+ if ((dtype_get_mysql_type(dfield_get_type(dfield))
+ == DATA_MYSQL_TRUE_VARCHAR)
+ && (type != DATA_INT)) {
+ /* In a MySQL key value format, a true VARCHAR is
+ always preceded by 2 bytes of a length field.
+ dfield_get_type(dfield)->len returns the maximum
+ 'payload' len in bytes. That does not include the
+ 2 bytes that tell the actual data length.
+
+ We added the check != DATA_INT to make sure we do
+ not treat MySQL ENUM or SET as a true VARCHAR! */
+
+ data_len += 2;
+ data_field_len += 2;
+ }
+
+ /* Storing may use at most data_len bytes of buf */
+
+ if (UNIV_LIKELY(!is_null)) {
+ buf = row_mysql_store_col_in_innobase_format(
+ dfield, buf,
+ FALSE, /* MySQL key value format col */
+ key_ptr + data_offset, data_len,
+ dict_table_is_comp(index->table));
+ ut_a(buf <= original_buf + buf_len);
+ }
+
+ key_ptr += data_field_len;
+
+ if (UNIV_UNLIKELY(key_ptr > key_end)) {
+ /* The last field in key was not a complete key field
+ but a prefix of it.
+
+ Print a warning about this! HA_READ_PREFIX_LAST does
+ not currently work in InnoDB with partial-field key
+ value prefixes. Since MySQL currently uses a padding
+ trick to calculate LIKE 'abc%' type queries there
+ should never be partial-field prefixes in searches. */
+
+ ib::warn() << "Using a partial-field key prefix in"
+ " search, index " << index->name
+ << " of table " << index->table->name
+ << ". Last data field length "
+ << data_field_len << " bytes, key ptr now"
+ " exceeds key end by " << (key_ptr - key_end)
+ << " bytes. Key value in the MySQL format:";
+
+ ut_print_buf(stderr, original_key_ptr, key_len);
+ putc('\n', stderr);
+
+ if (!is_null) {
+ ulint len = dfield_get_len(dfield);
+ dfield_set_len(dfield, len
+ - (ulint) (key_ptr - key_end));
+ }
+ ut_ad(0);
+ }
+
+ n_fields++;
+ field++;
+ dfield++;
+ }
+
+ ut_a(buf <= original_buf + buf_len);
+
+ /* We set the length of tuple to n_fields: we assume that the memory
+ area allocated for it is big enough (usually bigger than n_fields). */
+
+ dtuple_set_n_fields(tuple, n_fields);
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+void
+row_sel_field_store_in_mysql_format_func(
+ byte* dest,
+ const mysql_row_templ_t* templ,
+#ifdef UNIV_DEBUG
+ const dict_index_t* index,
+ ulint field_no,
+#endif /* UNIV_DEBUG */
+ const byte* data,
+ ulint len)
+{
+#ifdef UNIV_DEBUG
+ const dict_field_t* field
+ = templ->is_virtual
+ ? NULL : dict_index_get_nth_field(index, field_no);
+#endif /* UNIV_DEBUG */
+
+ ut_ad(len != UNIV_SQL_NULL);
+ MEM_CHECK_DEFINED(data, len);
+ MEM_CHECK_ADDRESSABLE(dest, templ->mysql_col_len);
+ MEM_UNDEFINED(dest, templ->mysql_col_len);
+
+ byte* pad = dest + len;
+
+ switch (templ->type) {
+ const byte* field_end;
+ case DATA_VARCHAR:
+ case DATA_VARMYSQL:
+ case DATA_BINARY:
+ field_end = dest + templ->mysql_col_len;
+
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR. Store the
+ length of the data to the first byte or the first
+ two bytes of dest. */
+
+ dest = row_mysql_store_true_var_len(
+ dest, len, templ->mysql_length_bytes);
+ /* Copy the actual data. Leave the rest of the
+ buffer uninitialized. */
+ memcpy(dest, data, len);
+ break;
+ }
+
+ /* Copy the actual data */
+ memcpy(dest, data, len);
+
+ /* Pad with trailing spaces. */
+
+ if (pad == field_end) {
+ break;
+ }
+
+ if (UNIV_UNLIKELY(templ->type == DATA_FIXBINARY)) {
+ memset(pad, 0, field_end - pad);
+ break;
+ }
+
+ ut_ad(templ->mbminlen <= templ->mbmaxlen);
+
+ /* We treat some Unicode charset strings specially. */
+ switch (templ->mbminlen) {
+ case 4:
+ /* InnoDB should never have stripped partial
+ UTF-32 characters. */
+ ut_a(!(len & 3));
+ break;
+ case 2:
+ /* A space char is two bytes,
+ 0x0020 in UCS2 and UTF-16 */
+
+ if (UNIV_UNLIKELY(len & 1)) {
+ /* A 0x20 has been stripped from the column.
+ Pad it back. */
+
+ if (pad < field_end) {
+ *pad++ = 0x20;
+ }
+ }
+ }
+
+ row_mysql_pad_col(templ->mbminlen, pad,
+ ulint(field_end - pad));
+ break;
+
+ case DATA_BLOB:
+ /* Store a pointer to the BLOB buffer to dest: the BLOB was
+ already copied to the buffer in row_sel_store_mysql_rec */
+
+ row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
+ len);
+ break;
+
+ case DATA_GEOMETRY:
+ /* We store all geometry data as BLOB data at server layer. */
+ row_mysql_store_geometry(dest, templ->mysql_col_len, data, len);
+ break;
+
+ case DATA_MYSQL:
+ memcpy(dest, data, len);
+
+ ut_ad(templ->mysql_col_len >= len);
+ ut_ad(templ->mbmaxlen >= templ->mbminlen);
+
+ /* If field_no equals to templ->icp_rec_field_no,
+ we are examining a row pointed by "icp_rec_field_no".
+ There is possibility that icp_rec_field_no refers to
+ a field in a secondary index while templ->rec_field_no
+ points to field in a primary index. The length
+ should still be equal, unless the field pointed
+ by icp_rec_field_no has a prefix */
+ ut_ad(templ->mbmaxlen > templ->mbminlen
+ || templ->mysql_col_len == len
+ || (field_no == templ->icp_rec_field_no
+ && field->prefix_len > 0));
+
+ /* The following assertion would fail for old tables
+ containing UTF-8 ENUM columns due to Bug #9526. */
+ ut_ad(!templ->mbmaxlen
+ || !(templ->mysql_col_len % templ->mbmaxlen));
+ ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len
+ || (field_no == templ->icp_rec_field_no
+ && field->prefix_len > 0)
+ || templ->rec_field_is_prefix);
+
+ ut_ad(templ->is_virtual
+ || !(field->prefix_len % templ->mbmaxlen));
+
+ if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
+ /* Pad with spaces. This undoes the stripping
+ done in row0mysql.cc, function
+ row_mysql_store_col_in_innobase_format(). */
+
+ memset(pad, 0x20, templ->mysql_col_len - len);
+ }
+ break;
+
+ default:
+#ifdef UNIV_DEBUG
+ case DATA_SYS_CHILD:
+ case DATA_SYS:
+ /* These column types should never be shipped to MySQL. */
+ ut_ad(0);
+ /* fall through */
+
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_DECIMAL:
+#endif /* UNIV_DEBUG */
+ ut_ad((templ->is_virtual && !field)
+ || (field && field->prefix_len
+ ? field->prefix_len == len
+ : templ->mysql_col_len == len));
+ memcpy(dest, data, len);
+ break;
+
+ case DATA_INT:
+ /* Convert InnoDB big-endian integer to little-endian
+ format, sign bit restored to 2's complement form */
+ DBUG_ASSERT(templ->mysql_col_len == len);
+
+ byte* ptr = pad;
+ do *--ptr = *data++; while (ptr != dest);
+ if (!templ->is_unsigned) {
+ pad[-1] ^= 0x80;
+ }
+ }
+}
+
+/** Convert a field in the Innobase format to a field in the MySQL format.
+@param[out] mysql_rec record in the MySQL format
+@param[in,out] prebuilt prebuilt struct
+@param[in] rec InnoDB record; must be protected
+ by a page latch
+@param[in] index index of rec
+@param[in] offsets array returned by rec_get_offsets()
+@param[in] field_no templ->rec_field_no or
+ templ->clust_rec_field_no
+ or templ->icp_rec_field_no
+@param[in] templ row template
+*/
+static MY_ATTRIBUTE((warn_unused_result))
+ibool
+row_sel_store_mysql_field(
+ byte* mysql_rec,
+ row_prebuilt_t* prebuilt,
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ ulint field_no,
+ const mysql_row_templ_t*templ)
+{
+ DBUG_ENTER("row_sel_store_mysql_field_func");
+
+ const byte* data;
+ ulint len;
+
+ ut_ad(prebuilt->default_rec);
+ ut_ad(templ);
+ ut_ad(templ >= prebuilt->mysql_template);
+ ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]);
+ ut_ad(field_no == templ->clust_rec_field_no
+ || field_no == templ->rec_field_no
+ || field_no == templ->icp_rec_field_no);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no) != 0)) {
+
+ mem_heap_t* heap;
+ /* Copy an externally stored field to a temporary heap */
+
+ ut_ad(field_no == templ->clust_rec_field_no);
+
+ if (DATA_LARGE_MTYPE(templ->type)) {
+ if (prebuilt->blob_heap == NULL) {
+ prebuilt->blob_heap = mem_heap_create(
+ srv_page_size);
+ }
+
+ heap = prebuilt->blob_heap;
+ } else {
+ heap = mem_heap_create(srv_page_size);
+ }
+
+ /* NOTE: if we are retrieving a big BLOB, we may
+ already run out of memory in the next call, which
+ causes an assert */
+
+ data = btr_rec_copy_externally_stored_field(
+ rec, offsets, prebuilt->table->space->zip_size(),
+ field_no, &len, heap);
+
+ if (UNIV_UNLIKELY(!data)) {
+ /* The externally stored field was not written
+ yet. This record should only be seen by
+ trx_rollback_recovered() or any
+ TRX_ISO_READ_UNCOMMITTED transactions. */
+
+ if (heap != prebuilt->blob_heap) {
+ mem_heap_free(heap);
+ }
+
+ ut_a(prebuilt->trx->isolation_level
+ == TRX_ISO_READ_UNCOMMITTED);
+ DBUG_RETURN(FALSE);
+ }
+
+ ut_a(len != UNIV_SQL_NULL);
+
+ row_sel_field_store_in_mysql_format(
+ mysql_rec + templ->mysql_col_offset,
+ templ, index, field_no, data, len);
+
+ if (heap != prebuilt->blob_heap) {
+ mem_heap_free(heap);
+ }
+ } else {
+ /* The field is stored in the index record, or
+ in the metadata for instant ADD COLUMN. */
+ data = rec_get_nth_cfield(rec, index, offsets, field_no, &len);
+
+ if (len == UNIV_SQL_NULL) {
+ /* MySQL assumes that the field for an SQL
+ NULL value is set to the default value. */
+ ut_ad(templ->mysql_null_bit_mask);
+
+ MEM_CHECK_DEFINED(prebuilt->default_rec
+ + templ->mysql_col_offset,
+ templ->mysql_col_len);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ mysql_rec[templ->mysql_null_byte_offset]
+ |= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ memcpy(mysql_rec + templ->mysql_col_offset,
+ (const byte*) prebuilt->default_rec
+ + templ->mysql_col_offset,
+ templ->mysql_col_len);
+ DBUG_RETURN(TRUE);
+ }
+
+ if (DATA_LARGE_MTYPE(templ->type)
+ || DATA_GEOMETRY_MTYPE(templ->type)) {
+
+ /* It is a BLOB field locally stored in the
+ InnoDB record: we MUST copy its contents to
+ prebuilt->blob_heap here because
+ row_sel_field_store_in_mysql_format() stores a
+ pointer to the data, and the data passed to us
+ will be invalid as soon as the
+ mini-transaction is committed and the page
+ latch on the clustered index page is
+ released. */
+
+ if (prebuilt->blob_heap == NULL) {
+ prebuilt->blob_heap = mem_heap_create(
+ srv_page_size);
+ DBUG_PRINT("anna", ("blob_heap allocated: %p",
+ prebuilt->blob_heap));
+ }
+
+ data = static_cast<byte*>(
+ mem_heap_dup(prebuilt->blob_heap, data, len));
+ }
+
+ row_sel_field_store_in_mysql_format(
+ mysql_rec + templ->mysql_col_offset,
+ templ, index, field_no, data, len);
+ }
+
+ ut_ad(len != UNIV_SQL_NULL);
+
+ if (templ->mysql_null_bit_mask) {
+ /* It is a nullable column with a non-NULL
+ value */
+ mysql_rec[templ->mysql_null_byte_offset]
+ &= static_cast<byte>(~templ->mysql_null_bit_mask);
+ }
+
+ DBUG_RETURN(TRUE);
+}
+
+/** Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query.
+@param[out] mysql_rec row in the MySQL format
+@param[in] prebuilt cursor
+@param[in] rec Innobase record in the index
+ which was described in prebuilt's
+ template, or in the clustered index;
+ must be protected by a page latch
+@param[in] vrow virtual columns
+@param[in] rec_clust whether index must be the clustered index
+@param[in] index index of rec
+@param[in] offsets array returned by rec_get_offsets(rec)
+@retval true on success
+@retval false if not all columns could be retrieved */
+MY_ATTRIBUTE((warn_unused_result))
+static bool row_sel_store_mysql_rec(
+ byte* mysql_rec,
+ row_prebuilt_t* prebuilt,
+ const rec_t* rec,
+ const dtuple_t* vrow,
+ bool rec_clust,
+ const dict_index_t* index,
+ const rec_offs* offsets)
+{
+ DBUG_ENTER("row_sel_store_mysql_rec");
+
+ ut_ad(rec_clust || index == prebuilt->index);
+ ut_ad(!rec_clust || dict_index_is_clust(index));
+
+ if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+ row_mysql_prebuilt_free_blob_heap(prebuilt);
+ }
+
+ for (ulint i = 0; i < prebuilt->n_template; i++) {
+ const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+
+ if (templ->is_virtual && dict_index_is_clust(index)) {
+ /* Skip virtual columns if it is not a covered
+ search or virtual key read is not requested. */
+ if (!rec_clust
+ || !prebuilt->index->has_virtual()
+ || !prebuilt->read_just_key) {
+ /* Initialize the NULL bit. */
+ if (templ->mysql_null_bit_mask) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ mysql_rec[templ->mysql_null_byte_offset]
+ |= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ }
+ continue;
+ }
+
+ dict_v_col_t* col;
+ col = dict_table_get_nth_v_col(
+ index->table, templ->clust_rec_field_no);
+
+ ut_ad(vrow);
+
+ const dfield_t* dfield = dtuple_get_nth_v_field(
+ vrow, col->v_pos);
+
+ if (dfield_get_type(dfield)->mtype == DATA_MISSING) {
+ ut_ad("no ha_innopart in MariaDB" == 0);
+ continue;
+ }
+
+ if (dfield->len == UNIV_SQL_NULL) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ mysql_rec[templ->mysql_null_byte_offset]
+ |= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ memcpy(mysql_rec
+ + templ->mysql_col_offset,
+ (const byte*) prebuilt->default_rec
+ + templ->mysql_col_offset,
+ templ->mysql_col_len);
+ } else {
+ row_sel_field_store_in_mysql_format(
+ mysql_rec + templ->mysql_col_offset,
+ templ, index, templ->clust_rec_field_no,
+ (const byte*)dfield->data, dfield->len);
+ if (templ->mysql_null_bit_mask) {
+ mysql_rec[
+ templ->mysql_null_byte_offset]
+ &= static_cast<byte>
+ (~templ->mysql_null_bit_mask);
+ }
+ }
+
+ continue;
+ }
+
+ const ulint field_no
+ = rec_clust
+ ? templ->clust_rec_field_no
+ : templ->rec_field_no;
+ /* We should never deliver column prefixes to the SQL layer,
+ except for evaluating handler_index_cond_check()
+ or handler_rowid_filter_check(). */
+ /* ...actually, we do want to do this in order to
+ support the prefix query optimization.
+
+ ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
+ == 0);
+
+ ...so we disable this assert. */
+
+ if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+ rec, index, offsets,
+ field_no, templ)) {
+
+ DBUG_RETURN(false);
+ }
+ }
+
+ /* FIXME: We only need to read the doc_id if an FTS indexed
+ column is being updated.
+ NOTE, the record can be cluster or secondary index record.
+ if secondary index is used then FTS_DOC_ID column should be part
+ of this index. */
+ if (dict_table_has_fts_index(prebuilt->table)) {
+ if (dict_index_is_clust(index)
+ || prebuilt->fts_doc_id_in_read_set) {
+ prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
+ rec, index, offsets);
+ }
+ }
+
+ DBUG_RETURN(true);
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+ ReadView* read_view, /*!< in: read view */
+ dict_index_t* clust_index, /*!< in: clustered index */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
+ const rec_t* rec, /*!< in: record in a clustered index */
+ rec_offs** offsets, /*!< in/out: offsets returned by
+ rec_get_offsets(rec, clust_index) */
+ mem_heap_t** offset_heap, /*!< in/out: memory heap from which
+ the offsets are allocated */
+ rec_t** old_vers, /*!< out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ dtuple_t** vrow, /*!< out: dtuple to hold old virtual
+ column data */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dberr_t err;
+
+ if (prebuilt->old_vers_heap) {
+ mem_heap_empty(prebuilt->old_vers_heap);
+ } else {
+ prebuilt->old_vers_heap = mem_heap_create(200);
+ }
+
+ err = row_vers_build_for_consistent_read(
+ rec, mtr, clust_index, offsets, read_view, offset_heap,
+ prebuilt->old_vers_heap, old_vers, vrow);
+ return(err);
+}
+
+/** Helper class to cache clust_rec and old_vers */
+class Row_sel_get_clust_rec_for_mysql
+{
+ const rec_t *cached_clust_rec;
+ rec_t *cached_old_vers;
+ lsn_t cached_lsn;
+ page_id_t cached_page_id;
+
+#ifdef UNIV_DEBUG
+ void check_eq(const dict_index_t *index, const rec_offs *offsets) const
+ {
+ rec_offs vers_offs[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS];
+ rec_offs_init(vers_offs);
+ mem_heap_t *heap= nullptr;
+
+ ut_ad(rec_offs_validate(cached_clust_rec, index, offsets));
+ ut_ad(index->first_user_field() <= rec_offs_n_fields(offsets));
+ ut_ad(vers_offs == rec_get_offsets(cached_old_vers, index, vers_offs,
+ index->n_core_fields,
+ index->db_trx_id(), &heap));
+ ut_ad(!heap);
+ for (auto n= index->db_trx_id(); n--; )
+ {
+ const dict_col_t *col= dict_index_get_nth_col(index, n);
+ ulint len1, len2;
+ const byte *b1= rec_get_nth_field(cached_clust_rec, offsets, n, &len1);
+ const byte *b2= rec_get_nth_field(cached_old_vers, vers_offs, n, &len2);
+ ut_ad(!cmp_data_data(col->mtype, col->prtype, b1, len1, b2, len2));
+ }
+ }
+#endif
+
+public:
+ Row_sel_get_clust_rec_for_mysql() :
+ cached_clust_rec(NULL), cached_old_vers(NULL), cached_lsn(0),
+ cached_page_id(page_id_t(0,0)) {}
+
+ dberr_t operator()(row_prebuilt_t *prebuilt, dict_index_t *sec_index,
+ const rec_t *rec, que_thr_t *thr, const rec_t **out_rec,
+ rec_offs **offsets, mem_heap_t **offset_heap,
+ dtuple_t **vrow, mtr_t *mtr);
+};
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+dberr_t
+Row_sel_get_clust_rec_for_mysql::operator()(
+/*============================*/
+ row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */
+ dict_index_t* sec_index,/*!< in: secondary index where rec resides */
+ const rec_t* rec, /*!< in: record in a non-clustered index; if
+ this is a locking read, then rec is not
+ allowed to be delete-marked, and that would
+ not make sense either */
+ que_thr_t* thr, /*!< in: query thread */
+ const rec_t** out_rec,/*!< out: clustered record or an old version of
+ it, NULL if the old version did not exist
+ in the read view, i.e., it was a fresh
+ inserted version */
+ rec_offs** offsets,/*!< in: offsets returned by
+ rec_get_offsets(rec, sec_index);
+ out: offsets returned by
+ rec_get_offsets(out_rec, clust_index) */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ dtuple_t** vrow, /*!< out: virtual column to fill */
+ mtr_t* mtr) /*!< in: mtr used to get access to the
+ non-clustered record; the same mtr is used to
+ access the clustered index */
+{
+ dict_index_t* clust_index;
+ const rec_t* clust_rec;
+ rec_t* old_vers;
+ dberr_t err;
+ trx_t* trx;
+
+ *out_rec = NULL;
+ trx = thr_get_trx(thr);
+
+ srv_stats.n_sec_rec_cluster_reads.inc(
+ thd_get_thread_id(trx->mysql_thd));
+
+ row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
+ sec_index, *offsets);
+
+ clust_index = dict_table_get_first_index(sec_index->table);
+
+ btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ prebuilt->clust_pcur, 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+ prebuilt->clust_pcur->trx_if_known = trx;
+
+ /* Note: only if the search ends up on a non-infimum record is the
+ low_match value the real match to the search tuple */
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(prebuilt->clust_pcur)
+ < dict_index_get_n_unique(clust_index)) {
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(prebuilt->pcur);
+
+ /* If this is a spatial index scan, and we are reading
+ from a shadow buffer, the record could be already
+ deleted (due to rollback etc.). So get the original
+ page and verify that */
+ if (dict_index_is_spatial(sec_index)
+ && btr_cur->rtr_info->matches
+ && (page_align(rec)
+ == btr_cur->rtr_info->matches->block.frame
+ || rec != btr_pcur_get_rec(prebuilt->pcur))) {
+#ifdef UNIV_DEBUG
+ rtr_info_t* rtr_info = btr_cur->rtr_info;
+ mutex_enter(&rtr_info->matches->rtr_match_mutex);
+ /* The page could be deallocated (by rollback etc.) */
+ if (!rtr_info->matches->valid) {
+ mutex_exit(&rtr_info->matches->rtr_match_mutex);
+ clust_rec = NULL;
+
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+ mutex_exit(&rtr_info->matches->rtr_match_mutex);
+
+ if (rec_get_deleted_flag(rec,
+ dict_table_is_comp(sec_index->table))
+ && prebuilt->select_lock_type == LOCK_NONE) {
+
+ clust_rec = NULL;
+
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ if (rec != btr_pcur_get_rec(prebuilt->pcur)) {
+ clust_rec = NULL;
+
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ /* FIXME: Why is this block not the
+ same as btr_pcur_get_block(prebuilt->pcur),
+ and is it not unsafe to use RW_NO_LATCH here? */
+ buf_block_t* block = buf_page_get_gen(
+ btr_pcur_get_block(prebuilt->pcur)->page.id(),
+ btr_pcur_get_block(prebuilt->pcur)->zip_size(),
+ RW_NO_LATCH, NULL, BUF_GET,
+ __FILE__, __LINE__, mtr, &err);
+ mem_heap_t* heap = mem_heap_create(256);
+ dtuple_t* tuple = dict_index_build_data_tuple(
+ rec, sec_index, true,
+ sec_index->n_fields, heap);
+ page_cur_t page_cursor;
+
+ ulint low_match = page_cur_search(
+ block, sec_index, tuple,
+ PAGE_CUR_LE, &page_cursor);
+
+ ut_ad(low_match < dtuple_get_n_fields_cmp(tuple));
+ mem_heap_free(heap);
+ clust_rec = NULL;
+
+ err = DB_SUCCESS;
+ goto func_exit;
+#endif /* UNIV_DEBUG */
+ } else if (!rec_get_deleted_flag(rec,
+ dict_table_is_comp(sec_index->table))
+ || prebuilt->select_lock_type != LOCK_NONE) {
+ /* In a rare case it is possible that no clust
+ rec is found for a delete-marked secondary index
+ record: if in row0umod.cc in
+ row_undo_mod_remove_clust_low() we have already removed
+ the clust rec, while purge is still cleaning and
+ removing secondary index records associated with
+ earlier versions of the clustered index record.
+ In that case we know that the clustered index
+ record did not exist in the read view of trx. */
+ ib::error() << "Clustered record for sec rec not found"
+ " index " << sec_index->name
+ << " of table " << sec_index->table->name;
+
+ fputs("InnoDB: sec index record ", stderr);
+ rec_print(stderr, rec, sec_index);
+ fputs("\n"
+ "InnoDB: clust index record ", stderr);
+ rec_print(stderr, clust_rec, clust_index);
+ putc('\n', stderr);
+ trx_print(stderr, trx, 600);
+ fputs("\n"
+ "InnoDB: Submit a detailed bug report"
+ " to https://jira.mariadb.org/\n", stderr);
+ ut_ad(0);
+ }
+
+ clust_rec = NULL;
+
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ *offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, offset_heap);
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record; we are searching
+ the clust rec with a unique condition, hence
+ we set a LOCK_REC_NOT_GAP type lock */
+
+ err = lock_clust_rec_read_check_and_lock(
+ 0, btr_pcur_get_block(prebuilt->clust_pcur),
+ clust_rec, clust_index, *offsets,
+ prebuilt->select_lock_type,
+ LOCK_REC_NOT_GAP,
+ thr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_SUCCESS_LOCKED_REC:
+ break;
+ default:
+ goto err_exit;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ old_vers = NULL;
+
+ /* If the isolation level allows reading of uncommitted data,
+ then we never look for an earlier version */
+
+ if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+ && !lock_clust_rec_cons_read_sees(
+ clust_rec, clust_index, *offsets,
+ &trx->read_view)) {
+ const buf_page_t& bpage = btr_pcur_get_block(
+ prebuilt->clust_pcur)->page;
+
+ const lsn_t lsn = mach_read_from_8(
+ page_align(clust_rec) + FIL_PAGE_LSN);
+
+ if (lsn != cached_lsn
+ || bpage.id() != cached_page_id
+ || clust_rec != cached_clust_rec) {
+ /* The following call returns 'offsets' associated with
+ 'old_vers' */
+ err = row_sel_build_prev_vers_for_mysql(
+ &trx->read_view, clust_index, prebuilt,
+ clust_rec, offsets, offset_heap, &old_vers,
+ vrow, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto err_exit;
+ }
+ cached_lsn = lsn;
+ cached_page_id = bpage.id();
+ cached_clust_rec = clust_rec;
+ cached_old_vers = old_vers;
+ } else {
+ err = DB_SUCCESS;
+ old_vers = cached_old_vers;
+
+ /* The offsets need not be same for the latest
+ version of clust_rec and its old version
+ old_vers. Re-calculate the offsets for old_vers. */
+
+ if (old_vers) {
+ ut_d(check_eq(clust_index, *offsets));
+ *offsets = rec_get_offsets(
+ old_vers, clust_index, *offsets,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, offset_heap);
+ }
+ }
+
+ if (old_vers == NULL) {
+ goto err_exit;
+ }
+
+ clust_rec = old_vers;
+ }
+
+ /* If we had to go to an earlier version of row or the
+ secondary index record is delete marked, then it may be that
+ the secondary index record corresponding to clust_rec
+ (or old_vers) is not rec; in that case we must ignore
+ such row because in our snapshot rec would not have existed.
+ Remember that from rec we cannot see directly which transaction
+ id corresponds to it: we have to go to the clustered index
+ record. A query where we want to fetch all rows where
+ the secondary index value is in some interval would return
+ a wrong result if we would not drop rows which we come to
+ visit through secondary index records that would not really
+ exist in our snapshot. */
+
+ /* And for spatial index, since the rec is from shadow buffer,
+ so we need to check if it's exactly match the clust_rec. */
+ if (clust_rec
+ && (old_vers
+ || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
+ || dict_index_is_spatial(sec_index)
+ || rec_get_deleted_flag(rec, dict_table_is_comp(
+ sec_index->table)))) {
+ err = row_sel_sec_rec_is_for_clust_rec(rec, sec_index,
+ clust_rec, clust_index, thr);
+ switch (err) {
+ case DB_SUCCESS:
+ clust_rec = NULL;
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ break;
+ default:
+ goto err_exit;
+ }
+ }
+
+ err = DB_SUCCESS;
+ }
+
+func_exit:
+ *out_rec = clust_rec;
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* We may use the cursor in update or in unlock_row():
+ store its position */
+
+ btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+ }
+
+err_exit:
+ return(err);
+}
+
+/********************************************************************//**
+Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on may have been deleted.
+Then we may have to move the cursor one step up or down.
+@return true if we may need to process the record the cursor is now
+positioned on (i.e. we should not go to the next record yet) */
+static
+bool
+sel_restore_position_for_mysql(
+/*===========================*/
+ ibool* same_user_rec, /*!< out: TRUE if we were able to restore
+ the cursor on a user record with the
+ same ordering prefix in in the
+ B-tree index */
+ ulint latch_mode, /*!< in: latch mode wished in
+ restoration */
+ btr_pcur_t* pcur, /*!< in: cursor whose position
+ has been stored */
+ ibool moves_up, /*!< in: TRUE if the cursor moves up
+ in the index */
+ mtr_t* mtr) /*!< in: mtr; CAUTION: may commit
+ mtr temporarily! */
+{
+ ibool success;
+
+ success = btr_pcur_restore_position(latch_mode, pcur, mtr);
+
+ *same_user_rec = success;
+
+ ut_ad(!success || pcur->rel_pos == BTR_PCUR_ON);
+#ifdef UNIV_DEBUG
+ if (pcur->pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) {
+ ut_ad(pcur->rel_pos == BTR_PCUR_BEFORE
+ || pcur->rel_pos == BTR_PCUR_AFTER);
+ } else {
+ ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad((pcur->rel_pos == BTR_PCUR_ON)
+ == btr_pcur_is_on_user_rec(pcur));
+ }
+#endif /* UNIV_DEBUG */
+
+ /* The position may need be adjusted for rel_pos and moves_up. */
+
+ switch (pcur->rel_pos) {
+ case BTR_PCUR_ON:
+ if (!success && moves_up) {
+next:
+ if (btr_pcur_move_to_next(pcur, mtr)
+ && rec_is_metadata(btr_pcur_get_rec(pcur),
+ *pcur->btr_cur.index)) {
+ btr_pcur_move_to_next(pcur, mtr);
+ }
+
+ return true;
+ }
+ return(!success);
+ case BTR_PCUR_AFTER_LAST_IN_TREE:
+ case BTR_PCUR_BEFORE_FIRST_IN_TREE:
+ return true;
+ case BTR_PCUR_AFTER:
+ /* positioned to record after pcur->old_rec. */
+ pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+prev:
+ if (btr_pcur_is_on_user_rec(pcur) && !moves_up
+ && !rec_is_metadata(btr_pcur_get_rec(pcur),
+ *pcur->btr_cur.index)) {
+ btr_pcur_move_to_prev(pcur, mtr);
+ }
+ return true;
+ case BTR_PCUR_BEFORE:
+ /* For non optimistic restoration:
+ The position is now set to the record before pcur->old_rec.
+
+ For optimistic restoration:
+ The position also needs to take the previous search_mode into
+ consideration. */
+
+ switch (pcur->pos_state) {
+ case BTR_PCUR_IS_POSITIONED_OPTIMISTIC:
+ pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+ if (pcur->search_mode == PAGE_CUR_GE) {
+ /* Positioned during Greater or Equal search
+ with BTR_PCUR_BEFORE. Optimistic restore to
+ the same record. If scanning for lower then
+ we must move to previous record.
+ This can happen with:
+ HANDLER READ idx a = (const);
+ HANDLER READ idx PREV; */
+ goto prev;
+ }
+ return true;
+ case BTR_PCUR_IS_POSITIONED:
+ if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
+ goto next;
+ }
+ return true;
+ case BTR_PCUR_WAS_POSITIONED:
+ case BTR_PCUR_NOT_POSITIONED:
+ break;
+ }
+ }
+ ut_ad(0);
+ return true;
+}
+
+/********************************************************************//**
+Copies a cached field for MySQL from the fetch cache. */
+static
+void
+row_sel_copy_cached_field_for_mysql(
+/*================================*/
+ byte* buf, /*!< in/out: row buffer */
+ const byte* cache, /*!< in: cached row */
+ const mysql_row_templ_t*templ) /*!< in: column template */
+{
+ ulint len;
+
+ buf += templ->mysql_col_offset;
+ cache += templ->mysql_col_offset;
+
+ MEM_CHECK_ADDRESSABLE(buf, templ->mysql_col_len);
+
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
+ && (templ->type != DATA_INT)) {
+ /* Check for != DATA_INT to make sure we do
+ not treat MySQL ENUM or SET as a true VARCHAR!
+ Find the actual length of the true VARCHAR field. */
+ row_mysql_read_true_varchar(
+ &len, cache, templ->mysql_length_bytes);
+ len += templ->mysql_length_bytes;
+ MEM_UNDEFINED(buf, templ->mysql_col_len);
+ } else {
+ len = templ->mysql_col_len;
+ }
+
+ memcpy(buf, cache, len);
+}
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out] buf Where to copy the MySQL row.
+@param[in] cached_rec What to copy (in MySQL row format).
+@param[in] prebuilt prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+ byte* buf,
+ const byte* cached_rec,
+ row_prebuilt_t* prebuilt)
+{
+ const mysql_row_templ_t*templ;
+ ulint i;
+ for (i = 0; i < prebuilt->n_template; i++) {
+ templ = prebuilt->mysql_template + i;
+
+ /* Skip virtual columns */
+ if (templ->is_virtual) {
+ continue;
+ }
+
+ row_sel_copy_cached_field_for_mysql(
+ buf, cached_rec, templ);
+ /* Copy NULL bit of the current field from cached_rec
+ to buf */
+ if (templ->mysql_null_bit_mask) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ buf[templ->mysql_null_byte_offset]
+ ^= (buf[templ->mysql_null_byte_offset]
+ ^ cached_rec[templ->mysql_null_byte_offset])
+ & (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ }
+ }
+}
+
+/********************************************************************//**
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_dequeue_cached_row_for_mysql(
+/*=================================*/
+ byte* buf, /*!< in/out: buffer where to copy the
+ row */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */
+{
+ ulint i;
+ const mysql_row_templ_t*templ;
+ const byte* cached_rec;
+ ut_ad(prebuilt->n_fetch_cached > 0);
+ ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
+
+ MEM_CHECK_ADDRESSABLE(buf, prebuilt->mysql_row_len);
+
+ cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
+
+ if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
+ row_sel_copy_cached_fields_for_mysql(buf, cached_rec, prebuilt);
+ } else if (prebuilt->mysql_prefix_len > 63) {
+ /* The record is long. Copy it field by field, in case
+ there are some long VARCHAR column of which only a
+ small length is being used. */
+ MEM_UNDEFINED(buf, prebuilt->mysql_prefix_len);
+
+ /* First copy the NULL bits. */
+ memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
+ /* Then copy the requested fields. */
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+ templ = prebuilt->mysql_template + i;
+
+ /* Skip virtual columns */
+ if (templ->is_virtual
+ && !(dict_index_has_virtual(prebuilt->index)
+ && prebuilt->read_just_key)) {
+ continue;
+ }
+
+ row_sel_copy_cached_field_for_mysql(
+ buf, cached_rec, templ);
+ }
+ } else {
+ memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
+ }
+
+ prebuilt->n_fetch_cached--;
+ prebuilt->fetch_cache_first++;
+
+ if (prebuilt->n_fetch_cached == 0) {
+ prebuilt->fetch_cache_first = 0;
+ }
+}
+
+/********************************************************************//**
+Initialise the prefetch cache. */
+UNIV_INLINE
+void
+row_sel_prefetch_cache_init(
+/*========================*/
+ row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
+{
+ ulint i;
+ ulint sz;
+ byte* ptr;
+
+ /* Reserve space for the magic number. */
+ sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
+ ptr = static_cast<byte*>(ut_malloc_nokey(sz));
+
+ for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
+
+ /* A user has reported memory corruption in these
+ buffers in Linux. Put magic numbers there to help
+ to track a possible bug. */
+
+ mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+ ptr += 4;
+
+ prebuilt->fetch_cache[i] = ptr;
+ ptr += prebuilt->mysql_row_len;
+
+ mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+ ptr += 4;
+ }
+}
+
+/********************************************************************//**
+Get the last fetch cache buffer from the queue.
+@return pointer to buffer. */
+UNIV_INLINE
+byte*
+row_sel_fetch_last_buf(
+/*===================*/
+ row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
+{
+ ut_ad(!prebuilt->templ_contains_blob);
+ ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+ if (prebuilt->fetch_cache[0] == NULL) {
+ /* Allocate memory for the fetch cache */
+ ut_ad(prebuilt->n_fetch_cached == 0);
+
+ row_sel_prefetch_cache_init(prebuilt);
+ }
+
+ ut_ad(prebuilt->fetch_cache_first == 0);
+ MEM_UNDEFINED(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+ prebuilt->mysql_row_len);
+
+ return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]);
+}
+
+/********************************************************************//**
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_enqueue_cache_row_for_mysql(
+/*================================*/
+ byte* mysql_rec, /*!< in/out: MySQL record */
+ row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
+{
+ /* For non ICP code path the row should already exist in the
+ next fetch cache slot. */
+
+ if (prebuilt->pk_filter || prebuilt->idx_cond) {
+ memcpy(row_sel_fetch_last_buf(prebuilt), mysql_rec,
+ prebuilt->mysql_row_len);
+ }
+
+ ++prebuilt->n_fetch_cached;
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). We assume that the search
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
+btr search latch has been locked in S-mode if AHI is enabled.
+@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut_for_mysql(
+/*==================================*/
+ const rec_t** out_rec,/*!< out: record if found */
+ row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */
+ rec_offs** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
+ mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
+ mtr_t* mtr) /*!< in: started mtr */
+{
+ dict_index_t* index = prebuilt->index;
+ const dtuple_t* search_tuple = prebuilt->search_tuple;
+ btr_pcur_t* pcur = prebuilt->pcur;
+ trx_t* trx = prebuilt->trx;
+ const rec_t* rec;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!prebuilt->templ_contains_blob);
+
+ rw_lock_t* ahi_latch = btr_search_sys.get_latch(*index);
+ rw_lock_s_lock(ahi_latch);
+ btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, pcur, ahi_latch, mtr);
+ rec = btr_pcur_get_rec(pcur);
+
+ if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) {
+retry:
+ rw_lock_s_unlock(ahi_latch);
+ return(SEL_RETRY);
+ }
+
+ /* As the cursor is now placed on a user record after a search with
+ the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+ fields in the user record matched to the search tuple */
+
+ if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
+exhausted:
+ rw_lock_s_unlock(ahi_latch);
+ return(SEL_EXHAUSTED);
+ }
+
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
+ ULINT_UNDEFINED, heap);
+
+ if (!lock_clust_rec_cons_read_sees(rec, index, *offsets,
+ &trx->read_view)) {
+ goto retry;
+ }
+
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+ goto exhausted;
+ }
+
+ *out_rec = rec;
+
+ rw_lock_s_unlock(ahi_latch);
+ return(SEL_FOUND);
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/*********************************************************************//**
+Check a pushed-down index condition.
+@return CHECK_ABORTED_BY_USER, CHECK_NEG, CHECK_POS, or CHECK_OUT_OF_RANGE */
+static
+check_result_t
+row_search_idx_cond_check(
+/*======================*/
+ byte* mysql_rec, /*!< out: record
+ in MySQL format (invalid unless
+ prebuilt->idx_cond!=NULL and
+ we return ICP_MATCH) */
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
+ for the table handle */
+ const rec_t* rec, /*!< in: InnoDB record */
+ const rec_offs* offsets) /*!< in: rec_get_offsets() */
+{
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
+
+ if (!prebuilt->idx_cond) {
+ if (!handler_rowid_filter_is_active(prebuilt->pk_filter)) {
+ return(CHECK_POS);
+ }
+ } else {
+ MONITOR_INC(MONITOR_ICP_ATTEMPTS);
+ }
+
+ /* Convert to MySQL format those fields that are needed for
+ evaluating the index condition. */
+
+ if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+ mem_heap_empty(prebuilt->blob_heap);
+ }
+
+ for (i = 0; i < prebuilt->idx_cond_n_cols; i++) {
+ const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+
+ /* Skip virtual columns */
+ if (templ->is_virtual) {
+ continue;
+ }
+
+ if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+ rec, prebuilt->index, offsets,
+ templ->icp_rec_field_no,
+ templ)) {
+ return(CHECK_NEG);
+ }
+ }
+
+ /* We assume that the index conditions on
+ case-insensitive columns are case-insensitive. The
+ case of such columns may be wrong in a secondary
+ index, if the case of the column has been updated in
+ the past, or a record has been deleted and a record
+ inserted in a different case. */
+ check_result_t result = prebuilt->idx_cond
+ ? handler_index_cond_check(prebuilt->idx_cond)
+ : CHECK_POS;
+
+ switch (result) {
+ case CHECK_POS:
+ if (handler_rowid_filter_is_active(prebuilt->pk_filter)) {
+ ut_ad(!prebuilt->index->is_primary());
+ if (prebuilt->clust_index_was_generated) {
+ ulint len;
+ dict_index_t* index = prebuilt->index;
+ const byte* data = rec_get_nth_field(
+ rec, offsets, index->n_fields - 1,
+ &len);
+ ut_ad(dict_index_get_nth_col(index,
+ index->n_fields - 1)
+ ->prtype == (DATA_ROW_ID | DATA_NOT_NULL));
+ ut_ad(len == DATA_ROW_ID_LEN);
+ memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN);
+ }
+ result = handler_rowid_filter_check(prebuilt->pk_filter);
+ switch (result) {
+ case CHECK_NEG:
+ MONITOR_INC(MONITOR_ICP_NO_MATCH);
+ return(result);
+ case CHECK_OUT_OF_RANGE:
+ MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+ return(result);
+ case CHECK_POS:
+ break;
+ default:
+ return(result);
+ }
+ }
+ /* Convert the remaining fields to MySQL format.
+ If this is a secondary index record, we must defer
+ this until we have fetched the clustered index record. */
+ if (!prebuilt->need_to_access_clustered
+ || dict_index_is_clust(prebuilt->index)) {
+ if (!row_sel_store_mysql_rec(
+ mysql_rec, prebuilt, rec, NULL, false,
+ prebuilt->index, offsets)) {
+ ut_ad(dict_index_is_clust(prebuilt->index));
+ return(CHECK_NEG);
+ }
+ }
+ MONITOR_INC(MONITOR_ICP_MATCH);
+ return(result);
+ case CHECK_NEG:
+ MONITOR_INC(MONITOR_ICP_NO_MATCH);
+ return(result);
+ case CHECK_OUT_OF_RANGE:
+ MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+ return(result);
+ case CHECK_ERROR:
+ case CHECK_ABORTED_BY_USER:
+ return(result);
+ }
+
+ ut_error;
+ return(result);
+}
+
+/** Extract virtual column data from a virtual index record and fill a dtuple
+@param[in] rec the virtual (secondary) index record
+@param[in] index the virtual index
+@param[in,out] vrow the dtuple where data extract to
+@param[in] heap memory heap to allocate memory
+*/
+static
+void
+row_sel_fill_vrow(
+ const rec_t* rec,
+ dict_index_t* index,
+ dtuple_t** vrow,
+ mem_heap_t* heap)
+{
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(!(*vrow));
+ ut_ad(heap);
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(!index->is_instant());
+ ut_ad(page_rec_is_leaf(rec));
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ *vrow = dtuple_create_with_vcol(
+ heap, 0, dict_table_get_n_v_cols(index->table));
+
+ /* Initialize all virtual row's mtype to DATA_MISSING */
+ dtuple_init_v_fld(*vrow);
+
+ for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+ const dict_field_t* field;
+ const dict_col_t* col;
+
+ field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(field);
+
+ if (col->is_virtual()) {
+ const byte* data;
+ ulint len;
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ const dict_v_col_t* vcol = reinterpret_cast<
+ const dict_v_col_t*>(col);
+
+ dfield_t* dfield = dtuple_get_nth_v_field(
+ *vrow, vcol->v_pos);
+ dfield_set_data(dfield, data, len);
+ dict_col_copy_type(col, dfield_get_type(dfield));
+ }
+ }
+}
+
+/** Return the record field length in characters.
+@param[in] col table column of the field
+@param[in] field_no field number
+@param[in] rec physical record
+@param[in] offsets field offsets in the physical record
+@return field length in characters. */
+static
+size_t
+rec_field_len_in_chars(
+ const dict_col_t* col,
+ const ulint field_no,
+ const rec_t* rec,
+ const rec_offs* offsets)
+{
+ const ulint cset = dtype_get_charset_coll(col->prtype);
+ const CHARSET_INFO* cs = all_charsets[cset];
+ ulint rec_field_len;
+ const char* rec_field = reinterpret_cast<const char *>(
+ rec_get_nth_field(
+ rec, offsets, field_no, &rec_field_len));
+
+ if (UNIV_UNLIKELY(!cs)) {
+ ib::warn() << "Missing collation " << cset;
+ return SIZE_T_MAX;
+ }
+
+ return cs->numchars(rec_field, rec_field + rec_field_len);
+}
+
+/** Avoid the clustered index lookup if all the following conditions
+are true:
+1) all columns are in secondary index
+2) all values for columns that are prefix-only indexes are shorter
+than the prefix size. This optimization can avoid many IOs for certain schemas.
+@return true, to avoid clustered index lookup. */
+static
+bool row_search_with_covering_prefix(
+ row_prebuilt_t* prebuilt,
+ const rec_t* rec,
+ const rec_offs* offsets)
+{
+ const dict_index_t* index = prebuilt->index;
+ ut_ad(!dict_index_is_clust(index));
+
+ if (dict_index_is_spatial(index)) {
+ return false;
+ }
+
+ if (!srv_prefix_index_cluster_optimization) {
+ return false;
+ }
+
+ /** Optimization only applicable if there the number of secondary index
+ fields are greater than or equal to number of clustered index fields. */
+ if (prebuilt->n_template > index->n_fields) {
+ return false;
+ }
+
+ /* We can avoid a clustered index lookup if
+ all of the following hold:
+ (1) all columns are in the secondary index
+ (2) all values for columns that are prefix-only
+ indexes are shorter than the prefix size
+ This optimization can avoid many IOs for certain schemas. */
+ for (ulint i = 0; i < prebuilt->n_template; i++) {
+ mysql_row_templ_t* templ = prebuilt->mysql_template + i;
+ ulint j = templ->rec_prefix_field_no;
+ ut_ad(!templ->mbminlen == !templ->mbmaxlen);
+
+ /** Condition (1) : is the field in the index. */
+ if (j == ULINT_UNDEFINED) {
+ return false;
+ }
+
+ /** Condition (2): If this is a prefix index then
+ row's value size shorter than prefix length. */
+
+ if (!templ->rec_field_is_prefix
+ || rec_offs_nth_sql_null(offsets, j)) {
+ continue;
+ }
+
+ const dict_field_t* field = dict_index_get_nth_field(index, j);
+
+ if (!field->prefix_len) {
+ continue;
+ }
+
+ const ulint rec_size = rec_offs_nth_size(offsets, j);
+
+ if (rec_size >= field->prefix_len) {
+ /* Shortest representation string by the
+ byte length of the record is longer than the
+ maximum possible index prefix. */
+ return false;
+ }
+
+ if (templ->mbminlen != templ->mbmaxlen
+ && rec_field_len_in_chars(field->col, j, rec, offsets)
+ >= field->prefix_len / templ->mbmaxlen) {
+ /* No of chars to store the record exceeds
+ the index prefix character length. */
+ return false;
+ }
+ }
+
+ /* If prefix index optimization condition satisfied then
+ for all columns above, use rec_prefix_field_no instead of
+ rec_field_no, and skip the clustered lookup below. */
+ for (ulint i = 0; i < prebuilt->n_template; i++) {
+ mysql_row_templ_t* templ = prebuilt->mysql_template + i;
+ templ->rec_field_no = templ->rec_prefix_field_no;
+ ut_a(templ->rec_field_no != ULINT_UNDEFINED);
+ }
+
+ srv_stats.n_sec_rec_cluster_reads_avoided.inc();
+ return true;
+}
+
+/** Searches for rows in the database using cursor.
+Function is mainly used for tables that are shared across connections and
+so it employs technique that can help re-construct the rows that
+transaction is suppose to see.
+It also has optimization such as pre-caching the rows, using AHI, etc.
+
+@param[out] buf buffer for the fetched row in MySQL format
+@param[in] mode search mode PAGE_CUR_L
+@param[in,out] prebuilt prebuilt struct for the table handler;
+ this contains the info to search_tuple,
+ index; if search tuple contains 0 field then
+ we position the cursor at start or the end of
+ index, depending on 'mode'
+@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
+@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV;
+ Note: if this is != 0, then prebuilt must has a
+ pcur with stored position! In opening of a
+ cursor 'direction' should be 0.
+@return DB_SUCCESS or error code */
+dberr_t
+row_search_mvcc(
+ byte* buf,
+ page_cur_mode_t mode,
+ row_prebuilt_t* prebuilt,
+ ulint match_mode,
+ ulint direction)
+{
+ DBUG_ENTER("row_search_mvcc");
+ DBUG_ASSERT(prebuilt->index->table == prebuilt->table);
+
+ dict_index_t* index = prebuilt->index;
+ ibool comp = dict_table_is_comp(prebuilt->table);
+ const dtuple_t* search_tuple = prebuilt->search_tuple;
+ btr_pcur_t* pcur = prebuilt->pcur;
+ trx_t* trx = prebuilt->trx;
+ dict_index_t* clust_index;
+ que_thr_t* thr;
+ const rec_t* UNINIT_VAR(rec);
+ dtuple_t* vrow = NULL;
+ const rec_t* result_rec = NULL;
+ const rec_t* clust_rec;
+ Row_sel_get_clust_rec_for_mysql row_sel_get_clust_rec_for_mysql;
+ ibool unique_search = FALSE;
+ ibool mtr_has_extra_clust_latch = FALSE;
+ ibool moves_up = FALSE;
+ /* if the returned record was locked and we did a semi-consistent
+ read (fetch the newest committed version), then this is set to
+ TRUE */
+ ulint next_offs;
+ ibool same_user_rec;
+ ibool table_lock_waited = FALSE;
+ byte* next_buf = 0;
+ bool spatial_search = false;
+
+ ut_ad(index && pcur && search_tuple);
+ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+ /* We don't support FTS queries from the HANDLER interfaces, because
+ we implemented FTS as reversed inverted index with auxiliary tables.
+ So anything related to traditional index query would not apply to
+ it. */
+ if (prebuilt->index->type & DICT_FTS) {
+ DBUG_RETURN(DB_END_OF_INDEX);
+ }
+
+ ut_ad(!sync_check_iterate(sync_check()));
+
+ if (!prebuilt->table->space) {
+ DBUG_RETURN(DB_TABLESPACE_DELETED);
+ } else if (!prebuilt->table->is_readable()) {
+ DBUG_RETURN(prebuilt->table->space
+ ? DB_DECRYPTION_FAILED
+ : DB_TABLESPACE_NOT_FOUND);
+ } else if (!prebuilt->index_usable) {
+ DBUG_RETURN(DB_MISSING_HISTORY);
+ } else if (prebuilt->index->is_corrupted()) {
+ DBUG_RETURN(DB_CORRUPTION);
+ }
+
+ /* We need to get the virtual column values stored in secondary
+ index key, if this is covered index scan or virtual key read is
+ requested. */
+ bool need_vrow = dict_index_has_virtual(prebuilt->index)
+ && prebuilt->read_just_key;
+
+ /* Reset the new record lock info if READ UNCOMMITTED or
+ READ COMMITED isolation level is used. Then
+ we are able to remove the record locks set here on an individual
+ row. */
+ prebuilt->new_rec_locks = 0;
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 1: Try to pop the row from the prefetch cache */
+
+ if (UNIV_UNLIKELY(direction == 0)) {
+ trx->op_info = "starting index read";
+
+ prebuilt->n_rows_fetched = 0;
+ prebuilt->n_fetch_cached = 0;
+ prebuilt->fetch_cache_first = 0;
+
+ if (prebuilt->sel_graph == NULL) {
+ /* Build a dummy select query graph */
+ row_prebuild_sel_graph(prebuilt);
+ }
+ } else {
+ trx->op_info = "fetching rows";
+
+ if (prebuilt->n_rows_fetched == 0) {
+ prebuilt->fetch_direction = direction;
+ }
+
+ if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
+ if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
+ ut_error;
+ /* TODO: scrollable cursor: restore cursor to
+ the place of the latest returned row,
+ or better: prevent caching for a scroll
+ cursor! */
+ }
+
+ prebuilt->n_rows_fetched = 0;
+ prebuilt->n_fetch_cached = 0;
+ prebuilt->fetch_cache_first = 0;
+
+ } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
+ row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+
+ prebuilt->n_rows_fetched++;
+ trx->op_info = "";
+ DBUG_RETURN(DB_SUCCESS);
+ }
+
+ if (prebuilt->fetch_cache_first > 0
+ && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+early_not_found:
+ /* The previous returned row was popped from the fetch
+ cache, but the cache was not full at the time of the
+ popping: no more rows can exist in the result set */
+ trx->op_info = "";
+ DBUG_RETURN(DB_RECORD_NOT_FOUND);
+ }
+
+ prebuilt->n_rows_fetched++;
+
+ if (prebuilt->n_rows_fetched > 1000000000) {
+ /* Prevent wrap-over */
+ prebuilt->n_rows_fetched = 500000000;
+ }
+
+ mode = pcur->search_mode;
+ }
+
+ /* In a search where at most one record in the index may match, we
+ can use a LOCK_REC_NOT_GAP type record lock when locking a
+ non-delete-marked matching record.
+
+ Note that in a unique secondary index there may be different
+ delete-marked versions of a record where only the primary key
+ values differ: thus in a secondary index we must use next-key
+ locks when locking delete-marked records. */
+
+ if (match_mode == ROW_SEL_EXACT
+ && dict_index_is_unique(index)
+ && dtuple_get_n_fields(search_tuple)
+ == dict_index_get_n_unique(index)
+ && (dict_index_is_clust(index)
+ || !dtuple_contains_null(search_tuple))) {
+
+ /* Note above that a UNIQUE secondary index can contain many
+ rows with the same key value if one of the columns is the SQL
+ null. A clustered index under MySQL can never contain null
+ columns because we demand that all the columns in primary key
+ are non-null. */
+
+ unique_search = TRUE;
+
+ /* Even if the condition is unique, MySQL seems to try to
+ retrieve also a second row if a primary key contains more than
+ 1 column. Return immediately if this is not a HANDLER
+ command. */
+
+ if (UNIV_UNLIKELY(direction != 0
+ && !prebuilt->used_in_HANDLER)) {
+ goto early_not_found;
+ }
+ }
+
+ /* We don't support sequencial scan for Rtree index, because it
+ is no meaning to do so. */
+ if (dict_index_is_spatial(index) && !RTREE_SEARCH_MODE(mode)) {
+ trx->op_info = "";
+ DBUG_RETURN(DB_END_OF_INDEX);
+ }
+
+ /* if the query is a plain locking SELECT, and the isolation level
+ is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
+ bool did_semi_consistent_read = false;
+ mtr_t mtr;
+ mtr.start();
+
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /*-------------------------------------------------------------*/
+ /* PHASE 2: Try fast adaptive hash index search if possible */
+
+ /* Next test if this is the special case where we can use the fast
+ adaptive hash index to try the search. Since we must release the
+ search system latch when we retrieve an externally stored field, we
+ cannot use the adaptive hash index in a search in the case the row
+ may be long and there may be externally stored fields */
+
+ if (UNIV_UNLIKELY(direction == 0)
+ && unique_search
+ && btr_search_enabled
+ && dict_index_is_clust(index)
+ && !prebuilt->templ_contains_blob
+ && !prebuilt->used_in_HANDLER
+ && (prebuilt->mysql_row_len < srv_page_size / 8)) {
+
+ mode = PAGE_CUR_GE;
+
+ if (prebuilt->select_lock_type == LOCK_NONE
+ && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+ && trx->read_view.is_open()) {
+
+ /* This is a SELECT query done as a consistent read,
+ and the read view has already been allocated:
+ let us try a search shortcut through the hash
+ index. */
+
+ dberr_t err = DB_SUCCESS;
+ switch (row_sel_try_search_shortcut_for_mysql(
+ &rec, prebuilt, &offsets, &heap,
+ &mtr)) {
+ case SEL_FOUND:
+ /* At this point, rec is protected by
+ a page latch that was acquired by
+ row_sel_try_search_shortcut_for_mysql().
+ The latch will not be released until
+ mtr.commit(). */
+ ut_ad(!rec_get_deleted_flag(rec, comp));
+
+ if (prebuilt->pk_filter || prebuilt->idx_cond) {
+ switch (row_search_idx_cond_check(
+ buf, prebuilt,
+ rec, offsets)) {
+ case CHECK_ABORTED_BY_USER:
+ goto aborted;
+ case CHECK_NEG:
+ case CHECK_OUT_OF_RANGE:
+ case CHECK_ERROR:
+ err = DB_RECORD_NOT_FOUND;
+ goto shortcut_done;
+ case CHECK_POS:
+ goto shortcut_done;
+ }
+
+ ut_ad("incorrect code" == 0);
+aborted:
+ err = DB_INTERRUPTED;
+ goto shortcut_done;
+ }
+
+ if (!row_sel_store_mysql_rec(
+ buf, prebuilt,
+ rec, NULL, false, index,
+ offsets)) {
+ /* Only fresh inserts may contain
+ incomplete externally stored
+ columns. Pretend that such
+ records do not exist. Such
+ records may only be accessed
+ at the READ UNCOMMITTED
+ isolation level or when
+ rolling back a recovered
+ transaction. Rollback happens
+ at a lower level, not here. */
+
+ /* Proceed as in case SEL_RETRY. */
+ break;
+ }
+
+ goto shortcut_done;
+
+ case SEL_EXHAUSTED:
+ err = DB_RECORD_NOT_FOUND;
+ shortcut_done:
+ mtr.commit();
+
+ /* NOTE that we do NOT store the cursor
+ position */
+ trx->op_info = "";
+ ut_ad(!sync_check_iterate(sync_check()));
+ ut_ad(!did_semi_consistent_read);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ DBUG_RETURN(err);
+
+ case SEL_RETRY:
+ break;
+
+ default:
+ ut_ad(0);
+ }
+
+ mtr.commit();
+ mtr.start();
+ }
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 3: Open or restore index cursor position */
+
+ spatial_search = dict_index_is_spatial(index)
+ && mode >= PAGE_CUR_CONTAIN;
+
+ /* The state of a running trx can only be changed by the
+ thread that is currently serving the transaction. Because we
+ are that thread, we can read trx->state without holding any
+ mutex. */
+ ut_ad(prebuilt->sql_stat_start
+ || trx->state == TRX_STATE_ACTIVE
+ || (prebuilt->table->no_rollback()
+ && trx->state == TRX_STATE_NOT_STARTED));
+
+ ut_ad(!trx_is_started(trx) || trx->state == TRX_STATE_ACTIVE);
+
+ ut_ad(prebuilt->sql_stat_start
+ || prebuilt->select_lock_type != LOCK_NONE
+ || trx->read_view.is_open()
+ || prebuilt->table->no_rollback()
+ || srv_read_only_mode);
+
+ /* Do not lock gaps at READ UNCOMMITTED or READ COMMITTED
+ isolation level */
+ const bool set_also_gap_locks =
+ prebuilt->select_lock_type != LOCK_NONE
+ && trx->isolation_level > TRX_ISO_READ_COMMITTED
+#ifdef WITH_WSREP
+ && !wsrep_thd_skip_locking(trx->mysql_thd)
+#endif /* WITH_WSREP */
+ ;
+
+ /* Note that if the search mode was GE or G, then the cursor
+ naturally moves upward (in fetch next) in alphabetical order,
+ otherwise downward */
+
+ if (UNIV_UNLIKELY(direction == 0)) {
+ if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G
+ || mode >= PAGE_CUR_CONTAIN) {
+ moves_up = TRUE;
+ }
+ } else if (direction == ROW_SEL_NEXT) {
+ moves_up = TRUE;
+ }
+
+ thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+ thr->start_running();
+
+ clust_index = dict_table_get_first_index(prebuilt->table);
+
+ dberr_t err = DB_SUCCESS;
+
+ /* Do some start-of-statement preparations */
+
+ if (prebuilt->table->no_rollback()) {
+ /* NO_ROLLBACK tables do not support MVCC or locking. */
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->sql_stat_start = FALSE;
+ } else if (!prebuilt->sql_stat_start) {
+ /* No need to set an intention lock or assign a read view */
+ ut_a(prebuilt->select_lock_type != LOCK_NONE
+ || srv_read_only_mode || trx->read_view.is_open());
+ } else {
+ prebuilt->sql_stat_start = FALSE;
+ trx_start_if_not_started(trx, false);
+
+ if (prebuilt->select_lock_type == LOCK_NONE) {
+ trx->read_view.open(trx);
+ } else {
+wait_table_again:
+ err = lock_table(0, prebuilt->table,
+ prebuilt->select_lock_type == LOCK_S
+ ? LOCK_IS : LOCK_IX, thr);
+
+ if (err != DB_SUCCESS) {
+
+ table_lock_waited = TRUE;
+ goto lock_table_wait;
+ }
+ }
+ }
+
+ /* Open or restore index cursor position */
+
+ if (UNIV_LIKELY(direction != 0)) {
+ if (spatial_search) {
+ /* R-Tree access does not need to do
+ cursor position and resposition */
+ goto next_rec;
+ }
+
+ bool need_to_process = sel_restore_position_for_mysql(
+ &same_user_rec, BTR_SEARCH_LEAF,
+ pcur, moves_up, &mtr);
+
+ if (UNIV_UNLIKELY(need_to_process)) {
+ if (UNIV_UNLIKELY(prebuilt->row_read_type
+ == ROW_READ_DID_SEMI_CONSISTENT)) {
+ /* We did a semi-consistent read,
+ but the record was removed in
+ the meantime. */
+ prebuilt->row_read_type
+ = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ } else if (UNIV_LIKELY(prebuilt->row_read_type
+ != ROW_READ_DID_SEMI_CONSISTENT)) {
+
+ /* The cursor was positioned on the record
+ that we returned previously. If we need
+ to repeat a semi-consistent read as a
+ pessimistic locking read, the record
+ cannot be skipped. */
+
+ goto next_rec;
+ }
+
+ } else if (dtuple_get_n_fields(search_tuple) > 0) {
+ pcur->btr_cur.thr = thr;
+
+ if (dict_index_is_spatial(index)) {
+ if (!prebuilt->rtr_info) {
+ prebuilt->rtr_info = rtr_create_rtr_info(
+ set_also_gap_locks, true,
+ btr_pcur_get_btr_cur(pcur), index);
+ prebuilt->rtr_info->search_tuple = search_tuple;
+ prebuilt->rtr_info->search_mode = mode;
+ rtr_info_update_btr(btr_pcur_get_btr_cur(pcur),
+ prebuilt->rtr_info);
+ } else {
+ rtr_info_reinit_in_cursor(
+ btr_pcur_get_btr_cur(pcur),
+ index, set_also_gap_locks);
+ prebuilt->rtr_info->search_tuple = search_tuple;
+ prebuilt->rtr_info->search_mode = mode;
+ }
+ }
+
+ err = btr_pcur_open_with_no_init(index, search_tuple, mode,
+ BTR_SEARCH_LEAF,
+ pcur, 0, &mtr);
+
+ if (err != DB_SUCCESS) {
+ rec = NULL;
+ goto page_read_error;
+ }
+
+ pcur->trx_if_known = trx;
+
+ rec = btr_pcur_get_rec(pcur);
+ ut_ad(page_rec_is_leaf(rec));
+
+ if (!moves_up
+ && set_also_gap_locks
+ && !page_rec_is_supremum(rec)
+ && !dict_index_is_spatial(index)) {
+
+ /* Try to place a gap lock on the next index record
+ to prevent phantoms in ORDER BY ... DESC queries */
+ const rec_t* next_rec = page_rec_get_next_const(rec);
+
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ err = sel_set_rec_lock(pcur,
+ next_rec, index, offsets,
+ prebuilt->select_lock_type,
+ LOCK_GAP, thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
+ err = btr_pcur_open_at_index_side(
+ mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF,
+ pcur, false, 0, &mtr);
+
+ if (err != DB_SUCCESS) {
+ if (err == DB_DECRYPTION_FAILED) {
+ ib_push_warning(trx->mysql_thd,
+ DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ prebuilt->table->name.m_name);
+ index->table->file_unreadable = true;
+ }
+ rec = NULL;
+ goto page_read_error;
+ }
+ }
+
+rec_loop:
+ DEBUG_SYNC_C("row_search_rec_loop");
+ if (trx_is_interrupted(trx)) {
+ if (!spatial_search) {
+ btr_pcur_store_position(pcur, &mtr);
+ }
+ err = DB_INTERRUPTED;
+ goto normal_return;
+ }
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 4: Look for matching records in a loop */
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (!index->table->is_readable()) {
+ err = DB_DECRYPTION_FAILED;
+ goto page_read_error;
+ }
+
+ ut_ad(!!page_rec_is_comp(rec) == comp);
+ ut_ad(page_rec_is_leaf(rec));
+
+ if (page_rec_is_infimum(rec)) {
+
+ /* The infimum record on a page cannot be in the result set,
+ and neither can a record lock be placed on it: we skip such
+ a record. */
+
+ goto next_rec;
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ if (set_also_gap_locks
+ && !dict_index_is_spatial(index)) {
+
+ /* Try to place a lock on the index record */
+
+ /* If the transaction isolation level is
+ READ UNCOMMITTED or READ COMMITTED,
+ we do not lock gaps. Supremum record is really
+ a gap and therefore we do not set locks there. */
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ err = sel_set_rec_lock(pcur,
+ rec, index, offsets,
+ prebuilt->select_lock_type,
+ LOCK_ORDINARY, thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ /* A page supremum record cannot be in the result set: skip
+ it now that we have placed a possible lock on it */
+
+ goto next_rec;
+ }
+
+ /*-------------------------------------------------------------*/
+ /* Do sanity checks in case our cursor has bumped into page
+ corruption */
+
+ if (comp) {
+ if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) {
+ /* Skip the metadata pseudo-record. */
+ ut_ad(index->is_instant());
+ goto next_rec;
+ }
+
+ next_offs = rec_get_next_offs(rec, TRUE);
+ if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
+
+ goto wrong_offs;
+ }
+ } else {
+ if (rec_get_info_bits(rec, false) & REC_INFO_MIN_REC_FLAG) {
+ /* Skip the metadata pseudo-record. */
+ ut_ad(index->is_instant());
+ goto next_rec;
+ }
+
+ next_offs = rec_get_next_offs(rec, FALSE);
+ if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
+
+ goto wrong_offs;
+ }
+ }
+
+ if (UNIV_UNLIKELY(next_offs >= srv_page_size - PAGE_DIR)) {
+
+wrong_offs:
+ if (srv_force_recovery == 0 || moves_up == FALSE) {
+ ib::error() << "Rec address "
+ << static_cast<const void*>(rec)
+ << ", buf block fix count "
+ << btr_pcur_get_block(pcur)->page
+ .buf_fix_count();
+
+ ib::error() << "Index corruption: rec offs "
+ << page_offset(rec) << " next offs "
+ << next_offs
+ << btr_pcur_get_block(pcur)->page.id()
+ << ", index " << index->name
+ << " of table " << index->table->name
+ << ". Run CHECK TABLE. You may need to"
+ " restore from a backup, or dump + drop +"
+ " reimport the table.";
+ ut_ad(0);
+ err = DB_CORRUPTION;
+
+ goto page_read_error;
+ } else {
+ /* The user may be dumping a corrupt table. Jump
+ over the corruption to recover as much as possible. */
+
+ ib::info() << "Index corruption: rec offs "
+ << page_offset(rec) << " next offs "
+ << next_offs
+ << btr_pcur_get_block(pcur)->page.id()
+ << ", index " << index->name
+ << " of table " << index->table->name
+ << ". We try to skip the rest of the page.";
+
+ page_cur_set_after_last(btr_pcur_get_block(pcur),
+ btr_pcur_get_page_cur(pcur));
+ pcur->old_stored = false;
+ goto next_rec;
+ }
+ }
+ /*-------------------------------------------------------------*/
+
+ /* Calculate the 'offsets' associated with 'rec' */
+
+ ut_ad(fil_page_index_page_check(btr_pcur_get_page(pcur)));
+ ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id);
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
+ if (!rec_validate(rec, offsets)
+ || !btr_index_rec_validate(rec, index, FALSE)) {
+
+ ib::error() << "Index corruption: rec offs "
+ << page_offset(rec) << " next offs "
+ << next_offs
+ << btr_pcur_get_block(pcur)->page.id()
+ << ", index " << index->name
+ << " of table " << index->table->name
+ << ". We try to skip the record.";
+
+ goto next_rec;
+ }
+ }
+
+ /* Note that we cannot trust the up_match value in the cursor at this
+ place because we can arrive here after moving the cursor! Thus
+ we have to recompare rec and search_tuple to determine if they
+ match enough. */
+
+ if (match_mode == ROW_SEL_EXACT) {
+ /* Test if the index record matches completely to search_tuple
+ in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+ /* fputs("Comparing rec and search tuple\n", stderr); */
+
+ if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
+
+ if (set_also_gap_locks
+ && !dict_index_is_spatial(index)) {
+ err = sel_set_rec_lock(
+ pcur,
+ rec, index, offsets,
+ prebuilt->select_lock_type, LOCK_GAP,
+ thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ /* The found record was not a match, but may be used
+ as NEXT record (index_next). Set the relative position
+ to BTR_PCUR_BEFORE, to reflect that the position of
+ the persistent cursor is before the found/stored row
+ (pcur->old_rec). */
+ ut_ad(pcur->rel_pos == BTR_PCUR_ON);
+ pcur->rel_pos = BTR_PCUR_BEFORE;
+
+ err = DB_RECORD_NOT_FOUND;
+ goto normal_return;
+ }
+
+ } else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+ if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
+
+ if (set_also_gap_locks
+ && !dict_index_is_spatial(index)) {
+ err = sel_set_rec_lock(
+ pcur,
+ rec, index, offsets,
+ prebuilt->select_lock_type, LOCK_GAP,
+ thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ /* The found record was not a match, but may be used
+ as NEXT record (index_next). Set the relative position
+ to BTR_PCUR_BEFORE, to reflect that the position of
+ the persistent cursor is before the found/stored row
+ (pcur->old_rec). */
+ ut_ad(pcur->rel_pos == BTR_PCUR_ON);
+ pcur->rel_pos = BTR_PCUR_BEFORE;
+
+ err = DB_RECORD_NOT_FOUND;
+ goto normal_return;
+ }
+ }
+
+ /* We are ready to look at a possible new index entry in the result
+ set: the cursor is now placed on a user record */
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record; note that delete
+ marked records are a special case in a unique search. If there
+ is a non-delete marked record, then it is enough to lock its
+ existence with LOCK_REC_NOT_GAP. */
+
+ unsigned lock_type;
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+ /* At READ COMMITTED or READ UNCOMMITTED
+ isolation levels, do not lock committed
+ delete-marked records. */
+ if (!rec_get_deleted_flag(rec, comp)) {
+ goto no_gap_lock;
+ }
+
+ /* At most one transaction can be active
+ for temporary table. */
+ if (clust_index->table->is_temporary()) {
+ goto no_gap_lock;
+ }
+
+ if (index == clust_index) {
+ trx_id_t trx_id = row_get_rec_trx_id(
+ rec, index, offsets);
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(trx_id);
+ if (!trx_sys.is_registered(trx, trx_id)) {
+ /* The clustered index record
+ was delete-marked in a committed
+ transaction. Ignore the record. */
+ goto locks_ok_del_marked;
+ }
+ } else if (trx_t* t = row_vers_impl_x_locked(
+ trx, rec, index, offsets)) {
+ /* The record belongs to an active
+ transaction. We must acquire a lock. */
+ t->release_reference();
+ } else {
+ /* The secondary index record does not
+ point to a delete-marked clustered index
+ record that belongs to an active transaction.
+ Ignore the secondary index record, because
+ it is not locked. */
+ goto next_rec;
+ }
+
+ goto no_gap_lock;
+ }
+
+#ifdef WITH_WSREP
+ if (UNIV_UNLIKELY(!set_also_gap_locks)) {
+ ut_ad(wsrep_thd_skip_locking(trx->mysql_thd));
+ goto no_gap_lock;
+ }
+#else /* WITH_WSREP */
+ ut_ad(set_also_gap_locks);
+#endif /* WITH_WSREP */
+
+ if ((unique_search && !rec_get_deleted_flag(rec, comp))
+ || dict_index_is_spatial(index)) {
+
+ goto no_gap_lock;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ /* If we are doing a 'greater or equal than a primary key
+ value' search from a clustered index, and we find a record
+ that has that exact primary key value, then there is no need
+ to lock the gap before the record, because no insert in the
+ gap can be in our search range. That is, no phantom row can
+ appear that way.
+
+ An example: if col1 is the primary key, the search is WHERE
+ col1 >= 100, and we find a record where col1 = 100, then no
+ need to lock the gap before that record. */
+
+ if (index == clust_index
+ && mode == PAGE_CUR_GE
+ && direction == 0
+ && dtuple_get_n_fields_cmp(search_tuple)
+ == dict_index_get_n_unique(index)
+ && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
+no_gap_lock:
+ lock_type = LOCK_REC_NOT_GAP;
+ }
+
+ err = sel_set_rec_lock(pcur,
+ rec, index, offsets,
+ prebuilt->select_lock_type,
+ lock_type, thr, &mtr);
+
+ switch (err) {
+ const rec_t* old_vers;
+ case DB_SUCCESS_LOCKED_REC:
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+ /* Note that a record of
+ prebuilt->index was locked. */
+ prebuilt->new_rec_locks = 1;
+ }
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ break;
+ case DB_LOCK_WAIT:
+ /* Lock wait for R-tree should already
+ be handled in sel_set_rtr_rec_lock() */
+ ut_ad(!dict_index_is_spatial(index));
+ /* Never unlock rows that were part of a conflict. */
+ prebuilt->new_rec_locks = 0;
+
+ if (UNIV_LIKELY(prebuilt->row_read_type
+ != ROW_READ_TRY_SEMI_CONSISTENT)
+ || unique_search
+ || index != clust_index) {
+
+ goto lock_wait_or_error;
+ }
+
+ /* The following call returns 'offsets'
+ associated with 'old_vers' */
+ row_sel_build_committed_vers_for_mysql(
+ clust_index, prebuilt, rec,
+ &offsets, &heap, &old_vers, need_vrow ? &vrow : NULL,
+ &mtr);
+
+ /* Check whether it was a deadlock or not, if not
+ a deadlock and the transaction had to wait then
+ release the lock it is waiting on. */
+
+ err = lock_trx_handle_wait(trx);
+
+ switch (err) {
+ case DB_SUCCESS:
+ /* The lock was granted while we were
+ searching for the last committed version.
+ Do a normal locking read. */
+
+ offsets = rec_get_offsets(
+ rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ goto locks_ok;
+ case DB_DEADLOCK:
+ goto lock_wait_or_error;
+ case DB_LOCK_WAIT:
+ ut_ad(!dict_index_is_spatial(index));
+ err = DB_SUCCESS;
+ break;
+ default:
+ ut_error;
+ }
+
+ if (old_vers == NULL) {
+ /* The row was not yet committed */
+
+ goto next_rec;
+ }
+
+ did_semi_consistent_read = true;
+ rec = old_vers;
+ break;
+ case DB_RECORD_NOT_FOUND:
+ if (dict_index_is_spatial(index)) {
+ goto next_rec;
+ } else {
+ goto lock_wait_or_error;
+ }
+
+ default:
+
+ goto lock_wait_or_error;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+ || prebuilt->table->no_rollback()) {
+
+ /* Do nothing: we let a non-locking SELECT read the
+ latest version of the record */
+
+ } else if (index == clust_index) {
+
+ /* Fetch a previous version of the row if the current
+ one is not visible in the snapshot; if we have a very
+ high force recovery level set, we try to avoid crashes
+ by skipping this lookup */
+
+ if (!lock_clust_rec_cons_read_sees(
+ rec, index, offsets, &trx->read_view)) {
+ ut_ad(srv_force_recovery
+ < SRV_FORCE_NO_UNDO_LOG_SCAN);
+ rec_t* old_vers;
+ /* The following call returns 'offsets'
+ associated with 'old_vers' */
+ err = row_sel_build_prev_vers_for_mysql(
+ &trx->read_view, clust_index,
+ prebuilt, rec, &offsets, &heap,
+ &old_vers, need_vrow ? &vrow : NULL,
+ &mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (old_vers == NULL) {
+ /* The row did not exist yet in
+ the read view */
+
+ goto next_rec;
+ }
+
+ rec = old_vers;
+ }
+ } else {
+ /* We are looking into a non-clustered index,
+ and to get the right version of the record we
+ have to look also into the clustered index: this
+ is necessary, because we can only get the undo
+ information via the clustered index record. */
+
+ ut_ad(!dict_index_is_clust(index));
+
+ if (!srv_read_only_mode
+ && !lock_sec_rec_cons_read_sees(
+ rec, index, &trx->read_view)) {
+ /* We should look at the clustered index.
+ However, as this is a non-locking read,
+ we can skip the clustered index lookup if
+ the condition does not match the secondary
+ index entry. */
+ switch (row_search_idx_cond_check(
+ buf, prebuilt, rec, offsets)) {
+ case CHECK_NEG:
+ goto next_rec;
+ case CHECK_ABORTED_BY_USER:
+ err = DB_INTERRUPTED;
+ goto idx_cond_failed;
+ case CHECK_OUT_OF_RANGE:
+ case CHECK_ERROR:
+ err = DB_RECORD_NOT_FOUND;
+ goto idx_cond_failed;
+ case CHECK_POS:
+ goto requires_clust_rec;
+ }
+
+ ut_error;
+ }
+ }
+ }
+
+locks_ok:
+ /* NOTE that at this point rec can be an old version of a clustered
+ index record built for a consistent read. We cannot assume after this
+ point that rec is on a buffer pool page. Functions like
+ page_rec_is_comp() cannot be used! */
+
+ if (rec_get_deleted_flag(rec, comp)) {
+locks_ok_del_marked:
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(index != clust_index
+ || row_get_rec_trx_id(rec, index, offsets));
+
+ /* The record is delete-marked: we can skip it */
+
+ /* This is an optimization to skip setting the next key lock
+ on the record that follows this delete-marked record. This
+ optimization works because of the unique search criteria
+ which precludes the presence of a range lock between this
+ delete marked record and the record following it.
+
+ For now this is applicable only to clustered indexes while
+ doing a unique search except for HANDLER queries because
+ HANDLER allows NEXT and PREV even in unique search on
+ clustered index. There is scope for further optimization
+ applicable to unique secondary indexes. Current behaviour is
+ to widen the scope of a lock on an already delete marked record
+ if the same record is deleted twice by the same transaction */
+ if (index == clust_index && unique_search
+ && !prebuilt->used_in_HANDLER) {
+
+ err = DB_RECORD_NOT_FOUND;
+
+ goto normal_return;
+ }
+
+ goto next_rec;
+ }
+
+ /* Check if the record matches the index condition. */
+ switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) {
+ case CHECK_NEG:
+ if (did_semi_consistent_read) {
+ row_unlock_for_mysql(prebuilt, TRUE);
+ }
+ goto next_rec;
+ case CHECK_ABORTED_BY_USER:
+ err = DB_INTERRUPTED;
+ goto idx_cond_failed;
+ case CHECK_OUT_OF_RANGE:
+ case CHECK_ERROR:
+ err = DB_RECORD_NOT_FOUND;
+ goto idx_cond_failed;
+ case CHECK_POS:
+ break;
+ }
+
+ if (index != clust_index && prebuilt->need_to_access_clustered) {
+ if (row_search_with_covering_prefix(prebuilt, rec, offsets)) {
+ goto use_covering_index;
+ }
+requires_clust_rec:
+ ut_ad(index != clust_index);
+ /* We use a 'goto' to the preceding label if a consistent
+ read of a secondary index record requires us to look up old
+ versions of the associated clustered index record. */
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ /* It was a non-clustered index and we must fetch also the
+ clustered index record */
+
+ mtr_has_extra_clust_latch = TRUE;
+
+ ut_ad(!vrow);
+ /* The following call returns 'offsets' associated with
+ 'clust_rec'. Note that 'clust_rec' can be an old version
+ built for a consistent read. */
+
+ err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+ thr, &clust_rec,
+ &offsets, &heap,
+ need_vrow ? &vrow : NULL,
+ &mtr);
+ switch (err) {
+ case DB_SUCCESS:
+ if (clust_rec == NULL) {
+ /* The record did not exist in the read view */
+ ut_ad(prebuilt->select_lock_type == LOCK_NONE
+ || dict_index_is_spatial(index));
+
+ goto next_rec;
+ }
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ ut_a(clust_rec != NULL);
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+ /* Note that the clustered index record
+ was locked. */
+ prebuilt->new_rec_locks = 2;
+ }
+ err = DB_SUCCESS;
+ break;
+ default:
+ vrow = NULL;
+ goto lock_wait_or_error;
+ }
+
+ if (rec_get_deleted_flag(clust_rec, comp)) {
+
+ /* The record is delete marked: we can skip it */
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && prebuilt->select_lock_type != LOCK_NONE) {
+
+ /* No need to keep a lock on a delete-marked
+ record if we do not want to use next-key
+ locking. */
+
+ row_unlock_for_mysql(prebuilt, TRUE);
+ }
+
+ goto next_rec;
+ }
+
+ if (need_vrow && !vrow) {
+ if (!heap) {
+ heap = mem_heap_create(100);
+ }
+ row_sel_fill_vrow(rec, index, &vrow, heap);
+ }
+
+ result_rec = clust_rec;
+ ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
+
+ if (prebuilt->pk_filter || prebuilt->idx_cond) {
+ /* Convert the record to MySQL format. We were
+ unable to do this in row_search_idx_cond_check(),
+ because the condition is on the secondary index
+ and the requested column is in the clustered index.
+ We convert all fields, including those that
+ may have been used in ICP, because the
+ secondary index may contain a column prefix
+ rather than the full column. Also, as noted
+ in Bug #56680, the column in the secondary
+ index may be in the wrong case, and the
+ authoritative case is in result_rec, the
+ appropriate version of the clustered index record. */
+ if (!row_sel_store_mysql_rec(
+ buf, prebuilt, result_rec, vrow,
+ true, clust_index, offsets)) {
+ goto next_rec;
+ }
+ }
+ } else {
+use_covering_index:
+ result_rec = rec;
+ }
+
+ /* We found a qualifying record 'result_rec'. At this point,
+ 'offsets' are associated with 'result_rec'. */
+
+ ut_ad(rec_offs_validate(result_rec,
+ result_rec != rec ? clust_index : index,
+ offsets));
+ ut_ad(!rec_get_deleted_flag(result_rec, comp));
+
+ /* Decide whether to prefetch extra rows.
+ At this point, the clustered index record is protected
+ by a page latch that was acquired when pcur was positioned.
+ The latch will not be released until mtr.commit(). */
+
+ if ((match_mode == ROW_SEL_EXACT
+ || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
+ && prebuilt->select_lock_type == LOCK_NONE
+ && !prebuilt->templ_contains_blob
+ && !prebuilt->clust_index_was_generated
+ && !prebuilt->used_in_HANDLER
+ && prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE
+ && !prebuilt->in_fts_query) {
+
+ /* Inside an update, for example, we do not cache rows,
+ since we may use the cursor position to do the actual
+ update, that is why we require ...lock_type == LOCK_NONE.
+ Since we keep space in prebuilt only for the BLOBs of
+ a single row, we cannot cache rows in the case there
+ are BLOBs in the fields to be fetched. In HANDLER we do
+ not cache rows because there the cursor is a scrollable
+ cursor. */
+
+ ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+ /* We only convert from InnoDB row format to MySQL row
+ format when ICP is disabled. */
+
+ if (!prebuilt->pk_filter && !prebuilt->idx_cond) {
+ /* We use next_buf to track the allocation of buffers
+ where we store and enqueue the buffers for our
+ pre-fetch optimisation.
+
+ If next_buf == 0 then we store the converted record
+ directly into the MySQL record buffer (buf). If it is
+ != 0 then we allocate a pre-fetch buffer and store the
+ converted record there.
+
+ If the conversion fails and the MySQL record buffer
+ was not written to then we reset next_buf so that
+ we can re-use the MySQL record buffer in the next
+ iteration. */
+
+ next_buf = next_buf
+ ? row_sel_fetch_last_buf(prebuilt) : buf;
+
+ if (!row_sel_store_mysql_rec(
+ next_buf, prebuilt, result_rec, vrow,
+ result_rec != rec,
+ result_rec != rec ? clust_index : index,
+ offsets)) {
+
+ if (next_buf == buf) {
+ ut_a(prebuilt->n_fetch_cached == 0);
+ next_buf = 0;
+ }
+
+ /* Only fresh inserts may contain incomplete
+ externally stored columns. Pretend that such
+ records do not exist. Such records may only be
+ accessed at the READ UNCOMMITTED isolation
+ level or when rolling back a recovered
+ transaction. Rollback happens at a lower
+ level, not here. */
+ goto next_rec;
+ }
+
+ if (next_buf != buf) {
+ row_sel_enqueue_cache_row_for_mysql(
+ next_buf, prebuilt);
+ }
+ } else {
+ row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
+ }
+
+ if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
+ goto next_rec;
+ }
+
+ } else {
+ if (UNIV_UNLIKELY
+ (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
+ /* CHECK TABLE: fetch the row */
+
+ if (result_rec != rec
+ && !prebuilt->need_to_access_clustered) {
+ /* We used 'offsets' for the clust
+ rec, recalculate them for 'rec' */
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED,
+ &heap);
+ result_rec = rec;
+ }
+
+ memcpy(buf + 4, result_rec
+ - rec_offs_extra_size(offsets),
+ rec_offs_size(offsets));
+ mach_write_to_4(buf,
+ rec_offs_extra_size(offsets) + 4);
+ } else if (!prebuilt->pk_filter && !prebuilt->idx_cond) {
+ /* The record was not yet converted to MySQL format. */
+ if (!row_sel_store_mysql_rec(
+ buf, prebuilt, result_rec, vrow,
+ result_rec != rec,
+ result_rec != rec ? clust_index : index,
+ offsets)) {
+ /* Only fresh inserts may contain
+ incomplete externally stored
+ columns. Pretend that such records do
+ not exist. Such records may only be
+ accessed at the READ UNCOMMITTED
+ isolation level or when rolling back a
+ recovered transaction. Rollback
+ happens at a lower level, not here. */
+ goto next_rec;
+ }
+ }
+
+ if (!prebuilt->clust_index_was_generated) {
+ } else if (result_rec != rec || index->is_primary()) {
+ memcpy(prebuilt->row_id, result_rec, DATA_ROW_ID_LEN);
+ } else {
+ ulint len;
+ const byte* data = rec_get_nth_field(
+ result_rec, offsets, index->n_fields - 1,
+ &len);
+ ut_ad(dict_index_get_nth_col(index,
+ index->n_fields - 1)
+ ->prtype == (DATA_ROW_ID | DATA_NOT_NULL));
+ ut_ad(len == DATA_ROW_ID_LEN);
+ memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN);
+ }
+ }
+
+ /* From this point on, 'offsets' are invalid. */
+
+ /* We have an optimization to save CPU time: if this is a consistent
+ read on a unique condition on the clustered index, then we do not
+ store the pcur position, because any fetch next or prev will anyway
+ return 'end of file'. Exceptions are locking reads and the MySQL
+ HANDLER command where the user can move the cursor with PREV or NEXT
+ even after a unique search. */
+
+ err = DB_SUCCESS;
+
+idx_cond_failed:
+ if (!unique_search
+ || !dict_index_is_clust(index)
+ || direction != 0
+ || prebuilt->select_lock_type != LOCK_NONE
+ || prebuilt->used_in_HANDLER) {
+
+ /* Inside an update always store the cursor position */
+
+ if (!spatial_search) {
+ btr_pcur_store_position(pcur, &mtr);
+ }
+ }
+
+ goto normal_return;
+
+next_rec:
+ /* Reset the old and new "did semi-consistent read" flags. */
+ if (UNIV_UNLIKELY(prebuilt->row_read_type
+ == ROW_READ_DID_SEMI_CONSISTENT)) {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ did_semi_consistent_read = false;
+ prebuilt->new_rec_locks = 0;
+ vrow = NULL;
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 5: Move the cursor to the next index record */
+
+ /* NOTE: For moves_up==FALSE, the mini-transaction will be
+ committed and restarted every time when switching b-tree
+ pages. For moves_up==TRUE in index condition pushdown, we can
+ scan an entire secondary index tree within a single
+ mini-transaction. As long as the prebuilt->idx_cond does not
+ match, we do not need to consult the clustered index or
+ return records to MySQL, and thus we can avoid repositioning
+ the cursor. What prevents us from buffer-fixing all leaf pages
+ within the mini-transaction is the btr_leaf_page_release()
+ call in btr_pcur_move_to_next_page(). Only the leaf page where
+ the cursor is positioned will remain buffer-fixed.
+ For R-tree spatial search, we also commit the mini-transaction
+ each time */
+
+ if (spatial_search) {
+ /* No need to do store restore for R-tree */
+ mtr.commit();
+ mtr.start();
+ mtr_has_extra_clust_latch = FALSE;
+ } else if (mtr_has_extra_clust_latch) {
+ /* If we have extra cluster latch, we must commit
+ mtr if we are moving to the next non-clustered
+ index record, because we could break the latching
+ order if we would access a different clustered
+ index page right away without releasing the previous. */
+
+ btr_pcur_store_position(pcur, &mtr);
+ mtr.commit();
+ mtr_has_extra_clust_latch = FALSE;
+
+ mtr.start();
+
+ if (sel_restore_position_for_mysql(&same_user_rec,
+ BTR_SEARCH_LEAF,
+ pcur, moves_up, &mtr)) {
+ goto rec_loop;
+ }
+ }
+
+ if (moves_up) {
+ if (UNIV_UNLIKELY(spatial_search)) {
+ if (rtr_pcur_move_to_next(
+ search_tuple, mode, pcur, 0, &mtr)) {
+ goto rec_loop;
+ }
+ } else {
+ const buf_block_t* block = btr_pcur_get_block(pcur);
+ /* This is based on btr_pcur_move_to_next(),
+ but avoids infinite read loop of a corrupted page. */
+ ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(pcur->latch_mode != BTR_NO_LATCHES);
+ pcur->old_stored = false;
+ if (btr_pcur_is_after_last_on_page(pcur)) {
+ if (btr_pcur_is_after_last_in_tree(pcur)) {
+ goto not_moved;
+ }
+ btr_pcur_move_to_next_page(pcur, &mtr);
+ if (UNIV_UNLIKELY(btr_pcur_get_block(pcur)
+ == block)) {
+ err = DB_CORRUPTION;
+ goto lock_wait_or_error;
+ }
+ } else {
+ btr_pcur_move_to_next_on_page(pcur);
+ }
+
+ goto rec_loop;
+ }
+ } else {
+ if (btr_pcur_move_to_prev(pcur, &mtr)) {
+ goto rec_loop;
+ }
+ }
+
+not_moved:
+ if (!spatial_search) {
+ btr_pcur_store_position(pcur, &mtr);
+ }
+
+ err = match_mode ? DB_RECORD_NOT_FOUND : DB_END_OF_INDEX;
+ goto normal_return;
+
+lock_wait_or_error:
+ if (!dict_index_is_spatial(index)) {
+ btr_pcur_store_position(pcur, &mtr);
+ }
+page_read_error:
+ /* Reset the old and new "did semi-consistent read" flags. */
+ if (UNIV_UNLIKELY(prebuilt->row_read_type
+ == ROW_READ_DID_SEMI_CONSISTENT)) {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ did_semi_consistent_read = false;
+
+lock_table_wait:
+ mtr.commit();
+ mtr_has_extra_clust_latch = FALSE;
+
+ trx->error_state = err;
+
+ /* The following is a patch for MySQL */
+
+ if (thr->is_active) {
+ que_thr_stop_for_mysql(thr);
+ }
+
+ thr->lock_state = QUE_THR_LOCK_ROW;
+
+ if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
+ /* It was a lock wait, and it ended */
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+ mtr.start();
+
+ /* Table lock waited, go try to obtain table lock
+ again */
+ if (table_lock_waited) {
+ table_lock_waited = FALSE;
+
+ goto wait_table_again;
+ }
+
+ if (!dict_index_is_spatial(index)) {
+ sel_restore_position_for_mysql(
+ &same_user_rec, BTR_SEARCH_LEAF, pcur,
+ moves_up, &mtr);
+ }
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && !same_user_rec) {
+
+ /* Since we were not able to restore the cursor
+ on the same user record, we cannot use
+ row_unlock_for_mysql() to unlock any records, and
+ we must thus reset the new rec lock info. Since
+ in lock0lock.cc we have blocked the inheriting of gap
+ X-locks, we actually do not have any new record locks
+ set in this case.
+
+ Note that if we were able to restore on the 'same'
+ user record, it is still possible that we were actually
+ waiting on a delete-marked record, and meanwhile
+ it was removed by purge and inserted again by some
+ other user. But that is no problem, because in
+ rec_loop we will again try to set a lock, and
+ new_rec_lock_info in trx will be right at the end. */
+
+ prebuilt->new_rec_locks = 0;
+ }
+
+ mode = pcur->search_mode;
+
+ goto rec_loop;
+ }
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+ goto func_exit;
+
+normal_return:
+ /*-------------------------------------------------------------*/
+ {
+ /* handler_index_cond_check() may pull TR_table search
+ which initates another row_search_mvcc(). */
+ ut_d(ulint n_active_thrs= trx->lock.n_active_thrs);
+ ut_d(trx->lock.n_active_thrs= 1);
+ thr->stop_no_error();
+ ut_d(trx->lock.n_active_thrs= n_active_thrs - 1);
+ }
+
+ mtr.commit();
+
+ DEBUG_SYNC_C("row_search_for_mysql_before_return");
+
+ if (prebuilt->pk_filter || prebuilt->idx_cond) {
+ /* When ICP is active we don't write to the MySQL buffer
+ directly, only to buffers that are enqueued in the pre-fetch
+ queue. We need to dequeue the first buffer and copy the contents
+ to the record buffer that was passed in by MySQL. */
+
+ if (prebuilt->n_fetch_cached > 0) {
+ row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+ err = DB_SUCCESS;
+ }
+
+ } else if (next_buf != 0) {
+
+ /* We may or may not have enqueued some buffers to the
+ pre-fetch queue, but we definitely wrote to the record
+ buffer passed to use by MySQL. */
+
+ DEBUG_SYNC_C("row_search_cached_row");
+ err = DB_SUCCESS;
+ }
+
+#ifdef UNIV_DEBUG
+ if (dict_index_is_spatial(index) && err != DB_SUCCESS
+ && err != DB_END_OF_INDEX && err != DB_INTERRUPTED) {
+ rtr_node_path_t* path = pcur->btr_cur.rtr_info->path;
+
+ ut_ad(path->empty());
+ }
+#endif
+
+func_exit:
+ trx->op_info = "";
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ /* Set or reset the "did semi-consistent read" flag on return.
+ The flag did_semi_consistent_read is set if and only if
+ the record being returned was fetched with a semi-consistent read. */
+ ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
+ || !did_semi_consistent_read);
+
+ if (prebuilt->row_read_type != ROW_READ_WITH_LOCKS) {
+ if (did_semi_consistent_read) {
+ prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
+ } else {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ }
+
+ ut_ad(!sync_check_iterate(sync_check()));
+
+ DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
+
+ DBUG_RETURN(err);
+}
+
+/********************************************************************//**
+Count rows in a R-Tree leaf level.
+@return DB_SUCCESS if successful */
+dberr_t
+row_count_rtree_recs(
+/*=================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
+ table handle; this contains the info
+ of search_tuple, index; if search
+ tuple contains 0 fields then we
+ position the cursor at the start or
+ the end of the index, depending on
+ 'mode' */
+ ulint* n_rows) /*!< out: number of entries
+ seen in the consistent read */
+{
+ dict_index_t* index = prebuilt->index;
+ dberr_t ret = DB_SUCCESS;
+ mtr_t mtr;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dtuple_t* search_entry = prebuilt->search_tuple;
+ ulint entry_len;
+ ulint i;
+ byte* buf;
+
+ ut_a(dict_index_is_spatial(index));
+
+ *n_rows = 0;
+
+ heap = mem_heap_create(256);
+
+ /* Build a search tuple. */
+ entry_len = dict_index_get_n_fields(index);
+ entry = dtuple_create(heap, entry_len);
+
+ for (i = 0; i < entry_len; i++) {
+ const dict_field_t* ind_field
+ = dict_index_get_nth_field(index, i);
+ const dict_col_t* col
+ = ind_field->col;
+ dfield_t* dfield
+ = dtuple_get_nth_field(entry, i);
+
+ if (i == 0) {
+ double* mbr;
+ double tmp_mbr[SPDIMS * 2];
+
+ dfield->type.mtype = DATA_GEOMETRY;
+ dfield->type.prtype |= DATA_GIS_MBR;
+
+ /* Allocate memory for mbr field */
+ mbr = static_cast<double*>
+ (mem_heap_alloc(heap, DATA_MBR_LEN));
+
+ /* Set mbr field data. */
+ dfield_set_data(dfield, mbr, DATA_MBR_LEN);
+
+ for (uint j = 0; j < SPDIMS; j++) {
+ tmp_mbr[j * 2] = DBL_MAX;
+ tmp_mbr[j * 2 + 1] = -DBL_MAX;
+ }
+ dfield_write_mbr(dfield, tmp_mbr);
+ continue;
+ }
+
+ dfield->type.mtype = col->mtype;
+ dfield->type.prtype = col->prtype;
+
+ }
+
+ prebuilt->search_tuple = entry;
+
+ ulint bufsize = std::max<ulint>(srv_page_size,
+ prebuilt->mysql_row_len);
+ buf = static_cast<byte*>(ut_malloc_nokey(bufsize));
+
+ ulint cnt = 1000;
+
+ ret = row_search_for_mysql(buf, PAGE_CUR_WITHIN, prebuilt, 0, 0);
+loop:
+ /* Check thd->killed every 1,000 scanned rows */
+ if (--cnt == 0) {
+ if (trx_is_interrupted(prebuilt->trx)) {
+ ret = DB_INTERRUPTED;
+ goto func_exit;
+ }
+ cnt = 1000;
+ }
+
+ switch (ret) {
+ case DB_SUCCESS:
+ break;
+ case DB_DEADLOCK:
+ case DB_LOCK_TABLE_FULL:
+ case DB_LOCK_WAIT_TIMEOUT:
+ case DB_INTERRUPTED:
+ goto func_exit;
+ default:
+ /* fall through (this error is ignored by CHECK TABLE) */
+ case DB_END_OF_INDEX:
+ ret = DB_SUCCESS;
+func_exit:
+ prebuilt->search_tuple = search_entry;
+ ut_free(buf);
+ mem_heap_free(heap);
+
+ return(ret);
+ }
+
+ *n_rows = *n_rows + 1;
+
+ ret = row_search_for_mysql(
+ buf, PAGE_CUR_WITHIN, prebuilt, 0, ROW_SEL_NEXT);
+
+ goto loop;
+}
+
+/*******************************************************************//**
+Read the AUTOINC column from the current row. If the value is less than
+0 and the type is not unsigned then we reset the value to 0.
+@return value read from the column */
+static
+ib_uint64_t
+row_search_autoinc_read_column(
+/*===========================*/
+ dict_index_t* index, /*!< in: index to read from */
+ const rec_t* rec, /*!< in: current rec */
+ ulint col_no, /*!< in: column number */
+ ulint mtype, /*!< in: column main type */
+ ibool unsigned_type) /*!< in: signed or unsigned flag */
+{
+ ulint len;
+ const byte* data;
+ ib_uint64_t value;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ rec_offs_init(offsets_);
+ ut_ad(page_rec_is_leaf(rec));
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ col_no + 1, &heap);
+
+ if (rec_offs_nth_sql_null(offsets, col_no)) {
+ /* There is no non-NULL value in the auto-increment column. */
+ value = 0;
+ goto func_exit;
+ }
+
+ data = rec_get_nth_field(rec, offsets, col_no, &len);
+
+ value = row_parse_int(data, len, mtype, unsigned_type);
+
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(value);
+}
+
+/** Get the maximum and non-delete-marked record in an index.
+@param[in] index index tree
+@param[in,out] mtr mini-transaction (may be committed and restarted)
+@return maximum record, page s-latched in mtr
+@retval NULL if there are no records, or if all of them are delete-marked */
+static
+const rec_t*
+row_search_get_max_rec(
+ dict_index_t* index,
+ mtr_t* mtr)
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ /* Open at the high/right end (false), and init cursor */
+ btr_pcur_open_at_index_side(
+ false, index, BTR_SEARCH_LEAF, &pcur, true, 0, mtr);
+
+ do {
+ const page_t* page;
+
+ page = btr_pcur_get_page(&pcur);
+ rec = page_find_rec_max_not_deleted(page);
+
+ if (page_rec_is_user_rec(rec)) {
+ break;
+ } else {
+ rec = NULL;
+ }
+ btr_pcur_move_before_first_on_page(&pcur);
+ } while (btr_pcur_move_to_prev(&pcur, mtr));
+
+ btr_pcur_close(&pcur);
+
+ ut_ad(!rec
+ || !(rec_get_info_bits(rec, dict_table_is_comp(index->table))
+ & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)));
+ return(rec);
+}
+
+/** Read the max AUTOINC value from an index.
+@param[in] index index starting with an AUTO_INCREMENT column
+@return the largest AUTO_INCREMENT value
+@retval 0 if no records were found */
+ib_uint64_t
+row_search_max_autoinc(dict_index_t* index)
+{
+ const dict_field_t* dfield = dict_index_get_nth_field(index, 0);
+
+ ib_uint64_t value = 0;
+
+ mtr_t mtr;
+ mtr.start();
+
+ if (const rec_t* rec = row_search_get_max_rec(index, &mtr)) {
+ value = row_search_autoinc_read_column(
+ index, rec, 0,
+ dfield->col->mtype,
+ dfield->col->prtype & DATA_UNSIGNED);
+ }
+
+ mtr.commit();
+ return(value);
+}
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc
new file mode 100644
index 00000000..4cf4a873
--- /dev/null
+++ b/storage/innobase/row/row0uins.cc
@@ -0,0 +1,608 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0uins.cc
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "row0log.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "fil0fil.h"
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***************************************************************//**
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+ undo_node_t* node) /*!< in: undo node */
+{
+ ibool success;
+ dberr_t err;
+ ulint n_tries = 0;
+ mtr_t mtr;
+ dict_index_t* index = node->pcur.btr_cur.index;
+ bool online;
+
+ ut_ad(index->is_primary());
+ ut_ad(node->trx->in_rollback);
+
+ mtr.start();
+ if (index->table->is_temporary()) {
+ ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ ut_ad(!dict_index_is_online_ddl(index));
+ ut_ad(index->table->id >= DICT_HDR_FIRST_ID);
+ online = false;
+ } else {
+ index->set_modified(mtr);
+ ut_ad(lock_table_has_locks(index->table));
+ online = dict_index_is_online_ddl(index);
+ if (online) {
+ ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+ ut_ad(node->trx->dict_operation_lock_mode
+ != RW_X_LATCH);
+ ut_ad(node->table->id != DICT_INDEXES_ID);
+ ut_ad(node->table->id != DICT_COLUMNS_ID);
+ mtr_s_lock_index(index, &mtr);
+ }
+ }
+
+ /* This is similar to row_undo_mod_clust(). The DDL thread may
+ already have copied this row from the log to the new table.
+ We must log the removal, so that the row will be correctly
+ purged. However, we can log the removal out of sync with the
+ B-tree modification. */
+
+ success = btr_pcur_restore_position(
+ online
+ ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+ : (node->rec_type == TRX_UNDO_INSERT_METADATA)
+ ? BTR_MODIFY_TREE : BTR_MODIFY_LEAF, &node->pcur, &mtr);
+ ut_a(success);
+
+ rec_t* rec = btr_pcur_get_rec(&node->pcur);
+
+ ut_ad(rec_get_trx_id(rec, index) == node->trx->id
+ || node->table->is_temporary());
+ ut_ad(!rec_get_deleted_flag(rec, index->table->not_redundant())
+ || rec_is_alter_metadata(rec, index->table->not_redundant()));
+ ut_ad(rec_is_metadata(rec, index->table->not_redundant())
+ == (node->rec_type == TRX_UNDO_INSERT_METADATA));
+
+ if (online && dict_index_is_online_ddl(index)) {
+ mem_heap_t* heap = NULL;
+ const rec_offs* offsets = rec_get_offsets(
+ rec, index, NULL, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ row_log_table_delete(rec, index, offsets, NULL);
+ mem_heap_free(heap);
+ } else {
+ switch (node->table->id) {
+ case DICT_INDEXES_ID:
+ ut_ad(!online);
+ ut_ad(node->trx->dict_operation_lock_mode
+ == RW_X_LATCH);
+ ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+
+ dict_drop_index_tree(&node->pcur, node->trx, &mtr);
+ mtr.commit();
+
+ mtr.start();
+ success = btr_pcur_restore_position(
+ BTR_MODIFY_LEAF, &node->pcur, &mtr);
+ ut_a(success);
+ break;
+ case DICT_COLUMNS_ID:
+ /* This is rolling back an INSERT into SYS_COLUMNS.
+ If it was part of an instant ALTER TABLE operation, we
+ must evict the table definition, so that it can be
+ reloaded after the dictionary operation has been
+ completed. At this point, any corresponding operation
+ to the metadata record will have been rolled back. */
+ ut_ad(!online);
+ ut_ad(node->trx->dict_operation_lock_mode
+ == RW_X_LATCH);
+ ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+ if (rec_get_n_fields_old(rec)
+ != DICT_NUM_FIELDS__SYS_COLUMNS) {
+ break;
+ }
+ ulint len;
+ const byte* data = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len);
+ if (len != 8) {
+ break;
+ }
+ node->trx->evict_table(mach_read_from_8(data));
+ }
+ }
+
+ if (btr_cur_optimistic_delete(&node->pcur.btr_cur, 0, &mtr)) {
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+retry:
+ /* If did not succeed, try pessimistic descent to tree */
+ mtr.start();
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ }
+
+ success = btr_pcur_restore_position(
+ BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+ &node->pcur, &mtr);
+ ut_a(success);
+
+ btr_cur_pessimistic_delete(&err, FALSE, &node->pcur.btr_cur, 0, true,
+ &mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (err == DB_OUT_OF_FILE_SPACE
+ && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+func_exit:
+ if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) {
+ /* When rolling back the very first instant ADD COLUMN
+ operation, reset the root page to the basic state. */
+ btr_reset_instant(*index, true, &mtr);
+ }
+
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+ return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry if found.
+@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_low(
+/*========================*/
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry to remove */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ btr_pcur_t pcur;
+ dberr_t err = DB_SUCCESS;
+ mtr_t mtr;
+ const bool modify_leaf = mode == BTR_MODIFY_LEAF;
+
+ row_mtr_start(&mtr, index, !modify_leaf);
+
+ if (modify_leaf) {
+ mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+ mtr_s_lock_index(index, &mtr);
+ } else {
+ ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE));
+ mtr_sx_lock_index(index, &mtr);
+ }
+
+ if (row_log_online_op_try(index, entry, 0)) {
+ goto func_exit_no_pcur;
+ }
+
+ if (dict_index_is_spatial(index)) {
+ if (modify_leaf) {
+ mode |= BTR_RTREE_DELETE_MARK;
+ }
+ btr_pcur_get_btr_cur(&pcur)->thr = thr;
+ mode |= BTR_RTREE_UNDO_INS;
+ }
+
+ switch (row_search_index_entry(index, entry, mode, &pcur, &mtr)) {
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
+ case ROW_NOT_FOUND:
+ break;
+ case ROW_FOUND:
+ if (dict_index_is_spatial(index)
+ && rec_get_deleted_flag(
+ btr_pcur_get_rec(&pcur),
+ dict_table_is_comp(index->table))) {
+ ib::error() << "Record found in index " << index->name
+ << " is deleted marked on insert rollback.";
+ ut_ad(0);
+ }
+
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ if (modify_leaf) {
+ err = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
+ ? DB_SUCCESS : DB_FAIL;
+ } else {
+ /* Passing rollback=false here, because we are
+ deleting a secondary index record: the distinction
+ only matters when deleting a record that contains
+ externally stored columns. */
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+ false, &mtr);
+ }
+ }
+
+ btr_pcur_close(&pcur);
+func_exit_no_pcur:
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec(
+/*====================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry to insert */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ ulint n_tries = 0;
+
+ /* Try first optimistic descent to the B-tree */
+
+ err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry, thr);
+
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+retry:
+ err = row_undo_ins_remove_sec_low(
+ BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+ index, entry, thr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ return(err);
+}
+
+/** Parse an insert undo record.
+@param[in,out] node row rollback state
+@param[in] dict_locked whether the data dictionary cache is locked */
+static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked)
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ ulint dummy;
+ bool dummy_extern;
+
+ ut_ad(node->state == UNDO_INSERT_PERSISTENT
+ || node->state == UNDO_INSERT_TEMPORARY);
+ ut_ad(node->trx->in_rollback);
+ ut_ad(trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &node->rec_type, &dummy,
+ &dummy_extern, &undo_no, &table_id);
+
+ node->update = NULL;
+ if (node->state == UNDO_INSERT_PERSISTENT) {
+ node->table = dict_table_open_on_id(table_id, dict_locked,
+ DICT_TABLE_OP_NORMAL);
+ } else if (!dict_locked) {
+ mutex_enter(&dict_sys.mutex);
+ node->table = dict_sys.get_temporary_table(table_id);
+ mutex_exit(&dict_sys.mutex);
+ } else {
+ node->table = dict_sys.get_temporary_table(table_id);
+ }
+
+ if (!node->table) {
+ return false;
+ }
+
+ switch (node->rec_type) {
+ default:
+ ut_ad("wrong undo record type" == 0);
+ goto close_table;
+ case TRX_UNDO_INSERT_METADATA:
+ case TRX_UNDO_INSERT_REC:
+ break;
+ case TRX_UNDO_RENAME_TABLE:
+ dict_table_t* table = node->table;
+ ut_ad(!table->is_temporary());
+ ut_ad(dict_table_is_file_per_table(table)
+ == !is_system_tablespace(table->space_id));
+ size_t len = mach_read_from_2(node->undo_rec)
+ + size_t(node->undo_rec - ptr) - 2;
+ ptr[len] = 0;
+ const char* name = reinterpret_cast<char*>(ptr);
+ if (strcmp(table->name.m_name, name)) {
+ dict_table_rename_in_cache(table, name, false,
+ table_id != 0);
+ }
+ goto close_table;
+ }
+
+ if (UNIV_UNLIKELY(!node->table->is_accessible())) {
+close_table:
+ /* Normally, tables should not disappear or become
+ unaccessible during ROLLBACK, because they should be
+ protected by InnoDB table locks. Corruption could be
+ a valid exception.
+
+ FIXME: When running out of temporary tablespace, it
+ would probably be better to just drop all temporary
+ tables (and temporary undo log records) of the current
+ connection, instead of doing this rollback. */
+ dict_table_close(node->table, dict_locked, FALSE);
+ node->table = NULL;
+ return false;
+ } else {
+ ut_ad(!node->table->skip_alter_undo);
+ clust_index = dict_table_get_first_index(node->table);
+
+ if (clust_index != NULL) {
+ if (node->rec_type == TRX_UNDO_INSERT_REC) {
+ ptr = trx_undo_rec_get_row_ref(
+ ptr, clust_index, &node->ref,
+ node->heap);
+ } else {
+ node->ref = &trx_undo_metadata;
+ if (!row_undo_search_clust_to_pcur(node)) {
+ /* An error probably occurred during
+ an insert into the clustered index,
+ after we wrote the undo log record. */
+ goto close_table;
+ }
+ return true;
+ }
+
+ if (!row_undo_search_clust_to_pcur(node)) {
+ /* An error probably occurred during
+ an insert into the clustered index,
+ after we wrote the undo log record. */
+ goto close_table;
+ }
+ if (node->table->n_v_cols) {
+ trx_undo_read_v_cols(node->table, ptr,
+ node->row, false);
+ }
+
+ } else {
+ ib::warn() << "Table " << node->table->name
+ << " has no indexes,"
+ " ignoring the table";
+ goto close_table;
+ }
+ }
+
+ return true;
+}
+
+/***************************************************************//**
+Removes secondary index records.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_rec(
+/*========================*/
+ undo_node_t* node, /*!< in/out: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err = DB_SUCCESS;
+ dict_index_t* index = node->index;
+ mem_heap_t* heap;
+
+ heap = mem_heap_create(1024);
+
+ while (index != NULL) {
+ dtuple_t* entry;
+
+ if (index->type & DICT_FTS) {
+ dict_table_next_uncorrupted_index(index);
+ continue;
+ }
+
+ /* An insert undo record TRX_UNDO_INSERT_REC will
+ always contain all fields of the index. It does not
+ matter if any indexes were created afterwards; all
+ index entries can be reconstructed from the row. */
+ entry = row_build_index_entry(
+ node->row, node->ext, index, heap);
+ if (UNIV_UNLIKELY(!entry)) {
+ /* The database must have crashed after
+ inserting a clustered index record but before
+ writing all the externally stored columns of
+ that record, or a statement is being rolled
+ back because an error occurred while storing
+ off-page columns.
+
+ Because secondary index entries are inserted
+ after the clustered index record, we may
+ assume that the secondary index record does
+ not exist. */
+ } else {
+ err = row_undo_ins_remove_sec(index, entry, thr);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ goto func_exit;
+ }
+ }
+
+ mem_heap_empty(heap);
+ dict_table_next_uncorrupted_index(index);
+ }
+
+func_exit:
+ node->index = index;
+ mem_heap_free(heap);
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+row_undo_ins(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ bool dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH;
+
+ if (!row_undo_ins_parse_undo_rec(node, dict_locked)) {
+ return DB_SUCCESS;
+ }
+
+ /* Iterate over all the indexes and undo the insert.*/
+
+ node->index = dict_table_get_first_index(node->table);
+ ut_ad(dict_index_is_clust(node->index));
+
+ switch (node->rec_type) {
+ default:
+ ut_ad("wrong undo record type" == 0);
+ /* fall through */
+ case TRX_UNDO_INSERT_REC:
+ /* Skip the clustered index (the first index) */
+ node->index = dict_table_get_next_index(node->index);
+
+ dict_table_skip_corrupt_index(node->index);
+
+ err = row_undo_ins_remove_sec_rec(node, thr);
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ log_free_check();
+
+ if (node->table->id == DICT_INDEXES_ID) {
+ ut_ad(!node->table->is_temporary());
+ if (!dict_locked) {
+ mutex_enter(&dict_sys.mutex);
+ }
+ err = row_undo_ins_remove_clust_rec(node);
+ if (!dict_locked) {
+ mutex_exit(&dict_sys.mutex);
+ }
+ } else {
+ err = row_undo_ins_remove_clust_rec(node);
+ }
+
+ if (err == DB_SUCCESS && node->table->stat_initialized) {
+ /* Not protected by dict_sys.mutex for
+ performance reasons, we would rather get garbage
+ in stat_n_rows (which is just an estimate anyway)
+ than protecting the following code with a latch. */
+ dict_table_n_rows_dec(node->table);
+
+ /* Do not attempt to update statistics when
+ executing ROLLBACK in the InnoDB SQL
+ interpreter, because in that case we would
+ already be holding dict_sys.mutex, which
+ would be acquired when updating statistics. */
+ if (!dict_locked) {
+ dict_stats_update_if_needed(node->table,
+ *node->trx);
+ }
+ }
+ break;
+
+ case TRX_UNDO_INSERT_METADATA:
+ log_free_check();
+ ut_ad(!node->table->is_temporary());
+ err = row_undo_ins_remove_clust_rec(node);
+ }
+
+ dict_table_close(node->table, dict_locked, FALSE);
+
+ node->table = NULL;
+
+ return(err);
+}
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
new file mode 100644
index 00000000..5c4d9a3e
--- /dev/null
+++ b/storage/innobase/row/row0umod.cc
@@ -0,0 +1,1418 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0umod.cc
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "ibuf0ibuf.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "row0log.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Undoes a modify in a clustered index record.
+@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_clust_low(
+/*===================*/
+ undo_node_t* node, /*!< in: row undo node */
+ rec_offs** offsets,/*!< out: rec_get_offsets() on the record */
+ mem_heap_t** offsets_heap,
+ /*!< in/out: memory heap that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ const dtuple_t**rebuilt_old_pk,
+ /*!< out: row_log_table_get_pk()
+ before the update, or NULL if
+ the table is not being rebuilt online or
+ the PRIMARY KEY definition does not change */
+ byte* sys, /*!< out: DB_TRX_ID, DB_ROLL_PTR
+ for row_log_table_delete() */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in: mtr; must be committed before
+ latching any further pages */
+ ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ dberr_t err;
+#ifdef UNIV_DEBUG
+ ibool success;
+#endif /* UNIV_DEBUG */
+
+ pcur = &node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+#ifdef UNIV_DEBUG
+ success =
+#endif /* UNIV_DEBUG */
+ btr_pcur_restore_position(mode, pcur, mtr);
+
+ ut_ad(success);
+ ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur),
+ btr_cur_get_index(btr_cur))
+ == thr_get_trx(thr)->id
+ || btr_cur_get_index(btr_cur)->table->is_temporary());
+ ut_ad(node->ref != &trx_undo_metadata
+ || node->update->info_bits == REC_INFO_METADATA_ADD
+ || node->update->info_bits == REC_INFO_METADATA_ALTER);
+
+ if (mode != BTR_MODIFY_LEAF
+ && dict_index_is_online_ddl(btr_cur_get_index(btr_cur))) {
+ *rebuilt_old_pk = row_log_table_get_pk(
+ btr_cur_get_rec(btr_cur),
+ btr_cur_get_index(btr_cur), NULL, sys, &heap);
+ } else {
+ *rebuilt_old_pk = NULL;
+ }
+
+ if (mode != BTR_MODIFY_TREE) {
+ ut_ad((mode & ulint(~BTR_ALREADY_S_LATCHED))
+ == BTR_MODIFY_LEAF);
+
+ err = btr_cur_optimistic_update(
+ BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, offsets, offsets_heap,
+ node->update, node->cmpl_info,
+ thr, thr_get_trx(thr)->id, mtr);
+ ut_ad(err != DB_SUCCESS || node->ref != &trx_undo_metadata);
+ } else {
+ big_rec_t* dummy_big_rec;
+
+ err = btr_cur_pessimistic_update(
+ BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, offsets, offsets_heap, heap,
+ &dummy_big_rec, node->update,
+ node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+
+ ut_a(!dummy_big_rec);
+
+ if (err == DB_SUCCESS
+ && node->ref == &trx_undo_metadata
+ && btr_cur_get_index(btr_cur)->table->instant
+ && node->update->info_bits == REC_INFO_METADATA_ADD) {
+ btr_reset_instant(*btr_cur_get_index(btr_cur), false,
+ mtr);
+ }
+ }
+
+ if (err == DB_SUCCESS
+ && btr_cur_get_index(btr_cur)->table->id == DICT_COLUMNS_ID) {
+ /* This is rolling back an UPDATE or DELETE on SYS_COLUMNS.
+ If it was part of an instant ALTER TABLE operation, we
+ must evict the table definition, so that it can be
+ reloaded after the dictionary operation has been
+ completed. At this point, any corresponding operation
+ to the metadata record will have been rolled back. */
+ const dfield_t& table_id = *dtuple_get_nth_field(node->row, 0);
+ ut_ad(dfield_get_len(&table_id) == 8);
+ node->trx->evict_table(mach_read_from_8(static_cast<byte*>(
+ table_id.data)));
+ }
+
+ return(err);
+}
+
+/** Get the byte offset of the DB_TRX_ID column
+@param[in] rec clustered index record
+@param[in] index clustered index
+@return the byte offset of DB_TRX_ID, from the start of rec */
+static ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index)
+{
+ ut_ad(index->n_uniq <= MAX_REF_PARTS);
+ ulint trx_id_offset = index->trx_id_offset;
+ if (!trx_id_offset) {
+ /* Reserve enough offsets for the PRIMARY KEY and 2 columns
+ so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+ rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+ rec_offs_init(offsets_);
+ mem_heap_t* heap = NULL;
+ const ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+ rec_offs* offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ trx_id_pos + 1, &heap);
+ ut_ad(!heap);
+ ulint len;
+ trx_id_offset = rec_get_nth_field_offs(
+ offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ }
+
+ return trx_id_offset;
+}
+
+/** Determine if rollback must execute a purge-like operation.
+@param[in,out] node row undo
+@param[in,out] mtr mini-transaction
+@return whether the record should be purged */
+static bool row_undo_mod_must_purge(undo_node_t* node, mtr_t* mtr)
+{
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+ ut_ad(!node->table->is_temporary());
+
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&node->pcur);
+ ut_ad(btr_cur->index->is_primary());
+ DEBUG_SYNC_C("rollback_purge_clust");
+
+ mtr->s_lock(&purge_sys.latch, __FILE__, __LINE__);
+
+ if (!purge_sys.changes_visible(node->new_trx_id, node->table->name)) {
+ return false;
+ }
+
+ const rec_t* rec = btr_cur_get_rec(btr_cur);
+
+ return trx_read_trx_id(rec + row_trx_id_offset(rec, btr_cur->index))
+ == node->new_trx_id;
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo.
+@return DB_SUCCESS or error code: we may run out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_clust(
+/*===============*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ btr_pcur_t* pcur;
+ mtr_t mtr;
+ dberr_t err;
+ dict_index_t* index;
+ bool online;
+
+ ut_ad(thr_get_trx(thr) == node->trx);
+ ut_ad(node->trx->dict_operation_lock_mode);
+ ut_ad(node->trx->in_rollback);
+ ut_ad(rw_lock_own_flagged(&dict_sys.latch,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+ log_free_check();
+ pcur = &node->pcur;
+ index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur));
+ ut_ad(index->is_primary());
+
+ mtr.start();
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ ut_ad(lock_table_has_locks(index->table));
+ }
+
+ online = dict_index_is_online_ddl(index);
+ if (online) {
+ ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH);
+ mtr_s_lock_index(index, &mtr);
+ }
+
+ mem_heap_t* heap = mem_heap_create(1024);
+ mem_heap_t* offsets_heap = NULL;
+ rec_offs* offsets = NULL;
+ const dtuple_t* rebuilt_old_pk;
+ byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+ /* Try optimistic processing of the record, keeping changes within
+ the index page */
+
+ err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
+ heap, &rebuilt_old_pk, sys,
+ thr, &mtr, online
+ ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+ : BTR_MODIFY_LEAF);
+
+ if (err != DB_SUCCESS) {
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ /* We may have to modify tree structure: do a pessimistic
+ descent down the index tree */
+
+ mtr.start();
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ }
+
+ err = row_undo_mod_clust_low(
+ node, &offsets, &offsets_heap,
+ heap, &rebuilt_old_pk, sys,
+ thr, &mtr, BTR_MODIFY_TREE);
+ ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
+ }
+
+ /* Online rebuild cannot be initiated while we are holding
+ dict_sys.latch and index->lock. (It can be aborted.) */
+ ut_ad(online || !dict_index_is_online_ddl(index));
+
+ if (err == DB_SUCCESS && online) {
+
+ ut_ad(rw_lock_own_flagged(
+ &index->lock,
+ RW_LOCK_FLAG_S | RW_LOCK_FLAG_X
+ | RW_LOCK_FLAG_SX));
+
+ switch (node->rec_type) {
+ case TRX_UNDO_DEL_MARK_REC:
+ row_log_table_insert(
+ btr_pcur_get_rec(pcur), index, offsets);
+ break;
+ case TRX_UNDO_UPD_EXIST_REC:
+ row_log_table_update(
+ btr_pcur_get_rec(pcur), index, offsets,
+ rebuilt_old_pk);
+ break;
+ case TRX_UNDO_UPD_DEL_REC:
+ row_log_table_delete(
+ btr_pcur_get_rec(pcur), index, offsets, sys);
+ break;
+ default:
+ ut_ad(0);
+ break;
+ }
+ }
+
+ /**
+ * when scrubbing, and records gets cleared,
+ * the transaction id is not present afterwards.
+ * this is safe as: since the record is on free-list
+ * it can be reallocated at any time after this mtr-commits
+ * which is just below
+ */
+ ut_ad(srv_immediate_scrub_data_uncompressed
+ || row_get_rec_trx_id(btr_pcur_get_rec(pcur), index, offsets)
+ == node->new_trx_id);
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+ DEBUG_SYNC_C("rollback_undo_pk");
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ /* FIXME: Perform the below operations in the above
+ mini-transaction when possible. */
+
+ if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing update_undo log record. */
+ ut_ad(node->new_trx_id);
+
+ mtr.start();
+ if (!btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, &mtr)) {
+ goto mtr_commit_exit;
+ }
+
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ if (!row_undo_mod_must_purge(node, &mtr)) {
+ goto mtr_commit_exit;
+ }
+ index->set_modified(mtr);
+ }
+
+ ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ dict_table_is_comp(node->table)));
+ if (btr_cur_optimistic_delete(&pcur->btr_cur, 0, &mtr)) {
+ goto mtr_commit_exit;
+ }
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ mtr.start();
+ if (!btr_pcur_restore_position(
+ BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+ pcur, &mtr)) {
+ goto mtr_commit_exit;
+ }
+
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ if (!row_undo_mod_must_purge(node, &mtr)) {
+ goto mtr_commit_exit;
+ }
+ index->set_modified(mtr);
+ }
+
+ ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ dict_table_is_comp(node->table)));
+
+ /* This operation is analogous to purge, we can free
+ also inherited externally stored fields. We can also
+ assume that the record was complete (including BLOBs),
+ because it had been delete-marked after it had been
+ completely inserted. Therefore, we are passing
+ rollback=false, just like purge does. */
+ btr_cur_pessimistic_delete(&err, FALSE, &pcur->btr_cur, 0,
+ false, &mtr);
+ ut_ad(err == DB_SUCCESS
+ || err == DB_OUT_OF_FILE_SPACE);
+ } else if (!index->table->is_temporary() && node->new_trx_id) {
+ /* We rolled back a record so that it still exists.
+ We must reset the DB_TRX_ID if the history is no
+ longer accessible by any active read view. */
+
+ mtr.start();
+ if (!btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, &mtr)) {
+ goto mtr_commit_exit;
+ }
+ rec_t* rec = btr_pcur_get_rec(pcur);
+ mtr.s_lock(&purge_sys.latch, __FILE__, __LINE__);
+ if (!purge_sys.changes_visible(node->new_trx_id,
+ node->table->name)) {
+ goto mtr_commit_exit;
+ }
+
+ ulint trx_id_offset = index->trx_id_offset;
+ ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+ /* Reserve enough offsets for the PRIMARY KEY and
+ 2 columns so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+ rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+ if (trx_id_offset) {
+#ifdef UNIV_DEBUG
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+ if (buf_block_get_page_zip(
+ btr_pcur_get_block(&node->pcur))) {
+ /* Below, page_zip_write_trx_id_and_roll_ptr()
+ needs offsets to access DB_TRX_ID,DB_ROLL_PTR.
+ We already computed offsets for possibly
+ another record in the clustered index.
+ Because the PRIMARY KEY is fixed-length,
+ the offsets for the PRIMARY KEY and
+ DB_TRX_ID,DB_ROLL_PTR are still valid.
+ Silence the rec_offs_validate() assertion. */
+ rec_offs_make_valid(rec, index, true, offsets);
+ }
+#endif
+ } else if (rec_is_metadata(rec, *index)) {
+ ut_ad(!buf_block_get_page_zip(btr_pcur_get_block(
+ pcur)));
+ for (unsigned i = index->first_user_field(); i--; ) {
+ trx_id_offset += index->fields[i].fixed_len;
+ }
+ } else {
+ ut_ad(index->n_uniq <= MAX_REF_PARTS);
+ rec_offs_init(offsets_);
+ offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ trx_id_pos + 2, &heap);
+ ulint len;
+ trx_id_offset = rec_get_nth_field_offs(
+ offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ }
+
+ if (trx_read_trx_id(rec + trx_id_offset) == node->new_trx_id) {
+ ut_ad(!rec_get_deleted_flag(
+ rec, dict_table_is_comp(node->table))
+ || rec_is_alter_metadata(rec, *index));
+ index->set_modified(mtr);
+ buf_block_t* block = btr_pcur_get_block(pcur);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ page_zip_write_trx_id_and_roll_ptr(
+ block, rec, offsets, trx_id_pos,
+ 0, 1ULL << ROLL_PTR_INSERT_FLAG_POS,
+ &mtr);
+ } else {
+ size_t offs = page_offset(rec + trx_id_offset);
+ mtr.memset(block, offs, DATA_TRX_ID_LEN, 0);
+ offs += DATA_TRX_ID_LEN;
+ mtr.write<1,mtr_t::MAYBE_NOP>(*block,
+ block->frame
+ + offs, 0x80U);
+ mtr.memset(block, offs + 1,
+ DATA_ROLL_PTR_LEN - 1, 0);
+ }
+ }
+ } else {
+ goto func_exit;
+ }
+
+mtr_commit_exit:
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+func_exit:
+ if (offsets_heap) {
+ mem_heap_free(offsets_heap);
+ }
+ mem_heap_free(heap);
+ return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry */
+ ulint mode) /*!< in: latch mode BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ dberr_t err = DB_SUCCESS;
+ mtr_t mtr;
+ mtr_t mtr_vers;
+ row_search_result search_result;
+ const bool modify_leaf = mode == BTR_MODIFY_LEAF;
+
+ row_mtr_start(&mtr, index, !modify_leaf);
+
+ if (!index->is_committed()) {
+ /* The index->online_status may change if the index is
+ or was being created online, but not committed yet. It
+ is protected by index->lock. */
+ if (modify_leaf) {
+ mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+ mtr_s_lock_index(index, &mtr);
+ } else {
+ ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE));
+ mtr_sx_lock_index(index, &mtr);
+ }
+
+ if (row_log_online_op_try(index, entry, 0)) {
+ goto func_exit_no_pcur;
+ }
+ } else {
+ /* For secondary indexes,
+ index->online_status==ONLINE_INDEX_COMPLETE if
+ index->is_committed(). */
+ ut_ad(!dict_index_is_online_ddl(index));
+ }
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ if (dict_index_is_spatial(index)) {
+ if (modify_leaf) {
+ btr_cur->thr = thr;
+ mode |= BTR_RTREE_DELETE_MARK;
+ }
+ mode |= BTR_RTREE_UNDO_INS;
+ }
+
+ search_result = row_search_index_entry(index, entry, mode,
+ &pcur, &mtr);
+
+ switch (UNIV_EXPECT(search_result, ROW_FOUND)) {
+ case ROW_NOT_FOUND:
+ /* In crash recovery, the secondary index record may
+ be missing if the UPDATE did not have time to insert
+ the secondary index records before the crash. When we
+ are undoing that UPDATE in crash recovery, the record
+ may be missing.
+
+ In normal processing, if an update ends in a deadlock
+ before it has inserted all updated secondary index
+ records, then the undo will not find those records. */
+ goto func_exit;
+ case ROW_FOUND:
+ break;
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
+ }
+
+ /* We should remove the index record if no prior version of the row,
+ which cannot be purged yet, requires its existence. If some requires,
+ we should delete mark the record. */
+
+ mtr_vers.start();
+
+ success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur),
+ &mtr_vers);
+ ut_a(success);
+
+ /* For temporary table, we can skip to check older version of
+ clustered index entry, because there is no MVCC or purge. */
+ if (node->table->is_temporary()
+ || row_vers_old_has_index_entry(
+ false, btr_pcur_get_rec(&node->pcur),
+ &mtr_vers, index, entry, 0, 0)) {
+ btr_rec_set_deleted<true>(btr_cur_get_block(btr_cur),
+ btr_cur_get_rec(btr_cur), &mtr);
+ } else {
+ /* Remove the index record */
+
+ if (dict_index_is_spatial(index)) {
+ rec_t* rec = btr_pcur_get_rec(&pcur);
+ if (rec_get_deleted_flag(rec,
+ dict_table_is_comp(index->table))) {
+ ib::error() << "Record found in index "
+ << index->name << " is deleted marked"
+ " on rollback update.";
+ ut_ad(0);
+ }
+ }
+
+ if (modify_leaf) {
+ err = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
+ ? DB_SUCCESS : DB_FAIL;
+ } else {
+ /* Passing rollback=false,
+ because we are deleting a secondary index record:
+ the distinction only matters when deleting a
+ record that contains externally stored columns. */
+ ut_ad(!index->is_primary());
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+ false, &mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+ }
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+
+func_exit:
+ btr_pcur_close(&pcur);
+func_exit_no_pcur:
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+NOTE that if we updated the fields of a delete-marked secondary index record
+so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
+return to the original values because we do not know them. But this should
+not cause problems because in row0sel.cc, in queries we always retrieve the
+clustered index record or an earlier version of it, if the secondary index
+record through which we do the search is delete-marked.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry) /*!< in: index entry */
+{
+ dberr_t err;
+
+ err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ entry, BTR_MODIFY_LEAF);
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ entry, BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE);
+ return(err);
+}
+
+/***********************************************************//**
+Delete unmarks a secondary index entry which must be found. It might not be
+delete-marked at the moment, but it does not harm to unmark it anyway. We also
+need to update the fields of the secondary index record if we updated its
+fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'.
+@retval DB_SUCCESS on success
+@retval DB_FAIL if BTR_MODIFY_TREE should be tried
+@retval DB_OUT_OF_FILE_SPACE when running out of tablespace
+@retval DB_DUPLICATE_KEY if the value was missing
+ and an insert would lead to a duplicate exists */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_unmark_sec_and_undo_update(
+/*========================================*/
+ ulint mode, /*!< in: search mode: BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry) /*!< in: index entry */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+ upd_t* update;
+ dberr_t err = DB_SUCCESS;
+ big_rec_t* dummy_big_rec;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+ const ulint flags
+ = BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG;
+ row_search_result search_result;
+ ulint orig_mode = mode;
+
+ ut_ad(trx->id != 0);
+
+ if (dict_index_is_spatial(index)) {
+ /* FIXME: Currently we do a 2-pass search for the undo
+ due to avoid undel-mark a wrong rec in rolling back in
+ partial update. Later, we could log some info in
+ secondary index updates to avoid this. */
+ ut_ad(mode & BTR_MODIFY_LEAF);
+ mode |= BTR_RTREE_DELETE_MARK;
+ }
+
+try_again:
+ row_mtr_start(&mtr, index, !(mode & BTR_MODIFY_LEAF));
+
+ if (!index->is_committed()) {
+ /* The index->online_status may change if the index is
+ or was being created online, but not committed yet. It
+ is protected by index->lock. */
+ if (mode == BTR_MODIFY_LEAF) {
+ mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+ mtr_s_lock_index(index, &mtr);
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+ mtr_sx_lock_index(index, &mtr);
+ }
+
+ if (row_log_online_op_try(index, entry, trx->id)) {
+ goto func_exit_no_pcur;
+ }
+ } else {
+ /* For secondary indexes,
+ index->online_status==ONLINE_INDEX_COMPLETE if
+ index->is_committed(). */
+ ut_ad(!dict_index_is_online_ddl(index));
+ }
+
+ btr_cur->thr = thr;
+
+ search_result = row_search_index_entry(index, entry, mode,
+ &pcur, &mtr);
+
+ switch (search_result) {
+ mem_heap_t* heap;
+ mem_heap_t* offsets_heap;
+ rec_offs* offsets;
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
+ case ROW_NOT_FOUND:
+ /* For spatial index, if first search didn't find an
+ undel-marked rec, try to find a del-marked rec. */
+ if (dict_index_is_spatial(index) && btr_cur->rtr_info->fd_del) {
+ if (mode != orig_mode) {
+ mode = orig_mode;
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ goto try_again;
+ }
+ }
+
+ if (index->is_committed()) {
+ /* During online secondary index creation, it
+ is possible that MySQL is waiting for a
+ meta-data lock upgrade before invoking
+ ha_innobase::commit_inplace_alter_table()
+ while this ROLLBACK is executing. InnoDB has
+ finished building the index, but it does not
+ yet exist in MySQL. In this case, we suppress
+ the printout to the error log. */
+ ib::warn() << "Record in index " << index->name
+ << " of table " << index->table->name
+ << " was not found on rollback, trying to"
+ " insert: " << *entry
+ << " at: " << rec_index_print(
+ btr_cur_get_rec(btr_cur), index);
+ }
+
+ if (btr_cur->up_match >= dict_index_get_n_unique(index)
+ || btr_cur->low_match >= dict_index_get_n_unique(index)) {
+ if (index->is_committed()) {
+ ib::warn() << "Record in index " << index->name
+ << " was not found on rollback, and"
+ " a duplicate exists";
+ }
+ err = DB_DUPLICATE_KEY;
+ break;
+ }
+
+ /* Insert the missing record that we were trying to
+ delete-unmark. */
+ big_rec_t* big_rec;
+ rec_t* insert_rec;
+ offsets = NULL;
+ offsets_heap = NULL;
+
+ err = btr_cur_optimistic_insert(
+ flags, btr_cur, &offsets, &offsets_heap,
+ entry, &insert_rec, &big_rec,
+ 0, thr, &mtr);
+ ut_ad(!big_rec);
+
+ if (err == DB_FAIL && mode == BTR_MODIFY_TREE) {
+ err = btr_cur_pessimistic_insert(
+ flags, btr_cur,
+ &offsets, &offsets_heap,
+ entry, &insert_rec, &big_rec,
+ 0, thr, &mtr);
+ /* There are no off-page columns in
+ secondary indexes. */
+ ut_ad(!big_rec);
+ }
+
+ if (err == DB_SUCCESS) {
+ page_update_max_trx_id(
+ btr_cur_get_block(btr_cur),
+ btr_cur_get_page_zip(btr_cur),
+ trx->id, &mtr);
+ }
+
+ if (offsets_heap) {
+ mem_heap_free(offsets_heap);
+ }
+
+ break;
+ case ROW_FOUND:
+ btr_rec_set_deleted<false>(btr_cur_get_block(btr_cur),
+ btr_cur_get_rec(btr_cur), &mtr);
+ heap = mem_heap_create(
+ sizeof(upd_t)
+ + dtuple_get_n_fields(entry) * sizeof(upd_field_t));
+ offsets_heap = NULL;
+ offsets = rec_get_offsets(
+ btr_cur_get_rec(btr_cur),
+ index, nullptr, index->n_core_fields, ULINT_UNDEFINED,
+ &offsets_heap);
+ update = row_upd_build_sec_rec_difference_binary(
+ btr_cur_get_rec(btr_cur), index, offsets, entry, heap);
+ if (upd_get_n_fields(update) == 0) {
+
+ /* Do nothing */
+
+ } else if (mode != BTR_MODIFY_TREE) {
+ /* Try an optimistic updating of the record, keeping
+ changes within the page */
+
+ /* TODO: pass offsets, not &offsets */
+ err = btr_cur_optimistic_update(
+ flags, btr_cur, &offsets, &offsets_heap,
+ update, 0, thr, thr_get_trx(thr)->id, &mtr);
+ switch (err) {
+ case DB_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_ZIP_OVERFLOW:
+ err = DB_FAIL;
+ default:
+ break;
+ }
+ } else {
+ err = btr_cur_pessimistic_update(
+ flags, btr_cur, &offsets, &offsets_heap,
+ heap, &dummy_big_rec,
+ update, 0, thr, thr_get_trx(thr)->id, &mtr);
+ ut_a(!dummy_big_rec);
+ }
+
+ mem_heap_free(heap);
+ mem_heap_free(offsets_heap);
+ }
+
+ btr_pcur_close(&pcur);
+func_exit_no_pcur:
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***********************************************************//**
+Flags a secondary index corrupted. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_undo_mod_sec_flag_corrupted(
+/*============================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_index_t* index) /*!< in: secondary index */
+{
+ ut_ad(!dict_index_is_clust(index));
+
+ switch (trx->dict_operation_lock_mode) {
+ case RW_S_LATCH:
+ /* Because row_undo() is holding an S-latch
+ on the data dictionary during normal rollback,
+ we can only mark the index corrupted in the
+ data dictionary cache. TODO: fix this somehow.*/
+ mutex_enter(&dict_sys.mutex);
+ dict_set_corrupted_index_cache_only(index);
+ mutex_exit(&dict_sys.mutex);
+ break;
+ default:
+ ut_ad(0);
+ /* fall through */
+ case RW_X_LATCH:
+ /* This should be the rollback of a data dictionary
+ transaction. */
+ dict_set_corrupted(index, trx, "rollback");
+ }
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_DEL.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_upd_del_sec(
+/*=====================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mem_heap_t* heap;
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+ ut_ad(!node->undo_row);
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ dict_index_t* index = node->index;
+ dtuple_t* entry;
+
+ if (index->type & DICT_FTS) {
+ dict_table_next_uncorrupted_index(node->index);
+ continue;
+ }
+
+ /* During online index creation,
+ HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCk
+ should guarantee that any active transaction has not modified
+ indexed columns such that col->ord_part was 0 at the
+ time when the undo log record was written. When we get
+ to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+ it should always cover all affected indexes. */
+ entry = row_build_index_entry(
+ node->row, node->ext, index, heap);
+
+ if (UNIV_UNLIKELY(!entry)) {
+ /* The database must have crashed after
+ inserting a clustered index record but before
+ writing all the externally stored columns of
+ that record. Because secondary index entries
+ are inserted after the clustered index record,
+ we may assume that the secondary index record
+ does not exist. However, this situation may
+ only occur during the rollback of incomplete
+ transactions. */
+ ut_a(thr_get_trx(thr) == trx_roll_crash_recv_trx);
+ } else {
+ err = row_undo_mod_del_mark_or_remove_sec(
+ node, thr, index, entry);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+
+ break;
+ }
+ }
+
+ mem_heap_empty(heap);
+ dict_table_next_uncorrupted_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is DEL_MARK.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_sec(
+/*======================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mem_heap_t* heap;
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(!node->undo_row);
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ dict_index_t* index = node->index;
+ dtuple_t* entry;
+
+ if (index->type == DICT_FTS) {
+ dict_table_next_uncorrupted_index(node->index);
+ continue;
+ }
+
+ /* During online index creation,
+ HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCK
+ should guarantee that any active transaction has not modified
+ indexed columns such that col->ord_part was 0 at the
+ time when the undo log record was written. When we get
+ to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+ it should always cover all affected indexes. */
+ entry = row_build_index_entry(
+ node->row, node->ext, index, heap);
+
+ ut_a(entry);
+
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_LEAF, thr, index, entry);
+ if (err == DB_FAIL) {
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_TREE, thr, index, entry);
+ }
+
+ if (err == DB_DUPLICATE_KEY) {
+ row_undo_mod_sec_flag_corrupted(
+ thr_get_trx(thr), index);
+ err = DB_SUCCESS;
+ /* Do not return any error to the caller. The
+ duplicate will be reported by ALTER TABLE or
+ CREATE UNIQUE INDEX. Unfortunately we cannot
+ report the duplicate key value to the DDL
+ thread, because the altered_table object is
+ private to its call stack. */
+ } else if (err != DB_SUCCESS) {
+ break;
+ }
+
+ mem_heap_empty(heap);
+ dict_table_next_uncorrupted_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mem_heap_t* heap;
+ dberr_t err = DB_SUCCESS;
+
+ if (node->index == NULL
+ || ((node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
+ /* No change in secondary indexes */
+
+ return(err);
+ }
+
+ heap = mem_heap_create(1024);
+
+
+ while (node->index != NULL) {
+ dict_index_t* index = node->index;
+ dtuple_t* entry;
+
+ if (dict_index_is_spatial(index)) {
+ if (!row_upd_changes_ord_field_binary_func(
+ index, node->update,
+#ifdef UNIV_DEBUG
+ thr,
+#endif /* UNIV_DEBUG */
+ node->row,
+ node->ext, ROW_BUILD_FOR_UNDO)) {
+ dict_table_next_uncorrupted_index(node->index);
+ continue;
+ }
+ } else {
+ if (index->type == DICT_FTS
+ || !row_upd_changes_ord_field_binary(index,
+ node->update,
+ thr, node->row,
+ node->ext)) {
+ dict_table_next_uncorrupted_index(node->index);
+ continue;
+ }
+ }
+
+ /* Build the newest version of the index entry */
+ entry = row_build_index_entry(node->row, node->ext,
+ index, heap);
+ if (UNIV_UNLIKELY(!entry)) {
+ /* The server must have crashed in
+ row_upd_clust_rec_by_insert() before
+ the updated externally stored columns (BLOBs)
+ of the new clustered index entry were written. */
+
+ /* The table must be in DYNAMIC or COMPRESSED
+ format. REDUNDANT and COMPACT formats
+ store a local 768-byte prefix of each
+ externally stored column. */
+ ut_a(dict_table_has_atomic_blobs(index->table));
+
+ /* This is only legitimate when
+ rolling back an incomplete transaction
+ after crash recovery. */
+ ut_a(thr_get_trx(thr)->is_recovered);
+
+ /* The server must have crashed before
+ completing the insert of the new
+ clustered index entry and before
+ inserting to the secondary indexes.
+ Because node->row was not yet written
+ to this index, we can ignore it. But
+ we must restore node->undo_row. */
+ } else {
+ /* NOTE that if we updated the fields of a
+ delete-marked secondary index record so that
+ alphabetically they stayed the same, e.g.,
+ 'abc' -> 'aBc', we cannot return to the
+ original values because we do not know them.
+ But this should not cause problems because
+ in row0sel.cc, in queries we always retrieve
+ the clustered index record or an earlier
+ version of it, if the secondary index record
+ through which we do the search is
+ delete-marked. */
+
+ err = row_undo_mod_del_mark_or_remove_sec(
+ node, thr, index, entry);
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ }
+
+ mem_heap_empty(heap);
+ /* We may have to update the delete mark in the
+ secondary index record of the previous version of
+ the row. We also need to update the fields of
+ the secondary index record if we updated its fields
+ but alphabetically they stayed the same, e.g.,
+ 'abc' -> 'aBc'. */
+ if (dict_index_is_spatial(index)) {
+ entry = row_build_index_entry_low(node->undo_row,
+ node->undo_ext,
+ index, heap,
+ ROW_BUILD_FOR_UNDO);
+ } else {
+ entry = row_build_index_entry(node->undo_row,
+ node->undo_ext,
+ index, heap);
+ }
+
+ ut_a(entry);
+
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_LEAF, thr, index, entry);
+ if (err == DB_FAIL) {
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_TREE, thr, index, entry);
+ }
+
+ if (err == DB_DUPLICATE_KEY) {
+ row_undo_mod_sec_flag_corrupted(
+ thr_get_trx(thr), index);
+ err = DB_SUCCESS;
+ } else if (err != DB_SUCCESS) {
+ break;
+ }
+
+ mem_heap_empty(heap);
+ dict_table_next_uncorrupted_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/** Parse an update undo record.
+@param[in,out] node row rollback state
+@param[in] dict_locked whether the data dictionary cache is locked */
+static bool row_undo_mod_parse_undo_rec(undo_node_t* node, bool dict_locked)
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ byte info_bits;
+ ulint type;
+ ulint cmpl_info;
+ bool dummy_extern;
+
+ ut_ad(node->state == UNDO_UPDATE_PERSISTENT
+ || node->state == UNDO_UPDATE_TEMPORARY);
+ ut_ad(node->trx->in_rollback);
+ ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+ node->rec_type = type;
+
+ if (node->state == UNDO_UPDATE_PERSISTENT) {
+ node->table = dict_table_open_on_id(table_id, dict_locked,
+ DICT_TABLE_OP_NORMAL);
+ } else if (!dict_locked) {
+ mutex_enter(&dict_sys.mutex);
+ node->table = dict_sys.get_temporary_table(table_id);
+ mutex_exit(&dict_sys.mutex);
+ } else {
+ node->table = dict_sys.get_temporary_table(table_id);
+ }
+
+ if (!node->table) {
+ return false;
+ }
+
+ ut_ad(!node->table->skip_alter_undo);
+
+ if (UNIV_UNLIKELY(!node->table->is_accessible())) {
+close_table:
+ /* Normally, tables should not disappear or become
+ unaccessible during ROLLBACK, because they should be
+ protected by InnoDB table locks. Corruption could be
+ a valid exception.
+
+ FIXME: When running out of temporary tablespace, it
+ would probably be better to just drop all temporary
+ tables (and temporary undo log records) of the current
+ connection, instead of doing this rollback. */
+ dict_table_close(node->table, dict_locked, FALSE);
+ node->table = NULL;
+ return false;
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+
+ ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+ roll_ptr, info_bits,
+ node->heap, &(node->update));
+ node->new_trx_id = trx_id;
+ node->cmpl_info = cmpl_info;
+ ut_ad(!node->ref->info_bits);
+
+ if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) {
+ if ((node->update->info_bits & ~REC_INFO_DELETED_FLAG)
+ != REC_INFO_MIN_REC_FLAG) {
+ ut_ad("wrong info_bits in undo log record" == 0);
+ goto close_table;
+ }
+ /* This must be an undo log record for a subsequent
+ instant ALTER TABLE, extending the metadata record. */
+ ut_ad(clust_index->is_instant());
+ ut_ad(clust_index->table->instant
+ || !(node->update->info_bits & REC_INFO_DELETED_FLAG));
+ node->ref = &trx_undo_metadata;
+ node->update->info_bits = (node->update->info_bits
+ & REC_INFO_DELETED_FLAG)
+ ? REC_INFO_METADATA_ALTER
+ : REC_INFO_METADATA_ADD;
+ }
+
+ if (!row_undo_search_clust_to_pcur(node)) {
+ /* As long as this rolling-back transaction exists,
+ the PRIMARY KEY value pointed to by the undo log
+ record should exist.
+
+ However, if InnoDB is killed during a rollback, or
+ shut down during the rollback of recovered
+ transactions, then after restart we may try to roll
+ back some of the same undo log records again, because
+ trx_roll_try_truncate() is not being invoked after
+ every undo log record.
+
+ It is also possible that the record
+ was not modified yet (the DB_ROLL_PTR does not match
+ node->roll_ptr) and thus there is nothing to roll back.
+
+ btr_cur_upd_lock_and_undo() only writes the undo log
+ record after successfully acquiring an exclusive lock
+ on the the clustered index record. That lock will not
+ be released before the transaction is committed or
+ fully rolled back. (Exception: if the server was
+ killed, restarted, and shut down again before the
+ rollback of the recovered transaction was completed,
+ it is possible that the transaction was partially
+ rolled back and locks released.) */
+ goto close_table;
+ }
+
+ /* Extract indexed virtual columns from undo log */
+ if (node->ref != &trx_undo_metadata && node->table->n_v_cols) {
+ row_upd_replace_vcol(node->row, node->table,
+ node->update, false, node->undo_row,
+ (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+ ? NULL : ptr);
+ }
+
+ return true;
+}
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+dberr_t
+row_undo_mod(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ ut_ad(thr_get_trx(thr) == node->trx);
+ const bool dict_locked = node->trx->dict_operation_lock_mode
+ == RW_X_LATCH;
+
+ if (!row_undo_mod_parse_undo_rec(node, dict_locked)) {
+ return DB_SUCCESS;
+ }
+
+ node->index = dict_table_get_first_index(node->table);
+ ut_ad(dict_index_is_clust(node->index));
+
+ if (node->ref->info_bits) {
+ ut_ad(node->ref->is_metadata());
+ goto rollback_clust;
+ }
+
+ /* Skip the clustered index (the first index) */
+ node->index = dict_table_get_next_index(node->index);
+
+ /* Skip all corrupted secondary index */
+ dict_table_skip_corrupt_index(node->index);
+
+ switch (node->rec_type) {
+ case TRX_UNDO_UPD_EXIST_REC:
+ err = row_undo_mod_upd_exist_sec(node, thr);
+ break;
+ case TRX_UNDO_DEL_MARK_REC:
+ err = row_undo_mod_del_mark_sec(node, thr);
+ break;
+ case TRX_UNDO_UPD_DEL_REC:
+ err = row_undo_mod_upd_del_sec(node, thr);
+ break;
+ default:
+ ut_error;
+ err = DB_ERROR;
+ }
+
+ if (err == DB_SUCCESS) {
+rollback_clust:
+ err = row_undo_mod_clust(node, thr);
+
+ bool update_statistics
+ = !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+
+ if (err == DB_SUCCESS && node->table->stat_initialized) {
+ switch (node->rec_type) {
+ case TRX_UNDO_UPD_EXIST_REC:
+ break;
+ case TRX_UNDO_DEL_MARK_REC:
+ dict_table_n_rows_inc(node->table);
+ update_statistics = update_statistics
+ || !srv_stats_include_delete_marked;
+ break;
+ case TRX_UNDO_UPD_DEL_REC:
+ dict_table_n_rows_dec(node->table);
+ update_statistics = update_statistics
+ || !srv_stats_include_delete_marked;
+ break;
+ }
+
+ /* Do not attempt to update statistics when
+ executing ROLLBACK in the InnoDB SQL
+ interpreter, because in that case we would
+ already be holding dict_sys.mutex, which
+ would be acquired when updating statistics. */
+ if (update_statistics && !dict_locked) {
+ dict_stats_update_if_needed(node->table,
+ *node->trx);
+ } else {
+ node->table->stat_modified_counter++;
+ }
+ }
+ }
+
+ dict_table_close(node->table, dict_locked, FALSE);
+
+ node->table = NULL;
+
+ return(err);
+}
diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc
new file mode 100644
index 00000000..3ac8e434
--- /dev/null
+++ b/storage/innobase/row/row0undo.cc
@@ -0,0 +1,491 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0undo.cc
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "row0upd.h"
+#include "row0mysql.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
+ mem_heap_t* heap) /*!< in: memory heap where created */
+{
+ undo_node_t* undo;
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)
+ || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
+ || trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(parent);
+
+ undo = static_cast<undo_node_t*>(
+ mem_heap_alloc(heap, sizeof(undo_node_t)));
+
+ undo->common.type = QUE_NODE_UNDO;
+ undo->common.parent = parent;
+
+ undo->state = UNDO_NODE_FETCH_NEXT;
+ undo->trx = trx;
+
+ btr_pcur_init(&(undo->pcur));
+
+ undo->heap = mem_heap_create(256);
+
+ return(undo);
+}
+
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return true if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+bool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+ undo_node_t* node) /*!< in/out: row undo node */
+{
+ dict_index_t* clust_index;
+ bool found;
+ mtr_t mtr;
+ row_ext_t** ext;
+ const rec_t* rec;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(!node->table->skip_alter_undo);
+
+ mtr_start(&mtr);
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF,
+ node->table, node->ref, &mtr);
+
+ if (!found) {
+ goto func_exit;
+ }
+
+ rec = btr_pcur_get_rec(&node->pcur);
+
+ offsets = rec_get_offsets(rec, clust_index, offsets,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ found = row_get_rec_roll_ptr(rec, clust_index, offsets)
+ == node->roll_ptr;
+
+ if (found) {
+ ut_ad(row_get_rec_trx_id(rec, clust_index, offsets)
+ == node->trx->id || node->table->is_temporary());
+
+ if (dict_table_has_atomic_blobs(node->table)) {
+ /* There is no prefix of externally stored
+ columns in the clustered index record. Build a
+ cache of column prefixes. */
+ ext = &node->ext;
+ } else {
+ /* REDUNDANT and COMPACT formats store a local
+ 768-byte prefix of each externally stored
+ column. No cache is needed. */
+ ext = NULL;
+ node->ext = NULL;
+ }
+
+ node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+ offsets, NULL,
+ NULL, NULL, ext, node->heap);
+
+ /* We will need to parse out virtual column info from undo
+ log, first mark them DATA_MISSING. So we will know if the
+ value gets updated */
+ if (node->table->n_v_cols
+ && (node->state == UNDO_UPDATE_PERSISTENT
+ || node->state == UNDO_UPDATE_TEMPORARY)
+ && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ for (ulint i = 0;
+ i < dict_table_get_n_v_cols(node->table); i++) {
+ dfield_get_type(dtuple_get_nth_v_field(
+ node->row, i))->mtype = DATA_MISSING;
+ }
+ }
+
+ if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+ ut_ad((node->row->info_bits & ~REC_INFO_DELETED_FLAG)
+ == REC_INFO_MIN_REC_FLAG
+ || node->row->info_bits == 0);
+ node->undo_row = dtuple_copy(node->row, node->heap);
+ row_upd_replace(node->undo_row, &node->undo_ext,
+ clust_index, node->update, node->heap);
+ } else {
+ ut_ad(((node->row->info_bits & ~REC_INFO_DELETED_FLAG)
+ == REC_INFO_MIN_REC_FLAG)
+ == (node->rec_type == TRX_UNDO_INSERT_METADATA));
+ node->undo_row = NULL;
+ node->undo_ext = NULL;
+ }
+
+ btr_pcur_store_position(&node->pcur, &mtr);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+func_exit:
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+ return(found);
+}
+
+/** Try to truncate the undo logs.
+@param[in,out] trx transaction */
+static void row_undo_try_truncate(trx_t* trx)
+{
+ if (trx_undo_t* undo = trx->rsegs.m_redo.undo) {
+ ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
+ trx_undo_truncate_end(*undo, trx->undo_no, false);
+ }
+
+ if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+ ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg);
+ trx_undo_truncate_end(*undo, trx->undo_no, true);
+ }
+}
+
+/** Get the latest undo log record for rollback.
+@param[in,out] node rollback context
+@return whether an undo log record was fetched */
+static bool row_undo_rec_get(undo_node_t* node)
+{
+ trx_t* trx = node->trx;
+
+ if (trx->pages_undone) {
+ trx->pages_undone = 0;
+ row_undo_try_truncate(trx);
+ }
+
+ trx_undo_t* undo = NULL;
+ trx_undo_t* update = trx->rsegs.m_redo.undo;
+ trx_undo_t* temp = trx->rsegs.m_noredo.undo;
+ const undo_no_t limit = trx->roll_limit;
+
+ ut_ad(!update || !temp || update->empty() || temp->empty()
+ || update->top_undo_no != temp->top_undo_no);
+
+ if (update && !update->empty() && update->top_undo_no >= limit) {
+ if (!undo) {
+ undo = update;
+ } else if (undo->top_undo_no < update->top_undo_no) {
+ undo = update;
+ }
+ }
+
+ if (temp && !temp->empty() && temp->top_undo_no >= limit) {
+ if (!undo) {
+ undo = temp;
+ } else if (undo->top_undo_no < temp->top_undo_no) {
+ undo = temp;
+ }
+ }
+
+ if (undo == NULL) {
+ row_undo_try_truncate(trx);
+ /* Mark any ROLLBACK TO SAVEPOINT completed, so that
+ if the transaction object is committed and reused
+ later, we will default to a full ROLLBACK. */
+ trx->roll_limit = 0;
+ trx->in_rollback = false;
+ return false;
+ }
+
+ ut_ad(!undo->empty());
+ ut_ad(limit <= undo->top_undo_no);
+
+ node->roll_ptr = trx_undo_build_roll_ptr(
+ false, undo->rseg->id, undo->top_page_no, undo->top_offset);
+
+ mtr_t mtr;
+ mtr.start();
+
+ buf_block_t* undo_page = trx_undo_page_get_s_latched(
+ page_id_t(undo->rseg->space->id, undo->top_page_no), &mtr);
+
+ uint16_t offset = undo->top_offset;
+
+ buf_block_t* prev_page = undo_page;
+ if (trx_undo_rec_t* prev_rec = trx_undo_get_prev_rec(
+ prev_page, offset, undo->hdr_page_no, undo->hdr_offset,
+ true, &mtr)) {
+ if (prev_page != undo_page) {
+ trx->pages_undone++;
+ }
+
+ undo->top_page_no = prev_page->page.id().page_no();
+ undo->top_offset = page_offset(prev_rec);
+ undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec);
+ ut_ad(!undo->empty());
+ } else {
+ undo->top_undo_no = IB_ID_MAX;
+ ut_ad(undo->empty());
+ }
+
+ node->undo_rec = trx_undo_rec_copy(undo_page->frame + offset,
+ node->heap);
+ mtr.commit();
+
+ switch (trx_undo_rec_get_type(node->undo_rec)) {
+ case TRX_UNDO_INSERT_METADATA:
+ /* This record type was introduced in MDEV-11369
+ instant ADD COLUMN, which was implemented after
+ MDEV-12288 removed the insert_undo log. There is no
+ instant ADD COLUMN for temporary tables. Therefore,
+ this record can only be present in the main undo log. */
+ /* fall through */
+ case TRX_UNDO_RENAME_TABLE:
+ ut_ad(undo == update);
+ /* fall through */
+ case TRX_UNDO_INSERT_REC:
+ node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS;
+ node->state = undo == temp
+ ? UNDO_INSERT_TEMPORARY : UNDO_INSERT_PERSISTENT;
+ break;
+ default:
+ node->state = undo == temp
+ ? UNDO_UPDATE_TEMPORARY : UNDO_UPDATE_PERSISTENT;
+ break;
+ }
+
+ trx->undo_no = node->undo_no = trx_undo_rec_get_undo_no(
+ node->undo_rec);
+ return true;
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_undo(
+/*=====*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(node->trx->in_rollback);
+
+ if (node->state == UNDO_NODE_FETCH_NEXT && !row_undo_rec_get(node)) {
+ /* Rollback completed for this query thread */
+ thr->run_node = que_node_get_parent(node);
+ return DB_SUCCESS;
+ }
+
+ /* Prevent prepare_inplace_alter_table_dict() from adding
+ dict_table_t::indexes while we are processing the record.
+ Recovered transactions are not protected by MDL, and the
+ secondary index creation is not protected by table locks
+ for online operation. (A table lock would only be acquired
+ when committing the ALTER TABLE operation.) */
+ trx_t* trx = node->trx;
+ const bool locked_data_dict = !trx->dict_operation_lock_mode;
+
+ if (UNIV_UNLIKELY(locked_data_dict)) {
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ dberr_t err;
+
+ switch (node->state) {
+ case UNDO_INSERT_PERSISTENT:
+ case UNDO_INSERT_TEMPORARY:
+ err = row_undo_ins(node, thr);
+ break;
+ case UNDO_UPDATE_PERSISTENT:
+ case UNDO_UPDATE_TEMPORARY:
+ err = row_undo_mod(node, thr);
+ break;
+ default:
+ ut_ad("wrong state" == 0);
+ err = DB_CORRUPTION;
+ }
+
+ if (locked_data_dict) {
+
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ node->state = UNDO_NODE_FETCH_NEXT;
+ btr_pcur_close(&(node->pcur));
+
+ mem_heap_empty(node->heap);
+
+ thr->run_node = node;
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_undo_step(
+/*==========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ undo_node_t* node;
+ trx_t* trx = thr_get_trx(thr);
+
+ node = static_cast<undo_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+ if (UNIV_UNLIKELY(trx_get_dict_operation(trx) == TRX_DICT_OP_NONE
+ && !srv_undo_sources
+ && srv_shutdown_state != SRV_SHUTDOWN_NONE)
+ && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) {
+ /* Shutdown has been initiated. */
+ trx->error_state = DB_INTERRUPTED;
+ return NULL;
+ }
+
+ if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) {
+ trx_roll_report_progress();
+ }
+
+ err = row_undo(node, thr);
+
+#ifdef ENABLED_DEBUG_SYNC
+ if (trx->mysql_thd) {
+ DEBUG_SYNC_C("trx_after_rollback_row");
+ }
+#endif /* ENABLED_DEBUG_SYNC */
+
+ trx->error_state = err;
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ib::fatal() << "Error (" << err << ") in rollback.";
+ }
+
+ return(thr);
+}
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
new file mode 100644
index 00000000..3792aab4
--- /dev/null
+++ b/storage/innobase/row/row0upd.cc
@@ -0,0 +1,3237 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0upd.cc
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "trx0undo.h"
+#include "rem0rec.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0ins.h"
+#include "row0log.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+#include "buf0lru.h"
+#include "trx0rec.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include <algorithm>
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
+#ifdef WITH_WSREP
+#include "log.h"
+#include "wsrep.h"
+#endif /* WITH_WSREP */
+
+
+/* What kind of latch and lock can we assume when the control comes to
+ -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+ Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+ dtuple_t* entry, /*!< in: old value of index entry */
+ dict_index_t* index, /*!< in: index of entry */
+ const upd_t* update, /*!< in: update vector for the row */
+ ulint n); /*!< in: how many first fields to check */
+
+/*********************************************************************//**
+Checks if index currently is mentioned as a referenced index in a foreign
+key constraint.
+
+NOTE that since we do not hold dict_sys.latch when leaving the
+function, it may be that the referencing table has been dropped when
+we leave this function: this function is only for heuristic use!
+
+@return true if referenced */
+static
+bool
+row_upd_index_is_referenced(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx) /*!< in: transaction */
+{
+ dict_table_t* table = index->table;
+
+ if (table->referenced_set.empty()) {
+ return false;
+ }
+
+ const bool froze_data_dict = !trx->dict_operation_lock_mode;
+ if (froze_data_dict) {
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ dict_foreign_set::iterator it
+ = std::find_if(table->referenced_set.begin(),
+ table->referenced_set.end(),
+ dict_foreign_with_index(index));
+
+ const bool is_referenced = (it != table->referenced_set.end());
+
+ if (froze_data_dict) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ return is_referenced;
+}
+
+#ifdef WITH_WSREP
+static
+ibool
+wsrep_row_upd_index_is_foreign(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx) /*!< in: transaction */
+{
+ dict_table_t* table = index->table;
+ ibool froze_data_dict = FALSE;
+ ibool is_referenced = FALSE;
+
+ if (table->foreign_set.empty()) {
+ return(FALSE);
+ }
+
+ if (trx->dict_operation_lock_mode == 0) {
+ row_mysql_freeze_data_dictionary(trx);
+ froze_data_dict = TRUE;
+ }
+
+ dict_foreign_set::iterator it
+ = std::find_if(table->foreign_set.begin(),
+ table->foreign_set.end(),
+ dict_foreign_with_foreign_index(index));
+
+ is_referenced = (it != table->foreign_set.end());
+
+ if (froze_data_dict) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ return(is_referenced);
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Checks if possible foreign key constraints hold after a delete of the record
+under pcur.
+
+NOTE that this function will temporarily commit mtr and lose the
+pcur position!
+
+@return DB_SUCCESS or an error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_check_references_constraints(
+/*=================================*/
+ upd_node_t* node, /*!< in: row update node */
+ btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the
+ cursor position is lost in this function! */
+ dict_table_t* table, /*!< in: table in question */
+ dict_index_t* index, /*!< in: index of the cursor */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_foreign_t* foreign;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ trx_t* trx;
+ const rec_t* rec;
+ dberr_t err;
+ ibool got_s_lock = FALSE;
+
+ DBUG_ENTER("row_upd_check_references_constraints");
+
+ if (table->referenced_set.empty()) {
+ DBUG_RETURN(DB_SUCCESS);
+ }
+
+ trx = thr_get_trx(thr);
+
+ rec = btr_pcur_get_rec(pcur);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ heap = mem_heap_create(500);
+
+ entry = row_rec_to_index_entry(rec, index, offsets, heap);
+
+ mtr_commit(mtr);
+
+ DEBUG_SYNC_C("foreign_constraint_check_for_update");
+
+ mtr->start();
+
+ if (trx->dict_operation_lock_mode == 0) {
+ got_s_lock = TRUE;
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+ "foreign_constraint_check_for_insert");
+
+ for (dict_foreign_set::iterator it = table->referenced_set.begin();
+ it != table->referenced_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ /* Note that we may have an update which updates the index
+ record, but does NOT update the first fields which are
+ referenced in a foreign key constraint. Then the update does
+ NOT break the constraint. */
+
+ if (foreign->referenced_index == index
+ && (node->is_delete
+ || row_upd_changes_first_fields_binary(
+ entry, index, node->update,
+ foreign->n_fields))) {
+ dict_table_t* foreign_table = foreign->foreign_table;
+
+ dict_table_t* ref_table = NULL;
+
+ if (foreign_table == NULL) {
+
+ ref_table = dict_table_open_on_name(
+ foreign->foreign_table_name_lookup,
+ FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+ }
+
+ if (foreign_table) {
+ foreign_table->inc_fk_checks();
+ }
+
+ /* NOTE that if the thread ends up waiting for a lock
+ we will release dict_sys.latch temporarily!
+ But the inc_fk_checks() protects foreign_table from
+ being dropped while the check is running. */
+
+ err = row_ins_check_foreign_constraint(
+ FALSE, foreign, table, entry, thr);
+
+ if (foreign_table) {
+ foreign_table->dec_fk_checks();
+ }
+ if (ref_table != NULL) {
+ dict_table_close(ref_table, FALSE, FALSE);
+ }
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+ }
+
+ err = DB_SUCCESS;
+
+func_exit:
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ mem_heap_free(heap);
+
+ DEBUG_SYNC_C("foreign_constraint_check_for_update_done");
+ DBUG_RETURN(err);
+}
+
+#ifdef WITH_WSREP
+static
+dberr_t
+wsrep_row_upd_check_foreign_constraints(
+/*=================================*/
+ upd_node_t* node, /*!< in: row update node */
+ btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the
+ cursor position is lost in this function! */
+ dict_table_t* table, /*!< in: table in question */
+ dict_index_t* index, /*!< in: index of the cursor */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_foreign_t* foreign;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ trx_t* trx;
+ const rec_t* rec;
+ dberr_t err;
+ ibool got_s_lock = FALSE;
+ ibool opened = FALSE;
+
+ if (table->foreign_set.empty()) {
+ return(DB_SUCCESS);
+ }
+
+ trx = thr_get_trx(thr);
+
+ /* TODO: make native slave thread bail out here */
+
+ rec = btr_pcur_get_rec(pcur);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ heap = mem_heap_create(500);
+
+ entry = row_rec_to_index_entry(rec, index, offsets, heap);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ if (trx->dict_operation_lock_mode == 0) {
+ got_s_lock = TRUE;
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ for (dict_foreign_set::iterator it = table->foreign_set.begin();
+ it != table->foreign_set.end();
+ ++it) {
+
+ foreign = *it;
+ /* Note that we may have an update which updates the index
+ record, but does NOT update the first fields which are
+ referenced in a foreign key constraint. Then the update does
+ NOT break the constraint. */
+
+ if (foreign->foreign_index == index
+ && (node->is_delete
+ || row_upd_changes_first_fields_binary(
+ entry, index, node->update,
+ foreign->n_fields))) {
+
+ if (foreign->referenced_table == NULL) {
+ foreign->referenced_table =
+ dict_table_open_on_name(
+ foreign->referenced_table_name_lookup,
+ FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+ opened = (foreign->referenced_table) ? TRUE : FALSE;
+ }
+
+ /* NOTE that if the thread ends up waiting for a lock
+ we will release dict_sys.latch temporarily!
+ But the counter on the table protects 'foreign' from
+ being dropped while the check is running. */
+
+ err = row_ins_check_foreign_constraint(
+ TRUE, foreign, table, entry, thr);
+
+ if (foreign->referenced_table) {
+ if (opened == TRUE) {
+ dict_table_close(foreign->referenced_table, FALSE, FALSE);
+ opened = FALSE;
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+ }
+
+ err = DB_SUCCESS;
+func_exit:
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/** Determine if a FOREIGN KEY constraint needs to be processed.
+@param[in] node query node
+@param[in] trx transaction
+@return whether the node cannot be ignored */
+
+inline bool wsrep_must_process_fk(const upd_node_t* node, const trx_t* trx)
+{
+ if (!trx->is_wsrep()) {
+ return false;
+ }
+ return que_node_get_type(node->common.parent) != QUE_NODE_UPDATE
+ || static_cast<upd_node_t*>(node->common.parent)->cascade_node
+ != node;
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+upd_node_t*
+upd_node_create(
+/*============*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ upd_node_t* node;
+
+ node = static_cast<upd_node_t*>(
+ mem_heap_zalloc(heap, sizeof(upd_node_t)));
+
+ node->common.type = QUE_NODE_UPDATE;
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+ node->heap = mem_heap_create(128);
+ node->magic_n = UPD_NODE_MAGIC_N;
+
+ return(node);
+}
+
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update) /*!< in: update vector */
+{
+ const upd_field_t* upd_field;
+ const dfield_t* new_val;
+ ulint old_len;
+ ulint new_len;
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+ ut_ad(!index->table->skip_alter_undo);
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+
+ /* We should ignore virtual field if the index is not
+ a virtual index */
+ if (upd_fld_is_virtual_col(upd_field)
+ && !index->has_virtual()) {
+ continue;
+ }
+
+ new_val = &(upd_field->new_val);
+ if (dfield_is_ext(new_val)) {
+ return(TRUE);
+ }
+ new_len = dfield_get_len(new_val);
+ ut_ad(new_len != UNIV_SQL_DEFAULT);
+
+ if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) {
+ new_len = dict_col_get_sql_null_size(
+ dict_index_get_nth_col(index,
+ upd_field->field_no),
+ 0);
+ }
+
+ if (rec_offs_nth_default(offsets, upd_field->field_no)) {
+ /* This is an instantly added column that is
+ at the initial default value. */
+ return(TRUE);
+ }
+
+ if (rec_offs_comp(offsets)
+ && rec_offs_nth_sql_null(offsets, upd_field->field_no)) {
+ /* Note that in the compact table format, for a
+ variable length field, an SQL NULL will use zero
+ bytes in the offset array at the start of the physical
+ record, but a zero-length value (empty string) will
+ use one byte! Thus, we cannot use update-in-place
+ if we update an SQL NULL varchar to an empty string! */
+
+ old_len = UNIV_SQL_NULL;
+ } else {
+ old_len = rec_offs_nth_size(offsets,
+ upd_field->field_no);
+ }
+
+ if (old_len != new_len
+ || rec_offs_nth_extern(offsets, upd_field->field_no)) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************//**
+Returns true if row update contains disowned external fields.
+@return true if the update contains disowned external fields. */
+bool
+row_upd_changes_disowned_external(
+/*==============================*/
+ const upd_t* update) /*!< in: update vector */
+{
+ const upd_field_t* upd_field;
+ const dfield_t* new_val;
+ ulint new_len;
+ ulint n_fields;
+ ulint i;
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ const byte* field_ref;
+
+ upd_field = upd_get_nth_field(update, i);
+ new_val = &(upd_field->new_val);
+ new_len = dfield_get_len(new_val);
+
+ if (!dfield_is_ext(new_val)) {
+ continue;
+ }
+
+ ut_ad(new_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ field_ref = static_cast<const byte*>(dfield_get_data(new_val))
+ + new_len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+ const rec_t* rec, /*!< in: secondary index record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ mem_heap_t* heap) /*!< in: memory heap from which allocated */
+{
+ upd_field_t* upd_field;
+ const dfield_t* dfield;
+ const byte* data;
+ ulint len;
+ upd_t* update;
+ ulint n_diff;
+
+ /* This function is used only for a secondary index */
+ ut_a(!dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry));
+ ut_ad(!rec_offs_any_extern(offsets));
+ ut_ad(!rec_offs_any_default(offsets));
+ ut_ad(!index->table->skip_alter_undo);
+
+ update = upd_create(dtuple_get_n_fields(entry), heap);
+
+ n_diff = 0;
+
+ for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ dfield = dtuple_get_nth_field(entry, i);
+
+ /* NOTE that it may be that len != dfield_get_len(dfield) if we
+ are updating in a character set and collation where strings of
+ different length can be equal in an alphabetical comparison,
+ and also in the case where we have a column prefix index
+ and the last characters in the index field are spaces; the
+ latter case probably caused the assertion failures reported at
+ row0upd.cc line 713 in versions 4.0.14 - 4.0.16. */
+
+ /* NOTE: we compare the fields as binary strings!
+ (No collation) */
+
+ if (!dfield_data_is_binary_equal(dfield, len, data)) {
+
+ upd_field = upd_get_nth_field(update, n_diff);
+
+ dfield_copy(&(upd_field->new_val), dfield);
+
+ upd_field_set_field_no(upd_field, i, index);
+
+ n_diff++;
+ }
+ }
+
+ update->n_fields = n_diff;
+
+ return(update);
+}
+
+
+/** Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@param[in] index clustered index
+@param[in] entry clustered index entry to insert
+@param[in] rec clustered index record
+@param[in] offsets rec_get_offsets(rec,index), or NULL
+@param[in] no_sys skip the system columns
+ DB_TRX_ID and DB_ROLL_PTR
+@param[in] trx transaction (for diagnostics),
+ or NULL
+@param[in] heap memory heap from which allocated
+@param[in] mysql_table NULL, or mysql table object when
+ user thread invokes dml
+@param[out] error error number in case of failure
+@return own: update vector of differing fields, excluding roll ptr and
+trx id,if error is not equal to DB_SUCCESS, return NULL */
+upd_t*
+row_upd_build_difference_binary(
+ dict_index_t* index,
+ const dtuple_t* entry,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ bool no_sys,
+ trx_t* trx,
+ mem_heap_t* heap,
+ TABLE* mysql_table,
+ dberr_t* error)
+{
+ ulint len;
+ upd_t* update;
+ ulint n_diff;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint n_v_fld = dtuple_get_n_v_fields(entry);
+ rec_offs_init(offsets_);
+
+ /* This function is used only for a clustered index */
+ ut_a(dict_index_is_clust(index));
+ ut_ad(!index->table->skip_alter_undo);
+ ut_ad(entry->n_fields <= index->n_fields);
+ ut_ad(entry->n_fields >= index->n_core_fields);
+
+ update = upd_create(index->n_fields + n_v_fld, heap);
+
+ n_diff = 0;
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ } else {
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ }
+
+ for (uint16_t i = 0; i < entry->n_fields; i++) {
+ const byte* data = rec_get_nth_cfield(rec, index, offsets, i,
+ &len);
+ const dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+ /* NOTE: we compare the fields as binary strings!
+ (No collation) */
+ if (no_sys && (i == index->db_trx_id()
+ || i == index->db_roll_ptr())) {
+ continue;
+ }
+
+ if (!dfield_is_ext(dfield)
+ != !rec_offs_nth_extern(offsets, i)
+ || !dfield_data_is_binary_equal(dfield, len, data)) {
+ upd_field_t* uf = upd_get_nth_field(update, n_diff++);
+ dfield_copy(&uf->new_val, dfield);
+ upd_field_set_field_no(uf, i, index);
+ }
+ }
+
+ for (uint16_t i = static_cast<uint16_t>(entry->n_fields);
+ i < index->n_fields; i++) {
+ upd_field_t* uf = upd_get_nth_field(update, n_diff++);
+ const dict_col_t* col = dict_index_get_nth_col(index, i);
+ /* upd_create() zero-initialized uf */
+ uf->new_val.data = const_cast<byte*>(col->instant_value(&len));
+ uf->new_val.len = static_cast<unsigned>(len);
+ dict_col_copy_type(col, &uf->new_val.type);
+ upd_field_set_field_no(uf, i, index);
+ }
+
+ /* Check the virtual columns updates. Even if there is no non-virtual
+ column (base columns) change, we will still need to build the
+ indexed virtual column value so that undo log would log them (
+ for purge/mvcc purpose) */
+ if (n_v_fld > 0) {
+ row_ext_t* ext;
+ THD* thd;
+
+ if (trx == NULL) {
+ thd = current_thd;
+ } else {
+ thd = trx->mysql_thd;
+ }
+
+ ut_ad(!update->old_vrow);
+
+ ib_vcol_row vc(NULL);
+ uchar *record = vc.record(thd, index, &mysql_table);
+
+ for (uint16_t i = 0; i < n_v_fld; i++) {
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(index->table, i);
+
+ if (!col->m_col.ord_part) {
+ continue;
+ }
+
+ if (update->old_vrow == NULL) {
+ update->old_vrow = row_build(
+ ROW_COPY_POINTERS, index, rec, offsets,
+ index->table, NULL, NULL, &ext, heap);
+ }
+
+ dfield_t* vfield = innobase_get_computed_value(
+ update->old_vrow, col, index,
+ &vc.heap, heap, NULL, thd, mysql_table, record,
+ NULL, NULL, NULL);
+ if (vfield == NULL) {
+ *error = DB_COMPUTE_VALUE_FAILED;
+ return(NULL);
+ }
+
+ const dfield_t* dfield = dtuple_get_nth_v_field(
+ entry, i);
+
+ if (!dfield_data_is_binary_equal(
+ dfield, vfield->len,
+ static_cast<byte*>(vfield->data))) {
+ upd_field_t* uf = upd_get_nth_field(update,
+ n_diff++);
+ uf->old_v_val = static_cast<dfield_t*>(
+ mem_heap_alloc(heap,
+ sizeof *uf->old_v_val));
+ dfield_copy(uf->old_v_val, vfield);
+ dfield_copy(&uf->new_val, dfield);
+ upd_field_set_v_field_no(uf, i, index);
+ }
+ }
+ }
+
+ update->n_fields = n_diff;
+ ut_ad(update->validate());
+
+ return(update);
+}
+
+/** Fetch a prefix of an externally stored column.
+This is similar to row_ext_lookup(), but the row_ext_t holds the old values
+of the column and must not be poisoned with the new values.
+@param[in] data 'internally' stored part of the field
+containing also the reference to the external part
+@param[in] local_len length of data, in bytes
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] len input - length of prefix to
+fetch; output: fetched length of the prefix
+@param[in,out] heap heap where to allocate
+@return BLOB prefix
+@retval NULL if the record is incomplete (should only happen
+in row_vers_vc_matches_cluster() executed concurrently with another purge) */
+static
+byte*
+row_upd_ext_fetch(
+ const byte* data,
+ ulint local_len,
+ ulint zip_size,
+ ulint* len,
+ mem_heap_t* heap)
+{
+ byte* buf = static_cast<byte*>(mem_heap_alloc(heap, *len));
+
+ *len = btr_copy_externally_stored_field_prefix(
+ buf, *len, zip_size, data, local_len);
+
+ return *len ? buf : NULL;
+}
+
+/** Replaces the new column value stored in the update vector in
+the given index entry field.
+@param[in,out] dfield data field of the index entry
+@param[in] field index field
+@param[in] col field->col
+@param[in] uf update field
+@param[in,out] heap memory heap for allocating and copying
+the new value
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return whether the previous version was built successfully */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+bool
+row_upd_index_replace_new_col_val(
+ dfield_t* dfield,
+ const dict_field_t* field,
+ const dict_col_t* col,
+ const upd_field_t* uf,
+ mem_heap_t* heap,
+ ulint zip_size)
+{
+ ulint len;
+ const byte* data;
+
+ dfield_copy_data(dfield, &uf->new_val);
+
+ if (dfield_is_null(dfield)) {
+ return true;
+ }
+
+ len = dfield_get_len(dfield);
+ data = static_cast<const byte*>(dfield_get_data(dfield));
+
+ if (field->prefix_len > 0) {
+ ibool fetch_ext = dfield_is_ext(dfield)
+ && len < (ulint) field->prefix_len
+ + BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (fetch_ext) {
+ ulint l = len;
+
+ len = field->prefix_len;
+
+ data = row_upd_ext_fetch(data, l, zip_size,
+ &len, heap);
+ if (UNIV_UNLIKELY(!data)) {
+ return false;
+ }
+ }
+
+ len = dtype_get_at_most_n_mbchars(col->prtype,
+ col->mbminlen, col->mbmaxlen,
+ field->prefix_len, len,
+ (const char*) data);
+
+ dfield_set_data(dfield, data, len);
+
+ if (!fetch_ext) {
+ dfield_dup(dfield, heap);
+ }
+
+ return true;
+ }
+
+ switch (uf->orig_len) {
+ byte* buf;
+ case BTR_EXTERN_FIELD_REF_SIZE:
+ /* Restore the original locally stored
+ part of the column. In the undo log,
+ InnoDB writes a longer prefix of externally
+ stored columns, so that column prefixes
+ in secondary indexes can be reconstructed. */
+ dfield_set_data(dfield,
+ data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ dfield_set_ext(dfield);
+ /* fall through */
+ case 0:
+ dfield_dup(dfield, heap);
+ break;
+ default:
+ /* Reconstruct the original locally
+ stored part of the column. The data
+ will have to be copied. */
+ ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+ buf = static_cast<byte*>(mem_heap_alloc(heap, uf->orig_len));
+
+ /* Copy the locally stored prefix. */
+ memcpy(buf, data,
+ unsigned(uf->orig_len) - BTR_EXTERN_FIELD_REF_SIZE);
+
+ /* Copy the BLOB pointer. */
+ memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE,
+ data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+ dfield_set_data(dfield, buf, uf->orig_len);
+ dfield_set_ext(dfield);
+ break;
+ }
+
+ return true;
+}
+
+/** Apply an update vector to an metadata entry.
+@param[in,out] entry clustered index metadata record to be updated
+@param[in] index index of the entry
+@param[in] update update vector built for the entry
+@param[in,out] heap memory heap for copying off-page columns */
+static
+void
+row_upd_index_replace_metadata(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update,
+ mem_heap_t* heap)
+{
+ ut_ad(!index->table->skip_alter_undo);
+ ut_ad(update->is_alter_metadata());
+ ut_ad(entry->info_bits == update->info_bits);
+ ut_ad(entry->n_fields == ulint(index->n_fields) + 1);
+ const ulint zip_size = index->table->space->zip_size();
+ const ulint first = index->first_user_field();
+ ut_d(bool found_mblob = false);
+
+ for (ulint i = upd_get_n_fields(update); i--; ) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+ ut_ad(!upd_fld_is_virtual_col(uf));
+ ut_ad(uf->field_no >= first - 2);
+ ulint f = uf->field_no;
+ dfield_t* dfield = dtuple_get_nth_field(entry, f);
+
+ if (f == first) {
+ ut_d(found_mblob = true);
+ ut_ad(!dfield_is_null(&uf->new_val));
+ ut_ad(dfield_is_ext(dfield));
+ ut_ad(dfield_get_len(dfield) == FIELD_REF_SIZE);
+ ut_ad(!dfield_is_null(dfield));
+ dfield_set_data(dfield, uf->new_val.data,
+ uf->new_val.len);
+ if (dfield_is_ext(&uf->new_val)) {
+ dfield_set_ext(dfield);
+ }
+ continue;
+ }
+
+ f -= f > first;
+ const dict_field_t* field = dict_index_get_nth_field(index, f);
+ if (!row_upd_index_replace_new_col_val(dfield, field,
+ field->col,
+ uf, heap, zip_size)) {
+ ut_error;
+ }
+ }
+
+ ut_ad(found_mblob);
+}
+
+/** Apply an update vector to an index entry.
+@param[in,out] entry index entry to be updated; the clustered index record
+ must be covered by a lock or a page latch to prevent
+ deletion (rollback or purge)
+@param[in] index index of the entry
+@param[in] update update vector built for the entry
+@param[in,out] heap memory heap for copying off-page columns */
+void
+row_upd_index_replace_new_col_vals_index_pos(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update,
+ mem_heap_t* heap)
+{
+ ut_ad(!index->table->skip_alter_undo);
+ ut_ad(!entry->is_metadata() || entry->info_bits == update->info_bits);
+
+ if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
+ row_upd_index_replace_metadata(entry, index, update, heap);
+ return;
+ }
+
+ const ulint zip_size = index->table->space->zip_size();
+
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ for (uint16_t i = index->n_fields; i--; ) {
+ const dict_field_t* field;
+ const dict_col_t* col;
+ const upd_field_t* uf;
+
+ field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(field);
+ if (col->is_virtual()) {
+ const dict_v_col_t* vcol = reinterpret_cast<
+ const dict_v_col_t*>(
+ col);
+
+ uf = upd_get_field_by_field_no(
+ update, vcol->v_pos, true);
+ } else {
+ uf = upd_get_field_by_field_no(
+ update, i, false);
+ }
+
+ if (uf && UNIV_UNLIKELY(!row_upd_index_replace_new_col_val(
+ dtuple_get_nth_field(entry, i),
+ field, col, uf, heap,
+ zip_size))) {
+ ut_error;
+ }
+ }
+}
+
+/** Replace the new column values stored in the update vector,
+during trx_undo_prev_version_build().
+@param entry clustered index tuple where the values are replaced
+ (the clustered index leaf page latch must be held)
+@param index clustered index
+@param update update vector for the clustered index
+@param heap memory heap for allocating and copying values
+@return whether the previous version was built successfully */
+bool
+row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index,
+ const upd_t *update, mem_heap_t *heap)
+{
+ ut_ad(index.is_primary());
+ const ulint zip_size= index.table->space->zip_size();
+
+ ut_ad(!index.table->skip_alter_undo);
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ for (ulint i= 0; i < index.n_fields; i++)
+ {
+ const dict_field_t *field= &index.fields[i];
+ const dict_col_t* col= dict_field_get_col(field);
+ const upd_field_t *uf;
+
+ if (col->is_virtual())
+ {
+ const dict_v_col_t *vcol= reinterpret_cast<const dict_v_col_t*>(col);
+ uf= upd_get_field_by_field_no(update, vcol->v_pos, true);
+ }
+ else
+ uf= upd_get_field_by_field_no(update, static_cast<uint16_t>
+ (dict_col_get_clust_pos(col, &index)),
+ false);
+
+ if (!uf)
+ continue;
+
+ if (!row_upd_index_replace_new_col_val(dtuple_get_nth_field(entry, i),
+ field, col, uf, heap, zip_size))
+ return false;
+ }
+
+ return true;
+}
+
+/** Replaces the virtual column values stored in the update vector.
+@param[in,out] row row whose column to be set
+@param[in] field data to set
+@param[in] len data length
+@param[in] vcol virtual column info */
+static
+void
+row_upd_set_vcol_data(
+ dtuple_t* row,
+ const byte* field,
+ ulint len,
+ dict_v_col_t* vcol)
+{
+ dfield_t* dfield = dtuple_get_nth_v_field(row, vcol->v_pos);
+
+ if (dfield_get_type(dfield)->mtype == DATA_MISSING) {
+ dict_col_copy_type(&vcol->m_col, dfield_get_type(dfield));
+
+ dfield_set_data(dfield, field, len);
+ }
+}
+
+/** Replaces the virtual column values stored in a dtuple with that of
+a update vector.
+@param[in,out] row row whose column to be updated
+@param[in] table table
+@param[in] update an update vector built for the clustered index
+@param[in] upd_new update to new or old value
+@param[in,out] undo_row undo row (if needs to be updated)
+@param[in] ptr remaining part in update undo log */
+void
+row_upd_replace_vcol(
+ dtuple_t* row,
+ const dict_table_t* table,
+ const upd_t* update,
+ bool upd_new,
+ dtuple_t* undo_row,
+ const byte* ptr)
+{
+ ulint col_no;
+ ulint i;
+ ulint n_cols;
+
+ ut_ad(!table->skip_alter_undo);
+
+ n_cols = dtuple_get_n_v_fields(row);
+ for (col_no = 0; col_no < n_cols; col_no++) {
+ dfield_t* dfield;
+
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(table, col_no);
+
+ /* If there is no index on the column, do not bother for
+ value update */
+ if (!col->m_col.ord_part) {
+ dict_index_t* clust_index
+ = dict_table_get_first_index(table);
+
+ /* Skip the column if there is no online alter
+ table in progress or it is not being indexed
+ in new table */
+ if (!dict_index_is_online_ddl(clust_index)
+ || !row_log_col_is_indexed(clust_index, col_no)) {
+ continue;
+ }
+ }
+
+ dfield = dtuple_get_nth_v_field(row, col_no);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ const upd_field_t* upd_field
+ = upd_get_nth_field(update, i);
+ if (!upd_fld_is_virtual_col(upd_field)
+ || upd_field->field_no != col->v_pos) {
+ continue;
+ }
+
+ if (upd_new) {
+ dfield_copy_data(dfield, &upd_field->new_val);
+ } else {
+ dfield_copy_data(dfield, upd_field->old_v_val);
+ }
+
+ dfield->type = upd_field->new_val.type;
+ break;
+ }
+ }
+
+ bool first_v_col = true;
+ bool is_undo_log = true;
+
+ /* We will read those unchanged (but indexed) virtual columns in */
+ if (ptr != NULL) {
+ const byte* end_ptr;
+
+ end_ptr = ptr + mach_read_from_2(ptr);
+ ptr += 2;
+
+ while (ptr != end_ptr) {
+ const byte* field;
+ uint32_t field_no, len, orig_len;
+
+ field_no = mach_read_next_compressed(&ptr);
+
+ const bool is_v = (field_no >= REC_MAX_N_FIELDS);
+
+ if (is_v) {
+ ptr = trx_undo_read_v_idx(
+ table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col = false;
+ }
+
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+
+ if (field_no == FIL_NULL) {
+ ut_ad(is_v);
+ continue;
+ }
+
+ if (is_v) {
+ dict_v_col_t* vcol = dict_table_get_nth_v_col(
+ table, field_no);
+
+ row_upd_set_vcol_data(row, field, len, vcol);
+
+ if (undo_row) {
+ row_upd_set_vcol_data(
+ undo_row, field, len, vcol);
+ }
+ }
+ ut_ad(ptr<= end_ptr);
+ }
+ }
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+void
+row_upd_replace(
+/*============*/
+ dtuple_t* row, /*!< in/out: row where replaced,
+ indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ row_ext_t** ext, /*!< out, own: NULL, or externally
+ stored column prefixes */
+ const dict_index_t* index, /*!< in: clustered index */
+ const upd_t* update, /*!< in: an update vector built for the
+ clustered index */
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ulint col_no;
+ ulint i;
+ ulint n_cols;
+ ulint n_ext_cols;
+ ulint* ext_cols;
+ const dict_table_t* table;
+
+ ut_ad(row);
+ ut_ad(ext);
+ ut_ad(index);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(update);
+ ut_ad(heap);
+ ut_ad(update->validate());
+
+ n_cols = dtuple_get_n_fields(row);
+ table = index->table;
+ ut_ad(n_cols == dict_table_get_n_cols(table));
+
+ ext_cols = static_cast<ulint*>(
+ mem_heap_alloc(heap, n_cols * sizeof *ext_cols));
+
+ n_ext_cols = 0;
+
+ dtuple_set_info_bits(row, update->info_bits);
+
+ for (col_no = 0; col_no < n_cols; col_no++) {
+
+ const dict_col_t* col
+ = dict_table_get_nth_col(table, col_no);
+ const ulint clust_pos
+ = dict_col_get_clust_pos(col, index);
+ dfield_t* dfield;
+
+ if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) {
+
+ continue;
+ }
+
+ dfield = dtuple_get_nth_field(row, col_no);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ const upd_field_t* upd_field
+ = upd_get_nth_field(update, i);
+
+ if (upd_field->field_no != clust_pos
+ || upd_fld_is_virtual_col(upd_field)) {
+
+ continue;
+ }
+
+ dfield_copy_data(dfield, &upd_field->new_val);
+ break;
+ }
+
+ if (dfield_is_ext(dfield) && col->ord_part) {
+ ext_cols[n_ext_cols++] = col_no;
+ }
+ }
+
+ if (n_ext_cols) {
+ *ext = row_ext_create(n_ext_cols, ext_cols, *table, row, heap);
+ } else {
+ *ext = NULL;
+ }
+
+ row_upd_replace_vcol(row, table, update, true, NULL, NULL);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+ibool
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the record */
+ const upd_t* update, /*!< in: update vector for the row; NOTE: the
+ field numbers in this MUST be clustered index
+ positions! */
+#ifdef UNIV_DEBUG
+ const que_thr_t*thr, /*!< in: query thread */
+#endif /* UNIV_DEBUG */
+ const dtuple_t* row, /*!< in: old value of row, or NULL if the
+ row and the data values in update are not
+ known when this function is called, e.g., at
+ compile time */
+ const row_ext_t*ext, /*!< NULL, or prefixes of the externally
+ stored columns in the old row */
+ ulint flag) /*!< in: ROW_BUILD_NORMAL,
+ ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */
+{
+ ulint n_unique;
+ ulint i;
+ const dict_index_t* clust_index;
+
+ ut_ad(thr);
+ ut_ad(thr->graph);
+ ut_ad(thr->graph->trx);
+ ut_ad(!index->table->skip_alter_undo);
+
+ n_unique = dict_index_get_n_unique(index);
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ for (i = 0; i < n_unique; i++) {
+
+ const dict_field_t* ind_field;
+ const dict_col_t* col;
+ ulint col_no;
+ const upd_field_t* upd_field;
+ const dfield_t* dfield;
+ dfield_t dfield_ext;
+ ulint dfield_len= 0;
+ const byte* buf;
+ bool is_virtual;
+ const dict_v_col_t* vcol = NULL;
+
+ ind_field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(ind_field);
+ col_no = dict_col_get_no(col);
+ is_virtual = col->is_virtual();
+
+ if (is_virtual) {
+ vcol = reinterpret_cast<const dict_v_col_t*>(col);
+
+ upd_field = upd_get_field_by_field_no(
+ update, vcol->v_pos, true);
+ } else {
+ upd_field = upd_get_field_by_field_no(
+ update, static_cast<uint16_t>(
+ dict_col_get_clust_pos(
+ col, clust_index)),
+ false);
+ }
+
+ if (upd_field == NULL) {
+ continue;
+ }
+
+ if (row == NULL) {
+ ut_ad(ext == NULL);
+ return(TRUE);
+ }
+
+ if (is_virtual) {
+ dfield = dtuple_get_nth_v_field(
+ row, vcol->v_pos);
+ } else {
+ dfield = dtuple_get_nth_field(row, col_no);
+ }
+
+ /* For spatial index update, since the different geometry
+ data could generate same MBR, so, if the new index entry is
+ same as old entry, which means the MBR is not changed, we
+ don't need to do anything. */
+ if (dict_index_is_spatial(index) && i == 0) {
+ double mbr1[SPDIMS * 2];
+ double mbr2[SPDIMS * 2];
+ rtr_mbr_t* old_mbr;
+ rtr_mbr_t* new_mbr;
+ const uchar* dptr = NULL;
+ ulint flen = 0;
+ ulint dlen = 0;
+ mem_heap_t* temp_heap = NULL;
+ const dfield_t* new_field = &upd_field->new_val;
+
+ const ulint zip_size = ext
+ ? ext->zip_size
+ : index->table->space->zip_size();
+
+ ut_ad(dfield->data != NULL
+ && dfield->len > GEO_DATA_HEADER_SIZE);
+ ut_ad(dict_col_get_spatial_status(col) != SPATIAL_NONE);
+
+ /* Get the old mbr. */
+ if (dfield_is_ext(dfield)) {
+ /* For off-page stored data, we
+ need to read the whole field data. */
+ flen = dfield_get_len(dfield);
+ dptr = static_cast<const byte*>(
+ dfield_get_data(dfield));
+ temp_heap = mem_heap_create(1000);
+
+ dptr = btr_copy_externally_stored_field(
+ &dlen, dptr,
+ zip_size,
+ flen,
+ temp_heap);
+ } else {
+ dptr = static_cast<const uchar*>(dfield->data);
+ dlen = dfield->len;
+ }
+
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(dlen
+ - GEO_DATA_HEADER_SIZE),
+ SPDIMS, mbr1);
+ old_mbr = reinterpret_cast<rtr_mbr_t*>(mbr1);
+
+ /* Get the new mbr. */
+ if (dfield_is_ext(new_field)) {
+ if (flag == ROW_BUILD_FOR_UNDO
+ && dict_table_has_atomic_blobs(
+ index->table)) {
+ /* For ROW_FORMAT=DYNAMIC
+ or COMPRESSED, a prefix of
+ off-page records is stored
+ in the undo log record
+ (for any column prefix indexes).
+ For SPATIAL INDEX, we must
+ ignore this prefix. The
+ full column value is stored in
+ the BLOB.
+ For non-spatial index, we
+ would have already fetched a
+ necessary prefix of the BLOB,
+ available in the "ext" parameter.
+
+ Here, for SPATIAL INDEX, we are
+ fetching the full column, which is
+ potentially wasting a lot of I/O,
+ memory, and possibly involving a
+ concurrency problem, similar to ones
+ that existed before the introduction
+ of row_ext_t.
+
+ MDEV-11657 FIXME: write the MBR
+ directly to the undo log record,
+ and avoid recomputing it here! */
+ flen = BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(dfield_get_len(new_field) >=
+ BTR_EXTERN_FIELD_REF_SIZE);
+ dptr = static_cast<const byte*>(
+ dfield_get_data(new_field))
+ + dfield_get_len(new_field)
+ - BTR_EXTERN_FIELD_REF_SIZE;
+ } else {
+ flen = dfield_get_len(new_field);
+ dptr = static_cast<const byte*>(
+ dfield_get_data(new_field));
+ }
+
+ if (temp_heap == NULL) {
+ temp_heap = mem_heap_create(1000);
+ }
+
+ dptr = btr_copy_externally_stored_field(
+ &dlen, dptr,
+ zip_size,
+ flen,
+ temp_heap);
+ } else {
+ dptr = static_cast<const byte*>(
+ upd_field->new_val.data);
+ dlen = upd_field->new_val.len;
+ }
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(dlen
+ - GEO_DATA_HEADER_SIZE),
+ SPDIMS, mbr2);
+ new_mbr = reinterpret_cast<rtr_mbr_t*>(mbr2);
+
+ if (temp_heap) {
+ mem_heap_free(temp_heap);
+ }
+
+ if (!MBR_EQUAL_CMP(old_mbr, new_mbr)) {
+ return(TRUE);
+ } else {
+ continue;
+ }
+ }
+
+ /* This treatment of column prefix indexes is loosely
+ based on row_build_index_entry(). */
+
+ if (UNIV_LIKELY(ind_field->prefix_len == 0)
+ || dfield_is_null(dfield)) {
+ /* do nothing special */
+ } else if (ext) {
+ /* Silence a compiler warning without
+ silencing a Valgrind error. */
+ dfield_len = 0;
+ MEM_UNDEFINED(&dfield_len, sizeof dfield_len);
+ /* See if the column is stored externally. */
+ buf = row_ext_lookup(ext, col_no, &dfield_len);
+
+ ut_ad(col->ord_part);
+
+ if (UNIV_LIKELY_NULL(buf)) {
+ if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+ /* The externally stored field
+ was not written yet. This
+ record should only be seen by
+ trx_rollback_recovered()
+ when the server had crashed before
+ storing the field. */
+ ut_ad(thr->graph->trx->is_recovered);
+ ut_ad(thr->graph->trx
+ == trx_roll_crash_recv_trx);
+ return(TRUE);
+ }
+
+ goto copy_dfield;
+ }
+ } else if (dfield_is_ext(dfield)) {
+ dfield_len = dfield_get_len(dfield);
+ ut_a(dfield_len > BTR_EXTERN_FIELD_REF_SIZE);
+ dfield_len -= BTR_EXTERN_FIELD_REF_SIZE;
+ ut_a(dict_index_is_clust(index)
+ || ind_field->prefix_len <= dfield_len);
+
+ buf= static_cast<const byte*>(dfield_get_data(dfield));
+copy_dfield:
+ ut_a(dfield_len > 0);
+ dfield_copy(&dfield_ext, dfield);
+ dfield_set_data(&dfield_ext, buf, dfield_len);
+ dfield = &dfield_ext;
+ }
+
+ if (!dfield_datas_are_binary_equal(
+ dfield, &upd_field->new_val,
+ ind_field->prefix_len)) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+ const dict_table_t* table, /*!< in: table */
+ const upd_t* update) /*!< in: update vector for the row */
+{
+ upd_field_t* upd_field;
+ dict_index_t* index;
+ ulint i;
+
+ index = dict_table_get_first_index(table);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ upd_field = upd_get_nth_field(update, i);
+
+ if (upd_fld_is_virtual_col(upd_field)) {
+ if (dict_table_get_nth_v_col(index->table,
+ upd_field->field_no)
+ ->m_col.ord_part) {
+ return(TRUE);
+ }
+ } else {
+ if (dict_field_get_col(dict_index_get_nth_field(
+ index, upd_field->field_no))->ord_part) {
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether the Doc ID column is changed */
+bool
+row_upd_changes_doc_id(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ upd_field_t* upd_field) /*!< in: field to check */
+{
+ ulint col_no;
+ dict_index_t* clust_index;
+ fts_t* fts = table->fts;
+
+ ut_ad(!table->skip_alter_undo);
+
+ clust_index = dict_table_get_first_index(table);
+
+ /* Convert from index-specific column number to table-global
+ column number. */
+ col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no);
+
+ return(col_no == fts->doc_col);
+}
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+ dict_table_t* table, /*!< in: table */
+ upd_field_t* upd_field) /*!< in: field to check */
+{
+ ulint col_no;
+ dict_index_t* clust_index;
+ fts_t* fts = table->fts;
+
+ ut_ad(!table->skip_alter_undo);
+
+ if (upd_fld_is_virtual_col(upd_field)) {
+ col_no = upd_field->field_no;
+ return(dict_table_is_fts_column(fts->indexes, col_no, true));
+ } else {
+ clust_index = dict_table_get_first_index(table);
+
+ /* Convert from index-specific column number to table-global
+ column number. */
+ col_no = dict_index_get_nth_col_no(clust_index,
+ upd_field->field_no);
+ return(dict_table_is_fts_column(fts->indexes, col_no, false));
+ }
+
+}
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+ dtuple_t* entry, /*!< in: index entry */
+ dict_index_t* index, /*!< in: index of entry */
+ const upd_t* update, /*!< in: update vector for the row */
+ ulint n) /*!< in: how many first fields to check */
+{
+ ulint n_upd_fields;
+ ulint i, j;
+ dict_index_t* clust_index;
+
+ ut_ad(update && index);
+ ut_ad(n <= dict_index_get_n_fields(index));
+
+ n_upd_fields = upd_get_n_fields(update);
+ clust_index = dict_table_get_first_index(index->table);
+
+ for (i = 0; i < n; i++) {
+
+ const dict_field_t* ind_field;
+ const dict_col_t* col;
+ ulint col_pos;
+
+ ind_field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(ind_field);
+ col_pos = dict_col_get_clust_pos(col, clust_index);
+
+ ut_a(ind_field->prefix_len == 0);
+
+ for (j = 0; j < n_upd_fields; j++) {
+
+ upd_field_t* upd_field
+ = upd_get_nth_field(update, j);
+
+ if (col_pos == upd_field->field_no
+ && !dfield_datas_are_binary_equal(
+ dtuple_get_nth_field(entry, i),
+ &upd_field->new_val, 0)) {
+
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+ rec_t* rec, /*!< in: record in a clustered index */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ const dict_index_t* index, /*!< in: index of rec */
+ sym_node_t* column) /*!< in: first column in a column list, or
+ NULL */
+{
+ ut_ad(dict_index_is_clust(index));
+
+ const byte* data;
+ ulint len;
+
+ while (column) {
+ data = rec_get_nth_cfield(
+ rec, index, offsets,
+ column->field_nos[SYM_CLUST_FIELD_NO], &len);
+ eval_node_copy_and_alloc_val(column, data, len);
+
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*********************************************************************//**
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+ upd_t* update) /*!< in/out: update vector */
+{
+ que_node_t* exp;
+ upd_field_t* upd_field;
+ ulint n_fields;
+ ulint i;
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+
+ exp = upd_field->exp;
+
+ eval_exp(exp);
+
+ dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+ }
+}
+
+/** Stores to the heap the virtual columns that need for any indexes
+@param[in,out] node row update node
+@param[in] update an update vector if it is update
+@param[in] thd mysql thread handle
+@param[in,out] mysql_table mysql table object
+@return true if success
+ false if virtual column value computation fails. */
+static
+bool
+row_upd_store_v_row(
+ upd_node_t* node,
+ const upd_t* update,
+ THD* thd,
+ TABLE* mysql_table)
+{
+ dict_index_t* index = dict_table_get_first_index(node->table);
+ ib_vcol_row vc(NULL);
+
+ for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(node->table);
+ col_no++) {
+
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(node->table, col_no);
+
+ if (col->m_col.ord_part) {
+ dfield_t* dfield
+ = dtuple_get_nth_v_field(node->row, col_no);
+ ulint n_upd
+ = update ? upd_get_n_fields(update) : 0;
+ ulint i = 0;
+
+ /* Check if the value is already in update vector */
+ for (i = 0; i < n_upd; i++) {
+ const upd_field_t* upd_field
+ = upd_get_nth_field(update, i);
+ if (!(upd_field->new_val.type.prtype
+ & DATA_VIRTUAL)
+ || upd_field->field_no != col->v_pos) {
+ continue;
+ }
+
+ dfield_copy_data(dfield, upd_field->old_v_val);
+ dfield_dup(dfield, node->heap);
+ break;
+ }
+
+ /* Not updated */
+ if (i >= n_upd) {
+ /* If this is an update, then the value
+ should be in update->old_vrow */
+ if (update) {
+ if (update->old_vrow == NULL) {
+ /* This only happens in
+ cascade update. And virtual
+ column can't be affected,
+ so it is Ok to set it to NULL */
+ dfield_set_null(dfield);
+ } else {
+ dfield_t* vfield
+ = dtuple_get_nth_v_field(
+ update->old_vrow,
+ col_no);
+ dfield_copy_data(dfield, vfield);
+ dfield_dup(dfield, node->heap);
+ }
+ } else {
+ uchar *record = vc.record(thd, index,
+ &mysql_table);
+ /* Need to compute, this happens when
+ deleting row */
+ dfield_t* vfield =
+ innobase_get_computed_value(
+ node->row, col, index,
+ &vc.heap, node->heap,
+ NULL, thd, mysql_table,
+ record, NULL, NULL,
+ NULL);
+ if (vfield == NULL) {
+ return false;
+ }
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+/** Stores to the heap the row on which the node->pcur is positioned.
+@param[in] node row update node
+@param[in] thd mysql thread handle
+@param[in,out] mysql_table NULL, or mysql table object when
+ user thread invokes dml
+@return false if virtual column value computation fails
+ true otherwise. */
+static
+bool
+row_upd_store_row(
+ upd_node_t* node,
+ THD* thd,
+ TABLE* mysql_table)
+{
+ dict_index_t* clust_index;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ row_ext_t** ext;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ const rec_offs* offsets;
+ rec_offs_init(offsets_);
+
+ ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
+
+ if (node->row != NULL) {
+ mem_heap_empty(node->heap);
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ rec = btr_pcur_get_rec(node->pcur);
+
+ offsets = rec_get_offsets(rec, clust_index, offsets_,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (dict_table_has_atomic_blobs(node->table)) {
+ /* There is no prefix of externally stored columns in
+ the clustered index record. Build a cache of column
+ prefixes. */
+ ext = &node->ext;
+ } else {
+ /* REDUNDANT and COMPACT formats store a local
+ 768-byte prefix of each externally stored column.
+ No cache is needed. */
+ ext = NULL;
+ node->ext = NULL;
+ }
+
+ node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+ NULL, NULL, NULL, ext, node->heap);
+
+ if (node->table->n_v_cols) {
+ bool ok = row_upd_store_v_row(node,
+ node->is_delete ? NULL : node->update,
+ thd, mysql_table);
+ if (!ok) {
+ return false;
+ }
+ }
+
+ if (node->is_delete == PLAIN_DELETE) {
+ node->upd_row = NULL;
+ node->upd_ext = NULL;
+ } else {
+ node->upd_row = dtuple_copy(node->row, node->heap);
+ row_upd_replace(node->upd_row, &node->upd_ext,
+ clust_index, node->update, node->heap);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return true;
+}
+
+/***********************************************************//**
+Updates a secondary index entry of a row.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_sec_index_entry(
+/*====================*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mtr_t mtr;
+ const rec_t* rec;
+ btr_pcur_t pcur;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ btr_cur_t* btr_cur;
+ dberr_t err = DB_SUCCESS;
+ trx_t* trx = thr_get_trx(thr);
+ ulint mode;
+ ulint flags;
+ enum row_search_result search_result;
+
+ ut_ad(trx->id != 0);
+
+ index = node->index;
+
+ const bool referenced = row_upd_index_is_referenced(index, trx);
+#ifdef WITH_WSREP
+ bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+#endif /* WITH_WSREP */
+
+ heap = mem_heap_create(1024);
+
+ /* Build old index entry */
+ entry = row_build_index_entry(node->row, node->ext, index, heap);
+ ut_a(entry);
+
+ log_free_check();
+
+ DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
+ "before_row_upd_sec_index_entry");
+
+ mtr.start();
+
+ switch (index->table->space_id) {
+ case SRV_TMP_SPACE_ID:
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ flags = BTR_NO_LOCKING_FLAG;
+ break;
+ default:
+ index->set_modified(mtr);
+ /* fall through */
+ case IBUF_SPACE_ID:
+ flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
+ break;
+ }
+
+ bool uncommitted = !index->is_committed();
+
+ if (uncommitted) {
+ /* The index->online_status may change if the index is
+ or was being created online, but not committed yet. It
+ is protected by index->lock. */
+
+ mtr_s_lock_index(index, &mtr);
+
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_COMPLETE:
+ /* This is a normal index. Do not log anything.
+ Perform the update on the index tree directly. */
+ break;
+ case ONLINE_INDEX_CREATION:
+ /* Log a DELETE and optionally INSERT. */
+ row_log_online_op(index, entry, 0);
+
+ if (!node->is_delete) {
+ mem_heap_empty(heap);
+ entry = row_build_index_entry(
+ node->upd_row, node->upd_ext,
+ index, heap);
+ ut_a(entry);
+ row_log_online_op(index, entry, trx->id);
+ }
+ /* fall through */
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ mtr_commit(&mtr);
+ goto func_exit;
+ }
+
+ /* We can only buffer delete-mark operations if there
+ are no foreign key constraints referring to the index.
+ Change buffering is disabled for temporary tables and
+ spatial index. */
+ mode = (referenced || index->table->is_temporary()
+ || dict_index_is_spatial(index))
+ ? BTR_MODIFY_LEAF_ALREADY_S_LATCHED
+ : BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED;
+ } else {
+ /* For secondary indexes,
+ index->online_status==ONLINE_INDEX_COMPLETE if
+ index->is_committed(). */
+ ut_ad(!dict_index_is_online_ddl(index));
+
+ /* We can only buffer delete-mark operations if there
+ are no foreign key constraints referring to the index.
+ Change buffering is disabled for temporary tables and
+ spatial index. */
+ mode = (referenced || index->table->is_temporary()
+ || dict_index_is_spatial(index))
+ ? BTR_MODIFY_LEAF
+ : BTR_DELETE_MARK_LEAF;
+ }
+
+ if (dict_index_is_spatial(index)) {
+ ut_ad(mode & BTR_MODIFY_LEAF);
+ mode |= BTR_RTREE_DELETE_MARK;
+ }
+
+ /* Set the query thread, so that ibuf_insert_low() will be
+ able to invoke thd_get_trx(). */
+ btr_pcur_get_btr_cur(&pcur)->thr = thr;
+
+ search_result = row_search_index_entry(index, entry, mode,
+ &pcur, &mtr);
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ rec = btr_cur_get_rec(btr_cur);
+
+ switch (search_result) {
+ case ROW_NOT_DELETED_REF: /* should only occur for BTR_DELETE */
+ ut_error;
+ break;
+ case ROW_BUFFERED:
+ /* Entry was delete marked already. */
+ break;
+
+ case ROW_NOT_FOUND:
+ if (!index->is_committed()) {
+ /* When online CREATE INDEX copied the update
+ that we already made to the clustered index,
+ and completed the secondary index creation
+ before we got here, the old secondary index
+ record would not exist. The CREATE INDEX
+ should be waiting for a MySQL meta-data lock
+ upgrade at least until this UPDATE returns.
+ After that point, set_committed(true) would be
+ invoked by commit_inplace_alter_table(). */
+ break;
+ }
+
+ if (dict_index_is_spatial(index) && btr_cur->rtr_info->fd_del) {
+ /* We found the record, but a delete marked */
+ break;
+ }
+
+ ib::error()
+ << "Record in index " << index->name
+ << " of table " << index->table->name
+ << " was not found on update: " << *entry
+ << " at: " << rec_index_print(rec, index);
+#ifdef UNIV_DEBUG
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ ut_ad(btr_validate_index(index, 0) == DB_SUCCESS);
+ ut_ad(0);
+#endif /* UNIV_DEBUG */
+ break;
+ case ROW_FOUND:
+ ut_ad(err == DB_SUCCESS);
+
+ /* Delete mark the old index record; it can already be
+ delete marked if we return after a lock wait in
+ row_ins_sec_index_entry() below */
+ if (!rec_get_deleted_flag(
+ rec, dict_table_is_comp(index->table))) {
+ err = lock_sec_rec_modify_check_and_lock(
+ flags,
+ btr_cur_get_block(btr_cur),
+ btr_cur_get_rec(btr_cur), index, thr, &mtr);
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ btr_rec_set_deleted<true>(btr_cur_get_block(btr_cur),
+ btr_cur_get_rec(btr_cur),
+ &mtr);
+#ifdef WITH_WSREP
+ if (!referenced && foreign
+ && wsrep_must_process_fk(node, trx)
+ && !wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+
+ rec_offs* offsets = rec_get_offsets(
+ rec, index, NULL, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ err = wsrep_row_upd_check_foreign_constraints(
+ node, &pcur, index->table,
+ index, offsets, thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_NO_REFERENCED_ROW:
+ err = DB_SUCCESS;
+ break;
+ case DB_LOCK_WAIT:
+ case DB_DEADLOCK:
+ case DB_LOCK_WAIT_TIMEOUT:
+ WSREP_DEBUG("Foreign key check fail: "
+ "%s on table %s index %s query %s",
+ ut_strerr(err), index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+ break;
+ default:
+ WSREP_ERROR("Foreign key check fail: "
+ "%s on table %s index %s query %s",
+ ut_strerr(err), index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+ break;
+ }
+ }
+#endif /* WITH_WSREP */
+ }
+
+#ifdef WITH_WSREP
+ ut_ad(err == DB_SUCCESS || err == DB_LOCK_WAIT
+ || err == DB_DEADLOCK || err == DB_LOCK_WAIT_TIMEOUT);
+#else
+ ut_ad(err == DB_SUCCESS);
+#endif
+
+ if (referenced) {
+ rec_offs* offsets = rec_get_offsets(
+ rec, index, NULL, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* NOTE that the following call loses
+ the position of pcur ! */
+ err = row_upd_check_references_constraints(
+ node, &pcur, index->table,
+ index, offsets, thr, &mtr);
+ }
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ if (node->is_delete == PLAIN_DELETE || err != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+
+ mem_heap_empty(heap);
+
+ DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
+ "before_row_upd_sec_new_index_entry");
+
+ uncommitted = !index->is_committed();
+ if (uncommitted) {
+ mtr.start();
+ /* The index->online_status may change if the index is
+ being rollbacked. It is protected by index->lock. */
+
+ mtr_s_lock_index(index, &mtr);
+
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_COMPLETE:
+ case ONLINE_INDEX_CREATION:
+ break;
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ mtr_commit(&mtr);
+ goto func_exit;
+ }
+
+ }
+
+ /* Build a new index entry */
+ entry = row_build_index_entry(node->upd_row, node->upd_ext,
+ index, heap);
+ ut_a(entry);
+
+ if (uncommitted) {
+ mtr_commit(&mtr);
+ }
+
+ /* Insert new index entry */
+ err = row_ins_sec_index_entry(index, entry, thr, !node->is_delete);
+
+func_exit:
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***********************************************************//**
+Updates the secondary index record if it is changed in the row update or
+deletes it if this is a delete.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_sec_step(
+/*=============*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+ || (node->state == UPD_NODE_UPDATE_SOME_SEC));
+ ut_ad(!dict_index_is_clust(node->index));
+
+ if (node->state == UPD_NODE_UPDATE_ALL_SEC
+ || row_upd_changes_ord_field_binary(node->index, node->update,
+ thr, node->row, node->ext)) {
+ return(row_upd_sec_index_entry(node, thr));
+ }
+
+ return(DB_SUCCESS);
+}
+
+#ifdef UNIV_DEBUG
+# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \
+ row_upd_clust_rec_by_insert_inherit_func(rec,index,offsets,entry,update)
+#else /* UNIV_DEBUG */
+# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \
+ row_upd_clust_rec_by_insert_inherit_func(rec,entry,update)
+#endif /* UNIV_DEBUG */
+/*******************************************************************//**
+Mark non-updated off-page columns inherited when the primary key is
+updated. We must mark them as inherited in entry, so that they are not
+freed in a rollback. A limited version of this function used to be
+called btr_cur_mark_dtuple_inherited_extern().
+@return whether any columns were inherited */
+static
+bool
+row_upd_clust_rec_by_insert_inherit_func(
+/*=====================================*/
+ const rec_t* rec, /*!< in: old record, or NULL */
+#ifdef UNIV_DEBUG
+ dict_index_t* index, /*!< in: index, or NULL */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec), or NULL */
+#endif /* UNIV_DEBUG */
+ dtuple_t* entry, /*!< in/out: updated entry to be
+ inserted into the clustered index */
+ const upd_t* update) /*!< in: update vector */
+{
+ bool inherit = false;
+
+ ut_ad(!rec == !offsets);
+ ut_ad(!rec == !index);
+ ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+ ut_ad(!rec || rec_offs_any_extern(offsets));
+
+ for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) {
+ dfield_t* dfield = dtuple_get_nth_field(entry, i);
+ byte* data;
+ ulint len;
+
+ ut_ad(!offsets
+ || !rec_offs_nth_extern(offsets, i)
+ == !dfield_is_ext(dfield)
+ || (!dict_index_get_nth_field(index, i)->name
+ && !dfield_is_ext(dfield)
+ && (dfield_is_null(dfield) || dfield->len == 0))
+ || upd_get_field_by_field_no(update, i, false));
+ if (!dfield_is_ext(dfield)
+ || upd_get_field_by_field_no(update, i, false)) {
+ continue;
+ }
+
+#ifdef UNIV_DEBUG
+ if (UNIV_LIKELY(rec != NULL)) {
+ ut_ad(!rec_offs_nth_default(offsets, i));
+ const byte* rec_data
+ = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(len == dfield_get_len(dfield));
+ ut_ad(len != UNIV_SQL_NULL);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ rec_data += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* The pointer must not be zero. */
+ ut_ad(memcmp(rec_data, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ /* The BLOB must be owned. */
+ ut_ad(!(rec_data[BTR_EXTERN_LEN]
+ & BTR_EXTERN_OWNER_FLAG));
+ }
+#endif /* UNIV_DEBUG */
+
+ len = dfield_get_len(dfield);
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ data += len - BTR_EXTERN_FIELD_REF_SIZE;
+ /* The pointer must not be zero. */
+ ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+ /* The BLOB must be owned, unless we are resuming from
+ a lock wait and we already had disowned the BLOB. */
+ ut_a(rec == NULL
+ || !(data[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+ data[BTR_EXTERN_LEN] &= byte(~BTR_EXTERN_OWNER_FLAG);
+ data[BTR_EXTERN_LEN] |= BTR_EXTERN_INHERITED_FLAG;
+ /* The BTR_EXTERN_INHERITED_FLAG only matters in
+ rollback of a fresh insert. Purge will always free
+ the extern fields of a delete-marked row. */
+
+ inherit = true;
+ }
+
+ return(inherit);
+}
+
+/***********************************************************//**
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_rec_by_insert(
+/*========================*/
+ upd_node_t* node, /*!< in/out: row update node */
+ dict_index_t* index, /*!< in: clustered index of the record */
+ que_thr_t* thr, /*!< in: query thread */
+ bool referenced,/*!< in: whether index may be referenced in
+ a foreign key constraint */
+#ifdef WITH_WSREP
+ bool foreign,/*!< in: whether this is a foreign key */
+#endif
+ mtr_t* mtr) /*!< in/out: mini-transaction,
+ may be committed and restarted */
+{
+ mem_heap_t* heap;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ trx_t* trx;
+ dict_table_t* table;
+ dtuple_t* entry;
+ dberr_t err;
+ rec_t* rec;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ ut_ad(dict_index_is_clust(index));
+
+ rec_offs_init(offsets_);
+
+ trx = thr_get_trx(thr);
+ table = node->table;
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ heap = mem_heap_create(1000);
+
+ entry = row_build_index_entry_low(node->upd_row, node->upd_ext,
+ index, heap, ROW_BUILD_FOR_INSERT);
+ if (index->is_instant()) entry->trim(*index);
+ ut_ad(dtuple_get_info_bits(entry) == 0);
+
+ {
+ dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
+ ut_ad(t->len == DATA_TRX_ID_LEN);
+ trx_write_trx_id(static_cast<byte*>(t->data), trx->id);
+ }
+
+ switch (node->state) {
+ default:
+ ut_error;
+ case UPD_NODE_INSERT_CLUSTERED:
+ /* A lock wait occurred in row_ins_clust_index_entry() in
+ the previous invocation of this function. */
+ row_upd_clust_rec_by_insert_inherit(
+ NULL, NULL, NULL, entry, node->update);
+ break;
+ case UPD_NODE_UPDATE_CLUSTERED:
+ /* This is the first invocation of the function where
+ we update the primary key. Delete-mark the old record
+ in the clustered index and prepare to insert a new entry. */
+ rec = btr_cur_get_rec(btr_cur);
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ ut_ad(page_rec_is_user_rec(rec));
+
+ if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+ /* If the clustered index record is already delete
+ marked, then we are here after a DB_LOCK_WAIT.
+ Skip delete marking clustered index and disowning
+ its blobs. */
+ ut_ad(row_get_rec_trx_id(rec, index, offsets)
+ == trx->id);
+ ut_ad(!trx_undo_roll_ptr_is_insert(
+ row_get_rec_roll_ptr(rec, index,
+ offsets)));
+ goto check_fk;
+ }
+
+ err = btr_cur_del_mark_set_clust_rec(
+ btr_cur_get_block(btr_cur), rec, index, offsets,
+ thr, node->row, mtr);
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+
+ /* If the the new row inherits externally stored
+ fields (off-page columns a.k.a. BLOBs) from the
+ delete-marked old record, mark them disowned by the
+ old record and owned by the new entry. */
+
+ if (rec_offs_any_extern(offsets)) {
+ if (row_upd_clust_rec_by_insert_inherit(
+ rec, index, offsets,
+ entry, node->update)) {
+ /* The blobs are disowned here, expecting the
+ insert down below to inherit them. But if the
+ insert fails, then this disown will be undone
+ when the operation is rolled back. */
+ btr_cur_disown_inherited_fields(
+ btr_cur_get_block(btr_cur),
+ rec, index, offsets, node->update,
+ mtr);
+ }
+ }
+check_fk:
+ if (referenced) {
+ /* NOTE that the following call loses
+ the position of pcur ! */
+
+ err = row_upd_check_references_constraints(
+ node, pcur, table, index, offsets, thr, mtr);
+
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+#ifdef WITH_WSREP
+ } else if (foreign && wsrep_must_process_fk(node, trx)) {
+ err = wsrep_row_upd_check_foreign_constraints(
+ node, pcur, table, index, offsets, thr, mtr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_NO_REFERENCED_ROW:
+ err = DB_SUCCESS;
+ break;
+ case DB_LOCK_WAIT:
+ case DB_DEADLOCK:
+ case DB_LOCK_WAIT_TIMEOUT:
+ WSREP_DEBUG("Foreign key check fail: "
+ "%s on table %s index %s query %s",
+ ut_strerr(err), index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+
+ goto err_exit;
+ default:
+ WSREP_ERROR("Foreign key check fail: "
+ "%s on table %s index %s query %s",
+ ut_strerr(err), index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+
+ goto err_exit;
+ }
+#endif /* WITH_WSREP */
+ }
+ }
+
+ mtr->commit();
+ mtr->start();
+
+ node->state = UPD_NODE_INSERT_CLUSTERED;
+ err = row_ins_clust_index_entry(index, entry, thr,
+ dtuple_get_n_ext(entry));
+err_exit:
+ mem_heap_free(heap);
+ return(err);
+}
+
+/***********************************************************//**
+Updates a clustered index record of a row when the ordering fields do
+not change.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_rec(
+/*==============*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ upd_node_t* node, /*!< in: row update node */
+ dict_index_t* index, /*!< in: clustered index */
+ rec_offs* offsets,/*!< in: rec_get_offsets() on node->pcur */
+ mem_heap_t** offsets_heap,
+ /*!< in/out: memory heap, can be emptied */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in,out: mini-transaction; may be
+ committed and restarted here */
+{
+ mem_heap_t* heap = NULL;
+ big_rec_t* big_rec = NULL;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ dberr_t err;
+ const dtuple_t* rebuilt_old_pk = NULL;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!thr_get_trx(thr)->in_rollback);
+ ut_ad(!node->table->skip_alter_undo);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ ut_ad(btr_cur_get_index(btr_cur) == index);
+ ut_ad(!rec_get_deleted_flag(btr_cur_get_rec(btr_cur),
+ dict_table_is_comp(index->table)));
+ ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets));
+
+ if (dict_index_is_online_ddl(index)) {
+ rebuilt_old_pk = row_log_table_get_pk(
+ btr_cur_get_rec(btr_cur), index, offsets, NULL, &heap);
+ }
+
+ /* Try optimistic updating of the record, keeping changes within
+ the page; we do not check locks because we assume the x-lock on the
+ record to update */
+
+ if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+ err = btr_cur_update_in_place(
+ flags | BTR_NO_LOCKING_FLAG, btr_cur,
+ offsets, node->update,
+ node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+ } else {
+ err = btr_cur_optimistic_update(
+ flags | BTR_NO_LOCKING_FLAG, btr_cur,
+ &offsets, offsets_heap, node->update,
+ node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+ }
+
+ if (err == DB_SUCCESS) {
+ goto success;
+ }
+
+ if (buf_pool.running_out()) {
+ err = DB_LOCK_TABLE_FULL;
+ goto func_exit;
+ }
+
+ /* We may have to modify the tree structure: do a pessimistic descent
+ down the index tree */
+
+ mtr->commit();
+ mtr->start();
+
+ if (index->table->is_temporary()) {
+ /* Disable locking, because temporary tables are never
+ shared between transactions or connections. */
+ flags |= BTR_NO_LOCKING_FLAG;
+ mtr->set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(*mtr);
+ }
+
+ /* NOTE: this transaction has an s-lock or x-lock on the record and
+ therefore other transactions cannot modify the record when we have no
+ latch on the page. In addition, we assume that other query threads of
+ the same transaction do not modify the record in the meantime.
+ Therefore we can assert that the restoration of the cursor succeeds. */
+
+ ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+
+ ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ dict_table_is_comp(index->table)));
+
+ if (!heap) {
+ heap = mem_heap_create(1024);
+ }
+
+ err = btr_cur_pessimistic_update(
+ flags | BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur,
+ &offsets, offsets_heap, heap, &big_rec,
+ node->update, node->cmpl_info,
+ thr, thr_get_trx(thr)->id, mtr);
+ if (big_rec) {
+ ut_a(err == DB_SUCCESS);
+
+ DEBUG_SYNC_C("before_row_upd_extern");
+ err = btr_store_big_rec_extern_fields(
+ pcur, offsets, big_rec, mtr, BTR_STORE_UPDATE);
+ DEBUG_SYNC_C("after_row_upd_extern");
+ }
+
+ if (err == DB_SUCCESS) {
+success:
+ if (dict_index_is_online_ddl(index)) {
+ row_log_table_update(
+ btr_cur_get_rec(btr_cur),
+ index, offsets, rebuilt_old_pk);
+ }
+ }
+
+func_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ if (big_rec) {
+ dtuple_big_rec_free(big_rec);
+ }
+
+ return(err);
+}
+
+/***********************************************************//**
+Delete marks a clustered index record.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_del_mark_clust_rec(
+/*=======================*/
+ upd_node_t* node, /*!< in: row update node */
+ dict_index_t* index, /*!< in: clustered index */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets() for the
+ record under the cursor */
+ que_thr_t* thr, /*!< in: query thread */
+ bool referenced,
+ /*!< in: whether index may be referenced in
+ a foreign key constraint */
+#ifdef WITH_WSREP
+ bool foreign,/*!< in: whether this is a foreign key */
+#endif
+ mtr_t* mtr) /*!< in,out: mini-transaction;
+ will be committed and restarted */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ rec_t* rec;
+ trx_t* trx = thr_get_trx(thr);
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(node->is_delete == PLAIN_DELETE);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ /* Store row because we have to build also the secondary index
+ entries */
+
+ if (!row_upd_store_row(node, trx->mysql_thd,
+ thr->prebuilt && thr->prebuilt->table == node->table
+ ? thr->prebuilt->m_mysql_table : NULL)) {
+ return DB_COMPUTE_VALUE_FAILED;
+ }
+
+ /* Mark the clustered index record deleted; we do not have to check
+ locks, because we assume that we have an x-lock on the record */
+
+ rec = btr_cur_get_rec(btr_cur);
+
+ dberr_t err = btr_cur_del_mark_set_clust_rec(
+ btr_cur_get_block(btr_cur), rec,
+ index, offsets, thr, node->row, mtr);
+
+ if (err != DB_SUCCESS) {
+ } else if (referenced) {
+ /* NOTE that the following call loses the position of pcur ! */
+
+ err = row_upd_check_references_constraints(
+ node, pcur, index->table, index, offsets, thr, mtr);
+#ifdef WITH_WSREP
+ } else if (foreign && wsrep_must_process_fk(node, trx)) {
+ err = wsrep_row_upd_check_foreign_constraints(
+ node, pcur, index->table, index, offsets, thr, mtr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_NO_REFERENCED_ROW:
+ err = DB_SUCCESS;
+ break;
+ case DB_LOCK_WAIT:
+ case DB_DEADLOCK:
+ case DB_LOCK_WAIT_TIMEOUT:
+ WSREP_DEBUG("Foreign key check fail: "
+ "%d on table %s index %s query %s",
+ err, index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+ break;
+ default:
+ WSREP_ERROR("Foreign key check fail: "
+ "%d on table %s index %s query %s",
+ err, index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+ break;
+ }
+#endif /* WITH_WSREP */
+ }
+
+ return(err);
+}
+
+/***********************************************************//**
+Updates the clustered index record.
+@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT
+in case of a lock wait, else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_step(
+/*===============*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_index_t* index;
+ btr_pcur_t* pcur;
+ ibool success;
+ dberr_t err;
+ mtr_t mtr;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets;
+ ulint flags;
+ trx_t* trx = thr_get_trx(thr);
+
+ rec_offs_init(offsets_);
+
+ index = dict_table_get_first_index(node->table);
+
+ const bool referenced = row_upd_index_is_referenced(index, trx);
+#ifdef WITH_WSREP
+ const bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+#endif
+
+ pcur = node->pcur;
+
+ /* We have to restore the cursor to its position */
+
+ mtr.start();
+
+ if (node->table->is_temporary()) {
+ /* Disable locking, because temporary tables are
+ private to the connection (no concurrent access). */
+ flags = node->table->no_rollback()
+ ? BTR_NO_ROLLBACK
+ : BTR_NO_LOCKING_FLAG;
+ /* Redo logging only matters for persistent tables. */
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ flags = node->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
+ index->set_modified(mtr);
+ }
+
+ /* If the restoration does not succeed, then the same
+ transaction has deleted the record on which the cursor was,
+ and that is an SQL error. If the restoration succeeds, it may
+ still be that the same transaction has successively deleted
+ and inserted a record with the same ordering fields, but in
+ that case we know that the transaction has at least an
+ implicit x-lock on the record. */
+
+ ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+ ulint mode;
+
+ DEBUG_SYNC_C_IF_THD(trx->mysql_thd, "innodb_row_upd_clust_step_enter");
+
+ if (dict_index_is_online_ddl(index)) {
+ ut_ad(node->table->id != DICT_INDEXES_ID);
+ mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+ mtr_s_lock_index(index, &mtr);
+ } else {
+ mode = BTR_MODIFY_LEAF;
+ }
+
+ success = btr_pcur_restore_position(mode, pcur, &mtr);
+
+ if (!success) {
+ err = DB_RECORD_NOT_FOUND;
+ goto exit_func;
+ }
+
+ /* If this is a row in SYS_INDEXES table of the data dictionary,
+ then we have to free the file segments of the index tree associated
+ with the index */
+
+ if (node->is_delete == PLAIN_DELETE
+ && node->table->id == DICT_INDEXES_ID) {
+
+ ut_ad(!dict_index_is_online_ddl(index));
+
+ dict_drop_index_tree(pcur, trx, &mtr);
+
+ mtr.commit();
+
+ mtr.start();
+ index->set_modified(mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
+ &mtr);
+ if (!success) {
+ err = DB_ERROR;
+
+ mtr.commit();
+
+ return(err);
+ }
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (!flags && !node->has_clust_rec_x_lock) {
+ err = lock_clust_rec_modify_check_and_lock(
+ 0, btr_pcur_get_block(pcur),
+ rec, index, offsets, thr);
+ if (err != DB_SUCCESS) {
+ goto exit_func;
+ }
+ }
+
+ ut_ad(index->table->no_rollback() || index->table->is_temporary()
+ || row_get_rec_trx_id(rec, index, offsets) == trx->id
+ || lock_trx_has_expl_x_lock(trx, index->table,
+ btr_pcur_get_block(pcur),
+ page_rec_get_heap_no(rec)));
+
+ if (node->is_delete == PLAIN_DELETE) {
+ err = row_upd_del_mark_clust_rec(
+ node, index, offsets, thr, referenced,
+#ifdef WITH_WSREP
+ foreign,
+#endif
+ &mtr);
+ goto all_done;
+ }
+
+ /* If the update is made for MySQL, we already have the update vector
+ ready, else we have to do some evaluation: */
+
+ if (UNIV_UNLIKELY(!node->in_mysql_interface)) {
+ /* Copy the necessary columns from clust_rec and calculate the
+ new values to set */
+ row_upd_copy_columns(rec, offsets, index,
+ UT_LIST_GET_FIRST(node->columns));
+ row_upd_eval_new_vals(node->update);
+ }
+
+ if (!node->is_delete && node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+ err = row_upd_clust_rec(
+ flags, node, index, offsets, &heap, thr, &mtr);
+ goto exit_func;
+ }
+
+ if (!row_upd_store_row(node, trx->mysql_thd, thr->prebuilt
+ ? thr->prebuilt->m_mysql_table : NULL)) {
+ err = DB_COMPUTE_VALUE_FAILED;
+ goto exit_func;
+ }
+
+ if (row_upd_changes_ord_field_binary(index, node->update, thr,
+ node->row, node->ext)) {
+
+ /* Update causes an ordering field (ordering fields within
+ the B-tree) of the clustered index record to change: perform
+ the update by delete marking and inserting.
+
+ TODO! What to do to the 'Halloween problem', where an update
+ moves the record forward in index so that it is again
+ updated when the cursor arrives there? Solution: the
+ read operation must check the undo record undo number when
+ choosing records to update. MySQL solves now the problem
+ externally! */
+
+ err = row_upd_clust_rec_by_insert(
+ node, index, thr, referenced,
+#ifdef WITH_WSREP
+ foreign,
+#endif
+ &mtr);
+all_done:
+ if (err == DB_SUCCESS) {
+ node->state = UPD_NODE_UPDATE_ALL_SEC;
+success:
+ node->index = dict_table_get_next_index(index);
+ }
+ } else {
+ err = row_upd_clust_rec(
+ flags, node, index, offsets, &heap, thr, &mtr);
+
+ if (err == DB_SUCCESS) {
+ ut_ad(node->is_delete != PLAIN_DELETE);
+ node->state = node->is_delete
+ ? UPD_NODE_UPDATE_ALL_SEC
+ : UPD_NODE_UPDATE_SOME_SEC;
+ goto success;
+ }
+ }
+
+exit_func:
+ mtr.commit();
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return err;
+}
+
+/***********************************************************//**
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+dberr_t
+row_upd(
+/*====*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err = DB_SUCCESS;
+ DBUG_ENTER("row_upd");
+
+ ut_ad(!thr_get_trx(thr)->in_rollback);
+
+ DBUG_PRINT("row_upd", ("table: %s", node->table->name.m_name));
+ DBUG_PRINT("row_upd", ("info bits in update vector: 0x%x",
+ node->update ? node->update->info_bits: 0));
+ DBUG_PRINT("row_upd", ("foreign_id: %s",
+ node->foreign ? node->foreign->id: "NULL"));
+
+ if (UNIV_LIKELY(node->in_mysql_interface)) {
+
+ /* We do not get the cmpl_info value from the MySQL
+ interpreter: we must calculate it on the fly: */
+
+ if (node->is_delete == PLAIN_DELETE
+ || row_upd_changes_some_index_ord_field_binary(
+ node->table, node->update)) {
+ node->cmpl_info = 0;
+ } else {
+ node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+ }
+ }
+
+ switch (node->state) {
+ case UPD_NODE_UPDATE_CLUSTERED:
+ case UPD_NODE_INSERT_CLUSTERED:
+ log_free_check();
+
+ err = row_upd_clust_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ DBUG_RETURN(err);
+ }
+ }
+
+ DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+ "after_row_upd_clust");
+
+ if (node->index == NULL
+ || (!node->is_delete
+ && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
+
+ DBUG_RETURN(DB_SUCCESS);
+ }
+
+ DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;);
+
+ do {
+ /* Skip corrupted index */
+ dict_table_skip_corrupt_index(node->index);
+
+ if (!node->index) {
+ break;
+ }
+
+ if (node->index->type != DICT_FTS) {
+ err = row_upd_sec_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ DBUG_RETURN(err);
+ }
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ } while (node->index != NULL);
+
+ ut_ad(err == DB_SUCCESS);
+
+ /* Do some cleanup */
+
+ if (node->row != NULL) {
+ node->row = NULL;
+ node->ext = NULL;
+ node->upd_row = NULL;
+ node->upd_ext = NULL;
+ mem_heap_empty(node->heap);
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ DBUG_RETURN(err);
+}
+
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_upd_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ upd_node_t* node;
+ sel_node_t* sel_node;
+ que_node_t* parent;
+ dberr_t err = DB_SUCCESS;
+ trx_t* trx;
+ DBUG_ENTER("row_upd_step");
+
+ ut_ad(thr);
+
+ trx = thr_get_trx(thr);
+
+ node = static_cast<upd_node_t*>(thr->run_node);
+
+ sel_node = node->select;
+
+ parent = que_node_get_parent(node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+ if (thr->prev_node == parent) {
+ node->state = UPD_NODE_SET_IX_LOCK;
+ }
+
+ if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+ if (!node->has_clust_rec_x_lock) {
+ /* It may be that the current session has not yet
+ started its transaction, or it has been committed: */
+
+ err = lock_table(0, node->table, LOCK_IX, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto error_handling;
+ }
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ if (node->searched_update) {
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch a row to update */
+
+ thr->run_node = sel_node;
+
+ DBUG_RETURN(thr);
+ }
+ }
+
+ /* sel_node is NULL if we are in the MySQL interface */
+
+ if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+ if (!node->searched_update) {
+ /* An explicit cursor should be positioned on a row
+ to update */
+
+ ut_error;
+
+ err = DB_ERROR;
+
+ goto error_handling;
+ }
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to update, or the select node performed the
+ updates directly in-place */
+
+ thr->run_node = parent;
+
+ DBUG_RETURN(thr);
+ }
+
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = row_upd(node, thr);
+
+error_handling:
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ DBUG_RETURN(NULL);
+ }
+
+ /* DO THE TRIGGER ACTIONS HERE */
+
+ if (node->searched_update) {
+ /* Fetch next row to update */
+
+ thr->run_node = sel_node;
+ } else {
+ /* It was an explicit cursor update */
+
+ thr->run_node = parent;
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ DBUG_RETURN(thr);
+}
+
+/** Write query start time as SQL field data to a buffer. Needed by InnoDB.
+@param thd Thread object
+@param buf Buffer to hold start time data */
+void thd_get_query_start_data(THD *thd, char *buf);
+
+/** Appends row_start or row_end field to update vector and sets a
+CURRENT_TIMESTAMP/trx->id value to it.
+Supposed to be called only by make_versioned_update() and
+make_versioned_delete().
+@param[in] trx transaction
+@param[in] vers_sys_idx table->row_start or table->row_end */
+void upd_node_t::vers_update_fields(const trx_t *trx, ulint idx)
+{
+ ut_ad(in_mysql_interface); // otherwise needs to recalculate node->cmpl_info
+ ut_ad(idx == table->vers_start || idx == table->vers_end);
+
+ dict_index_t *clust_index= dict_table_get_first_index(table);
+ const dict_col_t *col= dict_table_get_nth_col(table, idx);
+ ulint field_no= dict_col_get_clust_pos(col, clust_index);
+ upd_field_t *ufield;
+
+ for (ulint i= 0; i < update->n_fields; ++i)
+ {
+ if (update->fields[i].field_no == field_no)
+ {
+ ufield= &update->fields[i];
+ goto skip_append;
+ }
+ }
+
+ /* row_create_update_node_for_mysql() pre-allocated this much.
+ At least one PK column always remains unchanged. */
+ ut_ad(update->n_fields < ulint(table->n_cols + table->n_v_cols));
+
+ update->n_fields++;
+ ufield= upd_get_nth_field(update, update->n_fields - 1);
+ upd_field_set_field_no(ufield, static_cast<uint16_t>(field_no), clust_index);
+
+skip_append:
+ char *where= reinterpret_cast<char *>(update->vers_sys_value);
+ if (col->vers_native())
+ mach_write_to_8(where, trx->id);
+ else
+ thd_get_query_start_data(trx->mysql_thd, where);
+
+ dfield_set_data(&ufield->new_val, update->vers_sys_value, col->len);
+
+ for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++)
+ {
+ const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no);
+ if (!v_col->m_col.ord_part)
+ continue;
+ for (ulint i= 0; i < unsigned(v_col->num_base); i++)
+ {
+ dict_col_t *base_col= v_col->base_col[i];
+ if (base_col->ind == col->ind)
+ {
+ /* Virtual column depends on system field value
+ which we updated above. Remove it from update
+ vector, so it is recalculated in
+ row_upd_store_v_row() (see !update branch). */
+ update->remove(v_col->v_pos);
+ break;
+ }
+ }
+ }
+}
diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc
new file mode 100644
index 00000000..b7378607
--- /dev/null
+++ b/storage/innobase/row/row0vers.cc
@@ -0,0 +1,1353 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0vers.cc
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "row0mysql.h"
+
+/** Check whether all non-virtual index fields are equal.
+@param[in] index the secondary index
+@param[in] a first index entry to compare
+@param[in] b second index entry to compare
+@return whether all non-virtual fields are equal */
+static
+bool
+row_vers_non_virtual_fields_equal(
+ const dict_index_t* index,
+ const dfield_t* a,
+ const dfield_t* b)
+{
+ const dict_field_t* end = &index->fields[index->n_fields];
+
+ for (const dict_field_t* ifield = index->fields; ifield != end;
+ ifield++) {
+ if (!ifield->col->is_virtual()
+ && cmp_dfield_dfield(a++, b++)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out] caller_trx trx of current thread
+@param[in] clust_rec clustered index record
+@param[in] clust_index clustered index
+@param[in] rec secondary index record
+@param[in] index secondary index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in,out] mtr mini-transaction
+@return the active transaction; state must be rechecked after
+trx_mutex_enter(), and trx->release_reference() must be invoked
+@retval NULL if the record was committed */
+UNIV_INLINE
+trx_t*
+row_vers_impl_x_locked_low(
+ trx_t* caller_trx,
+ const rec_t* clust_rec,
+ dict_index_t* clust_index,
+ const rec_t* rec,
+ dict_index_t* index,
+ const rec_offs* offsets,
+ mtr_t* mtr)
+{
+ trx_id_t trx_id;
+ rec_t* prev_version = NULL;
+ rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* clust_offsets;
+ mem_heap_t* heap;
+ dtuple_t* ientry = NULL;
+ mem_heap_t* v_heap = NULL;
+ dtuple_t* cur_vrow = NULL;
+
+ rec_offs_init(clust_offsets_);
+
+ DBUG_ENTER("row_vers_impl_x_locked_low");
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (ulint trx_id_offset = clust_index->trx_id_offset) {
+ trx_id = mach_read_from_6(clust_rec + trx_id_offset);
+ if (trx_id == 0) {
+ /* The transaction history was already purged. */
+ DBUG_RETURN(0);
+ }
+ }
+
+ heap = mem_heap_create(1024);
+
+ clust_offsets = rec_get_offsets(clust_rec, clust_index, clust_offsets_,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+ if (trx_id == 0) {
+ /* The transaction history was already purged. */
+ mem_heap_free(heap);
+ DBUG_RETURN(0);
+ }
+
+ ut_ad(!clust_index->table->is_temporary());
+
+ trx_t* trx;
+
+ if (trx_id == caller_trx->id) {
+ trx = caller_trx;
+ trx->reference();
+ } else {
+ trx = trx_sys.find(caller_trx, trx_id);
+ if (trx == 0) {
+ /* The transaction that modified or inserted
+ clust_rec is no longer active, or it is
+ corrupt: no implicit lock on rec */
+ lock_check_trx_id_sanity(trx_id, clust_rec,
+ clust_index, clust_offsets);
+ mem_heap_free(heap);
+ DBUG_RETURN(0);
+ }
+ }
+
+ const ulint comp = page_rec_is_comp(rec);
+ ut_ad(index->table == clust_index->table);
+ ut_ad(!!comp == dict_table_is_comp(index->table));
+ ut_ad(!comp == !page_rec_is_comp(clust_rec));
+
+ const ulint rec_del = rec_get_deleted_flag(rec, comp);
+
+ if (dict_index_has_virtual(index)) {
+ ulint est_size = DTUPLE_EST_ALLOC(index->n_fields);
+
+ /* Allocate the dtuple for virtual columns extracted from undo
+ log with its own heap, so to avoid it being freed as we
+ iterating in the version loop below. */
+ v_heap = mem_heap_create(est_size);
+ ientry = row_rec_to_index_entry(rec, index, offsets, v_heap);
+ }
+
+ /* We look up if some earlier version, which was modified by
+ the trx_id transaction, of the clustered index record would
+ require rec to be in a different state (delete marked or
+ unmarked, or have different field values, or not existing). If
+ there is such a version, then rec was modified by the trx_id
+ transaction, and it has an implicit x-lock on rec. Note that
+ if clust_rec itself would require rec to be in a different
+ state, then the trx_id transaction has not yet had time to
+ modify rec, and does not necessarily have an implicit x-lock
+ on rec. */
+
+ for (const rec_t* version = clust_rec;; version = prev_version) {
+ row_ext_t* ext;
+ dtuple_t* row;
+ dtuple_t* entry;
+ ulint vers_del;
+ trx_id_t prev_trx_id;
+ mem_heap_t* old_heap = heap;
+ dtuple_t* vrow = NULL;
+
+ /* We keep the semaphore in mtr on the clust_rec page, so
+ that no other transaction can update it and get an
+ implicit x-lock on rec until mtr_commit(mtr). */
+
+ heap = mem_heap_create(1024);
+
+ trx_undo_prev_version_build(
+ clust_rec, mtr, version, clust_index, clust_offsets,
+ heap, &prev_version, NULL,
+ dict_index_has_virtual(index) ? &vrow : NULL, 0);
+
+ trx_mutex_enter(trx);
+ const bool committed = trx_state_eq(
+ trx, TRX_STATE_COMMITTED_IN_MEMORY);
+ trx_mutex_exit(trx);
+
+ /* The oldest visible clustered index version must not be
+ delete-marked, because we never start a transaction by
+ inserting a delete-marked record. */
+ ut_ad(committed || prev_version
+ || !rec_get_deleted_flag(version, comp));
+
+ /* Free version and clust_offsets. */
+ mem_heap_free(old_heap);
+
+ if (committed) {
+ goto not_locked;
+ }
+
+ if (prev_version == NULL) {
+
+ /* We reached the oldest visible version without
+ finding an older version of clust_rec that would
+ match the secondary index record. If the secondary
+ index record is not delete marked, then clust_rec
+ is considered the correct match of the secondary
+ index record and hence holds the implicit lock. */
+
+ if (rec_del) {
+ /* The secondary index record is del marked.
+ So, the implicit lock holder of clust_rec
+ did not modify the secondary index record yet,
+ and is not holding an implicit lock on it.
+
+ This assumes that whenever a row is inserted
+ or updated, the leaf page record always is
+ created with a clear delete-mark flag.
+ (We never insert a delete-marked record.) */
+not_locked:
+ trx->release_reference();
+ trx = 0;
+ }
+
+ break;
+ }
+
+ clust_offsets = rec_get_offsets(
+ prev_version, clust_index, clust_offsets_,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ vers_del = rec_get_deleted_flag(prev_version, comp);
+
+ prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
+ clust_offsets);
+
+ /* The stack of versions is locked by mtr. Thus, it
+ is safe to fetch the prefixes for externally stored
+ columns. */
+
+ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version,
+ clust_offsets,
+ NULL, NULL, NULL, &ext, heap);
+
+ if (dict_index_has_virtual(index)) {
+ if (vrow) {
+ /* Keep the virtual row info for the next
+ version */
+ cur_vrow = dtuple_copy(vrow, v_heap);
+ dtuple_dup_v_fld(cur_vrow, v_heap);
+ }
+
+ if (!cur_vrow) {
+ /* Build index entry out of row */
+ entry = row_build_index_entry(row, ext, index,
+ heap);
+
+ /* entry could only be NULL (the
+ clustered index record could contain
+ BLOB pointers that are NULL) if we
+ were accessing a freshly inserted
+ record before it was fully inserted.
+ prev_version cannot possibly be such
+ an incomplete record, because its
+ transaction would have to be committed
+ in order for later versions of the
+ record to be able to exist. */
+ ut_ad(entry);
+
+ /* If the indexed virtual columns has changed,
+ there must be log record to generate vrow.
+ Otherwise, it is not changed, so no need
+ to compare */
+ if (!row_vers_non_virtual_fields_equal(
+ index,
+ ientry->fields, entry->fields)) {
+ if (rec_del != vers_del) {
+ break;
+ }
+ } else if (!rec_del) {
+ break;
+ }
+
+ goto result_check;
+ } else {
+ ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
+ dtuple_copy_v_fields(row, cur_vrow);
+ }
+ }
+
+ entry = row_build_index_entry(row, ext, index, heap);
+
+ /* entry could only be NULL (the clustered index
+ record could contain BLOB pointers that are NULL) if
+ we were accessing a freshly inserted record before it
+ was fully inserted. prev_version cannot possibly be
+ such an incomplete record, because its transaction
+ would have to be committed in order for later versions
+ of the record to be able to exist. */
+ ut_ad(entry);
+
+ /* If we get here, we know that the trx_id transaction
+ modified prev_version. Let us check if prev_version
+ would require rec to be in a different state. */
+
+ /* The previous version of clust_rec must be
+ accessible, because clust_rec was not a fresh insert.
+ There is no guarantee that the transaction is still
+ active. */
+
+ /* We check if entry and rec are identified in the alphabetical
+ ordering */
+ if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
+ /* The delete marks of rec and prev_version should be
+ equal for rec to be in the state required by
+ prev_version */
+
+ if (rec_del != vers_del) {
+
+ break;
+ }
+
+ /* It is possible that the row was updated so that the
+ secondary index record remained the same in
+ alphabetical ordering, but the field values changed
+ still. For example, 'abc' -> 'ABC'. Check also that. */
+
+ dtuple_set_types_binary(
+ entry, dtuple_get_n_fields(entry));
+
+ if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
+
+ break;
+ }
+
+ } else if (!rec_del) {
+ /* The delete mark should be set in rec for it to be
+ in the state required by prev_version */
+
+ break;
+ }
+
+result_check:
+ if (trx->id != prev_trx_id) {
+ /* prev_version was the first version modified by
+ the trx_id transaction: no implicit x-lock */
+ goto not_locked;
+ }
+ }
+
+ if (trx) {
+ DBUG_PRINT("info", ("Implicit lock is held by trx:" TRX_ID_FMT,
+ trx_id));
+ }
+
+ if (v_heap != NULL) {
+ mem_heap_free(v_heap);
+ }
+
+ mem_heap_free(heap);
+ DBUG_RETURN(trx);
+}
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out] caller_trx trx of current thread
+@param[in] rec secondary index record
+@param[in] index secondary index
+@param[in] offsets rec_get_offsets(rec, index)
+@return the active transaction; state must be rechecked after
+trx_mutex_enter(), and trx->release_reference() must be invoked
+@retval NULL if the record was committed */
+trx_t*
+row_vers_impl_x_locked(
+ trx_t* caller_trx,
+ const rec_t* rec,
+ dict_index_t* index,
+ const rec_offs* offsets)
+{
+ mtr_t mtr;
+ trx_t* trx;
+ const rec_t* clust_rec;
+ dict_index_t* clust_index;
+
+ ut_ad(!lock_mutex_own());
+
+ mtr_start(&mtr);
+
+ /* Search for the clustered index record. The latch on the
+ page of clust_rec locks the top of the stack of versions. The
+ bottom of the version stack is not locked; oldest versions may
+ disappear by the fact that transactions may be committed and
+ collected by the purge. This is not a problem, because we are
+ only interested in active transactions. */
+
+ clust_rec = row_get_clust_rec(
+ BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr);
+
+ if (!clust_rec) {
+ /* In a rare case it is possible that no clust rec is found
+ for a secondary index record: if in row0umod.cc
+ row_undo_mod_remove_clust_low() we have already removed the
+ clust rec, while purge is still cleaning and removing
+ secondary index records associated with earlier versions of
+ the clustered index record. In that case there cannot be
+ any implicit lock on the secondary index record, because
+ an active transaction which has modified the secondary index
+ record has also modified the clustered index record. And in
+ a rollback we always undo the modifications to secondary index
+ records before the clustered index record. */
+
+ trx = 0;
+ } else {
+ trx = row_vers_impl_x_locked_low(
+ caller_trx, clust_rec, clust_index, rec, index,
+ offsets, &mtr);
+
+ ut_ad(trx == 0 || trx->is_referenced());
+ }
+
+ mtr_commit(&mtr);
+
+ return(trx);
+}
+
+/** build virtual column value from current cluster index record data
+@param[in,out] row the cluster index row in dtuple form
+@param[in] clust_index clustered index
+@param[in] index the secondary index
+@param[in] heap heap used to build virtual dtuple. */
+static
+bool
+row_vers_build_clust_v_col(
+ dtuple_t* row,
+ dict_index_t* clust_index,
+ dict_index_t* index,
+ mem_heap_t* heap)
+{
+ THD* thd= current_thd;
+ TABLE* maria_table= 0;
+
+ ut_ad(dict_index_has_virtual(index));
+ ut_ad(index->table == clust_index->table);
+
+ DEBUG_SYNC(current_thd, "ib_clust_v_col_before_row_allocated");
+
+ ib_vcol_row vc(nullptr);
+ byte *record = vc.record(thd, index, &maria_table);
+
+ ut_ad(maria_table);
+
+ for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+ const dict_col_t* c = dict_index_get_nth_col(index, i);
+
+ if (c->is_virtual()) {
+ const dict_v_col_t* col
+ = reinterpret_cast<const dict_v_col_t*>(c);
+
+ dfield_t *vfield = innobase_get_computed_value(
+ row, col, clust_index, &vc.heap,
+ heap, NULL, thd, maria_table, record, NULL,
+ NULL, NULL);
+ if (!vfield) {
+ innobase_report_computed_value_failed(row);
+ ut_ad(0);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/** Build latest virtual column data from undo log
+@param[in] in_purge whether this is the purge thread
+@param[in] rec clustered index record
+@param[in] clust_index clustered index
+@param[in,out] clust_offsets offsets on the clustered index record
+@param[in] index the secondary index
+@param[in] roll_ptr the rollback pointer for the purging record
+@param[in] trx_id trx id for the purging record
+@param[in,out] v_heap heap used to build vrow
+@param[out] v_row dtuple holding the virtual rows
+@param[in,out] mtr mtr holding the latch on rec */
+static
+void
+row_vers_build_cur_vrow_low(
+ bool in_purge,
+ const rec_t* rec,
+ dict_index_t* clust_index,
+ rec_offs* clust_offsets,
+ dict_index_t* index,
+ roll_ptr_t roll_ptr,
+ trx_id_t trx_id,
+ mem_heap_t* v_heap,
+ dtuple_t** vrow,
+ mtr_t* mtr)
+{
+ const rec_t* version;
+ rec_t* prev_version;
+ mem_heap_t* heap = NULL;
+ ulint num_v = dict_table_get_n_v_cols(index->table);
+ const dfield_t* field;
+ ulint i;
+ bool all_filled = false;
+
+ *vrow = dtuple_create_with_vcol(v_heap, 0, num_v);
+ dtuple_init_v_fld(*vrow);
+
+ for (i = 0; i < num_v; i++) {
+ dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype
+ = DATA_MISSING;
+ }
+
+ version = rec;
+
+ /* If this is called by purge thread, set TRX_UNDO_PREV_IN_PURGE
+ bit to search the undo log until we hit the current undo log with
+ roll_ptr */
+ const ulint status = in_purge
+ ? TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE
+ : TRX_UNDO_GET_OLD_V_VALUE;
+
+ while (!all_filled) {
+ mem_heap_t* heap2 = heap;
+ heap = mem_heap_create(1024);
+ roll_ptr_t cur_roll_ptr = row_get_rec_roll_ptr(
+ version, clust_index, clust_offsets);
+
+ trx_undo_prev_version_build(
+ rec, mtr, version, clust_index, clust_offsets,
+ heap, &prev_version, NULL, vrow, status);
+
+ if (heap2) {
+ mem_heap_free(heap2);
+ }
+
+ if (!prev_version) {
+ /* Versions end here */
+ break;
+ }
+
+ clust_offsets = rec_get_offsets(prev_version, clust_index,
+ NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ ulint entry_len = dict_index_get_n_fields(index);
+
+ all_filled = true;
+
+ for (i = 0; i < entry_len; i++) {
+ const dict_col_t* col
+ = dict_index_get_nth_col(index, i);
+
+ if (!col->is_virtual()) {
+ continue;
+ }
+
+ const dict_v_col_t* v_col
+ = reinterpret_cast<const dict_v_col_t*>(col);
+ field = dtuple_get_nth_v_field(*vrow, v_col->v_pos);
+
+ if (dfield_get_type(field)->mtype == DATA_MISSING) {
+ all_filled = false;
+ break;
+ }
+
+ }
+
+ trx_id_t rec_trx_id = row_get_rec_trx_id(
+ prev_version, clust_index, clust_offsets);
+
+ if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) {
+ break;
+ }
+
+ version = prev_version;
+ }
+
+ mem_heap_free(heap);
+}
+
+/** Check a virtual column value index secondary virtual index matches
+that of current cluster index record, which is recreated from information
+stored in undo log
+@param[in] rec record in the clustered index
+@param[in] icentry the index entry built from a cluster row
+@param[in] clust_index cluster index
+@param[in] clust_offsets offsets on the cluster record
+@param[in] index the secondary index
+@param[in] ientry the secondary index entry
+@param[in] roll_ptr the rollback pointer for the purging record
+@param[in] trx_id trx id for the purging record
+@param[in,out] v_heap heap used to build virtual dtuple
+@param[in,out] v_row dtuple holding the virtual rows (if needed)
+@param[in] mtr mtr holding the latch on rec
+@return true if matches, false otherwise */
+static
+bool
+row_vers_vc_matches_cluster(
+ const rec_t* rec,
+ const dtuple_t* icentry,
+ dict_index_t* clust_index,
+ rec_offs* clust_offsets,
+ dict_index_t* index,
+ const dtuple_t* ientry,
+ roll_ptr_t roll_ptr,
+ trx_id_t trx_id,
+ mem_heap_t* v_heap,
+ dtuple_t** vrow,
+ mtr_t* mtr)
+{
+ const rec_t* version;
+ rec_t* prev_version;
+ mem_heap_t* heap2;
+ mem_heap_t* heap = NULL;
+ mem_heap_t* tuple_heap;
+ ulint num_v = dict_table_get_n_v_cols(index->table);
+ bool compare[REC_MAX_N_FIELDS];
+ ulint n_fields = dtuple_get_n_fields(ientry);
+ ulint n_non_v_col = 0;
+ ulint n_cmp_v_col = 0;
+ const dfield_t* field1;
+ dfield_t* field2;
+ ulint i;
+
+ /* First compare non-virtual columns (primary keys) */
+ ut_ad(index->n_fields == n_fields);
+ ut_ad(n_fields == dtuple_get_n_fields(icentry));
+ {
+ const dfield_t* a = ientry->fields;
+ const dfield_t* b = icentry->fields;
+
+ for (const dict_field_t *ifield = index->fields,
+ *const end = &index->fields[index->n_fields];
+ ifield != end; ifield++, a++, b++) {
+ if (!ifield->col->is_virtual()) {
+ if (cmp_dfield_dfield(a, b)) {
+ return false;
+ }
+ n_non_v_col++;
+ }
+ }
+ }
+
+ tuple_heap = mem_heap_create(1024);
+
+ ut_ad(n_fields > n_non_v_col);
+
+ *vrow = dtuple_create_with_vcol(v_heap ? v_heap : tuple_heap, 0, num_v);
+ dtuple_init_v_fld(*vrow);
+
+ for (i = 0; i < num_v; i++) {
+ dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype
+ = DATA_MISSING;
+ compare[i] = false;
+ }
+
+ version = rec;
+
+ while (n_cmp_v_col < n_fields - n_non_v_col) {
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+ roll_ptr_t cur_roll_ptr = row_get_rec_roll_ptr(
+ version, clust_index, clust_offsets);
+
+ ut_ad(cur_roll_ptr != 0);
+ ut_ad(roll_ptr != 0);
+
+ trx_undo_prev_version_build(
+ rec, mtr, version, clust_index, clust_offsets,
+ heap, &prev_version, NULL, vrow,
+ TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE);
+
+ if (heap2) {
+ mem_heap_free(heap2);
+ }
+
+ if (!prev_version) {
+ /* Versions end here */
+ goto func_exit;
+ }
+
+ clust_offsets = rec_get_offsets(prev_version, clust_index,
+ NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ ulint entry_len = dict_index_get_n_fields(index);
+
+ for (i = 0; i < entry_len; i++) {
+ const dict_field_t* ind_field
+ = dict_index_get_nth_field(index, i);
+ const dict_col_t* col = ind_field->col;
+ field1 = dtuple_get_nth_field(ientry, i);
+
+ if (!col->is_virtual()) {
+ continue;
+ }
+
+ const dict_v_col_t* v_col
+ = reinterpret_cast<const dict_v_col_t*>(col);
+ field2
+ = dtuple_get_nth_v_field(*vrow, v_col->v_pos);
+
+ if ((dfield_get_type(field2)->mtype != DATA_MISSING)
+ && (!compare[v_col->v_pos])) {
+
+ if (ind_field->prefix_len != 0
+ && !dfield_is_null(field2)
+ && field2->len > ind_field->prefix_len) {
+ field2->len = ind_field->prefix_len;
+ }
+
+ /* The index field mismatch */
+ if (v_heap
+ || cmp_dfield_dfield(field2, field1) != 0) {
+ if (v_heap) {
+ dtuple_dup_v_fld(*vrow, v_heap);
+ }
+
+ mem_heap_free(tuple_heap);
+ mem_heap_free(heap);
+ return(false);
+ }
+
+ compare[v_col->v_pos] = true;
+ n_cmp_v_col++;
+ }
+ }
+
+ trx_id_t rec_trx_id = row_get_rec_trx_id(
+ prev_version, clust_index, clust_offsets);
+
+ if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) {
+ break;
+ }
+
+ version = prev_version;
+ }
+
+func_exit:
+ if (n_cmp_v_col == 0) {
+ *vrow = NULL;
+ }
+
+ mem_heap_free(tuple_heap);
+ mem_heap_free(heap);
+
+ /* FIXME: In the case of n_cmp_v_col is not the same as
+ n_fields - n_non_v_col, callback is needed to compare the rest
+ columns. At the timebeing, we will need to return true */
+ return (true);
+}
+
+/** Build a dtuple contains virtual column data for current cluster index
+@param[in] in_purge called by purge thread
+@param[in] rec cluster index rec
+@param[in] clust_index cluster index
+@param[in] clust_offsets cluster rec offset
+@param[in] index secondary index
+@param[in] roll_ptr roll_ptr for the purge record
+@param[in] trx_id transaction ID on the purging record
+@param[in,out] heap heap memory
+@param[in,out] v_heap heap memory to keep virtual colum dtuple
+@param[in] mtr mtr holding the latch on rec
+@return dtuple contains virtual column data */
+static
+dtuple_t*
+row_vers_build_cur_vrow(
+ bool in_purge,
+ const rec_t* rec,
+ dict_index_t* clust_index,
+ rec_offs** clust_offsets,
+ dict_index_t* index,
+ roll_ptr_t roll_ptr,
+ trx_id_t trx_id,
+ mem_heap_t* heap,
+ mem_heap_t* v_heap,
+ mtr_t* mtr)
+{
+ dtuple_t* cur_vrow = NULL;
+
+ roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr(
+ rec, clust_index, *clust_offsets);
+
+ /* if the row is newly inserted, then the virtual
+ columns need to be computed */
+ if (trx_undo_roll_ptr_is_insert(t_roll_ptr)) {
+
+ ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+
+ /* This is a newly inserted record and cannot
+ be deleted, So the externally stored field
+ cannot be freed yet. */
+ dtuple_t* row = row_build(ROW_COPY_POINTERS, clust_index,
+ rec, *clust_offsets,
+ NULL, NULL, NULL, NULL, heap);
+
+ if (!row_vers_build_clust_v_col(row, clust_index, index,
+ heap)) {
+ return nullptr;
+ }
+
+ cur_vrow = dtuple_copy(row, v_heap);
+ dtuple_dup_v_fld(cur_vrow, v_heap);
+ } else {
+ /* Try to fetch virtual column data from undo log */
+ row_vers_build_cur_vrow_low(
+ in_purge, rec, clust_index, *clust_offsets,
+ index, roll_ptr, trx_id, v_heap, &cur_vrow, mtr);
+ }
+
+ *clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ return(cur_vrow);
+}
+
+/** Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@param[in] also_curr TRUE if also rec is included in the versions
+ to search; otherwise only versions prior
+ to it are searched
+@param[in] rec record in the clustered index; the caller
+ must have a latch on the page
+@param[in] mtr mtr holding the latch on rec; it will
+ also hold the latch on purge_view
+@param[in] index secondary index
+@param[in] ientry secondary index entry
+@param[in] roll_ptr roll_ptr for the purge record
+@param[in] trx_id transaction ID on the purging record
+@return TRUE if earlier version should have */
+bool
+row_vers_old_has_index_entry(
+ bool also_curr,
+ const rec_t* rec,
+ mtr_t* mtr,
+ dict_index_t* index,
+ const dtuple_t* ientry,
+ roll_ptr_t roll_ptr,
+ trx_id_t trx_id)
+{
+ const rec_t* version;
+ rec_t* prev_version;
+ dict_index_t* clust_index;
+ rec_offs* clust_offsets;
+ mem_heap_t* heap;
+ mem_heap_t* heap2;
+ dtuple_t* row;
+ const dtuple_t* entry;
+ ulint comp;
+ dtuple_t* vrow = NULL;
+ mem_heap_t* v_heap = NULL;
+ dtuple_t* cur_vrow = NULL;
+
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ clust_index = dict_table_get_first_index(index->table);
+
+ comp = page_rec_is_comp(rec);
+ ut_ad(!dict_table_is_comp(index->table) == !comp);
+ heap = mem_heap_create(1024);
+ clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (dict_index_has_virtual(index)) {
+ v_heap = mem_heap_create(100);
+ }
+
+ DBUG_EXECUTE_IF("ib_purge_virtual_index_crash",
+ DBUG_SUICIDE(););
+
+ if (also_curr && !rec_get_deleted_flag(rec, comp)) {
+ row_ext_t* ext;
+
+ /* The top of the stack of versions is locked by the
+ mtr holding a latch on the page containing the
+ clustered index record. The bottom of the stack is
+ locked by the fact that the purge_sys.view must
+ 'overtake' any read view of an active transaction.
+ Thus, it is safe to fetch the prefixes for
+ externally stored columns. */
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ rec, clust_offsets,
+ NULL, NULL, NULL, &ext, heap);
+
+ if (dict_index_has_virtual(index)) {
+
+
+#ifdef DBUG_OFF
+# define dbug_v_purge false
+#else /* DBUG_OFF */
+ bool dbug_v_purge = false;
+#endif /* DBUG_OFF */
+
+ DBUG_EXECUTE_IF(
+ "ib_purge_virtual_index_callback",
+ dbug_v_purge = true;);
+
+ roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr(
+ rec, clust_index, clust_offsets);
+
+ /* if the row is newly inserted, then the virtual
+ columns need to be computed */
+ if (trx_undo_roll_ptr_is_insert(t_roll_ptr)
+ || dbug_v_purge) {
+
+ if (!row_vers_build_clust_v_col(
+ row, clust_index, index, heap)) {
+ goto unsafe_to_purge;
+ }
+
+ entry = row_build_index_entry(
+ row, ext, index, heap);
+ if (entry && !dtuple_coll_cmp(ientry, entry)) {
+ goto unsafe_to_purge;
+ }
+ } else {
+ /* Build index entry out of row */
+ entry = row_build_index_entry(row, ext, index, heap);
+ /* entry could only be NULL if
+ the clustered index record is an uncommitted
+ inserted record whose BLOBs have not been
+ written yet. The secondary index record
+ can be safely removed, because it cannot
+ possibly refer to this incomplete
+ clustered index record. (Insert would
+ always first be completed for the
+ clustered index record, then proceed to
+ secondary indexes.) */
+
+ if (entry && row_vers_vc_matches_cluster(
+ rec, entry,
+ clust_index, clust_offsets,
+ index, ientry, roll_ptr,
+ trx_id, NULL, &vrow, mtr)) {
+ goto unsafe_to_purge;
+ }
+ }
+ clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+ clust_index
+ ->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ } else {
+
+ entry = row_build_index_entry(
+ row, ext, index, heap);
+
+ /* If entry == NULL, the record contains unset BLOB
+ pointers. This must be a freshly inserted record. If
+ this is called from
+ row_purge_remove_sec_if_poss_low(), the thread will
+ hold latches on the clustered index and the secondary
+ index. Because the insert works in three steps:
+
+ (1) insert the record to clustered index
+ (2) store the BLOBs and update BLOB pointers
+ (3) insert records to secondary indexes
+
+ the purge thread can safely ignore freshly inserted
+ records and delete the secondary index record. The
+ thread that inserted the new record will be inserting
+ the secondary index records. */
+
+ /* NOTE that we cannot do the comparison as binary
+ fields because the row is maybe being modified so that
+ the clustered index record has already been updated to
+ a different binary value in a char field, but the
+ collation identifies the old and new value anyway! */
+ if (entry && !dtuple_coll_cmp(ientry, entry)) {
+unsafe_to_purge:
+ mem_heap_free(heap);
+
+ if (v_heap) {
+ mem_heap_free(v_heap);
+ }
+ return true;
+ }
+ }
+ } else if (dict_index_has_virtual(index)) {
+ /* The current cluster index record could be
+ deleted, but the previous version of it might not. We will
+ need to get the virtual column data from undo record
+ associated with current cluster index */
+
+ cur_vrow = row_vers_build_cur_vrow(
+ also_curr, rec, clust_index, &clust_offsets,
+ index, roll_ptr, trx_id, heap, v_heap, mtr);
+ }
+
+ version = rec;
+
+ for (;;) {
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+ vrow = NULL;
+
+ trx_undo_prev_version_build(rec, mtr, version,
+ clust_index, clust_offsets,
+ heap, &prev_version, NULL,
+ dict_index_has_virtual(index)
+ ? &vrow : NULL, 0);
+ mem_heap_free(heap2); /* free version and clust_offsets */
+
+ if (!prev_version) {
+ /* Versions end here */
+ mem_heap_free(heap);
+
+ if (v_heap) {
+ mem_heap_free(v_heap);
+ }
+
+ return false;
+ }
+
+ clust_offsets = rec_get_offsets(prev_version, clust_index,
+ NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (dict_index_has_virtual(index)) {
+ if (vrow) {
+ /* Keep the virtual row info for the next
+ version, unless it is changed */
+ mem_heap_empty(v_heap);
+ cur_vrow = dtuple_copy(vrow, v_heap);
+ dtuple_dup_v_fld(cur_vrow, v_heap);
+ }
+
+ if (!cur_vrow) {
+ /* Nothing for this index has changed,
+ continue */
+ version = prev_version;
+ continue;
+ }
+ }
+
+ if (!rec_get_deleted_flag(prev_version, comp)) {
+ row_ext_t* ext;
+
+ /* The stack of versions is locked by mtr.
+ Thus, it is safe to fetch the prefixes for
+ externally stored columns. */
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ prev_version, clust_offsets,
+ NULL, NULL, NULL, &ext, heap);
+
+ if (dict_index_has_virtual(index)) {
+ ut_ad(cur_vrow);
+ ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
+ dtuple_copy_v_fields(row, cur_vrow);
+ }
+
+ entry = row_build_index_entry(row, ext, index, heap);
+
+ /* If entry == NULL, the record contains unset
+ BLOB pointers. This must be a freshly
+ inserted record that we can safely ignore.
+ For the justification, see the comments after
+ the previous row_build_index_entry() call. */
+
+ /* NOTE that we cannot do the comparison as binary
+ fields because maybe the secondary index record has
+ already been updated to a different binary value in
+ a char field, but the collation identifies the old
+ and new value anyway! */
+
+ if (entry && !dtuple_coll_cmp(ientry, entry)) {
+ goto unsafe_to_purge;
+ }
+ }
+
+ version = prev_version;
+ }
+}
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return DB_SUCCESS or DB_MISSING_HISTORY */
+dberr_t
+row_vers_build_for_consistent_read(
+/*===============================*/
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec */
+ dict_index_t* index, /*!< in: the clustered index */
+ rec_offs** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ ReadView* view, /*!< in: the consistent read view */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ rec_t** old_vers,/*!< out, own: old version, or NULL
+ if the history is missing or the record
+ does not exist in the view, that is,
+ it was freshly inserted afterwards */
+ dtuple_t** vrow) /*!< out: virtual row */
+{
+ const rec_t* version;
+ rec_t* prev_version;
+ trx_id_t trx_id;
+ mem_heap_t* heap = NULL;
+ byte* buf;
+ dberr_t err;
+
+ ut_ad(index->is_primary());
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(!rw_lock_own(&(purge_sys.latch), RW_LOCK_S));
+
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ trx_id = row_get_rec_trx_id(rec, index, *offsets);
+
+ ut_ad(!view->changes_visible(trx_id, index->table->name));
+
+ ut_ad(!vrow || !(*vrow));
+
+ version = rec;
+
+ for (;;) {
+ mem_heap_t* prev_heap = heap;
+
+ heap = mem_heap_create(1024);
+
+ if (vrow) {
+ *vrow = NULL;
+ }
+
+ /* If purge can't see the record then we can't rely on
+ the UNDO log record. */
+
+ bool purge_sees = trx_undo_prev_version_build(
+ rec, mtr, version, index, *offsets, heap,
+ &prev_version, NULL, vrow, 0);
+
+ err = (purge_sees) ? DB_SUCCESS : DB_MISSING_HISTORY;
+
+ if (prev_heap != NULL) {
+ mem_heap_free(prev_heap);
+ }
+
+ if (prev_version == NULL) {
+ /* It was a freshly inserted version */
+ *old_vers = NULL;
+ ut_ad(!vrow || !(*vrow));
+ break;
+ }
+
+ *offsets = rec_get_offsets(
+ prev_version, index, *offsets,
+ index->n_core_fields, ULINT_UNDEFINED, offset_heap);
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(prev_version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
+
+ if (view->changes_visible(trx_id, index->table->name)) {
+
+ /* The view already sees this version: we can copy
+ it to in_heap and return */
+
+ buf = static_cast<byte*>(
+ mem_heap_alloc(
+ in_heap, rec_offs_size(*offsets)));
+
+ *old_vers = rec_copy(buf, prev_version, *offsets);
+ rec_offs_make_valid(*old_vers, index, true, *offsets);
+
+ if (vrow && *vrow) {
+ *vrow = dtuple_copy(*vrow, in_heap);
+ dtuple_dup_v_fld(*vrow, in_heap);
+ }
+ break;
+ }
+
+ version = prev_version;
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read. */
+void
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+ trx_t* caller_trx,/*!<in/out: trx of current thread */
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec */
+ dict_index_t* index, /*!< in: the clustered index */
+ rec_offs** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ const rec_t** old_vers,/*!< out: rec, old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+ dtuple_t** vrow) /*!< out: virtual row, old version, or NULL
+ if it is not updated in the view */
+{
+ const rec_t* version;
+ mem_heap_t* heap = NULL;
+ byte* buf;
+ trx_id_t rec_trx_id = 0;
+
+ ut_ad(index->is_primary());
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ ut_ad(!rw_lock_own(&(purge_sys.latch), RW_LOCK_S));
+
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ version = rec;
+ ut_ad(!vrow || !(*vrow));
+
+ for (;;) {
+ mem_heap_t* heap2;
+ rec_t* prev_version;
+ trx_id_t version_trx_id;
+
+ version_trx_id = row_get_rec_trx_id(version, index, *offsets);
+ if (rec == version) {
+ rec_trx_id = version_trx_id;
+ }
+
+ if (!trx_sys.is_registered(caller_trx, version_trx_id)) {
+committed_version_trx:
+ /* We found a version that belongs to a
+ committed transaction: return it. */
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ if (rec == version) {
+ *old_vers = rec;
+ if (vrow) {
+ *vrow = NULL;
+ }
+ break;
+ }
+
+ /* We assume that a rolled-back transaction stays in
+ TRX_STATE_ACTIVE state until all the changes have been
+ rolled back and the transaction is removed from
+ the global list of transactions. */
+
+ if (rec_trx_id == version_trx_id) {
+ /* The transaction was committed while
+ we searched for earlier versions.
+ Return the current version as a
+ semi-consistent read. */
+
+ version = rec;
+ *offsets = rec_get_offsets(
+ version, index, *offsets,
+ index->n_core_fields, ULINT_UNDEFINED,
+ offset_heap);
+ }
+
+ buf = static_cast<byte*>(
+ mem_heap_alloc(
+ in_heap, rec_offs_size(*offsets)));
+
+ *old_vers = rec_copy(buf, version, *offsets);
+ rec_offs_make_valid(*old_vers, index, true, *offsets);
+ if (vrow && *vrow) {
+ *vrow = dtuple_copy(*vrow, in_heap);
+ dtuple_dup_v_fld(*vrow, in_heap);
+ }
+ break;
+ }
+
+ DEBUG_SYNC_C("after_row_vers_check_trx_active");
+
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+
+ if (!trx_undo_prev_version_build(rec, mtr, version, index,
+ *offsets, heap,
+ &prev_version,
+ in_heap, vrow, 0)) {
+ mem_heap_free(heap);
+ heap = heap2;
+ heap2 = NULL;
+ goto committed_version_trx;
+ }
+
+ if (heap2) {
+ mem_heap_free(heap2); /* free version */
+ }
+
+ if (prev_version == NULL) {
+ /* It was a freshly inserted version */
+ *old_vers = NULL;
+ ut_ad(!vrow || !(*vrow));
+ break;
+ }
+
+ version = prev_version;
+ *offsets = rec_get_offsets(version, index, *offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, offset_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+ }/* for (;;) */
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+}
diff --git a/storage/innobase/snappy.cmake b/storage/innobase/snappy.cmake
new file mode 100644
index 00000000..3a2d828e
--- /dev/null
+++ b/storage/innobase/snappy.cmake
@@ -0,0 +1,34 @@
+# Copyright (C) 2015, MariaDB Corporation. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+SET(WITH_INNODB_SNAPPY AUTO CACHE STRING
+ "Build with snappy. Possible values are 'ON', 'OFF', 'AUTO' and default is 'AUTO'")
+
+MACRO (MYSQL_CHECK_SNAPPY)
+ IF (WITH_INNODB_SNAPPY STREQUAL "ON" OR WITH_INNODB_SNAPPY STREQUAL "AUTO")
+ CHECK_INCLUDE_FILES(snappy-c.h HAVE_SNAPPY_H)
+ CHECK_LIBRARY_EXISTS(snappy snappy_uncompress "" HAVE_SNAPPY_SHARED_LIB)
+
+ IF(HAVE_SNAPPY_SHARED_LIB AND HAVE_SNAPPY_H)
+ SET(HAVE_INNODB_SNAPPY TRUE)
+ ADD_DEFINITIONS(-DHAVE_SNAPPY=1)
+ LINK_LIBRARIES(snappy)
+ ELSE()
+ IF (WITH_INNODB_SNAPPY STREQUAL "ON")
+ MESSAGE(FATAL_ERROR "Required snappy library is not found")
+ ENDIF()
+ ENDIF()
+ ENDIF()
+ ADD_FEATURE_INFO(INNODB_SNAPPY HAVE_INNODB_SNAPPY "Snappy compression in the InnoDB storage engine")
+ENDMACRO()
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
new file mode 100644
index 00000000..f13af13c
--- /dev/null
+++ b/storage/innobase/srv/srv0mon.cc
@@ -0,0 +1,2108 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0mon.cc
+Database monitor counter interfaces
+
+Created 12/9/2009 Jimmy Yang
+*******************************************************/
+
+#include "buf0buf.h"
+#include "dict0mem.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "mach0data.h"
+#include "os0file.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "trx0rseg.h"
+#include "trx0sys.h"
+
+/* Macro to standardize the counter names for counters in the
+"monitor_buf_page" module as they have very structured defines */
+#define MONITOR_BUF_PAGE(name, description, code, op, op_code) \
+ {"buffer_page_" op "_" name, "buffer_page_io", \
+ "Number of " description " Pages " op, \
+ MONITOR_GROUP_MODULE, MONITOR_DEFAULT_START, \
+ MONITOR_##code##_##op_code}
+
+#define MONITOR_BUF_PAGE_READ(name, description, code) \
+ MONITOR_BUF_PAGE(name, description, code, "read", PAGE_READ)
+
+#define MONITOR_BUF_PAGE_WRITTEN(name, description, code) \
+ MONITOR_BUF_PAGE(name, description, code, "written", PAGE_WRITTEN)
+
+/** This array defines basic static information of monitor counters,
+including each monitor's name, module it belongs to, a short
+description and its property/type and corresponding monitor_id.
+Please note: If you add a monitor here, please add its corresponding
+monitor_id to "enum monitor_id_value" structure in srv0mon.h file. */
+
+static monitor_info_t innodb_counter_info[] =
+{
+ /* A dummy item to mark the module start, this is
+ to accomodate the default value (0) set for the
+ global variables with the control system. */
+ {"module_start", "module_start", "module_start",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_DEFAULT_START},
+
+ /* ========== Counters for Server Metadata ========== */
+ {"module_metadata", "metadata", "Server Metadata",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_METADATA},
+
+ {"metadata_table_handles_opened", "metadata",
+ "Number of table handles opened",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_TABLE_OPEN},
+
+ {"metadata_table_handles_closed", "metadata",
+ "Number of table handles closed",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_TABLE_CLOSE},
+
+ {"metadata_table_reference_count", "metadata",
+ "Table reference counter",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_TABLE_REFERENCE},
+
+ /* ========== Counters for Lock Module ========== */
+ {"module_lock", "lock", "Lock Module",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_LOCK},
+
+ {"lock_deadlocks", "lock", "Number of deadlocks",
+ MONITOR_DEFAULT_ON,
+ MONITOR_DEFAULT_START, MONITOR_DEADLOCK},
+
+ {"lock_timeouts", "lock", "Number of lock timeouts",
+ MONITOR_DEFAULT_ON,
+ MONITOR_DEFAULT_START, MONITOR_TIMEOUT},
+
+ {"lock_rec_lock_waits", "lock",
+ "Number of times enqueued into record lock wait queue",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_LOCKREC_WAIT},
+
+ {"lock_table_lock_waits", "lock",
+ "Number of times enqueued into table lock wait queue",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_TABLELOCK_WAIT},
+
+ {"lock_rec_lock_requests", "lock",
+ "Number of record locks requested",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK_REQ},
+
+ {"lock_rec_lock_created", "lock", "Number of record locks created",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_RECLOCK_CREATED},
+
+ {"lock_rec_lock_removed", "lock",
+ "Number of record locks removed from the lock queue",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_RECLOCK_REMOVED},
+
+ {"lock_rec_locks", "lock",
+ "Current number of record locks on tables",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK},
+
+ {"lock_table_lock_created", "lock", "Number of table locks created",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_TABLELOCK_CREATED},
+
+ {"lock_table_lock_removed", "lock",
+ "Number of table locks removed from the lock queue",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_TABLELOCK_REMOVED},
+
+ {"lock_table_locks", "lock",
+ "Current number of table locks on tables",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_NUM_TABLELOCK},
+
+ {"lock_row_lock_current_waits", "lock",
+ "Number of row locks currently being waited for"
+ " (innodb_row_lock_current_waits)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT},
+
+ {"lock_row_lock_time", "lock",
+ "Time spent in acquiring row locks, in milliseconds"
+ " (innodb_row_lock_time)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_WAIT_TIME},
+
+ {"lock_row_lock_time_max", "lock",
+ "The maximum time to acquire a row lock, in milliseconds"
+ " (innodb_row_lock_time_max)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_MAX_WAIT_TIME},
+
+ {"lock_row_lock_waits", "lock",
+ "Number of times a row lock had to be waited for"
+ " (innodb_row_lock_waits)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_WAIT},
+
+ {"lock_row_lock_time_avg", "lock",
+ "The average time to acquire a row lock, in milliseconds"
+ " (innodb_row_lock_time_avg)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_AVG_WAIT_TIME},
+
+ /* ========== Counters for Buffer Manager and I/O ========== */
+ {"module_buffer", "buffer", "Buffer Manager Module",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_BUFFER},
+
+ {"buffer_pool_size", "server",
+ "Server buffer pool size (all buffer pools) in bytes",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUFFER_POOL_SIZE},
+
+ {"buffer_pool_reads", "buffer",
+ "Number of reads directly from disk (innodb_buffer_pool_reads)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READS},
+
+ {"buffer_pool_read_requests", "buffer",
+ "Number of logical read requests (innodb_buffer_pool_read_requests)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_REQUESTS},
+
+ {"buffer_pool_write_requests", "buffer",
+ "Number of write requests (innodb_buffer_pool_write_requests)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WRITE_REQUEST},
+
+ {"buffer_pool_wait_free", "buffer",
+ "Number of times waited for free buffer"
+ " (innodb_buffer_pool_wait_free)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WAIT_FREE},
+
+ {"buffer_pool_read_ahead", "buffer",
+ "Number of pages read as read ahead (innodb_buffer_pool_read_ahead)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD},
+
+ {"buffer_pool_read_ahead_evicted", "buffer",
+ "Read-ahead pages evicted without being accessed"
+ " (innodb_buffer_pool_read_ahead_evicted)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED},
+
+ {"buffer_pool_pages_total", "buffer",
+ "Total buffer pool size in pages (innodb_buffer_pool_pages_total)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_TOTAL},
+
+ {"buffer_pool_pages_misc", "buffer",
+ "Buffer pages for misc use such as row locks or the adaptive"
+ " hash index (innodb_buffer_pool_pages_misc)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_MISC},
+
+ {"buffer_pool_pages_data", "buffer",
+ "Buffer pages containing data (innodb_buffer_pool_pages_data)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DATA},
+
+ {"buffer_pool_bytes_data", "buffer",
+ "Buffer bytes containing data (innodb_buffer_pool_bytes_data)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DATA},
+
+ {"buffer_pool_pages_dirty", "buffer",
+ "Buffer pages currently dirty (innodb_buffer_pool_pages_dirty)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DIRTY},
+
+ {"buffer_pool_bytes_dirty", "buffer",
+ "Buffer bytes currently dirty (innodb_buffer_pool_bytes_dirty)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DIRTY},
+
+ {"buffer_pool_pages_free", "buffer",
+ "Buffer pages currently free (innodb_buffer_pool_pages_free)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_FREE},
+
+ {"buffer_pages_created", "buffer",
+ "Number of pages created (innodb_pages_created)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_CREATED},
+
+ {"buffer_pages_written", "buffer",
+ "Number of pages written (innodb_pages_written)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN},
+
+ {"buffer_index_pages_written", "buffer",
+ "Number of index pages written (innodb_index_pages_written)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN},
+
+ {"buffer_non_index_pages_written", "buffer",
+ "Number of non index pages written (innodb_non_index_pages_written)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN},
+
+ {"buffer_pages_read", "buffer",
+ "Number of pages read (innodb_pages_read)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ},
+
+ {"buffer_index_sec_rec_cluster_reads", "buffer",
+ "Number of secondary record reads triggered cluster read",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS},
+
+ {"buffer_index_sec_rec_cluster_reads_avoided", "buffer",
+ "Number of secondary record reads avoided triggering cluster read",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED},
+
+ {"buffer_data_reads", "buffer",
+ "Amount of data read in bytes (innodb_data_reads)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_READ},
+
+ {"buffer_data_written", "buffer",
+ "Amount of data written in bytes (innodb_data_written)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_WRITTEN},
+
+ /* Cumulative counter for scanning in flush batches */
+ {"buffer_flush_batch_scanned", "buffer",
+ "Total pages scanned as part of flush batch",
+ MONITOR_SET_OWNER,
+ MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+ MONITOR_FLUSH_BATCH_SCANNED},
+
+ {"buffer_flush_batch_num_scan", "buffer",
+ "Number of times buffer flush list flush is called",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED,
+ MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL},
+
+ {"buffer_flush_batch_scanned_per_call", "buffer",
+ "Pages scanned per flush batch scan",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED,
+ MONITOR_FLUSH_BATCH_SCANNED_PER_CALL},
+
+ /* Cumulative counter for pages flushed in flush batches */
+ {"buffer_flush_batch_total_pages", "buffer",
+ "Total pages flushed as part of flush batch",
+ MONITOR_SET_OWNER, MONITOR_FLUSH_BATCH_COUNT,
+ MONITOR_FLUSH_BATCH_TOTAL_PAGE},
+
+ {"buffer_flush_batches", "buffer",
+ "Number of flush batches",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_COUNT},
+
+ {"buffer_flush_batch_pages", "buffer",
+ "Pages queued as a flush batch",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_PAGES},
+
+ /* Cumulative counter for flush batches because of neighbor */
+ {"buffer_flush_neighbor_total_pages", "buffer",
+ "Total neighbors flushed as part of neighbor flush",
+ MONITOR_SET_OWNER, MONITOR_FLUSH_NEIGHBOR_COUNT,
+ MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE},
+
+ {"buffer_flush_neighbor", "buffer",
+ "Number of times neighbors flushing is invoked",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+ MONITOR_FLUSH_NEIGHBOR_COUNT},
+
+ {"buffer_flush_neighbor_pages", "buffer",
+ "Pages queued as a neighbor batch",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+ MONITOR_FLUSH_NEIGHBOR_PAGES},
+
+ {"buffer_flush_n_to_flush_requested", "buffer",
+ "Number of pages requested for flushing.",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_FLUSH_N_TO_FLUSH_REQUESTED},
+
+ {"buffer_flush_n_to_flush_by_age", "buffer",
+ "Number of pages target by LSN Age for flushing.",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_FLUSH_N_TO_FLUSH_BY_AGE},
+
+ {"buffer_flush_adaptive_avg_time", "buffer",
+ "Avg time (ms) spent for adaptive flushing recently.",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_TIME},
+
+ {"buffer_flush_adaptive_avg_pass", "buffer",
+ "Number of adaptive flushes passed during the recent Avg period.",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_PASS},
+
+ {"buffer_LRU_get_free_loops", "buffer",
+ "Total loops in LRU get free.",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_LOOPS},
+
+ {"buffer_LRU_get_free_waits", "buffer",
+ "Total sleep waits in LRU get free.",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_WAITS},
+
+ {"buffer_flush_avg_page_rate", "buffer",
+ "Average number of pages at which flushing is happening",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_PAGE_RATE},
+
+ {"buffer_flush_lsn_avg_rate", "buffer",
+ "Average redo generation rate",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_FLUSH_LSN_AVG_RATE},
+
+ {"buffer_flush_pct_for_dirty", "buffer",
+ "Percent of IO capacity used to avoid max dirty page limit",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_DIRTY},
+
+ {"buffer_flush_pct_for_lsn", "buffer",
+ "Percent of IO capacity used to avoid reusable redo space limit",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_LSN},
+
+ {"buffer_flush_sync_waits", "buffer",
+ "Number of times a wait happens due to sync flushing",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_FLUSH_SYNC_WAITS},
+
+ /* Cumulative counter for flush batches for adaptive flushing */
+ {"buffer_flush_adaptive_total_pages", "buffer",
+ "Total pages flushed as part of adaptive flushing",
+ MONITOR_SET_OWNER, MONITOR_FLUSH_ADAPTIVE_COUNT,
+ MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE},
+
+ {"buffer_flush_adaptive", "buffer",
+ "Number of adaptive batches",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+ MONITOR_FLUSH_ADAPTIVE_COUNT},
+
+ {"buffer_flush_adaptive_pages", "buffer",
+ "Pages queued as an adaptive batch",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+ MONITOR_FLUSH_ADAPTIVE_PAGES},
+
+ /* Cumulative counter for flush batches because of sync */
+ {"buffer_flush_sync_total_pages", "buffer",
+ "Total pages flushed as part of sync batches",
+ MONITOR_SET_OWNER, MONITOR_FLUSH_SYNC_COUNT,
+ MONITOR_FLUSH_SYNC_TOTAL_PAGE},
+
+ {"buffer_flush_sync", "buffer",
+ "Number of sync batches",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+ MONITOR_FLUSH_SYNC_COUNT},
+
+ {"buffer_flush_sync_pages", "buffer",
+ "Pages queued as a sync batch",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+ MONITOR_FLUSH_SYNC_PAGES},
+
+ /* Cumulative counter for flush batches because of background */
+ {"buffer_flush_background_total_pages", "buffer",
+ "Total pages flushed as part of background batches",
+ MONITOR_SET_OWNER, MONITOR_FLUSH_BACKGROUND_COUNT,
+ MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE},
+
+ {"buffer_flush_background", "buffer",
+ "Number of background batches",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+ MONITOR_FLUSH_BACKGROUND_COUNT},
+
+ {"buffer_flush_background_pages", "buffer",
+ "Pages queued as a background batch",
+ MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+ MONITOR_FLUSH_BACKGROUND_PAGES},
+
+ /* Cumulative counter for LRU batch scan */
+ {"buffer_LRU_batch_scanned", "buffer",
+ "Total pages scanned as part of LRU batch",
+ MONITOR_SET_OWNER, MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_BATCH_SCANNED},
+
+ {"buffer_LRU_batch_num_scan", "buffer",
+ "Number of times LRU batch is called",
+ MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED,
+ MONITOR_LRU_BATCH_SCANNED_NUM_CALL},
+
+ {"buffer_LRU_batch_scanned_per_call", "buffer",
+ "Pages scanned per LRU batch call",
+ MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED,
+ MONITOR_LRU_BATCH_SCANNED_PER_CALL},
+
+ /* Cumulative counter for LRU batch pages flushed */
+ {"buffer_LRU_batch_flush_total_pages", "buffer",
+ "Total pages flushed as part of LRU batches",
+ MONITOR_SET_OWNER, MONITOR_LRU_BATCH_FLUSH_COUNT,
+ MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE},
+
+ {"buffer_LRU_batches_flush", "buffer",
+ "Number of LRU batches",
+ MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_FLUSH_COUNT},
+
+ {"buffer_LRU_batch_flush_pages", "buffer",
+ "Pages queued as an LRU batch",
+ MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_FLUSH_PAGES},
+
+ /* Cumulative counter for LRU batch pages flushed */
+ {"buffer_LRU_batch_evict_total_pages", "buffer",
+ "Total pages evicted as part of LRU batches",
+ MONITOR_SET_OWNER, MONITOR_LRU_BATCH_EVICT_COUNT,
+ MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE},
+
+ {"buffer_LRU_batches_evict", "buffer",
+ "Number of LRU batches",
+ MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_COUNT},
+
+ {"buffer_LRU_batch_evict_pages", "buffer",
+ "Pages queued as an LRU batch",
+ MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_PAGES},
+
+ {"buffer_LRU_single_flush_failure_count", "Buffer",
+ "Number of times attempt to flush a single page from LRU failed",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT},
+
+ {"buffer_LRU_get_free_search", "Buffer",
+ "Number of searches performed for a clean page",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_SEARCH},
+
+ /* Cumulative counter for LRU search scans */
+ {"buffer_LRU_search_scanned", "buffer",
+ "Total pages scanned as part of LRU search",
+ MONITOR_SET_OWNER,
+ MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_SEARCH_SCANNED},
+
+ {"buffer_LRU_search_num_scan", "buffer",
+ "Number of times LRU search is performed",
+ MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED,
+ MONITOR_LRU_SEARCH_SCANNED_NUM_CALL},
+
+ {"buffer_LRU_search_scanned_per_call", "buffer",
+ "Page scanned per single LRU search",
+ MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED,
+ MONITOR_LRU_SEARCH_SCANNED_PER_CALL},
+
+ /* Cumulative counter for LRU unzip search scans */
+ {"buffer_LRU_unzip_search_scanned", "buffer",
+ "Total pages scanned as part of LRU unzip search",
+ MONITOR_SET_OWNER,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED},
+
+ {"buffer_LRU_unzip_search_num_scan", "buffer",
+ "Number of times LRU unzip search is performed",
+ MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL},
+
+ {"buffer_LRU_unzip_search_scanned_per_call", "buffer",
+ "Page scanned per single LRU unzip search",
+ MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL},
+
+ /* ========== Counters for Buffer Page I/O ========== */
+ {"module_buffer_page", "buffer_page_io", "Buffer Page I/O Module",
+ static_cast<monitor_type_t>(
+ MONITOR_MODULE | MONITOR_GROUP_MODULE),
+ MONITOR_DEFAULT_START, MONITOR_MODULE_BUF_PAGE},
+
+ MONITOR_BUF_PAGE_READ("index_leaf","Index Leaf", INDEX_LEAF),
+
+ MONITOR_BUF_PAGE_READ("index_non_leaf","Index Non-leaf",
+ INDEX_NON_LEAF),
+
+ MONITOR_BUF_PAGE_READ("index_ibuf_leaf", "Insert Buffer Index Leaf",
+ INDEX_IBUF_LEAF),
+
+ MONITOR_BUF_PAGE_READ("index_ibuf_non_leaf",
+ "Insert Buffer Index Non-Leaf",
+ INDEX_IBUF_NON_LEAF),
+
+ MONITOR_BUF_PAGE_READ("undo_log", "Undo Log", UNDO_LOG),
+
+ MONITOR_BUF_PAGE_READ("index_inode", "Index Inode", INODE),
+
+ MONITOR_BUF_PAGE_READ("ibuf_free_list", "Insert Buffer Free List",
+ IBUF_FREELIST),
+
+ MONITOR_BUF_PAGE_READ("ibuf_bitmap", "Insert Buffer Bitmap",
+ IBUF_BITMAP),
+
+ MONITOR_BUF_PAGE_READ("system_page", "System", SYSTEM),
+
+ MONITOR_BUF_PAGE_READ("trx_system", "Transaction System", TRX_SYSTEM),
+
+ MONITOR_BUF_PAGE_READ("fsp_hdr", "File Space Header", FSP_HDR),
+
+ MONITOR_BUF_PAGE_READ("xdes", "Extent Descriptor", XDES),
+
+ MONITOR_BUF_PAGE_READ("blob", "Uncompressed BLOB", BLOB),
+
+ MONITOR_BUF_PAGE_READ("zblob", "First Compressed BLOB", ZBLOB),
+
+ MONITOR_BUF_PAGE_READ("zblob2", "Subsequent Compressed BLOB", ZBLOB2),
+
+ MONITOR_BUF_PAGE_READ("other", "other/unknown (old version of InnoDB)",
+ OTHER),
+
+ MONITOR_BUF_PAGE_WRITTEN("index_leaf","Index Leaf", INDEX_LEAF),
+
+ MONITOR_BUF_PAGE_WRITTEN("index_non_leaf","Index Non-leaf",
+ INDEX_NON_LEAF),
+
+ MONITOR_BUF_PAGE_WRITTEN("index_ibuf_leaf", "Insert Buffer Index Leaf",
+ INDEX_IBUF_LEAF),
+
+ MONITOR_BUF_PAGE_WRITTEN("index_ibuf_non_leaf",
+ "Insert Buffer Index Non-Leaf",
+ INDEX_IBUF_NON_LEAF),
+
+ MONITOR_BUF_PAGE_WRITTEN("undo_log", "Undo Log", UNDO_LOG),
+
+ MONITOR_BUF_PAGE_WRITTEN("index_inode", "Index Inode", INODE),
+
+ MONITOR_BUF_PAGE_WRITTEN("ibuf_free_list", "Insert Buffer Free List",
+ IBUF_FREELIST),
+
+ MONITOR_BUF_PAGE_WRITTEN("ibuf_bitmap", "Insert Buffer Bitmap",
+ IBUF_BITMAP),
+
+ MONITOR_BUF_PAGE_WRITTEN("system_page", "System", SYSTEM),
+
+ MONITOR_BUF_PAGE_WRITTEN("trx_system", "Transaction System",
+ TRX_SYSTEM),
+
+ MONITOR_BUF_PAGE_WRITTEN("fsp_hdr", "File Space Header", FSP_HDR),
+
+ MONITOR_BUF_PAGE_WRITTEN("xdes", "Extent Descriptor", XDES),
+
+ MONITOR_BUF_PAGE_WRITTEN("blob", "Uncompressed BLOB", BLOB),
+
+ MONITOR_BUF_PAGE_WRITTEN("zblob", "First Compressed BLOB", ZBLOB),
+
+ MONITOR_BUF_PAGE_WRITTEN("zblob2", "Subsequent Compressed BLOB",
+ ZBLOB2),
+
+ MONITOR_BUF_PAGE_WRITTEN("other", "other/unknown (old version InnoDB)",
+ OTHER),
+
+ /* ========== Counters for OS level operations ========== */
+ {"module_os", "os", "OS Level Operation",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_OS},
+
+ {"os_data_reads", "os",
+ "Number of reads initiated (innodb_data_reads)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_READ},
+
+ {"os_data_writes", "os",
+ "Number of writes initiated (innodb_data_writes)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_WRITE},
+
+ {"os_data_fsyncs", "os",
+ "Number of fsync() calls (innodb_data_fsyncs)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FSYNC},
+
+ {"os_pending_reads", "os", "Number of reads pending",
+ MONITOR_DEFAULT_ON,
+ MONITOR_DEFAULT_START, MONITOR_OS_PENDING_READS},
+
+ {"os_pending_writes", "os", "Number of writes pending",
+ MONITOR_DEFAULT_ON,
+ MONITOR_DEFAULT_START, MONITOR_OS_PENDING_WRITES},
+
+ {"os_log_bytes_written", "os",
+ "Bytes of log written (innodb_os_log_written)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_WRITTEN},
+
+ {"os_log_fsyncs", "os",
+ "Number of fsync log writes (innodb_os_log_fsyncs)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_FSYNC},
+
+ {"os_log_pending_fsyncs", "os",
+ "Number of pending fsync write (innodb_os_log_pending_fsyncs)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_FSYNC},
+
+ {"os_log_pending_writes", "os",
+ "Number of pending log file writes (innodb_os_log_pending_writes)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_WRITES},
+
+ /* ========== Counters for Transaction Module ========== */
+ {"module_trx", "transaction", "Transaction Manager",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_TRX},
+
+ {"trx_rw_commits", "transaction",
+ "Number of read-write transactions committed",
+ MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RW_COMMIT},
+
+ {"trx_ro_commits", "transaction",
+ "Number of read-only transactions committed",
+ MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RO_COMMIT},
+
+ {"trx_nl_ro_commits", "transaction",
+ "Number of non-locking auto-commit read-only transactions committed",
+ MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_NL_RO_COMMIT},
+
+ {"trx_commits_insert_update", "transaction",
+ "Number of transactions committed with inserts and updates",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_TRX_COMMIT_UNDO},
+
+ {"trx_rollbacks", "transaction",
+ "Number of transactions rolled back",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK},
+
+ {"trx_rollbacks_savepoint", "transaction",
+ "Number of transactions rolled back to savepoint",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_SAVEPOINT},
+
+ {"trx_active_transactions", "transaction",
+ "Number of active transactions",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_TRX_ACTIVE},
+
+ {"trx_rseg_history_len", "transaction",
+ "Length of the TRX_RSEG_HISTORY list",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_RSEG_HISTORY_LEN},
+
+ {"trx_undo_slots_used", "transaction", "Number of undo slots used",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_USED},
+
+ {"trx_undo_slots_cached", "transaction",
+ "Number of undo slots cached",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_CACHED},
+
+ {"trx_rseg_current_size", "transaction",
+ "Current rollback segment size in pages",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_RSEG_CUR_SIZE},
+
+ /* ========== Counters for Purge Module ========== */
+ {"module_purge", "purge", "Purge Module",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_PURGE},
+
+ {"purge_del_mark_records", "purge",
+ "Number of delete-marked rows purged",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_N_DEL_ROW_PURGE},
+
+ {"purge_upd_exist_or_extern_records", "purge",
+ "Number of purges on updates of existing records and"
+ " updates on delete marked record with externally stored field",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_N_UPD_EXIST_EXTERN},
+
+ {"purge_invoked", "purge",
+ "Number of times purge was invoked",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_PURGE_INVOKED},
+
+ {"purge_undo_log_pages", "purge",
+ "Number of undo log pages handled by the purge",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_PURGE_N_PAGE_HANDLED},
+
+ {"purge_dml_delay_usec", "purge",
+ "Microseconds DML to be delayed due to purge lagging",
+ MONITOR_DISPLAY_CURRENT,
+ MONITOR_DEFAULT_START, MONITOR_DML_PURGE_DELAY},
+
+ {"purge_stop_count", "purge",
+ "Number of times purge was stopped",
+ MONITOR_DISPLAY_CURRENT,
+ MONITOR_DEFAULT_START, MONITOR_PURGE_STOP_COUNT},
+
+ {"purge_resume_count", "purge",
+ "Number of times purge was resumed",
+ MONITOR_DISPLAY_CURRENT,
+ MONITOR_DEFAULT_START, MONITOR_PURGE_RESUME_COUNT},
+
+ /* ========== Counters for Recovery Module ========== */
+ {"module_log", "recovery", "Recovery Module",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_RECOVERY},
+
+ {"log_checkpoints", "recovery", "Number of checkpoints",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_NUM_CHECKPOINT},
+
+ {"log_lsn_last_flush", "recovery", "LSN of Last flush",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_FLUSHDISK},
+
+ {"log_lsn_last_checkpoint", "recovery", "LSN at last checkpoint",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CHECKPOINT},
+
+ {"log_lsn_current", "recovery", "Current LSN value",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CURRENT},
+
+ {"log_lsn_checkpoint_age", "recovery",
+ "Current LSN value minus LSN at last checkpoint",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_LSN_CHECKPOINT_AGE},
+
+ {"log_lsn_buf_pool_oldest", "recovery",
+ "The oldest modified block LSN in the buffer pool",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_OLDEST_LSN},
+
+ {"log_max_modified_age_async", "recovery",
+ "Maximum LSN difference; when exceeded, start asynchronous preflush",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_ASYNC},
+
+ {"log_pending_log_flushes", "recovery", "Pending log flushes",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_PENDING_LOG_FLUSH},
+
+ {"log_pending_checkpoint_writes", "recovery", "Pending checkpoints",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_PENDING_CHECKPOINT_WRITE},
+
+ {"log_num_log_io", "recovery", "Number of log I/Os",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_LOG_IO},
+
+ {"log_waits", "recovery",
+ "Number of log waits due to small log buffer (innodb_log_waits)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WAITS},
+
+ {"log_write_requests", "recovery",
+ "Number of log write requests (innodb_log_write_requests)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITE_REQUEST},
+
+ {"log_writes", "recovery",
+ "Number of log writes (innodb_log_writes)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITES},
+
+ {"log_padded", "recovery",
+ "Bytes of log padded for log write ahead",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_PADDED},
+
+ /* ========== Counters for Page Compression ========== */
+ {"module_compress", "compression", "Page Compression Info",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_PAGE},
+
+ {"compress_pages_compressed", "compression",
+ "Number of pages compressed", MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_PAGE_COMPRESS},
+
+ {"compress_pages_decompressed", "compression",
+ "Number of pages decompressed",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_PAGE_DECOMPRESS},
+
+ {"compression_pad_increments", "compression",
+ "Number of times padding is incremented to avoid compression failures",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_PAD_INCREMENTS},
+
+ {"compression_pad_decrements", "compression",
+ "Number of times padding is decremented due to good compressibility",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS},
+
+ {"compress_saved", "compression",
+ "Number of bytes saved by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED},
+
+ {"compress_pages_page_compressed", "compression",
+ "Number of pages compressed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED},
+
+ {"compress_page_compressed_trim_op", "compression",
+ "Number of TRIM operation performed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP},
+
+ {"compress_pages_page_decompressed", "compression",
+ "Number of pages decompressed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED},
+
+ {"compress_pages_page_compression_error", "compression",
+ "Number of page compression errors",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR},
+
+ {"compress_pages_encrypted", "compression",
+ "Number of pages encrypted",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_ENCRYPTED},
+
+ {"compress_pages_decrypted", "compression",
+ "Number of pages decrypted",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_DECRYPTED},
+
+ /* ========== Counters for Index ========== */
+ {"module_index", "index", "Index Manager",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_INDEX},
+
+ {"index_page_splits", "index", "Number of index page splits",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_INDEX_SPLIT},
+
+ {"index_page_merge_attempts", "index",
+ "Number of index page merge attempts",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_INDEX_MERGE_ATTEMPTS},
+
+ {"index_page_merge_successful", "index",
+ "Number of successful index page merges",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_INDEX_MERGE_SUCCESSFUL},
+
+ {"index_page_reorg_attempts", "index",
+ "Number of index page reorganization attempts",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_INDEX_REORG_ATTEMPTS},
+
+ {"index_page_reorg_successful", "index",
+ "Number of successful index page reorganizations",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_INDEX_REORG_SUCCESSFUL},
+
+ {"index_page_discards", "index", "Number of index pages discarded",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_INDEX_DISCARD},
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* ========== Counters for Adaptive Hash Index ========== */
+ {"module_adaptive_hash", "adaptive_hash_index", "Adaptive Hash Index",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_ADAPTIVE_HASH},
+
+ {"adaptive_hash_searches", "adaptive_hash_index",
+ "Number of successful searches using Adaptive Hash Index",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH},
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ {"adaptive_hash_searches_btree", "adaptive_hash_index",
+ "Number of searches using B-tree on an index search",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE},
+
+#ifdef BTR_CUR_HASH_ADAPT
+ {"adaptive_hash_pages_added", "adaptive_hash_index",
+ "Number of index pages on which the Adaptive Hash Index is built",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_ADDED},
+
+ {"adaptive_hash_pages_removed", "adaptive_hash_index",
+ "Number of index pages whose corresponding Adaptive Hash Index"
+ " entries were removed",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_REMOVED},
+
+ {"adaptive_hash_rows_added", "adaptive_hash_index",
+ "Number of Adaptive Hash Index rows added",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_ADDED},
+
+ {"adaptive_hash_rows_removed", "adaptive_hash_index",
+ "Number of Adaptive Hash Index rows removed",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVED},
+
+ {"adaptive_hash_rows_deleted_no_hash_entry", "adaptive_hash_index",
+ "Number of rows deleted that did not have corresponding Adaptive Hash"
+ " Index entries",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND},
+
+ {"adaptive_hash_rows_updated", "adaptive_hash_index",
+ "Number of Adaptive Hash Index rows updated",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_UPDATED},
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /* ========== Counters for tablespace ========== */
+ {"module_file", "file_system", "Tablespace and File System Manager",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_FIL_SYSTEM},
+
+ {"file_num_open_files", "file_system",
+ "Number of files currently open (innodb_num_open_files)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_N_FILE_OPENED},
+
+ /* ========== Counters for Change Buffer ========== */
+ {"module_ibuf_system", "change_buffer", "InnoDB Change Buffer",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_IBUF_SYSTEM},
+
+ {"ibuf_merges_insert", "change_buffer",
+ "Number of inserted records merged by change buffering",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_INSERT},
+
+ {"ibuf_merges_delete_mark", "change_buffer",
+ "Number of deleted records merged by change buffering",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DELETE},
+
+ {"ibuf_merges_delete", "change_buffer",
+ "Number of purge records merged by change buffering",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_PURGE},
+
+ {"ibuf_merges_discard_insert", "change_buffer",
+ "Number of insert merged operations discarded",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT},
+
+ {"ibuf_merges_discard_delete_mark", "change_buffer",
+ "Number of deleted merged operations discarded",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE},
+
+ {"ibuf_merges_discard_delete", "change_buffer",
+ "Number of purge merged operations discarded",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE},
+
+ {"ibuf_merges", "change_buffer", "Number of change buffer merges",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGES},
+
+ {"ibuf_size", "change_buffer", "Change buffer size in pages",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_SIZE},
+
+ /* ========== Counters for server operations ========== */
+ {"module_innodb", "innodb",
+ "Counter for general InnoDB server wide operations and properties",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_SERVER},
+
+ {"innodb_master_thread_sleeps", "server",
+ "Number of times (seconds) master thread sleeps",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_MASTER_THREAD_SLEEP},
+
+ {"innodb_activity_count", "server", "Current server activity count",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_SERVER_ACTIVITY},
+
+ {"innodb_master_active_loops", "server",
+ "Number of times master thread performs its tasks when"
+ " server is active",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_MASTER_ACTIVE_LOOPS},
+
+ {"innodb_master_idle_loops", "server",
+ "Number of times master thread performs its tasks when server is idle",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_MASTER_IDLE_LOOPS},
+
+ {"innodb_background_drop_table_usec", "server",
+ "Time (in microseconds) spent to process drop table list",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND},
+
+ {"innodb_log_flush_usec", "server",
+ "Time (in microseconds) spent to flush log records",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_SRV_LOG_FLUSH_MICROSECOND},
+
+ {"innodb_dict_lru_usec", "server",
+ "Time (in microseconds) spent to process DICT LRU list",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_MICROSECOND},
+
+ {"innodb_dict_lru_count_active", "server",
+ "Number of tables evicted from DICT LRU list in the active loop",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE},
+
+ {"innodb_dict_lru_count_idle", "server",
+ "Number of tables evicted from DICT LRU list in the idle loop",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE},
+
+ {"innodb_dblwr_writes", "server",
+ "Number of doublewrite operations that have been performed"
+ " (innodb_dblwr_writes)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_WRITES},
+
+ {"innodb_dblwr_pages_written", "server",
+ "Number of pages that have been written for doublewrite operations"
+ " (innodb_dblwr_pages_written)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN},
+
+ {"innodb_page_size", "server",
+ "InnoDB page size in bytes (innodb_page_size)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_PAGE_SIZE},
+
+ {"innodb_rwlock_s_spin_waits", "server",
+ "Number of rwlock spin waits due to shared latch request",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_WAITS},
+
+ {"innodb_rwlock_x_spin_waits", "server",
+ "Number of rwlock spin waits due to exclusive latch request",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_WAITS},
+
+ {"innodb_rwlock_sx_spin_waits", "server",
+ "Number of rwlock spin waits due to sx latch request",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_SX_SPIN_WAITS},
+
+ {"innodb_rwlock_s_spin_rounds", "server",
+ "Number of rwlock spin loop rounds due to shared latch request",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS},
+
+ {"innodb_rwlock_x_spin_rounds", "server",
+ "Number of rwlock spin loop rounds due to exclusive latch request",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS},
+
+ {"innodb_rwlock_sx_spin_rounds", "server",
+ "Number of rwlock spin loop rounds due to sx latch request",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_SX_SPIN_ROUNDS},
+
+ {"innodb_rwlock_s_os_waits", "server",
+ "Number of OS waits due to shared latch request",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_OS_WAITS},
+
+ {"innodb_rwlock_x_os_waits", "server",
+ "Number of OS waits due to exclusive latch request",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_OS_WAITS},
+
+ {"innodb_rwlock_sx_os_waits", "server",
+ "Number of OS waits due to sx latch request",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_SX_OS_WAITS},
+
+ /* ========== Counters for DML operations ========== */
+ {"module_dml", "dml", "Statistics for DMLs",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_DML_STATS},
+
+ {"dml_reads", "dml", "Number of rows read",
+ static_cast<monitor_type_t>(MONITOR_EXISTING),
+ MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_READ},
+
+ {"dml_inserts", "dml", "Number of rows inserted",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_INSERTED},
+
+ {"dml_deletes", "dml", "Number of rows deleted",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_DELETED},
+
+ {"dml_updates", "dml", "Number of rows updated",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_UPDTATED},
+
+ {"dml_system_reads", "dml", "Number of system rows read",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OLVD_SYSTEM_ROW_READ},
+
+ {"dml_system_inserts", "dml", "Number of system rows inserted",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OLVD_SYSTEM_ROW_INSERTED},
+
+ {"dml_system_deletes", "dml", "Number of system rows deleted",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OLVD_SYSTEM_ROW_DELETED},
+
+ {"dml_system_updates", "dml", "Number of system rows updated",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OLVD_SYSTEM_ROW_UPDATED},
+
+ /* ========== Counters for DDL operations ========== */
+ {"module_ddl", "ddl", "Statistics for DDLs",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_DDL_STATS},
+
+ {"ddl_background_drop_indexes", "ddl",
+ "Number of indexes waiting to be dropped after failed index creation",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_INDEX},
+
+ {"ddl_background_drop_tables", "ddl",
+ "Number of tables in background drop table list",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_TABLE},
+
+ {"ddl_online_create_index", "ddl",
+ "Number of indexes being created online",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ONLINE_CREATE_INDEX},
+
+ {"ddl_pending_alter_table", "ddl",
+ "Number of ALTER TABLE, CREATE INDEX, DROP INDEX in progress",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_PENDING_ALTER_TABLE},
+
+ {"ddl_sort_file_alter_table", "ddl",
+ "Number of sort files created during alter table",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ALTER_TABLE_SORT_FILES},
+
+ {"ddl_log_file_alter_table", "ddl",
+ "Number of log files created during alter table",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ALTER_TABLE_LOG_FILES},
+
+ /* ===== Counters for ICP (Index Condition Pushdown) Module ===== */
+ {"module_icp", "icp", "Index Condition Pushdown",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_ICP},
+
+ {"icp_attempts", "icp",
+ "Number of attempts for index push-down condition checks",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ICP_ATTEMPTS},
+
+ {"icp_no_match", "icp", "Index push-down condition does not match",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ICP_NO_MATCH},
+
+ {"icp_out_of_range", "icp", "Index push-down condition out of range",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ICP_OUT_OF_RANGE},
+
+ {"icp_match", "icp", "Index push-down condition matches",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_ICP_MATCH},
+
+ /* ========== Mutex monitoring on/off ========== */
+ {"latch_status", "Latch counters",
+ "Collect latch counters to display via SHOW ENGING INNODB MUTEX",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_MODULE_LATCHES},
+
+ {"latch", "sync", "Latch monitoring control",
+ MONITOR_HIDDEN,
+ MONITOR_DEFAULT_START, MONITOR_LATCHES},
+
+ /* ========== To turn on/off reset all counters ========== */
+ {"all", "All Counters", "Turn on/off and reset all counters",
+ MONITOR_MODULE,
+ MONITOR_DEFAULT_START, MONITOR_ALL_COUNTER}
+};
+
+/* The "innodb_counter_value" array stores actual counter values */
+monitor_value_t innodb_counter_value[NUM_MONITOR];
+
+/* monitor_set_tbl is used to record and determine whether a monitor
+has been turned on/off. */
+Atomic_relaxed<ulint>
+ monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / NUM_BITS_ULINT];
+
+/****************************************************************//**
+Get a monitor's "monitor_info" by its monitor id (index into the
+innodb_counter_info array.
+@return Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+ monitor_id_t monitor_id) /*!< id indexing into the
+ innodb_counter_info array */
+{
+ ut_a(monitor_id < NUM_MONITOR);
+
+ return((monitor_id < NUM_MONITOR)
+ ? &innodb_counter_info[monitor_id]
+ : NULL);
+}
+
+/****************************************************************//**
+Get monitor's name by its monitor id (indexing into the
+innodb_counter_info array.
+@return corresponding monitor name, or NULL if no such
+monitor */
+const char*
+srv_mon_get_name(
+/*=============*/
+ monitor_id_t monitor_id) /*!< id index into the
+ innodb_counter_info array */
+{
+ ut_a(monitor_id < NUM_MONITOR);
+
+ return((monitor_id < NUM_MONITOR)
+ ? innodb_counter_info[monitor_id].monitor_name
+ : NULL);
+}
+
+/****************************************************************//**
+Turn on/off, reset monitor counters in a module. If module_id
+is MONITOR_ALL_COUNTER then turn on all monitor counters.
+turned on because it has already been turned on. */
+void
+srv_mon_set_module_control(
+/*=======================*/
+ monitor_id_t module_id, /*!< in: Module ID as in
+ monitor_counter_id. If it is
+ set to MONITOR_ALL_COUNTER, this means
+ we shall turn on all the counters */
+ mon_option_t set_option) /*!< in: Turn on/off reset the
+ counter */
+{
+ lint ix;
+ lint start_id;
+ ibool set_current_module = FALSE;
+
+ ut_a(module_id <= NUM_MONITOR);
+ compile_time_assert(array_elements(innodb_counter_info)
+ == NUM_MONITOR);
+
+ /* The module_id must be an ID of MONITOR_MODULE type */
+ ut_a(innodb_counter_info[module_id].monitor_type & MONITOR_MODULE);
+
+ /* start with the first monitor in the module. If module_id
+ is MONITOR_ALL_COUNTER, this means we need to turn on all
+ monitor counters. */
+ if (module_id == MONITOR_ALL_COUNTER) {
+ start_id = 1;
+ } else if (innodb_counter_info[module_id].monitor_type
+ & MONITOR_GROUP_MODULE) {
+ /* Counters in this module are set as a group together
+ and cannot be turned on/off individually. Need to set
+ the on/off bit in the module counter */
+ start_id = module_id;
+ set_current_module = TRUE;
+
+ } else {
+ start_id = module_id + 1;
+ }
+
+ for (ix = start_id; ix < NUM_MONITOR; ix++) {
+ /* if we hit the next module counter, we will
+ continue if we want to turn on all monitor counters,
+ and break if just turn on the counters in the
+ current module. */
+ if (innodb_counter_info[ix].monitor_type & MONITOR_MODULE) {
+
+ if (set_current_module) {
+ /* Continue to set on/off bit on current
+ module */
+ set_current_module = FALSE;
+ } else if (module_id == MONITOR_ALL_COUNTER) {
+ if (!(innodb_counter_info[ix].monitor_type
+ & MONITOR_GROUP_MODULE)) {
+ continue;
+ }
+ } else {
+ /* Hitting the next module, stop */
+ break;
+ }
+ }
+
+ /* Cannot turn on a monitor already been turned on. User
+ should be aware some counters are already on before
+ turn them on again (which could reset counter value) */
+ if (MONITOR_IS_ON(ix) && (set_option == MONITOR_TURN_ON)) {
+ ib::info() << "Monitor '"
+ << srv_mon_get_name((monitor_id_t) ix)
+ << "' is already enabled.";
+ continue;
+ }
+
+ /* For some existing counters (server status variables),
+ we will get its counter value at the start/stop time
+ to calculate the actual value during the time. */
+ if (innodb_counter_info[ix].monitor_type & MONITOR_EXISTING) {
+ srv_mon_process_existing_counter(
+ static_cast<monitor_id_t>(ix), set_option);
+ }
+
+ /* Currently support 4 operations on the monitor counters:
+ turn on, turn off, reset and reset all operations. */
+ switch (set_option) {
+ case MONITOR_TURN_ON:
+ MONITOR_ON(ix);
+ MONITOR_INIT(ix);
+ MONITOR_SET_START(ix);
+ break;
+
+ case MONITOR_TURN_OFF:
+ MONITOR_OFF(ix);
+ MONITOR_SET_OFF(ix);
+ break;
+
+ case MONITOR_RESET_VALUE:
+ srv_mon_reset(static_cast<monitor_id_t>(ix));
+ break;
+
+ case MONITOR_RESET_ALL_VALUE:
+ srv_mon_reset_all(static_cast<monitor_id_t>(ix));
+ break;
+
+ default:
+ ut_error;
+ }
+ }
+}
+
+/****************************************************************//**
+Get transaction system's rollback segment size in pages
+@return size in pages */
+static
+ulint
+srv_mon_get_rseg_size(void)
+/*=======================*/
+{
+ ulint i;
+ ulint value = 0;
+
+ /* rseg_array is a static array, so we can go through it without
+ mutex protection. In addition, we provide an estimate of the
+ total rollback segment size and to avoid mutex contention we
+ don't acquire the rseg->mutex" */
+ for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ const trx_rseg_t* rseg = trx_sys.rseg_array[i];
+
+ if (rseg != NULL) {
+ value += rseg->curr_size;
+ }
+ }
+
+ return(value);
+}
+
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. Please also refer to
+srv_export_innodb_status() for related global counters used by
+the existing status variables.*/
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+ monitor_id_t monitor_id, /*!< in: the monitor's ID as in
+ monitor_counter_id */
+ mon_option_t set_option) /*!< in: Turn on/off reset the
+ counter */
+{
+ mon_type_t value;
+ monitor_info_t* monitor_info;
+ ibool update_min = FALSE;
+
+ monitor_info = srv_mon_get_info(monitor_id);
+
+ ut_a(monitor_info->monitor_type & MONITOR_EXISTING);
+ ut_a(monitor_id < NUM_MONITOR);
+
+ /* Get the value from corresponding global variable */
+ switch (monitor_id) {
+ /* export_vars.innodb_buffer_pool_reads. Num Reads from
+ disk (page not in buffer) */
+ case MONITOR_OVLD_BUF_POOL_READS:
+ value = srv_stats.buf_pool_reads;
+ break;
+
+ /* innodb_buffer_pool_read_requests, the number of logical
+ read requests */
+ case MONITOR_OVLD_BUF_POOL_READ_REQUESTS:
+ value = buf_pool.stat.n_page_gets;
+ break;
+
+ /* innodb_buffer_pool_write_requests, the number of
+ write request */
+ case MONITOR_OVLD_BUF_POOL_WRITE_REQUEST:
+ value = srv_stats.buf_pool_write_requests;
+ break;
+
+ /* innodb_buffer_pool_wait_free */
+ case MONITOR_OVLD_BUF_POOL_WAIT_FREE:
+ value = buf_pool.stat.LRU_waits;
+ break;
+
+ /* innodb_buffer_pool_read_ahead */
+ case MONITOR_OVLD_BUF_POOL_READ_AHEAD:
+ value = buf_pool.stat.n_ra_pages_read;
+ break;
+
+ /* innodb_buffer_pool_read_ahead_evicted */
+ case MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED:
+ value = buf_pool.stat.n_ra_pages_evicted;
+ break;
+
+ /* innodb_buffer_pool_pages_total */
+ case MONITOR_OVLD_BUF_POOL_PAGE_TOTAL:
+ value = buf_pool.get_n_pages();
+ break;
+
+ /* innodb_buffer_pool_pages_misc */
+ case MONITOR_OVLD_BUF_POOL_PAGE_MISC:
+ value = buf_pool.get_n_pages()
+ - UT_LIST_GET_LEN(buf_pool.LRU)
+ - UT_LIST_GET_LEN(buf_pool.free);
+ break;
+
+ /* innodb_buffer_pool_pages_data */
+ case MONITOR_OVLD_BUF_POOL_PAGES_DATA:
+ value = UT_LIST_GET_LEN(buf_pool.LRU);
+ break;
+
+ /* innodb_buffer_pool_bytes_data */
+ case MONITOR_OVLD_BUF_POOL_BYTES_DATA:
+ value = buf_pool.stat.LRU_bytes
+ + (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+ << srv_page_size_shift);
+ break;
+
+ /* innodb_buffer_pool_pages_dirty */
+ case MONITOR_OVLD_BUF_POOL_PAGES_DIRTY:
+ value = UT_LIST_GET_LEN(buf_pool.flush_list);
+ break;
+
+ /* innodb_buffer_pool_bytes_dirty */
+ case MONITOR_OVLD_BUF_POOL_BYTES_DIRTY:
+ value = buf_pool.stat.flush_list_bytes;
+ break;
+
+ /* innodb_buffer_pool_pages_free */
+ case MONITOR_OVLD_BUF_POOL_PAGES_FREE:
+ value = UT_LIST_GET_LEN(buf_pool.free);
+ break;
+
+ /* innodb_pages_created, the number of pages created */
+ case MONITOR_OVLD_PAGE_CREATED:
+ value = buf_pool.stat.n_pages_created;
+ break;
+
+ /* innodb_pages_written, the number of page written */
+ case MONITOR_OVLD_PAGES_WRITTEN:
+ value = buf_pool.stat.n_pages_written;
+ break;
+
+ /* innodb_index_pages_written, the number of index pages written */
+ case MONITOR_OVLD_INDEX_PAGES_WRITTEN:
+ value = srv_stats.index_pages_written;
+ break;
+
+ /* innodb_non_index_pages_written, the number of non index pages written */
+ case MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN:
+ value = srv_stats.non_index_pages_written;
+ break;
+
+ /* innodb_pages_read */
+ case MONITOR_OVLD_PAGES_READ:
+ value = buf_pool.stat.n_pages_read;
+ break;
+
+ /* Number of times secondary index lookup triggered cluster lookup */
+ case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS:
+ value = srv_stats.n_sec_rec_cluster_reads;
+ break;
+ /* Number of times prefix optimization avoided triggering cluster
+ lookup */
+ case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED:
+ value = srv_stats.n_sec_rec_cluster_reads_avoided;
+ break;
+
+ /* innodb_data_reads, the total number of data reads */
+ case MONITOR_OVLD_BYTE_READ:
+ value = srv_stats.data_read;
+ break;
+
+ /* innodb_data_writes, the total number of data writes. */
+ case MONITOR_OVLD_BYTE_WRITTEN:
+ value = srv_stats.data_written;
+ break;
+
+ /* innodb_data_reads, the total number of data reads. */
+ case MONITOR_OVLD_OS_FILE_READ:
+ value = os_n_file_reads;
+ break;
+
+ /* innodb_data_writes, the total number of data writes*/
+ case MONITOR_OVLD_OS_FILE_WRITE:
+ value = os_n_file_writes;
+ break;
+
+ /* innodb_data_fsyncs, number of fsync() operations so far. */
+ case MONITOR_OVLD_OS_FSYNC:
+ value = os_n_fsyncs;
+ break;
+
+ /* innodb_os_log_written */
+ case MONITOR_OVLD_OS_LOG_WRITTEN:
+ value = (mon_type_t) srv_stats.os_log_written;
+ break;
+
+ /* innodb_os_log_fsyncs */
+ case MONITOR_OVLD_OS_LOG_FSYNC:
+ value = log_sys.get_flushes();
+ break;
+
+ /* innodb_os_log_pending_fsyncs */
+ case MONITOR_OVLD_OS_LOG_PENDING_FSYNC:
+ value = log_sys.get_pending_flushes();
+ update_min = TRUE;
+ break;
+
+ /* innodb_os_log_pending_writes */
+ case MONITOR_OVLD_OS_LOG_PENDING_WRITES:
+ value = srv_stats.os_log_pending_writes;
+ update_min = TRUE;
+ break;
+
+ /* innodb_log_waits */
+ case MONITOR_OVLD_LOG_WAITS:
+ value = srv_stats.log_waits;
+ break;
+
+ /* innodb_log_write_requests */
+ case MONITOR_OVLD_LOG_WRITE_REQUEST:
+ value = srv_stats.log_write_requests;
+ break;
+
+ /* innodb_log_writes */
+ case MONITOR_OVLD_LOG_WRITES:
+ value = srv_stats.log_writes;
+ break;
+
+ case MONITOR_OVLD_LOG_PADDED:
+ value = srv_stats.log_padded;
+ break;
+
+ /* innodb_dblwr_writes */
+ case MONITOR_OVLD_SRV_DBLWR_WRITES:
+ buf_dblwr.lock();
+ value = buf_dblwr.batches();
+ buf_dblwr.unlock();
+ break;
+
+ /* innodb_dblwr_pages_written */
+ case MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN:
+ buf_dblwr.lock();
+ value = buf_dblwr.written();
+ buf_dblwr.unlock();
+ break;
+
+ /* innodb_page_size */
+ case MONITOR_OVLD_SRV_PAGE_SIZE:
+ value = srv_page_size;
+ break;
+
+ case MONITOR_OVLD_RWLOCK_S_SPIN_WAITS:
+ value = rw_lock_stats.rw_s_spin_wait_count;
+ break;
+
+ case MONITOR_OVLD_RWLOCK_X_SPIN_WAITS:
+ value = rw_lock_stats.rw_x_spin_wait_count;
+ break;
+
+ case MONITOR_OVLD_RWLOCK_SX_SPIN_WAITS:
+ value = rw_lock_stats.rw_sx_spin_wait_count;
+ break;
+
+ case MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS:
+ value = rw_lock_stats.rw_s_spin_round_count;
+ break;
+
+ case MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS:
+ value = rw_lock_stats.rw_x_spin_round_count;
+ break;
+
+ case MONITOR_OVLD_RWLOCK_SX_SPIN_ROUNDS:
+ value = rw_lock_stats.rw_sx_spin_round_count;
+ break;
+
+ case MONITOR_OVLD_RWLOCK_S_OS_WAITS:
+ value = rw_lock_stats.rw_s_os_wait_count;
+ break;
+
+ case MONITOR_OVLD_RWLOCK_X_OS_WAITS:
+ value = rw_lock_stats.rw_x_os_wait_count;
+ break;
+
+ case MONITOR_OVLD_RWLOCK_SX_OS_WAITS:
+ value = rw_lock_stats.rw_sx_os_wait_count;
+ break;
+
+ case MONITOR_OVLD_BUFFER_POOL_SIZE:
+ value = srv_buf_pool_size;
+ break;
+
+ /* innodb_rows_read */
+ case MONITOR_OLVD_ROW_READ:
+ value = srv_stats.n_rows_read;
+ break;
+
+ /* innodb_rows_inserted */
+ case MONITOR_OLVD_ROW_INSERTED:
+ value = srv_stats.n_rows_inserted;
+ break;
+
+ /* innodb_rows_deleted */
+ case MONITOR_OLVD_ROW_DELETED:
+ value = srv_stats.n_rows_deleted;
+ break;
+
+ /* innodb_rows_updated */
+ case MONITOR_OLVD_ROW_UPDTATED:
+ value = srv_stats.n_rows_updated;
+ break;
+
+ /* innodb_system_rows_read */
+ case MONITOR_OLVD_SYSTEM_ROW_READ:
+ value = srv_stats.n_system_rows_read;
+ break;
+
+ /* innodb_system_rows_inserted */
+ case MONITOR_OLVD_SYSTEM_ROW_INSERTED:
+ value = srv_stats.n_system_rows_inserted;
+ break;
+
+ /* innodb_system_rows_deleted */
+ case MONITOR_OLVD_SYSTEM_ROW_DELETED:
+ value = srv_stats.n_system_rows_deleted;
+ break;
+
+ /* innodb_system_rows_updated */
+ case MONITOR_OLVD_SYSTEM_ROW_UPDATED:
+ value = srv_stats.n_system_rows_updated;
+ break;
+
+ /* innodb_row_lock_current_waits */
+ case MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT:
+ value = srv_stats.n_lock_wait_current_count;
+ break;
+
+ /* innodb_row_lock_time */
+ case MONITOR_OVLD_LOCK_WAIT_TIME:
+ value = srv_stats.n_lock_wait_time / 1000;
+ break;
+
+ /* innodb_row_lock_time_max */
+ case MONITOR_OVLD_LOCK_MAX_WAIT_TIME:
+ value = lock_sys.n_lock_max_wait_time / 1000;
+ break;
+
+ /* innodb_row_lock_time_avg */
+ case MONITOR_OVLD_LOCK_AVG_WAIT_TIME:
+ if (srv_stats.n_lock_wait_count > 0) {
+ value = srv_stats.n_lock_wait_time / 1000
+ / srv_stats.n_lock_wait_count;
+ } else {
+ value = 0;
+ }
+ break;
+
+ /* innodb_row_lock_waits */
+ case MONITOR_OVLD_ROW_LOCK_WAIT:
+ value = srv_stats.n_lock_wait_count;
+ break;
+
+ case MONITOR_RSEG_HISTORY_LEN:
+ value = trx_sys.rseg_history_len;
+ break;
+
+ case MONITOR_RSEG_CUR_SIZE:
+ value = srv_mon_get_rseg_size();
+ break;
+
+ case MONITOR_OVLD_N_FILE_OPENED:
+ value = fil_system.n_open;
+ break;
+
+ case MONITOR_OVLD_IBUF_MERGE_INSERT:
+ value = ibuf.n_merged_ops[IBUF_OP_INSERT];
+ break;
+
+ case MONITOR_OVLD_IBUF_MERGE_DELETE:
+ value = ibuf.n_merged_ops[IBUF_OP_DELETE_MARK];
+ break;
+
+ case MONITOR_OVLD_IBUF_MERGE_PURGE:
+ value = ibuf.n_merged_ops[IBUF_OP_DELETE];
+ break;
+
+ case MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT:
+ value = ibuf.n_discarded_ops[IBUF_OP_INSERT];
+ break;
+
+ case MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE:
+ value = ibuf.n_discarded_ops[IBUF_OP_DELETE_MARK];
+ break;
+
+ case MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE:
+ value = ibuf.n_discarded_ops[IBUF_OP_DELETE];
+ break;
+
+ case MONITOR_OVLD_IBUF_MERGES:
+ value = ibuf.n_merges;
+ break;
+
+ case MONITOR_OVLD_IBUF_SIZE:
+ value = ibuf.size;
+ break;
+
+ case MONITOR_OVLD_SERVER_ACTIVITY:
+ value = srv_get_activity_count();
+ break;
+
+ case MONITOR_OVLD_LSN_FLUSHDISK:
+ value = log_sys.get_flushed_lsn();
+ break;
+
+ case MONITOR_OVLD_LSN_CURRENT:
+ value = log_sys.get_lsn();
+ break;
+
+ case MONITOR_PENDING_LOG_FLUSH:
+ value = static_cast<mon_type_t>(log_sys.pending_flushes);
+
+ break;
+
+ case MONITOR_PENDING_CHECKPOINT_WRITE:
+ mysql_mutex_lock(&log_sys.mutex);
+ value = static_cast<mon_type_t>(
+ log_sys.n_pending_checkpoint_writes);
+ mysql_mutex_unlock(&log_sys.mutex);
+ break;
+
+ case MONITOR_LOG_IO:
+ mysql_mutex_lock(&log_sys.mutex);
+ value = static_cast<mon_type_t>(log_sys.n_log_ios);
+ mysql_mutex_unlock(&log_sys.mutex);
+ break;
+
+ case MONITOR_LSN_CHECKPOINT_AGE:
+ mysql_mutex_lock(&log_sys.mutex);
+ value = static_cast<mon_type_t>(log_sys.get_lsn()
+ - log_sys.last_checkpoint_lsn);
+ mysql_mutex_unlock(&log_sys.mutex);
+ break;
+
+ case MONITOR_OVLD_BUF_OLDEST_LSN:
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ value = (mon_type_t) buf_pool.get_oldest_modification(0);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ break;
+
+ case MONITOR_OVLD_LSN_CHECKPOINT:
+ value = (mon_type_t) log_sys.last_checkpoint_lsn;
+ break;
+
+ case MONITOR_OVLD_MAX_AGE_ASYNC:
+ value = log_sys.max_modified_age_async;
+ break;
+
+#ifdef BTR_CUR_HASH_ADAPT
+ case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH:
+ value = btr_cur_n_sea;
+ break;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE:
+ value = btr_cur_n_non_sea;
+ break;
+
+ case MONITOR_OVLD_PAGE_COMPRESS_SAVED:
+ value = srv_stats.page_compression_saved;
+ break;
+ case MONITOR_OVLD_PAGES_PAGE_COMPRESSED:
+ value = srv_stats.pages_page_compressed;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP:
+ value = srv_stats.page_compressed_trim_op;
+ break;
+ case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED:
+ value = srv_stats.pages_page_decompressed;
+ break;
+ case MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR:
+ value = srv_stats.pages_page_compression_error;
+ break;
+ case MONITOR_OVLD_PAGES_ENCRYPTED:
+ value = srv_stats.pages_encrypted;
+ break;
+ case MONITOR_OVLD_PAGES_DECRYPTED:
+ value = srv_stats.pages_decrypted;
+ break;
+
+ default:
+ ut_error;
+ }
+
+ switch (set_option) {
+ case MONITOR_TURN_ON:
+ /* Save the initial counter value in mon_start_value
+ field */
+ MONITOR_SAVE_START(monitor_id, value);
+ return;
+
+ case MONITOR_TURN_OFF:
+ /* Save the counter value to mon_last_value when we
+ turn off the monitor but not yet reset. Note the
+ counter has not yet been set to off in the bitmap
+ table for normal turn off. We need to check the
+ count status (on/off) to avoid reset the value
+ for an already off conte */
+ if (MONITOR_IS_ON(monitor_id)) {
+ srv_mon_process_existing_counter(monitor_id,
+ MONITOR_GET_VALUE);
+ MONITOR_SAVE_LAST(monitor_id);
+ }
+ return;
+
+ case MONITOR_GET_VALUE:
+ if (MONITOR_IS_ON(monitor_id)) {
+
+ /* If MONITOR_DISPLAY_CURRENT bit is on, we
+ only record the current value, rather than
+ incremental value over a period. Most of
+` this type of counters are resource related
+ counters such as number of buffer pages etc. */
+ if (monitor_info->monitor_type
+ & MONITOR_DISPLAY_CURRENT) {
+ MONITOR_SET(monitor_id, value);
+ } else {
+ /* Most status counters are montonically
+ increasing, no need to update their
+ minimum values. Only do so
+ if "update_min" set to TRUE */
+ MONITOR_SET_DIFF(monitor_id, value);
+
+ if (update_min
+ && (MONITOR_VALUE(monitor_id)
+ < MONITOR_MIN_VALUE(monitor_id))) {
+ MONITOR_MIN_VALUE(monitor_id) =
+ MONITOR_VALUE(monitor_id);
+ }
+ }
+ }
+ return;
+
+ case MONITOR_RESET_VALUE:
+ if (!MONITOR_IS_ON(monitor_id)) {
+ MONITOR_LAST_VALUE(monitor_id) = 0;
+ }
+ return;
+
+ /* Nothing special for reset all operation for these existing
+ counters */
+ case MONITOR_RESET_ALL_VALUE:
+ return;
+ }
+}
+
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+void
+srv_mon_reset(
+/*==========*/
+ monitor_id_t monitor) /*!< in: monitor id */
+{
+ ibool monitor_was_on;
+
+ monitor_was_on = MONITOR_IS_ON(monitor);
+
+ if (monitor_was_on) {
+ /* Temporarily turn off the counter for the resetting
+ operation */
+ MONITOR_OFF(monitor);
+ }
+
+ /* Before resetting the current monitor value, first
+ calculate and set the max/min value since monitor
+ start */
+ srv_mon_calc_max_since_start(monitor);
+ srv_mon_calc_min_since_start(monitor);
+
+ /* Monitors with MONITOR_DISPLAY_CURRENT bit
+ are not incremental, no need to remember
+ the reset value. */
+ if (innodb_counter_info[monitor].monitor_type
+ & MONITOR_DISPLAY_CURRENT) {
+ MONITOR_VALUE_RESET(monitor) = 0;
+ } else {
+ /* Remember the new baseline */
+ MONITOR_VALUE_RESET(monitor) = MONITOR_VALUE_RESET(monitor)
+ + MONITOR_VALUE(monitor);
+ }
+
+ /* Reset the counter value */
+ MONITOR_VALUE(monitor) = 0;
+ MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;
+ MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;
+
+ MONITOR_FIELD((monitor), mon_reset_time) = time(NULL);
+
+ if (monitor_was_on) {
+ MONITOR_ON(monitor);
+ }
+}
+
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+void
+srv_mon_default_on(void)
+/*====================*/
+{
+ ulint ix;
+
+ for (ix = 0; ix < NUM_MONITOR; ix++) {
+ if (innodb_counter_info[ix].monitor_type
+ & MONITOR_DEFAULT_ON) {
+ /* Turn on monitor counters that are default on */
+ MONITOR_ON(ix);
+ MONITOR_INIT(ix);
+ MONITOR_SET_START(ix);
+ }
+ }
+}
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
new file mode 100644
index 00000000..ad221dc2
--- /dev/null
+++ b/storage/innobase/srv/srv0srv.cc
@@ -0,0 +1,2135 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0srv.cc
+The database server main program
+
+Created 10/8/1995 Heikki Tuuri
+*******************************************************/
+
+#include "my_global.h"
+// JAN: TODO: MySQL 5.7 missing header
+//#include "my_thread.h"
+//
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+
+#include "btr0sea.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "log0recv.h"
+#include "mem0mem.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "row0mysql.h"
+#include "row0log.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "sync0sync.h"
+#include "trx0i_s.h"
+#include "trx0purge.h"
+#include "ut0crc32.h"
+#include "btr0defragment.h"
+#include "ut0mem.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "fil0pagecompress.h"
+#include "trx0types.h"
+#include <list>
+
+#include <my_service_manager.h>
+/* The following is the maximum allowed duration of a lock wait. */
+UNIV_INTERN ulong srv_fatal_semaphore_wait_threshold = DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT;
+
+/* How much data manipulation language (DML) statements need to be delayed,
+in microseconds, in order to reduce the lagging of the purge thread. */
+ulint srv_dml_needed_delay;
+
+const char* srv_main_thread_op_info = "";
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+const char srv_mysql50_table_name_prefix[10] = "#mysql50#";
+
+/* Server parameters which are read from the initfile */
+
+/* The following three are dir paths which are catenated before file
+names, where the file name itself may also contain a path */
+
+char* srv_data_home;
+
+/** Rollback files directory, can be absolute. */
+char* srv_undo_dir;
+
+/** The number of tablespaces to use for rollback segments. */
+ulong srv_undo_tablespaces;
+
+/** The number of UNDO tablespaces that are open and ready to use. */
+ulint srv_undo_tablespaces_open;
+
+/** The number of UNDO tablespaces that are active (hosting some rollback
+segment). It is quite possible that some of the tablespaces doesn't host
+any of the rollback-segment based on configuration used. */
+ulint srv_undo_tablespaces_active;
+
+/** Rate at which UNDO records should be purged. */
+ulong srv_purge_rseg_truncate_frequency;
+
+/** Enable or Disable Truncate of UNDO tablespace.
+Note: If enabled then UNDO tablespace will be selected for truncate.
+While Server waits for undo-tablespace to truncate if user disables
+it, truncate action is completed but no new tablespace is marked
+for truncate (action is never aborted). */
+my_bool srv_undo_log_truncate;
+
+/** Maximum size of undo tablespace. */
+unsigned long long srv_max_undo_log_size;
+
+/** Set if InnoDB must operate in read-only mode. We don't do any
+recovery and open all tables in RO mode instead of RW mode. We don't
+sync the max trx id to disk either. */
+my_bool srv_read_only_mode;
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+my_bool srv_file_per_table;
+/** Set if InnoDB operates in read-only mode or innodb-force-recovery
+is greater than SRV_FORCE_NO_TRX_UNDO. */
+my_bool high_level_read_only;
+
+/** Sort buffer size in index creation */
+ulong srv_sort_buf_size;
+/** Maximum modification log file size for online index creation */
+unsigned long long srv_online_max_size;
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads.
+Currently we support native aio on windows and linux */
+my_bool srv_use_native_aio;
+my_bool srv_numa_interleave;
+/** copy of innodb_use_atomic_writes; @see innodb_init_params() */
+my_bool srv_use_atomic_writes;
+/** innodb_compression_algorithm; used with page compression */
+ulong innodb_compression_algorithm;
+
+#ifdef UNIV_DEBUG
+/** Used by SET GLOBAL innodb_master_thread_disabled_debug = X. */
+my_bool srv_master_thread_disabled_debug;
+/** Event used to inform that master thread is disabled. */
+static os_event_t srv_master_thread_disabled_event;
+#endif /* UNIV_DEBUG */
+
+/*------------------------- LOG FILES ------------------------ */
+char* srv_log_group_home_dir;
+
+/** The InnoDB redo log file size, or 0 when changing the redo log format
+at startup (while disallowing writes to the redo log). */
+ulonglong srv_log_file_size;
+/** innodb_log_buffer_size, in bytes */
+ulong srv_log_buffer_size;
+/** innodb_flush_log_at_trx_commit */
+ulong srv_flush_log_at_trx_commit;
+/** innodb_flush_log_at_timeout */
+uint srv_flush_log_at_timeout;
+/** innodb_page_size */
+ulong srv_page_size;
+/** log2 of innodb_page_size; @see innodb_init_params() */
+ulong srv_page_size_shift;
+/** innodb_log_write_ahead_size */
+ulong srv_log_write_ahead_size;
+
+/** innodb_adaptive_flushing; try to flush dirty pages so as to avoid
+IO bursts at the checkpoints. */
+my_bool srv_adaptive_flushing;
+
+/** innodb_flush_sync; whether to ignore io_capacity at log checkpoints */
+my_bool srv_flush_sync;
+
+/** common thread pool*/
+tpool::thread_pool* srv_thread_pool;
+
+/** Maximum number of times allowed to conditionally acquire
+mutex before switching to blocking wait on the mutex */
+#define MAX_MUTEX_NOWAIT 2
+
+/** Check whether the number of failed nonblocking mutex
+acquisition attempts exceeds maximum allowed value. If so,
+srv_printf_innodb_monitor() will request mutex acquisition
+with mutex_enter(), which will wait until it gets the mutex. */
+#define MUTEX_NOWAIT(mutex_skipped) ((mutex_skipped) < MAX_MUTEX_NOWAIT)
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+UNIV_INTERN os_event_t srv_allow_writes_event;
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
+/** copy of innodb_buffer_pool_size */
+ulint srv_buf_pool_size;
+const ulint srv_buf_pool_min_size = 5 * 1024 * 1024;
+/** Default pool size in bytes */
+const ulint srv_buf_pool_def_size = 128 * 1024 * 1024;
+/** Requested buffer pool chunk size */
+ulong srv_buf_pool_chunk_unit;
+/** innodb_lru_scan_depth; number of blocks scanned in LRU flush batch */
+ulong srv_LRU_scan_depth;
+/** innodb_flush_neighbors; whether or not to flush neighbors of a block */
+ulong srv_flush_neighbors;
+/** Previously requested size */
+ulint srv_buf_pool_old_size;
+/** Current size as scaling factor for the other components */
+ulint srv_buf_pool_base_size;
+/** Current size in bytes */
+ulint srv_buf_pool_curr_size;
+/** Dump this % of each buffer pool during BP dump */
+ulong srv_buf_pool_dump_pct;
+/** Abort load after this amount of pages */
+#ifdef UNIV_DEBUG
+ulong srv_buf_pool_load_pages_abort = LONG_MAX;
+#endif
+/** Lock table size in bytes */
+ulint srv_lock_table_size = ULINT_MAX;
+
+/** innodb_read_io_threads */
+uint srv_n_read_io_threads;
+/** innodb_write_io_threads */
+uint srv_n_write_io_threads;
+
+/** innodb_random_read_ahead */
+my_bool srv_random_read_ahead;
+/** innodb_read_ahead_threshold; the number of pages that must be present
+in the buffer cache and accessed sequentially for InnoDB to trigger a
+readahead request. */
+ulong srv_read_ahead_threshold;
+
+/** innodb_change_buffer_max_size; maximum on-disk size of change
+buffer in terms of percentage of the buffer pool. */
+uint srv_change_buffer_max_size;
+
+ulong srv_file_flush_method;
+
+
+/** copy of innodb_open_files; @see innodb_init_params() */
+ulint srv_max_n_open_files;
+
+/** innodb_io_capacity */
+ulong srv_io_capacity;
+/** innodb_io_capacity_max */
+ulong srv_max_io_capacity;
+
+/* The InnoDB main thread tries to keep the ratio of modified pages
+in the buffer pool to all database pages in the buffer pool smaller than
+the following number. But it is not guaranteed that the value stays below
+that during a time of heavy update/insert activity. */
+
+/** innodb_max_dirty_pages_pct */
+double srv_max_buf_pool_modified_pct;
+/** innodb_max_dirty_pages_pct_lwm */
+double srv_max_dirty_pages_pct_lwm;
+
+/** innodb_adaptive_flushing_lwm; the percentage of log capacity at
+which adaptive flushing, if enabled, will kick in. */
+double srv_adaptive_flushing_lwm;
+
+/** innodb_flushing_avg_loops; number of iterations over which
+adaptive flushing is averaged */
+ulong srv_flushing_avg_loops;
+
+/** innodb_purge_threads; the number of purge tasks to use */
+uint srv_n_purge_threads;
+
+/** innodb_purge_batch_size, in pages */
+ulong srv_purge_batch_size;
+
+/** innodb_stats_method decides how InnoDB treats
+NULL value when collecting statistics. By default, it is set to
+SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */
+ulong srv_innodb_stats_method;
+
+srv_stats_t srv_stats;
+
+/* structure to pass status variables to MySQL */
+export_var_t export_vars;
+
+/** Normally 0. When nonzero, skip some phases of crash recovery,
+starting from SRV_FORCE_IGNORE_CORRUPT, so that data can be recovered
+by SELECT or mysqldump. When this is nonzero, we do not allow any user
+modifications to the data. */
+ulong srv_force_recovery;
+
+/** innodb_print_all_deadlocks; whether to print all user-level
+transactions deadlocks to the error log */
+my_bool srv_print_all_deadlocks;
+
+/** innodb_cmp_per_index_enabled; enable
+INFORMATION_SCHEMA.innodb_cmp_per_index */
+my_bool srv_cmp_per_index_enabled;
+
+/** innodb_fast_shutdown=1 skips purge and change buffer merge.
+innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint).
+innodb_fast_shutdown=3 is a clean shutdown that skips the rollback
+of active transaction (to be done on restart). */
+uint srv_fast_shutdown;
+
+/** copy of innodb_status_file; generate a innodb_status.<pid> file */
+ibool srv_innodb_status;
+
+/** innodb_prefix_index_cluster_optimization; whether to optimize
+prefix index queries to skip cluster index lookup when possible */
+my_bool srv_prefix_index_cluster_optimization;
+
+/** innodb_stats_transient_sample_pages;
+When estimating number of different key values in an index, sample
+this many index pages, there are 2 ways to calculate statistics:
+* persistent stats that are calculated by ANALYZE TABLE and saved
+ in the innodb database.
+* quick transient stats, that are used if persistent stats for the given
+ table/index are not found in the innodb database */
+unsigned long long srv_stats_transient_sample_pages;
+/** innodb_stats_persistent */
+my_bool srv_stats_persistent;
+/** innodb_stats_include_delete_marked */
+my_bool srv_stats_include_delete_marked;
+/** innodb_stats_persistent_sample_pages */
+unsigned long long srv_stats_persistent_sample_pages;
+/** innodb_stats_auto_recalc */
+my_bool srv_stats_auto_recalc;
+
+/** innodb_stats_modified_counter; The number of rows modified before
+we calculate new statistics (default 0 = current limits) */
+unsigned long long srv_stats_modified_counter;
+
+/** innodb_stats_traditional; enable traditional statistic calculation
+based on number of configured pages */
+my_bool srv_stats_sample_traditional;
+
+my_bool srv_use_doublewrite_buf;
+
+/** innodb_sync_spin_loops */
+ulong srv_n_spin_wait_rounds;
+/** innodb_spin_wait_delay */
+uint srv_spin_wait_delay;
+
+static ulint srv_n_rows_inserted_old;
+static ulint srv_n_rows_updated_old;
+static ulint srv_n_rows_deleted_old;
+static ulint srv_n_rows_read_old;
+static ulint srv_n_system_rows_inserted_old;
+static ulint srv_n_system_rows_updated_old;
+static ulint srv_n_system_rows_deleted_old;
+static ulint srv_n_system_rows_read_old;
+
+ulint srv_truncated_status_writes;
+/** Number of initialized rollback segments for persistent undo log */
+ulong srv_available_undo_logs;
+
+/* Defragmentation */
+UNIV_INTERN my_bool srv_defragment;
+/** innodb_defragment_n_pages */
+UNIV_INTERN uint srv_defragment_n_pages;
+UNIV_INTERN uint srv_defragment_stats_accuracy;
+/** innodb_defragment_fill_factor_n_recs */
+UNIV_INTERN uint srv_defragment_fill_factor_n_recs;
+/** innodb_defragment_fill_factor */
+UNIV_INTERN double srv_defragment_fill_factor;
+/** innodb_defragment_frequency */
+UNIV_INTERN uint srv_defragment_frequency;
+/** derived from innodb_defragment_frequency;
+@see innodb_defragment_frequency_update() */
+UNIV_INTERN ulonglong srv_defragment_interval;
+
+/** Current mode of operation */
+UNIV_INTERN enum srv_operation_mode srv_operation;
+
+/* Set the following to 0 if you want InnoDB to write messages on
+stderr on startup/shutdown. Not enabled on the embedded server. */
+ibool srv_print_verbose_log;
+my_bool srv_print_innodb_monitor;
+my_bool srv_print_innodb_lock_monitor;
+/** innodb_force_primary_key; whether to disallow CREATE TABLE without
+PRIMARY KEY */
+my_bool srv_force_primary_key;
+
+/** Key version to encrypt the temporary tablespace */
+my_bool innodb_encrypt_temporary_tables;
+
+my_bool srv_immediate_scrub_data_uncompressed;
+
+static time_t srv_last_monitor_time;
+
+static ib_mutex_t srv_innodb_monitor_mutex;
+
+/** Mutex protecting page_zip_stat_per_index */
+ib_mutex_t page_zip_stat_per_index_mutex;
+
+/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */
+ib_mutex_t srv_monitor_file_mutex;
+
+/** Temporary file for innodb monitor output */
+FILE* srv_monitor_file;
+/** Mutex for locking srv_misc_tmpfile. Not created if srv_read_only_mode.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+ib_mutex_t srv_misc_tmpfile_mutex;
+/** Temporary file for miscellanous diagnostic output */
+FILE* srv_misc_tmpfile;
+
+static ulint srv_main_thread_process_no;
+static ulint srv_main_thread_id;
+
+/* The following counts are used by the srv_master_callback. */
+
+/** Iterations of the loop bounded by 'srv_active' label. */
+ulint srv_main_active_loops;
+/** Iterations of the loop bounded by the 'srv_idle' label. */
+ulint srv_main_idle_loops;
+/** Iterations of the loop bounded by the 'srv_shutdown' label. */
+static ulint srv_main_shutdown_loops;
+/** Log writes involving flush. */
+ulint srv_log_writes_and_flush;
+
+/* This is only ever touched by the master thread. It records the
+time when the last flush of log file has happened. The master
+thread ensures that we flush the log files at least once per
+second. */
+static time_t srv_last_log_flush_time;
+
+/* Interval in seconds at which various tasks are performed by the
+master thread when server is active. In order to balance the workload,
+we should try to keep intervals such that they are not multiple of
+each other. For example, if we have intervals for various tasks
+defined as 5, 10, 15, 60 then all tasks will be performed when
+current_time % 60 == 0 and no tasks will be performed when
+current_time % 5 != 0. */
+
+# define SRV_MASTER_CHECKPOINT_INTERVAL (7)
+# define SRV_MASTER_DICT_LRU_INTERVAL (47)
+
+/** Buffer pool dump status frequence in percentages */
+UNIV_INTERN ulong srv_buf_dump_status_frequency;
+
+/*
+ IMPLEMENTATION OF THE SERVER MAIN PROGRAM
+ =========================================
+
+There is the following analogue between this database
+server and an operating system kernel:
+
+DB concept equivalent OS concept
+---------- ---------------------
+transaction -- process;
+
+query thread -- thread;
+
+lock -- semaphore;
+
+kernel -- kernel;
+
+query thread execution:
+(a) without lock mutex
+reserved -- process executing in user mode;
+(b) with lock mutex reserved
+ -- process executing in kernel mode;
+
+The server has several backgroind threads all running at the same
+priority as user threads. It periodically checks if here is anything
+happening in the server which requires intervention of the master
+thread. Such situations may be, for example, when flushing of dirty
+blocks is needed in the buffer pool or old version of database rows
+have to be cleaned away (purged). The user can configure a separate
+dedicated purge thread(s) too, in which case the master thread does not
+do any purging.
+
+The threads which we call user threads serve the queries of the MySQL
+server. They run at normal priority.
+
+When there is no activity in the system, also the master thread
+suspends itself to wait for an event making the server totally silent.
+
+There is still one complication in our server design. If a
+background utility thread obtains a resource (e.g., mutex) needed by a user
+thread, and there is also some other user activity in the system,
+the user thread may have to wait indefinitely long for the
+resource, as the OS does not schedule a background thread if
+there is some other runnable user thread. This problem is called
+priority inversion in real-time programming.
+
+One solution to the priority inversion problem would be to keep record
+of which thread owns which resource and in the above case boost the
+priority of the background thread so that it will be scheduled and it
+can release the resource. This solution is called priority inheritance
+in real-time programming. A drawback of this solution is that the overhead
+of acquiring a mutex increases slightly, maybe 0.2 microseconds on a 100
+MHz Pentium, because the thread has to call os_thread_get_curr_id. This may
+be compared to 0.5 microsecond overhead for a mutex lock-unlock pair. Note
+that the thread cannot store the information in the resource , say mutex,
+itself, because competing threads could wipe out the information if it is
+stored before acquiring the mutex, and if it stored afterwards, the
+information is outdated for the time of one machine instruction, at least.
+(To be precise, the information could be stored to lock_word in mutex if
+the machine supports atomic swap.)
+
+The above solution with priority inheritance may become actual in the
+future, currently we do not implement any priority twiddling solution.
+Our general aim is to reduce the contention of all mutexes by making
+them more fine grained.
+
+The thread table contains information of the current status of each
+thread existing in the system, and also the event semaphores used in
+suspending the master thread and utility threads when they have nothing
+to do. The thread table can be seen as an analogue to the process table
+in a traditional Unix implementation. */
+
+/** The server system struct */
+struct srv_sys_t{
+ ib_mutex_t tasks_mutex; /*!< variable protecting the
+ tasks queue */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ tasks; /*!< task queue */
+
+ srv_stats_t::ulint_ctr_1_t
+ activity_count; /*!< For tracking server
+ activity */
+};
+
+static srv_sys_t srv_sys;
+
+/*
+ Structure shared by timer and coordinator_callback.
+ No protection necessary since timer and task never run
+ in parallel (being in the same task group of size 1).
+*/
+struct purge_coordinator_state
+{
+ /** Snapshot of the last history length before the purge call.*/
+ uint32 m_history_length;
+ Atomic_counter<int> m_running;
+ purge_coordinator_state() : m_history_length(), m_running(0) {}
+};
+
+static purge_coordinator_state purge_state;
+
+/** threadpool timer for srv_monitor_task() */
+std::unique_ptr<tpool::timer> srv_monitor_timer;
+
+
+/** The buffer pool dump/load file name */
+char* srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+char srv_buffer_pool_dump_at_shutdown = TRUE;
+char srv_buffer_pool_load_at_startup = TRUE;
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Performance schema stage event for monitoring ALTER TABLE progress
+everything after flush log_make_checkpoint(). */
+PSI_stage_info srv_stage_alter_table_end
+ = {0, "alter table (end)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_insert_index_tuples(). */
+PSI_stage_info srv_stage_alter_table_insert
+ = {0, "alter table (insert)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_apply(). */
+PSI_stage_info srv_stage_alter_table_log_index
+ = {0, "alter table (log apply index)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_table_apply(). */
+PSI_stage_info srv_stage_alter_table_log_table
+ = {0, "alter table (log apply table)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_sort(). */
+PSI_stage_info srv_stage_alter_table_merge_sort
+ = {0, "alter table (merge sort)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_read_clustered_index(). */
+PSI_stage_info srv_stage_alter_table_read_pk_internal_sort
+ = {0, "alter table (read PK and internal sort)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring buffer pool load progress. */
+PSI_stage_info srv_stage_buffer_pool_load
+ = {0, "buffer pool load", PSI_FLAG_STAGE_PROGRESS};
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/*********************************************************************//**
+Prints counters for work done by srv_master_thread. */
+static
+void
+srv_print_master_thread_info(
+/*=========================*/
+ FILE *file) /* in: output stream */
+{
+ fprintf(file, "srv_master_thread loops: " ULINTPF " srv_active, "
+ ULINTPF " srv_shutdown, " ULINTPF " srv_idle\n"
+ "srv_master_thread log flush and writes: " ULINTPF "\n",
+ srv_main_active_loops,
+ srv_main_shutdown_loops,
+ srv_main_idle_loops,
+ srv_log_writes_and_flush);
+}
+
+static void thread_pool_thread_init()
+{
+ my_thread_init();
+ pfs_register_thread(thread_pool_thread_key);
+}
+static void thread_pool_thread_end()
+{
+ pfs_delete_thread();
+ my_thread_end();
+}
+
+
+#ifndef DBUG_OFF
+static void dbug_after_task_callback()
+{
+ ut_ad(!sync_check_iterate(sync_check()));
+}
+#endif
+
+void srv_thread_pool_init()
+{
+ DBUG_ASSERT(!srv_thread_pool);
+
+#if defined (_WIN32)
+ srv_thread_pool= tpool::create_thread_pool_win();
+#else
+ srv_thread_pool= tpool::create_thread_pool_generic();
+#endif
+ srv_thread_pool->set_thread_callbacks(thread_pool_thread_init,
+ thread_pool_thread_end);
+#ifndef DBUG_OFF
+ tpool::set_after_task_callback(dbug_after_task_callback);
+#endif
+}
+
+
+void srv_thread_pool_end()
+{
+ ut_ad(!srv_master_timer);
+ delete srv_thread_pool;
+ srv_thread_pool= nullptr;
+}
+
+static bool need_srv_free;
+
+/** Initialize the server. */
+static void srv_init()
+{
+ mutex_create(LATCH_ID_SRV_INNODB_MONITOR, &srv_innodb_monitor_mutex);
+
+ if (!srv_read_only_mode) {
+ mutex_create(LATCH_ID_SRV_SYS_TASKS, &srv_sys.tasks_mutex);
+
+ UT_LIST_INIT(srv_sys.tasks, &que_thr_t::queue);
+ }
+
+ need_srv_free = true;
+ ut_d(srv_master_thread_disabled_event = os_event_create(0));
+
+ /* page_zip_stat_per_index_mutex is acquired from:
+ 1. page_zip_compress() (after SYNC_FSP)
+ 2. page_zip_decompress()
+ 3. i_s_cmp_per_index_fill_low() (where SYNC_DICT is acquired)
+ 4. innodb_cmp_per_index_update(), no other latches
+ since we do not acquire any other latches while holding this mutex,
+ it can have very low level. We pick SYNC_ANY_LATCH for it. */
+ mutex_create(LATCH_ID_PAGE_ZIP_STAT_PER_INDEX,
+ &page_zip_stat_per_index_mutex);
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+ /* Writes have to be enabled on init or else we hang. Thus, we
+ always set the event here regardless of innobase_disallow_writes.
+ That flag will always be 0 at this point because it isn't settable
+ via my.cnf or command line arg. */
+ srv_allow_writes_event = os_event_create(0);
+ os_event_set(srv_allow_writes_event);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
+ /* Initialize some INFORMATION SCHEMA internal structures */
+ trx_i_s_cache_init(trx_i_s_cache);
+
+}
+
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+void
+srv_free(void)
+/*==========*/
+{
+ if (!need_srv_free) {
+ return;
+ }
+
+ mutex_free(&srv_innodb_monitor_mutex);
+ mutex_free(&page_zip_stat_per_index_mutex);
+
+ if (!srv_read_only_mode) {
+ mutex_free(&srv_sys.tasks_mutex);
+ }
+
+ ut_d(os_event_destroy(srv_master_thread_disabled_event));
+
+ trx_i_s_cache_free(trx_i_s_cache);
+ srv_thread_pool_end();
+}
+
+/*********************************************************************//**
+Boots the InnoDB server. */
+void
+srv_boot(void)
+/*==========*/
+{
+ srv_thread_pool_init();
+ sync_check_init();
+ trx_pool_init();
+ row_mysql_init();
+ srv_init();
+}
+
+/******************************************************************//**
+Refreshes the values used to calculate per-second averages. */
+static void srv_refresh_innodb_monitor_stats(time_t current_time)
+{
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ if (difftime(current_time, srv_last_monitor_time) < 60) {
+ /* We referesh InnoDB Monitor values so that averages are
+ printed from at most 60 last seconds */
+ mutex_exit(&srv_innodb_monitor_mutex);
+ return;
+ }
+
+ srv_last_monitor_time = current_time;
+
+ os_aio_refresh_stats();
+
+#ifdef BTR_CUR_HASH_ADAPT
+ btr_cur_n_sea_old = btr_cur_n_sea;
+#endif /* BTR_CUR_HASH_ADAPT */
+ btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+ log_refresh_stats();
+
+ buf_refresh_io_stats();
+
+ srv_n_rows_inserted_old = srv_stats.n_rows_inserted;
+ srv_n_rows_updated_old = srv_stats.n_rows_updated;
+ srv_n_rows_deleted_old = srv_stats.n_rows_deleted;
+ srv_n_rows_read_old = srv_stats.n_rows_read;
+
+ srv_n_system_rows_inserted_old = srv_stats.n_system_rows_inserted;
+ srv_n_system_rows_updated_old = srv_stats.n_system_rows_updated;
+ srv_n_system_rows_deleted_old = srv_stats.n_system_rows_deleted;
+ srv_n_system_rows_read_old = srv_stats.n_system_rows_read;
+
+ mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+ FILE* file, /*!< in: output stream */
+ ibool nowait, /*!< in: whether to wait for the
+ lock_sys_t:: mutex */
+ ulint* trx_start_pos, /*!< out: file position of the start of
+ the list of active transactions */
+ ulint* trx_end) /*!< out: file position of the end of
+ the list of active transactions */
+{
+ double time_elapsed;
+ time_t current_time;
+ ibool ret;
+
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ current_time = time(NULL);
+
+ /* We add 0.001 seconds to time_elapsed to prevent division
+ by zero if two users happen to call SHOW ENGINE INNODB STATUS at the
+ same time */
+
+ time_elapsed = difftime(current_time, srv_last_monitor_time)
+ + 0.001;
+
+ srv_last_monitor_time = time(NULL);
+
+ fputs("\n=====================================\n", file);
+
+ ut_print_timestamp(file);
+ fprintf(file,
+ " INNODB MONITOR OUTPUT\n"
+ "=====================================\n"
+ "Per second averages calculated from the last %lu seconds\n",
+ (ulong) time_elapsed);
+
+ fputs("-----------------\n"
+ "BACKGROUND THREAD\n"
+ "-----------------\n", file);
+ srv_print_master_thread_info(file);
+
+ fputs("----------\n"
+ "SEMAPHORES\n"
+ "----------\n", file);
+
+ sync_print(file);
+
+ /* Conceptually, srv_innodb_monitor_mutex has a very high latching
+ order level in sync0sync.h, while dict_foreign_err_mutex has a very
+ low level 135. Therefore we can reserve the latter mutex here without
+ a danger of a deadlock of threads. */
+
+ mutex_enter(&dict_foreign_err_mutex);
+
+ if (!srv_read_only_mode && ftell(dict_foreign_err_file) != 0L) {
+ fputs("------------------------\n"
+ "LATEST FOREIGN KEY ERROR\n"
+ "------------------------\n", file);
+ ut_copy_file(file, dict_foreign_err_file);
+ }
+
+ mutex_exit(&dict_foreign_err_mutex);
+
+ /* Only if lock_print_info_summary proceeds correctly,
+ before we call the lock_print_info_all_transactions
+ to print all the lock information. IMPORTANT NOTE: This
+ function acquires the lock mutex on success. */
+ ret = lock_print_info_summary(file, nowait);
+
+ if (ret) {
+ if (trx_start_pos) {
+ long t = ftell(file);
+ if (t < 0) {
+ *trx_start_pos = ULINT_UNDEFINED;
+ } else {
+ *trx_start_pos = (ulint) t;
+ }
+ }
+
+ /* NOTE: If we get here then we have the lock mutex. This
+ function will release the lock mutex that we acquired when
+ we called the lock_print_info_summary() function earlier. */
+
+ lock_print_info_all_transactions(file);
+
+ if (trx_end) {
+ long t = ftell(file);
+ if (t < 0) {
+ *trx_end = ULINT_UNDEFINED;
+ } else {
+ *trx_end = (ulint) t;
+ }
+ }
+ }
+
+ fputs("--------\n"
+ "FILE I/O\n"
+ "--------\n", file);
+ os_aio_print(file);
+
+ fputs("-------------------------------------\n"
+ "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
+ "-------------------------------------\n", file);
+ ibuf_print(file);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ for (ulint i = 0; i < btr_ahi_parts && btr_search_enabled; ++i) {
+ const auto part= &btr_search_sys.parts[i];
+ rw_lock_s_lock(&part->latch);
+ ut_ad(part->heap->type == MEM_HEAP_FOR_BTR_SEARCH);
+ fprintf(file, "Hash table size " ULINTPF
+ ", node heap has " ULINTPF " buffer(s)\n",
+ part->table.n_cells,
+ part->heap->base.count - !part->heap->free_block);
+ rw_lock_s_unlock(&part->latch);
+ }
+
+ fprintf(file,
+ "%.2f hash searches/s, %.2f non-hash searches/s\n",
+ static_cast<double>(btr_cur_n_sea - btr_cur_n_sea_old)
+ / time_elapsed,
+ static_cast<double>(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+ / time_elapsed);
+ btr_cur_n_sea_old = btr_cur_n_sea;
+#else /* BTR_CUR_HASH_ADAPT */
+ fprintf(file,
+ "%.2f non-hash searches/s\n",
+ static_cast<double>(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+ / time_elapsed);
+#endif /* BTR_CUR_HASH_ADAPT */
+ btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+ fputs("---\n"
+ "LOG\n"
+ "---\n", file);
+ log_print(file);
+
+ fputs("----------------------\n"
+ "BUFFER POOL AND MEMORY\n"
+ "----------------------\n", file);
+ fprintf(file,
+ "Total large memory allocated " ULINTPF "\n"
+ "Dictionary memory allocated " ULINTPF "\n",
+ ulint{os_total_large_mem_allocated},
+ dict_sys.rough_size());
+
+ buf_print_io(file);
+
+ fputs("--------------\n"
+ "ROW OPERATIONS\n"
+ "--------------\n", file);
+ fprintf(file, ULINTPF " read views open inside InnoDB\n",
+ trx_sys.view_count());
+
+ if (ulint n_reserved = fil_system.sys_space->n_reserved_extents) {
+ fprintf(file,
+ ULINTPF " tablespace extents now reserved for"
+ " B-tree split operations\n",
+ n_reserved);
+ }
+
+ fprintf(file,
+ "Process ID=" ULINTPF
+ ", Main thread ID=" ULINTPF
+ ", state: %s\n",
+ srv_main_thread_process_no,
+ srv_main_thread_id,
+ srv_main_thread_op_info);
+ fprintf(file,
+ "Number of rows inserted " ULINTPF
+ ", updated " ULINTPF
+ ", deleted " ULINTPF
+ ", read " ULINTPF "\n",
+ (ulint) srv_stats.n_rows_inserted,
+ (ulint) srv_stats.n_rows_updated,
+ (ulint) srv_stats.n_rows_deleted,
+ (ulint) srv_stats.n_rows_read);
+ fprintf(file,
+ "%.2f inserts/s, %.2f updates/s,"
+ " %.2f deletes/s, %.2f reads/s\n",
+ static_cast<double>(srv_stats.n_rows_inserted
+ - srv_n_rows_inserted_old)
+ / time_elapsed,
+ static_cast<double>(srv_stats.n_rows_updated
+ - srv_n_rows_updated_old)
+ / time_elapsed,
+ static_cast<double>(srv_stats.n_rows_deleted
+ - srv_n_rows_deleted_old)
+ / time_elapsed,
+ static_cast<double>(srv_stats.n_rows_read
+ - srv_n_rows_read_old)
+ / time_elapsed);
+ fprintf(file,
+ "Number of system rows inserted " ULINTPF
+ ", updated " ULINTPF ", deleted " ULINTPF
+ ", read " ULINTPF "\n",
+ (ulint) srv_stats.n_system_rows_inserted,
+ (ulint) srv_stats.n_system_rows_updated,
+ (ulint) srv_stats.n_system_rows_deleted,
+ (ulint) srv_stats.n_system_rows_read);
+ fprintf(file,
+ "%.2f inserts/s, %.2f updates/s,"
+ " %.2f deletes/s, %.2f reads/s\n",
+ static_cast<double>(srv_stats.n_system_rows_inserted
+ - srv_n_system_rows_inserted_old)
+ / time_elapsed,
+ static_cast<double>(srv_stats.n_system_rows_updated
+ - srv_n_system_rows_updated_old)
+ / time_elapsed,
+ static_cast<double>(srv_stats.n_system_rows_deleted
+ - srv_n_system_rows_deleted_old)
+ / time_elapsed,
+ static_cast<double>(srv_stats.n_system_rows_read
+ - srv_n_system_rows_read_old)
+ / time_elapsed);
+ srv_n_rows_inserted_old = srv_stats.n_rows_inserted;
+ srv_n_rows_updated_old = srv_stats.n_rows_updated;
+ srv_n_rows_deleted_old = srv_stats.n_rows_deleted;
+ srv_n_rows_read_old = srv_stats.n_rows_read;
+ srv_n_system_rows_inserted_old = srv_stats.n_system_rows_inserted;
+ srv_n_system_rows_updated_old = srv_stats.n_system_rows_updated;
+ srv_n_system_rows_deleted_old = srv_stats.n_system_rows_deleted;
+ srv_n_system_rows_read_old = srv_stats.n_system_rows_read;
+
+ fputs("----------------------------\n"
+ "END OF INNODB MONITOR OUTPUT\n"
+ "============================\n", file);
+ mutex_exit(&srv_innodb_monitor_mutex);
+ fflush(file);
+
+ return(ret);
+}
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+void
+srv_export_innodb_status(void)
+/*==========================*/
+{
+ fil_crypt_stat_t crypt_stat;
+
+ if (!srv_read_only_mode) {
+ fil_crypt_total_stat(&crypt_stat);
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ ulint mem_adaptive_hash = 0;
+ for (ulong i = 0; i < btr_ahi_parts; i++) {
+ const auto part= &btr_search_sys.parts[i];
+ rw_lock_s_lock(&part->latch);
+ if (part->heap) {
+ ut_ad(part->heap->type == MEM_HEAP_FOR_BTR_SEARCH);
+
+ mem_adaptive_hash += mem_heap_get_size(part->heap)
+ + part->table.n_cells * sizeof(hash_cell_t);
+ }
+ rw_lock_s_unlock(&part->latch);
+ }
+ export_vars.innodb_mem_adaptive_hash = mem_adaptive_hash;
+#endif
+
+ export_vars.innodb_mem_dictionary = dict_sys.rough_size();
+
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ export_vars.innodb_data_pending_reads =
+ ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
+
+ export_vars.innodb_data_pending_writes =
+ ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
+
+ export_vars.innodb_data_pending_fsyncs =
+ log_sys.get_pending_flushes()
+ + fil_n_pending_tablespace_flushes;
+
+ export_vars.innodb_data_fsyncs = os_n_fsyncs;
+
+ export_vars.innodb_data_read = srv_stats.data_read;
+
+ export_vars.innodb_data_reads = os_n_file_reads;
+
+ export_vars.innodb_data_writes = os_n_file_writes;
+
+ ulint dblwr = 0;
+
+ if (buf_dblwr.is_initialised()) {
+ buf_dblwr.lock();
+ dblwr = buf_dblwr.submitted();
+ export_vars.innodb_dblwr_pages_written = buf_dblwr.written();
+ export_vars.innodb_dblwr_writes = buf_dblwr.batches();
+ buf_dblwr.unlock();
+ }
+
+ export_vars.innodb_data_written = srv_stats.data_written + dblwr;
+
+ export_vars.innodb_buffer_pool_read_requests
+ = buf_pool.stat.n_page_gets;
+
+ export_vars.innodb_buffer_pool_write_requests =
+ srv_stats.buf_pool_write_requests;
+
+ export_vars.innodb_buffer_pool_reads = srv_stats.buf_pool_reads;
+
+ export_vars.innodb_buffer_pool_read_ahead_rnd =
+ buf_pool.stat.n_ra_pages_read_rnd;
+
+ export_vars.innodb_buffer_pool_read_ahead =
+ buf_pool.stat.n_ra_pages_read;
+
+ export_vars.innodb_buffer_pool_read_ahead_evicted =
+ buf_pool.stat.n_ra_pages_evicted;
+
+ export_vars.innodb_buffer_pool_pages_data =
+ UT_LIST_GET_LEN(buf_pool.LRU);
+
+ export_vars.innodb_buffer_pool_bytes_data =
+ buf_pool.stat.LRU_bytes
+ + (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+ << srv_page_size_shift);
+
+ export_vars.innodb_buffer_pool_pages_dirty =
+ UT_LIST_GET_LEN(buf_pool.flush_list);
+
+ export_vars.innodb_buffer_pool_pages_made_young
+ = buf_pool.stat.n_pages_made_young;
+ export_vars.innodb_buffer_pool_pages_made_not_young
+ = buf_pool.stat.n_pages_not_made_young;
+
+ export_vars.innodb_buffer_pool_pages_old = buf_pool.LRU_old_len;
+
+ export_vars.innodb_buffer_pool_bytes_dirty =
+ buf_pool.stat.flush_list_bytes;
+
+ export_vars.innodb_buffer_pool_pages_free =
+ UT_LIST_GET_LEN(buf_pool.free);
+
+#ifdef UNIV_DEBUG
+ export_vars.innodb_buffer_pool_pages_latched =
+ buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+ export_vars.innodb_buffer_pool_pages_total = buf_pool.get_n_pages();
+
+ export_vars.innodb_buffer_pool_pages_misc =
+ buf_pool.get_n_pages()
+ - UT_LIST_GET_LEN(buf_pool.LRU)
+ - UT_LIST_GET_LEN(buf_pool.free);
+
+ export_vars.innodb_max_trx_id = trx_sys.get_max_trx_id();
+ export_vars.innodb_history_list_length = trx_sys.rseg_history_len;
+
+ export_vars.innodb_log_waits = srv_stats.log_waits;
+
+ export_vars.innodb_os_log_written = srv_stats.os_log_written;
+
+ export_vars.innodb_os_log_fsyncs = log_sys.get_flushes();
+
+ export_vars.innodb_os_log_pending_fsyncs
+ = log_sys.get_pending_flushes();
+
+ export_vars.innodb_os_log_pending_writes =
+ srv_stats.os_log_pending_writes;
+
+ export_vars.innodb_log_write_requests = srv_stats.log_write_requests;
+
+ export_vars.innodb_log_writes = srv_stats.log_writes;
+
+ export_vars.innodb_row_lock_waits = srv_stats.n_lock_wait_count;
+
+ export_vars.innodb_row_lock_current_waits =
+ srv_stats.n_lock_wait_current_count;
+
+ export_vars.innodb_row_lock_time = srv_stats.n_lock_wait_time / 1000;
+
+ if (srv_stats.n_lock_wait_count > 0) {
+
+ export_vars.innodb_row_lock_time_avg = (ulint)
+ (srv_stats.n_lock_wait_time
+ / 1000 / srv_stats.n_lock_wait_count);
+
+ } else {
+ export_vars.innodb_row_lock_time_avg = 0;
+ }
+
+ export_vars.innodb_row_lock_time_max =
+ lock_sys.n_lock_max_wait_time / 1000;
+
+ export_vars.innodb_rows_read = srv_stats.n_rows_read;
+
+ export_vars.innodb_rows_inserted = srv_stats.n_rows_inserted;
+
+ export_vars.innodb_rows_updated = srv_stats.n_rows_updated;
+
+ export_vars.innodb_rows_deleted = srv_stats.n_rows_deleted;
+
+ export_vars.innodb_system_rows_read = srv_stats.n_system_rows_read;
+
+ export_vars.innodb_system_rows_inserted =
+ srv_stats.n_system_rows_inserted;
+
+ export_vars.innodb_system_rows_updated =
+ srv_stats.n_system_rows_updated;
+
+ export_vars.innodb_system_rows_deleted =
+ srv_stats.n_system_rows_deleted;
+
+ export_vars.innodb_truncated_status_writes =
+ srv_truncated_status_writes;
+
+ export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved;
+ export_vars.innodb_index_pages_written = srv_stats.index_pages_written;
+ export_vars.innodb_non_index_pages_written = srv_stats.non_index_pages_written;
+ export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed;
+ export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op;
+ export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed;
+ export_vars.innodb_pages_page_compression_error = srv_stats.pages_page_compression_error;
+ export_vars.innodb_pages_decrypted = srv_stats.pages_decrypted;
+ export_vars.innodb_pages_encrypted = srv_stats.pages_encrypted;
+ export_vars.innodb_n_merge_blocks_encrypted = srv_stats.n_merge_blocks_encrypted;
+ export_vars.innodb_n_merge_blocks_decrypted = srv_stats.n_merge_blocks_decrypted;
+ export_vars.innodb_n_rowlog_blocks_encrypted = srv_stats.n_rowlog_blocks_encrypted;
+ export_vars.innodb_n_rowlog_blocks_decrypted = srv_stats.n_rowlog_blocks_decrypted;
+
+ export_vars.innodb_n_temp_blocks_encrypted =
+ srv_stats.n_temp_blocks_encrypted;
+
+ export_vars.innodb_n_temp_blocks_decrypted =
+ srv_stats.n_temp_blocks_decrypted;
+
+ export_vars.innodb_defragment_compression_failures =
+ btr_defragment_compression_failures;
+ export_vars.innodb_defragment_failures = btr_defragment_failures;
+ export_vars.innodb_defragment_count = btr_defragment_count;
+
+ export_vars.innodb_onlineddl_rowlog_rows = onlineddl_rowlog_rows;
+ export_vars.innodb_onlineddl_rowlog_pct_used = onlineddl_rowlog_pct_used;
+ export_vars.innodb_onlineddl_pct_progress = onlineddl_pct_progress;
+
+ export_vars.innodb_sec_rec_cluster_reads =
+ srv_stats.n_sec_rec_cluster_reads;
+ export_vars.innodb_sec_rec_cluster_reads_avoided =
+ srv_stats.n_sec_rec_cluster_reads_avoided;
+
+ if (!srv_read_only_mode) {
+ export_vars.innodb_encryption_rotation_pages_read_from_cache =
+ crypt_stat.pages_read_from_cache;
+ export_vars.innodb_encryption_rotation_pages_read_from_disk =
+ crypt_stat.pages_read_from_disk;
+ export_vars.innodb_encryption_rotation_pages_modified =
+ crypt_stat.pages_modified;
+ export_vars.innodb_encryption_rotation_pages_flushed =
+ crypt_stat.pages_flushed;
+ export_vars.innodb_encryption_rotation_estimated_iops =
+ crypt_stat.estimated_iops;
+ export_vars.innodb_encryption_key_requests =
+ srv_stats.n_key_requests;
+ export_vars.innodb_key_rotation_list_length =
+ srv_stats.key_rotation_list_length;
+ }
+
+ mutex_exit(&srv_innodb_monitor_mutex);
+
+ mysql_mutex_lock(&log_sys.mutex);
+ export_vars.innodb_lsn_current = log_sys.get_lsn();
+ export_vars.innodb_lsn_flushed = log_sys.get_flushed_lsn();
+ export_vars.innodb_lsn_last_checkpoint = log_sys.last_checkpoint_lsn;
+ export_vars.innodb_checkpoint_max_age = static_cast<ulint>(
+ log_sys.max_checkpoint_age);
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ export_vars.innodb_checkpoint_age = static_cast<ulint>(
+ export_vars.innodb_lsn_current
+ - export_vars.innodb_lsn_last_checkpoint);
+}
+
+struct srv_monitor_state_t
+{
+ time_t last_monitor_time;
+ ulint mutex_skipped;
+ bool last_srv_print_monitor;
+ srv_monitor_state_t() : mutex_skipped(0), last_srv_print_monitor(false)
+ {
+ srv_last_monitor_time = time(NULL);
+ last_monitor_time= srv_last_monitor_time;
+ }
+};
+
+static srv_monitor_state_t monitor_state;
+
+/** A task which prints the info output by various InnoDB monitors.*/
+static void srv_monitor()
+{
+ time_t current_time = time(NULL);
+
+ if (difftime(current_time, monitor_state.last_monitor_time) >= 15) {
+ monitor_state.last_monitor_time = current_time;
+
+ if (srv_print_innodb_monitor) {
+ /* Reset mutex_skipped counter everytime
+ srv_print_innodb_monitor changes. This is to
+ ensure we will not be blocked by lock_sys.mutex
+ for short duration information printing */
+ if (!monitor_state.last_srv_print_monitor) {
+ monitor_state.mutex_skipped = 0;
+ monitor_state.last_srv_print_monitor = true;
+ }
+
+ if (!srv_printf_innodb_monitor(stderr,
+ MUTEX_NOWAIT(monitor_state.mutex_skipped),
+ NULL, NULL)) {
+ monitor_state.mutex_skipped++;
+ } else {
+ /* Reset the counter */
+ monitor_state.mutex_skipped = 0;
+ }
+ } else {
+ monitor_state.last_monitor_time = 0;
+ }
+
+
+ /* We don't create the temp files or associated
+ mutexes in read-only-mode */
+
+ if (!srv_read_only_mode && srv_innodb_status) {
+ mutex_enter(&srv_monitor_file_mutex);
+ rewind(srv_monitor_file);
+ if (!srv_printf_innodb_monitor(srv_monitor_file,
+ MUTEX_NOWAIT(monitor_state.mutex_skipped),
+ NULL, NULL)) {
+ monitor_state.mutex_skipped++;
+ } else {
+ monitor_state.mutex_skipped = 0;
+ }
+
+ os_file_set_eof(srv_monitor_file);
+ mutex_exit(&srv_monitor_file_mutex);
+ }
+ }
+
+ srv_refresh_innodb_monitor_stats(current_time);
+}
+
+/*********************************************************************//**
+A task which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs.
+*/
+void srv_monitor_task(void*)
+{
+ /* number of successive fatal timeouts observed */
+ static ulint fatal_cnt;
+ static lsn_t old_lsn = recv_sys.recovered_lsn;
+ /* longest waiting thread for a semaphore */
+ os_thread_id_t waiter;
+ static os_thread_id_t old_waiter = os_thread_get_curr_id();
+ /* the semaphore that is being waited for */
+ const void* sema = NULL;
+ static const void* old_sema = NULL;
+
+ ut_ad(!srv_read_only_mode);
+
+ /* Try to track a strange bug reported by Harald Fuchs and others,
+ where the lsn seems to decrease at times */
+
+ lsn_t new_lsn = log_sys.get_lsn();
+ ut_a(new_lsn >= old_lsn);
+ old_lsn = new_lsn;
+
+ /* Update the statistics collected for deciding LRU
+ eviction policy. */
+ buf_LRU_stat_update();
+
+ if (sync_array_print_long_waits(&waiter, &sema)
+ && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
+#if defined(WITH_WSREP) && defined(WITH_INNODB_DISALLOW_WRITES)
+ if (!os_event_is_set(srv_allow_writes_event)) {
+ fprintf(stderr,
+ "WSREP: avoiding InnoDB self crash due to "
+ "long semaphore wait of > %lu seconds\n"
+ "Server is processing SST donor operation, "
+ "fatal_cnt now: " ULINTPF,
+ srv_fatal_semaphore_wait_threshold, fatal_cnt);
+ return;
+ }
+#endif /* WITH_WSREP */
+ if (fatal_cnt++) {
+ ib::fatal() << "Semaphore wait has lasted > "
+ << srv_fatal_semaphore_wait_threshold
+ << " seconds. We intentionally crash the"
+ " server because it appears to be hung.";
+ }
+ } else {
+ fatal_cnt = 0;
+ old_waiter = waiter;
+ old_sema = sema;
+ }
+
+ srv_monitor();
+}
+
+/******************************************************************//**
+Increment the server activity count. */
+void
+srv_inc_activity_count(void)
+/*========================*/
+{
+ srv_sys.activity_count.inc();
+}
+
+#ifdef UNIV_DEBUG
+/** @return whether purge or master task is active */
+bool srv_any_background_activity()
+{
+ if (purge_sys.enabled() || srv_master_timer.get())
+ {
+ ut_ad(!srv_read_only_mode);
+ return true;
+ }
+ return false;
+}
+#endif /* UNIV_DEBUG */
+
+static void purge_worker_callback(void*);
+static void purge_coordinator_callback(void*);
+static void purge_coordinator_timer_callback(void*);
+
+static tpool::task_group purge_task_group;
+tpool::waitable_task purge_worker_task(purge_worker_callback, nullptr,
+ &purge_task_group);
+static tpool::task_group purge_coordinator_task_group(1);
+static tpool::waitable_task purge_coordinator_task
+ (purge_coordinator_callback, nullptr, &purge_coordinator_task_group);
+
+static tpool::timer *purge_coordinator_timer;
+
+/** Wake up the purge threads if there is work to do. */
+void
+srv_wake_purge_thread_if_not_active()
+{
+ ut_ad(!srv_read_only_mode);
+
+ if (purge_sys.enabled() && !purge_sys.paused()
+ && trx_sys.rseg_history_len) {
+ if(++purge_state.m_running == 1) {
+ srv_thread_pool->submit_task(&purge_coordinator_task);
+ }
+ }
+}
+
+/** @return whether the purge tasks are active */
+bool purge_sys_t::running() const
+{
+ return purge_coordinator_task.is_running();
+}
+
+/** Stop purge during FLUSH TABLES FOR EXPORT */
+void purge_sys_t::stop()
+{
+ rw_lock_x_lock(&latch);
+
+ if (!enabled())
+ {
+ /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */
+ ut_ad(!srv_undo_sources);
+ rw_lock_x_unlock(&latch);
+ return;
+ }
+
+ ut_ad(srv_n_purge_threads > 0);
+
+ const auto paused= m_paused++;
+
+ rw_lock_x_unlock(&latch);
+
+ if (!paused)
+ {
+ ib::info() << "Stopping purge";
+ MONITOR_ATOMIC_INC(MONITOR_PURGE_STOP_COUNT);
+ purge_coordinator_task.disable();
+ }
+}
+
+/** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
+void purge_sys_t::resume()
+{
+ if (!enabled())
+ {
+ /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */
+ ut_ad(!srv_undo_sources);
+ return;
+ }
+ ut_ad(!srv_read_only_mode);
+ ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+ ut_ad(!sync_check_iterate(sync_check()));
+ purge_coordinator_task.enable();
+ rw_lock_x_lock(&latch);
+ int32_t paused= m_paused--;
+ ut_a(paused);
+
+ if (paused == 1)
+ {
+ ib::info() << "Resuming purge";
+ purge_state.m_running = 0;
+ srv_wake_purge_thread_if_not_active();
+ MONITOR_ATOMIC_INC(MONITOR_PURGE_RESUME_COUNT);
+ }
+ rw_lock_x_unlock(&latch);
+}
+
+/*******************************************************************//**
+Get current server activity count.
+@return activity count. */
+ulint
+srv_get_activity_count(void)
+/*========================*/
+{
+ return(srv_sys.activity_count);
+}
+
+/** Check if srv_inc_activity_count() has been called.
+@param activity_count copy of srv_sys.activity_count
+@return whether the activity_count had changed */
+static bool srv_check_activity(ulint *activity_count)
+{
+ ulint new_activity_count= srv_sys.activity_count;
+ if (new_activity_count != *activity_count)
+ {
+ *activity_count= new_activity_count;
+ return true;
+ }
+
+ return false;
+}
+
+/********************************************************************//**
+The master thread is tasked to ensure that flush of log file happens
+once every second in the background. This is to ensure that not more
+than one second of trxs are lost in case of crash when
+innodb_flush_logs_at_trx_commit != 1 */
+static
+void
+srv_sync_log_buffer_in_background(void)
+/*===================================*/
+{
+ time_t current_time = time(NULL);
+
+ srv_main_thread_op_info = "flushing log";
+ if (difftime(current_time, srv_last_log_flush_time)
+ >= srv_flush_log_at_timeout) {
+ log_buffer_flush_to_disk();
+ srv_last_log_flush_time = current_time;
+ srv_log_writes_and_flush++;
+ }
+}
+
+/********************************************************************//**
+Make room in the table cache by evicting an unused table.
+@return number of tables evicted. */
+static
+ulint
+srv_master_evict_from_table_cache(
+/*==============================*/
+ ulint pct_check) /*!< in: max percent to check */
+{
+ ulint n_tables_evicted = 0;
+
+ dict_sys_lock();
+
+ n_tables_evicted = dict_make_room_in_cache(
+ innobase_get_table_cache_size(), pct_check);
+
+ dict_sys_unlock();
+
+ return(n_tables_evicted);
+}
+
+/*********************************************************************//**
+This function prints progress message every 60 seconds during server
+shutdown, for any activities that master thread is pending on. */
+static
+void
+srv_shutdown_print_master_pending(
+/*==============================*/
+ time_t* last_print_time, /*!< last time the function
+ print the message */
+ ulint n_tables_to_drop, /*!< number of tables to
+ be dropped */
+ ulint n_bytes_merged) /*!< number of change buffer
+ just merged */
+{
+ time_t current_time = time(NULL);
+
+ if (difftime(current_time, *last_print_time) > 60) {
+ *last_print_time = current_time;
+
+ if (n_tables_to_drop) {
+ ib::info() << "Waiting for " << n_tables_to_drop
+ << " table(s) to be dropped";
+ }
+
+ /* Check change buffer merge, we only wait for change buffer
+ merge if it is a slow shutdown */
+ if (!srv_fast_shutdown && n_bytes_merged) {
+ ib::info() << "Waiting for change buffer merge to"
+ " complete number of bytes of change buffer"
+ " just merged: " << n_bytes_merged;
+ }
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Waits in loop as long as master thread is disabled (debug) */
+static
+void
+srv_master_do_disabled_loop(void)
+{
+ if (!srv_master_thread_disabled_debug) {
+ /* We return here to avoid changing op_info. */
+ return;
+ }
+
+ srv_main_thread_op_info = "disabled";
+
+ while (srv_master_thread_disabled_debug) {
+ os_event_set(srv_master_thread_disabled_event);
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+ break;
+ }
+ os_thread_sleep(100000);
+ }
+
+ srv_main_thread_op_info = "";
+}
+
+/** Disables master thread. It's used by:
+ SET GLOBAL innodb_master_thread_disabled_debug = 1 (0).
+@param[in] save immediate result from check function */
+void
+srv_master_thread_disabled_debug_update(THD*, st_mysql_sys_var*, void*,
+ const void* save)
+{
+ /* This method is protected by mutex, as every SET GLOBAL .. */
+ ut_ad(srv_master_thread_disabled_event != NULL);
+
+ const bool disable = *static_cast<const my_bool*>(save);
+
+ const int64_t sig_count = os_event_reset(
+ srv_master_thread_disabled_event);
+
+ srv_master_thread_disabled_debug = disable;
+
+ if (disable) {
+ os_event_wait_low(
+ srv_master_thread_disabled_event, sig_count);
+ }
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Perform the tasks that the master thread is supposed to do when the
+server is active. There are two types of tasks. The first category is
+of such tasks which are performed at each inovcation of this function.
+We assume that this function is called roughly every second when the
+server is active. The second category is of such tasks which are
+performed at some interval e.g.: purge, dict_LRU cleanup etc. */
+static
+void
+srv_master_do_active_tasks(void)
+/*============================*/
+{
+ time_t cur_time = time(NULL);
+ ulonglong counter_time = microsecond_interval_timer();
+
+ /* First do the tasks that we are suppose to do at each
+ invocation of this function. */
+
+ ++srv_main_active_loops;
+
+ MONITOR_INC(MONITOR_MASTER_ACTIVE_LOOPS);
+
+ /* ALTER TABLE in MySQL requires on Unix that the table handler
+ can drop tables lazily after there no longer are SELECT
+ queries to them. */
+ srv_main_thread_op_info = "doing background drop tables";
+ row_drop_tables_for_mysql_in_background();
+ MONITOR_INC_TIME_IN_MICRO_SECS(
+ MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND, counter_time);
+
+ ut_d(srv_master_do_disabled_loop());
+
+ if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
+ return;
+ }
+
+ /* make sure that there is enough reusable space in the redo
+ log files */
+ srv_main_thread_op_info = "checking free log space";
+ log_free_check();
+
+ /* Flush logs if needed */
+ srv_main_thread_op_info = "flushing log";
+ srv_sync_log_buffer_in_background();
+ MONITOR_INC_TIME_IN_MICRO_SECS(
+ MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
+
+ /* Now see if various tasks that are performed at defined
+ intervals need to be performed. */
+
+ if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
+ return;
+ }
+
+ if (cur_time % SRV_MASTER_DICT_LRU_INTERVAL == 0) {
+ srv_main_thread_op_info = "enforcing dict cache limit";
+ ulint n_evicted = srv_master_evict_from_table_cache(50);
+ if (n_evicted != 0) {
+ MONITOR_INC_VALUE(
+ MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE, n_evicted);
+ }
+ MONITOR_INC_TIME_IN_MICRO_SECS(
+ MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
+ }
+}
+
+/*********************************************************************//**
+Perform the tasks that the master thread is supposed to do whenever the
+server is idle. We do check for the server state during this function
+and if the server has entered the shutdown phase we may return from
+the function without completing the required tasks.
+Note that the server can move to active state when we are executing this
+function but we don't check for that as we are suppose to perform more
+or less same tasks when server is active. */
+static
+void
+srv_master_do_idle_tasks(void)
+/*==========================*/
+{
+ ++srv_main_idle_loops;
+
+ MONITOR_INC(MONITOR_MASTER_IDLE_LOOPS);
+
+
+ /* ALTER TABLE in MySQL requires on Unix that the table handler
+ can drop tables lazily after there no longer are SELECT
+ queries to them. */
+ ulonglong counter_time = microsecond_interval_timer();
+ srv_main_thread_op_info = "doing background drop tables";
+ row_drop_tables_for_mysql_in_background();
+ MONITOR_INC_TIME_IN_MICRO_SECS(
+ MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
+ counter_time);
+
+ ut_d(srv_master_do_disabled_loop());
+
+ if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
+ return;
+ }
+
+ /* make sure that there is enough reusable space in the redo
+ log files */
+ srv_main_thread_op_info = "checking free log space";
+ log_free_check();
+
+ if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
+ return;
+ }
+
+ srv_main_thread_op_info = "enforcing dict cache limit";
+ ulint n_evicted = srv_master_evict_from_table_cache(100);
+ if (n_evicted != 0) {
+ MONITOR_INC_VALUE(
+ MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE, n_evicted);
+ }
+ MONITOR_INC_TIME_IN_MICRO_SECS(
+ MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
+
+ /* Flush logs if needed */
+ srv_sync_log_buffer_in_background();
+ MONITOR_INC_TIME_IN_MICRO_SECS(
+ MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
+}
+
+/**
+Complete the shutdown tasks such as background DROP TABLE,
+and optionally change buffer merge (on innodb_fast_shutdown=0). */
+void srv_shutdown(bool ibuf_merge)
+{
+ ulint n_bytes_merged = 0;
+ ulint n_tables_to_drop;
+ time_t now = time(NULL);
+
+ do {
+ ut_ad(!srv_read_only_mode);
+ ut_ad(srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
+ ++srv_main_shutdown_loops;
+
+ /* FIXME: Remove the background DROP TABLE queue; it is not
+ crash-safe and breaks ACID. */
+ srv_main_thread_op_info = "doing background drop tables";
+ n_tables_to_drop = row_drop_tables_for_mysql_in_background();
+
+ if (ibuf_merge) {
+ srv_main_thread_op_info = "checking free log space";
+ log_free_check();
+ srv_main_thread_op_info = "doing insert buffer merge";
+ n_bytes_merged = ibuf_merge_all();
+
+ /* Flush logs if needed */
+ srv_sync_log_buffer_in_background();
+ }
+
+ /* Print progress message every 60 seconds during shutdown */
+ if (srv_print_verbose_log) {
+ srv_shutdown_print_master_pending(
+ &now, n_tables_to_drop, n_bytes_merged);
+ }
+ } while (n_bytes_merged || n_tables_to_drop);
+}
+
+/** The periodic master task controlling the server. */
+void srv_master_callback(void*)
+{
+ static ulint old_activity_count;
+
+ ut_a(srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
+
+ srv_main_thread_op_info = "";
+ MONITOR_INC(MONITOR_MASTER_THREAD_SLEEP);
+ if (srv_check_activity(&old_activity_count)) {
+ srv_master_do_active_tasks();
+ } else {
+ srv_master_do_idle_tasks();
+ }
+ srv_main_thread_op_info = "sleeping";
+}
+
+/** @return whether purge should exit due to shutdown */
+static bool srv_purge_should_exit()
+{
+ ut_ad(srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP);
+
+ if (srv_undo_sources)
+ return false;
+
+ if (srv_fast_shutdown)
+ return true;
+
+ /* Slow shutdown was requested. */
+ if (const uint32_t history_size= trx_sys.rseg_history_len)
+ {
+ static time_t progress_time;
+ time_t now= time(NULL);
+ if (now - progress_time >= 15)
+ {
+ progress_time= now;
+#if defined HAVE_SYSTEMD && !defined EMBEDDED_LIBRARY
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "InnoDB: to purge %u transactions",
+ history_size);
+ ib::info() << "to purge " << history_size << " transactions";
+#endif
+ }
+ return false;
+ }
+
+ return !trx_sys.any_active_transactions();
+}
+
+/*********************************************************************//**
+Fetch and execute a task from the work queue.
+@param [in,out] slot purge worker thread slot
+@return true if a task was executed */
+static bool srv_task_execute()
+{
+ ut_ad(!srv_read_only_mode);
+ ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+ mutex_enter(&srv_sys.tasks_mutex);
+
+ if (que_thr_t* thr = UT_LIST_GET_FIRST(srv_sys.tasks)) {
+ ut_a(que_node_get_type(thr->child) == QUE_NODE_PURGE);
+ UT_LIST_REMOVE(srv_sys.tasks, thr);
+ mutex_exit(&srv_sys.tasks_mutex);
+ que_run_threads(thr);
+ return true;
+ }
+
+ ut_ad(UT_LIST_GET_LEN(srv_sys.tasks) == 0);
+ mutex_exit(&srv_sys.tasks_mutex);
+ return false;
+}
+
+std::mutex purge_thread_count_mtx;
+void srv_update_purge_thread_count(uint n)
+{
+ std::lock_guard<std::mutex> lk(purge_thread_count_mtx);
+ srv_n_purge_threads = n;
+ srv_purge_thread_count_changed = 1;
+}
+
+Atomic_counter<int> srv_purge_thread_count_changed;
+
+/** Do the actual purge operation.
+@param[in,out] n_total_purged total number of purged pages
+@return length of history list before the last purge batch. */
+static uint32_t srv_do_purge(ulint* n_total_purged)
+{
+ ulint n_pages_purged;
+
+ static ulint count = 0;
+ static ulint n_use_threads = 0;
+ static uint32_t rseg_history_len = 0;
+ ulint old_activity_count = srv_get_activity_count();
+ static ulint n_threads = srv_n_purge_threads;
+
+ ut_a(n_threads > 0);
+ ut_ad(!srv_read_only_mode);
+
+ /* Purge until there are no more records to purge and there is
+ no change in configuration or server state. If the user has
+ configured more than one purge thread then we treat that as a
+ pool of threads and only use the extra threads if purge can't
+ keep up with updates. */
+
+ if (n_use_threads == 0) {
+ n_use_threads = n_threads;
+ }
+
+ do {
+ if (UNIV_UNLIKELY(srv_purge_thread_count_changed)) {
+ /* Read the fresh value of srv_n_purge_threads, reset
+ the changed flag. Both variables are protected by
+ purge_thread_count_mtx.
+
+ This code does not run concurrently, it is executed
+ by a single purge_coordinator thread, and no races
+ involving srv_purge_thread_count_changed are possible.
+ */
+
+ std::lock_guard<std::mutex> lk(purge_thread_count_mtx);
+ n_threads = n_use_threads = srv_n_purge_threads;
+ srv_purge_thread_count_changed = 0;
+ } else if (trx_sys.rseg_history_len > rseg_history_len
+ || (srv_max_purge_lag > 0
+ && rseg_history_len > srv_max_purge_lag)) {
+
+ /* History length is now longer than what it was
+ when we took the last snapshot. Use more threads. */
+
+ if (n_use_threads < n_threads) {
+ ++n_use_threads;
+ }
+
+ } else if (srv_check_activity(&old_activity_count)
+ && n_use_threads > 1) {
+
+ /* History length same or smaller since last snapshot,
+ use fewer threads. */
+
+ --n_use_threads;
+ }
+
+ /* Ensure that the purge threads are less than what
+ was configured. */
+
+ ut_a(n_use_threads > 0);
+ ut_a(n_use_threads <= n_threads);
+
+ /* Take a snapshot of the history list before purge. */
+ if (!(rseg_history_len = trx_sys.rseg_history_len)) {
+ break;
+ }
+
+ n_pages_purged = trx_purge(
+ n_use_threads,
+ !(++count % srv_purge_rseg_truncate_frequency)
+ || purge_sys.truncate.current);
+
+ *n_total_purged += n_pages_purged;
+ } while (n_pages_purged > 0 && !purge_sys.paused()
+ && !srv_purge_should_exit());
+
+ return(rseg_history_len);
+}
+
+
+static std::list<THD*> purge_thds;
+static std::mutex purge_thd_mutex;
+extern void* thd_attach_thd(THD*);
+extern void thd_detach_thd(void *);
+
+THD* acquire_thd(void **ctx)
+{
+ std::unique_lock<std::mutex> lk(purge_thd_mutex);
+ if (purge_thds.empty()) {
+ THD* thd = current_thd;
+ purge_thds.push_back(innobase_create_background_thd("InnoDB purge worker"));
+ set_current_thd(thd);
+ }
+ THD* thd = purge_thds.front();
+ purge_thds.pop_front();
+ lk.unlock();
+
+ /* Set current thd, and thd->mysys_var as well,
+ it might be used by something in the server.*/
+ *ctx = thd_attach_thd(thd);
+ return thd;
+}
+
+void release_thd(THD *thd, void *ctx)
+{
+ thd_detach_thd(ctx);
+ std::unique_lock<std::mutex> lk(purge_thd_mutex);
+ purge_thds.push_back(thd);
+ lk.unlock();
+ set_current_thd(0);
+}
+
+
+/*
+ Called by timer when purge coordinator decides
+ to delay processing of purge records.
+*/
+static void purge_coordinator_timer_callback(void *)
+{
+ if (!purge_sys.enabled() || purge_sys.paused() ||
+ purge_state.m_running || !trx_sys.rseg_history_len)
+ return;
+
+ if (purge_state.m_history_length < 5000 &&
+ purge_state.m_history_length == trx_sys.rseg_history_len)
+ /* No new records were added since wait started.
+ Simply wait for new records. The magic number 5000 is an
+ approximation for the case where we have cached UNDO
+ log records which prevent truncate of the UNDO segments.*/
+ return;
+ srv_wake_purge_thread_if_not_active();
+}
+
+static void purge_worker_callback(void*)
+{
+ ut_ad(!current_thd);
+ ut_ad(!srv_read_only_mode);
+ ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+ void *ctx;
+ THD *thd= acquire_thd(&ctx);
+ while (srv_task_execute())
+ ut_ad(purge_sys.running());
+ release_thd(thd,ctx);
+}
+
+static void purge_coordinator_callback_low()
+{
+ ulint n_total_purged= ULINT_UNDEFINED;
+ purge_state.m_history_length= 0;
+
+ if (!purge_sys.enabled() || purge_sys.paused())
+ return;
+ do
+ {
+ n_total_purged = 0;
+ int sigcount= purge_state.m_running;
+
+ purge_state.m_history_length= srv_do_purge(&n_total_purged);
+
+ /* Check if purge was woken by srv_wake_purge_thread_if_not_active() */
+
+ bool woken_during_purge= purge_state.m_running > sigcount;
+
+ /* If last purge batch processed less than 1 page and there is
+ still work to do, delay the next batch by 10ms. Unless
+ someone added work and woke us up. */
+ if (n_total_purged == 0)
+ {
+ if (trx_sys.rseg_history_len == 0)
+ return;
+ if (!woken_during_purge)
+ {
+ /* Delay next purge round*/
+ purge_coordinator_timer->set_time(10, 0);
+ return;
+ }
+ }
+ }
+ while ((purge_sys.enabled() && !purge_sys.paused()) ||
+ !srv_purge_should_exit());
+}
+
+static void purge_coordinator_callback(void*)
+{
+ void *ctx;
+ THD *thd= acquire_thd(&ctx);
+ purge_coordinator_callback_low();
+ release_thd(thd,ctx);
+ purge_state.m_running= 0;
+}
+
+void srv_init_purge_tasks()
+{
+ purge_coordinator_timer= srv_thread_pool->create_timer
+ (purge_coordinator_timer_callback, nullptr);
+}
+
+static void srv_shutdown_purge_tasks()
+{
+ purge_coordinator_task.wait();
+ delete purge_coordinator_timer;
+ purge_coordinator_timer= nullptr;
+ purge_worker_task.wait();
+ while (!purge_thds.empty())
+ {
+ innobase_destroy_background_thd(purge_thds.front());
+ purge_thds.pop_front();
+ }
+}
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(!srv_read_only_mode);
+ mutex_enter(&srv_sys.tasks_mutex);
+
+ UT_LIST_ADD_LAST(srv_sys.tasks, thr);
+
+ mutex_exit(&srv_sys.tasks_mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** @return number of tasks in queue */
+ulint srv_get_task_queue_length()
+{
+ ulint n_tasks;
+
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&srv_sys.tasks_mutex);
+
+ n_tasks = UT_LIST_GET_LEN(srv_sys.tasks);
+
+ mutex_exit(&srv_sys.tasks_mutex);
+
+ return(n_tasks);
+}
+#endif
+
+/** Shut down the purge threads. */
+void srv_purge_shutdown()
+{
+ if (purge_sys.enabled()) {
+ srv_update_purge_thread_count(innodb_purge_threads_MAX);
+ while(!srv_purge_should_exit()) {
+ ut_a(!purge_sys.paused());
+ srv_wake_purge_thread_if_not_active();
+ os_thread_sleep(1000);
+ }
+ purge_sys.coordinator_shutdown();
+ srv_shutdown_purge_tasks();
+ }
+}
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
new file mode 100644
index 00000000..aa6e7ce1
--- /dev/null
+++ b/storage/innobase/srv/srv0start.cc
@@ -0,0 +1,2168 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file srv/srv0start.cc
+Starts the InnoDB database server
+
+Created 2/16/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "my_global.h"
+
+#include "mysqld.h"
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+
+#include "row0ftsort.h"
+#include "ut0mem.h"
+#include "mem0mem.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "buf0dblwr.h"
+#include "buf0dump.h"
+#include "os0file.h"
+#include "os0thread.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "fsp0fsp.h"
+#include "rem0rec.h"
+#include "mtr0mtr.h"
+#include "log0crypt.h"
+#include "log0recv.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "trx0trx.h"
+#include "trx0sys.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "rem0rec.h"
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "btr0defragment.h"
+#include "mysql/service_wsrep.h" /* wsrep_recovery */
+#include "trx0rseg.h"
+#include "buf0flu.h"
+#include "buf0rea.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "dict0stats_bg.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+#include "btr0sea.h"
+#include "rem0cmp.h"
+#include "dict0crea.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0mysql.h"
+#include "btr0pcur.h"
+#include "os0event.h"
+#include "zlib.h"
+#include "ut0crc32.h"
+
+/** We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. srv_start() sets the value. */
+ulint srv_max_n_threads;
+
+/** Log sequence number at shutdown */
+lsn_t srv_shutdown_lsn;
+
+/** TRUE if a raw partition is in use */
+ibool srv_start_raw_disk_in_use;
+
+/** Number of IO threads to use */
+uint srv_n_file_io_threads;
+
+/** UNDO tablespaces starts with space id. */
+ulint srv_undo_space_id_start;
+
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+bool srv_startup_is_before_trx_rollback_phase;
+/** TRUE if the server is being started */
+bool srv_is_being_started;
+/** TRUE if SYS_TABLESPACES is available for lookups */
+bool srv_sys_tablespaces_open;
+/** TRUE if the server was successfully started */
+bool srv_was_started;
+/** The original value of srv_log_file_size (innodb_log_file_size) */
+static ulonglong srv_log_file_size_requested;
+/** whether srv_start() has been called */
+static bool srv_start_has_been_called;
+
+/** Whether any undo log records can be generated */
+UNIV_INTERN bool srv_undo_sources;
+
+#ifdef UNIV_DEBUG
+/** InnoDB system tablespace to set during recovery */
+UNIV_INTERN uint srv_sys_space_size_debug;
+/** whether redo log file have been created at startup */
+UNIV_INTERN bool srv_log_file_created;
+#endif /* UNIV_DEBUG */
+
+/** whether some background threads that create redo log have been started */
+static bool srv_started_redo;
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+enum srv_shutdown_t srv_shutdown_state = SRV_SHUTDOWN_NONE;
+
+/** Name of srv_monitor_file */
+static char* srv_monitor_file_name;
+std::unique_ptr<tpool::timer> srv_master_timer;
+
+/** */
+#define SRV_MAX_N_PENDING_SYNC_IOS 100
+
+#ifdef UNIV_PFS_THREAD
+/* Keys to register InnoDB threads with performance schema */
+mysql_pfs_key_t thread_pool_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Array of all InnoDB stage events for monitoring activities via
+performance schema. */
+static PSI_stage_info* srv_stages[] =
+{
+ &srv_stage_alter_table_end,
+ &srv_stage_alter_table_insert,
+ &srv_stage_alter_table_log_index,
+ &srv_stage_alter_table_log_table,
+ &srv_stage_alter_table_merge_sort,
+ &srv_stage_alter_table_read_pk_internal_sort,
+ &srv_stage_buffer_pool_load,
+};
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/*********************************************************************//**
+Check if a file can be opened in read-write mode.
+@return true if it doesn't exist or can be opened in rw mode. */
+static
+bool
+srv_file_check_mode(
+/*================*/
+ const char* name) /*!< in: filename to check */
+{
+ os_file_stat_t stat;
+
+ memset(&stat, 0x0, sizeof(stat));
+
+ dberr_t err = os_file_get_status(
+ name, &stat, true, srv_read_only_mode);
+
+ if (err == DB_FAIL) {
+ ib::error() << "os_file_get_status() failed on '" << name
+ << "'. Can't determine file permissions.";
+ return(false);
+
+ } else if (err == DB_SUCCESS) {
+
+ /* Note: stat.rw_perm is only valid of files */
+
+ if (stat.type == OS_FILE_TYPE_FILE) {
+
+ if (!stat.rw_perm) {
+ const char* mode = srv_read_only_mode
+ ? "read" : "read-write";
+ ib::error() << name << " can't be opened in "
+ << mode << " mode.";
+ return(false);
+ }
+ } else {
+ /* Not a regular file, bail out. */
+ ib::error() << "'" << name << "' not a regular file.";
+
+ return(false);
+ }
+ } else {
+
+ /* This is OK. If the file create fails on RO media, there
+ is nothing we can do. */
+
+ ut_a(err == DB_NOT_FOUND);
+ }
+
+ return(true);
+}
+
+/** Initial number of the redo log file */
+static const char INIT_LOG_FILE0[]= "101";
+
+/** Creates log file.
+@param[in] create_new_db whether the database is being initialized
+@param[in] lsn FIL_PAGE_FILE_FLUSH_LSN value
+@param[out] logfile0 name of the log file
+@return DB_SUCCESS or error code */
+static dberr_t create_log_file(bool create_new_db, lsn_t lsn,
+ std::string& logfile0)
+{
+ if (srv_read_only_mode) {
+ ib::error() << "Cannot create log file in read-only mode";
+ return DB_READ_ONLY;
+ }
+
+ /* Crashing after deleting the first file should be
+ recoverable. The buffer pool was clean, and we can simply
+ create log file from the scratch. */
+ DBUG_EXECUTE_IF("innodb_log_abort_6", delete_log_file("0");
+ return DB_ERROR;);
+
+ for (size_t i = 0; i < 102; i++) {
+ delete_log_file(std::to_string(i).c_str());
+ }
+
+ DBUG_PRINT("ib_log", ("After innodb_log_abort_6"));
+ DBUG_ASSERT(!buf_pool.any_io_pending());
+
+ DBUG_EXECUTE_IF("innodb_log_abort_7", return DB_ERROR;);
+ DBUG_PRINT("ib_log", ("After innodb_log_abort_7"));
+
+ logfile0 = get_log_file_path(LOG_FILE_NAME_PREFIX)
+ .append(INIT_LOG_FILE0);
+
+ bool ret;
+ pfs_os_file_t file = os_file_create(
+ innodb_log_file_key, logfile0.c_str(),
+ OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL,
+ OS_LOG_FILE, srv_read_only_mode, &ret);
+
+ if (!ret) {
+ ib::error() << "Cannot create " << logfile0;
+ return DB_ERROR;
+ }
+
+ ib::info() << "Setting log file " << logfile0 << " size to "
+ << srv_log_file_size << " bytes";
+
+ ret = os_file_set_size(logfile0.c_str(), file, srv_log_file_size);
+ if (!ret) {
+ os_file_close(file);
+ ib::error() << "Cannot set log file " << logfile0
+ << " size to " << srv_log_file_size << " bytes";
+ return DB_ERROR;
+ }
+
+ ret = os_file_close(file);
+ ut_a(ret);
+
+ DBUG_EXECUTE_IF("innodb_log_abort_8", return(DB_ERROR););
+ DBUG_PRINT("ib_log", ("After innodb_log_abort_8"));
+
+ /* We did not create the first log file initially as LOG_FILE_NAME, so
+ that crash recovery cannot find it until it has been completed and
+ renamed. */
+
+ log_sys.log.create();
+ if (!log_set_capacity(srv_log_file_size_requested)) {
+ return DB_ERROR;
+ }
+
+ log_sys.log.open_file(logfile0);
+ if (!fil_system.sys_space->open(create_new_db)) {
+ return DB_ERROR;
+ }
+
+ /* Create a log checkpoint. */
+ mysql_mutex_lock(&log_sys.mutex);
+ if (log_sys.is_encrypted() && !log_crypt_init()) {
+ return DB_ERROR;
+ }
+ ut_d(recv_no_log_write = false);
+ lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE);
+ log_sys.set_lsn(lsn + LOG_BLOCK_HDR_SIZE);
+ log_sys.log.set_lsn(lsn);
+ log_sys.log.set_lsn_offset(LOG_FILE_HDR_SIZE);
+
+ log_sys.buf_next_to_write = 0;
+ log_sys.write_lsn = lsn;
+
+ log_sys.next_checkpoint_no = 0;
+ log_sys.last_checkpoint_lsn = 0;
+
+ memset(log_sys.buf, 0, srv_log_buffer_size);
+ log_block_init(log_sys.buf, lsn);
+ log_block_set_first_rec_group(log_sys.buf, LOG_BLOCK_HDR_SIZE);
+ memset(log_sys.flush_buf, 0, srv_log_buffer_size);
+
+ log_sys.buf_free = LOG_BLOCK_HDR_SIZE;
+
+ log_sys.log.write_header_durable(lsn);
+
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ log_make_checkpoint();
+ log_buffer_flush_to_disk();
+
+ return DB_SUCCESS;
+}
+
+/** Rename the first redo log file.
+@param[in] lsn FIL_PAGE_FILE_FLUSH_LSN value
+@param[in,out] logfile0 name of the first log file
+@return error code
+@retval DB_SUCCESS on successful operation */
+MY_ATTRIBUTE((warn_unused_result))
+static dberr_t create_log_file_rename(lsn_t lsn, std::string &logfile0)
+{
+ ut_ad(!srv_log_file_created);
+ ut_d(srv_log_file_created= true);
+
+ DBUG_EXECUTE_IF("innodb_log_abort_9", return (DB_ERROR););
+ DBUG_PRINT("ib_log", ("After innodb_log_abort_9"));
+
+ /* Rename the first log file, now that a log checkpoint has been created. */
+ auto new_name = get_log_file_path();
+
+ ib::info() << "Renaming log file " << logfile0 << " to " << new_name;
+
+ mysql_mutex_lock(&log_sys.mutex);
+ ut_ad(logfile0.size() == 2 + new_name.size());
+ logfile0= new_name;
+ dberr_t err= log_sys.log.rename(std::move(new_name));
+
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ DBUG_EXECUTE_IF("innodb_log_abort_10", err= DB_ERROR;);
+
+ if (err == DB_SUCCESS)
+ ib::info() << "New log file created, LSN=" << lsn;
+
+ return err;
+}
+
+/** Create an undo tablespace file
+@param[in] name file name
+@return DB_SUCCESS or error code */
+static dberr_t srv_undo_tablespace_create(const char* name)
+{
+ pfs_os_file_t fh;
+ bool ret;
+ dberr_t err = DB_SUCCESS;
+
+ os_file_create_subdirs_if_needed(name);
+
+ fh = os_file_create(
+ innodb_data_file_key,
+ name,
+ srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE,
+ OS_FILE_NORMAL, OS_DATA_FILE, srv_read_only_mode, &ret);
+
+ if (!ret) {
+ if (os_file_get_last_error(false) != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+ /* AIX 5.1 after security patch ML7 may have
+ errno set to 0 here, which causes our function
+ to return 100; work around that AIX problem */
+ && os_file_get_last_error(false) != 100
+#endif /* UNIV_AIX */
+ ) {
+ ib::error() << "Can't create UNDO tablespace "
+ << name;
+ }
+ err = DB_ERROR;
+ } else if (srv_read_only_mode) {
+ ib::info() << name << " opened in read-only mode";
+ } else {
+ /* We created the data file and now write it full of zeros */
+
+ ib::info() << "Data file " << name << " did not exist: new to"
+ " be created";
+
+ ib::info() << "Setting file " << name << " size to "
+ << (SRV_UNDO_TABLESPACE_SIZE_IN_PAGES >> (20 - srv_page_size_shift)) << " MB";
+
+ ib::info() << "Database physically writes the file full: "
+ << "wait...";
+
+ if (!os_file_set_size(name, fh, os_offset_t
+ {SRV_UNDO_TABLESPACE_SIZE_IN_PAGES}
+ << srv_page_size_shift)) {
+ ib::error() << "Unable to allocate " << name;
+ err = DB_ERROR;
+ }
+
+ os_file_close(fh);
+ }
+
+ return(err);
+}
+
+/* Validate the number of undo opened undo tablespace and user given
+undo tablespace
+@return DB_SUCCESS if it is valid */
+static dberr_t srv_validate_undo_tablespaces()
+{
+ /* If the user says that there are fewer than what we find we
+ tolerate that discrepancy but not the inverse. Because there could
+ be unused undo tablespaces for future use. */
+
+ if (srv_undo_tablespaces > srv_undo_tablespaces_open)
+ {
+ ib::error() << "Expected to open innodb_undo_tablespaces="
+ << srv_undo_tablespaces
+ << " but was able to find only "
+ << srv_undo_tablespaces_open;
+
+ return DB_ERROR;
+ }
+ else if (srv_undo_tablespaces_open > 0)
+ {
+ ib::info() << "Opened " << srv_undo_tablespaces_open
+ << " undo tablespaces";
+
+ if (srv_undo_tablespaces == 0)
+ ib::warn() << "innodb_undo_tablespaces=0 disables"
+ " dedicated undo log tablespaces";
+ }
+ return DB_SUCCESS;
+}
+
+/** @return the number of active undo tablespaces (except system tablespace) */
+static ulint trx_rseg_get_n_undo_tablespaces()
+{
+ std::set<uint32_t> space_ids;
+ mtr_t mtr;
+ mtr.start();
+
+ if (const buf_block_t *sys_header= trx_sysf_get(&mtr, false))
+ for (ulint rseg_id= 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++)
+ if (trx_sysf_rseg_get_page_no(sys_header, rseg_id) != FIL_NULL)
+ if (uint32_t space= trx_sysf_rseg_get_space(sys_header, rseg_id))
+ space_ids.insert(space);
+ mtr.commit();
+ return space_ids.size();
+}
+
+/** Open an undo tablespace.
+@param[in] create whether undo tablespaces are being created
+@param[in] name tablespace file name
+@param[in] i undo tablespace count
+@return undo tablespace identifier
+@retval 0 on failure */
+static ulint srv_undo_tablespace_open(bool create, const char* name, ulint i)
+{
+ bool success;
+ char undo_name[sizeof "innodb_undo000"];
+ ulint space_id= 0;
+ ulint fsp_flags= 0;
+
+ if (create)
+ {
+ space_id= srv_undo_space_id_start + i;
+ snprintf(undo_name, sizeof(undo_name),
+ "innodb_undo%03u", static_cast<unsigned>(space_id));
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ fsp_flags= FSP_FLAGS_FCRC32_MASK_MARKER | FSP_FLAGS_FCRC32_PAGE_SSIZE();
+ break;
+ default:
+ fsp_flags= FSP_FLAGS_PAGE_SSIZE();
+ }
+ }
+
+ pfs_os_file_t fh= os_file_create(innodb_data_file_key, name, OS_FILE_OPEN |
+ OS_FILE_ON_ERROR_NO_EXIT |
+ OS_FILE_ON_ERROR_SILENT,
+ OS_FILE_AIO, OS_DATA_FILE,
+ srv_read_only_mode, &success);
+
+ if (!success)
+ return 0;
+
+ os_offset_t size= os_file_get_size(fh);
+ ut_a(size != os_offset_t(-1));
+
+ if (!create)
+ {
+ page_t *page= static_cast<byte*>(aligned_malloc(srv_page_size,
+ srv_page_size));
+ dberr_t err= os_file_read(IORequestRead, fh, page, 0, srv_page_size);
+ if (err != DB_SUCCESS)
+ {
+err_exit:
+ ib::error() << "Unable to read first page of file " << name;
+ aligned_free(page);
+ return err;
+ }
+
+ uint32_t id= mach_read_from_4(FIL_PAGE_SPACE_ID + page);
+ if (id == 0 || id >= SRV_SPACE_ID_UPPER_BOUND ||
+ memcmp_aligned<2>(FIL_PAGE_SPACE_ID + page,
+ FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4))
+ {
+ ib::error() << "Inconsistent tablespace ID in file " << name;
+ err= DB_CORRUPTION;
+ goto err_exit;
+ }
+
+ fsp_flags= mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
+ if (buf_page_is_corrupted(false, page, fsp_flags))
+ {
+ ib::error() << "Checksum mismatch in the first page of file " << name;
+ err= DB_CORRUPTION;
+ goto err_exit;
+ }
+
+ space_id= id;
+ snprintf(undo_name, sizeof undo_name, "innodb_undo%03u", id);
+ aligned_free(page);
+ }
+
+ /* Load the tablespace into InnoDB's internal data structures. */
+
+ /* We set the biggest space id to the undo tablespace
+ because InnoDB hasn't opened any other tablespace apart
+ from the system tablespace. */
+
+ fil_set_max_space_id_if_bigger(space_id);
+
+ fil_space_t *space= fil_space_t::create(undo_name, space_id, fsp_flags,
+ FIL_TYPE_TABLESPACE, NULL);
+ ut_a(fil_validate());
+ ut_a(space);
+
+ fil_node_t *file= space->add(name, fh, 0, false, true);
+ mutex_enter(&fil_system.mutex);
+
+ if (create)
+ {
+ space->set_sizes(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
+ space->size= file->size= uint32_t(size >> srv_page_size_shift);
+ }
+ else if (!file->read_page0())
+ {
+ os_file_close(file->handle);
+ file->handle= OS_FILE_CLOSED;
+ ut_a(fil_system.n_open > 0);
+ fil_system.n_open--;
+ }
+
+ mutex_exit(&fil_system.mutex);
+ return space_id;
+}
+
+/** Check if undo tablespaces and redo log files exist before creating a
+new system tablespace
+@retval DB_SUCCESS if all undo and redo logs are not found
+@retval DB_ERROR if any undo and redo logs are found */
+static
+dberr_t
+srv_check_undo_redo_logs_exists()
+{
+ bool ret;
+ pfs_os_file_t fh;
+ char name[OS_FILE_MAX_PATH];
+
+ /* Check if any undo tablespaces exist */
+ for (ulint i = 1; i <= srv_undo_tablespaces; ++i) {
+
+ snprintf(
+ name, sizeof(name),
+ "%s%cundo%03zu",
+ srv_undo_dir, OS_PATH_SEPARATOR,
+ i);
+
+ fh = os_file_create(
+ innodb_data_file_key, name,
+ OS_FILE_OPEN_RETRY
+ | OS_FILE_ON_ERROR_NO_EXIT
+ | OS_FILE_ON_ERROR_SILENT,
+ OS_FILE_NORMAL,
+ OS_DATA_FILE,
+ srv_read_only_mode,
+ &ret);
+
+ if (ret) {
+ os_file_close(fh);
+ ib::error()
+ << "undo tablespace '" << name << "' exists."
+ " Creating system tablespace with existing undo"
+ " tablespaces is not supported. Please delete"
+ " all undo tablespaces before creating new"
+ " system tablespace.";
+ return(DB_ERROR);
+ }
+ }
+
+ /* Check if redo log file exists */
+ auto logfilename = get_log_file_path();
+
+ fh = os_file_create(innodb_log_file_key, logfilename.c_str(),
+ OS_FILE_OPEN_RETRY | OS_FILE_ON_ERROR_NO_EXIT
+ | OS_FILE_ON_ERROR_SILENT,
+ OS_FILE_NORMAL, OS_LOG_FILE, srv_read_only_mode,
+ &ret);
+
+ if (ret) {
+ os_file_close(fh);
+ ib::error() << "redo log file '" << logfilename
+ << "' exists. Creating system tablespace with"
+ " existing redo log file is not recommended."
+ " Please delete redo log file before"
+ " creating new system tablespace.";
+ return DB_ERROR;
+ }
+
+ return(DB_SUCCESS);
+}
+
+static dberr_t srv_all_undo_tablespaces_open(bool create_new_db, ulint n_undo)
+{
+ /* Open all the undo tablespaces that are currently in use. If we
+ fail to open any of these it is a fatal error. The tablespace ids
+ should be contiguous. It is a fatal error because they are required
+ for recovery and are referenced by the UNDO logs (a.k.a RBS). */
+
+ ulint prev_id= create_new_db ? srv_undo_space_id_start - 1 : 0;
+
+ for (ulint i= 0; i < n_undo; ++i)
+ {
+ char name[OS_FILE_MAX_PATH];
+ snprintf(name, sizeof name, "%s%cundo%03zu", srv_undo_dir,
+ OS_PATH_SEPARATOR, i + 1);
+ ulint space_id= srv_undo_tablespace_open(create_new_db, name, i);
+ if (!space_id)
+ {
+ if (!create_new_db)
+ break;
+ ib::error() << "Unable to open create tablespace '" << name << "'.";
+ return DB_ERROR;
+ }
+
+ /* Should be no gaps in undo tablespace ids. */
+ ut_a(!i || prev_id + 1 == space_id);
+
+ prev_id= space_id;
+
+ /* Note the first undo tablespace id in case of
+ no active undo tablespace. */
+ if (0 == srv_undo_tablespaces_open++)
+ srv_undo_space_id_start= space_id;
+ }
+
+ /* Open any extra unused undo tablespaces. These must be contiguous.
+ We stop at the first failure. These are undo tablespaces that are
+ not in use and therefore not required by recovery. We only check
+ that there are no gaps. */
+
+ for (ulint i= prev_id + 1; i < srv_undo_space_id_start + TRX_SYS_N_RSEGS;
+ ++i)
+ {
+ char name[OS_FILE_MAX_PATH];
+ snprintf(name, sizeof(name),
+ "%s%cundo%03zu", srv_undo_dir, OS_PATH_SEPARATOR, i);
+ if (!srv_undo_tablespace_open(create_new_db, name, i))
+ break;
+ ++srv_undo_tablespaces_open;
+ }
+
+ return srv_validate_undo_tablespaces();
+}
+
+/** Open the configured number of dedicated undo tablespaces.
+@param[in] create_new_db whether the database is being initialized
+@return DB_SUCCESS or error code */
+dberr_t
+srv_undo_tablespaces_init(bool create_new_db)
+{
+ srv_undo_tablespaces_open= 0;
+
+ ut_a(srv_undo_tablespaces <= TRX_SYS_N_RSEGS);
+ ut_a(!create_new_db || srv_operation == SRV_OPERATION_NORMAL);
+
+ if (srv_undo_tablespaces == 1)
+ srv_undo_tablespaces= 0;
+
+ /* Create the undo spaces only if we are creating a new
+ instance. We don't allow creating of new undo tablespaces
+ in an existing instance (yet). */
+ if (create_new_db)
+ {
+ srv_undo_space_id_start= 1;
+ DBUG_EXECUTE_IF("innodb_undo_upgrade", srv_undo_space_id_start= 3;);
+
+ for (ulint i= 0; i < srv_undo_tablespaces; ++i)
+ {
+ char name[OS_FILE_MAX_PATH];
+ snprintf(name, sizeof name, "%s%cundo%03zu",
+ srv_undo_dir, OS_PATH_SEPARATOR, i + 1);
+ if (dberr_t err= srv_undo_tablespace_create(name))
+ {
+ ib::error() << "Could not create undo tablespace '" << name << "'.";
+ return err;
+ }
+ }
+ }
+
+ /* Get the tablespace ids of all the undo segments excluding
+ the system tablespace (0). If we are creating a new instance then
+ we build the undo_tablespace_ids ourselves since they don't
+ already exist. */
+ srv_undo_tablespaces_active= srv_undo_tablespaces;
+
+ ulint n_undo= (create_new_db || srv_operation == SRV_OPERATION_BACKUP ||
+ srv_operation == SRV_OPERATION_RESTORE_DELTA)
+ ? srv_undo_tablespaces : TRX_SYS_N_RSEGS;
+
+ if (dberr_t err= srv_all_undo_tablespaces_open(create_new_db, n_undo))
+ return err;
+
+ /* Initialize srv_undo_space_id_start=0 when there are no
+ dedicated undo tablespaces. */
+ if (srv_undo_tablespaces_open == 0)
+ srv_undo_space_id_start= 0;
+
+ if (create_new_db)
+ {
+ mtr_t mtr;
+ for (ulint i= 0; i < srv_undo_tablespaces; ++i)
+ {
+ mtr.start();
+ fsp_header_init(fil_space_get(srv_undo_space_id_start + i),
+ SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr);
+ mtr.commit();
+ }
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Create the temporary file tablespace.
+@param[in] create_new_db whether we are creating a new database
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+srv_open_tmp_tablespace(bool create_new_db)
+{
+ ulint sum_of_new_sizes;
+
+ /* Will try to remove if there is existing file left-over by last
+ unclean shutdown */
+ srv_tmp_space.set_sanity_check_status(true);
+ srv_tmp_space.delete_files();
+ srv_tmp_space.set_ignore_read_only(true);
+
+ ib::info() << "Creating shared tablespace for temporary tables";
+
+ bool create_new_temp_space;
+
+ srv_tmp_space.set_space_id(SRV_TMP_SPACE_ID);
+
+ dberr_t err = srv_tmp_space.check_file_spec(
+ &create_new_temp_space, 12 * 1024 * 1024);
+
+ if (err == DB_FAIL) {
+ ib::error() << "The innodb_temporary"
+ " data file must be writable!";
+ err = DB_ERROR;
+ } else if (err != DB_SUCCESS) {
+ ib::error() << "Could not create the shared innodb_temporary.";
+ } else if ((err = srv_tmp_space.open_or_create(
+ true, create_new_db, &sum_of_new_sizes, NULL))
+ != DB_SUCCESS) {
+ ib::error() << "Unable to create the shared innodb_temporary";
+ } else if (fil_system.temp_space->open(true)) {
+ /* Initialize the header page */
+ mtr_t mtr;
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ fsp_header_init(fil_system.temp_space,
+ srv_tmp_space.get_sum_of_sizes(),
+ &mtr);
+ mtr.commit();
+ } else {
+ /* This file was just opened in the code above! */
+ ib::error() << "The innodb_temporary"
+ " data file cannot be re-opened"
+ " after check_file_spec() succeeded!";
+ err = DB_ERROR;
+ }
+
+ return(err);
+}
+
+/** Shutdown background threads, except the page cleaner. */
+static void srv_shutdown_threads()
+{
+ ut_ad(!srv_undo_sources);
+ srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
+
+ lock_sys.timeout_timer.reset();
+ srv_master_timer.reset();
+
+ if (purge_sys.enabled()) {
+ srv_purge_shutdown();
+ }
+
+ if (srv_n_fil_crypt_threads) {
+ fil_crypt_set_thread_cnt(0);
+ }
+}
+
+#ifdef UNIV_DEBUG
+# define srv_init_abort(_db_err) \
+ srv_init_abort_low(create_new_db, __FILE__, __LINE__, _db_err)
+#else
+# define srv_init_abort(_db_err) \
+ srv_init_abort_low(create_new_db, _db_err)
+#endif /* UNIV_DEBUG */
+
+/** Innobase start-up aborted. Perform cleanup actions.
+@param[in] create_new_db TRUE if new db is being created
+@param[in] file File name
+@param[in] line Line number
+@param[in] err Reason for aborting InnoDB startup
+@return DB_SUCCESS or error code. */
+MY_ATTRIBUTE((warn_unused_result, nonnull))
+static
+dberr_t
+srv_init_abort_low(
+ bool create_new_db,
+#ifdef UNIV_DEBUG
+ const char* file,
+ unsigned line,
+#endif /* UNIV_DEBUG */
+ dberr_t err)
+{
+ ut_ad(srv_is_being_started);
+
+ if (create_new_db) {
+ ib::error() << "Database creation was aborted"
+#ifdef UNIV_DEBUG
+ " at " << innobase_basename(file) << "[" << line << "]"
+#endif /* UNIV_DEBUG */
+ " with error " << err << ". You may need"
+ " to delete the ibdata1 file before trying to start"
+ " up again.";
+ } else {
+ ib::error() << "Plugin initialization aborted"
+#ifdef UNIV_DEBUG
+ " at " << innobase_basename(file) << "[" << line << "]"
+#endif /* UNIV_DEBUG */
+ " with error " << err;
+ }
+
+ srv_shutdown_bg_undo_sources();
+ srv_shutdown_threads();
+ return(err);
+}
+
+/** Prepare to delete the redo log file. Flush the dirty pages from all the
+buffer pools. Flush the redo log buffer to the redo log file.
+@param[in] old_exists old redo log file exists
+@return lsn upto which data pages have been flushed. */
+static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists)
+{
+ DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
+
+ lsn_t flushed_lsn;
+ ulint count = 0;
+
+ if (log_sys.log.subformat != 2) {
+ srv_log_file_size = 0;
+ }
+
+ for (;;) {
+ /* Clean the buffer pool. */
+ buf_flush_sync();
+
+ DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
+ DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
+
+ mysql_mutex_lock(&log_sys.mutex);
+
+ fil_names_clear(log_sys.get_lsn(), false);
+
+ flushed_lsn = log_sys.get_lsn();
+
+ {
+ ib::info info;
+ if (srv_log_file_size == 0
+ || (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED)
+ != log_t::FORMAT_10_5) {
+ info << "Upgrading redo log: ";
+ } else if (!old_exists
+ || srv_log_file_size
+ != srv_log_file_size_requested) {
+ if (srv_encrypt_log
+ == (my_bool)log_sys.is_encrypted()) {
+ info << (srv_encrypt_log
+ ? "Resizing encrypted"
+ : "Resizing");
+ } else if (srv_encrypt_log) {
+ info << "Encrypting and resizing";
+ } else {
+ info << "Removing encryption"
+ " and resizing";
+ }
+
+ info << " redo log from " << srv_log_file_size
+ << " to ";
+ } else if (srv_encrypt_log) {
+ info << "Encrypting redo log: ";
+ } else {
+ info << "Removing redo log encryption: ";
+ }
+
+ info << srv_log_file_size_requested
+ << " bytes; LSN=" << flushed_lsn;
+ }
+
+ mysql_mutex_unlock(&log_sys.mutex);
+
+ if (flushed_lsn != log_sys.get_flushed_lsn()) {
+ log_write_up_to(flushed_lsn, false);
+ log_sys.log.flush();
+ }
+
+ ut_ad(flushed_lsn == log_sys.get_lsn());
+
+ /* Check if the buffer pools are clean. If not
+ retry till it is clean. */
+ if (ulint pending_io = buf_pool.io_pending()) {
+ count++;
+ /* Print a message every 60 seconds if we
+ are waiting to clean the buffer pools */
+ if (srv_print_verbose_log && count > 600) {
+ ib::info() << "Waiting for "
+ << pending_io << " buffer "
+ << "page I/Os to complete";
+ count = 0;
+ }
+
+ os_thread_sleep(100000);
+ continue;
+ }
+
+ break;
+ }
+
+ DBUG_RETURN(flushed_lsn);
+}
+
+/** Tries to locate LOG_FILE_NAME and check it's size, etc
+@param[out] log_file_found returns true here if correct file was found
+@return dberr_t with DB_SUCCESS or some error */
+static dberr_t find_and_check_log_file(bool &log_file_found)
+{
+ log_file_found= false;
+
+ auto logfile0= get_log_file_path();
+ os_file_stat_t stat_info;
+ const dberr_t err= os_file_get_status(logfile0.c_str(), &stat_info, false,
+ srv_read_only_mode);
+
+ auto is_operation_restore= []() -> bool {
+ return srv_operation == SRV_OPERATION_RESTORE ||
+ srv_operation == SRV_OPERATION_RESTORE_EXPORT;
+ };
+
+ if (err == DB_NOT_FOUND)
+ {
+ if (is_operation_restore())
+ return DB_NOT_FOUND;
+
+ return DB_SUCCESS;
+ }
+
+ if (stat_info.type != OS_FILE_TYPE_FILE)
+ return DB_SUCCESS;
+
+ if (!srv_file_check_mode(logfile0.c_str()))
+ return DB_ERROR;
+
+ const os_offset_t size= stat_info.size;
+ ut_a(size != (os_offset_t) -1);
+
+ if (size % OS_FILE_LOG_BLOCK_SIZE)
+ {
+ ib::error() << "Log file " << logfile0 << " size " << size
+ << " is not a multiple of " << OS_FILE_LOG_BLOCK_SIZE
+ << " bytes";
+ return DB_ERROR;
+ }
+
+ if (size == 0 && is_operation_restore())
+ {
+ /* Tolerate an empty LOG_FILE_NAME from a previous run of
+ mariabackup --prepare. */
+ return DB_NOT_FOUND;
+ }
+ /* The first log file must consist of at least the following 512-byte pages:
+ header, checkpoint page 1, empty, checkpoint page 2, redo log page(s).
+
+ Mariabackup --prepare would create an empty LOG_FILE_NAME. Tolerate it. */
+ if (size != 0 && size <= OS_FILE_LOG_BLOCK_SIZE * 4)
+ {
+ ib::error() << "Log file " << logfile0 << " size " << size
+ << " is too small";
+ return DB_ERROR;
+ }
+ srv_log_file_size= size;
+
+ log_file_found= true;
+ return DB_SUCCESS;
+}
+
+/** Start InnoDB.
+@param[in] create_new_db whether to create a new database
+@return DB_SUCCESS or error code */
+dberr_t srv_start(bool create_new_db)
+{
+ lsn_t flushed_lsn;
+ dberr_t err = DB_SUCCESS;
+ bool srv_log_file_found = true;
+ mtr_t mtr;
+
+ ut_ad(srv_operation == SRV_OPERATION_NORMAL
+ || srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+
+ if (srv_force_recovery) {
+ ib::info() << "!!! innodb_force_recovery is set to "
+ << srv_force_recovery << " !!!";
+ }
+
+ if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) {
+ srv_read_only_mode = true;
+ }
+
+ high_level_read_only = srv_read_only_mode
+ || srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE
+ || srv_sys_space.created_new_raw();
+
+ srv_started_redo = false;
+
+ compile_time_assert(sizeof(ulint) == sizeof(void*));
+
+#ifdef UNIV_DEBUG
+ ib::info() << "!!!!!!!! UNIV_DEBUG switched on !!!!!!!!!";
+#endif
+
+#ifdef UNIV_IBUF_DEBUG
+ ib::info() << "!!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!";
+#endif
+
+ ib::info() << MUTEX_TYPE;
+
+ ib::info() << "Compressed tables use zlib " ZLIB_VERSION
+#ifdef UNIV_ZIP_DEBUG
+ " with validation"
+#endif /* UNIV_ZIP_DEBUG */
+ ;
+#ifdef UNIV_ZIP_COPY
+ ib::info() << "and extra copying";
+#endif /* UNIV_ZIP_COPY */
+
+ /* Since InnoDB does not currently clean up all its internal data
+ structures in MySQL Embedded Server Library server_end(), we
+ print an error message if someone tries to start up InnoDB a
+ second time during the process lifetime. */
+
+ if (srv_start_has_been_called) {
+ ib::error() << "Startup called second time"
+ " during the process lifetime."
+ " In the MySQL Embedded Server Library"
+ " you cannot call server_init() more than"
+ " once during the process lifetime.";
+ }
+
+ srv_start_has_been_called = true;
+
+ srv_is_being_started = true;
+
+ /* Register performance schema stages before any real work has been
+ started which may need to be instrumented. */
+ mysql_stage_register("innodb", srv_stages,
+ static_cast<int>(UT_ARR_SIZE(srv_stages)));
+
+ /* Set the maximum number of threads which can wait for a semaphore
+ inside InnoDB: this is the 'sync wait array' size */
+
+ srv_max_n_threads = 1 /* io_ibuf_thread */
+ + 1 /* io_log_thread */
+ + 1 /* srv_print_monitor_task */
+ + 1 /* srv_purge_coordinator_thread */
+ + 1 /* buf_dump_thread */
+ + 1 /* dict_stats_thread */
+ + 1 /* fts_optimize_thread */
+ + 1 /* trx_rollback_all_recovered */
+ + 128 /* added as margin, for use of
+ InnoDB Memcached etc. */
+ + 1/* buf_flush_page_cleaner */
+ + max_connections
+ + srv_n_read_io_threads
+ + srv_n_write_io_threads
+ + srv_n_purge_threads
+ /* FTS Parallel Sort */
+ + fts_sort_pll_degree * FTS_NUM_AUX_INDEX
+ * max_connections;
+
+ srv_boot();
+
+ ib::info() << my_crc32c_implementation();
+
+ if (!srv_read_only_mode) {
+
+ mutex_create(LATCH_ID_SRV_MONITOR_FILE,
+ &srv_monitor_file_mutex);
+
+ if (srv_innodb_status) {
+
+ srv_monitor_file_name = static_cast<char*>(
+ ut_malloc_nokey(
+ strlen(fil_path_to_mysql_datadir)
+ + 20 + sizeof "/innodb_status."));
+
+ sprintf(srv_monitor_file_name,
+ "%s/innodb_status." ULINTPF,
+ fil_path_to_mysql_datadir,
+ static_cast<ulint>
+ (IF_WIN(GetCurrentProcessId(), getpid())));
+
+ srv_monitor_file = my_fopen(srv_monitor_file_name,
+ O_RDWR|O_TRUNC|O_CREAT,
+ MYF(MY_WME));
+
+ if (!srv_monitor_file) {
+ ib::error() << "Unable to create "
+ << srv_monitor_file_name << ": "
+ << strerror(errno);
+ if (err == DB_SUCCESS) {
+ err = DB_ERROR;
+ }
+ }
+ } else {
+
+ srv_monitor_file_name = NULL;
+ srv_monitor_file = os_file_create_tmpfile();
+
+ if (!srv_monitor_file && err == DB_SUCCESS) {
+ err = DB_ERROR;
+ }
+ }
+
+ mutex_create(LATCH_ID_SRV_MISC_TMPFILE,
+ &srv_misc_tmpfile_mutex);
+
+ srv_misc_tmpfile = os_file_create_tmpfile();
+
+ if (!srv_misc_tmpfile && err == DB_SUCCESS) {
+ err = DB_ERROR;
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+
+ srv_n_file_io_threads = srv_n_read_io_threads + srv_n_write_io_threads;
+
+ if (!srv_read_only_mode) {
+ /* Add the log and ibuf IO threads. */
+ srv_n_file_io_threads += 2;
+ } else {
+ ib::info() << "Disabling background log and ibuf IO write"
+ << " threads.";
+ }
+
+ ut_a(srv_n_file_io_threads <= SRV_MAX_N_IO_THREADS);
+
+ if (os_aio_init()) {
+ ib::error() << "Cannot initialize AIO sub-system";
+
+ return(srv_init_abort(DB_ERROR));
+ }
+
+#ifdef LINUX_NATIVE_AIO
+ if (srv_use_native_aio) {
+ ib::info() << "Using Linux native AIO";
+ }
+#endif
+
+ fil_system.create(srv_file_per_table ? 50000 : 5000);
+
+ ib::info() << "Initializing buffer pool, total size = "
+ << srv_buf_pool_size
+ << ", chunk size = " << srv_buf_pool_chunk_unit;
+
+ if (buf_pool.create()) {
+ ib::error() << "Cannot allocate memory for the buffer pool";
+
+ return(srv_init_abort(DB_ERROR));
+ }
+
+ ib::info() << "Completed initialization of buffer pool";
+
+#ifdef UNIV_DEBUG
+ /* We have observed deadlocks with a 5MB buffer pool but
+ the actual lower limit could very well be a little higher. */
+
+ if (srv_buf_pool_size <= 5 * 1024 * 1024) {
+
+ ib::info() << "Small buffer pool size ("
+ << srv_buf_pool_size / 1024 / 1024
+ << "M), the flst_validate() debug function can cause a"
+ << " deadlock if the buffer pool fills up.";
+ }
+#endif /* UNIV_DEBUG */
+
+ log_sys.create();
+ recv_sys.create();
+ lock_sys.create(srv_lock_table_size);
+
+
+ if (!srv_read_only_mode) {
+ buf_flush_page_cleaner_init();
+ ut_ad(buf_page_cleaner_is_active);
+ }
+
+ srv_startup_is_before_trx_rollback_phase = !create_new_db;
+
+ /* Check if undo tablespaces and redo log files exist before creating
+ a new system tablespace */
+ if (create_new_db) {
+ err = srv_check_undo_redo_logs_exists();
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(DB_ERROR));
+ }
+ recv_sys.debug_free();
+ }
+
+ /* Open or create the data files. */
+ ulint sum_of_new_sizes;
+
+ err = srv_sys_space.open_or_create(
+ false, create_new_db, &sum_of_new_sizes, &flushed_lsn);
+
+ switch (err) {
+ case DB_SUCCESS:
+ break;
+ case DB_CANNOT_OPEN_FILE:
+ ib::error()
+ << "Could not open or create the system tablespace. If"
+ " you tried to add new data files to the system"
+ " tablespace, and it failed here, you should now"
+ " edit innodb_data_file_path in my.cnf back to what"
+ " it was, and remove the new ibdata files InnoDB"
+ " created in this failed attempt. InnoDB only wrote"
+ " those files full of zeros, but did not yet use"
+ " them in any way. But be careful: do not remove"
+ " old data files which contain your precious data!";
+ /* fall through */
+ default:
+ /* Other errors might come from Datafile::validate_first_page() */
+ return(srv_init_abort(err));
+ }
+
+ srv_log_file_size_requested = srv_log_file_size;
+
+ if (innodb_encrypt_temporary_tables && !log_crypt_init()) {
+ return srv_init_abort(DB_ERROR);
+ }
+
+ std::string logfile0;
+ bool create_new_log = create_new_db;
+ if (create_new_db) {
+ flushed_lsn = log_sys.get_lsn();
+ log_sys.set_flushed_lsn(flushed_lsn);
+ buf_flush_sync();
+
+ err = create_log_file(true, flushed_lsn, logfile0);
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+ } else {
+ srv_log_file_size = 0;
+
+ bool log_file_found;
+ if (dberr_t err = find_and_check_log_file(log_file_found)) {
+ if (err == DB_NOT_FOUND) {
+ return DB_SUCCESS;
+ }
+ return srv_init_abort(err);
+ }
+
+ create_new_log = srv_log_file_size == 0;
+ if (create_new_log) {
+ if (flushed_lsn < lsn_t(1000)) {
+ ib::error()
+ << "Cannot create log file because"
+ " data files are corrupt or the"
+ " database was not shut down cleanly"
+ " after creating the data files.";
+ return srv_init_abort(DB_ERROR);
+ }
+
+ srv_log_file_size = srv_log_file_size_requested;
+
+ err = create_log_file(false, flushed_lsn, logfile0);
+
+ if (err == DB_SUCCESS) {
+ err = create_log_file_rename(flushed_lsn,
+ logfile0);
+ }
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+
+ /* Suppress the message about
+ crash recovery. */
+ flushed_lsn = log_sys.get_lsn();
+ goto file_checked;
+ }
+
+ srv_log_file_found = log_file_found;
+
+ log_sys.log.open_file(get_log_file_path());
+
+ log_sys.log.create();
+
+ if (!log_set_capacity(srv_log_file_size_requested)) {
+ return(srv_init_abort(DB_ERROR));
+ }
+ }
+
+file_checked:
+ /* Open log file and data files in the systemtablespace: we keep
+ them open until database shutdown */
+ ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug);
+
+ err = fil_system.sys_space->open(create_new_db)
+ ? srv_undo_tablespaces_init(create_new_db)
+ : DB_ERROR;
+
+ /* If the force recovery is set very high then we carry on regardless
+ of all errors. Basically this is fingers crossed mode. */
+
+ if (err != DB_SUCCESS
+ && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+ return(srv_init_abort(err));
+ }
+
+ /* Initialize objects used by dict stats gathering thread, which
+ can also be used by recovery if it tries to drop some table */
+ if (!srv_read_only_mode) {
+ dict_stats_init();
+ }
+
+ trx_sys.create();
+
+ if (create_new_db) {
+ ut_ad(!srv_read_only_mode);
+
+ mtr_start(&mtr);
+ ut_ad(fil_system.sys_space->id == 0);
+ compile_time_assert(TRX_SYS_SPACE == 0);
+ compile_time_assert(IBUF_SPACE_ID == 0);
+ fsp_header_init(fil_system.sys_space,
+ uint32_t(sum_of_new_sizes), &mtr);
+
+ ulint ibuf_root = btr_create(
+ DICT_CLUSTERED | DICT_IBUF, fil_system.sys_space,
+ DICT_IBUF_ID_MIN, nullptr, &mtr);
+
+ mtr_commit(&mtr);
+
+ if (ibuf_root == FIL_NULL) {
+ return(srv_init_abort(DB_ERROR));
+ }
+
+ ut_ad(ibuf_root == IBUF_TREE_ROOT_PAGE_NO);
+
+ /* To maintain backward compatibility we create only
+ the first rollback segment before the double write buffer.
+ All the remaining rollback segments will be created later,
+ after the double write buffer has been created. */
+ trx_sys_create_sys_pages();
+ err = trx_lists_init_at_db_start();
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+
+ err = dict_create();
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+
+ buf_flush_sync();
+
+ flushed_lsn = log_sys.get_lsn();
+
+ err = fil_write_flushed_lsn(flushed_lsn);
+
+ if (err == DB_SUCCESS) {
+ err = create_log_file_rename(flushed_lsn, logfile0);
+ }
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+ } else {
+ /* Suppress warnings in fil_space_t::create() for files
+ that are being read before dict_boot() has recovered
+ DICT_HDR_MAX_SPACE_ID. */
+ fil_system.space_id_reuse_warned = true;
+
+ /* We always try to do a recovery, even if the database had
+ been shut down normally: this is the normal startup path */
+
+ err = create_new_log
+ ? DB_SUCCESS
+ : recv_recovery_from_checkpoint_start(flushed_lsn);
+ recv_sys.close_files();
+
+ recv_sys.dblwr.pages.clear();
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+
+ switch (srv_operation) {
+ case SRV_OPERATION_NORMAL:
+ case SRV_OPERATION_RESTORE_EXPORT:
+ /* Initialize the change buffer. */
+ err = dict_boot();
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+ /* fall through */
+ case SRV_OPERATION_RESTORE:
+ /* This must precede
+ recv_apply_hashed_log_recs(true). */
+ srv_undo_tablespaces_active
+ = trx_rseg_get_n_undo_tablespaces();
+ err = srv_validate_undo_tablespaces();
+ if (err != DB_SUCCESS) {
+ return srv_init_abort(err);
+ }
+ if (srv_operation == SRV_OPERATION_RESTORE) {
+ break;
+ }
+ err = trx_lists_init_at_db_start();
+ if (err != DB_SUCCESS) {
+ return srv_init_abort(err);
+ }
+ break;
+ case SRV_OPERATION_RESTORE_DELTA:
+ case SRV_OPERATION_BACKUP:
+ ut_ad("wrong mariabackup mode" == 0);
+ }
+
+ if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
+ /* Apply the hashed log records to the
+ respective file pages, for the last batch of
+ recv_group_scan_log_recs(). */
+
+ recv_sys.apply(true);
+
+ if (recv_sys.found_corrupt_log
+ || recv_sys.found_corrupt_fs) {
+ return(srv_init_abort(DB_CORRUPTION));
+ }
+
+ DBUG_PRINT("ib_log", ("apply completed"));
+
+ if (recv_needed_recovery) {
+ trx_sys_print_mysql_binlog_offset();
+ }
+ }
+
+ fil_system.space_id_reuse_warned = false;
+
+ if (!srv_read_only_mode) {
+ const ulint flags = FSP_FLAGS_PAGE_SSIZE();
+ for (ulint id = 0; id <= srv_undo_tablespaces; id++) {
+ if (fil_space_t* space = fil_space_get(id)) {
+ fsp_flags_try_adjust(space, flags);
+ }
+ }
+
+ if (sum_of_new_sizes > 0) {
+ /* New data file(s) were added */
+ mtr.start();
+ mtr.x_lock_space(fil_system.sys_space,
+ __FILE__, __LINE__);
+ buf_block_t* block = buf_page_get(
+ page_id_t(0, 0), 0,
+ RW_SX_LATCH, &mtr);
+ ulint size = mach_read_from_4(
+ FSP_HEADER_OFFSET + FSP_SIZE
+ + block->frame);
+ ut_ad(size == fil_system.sys_space
+ ->size_in_header);
+ size += sum_of_new_sizes;
+ mtr.write<4>(*block,
+ FSP_HEADER_OFFSET + FSP_SIZE
+ + block->frame, size);
+ fil_system.sys_space->size_in_header
+ = uint32_t(size);
+ mtr.commit();
+ /* Immediately write the log record about
+ increased tablespace size to disk, so that it
+ is durable even if mysqld would crash
+ quickly */
+ log_buffer_flush_to_disk();
+ }
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ mtr.start();
+ buf_block_t* block = buf_page_get(page_id_t(0, 0), 0,
+ RW_S_LATCH, &mtr);
+ ut_ad(mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET
+ + block->frame)
+ == fil_system.sys_space->size_in_header);
+ mtr.commit();
+ }
+#endif
+ const ulint tablespace_size_in_header
+ = fil_system.sys_space->size_in_header;
+ const ulint sum_of_data_file_sizes
+ = srv_sys_space.get_sum_of_sizes();
+ /* Compare the system tablespace file size to what is
+ stored in FSP_SIZE. In srv_sys_space.open_or_create()
+ we already checked that the file sizes match the
+ innodb_data_file_path specification. */
+ if (srv_read_only_mode
+ || sum_of_data_file_sizes == tablespace_size_in_header) {
+ /* Do not complain about the size. */
+ } else if (!srv_sys_space.can_auto_extend_last_file()
+ || sum_of_data_file_sizes
+ < tablespace_size_in_header) {
+ ib::error() << "Tablespace size stored in header is "
+ << tablespace_size_in_header
+ << " pages, but the sum of data file sizes is "
+ << sum_of_data_file_sizes << " pages";
+
+ if (srv_force_recovery == 0
+ && sum_of_data_file_sizes
+ < tablespace_size_in_header) {
+ ib::error() <<
+ "Cannot start InnoDB. The tail of"
+ " the system tablespace is"
+ " missing. Have you edited"
+ " innodb_data_file_path in my.cnf"
+ " in an inappropriate way, removing"
+ " data files from there?"
+ " You can set innodb_force_recovery=1"
+ " in my.cnf to force"
+ " a startup if you are trying to"
+ " recover a badly corrupt database.";
+
+ return(srv_init_abort(DB_ERROR));
+ }
+ }
+
+ recv_sys.debug_free();
+
+ if (srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_EXPORT) {
+ /* After applying the redo log from
+ SRV_OPERATION_BACKUP, flush the changes
+ to the data files and truncate or delete the log.
+ Unless --export is specified, no further change to
+ InnoDB files is needed. */
+ ut_ad(srv_force_recovery <= SRV_FORCE_IGNORE_CORRUPT);
+ ut_ad(recv_no_log_write);
+ err = fil_write_flushed_lsn(log_sys.get_lsn());
+ DBUG_ASSERT(!buf_pool.any_io_pending());
+ log_sys.log.close_file();
+ if (err == DB_SUCCESS) {
+ bool trunc = srv_operation
+ == SRV_OPERATION_RESTORE;
+ if (!trunc) {
+ delete_log_file("0");
+ } else {
+ auto logfile0 = get_log_file_path();
+ /* Truncate the first log file. */
+ fclose(fopen(logfile0.c_str(), "w"));
+ }
+ }
+ return(err);
+ }
+
+ /* Upgrade or resize or rebuild the redo logs before
+ generating any dirty pages, so that the old redo log
+ file will not be written to. */
+
+ if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) {
+ /* Completely ignore the redo log. */
+ } else if (srv_read_only_mode) {
+ /* Leave the redo log alone. */
+ } else if (srv_log_file_size_requested == srv_log_file_size
+ && srv_log_file_found
+ && log_sys.log.format
+ == (srv_encrypt_log
+ ? log_t::FORMAT_ENC_10_5
+ : log_t::FORMAT_10_5)
+ && log_sys.log.subformat == 2) {
+ /* No need to add or remove encryption,
+ upgrade, downgrade, or resize. */
+ } else {
+ /* Prepare to delete the old redo log file */
+ flushed_lsn = srv_prepare_to_delete_redo_log_file(
+ srv_log_file_found);
+
+ DBUG_EXECUTE_IF("innodb_log_abort_1",
+ return(srv_init_abort(DB_ERROR)););
+ /* Prohibit redo log writes from any other
+ threads until creating a log checkpoint at the
+ end of create_log_file(). */
+ ut_d(recv_no_log_write = true);
+ DBUG_ASSERT(!buf_pool.any_io_pending());
+
+ DBUG_EXECUTE_IF("innodb_log_abort_3",
+ return(srv_init_abort(DB_ERROR)););
+ DBUG_PRINT("ib_log", ("After innodb_log_abort_3"));
+
+ /* Stamp the LSN to the data files. */
+ err = fil_write_flushed_lsn(flushed_lsn);
+
+ DBUG_EXECUTE_IF("innodb_log_abort_4", err = DB_ERROR;);
+ DBUG_PRINT("ib_log", ("After innodb_log_abort_4"));
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+
+ /* Close the redo log file, so that we can replace it */
+ log_sys.log.close_file();
+
+ DBUG_EXECUTE_IF("innodb_log_abort_5",
+ return(srv_init_abort(DB_ERROR)););
+ DBUG_PRINT("ib_log", ("After innodb_log_abort_5"));
+
+ ib::info()
+ << "Starting to delete and rewrite log file.";
+
+ srv_log_file_size = srv_log_file_size_requested;
+
+ err = create_log_file(false, flushed_lsn, logfile0);
+
+ if (err == DB_SUCCESS) {
+ err = create_log_file_rename(flushed_lsn,
+ logfile0);
+ }
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+ }
+ }
+
+ ut_ad(err == DB_SUCCESS);
+ ut_a(sum_of_new_sizes != ULINT_UNDEFINED);
+
+ /* Create the doublewrite buffer to a new tablespace */
+ if (!srv_read_only_mode && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+ && !buf_dblwr.create()) {
+ return(srv_init_abort(DB_ERROR));
+ }
+
+ /* Here the double write buffer has already been created and so
+ any new rollback segments will be allocated after the double
+ write buffer. The default segment should already exist.
+ We create the new segments only if it's a new database or
+ the database was shutdown cleanly. */
+
+ /* Note: When creating the extra rollback segments during an upgrade
+ we violate the latching order, even if the change buffer is empty.
+ We make an exception in sync0sync.cc and check srv_is_being_started
+ for that violation. It cannot create a deadlock because we are still
+ running in single threaded mode essentially. Only the IO threads
+ should be running at this stage. */
+
+ if (!trx_sys_create_rsegs()) {
+ return(srv_init_abort(DB_ERROR));
+ }
+
+ if (!create_new_db) {
+ ut_ad(high_level_read_only
+ || srv_force_recovery <= SRV_FORCE_NO_IBUF_MERGE);
+
+ /* Validate a few system page types that were left
+ uninitialized before MySQL or MariaDB 5.5. */
+ if (!high_level_read_only
+ && !fil_system.sys_space->full_crc32()) {
+ buf_block_t* block;
+ mtr.start();
+ /* Bitmap page types will be reset in
+ buf_dblwr_check_block() without redo logging. */
+ block = buf_page_get(
+ page_id_t(IBUF_SPACE_ID,
+ FSP_IBUF_HEADER_PAGE_NO),
+ 0, RW_X_LATCH, &mtr);
+ fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr);
+ /* Already MySQL 3.23.53 initialized
+ FSP_IBUF_TREE_ROOT_PAGE_NO to
+ FIL_PAGE_INDEX. No need to reset that one. */
+ block = buf_page_get(
+ page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+ 0, RW_X_LATCH, &mtr);
+ fil_block_check_type(*block, FIL_PAGE_TYPE_TRX_SYS,
+ &mtr);
+ block = buf_page_get(
+ page_id_t(TRX_SYS_SPACE,
+ FSP_FIRST_RSEG_PAGE_NO),
+ 0, RW_X_LATCH, &mtr);
+ fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr);
+ block = buf_page_get(
+ page_id_t(TRX_SYS_SPACE, FSP_DICT_HDR_PAGE_NO),
+ 0, RW_X_LATCH, &mtr);
+ fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr);
+ mtr.commit();
+ }
+
+ /* Roll back any recovered data dictionary
+ transactions, so that the data dictionary tables will
+ be free of any locks. The data dictionary latch
+ should guarantee that there is at most one data
+ dictionary transaction active at a time. */
+ if (!high_level_read_only
+ && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
+ /* If the following call is ever removed, the
+ first-time ha_innobase::open() must hold (or
+ acquire and release) a table lock that
+ conflicts with trx_resurrect_table_locks(), to
+ ensure that any recovered incomplete ALTER
+ TABLE will have been rolled back. Otherwise,
+ dict_table_t::instant could be cleared by
+ rollback invoking
+ dict_index_t::clear_instant_alter() while open
+ table handles exist in client connections. */
+ trx_rollback_recovered(false);
+ }
+
+ /* FIXME: Skip the following if srv_read_only_mode,
+ while avoiding "Allocated tablespace ID" warnings. */
+ if (srv_force_recovery <= SRV_FORCE_NO_IBUF_MERGE) {
+ /* Open or Create SYS_TABLESPACES and SYS_DATAFILES
+ so that tablespace names and other metadata can be
+ found. */
+ err = dict_create_or_check_sys_tablespace();
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+
+ /* The following call is necessary for the insert
+ buffer to work with multiple tablespaces. We must
+ know the mapping between space id's and .ibd file
+ names.
+
+ In a crash recovery, we check that the info in data
+ dictionary is consistent with what we already know
+ about space id's from the calls to fil_ibd_load().
+
+ In a normal startup, we create the space objects for
+ every table in the InnoDB data dictionary that has
+ an .ibd file.
+
+ We also determine the maximum tablespace id used. */
+ dict_check_tablespaces_and_store_max_id();
+ }
+
+ if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+ && !srv_read_only_mode) {
+ /* Drop partially created indexes. */
+ row_merge_drop_temp_indexes();
+ /* Drop garbage tables. */
+ row_mysql_drop_garbage_tables();
+
+ /* Drop any auxiliary tables that were not
+ dropped when the parent table was
+ dropped. This can happen if the parent table
+ was dropped but the server crashed before the
+ auxiliary tables were dropped. */
+ fts_drop_orphaned_tables();
+
+ /* Rollback incomplete non-DDL transactions */
+ trx_rollback_is_active = true;
+ os_thread_create(trx_rollback_all_recovered);
+ }
+ }
+
+ srv_startup_is_before_trx_rollback_phase = false;
+
+ if (!srv_read_only_mode) {
+ /* timer task which watches the timeouts
+ for lock waits */
+ lock_sys.timeout_timer.reset(srv_thread_pool->create_timer(
+ lock_wait_timeout_task));
+
+ DBUG_EXECUTE_IF("innodb_skip_monitors", goto skip_monitors;);
+ /* Create the task which warns of long semaphore waits */
+ srv_start_periodic_timer(srv_monitor_timer, srv_monitor_task,
+ SRV_MONITOR_INTERVAL);
+
+#ifndef DBUG_OFF
+skip_monitors:
+#endif
+ ut_ad(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN
+ || !purge_sys.enabled());
+
+ if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+ srv_undo_sources = true;
+ /* Create the dict stats gathering task */
+ dict_stats_start();
+ /* Create the thread that will optimize the
+ FULLTEXT search index subsystem. */
+ fts_optimize_init();
+ }
+ }
+
+ /* Create the SYS_FOREIGN and SYS_FOREIGN_COLS system tables */
+ err = dict_create_or_check_foreign_constraint_tables();
+ if (err == DB_SUCCESS) {
+ err = dict_create_or_check_sys_tablespace();
+ if (err == DB_SUCCESS) {
+ err = dict_create_or_check_sys_virtual();
+ }
+ }
+ switch (err) {
+ case DB_SUCCESS:
+ break;
+ case DB_READ_ONLY:
+ if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) {
+ break;
+ }
+ ib::error() << "Cannot create system tables in read-only mode";
+ /* fall through */
+ default:
+ return(srv_init_abort(err));
+ }
+
+ if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL) {
+ /* Initialize the innodb_temporary tablespace and keep
+ it open until shutdown. */
+ err = srv_open_tmp_tablespace(create_new_db);
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+
+ trx_temp_rseg_create();
+
+ if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+ srv_start_periodic_timer(srv_master_timer, srv_master_callback, 1000);
+ }
+ }
+
+ if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL
+ && srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+ srv_init_purge_tasks();
+ purge_sys.coordinator_startup();
+ srv_wake_purge_thread_if_not_active();
+ }
+
+ srv_is_being_started = false;
+
+ if (srv_print_verbose_log) {
+ ib::info() << INNODB_VERSION_STR
+ << " started; log sequence number "
+ << recv_sys.recovered_lsn
+ << "; transaction id " << trx_sys.get_max_trx_id();
+ }
+
+ if (srv_force_recovery == 0) {
+ /* In the change buffer we may have even bigger tablespace
+ id's, because we may have dropped those tablespaces, but
+ the buffered records have not been cleaned yet. */
+ ibuf_update_max_tablespace_id();
+ }
+
+ if (!srv_read_only_mode) {
+ if (create_new_db) {
+ srv_buffer_pool_load_at_startup = FALSE;
+ }
+
+#ifdef WITH_WSREP
+ /*
+ Create the dump/load thread only when not running with
+ --wsrep-recover.
+ */
+ if (!get_wsrep_recovery()) {
+#endif /* WITH_WSREP */
+
+ /* Start buffer pool dump/load task */
+ buf_load_at_startup();
+
+#ifdef WITH_WSREP
+ } else {
+ ib::warn() <<
+ "Skipping buffer pool dump/restore during "
+ "wsrep recovery.";
+ }
+#endif /* WITH_WSREP */
+
+ /* Create thread(s) that handles key rotation. This is
+ needed already here as log_preflush_pool_modified_pages
+ will flush dirty pages and that might need e.g.
+ fil_crypt_threads_event. */
+ fil_system_enter();
+ fil_crypt_threads_init();
+ fil_system_exit();
+
+ /* Initialize online defragmentation. */
+ btr_defragment_init();
+
+ srv_started_redo = true;
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Shut down background threads that can generate undo log. */
+void srv_shutdown_bg_undo_sources()
+{
+ srv_shutdown_state = SRV_SHUTDOWN_INITIATED;
+
+ if (srv_undo_sources) {
+ ut_ad(!srv_read_only_mode);
+ fts_optimize_shutdown();
+ dict_stats_shutdown();
+ while (row_get_background_drop_list_len_low()) {
+ srv_inc_activity_count();
+ os_thread_yield();
+ }
+ srv_undo_sources = false;
+ }
+}
+
+/**
+ Shutdown purge to make sure that there is no possibility that we call any
+ plugin code (e.g., audit) inside virtual column computation.
+*/
+void innodb_preshutdown()
+{
+ static bool first_time= true;
+ if (!first_time)
+ return;
+ first_time= false;
+
+ if (srv_read_only_mode)
+ return;
+ if (!srv_fast_shutdown && srv_operation == SRV_OPERATION_NORMAL)
+ {
+ /* Because a slow shutdown must empty the change buffer, we had
+ better prevent any further changes from being buffered. */
+ innodb_change_buffering= 0;
+
+ if (trx_sys.is_initialised())
+ while (trx_sys.any_active_transactions())
+ os_thread_sleep(1000);
+ }
+ srv_shutdown_bg_undo_sources();
+ srv_purge_shutdown();
+
+ if (srv_n_fil_crypt_threads)
+ fil_crypt_set_thread_cnt(0);
+}
+
+
+/** Shut down InnoDB. */
+void innodb_shutdown()
+{
+ innodb_preshutdown();
+ ut_ad(!srv_undo_sources);
+ switch (srv_operation) {
+ case SRV_OPERATION_BACKUP:
+ case SRV_OPERATION_RESTORE_DELTA:
+ break;
+ case SRV_OPERATION_RESTORE:
+ case SRV_OPERATION_RESTORE_EXPORT:
+ srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
+ if (!buf_page_cleaner_is_active) {
+ break;
+ }
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ while (buf_page_cleaner_is_active) {
+ pthread_cond_signal(&buf_pool.do_flush_list);
+ my_cond_wait(&buf_pool.done_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ break;
+ case SRV_OPERATION_NORMAL:
+ /* Shut down the persistent files. */
+ logs_empty_and_mark_files_at_shutdown();
+ }
+
+ os_aio_free();
+ fil_space_t::close_all();
+ /* Exit any remaining threads. */
+ ut_ad(!buf_page_cleaner_is_active);
+ srv_shutdown_threads();
+
+ if (srv_monitor_file) {
+ my_fclose(srv_monitor_file, MYF(MY_WME));
+ srv_monitor_file = 0;
+ if (srv_monitor_file_name) {
+ unlink(srv_monitor_file_name);
+ ut_free(srv_monitor_file_name);
+ }
+ }
+
+ if (srv_misc_tmpfile) {
+ my_fclose(srv_misc_tmpfile, MYF(MY_WME));
+ srv_misc_tmpfile = 0;
+ }
+
+ ut_ad(dict_sys.is_initialised() || !srv_was_started);
+ ut_ad(trx_sys.is_initialised() || !srv_was_started);
+ ut_ad(buf_dblwr.is_initialised() || !srv_was_started
+ || srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
+ ut_ad(lock_sys.is_initialised() || !srv_was_started);
+ ut_ad(log_sys.is_initialised() || !srv_was_started);
+ ut_ad(ibuf.index || !srv_was_started);
+
+ dict_stats_deinit();
+
+ if (srv_started_redo) {
+ ut_ad(!srv_read_only_mode);
+ /* srv_shutdown_bg_undo_sources() already invoked
+ fts_optimize_shutdown(); dict_stats_shutdown(); */
+
+ fil_crypt_threads_cleanup();
+ btr_defragment_shutdown();
+ }
+
+ /* This must be disabled before closing the buffer pool
+ and closing the data dictionary. */
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (dict_sys.is_initialised()) {
+ btr_search_disable();
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+ ibuf_close();
+ log_sys.close();
+ purge_sys.close();
+ trx_sys.close();
+ buf_dblwr.close();
+ lock_sys.close();
+ trx_pool_close();
+
+ if (!srv_read_only_mode) {
+ mutex_free(&srv_monitor_file_mutex);
+ mutex_free(&srv_misc_tmpfile_mutex);
+ }
+
+ dict_sys.close();
+ btr_search_sys_free();
+ row_mysql_close();
+ srv_free();
+ fil_system.close();
+ pars_lexer_close();
+ recv_sys.close();
+
+ ut_ad(buf_pool.is_initialised() || !srv_was_started);
+ buf_pool.close();
+ sync_check_close();
+
+ srv_sys_space.shutdown();
+ if (srv_tmp_space.get_sanity_check_status()) {
+ if (fil_system.temp_space) {
+ fil_system.temp_space->close();
+ }
+ srv_tmp_space.delete_files();
+ }
+ srv_tmp_space.shutdown();
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+ os_event_destroy(srv_allow_writes_event);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
+ if (srv_was_started && srv_print_verbose_log) {
+ ib::info() << "Shutdown completed; log sequence number "
+ << srv_shutdown_lsn
+ << "; transaction id " << trx_sys.get_max_trx_id();
+ }
+ srv_thread_pool_end();
+ srv_started_redo = false;
+ srv_was_started = false;
+ srv_start_has_been_called = false;
+}
+
+/** Get the meta-data filename from the table name for a
+single-table tablespace.
+@param[in] table table object
+@param[out] filename filename
+@param[in] max_len filename max length */
+void
+srv_get_meta_data_filename(
+ dict_table_t* table,
+ char* filename,
+ ulint max_len)
+{
+ ulint len;
+ char* path;
+
+ /* Make sure the data_dir_path is set. */
+ dict_get_and_save_data_dir_path(table, false);
+
+ if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+ ut_a(table->data_dir_path);
+
+ path = fil_make_filepath(
+ table->data_dir_path, table->name.m_name, CFG, true);
+ } else {
+ path = fil_make_filepath(NULL, table->name.m_name, CFG, false);
+ }
+
+ ut_a(path);
+ len = strlen(path);
+ ut_a(max_len >= len);
+
+ strcpy(filename, path);
+
+ ut_free(path);
+}
diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc
new file mode 100644
index 00000000..5f39325d
--- /dev/null
+++ b/storage/innobase/sync/sync0arr.cc
@@ -0,0 +1,1296 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2013, 2020, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0arr.cc
+The wait array used in synchronization primitives
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0arr.h"
+#include <mysqld_error.h>
+#include <mysql/plugin.h>
+#include <hash.h>
+#include <myisampack.h>
+#include <sql_acl.h>
+#include <mysys_err.h>
+#include <my_sys.h>
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "i_s.h"
+#include <sql_plugin.h>
+#include <innodb_priv.h>
+
+#include "lock0lock.h"
+#include "sync0rw.h"
+
+/*
+ WAIT ARRAY
+ ==========
+
+The wait array consists of cells each of which has an an event object created
+for it. The threads waiting for a mutex, for example, can reserve a cell
+in the array and suspend themselves to wait for the event to become signaled.
+When using the wait array, remember to make sure that some thread holding
+the synchronization object will eventually know that there is a waiter in
+the array and signal the object, to prevent infinite wait. Why we chose
+to implement a wait array? First, to make mutexes fast, we had to code
+our own implementation of them, which only in usually uncommon cases
+resorts to using slow operating system primitives. Then we had the choice of
+assigning a unique OS event for each mutex, which would be simpler, or
+using a global wait array. In some operating systems, the global wait
+array solution is more efficient and flexible, because we can do with
+a very small number of OS events, say 200. In NT 3.51, allocating events
+seems to be a quadratic algorithm, because 10 000 events are created fast,
+but 100 000 events takes a couple of minutes to create.
+
+As of 5.0.30 the above mentioned design is changed. Since now OS can handle
+millions of wait events efficiently, we no longer have this concept of each
+cell of wait array having one event. Instead, now the event that a thread
+wants to wait on is embedded in the wait object (mutex or rw_lock). We still
+keep the global wait array for the sake of diagnostics and also to avoid
+infinite wait The error_monitor thread scans the global wait array to signal
+any waiting threads who have missed the signal. */
+
+typedef TTASEventMutex<GenericPolicy> WaitMutex;
+
+/** The latch types that use the sync array. */
+union sync_object_t {
+
+ /** RW lock instance */
+ rw_lock_t* lock;
+
+ /** Mutex instance */
+ WaitMutex* mutex;
+};
+
+/** A cell where an individual thread may wait suspended until a resource
+is released. The suspending is implemented using an operating system
+event semaphore. */
+
+struct sync_cell_t {
+ sync_object_t latch; /*!< pointer to the object the
+ thread is waiting for; if NULL
+ the cell is free for use */
+ ulint request_type; /*!< lock type requested on the
+ object */
+ const char* file; /*!< in debug version file where
+ requested */
+ ulint line; /*!< in debug version line where
+ requested, or ULINT_UNDEFINED */
+ os_thread_id_t thread_id; /*!< thread id of this waiting
+ thread */
+ bool waiting; /*!< TRUE if the thread has already
+ called sync_array_event_wait
+ on this cell */
+ int64_t signal_count; /*!< We capture the signal_count
+ of the latch when we
+ reset the event. This value is
+ then passed on to os_event_wait
+ and we wait only if the event
+ has not been signalled in the
+ period between the reset and
+ wait call. */
+ /** time(NULL) when the wait cell was reserved.
+ FIXME: sync_array_print_long_waits_low() may display bogus
+ warnings when the system time is adjusted to the past! */
+ time_t reservation_time;
+};
+
+/* NOTE: It is allowed for a thread to wait for an event allocated for
+the array without owning the protecting mutex (depending on the case:
+OS or database mutex), but all changes (set or reset) to the state of
+the event must be made while owning the mutex. */
+
+/** Synchronization array */
+struct sync_array_t {
+
+ /** Constructor
+ Creates a synchronization wait array. It is protected by a mutex
+ which is automatically reserved when the functions operating on it
+ are called.
+ @param[in] num_cells Number of cells to create */
+ sync_array_t(ulint num_cells)
+ UNIV_NOTHROW;
+
+ /** Destructor */
+ ~sync_array_t()
+ UNIV_NOTHROW;
+
+ ulint n_reserved; /*!< number of currently reserved
+ cells in the wait array */
+ ulint n_cells; /*!< number of cells in the
+ wait array */
+ sync_cell_t* array; /*!< pointer to wait array */
+ SysMutex mutex; /*!< System mutex protecting the
+ data structure. As this data
+ structure is used in constructing
+ the database mutex, to prevent
+ infinite recursion in implementation,
+ we fall back to an OS mutex. */
+ ulint res_count; /*!< count of cell reservations
+ since creation of the array */
+ ulint next_free_slot; /*!< the next free cell in the array */
+ ulint first_free_slot;/*!< the last slot that was freed */
+};
+
+/** User configured sync array size */
+ulong srv_sync_array_size = 1;
+
+/** Locally stored copy of srv_sync_array_size */
+ulint sync_array_size;
+
+/** The global array of wait cells for implementation of the database's own
+mutexes and read-write locks */
+sync_array_t** sync_wait_array;
+
+/** count of how many times an object has been signalled */
+ulint sg_count;
+
+#define sync_array_exit(a) mutex_exit(&(a)->mutex)
+#define sync_array_enter(a) mutex_enter(&(a)->mutex)
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+This function is called only in the debug version. Detects a deadlock
+of one or more threads because of waits of semaphores.
+@return TRUE if deadlock detected */
+static
+bool
+sync_array_detect_deadlock(
+/*=======================*/
+ sync_array_t* arr, /*!< in: wait array; NOTE! the caller must
+ own the mutex to array */
+ sync_cell_t* start, /*!< in: cell where recursive search started */
+ sync_cell_t* cell, /*!< in: cell to search */
+ ulint depth); /*!< in: recursion depth */
+#endif /* UNIV_DEBUG */
+
+/** Constructor
+Creates a synchronization wait array. It is protected by a mutex
+which is automatically reserved when the functions operating on it
+are called.
+@param[in] num_cells Number of cells to create */
+sync_array_t::sync_array_t(ulint num_cells)
+ UNIV_NOTHROW
+ :
+ n_reserved(),
+ n_cells(num_cells),
+ array(UT_NEW_ARRAY_NOKEY(sync_cell_t, num_cells)),
+ mutex(),
+ res_count(),
+ next_free_slot(),
+ first_free_slot(ULINT_UNDEFINED)
+{
+ ut_a(num_cells > 0);
+
+ memset(array, 0x0, sizeof(sync_cell_t) * n_cells);
+
+ /* Then create the mutex to protect the wait array */
+ mutex_create(LATCH_ID_SYNC_ARRAY_MUTEX, &mutex);
+}
+
+/** Validate the integrity of the wait array. Check
+that the number of reserved cells equals the count variable.
+@param[in,out] arr sync wait array */
+static
+void
+sync_array_validate(sync_array_t* arr)
+{
+ ulint i;
+ ulint count = 0;
+
+ sync_array_enter(arr);
+
+ for (i = 0; i < arr->n_cells; i++) {
+ sync_cell_t* cell;
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->latch.mutex != NULL) {
+ count++;
+ }
+ }
+
+ ut_a(count == arr->n_reserved);
+
+ sync_array_exit(arr);
+}
+
+/** Destructor */
+sync_array_t::~sync_array_t()
+ UNIV_NOTHROW
+{
+ ut_a(n_reserved == 0);
+
+ sync_array_validate(this);
+
+ /* Release the mutex protecting the wait array */
+
+ mutex_free(&mutex);
+
+ UT_DELETE_ARRAY(array);
+}
+
+/*****************************************************************//**
+Gets the nth cell in array.
+@return cell */
+UNIV_INTERN
+sync_cell_t*
+sync_array_get_nth_cell(
+/*====================*/
+ sync_array_t* arr, /*!< in: sync array */
+ ulint n) /*!< in: index */
+{
+ ut_a(n < arr->n_cells);
+
+ return(arr->array + n);
+}
+
+/******************************************************************//**
+Frees the resources in a wait array. */
+static
+void
+sync_array_free(
+/*============*/
+ sync_array_t* arr) /*!< in, own: sync wait array */
+{
+ UT_DELETE(arr);
+}
+
+/*******************************************************************//**
+Returns the event that the thread owning the cell waits for. */
+static
+os_event_t
+sync_cell_get_event(
+/*================*/
+ sync_cell_t* cell) /*!< in: non-empty sync array cell */
+{
+ switch(cell->request_type) {
+ case SYNC_MUTEX:
+ return(cell->latch.mutex->event());
+ case RW_LOCK_X_WAIT:
+ return(cell->latch.lock->wait_ex_event);
+ default:
+ return(cell->latch.lock->event);
+ }
+}
+
+/******************************************************************//**
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state.
+@return sync cell to wait on */
+sync_cell_t*
+sync_array_reserve_cell(
+/*====================*/
+ sync_array_t* arr, /*!< in: wait array */
+ void* object, /*!< in: pointer to the object to wait for */
+ ulint type, /*!< in: lock request type */
+ const char* file, /*!< in: file where requested */
+ unsigned line) /*!< in: line where requested */
+{
+ sync_cell_t* cell;
+
+ sync_array_enter(arr);
+
+ if (arr->first_free_slot != ULINT_UNDEFINED) {
+ /* Try and find a slot in the free list */
+ ut_ad(arr->first_free_slot < arr->next_free_slot);
+ cell = sync_array_get_nth_cell(arr, arr->first_free_slot);
+ arr->first_free_slot = cell->line;
+ } else if (arr->next_free_slot < arr->n_cells) {
+ /* Try and find a slot after the currently allocated slots */
+ cell = sync_array_get_nth_cell(arr, arr->next_free_slot);
+ ++arr->next_free_slot;
+ } else {
+ sync_array_exit(arr);
+
+ // We should return NULL and if there is more than
+ // one sync array, try another sync array instance.
+ return(NULL);
+ }
+
+ ++arr->res_count;
+
+ ut_ad(arr->n_reserved < arr->n_cells);
+ ut_ad(arr->next_free_slot <= arr->n_cells);
+
+ ++arr->n_reserved;
+
+ /* Reserve the cell. */
+ ut_ad(cell->latch.mutex == NULL);
+
+ cell->request_type = type;
+
+ if (cell->request_type == SYNC_MUTEX) {
+ cell->latch.mutex = reinterpret_cast<WaitMutex*>(object);
+ } else {
+ cell->latch.lock = reinterpret_cast<rw_lock_t*>(object);
+ }
+
+ cell->waiting = false;
+
+ cell->file = file;
+ cell->line = line;
+
+ sync_array_exit(arr);
+
+ cell->thread_id = os_thread_get_curr_id();
+
+ cell->reservation_time = time(NULL);
+
+ /* Make sure the event is reset and also store the value of
+ signal_count at which the event was reset. */
+ os_event_t event = sync_cell_get_event(cell);
+ cell->signal_count = os_event_reset(event);
+
+ return(cell);
+}
+
+/******************************************************************//**
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+void
+sync_array_free_cell(
+/*=================*/
+ sync_array_t* arr, /*!< in: wait array */
+ sync_cell_t*& cell) /*!< in/out: the cell in the array */
+{
+ sync_array_enter(arr);
+
+ ut_a(cell->latch.mutex != NULL);
+
+ cell->waiting = false;
+ cell->signal_count = 0;
+ cell->latch.mutex = NULL;
+
+ /* Setup the list of free slots in the array */
+ cell->line = arr->first_free_slot;
+
+ arr->first_free_slot = cell - arr->array;
+
+ ut_a(arr->n_reserved > 0);
+ arr->n_reserved--;
+
+ if (arr->next_free_slot > arr->n_cells / 2 && arr->n_reserved == 0) {
+#ifdef UNIV_DEBUG
+ for (ulint i = 0; i < arr->next_free_slot; ++i) {
+ cell = sync_array_get_nth_cell(arr, i);
+
+ ut_ad(!cell->waiting);
+ ut_ad(cell->latch.mutex == 0);
+ ut_ad(cell->signal_count == 0);
+ }
+#endif /* UNIV_DEBUG */
+ arr->next_free_slot = 0;
+ arr->first_free_slot = ULINT_UNDEFINED;
+ }
+ sync_array_exit(arr);
+
+ cell = 0;
+}
+
+/******************************************************************//**
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+void
+sync_array_wait_event(
+/*==================*/
+ sync_array_t* arr, /*!< in: wait array */
+ sync_cell_t*& cell) /*!< in: index of the reserved cell */
+{
+ sync_array_enter(arr);
+
+ ut_ad(!cell->waiting);
+ ut_ad(cell->latch.mutex);
+ ut_ad(os_thread_get_curr_id() == cell->thread_id);
+
+ cell->waiting = true;
+
+#ifdef UNIV_DEBUG
+
+ /* We use simple enter to the mutex below, because if
+ we cannot acquire it at once, mutex_enter would call
+ recursively sync_array routines, leading to trouble.
+ rw_lock_debug_mutex freezes the debug lists. */
+
+ rw_lock_debug_mutex_enter();
+
+ if (sync_array_detect_deadlock(arr, cell, cell, 0)) {
+
+ ib::fatal() << "########################################"
+ " Deadlock Detected!";
+ }
+
+ rw_lock_debug_mutex_exit();
+#endif /* UNIV_DEBUG */
+ sync_array_exit(arr);
+
+ tpool::tpool_wait_begin();
+ os_event_wait_low(sync_cell_get_event(cell), cell->signal_count);
+ tpool::tpool_wait_end();
+
+ sync_array_free_cell(arr, cell);
+
+ cell = 0;
+}
+
+/******************************************************************//**
+Reports info of a wait array cell. */
+static
+void
+sync_array_cell_print(
+/*==================*/
+ FILE* file, /*!< in: file where to print */
+ sync_cell_t* cell) /*!< in: sync cell */
+{
+ rw_lock_t* rwlock;
+ ulint type;
+ ulint writer;
+
+ type = cell->request_type;
+
+ fprintf(file,
+ "--Thread " ULINTPF " has waited at %s line " ULINTPF
+ " for %.2f seconds the semaphore:\n",
+ ulint(cell->thread_id),
+ innobase_basename(cell->file), cell->line,
+ difftime(time(NULL), cell->reservation_time));
+
+ switch (type) {
+ default:
+ ut_error;
+ case RW_LOCK_X:
+ case RW_LOCK_X_WAIT:
+ case RW_LOCK_SX:
+ case RW_LOCK_S:
+ fputs(type == RW_LOCK_X ? "X-lock on"
+ : type == RW_LOCK_X_WAIT ? "X-lock (wait_ex) on"
+ : type == RW_LOCK_SX ? "SX-lock on"
+ : "S-lock on", file);
+
+ rwlock = cell->latch.lock;
+
+ if (rwlock) {
+ fprintf(file,
+ " RW-latch at %p created in file %s line %u\n",
+ (void*) rwlock, innobase_basename(rwlock->cfile_name),
+ rwlock->cline);
+
+ writer = rw_lock_get_writer(rwlock);
+
+ if (writer != RW_LOCK_NOT_LOCKED) {
+
+ fprintf(file,
+ "a writer (thread id " ULINTPF ") has"
+ " reserved it in mode %s",
+ ulint(rwlock->writer_thread),
+ writer == RW_LOCK_X ? " exclusive\n"
+ : writer == RW_LOCK_SX ? " SX\n"
+ : " wait exclusive\n");
+ }
+
+ fprintf(file,
+ "number of readers " ULINTPF
+ ", waiters flag %d, "
+ "lock_word: %x\n"
+ "Last time write locked in file %s line %u"
+#if 0 /* JAN: TODO: FIX LATER */
+ "\nHolder thread " ULINTPF
+ " file %s line " ULINTPF
+#endif
+ "\n",
+ rw_lock_get_reader_count(rwlock),
+ uint32_t{rwlock->waiters},
+ int32_t{rwlock->lock_word},
+ innobase_basename(rwlock->last_x_file_name),
+ rwlock->last_x_line
+#if 0 /* JAN: TODO: FIX LATER */
+ , ulint(rwlock->thread_id),
+ innobase_basename(rwlock->file_name),
+ rwlock->line
+#endif
+ );
+ }
+ break;
+ case SYNC_MUTEX:
+ WaitMutex* mutex = cell->latch.mutex;
+ const WaitMutex::MutexPolicy& policy = mutex->policy();
+#ifdef UNIV_DEBUG
+ const char* name = policy.context.get_enter_filename();
+ if (name == NULL) {
+ /* The mutex might have been released. */
+ name = "NULL";
+ }
+#endif /* UNIV_DEBUG */
+
+ if (mutex) {
+ fprintf(file,
+ "Mutex at %p, %s, lock var %x\n"
+#ifdef UNIV_DEBUG
+ "Last time reserved in file %s line %u"
+#endif /* UNIV_DEBUG */
+ "\n",
+ (void*) mutex,
+ policy.to_string().c_str(),
+ mutex->state()
+#ifdef UNIV_DEBUG
+ ,name,
+ policy.context.get_enter_line()
+#endif /* UNIV_DEBUG */
+ );
+ }
+ break;
+ }
+
+ if (!cell->waiting) {
+ fputs("wait has ended\n", file);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Looks for a cell with the given thread id.
+@return pointer to cell or NULL if not found */
+static
+sync_cell_t*
+sync_array_find_thread(
+/*===================*/
+ sync_array_t* arr, /*!< in: wait array */
+ os_thread_id_t thread) /*!< in: thread id */
+{
+ ulint i;
+
+ for (i = 0; i < arr->n_cells; i++) {
+ sync_cell_t* cell;
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->latch.mutex != NULL
+ && os_thread_eq(cell->thread_id, thread)) {
+
+ return(cell); /* Found */
+ }
+ }
+
+ return(NULL); /* Not found */
+}
+
+/******************************************************************//**
+Recursion step for deadlock detection.
+@return TRUE if deadlock detected */
+static
+ibool
+sync_array_deadlock_step(
+/*=====================*/
+ sync_array_t* arr, /*!< in: wait array; NOTE! the caller must
+ own the mutex to array */
+ sync_cell_t* start, /*!< in: cell where recursive search
+ started */
+ os_thread_id_t thread, /*!< in: thread to look at */
+ ulint pass, /*!< in: pass value */
+ ulint depth) /*!< in: recursion depth */
+{
+ sync_cell_t* new_cell;
+
+ if (pass != 0) {
+ /* If pass != 0, then we do not know which threads are
+ responsible of releasing the lock, and no deadlock can
+ be detected. */
+
+ return(FALSE);
+ }
+
+ new_cell = sync_array_find_thread(arr, thread);
+
+ if (new_cell == start) {
+ /* Deadlock */
+ fputs("########################################\n"
+ "DEADLOCK of threads detected!\n", stderr);
+
+ return(TRUE);
+
+ } else if (new_cell) {
+ return(sync_array_detect_deadlock(
+ arr, start, new_cell, depth + 1));
+ }
+ return(FALSE);
+}
+
+/**
+Report an error to stderr.
+@param lock rw-lock instance
+@param debug rw-lock debug information
+@param cell thread context */
+static
+void
+sync_array_report_error(
+ rw_lock_t* lock,
+ rw_lock_debug_t* debug,
+ sync_cell_t* cell)
+{
+ fprintf(stderr, "rw-lock %p ", (void*) lock);
+ sync_array_cell_print(stderr, cell);
+ rw_lock_debug_print(stderr, debug);
+}
+
+/******************************************************************//**
+This function is called only in the debug version. Detects a deadlock
+of one or more threads because of waits of semaphores.
+@return TRUE if deadlock detected */
+static
+bool
+sync_array_detect_deadlock(
+/*=======================*/
+ sync_array_t* arr, /*!< in: wait array; NOTE! the caller must
+ own the mutex to array */
+ sync_cell_t* start, /*!< in: cell where recursive search started */
+ sync_cell_t* cell, /*!< in: cell to search */
+ ulint depth) /*!< in: recursion depth */
+{
+ rw_lock_t* lock;
+ os_thread_id_t thread;
+ ibool ret;
+ rw_lock_debug_t*debug;
+
+ ut_a(arr);
+ ut_a(start);
+ ut_a(cell);
+ ut_ad(cell->latch.mutex != 0);
+ ut_ad(os_thread_get_curr_id() == start->thread_id);
+ ut_ad(depth < 100);
+
+ depth++;
+
+ if (!cell->waiting) {
+ /* No deadlock here */
+ return(false);
+ }
+
+ switch (cell->request_type) {
+ case SYNC_MUTEX: {
+
+ WaitMutex* mutex = cell->latch.mutex;
+ const WaitMutex::MutexPolicy& policy = mutex->policy();
+
+ if (mutex->state() != MUTEX_STATE_UNLOCKED) {
+ thread = policy.context.get_thread_id();
+
+ /* Note that mutex->thread_id above may be
+ also OS_THREAD_ID_UNDEFINED, because the
+ thread which held the mutex maybe has not
+ yet updated the value, or it has already
+ released the mutex: in this case no deadlock
+ can occur, as the wait array cannot contain
+ a thread with ID_UNDEFINED value. */
+ ret = sync_array_deadlock_step(
+ arr, start, thread, 0, depth);
+
+ if (ret) {
+ const char* name;
+
+ name = policy.context.get_enter_filename();
+
+ if (name == NULL) {
+ /* The mutex might have been
+ released. */
+ name = "NULL";
+ }
+
+ ib::info()
+ << "Mutex " << mutex << " owned by"
+ " thread " << thread
+ << " file " << name << " line "
+ << policy.context.get_enter_line();
+
+ sync_array_cell_print(stderr, cell);
+
+ return(true);
+ }
+ }
+
+ /* No deadlock */
+ return(false);
+ }
+
+ case RW_LOCK_X:
+ case RW_LOCK_X_WAIT:
+
+ lock = cell->latch.lock;
+
+ for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+ debug != NULL;
+ debug = UT_LIST_GET_NEXT(list, debug)) {
+
+ thread = debug->thread_id;
+
+ switch (debug->lock_type) {
+ case RW_LOCK_X:
+ case RW_LOCK_SX:
+ case RW_LOCK_X_WAIT:
+ if (os_thread_eq(thread, cell->thread_id)) {
+ break;
+ }
+ /* fall through */
+ case RW_LOCK_S:
+
+ /* The (wait) x-lock request can block
+ infinitely only if someone (can be also cell
+ thread) is holding s-lock, or someone
+ (cannot be cell thread) (wait) x-lock or
+ sx-lock, and he is blocked by start thread */
+
+ ret = sync_array_deadlock_step(
+ arr, start, thread, debug->pass,
+ depth);
+
+ if (ret) {
+ sync_array_report_error(
+ lock, debug, cell);
+ rw_lock_debug_print(stderr, debug);
+ return(TRUE);
+ }
+ }
+ }
+
+ return(false);
+
+ case RW_LOCK_SX:
+
+ lock = cell->latch.lock;
+
+ for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+ debug != 0;
+ debug = UT_LIST_GET_NEXT(list, debug)) {
+
+ thread = debug->thread_id;
+
+ switch (debug->lock_type) {
+ case RW_LOCK_X:
+ case RW_LOCK_SX:
+ case RW_LOCK_X_WAIT:
+
+ if (os_thread_eq(thread, cell->thread_id)) {
+ break;
+ }
+
+ /* The sx-lock request can block infinitely
+ only if someone (can be also cell thread) is
+ holding (wait) x-lock or sx-lock, and he is
+ blocked by start thread */
+
+ ret = sync_array_deadlock_step(
+ arr, start, thread, debug->pass,
+ depth);
+
+ if (ret) {
+ sync_array_report_error(
+ lock, debug, cell);
+ return(TRUE);
+ }
+ }
+ }
+
+ return(false);
+
+ case RW_LOCK_S:
+
+ lock = cell->latch.lock;
+
+ for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+ debug != 0;
+ debug = UT_LIST_GET_NEXT(list, debug)) {
+
+ thread = debug->thread_id;
+
+ if (debug->lock_type == RW_LOCK_X
+ || debug->lock_type == RW_LOCK_X_WAIT) {
+
+ /* The s-lock request can block infinitely
+ only if someone (can also be cell thread) is
+ holding (wait) x-lock, and he is blocked by
+ start thread */
+
+ ret = sync_array_deadlock_step(
+ arr, start, thread, debug->pass,
+ depth);
+
+ if (ret) {
+ sync_array_report_error(
+ lock, debug, cell);
+ return(TRUE);
+ }
+ }
+ }
+
+ return(false);
+
+ default:
+ ut_error;
+ }
+
+ return(true);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return TRUE if fatal semaphore wait threshold was exceeded */
+static
+bool
+sync_array_print_long_waits_low(
+/*============================*/
+ sync_array_t* arr, /*!< in: sync array instance */
+ os_thread_id_t* waiter, /*!< out: longest waiting thread */
+ const void** sema, /*!< out: longest-waited-for semaphore */
+ ibool* noticed)/*!< out: TRUE if long wait noticed */
+{
+ double fatal_timeout = static_cast<double>(
+ srv_fatal_semaphore_wait_threshold);
+ ibool fatal = FALSE;
+ double longest_diff = 0;
+ ulint i;
+
+ /* For huge tables, skip the check during CHECK TABLE etc... */
+ if (btr_validate_index_running) {
+ return(false);
+ }
+
+#if defined HAVE_valgrind && !__has_feature(memory_sanitizer)
+ /* Increase the timeouts if running under valgrind because it executes
+ extremely slowly. HAVE_valgrind does not necessary mean that
+ we are running under valgrind but we have no better way to tell.
+ See Bug#58432 innodb.innodb_bug56143 fails under valgrind
+ for an example */
+# define SYNC_ARRAY_TIMEOUT 2400
+ fatal_timeout *= 10;
+#else
+# define SYNC_ARRAY_TIMEOUT 240
+#endif
+ const time_t now = time(NULL);
+
+ for (ulint i = 0; i < arr->n_cells; i++) {
+
+ sync_cell_t* cell;
+ void* latch;
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ latch = cell->latch.mutex;
+
+ if (latch == NULL || !cell->waiting) {
+
+ continue;
+ }
+
+ double diff = difftime(now, cell->reservation_time);
+
+ if (diff > SYNC_ARRAY_TIMEOUT) {
+ ib::warn() << "A long semaphore wait:";
+ sync_array_cell_print(stderr, cell);
+ *noticed = TRUE;
+ }
+
+ if (diff > fatal_timeout) {
+ fatal = TRUE;
+ }
+
+ if (diff > longest_diff) {
+ longest_diff = diff;
+ *sema = latch;
+ *waiter = cell->thread_id;
+ }
+ }
+
+ /* We found a long semaphore wait, print all threads that are
+ waiting for a semaphore. */
+ if (*noticed) {
+ for (i = 0; i < arr->n_cells; i++) {
+ void* wait_object;
+ sync_cell_t* cell;
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ wait_object = cell->latch.mutex;
+
+ if (wait_object == NULL || !cell->waiting) {
+
+ continue;
+ }
+
+ ib::info() << "A semaphore wait:";
+ sync_array_cell_print(stderr, cell);
+ }
+ }
+
+#undef SYNC_ARRAY_TIMEOUT
+
+ return(fatal);
+}
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return TRUE if fatal semaphore wait threshold was exceeded */
+ibool
+sync_array_print_long_waits(
+/*========================*/
+ os_thread_id_t* waiter, /*!< out: longest waiting thread */
+ const void** sema) /*!< out: longest-waited-for semaphore */
+{
+ ulint i;
+ ibool fatal = FALSE;
+ ibool noticed = FALSE;
+
+ for (i = 0; i < sync_array_size; ++i) {
+
+ sync_array_t* arr = sync_wait_array[i];
+
+ sync_array_enter(arr);
+
+ if (sync_array_print_long_waits_low(
+ arr, waiter, sema, &noticed)) {
+
+ fatal = TRUE;
+ }
+
+ sync_array_exit(arr);
+ }
+
+ if (noticed) {
+ /* If some crucial semaphore is reserved, then also the InnoDB
+ Monitor can hang, and we do not get diagnostics. Since in
+ many cases an InnoDB hang is caused by a pwrite() or a pread()
+ call hanging inside the operating system, let us print right
+ now the values of pending calls of these. */
+
+ fprintf(stderr,
+ "InnoDB: Pending reads " UINT64PF
+ ", writes " UINT64PF "\n",
+ MONITOR_VALUE(MONITOR_OS_PENDING_READS),
+ MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
+
+ lock_wait_timeout_task(nullptr);
+ }
+
+ return(fatal);
+}
+
+/**********************************************************************//**
+Prints info of the wait array. */
+static
+void
+sync_array_print_info_low(
+/*======================*/
+ FILE* file, /*!< in: file where to print */
+ sync_array_t* arr) /*!< in: wait array */
+{
+ ulint i;
+ ulint count = 0;
+
+ fprintf(file,
+ "OS WAIT ARRAY INFO: reservation count " ULINTPF "\n",
+ arr->res_count);
+
+ for (i = 0; count < arr->n_reserved; ++i) {
+ sync_cell_t* cell;
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->latch.mutex != 0) {
+ count++;
+ sync_array_cell_print(file, cell);
+ }
+ }
+}
+
+/**********************************************************************//**
+Prints info of the wait array. */
+static
+void
+sync_array_print_info(
+/*==================*/
+ FILE* file, /*!< in: file where to print */
+ sync_array_t* arr) /*!< in: wait array */
+{
+ sync_array_enter(arr);
+
+ sync_array_print_info_low(file, arr);
+
+ sync_array_exit(arr);
+}
+
+/** Create the primary system wait arrays */
+void sync_array_init()
+{
+ ut_a(sync_wait_array == NULL);
+ ut_a(srv_sync_array_size > 0);
+ ut_a(srv_max_n_threads > 0);
+
+ sync_array_size = srv_sync_array_size;
+
+ sync_wait_array = UT_NEW_ARRAY_NOKEY(sync_array_t*, sync_array_size);
+
+ ulint n_slots = 1 + (srv_max_n_threads - 1) / sync_array_size;
+
+ for (ulint i = 0; i < sync_array_size; ++i) {
+
+ sync_wait_array[i] = UT_NEW_NOKEY(sync_array_t(n_slots));
+ }
+}
+
+/** Destroy the sync array wait sub-system. */
+void sync_array_close()
+{
+ for (ulint i = 0; i < sync_array_size; ++i) {
+ sync_array_free(sync_wait_array[i]);
+ }
+
+ UT_DELETE_ARRAY(sync_wait_array);
+ sync_wait_array = NULL;
+}
+
+/**********************************************************************//**
+Print info about the sync array(s). */
+void
+sync_array_print(
+/*=============*/
+ FILE* file) /*!< in/out: Print to this stream */
+{
+ for (ulint i = 0; i < sync_array_size; ++i) {
+ sync_array_print_info(file, sync_wait_array[i]);
+ }
+
+ fprintf(file,
+ "OS WAIT ARRAY INFO: signal count " ULINTPF "\n", sg_count);
+
+}
+
+/**********************************************************************//**
+Prints info of the wait array without using any mutexes/semaphores. */
+UNIV_INTERN
+void
+sync_array_print_innodb(void)
+/*=========================*/
+{
+ ulint i;
+ sync_array_t* arr = sync_array_get();
+
+ fputs("InnoDB: Semaphore wait debug output started for InnoDB:\n", stderr);
+
+ for (i = 0; i < arr->n_cells; i++) {
+ void* wait_object;
+ sync_cell_t* cell;
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ wait_object = cell->latch.mutex;
+
+ if (wait_object == NULL || !cell->waiting) {
+
+ continue;
+ }
+
+ fputs("InnoDB: Warning: semaphore wait:\n",
+ stderr);
+ sync_array_cell_print(stderr, cell);
+ }
+
+ fputs("InnoDB: Semaphore wait debug output ended:\n", stderr);
+
+}
+
+/**********************************************************************//**
+Get number of items on sync array. */
+UNIV_INTERN
+ulint
+sync_arr_get_n_items(void)
+/*======================*/
+{
+ sync_array_t* sync_arr = sync_array_get();
+ return (ulint) sync_arr->n_cells;
+}
+
+/******************************************************************//**
+Get specified item from sync array if it is reserved. Set given
+pointer to array item if it is reserved.
+@return true if item is reserved, false othervise */
+UNIV_INTERN
+ibool
+sync_arr_get_item(
+/*==============*/
+ ulint i, /*!< in: requested item */
+ sync_cell_t **cell) /*!< out: cell contents if item
+ reserved */
+{
+ sync_array_t* sync_arr;
+ sync_cell_t* wait_cell;
+ void* wait_object;
+ ibool found = FALSE;
+
+ sync_arr = sync_array_get();
+ wait_cell = sync_array_get_nth_cell(sync_arr, i);
+
+ if (wait_cell) {
+ wait_object = wait_cell->latch.mutex;
+
+ if(wait_object != NULL && wait_cell->waiting) {
+ found = TRUE;
+ *cell = wait_cell;
+ }
+ }
+
+ return found;
+}
+
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS table.
+Loop through each item on sync array, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS table.
+@return 0 on success */
+UNIV_INTERN
+int
+sync_arr_fill_sys_semphore_waits_table(
+/*===================================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ Field** fields;
+ ulint n_items;
+
+ DBUG_ENTER("i_s_sys_semaphore_waits_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ fields = tables->table->field;
+ n_items = sync_arr_get_n_items();
+ ulint type;
+
+ for(ulint i=0; i < n_items;i++) {
+ sync_cell_t *cell=NULL;
+ if (sync_arr_get_item(i, &cell)) {
+ WaitMutex* mutex;
+ type = cell->request_type;
+ /* JAN: FIXME
+ OK(fields[SYS_SEMAPHORE_WAITS_THREAD_ID]->store(,
+ ulint(cell->thread), true));
+ */
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_FILE], innobase_basename(cell->file)));
+ OK(fields[SYS_SEMAPHORE_WAITS_LINE]->store(cell->line, true));
+ fields[SYS_SEMAPHORE_WAITS_LINE]->set_notnull();
+ OK(fields[SYS_SEMAPHORE_WAITS_WAIT_TIME]->store(
+ difftime(time(NULL),
+ cell->reservation_time)));
+
+ if (type == SYNC_MUTEX) {
+ mutex = static_cast<WaitMutex*>(cell->latch.mutex);
+
+ if (mutex) {
+ // JAN: FIXME
+ // OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_OBJECT_NAME], mutex->cmutex_name));
+ OK(fields[SYS_SEMAPHORE_WAITS_WAIT_OBJECT]->store((longlong)mutex, true));
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "MUTEX"));
+ //OK(fields[SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID]->store(mutex->thread_id, true));
+ //OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_HOLDER_FILE], innobase_basename(mutex->file_name)));
+ //OK(fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->store(mutex->line, true));
+ //fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->set_notnull();
+ //OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_CREATED_FILE], innobase_basename(mutex->cfile_name)));
+ //OK(fields[SYS_SEMAPHORE_WAITS_CREATED_LINE]->store(mutex->cline, true));
+ //fields[SYS_SEMAPHORE_WAITS_CREATED_LINE]->set_notnull();
+ //OK(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG]->store(mutex->waiters, true));
+ //OK(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD]->store(mutex->lock_word, true));
+ //OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE], innobase_basename(mutex->file_name)));
+ //OK(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->store(mutex->line, true));
+ //fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->set_notnull();
+ //OK(fields[SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT]->store(mutex->count_os_wait, true));
+ }
+ } else if (type == RW_LOCK_X_WAIT
+ || type == RW_LOCK_X
+ || type == RW_LOCK_SX
+ || type == RW_LOCK_S) {
+ rw_lock_t* rwlock=NULL;
+
+ rwlock = static_cast<rw_lock_t *> (cell->latch.lock);
+
+ if (rwlock) {
+ ulint writer = rw_lock_get_writer(rwlock);
+
+ OK(fields[SYS_SEMAPHORE_WAITS_WAIT_OBJECT]->store((longlong)rwlock, true));
+ if (type == RW_LOCK_X) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "RW_LOCK_X"));
+ } else if (type == RW_LOCK_X_WAIT) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "RW_LOCK_X_WAIT"));
+ } else if (type == RW_LOCK_S) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "RW_LOCK_S"));
+ } else if (type == RW_LOCK_SX) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "RW_LOCK_SX"));
+ }
+
+ if (writer != RW_LOCK_NOT_LOCKED) {
+ // JAN: FIXME
+ // OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_OBJECT_NAME], rwlock->lock_name));
+ OK(fields[SYS_SEMAPHORE_WAITS_WRITER_THREAD]->store(ulint(rwlock->writer_thread), true));
+
+ if (writer == RW_LOCK_X) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_RESERVATION_MODE], "RW_LOCK_X"));
+ } else if (writer == RW_LOCK_X_WAIT) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_RESERVATION_MODE], "RW_LOCK_X_WAIT"));
+ } else if (type == RW_LOCK_SX) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_RESERVATION_MODE], "RW_LOCK_SX"));
+ }
+
+ //OK(fields[SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID]->store(rwlock->thread_id, true));
+ //OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_HOLDER_FILE], innobase_basename(rwlock->file_name)));
+ //OK(fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->store(rwlock->line, true));
+ //fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE]->set_notnull();
+ OK(fields[SYS_SEMAPHORE_WAITS_READERS]->store(rw_lock_get_reader_count(rwlock), true));
+ OK(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG]->store(
+ rwlock->waiters,
+ true));
+ OK(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD]->store(
+ rwlock->lock_word,
+ true));
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE], innobase_basename(rwlock->last_x_file_name)));
+ OK(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->store(rwlock->last_x_line, true));
+ fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE]->set_notnull();
+ OK(fields[SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT]->store(rwlock->count_os_wait, true));
+ }
+ }
+ }
+
+ OK(schema_table_store_record(thd, tables->table));
+ }
+ }
+
+ DBUG_RETURN(0);
+}
diff --git a/storage/innobase/sync/sync0debug.cc b/storage/innobase/sync/sync0debug.cc
new file mode 100644
index 00000000..7c3e4c05
--- /dev/null
+++ b/storage/innobase/sync/sync0debug.cc
@@ -0,0 +1,1423 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0debug.cc
+Debug checks for latches.
+
+Created 2012-08-21 Sunny Bains
+*******************************************************/
+
+#include "sync0sync.h"
+#include "sync0debug.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <map>
+
+#ifdef UNIV_DEBUG
+
+my_bool srv_sync_debug;
+
+/** The global mutex which protects debug info lists of all rw-locks.
+To modify the debug info list of an rw-lock, this mutex has to be
+acquired in addition to the mutex protecting the lock. */
+static SysMutex rw_lock_debug_mutex;
+
+/** The latch held by a thread */
+struct Latched {
+
+ /** Constructor */
+ Latched() : m_latch(), m_level(SYNC_UNKNOWN) { }
+
+ /** Constructor
+ @param[in] latch Latch instance
+ @param[in] level Level of latch held */
+ Latched(const latch_t* latch,
+ latch_level_t level)
+ :
+ m_latch(latch),
+ m_level(level)
+ {
+ /* No op */
+ }
+
+ /** @return the latch level */
+ latch_level_t get_level() const
+ {
+ return(m_level);
+ }
+
+ /** Check if the rhs latch and level match
+ @param[in] rhs instance to compare with
+ @return true on match */
+ bool operator==(const Latched& rhs) const
+ {
+ return(m_latch == rhs.m_latch && m_level == rhs.m_level);
+ }
+
+ /** The latch instance */
+ const latch_t* m_latch;
+
+ /** The latch level. For buffer blocks we can pass a separate latch
+ level to check against, see buf_block_dbg_add_level() */
+ latch_level_t m_level;
+};
+
+/** Thread specific latches. This is ordered on level in descending order. */
+typedef std::vector<Latched, ut_allocator<Latched> > Latches;
+
+/** The deadlock detector. */
+struct LatchDebug {
+
+ /** Debug mutex for control structures, should not be tracked
+ by this module. */
+ typedef OSMutex Mutex;
+
+ /** Comparator for the ThreadMap. */
+ struct os_thread_id_less
+ : public std::binary_function<
+ os_thread_id_t,
+ os_thread_id_t,
+ bool>
+ {
+ /** @return true if lhs < rhs */
+ bool operator()(
+ const os_thread_id_t& lhs,
+ const os_thread_id_t& rhs) const
+ UNIV_NOTHROW
+ {
+ return(ulint(lhs) < ulint(rhs));
+ }
+ };
+
+ /** For tracking a thread's latches. */
+ typedef std::map<
+ os_thread_id_t,
+ Latches*,
+ os_thread_id_less,
+ ut_allocator<std::pair<const os_thread_id_t, Latches*> > >
+ ThreadMap;
+
+ /** Constructor */
+ LatchDebug()
+ UNIV_NOTHROW;
+
+ /** Destructor */
+ ~LatchDebug()
+ UNIV_NOTHROW
+ {
+ m_mutex.destroy();
+ }
+
+ /** Create a new instance if one doesn't exist else return
+ the existing one.
+ @param[in] add add an empty entry if one is not
+ found (default no)
+ @return pointer to a thread's acquired latches. */
+ Latches* thread_latches(bool add = false)
+ UNIV_NOTHROW;
+
+ /** Check that all the latches already owned by a thread have a lower
+ level than limit.
+ @param[in] latches the thread's existing (acquired) latches
+ @param[in] limit to check against
+ @return latched if there is one with a level <= limit . */
+ const Latched* less(
+ const Latches* latches,
+ latch_level_t limit) const
+ UNIV_NOTHROW;
+
+ /** Checks if the level value exists in the thread's acquired latches.
+ @param[in] latches the thread's existing (acquired) latches
+ @param[in] level to lookup
+ @return latch if found or 0 */
+ const latch_t* find(
+ const Latches* Latches,
+ latch_level_t level) const
+ UNIV_NOTHROW;
+
+ /**
+ Checks if the level value exists in the thread's acquired latches.
+ @param[in] level to lookup
+ @return latch if found or 0 */
+ const latch_t* find(latch_level_t level)
+ UNIV_NOTHROW;
+
+ /** Report error and abort.
+ @param[in] latches thread's existing latches
+ @param[in] latched The existing latch causing the
+ invariant to fail
+ @param[in] level The new level request that breaks
+ the order */
+ void crash(
+ const Latches* latches,
+ const Latched* latched,
+ latch_level_t level) const
+ UNIV_NOTHROW;
+
+ /** Do a basic ordering check.
+ @param[in] latches thread's existing latches
+ @param[in] requested_level Level requested by latch
+ @param[in] level declared ulint so that we can
+ do level - 1. The level of the
+ latch that the thread is trying
+ to acquire
+ @return true if passes, else crash with error message. */
+ inline bool basic_check(
+ const Latches* latches,
+ latch_level_t requested_level,
+ lint level) const
+ UNIV_NOTHROW;
+
+ /** Adds a latch and its level in the thread level array. Allocates
+ the memory for the array if called for the first time for this
+ OS thread. Makes the checks against other latch levels stored
+ in the array for this thread.
+
+ @param[in] latch latch that the thread wants to acqire.
+ @param[in] level latch level to check against */
+ void lock_validate(
+ const latch_t* latch,
+ latch_level_t level)
+ UNIV_NOTHROW
+ {
+ /* Ignore diagnostic latches, starting with '.' */
+
+ if (*latch->get_name() != '.'
+ && latch->get_level() != SYNC_LEVEL_VARYING) {
+
+ ut_ad(level != SYNC_LEVEL_VARYING);
+
+ Latches* latches = check_order(latch, level);
+
+ ut_a(latches->empty()
+ || level == SYNC_LEVEL_VARYING
+ || level == SYNC_NO_ORDER_CHECK
+ || latches->back().get_level()
+ == SYNC_NO_ORDER_CHECK
+ || latches->back().m_latch->get_level()
+ == SYNC_LEVEL_VARYING
+ || latches->back().get_level() >= level);
+ }
+ }
+
+ /** Adds a latch and its level in the thread level array. Allocates
+ the memory for the array if called for the first time for this
+ OS thread. Makes the checks against other latch levels stored
+ in the array for this thread.
+
+ @param[in] latch latch that the thread wants to acqire.
+ @param[in] level latch level to check against */
+ void lock_granted(
+ const latch_t* latch,
+ latch_level_t level)
+ UNIV_NOTHROW
+ {
+ /* Ignore diagnostic latches, starting with '.' */
+
+ if (*latch->get_name() != '.'
+ && latch->get_level() != SYNC_LEVEL_VARYING) {
+
+ Latches* latches = thread_latches(true);
+
+ latches->push_back(Latched(latch, level));
+ }
+ }
+
+ /** For recursive X rw-locks.
+ @param[in] latch The RW-Lock to relock */
+ void relock(const latch_t* latch)
+ UNIV_NOTHROW
+ {
+ ut_a(latch->m_rw_lock);
+
+ latch_level_t level = latch->get_level();
+
+ /* Ignore diagnostic latches, starting with '.' */
+
+ if (*latch->get_name() != '.'
+ && latch->get_level() != SYNC_LEVEL_VARYING) {
+
+ Latches* latches = thread_latches(true);
+
+ Latches::iterator it = std::find(
+ latches->begin(), latches->end(),
+ Latched(latch, level));
+
+ ut_a(latches->empty()
+ || level == SYNC_LEVEL_VARYING
+ || level == SYNC_NO_ORDER_CHECK
+ || latches->back().m_latch->get_level()
+ == SYNC_LEVEL_VARYING
+ || latches->back().m_latch->get_level()
+ == SYNC_NO_ORDER_CHECK
+ || latches->back().get_level() >= level
+ || it != latches->end());
+
+ if (it == latches->end()) {
+ latches->push_back(Latched(latch, level));
+ } else {
+ latches->insert(it, Latched(latch, level));
+ }
+ }
+ }
+
+ /** Iterate over a thread's latches.
+ @param[in] functor The callback
+ @return true if the functor returns true. */
+ bool for_each(const sync_check_functor_t& functor)
+ UNIV_NOTHROW
+ {
+ if (const Latches* latches = thread_latches()) {
+ Latches::const_iterator end = latches->end();
+ for (Latches::const_iterator it = latches->begin();
+ it != end; ++it) {
+
+ if (functor(it->m_level)) {
+ return(true);
+ }
+ }
+ }
+
+ return(false);
+ }
+
+ /** Removes a latch from the thread level array if it is found there.
+ @param[in] latch The latch that was released
+ @return true if found in the array; it is not an error if the latch is
+ not found, as we presently are not able to determine the level for
+ every latch reservation the program does */
+ void unlock(const latch_t* latch) UNIV_NOTHROW;
+
+ /** Get the level name
+ @param[in] level The level ID to lookup
+ @return level name */
+ const std::string& get_level_name(latch_level_t level) const
+ UNIV_NOTHROW
+ {
+ Levels::const_iterator it = m_levels.find(level);
+
+ ut_ad(it != m_levels.end());
+
+ return(it->second);
+ }
+
+ /** Initialise the debug data structures */
+ static void init()
+ UNIV_NOTHROW;
+
+ /** Shutdown the latch debug checking */
+ static void shutdown()
+ UNIV_NOTHROW;
+
+ /** @return the singleton instance */
+ static LatchDebug* instance()
+ UNIV_NOTHROW
+ {
+ return(s_instance);
+ }
+
+ /** Create the singleton instance */
+ static void create_instance()
+ UNIV_NOTHROW
+ {
+ ut_ad(s_instance == NULL);
+
+ s_instance = UT_NEW_NOKEY(LatchDebug());
+ }
+
+private:
+ /** Disable copying */
+ LatchDebug(const LatchDebug&);
+ LatchDebug& operator=(const LatchDebug&);
+
+ /** Adds a latch and its level in the thread level array. Allocates
+ the memory for the array if called first time for this OS thread.
+ Makes the checks against other latch levels stored in the array
+ for this thread.
+
+ @param[in] latch pointer to a mutex or an rw-lock
+ @param[in] level level in the latching order
+ @return the thread's latches */
+ Latches* check_order(
+ const latch_t* latch,
+ latch_level_t level)
+ UNIV_NOTHROW;
+
+ /** Print the latches acquired by a thread
+ @param[in] latches Latches acquired by a thread */
+ void print_latches(const Latches* latches) const
+ UNIV_NOTHROW;
+
+ /** Special handling for the RTR mutexes. We need to add proper
+ levels for them if possible.
+ @param[in] latch Latch to check
+ @return true if it is a an _RTR_ mutex */
+ bool is_rtr_mutex(const latch_t* latch) const
+ UNIV_NOTHROW
+ {
+ return(latch->get_id() == LATCH_ID_RTR_ACTIVE_MUTEX
+ || latch->get_id() == LATCH_ID_RTR_PATH_MUTEX
+ || latch->get_id() == LATCH_ID_RTR_MATCH_MUTEX);
+ }
+
+private:
+ /** Comparator for the Levels . */
+ struct latch_level_less
+ : public std::binary_function<
+ latch_level_t,
+ latch_level_t,
+ bool>
+ {
+ /** @return true if lhs < rhs */
+ bool operator()(
+ const latch_level_t& lhs,
+ const latch_level_t& rhs) const
+ UNIV_NOTHROW
+ {
+ return(lhs < rhs);
+ }
+ };
+
+ typedef std::map<
+ latch_level_t,
+ std::string,
+ latch_level_less,
+ ut_allocator<std::pair<const latch_level_t, std::string> > >
+ Levels;
+
+ /** Mutex protecting the deadlock detector data structures. */
+ Mutex m_mutex;
+
+ /** Thread specific data. Protected by m_mutex. */
+ ThreadMap m_threads;
+
+ /** Mapping from latche level to its string representation. */
+ Levels m_levels;
+
+ /** The singleton instance. Must be created in single threaded mode. */
+ static LatchDebug* s_instance;
+
+public:
+ /** For checking whether this module has been initialised or not. */
+ static bool s_initialized;
+};
+
+/** The latch order checking infra-structure */
+LatchDebug* LatchDebug::s_instance = NULL;
+bool LatchDebug::s_initialized = false;
+
+#define LEVEL_MAP_INSERT(T) \
+do { \
+ std::pair<Levels::iterator, bool> result = \
+ m_levels.insert(Levels::value_type(T, #T)); \
+ ut_ad(result.second); \
+} while(0)
+
+/** Setup the mapping from level ID to level name mapping */
+LatchDebug::LatchDebug()
+{
+ m_mutex.init();
+
+ LEVEL_MAP_INSERT(SYNC_UNKNOWN);
+ LEVEL_MAP_INSERT(SYNC_MUTEX);
+ LEVEL_MAP_INSERT(RW_LOCK_SX);
+ LEVEL_MAP_INSERT(RW_LOCK_X_WAIT);
+ LEVEL_MAP_INSERT(RW_LOCK_S);
+ LEVEL_MAP_INSERT(RW_LOCK_X);
+ LEVEL_MAP_INSERT(RW_LOCK_NOT_LOCKED);
+ LEVEL_MAP_INSERT(SYNC_ANY_LATCH);
+ LEVEL_MAP_INSERT(SYNC_POOL);
+ LEVEL_MAP_INSERT(SYNC_POOL_MANAGER);
+ LEVEL_MAP_INSERT(SYNC_SEARCH_SYS);
+ LEVEL_MAP_INSERT(SYNC_WORK_QUEUE);
+ LEVEL_MAP_INSERT(SYNC_FTS_TOKENIZE);
+ LEVEL_MAP_INSERT(SYNC_FTS_OPTIMIZE);
+ LEVEL_MAP_INSERT(SYNC_FTS_CACHE_INIT);
+ LEVEL_MAP_INSERT(SYNC_RECV);
+ LEVEL_MAP_INSERT(SYNC_PURGE_QUEUE);
+ LEVEL_MAP_INSERT(SYNC_TRX_SYS_HEADER);
+ LEVEL_MAP_INSERT(SYNC_TRX);
+ LEVEL_MAP_INSERT(SYNC_RW_TRX_HASH_ELEMENT);
+ LEVEL_MAP_INSERT(SYNC_READ_VIEW);
+ LEVEL_MAP_INSERT(SYNC_TRX_SYS);
+ LEVEL_MAP_INSERT(SYNC_LOCK_SYS);
+ LEVEL_MAP_INSERT(SYNC_LOCK_WAIT_SYS);
+ LEVEL_MAP_INSERT(SYNC_INDEX_ONLINE_LOG);
+ LEVEL_MAP_INSERT(SYNC_IBUF_BITMAP);
+ LEVEL_MAP_INSERT(SYNC_IBUF_BITMAP_MUTEX);
+ LEVEL_MAP_INSERT(SYNC_IBUF_TREE_NODE);
+ LEVEL_MAP_INSERT(SYNC_IBUF_TREE_NODE_NEW);
+ LEVEL_MAP_INSERT(SYNC_IBUF_INDEX_TREE);
+ LEVEL_MAP_INSERT(SYNC_IBUF_MUTEX);
+ LEVEL_MAP_INSERT(SYNC_FSP_PAGE);
+ LEVEL_MAP_INSERT(SYNC_FSP);
+ LEVEL_MAP_INSERT(SYNC_EXTERN_STORAGE);
+ LEVEL_MAP_INSERT(SYNC_TRX_UNDO_PAGE);
+ LEVEL_MAP_INSERT(SYNC_RSEG_HEADER);
+ LEVEL_MAP_INSERT(SYNC_RSEG_HEADER_NEW);
+ LEVEL_MAP_INSERT(SYNC_NOREDO_RSEG);
+ LEVEL_MAP_INSERT(SYNC_REDO_RSEG);
+ LEVEL_MAP_INSERT(SYNC_PURGE_LATCH);
+ LEVEL_MAP_INSERT(SYNC_TREE_NODE);
+ LEVEL_MAP_INSERT(SYNC_TREE_NODE_FROM_HASH);
+ LEVEL_MAP_INSERT(SYNC_TREE_NODE_NEW);
+ LEVEL_MAP_INSERT(SYNC_INDEX_TREE);
+ LEVEL_MAP_INSERT(SYNC_IBUF_PESS_INSERT_MUTEX);
+ LEVEL_MAP_INSERT(SYNC_IBUF_HEADER);
+ LEVEL_MAP_INSERT(SYNC_DICT_HEADER);
+ LEVEL_MAP_INSERT(SYNC_STATS_AUTO_RECALC);
+ LEVEL_MAP_INSERT(SYNC_DICT);
+ LEVEL_MAP_INSERT(SYNC_FTS_CACHE);
+ LEVEL_MAP_INSERT(SYNC_DICT_OPERATION);
+ LEVEL_MAP_INSERT(SYNC_TRX_I_S_RWLOCK);
+ LEVEL_MAP_INSERT(SYNC_LEVEL_VARYING);
+ LEVEL_MAP_INSERT(SYNC_NO_ORDER_CHECK);
+
+ /* Enum count starts from 0 */
+ ut_ad(m_levels.size() == SYNC_LEVEL_MAX + 1);
+}
+
+/** Print the latches acquired by a thread
+@param[in] latches Latches acquired by a thread */
+void
+LatchDebug::print_latches(const Latches* latches) const
+ UNIV_NOTHROW
+{
+ ib::error() << "Latches already owned by this thread: ";
+
+ Latches::const_iterator end = latches->end();
+
+ for (Latches::const_iterator it = latches->begin();
+ it != end;
+ ++it) {
+
+ ib::error()
+ << sync_latch_get_name(it->m_latch->get_id())
+ << " -> "
+ << it->m_level << " "
+ << "(" << get_level_name(it->m_level) << ")";
+ }
+}
+
+/** Report error and abort
+@param[in] latches thread's existing latches
+@param[in] latched The existing latch causing the invariant to fail
+@param[in] level The new level request that breaks the order */
+void
+LatchDebug::crash(
+ const Latches* latches,
+ const Latched* latched,
+ latch_level_t level) const
+ UNIV_NOTHROW
+{
+ const latch_t* latch = latched->m_latch;
+ const std::string& in_level_name = get_level_name(level);
+
+ const std::string& latch_level_name =
+ get_level_name(latched->m_level);
+
+ ib::error()
+ << "Thread " << os_thread_get_curr_id()
+ << " already owns a latch "
+ << sync_latch_get_name(latch->m_id) << " at level"
+ << " " << latched->m_level << " (" << latch_level_name
+ << " ), which is at a lower/same level than the"
+ << " requested latch: "
+ << level << " (" << in_level_name << "). "
+ << latch->to_string();
+
+ print_latches(latches);
+
+ ut_error;
+}
+
+/** Check that all the latches already owned by a thread have a lower
+level than limit.
+@param[in] latches the thread's existing (acquired) latches
+@param[in] limit to check against
+@return latched info if there is one with a level <= limit . */
+const Latched*
+LatchDebug::less(
+ const Latches* latches,
+ latch_level_t limit) const
+ UNIV_NOTHROW
+{
+ Latches::const_iterator end = latches->end();
+
+ for (Latches::const_iterator it = latches->begin(); it != end; ++it) {
+
+ if (it->m_level <= limit) {
+ return(&(*it));
+ }
+ }
+
+ return(NULL);
+}
+
+/** Do a basic ordering check.
+@param[in] latches thread's existing latches
+@param[in] requested_level Level requested by latch
+@param[in] in_level declared ulint so that we can do level - 1.
+ The level of the latch that the thread is
+ trying to acquire
+@return true if passes, else crash with error message. */
+inline bool
+LatchDebug::basic_check(
+ const Latches* latches,
+ latch_level_t requested_level,
+ lint in_level) const
+ UNIV_NOTHROW
+{
+ latch_level_t level = latch_level_t(in_level);
+
+ ut_ad(level < SYNC_LEVEL_MAX);
+
+ const Latched* latched = less(latches, level);
+
+ if (latched != NULL) {
+ crash(latches, latched, requested_level);
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Create a new instance if one doesn't exist else return the existing one.
+@param[in] add add an empty entry if one is not found
+ (default no)
+@return pointer to a thread's acquired latches. */
+Latches*
+LatchDebug::thread_latches(bool add)
+ UNIV_NOTHROW
+{
+ m_mutex.enter();
+
+ os_thread_id_t thread_id = os_thread_get_curr_id();
+ ThreadMap::iterator lb = m_threads.lower_bound(thread_id);
+
+ if (lb != m_threads.end()
+ && !(m_threads.key_comp()(thread_id, lb->first))) {
+
+ Latches* latches = lb->second;
+
+ m_mutex.exit();
+
+ return(latches);
+
+ } else if (!add) {
+
+ m_mutex.exit();
+
+ return(NULL);
+
+ } else {
+ typedef ThreadMap::value_type value_type;
+
+ Latches* latches = UT_NEW_NOKEY(Latches());
+
+ ut_a(latches != NULL);
+
+ latches->reserve(32);
+
+ m_threads.insert(lb, value_type(thread_id, latches));
+
+ m_mutex.exit();
+
+ return(latches);
+ }
+}
+
+/** Checks if the level value exists in the thread's acquired latches.
+@param[in] levels the thread's existing (acquired) latches
+@param[in] level to lookup
+@return latch if found or 0 */
+const latch_t*
+LatchDebug::find(
+ const Latches* latches,
+ latch_level_t level) const UNIV_NOTHROW
+{
+ Latches::const_iterator end = latches->end();
+
+ for (Latches::const_iterator it = latches->begin(); it != end; ++it) {
+
+ if (it->m_level == level) {
+
+ return(it->m_latch);
+ }
+ }
+
+ return(0);
+}
+
+/** Checks if the level value exists in the thread's acquired latches.
+@param[in] level The level to lookup
+@return latch if found or NULL */
+const latch_t*
+LatchDebug::find(latch_level_t level)
+ UNIV_NOTHROW
+{
+ return(find(thread_latches(), level));
+}
+
+/**
+Adds a latch and its level in the thread level array. Allocates the memory
+for the array if called first time for this OS thread. Makes the checks
+against other latch levels stored in the array for this thread.
+@param[in] latch pointer to a mutex or an rw-lock
+@param[in] level level in the latching order
+@return the thread's latches */
+Latches*
+LatchDebug::check_order(
+ const latch_t* latch,
+ latch_level_t level)
+ UNIV_NOTHROW
+{
+ ut_ad(latch->get_level() != SYNC_LEVEL_VARYING);
+
+ Latches* latches = thread_latches(true);
+
+ /* NOTE that there is a problem with _NODE and _LEAF levels: if the
+ B-tree height changes, then a leaf can change to an internal node
+ or the other way around. We do not know at present if this can cause
+ unnecessary assertion failures below. */
+
+ switch (level) {
+ case SYNC_NO_ORDER_CHECK:
+ case SYNC_EXTERN_STORAGE:
+ case SYNC_TREE_NODE_FROM_HASH:
+ /* Do no order checking */
+ break;
+
+ case SYNC_TRX_SYS_HEADER:
+
+ if (srv_is_being_started) {
+ /* This is violated during trx_sys_create_rsegs()
+ when creating additional rollback segments when
+ upgrading in srv_start(). */
+ break;
+ }
+
+ /* Fall through */
+
+ case SYNC_RECV:
+ case SYNC_WORK_QUEUE:
+ case SYNC_FTS_TOKENIZE:
+ case SYNC_FTS_OPTIMIZE:
+ case SYNC_FTS_CACHE:
+ case SYNC_FTS_CACHE_INIT:
+ case SYNC_SEARCH_SYS:
+ case SYNC_LOCK_SYS:
+ case SYNC_LOCK_WAIT_SYS:
+ case SYNC_RW_TRX_HASH_ELEMENT:
+ case SYNC_READ_VIEW:
+ case SYNC_TRX_SYS:
+ case SYNC_IBUF_BITMAP_MUTEX:
+ case SYNC_REDO_RSEG:
+ case SYNC_NOREDO_RSEG:
+ case SYNC_PURGE_LATCH:
+ case SYNC_PURGE_QUEUE:
+ case SYNC_DICT_OPERATION:
+ case SYNC_DICT_HEADER:
+ case SYNC_TRX_I_S_RWLOCK:
+ case SYNC_IBUF_MUTEX:
+ case SYNC_INDEX_ONLINE_LOG:
+ case SYNC_STATS_AUTO_RECALC:
+ case SYNC_POOL:
+ case SYNC_POOL_MANAGER:
+ basic_check(latches, level, level);
+ break;
+
+ case SYNC_ANY_LATCH:
+
+ /* Temporary workaround for LATCH_ID_RTR_*_MUTEX */
+ if (is_rtr_mutex(latch)) {
+
+ const Latched* latched = less(latches, level);
+
+ if (latched == NULL
+ || (latched != NULL
+ && is_rtr_mutex(latched->m_latch))) {
+
+ /* No violation */
+ break;
+
+ }
+
+ crash(latches, latched, level);
+
+ } else {
+ basic_check(latches, level, level);
+ }
+
+ break;
+
+ case SYNC_TRX:
+
+ /* Either the thread must own the lock_sys.mutex, or
+ it is allowed to own only ONE trx_t::mutex. */
+
+ if (less(latches, level) != NULL) {
+ basic_check(latches, level, level - 1);
+ ut_a(find(latches, SYNC_LOCK_SYS) != 0);
+ }
+ break;
+
+ case SYNC_IBUF_BITMAP:
+
+ /* Either the thread must own the master mutex to all
+ the bitmap pages, or it is allowed to latch only ONE
+ bitmap page. */
+
+ if (find(latches, SYNC_IBUF_BITMAP_MUTEX) != 0) {
+
+ basic_check(latches, level, SYNC_IBUF_BITMAP - 1);
+
+ } else if (!srv_is_being_started) {
+
+ /* This is violated during trx_sys_create_rsegs()
+ when creating additional rollback segments during
+ upgrade. */
+
+ basic_check(latches, level, SYNC_IBUF_BITMAP);
+ }
+ break;
+
+ case SYNC_FSP_PAGE:
+ ut_a(find(latches, SYNC_FSP) != 0);
+ break;
+
+ case SYNC_FSP:
+
+ ut_a(find(latches, SYNC_FSP) != 0
+ || basic_check(latches, level, SYNC_FSP));
+ break;
+
+ case SYNC_TRX_UNDO_PAGE:
+
+ /* Purge is allowed to read in as many UNDO pages as it likes.
+ The purge thread can read the UNDO pages without any covering
+ mutex. */
+
+ ut_a(find(latches, SYNC_REDO_RSEG) != 0
+ || find(latches, SYNC_NOREDO_RSEG) != 0
+ || basic_check(latches, level, level - 1));
+ break;
+
+ case SYNC_RSEG_HEADER:
+
+ ut_a(find(latches, SYNC_REDO_RSEG) != 0
+ || find(latches, SYNC_NOREDO_RSEG) != 0);
+ break;
+
+ case SYNC_RSEG_HEADER_NEW:
+
+ ut_a(find(latches, SYNC_FSP_PAGE) != 0);
+ break;
+
+ case SYNC_TREE_NODE:
+
+ ut_a(find(latches, SYNC_FSP) == &fil_system.temp_space->latch
+ || find(latches, SYNC_INDEX_TREE)
+ || find(latches, SYNC_DICT_OPERATION)
+ || basic_check(latches, level, SYNC_TREE_NODE - 1));
+ break;
+
+ case SYNC_TREE_NODE_NEW:
+
+ ut_a(find(latches, SYNC_FSP_PAGE) != 0);
+ break;
+
+ case SYNC_INDEX_TREE:
+
+ basic_check(latches, level, SYNC_TREE_NODE - 1);
+ break;
+
+ case SYNC_IBUF_TREE_NODE:
+
+ ut_a(find(latches, SYNC_IBUF_INDEX_TREE) != 0
+ || basic_check(latches, level, SYNC_IBUF_TREE_NODE - 1));
+ break;
+
+ case SYNC_IBUF_TREE_NODE_NEW:
+
+ /* ibuf_add_free_page() allocates new pages for the change
+ buffer while only holding the tablespace x-latch. These
+ pre-allocated new pages may only be used while holding
+ ibuf_mutex, in btr_page_alloc_for_ibuf(). */
+
+ ut_a(find(latches, SYNC_IBUF_MUTEX) != 0
+ || find(latches, SYNC_FSP) != 0);
+ break;
+
+ case SYNC_IBUF_INDEX_TREE:
+
+ if (find(latches, SYNC_FSP) != 0) {
+ basic_check(latches, level, level - 1);
+ } else {
+ basic_check(latches, level, SYNC_IBUF_TREE_NODE - 1);
+ }
+ break;
+
+ case SYNC_IBUF_PESS_INSERT_MUTEX:
+
+ basic_check(latches, level, SYNC_FSP - 1);
+ ut_a(find(latches, SYNC_IBUF_MUTEX) == 0);
+ break;
+
+ case SYNC_IBUF_HEADER:
+
+ basic_check(latches, level, SYNC_FSP - 1);
+ ut_a(find(latches, SYNC_IBUF_MUTEX) == NULL);
+ ut_a(find(latches, SYNC_IBUF_PESS_INSERT_MUTEX) == NULL);
+ break;
+
+ case SYNC_DICT:
+ basic_check(latches, level, SYNC_DICT);
+ break;
+
+ case SYNC_MUTEX:
+ case SYNC_UNKNOWN:
+ case SYNC_LEVEL_VARYING:
+ case RW_LOCK_X:
+ case RW_LOCK_X_WAIT:
+ case RW_LOCK_S:
+ case RW_LOCK_SX:
+ case RW_LOCK_NOT_LOCKED:
+ /* These levels should never be set for a latch. */
+ ut_error;
+ break;
+ }
+
+ return(latches);
+}
+
+/** Removes a latch from the thread level array if it is found there.
+@param[in] latch that was released/unlocked
+@param[in] level level of the latch
+@return true if found in the array; it is not an error if the latch is
+not found, as we presently are not able to determine the level for
+every latch reservation the program does */
+void
+LatchDebug::unlock(const latch_t* latch)
+ UNIV_NOTHROW
+{
+ if (latch->get_level() == SYNC_LEVEL_VARYING) {
+ // We don't have varying level mutexes
+ ut_ad(latch->m_rw_lock);
+ }
+
+ Latches* latches;
+
+ if (*latch->get_name() == '.') {
+
+ /* Ignore diagnostic latches, starting with '.' */
+
+ } else if ((latches = thread_latches()) != NULL) {
+
+ Latches::reverse_iterator rend = latches->rend();
+
+ for (Latches::reverse_iterator it = latches->rbegin();
+ it != rend;
+ ++it) {
+
+ if (it->m_latch != latch) {
+
+ continue;
+ }
+
+ Latches::iterator i = it.base();
+
+ latches->erase(--i);
+
+ /* If this thread doesn't own any more
+ latches remove from the map.
+
+ FIXME: Perhaps use the master thread
+ to do purge. Or, do it from close connection.
+ This could be expensive. */
+
+ if (latches->empty()) {
+
+ m_mutex.enter();
+
+ os_thread_id_t thread_id;
+
+ thread_id = os_thread_get_curr_id();
+
+ m_threads.erase(thread_id);
+
+ m_mutex.exit();
+
+ UT_DELETE(latches);
+ }
+
+ return;
+ }
+
+ if (latch->get_level() != SYNC_LEVEL_VARYING) {
+ ib::error()
+ << "Couldn't find latch "
+ << sync_latch_get_name(latch->get_id());
+
+ print_latches(latches);
+
+ /** Must find the latch. */
+ ut_error;
+ }
+ }
+}
+
+/** Get the latch id from a latch name.
+@param[in] name Latch name
+@return latch id if found else LATCH_ID_NONE. */
+latch_id_t
+sync_latch_get_id(const char* name)
+{
+ LatchMetaData::const_iterator end = latch_meta.end();
+
+ /* Linear scan should be OK, this should be extremely rare. */
+
+ for (LatchMetaData::const_iterator it = latch_meta.begin();
+ it != end;
+ ++it) {
+
+ if (*it == NULL || (*it)->get_id() == LATCH_ID_NONE) {
+
+ continue;
+
+ } else if (strcmp((*it)->get_name(), name) == 0) {
+
+ return((*it)->get_id());
+ }
+ }
+
+ return(LATCH_ID_NONE);
+}
+
+/** Get the latch name from a sync level
+@param[in] level Latch level to lookup
+@return NULL if not found. */
+const char*
+sync_latch_get_name(latch_level_t level)
+{
+ LatchMetaData::const_iterator end = latch_meta.end();
+
+ /* Linear scan should be OK, this should be extremely rare. */
+
+ for (LatchMetaData::const_iterator it = latch_meta.begin();
+ it != end;
+ ++it) {
+
+ if (*it == NULL || (*it)->get_id() == LATCH_ID_NONE) {
+
+ continue;
+
+ } else if ((*it)->get_level() == level) {
+
+ return((*it)->get_name());
+ }
+ }
+
+ return(0);
+}
+
+/** Check if it is OK to acquire the latch.
+@param[in] latch latch type */
+void
+sync_check_lock_validate(const latch_t* latch)
+{
+ if (LatchDebug::instance() != NULL) {
+ LatchDebug::instance()->lock_validate(
+ latch, latch->get_level());
+ }
+}
+
+/** Note that the lock has been granted
+@param[in] latch latch type */
+void
+sync_check_lock_granted(const latch_t* latch)
+{
+ if (LatchDebug::instance() != NULL) {
+ LatchDebug::instance()->lock_granted(latch, latch->get_level());
+ }
+}
+
+/** Check if it is OK to acquire the latch.
+@param[in] latch latch type
+@param[in] level Latch level */
+void
+sync_check_lock(
+ const latch_t* latch,
+ latch_level_t level)
+{
+ if (LatchDebug::instance() != NULL) {
+
+ ut_ad(latch->get_level() == SYNC_LEVEL_VARYING);
+ ut_ad(latch->get_id() == LATCH_ID_BUF_BLOCK_LOCK);
+
+ LatchDebug::instance()->lock_validate(latch, level);
+ LatchDebug::instance()->lock_granted(latch, level);
+ }
+}
+
+/** Check if it is OK to re-acquire the lock.
+@param[in] latch RW-LOCK to relock (recursive X locks) */
+void
+sync_check_relock(const latch_t* latch)
+{
+ if (LatchDebug::instance() != NULL) {
+ LatchDebug::instance()->relock(latch);
+ }
+}
+
+/** Removes a latch from the thread level array if it is found there.
+@param[in] latch The latch to unlock */
+void
+sync_check_unlock(const latch_t* latch)
+{
+ if (LatchDebug::instance() != NULL) {
+ LatchDebug::instance()->unlock(latch);
+ }
+}
+
+/** Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@param[in] level to find
+@return a matching latch, or NULL if not found */
+const latch_t*
+sync_check_find(latch_level_t level)
+{
+ if (LatchDebug::instance() != NULL) {
+ return(LatchDebug::instance()->find(level));
+ }
+
+ return(NULL);
+}
+
+/** Iterate over the thread's latches.
+@param[in,out] functor called for each element.
+@return true if the functor returns true for any element */
+bool
+sync_check_iterate(const sync_check_functor_t& functor)
+{
+ if (LatchDebug* debug = LatchDebug::instance()) {
+ return(debug->for_each(functor));
+ }
+
+ return(false);
+}
+
+/** Enable sync order checking.
+
+Note: We don't enforce any synchronisation checks. The caller must ensure
+that no races can occur */
+static void sync_check_enable()
+{
+ if (!srv_sync_debug) {
+
+ return;
+ }
+
+ /* We should always call this before we create threads. */
+
+ LatchDebug::create_instance();
+}
+
+/** Initialise the debug data structures */
+void
+LatchDebug::init()
+ UNIV_NOTHROW
+{
+ mutex_create(LATCH_ID_RW_LOCK_DEBUG, &rw_lock_debug_mutex);
+}
+
+/** Shutdown the latch debug checking
+
+Note: We don't enforce any synchronisation checks. The caller must ensure
+that no races can occur */
+void
+LatchDebug::shutdown()
+ UNIV_NOTHROW
+{
+ mutex_free(&rw_lock_debug_mutex);
+
+ ut_a(s_initialized);
+
+ s_initialized = false;
+
+ UT_DELETE(s_instance);
+
+ LatchDebug::s_instance = NULL;
+}
+
+/** Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
+because the debug mutex is also acquired in sync0arr while holding the OS
+mutex protecting the sync array, and the ordinary mutex_enter might
+recursively call routines in sync0arr, leading to a deadlock on the OS
+mutex. */
+void
+rw_lock_debug_mutex_enter()
+{
+ mutex_enter(&rw_lock_debug_mutex);
+}
+
+/** Releases the debug mutex. */
+void
+rw_lock_debug_mutex_exit()
+{
+ mutex_exit(&rw_lock_debug_mutex);
+}
+#endif /* UNIV_DEBUG */
+
+/* Meta data for all the InnoDB latches. If the latch is not in recorded
+here then it will be be considered for deadlock checks. */
+LatchMetaData latch_meta;
+
+/** Load the latch meta data. */
+static
+void
+sync_latch_meta_init()
+ UNIV_NOTHROW
+{
+ latch_meta.resize(LATCH_ID_MAX + 1);
+
+ /* The latches should be ordered on latch_id_t. So that we can
+ index directly into the vector to update and fetch meta-data. */
+
+ LATCH_ADD_MUTEX(DICT_FOREIGN_ERR, SYNC_NO_ORDER_CHECK,
+ dict_foreign_err_mutex_key);
+
+ LATCH_ADD_MUTEX(DICT_SYS, SYNC_DICT, dict_sys_mutex_key);
+
+ LATCH_ADD_MUTEX(FIL_SYSTEM, SYNC_ANY_LATCH, fil_system_mutex_key);
+
+ LATCH_ADD_MUTEX(FTS_DELETE, SYNC_FTS_OPTIMIZE, fts_delete_mutex_key);
+
+ LATCH_ADD_MUTEX(FTS_DOC_ID, SYNC_FTS_OPTIMIZE, fts_doc_id_mutex_key);
+
+ LATCH_ADD_MUTEX(FTS_PLL_TOKENIZE, SYNC_FTS_TOKENIZE,
+ fts_pll_tokenize_mutex_key);
+
+ LATCH_ADD_MUTEX(IBUF_BITMAP, SYNC_IBUF_BITMAP_MUTEX,
+ ibuf_bitmap_mutex_key);
+
+ LATCH_ADD_MUTEX(IBUF, SYNC_IBUF_MUTEX, ibuf_mutex_key);
+
+ LATCH_ADD_MUTEX(IBUF_PESSIMISTIC_INSERT, SYNC_IBUF_PESS_INSERT_MUTEX,
+ ibuf_pessimistic_insert_mutex_key);
+
+ LATCH_ADD_MUTEX(PURGE_SYS_PQ, SYNC_PURGE_QUEUE,
+ purge_sys_pq_mutex_key);
+
+ LATCH_ADD_MUTEX(RECALC_POOL, SYNC_STATS_AUTO_RECALC,
+ recalc_pool_mutex_key);
+
+ LATCH_ADD_MUTEX(RECV_SYS, SYNC_RECV, recv_sys_mutex_key);
+
+ LATCH_ADD_MUTEX(REDO_RSEG, SYNC_REDO_RSEG, redo_rseg_mutex_key);
+
+ LATCH_ADD_MUTEX(NOREDO_RSEG, SYNC_NOREDO_RSEG, noredo_rseg_mutex_key);
+
+#ifdef UNIV_DEBUG
+ /* Mutex names starting with '.' are not tracked. They are assumed
+ to be diagnostic mutexes used in debugging. */
+ latch_meta[LATCH_ID_RW_LOCK_DEBUG] =
+ LATCH_ADD_MUTEX(RW_LOCK_DEBUG,
+ SYNC_NO_ORDER_CHECK,
+ rw_lock_debug_mutex_key);
+#endif /* UNIV_DEBUG */
+
+ LATCH_ADD_MUTEX(RTR_ACTIVE_MUTEX, SYNC_ANY_LATCH,
+ rtr_active_mutex_key);
+
+ LATCH_ADD_MUTEX(RTR_MATCH_MUTEX, SYNC_ANY_LATCH, rtr_match_mutex_key);
+
+ LATCH_ADD_MUTEX(RTR_PATH_MUTEX, SYNC_ANY_LATCH, rtr_path_mutex_key);
+
+ LATCH_ADD_MUTEX(RW_LOCK_LIST, SYNC_NO_ORDER_CHECK,
+ rw_lock_list_mutex_key);
+
+ LATCH_ADD_MUTEX(SRV_INNODB_MONITOR, SYNC_NO_ORDER_CHECK,
+ srv_innodb_monitor_mutex_key);
+
+ LATCH_ADD_MUTEX(SRV_MISC_TMPFILE, SYNC_ANY_LATCH,
+ srv_misc_tmpfile_mutex_key);
+
+ LATCH_ADD_MUTEX(SRV_MONITOR_FILE, SYNC_NO_ORDER_CHECK,
+ srv_monitor_file_mutex_key);
+
+ LATCH_ADD_MUTEX(TRX_POOL, SYNC_POOL, trx_pool_mutex_key);
+
+ LATCH_ADD_MUTEX(TRX_POOL_MANAGER, SYNC_POOL_MANAGER,
+ trx_pool_manager_mutex_key);
+
+ LATCH_ADD_MUTEX(TRX, SYNC_TRX, trx_mutex_key);
+
+ LATCH_ADD_MUTEX(LOCK_SYS, SYNC_LOCK_SYS, lock_mutex_key);
+
+ LATCH_ADD_MUTEX(LOCK_SYS_WAIT, SYNC_LOCK_WAIT_SYS,
+ lock_wait_mutex_key);
+
+ LATCH_ADD_MUTEX(TRX_SYS, SYNC_TRX_SYS, trx_sys_mutex_key);
+
+ LATCH_ADD_MUTEX(SRV_SYS_TASKS, SYNC_ANY_LATCH, srv_threads_mutex_key);
+
+ LATCH_ADD_MUTEX(PAGE_ZIP_STAT_PER_INDEX, SYNC_ANY_LATCH,
+ page_zip_stat_per_index_mutex_key);
+
+ LATCH_ADD_MUTEX(SYNC_ARRAY_MUTEX, SYNC_NO_ORDER_CHECK,
+ sync_array_mutex_key);
+
+ LATCH_ADD_MUTEX(ROW_DROP_LIST, SYNC_NO_ORDER_CHECK,
+ row_drop_list_mutex_key);
+
+ LATCH_ADD_MUTEX(INDEX_ONLINE_LOG, SYNC_INDEX_ONLINE_LOG,
+ index_online_log_key);
+
+ LATCH_ADD_MUTEX(WORK_QUEUE, SYNC_WORK_QUEUE, PFS_NOT_INSTRUMENTED);
+
+ // Add the RW locks
+ LATCH_ADD_RWLOCK(BTR_SEARCH, SYNC_SEARCH_SYS, btr_search_latch_key);
+
+ LATCH_ADD_RWLOCK(BUF_BLOCK_LOCK, SYNC_LEVEL_VARYING,
+ PFS_NOT_INSTRUMENTED);
+
+#ifdef UNIV_DEBUG
+ LATCH_ADD_RWLOCK(BUF_BLOCK_DEBUG, SYNC_LEVEL_VARYING,
+ PFS_NOT_INSTRUMENTED);
+#endif /* UNIV_DEBUG */
+
+ LATCH_ADD_RWLOCK(DICT_OPERATION, SYNC_DICT_OPERATION,
+ dict_operation_lock_key);
+
+ LATCH_ADD_RWLOCK(FIL_SPACE, SYNC_FSP, fil_space_latch_key);
+
+ LATCH_ADD_RWLOCK(FTS_CACHE, SYNC_FTS_CACHE, fts_cache_rw_lock_key);
+
+ LATCH_ADD_RWLOCK(FTS_CACHE_INIT, SYNC_FTS_CACHE_INIT,
+ fts_cache_init_rw_lock_key);
+
+ LATCH_ADD_RWLOCK(TRX_I_S_CACHE, SYNC_TRX_I_S_RWLOCK,
+ trx_i_s_cache_lock_key);
+
+ LATCH_ADD_RWLOCK(TRX_PURGE, SYNC_PURGE_LATCH, trx_purge_latch_key);
+
+ LATCH_ADD_RWLOCK(IBUF_INDEX_TREE, SYNC_IBUF_INDEX_TREE,
+ index_tree_rw_lock_key);
+
+ LATCH_ADD_RWLOCK(INDEX_TREE, SYNC_INDEX_TREE, index_tree_rw_lock_key);
+
+ /* JAN: TODO: Add PFS instrumentation */
+ LATCH_ADD_MUTEX(DEFRAGMENT_MUTEX, SYNC_NO_ORDER_CHECK,
+ PFS_NOT_INSTRUMENTED);
+ LATCH_ADD_MUTEX(BTR_DEFRAGMENT_MUTEX, SYNC_NO_ORDER_CHECK,
+ PFS_NOT_INSTRUMENTED);
+ LATCH_ADD_MUTEX(FIL_CRYPT_STAT_MUTEX, SYNC_NO_ORDER_CHECK,
+ PFS_NOT_INSTRUMENTED);
+ LATCH_ADD_MUTEX(FIL_CRYPT_DATA_MUTEX, SYNC_NO_ORDER_CHECK,
+ PFS_NOT_INSTRUMENTED);
+ LATCH_ADD_MUTEX(FIL_CRYPT_THREADS_MUTEX, SYNC_NO_ORDER_CHECK,
+ PFS_NOT_INSTRUMENTED);
+ LATCH_ADD_MUTEX(RW_TRX_HASH_ELEMENT, SYNC_RW_TRX_HASH_ELEMENT,
+ rw_trx_hash_element_mutex_key);
+ LATCH_ADD_MUTEX(READ_VIEW, SYNC_READ_VIEW, read_view_mutex_key);
+
+ latch_id_t id = LATCH_ID_NONE;
+
+ /* The array should be ordered on latch ID.We need to
+ index directly into it from the mutex policy to update
+ the counters and access the meta-data. */
+
+ for (LatchMetaData::iterator it = latch_meta.begin();
+ it != latch_meta.end();
+ ++it) {
+
+ const latch_meta_t* meta = *it;
+
+
+ /* Skip blank entries */
+ if (meta == NULL || meta->get_id() == LATCH_ID_NONE) {
+ continue;
+ }
+
+ ut_a(id < meta->get_id());
+
+ id = meta->get_id();
+ }
+}
+
+/** Destroy the latch meta data */
+static
+void
+sync_latch_meta_destroy()
+{
+ for (LatchMetaData::iterator it = latch_meta.begin();
+ it != latch_meta.end();
+ ++it) {
+
+ UT_DELETE(*it);
+ }
+
+ latch_meta.clear();
+}
+
+/** Initializes the synchronization data structures. */
+void
+sync_check_init()
+{
+ ut_ad(!LatchDebug::s_initialized);
+ ut_d(LatchDebug::s_initialized = true);
+
+ sync_latch_meta_init();
+
+ /* create the mutex to protect rw_lock list. */
+
+ mutex_create(LATCH_ID_RW_LOCK_LIST, &rw_lock_list_mutex);
+
+ ut_d(LatchDebug::init());
+
+ sync_array_init();
+
+ ut_d(sync_check_enable());
+}
+
+/** Free the InnoDB synchronization data structures. */
+void
+sync_check_close()
+{
+ ut_d(LatchDebug::shutdown());
+
+ mutex_free(&rw_lock_list_mutex);
+
+ sync_array_close();
+
+ sync_latch_meta_destroy();
+}
+
diff --git a/storage/innobase/sync/sync0rw.cc b/storage/innobase/sync/sync0rw.cc
new file mode 100644
index 00000000..2624ffb9
--- /dev/null
+++ b/storage/innobase/sync/sync0rw.cc
@@ -0,0 +1,1216 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0rw.cc
+The read-write lock (for thread synchronization)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0rw.h"
+#include "my_cpu.h"
+#include <my_sys.h>
+
+/*
+ IMPLEMENTATION OF THE RW_LOCK
+ =============================
+The status of a rw_lock is held in lock_word. The initial value of lock_word is
+X_LOCK_DECR. lock_word is decremented by 1 for each s-lock and by X_LOCK_DECR
+or 1 for each x-lock. This describes the lock state for each value of lock_word:
+
+lock_word == X_LOCK_DECR: Unlocked.
+X_LOCK_HALF_DECR < lock_word < X_LOCK_DECR:
+ S locked, no waiting writers.
+ (X_LOCK_DECR - lock_word) is the number
+ of S locks.
+lock_word == X_LOCK_HALF_DECR: SX locked, no waiting writers.
+0 < lock_word < X_LOCK_HALF_DECR:
+ SX locked AND S locked, no waiting writers.
+ (X_LOCK_HALF_DECR - lock_word) is the number
+ of S locks.
+lock_word == 0: X locked, no waiting writers.
+-X_LOCK_HALF_DECR < lock_word < 0:
+ S locked, with a waiting writer.
+ (-lock_word) is the number of S locks.
+lock_word == -X_LOCK_HALF_DECR: X locked and SX locked, no waiting writers.
+-X_LOCK_DECR < lock_word < -X_LOCK_HALF_DECR:
+ S locked, with a waiting writer
+ which has SX lock.
+ -(lock_word + X_LOCK_HALF_DECR) is the number
+ of S locks.
+lock_word == -X_LOCK_DECR: X locked with recursive X lock (2 X locks).
+-(X_LOCK_DECR + X_LOCK_HALF_DECR) < lock_word < -X_LOCK_DECR:
+ X locked. The number of the X locks is:
+ 2 - (lock_word + X_LOCK_DECR)
+lock_word == -(X_LOCK_DECR + X_LOCK_HALF_DECR):
+ X locked with recursive X lock (2 X locks)
+ and SX locked.
+lock_word < -(X_LOCK_DECR + X_LOCK_HALF_DECR):
+ X locked and SX locked.
+ The number of the X locks is:
+ 2 - (lock_word + X_LOCK_DECR + X_LOCK_HALF_DECR)
+
+ LOCK COMPATIBILITY MATRIX
+
+ | S|SX| X|
+ --+--+--+--+
+ S| +| +| -|
+ --+--+--+--+
+ SX| +| -| -|
+ --+--+--+--+
+ X| -| -| -|
+ --+--+--+--+
+
+The lock_word is always read and updated atomically and consistently, so that
+it always represents the state of the lock, and the state of the lock changes
+with a single atomic operation. This lock_word holds all of the information
+that a thread needs in order to determine if it is eligible to gain the lock
+or if it must spin or sleep. The one exception to this is that writer_thread
+must be verified before recursive write locks: to solve this scenario, we make
+writer_thread readable by all threads, but only writeable by the x-lock or
+sx-lock holder.
+
+The other members of the lock obey the following rules to remain consistent:
+
+writer_thread: Is used only in recursive x-locking or sx-locking.
+ This field is 0 at lock creation time and is updated
+ when x-lock is acquired or when move_ownership is called.
+ A thread is only allowed to set the value of this field to
+ it's thread_id i.e.: a thread cannot set writer_thread to
+ some other thread's id.
+waiters: May be set to 1 anytime, but to avoid unnecessary wake-up
+ signals, it should only be set to 1 when there are threads
+ waiting on event. Must be 1 when a writer starts waiting to
+ ensure the current x-locking thread sends a wake-up signal
+ during unlock. May only be reset to 0 immediately before a
+ a wake-up signal is sent to event. On most platforms, a
+ memory barrier is required after waiters is set, and before
+ verifying lock_word is still held, to ensure some unlocker
+ really does see the flags new value.
+event: Threads wait on event for read or writer lock when another
+ thread has an x-lock or an x-lock reservation (wait_ex). A
+ thread may only wait on event after performing the following
+ actions in order:
+ (1) Record the counter value of event (with os_event_reset).
+ (2) Set waiters to 1.
+ (3) Verify lock_word <= 0.
+ (1) must come before (2) to ensure signal is not missed.
+ (2) must come before (3) to ensure a signal is sent.
+ These restrictions force the above ordering.
+ Immediately before sending the wake-up signal, we should:
+ (1) Verify lock_word == X_LOCK_DECR (unlocked)
+ (2) Reset waiters to 0.
+wait_ex_event: A thread may only wait on the wait_ex_event after it has
+ performed the following actions in order:
+ (1) Decrement lock_word by X_LOCK_DECR.
+ (2) Record counter value of wait_ex_event (os_event_reset,
+ called from sync_array_reserve_cell).
+ (3) Verify that lock_word < 0.
+ (1) must come first to ensures no other threads become reader
+ or next writer, and notifies unlocker that signal must be sent.
+ (2) must come before (3) to ensure the signal is not missed.
+ These restrictions force the above ordering.
+ Immediately before sending the wake-up signal, we should:
+ Verify lock_word == 0 (waiting thread holds x_lock)
+*/
+
+rw_lock_stats_t rw_lock_stats;
+
+/* The global list of rw-locks */
+ilist<rw_lock_t> rw_lock_list;
+ib_mutex_t rw_lock_list_mutex;
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Creates a debug info struct. */
+static
+rw_lock_debug_t*
+rw_lock_debug_create(void);
+/*======================*/
+/******************************************************************//**
+Frees a debug info struct. */
+static
+void
+rw_lock_debug_free(
+/*===============*/
+ rw_lock_debug_t* info);
+
+/******************************************************************//**
+Creates a debug info struct.
+@return own: debug info struct */
+static
+rw_lock_debug_t*
+rw_lock_debug_create(void)
+/*======================*/
+{
+ return((rw_lock_debug_t*) ut_malloc_nokey(sizeof(rw_lock_debug_t)));
+}
+
+/******************************************************************//**
+Frees a debug info struct. */
+static
+void
+rw_lock_debug_free(
+/*===============*/
+ rw_lock_debug_t* info)
+{
+ ut_free(info);
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+void
+rw_lock_create_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+ latch_level_t level, /*!< in: level */
+#endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ unsigned cline) /*!< in: file line where created */
+{
+#if defined(UNIV_DEBUG) && !defined(UNIV_PFS_RWLOCK)
+ /* It should have been created in pfs_rw_lock_create_func() */
+ new(lock) rw_lock_t();
+#endif /* UNIV_DEBUG */
+
+ lock->lock_word = X_LOCK_DECR;
+ lock->waiters = 0;
+
+ lock->sx_recursive = 0;
+ lock->writer_thread= 0;
+
+#ifdef UNIV_DEBUG
+ lock->m_rw_lock = true;
+
+ UT_LIST_INIT(lock->debug_list, &rw_lock_debug_t::list);
+
+ lock->m_id = sync_latch_get_id(sync_latch_get_name(level));
+ ut_a(lock->m_id != LATCH_ID_NONE);
+
+ lock->level = level;
+#endif /* UNIV_DEBUG */
+
+ lock->cfile_name = cfile_name;
+
+ /* This should hold in practice. If it doesn't then we need to
+ split the source file anyway. Or create the locks on lines
+ less than 8192. cline is unsigned:13. */
+ ut_ad(cline <= ((1U << 13) - 1));
+ lock->cline = cline & ((1U << 13) - 1);
+ lock->count_os_wait = 0;
+ lock->last_x_file_name = "not yet reserved";
+ lock->last_x_line = 0;
+ lock->event = os_event_create(0);
+ lock->wait_ex_event = os_event_create(0);
+
+ lock->is_block_lock = 0;
+
+ ut_d(lock->created = true);
+
+ mutex_enter(&rw_lock_list_mutex);
+ rw_lock_list.push_front(*lock);
+ mutex_exit(&rw_lock_list_mutex);
+}
+
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the rw-lock is freed. Removes an rw-lock object from the global list. The
+rw-lock is checked to be in the non-locked state. */
+void
+rw_lock_free_func(
+/*==============*/
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ ut_ad(rw_lock_validate(lock));
+ ut_a(lock->lock_word == X_LOCK_DECR);
+
+ ut_d(lock->created = false);
+
+ mutex_enter(&rw_lock_list_mutex);
+
+ os_event_destroy(lock->event);
+
+ os_event_destroy(lock->wait_ex_event);
+
+ rw_lock_list.remove(*lock);
+
+ mutex_exit(&rw_lock_list_mutex);
+}
+
+/******************************************************************//**
+Lock an rw-lock in shared mode for the current thread. If the rw-lock is
+locked in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by srv_n_spin_wait_rounds), waiting
+for the lock, before suspending the thread. */
+void
+rw_lock_s_lock_spin(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock
+ will be passed to another thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ ulint i = 0; /* spin round count */
+ sync_array_t* sync_arr;
+ lint spin_count = 0;
+ int64_t count_os_wait = 0;
+
+ /* We reuse the thread id to index into the counter, cache
+ it here for efficiency. */
+
+ ut_ad(rw_lock_validate(lock));
+
+ rw_lock_stats.rw_s_spin_wait_count.inc();
+
+lock_loop:
+
+ /* Spin waiting for the writer field to become free */
+ HMT_low();
+ ulint j = i;
+ while (i < srv_n_spin_wait_rounds &&
+ lock->lock_word <= 0) {
+ ut_delay(srv_spin_wait_delay);
+ i++;
+ }
+
+ HMT_medium();
+ if (i >= srv_n_spin_wait_rounds) {
+ os_thread_yield();
+ }
+
+ spin_count += lint(i - j);
+
+ /* We try once again to obtain the lock */
+ if (rw_lock_s_lock_low(lock, pass, file_name, line)) {
+
+ if (count_os_wait > 0) {
+ lock->count_os_wait +=
+ static_cast<uint32_t>(count_os_wait);
+ rw_lock_stats.rw_s_os_wait_count.add(count_os_wait);
+ }
+
+ rw_lock_stats.rw_s_spin_round_count.add(spin_count);
+
+ return; /* Success */
+ } else {
+
+ if (i < srv_n_spin_wait_rounds) {
+ goto lock_loop;
+ }
+
+
+ ++count_os_wait;
+
+ sync_cell_t* cell;
+
+ sync_arr = sync_array_get_and_reserve_cell(
+ lock, RW_LOCK_S, file_name, line, &cell);
+
+ /* Set waiters before checking lock_word to ensure wake-up
+ signal is sent. This may lead to some unnecessary signals. */
+ lock->waiters.exchange(1, std::memory_order_acquire);
+
+ if (rw_lock_s_lock_low(lock, pass, file_name, line)) {
+
+ sync_array_free_cell(sync_arr, cell);
+
+ if (count_os_wait > 0) {
+
+ lock->count_os_wait +=
+ static_cast<uint32_t>(count_os_wait);
+
+ rw_lock_stats.rw_s_os_wait_count.add(
+ count_os_wait);
+ }
+
+ rw_lock_stats.rw_s_spin_round_count.add(spin_count);
+
+ return; /* Success */
+ }
+
+ /* see comments in trx_commit_low() to
+ before_trx_state_committed_in_memory explaining
+ this care to invoke the following sync check.*/
+#ifndef DBUG_OFF
+#ifdef UNIV_DEBUG
+ if (lock->get_level() != SYNC_DICT_OPERATION) {
+ DEBUG_SYNC_C("rw_s_lock_waiting");
+ }
+#endif
+#endif
+ sync_array_wait_event(sync_arr, cell);
+
+ i = 0;
+
+ goto lock_loop;
+ }
+}
+
+/******************************************************************//**
+This function is used in the insert buffer to move the ownership of an
+x-latch on a buffer frame to the current thread. The x-latch was set by
+the buffer read operation and it protected the buffer frame while the
+read was done. The ownership is moved because we want that the current
+thread is able to acquire a second x-latch which is stored in an mtr.
+This, in turn, is needed to pass the debug checks of index page
+operations. */
+void
+rw_lock_x_lock_move_ownership(
+/*==========================*/
+ rw_lock_t* lock) /*!< in: lock which was x-locked in the
+ buffer read */
+{
+ ut_ad(rw_lock_is_locked(lock, RW_LOCK_X));
+
+ lock->writer_thread = os_thread_get_curr_id();
+}
+
+/******************************************************************//**
+Function for the next writer to call. Waits for readers to exit.
+The caller must have already decremented lock_word by X_LOCK_DECR. */
+UNIV_INLINE
+void
+rw_lock_x_lock_wait_func(
+/*=====================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+#ifdef UNIV_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+#endif
+ lint threshold,/*!< in: threshold to wait for */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ ulint i = 0;
+ lint n_spins = 0;
+ sync_array_t* sync_arr;
+ int64_t count_os_wait = 0;
+
+ ut_ad(lock->lock_word <= threshold);
+
+ HMT_low();
+ while (lock->lock_word < threshold) {
+ ut_delay(srv_spin_wait_delay);
+
+ if (i < srv_n_spin_wait_rounds) {
+ i++;
+ continue;
+ }
+
+ /* If there is still a reader, then go to sleep.*/
+ n_spins += i;
+
+ sync_cell_t* cell;
+
+ sync_arr = sync_array_get_and_reserve_cell(
+ lock, RW_LOCK_X_WAIT, file_name, line, &cell);
+
+ i = 0;
+
+ /* Check lock_word to ensure wake-up isn't missed.*/
+ if (lock->lock_word < threshold) {
+ ++count_os_wait;
+
+ /* Add debug info as it is needed to detect possible
+ deadlock. We must add info for WAIT_EX thread for
+ deadlock detection to work properly. */
+ ut_d(rw_lock_add_debug_info(
+ lock, pass, RW_LOCK_X_WAIT,
+ file_name, line));
+
+ sync_array_wait_event(sync_arr, cell);
+
+ ut_d(rw_lock_remove_debug_info(
+ lock, pass, RW_LOCK_X_WAIT));
+
+ /* It is possible to wake when lock_word < 0.
+ We must pass the while-loop check to proceed.*/
+
+ } else {
+ sync_array_free_cell(sync_arr, cell);
+ break;
+ }
+ }
+ HMT_medium();
+ rw_lock_stats.rw_x_spin_round_count.add(n_spins);
+
+ if (count_os_wait > 0) {
+ lock->count_os_wait += static_cast<uint32_t>(count_os_wait);
+ rw_lock_stats.rw_x_os_wait_count.add(count_os_wait);
+ }
+}
+
+#ifdef UNIV_DEBUG
+# define rw_lock_x_lock_wait(L, P, T, F, O) \
+ rw_lock_x_lock_wait_func(L, P, T, F, O)
+#else
+# define rw_lock_x_lock_wait(L, P, T, F, O) \
+ rw_lock_x_lock_wait_func(L, T, F, O)
+#endif /* UNIV_DBEUG */
+
+/******************************************************************//**
+Low-level function for acquiring an exclusive lock.
+@return FALSE if did not succeed, TRUE if success. */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_low(
+/*===============*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ if (rw_lock_lock_word_decr(lock, X_LOCK_DECR, X_LOCK_HALF_DECR)) {
+
+ /* As we are going to write our own thread id in that field it
+ must be that the current writer_thread value is not active. */
+ ut_a(!lock->writer_thread);
+
+ /* Decrement occurred: we are writer or next-writer. */
+ if (!pass)
+ {
+ lock->writer_thread = os_thread_get_curr_id();
+ }
+
+ rw_lock_x_lock_wait(lock, pass, 0, file_name, line);
+
+ } else {
+ os_thread_id_t thread_id = os_thread_get_curr_id();
+
+ /* Decrement failed: An X or SX lock is held by either
+ this thread or another. Try to relock. */
+ if (!pass && os_thread_eq(lock->writer_thread, thread_id)) {
+ /* Other s-locks can be allowed. If it is request x
+ recursively while holding sx lock, this x lock should
+ be along with the latching-order. */
+
+ /* The existing X or SX lock is from this thread */
+ if (rw_lock_lock_word_decr(lock, X_LOCK_DECR, 0)) {
+ /* There is at least one SX-lock from this
+ thread, but no X-lock. */
+
+ /* Wait for any the other S-locks to be
+ released. */
+ rw_lock_x_lock_wait(
+ lock, pass, -X_LOCK_HALF_DECR,
+ file_name, line);
+
+ } else {
+ int32_t lock_word = lock->lock_word;
+ /* At least one X lock by this thread already
+ exists. Add another. */
+ if (lock_word == 0
+ || lock_word == -X_LOCK_HALF_DECR) {
+ lock->lock_word.fetch_sub(X_LOCK_DECR);
+ } else {
+ ut_ad(lock_word <= -X_LOCK_DECR);
+ lock->lock_word.fetch_sub(1);
+ }
+ }
+
+ } else {
+ /* Another thread locked before us */
+ return(FALSE);
+ }
+ }
+
+ ut_d(rw_lock_add_debug_info(lock, pass, RW_LOCK_X, file_name, line));
+
+ lock->last_x_file_name = file_name;
+ lock->last_x_line = line & ((1U << 14) - 1);
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+Low-level function for acquiring an sx lock.
+@return FALSE if did not succeed, TRUE if success. */
+ibool
+rw_lock_sx_lock_low(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ if (rw_lock_lock_word_decr(lock, X_LOCK_HALF_DECR, X_LOCK_HALF_DECR)) {
+
+ /* As we are going to write our own thread id in that field it
+ must be that the current writer_thread value is not active. */
+ ut_a(!lock->writer_thread);
+
+ /* Decrement occurred: we are the SX lock owner. */
+ if (!pass)
+ {
+ lock->writer_thread = os_thread_get_curr_id();
+ }
+
+ lock->sx_recursive = 1;
+ } else {
+ os_thread_id_t thread_id = os_thread_get_curr_id();
+
+ /* Decrement failed: It already has an X or SX lock by this
+ thread or another thread. If it is this thread, relock,
+ else fail. */
+ if (!pass && os_thread_eq(lock->writer_thread, thread_id)) {
+ /* This thread owns an X or SX lock */
+ if (lock->sx_recursive++ == 0) {
+ /* This thread is making first SX-lock request
+ and it must be holding at least one X-lock here
+ because:
+
+ * There can't be a WAIT_EX thread because we are
+ the thread which has it's thread_id written in
+ the writer_thread field and we are not waiting.
+
+ * Any other X-lock thread cannot exist because
+ it must update recursive flag only after
+ updating the thread_id. Had there been
+ a concurrent X-locking thread which succeeded
+ in decrementing the lock_word it must have
+ written it's thread_id before setting the
+ recursive flag. As we cleared the if()
+ condition above therefore we must be the only
+ thread working on this lock and it is safe to
+ read and write to the lock_word. */
+
+#ifdef UNIV_DEBUG
+ auto lock_word =
+#endif
+ lock->lock_word.fetch_sub(X_LOCK_HALF_DECR,
+ std::memory_order_relaxed);
+
+ ut_ad((lock_word == 0)
+ || ((lock_word <= -X_LOCK_DECR)
+ && (lock_word
+ > -(X_LOCK_DECR
+ + X_LOCK_HALF_DECR))));
+ }
+ } else {
+ /* Another thread locked before us */
+ return(FALSE);
+ }
+ }
+
+ ut_d(rw_lock_add_debug_info(lock, pass, RW_LOCK_SX, file_name, line));
+
+ lock->last_x_file_name = file_name;
+ lock->last_x_line = line & ((1U << 14) - 1);
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by srv_n_spin_wait_rounds), waiting
+for the lock before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+void
+rw_lock_x_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+{
+ ulint i = 0;
+ sync_array_t* sync_arr;
+ lint spin_count = 0;
+ int64_t count_os_wait = 0;
+
+ ut_ad(rw_lock_validate(lock));
+ ut_ad(!rw_lock_own(lock, RW_LOCK_S));
+
+ if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
+ /* Locking succeeded */
+ return;
+ }
+ rw_lock_stats.rw_x_spin_wait_count.inc();
+
+lock_loop:
+
+ if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
+
+ if (count_os_wait > 0) {
+ lock->count_os_wait +=
+ static_cast<uint32_t>(count_os_wait);
+ rw_lock_stats.rw_x_os_wait_count.add(count_os_wait);
+ }
+
+ rw_lock_stats.rw_x_spin_round_count.add(spin_count);
+
+ /* Locking succeeded */
+ return;
+
+ } else {
+
+ /* Spin waiting for the lock_word to become free */
+ HMT_low();
+ ulint j = i;
+ while (i < srv_n_spin_wait_rounds
+ && lock->lock_word <= X_LOCK_HALF_DECR) {
+ ut_delay(srv_spin_wait_delay);
+ i++;
+ }
+
+ HMT_medium();
+ spin_count += lint(i - j);
+
+ if (i >= srv_n_spin_wait_rounds) {
+
+ os_thread_yield();
+
+ } else {
+
+ goto lock_loop;
+ }
+ }
+
+ sync_cell_t* cell;
+
+ sync_arr = sync_array_get_and_reserve_cell(
+ lock, RW_LOCK_X, file_name, line, &cell);
+
+ /* Waiters must be set before checking lock_word, to ensure signal
+ is sent. This could lead to a few unnecessary wake-up signals. */
+ lock->waiters.exchange(1, std::memory_order_acquire);
+
+ if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
+ sync_array_free_cell(sync_arr, cell);
+
+ if (count_os_wait > 0) {
+ lock->count_os_wait +=
+ static_cast<uint32_t>(count_os_wait);
+ rw_lock_stats.rw_x_os_wait_count.add(count_os_wait);
+ }
+
+ rw_lock_stats.rw_x_spin_round_count.add(spin_count);
+
+ /* Locking succeeded */
+ return;
+ }
+
+ ++count_os_wait;
+
+ sync_array_wait_event(sync_arr, cell);
+
+ i = 0;
+
+ goto lock_loop;
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in SX mode for the current thread. If the rw-lock is locked
+in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single sx-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+void
+rw_lock_sx_lock_func(
+/*=================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ unsigned line) /*!< in: line where requested */
+
+{
+ ulint i = 0;
+ sync_array_t* sync_arr;
+ lint spin_count = 0;
+ int64_t count_os_wait = 0;
+
+ ut_ad(rw_lock_validate(lock));
+ ut_ad(!rw_lock_own(lock, RW_LOCK_S));
+
+ if (rw_lock_sx_lock_low(lock, pass, file_name, line)) {
+ /* Locking succeeded */
+ return;
+ }
+
+ rw_lock_stats.rw_sx_spin_wait_count.inc();
+
+lock_loop:
+
+ if (rw_lock_sx_lock_low(lock, pass, file_name, line)) {
+
+ if (count_os_wait > 0) {
+ lock->count_os_wait +=
+ static_cast<uint32_t>(count_os_wait);
+ rw_lock_stats.rw_sx_os_wait_count.add(count_os_wait);
+ }
+
+ rw_lock_stats.rw_sx_spin_round_count.add(spin_count);
+
+ /* Locking succeeded */
+ return;
+
+ } else {
+
+ /* Spin waiting for the lock_word to become free */
+ ulint j = i;
+ while (i < srv_n_spin_wait_rounds
+ && lock->lock_word <= X_LOCK_HALF_DECR) {
+ ut_delay(srv_spin_wait_delay);
+ i++;
+ }
+
+ spin_count += lint(i - j);
+
+ if (i >= srv_n_spin_wait_rounds) {
+
+ os_thread_yield();
+
+ } else {
+
+ goto lock_loop;
+ }
+ }
+
+ sync_cell_t* cell;
+
+ sync_arr = sync_array_get_and_reserve_cell(
+ lock, RW_LOCK_SX, file_name, line, &cell);
+
+ /* Waiters must be set before checking lock_word, to ensure signal
+ is sent. This could lead to a few unnecessary wake-up signals. */
+ lock->waiters.exchange(1, std::memory_order_acquire);
+
+ if (rw_lock_sx_lock_low(lock, pass, file_name, line)) {
+
+ sync_array_free_cell(sync_arr, cell);
+
+ if (count_os_wait > 0) {
+ lock->count_os_wait +=
+ static_cast<uint32_t>(count_os_wait);
+ rw_lock_stats.rw_sx_os_wait_count.add(count_os_wait);
+ }
+
+ rw_lock_stats.rw_sx_spin_round_count.add(spin_count);
+
+ /* Locking succeeded */
+ return;
+ }
+
+ ++count_os_wait;
+
+ sync_array_wait_event(sync_arr, cell);
+
+ i = 0;
+
+ goto lock_loop;
+}
+
+#ifdef UNIV_DEBUG
+
+/******************************************************************//**
+Checks that the rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return true */
+bool
+rw_lock_validate(
+/*=============*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ ut_ad(lock);
+
+ ut_ad(lock->created);
+
+ int32_t lock_word = lock->lock_word;
+
+ ut_ad(lock->waiters < 2);
+ ut_ad(lock_word > -(2 * X_LOCK_DECR));
+ ut_ad(lock_word <= X_LOCK_DECR);
+
+ return(true);
+}
+
+/******************************************************************//**
+Checks if somebody has locked the rw-lock in the specified mode.
+@return true if locked */
+bool
+rw_lock_is_locked(
+/*==============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint lock_type) /*!< in: lock type: RW_LOCK_S,
+ RW_LOCK_X or RW_LOCK_SX */
+{
+ ut_ad(rw_lock_validate(lock));
+
+ switch (lock_type) {
+ case RW_LOCK_S:
+ return(rw_lock_get_reader_count(lock) > 0);
+
+ case RW_LOCK_X:
+ return(rw_lock_get_writer(lock) == RW_LOCK_X);
+
+ case RW_LOCK_SX:
+ return(rw_lock_get_sx_lock_count(lock) > 0);
+
+ default:
+ ut_error;
+ }
+ return(false); /* avoid compiler warnings */
+}
+
+/******************************************************************//**
+Inserts the debug information for an rw-lock. */
+void
+rw_lock_add_debug_info(
+/*===================*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint pass, /*!< in: pass value */
+ ulint lock_type, /*!< in: lock type */
+ const char* file_name, /*!< in: file where requested */
+ unsigned line) /*!< in: line where requested */
+{
+ ut_ad(file_name != NULL);
+
+ rw_lock_debug_t* info = rw_lock_debug_create();
+
+ rw_lock_debug_mutex_enter();
+
+ info->pass = pass;
+ info->line = line;
+ info->lock_type = lock_type;
+ info->file_name = file_name;
+ info->thread_id = os_thread_get_curr_id();
+
+ UT_LIST_ADD_FIRST(lock->debug_list, info);
+
+ rw_lock_debug_mutex_exit();
+
+ if (pass == 0 && lock_type != RW_LOCK_X_WAIT) {
+ int32_t lock_word = lock->lock_word;
+
+ /* Recursive x while holding SX
+ (lock_type == RW_LOCK_X && lock_word == -X_LOCK_HALF_DECR)
+ is treated as not-relock (new lock). */
+
+ if ((lock_type == RW_LOCK_X
+ && lock_word < -X_LOCK_HALF_DECR)
+ || (lock_type == RW_LOCK_SX
+ && (lock_word < 0 || lock->sx_recursive == 1))) {
+
+ sync_check_lock_validate(lock);
+ sync_check_lock_granted(lock);
+ } else {
+ sync_check_relock(lock);
+ }
+ }
+}
+
+/******************************************************************//**
+Removes a debug information struct for an rw-lock. */
+void
+rw_lock_remove_debug_info(
+/*======================*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint pass, /*!< in: pass value */
+ ulint lock_type) /*!< in: lock type */
+{
+ rw_lock_debug_t* info;
+
+ ut_ad(lock);
+
+ if (pass == 0 && lock_type != RW_LOCK_X_WAIT) {
+ sync_check_unlock(lock);
+ }
+
+ rw_lock_debug_mutex_enter();
+
+ for (info = UT_LIST_GET_FIRST(lock->debug_list);
+ info != 0;
+ info = UT_LIST_GET_NEXT(list, info)) {
+
+ if (pass == info->pass
+ && (pass != 0
+ || os_thread_eq(info->thread_id,
+ os_thread_get_curr_id()))
+ && info->lock_type == lock_type) {
+
+ /* Found! */
+ UT_LIST_REMOVE(lock->debug_list, info);
+
+ rw_lock_debug_mutex_exit();
+
+ rw_lock_debug_free(info);
+
+ return;
+ }
+ }
+
+ ut_error;
+}
+
+/******************************************************************//**
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0.
+@return TRUE if locked */
+bool
+rw_lock_own(
+/*========*/
+ const rw_lock_t*lock, /*!< in: rw-lock */
+ ulint lock_type) /*!< in: lock type: RW_LOCK_S,
+ RW_LOCK_X */
+{
+ ut_ad(lock);
+ ut_ad(rw_lock_validate(lock));
+
+ const os_thread_id_t thread_id = os_thread_get_curr_id();
+
+ if (!os_thread_eq(lock->writer_thread, thread_id)) {
+ } else if (lock_type == RW_LOCK_X && rw_lock_get_x_lock_count(lock)) {
+ return TRUE;
+ } else if (lock_type == RW_LOCK_SX && rw_lock_get_sx_lock_count(lock)) {
+ return TRUE;
+ }
+
+ rw_lock_debug_mutex_enter();
+
+ for (const rw_lock_debug_t* info = UT_LIST_GET_FIRST(lock->debug_list);
+ info != NULL;
+ info = UT_LIST_GET_NEXT(list, info)) {
+
+ if (os_thread_eq(info->thread_id, thread_id)
+ && info->pass == 0
+ && info->lock_type == lock_type) {
+
+ rw_lock_debug_mutex_exit();
+ /* Found! */
+
+ return(true);
+ }
+ }
+ rw_lock_debug_mutex_exit();
+
+ return(false);
+}
+
+/** Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0.
+@param[in] lock rw-lock
+@param[in] flags specify lock types with OR of the
+ rw_lock_flag_t values
+@return true if locked */
+bool rw_lock_own_flagged(const rw_lock_t* lock, rw_lock_flags_t flags)
+{
+ ut_ad(rw_lock_validate(lock));
+
+ const os_thread_id_t thread_id = os_thread_get_curr_id();
+
+ if (!os_thread_eq(lock->writer_thread, thread_id)) {
+ } else if ((flags & RW_LOCK_FLAG_X)
+ && rw_lock_get_x_lock_count(lock)) {
+ return true;
+ } else if ((flags & RW_LOCK_FLAG_SX)
+ && rw_lock_get_sx_lock_count(lock)) {
+ return true;
+ }
+
+ rw_lock_debug_mutex_enter();
+
+ for (rw_lock_debug_t* info = UT_LIST_GET_FIRST(lock->debug_list);
+ info != NULL;
+ info = UT_LIST_GET_NEXT(list, info)) {
+ if (!os_thread_eq(info->thread_id, thread_id)
+ || info->pass) {
+ continue;
+ }
+
+ switch (info->lock_type) {
+ case RW_LOCK_S:
+ if (!(flags & RW_LOCK_FLAG_S)) {
+ continue;
+ }
+ break;
+
+ case RW_LOCK_X:
+ if (!(flags & RW_LOCK_FLAG_X)) {
+ continue;
+ }
+ break;
+
+ case RW_LOCK_SX:
+ if (!(flags & RW_LOCK_FLAG_SX)) {
+ continue;
+ }
+ break;
+ }
+
+ rw_lock_debug_mutex_exit();
+ return true;
+ }
+
+ rw_lock_debug_mutex_exit();
+ return false;
+}
+
+/***************************************************************//**
+Prints debug info of currently locked rw-locks. */
+void
+rw_lock_list_print_info(
+/*====================*/
+ FILE* file) /*!< in: file where to print */
+{
+ ulint count = 0;
+
+ mutex_enter(&rw_lock_list_mutex);
+
+ fputs("-------------\n"
+ "RW-LATCH INFO\n"
+ "-------------\n", file);
+
+ for (const rw_lock_t& lock : rw_lock_list) {
+
+ count++;
+
+ if (lock.lock_word != X_LOCK_DECR) {
+
+ fprintf(file, "RW-LOCK: %p ", (void*) &lock);
+
+ if (int32_t waiters= lock.waiters) {
+ fprintf(file, " (%d waiters)\n", waiters);
+ } else {
+ putc('\n', file);
+ }
+
+ rw_lock_debug_t* info;
+
+ rw_lock_debug_mutex_enter();
+
+ for (info = UT_LIST_GET_FIRST(lock.debug_list);
+ info != NULL;
+ info = UT_LIST_GET_NEXT(list, info)) {
+
+ rw_lock_debug_print(file, info);
+ }
+
+ rw_lock_debug_mutex_exit();
+ }
+ }
+
+ fprintf(file, "Total number of rw-locks " ULINTPF "\n", count);
+ mutex_exit(&rw_lock_list_mutex);
+}
+
+/*********************************************************************//**
+Prints info of a debug struct. */
+void
+rw_lock_debug_print(
+/*================*/
+ FILE* f, /*!< in: output stream */
+ const rw_lock_debug_t* info) /*!< in: debug struct */
+{
+ ulint rwt = info->lock_type;
+
+ fprintf(f, "Locked: thread " ULINTPF " file %s line %u ",
+ ulint(info->thread_id),
+ sync_basename(info->file_name),
+ info->line);
+
+ switch (rwt) {
+ case RW_LOCK_S:
+ fputs("S-LOCK", f);
+ break;
+ case RW_LOCK_X:
+ fputs("X-LOCK", f);
+ break;
+ case RW_LOCK_SX:
+ fputs("SX-LOCK", f);
+ break;
+ case RW_LOCK_X_WAIT:
+ fputs("WAIT X-LOCK", f);
+ break;
+ default:
+ ut_error;
+ }
+
+ if (info->pass != 0) {
+ fprintf(f, " pass value %lu", (ulong) info->pass);
+ }
+
+ fprintf(f, "\n");
+}
+
+/** Print the rw-lock information.
+@return the string representation */
+std::string
+rw_lock_t::to_string() const
+{
+ /* Note: For X locks it can be locked form multiple places because
+ the same thread can call X lock recursively. */
+
+ std::ostringstream msg;
+ bool written = false;
+
+ ut_ad(rw_lock_validate(this));
+
+ msg << "RW-LATCH: "
+ << "thread id " << os_thread_get_curr_id()
+ << " addr: " << this
+ << " Locked from: ";
+
+ rw_lock_debug_mutex_enter();
+
+ for (rw_lock_debug_t* info = UT_LIST_GET_FIRST(debug_list);
+ info != NULL;
+ info = UT_LIST_GET_NEXT(list, info)) {
+ if (!os_thread_eq(info->thread_id, os_thread_get_curr_id())) {
+ continue;
+ }
+
+ if (written) {
+ msg << ", ";
+ }
+
+ written = true;
+
+ msg << info->file_name << ":" << info->line;
+ }
+
+ rw_lock_debug_mutex_exit();
+
+ return(msg.str());
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc
new file mode 100644
index 00000000..0a6f8bfb
--- /dev/null
+++ b/storage/innobase/sync/sync0sync.cc
@@ -0,0 +1,246 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2020, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0sync.cc
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0rw.h"
+#include "sync0sync.h"
+
+#ifdef UNIV_PFS_MUTEX
+mysql_pfs_key_t buf_pool_mutex_key;
+mysql_pfs_key_t dict_foreign_err_mutex_key;
+mysql_pfs_key_t dict_sys_mutex_key;
+mysql_pfs_key_t fil_system_mutex_key;
+mysql_pfs_key_t flush_list_mutex_key;
+mysql_pfs_key_t fts_delete_mutex_key;
+mysql_pfs_key_t fts_doc_id_mutex_key;
+mysql_pfs_key_t fts_pll_tokenize_mutex_key;
+mysql_pfs_key_t ibuf_bitmap_mutex_key;
+mysql_pfs_key_t ibuf_mutex_key;
+mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
+mysql_pfs_key_t log_sys_mutex_key;
+mysql_pfs_key_t log_cmdq_mutex_key;
+mysql_pfs_key_t log_flush_order_mutex_key;
+mysql_pfs_key_t recalc_pool_mutex_key;
+mysql_pfs_key_t purge_sys_pq_mutex_key;
+mysql_pfs_key_t recv_sys_mutex_key;
+mysql_pfs_key_t redo_rseg_mutex_key;
+mysql_pfs_key_t noredo_rseg_mutex_key;
+mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
+# ifdef UNIV_DEBUG
+mysql_pfs_key_t rw_lock_debug_mutex_key;
+# endif /* UNIV_DEBUG */
+mysql_pfs_key_t rtr_active_mutex_key;
+mysql_pfs_key_t rtr_match_mutex_key;
+mysql_pfs_key_t rtr_path_mutex_key;
+mysql_pfs_key_t rw_lock_list_mutex_key;
+mysql_pfs_key_t srv_innodb_monitor_mutex_key;
+mysql_pfs_key_t srv_misc_tmpfile_mutex_key;
+mysql_pfs_key_t srv_monitor_file_mutex_key;
+mysql_pfs_key_t buf_dblwr_mutex_key;
+mysql_pfs_key_t trx_mutex_key;
+mysql_pfs_key_t trx_pool_mutex_key;
+mysql_pfs_key_t trx_pool_manager_mutex_key;
+mysql_pfs_key_t lock_mutex_key;
+mysql_pfs_key_t lock_wait_mutex_key;
+mysql_pfs_key_t trx_sys_mutex_key;
+mysql_pfs_key_t srv_threads_mutex_key;
+mysql_pfs_key_t sync_array_mutex_key;
+mysql_pfs_key_t thread_mutex_key;
+mysql_pfs_key_t row_drop_list_mutex_key;
+mysql_pfs_key_t rw_trx_hash_element_mutex_key;
+mysql_pfs_key_t read_view_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+#ifdef UNIV_PFS_RWLOCK
+mysql_pfs_key_t btr_search_latch_key;
+mysql_pfs_key_t dict_operation_lock_key;
+mysql_pfs_key_t index_tree_rw_lock_key;
+mysql_pfs_key_t index_online_log_key;
+mysql_pfs_key_t fil_space_latch_key;
+mysql_pfs_key_t fts_cache_rw_lock_key;
+mysql_pfs_key_t fts_cache_init_rw_lock_key;
+mysql_pfs_key_t trx_i_s_cache_lock_key;
+mysql_pfs_key_t trx_purge_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+/** For monitoring active mutexes */
+MutexMonitor mutex_monitor;
+
+/**
+Prints wait info of the sync system.
+@param file - where to print */
+static
+void
+sync_print_wait_info(FILE* file)
+{
+ fprintf(file,
+ "RW-shared spins " UINT64PF ", rounds " UINT64PF ","
+ " OS waits " UINT64PF "\n"
+ "RW-excl spins " UINT64PF ", rounds " UINT64PF ","
+ " OS waits " UINT64PF "\n"
+ "RW-sx spins " UINT64PF ", rounds " UINT64PF ","
+ " OS waits " UINT64PF "\n",
+ (ib_uint64_t) rw_lock_stats.rw_s_spin_wait_count,
+ (ib_uint64_t) rw_lock_stats.rw_s_spin_round_count,
+ (ib_uint64_t) rw_lock_stats.rw_s_os_wait_count,
+ (ib_uint64_t) rw_lock_stats.rw_x_spin_wait_count,
+ (ib_uint64_t) rw_lock_stats.rw_x_spin_round_count,
+ (ib_uint64_t) rw_lock_stats.rw_x_os_wait_count,
+ (ib_uint64_t) rw_lock_stats.rw_sx_spin_wait_count,
+ (ib_uint64_t) rw_lock_stats.rw_sx_spin_round_count,
+ (ib_uint64_t) rw_lock_stats.rw_sx_os_wait_count);
+
+ fprintf(file,
+ "Spin rounds per wait: %.2f RW-shared,"
+ " %.2f RW-excl, %.2f RW-sx\n",
+ rw_lock_stats.rw_s_spin_wait_count
+ ? static_cast<double>(rw_lock_stats.rw_s_spin_round_count) /
+ static_cast<double>(rw_lock_stats.rw_s_spin_wait_count)
+ : static_cast<double>(rw_lock_stats.rw_s_spin_round_count),
+ rw_lock_stats.rw_x_spin_wait_count
+ ? static_cast<double>(rw_lock_stats.rw_x_spin_round_count) /
+ static_cast<double>(rw_lock_stats.rw_x_spin_wait_count)
+ : static_cast<double>(rw_lock_stats.rw_x_spin_round_count),
+ rw_lock_stats.rw_sx_spin_wait_count
+ ? static_cast<double>(rw_lock_stats.rw_sx_spin_round_count) /
+ static_cast<double>(rw_lock_stats.rw_sx_spin_wait_count)
+ : static_cast<double>(rw_lock_stats.rw_sx_spin_round_count));
+}
+
+/**
+Prints info of the sync system.
+@param file - where to print */
+void
+sync_print(FILE* file)
+{
+#ifdef UNIV_DEBUG
+ rw_lock_list_print_info(file);
+#endif /* UNIV_DEBUG */
+
+ sync_array_print(file);
+
+ sync_print_wait_info(file);
+}
+
+/** Print the filename "basename" e.g., p = "/a/b/c/d/e.cc" -> p = "e.cc"
+@param[in] filename Name from where to extract the basename
+@return the basename */
+const char*
+sync_basename(const char* filename)
+{
+ const char* ptr = filename + strlen(filename) - 1;
+
+ while (ptr > filename && *ptr != '/' && *ptr != '\\') {
+ --ptr;
+ }
+
+ ++ptr;
+
+ return(ptr);
+}
+
+/** String representation of the filename and line number where the
+latch was created
+@param[in] id Latch ID
+@param[in] created Filename and line number where it was crated
+@return the string representation */
+std::string
+sync_mutex_to_string(
+ latch_id_t id,
+ const std::string& created)
+{
+ std::ostringstream msg;
+
+ msg << "Mutex " << sync_latch_get_name(id) << " "
+ << "created " << created;
+
+ return(msg.str());
+}
+
+/** Enable the mutex monitoring */
+void
+MutexMonitor::enable()
+{
+ /** Note: We don't add any latch meta-data after startup. Therefore
+ there is no need to use a mutex here. */
+
+ LatchMetaData::iterator end = latch_meta.end();
+
+ for (LatchMetaData::iterator it = latch_meta.begin(); it != end; ++it) {
+
+ if (*it != NULL) {
+ (*it)->get_counter()->enable();
+ }
+ }
+}
+
+/** Disable the mutex monitoring */
+void
+MutexMonitor::disable()
+{
+ /** Note: We don't add any latch meta-data after startup. Therefore
+ there is no need to use a mutex here. */
+
+ LatchMetaData::iterator end = latch_meta.end();
+
+ for (LatchMetaData::iterator it = latch_meta.begin(); it != end; ++it) {
+
+ if (*it != NULL) {
+ (*it)->get_counter()->disable();
+ }
+ }
+}
+
+/** Reset the mutex monitoring counters */
+void
+MutexMonitor::reset()
+{
+ /** Note: We don't add any latch meta-data after startup. Therefore
+ there is no need to use a mutex here. */
+
+ LatchMetaData::iterator end = latch_meta.end();
+
+ for (LatchMetaData::iterator it = latch_meta.begin(); it != end; ++it) {
+
+ if (*it != NULL) {
+ (*it)->get_counter()->reset();
+ }
+ }
+
+ mutex_enter(&rw_lock_list_mutex);
+
+ for (rw_lock_t& rw_lock : rw_lock_list) {
+ rw_lock.count_os_wait = 0;
+ }
+
+ mutex_exit(&rw_lock_list_mutex);
+}
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
new file mode 100644
index 00000000..d043c3d8
--- /dev/null
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -0,0 +1,1490 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0i_s.cc
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables fetch code.
+
+The code below fetches information needed to fill those
+3 dynamic tables and uploads it into a "transactions
+table cache" for later retrieval.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#include "trx0i_s.h"
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "sync0rw.h"
+#include "sync0sync.h"
+#include "trx0sys.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "sql_class.h"
+
+/** Initial number of rows in the table cache */
+#define TABLE_CACHE_INITIAL_ROWSNUM 1024
+
+/** @brief The maximum number of chunks to allocate for a table cache.
+
+The rows of a table cache are stored in a set of chunks. When a new
+row is added a new chunk is allocated if necessary. Assuming that the
+first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each
+subsequent is N/2 where N is the number of rows we have allocated till
+now, then 39th chunk would accommodate 1677416425 rows and all chunks
+would accommodate 3354832851 rows. */
+#define MEM_CHUNKS_IN_TABLE_CACHE 39
+
+/** The following are some testing auxiliary macros. Do not enable them
+in a production environment. */
+/* @{ */
+
+#if 0
+/** If this is enabled then lock folds will always be different
+resulting in equal rows being put in a different cells of the hash
+table. Checking for duplicates will be flawed because different
+fold will be calculated when a row is searched in the hash table. */
+#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+#endif
+
+#if 0
+/** This effectively kills the search-for-duplicate-before-adding-a-row
+function, but searching in the hash is still performed. It will always
+be assumed that lock is not present and insertion will be performed in
+the hash table. */
+#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+#endif
+
+#if 0
+/** This aggressively repeats adding each row many times. Depending on
+the above settings this may be noop or may result in lots of rows being
+added. */
+#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+#endif
+
+#if 0
+/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash
+table search is not performed at all. */
+#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+#endif
+
+#if 0
+/** Do not insert each row into the hash table, duplicates may appear
+if this is enabled, also if this is enabled searching into the hash is
+noop because it will be empty. */
+#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+#endif
+/* @} */
+
+/** Memory limit passed to ha_storage_put_memlim().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_STORAGE(cache) \
+ (TRX_I_S_MEM_LIMIT \
+ - (cache)->mem_allocd)
+
+/** Memory limit in table_cache_create_empty_row().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_ALLOC(cache) \
+ (TRX_I_S_MEM_LIMIT \
+ - (cache)->mem_allocd \
+ - ha_storage_get_size((cache)->storage))
+
+/** Memory for each table in the intermediate buffer is allocated in
+separate chunks. These chunks are considered to be concatenated to
+represent one flat array of rows. */
+struct i_s_mem_chunk_t {
+ ulint offset; /*!< offset, in number of rows */
+ ulint rows_allocd; /*!< the size of this chunk, in number
+ of rows */
+ void* base; /*!< start of the chunk */
+};
+
+/** This represents one table's cache. */
+struct i_s_table_cache_t {
+ ulint rows_used; /*!< number of used rows */
+ ulint rows_allocd; /*!< number of allocated rows */
+ ulint row_size; /*!< size of a single row */
+ i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
+ memory chunks that stores the
+ rows */
+};
+
+/** This structure describes the intermediate buffer */
+struct trx_i_s_cache_t {
+ rw_lock_t rw_lock; /*!< read-write lock protecting
+ the rest of this structure */
+ Atomic_relaxed<ulonglong> last_read;
+ /*!< last time the cache was read;
+ measured in nanoseconds */
+ i_s_table_cache_t innodb_trx; /*!< innodb_trx table */
+ i_s_table_cache_t innodb_locks; /*!< innodb_locks table */
+ i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
+/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
+#define LOCKS_HASH_CELLS_NUM 10000
+ hash_table_t locks_hash; /*!< hash table used to eliminate
+ duplicate entries in the
+ innodb_locks table */
+/** Initial size of the cache storage */
+#define CACHE_STORAGE_INITIAL_SIZE 1024
+/** Number of hash cells in the cache storage */
+#define CACHE_STORAGE_HASH_CELLS 2048
+ ha_storage_t* storage; /*!< storage for external volatile
+ data that may become unavailable
+ when we release
+ lock_sys.mutex */
+ ulint mem_allocd; /*!< the amount of memory
+ allocated with mem_alloc*() */
+ bool is_truncated; /*!< this is true if the memory
+ limit was hit and thus the data
+ in the cache is truncated */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+static trx_i_s_cache_t trx_i_s_cache_static;
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+trx_i_s_cache_t* trx_i_s_cache = &trx_i_s_cache_static;
+
+/** @return the heap number of a record lock
+@retval 0xFFFF for table locks */
+static uint16_t wait_lock_get_heap_no(const lock_t *lock)
+{
+ return lock_get_type(lock) == LOCK_REC
+ ? static_cast<uint16_t>(lock_rec_find_set_bit(lock))
+ : uint16_t{0xFFFF};
+}
+
+/*******************************************************************//**
+Initializes the members of a table cache. */
+static
+void
+table_cache_init(
+/*=============*/
+ i_s_table_cache_t* table_cache, /*!< out: table cache */
+ size_t row_size) /*!< in: the size of a
+ row */
+{
+ ulint i;
+
+ table_cache->rows_used = 0;
+ table_cache->rows_allocd = 0;
+ table_cache->row_size = row_size;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ /* the memory is actually allocated in
+ table_cache_create_empty_row() */
+ table_cache->chunks[i].base = NULL;
+ }
+}
+
+/*******************************************************************//**
+Frees a table cache. */
+static
+void
+table_cache_free(
+/*=============*/
+ i_s_table_cache_t* table_cache) /*!< in/out: table cache */
+{
+ ulint i;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ /* the memory is actually allocated in
+ table_cache_create_empty_row() */
+ if (table_cache->chunks[i].base) {
+ ut_free(table_cache->chunks[i].base);
+ table_cache->chunks[i].base = NULL;
+ }
+ }
+}
+
+/*******************************************************************//**
+Returns an empty row from a table cache. The row is allocated if no more
+empty rows are available. The number of used rows is incremented.
+If the memory limit is hit then NULL is returned and nothing is
+allocated.
+@return empty row, or NULL if out of memory */
+static
+void*
+table_cache_create_empty_row(
+/*=========================*/
+ i_s_table_cache_t* table_cache, /*!< in/out: table cache */
+ trx_i_s_cache_t* cache) /*!< in/out: cache to record
+ how many bytes are
+ allocated */
+{
+ ulint i;
+ void* row;
+
+ ut_a(table_cache->rows_used <= table_cache->rows_allocd);
+
+ if (table_cache->rows_used == table_cache->rows_allocd) {
+
+ /* rows_used == rows_allocd means that new chunk needs
+ to be allocated: either no more empty rows in the
+ last allocated chunk or nothing has been allocated yet
+ (rows_num == rows_allocd == 0); */
+
+ i_s_mem_chunk_t* chunk;
+ ulint req_bytes;
+ ulint got_bytes;
+ ulint req_rows;
+ ulint got_rows;
+
+ /* find the first not allocated chunk */
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].base == NULL) {
+
+ break;
+ }
+ }
+
+ /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+ have been allocated :-X */
+ ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+ /* allocate the chunk we just found */
+
+ if (i == 0) {
+
+ /* first chunk, nothing is allocated yet */
+ req_rows = TABLE_CACHE_INITIAL_ROWSNUM;
+ } else {
+
+ /* Memory is increased by the formula
+ new = old + old / 2; We are trying not to be
+ aggressive here (= using the common new = old * 2)
+ because the allocated memory will not be freed
+ until InnoDB exit (it is reused). So it is better
+ to once allocate the memory in more steps, but
+ have less unused/wasted memory than to use less
+ steps in allocation (which is done once in a
+ lifetime) but end up with lots of unused/wasted
+ memory. */
+ req_rows = table_cache->rows_allocd / 2;
+ }
+ req_bytes = req_rows * table_cache->row_size;
+
+ if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) {
+
+ return(NULL);
+ }
+
+ chunk = &table_cache->chunks[i];
+
+ got_bytes = req_bytes;
+ chunk->base = ut_malloc_nokey(req_bytes);
+
+ got_rows = got_bytes / table_cache->row_size;
+
+ cache->mem_allocd += got_bytes;
+
+#if 0
+ printf("allocating chunk %d req bytes=%lu, got bytes=%lu,"
+ " row size=%lu,"
+ " req rows=%lu, got rows=%lu\n",
+ i, req_bytes, got_bytes,
+ table_cache->row_size,
+ req_rows, got_rows);
+#endif
+
+ chunk->rows_allocd = got_rows;
+
+ table_cache->rows_allocd += got_rows;
+
+ /* adjust the offset of the next chunk */
+ if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) {
+
+ table_cache->chunks[i + 1].offset
+ = chunk->offset + chunk->rows_allocd;
+ }
+
+ /* return the first empty row in the newly allocated
+ chunk */
+ row = chunk->base;
+ } else {
+
+ char* chunk_start;
+ ulint offset;
+
+ /* there is an empty row, no need to allocate new
+ chunks */
+
+ /* find the first chunk that contains allocated but
+ empty/unused rows */
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].offset
+ + table_cache->chunks[i].rows_allocd
+ > table_cache->rows_used) {
+
+ break;
+ }
+ }
+
+ /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+ are full, but
+ table_cache->rows_used != table_cache->rows_allocd means
+ exactly the opposite - there are allocated but
+ empty/unused rows :-X */
+ ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+ chunk_start = (char*) table_cache->chunks[i].base;
+ offset = table_cache->rows_used
+ - table_cache->chunks[i].offset;
+
+ row = chunk_start + offset * table_cache->row_size;
+ }
+
+ table_cache->rows_used++;
+
+ return(row);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a row in the locks cache.
+@return TRUE if valid */
+static
+ibool
+i_s_locks_row_validate(
+/*===================*/
+ const i_s_locks_row_t* row) /*!< in: row to validate */
+{
+ ut_ad(row->lock_mode);
+ ut_ad(row->lock_table != NULL);
+ ut_ad(row->lock_table_id != 0);
+
+ if (!row->lock_index) {
+ /* table lock */
+ ut_ad(!row->lock_data);
+ ut_ad(row->lock_page == page_id_t(0, 0));
+ ut_ad(!row->lock_rec);
+ } else {
+ /* record lock */
+ /* row->lock_data == NULL if buf_page_try_get() == NULL */
+ }
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Fills i_s_trx_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_trx_row(
+/*=========*/
+ i_s_trx_row_t* row, /*!< out: result object
+ that's filled */
+ const trx_t* trx, /*!< in: transaction to
+ get data from */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ corresponding row in
+ innodb_locks if trx is
+ waiting or NULL if trx
+ is not waiting */
+ trx_i_s_cache_t* cache) /*!< in/out: cache into
+ which to copy volatile
+ strings */
+{
+ const char* s;
+
+ ut_ad(lock_mutex_own());
+
+ row->trx_id = trx_get_id_for_print(trx);
+ row->trx_started = trx->start_time;
+ row->trx_state = trx_get_que_state_str(trx);
+ row->requested_lock_row = requested_lock_row;
+ ut_ad(requested_lock_row == NULL
+ || i_s_locks_row_validate(requested_lock_row));
+
+ if (trx->lock.wait_lock != NULL) {
+
+ ut_a(requested_lock_row != NULL);
+ row->trx_wait_started = trx->lock.wait_started;
+ } else {
+ ut_a(requested_lock_row == NULL);
+ row->trx_wait_started = 0;
+ }
+
+ row->trx_weight = static_cast<uintmax_t>(TRX_WEIGHT(trx));
+
+ if (trx->mysql_thd == NULL) {
+ /* For internal transactions e.g., purge and transactions
+ being recovered at startup there is no associated MySQL
+ thread data structure. */
+ row->trx_mysql_thread_id = 0;
+ row->trx_query = NULL;
+ goto thd_done;
+ }
+
+ row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+
+ char query[TRX_I_S_TRX_QUERY_MAX_LEN + 1];
+ if (size_t stmt_len = thd_query_safe(trx->mysql_thd, query,
+ sizeof query)) {
+ row->trx_query = static_cast<const char*>(
+ ha_storage_put_memlim(
+ cache->storage, query, stmt_len + 1,
+ MAX_ALLOWED_FOR_STORAGE(cache)));
+
+ row->trx_query_cs = thd_charset(trx->mysql_thd);
+
+ if (row->trx_query == NULL) {
+
+ return(FALSE);
+ }
+ } else {
+
+ row->trx_query = NULL;
+ }
+
+thd_done:
+ row->trx_operation_state = trx->op_info;
+
+ row->trx_tables_in_use = trx->n_mysql_tables_in_use;
+
+ row->trx_tables_locked = lock_number_of_tables_locked(&trx->lock);
+
+ /* These are protected by both trx->mutex or lock_sys.mutex,
+ or just lock_sys.mutex. For reading, it suffices to hold
+ lock_sys.mutex. */
+
+ row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks);
+
+ row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap);
+
+ row->trx_rows_locked = lock_number_of_rows_locked(&trx->lock);
+
+ row->trx_rows_modified = trx->undo_no;
+
+ row->trx_isolation_level = trx->isolation_level;
+
+ row->trx_unique_checks = (ibool) trx->check_unique_secondary;
+
+ row->trx_foreign_key_checks = (ibool) trx->check_foreigns;
+
+ s = trx->detailed_error;
+
+ if (s != NULL && s[0] != '\0') {
+
+ TRX_I_S_STRING_COPY(s,
+ row->trx_foreign_key_error,
+ TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache);
+
+ if (row->trx_foreign_key_error == NULL) {
+
+ return(FALSE);
+ }
+ } else {
+ row->trx_foreign_key_error = NULL;
+ }
+
+ row->trx_is_read_only = trx->read_only;
+
+ row->trx_is_autocommit_non_locking = trx->is_autocommit_non_locking();
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Format the nth field of "rec" and put it in "buf". The result is always
+NUL-terminated. Returns the number of bytes that were written to "buf"
+(including the terminating NUL).
+@return end of the result */
+static
+ulint
+put_nth_field(
+/*==========*/
+ char* buf, /*!< out: buffer */
+ ulint buf_size,/*!< in: buffer size in bytes */
+ ulint n, /*!< in: number of field */
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets)/*!< in: record offsets, returned
+ by rec_get_offsets() */
+{
+ const byte* data;
+ ulint data_len;
+ dict_field_t* dict_field;
+ ulint ret;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ ret = 0;
+
+ if (n > 0) {
+ /* we must append ", " before the actual data */
+
+ if (buf_size < 3) {
+
+ buf[0] = '\0';
+ return(1);
+ }
+
+ memcpy(buf, ", ", 3);
+
+ buf += 2;
+ buf_size -= 2;
+ ret += 2;
+ }
+
+ /* now buf_size >= 1 */
+
+ data = rec_get_nth_field(rec, offsets, n, &data_len);
+
+ dict_field = dict_index_get_nth_field(index, n);
+
+ ret += row_raw_format((const char*) data, data_len,
+ dict_field, buf, buf_size);
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Fills the "lock_data" member of i_s_locks_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_lock_data(
+/*===========*/
+ const char** lock_data,/*!< out: "lock_data" to fill */
+ const lock_t* lock, /*!< in: lock used to find the data */
+ ulint heap_no,/*!< in: rec num used to find the data */
+ trx_i_s_cache_t* cache) /*!< in/out: cache where to store
+ volatile data */
+{
+ ut_a(lock_get_type(lock) == LOCK_REC);
+
+ switch (heap_no) {
+ case PAGE_HEAP_NO_INFIMUM:
+ case PAGE_HEAP_NO_SUPREMUM:
+ *lock_data = ha_storage_put_str_memlim(
+ cache->storage,
+ heap_no == PAGE_HEAP_NO_INFIMUM
+ ? "infimum pseudo-record"
+ : "supremum pseudo-record",
+ MAX_ALLOWED_FOR_STORAGE(cache));
+ return(*lock_data != NULL);
+ }
+
+ mtr_t mtr;
+
+ const buf_block_t* block;
+ const page_t* page;
+ const rec_t* rec;
+ const dict_index_t* index;
+ ulint n_fields;
+ mem_heap_t* heap;
+ rec_offs offsets_onstack[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets;
+ char buf[TRX_I_S_LOCK_DATA_MAX_LEN];
+ ulint buf_used;
+ ulint i;
+
+ mtr_start(&mtr);
+
+ block = buf_page_try_get(lock->un_member.rec_lock.page_id, &mtr);
+
+ if (block == NULL) {
+
+ *lock_data = NULL;
+
+ mtr_commit(&mtr);
+
+ return(TRUE);
+ }
+
+ page = reinterpret_cast<const page_t*>(buf_block_get_frame(block));
+
+ rec_offs_init(offsets_onstack);
+ offsets = offsets_onstack;
+
+ rec = page_find_rec_with_heap_no(page, heap_no);
+
+ index = lock_rec_get_index(lock);
+
+ n_fields = dict_index_get_n_unique(index);
+
+ ut_a(n_fields > 0);
+
+ heap = NULL;
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ n_fields, &heap);
+
+ /* format and store the data */
+
+ buf_used = 0;
+ for (i = 0; i < n_fields; i++) {
+
+ buf_used += put_nth_field(
+ buf + buf_used, sizeof(buf) - buf_used,
+ i, index, rec, offsets) - 1;
+ }
+
+ *lock_data = (const char*) ha_storage_put_memlim(
+ cache->storage, buf, buf_used + 1,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ if (heap != NULL) {
+
+ /* this means that rec_get_offsets() has created a new
+ heap and has stored offsets in it; check that this is
+ really the case and free the heap */
+ ut_a(offsets != offsets_onstack);
+ mem_heap_free(heap);
+ }
+
+ mtr_commit(&mtr);
+
+ if (*lock_data == NULL) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_locks_row_t object. Returns its first argument.
+If memory can not be allocated then FALSE is returned.
+@return false if allocation fails */
+static bool fill_locks_row(
+ i_s_locks_row_t* row, /*!< out: result object that's filled */
+ const lock_t* lock, /*!< in: lock to get data from */
+ uint16_t heap_no,/*!< in: lock's record number
+ or 0 if the lock
+ is a table lock */
+ trx_i_s_cache_t* cache) /*!< in/out: cache into which to copy
+ volatile strings */
+{
+ row->lock_trx_id = lock->trx->id;
+ const auto lock_type = lock_get_type(lock);
+ ut_ad(lock_type == LOCK_REC || lock_type == LOCK_TABLE);
+
+ const bool is_gap_lock = lock_type == LOCK_REC
+ && (lock->type_mode & LOCK_GAP);
+ switch (lock->type_mode & LOCK_MODE_MASK) {
+ case LOCK_S:
+ row->lock_mode = uint8_t(1 + is_gap_lock);
+ break;
+ case LOCK_X:
+ row->lock_mode = uint8_t(3 + is_gap_lock);
+ break;
+ case LOCK_IS:
+ row->lock_mode = uint8_t(5 + is_gap_lock);
+ break;
+ case LOCK_IX:
+ row->lock_mode = uint8_t(7 + is_gap_lock);
+ break;
+ case LOCK_AUTO_INC:
+ row->lock_mode = 9;
+ break;
+ default:
+ ut_ad("unknown lock mode" == 0);
+ row->lock_mode = 0;
+ }
+
+ row->lock_table = ha_storage_put_str_memlim(
+ cache->storage, lock_get_table_name(lock).m_name,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ /* memory could not be allocated */
+ if (row->lock_table == NULL) {
+
+ return false;
+ }
+
+ if (lock_type == LOCK_REC) {
+ row->lock_index = ha_storage_put_str_memlim(
+ cache->storage, lock_rec_get_index_name(lock),
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ /* memory could not be allocated */
+ if (row->lock_index == NULL) {
+
+ return false;
+ }
+
+ row->lock_page = lock->un_member.rec_lock.page_id;
+ row->lock_rec = heap_no;
+
+ if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
+
+ /* memory could not be allocated */
+ return false;
+ }
+ } else {
+ row->lock_index = NULL;
+
+ row->lock_page = page_id_t(0, 0);
+ row->lock_rec = 0;
+
+ row->lock_data = NULL;
+ }
+
+ row->lock_table_id = lock_get_table_id(lock);
+
+ row->hash_chain.value = row;
+ ut_ad(i_s_locks_row_validate(row));
+
+ return true;
+}
+
+/*******************************************************************//**
+Fills i_s_lock_waits_row_t object. Returns its first argument.
+@return result object that's filled */
+static
+i_s_lock_waits_row_t*
+fill_lock_waits_row(
+/*================*/
+ i_s_lock_waits_row_t* row, /*!< out: result object
+ that's filled */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ relevant requested lock
+ row in innodb_locks */
+ const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the
+ relevant blocking lock
+ row in innodb_locks */
+{
+ ut_ad(i_s_locks_row_validate(requested_lock_row));
+ ut_ad(i_s_locks_row_validate(blocking_lock_row));
+
+ row->requested_lock_row = requested_lock_row;
+ row->blocking_lock_row = blocking_lock_row;
+
+ return(row);
+}
+
+/*******************************************************************//**
+Calculates a hash fold for a lock. For a record lock the fold is
+calculated from 4 elements, which uniquely identify a lock at a given
+point in time: transaction id, space id, page number, record number.
+For a table lock the fold is table's id.
+@return fold */
+static
+ulint
+fold_lock(
+/*======*/
+ const lock_t* lock, /*!< in: lock object to fold */
+ ulint heap_no)/*!< in: lock's record number
+ or 0xFFFF if the lock
+ is a table lock */
+{
+#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+ static ulint fold = 0;
+
+ return(fold++);
+#else
+ ulint ret;
+
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ ut_a(heap_no != 0xFFFF);
+ ret = ut_fold_ulint_pair((ulint) lock->trx->id,
+ lock->un_member.rec_lock.page_id.
+ fold());
+ ret = ut_fold_ulint_pair(ret, heap_no);
+
+ break;
+ case LOCK_TABLE:
+ /* this check is actually not necessary for continuing
+ correct operation, but something must have gone wrong if
+ it fails. */
+ ut_a(heap_no == 0xFFFF);
+
+ ret = (ulint) lock_get_table_id(lock);
+
+ break;
+ default:
+ ut_error;
+ }
+
+ return(ret);
+#endif
+}
+
+/*******************************************************************//**
+Checks whether i_s_locks_row_t object represents a lock_t object.
+@return TRUE if they match */
+static
+ibool
+locks_row_eq_lock(
+/*==============*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ const lock_t* lock, /*!< in: lock object */
+ ulint heap_no)/*!< in: lock's record number
+ or 0xFFFF if the lock
+ is a table lock */
+{
+ ut_ad(i_s_locks_row_validate(row));
+#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+ return(0);
+#else
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ ut_a(heap_no != 0xFFFF);
+
+ return(row->lock_trx_id == lock->trx->id
+ && row->lock_page == lock->un_member.rec_lock.page_id
+ && row->lock_rec == heap_no);
+
+ case LOCK_TABLE:
+ /* this check is actually not necessary for continuing
+ correct operation, but something must have gone wrong if
+ it fails. */
+ ut_a(heap_no == 0xFFFF);
+
+ return(row->lock_trx_id == lock->trx->id
+ && row->lock_table_id == lock_get_table_id(lock));
+
+ default:
+ ut_error;
+ return(FALSE);
+ }
+#endif
+}
+
+/*******************************************************************//**
+Searches for a row in the innodb_locks cache that has a specified id.
+This happens in O(1) time since a hash table is used. Returns pointer to
+the row or NULL if none is found.
+@return row or NULL */
+static
+i_s_locks_row_t*
+search_innodb_locks(
+/*================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ const lock_t* lock, /*!< in: lock to search for */
+ uint16_t heap_no)/*!< in: lock's record number
+ or 0xFFFF if the lock
+ is a table lock */
+{
+ i_s_hash_chain_t* hash_chain;
+
+ HASH_SEARCH(
+ /* hash_chain->"next" */
+ next,
+ /* the hash table */
+ &cache->locks_hash,
+ /* fold */
+ fold_lock(lock, heap_no),
+ /* the type of the next variable */
+ i_s_hash_chain_t*,
+ /* auxiliary variable */
+ hash_chain,
+ /* assertion on every traversed item */
+ ut_ad(i_s_locks_row_validate(hash_chain->value)),
+ /* this determines if we have found the lock */
+ locks_row_eq_lock(hash_chain->value, lock, heap_no));
+
+ if (hash_chain == NULL) {
+
+ return(NULL);
+ }
+ /* else */
+
+ return(hash_chain->value);
+}
+
+/*******************************************************************//**
+Adds new element to the locks cache, enlarging it if necessary.
+Returns a pointer to the added row. If the row is already present then
+no row is added and a pointer to the existing row is returned.
+If row can not be allocated then NULL is returned.
+@return row */
+static
+i_s_locks_row_t*
+add_lock_to_cache(
+/*==============*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const lock_t* lock, /*!< in: the element to add */
+ uint16_t heap_no)/*!< in: lock's record number
+ or 0 if the lock
+ is a table lock */
+{
+ i_s_locks_row_t* dst_row;
+
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+ ulint i;
+ for (i = 0; i < 10000; i++) {
+#endif
+#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+ /* quit if this lock is already present */
+ dst_row = search_innodb_locks(cache, lock, heap_no);
+ if (dst_row != NULL) {
+
+ ut_ad(i_s_locks_row_validate(dst_row));
+ return(dst_row);
+ }
+#endif
+
+ dst_row = (i_s_locks_row_t*)
+ table_cache_create_empty_row(&cache->innodb_locks, cache);
+
+ /* memory could not be allocated */
+ if (dst_row == NULL) {
+
+ return(NULL);
+ }
+
+ if (!fill_locks_row(dst_row, lock, heap_no, cache)) {
+
+ /* memory could not be allocated */
+ cache->innodb_locks.rows_used--;
+ return(NULL);
+ }
+
+#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+ HASH_INSERT(
+ /* the type used in the hash chain */
+ i_s_hash_chain_t,
+ /* hash_chain->"next" */
+ next,
+ /* the hash table */
+ &cache->locks_hash,
+ /* fold */
+ fold_lock(lock, heap_no),
+ /* add this data to the hash */
+ &dst_row->hash_chain);
+#endif
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+ } /* for()-loop */
+#endif
+
+ ut_ad(i_s_locks_row_validate(dst_row));
+ return(dst_row);
+}
+
+/*******************************************************************//**
+Adds new pair of locks to the lock waits cache.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+add_lock_wait_to_cache(
+/*===================*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ relevant requested lock
+ row in innodb_locks */
+ const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the
+ relevant blocking lock
+ row in innodb_locks */
+{
+ i_s_lock_waits_row_t* dst_row;
+
+ dst_row = (i_s_lock_waits_row_t*)
+ table_cache_create_empty_row(&cache->innodb_lock_waits,
+ cache);
+
+ /* memory could not be allocated */
+ if (dst_row == NULL) {
+
+ return(FALSE);
+ }
+
+ fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row);
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Adds transaction's relevant (important) locks to cache.
+If the transaction is waiting, then the wait lock is added to
+innodb_locks and a pointer to the added row is returned in
+requested_lock_row, otherwise requested_lock_row is set to NULL.
+If rows can not be allocated then FALSE is returned and the value of
+requested_lock_row is undefined.
+@return FALSE if allocation fails */
+static
+ibool
+add_trx_relevant_locks_to_cache(
+/*============================*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const trx_t* trx, /*!< in: transaction */
+ i_s_locks_row_t** requested_lock_row)/*!< out: pointer to the
+ requested lock row, or NULL or
+ undefined */
+{
+ ut_ad(lock_mutex_own());
+
+ /* If transaction is waiting we add the wait lock and all locks
+ from another transactions that are blocking the wait lock. */
+ if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+ const lock_t* curr_lock;
+ i_s_locks_row_t* blocking_lock_row;
+ lock_queue_iterator_t iter;
+
+ ut_a(trx->lock.wait_lock != NULL);
+
+ uint16_t wait_lock_heap_no
+ = wait_lock_get_heap_no(trx->lock.wait_lock);
+
+ /* add the requested lock */
+ *requested_lock_row
+ = add_lock_to_cache(cache, trx->lock.wait_lock,
+ wait_lock_heap_no);
+
+ /* memory could not be allocated */
+ if (*requested_lock_row == NULL) {
+
+ return(FALSE);
+ }
+
+ /* then iterate over the locks before the wait lock and
+ add the ones that are blocking it */
+
+ lock_queue_iterator_reset(&iter, trx->lock.wait_lock,
+ ULINT_UNDEFINED);
+
+ for (curr_lock = lock_queue_iterator_get_prev(&iter);
+ curr_lock != NULL;
+ curr_lock = lock_queue_iterator_get_prev(&iter)) {
+
+ if (lock_has_to_wait(trx->lock.wait_lock,
+ curr_lock)) {
+
+ /* add the lock that is
+ blocking trx->lock.wait_lock */
+ blocking_lock_row
+ = add_lock_to_cache(
+ cache, curr_lock,
+ /* heap_no is the same
+ for the wait and waited
+ locks */
+ wait_lock_heap_no);
+
+ /* memory could not be allocated */
+ if (blocking_lock_row == NULL) {
+
+ return(FALSE);
+ }
+
+ /* add the relation between both locks
+ to innodb_lock_waits */
+ if (!add_lock_wait_to_cache(
+ cache, *requested_lock_row,
+ blocking_lock_row)) {
+
+ /* memory could not be allocated */
+ return(FALSE);
+ }
+ }
+ }
+ } else {
+
+ *requested_lock_row = NULL;
+ }
+
+ return(TRUE);
+}
+
+/** The minimum time that a cache must not be updated after it has been
+read for the last time; measured in nanoseconds. We use this technique
+to ensure that SELECTs which join several INFORMATION SCHEMA tables read
+the same version of the cache. */
+#define CACHE_MIN_IDLE_TIME_NS 100000000 /* 0.1 sec */
+
+/*******************************************************************//**
+Checks if the cache can safely be updated.
+@return whether the cache can be updated */
+static bool can_cache_be_updated(trx_i_s_cache_t* cache)
+{
+ /* cache->last_read is only updated when a shared rw lock on the
+ whole cache is being held (see trx_i_s_cache_end_read()) and
+ we are currently holding an exclusive rw lock on the cache.
+ So it is not possible for last_read to be updated while we are
+ reading it. */
+
+ ut_ad(rw_lock_own(&cache->rw_lock, RW_LOCK_X));
+
+ return my_interval_timer() - cache->last_read > CACHE_MIN_IDLE_TIME_NS;
+}
+
+/*******************************************************************//**
+Declare a cache empty, preparing it to be filled up. Not all resources
+are freed because they can be reused. */
+static
+void
+trx_i_s_cache_clear(
+/*================*/
+ trx_i_s_cache_t* cache) /*!< out: cache to clear */
+{
+ cache->innodb_trx.rows_used = 0;
+ cache->innodb_locks.rows_used = 0;
+ cache->innodb_lock_waits.rows_used = 0;
+
+ cache->locks_hash.clear();
+
+ ha_storage_empty(&cache->storage);
+}
+
+
+/**
+ Add transactions to innodb_trx's cache.
+
+ We also add all locks that are relevant to each transaction into
+ innodb_locks' and innodb_lock_waits' caches.
+*/
+
+static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx)
+{
+ i_s_locks_row_t *requested_lock_row;
+
+#ifdef UNIV_DEBUG
+ {
+ const auto state= trx->state;
+
+ if (trx->is_autocommit_non_locking())
+ {
+ ut_ad(trx->read_only);
+ ut_ad(!trx->is_recovered);
+ ut_ad(trx->mysql_thd);
+ ut_ad(state == TRX_STATE_NOT_STARTED || state == TRX_STATE_ACTIVE);
+ }
+ else
+ ut_ad(state == TRX_STATE_ACTIVE ||
+ state == TRX_STATE_PREPARED ||
+ state == TRX_STATE_PREPARED_RECOVERED ||
+ state == TRX_STATE_COMMITTED_IN_MEMORY);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (add_trx_relevant_locks_to_cache(cache, trx, &requested_lock_row))
+ {
+ if (i_s_trx_row_t *trx_row= reinterpret_cast<i_s_trx_row_t*>(
+ table_cache_create_empty_row(&cache->innodb_trx, cache)))
+ {
+ if (fill_trx_row(trx_row, trx, requested_lock_row, cache))
+ return;
+ --cache->innodb_trx.rows_used;
+ }
+ }
+
+ /* memory could not be allocated */
+ cache->is_truncated= true;
+}
+
+
+/**
+ Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+ table cache buffer. Cache must be locked for write.
+*/
+
+static void fetch_data_into_cache(trx_i_s_cache_t *cache)
+{
+ ut_ad(lock_mutex_own());
+ trx_i_s_cache_clear(cache);
+
+ /* Capture the state of transactions */
+ trx_sys.trx_list.for_each([cache](trx_t &trx) {
+ if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED &&
+ &trx != purge_sys.query->trx)
+ {
+ mutex_enter(&trx.mutex);
+ if (trx.state != TRX_STATE_NOT_STARTED)
+ fetch_data_into_cache_low(cache, &trx);
+ mutex_exit(&trx.mutex);
+ }
+ });
+ cache->is_truncated= false;
+}
+
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+Called from handler/i_s.cc.
+@return 0 - fetched, 1 - not */
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+ trx_i_s_cache_t* cache) /*!< in/out: cache */
+{
+ if (!can_cache_be_updated(cache)) {
+
+ return(1);
+ }
+
+ /* We need to read trx_sys and record/table lock queues */
+
+ lock_mutex_enter();
+ fetch_data_into_cache(cache);
+ lock_mutex_exit();
+
+ /* update cache last read time */
+ cache->last_read = my_interval_timer();
+
+ return(0);
+}
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+bool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ return(cache->is_truncated);
+}
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_init(
+/*===============*/
+ trx_i_s_cache_t* cache) /*!< out: cache to init */
+{
+ /* The latching is done in the following order:
+ acquire trx_i_s_cache_t::rw_lock, X
+ acquire lock mutex
+ release lock mutex
+ release trx_i_s_cache_t::rw_lock
+ acquire trx_i_s_cache_t::rw_lock, S
+ release trx_i_s_cache_t::rw_lock */
+
+ rw_lock_create(trx_i_s_cache_lock_key, &cache->rw_lock,
+ SYNC_TRX_I_S_RWLOCK);
+
+ cache->last_read = 0;
+
+ table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
+ table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
+ table_cache_init(&cache->innodb_lock_waits,
+ sizeof(i_s_lock_waits_row_t));
+
+ cache->locks_hash.create(LOCKS_HASH_CELLS_NUM);
+
+ cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
+ CACHE_STORAGE_HASH_CELLS);
+
+ cache->mem_allocd = 0;
+
+ cache->is_truncated = false;
+}
+
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_free(
+/*===============*/
+ trx_i_s_cache_t* cache) /*!< in, own: cache to free */
+{
+ rw_lock_free(&cache->rw_lock);
+
+ cache->locks_hash.free();
+ ha_storage_free(cache->storage);
+ table_cache_free(&cache->innodb_trx);
+ table_cache_free(&cache->innodb_locks);
+ table_cache_free(&cache->innodb_lock_waits);
+}
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ rw_lock_s_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_end_read(
+/*===================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ cache->last_read = my_interval_timer();
+ rw_lock_s_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_start_write(
+/*======================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ rw_lock_x_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_end_write(
+/*====================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ ut_ad(rw_lock_own(&cache->rw_lock, RW_LOCK_X));
+
+ rw_lock_x_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Selects a INFORMATION SCHEMA table cache from the whole cache.
+@return table cache */
+static
+i_s_table_cache_t*
+cache_select_table(
+/*===============*/
+ trx_i_s_cache_t* cache, /*!< in: whole cache */
+ enum i_s_table table) /*!< in: which table */
+{
+ ut_ad(rw_lock_own_flagged(&cache->rw_lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+ switch (table) {
+ case I_S_INNODB_TRX:
+ return &cache->innodb_trx;
+ case I_S_INNODB_LOCKS:
+ return &cache->innodb_locks;
+ case I_S_INNODB_LOCK_WAITS:
+ return &cache->innodb_lock_waits;
+ }
+
+ ut_error;
+ return NULL;
+}
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table) /*!< in: which table */
+{
+ i_s_table_cache_t* table_cache;
+
+ table_cache = cache_select_table(cache, table);
+
+ return(table_cache->rows_used);
+}
+
+/*******************************************************************//**
+Retrieves the nth row (zero-based) in the cache for a given
+INFORMATION SCHEMA table.
+@return row */
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table, /*!< in: which table */
+ ulint n) /*!< in: row number */
+{
+ i_s_table_cache_t* table_cache;
+ ulint i;
+ void* row;
+
+ table_cache = cache_select_table(cache, table);
+
+ ut_a(n < table_cache->rows_used);
+
+ row = NULL;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].offset
+ + table_cache->chunks[i].rows_allocd > n) {
+
+ row = (char*) table_cache->chunks[i].base
+ + (n - table_cache->chunks[i].offset)
+ * table_cache->row_size;
+ break;
+ }
+ }
+
+ ut_a(row != NULL);
+
+ return(row);
+}
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ char* lock_id,/*!< out: resulting lock_id */
+ ulint lock_id_size)/*!< in: size of the lock id
+ buffer */
+{
+ int res_len;
+
+ /* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
+
+ if (row->lock_index) {
+ /* record lock */
+ res_len = snprintf(lock_id, lock_id_size,
+ TRX_ID_FMT
+ ":%u:%u:%u",
+ row->lock_trx_id, row->lock_page.space(),
+ row->lock_page.page_no(), row->lock_rec);
+ } else {
+ /* table lock */
+ res_len = snprintf(lock_id, lock_id_size,
+ TRX_ID_FMT":" UINT64PF,
+ row->lock_trx_id,
+ row->lock_table_id);
+ }
+
+ /* the typecast is safe because snprintf(3) never returns
+ negative result */
+ ut_a(res_len >= 0);
+ ut_a((ulint) res_len < lock_id_size);
+
+ return(lock_id);
+}
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
new file mode 100644
index 00000000..28491853
--- /dev/null
+++ b/storage/innobase/trx/trx0purge.cc
@@ -0,0 +1,1297 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0purge.cc
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+#include "fsp0fsp.h"
+#include "fut0fut.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "os0thread.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "sync0sync.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include <mysql/service_wsrep.h>
+
+#include <unordered_map>
+
+/** Maximum allowable purge history length. <=0 means 'infinite'. */
+ulong srv_max_purge_lag = 0;
+
+/** Max DML user threads delay in micro-seconds. */
+ulong srv_max_purge_lag_delay = 0;
+
+/** The global data structure coordinating a purge */
+purge_sys_t purge_sys;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+trx_undo_rec_t trx_purge_dummy_rec;
+
+#ifdef UNIV_DEBUG
+my_bool srv_purge_view_update_only_debug;
+#endif /* UNIV_DEBUG */
+
+/** Sentinel value */
+static const TrxUndoRsegs NullElement;
+
+/** Default constructor */
+TrxUndoRsegsIterator::TrxUndoRsegsIterator()
+ : m_rsegs(NullElement), m_iter(m_rsegs.begin())
+{
+}
+
+/** Sets the next rseg to purge in purge_sys.
+Executed in the purge coordinator thread.
+@return whether anything is to be purged */
+inline bool TrxUndoRsegsIterator::set_next()
+{
+ mutex_enter(&purge_sys.pq_mutex);
+
+ /* Only purge consumes events from the priority queue, user
+ threads only produce the events. */
+
+ /* Check if there are more rsegs to process in the
+ current element. */
+ if (m_iter != m_rsegs.end()) {
+ /* We are still processing rollback segment from
+ the same transaction and so expected transaction
+ number shouldn't increase. Undo the increment of
+ expected commit done by caller assuming rollback
+ segments from given transaction are done. */
+ purge_sys.tail.trx_no = (*m_iter)->last_trx_no();
+ } else if (!purge_sys.purge_queue.empty()) {
+ m_rsegs = purge_sys.purge_queue.top();
+ purge_sys.purge_queue.pop();
+ ut_ad(purge_sys.purge_queue.empty()
+ || purge_sys.purge_queue.top() != m_rsegs);
+ m_iter = m_rsegs.begin();
+ } else {
+ /* Queue is empty, reset iterator. */
+ purge_sys.rseg = NULL;
+ mutex_exit(&purge_sys.pq_mutex);
+ m_rsegs = NullElement;
+ m_iter = m_rsegs.begin();
+ return false;
+ }
+
+ purge_sys.rseg = *m_iter++;
+ mutex_exit(&purge_sys.pq_mutex);
+ mutex_enter(&purge_sys.rseg->mutex);
+
+ ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
+ ut_ad(purge_sys.rseg->last_trx_no() == m_rsegs.trx_no);
+
+ /* We assume in purge of externally stored fields that space id is
+ in the range of UNDO tablespace space ids */
+ ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE
+ || srv_is_undo_tablespace(purge_sys.rseg->space->id));
+
+ ut_a(purge_sys.tail.trx_no <= purge_sys.rseg->last_trx_no());
+
+ purge_sys.tail.trx_no = purge_sys.rseg->last_trx_no();
+ purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+ purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+ mutex_exit(&purge_sys.rseg->mutex);
+
+ return(true);
+}
+
+/** Build a purge 'query' graph. The actual purge is performed by executing
+this query graph.
+@return own: the query graph */
+static
+que_t*
+purge_graph_build()
+{
+ ut_a(srv_n_purge_threads > 0);
+
+ trx_t* trx = trx_create();
+ ut_ad(!trx->id);
+ trx->start_time = time(NULL);
+ trx->start_time_micro = microsecond_interval_timer();
+ trx->state = TRX_STATE_ACTIVE;
+ trx->op_info = "purge trx";
+
+ mem_heap_t* heap = mem_heap_create(512);
+ que_fork_t* fork = que_fork_create(
+ NULL, NULL, QUE_FORK_PURGE, heap);
+ fork->trx = trx;
+
+ for (auto i = innodb_purge_threads_MAX; i; i--) {
+ que_thr_t* thr = que_thr_create(fork, heap, NULL);
+ thr->child = new(mem_heap_alloc(heap, sizeof(purge_node_t)))
+ purge_node_t(thr);
+ }
+
+ return(fork);
+}
+
+/** Initialise the purge system. */
+void purge_sys_t::create()
+{
+ ut_ad(this == &purge_sys);
+ ut_ad(!heap);
+ ut_ad(!enabled());
+ m_paused= 0;
+ query= purge_graph_build();
+ next_stored= false;
+ rseg= NULL;
+ page_no= 0;
+ offset= 0;
+ hdr_page_no= 0;
+ hdr_offset= 0;
+ rw_lock_create(trx_purge_latch_key, &latch, SYNC_PURGE_LATCH);
+ mutex_create(LATCH_ID_PURGE_SYS_PQ, &pq_mutex);
+ truncate.current= NULL;
+ truncate.last= NULL;
+ heap= mem_heap_create(4096);
+}
+
+/** Close the purge subsystem on shutdown. */
+void purge_sys_t::close()
+{
+ ut_ad(this == &purge_sys);
+ if (!heap)
+ return;
+
+ ut_ad(!enabled());
+ trx_t* trx = query->trx;
+ que_graph_free(query);
+ ut_ad(!trx->id);
+ ut_ad(trx->state == TRX_STATE_ACTIVE);
+ trx->state= TRX_STATE_NOT_STARTED;
+ trx->free();
+ rw_lock_free(&latch);
+ mutex_free(&pq_mutex);
+ mem_heap_free(heap);
+ heap= nullptr;
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/** Prepend the history list with an undo log.
+Remove the undo log segment from the rseg slot if it is too big for reuse.
+@param[in] trx transaction
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction */
+void
+trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
+{
+ DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")",
+ trx->id, trx_id_t{trx->rw_trx_hash_element->no}));
+ ut_ad(undo == trx->rsegs.m_redo.undo);
+ trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+ ut_ad(undo->rseg == rseg);
+ buf_block_t* rseg_header = trx_rsegf_get(
+ rseg->space, rseg->page_no, mtr);
+ buf_block_t* undo_page = trx_undo_set_state_at_finish(
+ undo, mtr);
+ trx_ulogf_t* undo_header = undo_page->frame + undo->hdr_offset;
+
+ ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1);
+
+ if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+ + rseg_header->frame))) {
+ /* This database must have been upgraded from
+ before MariaDB 10.3.5. */
+ trx_rseg_format_upgrade(rseg_header, mtr);
+ }
+
+ if (undo->state != TRX_UNDO_CACHED) {
+ /* The undo log segment will not be reused */
+ ut_a(undo->id < TRX_RSEG_N_SLOTS);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ mtr->memset(rseg_header,
+ TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+ + undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
+
+ MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+
+ uint32_t hist_size = mach_read_from_4(TRX_RSEG_HISTORY_SIZE
+ + TRX_RSEG
+ + rseg_header->frame);
+
+ ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR
+ + TRX_UNDO_PAGE_LIST
+ + undo_page->frame));
+
+ mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+ + rseg_header->frame,
+ hist_size + undo->size);
+ mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID
+ + rseg_header->frame,
+ trx_sys.get_max_trx_id());
+ }
+
+ /* After the purge thread has been given permission to exit,
+ we may roll back transactions (trx->undo_no==0)
+ in THD::cleanup() invoked from unlink_thd() in fast shutdown,
+ or in trx_rollback_recovered() in slow shutdown.
+
+ Before any transaction-generating background threads or the
+ purge have been started, we can
+ start transactions in row_merge_drop_temp_indexes() and
+ fts_drop_orphaned_tables(), and roll back recovered transactions.
+
+ Arbitrary user transactions may be executed when all the undo log
+ related background processes (including purge) are disabled due to
+ innodb_force_recovery=2 or innodb_force_recovery=3.
+ DROP TABLE may be executed at any innodb_force_recovery level.
+
+ During fast shutdown, we may also continue to execute
+ user transactions. */
+ ut_ad(srv_undo_sources
+ || trx->undo_no == 0
+ || (!purge_sys.enabled()
+ && (srv_is_being_started
+ || trx_rollback_is_active
+ || srv_force_recovery >= SRV_FORCE_NO_BACKGROUND))
+ || ((trx->mysql_thd || trx->internal)
+ && srv_fast_shutdown));
+
+#ifdef WITH_WSREP
+ if (wsrep_is_wsrep_xid(trx->xid)) {
+ trx_rseg_update_wsrep_checkpoint(rseg_header, trx->xid, mtr);
+ }
+#endif
+
+ if (trx->mysql_log_file_name && *trx->mysql_log_file_name) {
+ /* Update the latest MySQL binlog name and offset info
+ in rollback segment header if MySQL binlogging is on
+ or the database server is a MySQL replication save. */
+ trx_rseg_update_binlog_offset(rseg_header, trx, mtr);
+ }
+
+ /* Add the log as the first in the history list */
+ flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page,
+ static_cast<uint16_t>(undo->hdr_offset
+ + TRX_UNDO_HISTORY_NODE), mtr);
+
+ mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page,
+ undo_header + TRX_UNDO_TRX_NO,
+ trx->rw_trx_hash_element->no);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header
+ + TRX_UNDO_NEEDS_PURGE, 1U);
+
+ if (rseg->last_page_no == FIL_NULL) {
+ rseg->last_page_no = undo->hdr_page_no;
+ rseg->set_last_commit(undo->hdr_offset,
+ trx->rw_trx_hash_element->no);
+ rseg->needs_purge = true;
+ }
+
+ trx_sys.rseg_history_len++;
+
+ if (undo->state == TRX_UNDO_CACHED) {
+ UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+ MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+ } else {
+ ut_ad(undo->state == TRX_UNDO_TO_PURGE);
+ ut_free(undo);
+ }
+
+ undo = NULL;
+}
+
+/** Remove undo log header from the history list.
+@param[in,out] rseg rollback segment header page
+@param[in] log undo log segment header page
+@param[in] offset byte offset in the undo log segment header page
+@param[in,out] mtr mini-transaction */
+static void trx_purge_remove_log_hdr(buf_block_t *rseg, buf_block_t* log,
+ uint16_t offset, mtr_t *mtr)
+{
+ flst_remove(rseg, TRX_RSEG + TRX_RSEG_HISTORY,
+ log, static_cast<uint16_t>(offset + TRX_UNDO_HISTORY_NODE), mtr);
+ trx_sys.rseg_history_len--;
+}
+
+/** Free an undo log segment, and remove the header from the history list.
+@param[in,out] rseg rollback segment
+@param[in] hdr_addr file address of log_hdr */
+static
+void
+trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr)
+{
+ mtr_t mtr;
+
+ mtr.start();
+ mutex_enter(&rseg->mutex);
+
+ buf_block_t* rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(rseg->space->id, hdr_addr.page), &mtr);
+
+ /* Mark the last undo log totally purged, so that if the
+ system crashes, the tail of the undo log will not get accessed
+ again. The list of pages in the undo log tail gets
+ inconsistent during the freeing of the segment, and therefore
+ purge should not try to access them again. */
+ mtr.write<2,mtr_t::MAYBE_NOP>(*block, block->frame + hdr_addr.boffset
+ + TRX_UNDO_NEEDS_PURGE, 0U);
+
+ while (!fseg_free_step_not_header(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ + block->frame, &mtr)) {
+ mutex_exit(&rseg->mutex);
+
+ mtr.commit();
+ mtr.start();
+
+ mutex_enter(&rseg->mutex);
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
+
+ block = trx_undo_page_get(
+ page_id_t(rseg->space->id, hdr_addr.page), &mtr);
+ }
+
+ /* The page list may now be inconsistent, but the length field
+ stored in the list base node tells us how big it was before we
+ started the freeing. */
+
+ const uint32_t seg_size = flst_get_len(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame);
+
+ /* We may free the undo log segment header page; it must be freed
+ within the same mtr as the undo log header is removed from the
+ history list: otherwise, in case of a database crash, the segment
+ could become inaccessible garbage in the file space. */
+
+ trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset, &mtr);
+
+ do {
+
+ /* Here we assume that a file segment with just the header
+ page can be freed in a few steps, so that the buffer pool
+ is not flooded with bufferfixed pages: see the note in
+ fsp0fsp.cc. */
+
+ } while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ + block->frame, &mtr));
+
+ byte* hist = TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->frame;
+ ut_ad(mach_read_from_4(hist) >= seg_size);
+
+ mtr.write<4>(*rseg_hdr, hist, mach_read_from_4(hist) - seg_size);
+
+ ut_ad(rseg->curr_size >= seg_size);
+
+ rseg->curr_size -= seg_size;
+
+ mutex_exit(&(rseg->mutex));
+
+ mtr_commit(&mtr);
+}
+
+/** Remove unnecessary history data from a rollback segment.
+@param[in,out] rseg rollback segment
+@param[in] limit truncate anything before this */
+static
+void
+trx_purge_truncate_rseg_history(
+ trx_rseg_t& rseg,
+ const purge_sys_t::iterator& limit)
+{
+ fil_addr_t hdr_addr;
+ fil_addr_t prev_hdr_addr;
+ mtr_t mtr;
+ trx_id_t undo_trx_no;
+
+ mtr.start();
+ ut_ad(rseg.is_persistent());
+ mutex_enter(&rseg.mutex);
+
+ buf_block_t* rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr);
+
+ hdr_addr = flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY
+ + rseg_hdr->frame);
+ hdr_addr.boffset = static_cast<uint16_t>(hdr_addr.boffset
+ - TRX_UNDO_HISTORY_NODE);
+
+loop:
+ if (hdr_addr.page == FIL_NULL) {
+func_exit:
+ mutex_exit(&rseg.mutex);
+ mtr.commit();
+ return;
+ }
+
+ buf_block_t* block = trx_undo_page_get(page_id_t(rseg.space->id,
+ hdr_addr.page),
+ &mtr);
+ undo_trx_no = mach_read_from_8(block->frame + hdr_addr.boffset
+ + TRX_UNDO_TRX_NO);
+
+ if (undo_trx_no >= limit.trx_no) {
+ if (undo_trx_no == limit.trx_no) {
+ trx_undo_truncate_start(
+ &rseg, hdr_addr.page,
+ hdr_addr.boffset, limit.undo_no);
+ }
+
+ goto func_exit;
+ }
+
+ prev_hdr_addr = flst_get_prev_addr(block->frame + hdr_addr.boffset
+ + TRX_UNDO_HISTORY_NODE);
+ prev_hdr_addr.boffset = static_cast<uint16_t>(prev_hdr_addr.boffset
+ - TRX_UNDO_HISTORY_NODE);
+
+ if (mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame)
+ == TRX_UNDO_TO_PURGE
+ && !mach_read_from_2(block->frame + hdr_addr.boffset
+ + TRX_UNDO_NEXT_LOG)) {
+
+ /* We can free the whole log segment */
+
+ mutex_exit(&rseg.mutex);
+ mtr.commit();
+
+ /* calls the trx_purge_remove_log_hdr()
+ inside trx_purge_free_segment(). */
+ trx_purge_free_segment(&rseg, hdr_addr);
+ } else {
+ /* Remove the log hdr from the rseg history. */
+ trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset,
+ &mtr);
+
+ mutex_exit(&rseg.mutex);
+ mtr.commit();
+ }
+
+ mtr.start();
+ mutex_enter(&rseg.mutex);
+
+ rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr);
+
+ hdr_addr = prev_hdr_addr;
+
+ goto loop;
+}
+
+/** Cleanse purge queue to remove the rseg that reside in undo-tablespace
+marked for truncate.
+@param[in] space undo tablespace being truncated */
+static void trx_purge_cleanse_purge_queue(const fil_space_t& space)
+{
+ typedef std::vector<TrxUndoRsegs> purge_elem_list_t;
+ purge_elem_list_t purge_elem_list;
+
+ mutex_enter(&purge_sys.pq_mutex);
+
+ /* Remove rseg instances that are in the purge queue before we start
+ truncate of corresponding UNDO truncate. */
+ while (!purge_sys.purge_queue.empty()) {
+ purge_elem_list.push_back(purge_sys.purge_queue.top());
+ purge_sys.purge_queue.pop();
+ }
+
+ for (purge_elem_list_t::iterator it = purge_elem_list.begin();
+ it != purge_elem_list.end();
+ ++it) {
+
+ for (TrxUndoRsegs::iterator it2 = it->begin();
+ it2 != it->end();
+ ++it2) {
+ if ((*it2)->space == &space) {
+ it->erase(it2);
+ break;
+ }
+ }
+
+ if (!it->empty()) {
+ purge_sys.purge_queue.push(*it);
+ }
+ }
+
+ mutex_exit(&purge_sys.pq_mutex);
+}
+
+/**
+Removes unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller must not have any latches on undo log pages!
+*/
+static void trx_purge_truncate_history()
+{
+ ut_ad(purge_sys.head <= purge_sys.tail);
+ purge_sys_t::iterator& head = purge_sys.head.trx_no
+ ? purge_sys.head : purge_sys.tail;
+
+ if (head.trx_no >= purge_sys.low_limit_no()) {
+ /* This is sometimes necessary. TODO: find out why. */
+ head.trx_no = purge_sys.low_limit_no();
+ head.undo_no = 0;
+ }
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
+ ut_ad(rseg->id == i);
+ trx_purge_truncate_rseg_history(*rseg, head);
+ }
+ }
+
+ if (srv_undo_tablespaces_active < 2) {
+ return;
+ }
+
+ while (srv_undo_log_truncate) {
+ if (!purge_sys.truncate.current) {
+ const ulint threshold = ulint(srv_max_undo_log_size
+ >> srv_page_size_shift);
+ for (ulint i = purge_sys.truncate.last
+ ? purge_sys.truncate.last->id
+ - srv_undo_space_id_start
+ : 0, j = i;; ) {
+ ulint space_id = srv_undo_space_id_start + i;
+ ut_ad(srv_is_undo_tablespace(space_id));
+ fil_space_t* space= fil_space_get(space_id);
+
+ if (space && space->get_size() > threshold) {
+ purge_sys.truncate.current = space;
+ break;
+ }
+
+ ++i;
+ i %= srv_undo_tablespaces_active;
+ if (i == j) {
+ break;
+ }
+ }
+ }
+
+ if (!purge_sys.truncate.current) {
+ return;
+ }
+
+ fil_space_t& space = *purge_sys.truncate.current;
+ /* Undo tablespace always are a single file. */
+ ut_a(UT_LIST_GET_LEN(space.chain) == 1);
+ fil_node_t* file = UT_LIST_GET_FIRST(space.chain);
+ /* The undo tablespace files are never closed. */
+ ut_ad(file->is_open());
+
+ DBUG_LOG("undo", "marking for truncate: " << file->name);
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
+ ut_ad(rseg->is_persistent());
+ if (rseg->space == &space) {
+ /* Once set, this rseg will
+ not be allocated to subsequent
+ transactions, but we will wait
+ for existing active
+ transactions to finish. */
+ rseg->skip_allocation = true;
+ }
+ }
+ }
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ trx_rseg_t* rseg = trx_sys.rseg_array[i];
+ if (!rseg || rseg->space != &space) {
+ continue;
+ }
+ mutex_enter(&rseg->mutex);
+ ut_ad(rseg->skip_allocation);
+ if (rseg->trx_ref_count) {
+not_free:
+ mutex_exit(&rseg->mutex);
+ return;
+ }
+
+ if (rseg->curr_size != 1) {
+ /* Check if all segments are
+ cached and safe to remove. */
+ ulint cached = 0;
+
+ for (trx_undo_t* undo = UT_LIST_GET_FIRST(
+ rseg->undo_cached);
+ undo;
+ undo = UT_LIST_GET_NEXT(undo_list,
+ undo)) {
+ if (head.trx_no < undo->trx_id) {
+ goto not_free;
+ } else {
+ cached += undo->size;
+ }
+ }
+
+ ut_ad(rseg->curr_size > cached);
+
+ if (rseg->curr_size > cached + 1) {
+ goto not_free;
+ }
+ }
+
+ mutex_exit(&rseg->mutex);
+ }
+
+ ib::info() << "Truncating " << file->name;
+ trx_purge_cleanse_purge_queue(space);
+
+ /* Flush all to-be-discarded pages of the tablespace.
+
+ During truncation, we do not want any writes to the
+ to-be-discarded area, because we must set the space.size
+ early in order to have deterministic page allocation.
+
+ If a log checkpoint was completed at LSN earlier than our
+ mini-transaction commit and the server was killed, then
+ discarding the to-be-trimmed pages without flushing would
+ break crash recovery. So, we cannot avoid the write. */
+ while (buf_flush_list_space(&space));
+
+ log_free_check();
+
+ /* Adjust the tablespace metadata. */
+ if (!fil_truncate_prepare(space.id)) {
+ ib::error() << "Failed to find UNDO tablespace "
+ << file->name;
+ return;
+ }
+
+ /* Re-initialize tablespace, in a single mini-transaction. */
+ mtr_t mtr;
+ const ulint size = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+ mtr.start();
+ mtr_x_lock_space(purge_sys.truncate.current, &mtr);
+ /* Associate the undo tablespace with mtr.
+ During mtr::commit(), InnoDB can use the undo
+ tablespace object to clear all freed ranges */
+ mtr.set_named_space(purge_sys.truncate.current);
+ mtr.trim_pages(page_id_t(space.id, size));
+ fsp_header_init(purge_sys.truncate.current, size, &mtr);
+ mutex_enter(&fil_system.mutex);
+ purge_sys.truncate.current->size = file->size = size;
+ mutex_exit(&fil_system.mutex);
+
+ buf_block_t* sys_header = trx_sysf_get(&mtr);
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ trx_rseg_t* rseg = trx_sys.rseg_array[i];
+ if (!rseg || rseg->space != &space) {
+ continue;
+ }
+
+ ut_ad(rseg->is_persistent());
+ ut_d(const ulint old_page = rseg->page_no);
+
+ buf_block_t* rblock = trx_rseg_header_create(
+ purge_sys.truncate.current,
+ rseg->id, sys_header, &mtr);
+ ut_ad(rblock);
+ rseg->page_no = rblock
+ ? rblock->page.id().page_no() : FIL_NULL;
+ ut_ad(old_page == rseg->page_no);
+
+ /* Before re-initialization ensure that we
+ free the existing structure. There can't be
+ any active transactions. */
+ ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0);
+
+ trx_undo_t* next_undo;
+
+ for (trx_undo_t* undo = UT_LIST_GET_FIRST(
+ rseg->undo_cached);
+ undo; undo = next_undo) {
+
+ next_undo = UT_LIST_GET_NEXT(undo_list, undo);
+ UT_LIST_REMOVE(rseg->undo_cached, undo);
+ MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+ ut_free(undo);
+ }
+
+ UT_LIST_INIT(rseg->undo_list,
+ &trx_undo_t::undo_list);
+ UT_LIST_INIT(rseg->undo_cached,
+ &trx_undo_t::undo_list);
+
+ /* These were written by trx_rseg_header_create(). */
+ ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+ + rblock->frame));
+ ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+ + rblock->frame));
+
+ /* Initialize the undo log lists according to
+ the rseg header */
+ rseg->curr_size = 1;
+ rseg->trx_ref_count = 0;
+ rseg->last_page_no = FIL_NULL;
+ rseg->last_commit_and_offset = 0;
+ rseg->needs_purge = false;
+ }
+
+ mtr.commit();
+ /* Write-ahead the redo log record. */
+ log_write_up_to(mtr.commit_lsn(), true);
+
+ /* Trim the file size. */
+ os_file_truncate(file->name, file->handle,
+ os_offset_t(size) << srv_page_size_shift,
+ true);
+
+ /* This is only executed by srv_purge_coordinator_thread. */
+ export_vars.innodb_undo_truncations++;
+
+ /* In MDEV-8319 (10.5) we will PUNCH_HOLE the garbage
+ (with write-ahead logging). */
+ mutex_enter(&fil_system.mutex);
+ ut_ad(&space == purge_sys.truncate.current);
+ ut_ad(space.is_being_truncated);
+ purge_sys.truncate.current->set_stopping(false);
+ purge_sys.truncate.current->is_being_truncated = false;
+ mutex_exit(&fil_system.mutex);
+
+ if (purge_sys.rseg != NULL
+ && purge_sys.rseg->last_page_no == FIL_NULL) {
+ /* If purge_sys.rseg is pointing to rseg that
+ was recently truncated then move to next rseg
+ element. Note: Ideally purge_sys.rseg should
+ be NULL because purge should complete
+ processing of all the records but there is
+ purge_batch_size that can force the purge loop
+ to exit before all the records are purged and
+ in this case purge_sys.rseg could point to a
+ valid rseg waiting for next purge cycle. */
+ purge_sys.next_stored = false;
+ purge_sys.rseg = NULL;
+ }
+
+ DBUG_EXECUTE_IF("ib_undo_trunc",
+ ib::info() << "ib_undo_trunc";
+ log_buffer_flush_to_disk();
+ DBUG_SUICIDE(););
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
+ ut_ad(rseg->is_persistent());
+ if (rseg->space == &space) {
+ rseg->skip_allocation = false;
+ }
+ }
+ }
+
+ ib::info() << "Truncated " << file->name;
+ purge_sys.truncate.last = purge_sys.truncate.current;
+ purge_sys.truncate.current = NULL;
+ }
+}
+
+/***********************************************************************//**
+Updates the last not yet purged history log info in rseg when we have purged
+a whole undo log. Advances also purge_sys.purge_trx_no past the purged log. */
+static void trx_purge_rseg_get_next_history_log(
+ ulint* n_pages_handled)/*!< in/out: number of UNDO pages
+ handled */
+{
+ fil_addr_t prev_log_addr;
+ trx_id_t trx_no;
+ mtr_t mtr;
+
+ mutex_enter(&purge_sys.rseg->mutex);
+
+ ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
+
+ purge_sys.tail.trx_no = purge_sys.rseg->last_trx_no() + 1;
+ purge_sys.tail.undo_no = 0;
+ purge_sys.next_stored = false;
+
+ mtr.start();
+
+ const buf_block_t* undo_page = trx_undo_page_get_s_latched(
+ page_id_t(purge_sys.rseg->space->id,
+ purge_sys.rseg->last_page_no), &mtr);
+
+ const trx_ulogf_t* log_hdr = undo_page->frame
+ + purge_sys.rseg->last_offset();
+
+ /* Increase the purge page count by one for every handled log */
+
+ (*n_pages_handled)++;
+
+ prev_log_addr = flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE);
+ prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset
+ - TRX_UNDO_HISTORY_NODE);
+
+
+ const bool empty = prev_log_addr.page == FIL_NULL;
+
+ if (empty) {
+ /* No logs left in the history list */
+ purge_sys.rseg->last_page_no = FIL_NULL;
+ }
+
+ mutex_exit(&purge_sys.rseg->mutex);
+ mtr.commit();
+
+ if (empty) {
+ return;
+ }
+
+ /* Read the previous log header. */
+ mtr.start();
+
+ log_hdr = trx_undo_page_get_s_latched(
+ page_id_t(purge_sys.rseg->space->id, prev_log_addr.page),
+ &mtr)->frame
+ + prev_log_addr.boffset;
+
+ trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+ ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1);
+
+ mtr_commit(&mtr);
+
+ mutex_enter(&purge_sys.rseg->mutex);
+
+ purge_sys.rseg->last_page_no = prev_log_addr.page;
+ purge_sys.rseg->set_last_commit(prev_log_addr.boffset, trx_no);
+ purge_sys.rseg->needs_purge = log_hdr[TRX_UNDO_NEEDS_PURGE + 1] != 0;
+
+ /* Purge can also produce events, however these are already ordered
+ in the rollback segment and any user generated event will be greater
+ than the events that Purge produces. ie. Purge can never produce
+ events from an empty rollback segment. */
+
+ mutex_enter(&purge_sys.pq_mutex);
+
+ purge_sys.purge_queue.push(*purge_sys.rseg);
+
+ mutex_exit(&purge_sys.pq_mutex);
+
+ mutex_exit(&purge_sys.rseg->mutex);
+}
+
+/** Position the purge sys "iterator" on the undo record to use for purging. */
+static void trx_purge_read_undo_rec()
+{
+ uint16_t offset;
+ uint32_t page_no;
+ ib_uint64_t undo_no;
+
+ purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+ page_no = purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+ if (purge_sys.rseg->needs_purge) {
+ mtr_t mtr;
+ mtr.start();
+ buf_block_t* undo_page;
+ if (trx_undo_rec_t* undo_rec = trx_undo_get_first_rec(
+ *purge_sys.rseg->space, purge_sys.hdr_page_no,
+ purge_sys.hdr_offset, RW_S_LATCH,
+ undo_page, &mtr)) {
+
+ offset = page_offset(undo_rec);
+ undo_no = trx_undo_rec_get_undo_no(undo_rec);
+ page_no = undo_page->page.id().page_no();
+ } else {
+ offset = 0;
+ undo_no = 0;
+ }
+
+ mtr.commit();
+ } else {
+ offset = 0;
+ undo_no = 0;
+ }
+
+ purge_sys.offset = offset;
+ purge_sys.page_no = page_no;
+ purge_sys.tail.undo_no = undo_no;
+
+ purge_sys.next_stored = true;
+}
+
+/***********************************************************************//**
+Chooses the next undo log to purge and updates the info in purge_sys. This
+function is used to initialize purge_sys when the next record to purge is
+not known, and also to update the purge system info on the next record when
+purge has handled the whole undo log for a transaction. */
+static
+void
+trx_purge_choose_next_log(void)
+/*===========================*/
+{
+ ut_ad(!purge_sys.next_stored);
+
+ if (purge_sys.rseg_iter.set_next()) {
+ trx_purge_read_undo_rec();
+ } else {
+ /* There is nothing to do yet. */
+ os_thread_yield();
+ }
+}
+
+/***********************************************************************//**
+Gets the next record to purge and updates the info in the purge system.
+@return copy of an undo log record or pointer to the dummy undo log record */
+static
+trx_undo_rec_t*
+trx_purge_get_next_rec(
+/*===================*/
+ ulint* n_pages_handled,/*!< in/out: number of UNDO pages
+ handled */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ mtr_t mtr;
+
+ ut_ad(purge_sys.next_stored);
+ ut_ad(purge_sys.tail.trx_no < purge_sys.low_limit_no());
+
+ const ulint space = purge_sys.rseg->space->id;
+ const uint32_t page_no = purge_sys.page_no;
+ const uint16_t offset = purge_sys.offset;
+
+ if (offset == 0) {
+ /* It is the dummy undo log record, which means that there is
+ no need to purge this undo log */
+
+ trx_purge_rseg_get_next_history_log(n_pages_handled);
+
+ /* Look for the next undo log and record to purge */
+
+ trx_purge_choose_next_log();
+
+ return(&trx_purge_dummy_rec);
+ }
+
+ mtr_start(&mtr);
+
+ buf_block_t* undo_page = trx_undo_page_get_s_latched(
+ page_id_t(space, page_no), &mtr);
+ buf_block_t* rec2_page = undo_page;
+
+ const trx_undo_rec_t* rec2 = trx_undo_page_get_next_rec(
+ undo_page, offset, purge_sys.hdr_page_no, purge_sys.hdr_offset);
+
+ if (rec2 == NULL) {
+ rec2 = trx_undo_get_next_rec(rec2_page, offset,
+ purge_sys.hdr_page_no,
+ purge_sys.hdr_offset, &mtr);
+ }
+
+ if (rec2 == NULL) {
+ mtr_commit(&mtr);
+
+ trx_purge_rseg_get_next_history_log(n_pages_handled);
+
+ /* Look for the next undo log and record to purge */
+
+ trx_purge_choose_next_log();
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(
+ page_id_t(space, page_no), &mtr);
+ } else {
+ purge_sys.offset = page_offset(rec2);
+ purge_sys.page_no = rec2_page->page.id().page_no();
+ purge_sys.tail.undo_no = trx_undo_rec_get_undo_no(rec2);
+
+ if (undo_page != rec2_page) {
+ /* We advance to a new page of the undo log: */
+ (*n_pages_handled)++;
+ }
+ }
+
+ trx_undo_rec_t* rec_copy = trx_undo_rec_copy(undo_page->frame + offset,
+ heap);
+
+ mtr_commit(&mtr);
+
+ return(rec_copy);
+}
+
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
+static MY_ATTRIBUTE((warn_unused_result))
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+ roll_ptr_t* roll_ptr, /*!< out: roll pointer to undo record */
+ ulint* n_pages_handled,/*!< in/out: number of UNDO log pages
+ handled */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ if (!purge_sys.next_stored) {
+ trx_purge_choose_next_log();
+
+ if (!purge_sys.next_stored) {
+ DBUG_PRINT("ib_purge",
+ ("no logs left in the history list"));
+ return(NULL);
+ }
+ }
+
+ if (purge_sys.tail.trx_no >= purge_sys.low_limit_no()) {
+
+ return(NULL);
+ }
+
+ /* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
+ os_thread_get_curr_id(), iter->trx_no, iter->undo_no); */
+
+ *roll_ptr = trx_undo_build_roll_ptr(
+ /* row_purge_record_func() will later set
+ ROLL_PTR_INSERT_FLAG for TRX_UNDO_INSERT_REC */
+ false,
+ purge_sys.rseg->id,
+ purge_sys.page_no, purge_sys.offset);
+
+ /* The following call will advance the stored values of the
+ purge iterator. */
+
+ return(trx_purge_get_next_rec(n_pages_handled, heap));
+}
+
+/** Run a purge batch.
+@param n_purge_threads number of purge threads
+@return number of undo log pages handled in the batch */
+static
+ulint
+trx_purge_attach_undo_recs(ulint n_purge_threads)
+{
+ que_thr_t* thr;
+ ulint i;
+ ulint n_pages_handled = 0;
+ ulint n_thrs = UT_LIST_GET_LEN(purge_sys.query->thrs);
+
+ ut_a(n_purge_threads > 0);
+
+ purge_sys.head = purge_sys.tail;
+
+#ifdef UNIV_DEBUG
+ i = 0;
+ /* Debug code to validate some pre-requisites and reset done flag. */
+ for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+ thr != NULL && i < n_purge_threads;
+ thr = UT_LIST_GET_NEXT(thrs, thr), ++i) {
+
+ purge_node_t* node;
+
+ /* Get the purge node. */
+ node = (purge_node_t*) thr->child;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+ ut_ad(node->undo_recs.empty());
+ ut_ad(!node->in_progress);
+ ut_d(node->in_progress = true);
+ }
+
+ /* There should never be fewer nodes than threads, the inverse
+ however is allowed because we only use purge threads as needed. */
+ ut_ad(i == n_purge_threads);
+#endif
+
+ /* Fetch and parse the UNDO records. The UNDO records are added
+ to a per purge node vector. */
+ thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+ ut_a(n_thrs > 0 && thr != NULL);
+
+ ut_ad(purge_sys.head <= purge_sys.tail);
+
+ i = 0;
+
+ const ulint batch_size = srv_purge_batch_size;
+ std::unordered_map<table_id_t, purge_node_t*> table_id_map;
+ mem_heap_empty(purge_sys.heap);
+
+ while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) {
+ purge_node_t* node;
+ trx_purge_rec_t purge_rec;
+
+ ut_a(!thr->is_active);
+
+ /* Get the purge node. */
+ node = (purge_node_t*) thr->child;
+ ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
+
+ /* Track the max {trx_id, undo_no} for truncating the
+ UNDO logs once we have purged the records. */
+
+ if (purge_sys.head <= purge_sys.tail) {
+ purge_sys.head = purge_sys.tail;
+ }
+
+ /* Fetch the next record, and advance the purge_sys.tail. */
+ purge_rec.undo_rec = trx_purge_fetch_next_rec(
+ &purge_rec.roll_ptr, &n_pages_handled,
+ purge_sys.heap);
+
+ if (purge_rec.undo_rec == NULL) {
+ break;
+ } else if (purge_rec.undo_rec == &trx_purge_dummy_rec) {
+ continue;
+ }
+
+ table_id_t table_id = trx_undo_rec_get_table_id(
+ purge_rec.undo_rec);
+
+ purge_node_t *& table_node = table_id_map[table_id];
+
+ if (table_node) {
+ node = table_node;
+ } else {
+ thr = UT_LIST_GET_NEXT(thrs, thr);
+
+ if (!(++i % n_purge_threads)) {
+ thr = UT_LIST_GET_FIRST(
+ purge_sys.query->thrs);
+ }
+
+ ut_a(thr != NULL);
+ table_node = node;
+ }
+
+ node->undo_recs.push(purge_rec);
+
+ if (n_pages_handled >= batch_size) {
+ break;
+ }
+ }
+
+ ut_ad(purge_sys.head <= purge_sys.tail);
+
+ return(n_pages_handled);
+}
+
+/*******************************************************************//**
+Calculate the DML delay required.
+@return delay in microseconds or ULINT_MAX */
+static
+ulint
+trx_purge_dml_delay(void)
+/*=====================*/
+{
+ /* Determine how much data manipulation language (DML) statements
+ need to be delayed in order to reduce the lagging of the purge
+ thread. */
+ ulint delay = 0; /* in microseconds; default: no delay */
+
+ /* If purge lag is set then calculate the new DML delay. */
+
+ if (srv_max_purge_lag > 0) {
+ double ratio = static_cast<double>(trx_sys.rseg_history_len) /
+ static_cast<double>(srv_max_purge_lag);
+
+ if (ratio > 1.0) {
+ /* If the history list length exceeds the
+ srv_max_purge_lag, the data manipulation
+ statements are delayed by at least 5000
+ microseconds. */
+ delay = (ulint) ((ratio - .5) * 10000);
+ }
+
+ if (delay > srv_max_purge_lag_delay) {
+ delay = srv_max_purge_lag_delay;
+ }
+
+ MONITOR_SET(MONITOR_DML_PURGE_DELAY, delay);
+ }
+
+ return(delay);
+}
+
+extern tpool::waitable_task purge_worker_task;
+
+/** Wait for pending purge jobs to complete. */
+static void trx_purge_wait_for_workers_to_complete()
+{
+ bool notify_wait = purge_worker_task.is_running();
+
+ if (notify_wait)
+ tpool::tpool_wait_begin();
+
+ purge_worker_task.wait();
+
+ if(notify_wait)
+ tpool::tpool_wait_end();
+
+ /* There should be no outstanding tasks as long
+ as the worker threads are active. */
+ ut_ad(srv_get_task_queue_length() == 0);
+}
+
+/**
+Run a purge batch.
+@param n_tasks number of purge tasks to submit to the queue
+@param truncate whether to truncate the history at the end of the batch
+@return number of undo log pages handled in the batch */
+ulint trx_purge(ulint n_tasks, bool truncate)
+{
+ que_thr_t* thr = NULL;
+ ulint n_pages_handled;
+
+ ut_ad(n_tasks > 0);
+
+ srv_dml_needed_delay = trx_purge_dml_delay();
+
+ purge_sys.clone_oldest_view();
+
+#ifdef UNIV_DEBUG
+ if (srv_purge_view_update_only_debug) {
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* Fetch the UNDO recs that need to be purged. */
+ n_pages_handled = trx_purge_attach_undo_recs(n_tasks);
+
+ /* Submit tasks to workers queue if using multi-threaded purge. */
+ for (ulint i = n_tasks; --i; ) {
+ thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+ ut_a(thr);
+ srv_que_task_enqueue_low(thr);
+ srv_thread_pool->submit_task(&purge_worker_task);
+ }
+
+ thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+
+ que_run_threads(thr);
+
+ trx_purge_wait_for_workers_to_complete();
+
+ if (truncate) {
+ trx_purge_truncate_history();
+ }
+
+ MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
+ MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages_handled);
+
+ return(n_pages_handled);
+}
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
new file mode 100644
index 00000000..438dfcf9
--- /dev/null
+++ b/storage/innobase/trx/trx0rec.cc
@@ -0,0 +1,2559 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rec.cc
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "trx0rseg.h"
+#include "row0row.h"
+#include "row0mysql.h"
+
+/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA. */
+const dtuple_t trx_undo_metadata = {
+ /* This also works for REC_INFO_METADATA_ALTER, because the
+ delete-mark (REC_INFO_DELETED_FLAG) is ignored when searching. */
+ REC_INFO_METADATA_ADD, 0, 0,
+ NULL, 0, NULL
+#ifdef UNIV_DEBUG
+ , DATA_TUPLE_MAGIC_N
+#endif /* UNIV_DEBUG */
+};
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/** Calculate the free space left for extending an undo log record.
+@param undo_block undo log page
+@param ptr current end of the undo page
+@return bytes left */
+static ulint trx_undo_left(const buf_block_t *undo_block, const byte *ptr)
+{
+ ut_ad(ptr >= &undo_block->frame[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]);
+ /* The 10 is supposed to be an extra safety margin (and needed for
+ compatibility with older versions) */
+ lint left= srv_page_size - (ptr - undo_block->frame) -
+ (10 + FIL_PAGE_DATA_END);
+ ut_ad(left >= 0);
+ return left < 0 ? 0 : static_cast<ulint>(left);
+}
+
+/**********************************************************************//**
+Set the next and previous pointers in the undo page for the undo record
+that was written to ptr. Update the first free value by the number of bytes
+written for this undo record.
+@return offset of the inserted entry on the page if succeeded, 0 if fail */
+static
+uint16_t
+trx_undo_page_set_next_prev_and_add(
+/*================================*/
+ buf_block_t* undo_block, /*!< in/out: undo log page */
+ byte* ptr, /*!< in: ptr up to where data has been
+ written on this undo page. */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(page_align(ptr) == undo_block->frame);
+
+ if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2))
+ return 0;
+
+ byte *ptr_to_first_free= my_assume_aligned<2>(TRX_UNDO_PAGE_HDR +
+ TRX_UNDO_PAGE_FREE +
+ undo_block->frame);
+
+ const uint16_t first_free= mach_read_from_2(ptr_to_first_free);
+
+ /* Write offset of the previous undo log record */
+ memcpy(ptr, ptr_to_first_free, 2);
+ ptr += 2;
+
+ const uint16_t end_of_rec= static_cast<uint16_t>(ptr - undo_block->frame);
+
+ /* Update the offset to first free undo record */
+ mach_write_to_2(ptr_to_first_free, end_of_rec);
+ /* Write offset of the next undo log record */
+ memcpy(undo_block->frame + first_free, ptr_to_first_free, 2);
+ const byte *start= undo_block->frame + first_free + 2;
+
+ mtr->undo_append(*undo_block, start, ptr - start - 2);
+ return first_free;
+}
+
+/** Virtual column undo log version. To distinguish it from a length value
+in 5.7.8 undo log, it starts with 0xF1 */
+static const ulint VIRTUAL_COL_UNDO_FORMAT_1 = 0xF1;
+
+/** Write virtual column index info (index id and column position in index)
+to the undo log
+@param[in,out] undo_block undo log page
+@param[in] table the table
+@param[in] pos the virtual column position
+@param[in] ptr undo log record being written
+@param[in] first_v_col whether this is the first virtual column
+ which could start with a version marker
+@return new undo log pointer */
+static
+byte*
+trx_undo_log_v_idx(
+ buf_block_t* undo_block,
+ const dict_table_t* table,
+ ulint pos,
+ byte* ptr,
+ bool first_v_col)
+{
+ ut_ad(pos < table->n_v_def);
+ dict_v_col_t* vcol = dict_table_get_nth_v_col(table, pos);
+ byte* old_ptr;
+
+ ut_ad(!vcol->v_indexes.empty());
+
+ ulint size = first_v_col ? 1 + 2 : 2;
+ const ulint avail = trx_undo_left(undo_block, ptr);
+
+ /* The mach_write_compressed(ptr, flen) in
+ trx_undo_page_report_modify() will consume additional 1 to 5 bytes. */
+ if (avail < size + 5) {
+ return(NULL);
+ }
+
+ ulint n_idx = 0;
+ for (const auto& v_index : vcol->v_indexes) {
+ n_idx++;
+ /* FIXME: index->id is 64 bits! */
+ size += mach_get_compressed_size(uint32_t(v_index.index->id));
+ size += mach_get_compressed_size(v_index.nth_field);
+ }
+
+ size += mach_get_compressed_size(n_idx);
+
+ if (avail < size + 5) {
+ return(NULL);
+ }
+
+ ut_d(const byte* orig_ptr = ptr);
+
+ if (first_v_col) {
+ /* write the version marker */
+ mach_write_to_1(ptr, VIRTUAL_COL_UNDO_FORMAT_1);
+
+ ptr += 1;
+ }
+
+ old_ptr = ptr;
+
+ ptr += 2;
+
+ ptr += mach_write_compressed(ptr, n_idx);
+
+ for (const auto& v_index : vcol->v_indexes) {
+ ptr += mach_write_compressed(
+ /* FIXME: index->id is 64 bits! */
+ ptr, uint32_t(v_index.index->id));
+
+ ptr += mach_write_compressed(ptr, v_index.nth_field);
+ }
+
+ ut_ad(orig_ptr + size == ptr);
+
+ mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+ return(ptr);
+}
+
+/** Read virtual column index from undo log, and verify the column is still
+indexed, and return its position
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[out] col_pos the column number or FIL_NULL
+ if the column is not indexed any more
+@return remaining part of undo log record after reading these values */
+static
+const byte*
+trx_undo_read_v_idx_low(
+ const dict_table_t* table,
+ const byte* ptr,
+ uint32_t* col_pos)
+{
+ ulint len = mach_read_from_2(ptr);
+ const byte* old_ptr = ptr;
+
+ *col_pos = FIL_NULL;
+
+ ptr += 2;
+
+ ulint num_idx = mach_read_next_compressed(&ptr);
+
+ ut_ad(num_idx > 0);
+
+ dict_index_t* clust_index = dict_table_get_first_index(table);
+
+ for (ulint i = 0; i < num_idx; i++) {
+ index_id_t id = mach_read_next_compressed(&ptr);
+ ulint pos = mach_read_next_compressed(&ptr);
+ dict_index_t* index = dict_table_get_next_index(clust_index);
+
+ while (index != NULL) {
+ /* Return if we find a matching index.
+ TODO: in the future, it might be worth to add
+ checks on other indexes */
+ if (index->id == id) {
+ const dict_col_t* col = dict_index_get_nth_col(
+ index, pos);
+ ut_ad(col->is_virtual());
+ const dict_v_col_t* vcol = reinterpret_cast<
+ const dict_v_col_t*>(col);
+ *col_pos = vcol->v_pos;
+ return(old_ptr + len);
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+ }
+
+ return(old_ptr + len);
+}
+
+/** Read virtual column index from undo log or online log if the log
+contains such info, and in the undo log case, verify the column is
+still indexed, and output its position
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[in] first_v_col if this is the first virtual column, which
+ has the version marker
+@param[in,out] is_undo_log this function is used to parse both undo log,
+ and online log for virtual columns. So
+ check to see if this is undo log. When
+ first_v_col is true, is_undo_log is output,
+ when first_v_col is false, is_undo_log is input
+@param[out] field_no the column number, or FIL_NULL if not indexed
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_read_v_idx(
+ const dict_table_t* table,
+ const byte* ptr,
+ bool first_v_col,
+ bool* is_undo_log,
+ uint32_t* field_no)
+{
+ /* Version marker only put on the first virtual column */
+ if (first_v_col) {
+ /* Undo log has the virtual undo log marker */
+ *is_undo_log = (mach_read_from_1(ptr)
+ == VIRTUAL_COL_UNDO_FORMAT_1);
+
+ if (*is_undo_log) {
+ ptr += 1;
+ }
+ }
+
+ if (*is_undo_log) {
+ ptr = trx_undo_read_v_idx_low(table, ptr, field_no);
+ } else {
+ *field_no -= REC_MAX_N_FIELDS;
+ }
+
+ return(ptr);
+}
+
+/** Reports in the undo log of an insert of virtual columns.
+@param[in] undo_block undo log page
+@param[in] table the table
+@param[in] row dtuple contains the virtual columns
+@param[in,out] ptr log ptr
+@return true if write goes well, false if out of space */
+static
+bool
+trx_undo_report_insert_virtual(
+ buf_block_t* undo_block,
+ dict_table_t* table,
+ const dtuple_t* row,
+ byte** ptr)
+{
+ byte* start = *ptr;
+ bool first_v_col = true;
+
+ if (trx_undo_left(undo_block, *ptr) < 2) {
+ return(false);
+ }
+
+ /* Reserve 2 bytes to write the number
+ of bytes the stored fields take in this
+ undo record */
+ *ptr += 2;
+
+ for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table);
+ col_no++) {
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(table, col_no);
+
+ if (col->m_col.ord_part) {
+
+ /* make sure enought space to write the length */
+ if (trx_undo_left(undo_block, *ptr) < 5) {
+ return(false);
+ }
+
+ ulint pos = col_no;
+ pos += REC_MAX_N_FIELDS;
+ *ptr += mach_write_compressed(*ptr, pos);
+
+ *ptr = trx_undo_log_v_idx(undo_block, table,
+ col_no, *ptr, first_v_col);
+ first_v_col = false;
+
+ if (*ptr == NULL) {
+ return(false);
+ }
+
+ const dfield_t* vfield = dtuple_get_nth_v_field(
+ row, col->v_pos);
+ switch (ulint flen = vfield->len) {
+ case 0: case UNIV_SQL_NULL:
+ if (trx_undo_left(undo_block, *ptr) < 5) {
+ return(false);
+ }
+
+ *ptr += mach_write_compressed(*ptr, flen);
+ break;
+ default:
+ ulint max_len
+ = dict_max_v_field_len_store_undo(
+ table, col_no);
+
+ if (flen > max_len) {
+ flen = max_len;
+ }
+
+ if (trx_undo_left(undo_block, *ptr)
+ < flen + 5) {
+ return(false);
+ }
+ *ptr += mach_write_compressed(*ptr, flen);
+
+ memcpy(*ptr, vfield->data, flen);
+ *ptr += flen;
+ }
+ }
+ }
+
+ /* Always mark the end of the log with 2 bytes length field */
+ mach_write_to_2(start, ulint(*ptr - start));
+
+ return(true);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an insert of a clustered index record.
+@return offset of the inserted entry on the page if succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_insert(
+/*========================*/
+ buf_block_t* undo_block, /*!< in: undo log page */
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: index entry which will be
+ inserted to the clustered index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(index->is_primary());
+ /* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+ TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+ TRX_UNDO_INSERT == 1 into insert_undo pages,
+ or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+ ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+ + undo_block->frame) <= 2);
+
+ uint16_t first_free = mach_read_from_2(my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + undo_block->frame));
+ byte* ptr = undo_block->frame + first_free;
+
+ if (trx_undo_left(undo_block, ptr) < 2 + 1 + 11 + 11) {
+ /* Not enough space for writing the general parameters */
+ return(0);
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ /* Store first some general parameters to the undo log */
+ *ptr++ = TRX_UNDO_INSERT_REC;
+ ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+ ptr += mach_u64_write_much_compressed(ptr, index->table->id);
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the record
+ to be inserted in the clustered index */
+ if (UNIV_UNLIKELY(clust_entry->info_bits != 0)) {
+ ut_ad(clust_entry->is_metadata());
+ ut_ad(index->is_instant());
+ ut_ad(undo_block->frame[first_free + 2]
+ == TRX_UNDO_INSERT_REC);
+ undo_block->frame[first_free + 2] = TRX_UNDO_INSERT_METADATA;
+ goto done;
+ }
+
+ for (unsigned i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ const dfield_t* field = dtuple_get_nth_field(clust_entry, i);
+ ulint flen = dfield_get_len(field);
+
+ if (trx_undo_left(undo_block, ptr) < 5) {
+
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ switch (flen) {
+ case 0: case UNIV_SQL_NULL:
+ break;
+ default:
+ if (trx_undo_left(undo_block, ptr) < flen) {
+
+ return(0);
+ }
+
+ memcpy(ptr, dfield_get_data(field), flen);
+ ptr += flen;
+ }
+ }
+
+ if (index->table->n_v_cols) {
+ if (!trx_undo_report_insert_virtual(
+ undo_block, index->table, clust_entry, &ptr)) {
+ return(0);
+ }
+ }
+
+done:
+ return(trx_undo_page_set_next_prev_and_add(undo_block, ptr, mtr));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ ulint* type, /*!< out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ ulint* cmpl_info, /*!< out: compiler info, relevant only
+ for update type records */
+ bool* updated_extern, /*!< out: true if we updated an
+ externally stored fild */
+ undo_no_t* undo_no, /*!< out: undo log record number */
+ table_id_t* table_id) /*!< out: table id */
+{
+ const byte* ptr;
+ ulint type_cmpl;
+
+ ptr = undo_rec + 2;
+
+ type_cmpl = mach_read_from_1(ptr);
+ ptr++;
+
+ *updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN);
+ type_cmpl &= ~TRX_UNDO_UPD_EXTERN;
+ *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+ ut_ad(*type >= TRX_UNDO_RENAME_TABLE);
+ ut_ad(*type <= TRX_UNDO_DEL_MARK_REC);
+ *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
+
+ *undo_no = mach_read_next_much_compressed(&ptr);
+ *table_id = mach_read_next_much_compressed(&ptr);
+ ut_ad(*table_id);
+
+ return(const_cast<byte*>(ptr));
+}
+
+/** Read from an undo log record a non-virtual column value.
+@param[in,out] ptr pointer to remaining part of the undo record
+@param[in,out] field stored field
+@param[in,out] len length of the field, or UNIV_SQL_NULL
+@param[in,out] orig_len original length of the locally stored part
+of an externally stored column, or 0
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_rec_get_col_val(
+ const byte* ptr,
+ const byte** field,
+ uint32_t* len,
+ uint32_t* orig_len)
+{
+ *len = mach_read_next_compressed(&ptr);
+ *orig_len = 0;
+
+ switch (*len) {
+ case UNIV_SQL_NULL:
+ *field = NULL;
+ break;
+ case UNIV_EXTERN_STORAGE_FIELD:
+ *orig_len = mach_read_next_compressed(&ptr);
+ *len = mach_read_next_compressed(&ptr);
+ *field = ptr;
+ ptr += *len & ~SPATIAL_STATUS_MASK;
+
+ ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_ad(*len > *orig_len);
+ /* @see dtuple_convert_big_rec() */
+ ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ /* we do not have access to index->table here
+ ut_ad(dict_table_has_atomic_blobs(index->table)
+ || *len >= col->max_prefix
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ */
+
+ *len += UNIV_EXTERN_STORAGE_FIELD;
+ break;
+ default:
+ *field = ptr;
+ if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+ ptr += (*len - UNIV_EXTERN_STORAGE_FIELD)
+ & ~SPATIAL_STATUS_MASK;
+ } else {
+ ptr += *len;
+ }
+ }
+
+ return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ byte* ptr, /*!< in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t**ref, /*!< out, own: row reference */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(index && ptr && ref && heap);
+ ut_a(dict_index_is_clust(index));
+
+ ref_len = dict_index_get_n_unique(index);
+
+ dtuple_t* tuple = dtuple_create(heap, ref_len);
+ *ref = tuple;
+
+ dict_index_copy_types(tuple, index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ const byte* field;
+ uint32_t len, orig_len;
+
+ dfield_t* dfield = dtuple_get_nth_field(tuple, i);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+static
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record, at the start of the row reference */
+ dict_index_t* index) /*!< in: clustered index */
+{
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(index && ptr);
+ ut_a(dict_index_is_clust(index));
+
+ ref_len = dict_index_get_n_unique(index);
+
+ for (i = 0; i < ref_len; i++) {
+ const byte* field;
+ uint32_t len, orig_len;
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+ }
+
+ return(ptr);
+}
+
+/** Fetch a prefix of an externally stored column, for writing to the undo
+log of an update or delete marking of a clustered index record.
+@param[out] ext_buf buffer to hold the prefix data and BLOB pointer
+@param[in] prefix_len prefix size to store in the undo log
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] field an externally stored column
+@param[in,out] len input: length of field; output: used length of
+ext_buf
+@return ext_buf */
+static
+byte*
+trx_undo_page_fetch_ext(
+ byte* ext_buf,
+ ulint prefix_len,
+ ulint zip_size,
+ const byte* field,
+ ulint* len)
+{
+ /* Fetch the BLOB. */
+ ulint ext_len = btr_copy_externally_stored_field_prefix(
+ ext_buf, prefix_len, zip_size, field, *len);
+ /* BLOBs should always be nonempty. */
+ ut_a(ext_len);
+ /* Append the BLOB pointer to the prefix. */
+ memcpy(ext_buf + ext_len,
+ field + *len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ *len = ext_len + BTR_EXTERN_FIELD_REF_SIZE;
+ return(ext_buf);
+}
+
+/** Writes to the undo log a prefix of an externally stored column.
+@param[out] ptr undo log position, at least 15 bytes must be
+available
+@param[out] ext_buf a buffer of DICT_MAX_FIELD_LEN_BY_FORMAT()
+ size, or NULL when should not fetch a longer
+ prefix
+@param[in] prefix_len prefix size to store in the undo log
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] field the locally stored part of the externally
+stored column
+@param[in,out] len length of field, in bytes
+@param[in] spatial_status whether the column is used by spatial index or
+ regular index
+@return undo log position */
+static
+byte*
+trx_undo_page_report_modify_ext(
+ byte* ptr,
+ byte* ext_buf,
+ ulint prefix_len,
+ ulint zip_size,
+ const byte** field,
+ ulint* len,
+ spatial_status_t spatial_status)
+{
+ ulint spatial_len= 0;
+
+ switch (spatial_status) {
+ case SPATIAL_UNKNOWN:
+ case SPATIAL_NONE:
+ break;
+
+ case SPATIAL_MIXED:
+ case SPATIAL_ONLY:
+ spatial_len = DATA_MBR_LEN;
+ break;
+ }
+
+ /* Encode spatial status into length. */
+ spatial_len |= ulint(spatial_status) << SPATIAL_STATUS_SHIFT;
+
+ if (spatial_status == SPATIAL_ONLY) {
+ /* If the column is only used by gis index, log its
+ MBR is enough.*/
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+ + spatial_len);
+
+ return(ptr);
+ }
+
+ if (ext_buf) {
+ ut_a(prefix_len > 0);
+
+ /* If an ordering column is externally stored, we will
+ have to store a longer prefix of the field. In this
+ case, write to the log a marker followed by the
+ original length and the real length of the field. */
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD);
+
+ ptr += mach_write_compressed(ptr, *len);
+
+ *field = trx_undo_page_fetch_ext(ext_buf, prefix_len,
+ zip_size, *field, len);
+
+ ptr += mach_write_compressed(ptr, *len + spatial_len);
+ } else {
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+ + *len + spatial_len);
+ }
+
+ return(ptr);
+}
+
+/** Get MBR from a Geometry column stored externally
+@param[out] mbr MBR to fill
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] field field contain the geometry data
+@param[in,out] len length of field, in bytes
+*/
+static
+void
+trx_undo_get_mbr_from_ext(
+/*======================*/
+ double* mbr,
+ ulint zip_size,
+ const byte* field,
+ ulint* len)
+{
+ uchar* dptr = NULL;
+ ulint dlen;
+ mem_heap_t* heap = mem_heap_create(100);
+
+ dptr = btr_copy_externally_stored_field(
+ &dlen, field, zip_size, *len, heap);
+
+ if (dlen <= GEO_DATA_HEADER_SIZE) {
+ for (uint i = 0; i < SPDIMS; ++i) {
+ mbr[i * 2] = DBL_MAX;
+ mbr[i * 2 + 1] = -DBL_MAX;
+ }
+ } else {
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(dlen
+ - GEO_DATA_HEADER_SIZE), SPDIMS, mbr);
+ }
+
+ mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an update or delete marking of a clustered index
+record.
+@return byte offset of the inserted undo log entry on the page if
+succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_modify(
+/*========================*/
+ buf_block_t* undo_block, /*!< in: undo log page */
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: clustered index where update or
+ delete marking is done */
+ const rec_t* rec, /*!< in: clustered index record which
+ has NOT yet been modified */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector which tells the
+ columns to be updated; in the case of
+ a delete, this should be set to NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const dtuple_t* row, /*!< in: clustered index row contains
+ virtual column info */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(index->is_primary());
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ /* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+ TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+ TRX_UNDO_INSERT == 1 into insert_undo pages,
+ or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+ ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+ + undo_block->frame) <= 2);
+
+ byte* ptr_to_first_free = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + undo_block->frame);
+
+ const uint16_t first_free = mach_read_from_2(ptr_to_first_free);
+ byte *ptr = undo_block->frame + first_free;
+
+ if (trx_undo_left(undo_block, ptr) < 50) {
+ /* NOTE: the value 50 must be big enough so that the general
+ fields written below fit on the undo log page */
+ return 0;
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ dict_table_t* table = index->table;
+ const byte* field;
+ ulint flen;
+ ulint col_no;
+ ulint type_cmpl;
+ byte* type_cmpl_ptr;
+ ulint i;
+ trx_id_t trx_id;
+ ibool ignore_prefix = FALSE;
+ byte ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE];
+ bool first_v_col = true;
+
+ /* Store first some general parameters to the undo log */
+
+ if (!update) {
+ ut_ad(!rec_is_delete_marked(rec, dict_table_is_comp(table)));
+ type_cmpl = TRX_UNDO_DEL_MARK_REC;
+ } else if (rec_is_delete_marked(rec, dict_table_is_comp(table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing update_undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, offsets));
+
+ type_cmpl = TRX_UNDO_UPD_DEL_REC;
+ /* We are about to update a delete marked record.
+ We don't typically need the prefix in this case unless
+ the delete marking is done by the same transaction
+ (which we check below). */
+ ignore_prefix = TRUE;
+ } else {
+ type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+ }
+
+ type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
+ type_cmpl_ptr = ptr;
+
+ *ptr++ = (byte) type_cmpl;
+ ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+
+ ptr += mach_u64_write_much_compressed(ptr, table->id);
+
+ /*----------------------------------------*/
+ /* Store the state of the info bits */
+
+ *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));
+
+ /* Store the values of the system columns */
+ field = rec_get_nth_field(rec, offsets, index->db_trx_id(), &flen);
+ ut_ad(flen == DATA_TRX_ID_LEN);
+
+ trx_id = trx_read_trx_id(field);
+
+ /* If it is an update of a delete marked record, then we are
+ allowed to ignore blob prefixes if the delete marking was done
+ by some other trx as it must have committed by now for us to
+ allow an over-write. */
+ if (trx_id == trx->id) {
+ ignore_prefix = false;
+ }
+ ptr += mach_u64_write_compressed(ptr, trx_id);
+
+ field = rec_get_nth_field(rec, offsets, index->db_roll_ptr(), &flen);
+ ut_ad(flen == DATA_ROLL_PTR_LEN);
+ ut_ad(memcmp(field, field_ref_zero, DATA_ROLL_PTR_LEN));
+
+ ptr += mach_u64_write_compressed(ptr, trx_read_roll_ptr(field));
+
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the
+ record which will be modified in the clustered index */
+
+ for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ /* The ordering columns must not be instant added columns. */
+ ut_ad(!rec_offs_nth_default(offsets, i));
+ field = rec_get_nth_field(rec, offsets, i, &flen);
+
+ /* The ordering columns must not be stored externally. */
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ ut_ad(dict_index_get_nth_col(index, i)->ord_part);
+
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_block, ptr) < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+
+ /*----------------------------------------*/
+ /* Save to the undo log the old values of the columns to be updated. */
+
+ if (update) {
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ ulint n_updated = upd_get_n_fields(update);
+
+ /* If this is an online update while an inplace alter table
+ is in progress and the table has virtual column, we will
+ need to double check if there are any non-indexed columns
+ being registered in update vector in case they will be indexed
+ in new table */
+ if (dict_index_is_online_ddl(index) && table->n_v_cols > 0) {
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ upd_field_t* fld = upd_get_nth_field(
+ update, i);
+ ulint pos = fld->field_no;
+
+ /* These columns must not have an index
+ on them */
+ if (upd_fld_is_virtual_col(fld)
+ && dict_table_get_nth_v_col(
+ table, pos)->v_indexes.empty()) {
+ n_updated--;
+ }
+ }
+ }
+
+ i = 0;
+
+ if (UNIV_UNLIKELY(update->is_alter_metadata())) {
+ ut_ad(update->n_fields >= 1);
+ ut_ad(!upd_fld_is_virtual_col(&update->fields[0]));
+ ut_ad(update->fields[0].field_no
+ == index->first_user_field());
+ ut_ad(!dfield_is_ext(&update->fields[0].new_val));
+ ut_ad(!dfield_is_null(&update->fields[0].new_val));
+ /* The instant ADD COLUMN metadata record does not
+ contain the BLOB. Do not write anything for it. */
+ i = !rec_is_alter_metadata(rec, *index);
+ n_updated -= i;
+ }
+
+ ptr += mach_write_compressed(ptr, n_updated);
+
+ for (; i < upd_get_n_fields(update); i++) {
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return 0;
+ }
+
+ upd_field_t* fld = upd_get_nth_field(update, i);
+
+ bool is_virtual = upd_fld_is_virtual_col(fld);
+ ulint max_v_log_len = 0;
+
+ ulint pos = fld->field_no;
+ const dict_col_t* col = NULL;
+
+ if (is_virtual) {
+ /* Skip the non-indexed column, during
+ an online alter table */
+ if (dict_index_is_online_ddl(index)
+ && dict_table_get_nth_v_col(
+ table, pos)->v_indexes.empty()) {
+ continue;
+ }
+
+ /* add REC_MAX_N_FIELDS to mark this
+ is a virtual col */
+ ptr += mach_write_compressed(
+ ptr, pos + REC_MAX_N_FIELDS);
+
+ if (trx_undo_left(undo_block, ptr) < 15) {
+ return 0;
+ }
+
+ ut_ad(fld->field_no < table->n_v_def);
+
+ ptr = trx_undo_log_v_idx(undo_block, table,
+ fld->field_no, ptr,
+ first_v_col);
+ if (ptr == NULL) {
+ return(0);
+ }
+ first_v_col = false;
+
+ max_v_log_len
+ = dict_max_v_field_len_store_undo(
+ table, fld->field_no);
+
+ field = static_cast<byte*>(
+ fld->old_v_val->data);
+ flen = fld->old_v_val->len;
+
+ /* Only log sufficient bytes for index
+ record update */
+ if (flen != UNIV_SQL_NULL) {
+ flen = ut_min(
+ flen, max_v_log_len);
+ }
+
+ goto store_len;
+ }
+
+ if (UNIV_UNLIKELY(update->is_metadata())) {
+ ut_ad(pos >= index->first_user_field());
+ ut_ad(rec_is_metadata(rec, *index));
+
+ if (rec_is_alter_metadata(rec, *index)) {
+ ut_ad(update->is_alter_metadata());
+
+ field = rec_offs_n_fields(offsets)
+ > pos
+ && !rec_offs_nth_default(
+ offsets, pos)
+ ? rec_get_nth_field(
+ rec, offsets,
+ pos, &flen)
+ : index->instant_field_value(
+ pos - 1, &flen);
+
+ if (pos == index->first_user_field()) {
+ ut_ad(rec_offs_nth_extern(
+ offsets, pos));
+ ut_ad(flen == FIELD_REF_SIZE);
+ goto write_field;
+ }
+ col = dict_index_get_nth_col(index,
+ pos - 1);
+ } else if (!update->is_alter_metadata()) {
+ goto get_field;
+ } else {
+ /* We are converting an ADD COLUMN
+ metadata record to an ALTER TABLE
+ metadata record, with BLOB. Subtract
+ the missing metadata BLOB field. */
+ ut_ad(pos > index->first_user_field());
+ --pos;
+ goto get_field;
+ }
+ } else {
+get_field:
+ col = dict_index_get_nth_col(index, pos);
+ field = rec_get_nth_cfield(
+ rec, index, offsets, pos, &flen);
+ }
+write_field:
+ /* Write field number to undo log */
+ ptr += mach_write_compressed(ptr, pos);
+
+ if (trx_undo_left(undo_block, ptr) < 15) {
+ return 0;
+ }
+
+ if (rec_offs_n_fields(offsets) > pos
+ && rec_offs_nth_extern(offsets, pos)) {
+ ut_ad(col || pos == index->first_user_field());
+ ut_ad(col || update->is_alter_metadata());
+ ut_ad(col
+ || rec_is_alter_metadata(rec, *index));
+ ulint prefix_len = col
+ ? dict_max_field_len_store_undo(
+ table, col)
+ : 0;
+
+ ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE
+ <= sizeof ext_buf);
+
+ ptr = trx_undo_page_report_modify_ext(
+ ptr,
+ col
+ && col->ord_part
+ && !ignore_prefix
+ && flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+ ? ext_buf : NULL, prefix_len,
+ table->space->zip_size(),
+ &field, &flen, SPATIAL_UNKNOWN);
+
+ *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
+ } else {
+store_len:
+ ptr += mach_write_compressed(ptr, flen);
+ }
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_block, ptr) < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+
+ /* Also record the new value for virtual column */
+ if (is_virtual) {
+ field = static_cast<byte*>(fld->new_val.data);
+ flen = fld->new_val.len;
+ if (flen != UNIV_SQL_NULL) {
+ flen = ut_min(
+ flen, max_v_log_len);
+ }
+
+ if (trx_undo_left(undo_block, ptr) < 15) {
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_block, ptr)
+ < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+ }
+
+ /* Reset the first_v_col, so to put the virtual column undo
+ version marker again, when we log all the indexed columns */
+ first_v_col = true;
+
+ /*----------------------------------------*/
+ /* In the case of a delete marking, and also in the case of an update
+ where any ordering field of any index changes, store the values of all
+ columns which occur as ordering fields in any index. This info is used
+ in the purge of old versions where we use it to build and search the
+ delete marked index records, to look if we can remove them from the
+ index tree. Note that starting from 4.0.14 also externally stored
+ fields can be ordering in some index. Starting from 5.2, we no longer
+ store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
+ but we can construct the column prefix fields in the index by
+ fetching the first page of the BLOB that is pointed to by the
+ clustered index. This works also in crash recovery, because all pages
+ (including BLOBs) are recovered before anything is rolled back. */
+
+ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ byte* old_ptr = ptr;
+ double mbr[SPDIMS * 2];
+ mem_heap_t* row_heap = NULL;
+
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ /* Reserve 2 bytes to write the number of bytes the stored
+ fields take in this undo record */
+
+ ptr += 2;
+
+ for (col_no = 0; col_no < dict_table_get_n_cols(table);
+ col_no++) {
+
+ const dict_col_t* col
+ = dict_table_get_nth_col(table, col_no);
+
+ if (!col->ord_part) {
+ continue;
+ }
+
+ const ulint pos = dict_index_get_nth_col_pos(
+ index, col_no, NULL);
+ /* All non-virtual columns must be present in
+ the clustered index. */
+ ut_ad(pos != ULINT_UNDEFINED);
+
+ const bool is_ext = rec_offs_nth_extern(offsets, pos);
+ const spatial_status_t spatial_status = is_ext
+ ? dict_col_get_spatial_status(col)
+ : SPATIAL_NONE;
+
+ switch (spatial_status) {
+ case SPATIAL_UNKNOWN:
+ ut_ad(0);
+ /* fall through */
+ case SPATIAL_MIXED:
+ case SPATIAL_ONLY:
+ /* Externally stored spatially indexed
+ columns will be (redundantly) logged
+ again, because we did not write the
+ MBR yet, that is, the previous call to
+ trx_undo_page_report_modify_ext()
+ was with SPATIAL_UNKNOWN. */
+ break;
+ case SPATIAL_NONE:
+ if (!update) {
+ /* This is a DELETE operation. */
+ break;
+ }
+ /* Avoid redundantly logging indexed
+ columns that were updated. */
+
+ for (i = 0; i < update->n_fields; i++) {
+ const ulint field_no
+ = upd_get_nth_field(update, i)
+ ->field_no;
+ if (field_no >= index->n_fields
+ || dict_index_get_nth_field(
+ index, field_no)->col
+ == col) {
+ goto already_logged;
+ }
+ }
+ }
+
+ if (true) {
+ /* Write field number to undo log */
+ if (trx_undo_left(undo_block, ptr) < 5 + 15) {
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, pos);
+
+ /* Save the old value of field */
+ field = rec_get_nth_cfield(
+ rec, index, offsets, pos, &flen);
+
+ if (is_ext) {
+ const dict_col_t* col =
+ dict_index_get_nth_col(
+ index, pos);
+ ulint prefix_len =
+ dict_max_field_len_store_undo(
+ table, col);
+
+ ut_a(prefix_len < sizeof ext_buf);
+ const ulint zip_size
+ = table->space->zip_size();
+
+ /* If there is a spatial index on it,
+ log its MBR */
+ if (spatial_status != SPATIAL_NONE) {
+ ut_ad(DATA_GEOMETRY_MTYPE(
+ col->mtype));
+
+ trx_undo_get_mbr_from_ext(
+ mbr, zip_size,
+ field, &flen);
+ }
+
+ ptr = trx_undo_page_report_modify_ext(
+ ptr,
+ flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+ && !ignore_prefix
+ ? ext_buf : NULL, prefix_len,
+ zip_size,
+ &field, &flen,
+ spatial_status);
+ } else {
+ ptr += mach_write_compressed(
+ ptr, flen);
+ }
+
+ if (flen != UNIV_SQL_NULL
+ && spatial_status != SPATIAL_ONLY) {
+ if (trx_undo_left(undo_block, ptr)
+ < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+
+ if (spatial_status != SPATIAL_NONE) {
+ if (trx_undo_left(undo_block, ptr)
+ < DATA_MBR_LEN) {
+ return(0);
+ }
+
+ for (int i = 0; i < SPDIMS * 2;
+ i++) {
+ mach_double_write(
+ ptr, mbr[i]);
+ ptr += sizeof(double);
+ }
+ }
+ }
+
+already_logged:
+ continue;
+ }
+
+ for (col_no = 0; col_no < dict_table_get_n_v_cols(table);
+ col_no++) {
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(table, col_no);
+
+ if (col->m_col.ord_part) {
+ ulint pos = col_no;
+ ulint max_v_log_len
+ = dict_max_v_field_len_store_undo(
+ table, pos);
+
+ /* Write field number to undo log.
+ Make sure there is enought space in log */
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ pos += REC_MAX_N_FIELDS;
+ ptr += mach_write_compressed(ptr, pos);
+
+ ut_ad(col_no < table->n_v_def);
+ ptr = trx_undo_log_v_idx(undo_block, table,
+ col_no, ptr,
+ first_v_col);
+ first_v_col = false;
+
+ if (!ptr) {
+ return(0);
+ }
+
+ const dfield_t* vfield = NULL;
+
+ if (update) {
+ ut_ad(!row);
+ if (update->old_vrow == NULL) {
+ flen = UNIV_SQL_NULL;
+ } else {
+ vfield = dtuple_get_nth_v_field(
+ update->old_vrow,
+ col->v_pos);
+ }
+ } else if (row) {
+ vfield = dtuple_get_nth_v_field(
+ row, col->v_pos);
+ } else {
+ ut_ad(0);
+ }
+
+ if (vfield) {
+ field = static_cast<byte*>(vfield->data);
+ flen = vfield->len;
+ } else {
+ ut_ad(flen == UNIV_SQL_NULL);
+ }
+
+ if (flen != UNIV_SQL_NULL) {
+ flen = ut_min(
+ flen, max_v_log_len);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ switch (flen) {
+ case 0: case UNIV_SQL_NULL:
+ break;
+ default:
+ if (trx_undo_left(undo_block, ptr)
+ < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+
+ mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+ if (row_heap) {
+ mem_heap_free(row_heap);
+ }
+ }
+
+ /*----------------------------------------*/
+ /* Write pointers to the previous and the next undo log records */
+ if (trx_undo_left(undo_block, ptr) < 2) {
+ return(0);
+ }
+
+ mach_write_to_2(ptr, first_free);
+ const uint16_t new_free = static_cast<uint16_t>(
+ ptr + 2 - undo_block->frame);
+ mach_write_to_2(undo_block->frame + first_free, new_free);
+
+ mach_write_to_2(ptr_to_first_free, new_free);
+
+ const byte* start = &undo_block->frame[first_free + 2];
+ mtr->undo_append(*undo_block, start, ptr - start);
+ return(first_free);
+}
+
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ const byte* ptr, /*!< in: remaining part of undo
+ log record after reading
+ general parameters */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr, /*!< out: roll ptr */
+ byte* info_bits) /*!< out: info bits state */
+{
+ /* Read the state of the info bits */
+ *info_bits = *ptr++;
+
+ /* Read the values of the system columns */
+
+ *trx_id = mach_u64_read_next_compressed(&ptr);
+ *roll_ptr = mach_u64_read_next_compressed(&ptr);
+
+ return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ const byte* ptr, /*!< in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ trx_id_t trx_id, /*!< in: transaction id from this undo record */
+ roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */
+ byte info_bits,/*!< in: info bits from this undo record */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd) /*!< out, own: update vector */
+{
+ upd_field_t* upd_field;
+ upd_t* update;
+ ulint n_fields;
+ byte* buf;
+ bool first_v_col = true;
+ bool is_undo_log = true;
+ ulint n_skip_field = 0;
+
+ ut_a(dict_index_is_clust(index));
+
+ if (type != TRX_UNDO_DEL_MARK_REC) {
+ n_fields = mach_read_next_compressed(&ptr);
+ } else {
+ n_fields = 0;
+ }
+
+ *upd = update = upd_create(n_fields + 2, heap);
+
+ update->info_bits = info_bits;
+
+ /* Store first trx id and roll ptr to update vector */
+
+ upd_field = upd_get_nth_field(update, n_fields);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN));
+
+ mach_write_to_6(buf, trx_id);
+
+ upd_field_set_field_no(upd_field, index->db_trx_id(), index);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+ upd_field = upd_get_nth_field(update, n_fields + 1);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN));
+
+ trx_write_roll_ptr(buf, roll_ptr);
+
+ upd_field_set_field_no(upd_field, index->db_roll_ptr(), index);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+ /* Store then the updated ordinary columns to the update vector */
+
+ for (ulint i = 0; i < n_fields; i++) {
+ const byte* field;
+ uint32_t len, orig_len;
+
+ upd_field = upd_get_nth_field(update, i);
+ uint32_t field_no = mach_read_next_compressed(&ptr);
+
+ const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+ if (is_virtual) {
+ /* If new version, we need to check index list to figure
+ out the correct virtual column position */
+ ptr = trx_undo_read_v_idx(
+ index->table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col = false;
+ /* This column could be dropped or no longer indexed */
+ if (field_no >= index->n_fields) {
+ /* Mark this is no longer needed */
+ upd_field->field_no = REC_MAX_N_FIELDS;
+
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+ n_skip_field++;
+ continue;
+ }
+
+ upd_field_set_v_field_no(
+ upd_field, static_cast<uint16_t>(field_no),
+ index);
+ } else if (UNIV_UNLIKELY((update->info_bits
+ & ~REC_INFO_DELETED_FLAG)
+ == REC_INFO_MIN_REC_FLAG)) {
+ ut_ad(type == TRX_UNDO_UPD_EXIST_REC);
+ const uint32_t uf = index->first_user_field();
+ ut_ad(field_no >= uf);
+
+ if (update->info_bits != REC_INFO_MIN_REC_FLAG) {
+ /* Generic instant ALTER TABLE */
+ if (field_no == uf) {
+ upd_field->new_val.type
+ .metadata_blob_init();
+ } else if (field_no >= index->n_fields) {
+ /* This is reachable during
+ purge if the table was emptied
+ and converted to the canonical
+ format on a later ALTER TABLE.
+ In this case,
+ row_purge_upd_exist_or_extern()
+ would only be interested in
+ freeing any BLOBs that were
+ updated, that is, the metadata
+ BLOB above. Other BLOBs in
+ the metadata record are never
+ updated; they are for the
+ initial DEFAULT values of the
+ instantly added columns, and
+ they will never change.
+
+ Note: if the table becomes
+ empty during ROLLBACK or is
+ empty during subsequent ALTER
+ TABLE, and btr_page_empty() is
+ called to re-create the root
+ page without the metadata
+ record, in that case we should
+ only free the latest version
+ of BLOBs in the record,
+ which purge would never touch. */
+ field_no = REC_MAX_N_FIELDS;
+ n_skip_field++;
+ } else {
+ dict_col_copy_type(
+ dict_index_get_nth_col(
+ index, field_no - 1),
+ &upd_field->new_val.type);
+ }
+ } else {
+ /* Instant ADD COLUMN...LAST */
+ dict_col_copy_type(
+ dict_index_get_nth_col(index,
+ field_no),
+ &upd_field->new_val.type);
+ }
+ upd_field->field_no = field_no
+ & dict_index_t::MAX_N_FIELDS;
+ } else if (field_no < index->n_fields) {
+ upd_field_set_field_no(upd_field,
+ static_cast<uint16_t>(field_no),
+ index);
+ } else {
+ ib::error() << "Trying to access update undo rec"
+ " field " << field_no
+ << " in index " << index->name
+ << " of table " << index->table->name
+ << " but index has only "
+ << dict_index_get_n_fields(index)
+ << " fields " << BUG_REPORT_MSG
+ << ". Run also CHECK TABLE "
+ << index->table->name << "."
+ " n_fields = " << n_fields << ", i = " << i;
+
+ ut_ad(0);
+ *upd = NULL;
+ return(NULL);
+ }
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ upd_field->orig_len = static_cast<uint16_t>(orig_len);
+
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_null(&upd_field->new_val);
+ } else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+ dfield_set_data(&upd_field->new_val, field, len);
+ } else {
+ len -= UNIV_EXTERN_STORAGE_FIELD;
+
+ dfield_set_data(&upd_field->new_val, field, len);
+ dfield_set_ext(&upd_field->new_val);
+ }
+
+ ut_ad(update->info_bits != (REC_INFO_DELETED_FLAG
+ | REC_INFO_MIN_REC_FLAG)
+ || field_no != index->first_user_field()
+ || (upd_field->new_val.ext
+ && upd_field->new_val.len == FIELD_REF_SIZE));
+
+ if (is_virtual) {
+ upd_field->old_v_val = static_cast<dfield_t*>(
+ mem_heap_alloc(
+ heap, sizeof *upd_field->old_v_val));
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_null(upd_field->old_v_val);
+ } else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+ dfield_set_data(
+ upd_field->old_v_val, field, len);
+ } else {
+ ut_ad(0);
+ }
+ }
+ }
+
+ /* We may have to skip dropped indexed virtual columns.
+ Also, we may have to trim the update vector of a metadata record
+ if dict_index_t::clear_instant_alter() was invoked on the table
+ later, and the number of fields no longer matches. */
+
+ if (n_skip_field) {
+ upd_field_t* d = upd_get_nth_field(update, 0);
+ const upd_field_t* const end = d + n_fields + 2;
+
+ for (const upd_field_t* s = d; s != end; s++) {
+ if (s->field_no != REC_MAX_N_FIELDS) {
+ *d++ = *s;
+ }
+ }
+
+ ut_ad(d + n_skip_field == end);
+ update->n_fields = d - upd_get_nth_field(update, 0);
+ }
+
+ return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds a partial row from an update undo log record, for purge.
+It contains the columns which occur as ordering in any index of the table.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
+@return pointer to remaining part of undo record */
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+ const byte* ptr, /*!< in: remaining part in update undo log
+ record of a suitable type, at the start of
+ the stored index columns;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the partial row is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ const upd_t* update, /*!< in: updated columns */
+ dtuple_t** row, /*!< out, own: partial row */
+ ibool ignore_prefix, /*!< in: flag to indicate if we
+ expect blob prefixes in undo. Used
+ only in the assertion. */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ const byte* end_ptr;
+ bool first_v_col = true;
+ bool is_undo_log = true;
+
+ ut_ad(index->is_primary());
+
+ *row = dtuple_create_with_vcol(
+ heap, dict_table_get_n_cols(index->table),
+ dict_table_get_n_v_cols(index->table));
+
+ /* Mark all columns in the row uninitialized, so that
+ we can distinguish missing fields from fields that are SQL NULL. */
+ for (ulint i = 0; i < dict_table_get_n_cols(index->table); i++) {
+ dfield_get_type(dtuple_get_nth_field(*row, i))
+ ->mtype = DATA_MISSING;
+ }
+
+ dtuple_init_v_fld(*row);
+
+ for (const upd_field_t* uf = update->fields, * const ue
+ = update->fields + update->n_fields;
+ uf != ue; uf++) {
+ if (uf->old_v_val) {
+ continue;
+ }
+ const dict_col_t& c = *dict_index_get_nth_col(index,
+ uf->field_no);
+ if (!c.is_dropped()) {
+ *dtuple_get_nth_field(*row, c.ind) = uf->new_val;
+ }
+ }
+
+ end_ptr = ptr + mach_read_from_2(ptr);
+ ptr += 2;
+
+ while (ptr != end_ptr) {
+ dfield_t* dfield;
+ const byte* field;
+ uint32_t field_no;
+ const dict_col_t* col;
+ uint32_t len, orig_len;
+
+ field_no = mach_read_next_compressed(&ptr);
+
+ const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+ if (is_virtual) {
+ ptr = trx_undo_read_v_idx(
+ index->table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col = false;
+ }
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ /* This column could be dropped or no longer indexed */
+ if (field_no == FIL_NULL) {
+ ut_ad(is_virtual);
+ continue;
+ }
+
+ if (is_virtual) {
+ dict_v_col_t* vcol = dict_table_get_nth_v_col(
+ index->table, field_no);
+ col = &vcol->m_col;
+ dfield = dtuple_get_nth_v_field(*row, vcol->v_pos);
+ dict_col_copy_type(
+ &vcol->m_col,
+ dfield_get_type(dfield));
+ } else {
+ col = dict_index_get_nth_col(index, field_no);
+
+ if (col->is_dropped()) {
+ continue;
+ }
+
+ dfield = dtuple_get_nth_field(*row, col->ind);
+ ut_ad(dfield->type.mtype == DATA_MISSING
+ || dict_col_type_assert_equal(col,
+ &dfield->type));
+ ut_ad(dfield->type.mtype == DATA_MISSING
+ || dfield->len == len
+ || (len != UNIV_SQL_NULL
+ && len >= UNIV_EXTERN_STORAGE_FIELD));
+ dict_col_copy_type(col, dfield_get_type(dfield));
+ }
+
+ dfield_set_data(dfield, field, len);
+
+ if (len != UNIV_SQL_NULL
+ && len >= UNIV_EXTERN_STORAGE_FIELD) {
+ spatial_status_t spatial_status;
+
+ /* Decode spatial status. */
+ spatial_status = static_cast<spatial_status_t>(
+ (len & SPATIAL_STATUS_MASK)
+ >> SPATIAL_STATUS_SHIFT);
+ len &= ~SPATIAL_STATUS_MASK;
+
+ /* Keep compatible with 5.7.9 format. */
+ if (spatial_status == SPATIAL_UNKNOWN) {
+ spatial_status =
+ dict_col_get_spatial_status(col);
+ }
+
+ switch (spatial_status) {
+ case SPATIAL_ONLY:
+ ut_ad(len - UNIV_EXTERN_STORAGE_FIELD
+ == DATA_MBR_LEN);
+ dfield_set_len(
+ dfield,
+ len - UNIV_EXTERN_STORAGE_FIELD);
+ break;
+
+ case SPATIAL_MIXED:
+ dfield_set_len(
+ dfield,
+ len - UNIV_EXTERN_STORAGE_FIELD
+ - DATA_MBR_LEN);
+ break;
+
+ case SPATIAL_NONE:
+ dfield_set_len(
+ dfield,
+ len - UNIV_EXTERN_STORAGE_FIELD);
+ break;
+
+ case SPATIAL_UNKNOWN:
+ ut_ad(0);
+ break;
+ }
+
+ dfield_set_ext(dfield);
+ dfield_set_spatial_status(dfield, spatial_status);
+
+ /* If the prefix of this column is indexed,
+ ensure that enough prefix is stored in the
+ undo log record. */
+ if (!ignore_prefix && col->ord_part
+ && spatial_status != SPATIAL_ONLY) {
+ ut_a(dfield_get_len(dfield)
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_a(dict_table_has_atomic_blobs(index->table)
+ || dfield_get_len(dfield)
+ >= REC_ANTELOPE_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ return(const_cast<byte*>(ptr));
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out] trx transaction
+@param[in] table table that is being renamed
+@param[in,out] block undo page
+@param[in,out] mtr mini-transaction
+@return byte offset of the undo log record
+@retval 0 in case of failure */
+static
+uint16_t
+trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
+ buf_block_t* block, mtr_t* mtr)
+{
+ byte* ptr_first_free = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + block->frame);
+ const uint16_t first_free = mach_read_from_2(ptr_first_free);
+ ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ ut_ad(first_free <= srv_page_size - FIL_PAGE_DATA_END);
+ byte* const start = block->frame + first_free;
+ size_t len = strlen(table->name.m_name);
+ const size_t fixed = 2 + 1 + 11 + 11 + 2;
+ ut_ad(len <= NAME_LEN * 2 + 1);
+ /* The -10 is used in trx_undo_left() */
+ compile_time_assert((NAME_LEN * 1) * 2 + fixed
+ + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE
+ < UNIV_PAGE_SIZE_MIN - 10 - FIL_PAGE_DATA_END);
+
+ if (trx_undo_left(block, start) < fixed + len) {
+ ut_ad(first_free > TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_HDR_SIZE);
+ return 0;
+ }
+
+ byte* ptr = start + 2;
+ *ptr++ = TRX_UNDO_RENAME_TABLE;
+ ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+ ptr += mach_u64_write_much_compressed(ptr, table->id);
+ memcpy(ptr, table->name.m_name, len);
+ ptr += len;
+ mach_write_to_2(ptr, first_free);
+ mach_write_to_2(ptr_first_free, ptr + 2 - block->frame);
+ memcpy(start, ptr_first_free, 2);
+ mtr->undo_append(*block, start + 2, ptr - start - 2);
+ return first_free;
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out] trx transaction
+@param[in] table table that is being renamed
+@return DB_SUCCESS or error code */
+dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
+{
+ ut_ad(!trx->read_only);
+ ut_ad(trx->id);
+ ut_ad(!table->is_temporary());
+
+ mtr_t mtr;
+ dberr_t err;
+ mtr.start();
+ if (buf_block_t* block = trx_undo_assign(trx, &err, &mtr)) {
+ trx_undo_t* undo = trx->rsegs.m_redo.undo;
+ ut_ad(err == DB_SUCCESS);
+ ut_ad(undo);
+ for (ut_d(int loop_count = 0);;) {
+ ut_ad(loop_count++ < 2);
+ ut_ad(undo->last_page_no
+ == block->page.id().page_no());
+
+ if (uint16_t offset = trx_undo_page_report_rename(
+ trx, table, block, &mtr)) {
+ undo->top_page_no = undo->last_page_no;
+ undo->top_offset = offset;
+ undo->top_undo_no = trx->undo_no++;
+ undo->guess_block = block;
+ ut_ad(!undo->empty());
+
+ err = DB_SUCCESS;
+ break;
+ } else {
+ mtr.commit();
+ mtr.start();
+ block = trx_undo_add_page(undo, &mtr);
+ if (!block) {
+ err = DB_OUT_OF_FILE_SPACE;
+ break;
+ }
+ }
+ }
+ }
+
+ mtr.commit();
+ return err;
+}
+
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: in the case of an insert,
+ index entry to insert into the
+ clustered index; in updates,
+ may contain a clustered index
+ record tuple that also contains
+ virtual columns of the table;
+ otherwise, NULL */
+ const upd_t* update, /*!< in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const rec_t* rec, /*!< in: case of an update or delete
+ marking, the record in the clustered
+ index; NULL if insert */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ roll_ptr_t* roll_ptr) /*!< out: DB_ROLL_PTR to the
+ undo log record */
+{
+ trx_t* trx;
+ mtr_t mtr;
+#ifdef UNIV_DEBUG
+ int loop_count = 0;
+#endif /* UNIV_DEBUG */
+
+ ut_a(dict_index_is_clust(index));
+ ut_ad(!update || rec);
+ ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+ ut_ad(!srv_read_only_mode);
+
+ trx = thr_get_trx(thr);
+ /* This function must not be invoked during rollback
+ (of a TRX_STATE_PREPARE transaction or otherwise). */
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(!trx->in_rollback);
+
+ mtr.start();
+ trx_undo_t** pundo;
+ trx_rseg_t* rseg;
+ const bool is_temp = index->table->is_temporary();
+
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ rseg = trx->get_temp_rseg();
+ pundo = &trx->rsegs.m_noredo.undo;
+ } else {
+ ut_ad(!trx->read_only);
+ ut_ad(trx->id);
+ pundo = &trx->rsegs.m_redo.undo;
+ rseg = trx->rsegs.m_redo.rseg;
+ }
+
+ dberr_t err;
+ buf_block_t* undo_block = trx_undo_assign_low(trx, rseg, pundo,
+ &err, &mtr);
+ trx_undo_t* undo = *pundo;
+
+ ut_ad((err == DB_SUCCESS) == (undo_block != NULL));
+ if (UNIV_UNLIKELY(undo_block == NULL)) {
+ goto err_exit;
+ }
+
+ ut_ad(undo != NULL);
+
+ do {
+ uint16_t offset = !rec
+ ? trx_undo_page_report_insert(
+ undo_block, trx, index, clust_entry, &mtr)
+ : trx_undo_page_report_modify(
+ undo_block, trx, index, rec, offsets, update,
+ cmpl_info, clust_entry, &mtr);
+
+ if (UNIV_UNLIKELY(offset == 0)) {
+ const uint16_t first_free = mach_read_from_2(
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + undo_block->frame);
+ memset(undo_block->frame + first_free, 0,
+ (srv_page_size - FIL_PAGE_DATA_END)
+ - first_free);
+
+ if (first_free
+ == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) {
+ /* The record did not fit on an empty
+ undo page. Discard the freshly allocated
+ page and return an error. */
+
+ /* When we remove a page from an undo
+ log, this is analogous to a
+ pessimistic insert in a B-tree, and we
+ must reserve the counterpart of the
+ tree latch, which is the rseg
+ mutex. We must commit the mini-transaction
+ first, because it may be holding lower-level
+ latches, such as SYNC_FSP and SYNC_FSP_PAGE. */
+
+ mtr.commit();
+ mtr.start();
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ mutex_enter(&rseg->mutex);
+ trx_undo_free_last_page(undo, &mtr);
+ mutex_exit(&rseg->mutex);
+
+ err = DB_UNDO_RECORD_TOO_BIG;
+ goto err_exit;
+ } else {
+ /* Write log for clearing the unused
+ tail of the undo page. It might
+ contain some garbage from a previously
+ written record, and mtr_t::write()
+ will optimize away writes of unchanged
+ bytes. Failure to write this caused a
+ recovery failure when we avoided
+ reading the undo log page from the
+ data file and initialized it based on
+ redo log records (which included the
+ write of the previous garbage). */
+ mtr.memset(*undo_block, first_free,
+ srv_page_size - first_free
+ - FIL_PAGE_DATA_END, 0);
+ }
+
+ mtr.commit();
+ } else {
+ /* Success */
+ undo->top_page_no = undo_block->page.id().page_no();
+ mtr.commit();
+ undo->top_offset = offset;
+ undo->top_undo_no = trx->undo_no++;
+ undo->guess_block = undo_block;
+ ut_ad(!undo->empty());
+
+ if (!is_temp) {
+ const undo_no_t limit = undo->top_undo_no;
+ /* Determine if this is the first time
+ when this transaction modifies a
+ system-versioned column in this table. */
+ trx_mod_table_time_t& time
+ = trx->mod_tables.insert(
+ trx_mod_tables_t::value_type(
+ index->table, limit))
+ .first->second;
+ ut_ad(time.valid(limit));
+
+ if (!time.is_versioned()
+ && index->table->versioned_by_id()
+ && (!rec /* INSERT */
+ || (update
+ && update->affects_versioned()))) {
+ time.set_versioned(limit);
+ }
+ }
+
+ *roll_ptr = trx_undo_build_roll_ptr(
+ !rec, rseg->id, undo->top_page_no, offset);
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(undo_block->page.id().page_no() == undo->last_page_no);
+
+ /* We have to extend the undo log by one page */
+
+ ut_ad(++loop_count < 2);
+ mtr.start();
+
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ undo_block = trx_undo_add_page(undo, &mtr);
+
+ DBUG_EXECUTE_IF("ib_err_ins_undo_page_add_failure",
+ undo_block = NULL;);
+ } while (UNIV_LIKELY(undo_block != NULL));
+
+ ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ DB_OUT_OF_FILE_SPACE,
+ //ER_INNODB_UNDO_LOG_FULL,
+ "No more space left over in %s tablespace for allocating UNDO"
+ " log pages. Please add new data file to the tablespace or"
+ " check if filesystem is full or enable auto-extension for"
+ " the tablespace",
+ undo->rseg->space == fil_system.sys_space
+ ? "system" : is_temp ? "temporary" : "undo");
+
+ /* Did not succeed: out of space */
+ err = DB_OUT_OF_FILE_SPACE;
+
+err_exit:
+ mtr_commit(&mtr);
+ return(err);
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/** Copy an undo record to heap.
+@param[in] roll_ptr roll pointer to a record that exists
+@param[in,out] heap memory heap where copied */
+static
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+ roll_ptr_t roll_ptr,
+ mem_heap_t* heap)
+{
+ trx_undo_rec_t* undo_rec;
+ ulint rseg_id;
+ uint32_t page_no;
+ uint16_t offset;
+ trx_rseg_t* rseg;
+ bool is_insert;
+ mtr_t mtr;
+
+ trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no,
+ &offset);
+ ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO);
+ ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ rseg = trx_sys.rseg_array[rseg_id];
+ ut_ad(rseg->is_persistent());
+
+ mtr.start();
+
+ buf_block_t* undo_page = trx_undo_page_get_s_latched(
+ page_id_t(rseg->space->id, page_no), &mtr);
+
+ undo_rec = trx_undo_rec_copy(undo_page->frame + offset, heap);
+
+ mtr.commit();
+
+ return(undo_rec);
+}
+
+/** Copy an undo record to heap.
+@param[in] roll_ptr roll pointer to record
+@param[in,out] heap memory heap where copied
+@param[in] trx_id id of the trx that generated
+ the roll pointer: it points to an
+ undo log of this transaction
+@param[in] name table name
+@param[out] undo_rec own: copy of the record
+@retval true if the undo log has been
+truncated and we cannot fetch the old version
+@retval false if the undo log record is available
+NOTE: the caller must have latches on the clustered index page. */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+trx_undo_get_undo_rec(
+ roll_ptr_t roll_ptr,
+ mem_heap_t* heap,
+ trx_id_t trx_id,
+ const table_name_t& name,
+ trx_undo_rec_t** undo_rec)
+{
+ rw_lock_s_lock(&purge_sys.latch);
+
+ bool missing_history = purge_sys.changes_visible(trx_id, name);
+ if (!missing_history) {
+ *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+ }
+
+ rw_lock_s_unlock(&purge_sys.latch);
+
+ return(missing_history);
+}
+
+#ifdef UNIV_DEBUG
+#define ATTRIB_USED_ONLY_IN_DEBUG
+#else /* UNIV_DEBUG */
+#define ATTRIB_USED_ONLY_IN_DEBUG MY_ATTRIBUTE((unused))
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Build a previous version of a clustered index record. The caller must
+hold a latch on the index page of the clustered index record.
+@retval true if previous version was built, or if it was an insert
+or the table has been rebuilt
+@retval false if the previous version is earlier than purge_view,
+or being purged, which means that it may have been removed */
+bool
+trx_undo_prev_version_build(
+/*========================*/
+ const rec_t* index_rec ATTRIB_USED_ONLY_IN_DEBUG,
+ /*!< in: clustered index record in the
+ index tree */
+ mtr_t* index_mtr ATTRIB_USED_ONLY_IN_DEBUG,
+ /*!< in: mtr which contains the latch to
+ index_rec page and purge_view */
+ const rec_t* rec, /*!< in: version of a clustered index record */
+ dict_index_t* index, /*!< in: clustered index */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ rec_t** old_vers,/*!< out, own: previous version, or NULL if
+ rec is the first inserted version, or if
+ history data has been deleted (an error),
+ or if the purge COULD have removed the version
+ though it has not yet done so */
+ mem_heap_t* v_heap, /* !< in: memory heap used to create vrow
+ dtuple if it is not yet created. This heap
+ diffs from "heap" above in that it could be
+ prebuilt->old_vers_heap for selection */
+ dtuple_t** vrow, /*!< out: virtual column info, if any */
+ ulint v_status)
+ /*!< in: status determine if it is going
+ into this function by purge thread or not.
+ And if we read "after image" of undo log */
+{
+ trx_undo_rec_t* undo_rec = NULL;
+ dtuple_t* entry;
+ trx_id_t rec_trx_id;
+ ulint type;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ upd_t* update;
+ byte* ptr;
+ byte info_bits;
+ ulint cmpl_info;
+ bool dummy_extern;
+ byte* buf;
+
+ ut_ad(!index->table->is_temporary());
+ ut_ad(!rw_lock_own(&purge_sys.latch, RW_LOCK_S));
+ ut_ad(index_mtr->memo_contains_page_flagged(index_rec,
+ MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_a(index->is_primary());
+
+ roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+
+ *old_vers = NULL;
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+ /* The record rec is the first inserted version */
+ return(true);
+ }
+
+ rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ ut_ad(!index->table->skip_alter_undo);
+
+ if (trx_undo_get_undo_rec(
+ roll_ptr, heap, rec_trx_id, index->table->name,
+ &undo_rec)) {
+ if (v_status & TRX_UNDO_PREV_IN_PURGE) {
+ /* We are fetching the record being purged */
+ undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+ } else {
+ /* The undo record may already have been purged,
+ during purge or semi-consistent read. */
+ return(false);
+ }
+ }
+
+ ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+
+ if (table_id != index->table->id) {
+ /* The table should have been rebuilt, but purge has
+ not yet removed the undo log records for the
+ now-dropped old table (table_id). */
+ return(true);
+ }
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+
+ /* (a) If a clustered index record version is such that the
+ trx id stamp in it is bigger than purge_sys.view, then the
+ BLOBs in that version are known to exist (the purge has not
+ progressed that far);
+
+ (b) if the version is the first version such that trx id in it
+ is less than purge_sys.view, and it is not delete-marked,
+ then the BLOBs in that version are known to exist (the purge
+ cannot have purged the BLOBs referenced by that version
+ yet).
+
+ This function does not fetch any BLOBs. The callers might, by
+ possibly invoking row_ext_create() via row_build(). However,
+ they should have all needed information in the *old_vers
+ returned by this function. This is because *old_vers is based
+ on the transaction undo log records. The function
+ trx_undo_page_fetch_ext() will write BLOB prefixes to the
+ transaction undo log that are at least as long as the longest
+ possible column prefix in a secondary index. Thus, secondary
+ index entries for *old_vers can be constructed without
+ dereferencing any BLOB pointers. */
+
+ ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+ ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+ roll_ptr, info_bits,
+ heap, &update);
+ ut_a(ptr);
+
+ if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+ /* We should confirm the existence of disowned external data,
+ if the previous version record is delete marked. If the trx_id
+ of the previous record is seen by purge view, we should treat
+ it as missing history, because the disowned external data
+ might be purged already.
+
+ The inherited external data (BLOBs) can be freed (purged)
+ after trx_id was committed, provided that no view was started
+ before trx_id. If the purge view can see the committed
+ delete-marked record by trx_id, no transactions need to access
+ the BLOB. */
+
+ /* the row_upd_changes_disowned_external(update) call could be
+ omitted, but the synchronization on purge_sys.latch is likely
+ more expensive. */
+
+ if ((update->info_bits & REC_INFO_DELETED_FLAG)
+ && row_upd_changes_disowned_external(update)) {
+ bool missing_extern;
+
+ rw_lock_s_lock(&purge_sys.latch);
+
+ missing_extern = purge_sys.changes_visible(
+ trx_id, index->table->name);
+
+ rw_lock_s_unlock(&purge_sys.latch);
+
+ if (missing_extern) {
+ /* treat as a fresh insert, not to
+ cause assertion error at the caller. */
+ return(true);
+ }
+ }
+
+ /* We have to set the appropriate extern storage bits in the
+ old version of the record: the extern bits in rec for those
+ fields that update does NOT update, as well as the bits for
+ those fields that update updates to become externally stored
+ fields. Store the info: */
+
+ entry = row_rec_to_index_entry(rec, index, offsets, heap);
+ /* The page containing the clustered index record
+ corresponding to entry is latched in mtr. Thus the
+ following call is safe. */
+ if (!row_upd_index_replace_new_col_vals(entry, *index, update,
+ heap)) {
+ ut_a(v_status & TRX_UNDO_PREV_IN_PURGE);
+ return false;
+ }
+
+ /* Get number of externally stored columns in updated record */
+ const ulint n_ext = index->is_primary()
+ ? dtuple_get_n_ext(entry) : 0;
+
+ buf = static_cast<byte*>(mem_heap_alloc(
+ heap, rec_get_converted_size(index, entry, n_ext)));
+
+ *old_vers = rec_convert_dtuple_to_rec(buf, index,
+ entry, n_ext);
+ } else {
+ buf = static_cast<byte*>(mem_heap_alloc(
+ heap, rec_offs_size(offsets)));
+
+ *old_vers = rec_copy(buf, rec, offsets);
+ rec_offs_make_valid(*old_vers, index, true, offsets);
+ rec_set_bit_field_1(*old_vers, update->info_bits,
+ rec_offs_comp(offsets)
+ ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+ for (ulint i = 0; i < update->n_fields; i++) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+ if (upd_fld_is_virtual_col(uf)) {
+ /* There are no virtual columns in
+ a clustered index record. */
+ continue;
+ }
+ const ulint n = uf->field_no;
+ ut_ad(!dfield_is_ext(&uf->new_val)
+ == !rec_offs_nth_extern(offsets, n));
+ ut_ad(!rec_offs_nth_default(offsets, n));
+
+ if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+ if (rec_offs_nth_sql_null(offsets, n)) {
+ ut_ad(index->table->is_instant());
+ ut_ad(n >= index->n_core_fields);
+ continue;
+ }
+ ut_ad(!index->table->not_redundant());
+ ulint l = rec_get_1byte_offs_flag(*old_vers)
+ ? (n + 1) : (n + 1) * 2;
+ byte* b = *old_vers - REC_N_OLD_EXTRA_BYTES
+ - l;
+ *b= byte(*b | REC_1BYTE_SQL_NULL_MASK);
+ compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+ == REC_2BYTE_SQL_NULL_MASK);
+ continue;
+ }
+
+ ulint len;
+ memcpy(rec_get_nth_field(*old_vers, offsets, n, &len),
+ uf->new_val.data, uf->new_val.len);
+ if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+ ut_ad(len == UNIV_SQL_NULL);
+ ut_ad(!rec_offs_comp(offsets));
+ ut_ad(uf->new_val.len
+ == rec_get_nth_field_size(rec, n));
+ ulint l = rec_get_1byte_offs_flag(*old_vers)
+ ? (n + 1) : (n + 1) * 2;
+ *(*old_vers - REC_N_OLD_EXTRA_BYTES - l)
+ &= byte(~REC_1BYTE_SQL_NULL_MASK);
+ }
+ }
+ }
+
+ /* Set the old value (which is the after image of an update) in the
+ update vector to dtuple vrow */
+ if (v_status & TRX_UNDO_GET_OLD_V_VALUE) {
+ row_upd_replace_vcol((dtuple_t*)*vrow, index->table, update,
+ false, NULL, NULL);
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ rec_offs offsets_dbg[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_dbg);
+ ut_a(!rec_offs_any_null_extern(
+ *old_vers, rec_get_offsets(*old_vers, index, offsets_dbg,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap)));
+#endif // defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+
+ if (vrow && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ if (!(*vrow)) {
+ *vrow = dtuple_create_with_vcol(
+ v_heap ? v_heap : heap,
+ dict_table_get_n_cols(index->table),
+ dict_table_get_n_v_cols(index->table));
+ dtuple_init_v_fld(*vrow);
+ }
+
+ ut_ad(index->table->n_v_cols);
+ trx_undo_read_v_cols(index->table, ptr, *vrow,
+ v_status & TRX_UNDO_PREV_IN_PURGE);
+ }
+
+ return(true);
+}
+
+/** Read virtual column value from undo log
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[in,out] row the dtuple to fill
+@param[in] in_purge whether this is called by purge */
+void
+trx_undo_read_v_cols(
+ const dict_table_t* table,
+ const byte* ptr,
+ dtuple_t* row,
+ bool in_purge)
+{
+ const byte* end_ptr;
+ bool first_v_col = true;
+ bool is_undo_log = true;
+
+ end_ptr = ptr + mach_read_from_2(ptr);
+ ptr += 2;
+ while (ptr < end_ptr) {
+ dfield_t* dfield;
+ const byte* field;
+ uint32_t field_no, len, orig_len;
+
+ field_no = mach_read_next_compressed(
+ const_cast<const byte**>(&ptr));
+
+ const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+ if (is_virtual) {
+ ptr = trx_undo_read_v_idx(
+ table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col = false;
+ }
+
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+
+ /* The virtual column is no longer indexed or does not exist.
+ This needs to put after trx_undo_rec_get_col_val() so the
+ undo ptr advances */
+ if (field_no == FIL_NULL) {
+ ut_ad(is_virtual);
+ continue;
+ }
+
+ if (is_virtual) {
+ dict_v_col_t* vcol = dict_table_get_nth_v_col(
+ table, field_no);
+
+ dfield = dtuple_get_nth_v_field(row, vcol->v_pos);
+
+ if (!in_purge
+ || dfield_get_type(dfield)->mtype == DATA_MISSING) {
+ dict_col_copy_type(
+ &vcol->m_col,
+ dfield_get_type(dfield));
+ dfield_set_data(dfield, field, len);
+ }
+ }
+ }
+
+ ut_ad(ptr == end_ptr);
+}
diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc
new file mode 100644
index 00000000..23aa950a
--- /dev/null
+++ b/storage/innobase/trx/trx0roll.cc
@@ -0,0 +1,984 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0roll.cc
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#include <my_service_manager.h>
+#include <mysql/service_wsrep.h>
+
+#include "fsp0fsp.h"
+#include "lock0lock.h"
+#include "mach0data.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "row0mysql.h"
+#include "row0undo.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t trx_rollback_clean_thread_key;
+#endif
+
+/** true if trx_rollback_all_recovered() thread is active */
+bool trx_rollback_is_active;
+
+/** In crash recovery, the current trx to be rolled back; NULL otherwise */
+const trx_t* trx_roll_crash_recv_trx;
+
+/** Finish transaction rollback.
+@return whether the rollback was completed normally
+@retval false if the rollback was aborted by shutdown */
+inline bool trx_t::rollback_finish()
+{
+ mod_tables.clear();
+ if (UNIV_LIKELY(error_state == DB_SUCCESS))
+ {
+ commit();
+ return true;
+ }
+
+ ut_a(error_state == DB_INTERRUPTED);
+ ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE);
+ ut_a(!srv_undo_sources);
+ ut_ad(srv_fast_shutdown);
+ ut_d(in_rollback= false);
+ if (trx_undo_t *&undo= rsegs.m_redo.undo)
+ {
+ UT_LIST_REMOVE(rsegs.m_redo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo= nullptr;
+ }
+ if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+ {
+ UT_LIST_REMOVE(rsegs.m_noredo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo= nullptr;
+ }
+ commit_low();
+ lock.que_state= TRX_QUE_RUNNING;
+ return false;
+}
+
+/** Roll back an active transaction. */
+inline void trx_t::rollback_low(trx_savept_t *savept)
+{
+ mem_heap_t *heap= mem_heap_create(512);
+ roll_node_t *roll_node= roll_node_create(heap);
+ roll_node->savept= savept;
+
+ ut_ad(!in_rollback);
+#ifdef UNIV_DEBUG
+ {
+ const auto s= state;
+ ut_ad(s == TRX_STATE_ACTIVE ||
+ s == TRX_STATE_PREPARED ||
+ s == TRX_STATE_PREPARED_RECOVERED);
+ if (savept)
+ {
+ ut_ad(s == TRX_STATE_ACTIVE);
+ ut_ad(mysql_thd);
+ ut_ad(!is_recovered);
+ }
+ }
+#endif
+
+ error_state = DB_SUCCESS;
+
+ if (has_logged())
+ {
+ ut_ad(rsegs.m_redo.rseg || rsegs.m_noredo.rseg);
+ que_thr_t *thr= pars_complete_graph_for_exec(roll_node, this, heap,
+ nullptr);
+ ut_a(thr == que_fork_start_command(static_cast<que_fork_t*>
+ (que_node_get_parent(thr))));
+ que_run_threads(thr);
+ que_run_threads(roll_node->undo_thr);
+
+ /* Free the memory reserved by the undo graph. */
+ que_graph_free(static_cast<que_t*>(roll_node->undo_thr->common.parent));
+ }
+
+ if (!savept)
+ {
+ rollback_finish();
+ MONITOR_INC(MONITOR_TRX_ROLLBACK);
+ }
+ else
+ {
+ ut_a(error_state == DB_SUCCESS);
+ const undo_no_t limit= savept->least_undo_no;
+ for (trx_mod_tables_t::iterator i= mod_tables.begin();
+ i != mod_tables.end(); )
+ {
+ trx_mod_tables_t::iterator j= i++;
+ ut_ad(j->second.valid());
+ if (j->second.rollback(limit))
+ mod_tables.erase(j);
+ }
+ lock.que_state= TRX_QUE_RUNNING;
+ MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
+ }
+
+ mem_heap_free(heap);
+
+ MONITOR_DEC(MONITOR_TRX_ACTIVE);
+}
+
+/** Initiate rollback.
+@param savept savepoint
+@return error code or DB_SUCCESS */
+dberr_t trx_t::rollback(trx_savept_t *savept)
+{
+ ut_ad(!trx_mutex_own(this));
+ if (state == TRX_STATE_NOT_STARTED)
+ {
+ error_state= DB_SUCCESS;
+ return DB_SUCCESS;
+ }
+ ut_ad(state == TRX_STATE_ACTIVE);
+#ifdef WITH_WSREP
+ if (!savept && is_wsrep() && wsrep_thd_is_SR(mysql_thd))
+ wsrep_handle_SR_rollback(nullptr, mysql_thd);
+#endif /* WITH_WSREP */
+ rollback_low(savept);
+ return error_state;
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+static
+dberr_t
+trx_rollback_for_mysql_low(
+/*=======================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ trx->op_info = "rollback";
+
+ /* If we are doing the XA recovery of prepared transactions,
+ then the transaction object does not have an InnoDB session
+ object, and we set a dummy session that we use for all MySQL
+ transactions. */
+
+ trx->rollback_low();
+
+ trx->op_info = "";
+
+ return(trx->error_state);
+}
+
+/** Rollback a transaction used in MySQL
+@param[in, out] trx transaction
+@return error code or DB_SUCCESS */
+dberr_t trx_rollback_for_mysql(trx_t* trx)
+{
+ /* We are reading trx->state without holding trx->mutex
+ here, because the rollback should be invoked for a running
+ active MySQL transaction (or recovered prepared transaction)
+ that is associated with the current thread. */
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx->will_lock = false;
+ ut_ad(trx->mysql_thd);
+#ifdef WITH_WSREP
+ trx->wsrep= false;
+ trx->lock.was_chosen_as_wsrep_victim= false;
+#endif
+ return(DB_SUCCESS);
+
+ case TRX_STATE_ACTIVE:
+ ut_ad(trx->mysql_thd);
+ ut_ad(!trx->is_recovered);
+ ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+ return(trx_rollback_for_mysql_low(trx));
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ ut_ad(!trx->is_autocommit_non_locking());
+ if (trx->rsegs.m_redo.undo) {
+ /* The XA ROLLBACK of a XA PREPARE transaction
+ will consist of multiple mini-transactions.
+
+ As the very first step of XA ROLLBACK, we must
+ change the undo log state back from
+ TRX_UNDO_PREPARED to TRX_UNDO_ACTIVE, in order
+ to ensure that recovery will complete the
+ rollback.
+
+ Failure to perform this step could cause a
+ situation where we would roll back part of
+ a XA PREPARE transaction, the server would be
+ killed, and finally, the transaction would be
+ recovered in XA PREPARE state, with some of
+ the actions already having been rolled back. */
+ ut_ad(trx->rsegs.m_redo.undo->rseg
+ == trx->rsegs.m_redo.rseg);
+ mtr_t mtr;
+ mtr.start();
+ mutex_enter(&trx->rsegs.m_redo.rseg->mutex);
+ if (trx_undo_t* undo = trx->rsegs.m_redo.undo) {
+ trx_undo_set_state_at_prepare(trx, undo, true,
+ &mtr);
+ }
+ mutex_exit(&trx->rsegs.m_redo.rseg->mutex);
+ /* Write the redo log for the XA ROLLBACK
+ state change to the global buffer. It is
+ not necessary to flush the redo log. If
+ a durable log write of a later mini-transaction
+ takes place for whatever reason, then this state
+ change will be durable as well. */
+ mtr.commit();
+ ut_ad(mtr.commit_lsn() > 0);
+ }
+ return(trx_rollback_for_mysql_low(trx));
+
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ ut_ad(!trx->is_autocommit_non_locking());
+ break;
+ }
+
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ dberr_t err;
+
+ /* We are reading trx->state without holding trx->mutex
+ here, because the statement rollback should be invoked for a
+ running active MySQL transaction that is associated with the
+ current thread. */
+ ut_ad(trx->mysql_thd);
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ return(DB_SUCCESS);
+
+ case TRX_STATE_ACTIVE:
+ ut_ad(trx->mysql_thd);
+ ut_ad(!trx->is_recovered);
+ ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+
+ trx->op_info = "rollback of SQL statement";
+
+ err = trx->rollback(&trx->last_sql_stat_start);
+
+ if (trx->fts_trx != NULL) {
+ fts_savepoint_rollback_last_stmt(trx);
+ }
+
+ /* The following call should not be needed,
+ but we play it safe: */
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+
+ return(err);
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ /* The statement rollback is only allowed on an ACTIVE
+ transaction, not a PREPARED or COMMITTED one. */
+ break;
+ }
+
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Search for a savepoint using name.
+@return savepoint if found else NULL */
+static
+trx_named_savept_t*
+trx_savepoint_find(
+/*===============*/
+ trx_t* trx, /*!< in: transaction */
+ const char* name) /*!< in: savepoint name */
+{
+ trx_named_savept_t* savep;
+
+ for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+ savep != NULL;
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
+ if (!strcmp(savep->name, name)) {
+ return(savep);
+ }
+ }
+
+ return(NULL);
+}
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+static
+void
+trx_roll_savepoint_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep) /*!< in: savepoint to free */
+{
+ UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+ ut_free(savep->name);
+ ut_free(savep);
+}
+
+/*******************************************************************//**
+Frees savepoint structs starting from savep. */
+void
+trx_roll_savepoints_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep) /*!< in: free all savepoints starting
+ with this savepoint i*/
+{
+ while (savep != NULL) {
+ trx_named_savept_t* next_savep;
+
+ next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+
+ trx_roll_savepoint_free(trx, savep);
+
+ savep = next_savep;
+ }
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+trx_rollback_to_savepoint_for_mysql_low(
+/*====================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ trx_named_savept_t* savep, /*!< in/out: savepoint */
+ int64_t* mysql_binlog_cache_pos)
+ /*!< out: the MySQL binlog
+ cache position corresponding
+ to this savepoint; MySQL needs
+ this information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+{
+ dberr_t err;
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(trx->mysql_thd);
+
+ /* Free all savepoints strictly later than savep. */
+
+ trx_roll_savepoints_free(
+ trx, UT_LIST_GET_NEXT(trx_savepoints, savep));
+
+ *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+ trx->op_info = "rollback to a savepoint";
+
+ err = trx->rollback(&savep->savept);
+
+ /* Store the current undo_no of the transaction so that
+ we know where to roll back if we have to roll back the
+ next SQL statement: */
+
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+#ifdef WITH_WSREP
+ trx->lock.was_chosen_as_wsrep_victim = false;
+#endif
+ return(err);
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache
+ position corresponding to this
+ savepoint; MySQL needs this
+ information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+{
+ trx_named_savept_t* savep;
+
+ /* We are reading trx->state without holding trx->mutex
+ here, because the savepoint rollback should be invoked for a
+ running active MySQL transaction that is associated with the
+ current thread. */
+ ut_ad(trx->mysql_thd);
+
+ savep = trx_savepoint_find(trx, savepoint_name);
+
+ if (savep == NULL) {
+ return(DB_NO_SAVEPOINT);
+ }
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ ib::error() << "Transaction has a savepoint "
+ << savep->name
+ << " though it is not started";
+ return(DB_ERROR);
+
+ case TRX_STATE_ACTIVE:
+
+ return(trx_rollback_to_savepoint_for_mysql_low(
+ trx, savep, mysql_binlog_cache_pos));
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ /* The savepoint rollback is only allowed on an ACTIVE
+ transaction, not a PREPARED or COMMITTED one. */
+ break;
+ }
+
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ int64_t binlog_cache_pos) /*!< in: MySQL binlog cache
+ position corresponding to this
+ connection at the time of the
+ savepoint */
+{
+ trx_named_savept_t* savep;
+
+ trx_start_if_not_started_xa(trx, false);
+
+ savep = trx_savepoint_find(trx, savepoint_name);
+
+ if (savep) {
+ /* There is a savepoint with the same name: free that */
+
+ UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+ ut_free(savep->name);
+ ut_free(savep);
+ }
+
+ /* Create a new savepoint and add it as the last in the list */
+
+ savep = static_cast<trx_named_savept_t*>(
+ ut_malloc_nokey(sizeof(*savep)));
+
+ savep->name = mem_strdup(savepoint_name);
+
+ savep->savept = trx_savept_take(trx);
+
+ savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+ UT_LIST_ADD_LAST(trx->trx_savepoints, savep);
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Releases only the named savepoint. Savepoints which were set after this
+savepoint are left as is.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name) /*!< in: savepoint name */
+{
+ trx_named_savept_t* savep;
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE, true)
+ || trx_state_eq(trx, TRX_STATE_PREPARED, true));
+ ut_ad(trx->mysql_thd);
+
+ savep = trx_savepoint_find(trx, savepoint_name);
+
+ if (savep != NULL) {
+ trx_roll_savepoint_free(trx, savep);
+ }
+
+ return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT);
+}
+
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return savepoint */
+trx_savept_t
+trx_savept_take(
+/*============*/
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_savept_t savept;
+
+ savept.least_undo_no = trx->undo_no;
+
+ return(savept);
+}
+
+/*******************************************************************//**
+Roll back an active transaction. */
+static
+void
+trx_rollback_active(
+/*================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ roll_node_t* roll_node;
+ const trx_id_t trx_id = trx->id;
+
+ ut_ad(trx_id);
+
+ heap = mem_heap_create(512);
+
+ fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap, NULL);
+
+ roll_node = roll_node_create(heap);
+
+ thr->child = roll_node;
+ roll_node->common.parent = thr;
+
+ trx->graph = fork;
+
+ ut_a(thr == que_fork_start_command(fork));
+
+ trx_roll_crash_recv_trx = trx;
+
+ const bool dictionary_locked = trx_get_dict_operation(trx)
+ != TRX_DICT_OP_NONE;
+
+ if (dictionary_locked) {
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ que_run_threads(thr);
+ ut_a(roll_node->undo_thr != NULL);
+
+ que_run_threads(roll_node->undo_thr);
+
+ que_graph_free(
+ static_cast<que_t*>(roll_node->undo_thr->common.parent));
+
+ if (UNIV_UNLIKELY(!trx->rollback_finish())) {
+ ut_ad(!dictionary_locked);
+ goto func_exit;
+ }
+
+ ut_a(trx->lock.que_state == TRX_QUE_RUNNING);
+
+ if (!dictionary_locked || !trx->table_id) {
+ } else if (dict_table_t* table = dict_table_open_on_id(
+ trx->table_id, TRUE, DICT_TABLE_OP_NORMAL)) {
+ ib::info() << "Dropping table " << table->name
+ << ", with id " << trx->table_id
+ << " in recovery";
+
+ dict_table_close_and_drop(trx, table);
+
+ trx_commit_for_mysql(trx);
+ }
+
+ ib::info() << "Rolled back recovered transaction " << trx_id;
+
+func_exit:
+ if (dictionary_locked) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ mem_heap_free(heap);
+
+ trx_roll_crash_recv_trx = NULL;
+}
+
+
+struct trx_roll_count_callback_arg
+{
+ uint32_t n_trx;
+ uint64_t n_rows;
+ trx_roll_count_callback_arg(): n_trx(0), n_rows(0) {}
+};
+
+
+static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element,
+ trx_roll_count_callback_arg *arg)
+{
+ mutex_enter(&element->mutex);
+ if (trx_t *trx= element->trx)
+ {
+ if (trx->is_recovered && trx_state_eq(trx, TRX_STATE_ACTIVE))
+ {
+ arg->n_trx++;
+ arg->n_rows+= trx->undo_no;
+ }
+ }
+ mutex_exit(&element->mutex);
+ return 0;
+}
+
+/** Report progress when rolling back a row of a recovered transaction. */
+void trx_roll_report_progress()
+{
+ time_t now = time(NULL);
+ mutex_enter(&recv_sys.mutex);
+ bool report = recv_sys.report(now);
+ mutex_exit(&recv_sys.mutex);
+
+ if (report) {
+ trx_roll_count_callback_arg arg;
+
+ /* Get number of recovered active transactions and number of
+ rows they modified. Numbers must be accurate, because only this
+ thread is allowed to touch recovered transactions. */
+ trx_sys.rw_trx_hash.iterate_no_dups(
+ trx_roll_count_callback, &arg);
+
+ if (arg.n_rows > 0) {
+ service_manager_extend_timeout(
+ INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "To roll back: " UINT32PF " transactions, "
+ UINT64PF " rows", arg.n_trx, arg.n_rows);
+ }
+
+ ib::info() << "To roll back: " << arg.n_trx
+ << " transactions, " << arg.n_rows << " rows";
+
+ }
+}
+
+
+static my_bool trx_rollback_recovered_callback(rw_trx_hash_element_t *element,
+ std::vector<trx_t*> *trx_list)
+{
+ mutex_enter(&element->mutex);
+ if (trx_t *trx= element->trx)
+ {
+ mutex_enter(&trx->mutex);
+ if (trx_state_eq(trx, TRX_STATE_ACTIVE) && trx->is_recovered)
+ trx_list->push_back(trx);
+ mutex_exit(&trx->mutex);
+ }
+ mutex_exit(&element->mutex);
+ return 0;
+}
+
+
+/**
+ Rollback any incomplete transactions which were encountered in crash recovery.
+
+ If the transaction already was committed, then we clean up a possible insert
+ undo log. If the transaction was not yet committed, then we roll it back.
+
+ Note: For XA recovered transactions, we rely on MySQL to
+ do rollback. They will be in TRX_STATE_PREPARED state. If the server
+ is shutdown and they are still lingering in trx_sys_t::trx_list
+ then the shutdown will hang.
+
+ @param[in] all true=roll back all recovered active transactions;
+ false=roll back any incomplete dictionary transaction
+*/
+
+void trx_rollback_recovered(bool all)
+{
+ std::vector<trx_t*> trx_list;
+
+ ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO);
+
+ /*
+ Collect list of recovered ACTIVE transaction ids first. Once collected, no
+ other thread is allowed to modify or remove these transactions from
+ rw_trx_hash.
+ */
+ trx_sys.rw_trx_hash.iterate_no_dups(trx_rollback_recovered_callback,
+ &trx_list);
+
+ while (!trx_list.empty())
+ {
+ trx_t *trx= trx_list.back();
+ trx_list.pop_back();
+
+ ut_ad(trx);
+ ut_d(trx_mutex_enter(trx));
+ ut_ad(trx->is_recovered);
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_d(trx_mutex_exit(trx));
+
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE && !srv_undo_sources &&
+ srv_fast_shutdown)
+ goto discard;
+
+ if (all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
+ || trx->has_stats_table_lock())
+ {
+ trx_rollback_active(trx);
+ if (trx->error_state != DB_SUCCESS)
+ {
+ ut_ad(trx->error_state == DB_INTERRUPTED);
+ trx->error_state= DB_SUCCESS;
+ ut_ad(!srv_undo_sources);
+ ut_ad(srv_fast_shutdown);
+discard:
+ /* Note: before kill_server() invoked innobase_end() via
+ unireg_end(), it invoked close_connections(), which should initiate
+ the rollback of any user transactions via THD::cleanup() in the
+ connection threads, and wait for all THD::cleanup() to complete.
+ So, no active user transactions should exist at this point.
+
+ srv_undo_sources=false was cleared early in innobase_end().
+
+ Generally, the server guarantees that all connections using
+ InnoDB must be disconnected by the time we are reaching this code,
+ be it during shutdown or UNINSTALL PLUGIN.
+
+ Because there is no possible race condition with any
+ concurrent user transaction, we do not have to invoke
+ trx->commit_state() or wait for !trx->is_referenced()
+ before trx_sys.deregister_rw(trx). */
+ trx_sys.deregister_rw(trx);
+ trx_free_at_shutdown(trx);
+ }
+ else
+ trx->free();
+ }
+ }
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return a dummy parameter */
+extern "C"
+os_thread_ret_t
+DECLARE_THREAD(trx_rollback_all_recovered)(void*)
+{
+ my_thread_init();
+ ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(trx_rollback_clean_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+ if (trx_sys.rw_trx_hash.size()) {
+ ib::info() << "Starting in background the rollback of"
+ " recovered transactions";
+ trx_rollback_recovered(true);
+ ib::info() << "Rollback of non-prepared transactions"
+ " completed";
+ }
+
+ trx_rollback_is_active = false;
+
+ my_thread_end();
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit();
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return own: the query graph */
+static
+que_t*
+trx_roll_graph_build(
+/*=================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+
+ ut_ad(trx_mutex_own(trx));
+
+ heap = mem_heap_create(512);
+ fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap, NULL);
+
+ thr->child = row_undo_node_create(trx, thr, heap);
+
+ return(fork);
+}
+
+/*********************************************************************//**
+Starts a rollback operation, creates the UNDO graph that will do the
+actual undo operation.
+@return query graph thread that will perform the UNDO operations. */
+static
+que_thr_t*
+trx_rollback_start(
+/*===============*/
+ trx_t* trx, /*!< in: transaction */
+ undo_no_t roll_limit) /*!< in: rollback to undo no (for
+ partial undo), 0 if we are rolling back
+ the entire transaction */
+{
+ ut_ad(trx_mutex_own(trx));
+
+ /* Initialize the rollback field in the transaction */
+
+ ut_ad(!trx->roll_limit);
+ ut_ad(!trx->in_rollback);
+
+ trx->roll_limit = roll_limit;
+ trx->in_rollback = true;
+
+ ut_a(trx->roll_limit <= trx->undo_no);
+
+ trx->pages_undone = 0;
+
+ /* Build a 'query' graph which will perform the undo operations */
+
+ que_t* roll_graph = trx_roll_graph_build(trx);
+
+ trx->graph = roll_graph;
+
+ trx->lock.que_state = TRX_QUE_ROLLING_BACK;
+
+ return(que_fork_start_command(roll_graph));
+}
+
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+roll_node_t*
+roll_node_create(
+/*=============*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ roll_node_t* node;
+
+ node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node)));
+
+ node->state = ROLL_NODE_SEND;
+
+ node->common.type = QUE_NODE_ROLLBACK;
+
+ return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ roll_node_t* node;
+
+ node = static_cast<roll_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = ROLL_NODE_SEND;
+ }
+
+ if (node->state == ROLL_NODE_SEND) {
+ trx_t* trx;
+ ib_id_t roll_limit;
+
+ trx = thr_get_trx(thr);
+
+ trx_mutex_enter(trx);
+
+ node->state = ROLL_NODE_WAIT;
+
+ ut_a(node->undo_thr == NULL);
+
+ roll_limit = node->savept ? node->savept->least_undo_no : 0;
+
+ trx_commit_or_rollback_prepare(trx);
+
+ node->undo_thr = trx_rollback_start(trx, roll_limit);
+
+ trx_mutex_exit(trx);
+
+ } else {
+ ut_ad(node->state == ROLL_NODE_WAIT);
+
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
new file mode 100644
index 00000000..307f8757
--- /dev/null
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -0,0 +1,768 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rseg.cc
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+#include "srv0mon.h"
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+
+#ifdef UNIV_DEBUG
+/** The latest known WSREP XID sequence number */
+static long long wsrep_seqno = -1;
+#endif /* UNIV_DEBUG */
+/** The latest known WSREP XID UUID */
+static unsigned char wsrep_uuid[16];
+
+/** Write the WSREP XID information into rollback segment header.
+@param[in,out] rseg_header rollback segment header
+@param[in] xid WSREP XID
+@param[in,out] mtr mini transaction */
+static void
+trx_rseg_write_wsrep_checkpoint(
+ buf_block_t* rseg_header,
+ const XID* xid,
+ mtr_t* mtr)
+{
+ DBUG_ASSERT(xid->gtrid_length >= 0);
+ DBUG_ASSERT(xid->bqual_length >= 0);
+ DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE);
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+ + rseg_header->frame,
+ uint32_t(xid->formatID));
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+ + rseg_header->frame,
+ uint32_t(xid->gtrid_length));
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+ + rseg_header->frame,
+ uint32_t(xid->bqual_length));
+
+ const ulint xid_length = static_cast<ulint>(xid->gtrid_length
+ + xid->bqual_length);
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ + rseg_header->frame,
+ xid->data, xid_length);
+ if (xid_length < XIDDATASIZE
+ && memcmp(TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ + rseg_header->frame, field_ref_zero,
+ XIDDATASIZE - xid_length)) {
+ mtr->memset(rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length,
+ XIDDATASIZE - xid_length, 0);
+ }
+}
+
+/** Update the WSREP XID information in rollback segment header.
+@param[in,out] rseg_header rollback segment header
+@param[in] xid WSREP XID
+@param[in,out] mtr mini-transaction */
+void
+trx_rseg_update_wsrep_checkpoint(
+ buf_block_t* rseg_header,
+ const XID* xid,
+ mtr_t* mtr)
+{
+ ut_ad(wsrep_is_wsrep_xid(xid));
+
+#ifdef UNIV_DEBUG
+ /* Check that seqno is monotonically increasing */
+ long long xid_seqno = wsrep_xid_seqno(xid);
+ const byte* xid_uuid = wsrep_xid_uuid(xid);
+
+ if (xid_seqno != -1
+ && !memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) {
+ ut_ad(xid_seqno > wsrep_seqno);
+ } else {
+ memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
+ }
+ wsrep_seqno = xid_seqno;
+#endif /* UNIV_DEBUG */
+ trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr);
+}
+
+/** Clear the WSREP XID information from rollback segment header.
+@param[in,out] block rollback segment header
+@param[in,out] mtr mini-transaction */
+static void trx_rseg_clear_wsrep_checkpoint(buf_block_t *block, mtr_t *mtr)
+{
+ mtr->memset(block, TRX_RSEG + TRX_RSEG_WSREP_XID_INFO,
+ TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE - TRX_RSEG_WSREP_XID_INFO,
+ 0);
+}
+
+static void
+trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr)
+{
+ const byte* xid_uuid = wsrep_xid_uuid(xid);
+ /* We must make check against wsrep_uuid here, the
+ trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with
+ xid contents in debug mode and the memcmp() will never give nonzero
+ result. */
+ const bool must_clear_rsegs = memcmp(wsrep_uuid, xid_uuid,
+ sizeof wsrep_uuid);
+ const trx_rseg_t* rseg = trx_sys.rseg_array[0];
+
+ buf_block_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no,
+ mtr);
+ if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+ + rseg_header->frame))) {
+ trx_rseg_format_upgrade(rseg_header, mtr);
+ }
+
+ trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr);
+
+ if (must_clear_rsegs) {
+ /* Because the UUID part of the WSREP XID differed
+ from current_xid_uuid, the WSREP group UUID was
+ changed, and we must reset the XID in all rollback
+ segment headers. */
+ for (ulint rseg_id = 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) {
+ if (const trx_rseg_t* rseg =
+ trx_sys.rseg_array[rseg_id]) {
+ trx_rseg_clear_wsrep_checkpoint(
+ trx_rsegf_get(rseg->space,
+ rseg->page_no, mtr),
+ mtr);
+ }
+ }
+ }
+}
+
+/** Update WSREP checkpoint XID in first rollback segment header
+as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
+are no wsrep transactions committing.
+If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
+stored into rollback segments, the WSREP XID in all the remaining rollback
+segments will be reset.
+@param[in] xid WSREP XID */
+void trx_rseg_update_wsrep_checkpoint(const XID* xid)
+{
+ mtr_t mtr;
+ mtr.start();
+ trx_rseg_update_wsrep_checkpoint(xid, &mtr);
+ mtr.commit();
+}
+
+/** Read the WSREP XID information in rollback segment header.
+@param[in] rseg_header Rollback segment header
+@param[out] xid Transaction XID
+@return whether the WSREP XID was present */
+static
+bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
+{
+ int formatID = static_cast<int>(
+ mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+ + rseg_header->frame));
+ if (formatID == 0) {
+ return false;
+ }
+
+ xid.formatID = formatID;
+ xid.gtrid_length = static_cast<int>(
+ mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+ + rseg_header->frame));
+
+ xid.bqual_length = static_cast<int>(
+ mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+ + rseg_header->frame));
+
+ memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ + rseg_header->frame, XIDDATASIZE);
+
+ return true;
+}
+
+/** Read the WSREP XID from the TRX_SYS page (in case of upgrade).
+@param[in] page TRX_SYS page
+@param[out] xid WSREP XID (if present)
+@return whether the WSREP XID is present */
+static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
+{
+ if (mach_read_from_4(TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_MAGIC_N_FLD
+ + page)
+ != TRX_SYS_WSREP_XID_MAGIC_N) {
+ return false;
+ }
+
+ xid.formatID = static_cast<int>(
+ mach_read_from_4(
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_FORMAT + page));
+ xid.gtrid_length = static_cast<int>(
+ mach_read_from_4(
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_GTRID_LEN + page));
+ xid.bqual_length = static_cast<int>(
+ mach_read_from_4(
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_BQUAL_LEN + page));
+ memcpy(xid.data,
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE);
+ return true;
+}
+
+/** Recover the latest WSREP checkpoint XID.
+@param[out] xid WSREP XID
+@return whether the WSREP XID was found */
+bool trx_rseg_read_wsrep_checkpoint(XID& xid)
+{
+ mtr_t mtr;
+ long long max_xid_seqno = -1;
+ bool found = false;
+
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS;
+ rseg_id++, mtr.commit()) {
+ mtr.start();
+ const buf_block_t* sys = trx_sysf_get(&mtr, false);
+ const uint32_t page_no = trx_sysf_rseg_get_page_no(
+ sys, rseg_id);
+
+ if (page_no == FIL_NULL) {
+ continue;
+ }
+
+ const buf_block_t* rseg_header = trx_rsegf_get_new(
+ trx_sysf_rseg_get_space(sys, rseg_id), page_no, &mtr);
+
+ if (mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+ + rseg_header->frame)) {
+ continue;
+ }
+
+ XID tmp_xid;
+ long long tmp_seqno = 0;
+ if (trx_rseg_read_wsrep_checkpoint(rseg_header, tmp_xid)
+ && (tmp_seqno = wsrep_xid_seqno(&tmp_xid))
+ > max_xid_seqno) {
+ found = true;
+ max_xid_seqno = tmp_seqno;
+ xid = tmp_xid;
+ memcpy(wsrep_uuid, wsrep_xid_uuid(&tmp_xid),
+ sizeof wsrep_uuid);
+ }
+ }
+
+ return found;
+}
+#endif /* WITH_WSREP */
+
+/** Upgrade a rollback segment header page to MariaDB 10.3 format.
+@param[in,out] rseg_header rollback segment header page
+@param[in,out] mtr mini-transaction */
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr)
+{
+ mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_FORMAT, 4, 0);
+ /* Clear also possible garbage at the end of the page. Old
+ InnoDB versions did not initialize unused parts of pages. */
+ mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8,
+ srv_page_size
+ - (FIL_PAGE_DATA_END + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8),
+ 0);
+}
+
+/** Create a rollback segment header.
+@param[in,out] space system, undo, or temporary tablespace
+@param[in] rseg_id rollback segment identifier
+@param[in,out] sys_header the TRX_SYS page (NULL for temporary rseg)
+@param[in,out] mtr mini-transaction
+@return the created rollback segment
+@retval NULL on failure */
+buf_block_t*
+trx_rseg_header_create(
+ fil_space_t* space,
+ ulint rseg_id,
+ buf_block_t* sys_header,
+ mtr_t* mtr)
+{
+ buf_block_t* block;
+
+ ut_ad(mtr->memo_contains(*space));
+ ut_ad(!sys_header == (space == fil_system.temp_space));
+
+ /* Allocate a new file segment for the rollback segment */
+ block = fseg_create(space, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
+
+ if (block == NULL) {
+ /* No space left */
+ return block;
+ }
+
+ buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+ ut_ad(0 == mach_read_from_4(TRX_RSEG_FORMAT + TRX_RSEG
+ + block->frame));
+ ut_ad(0 == mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG
+ + block->frame));
+
+ /* Initialize the history list */
+ flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr);
+
+ /* Reset the undo log slots */
+ mtr->memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG,
+ TRX_RSEG_N_SLOTS * 4, 0xff);
+
+ if (sys_header) {
+ /* Add the rollback segment info to the free slot in
+ the trx system header */
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(
+ *sys_header,
+ TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
+ + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+ + sys_header->frame, space->id);
+ mtr->write<4,mtr_t::MAYBE_NOP>(
+ *sys_header,
+ TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO
+ + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+ + sys_header->frame, block->page.id().page_no());
+ }
+
+ return block;
+}
+
+/** Free a rollback segment in memory. */
+void
+trx_rseg_mem_free(trx_rseg_t* rseg)
+{
+ trx_undo_t* undo;
+ trx_undo_t* next_undo;
+
+ mutex_free(&rseg->mutex);
+
+ /* There can't be any active transactions. */
+ ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0);
+
+ for (undo = UT_LIST_GET_FIRST(rseg->undo_cached);
+ undo != NULL;
+ undo = next_undo) {
+
+ next_undo = UT_LIST_GET_NEXT(undo_list, undo);
+
+ UT_LIST_REMOVE(rseg->undo_cached, undo);
+
+ MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+ ut_free(undo);
+ }
+
+ ut_free(rseg);
+}
+
+/** Create a rollback segment object.
+@param[in] id rollback segment id
+@param[in] space space where the segment is placed
+@param[in] page_no page number of the segment header */
+static
+trx_rseg_t*
+trx_rseg_mem_create(ulint id, fil_space_t* space, uint32_t page_no)
+{
+ trx_rseg_t* rseg = static_cast<trx_rseg_t*>(
+ ut_zalloc_nokey(sizeof *rseg));
+
+ rseg->id = id;
+ rseg->space = space;
+ rseg->page_no = page_no;
+ rseg->last_page_no = FIL_NULL;
+ rseg->curr_size = 1;
+
+ mutex_create(rseg->is_persistent()
+ ? LATCH_ID_REDO_RSEG : LATCH_ID_NOREDO_RSEG,
+ &rseg->mutex);
+
+ UT_LIST_INIT(rseg->undo_list, &trx_undo_t::undo_list);
+ UT_LIST_INIT(rseg->undo_cached, &trx_undo_t::undo_list);
+
+ return(rseg);
+}
+
+/** Read the undo log lists.
+@param[in,out] rseg rollback segment
+@param[in,out] max_trx_id maximum observed transaction identifier
+@param[in] rseg_header rollback segment header
+@return error code */
+static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, trx_id_t &max_trx_id,
+ const buf_block_t *rseg_header)
+{
+ ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+
+ for (ulint i= 0; i < TRX_RSEG_N_SLOTS; i++)
+ {
+ uint32_t page_no= trx_rsegf_get_nth_undo(rseg_header, i);
+ if (page_no != FIL_NULL)
+ {
+ const trx_undo_t *undo= trx_undo_mem_create_at_db_start(rseg, i, page_no,
+ max_trx_id);
+ if (!undo)
+ return DB_CORRUPTION;
+ rseg->curr_size+= undo->size;
+ MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+ }
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Restore the state of a persistent rollback segment.
+@param[in,out] rseg persistent rollback segment
+@param[in,out] max_trx_id maximum observed transaction identifier
+@param[in,out] mtr mini-transaction
+@return error code */
+static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id,
+ mtr_t *mtr)
+{
+ buf_block_t* rseg_hdr = trx_rsegf_get_new(
+ rseg->space->id, rseg->page_no, mtr);
+
+ if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->frame)) {
+ trx_id_t id = mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID
+ + rseg_hdr->frame);
+
+ if (id > max_trx_id) {
+ max_trx_id = id;
+ }
+
+ const byte* binlog_name = TRX_RSEG + TRX_RSEG_BINLOG_NAME
+ + rseg_hdr->frame;
+ if (*binlog_name) {
+ lsn_t lsn = mach_read_from_8(my_assume_aligned<8>(
+ FIL_PAGE_LSN
+ + rseg_hdr
+ ->frame));
+ compile_time_assert(TRX_RSEG_BINLOG_NAME_LEN == sizeof
+ trx_sys.recovered_binlog_filename);
+ if (lsn > trx_sys.recovered_binlog_lsn) {
+ trx_sys.recovered_binlog_lsn = lsn;
+ trx_sys.recovered_binlog_offset
+ = mach_read_from_8(
+ TRX_RSEG
+ + TRX_RSEG_BINLOG_OFFSET
+ + rseg_hdr->frame);
+ memcpy(trx_sys.recovered_binlog_filename,
+ binlog_name,
+ TRX_RSEG_BINLOG_NAME_LEN);
+ }
+
+#ifdef WITH_WSREP
+ trx_rseg_read_wsrep_checkpoint(
+ rseg_hdr, trx_sys.recovered_wsrep_xid);
+#endif
+ }
+ }
+
+ if (srv_operation == SRV_OPERATION_RESTORE) {
+ /* mariabackup --prepare only deals with
+ the redo log and the data files, not with
+ transactions or the data dictionary. */
+ return DB_SUCCESS;
+ }
+
+ /* Initialize the undo log lists according to the rseg header */
+
+ rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+ + rseg_hdr->frame)
+ + 1;
+ if (dberr_t err = trx_undo_lists_init(rseg, max_trx_id, rseg_hdr)) {
+ return err;
+ }
+
+ if (auto len = flst_get_len(TRX_RSEG + TRX_RSEG_HISTORY
+ + rseg_hdr->frame)) {
+ trx_sys.rseg_history_len += len;
+
+ fil_addr_t node_addr = flst_get_last(TRX_RSEG
+ + TRX_RSEG_HISTORY
+ + rseg_hdr->frame);
+ node_addr.boffset = static_cast<uint16_t>(
+ node_addr.boffset - TRX_UNDO_HISTORY_NODE);
+
+ rseg->last_page_no = node_addr.page;
+
+ const buf_block_t* block = trx_undo_page_get(
+ page_id_t(rseg->space->id, node_addr.page), mtr);
+
+ trx_id_t id = mach_read_from_8(block->frame + node_addr.boffset
+ + TRX_UNDO_TRX_ID);
+ if (id > max_trx_id) {
+ max_trx_id = id;
+ }
+ id = mach_read_from_8(block->frame + node_addr.boffset
+ + TRX_UNDO_TRX_NO);
+ if (id > max_trx_id) {
+ max_trx_id = id;
+ }
+
+ rseg->set_last_commit(node_addr.boffset, id);
+ unsigned purge = mach_read_from_2(block->frame
+ + node_addr.boffset
+ + TRX_UNDO_NEEDS_PURGE);
+ ut_ad(purge <= 1);
+ rseg->needs_purge = purge != 0;
+
+ if (rseg->last_page_no != FIL_NULL) {
+
+ /* There is no need to cover this operation by the purge
+ mutex because we are still bootstrapping. */
+ purge_sys.purge_queue.push(*rseg);
+ }
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Read binlog metadata from the TRX_SYS page, in case we are upgrading
+from MySQL or a MariaDB version older than 10.3.5. */
+static void trx_rseg_init_binlog_info(const page_t* page)
+{
+ if (mach_read_from_4(TRX_SYS + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+ + page)
+ == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+ memcpy(trx_sys.recovered_binlog_filename,
+ TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME
+ + TRX_SYS + page, TRX_SYS_MYSQL_LOG_NAME_LEN);
+ trx_sys.recovered_binlog_offset = mach_read_from_8(
+ TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET
+ + TRX_SYS + page);
+ }
+
+#ifdef WITH_WSREP
+ trx_rseg_init_wsrep_xid(page, trx_sys.recovered_wsrep_xid);
+#endif
+}
+
+/** Initialize or recover the rollback segments at startup. */
+dberr_t trx_rseg_array_init()
+{
+ trx_id_t max_trx_id = 0;
+
+ *trx_sys.recovered_binlog_filename = '\0';
+ trx_sys.recovered_binlog_offset = 0;
+#ifdef WITH_WSREP
+ trx_sys.recovered_wsrep_xid.null();
+ XID wsrep_sys_xid;
+ wsrep_sys_xid.null();
+ bool wsrep_xid_in_rseg_found = false;
+#endif
+ mtr_t mtr;
+ dberr_t err = DB_SUCCESS;
+
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ mtr.start();
+ if (const buf_block_t* sys = trx_sysf_get(&mtr, false)) {
+ if (rseg_id == 0) {
+ /* In case this is an upgrade from
+ before MariaDB 10.3.5, fetch the base
+ information from the TRX_SYS page. */
+ max_trx_id = mach_read_from_8(
+ TRX_SYS + TRX_SYS_TRX_ID_STORE
+ + sys->frame);
+ trx_rseg_init_binlog_info(sys->frame);
+#ifdef WITH_WSREP
+ wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid);
+#endif
+ }
+
+ const uint32_t page_no = trx_sysf_rseg_get_page_no(
+ sys, rseg_id);
+ if (page_no != FIL_NULL) {
+ trx_rseg_t* rseg = trx_rseg_mem_create(
+ rseg_id,
+ fil_space_get(trx_sysf_rseg_get_space(
+ sys, rseg_id)),
+ page_no);
+ ut_ad(rseg->is_persistent());
+ ut_ad(rseg->id == rseg_id);
+ ut_ad(!trx_sys.rseg_array[rseg_id]);
+ trx_sys.rseg_array[rseg_id] = rseg;
+ if ((err = trx_rseg_mem_restore(
+ rseg, max_trx_id, &mtr))
+ != DB_SUCCESS) {
+ mtr.commit();
+ break;
+ }
+#ifdef WITH_WSREP
+ if (!wsrep_sys_xid.is_null() &&
+ !wsrep_sys_xid.eq(&trx_sys.recovered_wsrep_xid)) {
+ wsrep_xid_in_rseg_found = true;
+ ut_ad(memcmp(wsrep_xid_uuid(&wsrep_sys_xid),
+ wsrep_xid_uuid(&trx_sys.recovered_wsrep_xid),
+ sizeof wsrep_uuid)
+ || wsrep_xid_seqno(
+ &wsrep_sys_xid)
+ <= wsrep_xid_seqno(
+ &trx_sys.recovered_wsrep_xid));
+ }
+#endif
+ }
+ }
+
+ mtr.commit();
+ }
+
+ if (err != DB_SUCCESS) {
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ if (trx_rseg_t*& rseg = trx_sys.rseg_array[rseg_id]) {
+ while (trx_undo_t* u= UT_LIST_GET_FIRST(
+ rseg->undo_list)) {
+ UT_LIST_REMOVE(rseg->undo_list, u);
+ ut_free(u);
+ }
+ trx_rseg_mem_free(rseg);
+ rseg = NULL;
+ }
+ }
+ return err;
+ }
+
+#ifdef WITH_WSREP
+ if (!wsrep_sys_xid.is_null()) {
+ /* Upgrade from a version prior to 10.3.5,
+ where WSREP XID was stored in TRX_SYS page.
+ If no rollback segment has a WSREP XID set,
+ we must copy the XID found in TRX_SYS page
+ to rollback segments. */
+ mtr.start();
+
+ if (!wsrep_xid_in_rseg_found) {
+ trx_rseg_update_wsrep_checkpoint(&wsrep_sys_xid, &mtr);
+ }
+
+ /* Finally, clear WSREP XID in TRX_SYS page. */
+ mtr.memset(trx_sysf_get(&mtr),
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO,
+ TRX_SYS_WSREP_XID_LEN, 0);
+ mtr.commit();
+ }
+#endif
+
+ trx_sys.init_max_trx_id(max_trx_id + 1);
+ return DB_SUCCESS;
+}
+
+/** Create a persistent rollback segment.
+@param[in] space_id system or undo tablespace id
+@return pointer to new rollback segment
+@retval NULL on failure */
+trx_rseg_t*
+trx_rseg_create(ulint space_id)
+{
+ trx_rseg_t* rseg = NULL;
+ mtr_t mtr;
+
+ mtr.start();
+
+ fil_space_t* space = mtr_x_lock_space(space_id, &mtr);
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+
+ if (buf_block_t* sys_header = trx_sysf_get(&mtr)) {
+ ulint rseg_id = trx_sys_rseg_find_free(sys_header);
+ if (buf_block_t* rblock = rseg_id == ULINT_UNDEFINED
+ ? NULL
+ : trx_rseg_header_create(space, rseg_id, sys_header,
+ &mtr)) {
+ ut_ad(trx_sysf_rseg_get_space(sys_header, rseg_id)
+ == space_id);
+ rseg = trx_rseg_mem_create(rseg_id, space,
+ rblock->page.id().
+ page_no());
+ ut_ad(rseg->id == rseg_id);
+ ut_ad(rseg->is_persistent());
+ ut_ad(!trx_sys.rseg_array[rseg->id]);
+ trx_sys.rseg_array[rseg->id] = rseg;
+ }
+ }
+
+ mtr.commit();
+
+ return(rseg);
+}
+
+/** Create the temporary rollback segments. */
+void
+trx_temp_rseg_create()
+{
+ mtr_t mtr;
+
+ for (ulong i = 0; i < TRX_SYS_N_RSEGS; i++) {
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ mtr_x_lock_space(fil_system.temp_space, &mtr);
+
+ buf_block_t* rblock = trx_rseg_header_create(
+ fil_system.temp_space, i, NULL, &mtr);
+ trx_rseg_t* rseg = trx_rseg_mem_create(
+ i, fil_system.temp_space, rblock->page.id().page_no());
+ ut_ad(!rseg->is_persistent());
+ ut_ad(!trx_sys.temp_rsegs[i]);
+ trx_sys.temp_rsegs[i] = rseg;
+ mtr.commit();
+ }
+}
+
+/** Update the offset information about the end of the binlog entry
+which corresponds to the transaction just being committed.
+In a replication slave, this updates the master binlog position
+up to which replication has proceeded.
+@param[in,out] rseg_header rollback segment header
+@param[in] trx committing transaction
+@param[in,out] mtr mini-transaction */
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
+ mtr_t *mtr)
+{
+ DBUG_LOG("trx", "trx_mysql_binlog_offset: " << trx->mysql_log_offset);
+
+ const size_t len = strlen(trx->mysql_log_file_name) + 1;
+
+ ut_ad(len > 1);
+
+ if (UNIV_UNLIKELY(len > TRX_RSEG_BINLOG_NAME_LEN)) {
+ return;
+ }
+
+ mtr->write<8,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_BINLOG_OFFSET
+ + rseg_header->frame,
+ trx->mysql_log_offset);
+
+ void* name = TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->frame;
+
+ if (memcmp(trx->mysql_log_file_name, name, len)) {
+ mtr->memcpy(*rseg_header, name, trx->mysql_log_file_name, len);
+ }
+}
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
new file mode 100644
index 00000000..3064645f
--- /dev/null
+++ b/storage/innobase/trx/trx0sys.cc
@@ -0,0 +1,339 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.cc
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+#include "mysqld.h"
+#include "sql_error.h"
+
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/** The transaction system */
+trx_sys_t trx_sys;
+
+/** Check whether transaction id is valid.
+@param[in] id transaction id to check
+@param[in] name table name */
+void
+ReadViewBase::check_trx_id_sanity(
+ trx_id_t id,
+ const table_name_t& name)
+{
+ if (id >= trx_sys.get_max_trx_id()) {
+
+ ib::warn() << "A transaction id"
+ << " in a record of table "
+ << name
+ << " is newer than the"
+ << " system-wide maximum.";
+ ut_ad(0);
+ THD *thd = current_thd;
+ if (thd != NULL) {
+ char table_name[MAX_FULL_NAME_LEN + 1];
+
+ innobase_format_name(
+ table_name, sizeof(table_name),
+ name.m_name);
+
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_SIGNAL_WARN,
+ "InnoDB: Transaction id"
+ " in a record of table"
+ " %s is newer than system-wide"
+ " maximum.", table_name);
+ }
+ }
+}
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+uint trx_rseg_n_slots_debug = 0;
+#endif
+
+/** Display the MySQL binlog offset info if it is present in the trx
+system header. */
+void
+trx_sys_print_mysql_binlog_offset()
+{
+ if (!*trx_sys.recovered_binlog_filename) {
+ return;
+ }
+
+ ib::info() << "Last binlog file '"
+ << trx_sys.recovered_binlog_filename
+ << "', position "
+ << trx_sys.recovered_binlog_offset;
+}
+
+/** Find an available rollback segment.
+@param[in] sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
+@retval ULINT_UNDEFINED if not found */
+ulint
+trx_sys_rseg_find_free(const buf_block_t* sys_header)
+{
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ if (trx_sysf_rseg_get_page_no(sys_header, rseg_id)
+ == FIL_NULL) {
+ return rseg_id;
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/** Count the number of initialized persistent rollback segment slots. */
+static
+void
+trx_sysf_get_n_rseg_slots()
+{
+ mtr_t mtr;
+ mtr.start();
+
+ srv_available_undo_logs = 0;
+ if (const buf_block_t* sys_header = trx_sysf_get(&mtr, false)) {
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ srv_available_undo_logs
+ += trx_sysf_rseg_get_page_no(sys_header,
+ rseg_id)
+ != FIL_NULL;
+ }
+ }
+
+ mtr.commit();
+}
+
+/*****************************************************************//**
+Creates the file page for the transaction system. This function is called only
+at the database creation, before trx_sys_init. */
+static
+void
+trx_sysf_create(
+/*============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint slot_no;
+ buf_block_t* block;
+
+ ut_ad(mtr);
+
+ /* Note that below we first reserve the file space x-latch, and
+ then enter the kernel: we must do it in this order to conform
+ to the latching order rules. */
+
+ mtr_x_lock_space(fil_system.sys_space, mtr);
+ compile_time_assert(TRX_SYS_SPACE == 0);
+
+ /* Create the trx sys file block in a new allocated file segment */
+ block = fseg_create(fil_system.sys_space,
+ TRX_SYS + TRX_SYS_FSEG_HEADER,
+ mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+ ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO));
+
+ mtr->write<2>(*block, FIL_PAGE_TYPE + block->frame,
+ FIL_PAGE_TYPE_TRX_SYS);
+
+ ut_ad(!mach_read_from_4(block->frame
+ + TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_MAGIC));
+
+ /* Reset the rollback segment slots. Old versions of InnoDB
+ (before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect
+ that the whole array is initialized. */
+ compile_time_assert(256 >= TRX_SYS_N_RSEGS);
+ compile_time_assert(TRX_SYS + TRX_SYS_RSEGS
+ + 256 * TRX_SYS_RSEG_SLOT_SIZE
+ <= UNIV_PAGE_SIZE_MIN - FIL_PAGE_DATA_END);
+ mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS,
+ 256 * TRX_SYS_RSEG_SLOT_SIZE, 0xff);
+ /* Initialize all of the page. This part used to be uninitialized. */
+ mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS
+ + 256 * TRX_SYS_RSEG_SLOT_SIZE,
+ srv_page_size
+ - (FIL_PAGE_DATA_END + TRX_SYS + TRX_SYS_RSEGS
+ + 256 * TRX_SYS_RSEG_SLOT_SIZE),
+ 0);
+
+ /* Create the first rollback segment in the SYSTEM tablespace */
+ slot_no = trx_sys_rseg_find_free(block);
+ buf_block_t* rblock = trx_rseg_header_create(fil_system.sys_space,
+ slot_no, block, mtr);
+
+ ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+ ut_a(rblock->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO));
+}
+
+/** Create the instance */
+void
+trx_sys_t::create()
+{
+ ut_ad(this == &trx_sys);
+ ut_ad(!is_initialised());
+ m_initialised = true;
+ trx_list.create();
+ rseg_history_len= 0;
+
+ rw_trx_hash.init();
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+void
+trx_sys_create_sys_pages(void)
+/*==========================*/
+{
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ trx_sysf_create(&mtr);
+
+ mtr_commit(&mtr);
+}
+
+/** Create the rollback segments.
+@return whether the creation succeeded */
+bool
+trx_sys_create_rsegs()
+{
+ /* srv_available_undo_logs reflects the number of persistent
+ rollback segments that have been initialized in the
+ transaction system header page. */
+ ut_ad(srv_undo_tablespaces <= TRX_SYS_MAX_UNDO_SPACES);
+
+ if (high_level_read_only) {
+ srv_available_undo_logs = 0;
+ return(true);
+ }
+
+ /* This is executed in single-threaded mode therefore it is not
+ necessary to use the same mtr in trx_rseg_create(). n_used cannot
+ change while the function is executing. */
+ trx_sysf_get_n_rseg_slots();
+
+ ut_ad(srv_available_undo_logs <= TRX_SYS_N_RSEGS);
+
+ /* The first persistent rollback segment is always initialized
+ in the system tablespace. */
+ ut_a(srv_available_undo_logs > 0);
+
+ for (ulint i = 0; srv_available_undo_logs < TRX_SYS_N_RSEGS;
+ i++, srv_available_undo_logs++) {
+ /* Tablespace 0 is the system tablespace.
+ Dedicated undo log tablespaces start from 1. */
+ ulint space = srv_undo_tablespaces > 0
+ ? (i % srv_undo_tablespaces)
+ + srv_undo_space_id_start
+ : TRX_SYS_SPACE;
+
+ if (!trx_rseg_create(space)) {
+ ib::error() << "Unable to allocate the"
+ " requested innodb_undo_logs";
+ return(false);
+ }
+
+ /* Increase the number of active undo
+ tablespace in case new rollback segment
+ assigned to new undo tablespace. */
+ if (space > srv_undo_tablespaces_active) {
+ srv_undo_tablespaces_active++;
+
+ ut_ad(srv_undo_tablespaces_active == space);
+ }
+ }
+
+ ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+ ib::info info;
+ info << srv_available_undo_logs;
+ if (srv_undo_tablespaces_active) {
+ info << " rollback segments in " << srv_undo_tablespaces_active
+ << " undo tablespaces are active.";
+ } else {
+ info << " rollback segments are active.";
+ }
+
+ return(true);
+}
+
+/** Close the transaction system on shutdown */
+void
+trx_sys_t::close()
+{
+ ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+ if (!is_initialised()) {
+ return;
+ }
+
+ if (size_t size = view_count()) {
+ ib::error() << "All read views were not closed before"
+ " shutdown: " << size << " read views open";
+ }
+
+ rw_trx_hash.destroy();
+
+ /* There can't be any active transactions. */
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ if (trx_rseg_t* rseg = rseg_array[i]) {
+ trx_rseg_mem_free(rseg);
+ }
+
+ if (trx_rseg_t* rseg = temp_rsegs[i]) {
+ trx_rseg_mem_free(rseg);
+ }
+ }
+
+ ut_a(trx_list.empty());
+ trx_list.close();
+ m_initialised = false;
+}
+
+/** @return total number of active (non-prepared) transactions */
+ulint trx_sys_t::any_active_transactions()
+{
+ uint32_t total_trx= 0;
+
+ trx_sys.trx_list.for_each([&total_trx](const trx_t &trx) {
+ if (trx.state == TRX_STATE_COMMITTED_IN_MEMORY ||
+ (trx.state == TRX_STATE_ACTIVE && trx.id))
+ total_trx++;
+ });
+
+ return total_trx;
+}
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
new file mode 100644
index 00000000..cf8fa17c
--- /dev/null
+++ b/storage/innobase/trx/trx0trx.cc
@@ -0,0 +1,2300 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.cc
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif
+
+#include <mysql/service_thd_error_context.h>
+
+#include "btr0sea.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "trx0xa.h"
+#include "ut0pool.h"
+#include "ut0vec.h"
+
+#include <set>
+#include <new>
+
+/** The bit pattern corresponding to TRX_ID_MAX */
+const byte trx_id_max_bytes[8] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/** The bit pattern corresponding to max timestamp */
+const byte timestamp_max_bytes[7] = {
+ 0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f
+};
+
+
+static const ulint MAX_DETAILED_ERROR_LEN = 256;
+
+/** Set of table_id */
+typedef std::set<
+ table_id_t,
+ std::less<table_id_t>,
+ ut_allocator<table_id_t> > table_id_set;
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+void
+trx_set_detailed_error(
+/*===================*/
+ trx_t* trx, /*!< in: transaction struct */
+ const char* msg) /*!< in: detailed error message */
+{
+ strncpy(trx->detailed_error, msg, MAX_DETAILED_ERROR_LEN - 1);
+ trx->detailed_error[MAX_DETAILED_ERROR_LEN - 1] = '\0';
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction struct */
+ FILE* file) /*!< in: file to read message from */
+{
+ os_file_read_string(file, trx->detailed_error, MAX_DETAILED_ERROR_LEN);
+}
+
+/********************************************************************//**
+Initialize transaction object.
+@param trx trx to initialize */
+static
+void
+trx_init(
+/*=====*/
+ trx_t* trx)
+{
+ trx->state = TRX_STATE_NOT_STARTED;
+
+ trx->is_recovered = false;
+
+ trx->op_info = "";
+
+ trx->active_commit_ordered = false;
+
+ trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+ trx->check_foreigns = true;
+
+ trx->check_unique_secondary = true;
+
+ trx->lock.n_rec_locks = 0;
+
+ trx->dict_operation = TRX_DICT_OP_NONE;
+
+ trx->table_id = 0;
+
+ trx->error_state = DB_SUCCESS;
+
+ trx->error_key_num = ULINT_UNDEFINED;
+
+ trx->undo_no = 0;
+
+ trx->rsegs.m_redo.rseg = NULL;
+
+ trx->rsegs.m_noredo.rseg = NULL;
+
+ trx->read_only = false;
+
+ trx->auto_commit = false;
+
+ trx->will_lock = false;
+
+ trx->ddl = false;
+
+ trx->internal = false;
+
+ ut_d(trx->start_file = 0);
+
+ ut_d(trx->start_line = 0);
+
+ trx->magic_n = TRX_MAGIC_N;
+
+ trx->lock.que_state = TRX_QUE_RUNNING;
+
+ trx->last_sql_stat_start.least_undo_no = 0;
+
+ ut_ad(!trx->read_view.is_open());
+
+ trx->lock.rec_cached = 0;
+
+ trx->lock.table_cached = 0;
+#ifdef WITH_WSREP
+ ut_ad(!trx->wsrep);
+ ut_ad(!trx->wsrep_UK_scan);
+#endif /* WITH_WSREP */
+}
+
+/** For managing the life-cycle of the trx_t instance that we get
+from the pool. */
+struct TrxFactory {
+
+ /** Initializes a transaction object. It must be explicitly started
+ with trx_start_if_not_started() before using it. The default isolation
+ level is TRX_ISO_REPEATABLE_READ.
+ @param trx Transaction instance to initialise */
+ static void init(trx_t* trx)
+ {
+ /* Explicitly call the constructor of the already
+ allocated object. trx_t objects are allocated by
+ ut_zalloc_nokey() in Pool::Pool() which would not call
+ the constructors of the trx_t members. */
+ new(&trx->mod_tables) trx_mod_tables_t();
+
+ new(&trx->lock.table_locks) lock_list();
+
+ new(&trx->read_view) ReadView();
+
+ trx->rw_trx_hash_pins = 0;
+ trx_init(trx);
+
+ trx->dict_operation_lock_mode = 0;
+
+ trx->xid = UT_NEW_NOKEY(xid_t());
+
+ trx->detailed_error = reinterpret_cast<char*>(
+ ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN));
+
+ trx->lock.lock_heap = mem_heap_create_typed(
+ 1024, MEM_HEAP_FOR_LOCK_HEAP);
+
+ lock_trx_lock_list_init(&trx->lock.trx_locks);
+
+ UT_LIST_INIT(trx->lock.evicted_tables,
+ &dict_table_t::table_LRU);
+
+ UT_LIST_INIT(
+ trx->trx_savepoints,
+ &trx_named_savept_t::trx_savepoints);
+
+ mutex_create(LATCH_ID_TRX, &trx->mutex);
+ }
+
+ /** Release resources held by the transaction object.
+ @param trx the transaction for which to release resources */
+ static void destroy(trx_t* trx)
+ {
+#ifdef __SANITIZE_ADDRESS__
+ /* Unpoison the memory for AddressSanitizer */
+ MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+ /* In Valgrind, we cannot cancel MEM_NOACCESS() without
+ changing the state of the V bits (which indicate
+ which bits are initialized).
+ We will declare the contents as initialized.
+ We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+ MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+ ut_ad(!trx->mysql_thd);
+
+ ut_a(trx->lock.wait_lock == NULL);
+ ut_a(trx->lock.wait_thr == NULL);
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ if (trx->lock.lock_heap != NULL) {
+ mem_heap_free(trx->lock.lock_heap);
+ trx->lock.lock_heap = NULL;
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+ ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+ UT_DELETE(trx->xid);
+ ut_free(trx->detailed_error);
+
+ mutex_free(&trx->mutex);
+
+ trx->mod_tables.~trx_mod_tables_t();
+
+ ut_ad(!trx->read_view.is_open());
+
+ trx->lock.table_locks.~lock_list();
+
+ trx->read_view.~ReadView();
+ }
+};
+
+/** The lock strategy for TrxPool */
+struct TrxPoolLock {
+ TrxPoolLock() { }
+
+ /** Create the mutex */
+ void create()
+ {
+ mutex_create(LATCH_ID_TRX_POOL, &m_mutex);
+ }
+
+ /** Acquire the mutex */
+ void enter() { mutex_enter(&m_mutex); }
+
+ /** Release the mutex */
+ void exit() { mutex_exit(&m_mutex); }
+
+ /** Free the mutex */
+ void destroy() { mutex_free(&m_mutex); }
+
+ /** Mutex to use */
+ ib_mutex_t m_mutex;
+};
+
+/** The lock strategy for the TrxPoolManager */
+struct TrxPoolManagerLock {
+ TrxPoolManagerLock() { }
+
+ /** Create the mutex */
+ void create()
+ {
+ mutex_create(LATCH_ID_TRX_POOL_MANAGER, &m_mutex);
+ }
+
+ /** Acquire the mutex */
+ void enter() { mutex_enter(&m_mutex); }
+
+ /** Release the mutex */
+ void exit() { mutex_exit(&m_mutex); }
+
+ /** Free the mutex */
+ void destroy() { mutex_free(&m_mutex); }
+
+ /** Mutex to use */
+ ib_mutex_t m_mutex;
+};
+
+/** Use explicit mutexes for the trx_t pool and its manager. */
+typedef Pool<trx_t, TrxFactory, TrxPoolLock> trx_pool_t;
+typedef PoolManager<trx_pool_t, TrxPoolManagerLock > trx_pools_t;
+
+/** The trx_t pool manager */
+static trx_pools_t* trx_pools;
+
+/** Size of on trx_t pool in bytes. */
+static const ulint MAX_TRX_BLOCK_SIZE = 1024 * 1024 * 4;
+
+/** Create the trx_t pool */
+void
+trx_pool_init()
+{
+ trx_pools = UT_NEW_NOKEY(trx_pools_t(MAX_TRX_BLOCK_SIZE));
+
+ ut_a(trx_pools != 0);
+}
+
+/** Destroy the trx_t pool */
+void
+trx_pool_close()
+{
+ UT_DELETE(trx_pools);
+
+ trx_pools = 0;
+}
+
+/** @return an allocated transaction */
+trx_t *trx_create()
+{
+ trx_t* trx = trx_pools->get();
+
+#ifdef __SANITIZE_ADDRESS__
+ /* Unpoison the memory for AddressSanitizer.
+ It may have been poisoned in trx_t::free().*/
+ MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+ /* In Valgrind, we cannot cancel MEM_NOACCESS() without
+ changing the state of the V bits (which indicate
+ which bits are initialized).
+ We will declare the contents as initialized.
+ We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+ MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+ trx->assert_freed();
+
+ mem_heap_t* heap;
+ ib_alloc_t* alloc;
+
+ /* We just got trx from pool, it should be non locking */
+ ut_ad(!trx->will_lock);
+ ut_ad(!trx->rw_trx_hash_pins);
+
+ DBUG_LOG("trx", "Create: " << trx);
+
+ heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
+
+ alloc = ib_heap_allocator_create(heap);
+
+ trx->autoinc_locks = ib_vector_create(alloc, sizeof(void**), 4);
+
+ ut_ad(trx->mod_tables.empty());
+ ut_ad(trx->lock.n_rec_locks == 0);
+ ut_ad(trx->lock.table_cached == 0);
+ ut_ad(trx->lock.rec_cached == 0);
+ ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+#ifdef WITH_WSREP
+ ut_ad(!trx->wsrep_UK_scan);
+#endif /* WITH_WSREP */
+
+ trx_sys.register_trx(trx);
+
+ return(trx);
+}
+
+/** Free the memory to trx_pools */
+void trx_t::free()
+{
+ MEM_CHECK_DEFINED(this, sizeof *this);
+
+ ut_ad(!n_mysql_tables_in_use);
+ ut_ad(!mysql_log_file_name);
+ ut_ad(!mysql_n_tables_locked);
+ ut_ad(!internal);
+ ut_ad(!will_lock);
+ ut_ad(error_state == DB_SUCCESS);
+ ut_ad(magic_n == TRX_MAGIC_N);
+ ut_ad(!read_only);
+ ut_ad(!lock.wait_lock);
+
+ dict_operation= TRX_DICT_OP_NONE;
+ trx_sys.deregister_trx(this);
+ assert_freed();
+ trx_sys.rw_trx_hash.put_pins(this);
+
+ mysql_thd= nullptr;
+
+ // FIXME: We need to avoid this heap free/alloc for each commit.
+ if (autoinc_locks)
+ {
+ ut_ad(ib_vector_is_empty(autoinc_locks));
+ /* We allocated a dedicated heap for the vector. */
+ ib_vector_free(autoinc_locks);
+ autoinc_locks= NULL;
+ }
+
+ mod_tables.clear();
+
+ MEM_NOACCESS(&n_ref, sizeof n_ref);
+ /* do not poison mutex */
+ MEM_NOACCESS(&id, sizeof id);
+ MEM_NOACCESS(&state, sizeof state);
+ MEM_NOACCESS(&is_recovered, sizeof is_recovered);
+#ifdef WITH_WSREP
+ MEM_NOACCESS(&wsrep, sizeof wsrep);
+#endif
+ read_view.mem_noaccess();
+ MEM_NOACCESS(&lock, sizeof lock);
+ MEM_NOACCESS(&op_info, sizeof op_info);
+ MEM_NOACCESS(&isolation_level, sizeof isolation_level);
+ MEM_NOACCESS(&check_foreigns, sizeof check_foreigns);
+ MEM_NOACCESS(&is_registered, sizeof is_registered);
+ MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered);
+ MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary);
+ MEM_NOACCESS(&flush_log_later, sizeof flush_log_later);
+ MEM_NOACCESS(&must_flush_log_later, sizeof must_flush_log_later);
+ MEM_NOACCESS(&duplicates, sizeof duplicates);
+ MEM_NOACCESS(&dict_operation, sizeof dict_operation);
+ MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode);
+ MEM_NOACCESS(&start_time, sizeof start_time);
+ MEM_NOACCESS(&start_time_micro, sizeof start_time_micro);
+ MEM_NOACCESS(&commit_lsn, sizeof commit_lsn);
+ MEM_NOACCESS(&table_id, sizeof table_id);
+ MEM_NOACCESS(&mysql_thd, sizeof mysql_thd);
+ MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name);
+ MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset);
+ MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use);
+ MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked);
+ MEM_NOACCESS(&error_state, sizeof error_state);
+ MEM_NOACCESS(&error_info, sizeof error_info);
+ MEM_NOACCESS(&error_key_num, sizeof error_key_num);
+ MEM_NOACCESS(&graph, sizeof graph);
+ MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints);
+ MEM_NOACCESS(&undo_no, sizeof undo_no);
+ MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start);
+ MEM_NOACCESS(&rsegs, sizeof rsegs);
+ MEM_NOACCESS(&roll_limit, sizeof roll_limit);
+ MEM_NOACCESS(&in_rollback, sizeof in_rollback);
+ MEM_NOACCESS(&pages_undone, sizeof pages_undone);
+ MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows);
+ MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks);
+ MEM_NOACCESS(&read_only, sizeof read_only);
+ MEM_NOACCESS(&auto_commit, sizeof auto_commit);
+ MEM_NOACCESS(&will_lock, sizeof will_lock);
+ MEM_NOACCESS(&fts_trx, sizeof fts_trx);
+ MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id);
+ MEM_NOACCESS(&flush_tables, sizeof flush_tables);
+ MEM_NOACCESS(&ddl, sizeof ddl);
+ MEM_NOACCESS(&internal, sizeof internal);
+#ifdef UNIV_DEBUG
+ MEM_NOACCESS(&start_line, sizeof start_line);
+ MEM_NOACCESS(&start_file, sizeof start_file);
+#endif /* UNIV_DEBUG */
+ MEM_NOACCESS(&xid, sizeof xid);
+ MEM_NOACCESS(&mod_tables, sizeof mod_tables);
+ MEM_NOACCESS(&detailed_error, sizeof detailed_error);
+#ifdef WITH_WSREP
+ ut_ad(!wsrep_UK_scan);
+ MEM_NOACCESS(&wsrep_UK_scan, sizeof wsrep_UK_scan);
+#endif /* WITH_WSREP */
+ MEM_NOACCESS(&magic_n, sizeof magic_n);
+ trx_pools->mem_free(this);
+}
+
+/** Transition to committed state, to release implicit locks. */
+inline void trx_t::commit_state()
+{
+ ut_ad(state == TRX_STATE_PREPARED
+ || state == TRX_STATE_PREPARED_RECOVERED
+ || state == TRX_STATE_ACTIVE);
+ /* This makes the transaction committed in memory and makes its
+ changes to data visible to other transactions. NOTE that there is a
+ small discrepancy from the strict formal visibility rules here: a
+ user of the database can see modifications made by another
+ transaction T even before the necessary redo log segment has been
+ flushed to the disk. If the database happens to crash before the
+ flush, the user has seen modifications from T which will never be a
+ committed transaction. However, any transaction T2 which sees the
+ modifications of the committing transaction T, and which also itself
+ makes modifications to the database, will get an lsn larger than the
+ committing transaction T. In the case where the log flush fails, and
+ T never gets committed, also T2 will never get committed. */
+ trx_mutex_enter(this);
+ state= TRX_STATE_COMMITTED_IN_MEMORY;
+ trx_mutex_exit(this);
+ ut_ad(id || !is_referenced());
+}
+
+/** Release any explicit locks of a committing transaction. */
+inline void trx_t::release_locks()
+{
+ DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY);
+ DBUG_ASSERT(!is_referenced());
+
+ if (UT_LIST_GET_LEN(lock.trx_locks))
+ {
+ lock_release(this);
+ lock.n_rec_locks = 0;
+ ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+ ut_ad(ib_vector_is_empty(autoinc_locks));
+ mem_heap_empty(lock.lock_heap);
+ }
+
+ lock.table_locks.clear();
+}
+
+/** At shutdown, frees a transaction object. */
+void
+trx_free_at_shutdown(trx_t *trx)
+{
+ ut_ad(trx->is_recovered);
+ ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
+ || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
+ || (trx_state_eq(trx, TRX_STATE_ACTIVE)
+ && (!srv_was_started
+ || srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_EXPORT
+ || srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+ || (!srv_is_being_started
+ && !srv_undo_sources && srv_fast_shutdown))));
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+
+ trx->commit_state();
+ trx->release_locks();
+ trx_undo_free_at_shutdown(trx);
+
+ ut_a(!trx->read_only);
+
+ DBUG_LOG("trx", "Free prepared: " << trx);
+ trx->state = TRX_STATE_NOT_STARTED;
+ ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks));
+ trx->id = 0;
+ trx->free();
+}
+
+
+/**
+ Disconnect a prepared transaction from MySQL
+ @param[in,out] trx transaction
+*/
+void trx_disconnect_prepared(trx_t *trx)
+{
+ ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(trx->mysql_thd);
+ ut_ad(!trx->mysql_log_file_name);
+ trx->read_view.close();
+ trx->is_recovered= true;
+ trx->mysql_thd= NULL;
+ /* todo/fixme: suggest to do it at innodb prepare */
+ trx->will_lock= false;
+ trx_sys.rw_trx_hash.put_pins(trx);
+}
+
+/****************************************************************//**
+Resurrect the table locks for a resurrected transaction. */
+static
+void
+trx_resurrect_table_locks(
+/*======================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const trx_undo_t* undo) /*!< in: undo log */
+{
+ mtr_t mtr;
+ table_id_set tables;
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
+
+ if (undo->empty()) {
+ return;
+ }
+
+ mtr_start(&mtr);
+
+ /* trx_rseg_mem_create() may have acquired an X-latch on this
+ page, so we cannot acquire an S-latch. */
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(trx->rsegs.m_redo.rseg->space->id,
+ undo->top_page_no), &mtr);
+ buf_block_t* undo_block = block;
+ trx_undo_rec_t* undo_rec = block->frame + undo->top_offset;
+
+ do {
+ ulint type;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ ulint cmpl_info;
+ bool updated_extern;
+
+ if (undo_block != block) {
+ mtr.memo_release(undo_block, MTR_MEMO_PAGE_X_FIX);
+ undo_block = block;
+ }
+
+ trx_undo_rec_get_pars(
+ undo_rec, &type, &cmpl_info,
+ &updated_extern, &undo_no, &table_id);
+ tables.insert(table_id);
+
+ undo_rec = trx_undo_get_prev_rec(
+ block, page_offset(undo_rec), undo->hdr_page_no,
+ undo->hdr_offset, false, &mtr);
+ } while (undo_rec);
+
+ mtr_commit(&mtr);
+
+ for (table_id_set::const_iterator i = tables.begin();
+ i != tables.end(); i++) {
+ if (dict_table_t* table = dict_table_open_on_id(
+ *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) {
+ if (!table->is_readable()) {
+ mutex_enter(&dict_sys.mutex);
+ dict_table_close(table, TRUE, FALSE);
+ dict_sys.remove(table);
+ mutex_exit(&dict_sys.mutex);
+ continue;
+ }
+
+ if (trx->state == TRX_STATE_PREPARED) {
+ trx->mod_tables.insert(
+ trx_mod_tables_t::value_type(table,
+ 0));
+ }
+ lock_table_ix_resurrect(table, trx);
+
+ DBUG_LOG("ib_trx",
+ "resurrect " << ib::hex(trx->id)
+ << " IX lock on " << table->name);
+
+ dict_table_close(table, FALSE, FALSE);
+ }
+ }
+}
+
+
+/**
+ Resurrect the transactions that were doing inserts/updates the time of the
+ crash, they need to be undone.
+*/
+
+static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
+ time_t start_time, ulonglong start_time_micro,
+ uint64_t *rows_to_undo)
+{
+ trx_state_t state;
+ /*
+ This is single-threaded startup code, we do not need the
+ protection of trx->mutex here.
+ */
+ switch (undo->state)
+ {
+ case TRX_UNDO_ACTIVE:
+ state= TRX_STATE_ACTIVE;
+ break;
+ case TRX_UNDO_PREPARED:
+ /*
+ Prepared transactions are left in the prepared state
+ waiting for a commit or abort decision from MySQL
+ */
+ ib::info() << "Transaction " << undo->trx_id
+ << " was in the XA prepared state.";
+
+ state= TRX_STATE_PREPARED;
+ break;
+ default:
+ return;
+ }
+
+ trx_t *trx= trx_create();
+ trx->state= state;
+ ut_d(trx->start_file= __FILE__);
+ ut_d(trx->start_line= __LINE__);
+
+ trx->rsegs.m_redo.undo= undo;
+ trx->undo_no= undo->top_undo_no + 1;
+ trx->rsegs.m_redo.rseg= rseg;
+ /*
+ For transactions with active data will not have rseg size = 1
+ or will not qualify for purge limit criteria. So it is safe to increment
+ this trx_ref_count w/o mutex protection.
+ */
+ ++trx->rsegs.m_redo.rseg->trx_ref_count;
+ *trx->xid= undo->xid;
+ trx->id= undo->trx_id;
+ trx->is_recovered= true;
+ trx->start_time= start_time;
+ trx->start_time_micro= start_time_micro;
+
+ if (undo->dict_operation)
+ {
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+ if (!trx->table_id)
+ trx->table_id= undo->table_id;
+ }
+
+ trx_sys.rw_trx_hash.insert(trx);
+ trx_sys.rw_trx_hash.put_pins(trx);
+ trx_resurrect_table_locks(trx, undo);
+ if (trx_state_eq(trx, TRX_STATE_ACTIVE))
+ *rows_to_undo+= trx->undo_no;
+}
+
+
+/** Initialize (resurrect) transactions at startup. */
+dberr_t trx_lists_init_at_db_start()
+{
+ ut_a(srv_is_being_started);
+ ut_ad(!srv_was_started);
+
+ if (srv_operation == SRV_OPERATION_RESTORE) {
+ /* mariabackup --prepare only deals with
+ the redo log and the data files, not with
+ transactions or the data dictionary. */
+ return trx_rseg_array_init();
+ }
+
+ if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
+ return DB_SUCCESS;
+ }
+
+ purge_sys.create();
+ if (dberr_t err = trx_rseg_array_init()) {
+ ib::info() << "Retry with innodb_force_recovery=5";
+ return err;
+ }
+
+ /* Look from the rollback segments if there exist undo logs for
+ transactions. */
+ const time_t start_time = time(NULL);
+ const ulonglong start_time_micro= microsecond_interval_timer();
+ uint64_t rows_to_undo = 0;
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ trx_undo_t* undo;
+ trx_rseg_t* rseg = trx_sys.rseg_array[i];
+
+ /* Some rollback segment may be unavailable,
+ especially if the server was previously run with a
+ non-default value of innodb_undo_logs. */
+ if (rseg == NULL) {
+ continue;
+ }
+ /* Ressurrect other transactions. */
+ for (undo = UT_LIST_GET_FIRST(rseg->undo_list);
+ undo != NULL;
+ undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+ trx_t *trx = trx_sys.find(0, undo->trx_id, false);
+ if (!trx) {
+ trx_resurrect(undo, rseg, start_time,
+ start_time_micro, &rows_to_undo);
+ } else {
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(trx->start_time == start_time);
+ ut_ad(trx->is_recovered);
+ ut_ad(trx->rsegs.m_redo.rseg == rseg);
+ ut_ad(trx->rsegs.m_redo.rseg->trx_ref_count);
+
+ trx->rsegs.m_redo.undo = undo;
+ if (undo->top_undo_no >= trx->undo_no) {
+ if (trx_state_eq(trx,
+ TRX_STATE_ACTIVE)) {
+ rows_to_undo -= trx->undo_no;
+ rows_to_undo +=
+ undo->top_undo_no + 1;
+ }
+
+ trx->undo_no = undo->top_undo_no + 1;
+ }
+ trx_resurrect_table_locks(trx, undo);
+ }
+ }
+ }
+
+ if (const auto size = trx_sys.rw_trx_hash.size()) {
+ ib::info() << size
+ << " transaction(s) which must be rolled back or"
+ " cleaned up in total " << rows_to_undo
+ << " row operations to undo";
+ ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id();
+ }
+
+ purge_sys.clone_oldest_view();
+ return DB_SUCCESS;
+}
+
+/** Assign a persistent rollback segment in a round-robin fashion,
+evenly distributed between 0 and innodb_undo_logs-1
+@return persistent rollback segment
+@retval NULL if innodb_read_only */
+static trx_rseg_t* trx_assign_rseg_low()
+{
+ if (high_level_read_only) {
+ ut_ad(!srv_available_undo_logs);
+ return(NULL);
+ }
+
+ ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+ /* The first slot is always assigned to the system tablespace. */
+ ut_ad(trx_sys.rseg_array[0]->space == fil_system.sys_space);
+
+ /* Choose a rollback segment evenly distributed between 0 and
+ innodb_undo_logs-1 in a round-robin fashion, skipping those
+ undo tablespaces that are scheduled for truncation. */
+ static Atomic_counter<unsigned> rseg_slot;
+ unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS;
+ ut_d(if (trx_rseg_n_slots_debug) slot = 0);
+ trx_rseg_t* rseg;
+
+#ifdef UNIV_DEBUG
+ ulint start_scan_slot = slot;
+ bool look_for_rollover = false;
+#endif /* UNIV_DEBUG */
+
+ bool allocated = false;
+
+ do {
+ for (;;) {
+ rseg = trx_sys.rseg_array[slot];
+
+#ifdef UNIV_DEBUG
+ /* Ensure that we are not revisiting the same
+ slot that we have already inspected. */
+ if (look_for_rollover) {
+ ut_ad(start_scan_slot != slot);
+ }
+ look_for_rollover = true;
+#endif /* UNIV_DEBUG */
+
+ ut_d(if (!trx_rseg_n_slots_debug))
+ slot = (slot + 1) % TRX_SYS_N_RSEGS;
+
+ if (rseg == NULL) {
+ continue;
+ }
+
+ ut_ad(rseg->is_persistent());
+
+ if (rseg->space != fil_system.sys_space) {
+ if (rseg->skip_allocation
+ || !srv_undo_tablespaces) {
+ continue;
+ }
+ } else if (trx_rseg_t* next
+ = trx_sys.rseg_array[slot]) {
+ if (next->space != fil_system.sys_space
+ && srv_undo_tablespaces > 0) {
+ /** If dedicated
+ innodb_undo_tablespaces have
+ been configured, try to use them
+ instead of the system tablespace. */
+ continue;
+ }
+ }
+
+ break;
+ }
+
+ /* By now we have only selected the rseg but not marked it
+ allocated. By marking it allocated we are ensuring that it will
+ never be selected for UNDO truncate purge. */
+ mutex_enter(&rseg->mutex);
+ if (!rseg->skip_allocation) {
+ rseg->trx_ref_count++;
+ allocated = true;
+ }
+ mutex_exit(&rseg->mutex);
+ } while (!allocated);
+
+ ut_ad(rseg->trx_ref_count > 0);
+ ut_ad(rseg->is_persistent());
+ return(rseg);
+}
+
+/** Assign a rollback segment for modifying temporary tables.
+@return the assigned rollback segment */
+trx_rseg_t *trx_t::assign_temp_rseg()
+{
+ ut_ad(!rsegs.m_noredo.rseg);
+ ut_ad(!is_autocommit_non_locking());
+ compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS));
+
+ /* Choose a temporary rollback segment between 0 and 127
+ in a round-robin fashion. */
+ static Atomic_counter<unsigned> rseg_slot;
+ trx_rseg_t* rseg = trx_sys.temp_rsegs[
+ rseg_slot++ & (TRX_SYS_N_RSEGS - 1)];
+ ut_ad(!rseg->is_persistent());
+ rsegs.m_noredo.rseg = rseg;
+
+ if (id == 0) {
+ trx_sys.register_rw(this);
+ }
+
+ ut_ad(!rseg->is_persistent());
+ return(rseg);
+}
+
+/****************************************************************//**
+Starts a transaction. */
+static
+void
+trx_start_low(
+/*==========*/
+ trx_t* trx, /*!< in: transaction */
+ bool read_write) /*!< in: true if read-write transaction */
+{
+ ut_ad(!trx->in_rollback);
+ ut_ad(!trx->is_recovered);
+ ut_ad(trx->start_line != 0);
+ ut_ad(trx->start_file != 0);
+ ut_ad(trx->roll_limit == 0);
+ ut_ad(trx->error_state == DB_SUCCESS);
+ ut_ad(trx->rsegs.m_redo.rseg == NULL);
+ ut_ad(trx->rsegs.m_noredo.rseg == NULL);
+ ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+ ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+ /* Check whether it is an AUTOCOMMIT SELECT */
+ trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
+
+ trx->read_only = srv_read_only_mode
+ || (!trx->ddl && !trx->internal
+ && thd_trx_is_read_only(trx->mysql_thd));
+
+ if (!trx->auto_commit) {
+ trx->will_lock = true;
+ } else if (!trx->will_lock) {
+ trx->read_only = true;
+ }
+
+#ifdef WITH_WSREP
+ trx->xid->null();
+#endif /* WITH_WSREP */
+
+ ut_a(ib_vector_is_empty(trx->autoinc_locks));
+ ut_a(trx->lock.table_locks.empty());
+
+ /* No other thread can access this trx object through rw_trx_hash,
+ still it can be found through trx_sys.trx_list. Sometimes it's
+ possible to indirectly protect trx_t::state by freezing
+ trx_sys.trx_list.
+
+ For now we update it without mutex protection, because original code
+ did it this way. It has to be reviewed and fixed properly. */
+ trx->state = TRX_STATE_ACTIVE;
+
+ /* By default all transactions are in the read-only list unless they
+ are non-locking auto-commit read only transactions or background
+ (internal) transactions. Note: Transactions marked explicitly as
+ read only can write to temporary tables, we put those on the RO
+ list too. */
+
+ if (!trx->read_only
+ && (trx->mysql_thd == 0 || read_write || trx->ddl)) {
+
+ /* Temporary rseg is assigned only if the transaction
+ updates a temporary table */
+ trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
+ ut_ad(trx->rsegs.m_redo.rseg != 0
+ || srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
+
+ trx_sys.register_rw(trx);
+ } else {
+ if (!trx->is_autocommit_non_locking()) {
+
+ /* If this is a read-only transaction that is writing
+ to a temporary table then it needs a transaction id
+ to write to the temporary table. */
+
+ if (read_write) {
+ ut_ad(!srv_read_only_mode);
+ trx_sys.register_rw(trx);
+ }
+ } else {
+ ut_ad(!read_write);
+ }
+ }
+
+ trx->start_time = time(NULL);
+ trx->start_time_micro = trx->mysql_thd
+ ? thd_query_start_micro(trx->mysql_thd)
+ : microsecond_interval_timer();
+
+ ut_a(trx->error_state == DB_SUCCESS);
+
+ MONITOR_INC(MONITOR_TRX_ACTIVE);
+}
+
+/** Set the serialisation number for a persistent committed transaction.
+@param[in,out] trx committed transaction with persistent changes */
+static
+void
+trx_serialise(trx_t* trx)
+{
+ trx_rseg_t *rseg = trx->rsegs.m_redo.rseg;
+ ut_ad(rseg);
+ ut_ad(mutex_own(&rseg->mutex));
+
+ if (rseg->last_page_no == FIL_NULL) {
+ mutex_enter(&purge_sys.pq_mutex);
+ }
+
+ trx_sys.assign_new_trx_no(trx);
+
+ /* If the rollback segment is not empty then the
+ new trx_t::no can't be less than any trx_t::no
+ already in the rollback segment. User threads only
+ produce events when a rollback segment is empty. */
+ if (rseg->last_page_no == FIL_NULL) {
+ purge_sys.purge_queue.push(TrxUndoRsegs(trx->rw_trx_hash_element->no,
+ *rseg));
+ mutex_exit(&purge_sys.pq_mutex);
+ }
+}
+
+/****************************************************************//**
+Assign the transaction its history serialisation number and write the
+update UNDO log record to the assigned rollback segment. */
+static
+void
+trx_write_serialisation_history(
+/*============================*/
+ trx_t* trx, /*!< in/out: transaction */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE to some
+ other state: these modifications to the file data structure define
+ the transaction as committed in the file based domain, at the
+ serialization point of the log sequence number lsn obtained below. */
+
+ /* We have to hold the rseg mutex because update log headers have
+ to be put to the history list in the (serialisation) order of the
+ UNDO trx number. This is required for the purge in-memory data
+ structures too. */
+
+ if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+ /* Undo log for temporary tables is discarded at transaction
+ commit. There is no purge for temporary tables, and also no
+ MVCC, because they are private to a session. */
+
+ mtr_t temp_mtr;
+ temp_mtr.start();
+ temp_mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ mutex_enter(&trx->rsegs.m_noredo.rseg->mutex);
+ trx_undo_set_state_at_finish(undo, &temp_mtr);
+ mutex_exit(&trx->rsegs.m_noredo.rseg->mutex);
+ temp_mtr.commit();
+ }
+
+ trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+ if (!rseg) {
+ ut_ad(!trx->rsegs.m_redo.undo);
+ return;
+ }
+
+ trx_undo_t*& undo = trx->rsegs.m_redo.undo;
+
+ if (!undo) {
+ return;
+ }
+
+ ut_ad(!trx->read_only);
+ ut_ad(!undo || undo->rseg == rseg);
+ mutex_enter(&rseg->mutex);
+
+ /* Assign the transaction serialisation number and add any
+ undo log to the purge queue. */
+ trx_serialise(trx);
+ if (undo) {
+ UT_LIST_REMOVE(rseg->undo_list, undo);
+ trx_purge_add_undo_to_history(trx, undo, mtr);
+ }
+
+ mutex_exit(&rseg->mutex);
+
+ MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+}
+
+/********************************************************************
+Finalize a transaction containing updates for a FTS table. */
+static
+void
+trx_finalize_for_fts_table(
+/*=======================*/
+ fts_trx_table_t* ftt) /* in: FTS trx table */
+{
+ fts_t* fts = ftt->table->fts;
+ fts_doc_ids_t* doc_ids = ftt->added_doc_ids;
+
+ ut_a(fts->add_wq);
+
+ mem_heap_t* heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
+
+ ib_wqueue_add(fts->add_wq, doc_ids, heap);
+
+ /* fts_trx_table_t no longer owns the list. */
+ ftt->added_doc_ids = NULL;
+}
+
+/******************************************************************//**
+Finalize a transaction containing updates to FTS tables. */
+static
+void
+trx_finalize_for_fts(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ bool is_commit) /*!< in: true if the transaction was
+ committed, false if it was rolled back. */
+{
+ if (is_commit) {
+ const ib_rbt_node_t* node;
+ ib_rbt_t* tables;
+ fts_savepoint_t* savepoint;
+
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_last(trx->fts_trx->savepoints));
+
+ tables = savepoint->tables;
+
+ for (node = rbt_first(tables);
+ node;
+ node = rbt_next(tables, node)) {
+ fts_trx_table_t** ftt;
+
+ ftt = rbt_value(fts_trx_table_t*, node);
+
+ if ((*ftt)->added_doc_ids) {
+ trx_finalize_for_fts_table(*ftt);
+ }
+ }
+ }
+
+ fts_trx_free(trx->fts_trx);
+ trx->fts_trx = NULL;
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static
+void
+trx_flush_log_if_needed_low(
+/*========================*/
+ lsn_t lsn) /*!< in: lsn up to which logs are to be
+ flushed. */
+{
+ bool flush = srv_file_flush_method != SRV_NOSYNC;
+
+ switch (srv_flush_log_at_trx_commit) {
+ case 2:
+ /* Write the log but do not flush it to disk */
+ flush = false;
+ /* fall through */
+ case 1:
+ case 3:
+ /* Write the log and optionally flush it to disk */
+ log_write_up_to(lsn, flush);
+ srv_inc_activity_count();
+ return;
+ case 0:
+ /* Do nothing */
+ return;
+ }
+
+ ut_error;
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static
+void
+trx_flush_log_if_needed(
+/*====================*/
+ lsn_t lsn, /*!< in: lsn up to which logs are to be
+ flushed. */
+ trx_t* trx) /*!< in/out: transaction */
+{
+ trx->op_info = "flushing log";
+ trx_flush_log_if_needed_low(lsn);
+ trx->op_info = "";
+}
+
+/**********************************************************************//**
+For each table that has been modified by the given transaction: update
+its dict_table_t::update_time with the current timestamp. Clear the list
+of the modified tables at the end. */
+static
+void
+trx_update_mod_tables_timestamp(
+/*============================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ /* consider using trx->start_time if calling time() is too
+ expensive here */
+ const time_t now = time(NULL);
+
+ trx_mod_tables_t::const_iterator end = trx->mod_tables.end();
+
+ for (trx_mod_tables_t::const_iterator it = trx->mod_tables.begin();
+ it != end;
+ ++it) {
+
+ /* This could be executed by multiple threads concurrently
+ on the same table object. This is fine because time_t is
+ word size or less. And _purely_ _theoretically_, even if
+ time_t write is not atomic, likely the value of 'now' is
+ the same in all threads and even if it is not, getting a
+ "garbage" in table->update_time is justified because
+ protecting it with a latch here would be too performance
+ intrusive. */
+ dict_table_t* table = it->first;
+ table->update_time = now;
+ }
+
+ trx->mod_tables.clear();
+}
+
+/** Evict a table definition due to the rollback of ALTER TABLE.
+@param[in] table_id table identifier */
+void trx_t::evict_table(table_id_t table_id)
+{
+ ut_ad(in_rollback);
+
+ dict_table_t* table = dict_table_open_on_id(
+ table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+ if (!table) {
+ return;
+ }
+
+ if (!table->release()) {
+ /* This must be a DDL operation that is being rolled
+ back in an active connection. */
+ ut_a(table->get_ref_count() == 1);
+ ut_ad(!is_recovered);
+ ut_ad(mysql_thd);
+ return;
+ }
+
+ /* This table should only be locked by this transaction, if at all. */
+ ut_ad(UT_LIST_GET_LEN(table->locks) <= 1);
+ const bool locked = UT_LIST_GET_LEN(table->locks);
+ ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this);
+ dict_sys.remove(table, true, locked);
+ if (locked) {
+ UT_LIST_ADD_FIRST(lock.evicted_tables, table);
+ }
+}
+
+/** Mark a transaction committed in the main memory data structures. */
+inline void trx_t::commit_in_memory(const mtr_t *mtr)
+{
+ must_flush_log_later= false;
+ read_view.close();
+
+ if (is_autocommit_non_locking())
+ {
+ ut_ad(id == 0);
+ ut_ad(read_only);
+ ut_ad(!will_lock);
+ ut_a(!is_recovered);
+ ut_ad(!rsegs.m_redo.rseg);
+ ut_ad(mysql_thd);
+ ut_ad(state == TRX_STATE_ACTIVE);
+
+ /* Note: We are asserting without holding the lock mutex. But
+ that is OK because this transaction is not waiting and cannot
+ be rolled back and no new locks can (or should) be added
+ because it is flagged as a non-locking read-only transaction. */
+ ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+
+ /* This state change is not protected by any mutex, therefore
+ there is an inherent race here around state transition during
+ printouts. We ignore this race for the sake of efficiency.
+ However, the freezing of trx_sys.trx_list will protect the trx_t
+ instance and it cannot be removed from the trx_list and freed
+ without first unfreezing trx_list. */
+ state= TRX_STATE_NOT_STARTED;
+
+ MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
+
+ DBUG_LOG("trx", "Autocommit in memory: " << this);
+ }
+ else
+ {
+#ifdef UNIV_DEBUG
+ if (!UT_LIST_GET_LEN(lock.trx_locks))
+ for (auto l : lock.table_locks)
+ ut_ad(!l);
+#endif /* UNIV_DEBUG */
+ commit_state();
+
+ if (id)
+ {
+ trx_sys.deregister_rw(this);
+
+ /* Wait for any implicit-to-explicit lock conversions to cease,
+ so that there will be no race condition in lock_release(). */
+ while (UNIV_UNLIKELY(is_referenced()))
+ ut_delay(srv_spin_wait_delay);
+ }
+ else
+ ut_ad(read_only || !rsegs.m_redo.rseg);
+
+ if (read_only || !rsegs.m_redo.rseg)
+ {
+ MONITOR_INC(MONITOR_TRX_RO_COMMIT);
+ }
+ else
+ {
+ trx_update_mod_tables_timestamp(this);
+ MONITOR_INC(MONITOR_TRX_RW_COMMIT);
+ is_recovered= false;
+ }
+
+ release_locks();
+ id= 0;
+ DEBUG_SYNC_C("after_trx_committed_in_memory");
+
+ while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
+ {
+ UT_LIST_REMOVE(lock.evicted_tables, table);
+ dict_mem_table_free(table);
+ }
+ }
+
+ ut_ad(!rsegs.m_redo.undo);
+ ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
+
+ if (trx_rseg_t *rseg= rsegs.m_redo.rseg)
+ {
+ mutex_enter(&rseg->mutex);
+ ut_ad(rseg->trx_ref_count > 0);
+ --rseg->trx_ref_count;
+ mutex_exit(&rseg->mutex);
+ }
+
+ if (mtr)
+ {
+ if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+ {
+ ut_ad(undo->rseg == rsegs.m_noredo.rseg);
+ trx_undo_commit_cleanup(undo);
+ undo= nullptr;
+ }
+
+ /* NOTE that we could possibly make a group commit more efficient
+ here: call os_thread_yield here to allow also other trxs to come
+ to commit! */
+
+ /*-------------------------------------*/
+
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the transaction durable if the OS
+ does not crash. We may also flush the log files to disk, making
+ the transaction durable also at an OS crash or a power outage.
+
+ The idea in InnoDB's group commit is that a group of transactions
+ gather behind a trx doing a physical disk write to log files, and
+ when that physical write has been completed, one of those
+ transactions does a write which commits the whole group. Note that
+ this group commit will only bring benefit if there are > 2 users
+ in the database. Then at least 2 users can gather behind one doing
+ the physical log write to disk.
+
+ If we are calling trx_t::commit() under prepare_commit_mutex, we
+ will delay possible log write and flush to a separate function
+ trx_commit_complete_for_mysql(), which is only called when the
+ thread has released the mutex. This is to make the group commit
+ algorithm to work. Otherwise, the prepare_commit mutex would
+ serialize all commits and prevent a group of transactions from
+ gathering. */
+
+ commit_lsn= mtr->commit_lsn();
+ if (!commit_lsn)
+ /* Nothing to be done. */;
+ else if (flush_log_later)
+ /* Do nothing yet */
+ must_flush_log_later= true;
+ else if (srv_flush_log_at_trx_commit)
+ trx_flush_log_if_needed(commit_lsn, this);
+ }
+
+ ut_ad(!rsegs.m_noredo.undo);
+
+ /* Free all savepoints, starting from the first. */
+ trx_named_savept_t *savep= UT_LIST_GET_FIRST(trx_savepoints);
+
+ trx_roll_savepoints_free(this, savep);
+
+ if (fts_trx)
+ trx_finalize_for_fts(this, undo_no != 0);
+
+#ifdef WITH_WSREP
+ /* Serialization history has been written and the transaction is
+ committed in memory, which makes this commit ordered. Release commit
+ order critical section. */
+ if (wsrep)
+ {
+ wsrep= false;
+ wsrep_commit_ordered(mysql_thd);
+ }
+ lock.was_chosen_as_wsrep_victim= false;
+#endif /* WITH_WSREP */
+ trx_mutex_enter(this);
+ dict_operation= TRX_DICT_OP_NONE;
+
+ DBUG_LOG("trx", "Commit in memory: " << this);
+ state= TRX_STATE_NOT_STARTED;
+
+ assert_freed();
+ trx_init(this);
+ trx_mutex_exit(this);
+
+ ut_a(error_state == DB_SUCCESS);
+ if (!srv_read_only_mode)
+ srv_wake_purge_thread_if_not_active();
+}
+
+/** Commit the transaction in a mini-transaction.
+@param mtr mini-transaction (if there are any persistent modifications) */
+void trx_t::commit_low(mtr_t *mtr)
+{
+ ut_ad(!mtr || mtr->is_active());
+ ut_d(bool aborted = in_rollback && error_state == DB_DEADLOCK);
+ ut_ad(!mtr == (aborted || !has_logged()));
+ ut_ad(!mtr || !aborted);
+
+ /* undo_no is non-zero if we're doing the final commit. */
+ if (fts_trx && undo_no)
+ {
+ ut_a(!is_autocommit_non_locking());
+ /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY instead of
+ dying. This is a possible scenario if there is a crash between
+ insert to DELETED table committing and transaction committing. The
+ fix would be able to return error from this function */
+ if (dberr_t error= fts_commit(this))
+ ut_a(error == DB_DUPLICATE_KEY);
+ }
+
+#ifndef DBUG_OFF
+ const bool debug_sync= mysql_thd && has_logged_persistent();
+#endif
+
+ if (mtr)
+ {
+ trx_write_serialisation_history(this, mtr);
+
+ /* The following call commits the mini-transaction, making the
+ whole transaction committed in the file-based world, at this log
+ sequence number. The transaction becomes 'durable' when we write
+ the log to disk, but in the logical sense the commit in the
+ file-based data structures (undo logs etc.) happens here.
+
+ NOTE that transaction numbers, which are assigned only to
+ transactions with an update undo log, do not necessarily come in
+ exactly the same order as commit lsn's, if the transactions have
+ different rollback segments. To get exactly the same order we
+ should hold the kernel mutex up to this point, adding to the
+ contention of the kernel mutex. However, if a transaction T2 is
+ able to see modifications made by a transaction T1, T2 will always
+ get a bigger transaction number and a bigger commit lsn than T1. */
+
+ mtr->commit();
+ }
+#ifndef DBUG_OFF
+ if (debug_sync)
+ DEBUG_SYNC_C("before_trx_state_committed_in_memory");
+#endif
+
+ commit_in_memory(mtr);
+}
+
+
+void trx_t::commit()
+{
+ mtr_t *mtr= nullptr;
+ mtr_t local_mtr;
+
+ if (has_logged())
+ {
+ mtr= &local_mtr;
+ local_mtr.start();
+ }
+ commit_low(mtr);
+}
+
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* We are reading trx->state without holding trx->mutex
+ here, because the commit or rollback should be invoked for a
+ running (or recovered prepared) transaction that is associated
+ with the current thread. */
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx_start_low(trx, true);
+ /* fall through */
+
+ case TRX_STATE_ACTIVE:
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ /* If the trx is in a lock wait state, moves the waiting
+ query thread to the suspended state */
+
+ if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+ ut_a(trx->lock.wait_thr != NULL);
+ trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
+ trx->lock.wait_thr = NULL;
+
+ trx->lock.que_state = TRX_QUE_RUNNING;
+ }
+
+ ut_ad(trx->lock.n_active_thrs == 1);
+ return;
+
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ commit_node_t* node;
+
+ node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
+ node->common.type = QUE_NODE_COMMIT;
+ node->state = COMMIT_NODE_SEND;
+
+ return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_commit_step(
+/*============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ commit_node_t* node;
+
+ node = static_cast<commit_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = COMMIT_NODE_SEND;
+ }
+
+ if (node->state == COMMIT_NODE_SEND) {
+ trx_t* trx;
+
+ node->state = COMMIT_NODE_WAIT;
+
+ trx = thr_get_trx(thr);
+
+ ut_a(trx->lock.wait_thr == NULL);
+ ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
+
+ trx_commit_or_rollback_prepare(trx);
+
+ trx->lock.que_state = TRX_QUE_COMMITTING;
+ trx->commit();
+ ut_ad(trx->lock.wait_thr == NULL);
+ trx->lock.que_state = TRX_QUE_RUNNING;
+
+ thr = NULL;
+ } else {
+ ut_ad(node->state == COMMIT_NODE_WAIT);
+
+ node->state = COMMIT_NODE_SEND;
+
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* Because we do not do the commit by sending an Innobase
+ sig to the transaction, we must here make sure that trx has been
+ started. */
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ ut_d(trx->start_file = __FILE__);
+ ut_d(trx->start_line = __LINE__);
+
+ trx_start_low(trx, true);
+ /* fall through */
+ case TRX_STATE_ACTIVE:
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ trx->op_info = "committing";
+ trx->commit();
+ MONITOR_DEC(MONITOR_TRX_ACTIVE);
+ trx->op_info = "";
+ return(DB_SUCCESS);
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+void
+trx_commit_complete_for_mysql(
+/*==========================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ if (trx->id != 0
+ || !trx->must_flush_log_later
+ || (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered)) {
+
+ return;
+ }
+
+ trx_flush_log_if_needed(trx->commit_lsn, trx);
+
+ trx->must_flush_log_later = false;
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ ut_a(trx);
+
+ switch (trx->state) {
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ case TRX_STATE_NOT_STARTED:
+ trx->undo_no = 0;
+ /* fall through */
+ case TRX_STATE_ACTIVE:
+ trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+
+ if (trx->fts_trx != NULL) {
+ fts_savepoint_laststmt_refresh(trx);
+ }
+
+ return;
+ }
+
+ ut_error;
+}
+
+/**********************************************************************//**
+Prints info about a transaction. */
+void
+trx_print_low(
+/*==========*/
+ FILE* f,
+ /*!< in: output stream */
+ const trx_t* trx,
+ /*!< in: transaction */
+ ulint max_query_len,
+ /*!< in: max query length to print,
+ or 0 to use the default max length */
+ ulint n_rec_locks,
+ /*!< in: lock_number_of_rows_locked(&trx->lock) */
+ ulint n_trx_locks,
+ /*!< in: length of trx->lock.trx_locks */
+ ulint heap_size)
+ /*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+{
+ ibool newline;
+
+ fprintf(f, "TRANSACTION " TRX_ID_FMT, trx_get_id_for_print(trx));
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ fputs(", not started", f);
+ goto state_ok;
+ case TRX_STATE_ACTIVE:
+ fprintf(f, ", ACTIVE %lu sec",
+ (ulong) difftime(time(NULL), trx->start_time));
+ goto state_ok;
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+ (ulong) difftime(time(NULL), trx->start_time));
+ goto state_ok;
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ fputs(", COMMITTED IN MEMORY", f);
+ goto state_ok;
+ }
+ fprintf(f, ", state %lu", (ulong) trx->state);
+ ut_ad(0);
+state_ok:
+ const char* op_info = trx->op_info;
+
+ if (*op_info) {
+ putc(' ', f);
+ fputs(op_info, f);
+ }
+
+ if (trx->is_recovered) {
+ fputs(" recovered trx", f);
+ }
+
+ putc('\n', f);
+
+ if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+ fprintf(f, "mysql tables in use %lu, locked %lu\n",
+ (ulong) trx->n_mysql_tables_in_use,
+ (ulong) trx->mysql_n_tables_locked);
+ }
+
+ newline = TRUE;
+
+ /* trx->lock.que_state of an ACTIVE transaction may change
+ while we are not holding trx->mutex. We perform a dirty read
+ for performance reasons. */
+
+ switch (trx->lock.que_state) {
+ case TRX_QUE_RUNNING:
+ newline = FALSE; break;
+ case TRX_QUE_LOCK_WAIT:
+ fputs("LOCK WAIT ", f); break;
+ case TRX_QUE_ROLLING_BACK:
+ fputs("ROLLING BACK ", f); break;
+ case TRX_QUE_COMMITTING:
+ fputs("COMMITTING ", f); break;
+ default:
+ fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
+ }
+
+ if (n_trx_locks > 0 || heap_size > 400) {
+ newline = TRUE;
+
+ fprintf(f, "%lu lock struct(s), heap size %lu,"
+ " %lu row lock(s)",
+ (ulong) n_trx_locks,
+ (ulong) heap_size,
+ (ulong) n_rec_locks);
+ }
+
+ if (trx->undo_no != 0) {
+ newline = TRUE;
+ fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
+ }
+
+ if (newline) {
+ putc('\n', f);
+ }
+
+ if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL) {
+ innobase_mysql_print_thd(
+ f, trx->mysql_thd, static_cast<uint>(max_query_len));
+ }
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys.mutex.
+When possible, use trx_print() instead. */
+void
+trx_print_latched(
+/*==============*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ ulint max_query_len) /*!< in: max query length to print,
+ or 0 to use the default max length */
+{
+ ut_ad(lock_mutex_own());
+
+ trx_print_low(f, trx, max_query_len,
+ lock_number_of_rows_locked(&trx->lock),
+ UT_LIST_GET_LEN(trx->lock.trx_locks),
+ mem_heap_get_size(trx->lock.lock_heap));
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys.mutex. */
+void
+trx_print(
+/*======*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ ulint max_query_len) /*!< in: max query length to print,
+ or 0 to use the default max length */
+{
+ ulint n_rec_locks;
+ ulint n_trx_locks;
+ ulint heap_size;
+
+ lock_mutex_enter();
+ n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+ n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+ heap_size = mem_heap_get_size(trx->lock.lock_heap);
+ lock_mutex_exit();
+
+ trx_print_low(f, trx, max_query_len,
+ n_rec_locks, n_trx_locks, heap_size);
+}
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return TRUE if weight(a) >= weight(b) */
+bool
+trx_weight_ge(
+/*==========*/
+ const trx_t* a, /*!< in: transaction to be compared */
+ const trx_t* b) /*!< in: transaction to be compared */
+{
+ ibool a_notrans_edit;
+ ibool b_notrans_edit;
+
+ /* If mysql_thd is NULL for a transaction we assume that it has
+ not edited non-transactional tables. */
+
+ a_notrans_edit = a->mysql_thd != NULL
+ && thd_has_edited_nontrans_tables(a->mysql_thd);
+
+ b_notrans_edit = b->mysql_thd != NULL
+ && thd_has_edited_nontrans_tables(b->mysql_thd);
+
+ if (a_notrans_edit != b_notrans_edit) {
+
+ return(a_notrans_edit);
+ }
+
+ /* Either both had edited non-transactional tables or both had
+ not, we fall back to comparing the number of altered/locked
+ rows. */
+
+ return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
+}
+
+/** Prepare a transaction.
+@return log sequence number that makes the XA PREPARE durable
+@retval 0 if no changes needed to be made durable */
+static lsn_t trx_prepare_low(trx_t *trx)
+{
+ ut_ad(!trx->is_recovered);
+
+ mtr_t mtr;
+
+ if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+ ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg);
+
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ mutex_enter(&undo->rseg->mutex);
+ trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+ mutex_exit(&undo->rseg->mutex);
+
+ mtr.commit();
+ }
+
+ trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+ if (!undo) {
+ /* There were no changes to persistent tables. */
+ return(0);
+ }
+
+ trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+ ut_ad(undo->rseg == rseg);
+
+ mtr.start();
+
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE to
+ TRX_UNDO_PREPARED: these modifications to the file data
+ structure define the transaction as prepared in the file-based
+ world, at the serialization point of lsn. */
+
+ mutex_enter(&rseg->mutex);
+ trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+ mutex_exit(&rseg->mutex);
+
+ /* Make the XA PREPARE durable. */
+ mtr.commit();
+ ut_ad(mtr.commit_lsn() > 0);
+ return(mtr.commit_lsn());
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+static
+void
+trx_prepare(
+/*========*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* Only fresh user transactions can be prepared.
+ Recovered transactions cannot. */
+ ut_a(!trx->is_recovered);
+
+ lsn_t lsn = trx_prepare_low(trx);
+
+ DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE(););
+
+ ut_a(trx->state == TRX_STATE_ACTIVE);
+ trx_mutex_enter(trx);
+ trx->state = TRX_STATE_PREPARED;
+ trx_mutex_exit(trx);
+
+ if (lsn) {
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the prepared state of the
+ transaction durable if the OS does not crash. We may also
+ flush the log files to disk, making the prepared state of the
+ transaction durable also at an OS crash or a power outage.
+
+ The idea in InnoDB's group prepare is that a group of
+ transactions gather behind a trx doing a physical disk write
+ to log files, and when that physical write has been completed,
+ one of those transactions does a write which prepares the whole
+ group. Note that this group prepare will only bring benefit if
+ there are > 2 users in the database. Then at least 2 users can
+ gather behind one doing the physical log write to disk.
+
+ We must not be holding any mutexes or latches here. */
+
+ trx_flush_log_if_needed(lsn, trx);
+ }
+}
+
+/** XA PREPARE a transaction.
+@param[in,out] trx transaction to prepare */
+void trx_prepare_for_mysql(trx_t* trx)
+{
+ trx_start_if_not_started_xa(trx, false);
+
+ trx->op_info = "preparing";
+
+ trx_prepare(trx);
+
+ trx->op_info = "";
+}
+
+
+struct trx_recover_for_mysql_callback_arg
+{
+ XID *xid_list;
+ uint len;
+ uint count;
+};
+
+
+static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
+ trx_recover_for_mysql_callback_arg *arg)
+{
+ DBUG_ASSERT(arg->len > 0);
+ mutex_enter(&element->mutex);
+ if (trx_t *trx= element->trx)
+ {
+ /*
+ The state of a read-write transaction can only change from ACTIVE to
+ PREPARED while we are holding the element->mutex. But since it is
+ executed at startup no state change should occur.
+ */
+ if (trx_state_eq(trx, TRX_STATE_PREPARED))
+ {
+ ut_ad(trx->is_recovered);
+ ut_ad(trx->id);
+ if (arg->count == 0)
+ ib::info() << "Starting recovery for XA transactions...";
+ XID& xid= arg->xid_list[arg->count];
+ if (arg->count++ < arg->len)
+ {
+ trx->state= TRX_STATE_PREPARED_RECOVERED;
+ ib::info() << "Transaction " << trx->id
+ << " in prepared state after recovery";
+ ib::info() << "Transaction contains changes to " << trx->undo_no
+ << " rows";
+ xid= *trx->xid;
+ }
+ }
+ }
+ mutex_exit(&element->mutex);
+ /* Do not terminate upon reaching arg->len; count all transactions */
+ return false;
+}
+
+
+static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element,
+ void*)
+{
+ mutex_enter(&element->mutex);
+ if (trx_t *trx= element->trx)
+ {
+ if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED))
+ trx->state= TRX_STATE_PREPARED;
+ }
+ mutex_exit(&element->mutex);
+ return false;
+}
+
+
+/**
+ Find prepared transaction objects for recovery.
+
+ @param[out] xid_list prepared transactions
+ @param[in] len number of slots in xid_list
+
+ @return number of prepared transactions stored in xid_list
+*/
+
+int trx_recover_for_mysql(XID *xid_list, uint len)
+{
+ trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 };
+
+ ut_ad(xid_list);
+ ut_ad(len);
+
+ /* Fill xid_list with PREPARED transactions. */
+ trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg);
+ if (arg.count)
+ {
+ ib::info() << arg.count
+ << " transactions in prepared state after recovery";
+ /* After returning the full list, reset the state, because
+ init_server_components() wants to recover the collection of
+ transactions twice, by first calling tc_log->open() and then
+ ha_recover() directly. */
+ if (arg.count <= len)
+ trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback);
+ }
+ return int(std::min(arg.count, len));
+}
+
+
+struct trx_get_trx_by_xid_callback_arg
+{
+ const XID *xid;
+ trx_t *trx;
+};
+
+
+static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element,
+ trx_get_trx_by_xid_callback_arg *arg)
+{
+ my_bool found= 0;
+ mutex_enter(&element->mutex);
+ if (trx_t *trx= element->trx)
+ {
+ trx_mutex_enter(trx);
+ if (trx->is_recovered &&
+ (trx_state_eq(trx, TRX_STATE_PREPARED) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) &&
+ arg->xid->eq(reinterpret_cast<XID*>(trx->xid)))
+ {
+#ifdef WITH_WSREP
+ /* The commit of a prepared recovered Galera
+ transaction needs a valid trx->xid for
+ invoking trx_sys_update_wsrep_checkpoint(). */
+ if (!wsrep_is_wsrep_xid(trx->xid))
+#endif /* WITH_WSREP */
+ /* Invalidate the XID, so that subsequent calls will not find it. */
+ trx->xid->null();
+ arg->trx= trx;
+ found= 1;
+ }
+ trx_mutex_exit(trx);
+ }
+ mutex_exit(&element->mutex);
+ return found;
+}
+
+/** Look up an X/Open distributed transaction in XA PREPARE state.
+@param[in] xid X/Open XA transaction identifier
+@return transaction on match (the trx_t::xid will be invalidated);
+note that the trx may have been committed before the caller acquires
+trx_t::mutex
+@retval NULL if no match */
+trx_t* trx_get_trx_by_xid(const XID* xid)
+{
+ trx_get_trx_by_xid_callback_arg arg= { xid, 0 };
+
+ if (xid)
+ trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg);
+ return arg.trx;
+}
+
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+ trx_t* trx, /*!< in/out: transaction */
+ bool read_write) /*!< in: true if read write transaction */
+{
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx_start_low(trx, read_write);
+ return;
+
+ case TRX_STATE_ACTIVE:
+ if (trx->id == 0 && read_write) {
+ /* If the transaction is tagged as read-only then
+ it can only write to temp tables and for such
+ transactions we don't want to move them to the
+ trx_sys_t::rw_trx_hash. */
+ if (!trx->read_only) {
+ trx_set_rw_mode(trx);
+ }
+ }
+ return;
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_low(
+/*==========================*/
+ trx_t* trx, /*!< in: transaction */
+ bool read_write) /*!< in: true if read write transaction */
+{
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx_start_low(trx, read_write);
+ return;
+
+ case TRX_STATE_ACTIVE:
+ if (read_write && trx->id == 0 && !trx->read_only) {
+ trx_set_rw_mode(trx);
+ }
+ return;
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/*************************************************************//**
+Starts a transaction for internal processing. */
+void
+trx_start_internal_low(
+/*===================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* Ensure it is not flagged as an auto-commit-non-locking
+ transaction. */
+
+ trx->will_lock = true;
+
+ trx->internal = true;
+
+ trx_start_low(trx, true);
+}
+
+/** Starts a read-only transaction for internal processing.
+@param[in,out] trx transaction to be started */
+void
+trx_start_internal_read_only_low(
+ trx_t* trx)
+{
+ /* Ensure it is not flagged as an auto-commit-non-locking
+ transaction. */
+
+ trx->will_lock = true;
+
+ trx->internal = true;
+
+ trx_start_low(trx, false);
+}
+
+/*************************************************************//**
+Starts the transaction for a DDL operation. */
+void
+trx_start_for_ddl_low(
+/*==================*/
+ trx_t* trx, /*!< in/out: transaction */
+ trx_dict_op_t op) /*!< in: dictionary operation type */
+{
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ /* Flag this transaction as a dictionary operation, so that
+ the data dictionary will be locked in crash recovery. */
+
+ trx_set_dict_operation(trx, op);
+ trx->ddl= true;
+ trx_start_internal_low(trx);
+ return;
+
+ case TRX_STATE_ACTIVE:
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/*************************************************************//**
+Set the transaction as a read-write transaction if it is not already
+tagged as such. Read-only transactions that are writing to temporary
+tables are assigned an ID and a rollback segment but are not added
+to the trx read-write list because their updates should not be visible
+to other transactions and therefore their changes can be ignored by
+by MVCC. */
+void
+trx_set_rw_mode(
+/*============*/
+ trx_t* trx) /*!< in/out: transaction that is RW */
+{
+ ut_ad(trx->rsegs.m_redo.rseg == 0);
+ ut_ad(!trx->is_autocommit_non_locking());
+ ut_ad(!trx->read_only);
+ ut_ad(trx->id == 0);
+
+ if (high_level_read_only) {
+ return;
+ }
+
+ trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
+ ut_ad(trx->rsegs.m_redo.rseg != 0);
+
+ trx_sys.register_rw(trx);
+
+ /* So that we can see our own changes. */
+ if (trx->read_view.is_open()) {
+ trx->read_view.set_creator_trx_id(trx->id);
+ }
+}
+
+bool trx_t::has_stats_table_lock() const
+{
+ for (lock_list::const_iterator it= lock.table_locks.begin(),
+ end= lock.table_locks.end(); it != end; ++it)
+ {
+ const lock_t *lock= *it;
+ if (lock && lock->un_member.tab_lock.table->is_stats_table())
+ return true;
+ }
+
+ return false;
+}
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
new file mode 100644
index 00000000..3d2d9752
--- /dev/null
+++ b/storage/innobase/trx/trx0undo.cc
@@ -0,0 +1,1401 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0undo.cc
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "log.h"
+
+/* How should the old versions in the history list be managed?
+ ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+ However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+ A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+ When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+ In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+ We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+ -------------------------------------------------------------------
+latches?
+-------
+When a transaction does its first insert or modify in the clustered index, an
+undo log is assigned for it. Then we must have an x-latch to the rollback
+segment header.
+ When the transaction performs modifications or rolls back, its
+undo log is protected by undo page latches.
+Only the thread that is associated with the transaction may hold multiple
+undo page latches at a time. Undo pages are always private to a single
+transaction. Other threads that are performing MVCC reads
+or checking for implicit locks will lock at most one undo page at a time
+in trx_undo_get_undo_rec_low().
+ When the transaction commits, its persistent undo log is added
+to the history list. If it is not suitable for reuse, its slot is reset.
+In both cases, an x-latch must be acquired on the rollback segment header page.
+ The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open XA transaction identification*/
+ uint32_t page_no,/*!< in: undo log header page number */
+ uint16_t offset);/*!< in: undo log header byte offset on page */
+
+/** Determine the start offset of undo log records of an undo log page.
+@param[in] block undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset
+@return start offset */
+static
+uint16_t trx_undo_page_get_start(const buf_block_t *block, uint32_t page_no,
+ uint16_t offset)
+{
+ return page_no == block->page.id().page_no()
+ ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->frame)
+ : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+}
+
+/** Get the first undo log record on a page.
+@param[in] block undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header page offset
+@return pointer to first record
+@retval NULL if none exists */
+static trx_undo_rec_t*
+trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
+ uint16_t offset)
+{
+ uint16_t start= trx_undo_page_get_start(block, page_no, offset);
+ return start == trx_undo_page_get_end(block, page_no, offset)
+ ? nullptr : block->frame + start;
+}
+
+/** Get the last undo log record on a page.
+@param[in] page undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header page offset
+@return pointer to last record
+@retval NULL if none exists */
+static
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(const buf_block_t *block, uint32_t page_no,
+ uint16_t offset)
+{
+ uint16_t end= trx_undo_page_get_end(block, page_no, offset);
+ return trx_undo_page_get_start(block, page_no, offset) == end
+ ? nullptr : block->frame + mach_read_from_2(block->frame + end - 2);
+}
+
+/** Get the previous record in an undo log from the previous page.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
+ uint32_t page_no, uint16_t offset,
+ bool shared, mtr_t *mtr)
+{
+ uint32_t prev_page_no= flst_get_prev_addr(TRX_UNDO_PAGE_HDR +
+ TRX_UNDO_PAGE_NODE +
+ block->frame).page;
+
+ if (prev_page_no == FIL_NULL)
+ return nullptr;
+
+ block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no),
+ 0, shared ? RW_S_LATCH : RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ return trx_undo_page_get_last_rec(block, page_no, offset);
+}
+
+/** Get the previous undo log record.
+@param[in] block undo log page
+@param[in] rec undo log record
+@param[in] page_no undo log header page number
+@param[in] offset undo log header page offset
+@return pointer to record
+@retval NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(const buf_block_t *block, trx_undo_rec_t *rec,
+ uint32_t page_no, uint16_t offset)
+{
+ ut_ad(block->frame == page_align(rec));
+ return rec == block->frame + trx_undo_page_get_start(block, page_no, offset)
+ ? nullptr
+ : block->frame + mach_read_from_2(rec - 2);
+}
+
+/** Get the previous record in an undo log.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+ uint16_t offset, bool shared, mtr_t *mtr)
+{
+ if (trx_undo_rec_t *prev= trx_undo_page_get_prev_rec(block,
+ block->frame + rec,
+ page_no, offset))
+ return prev;
+
+ /* We have to go to the previous undo log page to look for the
+ previous record */
+
+ return trx_undo_get_prev_rec_from_prev_page(block, rec, page_no, offset,
+ shared, mtr);
+}
+
+/** Get the next record in an undo log from the next page.
+@param[in,out] block undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(buf_block_t *&block, uint32_t page_no,
+ uint16_t offset, ulint mode, mtr_t *mtr)
+{
+ if (page_no == block->page.id().page_no() &&
+ mach_read_from_2(block->frame + offset + TRX_UNDO_NEXT_LOG))
+ return NULL;
+
+ uint32_t next= flst_get_next_addr(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+ block->frame).page;
+ if (next == FIL_NULL)
+ return NULL;
+
+ block= buf_page_get(page_id_t(block->page.id().space(), next), 0, mode, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ return trx_undo_page_get_first_rec(block, page_no, offset);
+}
+
+/** Get the next record in an undo log.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+ uint16_t offset, mtr_t *mtr)
+{
+ if (trx_undo_rec_t *next= trx_undo_page_get_next_rec(block, rec, page_no,
+ offset))
+ return next;
+
+ return trx_undo_get_next_rec_from_next_page(block, page_no, offset,
+ RW_S_LATCH, mtr);
+}
+
+/** Get the first record in an undo log.
+@param[in] space undo log header space
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH
+@param[out] block undo log page
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
+ uint16_t offset, ulint mode, buf_block_t*& block,
+ mtr_t *mtr)
+{
+ block = buf_page_get(page_id_t(space.id, page_no), 0, mode, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset))
+ return rec;
+
+ return trx_undo_get_next_rec_from_next_page(block, page_no, offset, mode,
+ mtr);
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param[in,out] block undo log page */
+void trx_undo_page_init(const buf_block_t &block)
+{
+ mach_write_to_2(my_assume_aligned<2>(FIL_PAGE_TYPE + block.frame),
+ FIL_PAGE_UNDO_LOG);
+ static_assert(TRX_UNDO_PAGE_HDR == FIL_PAGE_DATA, "compatibility");
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + block.frame,
+ 0, 2);
+ mach_write_to_2(my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.frame),
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ memcpy_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.frame,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.frame, 2);
+ /* The following corresponds to flst_zero_both(), but without writing log. */
+ memset_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+ FIL_ADDR_PAGE + block.frame, 0xff, 4);
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+ FIL_ADDR_BYTE + block.frame, 0, 2);
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+ FIL_ADDR_PAGE + block.frame, 0xff, 4);
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+ FIL_ADDR_BYTE + block.frame, 0, 2);
+ static_assert(TRX_UNDO_PAGE_NODE + FLST_NEXT + FIL_ADDR_BYTE + 2 ==
+ TRX_UNDO_PAGE_HDR_SIZE, "compatibility");
+ /* Preserve TRX_UNDO_SEG_HDR, but clear the rest of the page. */
+ memset_aligned<2>(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE + block.frame, 0,
+ srv_page_size - (TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+ FIL_PAGE_DATA_END));
+}
+
+/** Look for a free slot for an undo log segment.
+@param rseg_header rollback segment header
+@return slot index
+@retval ULINT_UNDEFINED if not found */
+static ulint trx_rsegf_undo_find_free(const buf_block_t *rseg_header)
+{
+ ulint max_slots= TRX_RSEG_N_SLOTS;
+
+#ifdef UNIV_DEBUG
+ if (trx_rseg_n_slots_debug)
+ max_slots= std::min<ulint>(trx_rseg_n_slots_debug, TRX_RSEG_N_SLOTS);
+#endif
+
+ for (ulint i= 0; i < max_slots; i++)
+ if (trx_rsegf_get_nth_undo(rseg_header, i) == FIL_NULL)
+ return i;
+
+ return ULINT_UNDEFINED;
+}
+
+/** Create an undo log segment.
+@param[in,out] space tablespace
+@param[in,out] rseg_hdr rollback segment header (x-latched)
+@param[out] id undo slot number
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return undo log block
+@retval NULL on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
+ dberr_t *err, mtr_t *mtr)
+{
+ buf_block_t* block;
+ uint32_t n_reserved;
+ bool success;
+
+ const ulint slot_no = trx_rsegf_undo_find_free(rseg_hdr);
+
+ if (slot_no == ULINT_UNDEFINED) {
+ ib::warn() << "Cannot find a free slot for an undo log. Do"
+ " you have too many active transactions running"
+ " concurrently?";
+
+ *err = DB_TOO_MANY_CONCURRENT_TRXS;
+ return NULL;
+ }
+
+ ut_ad(slot_no < TRX_RSEG_N_SLOTS);
+
+ success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+ mtr);
+ if (!success) {
+ *err = DB_OUT_OF_FILE_SPACE;
+ return NULL;
+ }
+
+ /* Allocate a new file segment for the undo log */
+ block = fseg_create(space, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+ mtr, true);
+
+ space->release_free_extents(n_reserved);
+
+ if (block == NULL) {
+ *err = DB_OUT_OF_FILE_SPACE;
+ return NULL;
+ }
+
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ mtr->undo_create(*block);
+ trx_undo_page_init(*block);
+
+ mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + block->frame,
+ TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+ TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+ + block->frame, 0U);
+
+ flst_init(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame,
+ mtr);
+
+ flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+ *id = slot_no;
+ mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+ + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->frame,
+ block->page.id().page_no());
+
+ MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+
+ *err = DB_SUCCESS;
+ return block;
+}
+
+/** Initialize an undo log header.
+@param[in,out] undo_page undo log segment header page
+@param[in] trx_id transaction identifier
+@param[in,out] mtr mini-transaction
+@return header byte offset on page */
+static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
+ mtr_t* mtr)
+{
+ /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
+ repurposed after upgrading to MariaDB 10.3. */
+ byte *undo_type= my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->frame);
+ ut_ad(mach_read_from_2(undo_type) <= 2);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_type, 0U);
+ byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
+ undo_page->frame);
+ const uint16_t free= mach_read_from_2(start + 2);
+ static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
+ "compatibility");
+ ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
+
+ mach_write_to_2(start, free + TRX_UNDO_LOG_XA_HDR_SIZE);
+ /* A WRITE of 2 bytes is never longer than a MEMMOVE.
+ So, WRITE 2+2 bytes is better than WRITE+MEMMOVE.
+ But, a MEMSET will only be 1+2 bytes, that is, 1 byte shorter! */
+ memcpy_aligned<2>(start + 2, start, 2);
+ mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4,
+ start, 2);
+ uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+ undo_page->frame);
+ alignas(4) byte buf[4];
+ mach_write_to_2(buf, TRX_UNDO_ACTIVE);
+ mach_write_to_2(buf + 2, free);
+ static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility");
+ static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment");
+ mtr->memcpy(*undo_page, my_assume_aligned<4>
+ (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->frame),
+ buf, 4);
+ if (prev_log)
+ mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG + undo_page->frame,
+ free);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_TRX_ID +
+ undo_page->frame, trx_id);
+ /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */
+ mach_write_to_2(buf, 1);
+ memcpy_aligned<2>(buf + 2, start, 2);
+ static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START,
+ "compatibility");
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_NEEDS_PURGE +
+ undo_page->frame, buf, 4);
+ /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */
+ if (prev_log)
+ {
+ mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+ TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_PREV_LOG +
+ undo_page->frame, prev_log);
+ static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE,
+ "compatibility");
+ mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0);
+ static_assert(TRX_UNDO_LOG_OLD_HDR_SIZE == TRX_UNDO_HISTORY_NODE +
+ FLST_NODE_SIZE, "compatibility");
+ }
+ else
+ mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+ TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0);
+ return free;
+}
+
+/** Write X/Open XA Transaction Identifier (XID) to undo log header
+@param[in,out] block undo header page
+@param[in] offset undo header record offset
+@param[in] xid distributed transaction identifier
+@param[in,out] mtr mini-transaction */
+static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
+ const XID &xid, mtr_t *mtr)
+{
+ DBUG_ASSERT(xid.gtrid_length > 0);
+ DBUG_ASSERT(xid.bqual_length >= 0);
+ DBUG_ASSERT(xid.gtrid_length <= MAXGTRIDSIZE);
+ DBUG_ASSERT(xid.bqual_length <= MAXBQUALSIZE);
+ static_assert(MAXGTRIDSIZE + MAXBQUALSIZE == XIDDATASIZE,
+ "gtrid and bqual don't fit xid data");
+ DBUG_ASSERT(mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+ block->frame) == offset);
+
+ trx_ulogf_t* log_hdr= block->frame + offset;
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_FORMAT,
+ static_cast<uint32_t>(xid.formatID));
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_TRID_LEN,
+ static_cast<uint32_t>(xid.gtrid_length));
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+ static_cast<uint32_t>(xid.bqual_length));
+ const ulint xid_length= static_cast<ulint>(xid.gtrid_length
+ + xid.bqual_length);
+ mtr->memcpy(*block, &block->frame[offset + TRX_UNDO_XA_XID],
+ xid.data, xid_length);
+ if (UNIV_LIKELY(xid_length < XIDDATASIZE))
+ mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length,
+ XIDDATASIZE - xid_length, 0);
+}
+
+/********************************************************************//**
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid)
+{
+ xid->formatID=static_cast<long>(mach_read_from_4(
+ log_hdr + TRX_UNDO_XA_FORMAT));
+
+ xid->gtrid_length=static_cast<long>(mach_read_from_4(
+ log_hdr + TRX_UNDO_XA_TRID_LEN));
+
+ xid->bqual_length=static_cast<long>(mach_read_from_4(
+ log_hdr + TRX_UNDO_XA_BQUAL_LEN));
+
+ memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
+}
+
+/** Allocate an undo log page.
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction that does not hold any page latch
+@return X-latched block if success
+@retval NULL on failure */
+buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
+{
+ trx_rseg_t* rseg = undo->rseg;
+ buf_block_t* new_block = NULL;
+ uint32_t n_reserved;
+
+ /* When we add a page to an undo log, this is analogous to
+ a pessimistic insert in a B-tree, and we must reserve the
+ counterpart of the tree latch, which is the rseg mutex. */
+
+ mutex_enter(&rseg->mutex);
+
+ buf_block_t* header_block = trx_undo_page_get(
+ page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
+
+ if (!fsp_reserve_free_extents(&n_reserved, undo->rseg->space, 1,
+ FSP_UNDO, mtr)) {
+ goto func_exit;
+ }
+
+ new_block = fseg_alloc_free_page_general(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ + header_block->frame,
+ undo->top_page_no + 1, FSP_UP, true, mtr, mtr);
+
+ rseg->space->release_free_extents(n_reserved);
+
+ if (!new_block) {
+ goto func_exit;
+ }
+
+ ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+ buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE);
+ undo->last_page_no = new_block->page.id().page_no();
+
+ mtr->undo_create(*new_block);
+ trx_undo_page_init(*new_block);
+
+ flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+ undo->size++;
+ rseg->curr_size++;
+
+func_exit:
+ mutex_exit(&rseg->mutex);
+ return(new_block);
+}
+
+/********************************************************************//**
+Frees an undo log page that is not the header page.
+@return last page number in remaining log */
+static
+uint32_t
+trx_undo_free_page(
+/*===============*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ bool in_history, /*!< in: TRUE if the undo log is in the history
+ list */
+ uint32_t hdr_page_no, /*!< in: header page number */
+ uint32_t page_no, /*!< in: page number to free: must not be the
+ header page */
+ mtr_t* mtr) /*!< in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+{
+ const ulint space = rseg->space->id;
+
+ ut_a(hdr_page_no != page_no);
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ buf_block_t* undo_block = trx_undo_page_get(page_id_t(space, page_no),
+ mtr);
+ buf_block_t* header_block = trx_undo_page_get(page_id_t(space,
+ hdr_page_no),
+ mtr);
+
+ flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+ fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ + header_block->frame,
+ rseg->space, page_no, mtr);
+ buf_page_free(rseg->space, page_no, mtr, __FILE__, __LINE__);
+
+ const fil_addr_t last_addr = flst_get_last(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_block->frame);
+ rseg->curr_size--;
+
+ if (in_history) {
+ buf_block_t* rseg_header = trx_rsegf_get(
+ rseg->space, rseg->page_no, mtr);
+ byte* rseg_hist_size = TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+ + rseg_header->frame;
+ uint32_t hist_size = mach_read_from_4(rseg_hist_size);
+ ut_ad(hist_size > 0);
+ mtr->write<4>(*rseg_header, rseg_hist_size, hist_size - 1);
+ }
+
+ return(last_addr.page);
+}
+
+/** Free the last undo log page. The caller must hold the rseg mutex.
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction that does not hold any undo log page
+ or that has allocated the undo log page */
+void
+trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr)
+{
+ ut_ad(undo->hdr_page_no != undo->last_page_no);
+ ut_ad(undo->size > 0);
+
+ undo->last_page_no = trx_undo_free_page(
+ undo->rseg, false, undo->hdr_page_no, undo->last_page_no, mtr);
+
+ undo->size--;
+}
+
+/** Truncate the tail of an undo log during rollback.
+@param[in,out] undo undo log
+@param[in] limit all undo logs after this limit will be discarded
+@param[in] is_temp whether this is temporary undo log */
+void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp)
+{
+ mtr_t mtr;
+ ut_ad(is_temp == !undo.rseg->is_persistent());
+
+ for (;;) {
+ mtr.start();
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ trx_undo_rec_t* trunc_here = NULL;
+ mutex_enter(&undo.rseg->mutex);
+ buf_block_t* undo_block = trx_undo_page_get(
+ page_id_t(undo.rseg->space->id, undo.last_page_no),
+ &mtr);
+ trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
+ undo_block, undo.hdr_page_no, undo.hdr_offset);
+ while (rec) {
+ if (trx_undo_rec_get_undo_no(rec) < limit) {
+ goto func_exit;
+ }
+ /* Truncate at least this record off, maybe more */
+ trunc_here = rec;
+
+ rec = trx_undo_page_get_prev_rec(undo_block, rec,
+ undo.hdr_page_no,
+ undo.hdr_offset);
+ }
+
+ if (undo.last_page_no != undo.hdr_page_no) {
+ trx_undo_free_last_page(&undo, &mtr);
+ mutex_exit(&undo.rseg->mutex);
+ mtr.commit();
+ continue;
+ }
+
+func_exit:
+ mutex_exit(&undo.rseg->mutex);
+
+ if (trunc_here) {
+ mtr.write<2>(*undo_block,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + undo_block->frame,
+ ulint(trunc_here - undo_block->frame));
+ }
+
+ mtr.commit();
+ return;
+ }
+}
+
+/** Truncate the head of an undo log.
+NOTE that only whole pages are freed; the header page is not
+freed, but emptied, if all the records there are below the limit.
+@param[in,out] rseg rollback segment
+@param[in] hdr_page_no header page number
+@param[in] hdr_offset header offset on the page
+@param[in] limit first undo number to preserve
+(everything below the limit will be truncated) */
+void
+trx_undo_truncate_start(
+ trx_rseg_t* rseg,
+ uint32_t hdr_page_no,
+ uint16_t hdr_offset,
+ undo_no_t limit)
+{
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* last_rec;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ if (!limit) {
+ return;
+ }
+loop:
+ mtr_start(&mtr);
+
+ if (!rseg->is_persistent()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ buf_block_t* undo_page;
+ rec = trx_undo_get_first_rec(*rseg->space, hdr_page_no, hdr_offset,
+ RW_X_LATCH, undo_page, &mtr);
+ if (rec == NULL) {
+ /* Already empty */
+done:
+ mtr.commit();
+ return;
+ }
+
+ last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+ hdr_offset);
+ if (trx_undo_rec_get_undo_no(last_rec) >= limit) {
+ goto done;
+ }
+
+ if (undo_page->page.id().page_no() == hdr_page_no) {
+ uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG
+ + undo_page->frame);
+ if (end == 0) {
+ end = mach_read_from_2(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + undo_page->frame);
+ }
+
+ mtr.write<2>(*undo_page, undo_page->frame + hdr_offset
+ + TRX_UNDO_LOG_START, end);
+ } else {
+ trx_undo_free_page(rseg, true, hdr_page_no,
+ undo_page->page.id().page_no(), &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ goto loop;
+}
+
+/** Frees an undo log segment which is not in the history list.
+@param undo temporary undo log */
+static void trx_undo_seg_free(const trx_undo_t *undo)
+{
+ ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+ trx_rseg_t* const rseg = undo->rseg;
+ bool finished;
+ mtr_t mtr;
+ ut_ad(rseg->space == fil_system.temp_space);
+
+ do {
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(SRV_TMP_SPACE_ID, undo->hdr_page_no), &mtr);
+
+ fseg_header_t* file_seg = TRX_UNDO_SEG_HDR
+ + TRX_UNDO_FSEG_HEADER + block->frame;
+
+ finished = fseg_free_step(file_seg, &mtr);
+
+ if (finished) {
+ /* Update the rseg header */
+ buf_block_t* rseg_header = trx_rsegf_get(
+ rseg->space, rseg->page_no, &mtr);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ memset(TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+ + undo->id * TRX_RSEG_SLOT_SIZE +
+ rseg_header->frame, 0xff, 4);
+ MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+ }
+
+ mtr.commit();
+ } while (!finished);
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/** Read an undo log when starting up the database.
+@param[in,out] rseg rollback segment
+@param[in] id rollback segment slot
+@param[in] page_no undo log segment page number
+@param[in,out] max_trx_id the largest observed transaction ID
+@return the undo log
+@retval nullptr on error */
+trx_undo_t *
+trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no,
+ trx_id_t &max_trx_id)
+{
+ mtr_t mtr;
+ XID xid;
+
+ ut_ad(id < TRX_RSEG_N_SLOTS);
+
+ mtr.start();
+ const buf_block_t* block = trx_undo_page_get(
+ page_id_t(rseg->space->id, page_no), &mtr);
+ const uint16_t type = mach_read_from_2(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE
+ + block->frame);
+ if (UNIV_UNLIKELY(type > 2)) {
+corrupted_type:
+ sql_print_error("InnoDB: unsupported undo header type %u",
+ type);
+corrupted:
+ mtr.commit();
+ return nullptr;
+ }
+
+ uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+ + block->frame);
+ if (offset < TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE ||
+ offset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE) {
+ sql_print_error("InnoDB: invalid undo header offset %u",
+ offset);
+ goto corrupted;
+ }
+
+ const trx_ulogf_t* const undo_header = block->frame + offset;
+ uint16_t state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+ + block->frame);
+ switch (state) {
+ case TRX_UNDO_ACTIVE:
+ case TRX_UNDO_PREPARED:
+ if (UNIV_LIKELY(type != 1)) {
+ break;
+ }
+ sql_print_error("InnoDB: upgrade from older version than"
+ " MariaDB 10.3 requires clean shutdown");
+ goto corrupted;
+ default:
+ sql_print_error("InnoDB: unsupported undo header state %u",
+ state);
+ goto corrupted;
+ case TRX_UNDO_TO_PURGE:
+ if (UNIV_UNLIKELY(type == 1)) {
+ goto corrupted_type;
+ }
+ /* fall through */
+ case TRX_UNDO_CACHED:
+ trx_id_t id = mach_read_from_8(TRX_UNDO_TRX_NO + undo_header);
+ if (id >> 48) {
+ sql_print_error("InnoDB: corrupted TRX_NO %llx", id);
+ goto corrupted;
+ }
+ if (id > max_trx_id) {
+ max_trx_id = id;
+ }
+ }
+
+ /* Read X/Open XA transaction identification if it exists, or
+ set it to NULL. */
+
+ if (undo_header[TRX_UNDO_XID_EXISTS]) {
+ trx_undo_read_xid(undo_header, &xid);
+ } else {
+ xid.null();
+ }
+
+ trx_id_t trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID);
+ if (trx_id >> 48) {
+ sql_print_error("InnoDB: corrupted TRX_ID %llx", trx_id);
+ goto corrupted;
+ }
+ if (trx_id > max_trx_id) {
+ max_trx_id = trx_id;
+ }
+
+ mutex_enter(&rseg->mutex);
+ trx_undo_t* undo = trx_undo_mem_create(
+ rseg, id, trx_id, &xid, page_no, offset);
+ mutex_exit(&rseg->mutex);
+ if (!undo) {
+ return undo;
+ }
+
+ undo->dict_operation = undo_header[TRX_UNDO_DICT_TRANS];
+ undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID);
+ undo->size = flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+ + block->frame);
+
+ fil_addr_t last_addr = flst_get_last(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame);
+
+ undo->last_page_no = last_addr.page;
+ undo->top_page_no = last_addr.page;
+
+ const buf_block_t* last = trx_undo_page_get(
+ page_id_t(rseg->space->id, undo->last_page_no), &mtr);
+
+ if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
+ last, page_no, offset)) {
+ undo->top_offset = static_cast<uint16_t>(rec - last->frame);
+ undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+ ut_ad(!undo->empty());
+ } else {
+ undo->top_undo_no = IB_ID_MAX;
+ ut_ad(undo->empty());
+ }
+
+ undo->state = state;
+
+ if (state != TRX_UNDO_CACHED) {
+ UT_LIST_ADD_LAST(rseg->undo_list, undo);
+ } else {
+ UT_LIST_ADD_LAST(rseg->undo_cached, undo);
+ MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+ }
+
+ mtr.commit();
+ return undo;
+}
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open transaction identification */
+ uint32_t page_no,/*!< in: undo log header page number */
+ uint16_t offset) /*!< in: undo log header byte offset on page */
+{
+ trx_undo_t* undo;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ ut_a(id < TRX_RSEG_N_SLOTS);
+
+ undo = static_cast<trx_undo_t*>(ut_malloc_nokey(sizeof(*undo)));
+
+ if (undo == NULL) {
+
+ return(NULL);
+ }
+
+ undo->id = id;
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->rseg = rseg;
+
+ undo->hdr_page_no = page_no;
+ undo->hdr_offset = offset;
+ undo->last_page_no = page_no;
+ undo->size = 1;
+
+ undo->top_undo_no = IB_ID_MAX;
+ undo->top_page_no = page_no;
+ undo->guess_block = NULL;
+ ut_ad(undo->empty());
+
+ return(undo);
+}
+
+/********************************************************************//**
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+ trx_undo_t* undo, /*!< in: undo log to init */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open XA transaction identification*/
+ uint16_t offset) /*!< in: undo log header byte offset on page */
+{
+ ut_ad(mutex_own(&((undo->rseg)->mutex)));
+
+ ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->hdr_offset = offset;
+ undo->top_undo_no = IB_ID_MAX;
+ ut_ad(undo->empty());
+}
+
+/** Create an undo log.
+@param[in,out] trx transaction
+@param[in,out] rseg rollback segment
+@param[out] undo undo log object
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return undo log block
+@retval NULL on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+ dberr_t* err, mtr_t* mtr)
+{
+ ulint id;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ buf_block_t* block = trx_undo_seg_create(
+ rseg->space,
+ trx_rsegf_get(rseg->space, rseg->page_no, mtr), &id, err, mtr);
+
+ if (!block) {
+ return NULL;
+ }
+
+ rseg->curr_size++;
+
+ uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+ *undo = trx_undo_mem_create(rseg, id, trx->id, trx->xid,
+ block->page.id().page_no(), offset);
+ if (*undo == NULL) {
+ *err = DB_OUT_OF_MEMORY;
+ /* FIXME: this will not free the undo block to the file */
+ return NULL;
+ } else if (rseg != trx->rsegs.m_redo.rseg) {
+ return block;
+ }
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ break;
+ case TRX_DICT_OP_INDEX:
+ /* Do not discard the table on recovery. */
+ trx->table_id = 0;
+ /* fall through */
+ case TRX_DICT_OP_TABLE:
+ (*undo)->table_id = trx->table_id;
+ (*undo)->dict_operation = TRUE;
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+ + TRX_UNDO_DICT_TRANS, 1U);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+ + TRX_UNDO_TABLE_ID,
+ trx->table_id);
+ }
+
+ *err = DB_SUCCESS;
+ return block;
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/** Reuse a cached undo log block.
+@param[in,out] trx transaction
+@param[in,out] rseg rollback segment
+@param[out] pundo the undo log memory object
+@param[in,out] mtr mini-transaction
+@return the undo log block
+@retval NULL if none cached */
+static
+buf_block_t*
+trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
+ mtr_t* mtr)
+{
+ ut_ad(mutex_own(&rseg->mutex));
+
+ trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached);
+ if (!undo) {
+ return NULL;
+ }
+
+ ut_ad(undo->size == 1);
+ ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+ buf_block_t* block = buf_page_get(page_id_t(undo->rseg->space->id,
+ undo->hdr_page_no),
+ 0, RW_X_LATCH, mtr);
+ if (!block) {
+ return NULL;
+ }
+
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ UT_LIST_REMOVE(rseg->undo_cached, undo);
+ MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+ *pundo = undo;
+
+ uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+ trx_undo_mem_init_for_reuse(undo, trx->id, trx->xid, offset);
+
+ if (rseg != trx->rsegs.m_redo.rseg) {
+ return block;
+ }
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ return block;
+ case TRX_DICT_OP_INDEX:
+ /* Do not discard the table on recovery. */
+ trx->table_id = 0;
+ /* fall through */
+ case TRX_DICT_OP_TABLE:
+ undo->table_id = trx->table_id;
+ undo->dict_operation = TRUE;
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+ + TRX_UNDO_DICT_TRANS, 1U);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+ + TRX_UNDO_TABLE_ID,
+ trx->table_id);
+ }
+
+ return block;
+}
+
+/** Assign an undo log for a persistent transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out] trx transaction
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return the undo log block
+@retval NULL on error */
+buf_block_t*
+trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
+{
+ ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
+
+ trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+ if (undo) {
+ return buf_page_get_gen(
+ page_id_t(undo->rseg->space->id, undo->last_page_no),
+ 0, RW_X_LATCH, undo->guess_block,
+ BUF_GET, __FILE__, __LINE__, mtr, err);
+ }
+
+ trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+
+ mutex_enter(&rseg->mutex);
+ buf_block_t* block = trx_undo_reuse_cached(
+ trx, rseg, &trx->rsegs.m_redo.undo, mtr);
+
+ if (!block) {
+ block = trx_undo_create(trx, rseg, &trx->rsegs.m_redo.undo,
+ err, mtr);
+ ut_ad(!block == (*err != DB_SUCCESS));
+ if (!block) {
+ goto func_exit;
+ }
+ } else {
+ *err = DB_SUCCESS;
+ }
+
+ UT_LIST_ADD_FIRST(rseg->undo_list, trx->rsegs.m_redo.undo);
+
+func_exit:
+ mutex_exit(&rseg->mutex);
+ return block;
+}
+
+/** Assign an undo log for a transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out] trx transaction
+@param[in] rseg rollback segment
+@param[out] undo the undo log
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return the undo log block
+@retval NULL on error */
+buf_block_t*
+trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+ dberr_t* err, mtr_t* mtr)
+{
+ const bool is_temp __attribute__((unused)) = rseg == trx->rsegs.m_noredo.rseg;
+
+ ut_ad(rseg == trx->rsegs.m_redo.rseg
+ || rseg == trx->rsegs.m_noredo.rseg);
+ ut_ad(undo == (is_temp
+ ? &trx->rsegs.m_noredo.undo
+ : &trx->rsegs.m_redo.undo));
+ ut_ad(mtr->get_log_mode()
+ == (is_temp ? MTR_LOG_NO_REDO : MTR_LOG_ALL));
+
+ if (*undo) {
+ return buf_page_get_gen(
+ page_id_t(rseg->space->id, (*undo)->last_page_no),
+ 0, RW_X_LATCH, (*undo)->guess_block,
+ BUF_GET, __FILE__, __LINE__, mtr, err);
+ }
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_too_many_trx",
+ *err = DB_TOO_MANY_CONCURRENT_TRXS; return NULL;
+ );
+
+ mutex_enter(&rseg->mutex);
+
+ buf_block_t* block = trx_undo_reuse_cached(trx, rseg, undo, mtr);
+
+ if (!block) {
+ block = trx_undo_create(trx, rseg, undo, err, mtr);
+ ut_ad(!block == (*err != DB_SUCCESS));
+ if (!block) {
+ goto func_exit;
+ }
+ } else {
+ *err = DB_SUCCESS;
+ }
+
+ UT_LIST_ADD_FIRST(rseg->undo_list, *undo);
+
+func_exit:
+ mutex_exit(&rseg->mutex);
+ return block;
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return undo log segment header page, x-latched */
+buf_block_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
+
+ const uint16_t state = undo->size == 1
+ && TRX_UNDO_PAGE_REUSE_LIMIT
+ > mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + block->frame)
+ ? TRX_UNDO_CACHED
+ : TRX_UNDO_TO_PURGE;
+
+ undo->state = state;
+ mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+ + block->frame, state);
+ return block;
+}
+
+/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
+@param[in,out] trx transaction
+@param[in,out] undo undo log
+@param[in] rollback false=XA PREPARE, true=XA ROLLBACK
+@param[in,out] mtr mini-transaction
+@return undo log segment header page, x-latched */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+ mtr_t *mtr)
+{
+ ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
+
+ if (rollback) {
+ ut_ad(undo->state == TRX_UNDO_PREPARED);
+ mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+ + block->frame, TRX_UNDO_ACTIVE);
+ return;
+ }
+
+ /*------------------------------*/
+ ut_ad(undo->state == TRX_UNDO_ACTIVE);
+ undo->state = TRX_UNDO_PREPARED;
+ undo->xid = *trx->xid;
+ /*------------------------------*/
+
+ mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame,
+ undo->state);
+ uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+ + block->frame);
+ mtr->write<1>(*block, block->frame + offset + TRX_UNDO_XID_EXISTS, 1U);
+
+ trx_undo_write_xid(block, offset, undo->xid, mtr);
+}
+
+/** Free temporary undo log after commit or rollback.
+The information is not needed after a commit or rollback, therefore
+the data can be discarded.
+@param undo temporary undo log */
+void trx_undo_commit_cleanup(trx_undo_t *undo)
+{
+ trx_rseg_t* rseg = undo->rseg;
+ ut_ad(rseg->space == fil_system.temp_space);
+
+ mutex_enter(&rseg->mutex);
+
+ UT_LIST_REMOVE(rseg->undo_list, undo);
+
+ if (undo->state == TRX_UNDO_CACHED) {
+ UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+ MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+ } else {
+ ut_ad(undo->state == TRX_UNDO_TO_PURGE);
+
+ /* Delete first the undo log segment in the file */
+ trx_undo_seg_free(undo);
+
+ ut_ad(rseg->curr_size > undo->size);
+ rseg->curr_size -= undo->size;
+
+ ut_free(undo);
+ }
+
+ mutex_exit(&rseg->mutex);
+}
+
+/** At shutdown, frees the undo logs of a transaction. */
+void trx_undo_free_at_shutdown(trx_t *trx)
+{
+ if (trx_undo_t*& undo = trx->rsegs.m_redo.undo) {
+ switch (undo->state) {
+ case TRX_UNDO_PREPARED:
+ break;
+ case TRX_UNDO_CACHED:
+ case TRX_UNDO_TO_PURGE:
+ ut_ad(trx_state_eq(trx,
+ TRX_STATE_COMMITTED_IN_MEMORY));
+ /* fall through */
+ case TRX_UNDO_ACTIVE:
+ /* trx_t::commit_state() assigns
+ trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */
+ ut_a(!srv_was_started
+ || srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+ || srv_fast_shutdown);
+ break;
+ default:
+ ut_error;
+ }
+
+ UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo = NULL;
+ }
+ if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) {
+ ut_a(undo->state == TRX_UNDO_PREPARED);
+
+ UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo = NULL;
+ }
+}
diff --git a/storage/innobase/ut/ut0dbg.cc b/storage/innobase/ut/ut0dbg.cc
new file mode 100644
index 00000000..fc51cce9
--- /dev/null
+++ b/storage/innobase/ut/ut0dbg.cc
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file ut/ut0dbg.cc
+Debug utilities for Innobase.
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#include "univ.i"
+#include "ut0dbg.h"
+
+/*************************************************************//**
+Report a failed assertion. */
+ATTRIBUTE_NORETURN
+void
+ut_dbg_assertion_failed(
+/*====================*/
+ const char* expr, /*!< in: the failed assertion (optional) */
+ const char* file, /*!< in: source file containing the assertion */
+ unsigned line) /*!< in: line number of the assertion */
+{
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Assertion failure in file %s line %u\n",
+ file, line);
+ if (expr) {
+ fprintf(stderr,
+ "InnoDB: Failing assertion: %s\n", expr);
+ }
+
+ fputs("InnoDB: We intentionally generate a memory trap.\n"
+ "InnoDB: Submit a detailed bug report"
+ " to https://jira.mariadb.org/\n"
+ "InnoDB: If you get repeated assertion failures"
+ " or crashes, even\n"
+ "InnoDB: immediately after the mysqld startup, there may be\n"
+ "InnoDB: corruption in the InnoDB tablespace. Please refer to\n"
+ "InnoDB: https://mariadb.com/kb/en/library/innodb-recovery-modes/\n"
+ "InnoDB: about forcing recovery.\n", stderr);
+
+ fflush(stderr);
+ fflush(stdout);
+ abort();
+}
diff --git a/storage/innobase/ut/ut0list.cc b/storage/innobase/ut/ut0list.cc
new file mode 100644
index 00000000..370c18d4
--- /dev/null
+++ b/storage/innobase/ut/ut0list.cc
@@ -0,0 +1,151 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0list.cc
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+#include "ut0list.h"
+
+/****************************************************************//**
+Create a new list.
+@return list */
+ib_list_t*
+ib_list_create(void)
+/*=================*/
+{
+ return(static_cast<ib_list_t*>(ut_zalloc_nokey(sizeof(ib_list_t))));
+}
+
+/****************************************************************//**
+Free a list. */
+void
+ib_list_free(
+/*=========*/
+ ib_list_t* list) /*!< in: list */
+{
+ /* We don't check that the list is empty because it's entirely valid
+ to e.g. have all the nodes allocated from a single heap that is then
+ freed after the list itself is freed. */
+
+ ut_free(list);
+}
+
+/****************************************************************//**
+Add the data after the indicated node.
+@return new list node */
+static
+ib_list_node_t*
+ib_list_add_after(
+/*==============*/
+ ib_list_t* list, /*!< in: list */
+ ib_list_node_t* prev_node, /*!< in: node preceding new node (can
+ be NULL) */
+ void* data, /*!< in: data */
+ mem_heap_t* heap) /*!< in: memory heap to use */
+{
+ ib_list_node_t* node;
+
+ node = static_cast<ib_list_node_t*>(
+ mem_heap_alloc(heap, sizeof(*node)));
+
+ node->data = data;
+
+ if (!list->first) {
+ /* Empty list. */
+
+ ut_a(!prev_node);
+
+ node->prev = NULL;
+ node->next = NULL;
+
+ list->first = node;
+ list->last = node;
+ } else if (!prev_node) {
+ /* Start of list. */
+
+ node->prev = NULL;
+ node->next = list->first;
+
+ list->first->prev = node;
+
+ list->first = node;
+ } else {
+ /* Middle or end of list. */
+
+ node->prev = prev_node;
+ node->next = prev_node->next;
+
+ prev_node->next = node;
+
+ if (node->next) {
+ node->next->prev = node;
+ } else {
+ list->last = node;
+ }
+ }
+
+ return(node);
+}
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return new list node */
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+ ib_list_t* list, /*!< in: list */
+ void* data, /*!< in: data */
+ mem_heap_t* heap) /*!< in: memory heap to use */
+{
+ return(ib_list_add_after(list, ib_list_get_last(list), data, heap));
+}
+
+/****************************************************************//**
+Remove the node from the list. */
+void
+ib_list_remove(
+/*===========*/
+ ib_list_t* list, /*!< in: list */
+ ib_list_node_t* node) /*!< in: node to remove */
+{
+ if (node->prev) {
+ node->prev->next = node->next;
+ } else {
+ /* First item in list. */
+
+ ut_ad(list->first == node);
+
+ list->first = node->next;
+ }
+
+ if (node->next) {
+ node->next->prev = node->prev;
+ } else {
+ /* Last item in list. */
+
+ ut_ad(list->last == node);
+
+ list->last = node->prev;
+ }
+
+ node->prev = node->next = NULL;
+}
diff --git a/storage/innobase/ut/ut0mem.cc b/storage/innobase/ut/ut0mem.cc
new file mode 100644
index 00000000..faade827
--- /dev/null
+++ b/storage/innobase/ut/ut0mem.cc
@@ -0,0 +1,54 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ut/ut0mem.cc
+Memory primitives
+
+Created 5/11/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0mem.h"
+
+/********************************************************************
+Concatenate 3 strings.*/
+char*
+ut_str3cat(
+/*=======*/
+ /* out, own: concatenated string, must be
+ freed with ut_free() */
+ const char* s1, /* in: string 1 */
+ const char* s2, /* in: string 2 */
+ const char* s3) /* in: string 3 */
+{
+ char* s;
+ ulint s1_len = strlen(s1);
+ ulint s2_len = strlen(s2);
+ ulint s3_len = strlen(s3);
+
+ s = static_cast<char*>(ut_malloc_nokey(s1_len + s2_len + s3_len + 1));
+
+ memcpy(s, s1, s1_len);
+ memcpy(s + s1_len, s2, s2_len);
+ memcpy(s + s1_len + s2_len, s3, s3_len);
+
+ s[s1_len + s2_len + s3_len] = '\0';
+
+ return(s);
+}
diff --git a/storage/innobase/ut/ut0new.cc b/storage/innobase/ut/ut0new.cc
new file mode 100644
index 00000000..5e00a4ca
--- /dev/null
+++ b/storage/innobase/ut/ut0new.cc
@@ -0,0 +1,112 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ut/ut0new.cc
+Instrumented memory allocator.
+
+Created May 26, 2014 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include <algorithm>
+/** The total amount of memory currently allocated from the operating
+system with allocate_large(). */
+Atomic_counter<ulint> os_total_large_mem_allocated;
+
+/** Maximum number of retries to allocate memory. */
+const size_t alloc_max_retries = 60;
+
+/** Keys for registering allocations with performance schema.
+Keep this list alphabetically sorted. */
+#ifdef BTR_CUR_HASH_ADAPT
+PSI_memory_key mem_key_ahi;
+#endif /* BTR_CUR_HASH_ADAPT */
+PSI_memory_key mem_key_buf_buf_pool;
+PSI_memory_key mem_key_dict_stats_bg_recalc_pool_t;
+PSI_memory_key mem_key_dict_stats_index_map_t;
+PSI_memory_key mem_key_dict_stats_n_diff_on_level;
+PSI_memory_key mem_key_other;
+PSI_memory_key mem_key_row_log_buf;
+PSI_memory_key mem_key_row_merge_sort;
+PSI_memory_key mem_key_std;
+
+#ifdef UNIV_PFS_MEMORY
+
+/** Auxiliary array of performance schema 'PSI_memory_info'.
+Each allocation appears in
+performance_schema.memory_summary_global_by_event_name (and alike) in the form
+of e.g. 'memory/innodb/NAME' where the last component NAME is picked from
+the list below:
+1. If key is specified, then the respective name is used
+2. Without a specified key, allocations from inside std::* containers use
+ mem_key_std
+3. Without a specified key, allocations from outside std::* pick up the key
+ based on the file name, and if file name is not found in the predefined list
+ (in ut_new_boot()) then mem_key_other is used.
+Keep this list alphabetically sorted. */
+static PSI_memory_info pfs_info[] = {
+#ifdef BTR_CUR_HASH_ADAPT
+ {&mem_key_ahi, "adaptive hash index", 0},
+#endif /* BTR_CUR_HASH_ADAPT */
+ {&mem_key_buf_buf_pool, "buf_buf_pool", 0},
+ {&mem_key_dict_stats_bg_recalc_pool_t, "dict_stats_bg_recalc_pool_t", 0},
+ {&mem_key_dict_stats_index_map_t, "dict_stats_index_map_t", 0},
+ {&mem_key_dict_stats_n_diff_on_level, "dict_stats_n_diff_on_level", 0},
+ {&mem_key_other, "other", 0},
+ {&mem_key_row_log_buf, "row_log_buf", 0},
+ {&mem_key_row_merge_sort, "row_merge_sort", 0},
+ {&mem_key_std, "std", 0},
+};
+
+static const int NKEYS = static_cast<int>UT_ARR_SIZE(auto_event_names)-1;
+static PSI_memory_key auto_event_keys[NKEYS];
+
+/** Setup the internal objects needed for UT_NEW() to operate.
+This must be called before the first call to UT_NEW(). */
+void ut_new_boot()
+{
+ PSI_MEMORY_CALL(register_memory)("innodb", pfs_info, static_cast<int>
+ UT_ARR_SIZE(pfs_info));
+
+ PSI_memory_info pfs_info_auto[NKEYS];
+ for (int i= 0; i < NKEYS; i++)
+ {
+ pfs_info_auto[i]= {&auto_event_keys[i], auto_event_names[i], 0};
+ }
+
+ PSI_MEMORY_CALL(register_memory)("innodb", pfs_info_auto,NKEYS);
+}
+
+/** Retrieve a memory key (registered with PFS), corresponding to source file .
+
+@param[in] autoevent_idx - offset to the auto_event_names corresponding to the
+file name of the caller.
+
+@return registered memory key or PSI_NOT_INSTRUMENTED
+*/
+PSI_memory_key ut_new_get_key_by_file(uint32_t autoevent_idx)
+{
+ ut_ad(autoevent_idx < NKEYS);
+ return auto_event_keys[autoevent_idx];
+}
+
+#else /* UNIV_PFS_MEMORY */
+void ut_new_boot(){}
+#endif
diff --git a/storage/innobase/ut/ut0rbt.cc b/storage/innobase/ut/ut0rbt.cc
new file mode 100644
index 00000000..cdd1ef06
--- /dev/null
+++ b/storage/innobase/ut/ut0rbt.cc
@@ -0,0 +1,1140 @@
+/***************************************************************************//**
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/********************************************************************//**
+Red-Black tree implementation
+
+(c) 2007 Oracle/Innobase Oy
+
+Created 2007-03-20 Sunny Bains
+***********************************************************************/
+
+#include "ut0rbt.h"
+
+/**********************************************************************//**
+Definition of a red-black tree
+==============================
+
+A red-black tree is a binary search tree which has the following
+red-black properties:
+
+ 1. Every node is either red or black.
+ 2. Every leaf (NULL - in our case tree->nil) is black.
+ 3. If a node is red, then both its children are black.
+ 4. Every simple path from a node to a descendant leaf contains the
+ same number of black nodes.
+
+ from (3) above, the implication is that on any path from the root
+ to a leaf, red nodes must not be adjacent.
+
+ However, any number of black nodes may appear in a sequence.
+ */
+
+#if defined(IB_RBT_TESTING)
+#warning "Testing enabled!"
+#endif
+
+#define ROOT(t) (t->root->left)
+#define SIZEOF_NODE(t) ((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1)
+
+#if defined UNIV_DEBUG || defined IB_RBT_TESTING
+/**********************************************************************//**
+Verify that the keys are in order.
+@return TRUE of OK. FALSE if not ordered */
+static
+ibool
+rbt_check_ordering(
+/*===============*/
+ const ib_rbt_t* tree) /*!< in: tree to verfify */
+{
+ const ib_rbt_node_t* node;
+ const ib_rbt_node_t* prev = NULL;
+
+ /* Iterate over all the nodes, comparing each node with the prev */
+ for (node = rbt_first(tree); node; node = rbt_next(tree, prev)) {
+
+ if (prev) {
+ int result;
+
+ if (tree->cmp_arg) {
+ result = tree->compare_with_arg(
+ tree->cmp_arg, prev->value,
+ node->value);
+ } else {
+ result = tree->compare(
+ prev->value, node->value);
+ }
+
+ if (result >= 0) {
+ return(FALSE);
+ }
+ }
+
+ prev = node;
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Check that every path from the root to the leaves has the same count.
+Count is expressed in the number of black nodes.
+@return 0 on failure else black height of the subtree */
+static
+ibool
+rbt_count_black_nodes(
+/*==================*/
+ const ib_rbt_t* tree, /*!< in: tree to verify */
+ const ib_rbt_node_t* node) /*!< in: start of sub-tree */
+{
+ ulint result;
+
+ if (node != tree->nil) {
+ ulint left_height = rbt_count_black_nodes(tree, node->left);
+
+ ulint right_height = rbt_count_black_nodes(tree, node->right);
+
+ if (left_height == 0
+ || right_height == 0
+ || left_height != right_height) {
+
+ result = 0;
+ } else if (node->color == IB_RBT_RED) {
+
+ /* Case 3 */
+ if (node->left->color != IB_RBT_BLACK
+ || node->right->color != IB_RBT_BLACK) {
+
+ result = 0;
+ } else {
+ result = left_height;
+ }
+ /* Check if it's anything other than RED or BLACK. */
+ } else if (node->color != IB_RBT_BLACK) {
+
+ result = 0;
+ } else {
+
+ result = right_height + 1;
+ }
+ } else {
+ result = 1;
+ }
+
+ return(result);
+}
+#endif /* UNIV_DEBUG || IB_RBT_TESTING */
+
+/**********************************************************************//**
+Turn the node's right child's left sub-tree into node's right sub-tree.
+This will also make node's right child it's parent. */
+static
+void
+rbt_rotate_left(
+/*============*/
+ const ib_rbt_node_t* nil, /*!< in: nil node of the tree */
+ ib_rbt_node_t* node) /*!< in: node to rotate */
+{
+ ib_rbt_node_t* right = node->right;
+
+ node->right = right->left;
+
+ if (right->left != nil) {
+ right->left->parent = node;
+ }
+
+ /* Right's new parent was node's parent. */
+ right->parent = node->parent;
+
+ /* Since root's parent is tree->nil and root->parent->left points
+ back to root, we can avoid the check. */
+ if (node == node->parent->left) {
+ /* Node was on the left of its parent. */
+ node->parent->left = right;
+ } else {
+ /* Node must have been on the right. */
+ node->parent->right = right;
+ }
+
+ /* Finally, put node on right's left. */
+ right->left = node;
+ node->parent = right;
+}
+
+/**********************************************************************//**
+Turn the node's left child's right sub-tree into node's left sub-tree.
+This also make node's left child it's parent. */
+static
+void
+rbt_rotate_right(
+/*=============*/
+ const ib_rbt_node_t* nil, /*!< in: nil node of tree */
+ ib_rbt_node_t* node) /*!< in: node to rotate */
+{
+ ib_rbt_node_t* left = node->left;
+
+ node->left = left->right;
+
+ if (left->right != nil) {
+ left->right->parent = node;
+ }
+
+ /* Left's new parent was node's parent. */
+ left->parent = node->parent;
+
+ /* Since root's parent is tree->nil and root->parent->left points
+ back to root, we can avoid the check. */
+ if (node == node->parent->right) {
+ /* Node was on the left of its parent. */
+ node->parent->right = left;
+ } else {
+ /* Node must have been on the left. */
+ node->parent->left = left;
+ }
+
+ /* Finally, put node on left's right. */
+ left->right = node;
+ node->parent = left;
+}
+
+/**********************************************************************//**
+Append a node to the tree. */
+static
+ib_rbt_node_t*
+rbt_tree_add_child(
+/*===============*/
+ const ib_rbt_t* tree,
+ ib_rbt_bound_t* parent,
+ ib_rbt_node_t* node)
+{
+ /* Cast away the const. */
+ ib_rbt_node_t* last = (ib_rbt_node_t*) parent->last;
+
+ if (last == tree->root || parent->result < 0) {
+ last->left = node;
+ } else {
+ /* FIXME: We don't handle duplicates (yet)! */
+ ut_a(parent->result != 0);
+
+ last->right = node;
+ }
+
+ node->parent = last;
+
+ return(node);
+}
+
+/**********************************************************************//**
+Generic binary tree insert */
+static
+ib_rbt_node_t*
+rbt_tree_insert(
+/*============*/
+ ib_rbt_t* tree,
+ const void* key,
+ ib_rbt_node_t* node)
+{
+ ib_rbt_bound_t parent;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ parent.result = 0;
+ parent.last = tree->root;
+
+ /* Regular binary search. */
+ while (current != tree->nil) {
+
+ parent.last = current;
+
+ if (tree->cmp_arg) {
+ parent.result = tree->compare_with_arg(
+ tree->cmp_arg, key, current->value);
+ } else {
+ parent.result = tree->compare(key, current->value);
+ }
+
+ if (parent.result < 0) {
+ current = current->left;
+ } else {
+ current = current->right;
+ }
+ }
+
+ ut_a(current == tree->nil);
+
+ rbt_tree_add_child(tree, &parent, node);
+
+ return(node);
+}
+
+/**********************************************************************//**
+Balance a tree after inserting a node. */
+static
+void
+rbt_balance_tree(
+/*=============*/
+ const ib_rbt_t* tree, /*!< in: tree to balance */
+ ib_rbt_node_t* node) /*!< in: node that was inserted */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* parent = node->parent;
+
+ /* Restore the red-black property. */
+ node->color = IB_RBT_RED;
+
+ while (node != ROOT(tree) && parent->color == IB_RBT_RED) {
+ ib_rbt_node_t* grand_parent = parent->parent;
+
+ if (parent == grand_parent->left) {
+ ib_rbt_node_t* uncle = grand_parent->right;
+
+ if (uncle->color == IB_RBT_RED) {
+
+ /* Case 1 - change the colors. */
+ uncle->color = IB_RBT_BLACK;
+ parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ /* Move node up the tree. */
+ node = grand_parent;
+
+ } else {
+
+ if (node == parent->right) {
+ /* Right is a black node and node is
+ to the right, case 2 - move node
+ up and rotate. */
+ node = parent;
+ rbt_rotate_left(nil, node);
+ }
+
+ grand_parent = node->parent->parent;
+
+ /* Case 3. */
+ node->parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ rbt_rotate_right(nil, grand_parent);
+ }
+
+ } else {
+ ib_rbt_node_t* uncle = grand_parent->left;
+
+ if (uncle->color == IB_RBT_RED) {
+
+ /* Case 1 - change the colors. */
+ uncle->color = IB_RBT_BLACK;
+ parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ /* Move node up the tree. */
+ node = grand_parent;
+
+ } else {
+
+ if (node == parent->left) {
+ /* Left is a black node and node is to
+ the right, case 2 - move node up and
+ rotate. */
+ node = parent;
+ rbt_rotate_right(nil, node);
+ }
+
+ grand_parent = node->parent->parent;
+
+ /* Case 3. */
+ node->parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ rbt_rotate_left(nil, grand_parent);
+ }
+ }
+
+ parent = node->parent;
+ }
+
+ /* Color the root black. */
+ ROOT(tree)->color = IB_RBT_BLACK;
+}
+
+/**********************************************************************//**
+Find the given node's successor.
+@return successor node or NULL if no successor */
+static
+ib_rbt_node_t*
+rbt_find_successor(
+/*===============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current) /*!< in: this is declared const
+ because it can be called via
+ rbt_next() */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* next = current->right;
+
+ /* Is there a sub-tree to the right that we can follow. */
+ if (next != nil) {
+
+ /* Follow the left most links of the current right child. */
+ while (next->left != nil) {
+ next = next->left;
+ }
+
+ } else { /* We will have to go up the tree to find the successor. */
+ ib_rbt_node_t* parent = current->parent;
+
+ /* Cast away the const. */
+ next = (ib_rbt_node_t*) current;
+
+ while (parent != tree->root && next == parent->right) {
+ next = parent;
+ parent = next->parent;
+ }
+
+ next = (parent == tree->root) ? NULL : parent;
+ }
+
+ return(next);
+}
+
+/**********************************************************************//**
+Find the given node's precedecessor.
+@return predecessor node or NULL if no predecesor */
+static
+ib_rbt_node_t*
+rbt_find_predecessor(
+/*=================*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current) /*!< in: this is declared const
+ because it can be called via
+ rbt_prev() */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* prev = current->left;
+
+ /* Is there a sub-tree to the left that we can follow. */
+ if (prev != nil) {
+
+ /* Follow the right most links of the current left child. */
+ while (prev->right != nil) {
+ prev = prev->right;
+ }
+
+ } else { /* We will have to go up the tree to find the precedecessor. */
+ ib_rbt_node_t* parent = current->parent;
+
+ /* Cast away the const. */
+ prev = (ib_rbt_node_t*) current;
+
+ while (parent != tree->root && prev == parent->left) {
+ prev = parent;
+ parent = prev->parent;
+ }
+
+ prev = (parent == tree->root) ? NULL : parent;
+ }
+
+ return(prev);
+}
+
+/**********************************************************************//**
+Replace node with child. After applying transformations eject becomes
+an orphan. */
+static
+void
+rbt_eject_node(
+/*===========*/
+ ib_rbt_node_t* eject, /*!< in: node to eject */
+ ib_rbt_node_t* node) /*!< in: node to replace with */
+{
+ /* Update the to be ejected node's parent's child pointers. */
+ if (eject->parent->left == eject) {
+ eject->parent->left = node;
+ } else if (eject->parent->right == eject) {
+ eject->parent->right = node;
+ } else {
+ ut_a(0);
+ }
+ /* eject is now an orphan but otherwise its pointers
+ and color are left intact. */
+
+ node->parent = eject->parent;
+}
+
+/**********************************************************************//**
+Replace a node with another node. */
+static
+void
+rbt_replace_node(
+/*=============*/
+ ib_rbt_node_t* replace, /*!< in: node to replace */
+ ib_rbt_node_t* node) /*!< in: node to replace with */
+{
+ ib_rbt_color_t color = node->color;
+
+ /* Update the node pointers. */
+ node->left = replace->left;
+ node->right = replace->right;
+
+ /* Update the child node pointers. */
+ node->left->parent = node;
+ node->right->parent = node;
+
+ /* Make the parent of replace point to node. */
+ rbt_eject_node(replace, node);
+
+ /* Swap the colors. */
+ node->color = replace->color;
+ replace->color = color;
+}
+
+/**********************************************************************//**
+Detach node from the tree replacing it with one of it's children.
+@return the child node that now occupies the position of the detached node */
+static
+ib_rbt_node_t*
+rbt_detach_node(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_node_t* node) /*!< in: node to detach */
+{
+ ib_rbt_node_t* child;
+ const ib_rbt_node_t* nil = tree->nil;
+
+ if (node->left != nil && node->right != nil) {
+ /* Case where the node to be deleted has two children. */
+ ib_rbt_node_t* successor = rbt_find_successor(tree, node);
+
+ ut_a(successor != nil);
+ ut_a(successor->parent != nil);
+ ut_a(successor->left == nil);
+
+ child = successor->right;
+
+ /* Remove the successor node and replace with its child. */
+ rbt_eject_node(successor, child);
+
+ /* Replace the node to delete with its successor node. */
+ rbt_replace_node(node, successor);
+ } else {
+ ut_a(node->left == nil || node->right == nil);
+
+ child = (node->left != nil) ? node->left : node->right;
+
+ /* Replace the node to delete with one of it's children. */
+ rbt_eject_node(node, child);
+ }
+
+ /* Reset the node links. */
+ node->parent = node->right = node->left = tree->nil;
+
+ return(child);
+}
+
+/**********************************************************************//**
+Rebalance the right sub-tree after deletion.
+@return node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_right(
+/*==============*/
+ const ib_rbt_node_t* nil, /*!< in: rb tree nil node */
+ ib_rbt_node_t* parent, /*!< in: parent node */
+ ib_rbt_node_t* sibling) /*!< in: sibling node */
+{
+ ib_rbt_node_t* node = NULL;
+
+ ut_a(sibling != nil);
+
+ /* Case 3. */
+ if (sibling->color == IB_RBT_RED) {
+
+ parent->color = IB_RBT_RED;
+ sibling->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, parent);
+
+ sibling = parent->right;
+
+ ut_a(sibling != nil);
+ }
+
+ /* Since this will violate case 3 because of the change above. */
+ if (sibling->left->color == IB_RBT_BLACK
+ && sibling->right->color == IB_RBT_BLACK) {
+
+ node = parent; /* Parent needs to be rebalanced too. */
+ sibling->color = IB_RBT_RED;
+
+ } else {
+ if (sibling->right->color == IB_RBT_BLACK) {
+
+ ut_a(sibling->left->color == IB_RBT_RED);
+
+ sibling->color = IB_RBT_RED;
+ sibling->left->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, sibling);
+
+ sibling = parent->right;
+ ut_a(sibling != nil);
+ }
+
+ sibling->color = parent->color;
+ sibling->right->color = IB_RBT_BLACK;
+
+ parent->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, parent);
+ }
+
+ return(node);
+}
+
+/**********************************************************************//**
+Rebalance the left sub-tree after deletion.
+@return node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_left(
+/*=============*/
+ const ib_rbt_node_t* nil, /*!< in: rb tree nil node */
+ ib_rbt_node_t* parent, /*!< in: parent node */
+ ib_rbt_node_t* sibling) /*!< in: sibling node */
+{
+ ib_rbt_node_t* node = NULL;
+
+ ut_a(sibling != nil);
+
+ /* Case 3. */
+ if (sibling->color == IB_RBT_RED) {
+
+ parent->color = IB_RBT_RED;
+ sibling->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, parent);
+ sibling = parent->left;
+
+ ut_a(sibling != nil);
+ }
+
+ /* Since this will violate case 3 because of the change above. */
+ if (sibling->right->color == IB_RBT_BLACK
+ && sibling->left->color == IB_RBT_BLACK) {
+
+ node = parent; /* Parent needs to be rebalanced too. */
+ sibling->color = IB_RBT_RED;
+
+ } else {
+ if (sibling->left->color == IB_RBT_BLACK) {
+
+ ut_a(sibling->right->color == IB_RBT_RED);
+
+ sibling->color = IB_RBT_RED;
+ sibling->right->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, sibling);
+
+ sibling = parent->left;
+
+ ut_a(sibling != nil);
+ }
+
+ sibling->color = parent->color;
+ sibling->left->color = IB_RBT_BLACK;
+
+ parent->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, parent);
+ }
+
+ return(node);
+}
+
+/**********************************************************************//**
+Delete the node and rebalance the tree if necessary */
+static
+void
+rbt_remove_node_and_rebalance(
+/*==========================*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_node_t* node) /*!< in: node to remove */
+{
+ /* Detach node and get the node that will be used
+ as rebalance start. */
+ ib_rbt_node_t* child = rbt_detach_node(tree, node);
+
+ if (node->color == IB_RBT_BLACK) {
+ ib_rbt_node_t* last = child;
+
+ ROOT(tree)->color = IB_RBT_RED;
+
+ while (child && child->color == IB_RBT_BLACK) {
+ ib_rbt_node_t* parent = child->parent;
+
+ /* Did the deletion cause an imbalance in the
+ parents left sub-tree. */
+ if (parent->left == child) {
+
+ child = rbt_balance_right(
+ tree->nil, parent, parent->right);
+
+ } else if (parent->right == child) {
+
+ child = rbt_balance_left(
+ tree->nil, parent, parent->left);
+
+ } else {
+ ut_error;
+ }
+
+ if (child) {
+ last = child;
+ }
+ }
+
+ ut_a(last);
+
+ last->color = IB_RBT_BLACK;
+ ROOT(tree)->color = IB_RBT_BLACK;
+ }
+
+ /* Note that we have removed a node from the tree. */
+ --tree->n_nodes;
+}
+
+/**********************************************************************//**
+Recursively free the nodes. */
+static
+void
+rbt_free_node(
+/*==========*/
+ ib_rbt_node_t* node, /*!< in: node to free */
+ ib_rbt_node_t* nil) /*!< in: rb tree nil node */
+{
+ if (node != nil) {
+ rbt_free_node(node->left, nil);
+ rbt_free_node(node->right, nil);
+
+ ut_free(node);
+ }
+}
+
+/**********************************************************************//**
+Free all the nodes and free the tree. */
+void
+rbt_free(
+/*=====*/
+ ib_rbt_t* tree) /*!< in: rb tree to free */
+{
+ rbt_free_node(tree->root, tree->nil);
+ ut_free(tree->nil);
+ ut_free(tree);
+}
+
+/**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return an empty rb tree */
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+ size_t sizeof_value, /*!< in: sizeof data item */
+ ib_rbt_arg_compare
+ compare, /*!< in: fn to compare items */
+ void* cmp_arg) /*!< in: compare fn arg */
+{
+ ib_rbt_t* tree;
+
+ ut_a(cmp_arg);
+
+ tree = rbt_create(sizeof_value, NULL);
+ tree->cmp_arg = cmp_arg;
+ tree->compare_with_arg = compare;
+
+ return(tree);
+}
+
+/**********************************************************************//**
+Create an instance of a red black tree.
+@return an empty rb tree */
+ib_rbt_t*
+rbt_create(
+/*=======*/
+ size_t sizeof_value, /*!< in: sizeof data item */
+ ib_rbt_compare compare) /*!< in: fn to compare items */
+{
+ ib_rbt_t* tree;
+ ib_rbt_node_t* node;
+
+ tree = (ib_rbt_t*) ut_zalloc_nokey(sizeof(*tree));
+
+ tree->sizeof_value = sizeof_value;
+
+ /* Create the sentinel (NIL) node. */
+ node = tree->nil = (ib_rbt_node_t*) ut_zalloc_nokey(sizeof(*node));
+
+ node->color = IB_RBT_BLACK;
+ node->parent = node->left = node->right = node;
+
+ /* Create the "fake" root, the real root node will be the
+ left child of this node. */
+ node = tree->root = (ib_rbt_node_t*) ut_zalloc_nokey(sizeof(*node));
+
+ node->color = IB_RBT_BLACK;
+ node->parent = node->left = node->right = tree->nil;
+
+ tree->compare = compare;
+
+ return(tree);
+}
+
+/**********************************************************************//**
+Generic insert of a value in the rb tree.
+@return inserted node */
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ const void* value) /*!< in: value of key, this value
+ is copied to the node */
+{
+ ib_rbt_node_t* node;
+
+ /* Create the node that will hold the value data. */
+ node = (ib_rbt_node_t*) ut_malloc_nokey(SIZEOF_NODE(tree));
+
+ memcpy(node->value, value, tree->sizeof_value);
+ node->parent = node->left = node->right = tree->nil;
+
+ /* Insert in the tree in the usual way. */
+ rbt_tree_insert(tree, key, node);
+ rbt_balance_tree(tree, node);
+
+ ++tree->n_nodes;
+
+ return(node);
+}
+
+/**********************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: bounds */
+ const void* value) /*!< in: this value is copied
+ to the node */
+{
+ ib_rbt_node_t* node;
+
+ /* Create the node that will hold the value data */
+ node = (ib_rbt_node_t*) ut_malloc_nokey(SIZEOF_NODE(tree));
+
+ memcpy(node->value, value, tree->sizeof_value);
+ node->parent = node->left = node->right = tree->nil;
+
+ /* If tree is empty */
+ if (parent->last == NULL) {
+ parent->last = tree->root;
+ }
+
+ /* Append the node, the hope here is that the caller knows
+ what s/he is doing. */
+ rbt_tree_add_child(tree, parent, node);
+ rbt_balance_tree(tree, node);
+
+ ++tree->n_nodes;
+
+#if defined UNIV_DEBUG || defined IB_RBT_TESTING
+ ut_a(rbt_validate(tree));
+#endif
+ return(node);
+}
+
+/**********************************************************************//**
+Find a matching node in the rb tree.
+@return NULL if not found else the node where key was found */
+static
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to use for search */
+{
+ const ib_rbt_node_t* current = ROOT(tree);
+
+ /* Regular binary search. */
+ while (current != tree->nil) {
+ int result;
+
+ if (tree->cmp_arg) {
+ result = tree->compare_with_arg(
+ tree->cmp_arg, key, current->value);
+ } else {
+ result = tree->compare(key, current->value);
+ }
+
+ if (result < 0) {
+ current = current->left;
+ } else if (result > 0) {
+ current = current->right;
+ } else {
+ break;
+ }
+ }
+
+ return(current != tree->nil ? current : NULL);
+}
+
+/**********************************************************************//**
+Delete a node indentified by key.
+@return TRUE if success FALSE if not found */
+ibool
+rbt_delete(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to delete */
+{
+ ibool deleted = FALSE;
+ ib_rbt_node_t* node = (ib_rbt_node_t*) rbt_lookup(tree, key);
+
+ if (node) {
+ rbt_remove_node_and_rebalance(tree, node);
+
+ ut_free(node);
+ deleted = TRUE;
+ }
+
+ return(deleted);
+}
+
+/**********************************************************************//**
+Remove a node from the rb tree, the node is not free'd, that is the
+callers responsibility.
+@return deleted node but without the const */
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* const_node) /*!< in: node to delete, this
+ is a fudge and declared const
+ because the caller can access
+ only const nodes */
+{
+ /* Cast away the const. */
+ rbt_remove_node_and_rebalance(tree, (ib_rbt_node_t*) const_node);
+
+ /* This is to make it easier to do something like this:
+ ut_free(rbt_remove_node(node));
+ */
+
+ return((ib_rbt_node_t*) const_node);
+}
+
+/**********************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return value of result */
+int
+rbt_search(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key) /*!< in: key to search */
+{
+ ib_rbt_node_t* current = ROOT(tree);
+
+ /* Every thing is greater than the NULL root. */
+ parent->result = 1;
+ parent->last = NULL;
+
+ while (current != tree->nil) {
+
+ parent->last = current;
+
+ if (tree->cmp_arg) {
+ parent->result = tree->compare_with_arg(
+ tree->cmp_arg, key, current->value);
+ } else {
+ parent->result = tree->compare(key, current->value);
+ }
+
+ if (parent->result > 0) {
+ current = current->right;
+ } else if (parent->result < 0) {
+ current = current->left;
+ } else {
+ break;
+ }
+ }
+
+ return(parent->result);
+}
+
+/**********************************************************************//**
+Find the node that has the greatest key that is <= key. But use the
+supplied comparison function.
+@return value of result */
+int
+rbt_search_cmp(
+/*===========*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key, /*!< in: key to search */
+ ib_rbt_compare compare, /*!< in: fn to compare items */
+ ib_rbt_arg_compare
+ arg_compare) /*!< in: fn to compare items
+ with argument */
+{
+ ib_rbt_node_t* current = ROOT(tree);
+
+ /* Every thing is greater than the NULL root. */
+ parent->result = 1;
+ parent->last = NULL;
+
+ while (current != tree->nil) {
+
+ parent->last = current;
+
+ if (arg_compare) {
+ ut_ad(tree->cmp_arg);
+ parent->result = arg_compare(
+ tree->cmp_arg, key, current->value);
+ } else {
+ parent->result = compare(key, current->value);
+ }
+
+ if (parent->result > 0) {
+ current = current->right;
+ } else if (parent->result < 0) {
+ current = current->left;
+ } else {
+ break;
+ }
+ }
+
+ return(parent->result);
+}
+
+/**********************************************************************//**
+Return the left most node in the tree. */
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+ /* out leftmost node or NULL */
+ const ib_rbt_t* tree) /* in: rb tree */
+{
+ ib_rbt_node_t* first = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ first = current;
+ current = current->left;
+ }
+
+ return(first);
+}
+
+/**********************************************************************//**
+Return the right most node in the tree.
+@return the rightmost node or NULL */
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+ const ib_rbt_t* tree) /*!< in: rb tree */
+{
+ ib_rbt_node_t* last = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ last = current;
+ current = current->right;
+ }
+
+ return(last);
+}
+
+/**********************************************************************//**
+Return the next node.
+@return node next from current */
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current) /*!< in: current node */
+{
+ return(current ? rbt_find_successor(tree, current) : NULL);
+}
+
+/**********************************************************************//**
+Return the previous node.
+@return node prev from current */
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current) /*!< in: current node */
+{
+ return(current ? rbt_find_predecessor(tree, current) : NULL);
+}
+
+/**********************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+ulint
+rbt_merge_uniq(
+/*===========*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ const ib_rbt_t* src) /*!< in: src rb tree */
+{
+ ib_rbt_bound_t parent;
+ ulint n_merged = 0;
+ const ib_rbt_node_t* src_node = rbt_first(src);
+
+ if (rbt_empty(src) || dst == src) {
+ return(0);
+ }
+
+ for (/* No op */; src_node; src_node = rbt_next(src, src_node)) {
+
+ if (rbt_search(dst, &parent, src_node->value) != 0) {
+ rbt_add_node(dst, &parent, src_node->value);
+ ++n_merged;
+ }
+ }
+
+ return(n_merged);
+}
+
+#if defined UNIV_DEBUG || defined IB_RBT_TESTING
+/**********************************************************************//**
+Check that every path from the root to the leaves has the same count and
+the tree nodes are in order.
+@return TRUE if OK FALSE otherwise */
+ibool
+rbt_validate(
+/*=========*/
+ const ib_rbt_t* tree) /*!< in: RB tree to validate */
+{
+ if (rbt_count_black_nodes(tree, ROOT(tree)) > 0) {
+ return(rbt_check_ordering(tree));
+ }
+
+ return(FALSE);
+}
+#endif /* UNIV_DEBUG || IB_RBT_TESTING */
diff --git a/storage/innobase/ut/ut0rnd.cc b/storage/innobase/ut/ut0rnd.cc
new file mode 100644
index 00000000..a2e56951
--- /dev/null
+++ b/storage/innobase/ut/ut0rnd.cc
@@ -0,0 +1,93 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0rnd.cc
+Random numbers and hashing
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0rnd.h"
+
+/** Seed value of ut_rnd_gen() */
+std::atomic<uint32_t> ut_rnd_current;
+
+/** These random numbers are used in ut_find_prime */
+/*@{*/
+#define UT_RANDOM_1 1.0412321
+#define UT_RANDOM_2 1.1131347
+#define UT_RANDOM_3 1.0132677
+/*@}*/
+
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return prime */
+ulint
+ut_find_prime(
+/*==========*/
+ ulint n) /*!< in: positive number > 100 */
+{
+ ulint pow2;
+ ulint i;
+
+ n += 100;
+
+ pow2 = 1;
+ while (pow2 * 2 < n) {
+ pow2 = 2 * pow2;
+ }
+
+ if ((double) n < 1.05 * (double) pow2) {
+ n = (ulint) ((double) n * UT_RANDOM_1);
+ }
+
+ pow2 = 2 * pow2;
+
+ if ((double) n > 0.95 * (double) pow2) {
+ n = (ulint) ((double) n * UT_RANDOM_2);
+ }
+
+ if (n > pow2 - 20) {
+ n += 30;
+ }
+
+ /* Now we have n far enough from powers of 2. To make
+ n more random (especially, if it was not near
+ a power of 2), we then multiply it by a random number. */
+
+ n = (ulint) ((double) n * UT_RANDOM_3);
+
+ for (;; n++) {
+ i = 2;
+ while (i * i <= n) {
+ if (n % i == 0) {
+ goto next_n;
+ }
+ i++;
+ }
+
+ /* Found a prime */
+ break;
+next_n: ;
+ }
+
+ return(n);
+}
diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc
new file mode 100644
index 00000000..1dd1cff6
--- /dev/null
+++ b/storage/innobase/ut/ut0ut.cc
@@ -0,0 +1,648 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0ut.cc
+Various utilities for Innobase.
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ha_prototypes.h"
+
+#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+#ifndef UNIV_INNOCHECKSUM
+#include <mysql_com.h>
+#include "os0thread.h"
+#include "ut0ut.h"
+#include "trx0trx.h"
+#include <string>
+#include "log.h"
+#include "my_cpu.h"
+#ifndef DBUG_OFF
+#include "rem0rec.h"
+#endif
+
+/**********************************************************//**
+Returns the number of milliseconds since some epoch. The
+value may wrap around. It should only be used for heuristic
+purposes.
+@return ms since epoch */
+ulint
+ut_time_ms(void)
+/*============*/
+{
+ return static_cast<ulint>(my_interval_timer() / 1000000);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/**********************************************************//**
+Prints a timestamp to a file. */
+void
+ut_print_timestamp(
+/*===============*/
+ FILE* file) /*!< in: file where to print */
+{
+#ifdef _WIN32
+ SYSTEMTIME cal_tm;
+ GetLocalTime(&cal_tm);
+#else
+ time_t tm;
+ struct tm cal_tm;
+ time(&tm);
+ localtime_r(&tm, &cal_tm);
+#endif
+ fprintf(file,
+ IF_WIN("%u-%02u-%02u %02u:%02u:%02u %#zx",
+ "%d-%02d-%02d %02d:%02d:%02d %#zx"),
+#ifdef _WIN32
+ cal_tm.wYear,
+ cal_tm.wMonth,
+ cal_tm.wDay,
+ cal_tm.wHour,
+ cal_tm.wMinute,
+ cal_tm.wSecond,
+#else
+ cal_tm.tm_year + 1900,
+ cal_tm.tm_mon + 1,
+ cal_tm.tm_mday,
+ cal_tm.tm_hour,
+ cal_tm.tm_min,
+ cal_tm.tm_sec,
+#endif
+#ifdef UNIV_INNOCHECKSUM
+ ulint{0}
+#else
+ ulint(os_thread_get_curr_id())
+#endif
+ );
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+void
+ut_sprintf_timestamp(
+/*=================*/
+ char* buf) /*!< in: buffer where to sprintf */
+{
+#ifdef _WIN32
+ SYSTEMTIME cal_tm;
+ GetLocalTime(&cal_tm);
+
+ sprintf(buf, "%02u%02u%02u %2u:%02u:%02u",
+ cal_tm.wYear % 100,
+ cal_tm.wMonth,
+ cal_tm.wDay,
+ cal_tm.wHour,
+ cal_tm.wMinute,
+ cal_tm.wSecond);
+#else
+ time_t tm;
+ struct tm cal_tm;
+ time(&tm);
+ localtime_r(&tm, &cal_tm);
+ sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
+ cal_tm.tm_year % 100,
+ cal_tm.tm_mon + 1,
+ cal_tm.tm_mday,
+ cal_tm.tm_hour,
+ cal_tm.tm_min,
+ cal_tm.tm_sec);
+#endif
+}
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+ FILE* file, /*!< in: file where to print */
+ const void* buf, /*!< in: memory buffer */
+ ulint len) /*!< in: length of the buffer */
+{
+ const byte* data;
+ ulint i;
+
+ fprintf(file, " len " ULINTPF "; hex ", len);
+
+ for (data = (const byte*) buf, i = 0; i < len; i++) {
+ fprintf(file, "%02x", *data++);
+ }
+
+ fputs("; asc ", file);
+
+ data = (const byte*) buf;
+
+ for (i = 0; i < len; i++) {
+ int c = (int) *data++;
+ putc(isprint(c) ? c : ' ', file);
+ }
+
+ putc(';', file);
+}
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex. */
+void
+ut_print_buf_hex(
+/*=============*/
+ std::ostream& o, /*!< in/out: output stream */
+ const void* buf, /*!< in: memory buffer */
+ ulint len) /*!< in: length of the buffer */
+{
+ const byte* data;
+ ulint i;
+
+ static const char hexdigit[16] = {
+ '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'
+ };
+
+ o << "(0x";
+
+ for (data = static_cast<const byte*>(buf), i = 0; i < len; i++) {
+ byte b = *data++;
+ o << hexdigit[int(b) >> 4] << hexdigit[b & 15];
+ }
+
+ o << ")";
+}
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+ std::ostream& o, /*!< in/out: output stream */
+ const void* buf, /*!< in: memory buffer */
+ ulint len) /*!< in: length of the buffer */
+{
+ const byte* data;
+ ulint i;
+
+ for (data = static_cast<const byte*>(buf), i = 0; i < len; i++) {
+ int c = static_cast<int>(*data++);
+ o << (isprint(c) ? static_cast<char>(c) : ' ');
+ }
+
+ ut_print_buf_hex(o, buf, len);
+}
+
+/** Get a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier.
+ @param [in] trx transaction (NULL=no quotes).
+ @param [in] name table name.
+ @retval String quoted as an SQL identifier.
+*/
+std::string
+ut_get_name(
+ const trx_t* trx,
+ const char* name)
+{
+ /* 2 * NAME_LEN for database and table name,
+ and some slack for the #mysql50# prefix and quotes */
+ char buf[3 * NAME_LEN];
+ const char* bufend;
+
+ bufend = innobase_convert_name(buf, sizeof buf,
+ name, strlen(name),
+ trx ? trx->mysql_thd : NULL);
+ buf[bufend - buf] = '\0';
+ return(std::string(buf, 0, size_t(bufend - buf)));
+}
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+void
+ut_print_name(
+/*==========*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ const char* name) /*!< in: name to print */
+{
+ /* 2 * NAME_LEN for database and table name,
+ and some slack for the #mysql50# prefix and quotes */
+ char buf[3 * NAME_LEN];
+ const char* bufend;
+
+ bufend = innobase_convert_name(buf, sizeof buf,
+ name, strlen(name),
+ trx ? trx->mysql_thd : NULL);
+
+ if (fwrite(buf, 1, size_t(bufend - buf), f) != size_t(bufend - buf)) {
+ perror("fwrite");
+ }
+}
+
+/** Format a table name, quoted as an SQL identifier.
+If the name contains a slash '/', the result will contain two
+identifiers separated by a period (.), as in SQL
+database_name.table_name.
+@see table_name_t
+@param[in] name table or index name
+@param[out] formatted formatted result, will be NUL-terminated
+@param[in] formatted_size size of the buffer in bytes
+@return pointer to 'formatted' */
+char*
+ut_format_name(
+ const char* name,
+ char* formatted,
+ ulint formatted_size)
+{
+ switch (formatted_size) {
+ case 1:
+ formatted[0] = '\0';
+ /* FALL-THROUGH */
+ case 0:
+ return(formatted);
+ }
+
+ char* end;
+
+ end = innobase_convert_name(formatted, formatted_size,
+ name, strlen(name), NULL);
+
+ /* If the space in 'formatted' was completely used, then sacrifice
+ the last character in order to write '\0' at the end. */
+ if ((ulint) (end - formatted) == formatted_size) {
+ end--;
+ }
+
+ ut_a((ulint) (end - formatted) < formatted_size);
+
+ *end = '\0';
+
+ return(formatted);
+}
+
+/**********************************************************************//**
+Catenate files. */
+void
+ut_copy_file(
+/*=========*/
+ FILE* dest, /*!< in: output file */
+ FILE* src) /*!< in: input file to be appended to output */
+{
+ long len = ftell(src);
+ char buf[4096];
+
+ rewind(src);
+ do {
+ size_t maxs = len < (long) sizeof buf
+ ? (size_t) len
+ : sizeof buf;
+ size_t size = fread(buf, 1, maxs, src);
+ if (fwrite(buf, 1, size, dest) != size) {
+ perror("fwrite");
+ }
+ len -= (long) size;
+ if (size < maxs) {
+ break;
+ }
+ } while (len > 0);
+}
+
+/** Convert an error number to a human readable text message.
+The returned string is static and should not be freed or modified.
+@param[in] num InnoDB internal error number
+@return string, describing the error */
+const char*
+ut_strerr(
+ dberr_t num)
+{
+ switch (num) {
+ case DB_SUCCESS:
+ return("Success");
+ case DB_SUCCESS_LOCKED_REC:
+ return("Success, record lock created");
+ case DB_ERROR:
+ return("Generic error");
+ case DB_READ_ONLY:
+ return("Read only transaction");
+ case DB_INTERRUPTED:
+ return("Operation interrupted");
+ case DB_OUT_OF_MEMORY:
+ return("Cannot allocate memory");
+ case DB_OUT_OF_FILE_SPACE:
+ return("Out of disk space");
+ case DB_LOCK_WAIT:
+ return("Lock wait");
+ case DB_DEADLOCK:
+ return("Deadlock");
+ case DB_ROLLBACK:
+ return("Rollback");
+ case DB_DUPLICATE_KEY:
+ return("Duplicate key");
+ case DB_MISSING_HISTORY:
+ return("Required history data has been deleted");
+ case DB_CLUSTER_NOT_FOUND:
+ return("Cluster not found");
+ case DB_TABLE_NOT_FOUND:
+ return("Table not found");
+ case DB_MUST_GET_MORE_FILE_SPACE:
+ return("More file space needed");
+ case DB_TABLE_IS_BEING_USED:
+ return("Table is being used");
+ case DB_TOO_BIG_RECORD:
+ return("Record too big");
+ case DB_TOO_BIG_INDEX_COL:
+ return("Index columns size too big");
+ case DB_LOCK_WAIT_TIMEOUT:
+ return("Lock wait timeout");
+ case DB_NO_REFERENCED_ROW:
+ return("Referenced key value not found");
+ case DB_ROW_IS_REFERENCED:
+ return("Row is referenced");
+ case DB_CANNOT_ADD_CONSTRAINT:
+ return("Cannot add constraint");
+ case DB_CORRUPTION:
+ return("Data structure corruption");
+ case DB_CANNOT_DROP_CONSTRAINT:
+ return("Cannot drop constraint");
+ case DB_NO_SAVEPOINT:
+ return("No such savepoint");
+ case DB_TABLESPACE_EXISTS:
+ return("Tablespace already exists");
+ case DB_TABLESPACE_DELETED:
+ return("Tablespace deleted or being deleted");
+ case DB_TABLESPACE_NOT_FOUND:
+ return("Tablespace not found");
+ case DB_LOCK_TABLE_FULL:
+ return("Lock structs have exhausted the buffer pool");
+ case DB_FOREIGN_DUPLICATE_KEY:
+ return("Foreign key activated with duplicate keys");
+ case DB_FOREIGN_EXCEED_MAX_CASCADE:
+ return("Foreign key cascade delete/update exceeds max depth");
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ return("Too many concurrent transactions");
+ case DB_UNSUPPORTED:
+ return("Unsupported");
+ case DB_INVALID_NULL:
+ return("NULL value encountered in NOT NULL column");
+ case DB_STATS_DO_NOT_EXIST:
+ return("Persistent statistics do not exist");
+ case DB_FAIL:
+ return("Failed, retry may succeed");
+ case DB_OVERFLOW:
+ return("Overflow");
+ case DB_UNDERFLOW:
+ return("Underflow");
+ case DB_STRONG_FAIL:
+ return("Failed, retry will not succeed");
+ case DB_ZIP_OVERFLOW:
+ return("Zip overflow");
+ case DB_RECORD_NOT_FOUND:
+ return("Record not found");
+ case DB_CHILD_NO_INDEX:
+ return("No index on referencing keys in referencing table");
+ case DB_PARENT_NO_INDEX:
+ return("No index on referenced keys in referenced table");
+ case DB_FTS_INVALID_DOCID:
+ return("FTS Doc ID cannot be zero");
+ case DB_INDEX_CORRUPT:
+ return("Index corrupted");
+ case DB_UNDO_RECORD_TOO_BIG:
+ return("Undo record too big");
+ case DB_END_OF_INDEX:
+ return("End of index");
+ case DB_IO_ERROR:
+ return("I/O error");
+ case DB_TABLE_IN_FK_CHECK:
+ return("Table is being used in foreign key check");
+ case DB_NOT_FOUND:
+ return("not found");
+ case DB_ONLINE_LOG_TOO_BIG:
+ return("Log size exceeded during online index creation");
+ case DB_IDENTIFIER_TOO_LONG:
+ return("Identifier name is too long");
+ case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
+ return("FTS query exceeds result cache limit");
+ case DB_TEMP_FILE_WRITE_FAIL:
+ return("Temp file write failure");
+ case DB_CANT_CREATE_GEOMETRY_OBJECT:
+ return("Can't create specificed geometry data object");
+ case DB_CANNOT_OPEN_FILE:
+ return("Cannot open a file");
+ case DB_TABLE_CORRUPT:
+ return("Table is corrupted");
+ case DB_FTS_TOO_MANY_WORDS_IN_PHRASE:
+ return("Too many words in a FTS phrase or proximity search");
+ case DB_DECRYPTION_FAILED:
+ return("Table is encrypted but decrypt failed.");
+ case DB_IO_PARTIAL_FAILED:
+ return("Partial IO failed");
+ case DB_FORCED_ABORT:
+ return("Transaction aborted by another higher priority "
+ "transaction");
+ case DB_COMPUTE_VALUE_FAILED:
+ return("Compute generated column failed");
+ case DB_NO_FK_ON_S_BASE_COL:
+ return("Cannot add foreign key on the base column "
+ "of stored column");
+ case DB_IO_NO_PUNCH_HOLE:
+ return ("File system does not support punch hole (trim) operation.");
+ case DB_PAGE_CORRUPTED:
+ return("Page read from tablespace is corrupted.");
+
+ /* do not add default: in order to produce a warning if new code
+ is added to the enum but not added here */
+ }
+
+ /* we abort here because if unknown error code is given, this could
+ mean that memory corruption has happened and someone's error-code
+ variable has been overwritten with bogus data */
+ ut_error;
+
+ /* NOT REACHED */
+ return("Unknown error");
+}
+
+#ifdef UNIV_PFS_MEMORY
+
+/** Extract the basename of a file without its extension.
+For example, extract "foo0bar" out of "/path/to/foo0bar.cc".
+@param[in] file file path, e.g. "/path/to/foo0bar.cc"
+@param[out] base result, e.g. "foo0bar"
+@param[in] base_size size of the output buffer 'base', if there
+is not enough space, then the result will be truncated, but always
+'\0'-terminated
+@return number of characters that would have been printed if the size
+were unlimited (not including the final ‘\0’) */
+size_t
+ut_basename_noext(
+ const char* file,
+ char* base,
+ size_t base_size)
+{
+ /* Assuming 'file' contains something like the following,
+ extract the file name without the extenstion out of it by
+ setting 'beg' and 'len'.
+ ...mysql-trunk/storage/innobase/dict/dict0dict.cc:302
+ ^-- beg, len=9
+ */
+
+ const char* beg = strrchr(file, OS_PATH_SEPARATOR);
+
+ if (beg == NULL) {
+ beg = file;
+ } else {
+ beg++;
+ }
+
+ size_t len = strlen(beg);
+
+ const char* end = strrchr(beg, '.');
+
+ if (end != NULL) {
+ len = end - beg;
+ }
+
+ const size_t copy_len = std::min(len, base_size - 1);
+
+ memcpy(base, beg, copy_len);
+
+ base[copy_len] = '\0';
+
+ return(len);
+}
+
+#endif /* UNIV_PFS_MEMORY */
+
+namespace ib {
+
+ATTRIBUTE_COLD logger& logger::operator<<(dberr_t err)
+{
+ m_oss << ut_strerr(err);
+ return *this;
+}
+
+info::~info()
+{
+ sql_print_information("InnoDB: %s", m_oss.str().c_str());
+}
+
+warn::~warn()
+{
+ sql_print_warning("InnoDB: %s", m_oss.str().c_str());
+}
+
+/** true if error::~error() was invoked, false otherwise */
+bool error::logged;
+
+error::~error()
+{
+ sql_print_error("InnoDB: %s", m_oss.str().c_str());
+ logged = true;
+}
+
+#ifdef _MSC_VER
+/* disable warning
+ "ib::fatal::~fatal': destructor never returns, potential memory leak"
+ on Windows.
+*/
+#pragma warning (push)
+#pragma warning (disable : 4722)
+#endif
+
+ATTRIBUTE_NORETURN
+fatal::~fatal()
+{
+ sql_print_error("[FATAL] InnoDB: %s", m_oss.str().c_str());
+ abort();
+}
+
+#ifdef _MSC_VER
+#pragma warning (pop)
+#endif
+
+error_or_warn::~error_or_warn()
+{
+ if (m_error) {
+ sql_print_error("InnoDB: %s", m_oss.str().c_str());
+ } else {
+ sql_print_warning("InnoDB: %s", m_oss.str().c_str());
+ }
+}
+
+fatal_or_error::~fatal_or_error()
+{
+ sql_print_error(m_fatal ? "[FATAL] InnoDB: %s" : "InnoDB: %s",
+ m_oss.str().c_str());
+ if (m_fatal) {
+ abort();
+ }
+}
+
+} // namespace ib
+
+#ifndef DBUG_OFF
+static char dbug_print_buf[1024];
+
+const char * dbug_print_rec(const rec_t* rec, const rec_offs* offsets)
+{
+ rec_printer r(rec, offsets);
+ strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1);
+ return dbug_print_buf;
+}
+
+const char * dbug_print_rec(const rec_t* rec, ulint info, const rec_offs* offsets)
+{
+ rec_printer r(rec, info, offsets);
+ strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1);
+ return dbug_print_buf;
+}
+
+const char * dbug_print_rec(const dtuple_t* tuple)
+{
+ rec_printer r(tuple);
+ strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1);
+ return dbug_print_buf;
+}
+
+const char * dbug_print_rec(const dfield_t* field, ulint n)
+{
+ rec_printer r(field, n);
+ strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1);
+ return dbug_print_buf;
+}
+
+const char * dbug_print_rec(const rec_t* rec, dict_index_t* index)
+{
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ mem_heap_t* tmp_heap = NULL;
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &tmp_heap);
+ rec_printer r(rec, offsets);
+ strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1);
+ return dbug_print_buf;
+}
+#endif /* !DBUG_OFF */
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/ut/ut0vec.cc b/storage/innobase/ut/ut0vec.cc
new file mode 100644
index 00000000..c9262bc9
--- /dev/null
+++ b/storage/innobase/ut/ut0vec.cc
@@ -0,0 +1,73 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0vec.cc
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#include "ut0vec.h"
+#include "mem0mem.h"
+
+/********************************************************************
+Create a new vector with the given initial size. */
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+ /* out: vector */
+ ib_alloc_t* allocator, /* in: vector allocator */
+ ulint sizeof_value, /* in: size of data item */
+ ulint size) /* in: initial size */
+{
+ ib_vector_t* vec;
+
+ ut_a(size > 0);
+
+ vec = static_cast<ib_vector_t*>(
+ allocator->mem_malloc(allocator, sizeof(*vec)));
+
+ vec->used = 0;
+ vec->total = size;
+ vec->allocator = allocator;
+ vec->sizeof_value = sizeof_value;
+
+ vec->data = static_cast<void*>(
+ allocator->mem_malloc(allocator, vec->sizeof_value * size));
+
+ return(vec);
+}
+
+/********************************************************************
+Resize the vector, currently the vector can only grow and we
+expand the number of elements it can hold by 2 times. */
+void
+ib_vector_resize(
+/*=============*/
+ ib_vector_t* vec) /* in: vector */
+{
+ ulint new_total = vec->total * 2;
+ ulint old_size = vec->used * vec->sizeof_value;
+ ulint new_size = new_total * vec->sizeof_value;
+
+ vec->data = static_cast<void*>(vec->allocator->mem_resize(
+ vec->allocator, vec->data, old_size, new_size));
+
+ vec->total = new_total;
+}
diff --git a/storage/innobase/ut/ut0wqueue.cc b/storage/innobase/ut/ut0wqueue.cc
new file mode 100644
index 00000000..53bb0c8b
--- /dev/null
+++ b/storage/innobase/ut/ut0wqueue.cc
@@ -0,0 +1,133 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#include "ut0list.h"
+#include "mem0mem.h"
+#include "ut0wqueue.h"
+
+/*******************************************************************//**
+@file ut/ut0wqueue.cc
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Create a new work queue.
+@return work queue */
+ib_wqueue_t*
+ib_wqueue_create(void)
+/*===================*/
+{
+ ib_wqueue_t* wq = static_cast<ib_wqueue_t*>(
+ ut_malloc_nokey(sizeof(*wq)));
+
+ /* Function ib_wqueue_create() has not been used anywhere,
+ not necessary to instrument this mutex */
+
+ mutex_create(LATCH_ID_WORK_QUEUE, &wq->mutex);
+
+ wq->items = ib_list_create();
+
+ return(wq);
+}
+
+/****************************************************************//**
+Free a work queue. */
+void
+ib_wqueue_free(
+/*===========*/
+ ib_wqueue_t* wq) /*!< in: work queue */
+{
+ mutex_free(&wq->mutex);
+ ib_list_free(wq->items);
+
+ ut_free(wq);
+}
+
+/** Add a work item to the queue.
+@param[in,out] wq work queue
+@param[in] item work item
+@param[in,out] heap memory heap to use for allocating list node
+@param[in] wq_locked work queue mutex locked */
+void
+ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap, bool wq_locked)
+{
+ if (!wq_locked) {
+ mutex_enter(&wq->mutex);
+ }
+
+ ib_list_add_last(wq->items, item, heap);
+
+ if (!wq_locked) {
+ mutex_exit(&wq->mutex);
+ }
+}
+
+/********************************************************************
+Return first item on work queue or NULL if queue is empty
+@return work item or NULL */
+void*
+ib_wqueue_nowait(
+/*=============*/
+ ib_wqueue_t* wq) /*<! in: work queue */
+{
+ ib_list_node_t* node = NULL;
+
+ mutex_enter(&wq->mutex);
+
+ if(!ib_list_is_empty(wq->items)) {
+ node = ib_list_get_first(wq->items);
+
+ if (node) {
+ ib_list_remove(wq->items, node);
+ }
+ }
+
+ mutex_exit(&wq->mutex);
+
+ return (node ? node->data : NULL);
+}
+/** Check if queue is empty.
+@param wq wait queue
+@return whether the queue is empty */
+bool ib_wqueue_is_empty(ib_wqueue_t* wq)
+{
+ mutex_enter(&wq->mutex);
+ bool is_empty = ib_list_is_empty(wq->items);
+ mutex_exit(&wq->mutex);
+ return is_empty;
+}
+
+/********************************************************************
+Get number of items on queue.
+@return number of items on queue */
+ulint
+ib_wqueue_len(
+/*==========*/
+ ib_wqueue_t* wq) /*<! in: work queue */
+{
+ ulint len = 0;
+
+ mutex_enter(&wq->mutex);
+ len = ib_list_len(wq->items);
+ mutex_exit(&wq->mutex);
+
+ return(len);
+}